{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999398785546805, "eval_steps": 500, "global_step": 8316, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "auxiliary_loss_clip": 0.04592296, "auxiliary_loss_mlp": 0.0257779, "balance_loss_clip": 2.47145319, "balance_loss_mlp": 2.09008121, "epoch": 0.00012024289063909097, "flos": 24932483919360.0, "grad_norm": 40.29560017448091, "language_loss": 2.5798173, "learning_rate": 0.0, "loss": 1.90189219, "num_input_tokens_seen": 20375, "step": 1, "time_per_iteration": 13.533031463623047 }, { "auxiliary_loss_clip": 0.03096462, "auxiliary_loss_mlp": 0.016449, "balance_loss_clip": 1.65451014, "balance_loss_mlp": 1.32103169, "epoch": 0.00024048578127818193, "flos": 30664624377600.0, "grad_norm": 55.94489656313446, "language_loss": 1.89316106, "learning_rate": 5.021476677069823e-07, "loss": 1.94057465, "num_input_tokens_seen": 39035, "step": 2, "time_per_iteration": 2.4680161476135254 }, { "auxiliary_loss_clip": 0.03069692, "auxiliary_loss_mlp": 0.01669571, "balance_loss_clip": 1.65272069, "balance_loss_mlp": 1.34818316, "epoch": 0.0003607286719172729, "flos": 19026227969280.0, "grad_norm": 40.107296391464416, "language_loss": 1.61673808, "learning_rate": 7.958852231401551e-07, "loss": 1.66413069, "num_input_tokens_seen": 57600, "step": 3, "time_per_iteration": 2.34382700920105 }, { "auxiliary_loss_clip": 0.03077994, "auxiliary_loss_mlp": 0.0173602, "balance_loss_clip": 1.65171075, "balance_loss_mlp": 1.41310573, "epoch": 0.00048097156255636386, "flos": 19316314206720.0, "grad_norm": 36.87690991080161, "language_loss": 1.64510322, "learning_rate": 1.0042953354139647e-06, "loss": 1.69324338, "num_input_tokens_seen": 76465, "step": 4, "time_per_iteration": 2.4014604091644287 }, { "auxiliary_loss_clip": 0.03073864, "auxiliary_loss_mlp": 0.01667524, "balance_loss_clip": 1.65356755, "balance_loss_mlp": 1.35071325, "epoch": 0.0006012144531954548, "flos": 13991264893440.0, "grad_norm": 55.24854794788945, "language_loss": 1.93569183, "learning_rate": 1.1659507774310057e-06, "loss": 1.98310578, "num_input_tokens_seen": 94350, "step": 5, "time_per_iteration": 2.6521339416503906 }, { "auxiliary_loss_clip": 0.03084522, "auxiliary_loss_mlp": 0.01682753, "balance_loss_clip": 1.65810752, "balance_loss_mlp": 1.36117351, "epoch": 0.0007214573438345458, "flos": 23148988225920.0, "grad_norm": 45.194418366608836, "language_loss": 1.61075056, "learning_rate": 1.2980328908471373e-06, "loss": 1.6584233, "num_input_tokens_seen": 114595, "step": 6, "time_per_iteration": 2.793560028076172 }, { "auxiliary_loss_clip": 0.03138458, "auxiliary_loss_mlp": 0.01606507, "balance_loss_clip": 1.79562807, "balance_loss_mlp": 1.40661681, "epoch": 0.0008417002344736367, "flos": 67663246170240.0, "grad_norm": 4.613852836749538, "language_loss": 0.81504482, "learning_rate": 1.4097067265369432e-06, "loss": 0.86249447, "num_input_tokens_seen": 179590, "step": 7, "time_per_iteration": 3.0909554958343506 }, { "auxiliary_loss_clip": 0.0305004, "auxiliary_loss_mlp": 0.0170907, "balance_loss_clip": 1.64211285, "balance_loss_mlp": 1.39588332, "epoch": 0.0009619431251127277, "flos": 21281381504640.0, "grad_norm": 49.659725106120916, "language_loss": 1.582201, "learning_rate": 1.506443003120947e-06, "loss": 1.62979209, "num_input_tokens_seen": 195090, "step": 8, "time_per_iteration": 2.709663152694702 }, { "auxiliary_loss_clip": 0.03055894, "auxiliary_loss_mlp": 0.01697121, "balance_loss_clip": 1.64996243, "balance_loss_mlp": 1.37897575, "epoch": 0.0010821860157518186, "flos": 23331342597120.0, "grad_norm": 17.508890825397188, "language_loss": 1.47852767, "learning_rate": 1.5917704462803102e-06, "loss": 1.52605772, "num_input_tokens_seen": 211635, "step": 9, "time_per_iteration": 2.653956174850464 }, { "auxiliary_loss_clip": 0.03042046, "auxiliary_loss_mlp": 0.01654968, "balance_loss_clip": 1.64881778, "balance_loss_mlp": 1.3379668, "epoch": 0.0012024289063909096, "flos": 17010166337280.0, "grad_norm": 13.44495182275048, "language_loss": 1.52919805, "learning_rate": 1.6680984451379884e-06, "loss": 1.57616818, "num_input_tokens_seen": 224705, "step": 10, "time_per_iteration": 2.69167423248291 }, { "auxiliary_loss_clip": 0.03049854, "auxiliary_loss_mlp": 0.01682941, "balance_loss_clip": 1.64465737, "balance_loss_mlp": 1.37852824, "epoch": 0.0013226717970300007, "flos": 21288133261440.0, "grad_norm": 14.412222420014784, "language_loss": 1.3266654, "learning_rate": 1.7371455188905097e-06, "loss": 1.3739934, "num_input_tokens_seen": 244635, "step": 11, "time_per_iteration": 2.680786371231079 }, { "auxiliary_loss_clip": 0.03064559, "auxiliary_loss_mlp": 0.0170214, "balance_loss_clip": 1.64962065, "balance_loss_mlp": 1.37445784, "epoch": 0.0014429146876690916, "flos": 27237884935680.0, "grad_norm": 11.478639317387843, "language_loss": 1.25377536, "learning_rate": 1.8001805585541196e-06, "loss": 1.30144238, "num_input_tokens_seen": 265765, "step": 12, "time_per_iteration": 2.7420272827148438 }, { "auxiliary_loss_clip": 0.03044011, "auxiliary_loss_mlp": 0.01663496, "balance_loss_clip": 1.64189076, "balance_loss_mlp": 1.35603178, "epoch": 0.0015631575783081825, "flos": 19062174504960.0, "grad_norm": 6.593714778584316, "language_loss": 1.29324901, "learning_rate": 1.8581671739548328e-06, "loss": 1.34032416, "num_input_tokens_seen": 283500, "step": 13, "time_per_iteration": 2.669037103652954 }, { "auxiliary_loss_clip": 0.03039403, "auxiliary_loss_mlp": 0.01618412, "balance_loss_clip": 1.63795567, "balance_loss_mlp": 1.30617929, "epoch": 0.0016834004689472734, "flos": 48139473985920.0, "grad_norm": 6.260191131609957, "language_loss": 1.1350081, "learning_rate": 1.9118543942439254e-06, "loss": 1.18158627, "num_input_tokens_seen": 305685, "step": 14, "time_per_iteration": 2.8571760654449463 }, { "auxiliary_loss_clip": 0.03015065, "auxiliary_loss_mlp": 0.01677578, "balance_loss_clip": 1.6330477, "balance_loss_mlp": 1.36076784, "epoch": 0.0018036433595863645, "flos": 34970026314240.0, "grad_norm": 5.78064015765431, "language_loss": 1.12634933, "learning_rate": 1.961836000571161e-06, "loss": 1.17327571, "num_input_tokens_seen": 327340, "step": 15, "time_per_iteration": 2.760812997817993 }, { "auxiliary_loss_clip": 0.03027068, "auxiliary_loss_mlp": 0.01439522, "balance_loss_clip": 1.77060544, "balance_loss_mlp": 1.25031328, "epoch": 0.0019238862502254555, "flos": 59768284440960.0, "grad_norm": 3.756658059109441, "language_loss": 0.64692342, "learning_rate": 2.0085906708279293e-06, "loss": 0.69158936, "num_input_tokens_seen": 382710, "step": 16, "time_per_iteration": 5.50650429725647 }, { "auxiliary_loss_clip": 0.03001511, "auxiliary_loss_mlp": 0.01635746, "balance_loss_clip": 1.63701475, "balance_loss_mlp": 1.32484829, "epoch": 0.0020441291408645466, "flos": 20814543417600.0, "grad_norm": 4.349171996444234, "language_loss": 1.160671, "learning_rate": 2.0525099325728135e-06, "loss": 1.20704341, "num_input_tokens_seen": 400890, "step": 17, "time_per_iteration": 2.6450772285461426 }, { "auxiliary_loss_clip": 0.02990377, "auxiliary_loss_mlp": 0.01400776, "balance_loss_clip": 1.76002371, "balance_loss_mlp": 1.21461892, "epoch": 0.0021643720315036373, "flos": 63857001582720.0, "grad_norm": 3.5101266357191325, "language_loss": 0.72124314, "learning_rate": 2.0939181139872922e-06, "loss": 0.76515466, "num_input_tokens_seen": 462605, "step": 18, "time_per_iteration": 3.085275173187256 }, { "auxiliary_loss_clip": 0.02970693, "auxiliary_loss_mlp": 0.01583358, "balance_loss_clip": 1.63045931, "balance_loss_mlp": 1.28276038, "epoch": 0.0022846149221427284, "flos": 31284981192960.0, "grad_norm": 4.7102787974699085, "language_loss": 1.01558769, "learning_rate": 2.1330868934640175e-06, "loss": 1.06112814, "num_input_tokens_seen": 483280, "step": 19, "time_per_iteration": 2.696502208709717 }, { "auxiliary_loss_clip": 0.02941308, "auxiliary_loss_mlp": 0.01372513, "balance_loss_clip": 1.74952602, "balance_loss_mlp": 1.19017053, "epoch": 0.002404857812781819, "flos": 51083648161920.0, "grad_norm": 3.555214763055384, "language_loss": 0.76406908, "learning_rate": 2.170246112844971e-06, "loss": 0.80720729, "num_input_tokens_seen": 537620, "step": 20, "time_per_iteration": 2.8700852394104004 }, { "auxiliary_loss_clip": 0.02914774, "auxiliary_loss_mlp": 0.01538777, "balance_loss_clip": 1.61850083, "balance_loss_mlp": 1.23951364, "epoch": 0.0025251007034209102, "flos": 15815347309440.0, "grad_norm": 4.735304505590101, "language_loss": 1.01388359, "learning_rate": 2.2055919496770983e-06, "loss": 1.05841899, "num_input_tokens_seen": 555760, "step": 21, "time_per_iteration": 2.647648811340332 }, { "auxiliary_loss_clip": 0.02901449, "auxiliary_loss_mlp": 0.01523067, "balance_loss_clip": 1.61415291, "balance_loss_mlp": 1.22533035, "epoch": 0.0026453435940600014, "flos": 37851857458560.0, "grad_norm": 5.2540440972606985, "language_loss": 0.8961674, "learning_rate": 2.2392931865974923e-06, "loss": 0.94041252, "num_input_tokens_seen": 578450, "step": 22, "time_per_iteration": 2.8207290172576904 }, { "auxiliary_loss_clip": 0.02862227, "auxiliary_loss_mlp": 0.01506672, "balance_loss_clip": 1.60537946, "balance_loss_mlp": 1.21026993, "epoch": 0.002765586484699092, "flos": 21141976821120.0, "grad_norm": 4.620206115742967, "language_loss": 1.01830506, "learning_rate": 2.271496085962064e-06, "loss": 1.06199408, "num_input_tokens_seen": 596145, "step": 23, "time_per_iteration": 2.69580078125 }, { "auxiliary_loss_clip": 0.02834849, "auxiliary_loss_mlp": 0.01483805, "balance_loss_clip": 1.59239292, "balance_loss_mlp": 1.19102716, "epoch": 0.002885829375338183, "flos": 20667381396480.0, "grad_norm": 3.2009452444972335, "language_loss": 1.02576804, "learning_rate": 2.3023282262611022e-06, "loss": 1.06895471, "num_input_tokens_seen": 614920, "step": 24, "time_per_iteration": 2.6719298362731934 }, { "auxiliary_loss_clip": 0.02845174, "auxiliary_loss_mlp": 0.01484913, "balance_loss_clip": 1.60035145, "balance_loss_mlp": 1.20014632, "epoch": 0.003006072265977274, "flos": 34823869873920.0, "grad_norm": 3.8772769835335987, "language_loss": 0.92433882, "learning_rate": 2.3319015548620114e-06, "loss": 0.96763968, "num_input_tokens_seen": 636060, "step": 25, "time_per_iteration": 2.8007843494415283 }, { "auxiliary_loss_clip": 0.02802626, "auxiliary_loss_mlp": 0.01451476, "balance_loss_clip": 1.58807039, "balance_loss_mlp": 1.17510104, "epoch": 0.003126315156616365, "flos": 24422021118720.0, "grad_norm": 2.271328937384169, "language_loss": 0.92861021, "learning_rate": 2.3603148416618152e-06, "loss": 0.97115123, "num_input_tokens_seen": 655575, "step": 26, "time_per_iteration": 2.705730438232422 }, { "auxiliary_loss_clip": 0.02808334, "auxiliary_loss_mlp": 0.01435545, "balance_loss_clip": 1.58833385, "balance_loss_mlp": 1.16241288, "epoch": 0.003246558047255456, "flos": 23622326674560.0, "grad_norm": 2.294819173954996, "language_loss": 1.00859427, "learning_rate": 2.3876556694204647e-06, "loss": 1.05103302, "num_input_tokens_seen": 675730, "step": 27, "time_per_iteration": 2.707465410232544 }, { "auxiliary_loss_clip": 0.02768539, "auxiliary_loss_mlp": 0.01436921, "balance_loss_clip": 1.58165073, "balance_loss_mlp": 1.15062785, "epoch": 0.003366800937894547, "flos": 17820275725440.0, "grad_norm": 2.5769542258057805, "language_loss": 0.90632999, "learning_rate": 2.414002061950908e-06, "loss": 0.94838458, "num_input_tokens_seen": 694605, "step": 28, "time_per_iteration": 2.656536340713501 }, { "auxiliary_loss_clip": 0.0275307, "auxiliary_loss_mlp": 0.01410387, "balance_loss_clip": 1.5744226, "balance_loss_mlp": 1.14259565, "epoch": 0.003487043828533638, "flos": 24426115269120.0, "grad_norm": 2.3808094420721275, "language_loss": 0.9981423, "learning_rate": 2.4394238264681557e-06, "loss": 1.0397768, "num_input_tokens_seen": 714340, "step": 29, "time_per_iteration": 2.68894624710083 }, { "auxiliary_loss_clip": 0.02727111, "auxiliary_loss_mlp": 0.01410366, "balance_loss_clip": 1.56732988, "balance_loss_mlp": 1.1368525, "epoch": 0.003607286719172729, "flos": 26140311002880.0, "grad_norm": 2.0666836282045256, "language_loss": 0.99509072, "learning_rate": 2.4639836682781433e-06, "loss": 1.03646541, "num_input_tokens_seen": 734470, "step": 30, "time_per_iteration": 2.7190186977386475 }, { "auxiliary_loss_clip": 0.02741221, "auxiliary_loss_mlp": 0.01397796, "balance_loss_clip": 1.5814749, "balance_loss_mlp": 1.11760616, "epoch": 0.00372752960981182, "flos": 20593082113920.0, "grad_norm": 3.2524685202632178, "language_loss": 1.00112808, "learning_rate": 2.487738122623307e-06, "loss": 1.04251838, "num_input_tokens_seen": 753380, "step": 31, "time_per_iteration": 2.6225290298461914 }, { "auxiliary_loss_clip": 0.02697098, "auxiliary_loss_mlp": 0.01375693, "balance_loss_clip": 1.56319976, "balance_loss_mlp": 1.10332382, "epoch": 0.003847772500450911, "flos": 22674608282880.0, "grad_norm": 2.45100144899735, "language_loss": 0.98840976, "learning_rate": 2.510738338534912e-06, "loss": 1.02913761, "num_input_tokens_seen": 772105, "step": 32, "time_per_iteration": 2.649449348449707 }, { "auxiliary_loss_clip": 0.02559674, "auxiliary_loss_mlp": 0.01359604, "balance_loss_clip": 1.52039504, "balance_loss_mlp": 1.09200263, "epoch": 0.003968015391090002, "flos": 17967796882560.0, "grad_norm": 2.721918852117027, "language_loss": 1.02625132, "learning_rate": 2.5330307420306648e-06, "loss": 1.06544399, "num_input_tokens_seen": 788955, "step": 33, "time_per_iteration": 2.6268293857574463 }, { "auxiliary_loss_clip": 0.02520656, "auxiliary_loss_mlp": 0.01344013, "balance_loss_clip": 1.51562619, "balance_loss_mlp": 1.10006297, "epoch": 0.004088258281729093, "flos": 27304103658240.0, "grad_norm": 2.643964959570546, "language_loss": 0.88083756, "learning_rate": 2.554657600279796e-06, "loss": 0.91948426, "num_input_tokens_seen": 810230, "step": 34, "time_per_iteration": 2.7650465965270996 }, { "auxiliary_loss_clip": 0.02501756, "auxiliary_loss_mlp": 0.01324716, "balance_loss_clip": 1.50747418, "balance_loss_mlp": 1.07313645, "epoch": 0.004208501172368184, "flos": 23258587599360.0, "grad_norm": 2.1435482384685067, "language_loss": 1.03435397, "learning_rate": 2.5756575039679493e-06, "loss": 1.07261872, "num_input_tokens_seen": 829780, "step": 35, "time_per_iteration": 2.741372585296631 }, { "auxiliary_loss_clip": 0.02464117, "auxiliary_loss_mlp": 0.01351179, "balance_loss_clip": 1.49650097, "balance_loss_mlp": 1.09998131, "epoch": 0.0043287440630072746, "flos": 17312104062720.0, "grad_norm": 1.9700471122117738, "language_loss": 0.94982445, "learning_rate": 2.5960657816942747e-06, "loss": 0.98797739, "num_input_tokens_seen": 848695, "step": 36, "time_per_iteration": 2.711890935897827 }, { "auxiliary_loss_clip": 0.02303775, "auxiliary_loss_mlp": 0.01393126, "balance_loss_clip": 1.57803583, "balance_loss_mlp": 1.25198257, "epoch": 0.004448986953646365, "flos": 53092491160320.0, "grad_norm": 1.398194431895382, "language_loss": 0.60974598, "learning_rate": 2.6159148575788668e-06, "loss": 0.64671493, "num_input_tokens_seen": 906730, "step": 37, "time_per_iteration": 3.112426996231079 }, { "auxiliary_loss_clip": 0.02412854, "auxiliary_loss_mlp": 0.013603, "balance_loss_clip": 1.48573279, "balance_loss_mlp": 1.1108191, "epoch": 0.004569229844285457, "flos": 13444165866240.0, "grad_norm": 2.5257416776216592, "language_loss": 0.98745996, "learning_rate": 2.635234561171e-06, "loss": 1.02519155, "num_input_tokens_seen": 925125, "step": 38, "time_per_iteration": 2.670102596282959 }, { "auxiliary_loss_clip": 0.02388864, "auxiliary_loss_mlp": 0.01327486, "balance_loss_clip": 1.47793186, "balance_loss_mlp": 1.09345436, "epoch": 0.0046894727349245475, "flos": 16209609966720.0, "grad_norm": 2.2874117518999193, "language_loss": 0.94215786, "learning_rate": 2.6540523970949877e-06, "loss": 0.97932136, "num_input_tokens_seen": 939970, "step": 39, "time_per_iteration": 2.7218737602233887 }, { "auxiliary_loss_clip": 0.02357519, "auxiliary_loss_mlp": 0.01333414, "balance_loss_clip": 1.47663307, "balance_loss_mlp": 1.09728432, "epoch": 0.004809715625563638, "flos": 23914244505600.0, "grad_norm": 2.6644391722813188, "language_loss": 0.92516059, "learning_rate": 2.6723937805519533e-06, "loss": 0.96206993, "num_input_tokens_seen": 957470, "step": 40, "time_per_iteration": 2.7147934436798096 }, { "auxiliary_loss_clip": 0.02345942, "auxiliary_loss_mlp": 0.01304425, "balance_loss_clip": 1.46480012, "balance_loss_mlp": 1.07916677, "epoch": 0.00492995851620273, "flos": 20773030273920.0, "grad_norm": 2.171423286546705, "language_loss": 0.92990512, "learning_rate": 2.690282243737839e-06, "loss": 0.96640879, "num_input_tokens_seen": 976405, "step": 41, "time_per_iteration": 2.6997878551483154 }, { "auxiliary_loss_clip": 0.02310364, "auxiliary_loss_mlp": 0.01327203, "balance_loss_clip": 1.45331836, "balance_loss_mlp": 1.09736788, "epoch": 0.0050502014068418205, "flos": 20338655103360.0, "grad_norm": 2.662969012086038, "language_loss": 0.99230605, "learning_rate": 2.7077396173840807e-06, "loss": 1.02868176, "num_input_tokens_seen": 994690, "step": 42, "time_per_iteration": 3.478790521621704 }, { "auxiliary_loss_clip": 0.02285866, "auxiliary_loss_mlp": 0.01314746, "balance_loss_clip": 1.44597411, "balance_loss_mlp": 1.09521067, "epoch": 0.005170444297480911, "flos": 25994872834560.0, "grad_norm": 2.6198556118564804, "language_loss": 0.92591667, "learning_rate": 2.7247861909342594e-06, "loss": 0.96192282, "num_input_tokens_seen": 1015615, "step": 43, "time_per_iteration": 4.0540406703948975 }, { "auxiliary_loss_clip": 0.02282108, "auxiliary_loss_mlp": 0.01309658, "balance_loss_clip": 1.44477665, "balance_loss_mlp": 1.09345984, "epoch": 0.005290687188120003, "flos": 20954055841920.0, "grad_norm": 2.286658410201937, "language_loss": 0.8293969, "learning_rate": 2.7414408543044743e-06, "loss": 0.86531454, "num_input_tokens_seen": 1031255, "step": 44, "time_per_iteration": 2.6883347034454346 }, { "auxiliary_loss_clip": 0.02231397, "auxiliary_loss_mlp": 0.01332251, "balance_loss_clip": 1.43015862, "balance_loss_mlp": 1.11261952, "epoch": 0.005410930078759093, "flos": 15851401585920.0, "grad_norm": 10.068546149385137, "language_loss": 0.79058111, "learning_rate": 2.7577212237113157e-06, "loss": 0.82621753, "num_input_tokens_seen": 1048295, "step": 45, "time_per_iteration": 2.682742118835449 }, { "auxiliary_loss_clip": 0.02217989, "auxiliary_loss_mlp": 0.01308043, "balance_loss_clip": 1.42400014, "balance_loss_mlp": 1.09499252, "epoch": 0.005531172969398184, "flos": 21104988791040.0, "grad_norm": 2.1281741110699595, "language_loss": 1.04237497, "learning_rate": 2.7736437536690466e-06, "loss": 1.07763529, "num_input_tokens_seen": 1067925, "step": 46, "time_per_iteration": 2.7103476524353027 }, { "auxiliary_loss_clip": 0.02207815, "auxiliary_loss_mlp": 0.01276385, "balance_loss_clip": 1.42449737, "balance_loss_mlp": 1.06781626, "epoch": 0.005651415860037276, "flos": 20844887431680.0, "grad_norm": 2.3266364602082144, "language_loss": 1.07720137, "learning_rate": 2.789223836941131e-06, "loss": 1.1120435, "num_input_tokens_seen": 1088060, "step": 47, "time_per_iteration": 2.6737117767333984 }, { "auxiliary_loss_clip": 0.02172153, "auxiliary_loss_mlp": 0.01285319, "balance_loss_clip": 1.41179073, "balance_loss_mlp": 1.08085096, "epoch": 0.005771658750676366, "flos": 13260195383040.0, "grad_norm": 2.388761675367142, "language_loss": 1.08692598, "learning_rate": 2.8044758939680847e-06, "loss": 1.12150085, "num_input_tokens_seen": 1104130, "step": 48, "time_per_iteration": 2.612104892730713 }, { "auxiliary_loss_clip": 0.02147161, "auxiliary_loss_mlp": 0.01282264, "balance_loss_clip": 1.41145968, "balance_loss_mlp": 1.07655692, "epoch": 0.005891901641315457, "flos": 24425396997120.0, "grad_norm": 7.205833187648408, "language_loss": 1.02130783, "learning_rate": 2.8194134530738863e-06, "loss": 1.05560207, "num_input_tokens_seen": 1122900, "step": 49, "time_per_iteration": 2.681382894515991 }, { "auxiliary_loss_clip": 0.02139021, "auxiliary_loss_mlp": 0.01290139, "balance_loss_clip": 1.40579212, "balance_loss_mlp": 1.09387314, "epoch": 0.006012144531954548, "flos": 23076197314560.0, "grad_norm": 2.558727094572549, "language_loss": 0.90087247, "learning_rate": 2.834049222568994e-06, "loss": 0.93516409, "num_input_tokens_seen": 1140250, "step": 50, "time_per_iteration": 2.67350697517395 }, { "auxiliary_loss_clip": 0.02135085, "auxiliary_loss_mlp": 0.01255022, "balance_loss_clip": 1.40258718, "balance_loss_mlp": 1.06400096, "epoch": 0.006132387422593639, "flos": 22528775064960.0, "grad_norm": 1.8717953273445211, "language_loss": 0.92423445, "learning_rate": 2.848395155712969e-06, "loss": 0.95813555, "num_input_tokens_seen": 1160470, "step": 51, "time_per_iteration": 2.6298584938049316 }, { "auxiliary_loss_clip": 0.02118313, "auxiliary_loss_mlp": 0.01293401, "balance_loss_clip": 1.40357351, "balance_loss_mlp": 1.09875619, "epoch": 0.00625263031323273, "flos": 27628340751360.0, "grad_norm": 2.312437719329922, "language_loss": 0.97569245, "learning_rate": 2.8624625093687977e-06, "loss": 1.00980961, "num_input_tokens_seen": 1177605, "step": 52, "time_per_iteration": 2.705409288406372 }, { "auxiliary_loss_clip": 0.02100515, "auxiliary_loss_mlp": 0.0125987, "balance_loss_clip": 1.39357424, "balance_loss_mlp": 1.07705033, "epoch": 0.006372873203871821, "flos": 23110671392640.0, "grad_norm": 2.413575028050906, "language_loss": 0.88998222, "learning_rate": 2.876261897070029e-06, "loss": 0.92358613, "num_input_tokens_seen": 1197735, "step": 53, "time_per_iteration": 2.630678415298462 }, { "auxiliary_loss_clip": 0.02099512, "auxiliary_loss_mlp": 0.01277437, "balance_loss_clip": 1.39772594, "balance_loss_mlp": 1.09194767, "epoch": 0.006493116094510912, "flos": 22856028900480.0, "grad_norm": 2.470427108327774, "language_loss": 0.92309314, "learning_rate": 2.889803337127447e-06, "loss": 0.95686257, "num_input_tokens_seen": 1216335, "step": 54, "time_per_iteration": 2.6569180488586426 }, { "auxiliary_loss_clip": 0.02068233, "auxiliary_loss_mlp": 0.01296732, "balance_loss_clip": 1.38537741, "balance_loss_mlp": 1.10141969, "epoch": 0.006613358985150003, "flos": 23071708114560.0, "grad_norm": 4.798285227810707, "language_loss": 0.84695476, "learning_rate": 2.903096296321516e-06, "loss": 0.88060445, "num_input_tokens_seen": 1234480, "step": 55, "time_per_iteration": 2.6526994705200195 }, { "auxiliary_loss_clip": 0.02069055, "auxiliary_loss_mlp": 0.01250531, "balance_loss_clip": 1.38686192, "balance_loss_mlp": 1.07524538, "epoch": 0.006733601875789094, "flos": 26537662229760.0, "grad_norm": 1.9678578099806474, "language_loss": 0.91687977, "learning_rate": 2.9161497296578907e-06, "loss": 0.95007569, "num_input_tokens_seen": 1253870, "step": 56, "time_per_iteration": 2.70420241355896 }, { "auxiliary_loss_clip": 0.02049681, "auxiliary_loss_mlp": 0.01253205, "balance_loss_clip": 1.38082504, "balance_loss_mlp": 1.07563078, "epoch": 0.006853844766428185, "flos": 15523178083200.0, "grad_norm": 2.169600513230549, "language_loss": 0.85848916, "learning_rate": 2.928972116604173e-06, "loss": 0.891518, "num_input_tokens_seen": 1270145, "step": 57, "time_per_iteration": 2.5955557823181152 }, { "auxiliary_loss_clip": 0.02021701, "auxiliary_loss_mlp": 0.01234647, "balance_loss_clip": 1.37206626, "balance_loss_mlp": 1.0678494, "epoch": 0.006974087657067276, "flos": 24243760897920.0, "grad_norm": 5.973243230344974, "language_loss": 1.02024364, "learning_rate": 2.9415714941751377e-06, "loss": 1.05280709, "num_input_tokens_seen": 1291365, "step": 58, "time_per_iteration": 2.651109218597412 }, { "auxiliary_loss_clip": 0.02038122, "auxiliary_loss_mlp": 0.01256678, "balance_loss_clip": 1.37371969, "balance_loss_mlp": 1.08892632, "epoch": 0.007094330547706367, "flos": 25772513690880.0, "grad_norm": 1.8730097132514536, "language_loss": 0.93521208, "learning_rate": 2.9539554871897396e-06, "loss": 0.96816009, "num_input_tokens_seen": 1311535, "step": 59, "time_per_iteration": 2.6681530475616455 }, { "auxiliary_loss_clip": 0.02006071, "auxiliary_loss_mlp": 0.01242539, "balance_loss_clip": 1.36593437, "balance_loss_mlp": 1.07907915, "epoch": 0.007214573438345458, "flos": 21319015979520.0, "grad_norm": 2.527464112918325, "language_loss": 0.97403991, "learning_rate": 2.9661313359851253e-06, "loss": 1.00652599, "num_input_tokens_seen": 1329420, "step": 60, "time_per_iteration": 2.6241652965545654 }, { "auxiliary_loss_clip": 0.01984811, "auxiliary_loss_mlp": 0.01236678, "balance_loss_clip": 1.36365652, "balance_loss_mlp": 1.07741404, "epoch": 0.007334816328984549, "flos": 24937088192640.0, "grad_norm": 6.292004761511977, "language_loss": 0.94155359, "learning_rate": 2.978105921839922e-06, "loss": 0.97376847, "num_input_tokens_seen": 1349965, "step": 61, "time_per_iteration": 2.7245492935180664 }, { "auxiliary_loss_clip": 0.01971468, "auxiliary_loss_mlp": 0.01249403, "balance_loss_clip": 1.35985589, "balance_loss_mlp": 1.08889973, "epoch": 0.00745505921962364, "flos": 18510586277760.0, "grad_norm": 2.265863591350538, "language_loss": 0.72096038, "learning_rate": 2.9898857903302893e-06, "loss": 0.75316906, "num_input_tokens_seen": 1368915, "step": 62, "time_per_iteration": 2.6413767337799072 }, { "auxiliary_loss_clip": 0.01974162, "auxiliary_loss_mlp": 0.01254663, "balance_loss_clip": 1.35824609, "balance_loss_mlp": 1.08986795, "epoch": 0.007575302110262731, "flos": 18477656484480.0, "grad_norm": 3.620721746446778, "language_loss": 0.8797034, "learning_rate": 3.001477172817253e-06, "loss": 0.9119916, "num_input_tokens_seen": 1386805, "step": 63, "time_per_iteration": 2.5936529636383057 }, { "auxiliary_loss_clip": 0.01950344, "auxiliary_loss_mlp": 0.01226837, "balance_loss_clip": 1.35019684, "balance_loss_mlp": 1.07834983, "epoch": 0.007695545000901822, "flos": 24973178382720.0, "grad_norm": 2.721068187518123, "language_loss": 0.9643023, "learning_rate": 3.012886006241894e-06, "loss": 0.99607414, "num_input_tokens_seen": 1406190, "step": 64, "time_per_iteration": 2.636819839477539 }, { "auxiliary_loss_clip": 0.01955315, "auxiliary_loss_mlp": 0.01230422, "balance_loss_clip": 1.35175109, "balance_loss_mlp": 1.07440138, "epoch": 0.007815787891540913, "flos": 21324223451520.0, "grad_norm": 2.0721214351494353, "language_loss": 0.88308674, "learning_rate": 3.0241179513858383e-06, "loss": 0.91494405, "num_input_tokens_seen": 1425500, "step": 65, "time_per_iteration": 2.624528646469116 }, { "auxiliary_loss_clip": 0.01937021, "auxiliary_loss_mlp": 0.01255913, "balance_loss_clip": 1.34111071, "balance_loss_mlp": 1.09264445, "epoch": 0.007936030782180003, "flos": 21575777374080.0, "grad_norm": 2.369998740161197, "language_loss": 0.87739629, "learning_rate": 3.035178409737647e-06, "loss": 0.9093256, "num_input_tokens_seen": 1442950, "step": 66, "time_per_iteration": 2.6210875511169434 }, { "auxiliary_loss_clip": 0.01916913, "auxiliary_loss_mlp": 0.01219577, "balance_loss_clip": 1.33460462, "balance_loss_mlp": 1.08339238, "epoch": 0.008056273672819095, "flos": 20120785159680.0, "grad_norm": 2.271438418590118, "language_loss": 0.88988554, "learning_rate": 3.046072539090907e-06, "loss": 0.92125046, "num_input_tokens_seen": 1460915, "step": 67, "time_per_iteration": 2.658783435821533 }, { "auxiliary_loss_clip": 0.01911898, "auxiliary_loss_mlp": 0.01215404, "balance_loss_clip": 1.33377588, "balance_loss_mlp": 1.07292461, "epoch": 0.008176516563458186, "flos": 18333116156160.0, "grad_norm": 2.5127764173045657, "language_loss": 1.04831719, "learning_rate": 3.056805267986779e-06, "loss": 1.07959032, "num_input_tokens_seen": 1478385, "step": 68, "time_per_iteration": 2.6134274005889893 }, { "auxiliary_loss_clip": 0.01894127, "auxiliary_loss_mlp": 0.01219922, "balance_loss_clip": 1.32747531, "balance_loss_mlp": 1.08106661, "epoch": 0.008296759454097276, "flos": 21872076664320.0, "grad_norm": 2.294801356368828, "language_loss": 0.95265126, "learning_rate": 3.0673813091022194e-06, "loss": 0.98379177, "num_input_tokens_seen": 1497605, "step": 69, "time_per_iteration": 3.514554500579834 }, { "auxiliary_loss_clip": 0.01747915, "auxiliary_loss_mlp": 0.0119753, "balance_loss_clip": 1.3496182, "balance_loss_mlp": 1.13153541, "epoch": 0.008417002344736368, "flos": 63408228036480.0, "grad_norm": 1.2700818634017939, "language_loss": 0.62012094, "learning_rate": 3.0778051716749317e-06, "loss": 0.64957529, "num_input_tokens_seen": 1561150, "step": 70, "time_per_iteration": 4.728844165802002 }, { "auxiliary_loss_clip": 0.01866523, "auxiliary_loss_mlp": 0.01209449, "balance_loss_clip": 1.30933714, "balance_loss_mlp": 1.07231104, "epoch": 0.008537245235375458, "flos": 22966454286720.0, "grad_norm": 2.358332988880353, "language_loss": 0.90341187, "learning_rate": 3.0880811730470094e-06, "loss": 0.93417162, "num_input_tokens_seen": 1580605, "step": 71, "time_per_iteration": 2.62558913230896 }, { "auxiliary_loss_clip": 0.01714101, "auxiliary_loss_mlp": 0.01158795, "balance_loss_clip": 1.32948065, "balance_loss_mlp": 1.09737861, "epoch": 0.008657488126014549, "flos": 61984046712960.0, "grad_norm": 1.1377262086437336, "language_loss": 0.58551824, "learning_rate": 3.098213449401257e-06, "loss": 0.6142472, "num_input_tokens_seen": 1647535, "step": 72, "time_per_iteration": 3.1147620677948 }, { "auxiliary_loss_clip": 0.01856985, "auxiliary_loss_mlp": 0.01213376, "balance_loss_clip": 1.30769348, "balance_loss_mlp": 1.08215058, "epoch": 0.00877773101665364, "flos": 30296791152000.0, "grad_norm": 2.159606984850392, "language_loss": 0.98988783, "learning_rate": 3.1082059657570015e-06, "loss": 1.0205915, "num_input_tokens_seen": 1666770, "step": 73, "time_per_iteration": 2.6980302333831787 }, { "auxiliary_loss_clip": 0.01828996, "auxiliary_loss_mlp": 0.01201452, "balance_loss_clip": 1.29911363, "balance_loss_mlp": 1.06755626, "epoch": 0.00889797390729273, "flos": 23514056104320.0, "grad_norm": 2.902644079794967, "language_loss": 0.96767902, "learning_rate": 3.1180625252858496e-06, "loss": 0.99798346, "num_input_tokens_seen": 1685200, "step": 74, "time_per_iteration": 2.63293194770813 }, { "auxiliary_loss_clip": 0.01812889, "auxiliary_loss_mlp": 0.01209509, "balance_loss_clip": 1.28972054, "balance_loss_mlp": 1.08362412, "epoch": 0.009018216797931822, "flos": 23075838178560.0, "grad_norm": 2.637849473753041, "language_loss": 0.80078471, "learning_rate": 3.1277867780021663e-06, "loss": 0.83100867, "num_input_tokens_seen": 1701835, "step": 75, "time_per_iteration": 2.6503360271453857 }, { "auxiliary_loss_clip": 0.01791304, "auxiliary_loss_mlp": 0.01180957, "balance_loss_clip": 1.28232956, "balance_loss_mlp": 1.06537127, "epoch": 0.009138459688570914, "flos": 15918877284480.0, "grad_norm": 2.788882816295574, "language_loss": 0.95595336, "learning_rate": 3.1373822288779824e-06, "loss": 0.98567593, "num_input_tokens_seen": 1718415, "step": 76, "time_per_iteration": 2.6364893913269043 }, { "auxiliary_loss_clip": 0.01790247, "auxiliary_loss_mlp": 0.01211052, "balance_loss_clip": 1.28385448, "balance_loss_mlp": 1.08821845, "epoch": 0.009258702579210003, "flos": 27016531372800.0, "grad_norm": 3.180093307409489, "language_loss": 0.79535246, "learning_rate": 3.1468522454274533e-06, "loss": 0.82536548, "num_input_tokens_seen": 1738770, "step": 77, "time_per_iteration": 2.7492098808288574 }, { "auxiliary_loss_clip": 0.01780251, "auxiliary_loss_mlp": 0.0119471, "balance_loss_clip": 1.27814174, "balance_loss_mlp": 1.07387948, "epoch": 0.009378945469849095, "flos": 26903196984960.0, "grad_norm": 1.9469720948869846, "language_loss": 0.91859007, "learning_rate": 3.15620006480197e-06, "loss": 0.9483397, "num_input_tokens_seen": 1758040, "step": 78, "time_per_iteration": 2.665693998336792 }, { "auxiliary_loss_clip": 0.01776904, "auxiliary_loss_mlp": 0.01187715, "balance_loss_clip": 1.27530289, "balance_loss_mlp": 1.06774259, "epoch": 0.009499188360488187, "flos": 35694236327040.0, "grad_norm": 4.061064857383708, "language_loss": 0.74933708, "learning_rate": 3.1654288004333087e-06, "loss": 0.77898324, "num_input_tokens_seen": 1776705, "step": 79, "time_per_iteration": 2.714592933654785 }, { "auxiliary_loss_clip": 0.01755734, "auxiliary_loss_mlp": 0.01180228, "balance_loss_clip": 1.2690351, "balance_loss_mlp": 1.07036471, "epoch": 0.009619431251127276, "flos": 21503201944320.0, "grad_norm": 2.4294124591910675, "language_loss": 0.76052099, "learning_rate": 3.1745414482589353e-06, "loss": 0.78988063, "num_input_tokens_seen": 1795915, "step": 80, "time_per_iteration": 2.632139205932617 }, { "auxiliary_loss_clip": 0.01745899, "auxiliary_loss_mlp": 0.01173838, "balance_loss_clip": 1.26470017, "balance_loss_mlp": 1.06268764, "epoch": 0.009739674141766368, "flos": 17421056991360.0, "grad_norm": 3.186390088460552, "language_loss": 0.87151778, "learning_rate": 3.1835408925606204e-06, "loss": 0.90071511, "num_input_tokens_seen": 1814055, "step": 81, "time_per_iteration": 2.6621835231781006 }, { "auxiliary_loss_clip": 0.01725373, "auxiliary_loss_mlp": 0.01186511, "balance_loss_clip": 1.25754762, "balance_loss_mlp": 1.07650506, "epoch": 0.00985991703240546, "flos": 27527109246720.0, "grad_norm": 5.285462304250921, "language_loss": 0.89346701, "learning_rate": 3.1924299114448214e-06, "loss": 0.92258584, "num_input_tokens_seen": 1834535, "step": 82, "time_per_iteration": 2.671375274658203 }, { "auxiliary_loss_clip": 0.01735232, "auxiliary_loss_mlp": 0.01187334, "balance_loss_clip": 1.2621932, "balance_loss_mlp": 1.07756591, "epoch": 0.00998015992304455, "flos": 13808084509440.0, "grad_norm": 2.534205593001497, "language_loss": 0.83327866, "learning_rate": 3.2012111819909055e-06, "loss": 0.86250436, "num_input_tokens_seen": 1851865, "step": 83, "time_per_iteration": 2.600539445877075 }, { "auxiliary_loss_clip": 0.01723025, "auxiliary_loss_mlp": 0.01179222, "balance_loss_clip": 1.25518084, "balance_loss_mlp": 1.07226741, "epoch": 0.010100402813683641, "flos": 20191385341440.0, "grad_norm": 2.138824840512966, "language_loss": 0.95058811, "learning_rate": 3.2098872850910627e-06, "loss": 0.97961056, "num_input_tokens_seen": 1868540, "step": 84, "time_per_iteration": 2.599152088165283 }, { "auxiliary_loss_clip": 0.01720412, "auxiliary_loss_mlp": 0.01179937, "balance_loss_clip": 1.25694168, "balance_loss_mlp": 1.07703519, "epoch": 0.010220645704322733, "flos": 17201642762880.0, "grad_norm": 2.1010529484399623, "language_loss": 0.89264512, "learning_rate": 3.2184607100038194e-06, "loss": 0.92164862, "num_input_tokens_seen": 1887180, "step": 85, "time_per_iteration": 2.614490270614624 }, { "auxiliary_loss_clip": 0.01718053, "auxiliary_loss_mlp": 0.01182105, "balance_loss_clip": 1.25722146, "balance_loss_mlp": 1.08063459, "epoch": 0.010340888594961822, "flos": 21470415805440.0, "grad_norm": 5.072650567508388, "language_loss": 0.93178141, "learning_rate": 3.2269338586412414e-06, "loss": 0.960783, "num_input_tokens_seen": 1904765, "step": 86, "time_per_iteration": 2.6865406036376953 }, { "auxiliary_loss_clip": 0.01704298, "auxiliary_loss_mlp": 0.01173039, "balance_loss_clip": 1.24989474, "balance_loss_mlp": 1.0777669, "epoch": 0.010461131485600914, "flos": 23002831785600.0, "grad_norm": 2.5050926268469746, "language_loss": 0.96704745, "learning_rate": 3.2353090496083106e-06, "loss": 0.99582082, "num_input_tokens_seen": 1922600, "step": 87, "time_per_iteration": 2.676307439804077 }, { "auxiliary_loss_clip": 0.01681535, "auxiliary_loss_mlp": 0.01172089, "balance_loss_clip": 1.24031413, "balance_loss_mlp": 1.082968, "epoch": 0.010581374376240005, "flos": 33546850571520.0, "grad_norm": 1.9442446928374768, "language_loss": 0.81370449, "learning_rate": 3.2435885220114572e-06, "loss": 0.84224069, "num_input_tokens_seen": 1943950, "step": 88, "time_per_iteration": 2.7299387454986572 }, { "auxiliary_loss_clip": 0.0169003, "auxiliary_loss_mlp": 0.01156504, "balance_loss_clip": 1.24699545, "balance_loss_mlp": 1.06290126, "epoch": 0.010701617266879095, "flos": 21763087822080.0, "grad_norm": 3.6004270120009805, "language_loss": 0.93844104, "learning_rate": 3.2517744390519113e-06, "loss": 0.96690637, "num_input_tokens_seen": 1962815, "step": 89, "time_per_iteration": 2.6142044067382812 }, { "auxiliary_loss_clip": 0.01676169, "auxiliary_loss_mlp": 0.01156133, "balance_loss_clip": 1.23301625, "balance_loss_mlp": 1.06782269, "epoch": 0.010821860157518187, "flos": 19060199256960.0, "grad_norm": 2.431057334734209, "language_loss": 0.75148028, "learning_rate": 3.259868891418298e-06, "loss": 0.77980328, "num_input_tokens_seen": 1980580, "step": 90, "time_per_iteration": 2.592013359069824 }, { "auxiliary_loss_clip": 0.01684397, "auxiliary_loss_mlp": 0.01192342, "balance_loss_clip": 1.24279797, "balance_loss_mlp": 1.10045588, "epoch": 0.010942103048157278, "flos": 25447378757760.0, "grad_norm": 2.02081501395768, "language_loss": 0.8498618, "learning_rate": 3.2678739004917757e-06, "loss": 0.87862921, "num_input_tokens_seen": 2000315, "step": 91, "time_per_iteration": 2.637687921524048 }, { "auxiliary_loss_clip": 0.0166731, "auxiliary_loss_mlp": 0.01170394, "balance_loss_clip": 1.23643279, "balance_loss_mlp": 1.08461094, "epoch": 0.011062345938796368, "flos": 27493928058240.0, "grad_norm": 3.0767886788724335, "language_loss": 0.92124963, "learning_rate": 3.275791421376029e-06, "loss": 0.94962668, "num_input_tokens_seen": 2023760, "step": 92, "time_per_iteration": 2.6677346229553223 }, { "auxiliary_loss_clip": 0.01656367, "auxiliary_loss_mlp": 0.01147862, "balance_loss_clip": 1.2288785, "balance_loss_mlp": 1.0688982, "epoch": 0.01118258882943546, "flos": 16071210864000.0, "grad_norm": 2.4559507604207944, "language_loss": 0.96172017, "learning_rate": 3.2836233457634622e-06, "loss": 0.98976243, "num_input_tokens_seen": 2041895, "step": 93, "time_per_iteration": 2.5751163959503174 }, { "auxiliary_loss_clip": 0.01653702, "auxiliary_loss_mlp": 0.01183245, "balance_loss_clip": 1.22804332, "balance_loss_mlp": 1.08911729, "epoch": 0.011302831720074551, "flos": 20668602458880.0, "grad_norm": 3.2201338679645133, "language_loss": 0.85535169, "learning_rate": 3.2913715046481135e-06, "loss": 0.88372111, "num_input_tokens_seen": 2061640, "step": 94, "time_per_iteration": 2.6139183044433594 }, { "auxiliary_loss_clip": 0.01650393, "auxiliary_loss_mlp": 0.0116086, "balance_loss_clip": 1.22666478, "balance_loss_mlp": 1.08041704, "epoch": 0.011423074610713641, "flos": 13072238490240.0, "grad_norm": 6.133333790693309, "language_loss": 0.88872576, "learning_rate": 3.299037670895023e-06, "loss": 0.91683829, "num_input_tokens_seen": 2078255, "step": 95, "time_per_iteration": 2.619093418121338 }, { "auxiliary_loss_clip": 0.01652527, "auxiliary_loss_mlp": 0.01147409, "balance_loss_clip": 1.23226905, "balance_loss_mlp": 1.06486893, "epoch": 0.011543317501352733, "flos": 30335646689280.0, "grad_norm": 3.7049022681075114, "language_loss": 0.80362546, "learning_rate": 3.3066235616750667e-06, "loss": 0.83162487, "num_input_tokens_seen": 2099490, "step": 96, "time_per_iteration": 2.6571922302246094 }, { "auxiliary_loss_clip": 0.01631608, "auxiliary_loss_mlp": 0.01143059, "balance_loss_clip": 1.21968937, "balance_loss_mlp": 1.06500113, "epoch": 0.011663560391991824, "flos": 15522962601600.0, "grad_norm": 4.102507524041367, "language_loss": 0.9242425, "learning_rate": 3.3141308407736276e-06, "loss": 0.95198917, "num_input_tokens_seen": 2116125, "step": 97, "time_per_iteration": 4.277625560760498 }, { "auxiliary_loss_clip": 0.01637193, "auxiliary_loss_mlp": 0.01148706, "balance_loss_clip": 1.21698785, "balance_loss_mlp": 1.07117188, "epoch": 0.011783803282630914, "flos": 19902125116800.0, "grad_norm": 2.409503682699795, "language_loss": 0.86788988, "learning_rate": 3.321561120780869e-06, "loss": 0.89574891, "num_input_tokens_seen": 2134835, "step": 98, "time_per_iteration": 3.4901506900787354 }, { "auxiliary_loss_clip": 0.01626835, "auxiliary_loss_mlp": 0.01143323, "balance_loss_clip": 1.21967435, "balance_loss_mlp": 1.07394278, "epoch": 0.011904046173270006, "flos": 22340674517760.0, "grad_norm": 12.859579686898757, "language_loss": 1.0142858, "learning_rate": 3.3289159651708192e-06, "loss": 1.04198742, "num_input_tokens_seen": 2152410, "step": 99, "time_per_iteration": 2.60176420211792 }, { "auxiliary_loss_clip": 0.01625277, "auxiliary_loss_mlp": 0.01142614, "balance_loss_clip": 1.21657133, "balance_loss_mlp": 1.06670177, "epoch": 0.012024289063909096, "flos": 19100060375040.0, "grad_norm": 1.9609932207630814, "language_loss": 0.97639257, "learning_rate": 3.3361968902759768e-06, "loss": 1.00407147, "num_input_tokens_seen": 2172090, "step": 100, "time_per_iteration": 2.5887444019317627 }, { "auxiliary_loss_clip": 0.01619465, "auxiliary_loss_mlp": 0.01132177, "balance_loss_clip": 1.21476483, "balance_loss_mlp": 1.06503844, "epoch": 0.012144531954548187, "flos": 15012205159680.0, "grad_norm": 2.274671944173216, "language_loss": 0.93972164, "learning_rate": 3.343405367163663e-06, "loss": 0.96723807, "num_input_tokens_seen": 2189020, "step": 101, "time_per_iteration": 2.5869266986846924 }, { "auxiliary_loss_clip": 0.01623648, "auxiliary_loss_mlp": 0.01137326, "balance_loss_clip": 1.21493495, "balance_loss_mlp": 1.06751716, "epoch": 0.012264774845187279, "flos": 15122020014720.0, "grad_norm": 13.200211338057656, "language_loss": 0.81224948, "learning_rate": 3.350542823419951e-06, "loss": 0.83985919, "num_input_tokens_seen": 2205620, "step": 102, "time_per_iteration": 2.601289987564087 }, { "auxiliary_loss_clip": 0.01617352, "auxiliary_loss_mlp": 0.01152063, "balance_loss_clip": 1.20911241, "balance_loss_mlp": 1.08277845, "epoch": 0.012385017735826368, "flos": 13949248959360.0, "grad_norm": 3.5181414275455443, "language_loss": 0.87406003, "learning_rate": 3.3576106448465615e-06, "loss": 0.90175414, "num_input_tokens_seen": 2219000, "step": 103, "time_per_iteration": 2.575157880783081 }, { "auxiliary_loss_clip": 0.01606251, "auxiliary_loss_mlp": 0.01139622, "balance_loss_clip": 1.20703244, "balance_loss_mlp": 1.06952691, "epoch": 0.01250526062646546, "flos": 23623260428160.0, "grad_norm": 2.010437276769021, "language_loss": 0.88113737, "learning_rate": 3.3646101770757797e-06, "loss": 0.9085961, "num_input_tokens_seen": 2237790, "step": 104, "time_per_iteration": 2.609377861022949 }, { "auxiliary_loss_clip": 0.01599122, "auxiliary_loss_mlp": 0.0114088, "balance_loss_clip": 1.20334709, "balance_loss_mlp": 1.06735206, "epoch": 0.012625503517104552, "flos": 34640078958720.0, "grad_norm": 1.7085932268712503, "language_loss": 0.85761094, "learning_rate": 3.371542727108104e-06, "loss": 0.88501102, "num_input_tokens_seen": 2259965, "step": 105, "time_per_iteration": 2.728022813796997 }, { "auxiliary_loss_clip": 0.01602075, "auxiliary_loss_mlp": 0.01180103, "balance_loss_clip": 1.20619822, "balance_loss_mlp": 1.1089586, "epoch": 0.012745746407743641, "flos": 17821891837440.0, "grad_norm": 2.6058663153912716, "language_loss": 0.90225899, "learning_rate": 3.3784095647770114e-06, "loss": 0.93008077, "num_input_tokens_seen": 2278610, "step": 106, "time_per_iteration": 2.6113648414611816 }, { "auxiliary_loss_clip": 0.01592173, "auxiliary_loss_mlp": 0.01142748, "balance_loss_clip": 1.19625306, "balance_loss_mlp": 1.07260489, "epoch": 0.012865989298382733, "flos": 20595057361920.0, "grad_norm": 2.5375172503219687, "language_loss": 0.88876402, "learning_rate": 3.3852119241449547e-06, "loss": 0.91611332, "num_input_tokens_seen": 2297730, "step": 107, "time_per_iteration": 2.6060562133789062 }, { "auxiliary_loss_clip": 0.01587286, "auxiliary_loss_mlp": 0.01131537, "balance_loss_clip": 1.19514942, "balance_loss_mlp": 1.06435037, "epoch": 0.012986232189021825, "flos": 23948969978880.0, "grad_norm": 2.1263203971495717, "language_loss": 0.96458864, "learning_rate": 3.3919510048344295e-06, "loss": 0.99177688, "num_input_tokens_seen": 2315740, "step": 108, "time_per_iteration": 2.6117992401123047 }, { "auxiliary_loss_clip": 0.01576402, "auxiliary_loss_mlp": 0.01131786, "balance_loss_clip": 1.19047308, "balance_loss_mlp": 1.06994045, "epoch": 0.013106475079660914, "flos": 23725425686400.0, "grad_norm": 4.195907912856149, "language_loss": 0.86752558, "learning_rate": 3.3986279732976907e-06, "loss": 0.89460742, "num_input_tokens_seen": 2334215, "step": 109, "time_per_iteration": 2.633835792541504 }, { "auxiliary_loss_clip": 0.01570618, "auxiliary_loss_mlp": 0.01112179, "balance_loss_clip": 1.18739426, "balance_loss_mlp": 1.051144, "epoch": 0.013226717970300006, "flos": 21102438925440.0, "grad_norm": 2.031351529118376, "language_loss": 0.9572528, "learning_rate": 3.4052439640284983e-06, "loss": 0.98408073, "num_input_tokens_seen": 2353130, "step": 110, "time_per_iteration": 2.610891580581665 }, { "auxiliary_loss_clip": 0.01571483, "auxiliary_loss_mlp": 0.01129761, "balance_loss_clip": 1.1909548, "balance_loss_mlp": 1.06691408, "epoch": 0.013346960860939098, "flos": 24863902231680.0, "grad_norm": 1.7554086602588028, "language_loss": 0.8135798, "learning_rate": 3.4118000807190217e-06, "loss": 0.84059227, "num_input_tokens_seen": 2374010, "step": 111, "time_per_iteration": 2.696084499359131 }, { "auxiliary_loss_clip": 0.01574202, "auxiliary_loss_mlp": 0.01128873, "balance_loss_clip": 1.18895674, "balance_loss_mlp": 1.06802821, "epoch": 0.013467203751578187, "flos": 28181940140160.0, "grad_norm": 1.8017197257528965, "language_loss": 0.76225603, "learning_rate": 3.4182973973648723e-06, "loss": 0.78928673, "num_input_tokens_seen": 2395220, "step": 112, "time_per_iteration": 2.6860828399658203 }, { "auxiliary_loss_clip": 0.01561408, "auxiliary_loss_mlp": 0.01148678, "balance_loss_clip": 1.18500376, "balance_loss_mlp": 1.08821487, "epoch": 0.013587446642217279, "flos": 18916233546240.0, "grad_norm": 2.3543122290576055, "language_loss": 0.95194882, "learning_rate": 3.424736959321014e-06, "loss": 0.97904968, "num_input_tokens_seen": 2413025, "step": 113, "time_per_iteration": 2.575143575668335 }, { "auxiliary_loss_clip": 0.01564182, "auxiliary_loss_mlp": 0.01142649, "balance_loss_clip": 1.18463874, "balance_loss_mlp": 1.08058834, "epoch": 0.01370768953285637, "flos": 23988615615360.0, "grad_norm": 2.4744194211300097, "language_loss": 0.889431, "learning_rate": 3.431119784311155e-06, "loss": 0.91649926, "num_input_tokens_seen": 2432700, "step": 114, "time_per_iteration": 2.608177661895752 }, { "auxiliary_loss_clip": 0.01549295, "auxiliary_loss_mlp": 0.01130157, "balance_loss_clip": 1.17954278, "balance_loss_mlp": 1.07250714, "epoch": 0.01382793242349546, "flos": 39202565512320.0, "grad_norm": 1.638271705124018, "language_loss": 0.77672195, "learning_rate": 3.43744686339307e-06, "loss": 0.80351645, "num_input_tokens_seen": 2455020, "step": 115, "time_per_iteration": 2.7269628047943115 }, { "auxiliary_loss_clip": 0.01544967, "auxiliary_loss_mlp": 0.010953, "balance_loss_clip": 1.17391562, "balance_loss_mlp": 1.0419656, "epoch": 0.013948175314134552, "flos": 41353506714240.0, "grad_norm": 2.368189704300822, "language_loss": 0.9097631, "learning_rate": 3.44371916188212e-06, "loss": 0.93616581, "num_input_tokens_seen": 2475775, "step": 116, "time_per_iteration": 2.746272563934326 }, { "auxiliary_loss_clip": 0.01539977, "auxiliary_loss_mlp": 0.01110158, "balance_loss_clip": 1.17387915, "balance_loss_mlp": 1.05901718, "epoch": 0.014068418204773643, "flos": 22453542028800.0, "grad_norm": 2.4300197062005022, "language_loss": 0.86365074, "learning_rate": 3.449937620235143e-06, "loss": 0.8901521, "num_input_tokens_seen": 2496370, "step": 117, "time_per_iteration": 2.637294292449951 }, { "auxiliary_loss_clip": 0.01541673, "auxiliary_loss_mlp": 0.01114464, "balance_loss_clip": 1.1748333, "balance_loss_mlp": 1.060987, "epoch": 0.014188661095412733, "flos": 23805147922560.0, "grad_norm": 2.1574447369944614, "language_loss": 0.89566547, "learning_rate": 3.456103154896722e-06, "loss": 0.92222685, "num_input_tokens_seen": 2517645, "step": 118, "time_per_iteration": 2.635037660598755 }, { "auxiliary_loss_clip": 0.01528601, "auxiliary_loss_mlp": 0.0112393, "balance_loss_clip": 1.16721666, "balance_loss_mlp": 1.0726459, "epoch": 0.014308903986051825, "flos": 23660248458240.0, "grad_norm": 1.87527939650359, "language_loss": 0.92715997, "learning_rate": 3.462216659109757e-06, "loss": 0.95368528, "num_input_tokens_seen": 2537825, "step": 119, "time_per_iteration": 2.5826480388641357 }, { "auxiliary_loss_clip": 0.01548501, "auxiliary_loss_mlp": 0.011336, "balance_loss_clip": 1.17659259, "balance_loss_mlp": 1.08133817, "epoch": 0.014429146876690916, "flos": 20667991927680.0, "grad_norm": 2.574267891817223, "language_loss": 0.85374397, "learning_rate": 3.4682790036921077e-06, "loss": 0.88056493, "num_input_tokens_seen": 2556485, "step": 120, "time_per_iteration": 2.568709135055542 }, { "auxiliary_loss_clip": 0.01522954, "auxiliary_loss_mlp": 0.01109574, "balance_loss_clip": 1.1682452, "balance_loss_mlp": 1.06606269, "epoch": 0.014549389767330006, "flos": 20229199384320.0, "grad_norm": 1.861321878702916, "language_loss": 0.83202308, "learning_rate": 3.4742910377810193e-06, "loss": 0.85834837, "num_input_tokens_seen": 2573945, "step": 121, "time_per_iteration": 2.618236541748047 }, { "auxiliary_loss_clip": 0.01520957, "auxiliary_loss_mlp": 0.01119775, "balance_loss_clip": 1.16583323, "balance_loss_mlp": 1.07340288, "epoch": 0.014669632657969098, "flos": 18004174381440.0, "grad_norm": 2.4903333954254143, "language_loss": 0.88693422, "learning_rate": 3.4802535895469042e-06, "loss": 0.91334158, "num_input_tokens_seen": 2592695, "step": 122, "time_per_iteration": 2.7141149044036865 }, { "auxiliary_loss_clip": 0.01523539, "auxiliary_loss_mlp": 0.0111212, "balance_loss_clip": 1.1650703, "balance_loss_mlp": 1.06543732, "epoch": 0.01478987554860819, "flos": 22741796672640.0, "grad_norm": 2.0415616141647654, "language_loss": 0.89808935, "learning_rate": 3.4861674668779934e-06, "loss": 0.92444593, "num_input_tokens_seen": 2610925, "step": 123, "time_per_iteration": 2.693521022796631 }, { "auxiliary_loss_clip": 0.01514595, "auxiliary_loss_mlp": 0.01106452, "balance_loss_clip": 1.16026092, "balance_loss_mlp": 1.05831575, "epoch": 0.01491011843924728, "flos": 17198590106880.0, "grad_norm": 5.187727928313824, "language_loss": 0.84371674, "learning_rate": 3.492033458037272e-06, "loss": 0.86992723, "num_input_tokens_seen": 2629495, "step": 124, "time_per_iteration": 3.479440450668335 }, { "auxiliary_loss_clip": 0.01510728, "auxiliary_loss_mlp": 0.01110825, "balance_loss_clip": 1.15768313, "balance_loss_mlp": 1.06760001, "epoch": 0.01503036132988637, "flos": 17673867889920.0, "grad_norm": 2.664610213426129, "language_loss": 0.87365061, "learning_rate": 3.497852332293018e-06, "loss": 0.8998661, "num_input_tokens_seen": 2645070, "step": 125, "time_per_iteration": 3.4887735843658447 }, { "auxiliary_loss_clip": 0.01510354, "auxiliary_loss_mlp": 0.0111473, "balance_loss_clip": 1.15976214, "balance_loss_mlp": 1.07255375, "epoch": 0.015150604220525462, "flos": 18878239935360.0, "grad_norm": 2.541710048970901, "language_loss": 0.96627098, "learning_rate": 3.5036248405242356e-06, "loss": 0.99252188, "num_input_tokens_seen": 2663825, "step": 126, "time_per_iteration": 2.642979621887207 }, { "auxiliary_loss_clip": 0.0151066, "auxiliary_loss_mlp": 0.01113067, "balance_loss_clip": 1.15819657, "balance_loss_mlp": 1.0669564, "epoch": 0.015270847111164552, "flos": 39420184060800.0, "grad_norm": 1.864338377495763, "language_loss": 0.82909191, "learning_rate": 3.509351715802146e-06, "loss": 0.85532922, "num_input_tokens_seen": 2684710, "step": 127, "time_per_iteration": 2.7415287494659424 }, { "auxiliary_loss_clip": 0.01508807, "auxiliary_loss_mlp": 0.01124026, "balance_loss_clip": 1.1564486, "balance_loss_mlp": 1.07691443, "epoch": 0.015391090001803644, "flos": 43762466286720.0, "grad_norm": 6.730558788297959, "language_loss": 0.78354734, "learning_rate": 3.5150336739488763e-06, "loss": 0.80987561, "num_input_tokens_seen": 2706995, "step": 128, "time_per_iteration": 2.8090872764587402 }, { "auxiliary_loss_clip": 0.01503317, "auxiliary_loss_mlp": 0.01090642, "balance_loss_clip": 1.15659976, "balance_loss_mlp": 1.05115986, "epoch": 0.015511332892442733, "flos": 18916341287040.0, "grad_norm": 1.9427516344510842, "language_loss": 0.84346092, "learning_rate": 3.5206714140744143e-06, "loss": 0.8694005, "num_input_tokens_seen": 2727050, "step": 129, "time_per_iteration": 2.6808488368988037 }, { "auxiliary_loss_clip": 0.01505657, "auxiliary_loss_mlp": 0.0111845, "balance_loss_clip": 1.15936875, "balance_loss_mlp": 1.07577288, "epoch": 0.015631575783081827, "flos": 24535283679360.0, "grad_norm": 4.73754184153692, "language_loss": 0.87491655, "learning_rate": 3.5262656190928208e-06, "loss": 0.90115762, "num_input_tokens_seen": 2745350, "step": 130, "time_per_iteration": 2.5958337783813477 }, { "auxiliary_loss_clip": 0.01452486, "auxiliary_loss_mlp": 0.01031608, "balance_loss_clip": 1.17128634, "balance_loss_mlp": 1.01797032, "epoch": 0.015751818673720917, "flos": 62328536098560.0, "grad_norm": 1.0704955359163306, "language_loss": 0.7151376, "learning_rate": 3.5318169562186737e-06, "loss": 0.73997855, "num_input_tokens_seen": 2814195, "step": 131, "time_per_iteration": 3.183711528778076 }, { "auxiliary_loss_clip": 0.01492113, "auxiliary_loss_mlp": 0.01122596, "balance_loss_clip": 1.15217769, "balance_loss_mlp": 1.08251762, "epoch": 0.015872061564360006, "flos": 23878549365120.0, "grad_norm": 1.7558376584742281, "language_loss": 0.82305408, "learning_rate": 3.5373260774446292e-06, "loss": 0.8492012, "num_input_tokens_seen": 2834645, "step": 132, "time_per_iteration": 2.642026901245117 }, { "auxiliary_loss_clip": 0.01489732, "auxiliary_loss_mlp": 0.01116723, "balance_loss_clip": 1.15059876, "balance_loss_mlp": 1.07655001, "epoch": 0.0159923044549991, "flos": 23367899664000.0, "grad_norm": 1.9892344404559494, "language_loss": 0.90380722, "learning_rate": 3.542793620000961e-06, "loss": 0.9298718, "num_input_tokens_seen": 2854120, "step": 133, "time_per_iteration": 2.566098928451538 }, { "auxiliary_loss_clip": 0.01487185, "auxiliary_loss_mlp": 0.01106252, "balance_loss_clip": 1.14991784, "balance_loss_mlp": 1.06557775, "epoch": 0.01611254734563819, "flos": 17858305249920.0, "grad_norm": 2.376586049494334, "language_loss": 0.86960846, "learning_rate": 3.5482202067978894e-06, "loss": 0.8955428, "num_input_tokens_seen": 2871330, "step": 134, "time_per_iteration": 2.5655272006988525 }, { "auxiliary_loss_clip": 0.01485696, "auxiliary_loss_mlp": 0.01100606, "balance_loss_clip": 1.1504935, "balance_loss_mlp": 1.06079006, "epoch": 0.01623279023627728, "flos": 20954774113920.0, "grad_norm": 2.9914096949009026, "language_loss": 0.76311105, "learning_rate": 3.553606446851471e-06, "loss": 0.78897405, "num_input_tokens_seen": 2888070, "step": 135, "time_per_iteration": 2.590481996536255 }, { "auxiliary_loss_clip": 0.01472415, "auxiliary_loss_mlp": 0.0109796, "balance_loss_clip": 1.14223146, "balance_loss_mlp": 1.05871665, "epoch": 0.016353033126916373, "flos": 15742412743680.0, "grad_norm": 1.9208589061965191, "language_loss": 0.83461642, "learning_rate": 3.5589529356937613e-06, "loss": 0.86032015, "num_input_tokens_seen": 2906465, "step": 136, "time_per_iteration": 2.5364911556243896 }, { "auxiliary_loss_clip": 0.0148191, "auxiliary_loss_mlp": 0.01101312, "balance_loss_clip": 1.14590597, "balance_loss_mlp": 1.0625453, "epoch": 0.016473276017555463, "flos": 18807280617600.0, "grad_norm": 2.457584216005113, "language_loss": 0.77103424, "learning_rate": 3.5642602557679627e-06, "loss": 0.79686648, "num_input_tokens_seen": 2924915, "step": 137, "time_per_iteration": 2.5617358684539795 }, { "auxiliary_loss_clip": 0.01475814, "auxiliary_loss_mlp": 0.01091411, "balance_loss_clip": 1.15158582, "balance_loss_mlp": 1.05798447, "epoch": 0.016593518908194552, "flos": 24352641999360.0, "grad_norm": 3.3367477928744576, "language_loss": 0.84288585, "learning_rate": 3.569528976809202e-06, "loss": 0.86855817, "num_input_tokens_seen": 2942130, "step": 138, "time_per_iteration": 2.5901236534118652 }, { "auxiliary_loss_clip": 0.0147646, "auxiliary_loss_mlp": 0.01108912, "balance_loss_clip": 1.14605927, "balance_loss_mlp": 1.06919122, "epoch": 0.016713761798833646, "flos": 22346133384960.0, "grad_norm": 1.7246101274504957, "language_loss": 0.89972842, "learning_rate": 3.5747596562115522e-06, "loss": 0.92558217, "num_input_tokens_seen": 2962745, "step": 139, "time_per_iteration": 2.6615800857543945 }, { "auxiliary_loss_clip": 0.01480523, "auxiliary_loss_mlp": 0.01106728, "balance_loss_clip": 1.14758432, "balance_loss_mlp": 1.06929624, "epoch": 0.016834004689472735, "flos": 17821820010240.0, "grad_norm": 2.316607481235765, "language_loss": 0.90950894, "learning_rate": 3.5799528393819138e-06, "loss": 0.93538153, "num_input_tokens_seen": 2981825, "step": 140, "time_per_iteration": 2.668705701828003 }, { "auxiliary_loss_clip": 0.01462849, "auxiliary_loss_mlp": 0.01100261, "balance_loss_clip": 1.13782465, "balance_loss_mlp": 1.06509459, "epoch": 0.016954247580111825, "flos": 20519501103360.0, "grad_norm": 2.218289125596974, "language_loss": 0.8820675, "learning_rate": 3.585109060081286e-06, "loss": 0.90769857, "num_input_tokens_seen": 3001625, "step": 141, "time_per_iteration": 2.556135416030884 }, { "auxiliary_loss_clip": 0.01469179, "auxiliary_loss_mlp": 0.011012, "balance_loss_clip": 1.14193761, "balance_loss_mlp": 1.06574762, "epoch": 0.017074490470750915, "flos": 22088869200000.0, "grad_norm": 1.7956846231971892, "language_loss": 0.78660738, "learning_rate": 3.590228840753992e-06, "loss": 0.81231105, "num_input_tokens_seen": 3022055, "step": 142, "time_per_iteration": 2.580550193786621 }, { "auxiliary_loss_clip": 0.01459938, "auxiliary_loss_mlp": 0.01103534, "balance_loss_clip": 1.13899851, "balance_loss_mlp": 1.07010746, "epoch": 0.01719473336139001, "flos": 15997270717440.0, "grad_norm": 2.028740299574435, "language_loss": 0.87280518, "learning_rate": 3.5953126928453423e-06, "loss": 0.89843988, "num_input_tokens_seen": 3039605, "step": 143, "time_per_iteration": 2.516853094100952 }, { "auxiliary_loss_clip": 0.01457036, "auxiliary_loss_mlp": 0.01084669, "balance_loss_clip": 1.13552809, "balance_loss_mlp": 1.05260146, "epoch": 0.017314976252029098, "flos": 22492038430080.0, "grad_norm": 1.9655212222122012, "language_loss": 0.80619258, "learning_rate": 3.600361117108239e-06, "loss": 0.83160961, "num_input_tokens_seen": 3059405, "step": 144, "time_per_iteration": 2.574777364730835 }, { "auxiliary_loss_clip": 0.01461691, "auxiliary_loss_mlp": 0.01090405, "balance_loss_clip": 1.13679838, "balance_loss_mlp": 1.05633473, "epoch": 0.017435219142668188, "flos": 22018053536640.0, "grad_norm": 2.0625245833236034, "language_loss": 0.97196126, "learning_rate": 3.6053746038991616e-06, "loss": 0.99748224, "num_input_tokens_seen": 3078490, "step": 145, "time_per_iteration": 2.5558927059173584 }, { "auxiliary_loss_clip": 0.01407729, "auxiliary_loss_mlp": 0.01010883, "balance_loss_clip": 1.15419316, "balance_loss_mlp": 0.99982017, "epoch": 0.01755546203330728, "flos": 72240526149120.0, "grad_norm": 1.0618487226235793, "language_loss": 0.58423042, "learning_rate": 3.6103536334639843e-06, "loss": 0.60841656, "num_input_tokens_seen": 3131755, "step": 146, "time_per_iteration": 3.109741687774658 }, { "auxiliary_loss_clip": 0.01451087, "auxiliary_loss_mlp": 0.01087139, "balance_loss_clip": 1.13362956, "balance_loss_mlp": 1.05540562, "epoch": 0.01767570492394637, "flos": 25337061112320.0, "grad_norm": 1.9974747287052388, "language_loss": 0.85786569, "learning_rate": 3.615298676214041e-06, "loss": 0.88324791, "num_input_tokens_seen": 3152035, "step": 147, "time_per_iteration": 2.617709159851074 }, { "auxiliary_loss_clip": 0.01449112, "auxiliary_loss_mlp": 0.010997, "balance_loss_clip": 1.13172388, "balance_loss_mlp": 1.06900418, "epoch": 0.01779594781458546, "flos": 20449188230400.0, "grad_norm": 2.1250273792771837, "language_loss": 0.88978142, "learning_rate": 3.6202101929928317e-06, "loss": 0.91526949, "num_input_tokens_seen": 3170625, "step": 148, "time_per_iteration": 2.5402638912200928 }, { "auxiliary_loss_clip": 0.01443985, "auxiliary_loss_mlp": 0.01093728, "balance_loss_clip": 1.12985492, "balance_loss_mlp": 1.06410456, "epoch": 0.017916190705224554, "flos": 16253601148800.0, "grad_norm": 2.0052308562285535, "language_loss": 0.88471079, "learning_rate": 3.6250886353337413e-06, "loss": 0.91008788, "num_input_tokens_seen": 3188155, "step": 149, "time_per_iteration": 2.5184099674224854 }, { "auxiliary_loss_clip": 0.01456577, "auxiliary_loss_mlp": 0.01095739, "balance_loss_clip": 1.13688076, "balance_loss_mlp": 1.06607938, "epoch": 0.018036433595863644, "flos": 23330588411520.0, "grad_norm": 2.5097661483820266, "language_loss": 0.86353678, "learning_rate": 3.6299344457091488e-06, "loss": 0.8890599, "num_input_tokens_seen": 3209015, "step": 150, "time_per_iteration": 2.6522512435913086 }, { "auxiliary_loss_clip": 0.01448062, "auxiliary_loss_mlp": 0.01085606, "balance_loss_clip": 1.1330452, "balance_loss_mlp": 1.05735326, "epoch": 0.018156676486502734, "flos": 18588010043520.0, "grad_norm": 2.1232483638502346, "language_loss": 0.93856239, "learning_rate": 3.634748057771256e-06, "loss": 0.96389908, "num_input_tokens_seen": 3224955, "step": 151, "time_per_iteration": 4.405697584152222 }, { "auxiliary_loss_clip": 0.01441102, "auxiliary_loss_mlp": 0.01090787, "balance_loss_clip": 1.13099337, "balance_loss_mlp": 1.06257057, "epoch": 0.018276919377141827, "flos": 25448707560960.0, "grad_norm": 1.8128308807158346, "language_loss": 0.85941714, "learning_rate": 3.639529896584965e-06, "loss": 0.88473606, "num_input_tokens_seen": 3246330, "step": 152, "time_per_iteration": 3.4095232486724854 }, { "auxiliary_loss_clip": 0.01442831, "auxiliary_loss_mlp": 0.01080838, "balance_loss_clip": 1.13019538, "balance_loss_mlp": 1.05102372, "epoch": 0.018397162267780917, "flos": 20047311889920.0, "grad_norm": 2.9186730443635307, "language_loss": 0.89245641, "learning_rate": 3.6442803788531233e-06, "loss": 0.91769302, "num_input_tokens_seen": 3264290, "step": 153, "time_per_iteration": 3.3136463165283203 }, { "auxiliary_loss_clip": 0.0144434, "auxiliary_loss_mlp": 0.01092801, "balance_loss_clip": 1.12938178, "balance_loss_mlp": 1.06197381, "epoch": 0.018517405158420007, "flos": 27565282425600.0, "grad_norm": 2.2904171111991043, "language_loss": 0.96016854, "learning_rate": 3.6489999131344357e-06, "loss": 0.98553991, "num_input_tokens_seen": 3287065, "step": 154, "time_per_iteration": 2.584468126296997 }, { "auxiliary_loss_clip": 0.01432768, "auxiliary_loss_mlp": 0.0109073, "balance_loss_clip": 1.12618279, "balance_loss_mlp": 1.06401515, "epoch": 0.0186376480490591, "flos": 19354056422400.0, "grad_norm": 1.7776320407094601, "language_loss": 0.9060998, "learning_rate": 3.653688900054313e-06, "loss": 0.93133479, "num_input_tokens_seen": 3305595, "step": 155, "time_per_iteration": 2.5302257537841797 }, { "auxiliary_loss_clip": 0.01435272, "auxiliary_loss_mlp": 0.01069098, "balance_loss_clip": 1.12378955, "balance_loss_mlp": 1.04096508, "epoch": 0.01875789093969819, "flos": 26687840993280.0, "grad_norm": 2.417757028032494, "language_loss": 0.76047444, "learning_rate": 3.6583477325089526e-06, "loss": 0.78551811, "num_input_tokens_seen": 3326135, "step": 156, "time_per_iteration": 2.585200548171997 }, { "auxiliary_loss_clip": 0.01429677, "auxiliary_loss_mlp": 0.01079277, "balance_loss_clip": 1.12295747, "balance_loss_mlp": 1.0518589, "epoch": 0.01887813383033728, "flos": 24353001135360.0, "grad_norm": 2.207466324006412, "language_loss": 1.04357588, "learning_rate": 3.6629767958628916e-06, "loss": 1.0686655, "num_input_tokens_seen": 3343510, "step": 157, "time_per_iteration": 2.5631606578826904 }, { "auxiliary_loss_clip": 0.01426265, "auxiliary_loss_mlp": 0.01080537, "balance_loss_clip": 1.12468541, "balance_loss_mlp": 1.05251074, "epoch": 0.018998376720976373, "flos": 14647532330880.0, "grad_norm": 2.4247202971029784, "language_loss": 0.85458946, "learning_rate": 3.667576468140291e-06, "loss": 0.87965751, "num_input_tokens_seen": 3361325, "step": 158, "time_per_iteration": 2.5459694862365723 }, { "auxiliary_loss_clip": 0.01419749, "auxiliary_loss_mlp": 0.01064302, "balance_loss_clip": 1.11828184, "balance_loss_mlp": 1.03892207, "epoch": 0.019118619611615463, "flos": 29305261146240.0, "grad_norm": 2.211349109085119, "language_loss": 0.88938743, "learning_rate": 3.672147120210184e-06, "loss": 0.91422796, "num_input_tokens_seen": 3377925, "step": 159, "time_per_iteration": 2.599454164505005 }, { "auxiliary_loss_clip": 0.01426579, "auxiliary_loss_mlp": 0.01076261, "balance_loss_clip": 1.12569165, "balance_loss_mlp": 1.0512867, "epoch": 0.019238862502254553, "flos": 20886723797760.0, "grad_norm": 1.9756263559848335, "language_loss": 0.86620474, "learning_rate": 3.6766891159659177e-06, "loss": 0.89123309, "num_input_tokens_seen": 3396335, "step": 160, "time_per_iteration": 2.537170886993408 }, { "auxiliary_loss_clip": 0.01426305, "auxiliary_loss_mlp": 0.01078336, "balance_loss_clip": 1.12688708, "balance_loss_mlp": 1.05296862, "epoch": 0.019359105392893646, "flos": 21360672777600.0, "grad_norm": 3.2657088674070787, "language_loss": 0.8801288, "learning_rate": 3.6812028124990075e-06, "loss": 0.90517521, "num_input_tokens_seen": 3413605, "step": 161, "time_per_iteration": 2.6275370121002197 }, { "auxiliary_loss_clip": 0.01421357, "auxiliary_loss_mlp": 0.01082785, "balance_loss_clip": 1.1231519, "balance_loss_mlp": 1.05837083, "epoch": 0.019479348283532736, "flos": 16283729681280.0, "grad_norm": 4.047007746984585, "language_loss": 0.81460655, "learning_rate": 3.6856885602676016e-06, "loss": 0.83964801, "num_input_tokens_seen": 3429640, "step": 162, "time_per_iteration": 2.507634162902832 }, { "auxiliary_loss_clip": 0.01420076, "auxiliary_loss_mlp": 0.01084318, "balance_loss_clip": 1.12288117, "balance_loss_mlp": 1.06010652, "epoch": 0.019599591174171826, "flos": 22091239497600.0, "grad_norm": 2.044509185201751, "language_loss": 0.94333065, "learning_rate": 3.6901467032597733e-06, "loss": 0.96837461, "num_input_tokens_seen": 3448125, "step": 163, "time_per_iteration": 2.6433823108673096 }, { "auxiliary_loss_clip": 0.01422195, "auxiliary_loss_mlp": 0.01070052, "balance_loss_clip": 1.12144852, "balance_loss_mlp": 1.04351616, "epoch": 0.01971983406481092, "flos": 19609668581760.0, "grad_norm": 2.2499308276572494, "language_loss": 0.87472332, "learning_rate": 3.694577579151804e-06, "loss": 0.89964581, "num_input_tokens_seen": 3466535, "step": 164, "time_per_iteration": 2.6180763244628906 }, { "auxiliary_loss_clip": 0.01421243, "auxiliary_loss_mlp": 0.01079805, "balance_loss_clip": 1.12326443, "balance_loss_mlp": 1.05434155, "epoch": 0.01984007695545001, "flos": 19099342103040.0, "grad_norm": 2.2160376331278835, "language_loss": 0.73723412, "learning_rate": 3.6989815194616703e-06, "loss": 0.76224458, "num_input_tokens_seen": 3483730, "step": 165, "time_per_iteration": 2.5633740425109863 }, { "auxiliary_loss_clip": 0.01420118, "auxiliary_loss_mlp": 0.01080578, "balance_loss_clip": 1.11949909, "balance_loss_mlp": 1.05358934, "epoch": 0.0199603198460891, "flos": 20848406964480.0, "grad_norm": 2.8469479377242544, "language_loss": 0.79968739, "learning_rate": 3.703358849697888e-06, "loss": 0.8246944, "num_input_tokens_seen": 3503640, "step": 166, "time_per_iteration": 2.601308584213257 }, { "auxiliary_loss_clip": 0.01417344, "auxiliary_loss_mlp": 0.01089618, "balance_loss_clip": 1.12307215, "balance_loss_mlp": 1.06593156, "epoch": 0.020080562736728192, "flos": 21870747861120.0, "grad_norm": 1.7661579960379223, "language_loss": 0.82708067, "learning_rate": 3.7077098895038803e-06, "loss": 0.85215032, "num_input_tokens_seen": 3523010, "step": 167, "time_per_iteration": 2.554431200027466 }, { "auxiliary_loss_clip": 0.01416286, "auxiliary_loss_mlp": 0.01076642, "balance_loss_clip": 1.12099493, "balance_loss_mlp": 1.05264544, "epoch": 0.020200805627367282, "flos": 21688788539520.0, "grad_norm": 2.228780715329253, "language_loss": 0.97167206, "learning_rate": 3.712034952798045e-06, "loss": 0.99660122, "num_input_tokens_seen": 3541125, "step": 168, "time_per_iteration": 2.521315097808838 }, { "auxiliary_loss_clip": 0.0141256, "auxiliary_loss_mlp": 0.01084325, "balance_loss_clip": 1.1159389, "balance_loss_mlp": 1.05980313, "epoch": 0.02032104851800637, "flos": 33543043729920.0, "grad_norm": 5.603565505184597, "language_loss": 0.84590304, "learning_rate": 3.7163343479096656e-06, "loss": 0.87087184, "num_input_tokens_seen": 3562700, "step": 169, "time_per_iteration": 2.6373016834259033 }, { "auxiliary_loss_clip": 0.01410202, "auxiliary_loss_mlp": 0.01076741, "balance_loss_clip": 1.11913192, "balance_loss_mlp": 1.05486584, "epoch": 0.020441291408645465, "flos": 31686965274240.0, "grad_norm": 2.948590484015234, "language_loss": 0.83056402, "learning_rate": 3.720608377710802e-06, "loss": 0.8554334, "num_input_tokens_seen": 3582790, "step": 170, "time_per_iteration": 2.628444194793701 }, { "auxiliary_loss_clip": 0.01403473, "auxiliary_loss_mlp": 0.01087102, "balance_loss_clip": 1.1130209, "balance_loss_mlp": 1.0624969, "epoch": 0.020561534299284555, "flos": 20886687884160.0, "grad_norm": 2.0495006095686255, "language_loss": 0.86342776, "learning_rate": 3.7248573397443277e-06, "loss": 0.88833356, "num_input_tokens_seen": 3601715, "step": 171, "time_per_iteration": 2.5745770931243896 }, { "auxiliary_loss_clip": 0.01409233, "auxiliary_loss_mlp": 0.0109003, "balance_loss_clip": 1.11958981, "balance_loss_mlp": 1.06468582, "epoch": 0.020681777189923645, "flos": 20996610480000.0, "grad_norm": 2.0797316398658534, "language_loss": 0.97813094, "learning_rate": 3.729081526348224e-06, "loss": 1.00312352, "num_input_tokens_seen": 3620245, "step": 172, "time_per_iteration": 2.6195945739746094 }, { "auxiliary_loss_clip": 0.01410013, "auxiliary_loss_mlp": 0.01068789, "balance_loss_clip": 1.11768854, "balance_loss_mlp": 1.04669952, "epoch": 0.020802020080562738, "flos": 28257532312320.0, "grad_norm": 1.936306971283965, "language_loss": 0.84899962, "learning_rate": 3.7332812247762777e-06, "loss": 0.87378764, "num_input_tokens_seen": 3641545, "step": 173, "time_per_iteration": 2.6253838539123535 }, { "auxiliary_loss_clip": 0.01410098, "auxiliary_loss_mlp": 0.01066071, "balance_loss_clip": 1.12051344, "balance_loss_mlp": 1.04311168, "epoch": 0.020922262971201828, "flos": 19681274344320.0, "grad_norm": 2.4853459240993656, "language_loss": 0.95718849, "learning_rate": 3.737456717315293e-06, "loss": 0.98195016, "num_input_tokens_seen": 3660510, "step": 174, "time_per_iteration": 2.6722049713134766 }, { "auxiliary_loss_clip": 0.01398191, "auxiliary_loss_mlp": 0.01087759, "balance_loss_clip": 1.11585402, "balance_loss_mlp": 1.06469226, "epoch": 0.021042505861840918, "flos": 15666353694720.0, "grad_norm": 1.7243215051599294, "language_loss": 0.9063713, "learning_rate": 3.7416082813989552e-06, "loss": 0.93123078, "num_input_tokens_seen": 3677505, "step": 175, "time_per_iteration": 2.5443568229675293 }, { "auxiliary_loss_clip": 0.01406896, "auxiliary_loss_mlp": 0.01079216, "balance_loss_clip": 1.11785543, "balance_loss_mlp": 1.05564809, "epoch": 0.02116274875248001, "flos": 21142012734720.0, "grad_norm": 1.9655892053854447, "language_loss": 0.89469743, "learning_rate": 3.745736189718439e-06, "loss": 0.91955858, "num_input_tokens_seen": 3696760, "step": 176, "time_per_iteration": 2.535235643386841 }, { "auxiliary_loss_clip": 0.01396913, "auxiliary_loss_mlp": 0.01065821, "balance_loss_clip": 1.11271834, "balance_loss_mlp": 1.04332662, "epoch": 0.0212829916431191, "flos": 24715770543360.0, "grad_norm": 2.7040320607491797, "language_loss": 0.72632468, "learning_rate": 3.749840710329894e-06, "loss": 0.75095206, "num_input_tokens_seen": 3717465, "step": 177, "time_per_iteration": 2.575251817703247 }, { "auxiliary_loss_clip": 0.01408161, "auxiliary_loss_mlp": 0.01085532, "balance_loss_clip": 1.11644828, "balance_loss_mlp": 1.06078386, "epoch": 0.02140323453375819, "flos": 16645493508480.0, "grad_norm": 3.6053867680093465, "language_loss": 0.98011255, "learning_rate": 3.7539221067588938e-06, "loss": 1.00504947, "num_input_tokens_seen": 3731440, "step": 178, "time_per_iteration": 4.1849565505981445 }, { "auxiliary_loss_clip": 0.01402694, "auxiliary_loss_mlp": 0.01084266, "balance_loss_clip": 1.11476171, "balance_loss_mlp": 1.0603168, "epoch": 0.021523477424397284, "flos": 20299332689280.0, "grad_norm": 4.752666604168373, "language_loss": 0.93415415, "learning_rate": 3.757980638101964e-06, "loss": 0.95902377, "num_input_tokens_seen": 3744935, "step": 179, "time_per_iteration": 3.2877705097198486 }, { "auxiliary_loss_clip": 0.01405218, "auxiliary_loss_mlp": 0.01076861, "balance_loss_clip": 1.11657786, "balance_loss_mlp": 1.05145788, "epoch": 0.021643720315036374, "flos": 26104005331200.0, "grad_norm": 2.4511298870031917, "language_loss": 0.8913734, "learning_rate": 3.7620165591252806e-06, "loss": 0.9161942, "num_input_tokens_seen": 3763035, "step": 180, "time_per_iteration": 3.358435869216919 }, { "auxiliary_loss_clip": 0.01394805, "auxiliary_loss_mlp": 0.01072653, "balance_loss_clip": 1.11491966, "balance_loss_mlp": 1.05084956, "epoch": 0.021763963205675464, "flos": 24787663614720.0, "grad_norm": 1.7665753306860168, "language_loss": 0.9454397, "learning_rate": 3.766030120360636e-06, "loss": 0.97011423, "num_input_tokens_seen": 3782665, "step": 181, "time_per_iteration": 2.548003911972046 }, { "auxiliary_loss_clip": 0.01401304, "auxiliary_loss_mlp": 0.01074823, "balance_loss_clip": 1.11475885, "balance_loss_mlp": 1.05270934, "epoch": 0.021884206096314557, "flos": 25813559957760.0, "grad_norm": 2.1208633989575887, "language_loss": 0.90331209, "learning_rate": 3.7700215681987578e-06, "loss": 0.92807329, "num_input_tokens_seen": 3802435, "step": 182, "time_per_iteration": 2.590991258621216 }, { "auxiliary_loss_clip": 0.01393931, "auxiliary_loss_mlp": 0.01085684, "balance_loss_clip": 1.11251068, "balance_loss_mlp": 1.06237805, "epoch": 0.022004448986953647, "flos": 20082719721600.0, "grad_norm": 1.9346909068289775, "language_loss": 0.82292002, "learning_rate": 3.7739911449800767e-06, "loss": 0.84771615, "num_input_tokens_seen": 3822490, "step": 183, "time_per_iteration": 2.6303699016571045 }, { "auxiliary_loss_clip": 0.01394469, "auxiliary_loss_mlp": 0.01085175, "balance_loss_clip": 1.11118197, "balance_loss_mlp": 1.06401491, "epoch": 0.022124691877592736, "flos": 20480609652480.0, "grad_norm": 2.4506076648523143, "language_loss": 0.80815208, "learning_rate": 3.7779390890830114e-06, "loss": 0.83294851, "num_input_tokens_seen": 3841140, "step": 184, "time_per_iteration": 2.582005500793457 }, { "auxiliary_loss_clip": 0.01394968, "auxiliary_loss_mlp": 0.01084229, "balance_loss_clip": 1.11145091, "balance_loss_mlp": 1.06118631, "epoch": 0.02224493476823183, "flos": 23586847015680.0, "grad_norm": 2.5213755960115054, "language_loss": 0.85919791, "learning_rate": 3.7818656350098723e-06, "loss": 0.88398993, "num_input_tokens_seen": 3862090, "step": 185, "time_per_iteration": 2.622337818145752 }, { "auxiliary_loss_clip": 0.0138934, "auxiliary_loss_mlp": 0.01075058, "balance_loss_clip": 1.10778046, "balance_loss_mlp": 1.05164576, "epoch": 0.02236517765887092, "flos": 16909940413440.0, "grad_norm": 2.5423084023343017, "language_loss": 0.7713865, "learning_rate": 3.7857710134704447e-06, "loss": 0.79603046, "num_input_tokens_seen": 3881025, "step": 186, "time_per_iteration": 2.530702829360962 }, { "auxiliary_loss_clip": 0.01388985, "auxiliary_loss_mlp": 0.01057624, "balance_loss_clip": 1.1122731, "balance_loss_mlp": 1.0367744, "epoch": 0.02248542054951001, "flos": 43508182930560.0, "grad_norm": 3.055214000823869, "language_loss": 0.79375297, "learning_rate": 3.7896554514633234e-06, "loss": 0.81821907, "num_input_tokens_seen": 3905310, "step": 187, "time_per_iteration": 2.703484296798706 }, { "auxiliary_loss_clip": 0.01387329, "auxiliary_loss_mlp": 0.01068577, "balance_loss_clip": 1.11025524, "balance_loss_mlp": 1.04735792, "epoch": 0.022605663440149103, "flos": 23367648268800.0, "grad_norm": 3.9384464579489027, "language_loss": 0.84562314, "learning_rate": 3.7935191723550955e-06, "loss": 0.87018216, "num_input_tokens_seen": 3924265, "step": 188, "time_per_iteration": 2.602079391479492 }, { "auxiliary_loss_clip": 0.01386209, "auxiliary_loss_mlp": 0.01071827, "balance_loss_clip": 1.10891056, "balance_loss_mlp": 1.05145431, "epoch": 0.022725906330788193, "flos": 29019915504000.0, "grad_norm": 2.1404268308171406, "language_loss": 0.88815933, "learning_rate": 3.797362395957408e-06, "loss": 0.91273969, "num_input_tokens_seen": 3944830, "step": 189, "time_per_iteration": 2.6051111221313477 }, { "auxiliary_loss_clip": 0.01396753, "auxiliary_loss_mlp": 0.01070363, "balance_loss_clip": 1.11555684, "balance_loss_mlp": 1.04886961, "epoch": 0.022846149221427282, "flos": 24496176746880.0, "grad_norm": 2.0484068975039684, "language_loss": 0.7848438, "learning_rate": 3.8011853386020055e-06, "loss": 0.809515, "num_input_tokens_seen": 3965735, "step": 190, "time_per_iteration": 2.59596586227417 }, { "auxiliary_loss_clip": 0.01392041, "auxiliary_loss_mlp": 0.01084012, "balance_loss_clip": 1.11264849, "balance_loss_mlp": 1.06164813, "epoch": 0.022966392112066376, "flos": 15523537219200.0, "grad_norm": 3.5710873371832825, "language_loss": 0.90078062, "learning_rate": 3.804988213213804e-06, "loss": 0.92554116, "num_input_tokens_seen": 3983975, "step": 191, "time_per_iteration": 2.5427801609039307 }, { "auxiliary_loss_clip": 0.01378642, "auxiliary_loss_mlp": 0.01020948, "balance_loss_clip": 1.15688062, "balance_loss_mlp": 1.00916982, "epoch": 0.023086635002705466, "flos": 55650408433920.0, "grad_norm": 1.0193135268128133, "language_loss": 0.6316992, "learning_rate": 3.808771229382049e-06, "loss": 0.65569508, "num_input_tokens_seen": 4043440, "step": 192, "time_per_iteration": 3.0310378074645996 }, { "auxiliary_loss_clip": 0.01383525, "auxiliary_loss_mlp": 0.01079605, "balance_loss_clip": 1.11008716, "balance_loss_mlp": 1.0592916, "epoch": 0.023206877893344555, "flos": 19313441118720.0, "grad_norm": 2.3585366127604876, "language_loss": 0.84396625, "learning_rate": 3.8125345934296324e-06, "loss": 0.86859757, "num_input_tokens_seen": 4061750, "step": 193, "time_per_iteration": 2.5186355113983154 }, { "auxiliary_loss_clip": 0.01384921, "auxiliary_loss_mlp": 0.01075543, "balance_loss_clip": 1.10954964, "balance_loss_mlp": 1.05296457, "epoch": 0.02332712078398365, "flos": 23072965090560.0, "grad_norm": 2.753611041070088, "language_loss": 0.88016462, "learning_rate": 3.81627850848061e-06, "loss": 0.9047693, "num_input_tokens_seen": 4082345, "step": 194, "time_per_iteration": 2.5787055492401123 }, { "auxiliary_loss_clip": 0.01379018, "auxiliary_loss_mlp": 0.01066263, "balance_loss_clip": 1.10537577, "balance_loss_mlp": 1.04605734, "epoch": 0.02344736367462274, "flos": 24425971614720.0, "grad_norm": 2.2039583775921376, "language_loss": 0.86284089, "learning_rate": 3.820003174525994e-06, "loss": 0.88729376, "num_input_tokens_seen": 4101770, "step": 195, "time_per_iteration": 2.538210868835449 }, { "auxiliary_loss_clip": 0.01383532, "auxiliary_loss_mlp": 0.01074209, "balance_loss_clip": 1.11042321, "balance_loss_mlp": 1.05376482, "epoch": 0.02356760656526183, "flos": 21579799697280.0, "grad_norm": 2.3914382843859765, "language_loss": 0.82955569, "learning_rate": 3.823708788487851e-06, "loss": 0.85413301, "num_input_tokens_seen": 4118770, "step": 196, "time_per_iteration": 2.5229756832122803 }, { "auxiliary_loss_clip": 0.01379316, "auxiliary_loss_mlp": 0.01082421, "balance_loss_clip": 1.10739207, "balance_loss_mlp": 1.06291866, "epoch": 0.02368784945590092, "flos": 25193598192000.0, "grad_norm": 2.6305134989326135, "language_loss": 0.84352314, "learning_rate": 3.827395544281781e-06, "loss": 0.86814046, "num_input_tokens_seen": 4141110, "step": 197, "time_per_iteration": 2.6391611099243164 }, { "auxiliary_loss_clip": 0.01386246, "auxiliary_loss_mlp": 0.01080686, "balance_loss_clip": 1.1109786, "balance_loss_mlp": 1.06018174, "epoch": 0.02380809234654001, "flos": 27562481164800.0, "grad_norm": 1.999182722210569, "language_loss": 0.78948164, "learning_rate": 3.831063632877802e-06, "loss": 0.81415093, "num_input_tokens_seen": 4161430, "step": 198, "time_per_iteration": 2.576688289642334 }, { "auxiliary_loss_clip": 0.01381736, "auxiliary_loss_mlp": 0.01072961, "balance_loss_clip": 1.11401629, "balance_loss_mlp": 1.05416203, "epoch": 0.0239283352371791, "flos": 18259786540800.0, "grad_norm": 2.379044868441493, "language_loss": 0.76057124, "learning_rate": 3.834713242359712e-06, "loss": 0.78511822, "num_input_tokens_seen": 4179260, "step": 199, "time_per_iteration": 2.490786552429199 }, { "auxiliary_loss_clip": 0.01384182, "auxiliary_loss_mlp": 0.01073777, "balance_loss_clip": 1.10767508, "balance_loss_mlp": 1.05190182, "epoch": 0.02404857812781819, "flos": 21395110942080.0, "grad_norm": 3.457610276393604, "language_loss": 0.87166214, "learning_rate": 3.838344557982959e-06, "loss": 0.89624172, "num_input_tokens_seen": 4200640, "step": 200, "time_per_iteration": 2.572953224182129 }, { "auxiliary_loss_clip": 0.01377653, "auxiliary_loss_mlp": 0.01075539, "balance_loss_clip": 1.10674405, "balance_loss_mlp": 1.05415249, "epoch": 0.024168821018457284, "flos": 16654256426880.0, "grad_norm": 3.3374394347500886, "language_loss": 0.84899002, "learning_rate": 3.841957762231063e-06, "loss": 0.87352198, "num_input_tokens_seen": 4218170, "step": 201, "time_per_iteration": 2.5260581970214844 }, { "auxiliary_loss_clip": 0.01374378, "auxiliary_loss_mlp": 0.01063737, "balance_loss_clip": 1.1042887, "balance_loss_mlp": 1.0435431, "epoch": 0.024289063909096374, "flos": 22820872464000.0, "grad_norm": 2.4385184604658967, "language_loss": 0.87626147, "learning_rate": 3.8455530348706454e-06, "loss": 0.90064269, "num_input_tokens_seen": 4237770, "step": 202, "time_per_iteration": 2.571443557739258 }, { "auxiliary_loss_clip": 0.01375074, "auxiliary_loss_mlp": 0.01073974, "balance_loss_clip": 1.10634232, "balance_loss_mlp": 1.05511534, "epoch": 0.024409306799735464, "flos": 17748598135680.0, "grad_norm": 2.153162568101511, "language_loss": 0.77302998, "learning_rate": 3.849130553005099e-06, "loss": 0.79752046, "num_input_tokens_seen": 4255985, "step": 203, "time_per_iteration": 2.48757266998291 }, { "auxiliary_loss_clip": 0.01375607, "auxiliary_loss_mlp": 0.01067165, "balance_loss_clip": 1.10425639, "balance_loss_mlp": 1.04836547, "epoch": 0.024529549690374557, "flos": 21616213109760.0, "grad_norm": 3.480862266978662, "language_loss": 0.8341502, "learning_rate": 3.852690491126933e-06, "loss": 0.85857791, "num_input_tokens_seen": 4276035, "step": 204, "time_per_iteration": 3.2992236614227295 }, { "auxiliary_loss_clip": 0.01370282, "auxiliary_loss_mlp": 0.01062416, "balance_loss_clip": 1.10067284, "balance_loss_mlp": 1.04217386, "epoch": 0.024649792581013647, "flos": 25551662918400.0, "grad_norm": 3.0605471564654763, "language_loss": 0.91415983, "learning_rate": 3.856233021168845e-06, "loss": 0.93848681, "num_input_tokens_seen": 4295730, "step": 205, "time_per_iteration": 3.370774984359741 }, { "auxiliary_loss_clip": 0.0136434, "auxiliary_loss_mlp": 0.01054383, "balance_loss_clip": 1.10094357, "balance_loss_mlp": 1.03669214, "epoch": 0.024770035471652737, "flos": 34495574544000.0, "grad_norm": 2.242911687815196, "language_loss": 0.91352785, "learning_rate": 3.859758312553544e-06, "loss": 0.93771505, "num_input_tokens_seen": 4317950, "step": 206, "time_per_iteration": 4.205277681350708 }, { "auxiliary_loss_clip": 0.01373758, "auxiliary_loss_mlp": 0.01069276, "balance_loss_clip": 1.1070987, "balance_loss_mlp": 1.05090547, "epoch": 0.02489027836229183, "flos": 21505428587520.0, "grad_norm": 2.119683256324153, "language_loss": 0.91625947, "learning_rate": 3.8632665322423735e-06, "loss": 0.9406898, "num_input_tokens_seen": 4337605, "step": 207, "time_per_iteration": 2.5087318420410156 }, { "auxiliary_loss_clip": 0.01371615, "auxiliary_loss_mlp": 0.01065658, "balance_loss_clip": 1.10397506, "balance_loss_mlp": 1.0465014, "epoch": 0.02501052125293092, "flos": 23219013790080.0, "grad_norm": 1.841662127677893, "language_loss": 0.86065757, "learning_rate": 3.866757844782762e-06, "loss": 0.88503033, "num_input_tokens_seen": 4358110, "step": 208, "time_per_iteration": 2.6086056232452393 }, { "auxiliary_loss_clip": 0.01371228, "auxiliary_loss_mlp": 0.01063733, "balance_loss_clip": 1.10509634, "balance_loss_mlp": 1.04489827, "epoch": 0.02513076414357001, "flos": 26388920010240.0, "grad_norm": 2.8526319740064374, "language_loss": 0.91058338, "learning_rate": 3.870232412354527e-06, "loss": 0.93493301, "num_input_tokens_seen": 4374955, "step": 209, "time_per_iteration": 2.523590326309204 }, { "auxiliary_loss_clip": 0.01367578, "auxiliary_loss_mlp": 0.01064445, "balance_loss_clip": 1.10237503, "balance_loss_mlp": 1.04549098, "epoch": 0.025251007034209103, "flos": 13590430047360.0, "grad_norm": 2.0385818608497526, "language_loss": 0.92587775, "learning_rate": 3.873690394815086e-06, "loss": 0.95019805, "num_input_tokens_seen": 4391535, "step": 210, "time_per_iteration": 2.495131015777588 }, { "auxiliary_loss_clip": 0.01365097, "auxiliary_loss_mlp": 0.01058858, "balance_loss_clip": 1.09899962, "balance_loss_mlp": 1.04014182, "epoch": 0.025371249924848193, "flos": 15049229103360.0, "grad_norm": 2.5820015441999047, "language_loss": 0.91305387, "learning_rate": 3.877131949743587e-06, "loss": 0.93729341, "num_input_tokens_seen": 4408400, "step": 211, "time_per_iteration": 2.478599786758423 }, { "auxiliary_loss_clip": 0.01367094, "auxiliary_loss_mlp": 0.01078591, "balance_loss_clip": 1.10253382, "balance_loss_mlp": 1.05948234, "epoch": 0.025491492815487283, "flos": 25553853648000.0, "grad_norm": 2.2464687423561025, "language_loss": 0.77946156, "learning_rate": 3.880557232483993e-06, "loss": 0.80391848, "num_input_tokens_seen": 4427840, "step": 212, "time_per_iteration": 2.5540060997009277 }, { "auxiliary_loss_clip": 0.01366147, "auxiliary_loss_mlp": 0.01061694, "balance_loss_clip": 1.09939647, "balance_loss_mlp": 1.04216766, "epoch": 0.025611735706126376, "flos": 20630752502400.0, "grad_norm": 2.013256768384452, "language_loss": 0.86888659, "learning_rate": 3.883966396187164e-06, "loss": 0.89316499, "num_input_tokens_seen": 4447110, "step": 213, "time_per_iteration": 2.5085840225219727 }, { "auxiliary_loss_clip": 0.0136812, "auxiliary_loss_mlp": 0.01060864, "balance_loss_clip": 1.10369563, "balance_loss_mlp": 1.04280365, "epoch": 0.025731978596765466, "flos": 19062282245760.0, "grad_norm": 2.048296030302431, "language_loss": 0.90014124, "learning_rate": 3.887359591851937e-06, "loss": 0.92443109, "num_input_tokens_seen": 4464715, "step": 214, "time_per_iteration": 2.543308734893799 }, { "auxiliary_loss_clip": 0.01363345, "auxiliary_loss_mlp": 0.01058674, "balance_loss_clip": 1.10138774, "balance_loss_mlp": 1.03925502, "epoch": 0.025852221487404556, "flos": 22163814927360.0, "grad_norm": 2.2926812510931076, "language_loss": 0.92373013, "learning_rate": 3.890736968365265e-06, "loss": 0.94795024, "num_input_tokens_seen": 4485030, "step": 215, "time_per_iteration": 2.5321481227874756 }, { "auxiliary_loss_clip": 0.01364322, "auxiliary_loss_mlp": 0.0106298, "balance_loss_clip": 1.10002589, "balance_loss_mlp": 1.04281032, "epoch": 0.02597246437804365, "flos": 26541971861760.0, "grad_norm": 1.9901051216940264, "language_loss": 0.85252273, "learning_rate": 3.894098672541412e-06, "loss": 0.87679577, "num_input_tokens_seen": 4505935, "step": 216, "time_per_iteration": 2.6220383644104004 }, { "auxiliary_loss_clip": 0.01364433, "auxiliary_loss_mlp": 0.01066895, "balance_loss_clip": 1.10046506, "balance_loss_mlp": 1.04668856, "epoch": 0.02609270726868274, "flos": 32671671696000.0, "grad_norm": 1.7493344947840754, "language_loss": 0.75335574, "learning_rate": 3.89744484916025e-06, "loss": 0.77766901, "num_input_tokens_seen": 4527045, "step": 217, "time_per_iteration": 2.65970516204834 }, { "auxiliary_loss_clip": 0.01366411, "auxiliary_loss_mlp": 0.01068807, "balance_loss_clip": 1.10224271, "balance_loss_mlp": 1.04861248, "epoch": 0.02621295015932183, "flos": 26243553669120.0, "grad_norm": 2.3008581604823672, "language_loss": 0.87159967, "learning_rate": 3.900775641004673e-06, "loss": 0.89595187, "num_input_tokens_seen": 4546360, "step": 218, "time_per_iteration": 2.546806573867798 }, { "auxiliary_loss_clip": 0.01371726, "auxiliary_loss_mlp": 0.01074983, "balance_loss_clip": 1.10462165, "balance_loss_mlp": 1.05239296, "epoch": 0.026333193049960922, "flos": 42921402353280.0, "grad_norm": 3.320789338224337, "language_loss": 0.73600757, "learning_rate": 3.904091188897156e-06, "loss": 0.76047468, "num_input_tokens_seen": 4565495, "step": 219, "time_per_iteration": 2.669745922088623 }, { "auxiliary_loss_clip": 0.01362539, "auxiliary_loss_mlp": 0.01073098, "balance_loss_clip": 1.10027313, "balance_loss_mlp": 1.05253482, "epoch": 0.026453435940600012, "flos": 17963846386560.0, "grad_norm": 2.020720471020208, "language_loss": 0.81937492, "learning_rate": 3.90739163173548e-06, "loss": 0.84373134, "num_input_tokens_seen": 4583330, "step": 220, "time_per_iteration": 2.5275635719299316 }, { "auxiliary_loss_clip": 0.01360372, "auxiliary_loss_mlp": 0.01069607, "balance_loss_clip": 1.09959698, "balance_loss_mlp": 1.05024719, "epoch": 0.026573678831239102, "flos": 18984319776000.0, "grad_norm": 2.9456030849177925, "language_loss": 0.88444632, "learning_rate": 3.910677106527646e-06, "loss": 0.90874612, "num_input_tokens_seen": 4600520, "step": 221, "time_per_iteration": 2.4854769706726074 }, { "auxiliary_loss_clip": 0.01358389, "auxiliary_loss_mlp": 0.01069227, "balance_loss_clip": 1.09946918, "balance_loss_mlp": 1.05108368, "epoch": 0.026693921721878195, "flos": 29241448634880.0, "grad_norm": 2.5311777765044643, "language_loss": 0.84321344, "learning_rate": 3.913947748426004e-06, "loss": 0.86748958, "num_input_tokens_seen": 4617340, "step": 222, "time_per_iteration": 2.6192891597747803 }, { "auxiliary_loss_clip": 0.01364779, "auxiliary_loss_mlp": 0.01070756, "balance_loss_clip": 1.10314584, "balance_loss_mlp": 1.05221844, "epoch": 0.026814164612517285, "flos": 14128083797760.0, "grad_norm": 2.854249974962943, "language_loss": 0.76695639, "learning_rate": 3.9172036907606136e-06, "loss": 0.79131174, "num_input_tokens_seen": 4630820, "step": 223, "time_per_iteration": 2.4835925102233887 }, { "auxiliary_loss_clip": 0.01362118, "auxiliary_loss_mlp": 0.01064354, "balance_loss_clip": 1.09909868, "balance_loss_mlp": 1.04522085, "epoch": 0.026934407503156375, "flos": 23511973115520.0, "grad_norm": 1.8045871885028086, "language_loss": 0.94935501, "learning_rate": 3.920445065071855e-06, "loss": 0.97361982, "num_input_tokens_seen": 4651985, "step": 224, "time_per_iteration": 2.5434398651123047 }, { "auxiliary_loss_clip": 0.01358694, "auxiliary_loss_mlp": 0.0107522, "balance_loss_clip": 1.09892631, "balance_loss_mlp": 1.05577648, "epoch": 0.027054650393795468, "flos": 28950356816640.0, "grad_norm": 2.3382265797310966, "language_loss": 0.7993409, "learning_rate": 3.923672001142322e-06, "loss": 0.82368004, "num_input_tokens_seen": 4672295, "step": 225, "time_per_iteration": 2.5866034030914307 }, { "auxiliary_loss_clip": 0.01355271, "auxiliary_loss_mlp": 0.01076068, "balance_loss_clip": 1.09733748, "balance_loss_mlp": 1.05650616, "epoch": 0.027174893284434558, "flos": 31431568596480.0, "grad_norm": 4.077587836933165, "language_loss": 0.84481573, "learning_rate": 3.926884627027996e-06, "loss": 0.86912912, "num_input_tokens_seen": 4696065, "step": 226, "time_per_iteration": 2.692692518234253 }, { "auxiliary_loss_clip": 0.01357076, "auxiliary_loss_mlp": 0.01070759, "balance_loss_clip": 1.09662664, "balance_loss_mlp": 1.05248427, "epoch": 0.027295136175073648, "flos": 22054466949120.0, "grad_norm": 2.107326939323988, "language_loss": 0.77363896, "learning_rate": 3.930083069088744e-06, "loss": 0.79791731, "num_input_tokens_seen": 4716065, "step": 227, "time_per_iteration": 2.503667116165161 }, { "auxiliary_loss_clip": 0.01322995, "auxiliary_loss_mlp": 0.01016696, "balance_loss_clip": 1.1227802, "balance_loss_mlp": 1.00658703, "epoch": 0.02741537906571274, "flos": 60800752972800.0, "grad_norm": 0.9749441330164644, "language_loss": 0.59297657, "learning_rate": 3.933267452018137e-06, "loss": 0.61637342, "num_input_tokens_seen": 4775860, "step": 228, "time_per_iteration": 3.094857692718506 }, { "auxiliary_loss_clip": 0.01355364, "auxiliary_loss_mlp": 0.01062031, "balance_loss_clip": 1.099015, "balance_loss_mlp": 1.04311216, "epoch": 0.02753562195635183, "flos": 24606278910720.0, "grad_norm": 2.208420289896797, "language_loss": 0.8430475, "learning_rate": 3.936437898872622e-06, "loss": 0.86722136, "num_input_tokens_seen": 4795835, "step": 229, "time_per_iteration": 2.54166841506958 }, { "auxiliary_loss_clip": 0.01357199, "auxiliary_loss_mlp": 0.0105513, "balance_loss_clip": 1.09940708, "balance_loss_mlp": 1.03717709, "epoch": 0.02765586484699092, "flos": 34094236907520.0, "grad_norm": 3.2385977808700495, "language_loss": 0.79844141, "learning_rate": 3.9395945311000525e-06, "loss": 0.8225646, "num_input_tokens_seen": 4817460, "step": 230, "time_per_iteration": 2.6339735984802246 }, { "auxiliary_loss_clip": 0.0135729, "auxiliary_loss_mlp": 0.01069433, "balance_loss_clip": 1.09914613, "balance_loss_mlp": 1.05052686, "epoch": 0.027776107737630014, "flos": 14829922615680.0, "grad_norm": 3.8376158589856786, "language_loss": 0.90692508, "learning_rate": 3.942737468567608e-06, "loss": 0.93119228, "num_input_tokens_seen": 4835475, "step": 231, "time_per_iteration": 2.497300863265991 }, { "auxiliary_loss_clip": 0.01355252, "auxiliary_loss_mlp": 0.01069152, "balance_loss_clip": 1.09883857, "balance_loss_mlp": 1.05079365, "epoch": 0.027896350628269104, "flos": 47920347066240.0, "grad_norm": 2.0804334918992344, "language_loss": 0.8606438, "learning_rate": 3.9458668295891026e-06, "loss": 0.88488781, "num_input_tokens_seen": 4857760, "step": 232, "time_per_iteration": 3.556023120880127 }, { "auxiliary_loss_clip": 0.01351136, "auxiliary_loss_mlp": 0.0106241, "balance_loss_clip": 1.09420633, "balance_loss_mlp": 1.04269302, "epoch": 0.028016593518908194, "flos": 21684550734720.0, "grad_norm": 2.3319487340446874, "language_loss": 0.86540693, "learning_rate": 3.948982730951712e-06, "loss": 0.88954234, "num_input_tokens_seen": 4875855, "step": 233, "time_per_iteration": 3.401474952697754 }, { "auxiliary_loss_clip": 0.01354923, "auxiliary_loss_mlp": 0.0106241, "balance_loss_clip": 1.09745383, "balance_loss_mlp": 1.04310989, "epoch": 0.028136836409547287, "flos": 18439483305600.0, "grad_norm": 2.467211104804349, "language_loss": 0.82109368, "learning_rate": 3.9520852879421254e-06, "loss": 0.84526706, "num_input_tokens_seen": 4893200, "step": 234, "time_per_iteration": 3.2872390747070312 }, { "auxiliary_loss_clip": 0.01349852, "auxiliary_loss_mlp": 0.01065635, "balance_loss_clip": 1.09696901, "balance_loss_mlp": 1.04802775, "epoch": 0.028257079300186377, "flos": 31576934937600.0, "grad_norm": 2.247225688848913, "language_loss": 0.81712079, "learning_rate": 3.955174614372137e-06, "loss": 0.84127569, "num_input_tokens_seen": 4912965, "step": 235, "time_per_iteration": 2.5721051692962646 }, { "auxiliary_loss_clip": 0.01352424, "auxiliary_loss_mlp": 0.01068071, "balance_loss_clip": 1.09749591, "balance_loss_mlp": 1.04923606, "epoch": 0.028377322190825467, "flos": 23513337832320.0, "grad_norm": 2.3448512452873995, "language_loss": 0.84337473, "learning_rate": 3.9582508226037045e-06, "loss": 0.8675797, "num_input_tokens_seen": 4933105, "step": 236, "time_per_iteration": 2.538154363632202 }, { "auxiliary_loss_clip": 0.01359055, "auxiliary_loss_mlp": 0.01070912, "balance_loss_clip": 1.09861803, "balance_loss_mlp": 1.05113506, "epoch": 0.02849756508146456, "flos": 20479604071680.0, "grad_norm": 2.440995852846626, "language_loss": 0.94367313, "learning_rate": 3.9613140235734636e-06, "loss": 0.96797276, "num_input_tokens_seen": 4950085, "step": 237, "time_per_iteration": 2.5157129764556885 }, { "auxiliary_loss_clip": 0.01350154, "auxiliary_loss_mlp": 0.01063875, "balance_loss_clip": 1.09532285, "balance_loss_mlp": 1.04457521, "epoch": 0.02861780797210365, "flos": 14283362292480.0, "grad_norm": 1.992663850482671, "language_loss": 0.80907106, "learning_rate": 3.96436432681674e-06, "loss": 0.83321142, "num_input_tokens_seen": 4968075, "step": 238, "time_per_iteration": 2.4828989505767822 }, { "auxiliary_loss_clip": 0.01349834, "auxiliary_loss_mlp": 0.01069475, "balance_loss_clip": 1.09542072, "balance_loss_mlp": 1.05052102, "epoch": 0.02873805086274274, "flos": 25808532053760.0, "grad_norm": 1.9908416034147012, "language_loss": 0.88966727, "learning_rate": 3.967401840491044e-06, "loss": 0.91386038, "num_input_tokens_seen": 4987355, "step": 239, "time_per_iteration": 2.5395402908325195 }, { "auxiliary_loss_clip": 0.01347916, "auxiliary_loss_mlp": 0.01063631, "balance_loss_clip": 1.09742951, "balance_loss_mlp": 1.04706097, "epoch": 0.028858293753381833, "flos": 17304238984320.0, "grad_norm": 2.672957067561469, "language_loss": 0.87890118, "learning_rate": 3.97042667139909e-06, "loss": 0.90301669, "num_input_tokens_seen": 5004680, "step": 240, "time_per_iteration": 2.4862513542175293 }, { "auxiliary_loss_clip": 0.01349318, "auxiliary_loss_mlp": 0.010588, "balance_loss_clip": 1.09697223, "balance_loss_mlp": 1.0407877, "epoch": 0.028978536644020923, "flos": 23038347358080.0, "grad_norm": 2.174193489347373, "language_loss": 0.87252986, "learning_rate": 3.973438925011327e-06, "loss": 0.89661103, "num_input_tokens_seen": 5022965, "step": 241, "time_per_iteration": 2.523435115814209 }, { "auxiliary_loss_clip": 0.01348963, "auxiliary_loss_mlp": 0.01051689, "balance_loss_clip": 1.09396458, "balance_loss_mlp": 1.03308034, "epoch": 0.029098779534660012, "flos": 28329712692480.0, "grad_norm": 9.429468356383522, "language_loss": 0.91451728, "learning_rate": 3.976438705488002e-06, "loss": 0.93852377, "num_input_tokens_seen": 5042625, "step": 242, "time_per_iteration": 2.5604324340820312 }, { "auxiliary_loss_clip": 0.01346862, "auxiliary_loss_mlp": 0.01061335, "balance_loss_clip": 1.09626973, "balance_loss_mlp": 1.04419231, "epoch": 0.029219022425299106, "flos": 13881665520000.0, "grad_norm": 8.943295230343054, "language_loss": 0.93283665, "learning_rate": 3.9794261157007744e-06, "loss": 0.9569186, "num_input_tokens_seen": 5060380, "step": 243, "time_per_iteration": 2.5408549308776855 }, { "auxiliary_loss_clip": 0.01352182, "auxiliary_loss_mlp": 0.01058749, "balance_loss_clip": 1.0975039, "balance_loss_mlp": 1.03918707, "epoch": 0.029339265315938196, "flos": 19422501788160.0, "grad_norm": 2.3217988013801363, "language_loss": 0.85028291, "learning_rate": 3.982401257253887e-06, "loss": 0.87439227, "num_input_tokens_seen": 5078720, "step": 244, "time_per_iteration": 2.4937031269073486 }, { "auxiliary_loss_clip": 0.01348208, "auxiliary_loss_mlp": 0.0105766, "balance_loss_clip": 1.0945996, "balance_loss_mlp": 1.04029155, "epoch": 0.029459508206577285, "flos": 15669550005120.0, "grad_norm": 2.3041590113838204, "language_loss": 0.89937472, "learning_rate": 3.985364230504893e-06, "loss": 0.92343342, "num_input_tokens_seen": 5096605, "step": 245, "time_per_iteration": 2.4814274311065674 }, { "auxiliary_loss_clip": 0.01354748, "auxiliary_loss_mlp": 0.01061326, "balance_loss_clip": 1.10030985, "balance_loss_mlp": 1.04437447, "epoch": 0.02957975109721638, "flos": 28220975245440.0, "grad_norm": 2.013585364659552, "language_loss": 0.84621513, "learning_rate": 3.988315134584976e-06, "loss": 0.87037593, "num_input_tokens_seen": 5116285, "step": 246, "time_per_iteration": 2.5597591400146484 }, { "auxiliary_loss_clip": 0.01352394, "auxiliary_loss_mlp": 0.01068742, "balance_loss_clip": 1.0980742, "balance_loss_mlp": 1.05041969, "epoch": 0.02969999398785547, "flos": 24315869450880.0, "grad_norm": 1.909379917416462, "language_loss": 0.80429095, "learning_rate": 3.991254067418851e-06, "loss": 0.8285023, "num_input_tokens_seen": 5136825, "step": 247, "time_per_iteration": 2.5331099033355713 }, { "auxiliary_loss_clip": 0.01342871, "auxiliary_loss_mlp": 0.01067055, "balance_loss_clip": 1.09528852, "balance_loss_mlp": 1.04994798, "epoch": 0.02982023687849456, "flos": 35078584193280.0, "grad_norm": 2.4163010074261524, "language_loss": 0.83139467, "learning_rate": 3.994181125744254e-06, "loss": 0.8554939, "num_input_tokens_seen": 5158630, "step": 248, "time_per_iteration": 2.6819405555725098 }, { "auxiliary_loss_clip": 0.01346289, "auxiliary_loss_mlp": 0.01057225, "balance_loss_clip": 1.09550762, "balance_loss_mlp": 1.04008269, "epoch": 0.02994047976913365, "flos": 26177155378560.0, "grad_norm": 2.3555214822289794, "language_loss": 0.74047112, "learning_rate": 3.99709640513106e-06, "loss": 0.76450622, "num_input_tokens_seen": 5179510, "step": 249, "time_per_iteration": 2.6884233951568604 }, { "auxiliary_loss_clip": 0.01347535, "auxiliary_loss_mlp": 0.01071898, "balance_loss_clip": 1.09291124, "balance_loss_mlp": 1.05293226, "epoch": 0.03006072265977274, "flos": 25625028447360.0, "grad_norm": 2.3579285643860426, "language_loss": 0.85599041, "learning_rate": 4e-06, "loss": 0.88018471, "num_input_tokens_seen": 5199345, "step": 250, "time_per_iteration": 2.6508238315582275 }, { "auxiliary_loss_clip": 0.01348462, "auxiliary_loss_mlp": 0.01056314, "balance_loss_clip": 1.09756136, "balance_loss_mlp": 1.03913593, "epoch": 0.03018096555041183, "flos": 22127078292480.0, "grad_norm": 6.856721845759342, "language_loss": 0.88815314, "learning_rate": 3.999999848300794e-06, "loss": 0.91220093, "num_input_tokens_seen": 5218330, "step": 251, "time_per_iteration": 2.561594009399414 }, { "auxiliary_loss_clip": 0.01340508, "auxiliary_loss_mlp": 0.0105576, "balance_loss_clip": 1.09116793, "balance_loss_mlp": 1.03841567, "epoch": 0.030301208441050925, "flos": 30188197359360.0, "grad_norm": 1.6813670069668534, "language_loss": 0.89249742, "learning_rate": 3.999999393203203e-06, "loss": 0.91646004, "num_input_tokens_seen": 5240740, "step": 252, "time_per_iteration": 2.600980520248413 }, { "auxiliary_loss_clip": 0.01340194, "auxiliary_loss_mlp": 0.01058007, "balance_loss_clip": 1.08968043, "balance_loss_mlp": 1.04090023, "epoch": 0.030421451331690014, "flos": 23621392920960.0, "grad_norm": 2.0489429878677483, "language_loss": 0.85192108, "learning_rate": 3.999998634707293e-06, "loss": 0.87590307, "num_input_tokens_seen": 5260290, "step": 253, "time_per_iteration": 2.5189058780670166 }, { "auxiliary_loss_clip": 0.01349752, "auxiliary_loss_mlp": 0.01063256, "balance_loss_clip": 1.09779787, "balance_loss_mlp": 1.04561341, "epoch": 0.030541694222329104, "flos": 27928446883200.0, "grad_norm": 3.2957676838366092, "language_loss": 0.96555686, "learning_rate": 3.999997572813182e-06, "loss": 0.98968697, "num_input_tokens_seen": 5278100, "step": 254, "time_per_iteration": 2.553295373916626 }, { "auxiliary_loss_clip": 0.01344141, "auxiliary_loss_mlp": 0.01069777, "balance_loss_clip": 1.09273148, "balance_loss_mlp": 1.05239654, "epoch": 0.030661937112968194, "flos": 18588441006720.0, "grad_norm": 1.8447062214383305, "language_loss": 0.87863976, "learning_rate": 3.999996207521028e-06, "loss": 0.90277898, "num_input_tokens_seen": 5296810, "step": 255, "time_per_iteration": 2.519864559173584 }, { "auxiliary_loss_clip": 0.01346642, "auxiliary_loss_mlp": 0.01058714, "balance_loss_clip": 1.09154677, "balance_loss_mlp": 1.03992653, "epoch": 0.030782180003607287, "flos": 12969139478400.0, "grad_norm": 2.1470557493906837, "language_loss": 0.82066244, "learning_rate": 3.999994538831039e-06, "loss": 0.84471607, "num_input_tokens_seen": 5313395, "step": 256, "time_per_iteration": 2.5047779083251953 }, { "auxiliary_loss_clip": 0.013442, "auxiliary_loss_mlp": 0.01059467, "balance_loss_clip": 1.09288216, "balance_loss_mlp": 1.04066777, "epoch": 0.030902422894246377, "flos": 23335364920320.0, "grad_norm": 2.4750030227833117, "language_loss": 0.85999751, "learning_rate": 3.99999256674347e-06, "loss": 0.88403416, "num_input_tokens_seen": 5333545, "step": 257, "time_per_iteration": 2.577547788619995 }, { "auxiliary_loss_clip": 0.01287211, "auxiliary_loss_mlp": 0.01008034, "balance_loss_clip": 1.09815097, "balance_loss_mlp": 0.99759179, "epoch": 0.031022665784885467, "flos": 55094151438720.0, "grad_norm": 1.004923587208518, "language_loss": 0.5349049, "learning_rate": 3.999990291258618e-06, "loss": 0.55785733, "num_input_tokens_seen": 5392235, "step": 258, "time_per_iteration": 3.0803494453430176 }, { "auxiliary_loss_clip": 0.01343399, "auxiliary_loss_mlp": 0.01061069, "balance_loss_clip": 1.09277654, "balance_loss_mlp": 1.0433073, "epoch": 0.03114290867552456, "flos": 19317786664320.0, "grad_norm": 2.3273693251046295, "language_loss": 0.86488926, "learning_rate": 3.999987712376829e-06, "loss": 0.8889339, "num_input_tokens_seen": 5410555, "step": 259, "time_per_iteration": 2.5353946685791016 }, { "auxiliary_loss_clip": 0.01342276, "auxiliary_loss_mlp": 0.01060603, "balance_loss_clip": 1.09426641, "balance_loss_mlp": 1.04290009, "epoch": 0.031263151566163654, "flos": 20959442881920.0, "grad_norm": 2.1982399831404953, "language_loss": 0.82236338, "learning_rate": 3.999984830098494e-06, "loss": 0.84639215, "num_input_tokens_seen": 5430135, "step": 260, "time_per_iteration": 4.185324668884277 }, { "auxiliary_loss_clip": 0.01339203, "auxiliary_loss_mlp": 0.01063006, "balance_loss_clip": 1.09065187, "balance_loss_mlp": 1.04500532, "epoch": 0.03138339445680274, "flos": 14793006412800.0, "grad_norm": 2.9356722304355114, "language_loss": 0.98046136, "learning_rate": 3.999981644424051e-06, "loss": 1.00448346, "num_input_tokens_seen": 5444935, "step": 261, "time_per_iteration": 3.1868677139282227 }, { "auxiliary_loss_clip": 0.01341159, "auxiliary_loss_mlp": 0.01069933, "balance_loss_clip": 1.0940057, "balance_loss_mlp": 1.05084765, "epoch": 0.03150363734744183, "flos": 11655599022720.0, "grad_norm": 2.645400996478755, "language_loss": 0.86089593, "learning_rate": 3.999978155353982e-06, "loss": 0.88500684, "num_input_tokens_seen": 5462080, "step": 262, "time_per_iteration": 2.502997636795044 }, { "auxiliary_loss_clip": 0.01338949, "auxiliary_loss_mlp": 0.01063926, "balance_loss_clip": 1.09073329, "balance_loss_mlp": 1.04542446, "epoch": 0.03162388023808092, "flos": 33727732485120.0, "grad_norm": 2.267351285021861, "language_loss": 0.80341858, "learning_rate": 3.9999743628888186e-06, "loss": 0.8274473, "num_input_tokens_seen": 5483870, "step": 263, "time_per_iteration": 2.6104564666748047 }, { "auxiliary_loss_clip": 0.01332334, "auxiliary_loss_mlp": 0.01060338, "balance_loss_clip": 1.08722758, "balance_loss_mlp": 1.0426352, "epoch": 0.03174412312872001, "flos": 20810952057600.0, "grad_norm": 2.7662914793297864, "language_loss": 0.89219964, "learning_rate": 3.999970267029133e-06, "loss": 0.91612631, "num_input_tokens_seen": 5502830, "step": 264, "time_per_iteration": 2.5291385650634766 }, { "auxiliary_loss_clip": 0.01335072, "auxiliary_loss_mlp": 0.01055993, "balance_loss_clip": 1.0904026, "balance_loss_mlp": 1.03866029, "epoch": 0.0318643660193591, "flos": 23727939638400.0, "grad_norm": 2.4709643370415564, "language_loss": 0.80019975, "learning_rate": 3.999965867775548e-06, "loss": 0.82411039, "num_input_tokens_seen": 5523225, "step": 265, "time_per_iteration": 2.5426151752471924 }, { "auxiliary_loss_clip": 0.01337859, "auxiliary_loss_mlp": 0.01066422, "balance_loss_clip": 1.09038353, "balance_loss_mlp": 1.04913712, "epoch": 0.0319846089099982, "flos": 13917863450880.0, "grad_norm": 2.5103618653069684, "language_loss": 0.86971104, "learning_rate": 3.9999611651287315e-06, "loss": 0.89375389, "num_input_tokens_seen": 5541380, "step": 266, "time_per_iteration": 2.5159478187561035 }, { "auxiliary_loss_clip": 0.01340565, "auxiliary_loss_mlp": 0.01060924, "balance_loss_clip": 1.09252357, "balance_loss_mlp": 1.04366207, "epoch": 0.03210485180063729, "flos": 14753253035520.0, "grad_norm": 2.393723200460593, "language_loss": 0.78739327, "learning_rate": 3.999956159089396e-06, "loss": 0.81140816, "num_input_tokens_seen": 5558830, "step": 267, "time_per_iteration": 2.4616403579711914 }, { "auxiliary_loss_clip": 0.0133729, "auxiliary_loss_mlp": 0.01065539, "balance_loss_clip": 1.09130144, "balance_loss_mlp": 1.04777694, "epoch": 0.03222509469127638, "flos": 28913153304960.0, "grad_norm": 2.3866526384486466, "language_loss": 0.79274267, "learning_rate": 3.999950849658302e-06, "loss": 0.81677091, "num_input_tokens_seen": 5577750, "step": 268, "time_per_iteration": 2.5542900562286377 }, { "auxiliary_loss_clip": 0.0134391, "auxiliary_loss_mlp": 0.01067759, "balance_loss_clip": 1.09357536, "balance_loss_mlp": 1.05040216, "epoch": 0.03234533758191547, "flos": 16946389739520.0, "grad_norm": 2.16188505563306, "language_loss": 0.841959, "learning_rate": 3.999945236836254e-06, "loss": 0.86607569, "num_input_tokens_seen": 5596715, "step": 269, "time_per_iteration": 2.50282883644104 }, { "auxiliary_loss_clip": 0.0134402, "auxiliary_loss_mlp": 0.01066178, "balance_loss_clip": 1.09516478, "balance_loss_mlp": 1.04737902, "epoch": 0.03246558047255456, "flos": 18989096284800.0, "grad_norm": 2.4772982015118155, "language_loss": 0.94841027, "learning_rate": 3.999939320624103e-06, "loss": 0.97251225, "num_input_tokens_seen": 5611865, "step": 270, "time_per_iteration": 2.509799003601074 }, { "auxiliary_loss_clip": 0.01341445, "auxiliary_loss_mlp": 0.0106344, "balance_loss_clip": 1.09389699, "balance_loss_mlp": 1.04583311, "epoch": 0.03258582336319365, "flos": 23728334688000.0, "grad_norm": 1.9824435272516618, "language_loss": 0.90065706, "learning_rate": 3.999933101022749e-06, "loss": 0.92470586, "num_input_tokens_seen": 5632270, "step": 271, "time_per_iteration": 2.596348285675049 }, { "auxiliary_loss_clip": 0.01337975, "auxiliary_loss_mlp": 0.01065526, "balance_loss_clip": 1.09219527, "balance_loss_mlp": 1.04808581, "epoch": 0.032706066253832745, "flos": 27670823562240.0, "grad_norm": 2.134231135976669, "language_loss": 0.86807013, "learning_rate": 3.999926578033132e-06, "loss": 0.8921051, "num_input_tokens_seen": 5652085, "step": 272, "time_per_iteration": 2.589371919631958 }, { "auxiliary_loss_clip": 0.01337107, "auxiliary_loss_mlp": 0.01063806, "balance_loss_clip": 1.08852267, "balance_loss_mlp": 1.04598451, "epoch": 0.032826309144471835, "flos": 45624685968000.0, "grad_norm": 1.9736985798727305, "language_loss": 0.63150817, "learning_rate": 3.999919751656244e-06, "loss": 0.65551728, "num_input_tokens_seen": 5678985, "step": 273, "time_per_iteration": 2.71744441986084 }, { "auxiliary_loss_clip": 0.01333368, "auxiliary_loss_mlp": 0.01057577, "balance_loss_clip": 1.08777916, "balance_loss_mlp": 1.03900385, "epoch": 0.032946552035110925, "flos": 25812374808960.0, "grad_norm": 2.8679144539871495, "language_loss": 0.75774014, "learning_rate": 3.9999126218931195e-06, "loss": 0.78164953, "num_input_tokens_seen": 5697020, "step": 274, "time_per_iteration": 2.5234222412109375 }, { "auxiliary_loss_clip": 0.01340223, "auxiliary_loss_mlp": 0.01052168, "balance_loss_clip": 1.09333205, "balance_loss_mlp": 1.03462029, "epoch": 0.033066794925750015, "flos": 15121984101120.0, "grad_norm": 2.1604332970218394, "language_loss": 0.89828002, "learning_rate": 3.99990518874484e-06, "loss": 0.92220396, "num_input_tokens_seen": 5713460, "step": 275, "time_per_iteration": 2.514495849609375 }, { "auxiliary_loss_clip": 0.01337893, "auxiliary_loss_mlp": 0.01068565, "balance_loss_clip": 1.09233379, "balance_loss_mlp": 1.05123186, "epoch": 0.033187037816389105, "flos": 22776593973120.0, "grad_norm": 2.295405872491499, "language_loss": 0.92337883, "learning_rate": 3.999897452212534e-06, "loss": 0.94744337, "num_input_tokens_seen": 5730790, "step": 276, "time_per_iteration": 2.5258681774139404 }, { "auxiliary_loss_clip": 0.01333459, "auxiliary_loss_mlp": 0.01064446, "balance_loss_clip": 1.0898875, "balance_loss_mlp": 1.0465169, "epoch": 0.033307280707028195, "flos": 23331414424320.0, "grad_norm": 2.170279915796394, "language_loss": 1.00139761, "learning_rate": 3.999889412297374e-06, "loss": 1.02537668, "num_input_tokens_seen": 5750215, "step": 277, "time_per_iteration": 2.5562474727630615 }, { "auxiliary_loss_clip": 0.01333571, "auxiliary_loss_mlp": 0.01043413, "balance_loss_clip": 1.08807111, "balance_loss_mlp": 1.02710533, "epoch": 0.03342752359766729, "flos": 28840290566400.0, "grad_norm": 2.066377876317811, "language_loss": 0.79029882, "learning_rate": 3.999881069000581e-06, "loss": 0.81406868, "num_input_tokens_seen": 5769945, "step": 278, "time_per_iteration": 2.58473539352417 }, { "auxiliary_loss_clip": 0.01335182, "auxiliary_loss_mlp": 0.01054719, "balance_loss_clip": 1.08848381, "balance_loss_mlp": 1.03621721, "epoch": 0.03354776648830638, "flos": 19384544090880.0, "grad_norm": 3.367739707248116, "language_loss": 0.87200749, "learning_rate": 3.99987242232342e-06, "loss": 0.89590657, "num_input_tokens_seen": 5784950, "step": 279, "time_per_iteration": 2.507972240447998 }, { "auxiliary_loss_clip": 0.0133659, "auxiliary_loss_mlp": 0.010664, "balance_loss_clip": 1.09167504, "balance_loss_mlp": 1.04811358, "epoch": 0.03366800937894547, "flos": 17858628472320.0, "grad_norm": 1.9479442345584634, "language_loss": 0.79613489, "learning_rate": 3.9998634722672026e-06, "loss": 0.8201648, "num_input_tokens_seen": 5805005, "step": 280, "time_per_iteration": 2.526787757873535 }, { "auxiliary_loss_clip": 0.01336722, "auxiliary_loss_mlp": 0.010581, "balance_loss_clip": 1.09226966, "balance_loss_mlp": 1.04107666, "epoch": 0.03378825226958456, "flos": 35951033635200.0, "grad_norm": 2.120507500201445, "language_loss": 0.78565633, "learning_rate": 3.999854218833286e-06, "loss": 0.80960453, "num_input_tokens_seen": 5825825, "step": 281, "time_per_iteration": 2.6457369327545166 }, { "auxiliary_loss_clip": 0.01335434, "auxiliary_loss_mlp": 0.01062124, "balance_loss_clip": 1.09196496, "balance_loss_mlp": 1.04435015, "epoch": 0.03390849516022365, "flos": 25702488126720.0, "grad_norm": 2.24343246384953, "language_loss": 0.81778347, "learning_rate": 3.999844662023075e-06, "loss": 0.84175909, "num_input_tokens_seen": 5845700, "step": 282, "time_per_iteration": 2.545872688293457 }, { "auxiliary_loss_clip": 0.0132775, "auxiliary_loss_mlp": 0.01058647, "balance_loss_clip": 1.08758342, "balance_loss_mlp": 1.04140937, "epoch": 0.03402873805086274, "flos": 21284505987840.0, "grad_norm": 1.8115762606009012, "language_loss": 0.92019212, "learning_rate": 3.999834801838018e-06, "loss": 0.94405603, "num_input_tokens_seen": 5864680, "step": 283, "time_per_iteration": 2.5256025791168213 }, { "auxiliary_loss_clip": 0.01329167, "auxiliary_loss_mlp": 0.01056436, "balance_loss_clip": 1.08850718, "balance_loss_mlp": 1.03924584, "epoch": 0.03414898094150183, "flos": 22710913954560.0, "grad_norm": 1.8912749750533082, "language_loss": 0.74029875, "learning_rate": 3.9998246382796115e-06, "loss": 0.76415479, "num_input_tokens_seen": 5884260, "step": 284, "time_per_iteration": 2.5002176761627197 }, { "auxiliary_loss_clip": 0.01333349, "auxiliary_loss_mlp": 0.01052163, "balance_loss_clip": 1.08690763, "balance_loss_mlp": 1.03386402, "epoch": 0.03426922383214093, "flos": 18879927874560.0, "grad_norm": 2.256044220588054, "language_loss": 0.90905637, "learning_rate": 3.999814171349399e-06, "loss": 0.93291152, "num_input_tokens_seen": 5902120, "step": 285, "time_per_iteration": 2.501950979232788 }, { "auxiliary_loss_clip": 0.0132887, "auxiliary_loss_mlp": 0.01055901, "balance_loss_clip": 1.08813155, "balance_loss_mlp": 1.03940225, "epoch": 0.03438946672278002, "flos": 34752012716160.0, "grad_norm": 1.6021543378440304, "language_loss": 0.73486495, "learning_rate": 3.9998034010489655e-06, "loss": 0.75871265, "num_input_tokens_seen": 5925810, "step": 286, "time_per_iteration": 2.619114875793457 }, { "auxiliary_loss_clip": 0.01328761, "auxiliary_loss_mlp": 0.01061034, "balance_loss_clip": 1.08955002, "balance_loss_mlp": 1.04446411, "epoch": 0.03450970961341911, "flos": 22164102236160.0, "grad_norm": 2.637695448636838, "language_loss": 0.76006019, "learning_rate": 3.999792327379946e-06, "loss": 0.78395808, "num_input_tokens_seen": 5945185, "step": 287, "time_per_iteration": 3.3553969860076904 }, { "auxiliary_loss_clip": 0.01334865, "auxiliary_loss_mlp": 0.01063927, "balance_loss_clip": 1.09394956, "balance_loss_mlp": 1.04707134, "epoch": 0.034629952504058197, "flos": 21725740656000.0, "grad_norm": 2.1228582628350092, "language_loss": 0.96314919, "learning_rate": 3.999780950344021e-06, "loss": 0.98713708, "num_input_tokens_seen": 5963375, "step": 288, "time_per_iteration": 3.356580972671509 }, { "auxiliary_loss_clip": 0.01336544, "auxiliary_loss_mlp": 0.01065429, "balance_loss_clip": 1.09137404, "balance_loss_mlp": 1.04733264, "epoch": 0.034750195394697286, "flos": 20047994248320.0, "grad_norm": 1.923355404044138, "language_loss": 0.82856649, "learning_rate": 3.999769269942916e-06, "loss": 0.85258615, "num_input_tokens_seen": 5983415, "step": 289, "time_per_iteration": 3.273599863052368 }, { "auxiliary_loss_clip": 0.01329649, "auxiliary_loss_mlp": 0.01055401, "balance_loss_clip": 1.08848834, "balance_loss_mlp": 1.03786576, "epoch": 0.034870438285336376, "flos": 27965865876480.0, "grad_norm": 1.7660963300444175, "language_loss": 0.8114922, "learning_rate": 3.999757286178402e-06, "loss": 0.83534265, "num_input_tokens_seen": 6005850, "step": 290, "time_per_iteration": 2.571743965148926 }, { "auxiliary_loss_clip": 0.01332818, "auxiliary_loss_mlp": 0.01050877, "balance_loss_clip": 1.09096825, "balance_loss_mlp": 1.03334117, "epoch": 0.03499068117597547, "flos": 22017514832640.0, "grad_norm": 2.817466840867797, "language_loss": 0.90789264, "learning_rate": 3.999744999052299e-06, "loss": 0.93172961, "num_input_tokens_seen": 6027240, "step": 291, "time_per_iteration": 2.5321621894836426 }, { "auxiliary_loss_clip": 0.0127556, "auxiliary_loss_mlp": 0.01032921, "balance_loss_clip": 1.09456384, "balance_loss_mlp": 1.02276409, "epoch": 0.03511092406661456, "flos": 57242147725440.0, "grad_norm": 0.9579212115780174, "language_loss": 0.61122328, "learning_rate": 3.9997324085664675e-06, "loss": 0.6343081, "num_input_tokens_seen": 6087470, "step": 292, "time_per_iteration": 3.0541954040527344 }, { "auxiliary_loss_clip": 0.0132714, "auxiliary_loss_mlp": 0.01057525, "balance_loss_clip": 1.08639526, "balance_loss_mlp": 1.03988171, "epoch": 0.03523116695725365, "flos": 22928065626240.0, "grad_norm": 2.8206649418372063, "language_loss": 0.92031407, "learning_rate": 3.999719514722821e-06, "loss": 0.94416064, "num_input_tokens_seen": 6107600, "step": 293, "time_per_iteration": 2.5209949016571045 }, { "auxiliary_loss_clip": 0.01324131, "auxiliary_loss_mlp": 0.0105454, "balance_loss_clip": 1.08604956, "balance_loss_mlp": 1.0381012, "epoch": 0.03535140984789274, "flos": 36903241226880.0, "grad_norm": 2.9812749147255326, "language_loss": 0.74878967, "learning_rate": 3.999706317523314e-06, "loss": 0.77257627, "num_input_tokens_seen": 6126160, "step": 294, "time_per_iteration": 2.658825635910034 }, { "auxiliary_loss_clip": 0.01325559, "auxiliary_loss_mlp": 0.01051473, "balance_loss_clip": 1.08720243, "balance_loss_mlp": 1.03529596, "epoch": 0.03547165273853183, "flos": 20449152316800.0, "grad_norm": 2.0086204547311945, "language_loss": 0.85961372, "learning_rate": 3.999692816969948e-06, "loss": 0.88338405, "num_input_tokens_seen": 6145695, "step": 295, "time_per_iteration": 2.555886745452881 }, { "auxiliary_loss_clip": 0.01264517, "auxiliary_loss_mlp": 0.01014689, "balance_loss_clip": 1.08577347, "balance_loss_mlp": 1.00429428, "epoch": 0.03559189562917092, "flos": 69850564871040.0, "grad_norm": 0.9962205863226998, "language_loss": 0.69369656, "learning_rate": 3.999679013064772e-06, "loss": 0.71648866, "num_input_tokens_seen": 6212440, "step": 296, "time_per_iteration": 3.1484200954437256 }, { "auxiliary_loss_clip": 0.01328386, "auxiliary_loss_mlp": 0.01055972, "balance_loss_clip": 1.08866513, "balance_loss_mlp": 1.03918695, "epoch": 0.03571213851981002, "flos": 21651944163840.0, "grad_norm": 2.7196133241091807, "language_loss": 0.85931814, "learning_rate": 3.99966490580988e-06, "loss": 0.88316172, "num_input_tokens_seen": 6229800, "step": 297, "time_per_iteration": 2.5340359210968018 }, { "auxiliary_loss_clip": 0.01329534, "auxiliary_loss_mlp": 0.01061466, "balance_loss_clip": 1.0877459, "balance_loss_mlp": 1.04449058, "epoch": 0.03583238141044911, "flos": 43945610757120.0, "grad_norm": 2.2396501534292472, "language_loss": 0.65846479, "learning_rate": 3.999650495207411e-06, "loss": 0.68237484, "num_input_tokens_seen": 6255825, "step": 298, "time_per_iteration": 2.708862543106079 }, { "auxiliary_loss_clip": 0.01322169, "auxiliary_loss_mlp": 0.01058964, "balance_loss_clip": 1.08597827, "balance_loss_mlp": 1.0416193, "epoch": 0.0359526243010882, "flos": 18910810592640.0, "grad_norm": 3.2348747223661514, "language_loss": 0.90328115, "learning_rate": 3.999635781259553e-06, "loss": 0.92709249, "num_input_tokens_seen": 6271090, "step": 299, "time_per_iteration": 2.509911298751831 }, { "auxiliary_loss_clip": 0.0124728, "auxiliary_loss_mlp": 0.01009398, "balance_loss_clip": 1.07158875, "balance_loss_mlp": 0.99881184, "epoch": 0.03607286719172729, "flos": 61668892782720.0, "grad_norm": 0.9174435658753497, "language_loss": 0.5222618, "learning_rate": 3.999620763968535e-06, "loss": 0.54482859, "num_input_tokens_seen": 6329965, "step": 300, "time_per_iteration": 2.985281467437744 }, { "auxiliary_loss_clip": 0.01322019, "auxiliary_loss_mlp": 0.01054684, "balance_loss_clip": 1.08697486, "balance_loss_mlp": 1.0374819, "epoch": 0.03619311008236638, "flos": 27819062991360.0, "grad_norm": 1.6924198064025273, "language_loss": 0.86637765, "learning_rate": 3.999605443336638e-06, "loss": 0.89014471, "num_input_tokens_seen": 6352095, "step": 301, "time_per_iteration": 2.6531150341033936 }, { "auxiliary_loss_clip": 0.01328685, "auxiliary_loss_mlp": 0.01060367, "balance_loss_clip": 1.08937466, "balance_loss_mlp": 1.04327273, "epoch": 0.03631335297300547, "flos": 13621133197440.0, "grad_norm": 2.5111007791976414, "language_loss": 0.8970139, "learning_rate": 3.999589819366185e-06, "loss": 0.9209044, "num_input_tokens_seen": 6365885, "step": 302, "time_per_iteration": 2.500898599624634 }, { "auxiliary_loss_clip": 0.01328616, "auxiliary_loss_mlp": 0.01056838, "balance_loss_clip": 1.08870649, "balance_loss_mlp": 1.03904057, "epoch": 0.036433595863644565, "flos": 27631788456960.0, "grad_norm": 2.09520069352677, "language_loss": 0.84832335, "learning_rate": 3.999573892059547e-06, "loss": 0.87217796, "num_input_tokens_seen": 6385015, "step": 303, "time_per_iteration": 2.5832653045654297 }, { "auxiliary_loss_clip": 0.01331332, "auxiliary_loss_mlp": 0.0106247, "balance_loss_clip": 1.08946693, "balance_loss_mlp": 1.04401696, "epoch": 0.036553838754283655, "flos": 24572020314240.0, "grad_norm": 2.161686598713949, "language_loss": 0.81187534, "learning_rate": 3.999557661419138e-06, "loss": 0.8358134, "num_input_tokens_seen": 6405165, "step": 304, "time_per_iteration": 2.521466016769409 }, { "auxiliary_loss_clip": 0.01329276, "auxiliary_loss_mlp": 0.01057483, "balance_loss_clip": 1.09052372, "balance_loss_mlp": 1.04122305, "epoch": 0.036674081644922744, "flos": 23404313076480.0, "grad_norm": 1.9459934439852062, "language_loss": 0.81442571, "learning_rate": 3.9995411274474225e-06, "loss": 0.83829331, "num_input_tokens_seen": 6424445, "step": 305, "time_per_iteration": 2.5334625244140625 }, { "auxiliary_loss_clip": 0.01327331, "auxiliary_loss_mlp": 0.01065835, "balance_loss_clip": 1.08737397, "balance_loss_mlp": 1.047858, "epoch": 0.036794324535561834, "flos": 27489690253440.0, "grad_norm": 2.0954074933042506, "language_loss": 0.81599385, "learning_rate": 3.999524290146908e-06, "loss": 0.83992547, "num_input_tokens_seen": 6444650, "step": 306, "time_per_iteration": 2.6260616779327393 }, { "auxiliary_loss_clip": 0.01325214, "auxiliary_loss_mlp": 0.01066193, "balance_loss_clip": 1.08915496, "balance_loss_mlp": 1.04903829, "epoch": 0.036914567426200924, "flos": 19463476227840.0, "grad_norm": 2.635763558910293, "language_loss": 0.92800236, "learning_rate": 3.9995071495201485e-06, "loss": 0.95191634, "num_input_tokens_seen": 6461755, "step": 307, "time_per_iteration": 2.5212206840515137 }, { "auxiliary_loss_clip": 0.01325563, "auxiliary_loss_mlp": 0.01061206, "balance_loss_clip": 1.08881021, "balance_loss_mlp": 1.04326475, "epoch": 0.037034810316840014, "flos": 22309324922880.0, "grad_norm": 2.9360639537587425, "language_loss": 0.97812188, "learning_rate": 3.999489705569744e-06, "loss": 1.0019896, "num_input_tokens_seen": 6479455, "step": 308, "time_per_iteration": 2.5483994483947754 }, { "auxiliary_loss_clip": 0.0132289, "auxiliary_loss_mlp": 0.01058299, "balance_loss_clip": 1.08516765, "balance_loss_mlp": 1.04157436, "epoch": 0.03715505320747911, "flos": 18588333265920.0, "grad_norm": 2.6182183455842285, "language_loss": 0.86391532, "learning_rate": 3.999471958298341e-06, "loss": 0.88772726, "num_input_tokens_seen": 6498365, "step": 309, "time_per_iteration": 2.507850408554077 }, { "auxiliary_loss_clip": 0.01329665, "auxiliary_loss_mlp": 0.01067838, "balance_loss_clip": 1.09073079, "balance_loss_mlp": 1.04976583, "epoch": 0.0372752960981182, "flos": 35955343267200.0, "grad_norm": 2.0334745302381405, "language_loss": 0.76096392, "learning_rate": 3.999453907708631e-06, "loss": 0.78493893, "num_input_tokens_seen": 6520770, "step": 310, "time_per_iteration": 2.6196281909942627 }, { "auxiliary_loss_clip": 0.01326735, "auxiliary_loss_mlp": 0.01053964, "balance_loss_clip": 1.08909392, "balance_loss_mlp": 1.03751278, "epoch": 0.03739553898875729, "flos": 20814040627200.0, "grad_norm": 1.7142251968670252, "language_loss": 0.81299865, "learning_rate": 3.999435553803353e-06, "loss": 0.83680564, "num_input_tokens_seen": 6540170, "step": 311, "time_per_iteration": 2.539167642593384 }, { "auxiliary_loss_clip": 0.01323714, "auxiliary_loss_mlp": 0.01061315, "balance_loss_clip": 1.0877887, "balance_loss_mlp": 1.04450667, "epoch": 0.03751578187939638, "flos": 20264140339200.0, "grad_norm": 2.3178425497429402, "language_loss": 0.8306613, "learning_rate": 3.999416896585292e-06, "loss": 0.85451156, "num_input_tokens_seen": 6557200, "step": 312, "time_per_iteration": 2.4963958263397217 }, { "auxiliary_loss_clip": 0.01325557, "auxiliary_loss_mlp": 0.01055698, "balance_loss_clip": 1.08774781, "balance_loss_mlp": 1.0384481, "epoch": 0.03763602477003547, "flos": 20668063754880.0, "grad_norm": 7.972621094044825, "language_loss": 0.85312146, "learning_rate": 3.9993979360572775e-06, "loss": 0.87693405, "num_input_tokens_seen": 6577340, "step": 313, "time_per_iteration": 3.361898422241211 }, { "auxiliary_loss_clip": 0.01332534, "auxiliary_loss_mlp": 0.0106335, "balance_loss_clip": 1.09138894, "balance_loss_mlp": 1.04605293, "epoch": 0.03775626766067456, "flos": 16691352197760.0, "grad_norm": 2.9572263982059153, "language_loss": 0.8326993, "learning_rate": 3.999378672222185e-06, "loss": 0.8566581, "num_input_tokens_seen": 6595125, "step": 314, "time_per_iteration": 2.488062620162964 }, { "auxiliary_loss_clip": 0.01327943, "auxiliary_loss_mlp": 0.01056861, "balance_loss_clip": 1.09060633, "balance_loss_mlp": 1.03814507, "epoch": 0.03787651055131366, "flos": 21141797253120.0, "grad_norm": 2.348276936918884, "language_loss": 0.82739699, "learning_rate": 3.9993591050829385e-06, "loss": 0.85124505, "num_input_tokens_seen": 6612990, "step": 315, "time_per_iteration": 4.209430932998657 }, { "auxiliary_loss_clip": 0.01327291, "auxiliary_loss_mlp": 0.01065795, "balance_loss_clip": 1.09015453, "balance_loss_mlp": 1.04814053, "epoch": 0.037996753441952746, "flos": 22018089450240.0, "grad_norm": 2.3789878680614844, "language_loss": 0.79256058, "learning_rate": 3.999339234642506e-06, "loss": 0.81649148, "num_input_tokens_seen": 6632740, "step": 316, "time_per_iteration": 2.517179489135742 }, { "auxiliary_loss_clip": 0.01328657, "auxiliary_loss_mlp": 0.01050433, "balance_loss_clip": 1.09153104, "balance_loss_mlp": 1.03221774, "epoch": 0.038116996332591836, "flos": 27709391790720.0, "grad_norm": 3.3013038311176386, "language_loss": 0.83545929, "learning_rate": 3.9993190609038994e-06, "loss": 0.85925019, "num_input_tokens_seen": 6651505, "step": 317, "time_per_iteration": 3.337282419204712 }, { "auxiliary_loss_clip": 0.01319379, "auxiliary_loss_mlp": 0.01049781, "balance_loss_clip": 1.08588779, "balance_loss_mlp": 1.03243613, "epoch": 0.038237239223230926, "flos": 21178067011200.0, "grad_norm": 2.0830086482639034, "language_loss": 0.833009, "learning_rate": 3.999298583870182e-06, "loss": 0.85670054, "num_input_tokens_seen": 6671090, "step": 318, "time_per_iteration": 2.5150322914123535 }, { "auxiliary_loss_clip": 0.01322628, "auxiliary_loss_mlp": 0.01057801, "balance_loss_clip": 1.08708167, "balance_loss_mlp": 1.04056382, "epoch": 0.038357482113870016, "flos": 25556618995200.0, "grad_norm": 2.7204237437115717, "language_loss": 0.77655065, "learning_rate": 3.999277803544458e-06, "loss": 0.80035496, "num_input_tokens_seen": 6691245, "step": 319, "time_per_iteration": 2.5506980419158936 }, { "auxiliary_loss_clip": 0.01230239, "auxiliary_loss_mlp": 0.01019467, "balance_loss_clip": 1.0673337, "balance_loss_mlp": 1.01012063, "epoch": 0.038477725004509106, "flos": 59227578034560.0, "grad_norm": 0.9630927809641395, "language_loss": 0.6238597, "learning_rate": 3.999256719929882e-06, "loss": 0.64635676, "num_input_tokens_seen": 6752520, "step": 320, "time_per_iteration": 3.092270612716675 }, { "auxiliary_loss_clip": 0.01228995, "auxiliary_loss_mlp": 0.01011204, "balance_loss_clip": 1.06638336, "balance_loss_mlp": 1.00195336, "epoch": 0.0385979678951482, "flos": 67317676398720.0, "grad_norm": 1.2175475917935272, "language_loss": 0.67108715, "learning_rate": 3.999235333029651e-06, "loss": 0.69348913, "num_input_tokens_seen": 6806460, "step": 321, "time_per_iteration": 2.9851176738739014 }, { "auxiliary_loss_clip": 0.01322092, "auxiliary_loss_mlp": 0.01058274, "balance_loss_clip": 1.08937657, "balance_loss_mlp": 1.04152548, "epoch": 0.03871821078578729, "flos": 22746752749440.0, "grad_norm": 1.8569989273025194, "language_loss": 0.81926775, "learning_rate": 3.999213642847009e-06, "loss": 0.84307146, "num_input_tokens_seen": 6827045, "step": 322, "time_per_iteration": 2.5512428283691406 }, { "auxiliary_loss_clip": 0.0132256, "auxiliary_loss_mlp": 0.01057646, "balance_loss_clip": 1.08720958, "balance_loss_mlp": 1.04084945, "epoch": 0.03883845367642638, "flos": 26280613526400.0, "grad_norm": 1.7577695843165855, "language_loss": 0.91229081, "learning_rate": 3.999191649385247e-06, "loss": 0.93609285, "num_input_tokens_seen": 6848220, "step": 323, "time_per_iteration": 2.6603574752807617 }, { "auxiliary_loss_clip": 0.0122134, "auxiliary_loss_mlp": 0.01009497, "balance_loss_clip": 1.06197786, "balance_loss_mlp": 1.00077116, "epoch": 0.03895869656706547, "flos": 56962835568000.0, "grad_norm": 0.906988018940969, "language_loss": 0.59800816, "learning_rate": 3.999169352647702e-06, "loss": 0.62031662, "num_input_tokens_seen": 6909400, "step": 324, "time_per_iteration": 2.9916763305664062 }, { "auxiliary_loss_clip": 0.01323623, "auxiliary_loss_mlp": 0.01081837, "balance_loss_clip": 1.08758521, "balance_loss_mlp": 1.06322801, "epoch": 0.03907893945770456, "flos": 24863363527680.0, "grad_norm": 1.8507078007250286, "language_loss": 0.83109725, "learning_rate": 3.999146752637755e-06, "loss": 0.85515183, "num_input_tokens_seen": 6930445, "step": 325, "time_per_iteration": 2.549497604370117 }, { "auxiliary_loss_clip": 0.01322288, "auxiliary_loss_mlp": 0.01060797, "balance_loss_clip": 1.08705592, "balance_loss_mlp": 1.04311848, "epoch": 0.03919918234834365, "flos": 18368595815040.0, "grad_norm": 2.8756009460010863, "language_loss": 0.89516294, "learning_rate": 3.999123849358836e-06, "loss": 0.91899383, "num_input_tokens_seen": 6948110, "step": 326, "time_per_iteration": 2.487290382385254 }, { "auxiliary_loss_clip": 0.01322495, "auxiliary_loss_mlp": 0.01065955, "balance_loss_clip": 1.08741999, "balance_loss_mlp": 1.04758453, "epoch": 0.03931942523898275, "flos": 25225414663680.0, "grad_norm": 2.2656996093550297, "language_loss": 0.74592853, "learning_rate": 3.999100642814418e-06, "loss": 0.76981294, "num_input_tokens_seen": 6968550, "step": 327, "time_per_iteration": 2.5387938022613525 }, { "auxiliary_loss_clip": 0.01321406, "auxiliary_loss_mlp": 0.01061238, "balance_loss_clip": 1.08763158, "balance_loss_mlp": 1.04345202, "epoch": 0.03943966812962184, "flos": 23257905240960.0, "grad_norm": 2.3694148212369535, "language_loss": 0.88497484, "learning_rate": 3.999077133008022e-06, "loss": 0.90880126, "num_input_tokens_seen": 6987135, "step": 328, "time_per_iteration": 2.5143589973449707 }, { "auxiliary_loss_clip": 0.0132301, "auxiliary_loss_mlp": 0.01064153, "balance_loss_clip": 1.08795619, "balance_loss_mlp": 1.04462671, "epoch": 0.03955991102026093, "flos": 29168837291520.0, "grad_norm": 2.2706797785465076, "language_loss": 0.90551168, "learning_rate": 3.9990533199432145e-06, "loss": 0.92938328, "num_input_tokens_seen": 7008630, "step": 329, "time_per_iteration": 2.554593563079834 }, { "auxiliary_loss_clip": 0.01320879, "auxiliary_loss_mlp": 0.01057919, "balance_loss_clip": 1.08672369, "balance_loss_mlp": 1.04010868, "epoch": 0.03968015391090002, "flos": 17602441695360.0, "grad_norm": 2.5874890831835398, "language_loss": 0.7574122, "learning_rate": 3.999029203623608e-06, "loss": 0.78120023, "num_input_tokens_seen": 7026350, "step": 330, "time_per_iteration": 2.4613447189331055 }, { "auxiliary_loss_clip": 0.01317444, "auxiliary_loss_mlp": 0.01055333, "balance_loss_clip": 1.08607543, "balance_loss_mlp": 1.03749931, "epoch": 0.03980039680153911, "flos": 21799285752960.0, "grad_norm": 2.173477655795099, "language_loss": 0.86607158, "learning_rate": 3.99900478405286e-06, "loss": 0.88979924, "num_input_tokens_seen": 7045660, "step": 331, "time_per_iteration": 2.490781545639038 }, { "auxiliary_loss_clip": 0.01318404, "auxiliary_loss_mlp": 0.01061005, "balance_loss_clip": 1.08941865, "balance_loss_mlp": 1.04522133, "epoch": 0.0399206396921782, "flos": 15195134148480.0, "grad_norm": 2.5433866667122227, "language_loss": 0.82356119, "learning_rate": 3.998980061234676e-06, "loss": 0.84735525, "num_input_tokens_seen": 7063575, "step": 332, "time_per_iteration": 2.4944329261779785 }, { "auxiliary_loss_clip": 0.01325771, "auxiliary_loss_mlp": 0.01053483, "balance_loss_clip": 1.08843422, "balance_loss_mlp": 1.03524363, "epoch": 0.040040882582817294, "flos": 14422910630400.0, "grad_norm": 2.6891676369336652, "language_loss": 0.75439119, "learning_rate": 3.9989550351728055e-06, "loss": 0.77818376, "num_input_tokens_seen": 7080505, "step": 333, "time_per_iteration": 2.4763119220733643 }, { "auxiliary_loss_clip": 0.01319959, "auxiliary_loss_mlp": 0.01056229, "balance_loss_clip": 1.08849204, "balance_loss_mlp": 1.0393132, "epoch": 0.040161125473456384, "flos": 19280906375040.0, "grad_norm": 2.4471815199456306, "language_loss": 0.84606612, "learning_rate": 3.998929705871046e-06, "loss": 0.86982799, "num_input_tokens_seen": 7097860, "step": 334, "time_per_iteration": 2.4974420070648193 }, { "auxiliary_loss_clip": 0.01318843, "auxiliary_loss_mlp": 0.01057949, "balance_loss_clip": 1.08930421, "balance_loss_mlp": 1.04095018, "epoch": 0.040281368364095474, "flos": 17821101738240.0, "grad_norm": 2.692403906654916, "language_loss": 0.88927627, "learning_rate": 3.99890407333324e-06, "loss": 0.91304421, "num_input_tokens_seen": 7116390, "step": 335, "time_per_iteration": 2.4892184734344482 }, { "auxiliary_loss_clip": 0.01315432, "auxiliary_loss_mlp": 0.01056401, "balance_loss_clip": 1.08302426, "balance_loss_mlp": 1.03863895, "epoch": 0.040401611254734564, "flos": 19573757959680.0, "grad_norm": 1.9567657765697963, "language_loss": 0.87152272, "learning_rate": 3.998878137563275e-06, "loss": 0.89524102, "num_input_tokens_seen": 7135940, "step": 336, "time_per_iteration": 2.5106372833251953 }, { "auxiliary_loss_clip": 0.01317852, "auxiliary_loss_mlp": 0.0105336, "balance_loss_clip": 1.08562124, "balance_loss_mlp": 1.03564513, "epoch": 0.040521854145373654, "flos": 22054466949120.0, "grad_norm": 1.943193987780698, "language_loss": 0.85139, "learning_rate": 3.998851898565085e-06, "loss": 0.8751021, "num_input_tokens_seen": 7155745, "step": 337, "time_per_iteration": 2.5173726081848145 }, { "auxiliary_loss_clip": 0.01314726, "auxiliary_loss_mlp": 0.01048657, "balance_loss_clip": 1.08427358, "balance_loss_mlp": 1.03190768, "epoch": 0.04064209703601274, "flos": 22674644196480.0, "grad_norm": 2.1707250638736135, "language_loss": 0.82978356, "learning_rate": 3.998825356342653e-06, "loss": 0.85341734, "num_input_tokens_seen": 7175920, "step": 338, "time_per_iteration": 2.5226211547851562 }, { "auxiliary_loss_clip": 0.0131692, "auxiliary_loss_mlp": 0.01064318, "balance_loss_clip": 1.0839572, "balance_loss_mlp": 1.04700851, "epoch": 0.04076233992665183, "flos": 38582172783360.0, "grad_norm": 2.348032330359331, "language_loss": 0.73048615, "learning_rate": 3.998798510900003e-06, "loss": 0.75429857, "num_input_tokens_seen": 7198720, "step": 339, "time_per_iteration": 2.6424903869628906 }, { "auxiliary_loss_clip": 0.01317186, "auxiliary_loss_mlp": 0.01053957, "balance_loss_clip": 1.08507109, "balance_loss_mlp": 1.03671885, "epoch": 0.04088258281729093, "flos": 25885309374720.0, "grad_norm": 4.171316213342774, "language_loss": 0.83692461, "learning_rate": 3.998771362241207e-06, "loss": 0.86063612, "num_input_tokens_seen": 7219125, "step": 340, "time_per_iteration": 2.5377330780029297 }, { "auxiliary_loss_clip": 0.01311549, "auxiliary_loss_mlp": 0.01054975, "balance_loss_clip": 1.08336401, "balance_loss_mlp": 1.03810692, "epoch": 0.04100282570793002, "flos": 19789832223360.0, "grad_norm": 1.8624021442822696, "language_loss": 0.876616, "learning_rate": 3.998743910370385e-06, "loss": 0.90028119, "num_input_tokens_seen": 7237985, "step": 341, "time_per_iteration": 3.3244218826293945 }, { "auxiliary_loss_clip": 0.01322695, "auxiliary_loss_mlp": 0.01049569, "balance_loss_clip": 1.09355974, "balance_loss_mlp": 1.03086543, "epoch": 0.04112306859856911, "flos": 22565152563840.0, "grad_norm": 2.0866266673048233, "language_loss": 0.73385412, "learning_rate": 3.998716155291702e-06, "loss": 0.75757676, "num_input_tokens_seen": 7255825, "step": 342, "time_per_iteration": 3.295494556427002 }, { "auxiliary_loss_clip": 0.01316787, "auxiliary_loss_mlp": 0.0105937, "balance_loss_clip": 1.08823419, "balance_loss_mlp": 1.04179811, "epoch": 0.0412433114892082, "flos": 25040654081280.0, "grad_norm": 1.7813420202434957, "language_loss": 0.90508056, "learning_rate": 3.998688097009366e-06, "loss": 0.92884207, "num_input_tokens_seen": 7276590, "step": 343, "time_per_iteration": 3.392993688583374 }, { "auxiliary_loss_clip": 0.01317468, "auxiliary_loss_mlp": 0.01054606, "balance_loss_clip": 1.08683801, "balance_loss_mlp": 1.03829789, "epoch": 0.04136355437984729, "flos": 25191371548800.0, "grad_norm": 2.13551725510914, "language_loss": 0.80172688, "learning_rate": 3.998659735527636e-06, "loss": 0.82544762, "num_input_tokens_seen": 7295680, "step": 344, "time_per_iteration": 3.27579665184021 }, { "auxiliary_loss_clip": 0.01314617, "auxiliary_loss_mlp": 0.0105403, "balance_loss_clip": 1.08498883, "balance_loss_mlp": 1.03670919, "epoch": 0.04148379727048638, "flos": 22966777509120.0, "grad_norm": 1.8327557282777152, "language_loss": 0.77637661, "learning_rate": 3.998631070850813e-06, "loss": 0.80006313, "num_input_tokens_seen": 7316300, "step": 345, "time_per_iteration": 2.50992488861084 }, { "auxiliary_loss_clip": 0.01312904, "auxiliary_loss_mlp": 0.01064336, "balance_loss_clip": 1.08687079, "balance_loss_mlp": 1.04860044, "epoch": 0.041604040161125476, "flos": 14063481187200.0, "grad_norm": 2.960064737292621, "language_loss": 0.83638465, "learning_rate": 3.9986021029832455e-06, "loss": 0.86015701, "num_input_tokens_seen": 7333615, "step": 346, "time_per_iteration": 2.4711108207702637 }, { "auxiliary_loss_clip": 0.01312436, "auxiliary_loss_mlp": 0.01055771, "balance_loss_clip": 1.08320427, "balance_loss_mlp": 1.03686416, "epoch": 0.041724283051764566, "flos": 12091877614080.0, "grad_norm": 4.551994444278132, "language_loss": 0.91755617, "learning_rate": 3.9985728319293285e-06, "loss": 0.94123816, "num_input_tokens_seen": 7347590, "step": 347, "time_per_iteration": 2.4509387016296387 }, { "auxiliary_loss_clip": 0.01317267, "auxiliary_loss_mlp": 0.01054278, "balance_loss_clip": 1.08412385, "balance_loss_mlp": 1.03663468, "epoch": 0.041844525942403656, "flos": 12385303816320.0, "grad_norm": 2.3224384490317864, "language_loss": 0.8525244, "learning_rate": 3.998543257693501e-06, "loss": 0.8762399, "num_input_tokens_seen": 7364345, "step": 348, "time_per_iteration": 2.4601216316223145 }, { "auxiliary_loss_clip": 0.01314363, "auxiliary_loss_mlp": 0.01065657, "balance_loss_clip": 1.08635926, "balance_loss_mlp": 1.0492177, "epoch": 0.041964768833042745, "flos": 23769345041280.0, "grad_norm": 2.248021269233625, "language_loss": 0.87798762, "learning_rate": 3.998513380280251e-06, "loss": 0.90178782, "num_input_tokens_seen": 7384625, "step": 349, "time_per_iteration": 2.522496461868286 }, { "auxiliary_loss_clip": 0.01317368, "auxiliary_loss_mlp": 0.01069265, "balance_loss_clip": 1.08583426, "balance_loss_mlp": 1.05029845, "epoch": 0.042085011723681835, "flos": 11875336473600.0, "grad_norm": 2.8138389743529975, "language_loss": 0.95031047, "learning_rate": 3.99848319969411e-06, "loss": 0.97417688, "num_input_tokens_seen": 7402225, "step": 350, "time_per_iteration": 2.458815097808838 }, { "auxiliary_loss_clip": 0.01319429, "auxiliary_loss_mlp": 0.01063121, "balance_loss_clip": 1.08800197, "balance_loss_mlp": 1.04487002, "epoch": 0.042205254614320925, "flos": 16873957964160.0, "grad_norm": 2.282321927813428, "language_loss": 0.79577434, "learning_rate": 3.9984527159396564e-06, "loss": 0.81959981, "num_input_tokens_seen": 7420865, "step": 351, "time_per_iteration": 2.467747449874878 }, { "auxiliary_loss_clip": 0.01313021, "auxiliary_loss_mlp": 0.01055937, "balance_loss_clip": 1.08246279, "balance_loss_mlp": 1.03935456, "epoch": 0.04232549750496002, "flos": 25118508810240.0, "grad_norm": 2.2321062629986983, "language_loss": 0.84833091, "learning_rate": 3.9984219290215154e-06, "loss": 0.87202048, "num_input_tokens_seen": 7441040, "step": 352, "time_per_iteration": 2.5033609867095947 }, { "auxiliary_loss_clip": 0.01311988, "auxiliary_loss_mlp": 0.01048693, "balance_loss_clip": 1.08593845, "balance_loss_mlp": 1.03308797, "epoch": 0.04244574039559911, "flos": 26724541714560.0, "grad_norm": 3.782901108407086, "language_loss": 0.89161533, "learning_rate": 3.998390838944356e-06, "loss": 0.91522217, "num_input_tokens_seen": 7462545, "step": 353, "time_per_iteration": 2.5348618030548096 }, { "auxiliary_loss_clip": 0.01313427, "auxiliary_loss_mlp": 0.01060902, "balance_loss_clip": 1.08533001, "balance_loss_mlp": 1.04473722, "epoch": 0.0425659832862382, "flos": 20923244951040.0, "grad_norm": 2.1331455820739187, "language_loss": 0.90390348, "learning_rate": 3.998359445712895e-06, "loss": 0.92764676, "num_input_tokens_seen": 7481650, "step": 354, "time_per_iteration": 2.5024843215942383 }, { "auxiliary_loss_clip": 0.01310854, "auxiliary_loss_mlp": 0.01050327, "balance_loss_clip": 1.08206868, "balance_loss_mlp": 1.03448427, "epoch": 0.04268622617687729, "flos": 23331127115520.0, "grad_norm": 2.2933715559087076, "language_loss": 0.81207919, "learning_rate": 3.9983277493318955e-06, "loss": 0.83569098, "num_input_tokens_seen": 7500945, "step": 355, "time_per_iteration": 2.5418589115142822 }, { "auxiliary_loss_clip": 0.01314101, "auxiliary_loss_mlp": 0.01053424, "balance_loss_clip": 1.08197987, "balance_loss_mlp": 1.03690147, "epoch": 0.04280646906751638, "flos": 25994010908160.0, "grad_norm": 1.6714723407154795, "language_loss": 0.81337923, "learning_rate": 3.998295749806165e-06, "loss": 0.83705449, "num_input_tokens_seen": 7522170, "step": 356, "time_per_iteration": 2.523563861846924 }, { "auxiliary_loss_clip": 0.01314028, "auxiliary_loss_mlp": 0.01068793, "balance_loss_clip": 1.08721352, "balance_loss_mlp": 1.05179369, "epoch": 0.04292671195815547, "flos": 26906824258560.0, "grad_norm": 1.8169540640186919, "language_loss": 0.83275414, "learning_rate": 3.998263447140558e-06, "loss": 0.85658234, "num_input_tokens_seen": 7542370, "step": 357, "time_per_iteration": 2.538599729537964 }, { "auxiliary_loss_clip": 0.01309809, "auxiliary_loss_mlp": 0.0104728, "balance_loss_clip": 1.08093023, "balance_loss_mlp": 1.03097248, "epoch": 0.04304695484879457, "flos": 39457315745280.0, "grad_norm": 1.753371143794966, "language_loss": 0.81925076, "learning_rate": 3.998230841339976e-06, "loss": 0.84282172, "num_input_tokens_seen": 7564380, "step": 358, "time_per_iteration": 2.637113332748413 }, { "auxiliary_loss_clip": 0.01309956, "auxiliary_loss_mlp": 0.01051402, "balance_loss_clip": 1.08537078, "balance_loss_mlp": 1.03527308, "epoch": 0.04316719773943366, "flos": 19646297475840.0, "grad_norm": 2.2657939925781188, "language_loss": 0.84906995, "learning_rate": 3.998197932409363e-06, "loss": 0.87268353, "num_input_tokens_seen": 7582390, "step": 359, "time_per_iteration": 2.501049041748047 }, { "auxiliary_loss_clip": 0.01303976, "auxiliary_loss_mlp": 0.01058092, "balance_loss_clip": 1.08091342, "balance_loss_mlp": 1.04223669, "epoch": 0.04328744063007275, "flos": 22452320966400.0, "grad_norm": 2.193100691262511, "language_loss": 0.86434275, "learning_rate": 3.9981647203537125e-06, "loss": 0.88796341, "num_input_tokens_seen": 7599890, "step": 360, "time_per_iteration": 2.4809176921844482 }, { "auxiliary_loss_clip": 0.01307359, "auxiliary_loss_mlp": 0.01062598, "balance_loss_clip": 1.08094239, "balance_loss_mlp": 1.04727912, "epoch": 0.04340768352071184, "flos": 21283033530240.0, "grad_norm": 2.0729731843907726, "language_loss": 0.96005911, "learning_rate": 3.998131205178063e-06, "loss": 0.98375863, "num_input_tokens_seen": 7618360, "step": 361, "time_per_iteration": 2.465364694595337 }, { "auxiliary_loss_clip": 0.01307294, "auxiliary_loss_mlp": 0.01057581, "balance_loss_clip": 1.08154821, "balance_loss_mlp": 1.04179764, "epoch": 0.04352792641135093, "flos": 11583705951360.0, "grad_norm": 4.862272840517238, "language_loss": 0.76420879, "learning_rate": 3.998097386887498e-06, "loss": 0.78785759, "num_input_tokens_seen": 7635435, "step": 362, "time_per_iteration": 2.4743475914001465 }, { "auxiliary_loss_clip": 0.01304918, "auxiliary_loss_mlp": 0.01067394, "balance_loss_clip": 1.08116436, "balance_loss_mlp": 1.05105066, "epoch": 0.04364816930199002, "flos": 23623547736960.0, "grad_norm": 1.9716536780055414, "language_loss": 0.85103494, "learning_rate": 3.998063265487148e-06, "loss": 0.87475806, "num_input_tokens_seen": 7656485, "step": 363, "time_per_iteration": 2.5508499145507812 }, { "auxiliary_loss_clip": 0.01308042, "auxiliary_loss_mlp": 0.01056062, "balance_loss_clip": 1.08364844, "balance_loss_mlp": 1.04043388, "epoch": 0.043768412192629114, "flos": 14429734214400.0, "grad_norm": 2.209335022683545, "language_loss": 0.80973411, "learning_rate": 3.99802884098219e-06, "loss": 0.8333751, "num_input_tokens_seen": 7674595, "step": 364, "time_per_iteration": 2.4609665870666504 }, { "auxiliary_loss_clip": 0.01307313, "auxiliary_loss_mlp": 0.01047458, "balance_loss_clip": 1.08088434, "balance_loss_mlp": 1.03095937, "epoch": 0.043888655083268203, "flos": 26468893641600.0, "grad_norm": 2.1858082866538378, "language_loss": 0.82345247, "learning_rate": 3.997994113377845e-06, "loss": 0.84700024, "num_input_tokens_seen": 7693495, "step": 365, "time_per_iteration": 2.530149459838867 }, { "auxiliary_loss_clip": 0.01306787, "auxiliary_loss_mlp": 0.01048853, "balance_loss_clip": 1.08145404, "balance_loss_mlp": 1.03215146, "epoch": 0.04400889797390729, "flos": 27235263242880.0, "grad_norm": 2.3087204289123235, "language_loss": 0.83136582, "learning_rate": 3.9979590826793815e-06, "loss": 0.85492224, "num_input_tokens_seen": 7714685, "step": 366, "time_per_iteration": 2.5117695331573486 }, { "auxiliary_loss_clip": 0.01310481, "auxiliary_loss_mlp": 0.01053061, "balance_loss_clip": 1.08402312, "balance_loss_mlp": 1.03628802, "epoch": 0.04412914086454638, "flos": 20119528183680.0, "grad_norm": 2.37421501344759, "language_loss": 0.80873638, "learning_rate": 3.997923748892113e-06, "loss": 0.83237183, "num_input_tokens_seen": 7734005, "step": 367, "time_per_iteration": 3.252614974975586 }, { "auxiliary_loss_clip": 0.01305054, "auxiliary_loss_mlp": 0.01051596, "balance_loss_clip": 1.08323932, "balance_loss_mlp": 1.03655171, "epoch": 0.04424938375518547, "flos": 22604618632320.0, "grad_norm": 2.060065623354843, "language_loss": 0.88556302, "learning_rate": 3.9978881120214015e-06, "loss": 0.90912956, "num_input_tokens_seen": 7755525, "step": 368, "time_per_iteration": 2.515429973602295 }, { "auxiliary_loss_clip": 0.01307272, "auxiliary_loss_mlp": 0.01050379, "balance_loss_clip": 1.08134317, "balance_loss_mlp": 1.03366542, "epoch": 0.04436962664582456, "flos": 24132365844480.0, "grad_norm": 1.9910126897436693, "language_loss": 0.79411018, "learning_rate": 3.997852172072652e-06, "loss": 0.81768668, "num_input_tokens_seen": 7776740, "step": 369, "time_per_iteration": 2.5036027431488037 }, { "auxiliary_loss_clip": 0.0130762, "auxiliary_loss_mlp": 0.01061772, "balance_loss_clip": 1.08186865, "balance_loss_mlp": 1.04572678, "epoch": 0.04448986953646366, "flos": 18222906251520.0, "grad_norm": 4.881557466520376, "language_loss": 0.8943032, "learning_rate": 3.9978159290513155e-06, "loss": 0.91799712, "num_input_tokens_seen": 7794820, "step": 370, "time_per_iteration": 3.3193178176879883 }, { "auxiliary_loss_clip": 0.01307958, "auxiliary_loss_mlp": 0.01065044, "balance_loss_clip": 1.0822525, "balance_loss_mlp": 1.04829526, "epoch": 0.04461011242710275, "flos": 30117920400000.0, "grad_norm": 1.6630904649713403, "language_loss": 0.80331135, "learning_rate": 3.997779382962892e-06, "loss": 0.82704139, "num_input_tokens_seen": 7817705, "step": 371, "time_per_iteration": 2.547529458999634 }, { "auxiliary_loss_clip": 0.0130169, "auxiliary_loss_mlp": 0.01055319, "balance_loss_clip": 1.0793817, "balance_loss_mlp": 1.03841484, "epoch": 0.04473035531774184, "flos": 29752529299200.0, "grad_norm": 2.1486411377727923, "language_loss": 0.73549217, "learning_rate": 3.997742533812924e-06, "loss": 0.75906223, "num_input_tokens_seen": 7840970, "step": 372, "time_per_iteration": 3.419358730316162 }, { "auxiliary_loss_clip": 0.01307919, "auxiliary_loss_mlp": 0.01067117, "balance_loss_clip": 1.08419704, "balance_loss_mlp": 1.05107129, "epoch": 0.04485059820838093, "flos": 13151565676800.0, "grad_norm": 2.450927137963643, "language_loss": 0.92658639, "learning_rate": 3.997705381607001e-06, "loss": 0.95033681, "num_input_tokens_seen": 7857785, "step": 373, "time_per_iteration": 2.4556984901428223 }, { "auxiliary_loss_clip": 0.01213861, "auxiliary_loss_mlp": 0.01050382, "balance_loss_clip": 1.06126595, "balance_loss_mlp": 1.04227602, "epoch": 0.04497084109902002, "flos": 68094209548800.0, "grad_norm": 0.9790704418458376, "language_loss": 0.60295194, "learning_rate": 3.997667926350761e-06, "loss": 0.62559438, "num_input_tokens_seen": 7916115, "step": 374, "time_per_iteration": 2.9758031368255615 }, { "auxiliary_loss_clip": 0.01212173, "auxiliary_loss_mlp": 0.01044617, "balance_loss_clip": 1.06006777, "balance_loss_mlp": 1.036129, "epoch": 0.04509108398965911, "flos": 64342263346560.0, "grad_norm": 0.9080632105557515, "language_loss": 0.57768965, "learning_rate": 3.997630168049886e-06, "loss": 0.60025764, "num_input_tokens_seen": 7974480, "step": 375, "time_per_iteration": 3.054396629333496 }, { "auxiliary_loss_clip": 0.01308536, "auxiliary_loss_mlp": 0.01060521, "balance_loss_clip": 1.08260226, "balance_loss_mlp": 1.04392719, "epoch": 0.045211326880298205, "flos": 22271115830400.0, "grad_norm": 1.8107792463460122, "language_loss": 0.77470303, "learning_rate": 3.997592106710101e-06, "loss": 0.79839367, "num_input_tokens_seen": 7993940, "step": 376, "time_per_iteration": 2.5062403678894043 }, { "auxiliary_loss_clip": 0.01301868, "auxiliary_loss_mlp": 0.01051322, "balance_loss_clip": 1.08013248, "balance_loss_mlp": 1.03538346, "epoch": 0.045331569770937295, "flos": 32159441796480.0, "grad_norm": 2.8265780724162384, "language_loss": 0.65737206, "learning_rate": 3.997553742337182e-06, "loss": 0.68090397, "num_input_tokens_seen": 8013365, "step": 377, "time_per_iteration": 2.5672574043273926 }, { "auxiliary_loss_clip": 0.01304414, "auxiliary_loss_mlp": 0.01055704, "balance_loss_clip": 1.08102703, "balance_loss_mlp": 1.03939605, "epoch": 0.045451812661576385, "flos": 22163455791360.0, "grad_norm": 1.8365341766960068, "language_loss": 0.912081, "learning_rate": 3.997515074936949e-06, "loss": 0.93568218, "num_input_tokens_seen": 8034240, "step": 378, "time_per_iteration": 2.5110716819763184 }, { "auxiliary_loss_clip": 0.01304302, "auxiliary_loss_mlp": 0.01059454, "balance_loss_clip": 1.0807929, "balance_loss_mlp": 1.0435034, "epoch": 0.045572055552215475, "flos": 16581968305920.0, "grad_norm": 3.127611100388022, "language_loss": 0.86878395, "learning_rate": 3.997476104515268e-06, "loss": 0.89242154, "num_input_tokens_seen": 8052430, "step": 379, "time_per_iteration": 2.490489959716797 }, { "auxiliary_loss_clip": 0.01303696, "auxiliary_loss_mlp": 0.01058352, "balance_loss_clip": 1.08355606, "balance_loss_mlp": 1.04277122, "epoch": 0.045692298442854565, "flos": 17603375448960.0, "grad_norm": 3.5862771205039743, "language_loss": 0.77378345, "learning_rate": 3.9974368310780485e-06, "loss": 0.79740387, "num_input_tokens_seen": 8069605, "step": 380, "time_per_iteration": 2.4650542736053467 }, { "auxiliary_loss_clip": 0.01313976, "auxiliary_loss_mlp": 0.01059389, "balance_loss_clip": 1.0852859, "balance_loss_mlp": 1.04187727, "epoch": 0.045812541333493655, "flos": 26761098781440.0, "grad_norm": 2.969493865473316, "language_loss": 0.74292934, "learning_rate": 3.997397254631251e-06, "loss": 0.76666296, "num_input_tokens_seen": 8090225, "step": 381, "time_per_iteration": 2.5366361141204834 }, { "auxiliary_loss_clip": 0.01201911, "auxiliary_loss_mlp": 0.01041464, "balance_loss_clip": 1.05471504, "balance_loss_mlp": 1.03407311, "epoch": 0.04593278422413275, "flos": 60250349894400.0, "grad_norm": 0.826749808815751, "language_loss": 0.60024762, "learning_rate": 3.997357375180878e-06, "loss": 0.62268138, "num_input_tokens_seen": 8154505, "step": 382, "time_per_iteration": 3.1870150566101074 }, { "auxiliary_loss_clip": 0.01308093, "auxiliary_loss_mlp": 0.01050148, "balance_loss_clip": 1.08306384, "balance_loss_mlp": 1.03308892, "epoch": 0.04605302711477184, "flos": 21799249839360.0, "grad_norm": 1.9575818561244382, "language_loss": 0.75227451, "learning_rate": 3.997317192732979e-06, "loss": 0.77585691, "num_input_tokens_seen": 8173285, "step": 383, "time_per_iteration": 2.5041611194610596 }, { "auxiliary_loss_clip": 0.01307153, "auxiliary_loss_mlp": 0.01062596, "balance_loss_clip": 1.08269322, "balance_loss_mlp": 1.04579926, "epoch": 0.04617327000541093, "flos": 19459705299840.0, "grad_norm": 15.082697588225773, "language_loss": 0.82512033, "learning_rate": 3.99727670729365e-06, "loss": 0.84881777, "num_input_tokens_seen": 8191845, "step": 384, "time_per_iteration": 2.483057975769043 }, { "auxiliary_loss_clip": 0.01305237, "auxiliary_loss_mlp": 0.01059073, "balance_loss_clip": 1.08576119, "balance_loss_mlp": 1.0434922, "epoch": 0.04629351289605002, "flos": 25411468135680.0, "grad_norm": 1.8832234573273792, "language_loss": 0.77701378, "learning_rate": 3.997235918869033e-06, "loss": 0.80065686, "num_input_tokens_seen": 8212880, "step": 385, "time_per_iteration": 2.584770679473877 }, { "auxiliary_loss_clip": 0.01307316, "auxiliary_loss_mlp": 0.0104868, "balance_loss_clip": 1.08620346, "balance_loss_mlp": 1.03336191, "epoch": 0.04641375578668911, "flos": 20558284813440.0, "grad_norm": 2.1588079040337074, "language_loss": 0.82640243, "learning_rate": 3.997194827465315e-06, "loss": 0.84996235, "num_input_tokens_seen": 8231475, "step": 386, "time_per_iteration": 2.5023181438446045 }, { "auxiliary_loss_clip": 0.01305475, "auxiliary_loss_mlp": 0.01048771, "balance_loss_clip": 1.08197701, "balance_loss_mlp": 1.03318989, "epoch": 0.0465339986773282, "flos": 13188661447680.0, "grad_norm": 3.255443470714345, "language_loss": 0.91170299, "learning_rate": 3.997153433088728e-06, "loss": 0.93524545, "num_input_tokens_seen": 8248600, "step": 387, "time_per_iteration": 2.462891101837158 }, { "auxiliary_loss_clip": 0.01306422, "auxiliary_loss_mlp": 0.01054352, "balance_loss_clip": 1.08443904, "balance_loss_mlp": 1.03742421, "epoch": 0.0466542415679673, "flos": 25556547168000.0, "grad_norm": 2.0165489000311903, "language_loss": 0.8136313, "learning_rate": 3.997111735745554e-06, "loss": 0.83723903, "num_input_tokens_seen": 8271570, "step": 388, "time_per_iteration": 2.5356483459472656 }, { "auxiliary_loss_clip": 0.01302901, "auxiliary_loss_mlp": 0.01060489, "balance_loss_clip": 1.08219302, "balance_loss_mlp": 1.0427146, "epoch": 0.04677448445860639, "flos": 22236749493120.0, "grad_norm": 1.849563944640564, "language_loss": 0.82464463, "learning_rate": 3.997069735442118e-06, "loss": 0.84827852, "num_input_tokens_seen": 8291265, "step": 389, "time_per_iteration": 2.526449680328369 }, { "auxiliary_loss_clip": 0.0130254, "auxiliary_loss_mlp": 0.0105639, "balance_loss_clip": 1.08192551, "balance_loss_mlp": 1.04064286, "epoch": 0.04689472734924548, "flos": 28147825198080.0, "grad_norm": 1.4598805329766036, "language_loss": 0.80379522, "learning_rate": 3.997027432184792e-06, "loss": 0.82738447, "num_input_tokens_seen": 8315925, "step": 390, "time_per_iteration": 2.6069769859313965 }, { "auxiliary_loss_clip": 0.0130467, "auxiliary_loss_mlp": 0.010543, "balance_loss_clip": 1.08359206, "balance_loss_mlp": 1.03882658, "epoch": 0.04701497023988457, "flos": 23148952312320.0, "grad_norm": 2.013483234248443, "language_loss": 0.89315629, "learning_rate": 3.99698482597999e-06, "loss": 0.91674602, "num_input_tokens_seen": 8333605, "step": 391, "time_per_iteration": 2.4999606609344482 }, { "auxiliary_loss_clip": 0.01194121, "auxiliary_loss_mlp": 0.01014802, "balance_loss_clip": 1.05286396, "balance_loss_mlp": 1.00836456, "epoch": 0.04713521313052366, "flos": 64827668764800.0, "grad_norm": 0.8743215866590295, "language_loss": 0.63898301, "learning_rate": 3.99694191683418e-06, "loss": 0.66107219, "num_input_tokens_seen": 8394405, "step": 392, "time_per_iteration": 3.077955484390259 }, { "auxiliary_loss_clip": 0.0130794, "auxiliary_loss_mlp": 0.01055877, "balance_loss_clip": 1.08684731, "balance_loss_mlp": 1.03896117, "epoch": 0.047255456021162746, "flos": 18771585477120.0, "grad_norm": 1.9766125883336993, "language_loss": 0.81709909, "learning_rate": 3.996898704753867e-06, "loss": 0.84073722, "num_input_tokens_seen": 8412355, "step": 393, "time_per_iteration": 2.467668056488037 }, { "auxiliary_loss_clip": 0.01301119, "auxiliary_loss_mlp": 0.0105127, "balance_loss_clip": 1.0808208, "balance_loss_mlp": 1.03555799, "epoch": 0.04737569891180184, "flos": 22053820504320.0, "grad_norm": 2.1282780018454623, "language_loss": 0.87553906, "learning_rate": 3.996855189745609e-06, "loss": 0.89906299, "num_input_tokens_seen": 8431620, "step": 394, "time_per_iteration": 3.3214595317840576 }, { "auxiliary_loss_clip": 0.01301429, "auxiliary_loss_mlp": 0.01058369, "balance_loss_clip": 1.08062744, "balance_loss_mlp": 1.04173899, "epoch": 0.04749594180244093, "flos": 29057370410880.0, "grad_norm": 1.7742712768962188, "language_loss": 0.92587304, "learning_rate": 3.996811371816007e-06, "loss": 0.949471, "num_input_tokens_seen": 8454045, "step": 395, "time_per_iteration": 2.5340468883514404 }, { "auxiliary_loss_clip": 0.01304723, "auxiliary_loss_mlp": 0.01062, "balance_loss_clip": 1.08485949, "balance_loss_mlp": 1.04672945, "epoch": 0.04761618469308002, "flos": 35112268172160.0, "grad_norm": 1.9140049090450078, "language_loss": 0.78198045, "learning_rate": 3.996767250971707e-06, "loss": 0.80564767, "num_input_tokens_seen": 8476785, "step": 396, "time_per_iteration": 2.604710340499878 }, { "auxiliary_loss_clip": 0.01306915, "auxiliary_loss_mlp": 0.01052324, "balance_loss_clip": 1.08561552, "balance_loss_mlp": 1.03587341, "epoch": 0.04773642758371911, "flos": 25630702796160.0, "grad_norm": 1.8965995587752826, "language_loss": 0.86975163, "learning_rate": 3.996722827219403e-06, "loss": 0.89334404, "num_input_tokens_seen": 8498400, "step": 397, "time_per_iteration": 3.320405960083008 }, { "auxiliary_loss_clip": 0.01310238, "auxiliary_loss_mlp": 0.01061852, "balance_loss_clip": 1.08772373, "balance_loss_mlp": 1.0454371, "epoch": 0.0478566704743582, "flos": 20631506688000.0, "grad_norm": 2.7339659543636303, "language_loss": 0.82932961, "learning_rate": 3.996678100565833e-06, "loss": 0.85305053, "num_input_tokens_seen": 8517455, "step": 398, "time_per_iteration": 2.5041348934173584 }, { "auxiliary_loss_clip": 0.0129915, "auxiliary_loss_mlp": 0.01055837, "balance_loss_clip": 1.08027911, "balance_loss_mlp": 1.03838503, "epoch": 0.04797691336499729, "flos": 18835721210880.0, "grad_norm": 2.2288814645322828, "language_loss": 0.88556743, "learning_rate": 3.996633071017783e-06, "loss": 0.90911722, "num_input_tokens_seen": 8534085, "step": 399, "time_per_iteration": 4.04193902015686 }, { "auxiliary_loss_clip": 0.01300727, "auxiliary_loss_mlp": 0.01055607, "balance_loss_clip": 1.08265805, "balance_loss_mlp": 1.0391798, "epoch": 0.04809715625563638, "flos": 21099673578240.0, "grad_norm": 2.366517592869167, "language_loss": 0.81760651, "learning_rate": 3.996587738582084e-06, "loss": 0.84116983, "num_input_tokens_seen": 8550885, "step": 400, "time_per_iteration": 2.4908688068389893 }, { "auxiliary_loss_clip": 0.01298287, "auxiliary_loss_mlp": 0.01045888, "balance_loss_clip": 1.07882285, "balance_loss_mlp": 1.03060579, "epoch": 0.04821739914627548, "flos": 23805650712960.0, "grad_norm": 13.411460199123866, "language_loss": 0.86008286, "learning_rate": 3.9965421032656115e-06, "loss": 0.88352466, "num_input_tokens_seen": 8570815, "step": 401, "time_per_iteration": 2.5356838703155518 }, { "auxiliary_loss_clip": 0.0130131, "auxiliary_loss_mlp": 0.01051442, "balance_loss_clip": 1.08075023, "balance_loss_mlp": 1.03390598, "epoch": 0.04833764203691457, "flos": 22200587475840.0, "grad_norm": 2.7015538418090874, "language_loss": 0.94228053, "learning_rate": 3.99649616507529e-06, "loss": 0.96580803, "num_input_tokens_seen": 8589910, "step": 402, "time_per_iteration": 2.515148639678955 }, { "auxiliary_loss_clip": 0.01191461, "auxiliary_loss_mlp": 0.01014412, "balance_loss_clip": 1.05255461, "balance_loss_mlp": 1.00852287, "epoch": 0.04845788492755366, "flos": 65904376896000.0, "grad_norm": 0.8948925615385385, "language_loss": 0.63133913, "learning_rate": 3.996449924018088e-06, "loss": 0.65339786, "num_input_tokens_seen": 8650370, "step": 403, "time_per_iteration": 3.001668930053711 }, { "auxiliary_loss_clip": 0.01297493, "auxiliary_loss_mlp": 0.01055731, "balance_loss_clip": 1.08124352, "balance_loss_mlp": 1.04083014, "epoch": 0.04857812781819275, "flos": 19281301424640.0, "grad_norm": 3.3128949042104976, "language_loss": 0.79501402, "learning_rate": 3.99640338010102e-06, "loss": 0.81854618, "num_input_tokens_seen": 8669475, "step": 404, "time_per_iteration": 2.5077438354492188 }, { "auxiliary_loss_clip": 0.01296881, "auxiliary_loss_mlp": 0.01050455, "balance_loss_clip": 1.07876885, "balance_loss_mlp": 1.03388453, "epoch": 0.04869837070883184, "flos": 24062376193920.0, "grad_norm": 1.8264005464572708, "language_loss": 0.78732586, "learning_rate": 3.996356533331146e-06, "loss": 0.81079918, "num_input_tokens_seen": 8691345, "step": 405, "time_per_iteration": 2.5813164710998535 }, { "auxiliary_loss_clip": 0.01309015, "auxiliary_loss_mlp": 0.01047016, "balance_loss_clip": 1.0822556, "balance_loss_mlp": 1.03093433, "epoch": 0.04881861359947093, "flos": 25187169657600.0, "grad_norm": 2.299877226344788, "language_loss": 0.61913782, "learning_rate": 3.996309383715573e-06, "loss": 0.64269817, "num_input_tokens_seen": 8710125, "step": 406, "time_per_iteration": 2.5506973266601562 }, { "auxiliary_loss_clip": 0.01305136, "auxiliary_loss_mlp": 0.0104492, "balance_loss_clip": 1.08353484, "balance_loss_mlp": 1.02932703, "epoch": 0.048938856490110025, "flos": 16362913213440.0, "grad_norm": 4.542308600645671, "language_loss": 0.73766047, "learning_rate": 3.996261931261454e-06, "loss": 0.76116097, "num_input_tokens_seen": 8728705, "step": 407, "time_per_iteration": 2.4768905639648438 }, { "auxiliary_loss_clip": 0.01301929, "auxiliary_loss_mlp": 0.01050798, "balance_loss_clip": 1.08295596, "balance_loss_mlp": 1.03481197, "epoch": 0.049059099380749115, "flos": 29895094379520.0, "grad_norm": 1.69931226147769, "language_loss": 0.86397779, "learning_rate": 3.996214175975987e-06, "loss": 0.88750505, "num_input_tokens_seen": 8749225, "step": 408, "time_per_iteration": 2.541062831878662 }, { "auxiliary_loss_clip": 0.01305683, "auxiliary_loss_mlp": 0.01056994, "balance_loss_clip": 1.08446574, "balance_loss_mlp": 1.04096055, "epoch": 0.049179342271388204, "flos": 35918858027520.0, "grad_norm": 16.323599984219786, "language_loss": 0.79003918, "learning_rate": 3.996166117866417e-06, "loss": 0.81366599, "num_input_tokens_seen": 8771160, "step": 409, "time_per_iteration": 2.6045706272125244 }, { "auxiliary_loss_clip": 0.01296232, "auxiliary_loss_mlp": 0.01049345, "balance_loss_clip": 1.07948112, "balance_loss_mlp": 1.03385949, "epoch": 0.049299585162027294, "flos": 14611226659200.0, "grad_norm": 1.9343237973452194, "language_loss": 0.86551613, "learning_rate": 3.996117756940035e-06, "loss": 0.88897181, "num_input_tokens_seen": 8787845, "step": 410, "time_per_iteration": 2.472079277038574 }, { "auxiliary_loss_clip": 0.0130136, "auxiliary_loss_mlp": 0.01048057, "balance_loss_clip": 1.08308649, "balance_loss_mlp": 1.03274441, "epoch": 0.049419828052666384, "flos": 19567939956480.0, "grad_norm": 2.5200190517528114, "language_loss": 0.97680199, "learning_rate": 3.996069093204175e-06, "loss": 1.00029612, "num_input_tokens_seen": 8803805, "step": 411, "time_per_iteration": 2.5612008571624756 }, { "auxiliary_loss_clip": 0.01307525, "auxiliary_loss_mlp": 0.01053742, "balance_loss_clip": 1.08575988, "balance_loss_mlp": 1.0374223, "epoch": 0.049540070943305474, "flos": 13659916907520.0, "grad_norm": 2.5097680591367753, "language_loss": 0.88016677, "learning_rate": 3.996020126666221e-06, "loss": 0.90377945, "num_input_tokens_seen": 8820785, "step": 412, "time_per_iteration": 2.5755655765533447 }, { "auxiliary_loss_clip": 0.01299896, "auxiliary_loss_mlp": 0.01047871, "balance_loss_clip": 1.08220887, "balance_loss_mlp": 1.03295815, "epoch": 0.04966031383394457, "flos": 21832035978240.0, "grad_norm": 1.955007345608373, "language_loss": 0.81952274, "learning_rate": 3.995970857333601e-06, "loss": 0.84300047, "num_input_tokens_seen": 8841195, "step": 413, "time_per_iteration": 2.533642530441284 }, { "auxiliary_loss_clip": 0.01301584, "auxiliary_loss_mlp": 0.01051497, "balance_loss_clip": 1.08071661, "balance_loss_mlp": 1.03554714, "epoch": 0.04978055672458366, "flos": 28618793349120.0, "grad_norm": 1.8510990305408832, "language_loss": 0.79738045, "learning_rate": 3.995921285213789e-06, "loss": 0.82091129, "num_input_tokens_seen": 8861455, "step": 414, "time_per_iteration": 2.533730983734131 }, { "auxiliary_loss_clip": 0.01296805, "auxiliary_loss_mlp": 0.0104877, "balance_loss_clip": 1.08063483, "balance_loss_mlp": 1.034024, "epoch": 0.04990079961522275, "flos": 19828220883840.0, "grad_norm": 2.488698298117378, "language_loss": 0.80567664, "learning_rate": 3.995871410314305e-06, "loss": 0.82913238, "num_input_tokens_seen": 8880015, "step": 415, "time_per_iteration": 2.4749958515167236 }, { "auxiliary_loss_clip": 0.01173741, "auxiliary_loss_mlp": 0.01007018, "balance_loss_clip": 1.04972339, "balance_loss_mlp": 1.0009625, "epoch": 0.05002104250586184, "flos": 62735045293440.0, "grad_norm": 0.9054442068518652, "language_loss": 0.59635961, "learning_rate": 3.995821232642714e-06, "loss": 0.61816722, "num_input_tokens_seen": 8938420, "step": 416, "time_per_iteration": 3.158643960952759 }, { "auxiliary_loss_clip": 0.01281183, "auxiliary_loss_mlp": 0.01051766, "balance_loss_clip": 1.08201373, "balance_loss_mlp": 1.03673339, "epoch": 0.05014128539650093, "flos": 27928518710400.0, "grad_norm": 2.1831508310459284, "language_loss": 0.82651401, "learning_rate": 3.995770752206629e-06, "loss": 0.8498435, "num_input_tokens_seen": 8959495, "step": 417, "time_per_iteration": 2.61844539642334 }, { "auxiliary_loss_clip": 0.01300244, "auxiliary_loss_mlp": 0.01045056, "balance_loss_clip": 1.08212149, "balance_loss_mlp": 1.02893901, "epoch": 0.05026152828714002, "flos": 17705576620800.0, "grad_norm": 2.3843534208319035, "language_loss": 0.9731673, "learning_rate": 3.995719969013709e-06, "loss": 0.99662036, "num_input_tokens_seen": 8976675, "step": 418, "time_per_iteration": 2.510394811630249 }, { "auxiliary_loss_clip": 0.01264372, "auxiliary_loss_mlp": 0.010525, "balance_loss_clip": 1.07839465, "balance_loss_mlp": 1.0364666, "epoch": 0.05038177117777912, "flos": 19133277477120.0, "grad_norm": 3.010087020556891, "language_loss": 0.85372448, "learning_rate": 3.995668883071655e-06, "loss": 0.87689316, "num_input_tokens_seen": 8992900, "step": 419, "time_per_iteration": 2.5459470748901367 }, { "auxiliary_loss_clip": 0.01300471, "auxiliary_loss_mlp": 0.01051948, "balance_loss_clip": 1.08223248, "balance_loss_mlp": 1.03637969, "epoch": 0.050502014068418206, "flos": 20667704618880.0, "grad_norm": 2.3014059850181887, "language_loss": 0.9099651, "learning_rate": 3.995617494388219e-06, "loss": 0.93348932, "num_input_tokens_seen": 9011020, "step": 420, "time_per_iteration": 2.4808101654052734 }, { "auxiliary_loss_clip": 0.01261402, "auxiliary_loss_mlp": 0.01044376, "balance_loss_clip": 1.07446694, "balance_loss_mlp": 1.02864623, "epoch": 0.050622256959057296, "flos": 21361103740800.0, "grad_norm": 1.989831026765851, "language_loss": 0.80376565, "learning_rate": 3.995565802971196e-06, "loss": 0.82682347, "num_input_tokens_seen": 9030995, "step": 421, "time_per_iteration": 3.3927552700042725 }, { "auxiliary_loss_clip": 0.01258431, "auxiliary_loss_mlp": 0.01052933, "balance_loss_clip": 1.07435346, "balance_loss_mlp": 1.03863978, "epoch": 0.050742499849696386, "flos": 27673588909440.0, "grad_norm": 1.925790732415257, "language_loss": 0.67646933, "learning_rate": 3.995513808828427e-06, "loss": 0.69958293, "num_input_tokens_seen": 9053790, "step": 422, "time_per_iteration": 2.646677255630493 }, { "auxiliary_loss_clip": 0.01260708, "auxiliary_loss_mlp": 0.01048243, "balance_loss_clip": 1.07567978, "balance_loss_mlp": 1.03291273, "epoch": 0.050862742740335476, "flos": 19865999013120.0, "grad_norm": 1.8116825290558, "language_loss": 0.76393795, "learning_rate": 3.9954615119678e-06, "loss": 0.78702748, "num_input_tokens_seen": 9072345, "step": 423, "time_per_iteration": 2.561293363571167 }, { "auxiliary_loss_clip": 0.01269033, "auxiliary_loss_mlp": 0.01056699, "balance_loss_clip": 1.07558346, "balance_loss_mlp": 1.04059434, "epoch": 0.050982985630974566, "flos": 22085098272000.0, "grad_norm": 2.03521907066593, "language_loss": 0.80695105, "learning_rate": 3.995408912397248e-06, "loss": 0.83020842, "num_input_tokens_seen": 9090240, "step": 424, "time_per_iteration": 2.5259456634521484 }, { "auxiliary_loss_clip": 0.01266463, "auxiliary_loss_mlp": 0.01051782, "balance_loss_clip": 1.07880783, "balance_loss_mlp": 1.03574848, "epoch": 0.05110322852161366, "flos": 20740962407040.0, "grad_norm": 2.153202176186237, "language_loss": 0.93311572, "learning_rate": 3.99535601012475e-06, "loss": 0.95629811, "num_input_tokens_seen": 9105570, "step": 425, "time_per_iteration": 4.150558233261108 }, { "auxiliary_loss_clip": 0.01245075, "auxiliary_loss_mlp": 0.00766993, "balance_loss_clip": 1.07676184, "balance_loss_mlp": 1.00095129, "epoch": 0.05122347141225275, "flos": 28547295327360.0, "grad_norm": 1.7456692394336903, "language_loss": 0.75426227, "learning_rate": 3.995302805158333e-06, "loss": 0.77438295, "num_input_tokens_seen": 9128225, "step": 426, "time_per_iteration": 2.6686456203460693 }, { "auxiliary_loss_clip": 0.01255958, "auxiliary_loss_mlp": 0.01051416, "balance_loss_clip": 1.07547879, "balance_loss_mlp": 1.03403497, "epoch": 0.05134371430289184, "flos": 19722679747200.0, "grad_norm": 1.952348852847772, "language_loss": 0.83475518, "learning_rate": 3.9952492975060665e-06, "loss": 0.85782886, "num_input_tokens_seen": 9148295, "step": 427, "time_per_iteration": 3.3942737579345703 }, { "auxiliary_loss_clip": 0.01276961, "auxiliary_loss_mlp": 0.01041478, "balance_loss_clip": 1.07808042, "balance_loss_mlp": 1.02670765, "epoch": 0.05146395719353093, "flos": 34458945649920.0, "grad_norm": 2.593341049968763, "language_loss": 0.84843683, "learning_rate": 3.995195487176067e-06, "loss": 0.87162125, "num_input_tokens_seen": 9168525, "step": 428, "time_per_iteration": 2.6617722511291504 }, { "auxiliary_loss_clip": 0.01295977, "auxiliary_loss_mlp": 0.01050163, "balance_loss_clip": 1.07999504, "balance_loss_mlp": 1.03482068, "epoch": 0.05158420008417002, "flos": 21760286561280.0, "grad_norm": 1.9204342854519867, "language_loss": 0.85410869, "learning_rate": 3.995141374176499e-06, "loss": 0.87757009, "num_input_tokens_seen": 9186920, "step": 429, "time_per_iteration": 2.5337750911712646 }, { "auxiliary_loss_clip": 0.01143837, "auxiliary_loss_mlp": 0.00756563, "balance_loss_clip": 1.04659235, "balance_loss_mlp": 1.00036418, "epoch": 0.05170444297480911, "flos": 72553956226560.0, "grad_norm": 0.8703592539281128, "language_loss": 0.63139963, "learning_rate": 3.995086958515572e-06, "loss": 0.65040362, "num_input_tokens_seen": 9244940, "step": 430, "time_per_iteration": 3.196502208709717 }, { "auxiliary_loss_clip": 0.0118113, "auxiliary_loss_mlp": 0.007568, "balance_loss_clip": 1.04612827, "balance_loss_mlp": 1.00034904, "epoch": 0.05182468586544821, "flos": 62416159326720.0, "grad_norm": 0.8555098447788403, "language_loss": 0.59934545, "learning_rate": 3.995032240201538e-06, "loss": 0.61872476, "num_input_tokens_seen": 9307335, "step": 431, "time_per_iteration": 3.0434532165527344 }, { "auxiliary_loss_clip": 0.01153987, "auxiliary_loss_mlp": 0.01007454, "balance_loss_clip": 1.03965926, "balance_loss_mlp": 1.00194609, "epoch": 0.0519449287560873, "flos": 41225989432320.0, "grad_norm": 0.9454745863448065, "language_loss": 0.63146764, "learning_rate": 3.9949772192427e-06, "loss": 0.65308207, "num_input_tokens_seen": 9353960, "step": 432, "time_per_iteration": 2.7855050563812256 }, { "auxiliary_loss_clip": 0.01259463, "auxiliary_loss_mlp": 0.01047102, "balance_loss_clip": 1.07326114, "balance_loss_mlp": 1.03160453, "epoch": 0.05206517164672639, "flos": 17494530261120.0, "grad_norm": 1.966456443919025, "language_loss": 0.79672104, "learning_rate": 3.994921895647405e-06, "loss": 0.81978667, "num_input_tokens_seen": 9372130, "step": 433, "time_per_iteration": 2.5267980098724365 }, { "auxiliary_loss_clip": 0.01176159, "auxiliary_loss_mlp": 0.01007939, "balance_loss_clip": 1.0428772, "balance_loss_mlp": 1.00245512, "epoch": 0.05218541453736548, "flos": 64002762973440.0, "grad_norm": 0.8373059307137443, "language_loss": 0.55360562, "learning_rate": 3.994866269424043e-06, "loss": 0.57544661, "num_input_tokens_seen": 9428500, "step": 434, "time_per_iteration": 2.9598982334136963 }, { "auxiliary_loss_clip": 0.01201427, "auxiliary_loss_mlp": 0.01053205, "balance_loss_clip": 1.05995917, "balance_loss_mlp": 1.0375762, "epoch": 0.05230565742800457, "flos": 19317319787520.0, "grad_norm": 2.507600517771012, "language_loss": 0.78426898, "learning_rate": 3.9948103405810545e-06, "loss": 0.80681527, "num_input_tokens_seen": 9447450, "step": 435, "time_per_iteration": 2.6395328044891357 }, { "auxiliary_loss_clip": 0.01230608, "auxiliary_loss_mlp": 0.01053478, "balance_loss_clip": 1.06978583, "balance_loss_mlp": 1.03942299, "epoch": 0.05242590031864366, "flos": 25298636538240.0, "grad_norm": 4.149633701889057, "language_loss": 0.86005664, "learning_rate": 3.994754109126923e-06, "loss": 0.88289744, "num_input_tokens_seen": 9468945, "step": 436, "time_per_iteration": 2.645141839981079 }, { "auxiliary_loss_clip": 0.0120372, "auxiliary_loss_mlp": 0.01043157, "balance_loss_clip": 1.06939352, "balance_loss_mlp": 1.02864957, "epoch": 0.052546143209282754, "flos": 26211629456640.0, "grad_norm": 1.6682583084824458, "language_loss": 0.93285549, "learning_rate": 3.994697575070181e-06, "loss": 0.95532429, "num_input_tokens_seen": 9488405, "step": 437, "time_per_iteration": 2.8471429347991943 }, { "auxiliary_loss_clip": 0.01259901, "auxiliary_loss_mlp": 0.01054144, "balance_loss_clip": 1.07766271, "balance_loss_mlp": 1.03888524, "epoch": 0.052666386099921844, "flos": 22158140578560.0, "grad_norm": 1.7520408156355585, "language_loss": 0.91556644, "learning_rate": 3.994640738419402e-06, "loss": 0.93870687, "num_input_tokens_seen": 9507780, "step": 438, "time_per_iteration": 2.873599052429199 }, { "auxiliary_loss_clip": 0.01274003, "auxiliary_loss_mlp": 0.01043856, "balance_loss_clip": 1.07774472, "balance_loss_mlp": 1.02959228, "epoch": 0.052786628990560934, "flos": 23881817502720.0, "grad_norm": 2.007857097295353, "language_loss": 0.80960095, "learning_rate": 3.9945835991832075e-06, "loss": 0.83277953, "num_input_tokens_seen": 9529665, "step": 439, "time_per_iteration": 2.573204517364502 }, { "auxiliary_loss_clip": 0.01294579, "auxiliary_loss_mlp": 0.01057563, "balance_loss_clip": 1.08350658, "balance_loss_mlp": 1.04297209, "epoch": 0.052906871881200024, "flos": 24605021934720.0, "grad_norm": 2.7126775079446124, "language_loss": 0.92883289, "learning_rate": 3.994526157370268e-06, "loss": 0.95235425, "num_input_tokens_seen": 9548280, "step": 440, "time_per_iteration": 2.5361196994781494 }, { "auxiliary_loss_clip": 0.01149993, "auxiliary_loss_mlp": 0.01006307, "balance_loss_clip": 1.03666568, "balance_loss_mlp": 1.00089502, "epoch": 0.053027114771839114, "flos": 56461631143680.0, "grad_norm": 0.8956927470746533, "language_loss": 0.59263247, "learning_rate": 3.994468412989296e-06, "loss": 0.61419547, "num_input_tokens_seen": 9609690, "step": 441, "time_per_iteration": 3.219245672225952 }, { "auxiliary_loss_clip": 0.01233992, "auxiliary_loss_mlp": 0.01049743, "balance_loss_clip": 1.06908166, "balance_loss_mlp": 1.03502071, "epoch": 0.053147357662478203, "flos": 17311098481920.0, "grad_norm": 2.0762146149015854, "language_loss": 0.92690444, "learning_rate": 3.994410366049052e-06, "loss": 0.94974178, "num_input_tokens_seen": 9627550, "step": 442, "time_per_iteration": 2.536608934402466 }, { "auxiliary_loss_clip": 0.01272897, "auxiliary_loss_mlp": 0.0104361, "balance_loss_clip": 1.07576668, "balance_loss_mlp": 1.02873874, "epoch": 0.0532676005531173, "flos": 17164977955200.0, "grad_norm": 2.1637109291866286, "language_loss": 0.83134085, "learning_rate": 3.994352016558341e-06, "loss": 0.8545059, "num_input_tokens_seen": 9644855, "step": 443, "time_per_iteration": 2.4811205863952637 }, { "auxiliary_loss_clip": 0.01274636, "auxiliary_loss_mlp": 0.01051717, "balance_loss_clip": 1.07829463, "balance_loss_mlp": 1.03706574, "epoch": 0.05338784344375639, "flos": 27819960831360.0, "grad_norm": 1.9838096726617758, "language_loss": 0.73960245, "learning_rate": 3.994293364526014e-06, "loss": 0.76286602, "num_input_tokens_seen": 9665740, "step": 444, "time_per_iteration": 2.6801724433898926 }, { "auxiliary_loss_clip": 0.01249816, "auxiliary_loss_mlp": 0.01047337, "balance_loss_clip": 1.07561362, "balance_loss_mlp": 1.03107703, "epoch": 0.05350808633439548, "flos": 21507691144320.0, "grad_norm": 2.3305621249999358, "language_loss": 0.84785217, "learning_rate": 3.99423440996097e-06, "loss": 0.87082368, "num_input_tokens_seen": 9685280, "step": 445, "time_per_iteration": 2.5926566123962402 }, { "auxiliary_loss_clip": 0.01259055, "auxiliary_loss_mlp": 0.01051048, "balance_loss_clip": 1.07913065, "balance_loss_mlp": 1.03587234, "epoch": 0.05362832922503457, "flos": 20084299920000.0, "grad_norm": 6.29055922851008, "language_loss": 0.81651968, "learning_rate": 3.994175152872152e-06, "loss": 0.83962071, "num_input_tokens_seen": 9704365, "step": 446, "time_per_iteration": 2.569978713989258 }, { "auxiliary_loss_clip": 0.01275756, "auxiliary_loss_mlp": 0.01041242, "balance_loss_clip": 1.07585716, "balance_loss_mlp": 1.02719927, "epoch": 0.05374857211567366, "flos": 26137222433280.0, "grad_norm": 2.196225724100749, "language_loss": 0.79002571, "learning_rate": 3.994115593268548e-06, "loss": 0.81319571, "num_input_tokens_seen": 9724145, "step": 447, "time_per_iteration": 3.436283826828003 }, { "auxiliary_loss_clip": 0.01291874, "auxiliary_loss_mlp": 0.01053269, "balance_loss_clip": 1.07938385, "balance_loss_mlp": 1.03897595, "epoch": 0.05386881500631275, "flos": 27486817165440.0, "grad_norm": 1.9615877816766543, "language_loss": 0.82201803, "learning_rate": 3.994055731159195e-06, "loss": 0.84546947, "num_input_tokens_seen": 9741615, "step": 448, "time_per_iteration": 2.530015230178833 }, { "auxiliary_loss_clip": 0.01277408, "auxiliary_loss_mlp": 0.01060105, "balance_loss_clip": 1.08094954, "balance_loss_mlp": 1.0459249, "epoch": 0.053989057896951846, "flos": 23585087249280.0, "grad_norm": 1.8797501118212996, "language_loss": 0.8712545, "learning_rate": 3.993995566553172e-06, "loss": 0.8946296, "num_input_tokens_seen": 9760580, "step": 449, "time_per_iteration": 2.6010842323303223 }, { "auxiliary_loss_clip": 0.01235954, "auxiliary_loss_mlp": 0.01046144, "balance_loss_clip": 1.06595063, "balance_loss_mlp": 1.03150511, "epoch": 0.054109300787590936, "flos": 25228862369280.0, "grad_norm": 1.6976254695350672, "language_loss": 0.772946, "learning_rate": 3.993935099459607e-06, "loss": 0.79576695, "num_input_tokens_seen": 9782195, "step": 450, "time_per_iteration": 2.612295389175415 }, { "auxiliary_loss_clip": 0.01284598, "auxiliary_loss_mlp": 0.01048673, "balance_loss_clip": 1.07867634, "balance_loss_mlp": 1.03505337, "epoch": 0.054229543678230026, "flos": 23841525421440.0, "grad_norm": 2.1527376349446117, "language_loss": 0.74068034, "learning_rate": 3.993874329887673e-06, "loss": 0.76401305, "num_input_tokens_seen": 9800850, "step": 451, "time_per_iteration": 2.5610320568084717 }, { "auxiliary_loss_clip": 0.01275216, "auxiliary_loss_mlp": 0.01055472, "balance_loss_clip": 1.07793713, "balance_loss_mlp": 1.04037964, "epoch": 0.054349786568869116, "flos": 16320933192960.0, "grad_norm": 2.43952839340982, "language_loss": 0.86216784, "learning_rate": 3.993813257846589e-06, "loss": 0.88547456, "num_input_tokens_seen": 9817605, "step": 452, "time_per_iteration": 3.3321056365966797 }, { "auxiliary_loss_clip": 0.01273227, "auxiliary_loss_mlp": 0.01045703, "balance_loss_clip": 1.07825232, "balance_loss_mlp": 1.03110576, "epoch": 0.054470029459508205, "flos": 18660729127680.0, "grad_norm": 2.4540345738589604, "language_loss": 0.92340803, "learning_rate": 3.993751883345619e-06, "loss": 0.94659734, "num_input_tokens_seen": 9835965, "step": 453, "time_per_iteration": 3.3967556953430176 }, { "auxiliary_loss_clip": 0.01253371, "auxiliary_loss_mlp": 0.0105064, "balance_loss_clip": 1.0768224, "balance_loss_mlp": 1.03570938, "epoch": 0.054590272350147295, "flos": 17785298856960.0, "grad_norm": 2.4492687650873255, "language_loss": 0.87209421, "learning_rate": 3.993690206394073e-06, "loss": 0.89513433, "num_input_tokens_seen": 9852265, "step": 454, "time_per_iteration": 3.352182626724243 }, { "auxiliary_loss_clip": 0.01260155, "auxiliary_loss_mlp": 0.01048789, "balance_loss_clip": 1.07582223, "balance_loss_mlp": 1.03435254, "epoch": 0.054710515240786385, "flos": 17785945301760.0, "grad_norm": 2.0167613805190956, "language_loss": 0.87748712, "learning_rate": 3.993628227001307e-06, "loss": 0.90057659, "num_input_tokens_seen": 9870465, "step": 455, "time_per_iteration": 2.5466055870056152 }, { "auxiliary_loss_clip": 0.01255651, "auxiliary_loss_mlp": 0.01054282, "balance_loss_clip": 1.07518387, "balance_loss_mlp": 1.04006076, "epoch": 0.05483075813142548, "flos": 48210900180480.0, "grad_norm": 1.8777234743699365, "language_loss": 0.71359921, "learning_rate": 3.993565945176726e-06, "loss": 0.73669851, "num_input_tokens_seen": 9891490, "step": 456, "time_per_iteration": 2.798192262649536 }, { "auxiliary_loss_clip": 0.01247937, "auxiliary_loss_mlp": 0.01047025, "balance_loss_clip": 1.0742749, "balance_loss_mlp": 1.03255939, "epoch": 0.05495100102206457, "flos": 19682244011520.0, "grad_norm": 1.958825381402083, "language_loss": 0.84441829, "learning_rate": 3.993503360929776e-06, "loss": 0.86736798, "num_input_tokens_seen": 9910375, "step": 457, "time_per_iteration": 2.5571701526641846 }, { "auxiliary_loss_clip": 0.01186518, "auxiliary_loss_mlp": 0.01048125, "balance_loss_clip": 1.06417263, "balance_loss_mlp": 1.03300881, "epoch": 0.05507124391270366, "flos": 26360048453760.0, "grad_norm": 1.7567421591039132, "language_loss": 0.81142128, "learning_rate": 3.99344047426995e-06, "loss": 0.83376771, "num_input_tokens_seen": 9931635, "step": 458, "time_per_iteration": 2.8402340412139893 }, { "auxiliary_loss_clip": 0.01225332, "auxiliary_loss_mlp": 0.01049963, "balance_loss_clip": 1.06929886, "balance_loss_mlp": 1.03465652, "epoch": 0.05519148680334275, "flos": 22601314581120.0, "grad_norm": 2.104397228739993, "language_loss": 0.93625522, "learning_rate": 3.993377285206789e-06, "loss": 0.95900822, "num_input_tokens_seen": 9951420, "step": 459, "time_per_iteration": 2.9285833835601807 }, { "auxiliary_loss_clip": 0.01216721, "auxiliary_loss_mlp": 0.01056765, "balance_loss_clip": 1.06795645, "balance_loss_mlp": 1.04189968, "epoch": 0.05531172969398184, "flos": 40552519380480.0, "grad_norm": 1.7810400891534426, "language_loss": 0.86242628, "learning_rate": 3.99331379374988e-06, "loss": 0.8851611, "num_input_tokens_seen": 9975025, "step": 460, "time_per_iteration": 2.7733118534088135 }, { "auxiliary_loss_clip": 0.01260238, "auxiliary_loss_mlp": 0.01044803, "balance_loss_clip": 1.07015657, "balance_loss_mlp": 1.03131461, "epoch": 0.05543197258462093, "flos": 23477894087040.0, "grad_norm": 2.0558160232231293, "language_loss": 0.80142117, "learning_rate": 3.993249999908852e-06, "loss": 0.82447153, "num_input_tokens_seen": 9995175, "step": 461, "time_per_iteration": 2.61435604095459 }, { "auxiliary_loss_clip": 0.01287158, "auxiliary_loss_mlp": 0.01047705, "balance_loss_clip": 1.07595766, "balance_loss_mlp": 1.03390098, "epoch": 0.05555221547526003, "flos": 18624603024000.0, "grad_norm": 2.0611482450465903, "language_loss": 0.87428278, "learning_rate": 3.993185903693384e-06, "loss": 0.89763141, "num_input_tokens_seen": 10011975, "step": 462, "time_per_iteration": 2.4678704738616943 }, { "auxiliary_loss_clip": 0.0125292, "auxiliary_loss_mlp": 0.01039594, "balance_loss_clip": 1.07375169, "balance_loss_mlp": 1.02593231, "epoch": 0.05567245836589912, "flos": 23587098410880.0, "grad_norm": 2.2576931871790533, "language_loss": 0.82325077, "learning_rate": 3.9931215051131995e-06, "loss": 0.84617591, "num_input_tokens_seen": 10032620, "step": 463, "time_per_iteration": 2.614413022994995 }, { "auxiliary_loss_clip": 0.01256273, "auxiliary_loss_mlp": 0.01046013, "balance_loss_clip": 1.06942487, "balance_loss_mlp": 1.03173161, "epoch": 0.05579270125653821, "flos": 27746667129600.0, "grad_norm": 1.5333354564677812, "language_loss": 0.80077308, "learning_rate": 3.993056804178068e-06, "loss": 0.82379591, "num_input_tokens_seen": 10054165, "step": 464, "time_per_iteration": 2.625532865524292 }, { "auxiliary_loss_clip": 0.01215156, "auxiliary_loss_mlp": 0.01045022, "balance_loss_clip": 1.06792319, "balance_loss_mlp": 1.03006136, "epoch": 0.0559129441471773, "flos": 27014161075200.0, "grad_norm": 2.051909912610453, "language_loss": 0.84421498, "learning_rate": 3.992991800897803e-06, "loss": 0.86681676, "num_input_tokens_seen": 10073970, "step": 465, "time_per_iteration": 2.793992757797241 }, { "auxiliary_loss_clip": 0.01286037, "auxiliary_loss_mlp": 0.01047208, "balance_loss_clip": 1.07677484, "balance_loss_mlp": 1.03202069, "epoch": 0.05603318703781639, "flos": 15229787794560.0, "grad_norm": 2.3041596877673123, "language_loss": 0.89988774, "learning_rate": 3.9929264952822665e-06, "loss": 0.92322022, "num_input_tokens_seen": 10091505, "step": 466, "time_per_iteration": 2.5002784729003906 }, { "auxiliary_loss_clip": 0.0127289, "auxiliary_loss_mlp": 0.01050952, "balance_loss_clip": 1.07353401, "balance_loss_mlp": 1.03664696, "epoch": 0.05615342992845548, "flos": 22266482976000.0, "grad_norm": 2.3220236947354196, "language_loss": 0.88353026, "learning_rate": 3.992860887341366e-06, "loss": 0.90676868, "num_input_tokens_seen": 10109675, "step": 467, "time_per_iteration": 2.573352575302124 }, { "auxiliary_loss_clip": 0.01225384, "auxiliary_loss_mlp": 0.01042764, "balance_loss_clip": 1.06847656, "balance_loss_mlp": 1.02748108, "epoch": 0.056273672819094574, "flos": 23584979508480.0, "grad_norm": 2.2303254393803043, "language_loss": 0.81188333, "learning_rate": 3.992794977085052e-06, "loss": 0.83456481, "num_input_tokens_seen": 10127675, "step": 468, "time_per_iteration": 2.624342679977417 }, { "auxiliary_loss_clip": 0.01241007, "auxiliary_loss_mlp": 0.01053444, "balance_loss_clip": 1.0728451, "balance_loss_mlp": 1.03932929, "epoch": 0.056393915709733664, "flos": 19858708552320.0, "grad_norm": 2.1551598959605265, "language_loss": 0.84982693, "learning_rate": 3.992728764523326e-06, "loss": 0.87277138, "num_input_tokens_seen": 10146620, "step": 469, "time_per_iteration": 2.6475706100463867 }, { "auxiliary_loss_clip": 0.01254025, "auxiliary_loss_mlp": 0.01047833, "balance_loss_clip": 1.07240641, "balance_loss_mlp": 1.03345656, "epoch": 0.05651415860037275, "flos": 22163779013760.0, "grad_norm": 2.1054534597155476, "language_loss": 0.80992544, "learning_rate": 3.99266224966623e-06, "loss": 0.8329441, "num_input_tokens_seen": 10167535, "step": 470, "time_per_iteration": 2.619994640350342 }, { "auxiliary_loss_clip": 0.01244018, "auxiliary_loss_mlp": 0.01046113, "balance_loss_clip": 1.07288432, "balance_loss_mlp": 1.03129482, "epoch": 0.05663440149101184, "flos": 19463548055040.0, "grad_norm": 1.9288716141113582, "language_loss": 0.87809944, "learning_rate": 3.992595432523855e-06, "loss": 0.90100068, "num_input_tokens_seen": 10184825, "step": 471, "time_per_iteration": 2.569845676422119 }, { "auxiliary_loss_clip": 0.01227736, "auxiliary_loss_mlp": 0.01050565, "balance_loss_clip": 1.06888604, "balance_loss_mlp": 1.03649819, "epoch": 0.05675464438165093, "flos": 22670226823680.0, "grad_norm": 1.9535716613615486, "language_loss": 0.86224347, "learning_rate": 3.992528313106338e-06, "loss": 0.88502645, "num_input_tokens_seen": 10203025, "step": 472, "time_per_iteration": 2.633270502090454 }, { "auxiliary_loss_clip": 0.0128748, "auxiliary_loss_mlp": 0.00766872, "balance_loss_clip": 1.08005643, "balance_loss_mlp": 1.00075042, "epoch": 0.05687488727229002, "flos": 16901177495040.0, "grad_norm": 2.305118764708682, "language_loss": 0.81970888, "learning_rate": 3.9924608914238595e-06, "loss": 0.8402524, "num_input_tokens_seen": 10218020, "step": 473, "time_per_iteration": 2.520453691482544 }, { "auxiliary_loss_clip": 0.01270703, "auxiliary_loss_mlp": 0.01049679, "balance_loss_clip": 1.07682896, "balance_loss_mlp": 1.03543305, "epoch": 0.05699513016292912, "flos": 29168980945920.0, "grad_norm": 2.2892678541469316, "language_loss": 0.84009999, "learning_rate": 3.992393167486648e-06, "loss": 0.86330378, "num_input_tokens_seen": 10237170, "step": 474, "time_per_iteration": 3.406655788421631 }, { "auxiliary_loss_clip": 0.01289301, "auxiliary_loss_mlp": 0.01055027, "balance_loss_clip": 1.07881534, "balance_loss_mlp": 1.03964925, "epoch": 0.05711537305356821, "flos": 18916197632640.0, "grad_norm": 2.5701574061421413, "language_loss": 0.80658674, "learning_rate": 3.992325141304977e-06, "loss": 0.83002996, "num_input_tokens_seen": 10255125, "step": 475, "time_per_iteration": 2.5102367401123047 }, { "auxiliary_loss_clip": 0.01224152, "auxiliary_loss_mlp": 0.01048825, "balance_loss_clip": 1.06943631, "balance_loss_mlp": 1.03480554, "epoch": 0.0572356159442073, "flos": 26758979879040.0, "grad_norm": 2.231398342184023, "language_loss": 0.86522353, "learning_rate": 3.992256812889166e-06, "loss": 0.88795334, "num_input_tokens_seen": 10271230, "step": 476, "time_per_iteration": 2.6430747509002686 }, { "auxiliary_loss_clip": 0.01287087, "auxiliary_loss_mlp": 0.01047905, "balance_loss_clip": 1.07895398, "balance_loss_mlp": 1.03381395, "epoch": 0.05735585883484639, "flos": 35116146840960.0, "grad_norm": 4.912103906002928, "language_loss": 0.76712555, "learning_rate": 3.992188182249582e-06, "loss": 0.79047549, "num_input_tokens_seen": 10293125, "step": 477, "time_per_iteration": 2.6121628284454346 }, { "auxiliary_loss_clip": 0.01252452, "auxiliary_loss_mlp": 0.01055691, "balance_loss_clip": 1.0760603, "balance_loss_mlp": 1.04084325, "epoch": 0.05747610172548548, "flos": 18734381965440.0, "grad_norm": 7.740941948675379, "language_loss": 0.90576172, "learning_rate": 3.992119249396633e-06, "loss": 0.92884308, "num_input_tokens_seen": 10311810, "step": 478, "time_per_iteration": 2.526930332183838 }, { "auxiliary_loss_clip": 0.01243971, "auxiliary_loss_mlp": 0.00766679, "balance_loss_clip": 1.06933284, "balance_loss_mlp": 1.000741, "epoch": 0.05759634461612457, "flos": 27964752554880.0, "grad_norm": 1.9002993415101694, "language_loss": 0.82179916, "learning_rate": 3.992050014340778e-06, "loss": 0.84190571, "num_input_tokens_seen": 10332165, "step": 479, "time_per_iteration": 3.464747190475464 }, { "auxiliary_loss_clip": 0.01147847, "auxiliary_loss_mlp": 0.01009889, "balance_loss_clip": 1.03669572, "balance_loss_mlp": 1.00461996, "epoch": 0.057716587506763666, "flos": 69292009405440.0, "grad_norm": 0.8640049590043475, "language_loss": 0.55069149, "learning_rate": 3.99198047709252e-06, "loss": 0.57226884, "num_input_tokens_seen": 10393685, "step": 480, "time_per_iteration": 3.161722183227539 }, { "auxiliary_loss_clip": 0.01232759, "auxiliary_loss_mlp": 0.0105072, "balance_loss_clip": 1.06522787, "balance_loss_mlp": 1.03571713, "epoch": 0.057836830397402755, "flos": 25009196745600.0, "grad_norm": 1.8616834802833198, "language_loss": 0.78794473, "learning_rate": 3.991910637662408e-06, "loss": 0.81077951, "num_input_tokens_seen": 10413975, "step": 481, "time_per_iteration": 3.4052248001098633 }, { "auxiliary_loss_clip": 0.01285052, "auxiliary_loss_mlp": 0.01040041, "balance_loss_clip": 1.07858062, "balance_loss_mlp": 1.02562273, "epoch": 0.057957073288041845, "flos": 25593894334080.0, "grad_norm": 1.8138002061911072, "language_loss": 0.80608875, "learning_rate": 3.9918404960610355e-06, "loss": 0.82933968, "num_input_tokens_seen": 10433005, "step": 482, "time_per_iteration": 2.5188021659851074 }, { "auxiliary_loss_clip": 0.01277709, "auxiliary_loss_mlp": 0.01051611, "balance_loss_clip": 1.07918572, "balance_loss_mlp": 1.03740668, "epoch": 0.058077316178680935, "flos": 20777411733120.0, "grad_norm": 2.261458503024001, "language_loss": 0.77691972, "learning_rate": 3.991770052299043e-06, "loss": 0.80021298, "num_input_tokens_seen": 10451235, "step": 483, "time_per_iteration": 2.569852828979492 }, { "auxiliary_loss_clip": 0.01250886, "auxiliary_loss_mlp": 0.010403, "balance_loss_clip": 1.07015252, "balance_loss_mlp": 1.0271039, "epoch": 0.058197559069320025, "flos": 18916484941440.0, "grad_norm": 2.1821964473996855, "language_loss": 0.87601757, "learning_rate": 3.991699306387118e-06, "loss": 0.89892948, "num_input_tokens_seen": 10469705, "step": 484, "time_per_iteration": 2.537954330444336 }, { "auxiliary_loss_clip": 0.01269438, "auxiliary_loss_mlp": 0.01052509, "balance_loss_clip": 1.07506013, "balance_loss_mlp": 1.03837669, "epoch": 0.058317801959959115, "flos": 24863327614080.0, "grad_norm": 1.8435456857959243, "language_loss": 0.78118783, "learning_rate": 3.991628258335991e-06, "loss": 0.8044073, "num_input_tokens_seen": 10491910, "step": 485, "time_per_iteration": 2.5575599670410156 }, { "auxiliary_loss_clip": 0.01228738, "auxiliary_loss_mlp": 0.01045503, "balance_loss_clip": 1.06771517, "balance_loss_mlp": 1.03119779, "epoch": 0.05843804485059821, "flos": 23257977068160.0, "grad_norm": 3.9403707254477287, "language_loss": 0.8729986, "learning_rate": 3.991556908156442e-06, "loss": 0.89574105, "num_input_tokens_seen": 10508435, "step": 486, "time_per_iteration": 2.5650887489318848 }, { "auxiliary_loss_clip": 0.01256629, "auxiliary_loss_mlp": 0.01053806, "balance_loss_clip": 1.07344568, "balance_loss_mlp": 1.03997207, "epoch": 0.0585582877412373, "flos": 23150532510720.0, "grad_norm": 2.5836211887872502, "language_loss": 0.87835133, "learning_rate": 3.9914852558592914e-06, "loss": 0.90145564, "num_input_tokens_seen": 10529485, "step": 487, "time_per_iteration": 2.5938074588775635 }, { "auxiliary_loss_clip": 0.01269577, "auxiliary_loss_mlp": 0.01045475, "balance_loss_clip": 1.0771538, "balance_loss_mlp": 1.03089583, "epoch": 0.05867853063187639, "flos": 23506406507520.0, "grad_norm": 3.245791908158948, "language_loss": 0.80608124, "learning_rate": 3.991413301455413e-06, "loss": 0.82923174, "num_input_tokens_seen": 10545935, "step": 488, "time_per_iteration": 2.5468804836273193 }, { "auxiliary_loss_clip": 0.01237281, "auxiliary_loss_mlp": 0.01043722, "balance_loss_clip": 1.06981635, "balance_loss_mlp": 1.0305438, "epoch": 0.05879877352251548, "flos": 29495803818240.0, "grad_norm": 2.8241493027910707, "language_loss": 0.78130937, "learning_rate": 3.991341044955719e-06, "loss": 0.80411935, "num_input_tokens_seen": 10565690, "step": 489, "time_per_iteration": 2.7312850952148438 }, { "auxiliary_loss_clip": 0.01265329, "auxiliary_loss_mlp": 0.00767242, "balance_loss_clip": 1.0730257, "balance_loss_mlp": 1.00077808, "epoch": 0.05891901641315457, "flos": 20157485880960.0, "grad_norm": 1.960062980502636, "language_loss": 0.81499922, "learning_rate": 3.991268486371172e-06, "loss": 0.83532488, "num_input_tokens_seen": 10584245, "step": 490, "time_per_iteration": 2.597562074661255 }, { "auxiliary_loss_clip": 0.01251835, "auxiliary_loss_mlp": 0.01052381, "balance_loss_clip": 1.07094288, "balance_loss_mlp": 1.03585863, "epoch": 0.05903925930379366, "flos": 24644200694400.0, "grad_norm": 2.4332300998667087, "language_loss": 0.87856263, "learning_rate": 3.991195625712779e-06, "loss": 0.90160477, "num_input_tokens_seen": 10601210, "step": 491, "time_per_iteration": 2.646657943725586 }, { "auxiliary_loss_clip": 0.01283379, "auxiliary_loss_mlp": 0.01044919, "balance_loss_clip": 1.07858241, "balance_loss_mlp": 1.03049469, "epoch": 0.05915950219443276, "flos": 21250391045760.0, "grad_norm": 1.9560043557368332, "language_loss": 0.81519854, "learning_rate": 3.991122462991592e-06, "loss": 0.83848155, "num_input_tokens_seen": 10620730, "step": 492, "time_per_iteration": 2.55846905708313 }, { "auxiliary_loss_clip": 0.01288231, "auxiliary_loss_mlp": 0.01050067, "balance_loss_clip": 1.07664061, "balance_loss_mlp": 1.03595257, "epoch": 0.05927974508507185, "flos": 9902727319680.0, "grad_norm": 4.138479272791149, "language_loss": 0.81250608, "learning_rate": 3.991048998218712e-06, "loss": 0.8358891, "num_input_tokens_seen": 10634035, "step": 493, "time_per_iteration": 2.4798851013183594 }, { "auxiliary_loss_clip": 0.01265389, "auxiliary_loss_mlp": 0.01050066, "balance_loss_clip": 1.07141185, "balance_loss_mlp": 1.0360117, "epoch": 0.05939998797571094, "flos": 18259499232000.0, "grad_norm": 2.6924464145983067, "language_loss": 0.76272655, "learning_rate": 3.990975231405281e-06, "loss": 0.78588104, "num_input_tokens_seen": 10652485, "step": 494, "time_per_iteration": 2.540430784225464 }, { "auxiliary_loss_clip": 0.01264716, "auxiliary_loss_mlp": 0.01046595, "balance_loss_clip": 1.0755918, "balance_loss_mlp": 1.03237343, "epoch": 0.05952023086635003, "flos": 28256598558720.0, "grad_norm": 1.7629613226632177, "language_loss": 0.78670263, "learning_rate": 3.990901162562491e-06, "loss": 0.80981576, "num_input_tokens_seen": 10673175, "step": 495, "time_per_iteration": 2.63199782371521 }, { "auxiliary_loss_clip": 0.01227425, "auxiliary_loss_mlp": 0.00767761, "balance_loss_clip": 1.06459618, "balance_loss_mlp": 1.00068176, "epoch": 0.05964047375698912, "flos": 14902498045440.0, "grad_norm": 2.473320868503803, "language_loss": 0.90764403, "learning_rate": 3.9908267917015765e-06, "loss": 0.92759585, "num_input_tokens_seen": 10691235, "step": 496, "time_per_iteration": 2.6073436737060547 }, { "auxiliary_loss_clip": 0.01254178, "auxiliary_loss_mlp": 0.01055666, "balance_loss_clip": 1.06982493, "balance_loss_mlp": 1.04096711, "epoch": 0.059760716647628206, "flos": 23185581206400.0, "grad_norm": 2.4768764183403613, "language_loss": 0.93161809, "learning_rate": 3.990752118833821e-06, "loss": 0.95471656, "num_input_tokens_seen": 10708675, "step": 497, "time_per_iteration": 2.5714454650878906 }, { "auxiliary_loss_clip": 0.01283475, "auxiliary_loss_mlp": 0.01045611, "balance_loss_clip": 1.07747734, "balance_loss_mlp": 1.03153205, "epoch": 0.0598809595382673, "flos": 22746968231040.0, "grad_norm": 1.816147997487087, "language_loss": 0.78227592, "learning_rate": 3.990677143970553e-06, "loss": 0.80556679, "num_input_tokens_seen": 10729485, "step": 498, "time_per_iteration": 2.589621067047119 }, { "auxiliary_loss_clip": 0.01230367, "auxiliary_loss_mlp": 0.01055318, "balance_loss_clip": 1.07252336, "balance_loss_mlp": 1.0402143, "epoch": 0.06000120242890639, "flos": 22127221946880.0, "grad_norm": 2.3668604459595377, "language_loss": 0.81221884, "learning_rate": 3.990601867123144e-06, "loss": 0.83507574, "num_input_tokens_seen": 10749210, "step": 499, "time_per_iteration": 2.6350436210632324 }, { "auxiliary_loss_clip": 0.0121815, "auxiliary_loss_mlp": 0.01049183, "balance_loss_clip": 1.06963706, "balance_loss_mlp": 1.03477049, "epoch": 0.06012144531954548, "flos": 19171773878400.0, "grad_norm": 2.065752786043113, "language_loss": 0.84776658, "learning_rate": 3.990526288303014e-06, "loss": 0.87043989, "num_input_tokens_seen": 10768000, "step": 500, "time_per_iteration": 2.630920171737671 }, { "auxiliary_loss_clip": 0.01245623, "auxiliary_loss_mlp": 0.00766268, "balance_loss_clip": 1.07011294, "balance_loss_mlp": 1.00060201, "epoch": 0.06024168821018457, "flos": 22783345729920.0, "grad_norm": 1.711959340986966, "language_loss": 0.91168308, "learning_rate": 3.9904504075216295e-06, "loss": 0.93180203, "num_input_tokens_seen": 10788760, "step": 501, "time_per_iteration": 3.4149577617645264 }, { "auxiliary_loss_clip": 0.01232014, "auxiliary_loss_mlp": 0.01051664, "balance_loss_clip": 1.06641507, "balance_loss_mlp": 1.03676319, "epoch": 0.06036193110082366, "flos": 18770687637120.0, "grad_norm": 2.0796848827466285, "language_loss": 0.94184071, "learning_rate": 3.990374224790501e-06, "loss": 0.96467751, "num_input_tokens_seen": 10806965, "step": 502, "time_per_iteration": 2.604987621307373 }, { "auxiliary_loss_clip": 0.01248037, "auxiliary_loss_mlp": 0.01052614, "balance_loss_clip": 1.07285142, "balance_loss_mlp": 1.03800464, "epoch": 0.06048217399146275, "flos": 17201570935680.0, "grad_norm": 1.8303229565552828, "language_loss": 0.70841467, "learning_rate": 3.990297740121185e-06, "loss": 0.73142111, "num_input_tokens_seen": 10824900, "step": 503, "time_per_iteration": 2.5631203651428223 }, { "auxiliary_loss_clip": 0.0126291, "auxiliary_loss_mlp": 0.00766852, "balance_loss_clip": 1.07250094, "balance_loss_mlp": 1.00057173, "epoch": 0.06060241688210185, "flos": 24024131187840.0, "grad_norm": 1.7632762460295015, "language_loss": 0.78359771, "learning_rate": 3.990220953525284e-06, "loss": 0.80389529, "num_input_tokens_seen": 10842010, "step": 504, "time_per_iteration": 2.5622775554656982 }, { "auxiliary_loss_clip": 0.01236032, "auxiliary_loss_mlp": 0.01049064, "balance_loss_clip": 1.06633329, "balance_loss_mlp": 1.03552783, "epoch": 0.06072265977274094, "flos": 14611190745600.0, "grad_norm": 3.417936454705504, "language_loss": 0.74035621, "learning_rate": 3.9901438650144465e-06, "loss": 0.76320714, "num_input_tokens_seen": 10858260, "step": 505, "time_per_iteration": 3.3221349716186523 }, { "auxiliary_loss_clip": 0.01255656, "auxiliary_loss_mlp": 0.01044378, "balance_loss_clip": 1.07094288, "balance_loss_mlp": 1.03138399, "epoch": 0.06084290266338003, "flos": 20558284813440.0, "grad_norm": 4.427473181387212, "language_loss": 0.91775775, "learning_rate": 3.990066474600367e-06, "loss": 0.94075811, "num_input_tokens_seen": 10876230, "step": 506, "time_per_iteration": 3.4471898078918457 }, { "auxiliary_loss_clip": 0.01248885, "auxiliary_loss_mlp": 0.01047921, "balance_loss_clip": 1.06613219, "balance_loss_mlp": 1.03276944, "epoch": 0.06096314555401912, "flos": 22309217182080.0, "grad_norm": 2.097791928212606, "language_loss": 0.67857343, "learning_rate": 3.989988782294786e-06, "loss": 0.70154148, "num_input_tokens_seen": 10896320, "step": 507, "time_per_iteration": 2.5640347003936768 }, { "auxiliary_loss_clip": 0.01216106, "auxiliary_loss_mlp": 0.01053604, "balance_loss_clip": 1.06556737, "balance_loss_mlp": 1.03891158, "epoch": 0.06108338844465821, "flos": 19131374056320.0, "grad_norm": 1.7265108818948975, "language_loss": 0.95145917, "learning_rate": 3.989910788109489e-06, "loss": 0.97415626, "num_input_tokens_seen": 10912970, "step": 508, "time_per_iteration": 3.319075584411621 }, { "auxiliary_loss_clip": 0.01225846, "auxiliary_loss_mlp": 0.01045771, "balance_loss_clip": 1.06460392, "balance_loss_mlp": 1.03205013, "epoch": 0.0612036313352973, "flos": 33584018169600.0, "grad_norm": 4.653434425353272, "language_loss": 0.75043541, "learning_rate": 3.989832492056307e-06, "loss": 0.77315158, "num_input_tokens_seen": 10933995, "step": 509, "time_per_iteration": 2.7116730213165283 }, { "auxiliary_loss_clip": 0.0126432, "auxiliary_loss_mlp": 0.01049766, "balance_loss_clip": 1.07376266, "balance_loss_mlp": 1.03516316, "epoch": 0.06132387422593639, "flos": 27490552179840.0, "grad_norm": 2.30611056388382, "language_loss": 0.8086307, "learning_rate": 3.989753894147119e-06, "loss": 0.83177161, "num_input_tokens_seen": 10954120, "step": 510, "time_per_iteration": 2.5530357360839844 }, { "auxiliary_loss_clip": 0.01258622, "auxiliary_loss_mlp": 0.01046003, "balance_loss_clip": 1.07547843, "balance_loss_mlp": 1.03217459, "epoch": 0.061444117116575485, "flos": 25885057979520.0, "grad_norm": 1.8684002224500142, "language_loss": 0.79945993, "learning_rate": 3.989674994393846e-06, "loss": 0.82250619, "num_input_tokens_seen": 10973595, "step": 511, "time_per_iteration": 2.58402943611145 }, { "auxiliary_loss_clip": 0.01262331, "auxiliary_loss_mlp": 0.01041539, "balance_loss_clip": 1.07509351, "balance_loss_mlp": 1.02762151, "epoch": 0.061564360007214575, "flos": 28512031150080.0, "grad_norm": 2.1365770111365485, "language_loss": 0.94072628, "learning_rate": 3.98959579280846e-06, "loss": 0.96376503, "num_input_tokens_seen": 10991995, "step": 512, "time_per_iteration": 2.5635733604431152 }, { "auxiliary_loss_clip": 0.01196261, "auxiliary_loss_mlp": 0.01045152, "balance_loss_clip": 1.06744456, "balance_loss_mlp": 1.03076339, "epoch": 0.061684602897853665, "flos": 12094355652480.0, "grad_norm": 2.2795350665917007, "language_loss": 0.83073515, "learning_rate": 3.989516289402973e-06, "loss": 0.85314929, "num_input_tokens_seen": 11007625, "step": 513, "time_per_iteration": 2.649857759475708 }, { "auxiliary_loss_clip": 0.01177261, "auxiliary_loss_mlp": 0.01048162, "balance_loss_clip": 1.0571543, "balance_loss_mlp": 1.03379762, "epoch": 0.061804845788492754, "flos": 19532639865600.0, "grad_norm": 2.8027560640377995, "language_loss": 0.80464005, "learning_rate": 3.989436484189447e-06, "loss": 0.82689428, "num_input_tokens_seen": 11025570, "step": 514, "time_per_iteration": 2.6532979011535645 }, { "auxiliary_loss_clip": 0.01263404, "auxiliary_loss_mlp": 0.01041708, "balance_loss_clip": 1.06956124, "balance_loss_mlp": 1.02771306, "epoch": 0.061925088679131844, "flos": 15341111020800.0, "grad_norm": 2.9314761155770612, "language_loss": 0.80997372, "learning_rate": 3.9893563771799885e-06, "loss": 0.83302486, "num_input_tokens_seen": 11042045, "step": 515, "time_per_iteration": 2.526691198348999 }, { "auxiliary_loss_clip": 0.01279233, "auxiliary_loss_mlp": 0.01048377, "balance_loss_clip": 1.07494998, "balance_loss_mlp": 1.03396511, "epoch": 0.062045331569770934, "flos": 25919927107200.0, "grad_norm": 2.3013100180536035, "language_loss": 0.85945821, "learning_rate": 3.989275968386749e-06, "loss": 0.8827343, "num_input_tokens_seen": 11059955, "step": 516, "time_per_iteration": 2.540703773498535 }, { "auxiliary_loss_clip": 0.01238818, "auxiliary_loss_mlp": 0.01051251, "balance_loss_clip": 1.06632495, "balance_loss_mlp": 1.03590882, "epoch": 0.06216557446041003, "flos": 28110621686400.0, "grad_norm": 2.2433027139282955, "language_loss": 0.76641953, "learning_rate": 3.989195257821926e-06, "loss": 0.78932023, "num_input_tokens_seen": 11078440, "step": 517, "time_per_iteration": 2.687325954437256 }, { "auxiliary_loss_clip": 0.01242201, "auxiliary_loss_mlp": 0.01050637, "balance_loss_clip": 1.07084441, "balance_loss_mlp": 1.03628373, "epoch": 0.06228581735104912, "flos": 23478181395840.0, "grad_norm": 2.1961413645515186, "language_loss": 0.84141314, "learning_rate": 3.989114245497765e-06, "loss": 0.86434156, "num_input_tokens_seen": 11098240, "step": 518, "time_per_iteration": 2.602976083755493 }, { "auxiliary_loss_clip": 0.01261688, "auxiliary_loss_mlp": 0.0104308, "balance_loss_clip": 1.06707501, "balance_loss_mlp": 1.02932966, "epoch": 0.06240606024168821, "flos": 15195205975680.0, "grad_norm": 2.5054984938972042, "language_loss": 0.94887328, "learning_rate": 3.989032931426554e-06, "loss": 0.97192097, "num_input_tokens_seen": 11115395, "step": 519, "time_per_iteration": 2.5378425121307373 }, { "auxiliary_loss_clip": 0.01237419, "auxiliary_loss_mlp": 0.01044545, "balance_loss_clip": 1.06759143, "balance_loss_mlp": 1.03063893, "epoch": 0.06252630313232731, "flos": 20631829910400.0, "grad_norm": 2.209568759441187, "language_loss": 0.86931837, "learning_rate": 3.9889513156206295e-06, "loss": 0.892138, "num_input_tokens_seen": 11134835, "step": 520, "time_per_iteration": 2.536693572998047 }, { "auxiliary_loss_clip": 0.01234608, "auxiliary_loss_mlp": 0.01047263, "balance_loss_clip": 1.06920779, "balance_loss_mlp": 1.03227854, "epoch": 0.06264654602296639, "flos": 20778058177920.0, "grad_norm": 2.9212118464256167, "language_loss": 0.73774111, "learning_rate": 3.988869398092371e-06, "loss": 0.7605598, "num_input_tokens_seen": 11154745, "step": 521, "time_per_iteration": 2.6357240676879883 }, { "auxiliary_loss_clip": 0.01246808, "auxiliary_loss_mlp": 0.01047027, "balance_loss_clip": 1.07113934, "balance_loss_mlp": 1.03257942, "epoch": 0.06276678891360549, "flos": 29605798241280.0, "grad_norm": 2.363192753656143, "language_loss": 0.79168439, "learning_rate": 3.988787178854206e-06, "loss": 0.81462276, "num_input_tokens_seen": 11174280, "step": 522, "time_per_iteration": 2.5789244174957275 }, { "auxiliary_loss_clip": 0.01278253, "auxiliary_loss_mlp": 0.01048427, "balance_loss_clip": 1.07506073, "balance_loss_mlp": 1.03434825, "epoch": 0.06288703180424457, "flos": 22126288193280.0, "grad_norm": 2.292919083044179, "language_loss": 0.8775146, "learning_rate": 3.988704657918608e-06, "loss": 0.90078139, "num_input_tokens_seen": 11193340, "step": 523, "time_per_iteration": 2.5421876907348633 }, { "auxiliary_loss_clip": 0.01260593, "auxiliary_loss_mlp": 0.01053341, "balance_loss_clip": 1.07486653, "balance_loss_mlp": 1.03994203, "epoch": 0.06300727469488367, "flos": 14976689587200.0, "grad_norm": 2.8851721886215547, "language_loss": 0.80009913, "learning_rate": 3.988621835298094e-06, "loss": 0.82323843, "num_input_tokens_seen": 11210555, "step": 524, "time_per_iteration": 2.4824345111846924 }, { "auxiliary_loss_clip": 0.01274082, "auxiliary_loss_mlp": 0.01045944, "balance_loss_clip": 1.07479048, "balance_loss_mlp": 1.0320915, "epoch": 0.06312751758552275, "flos": 24535391420160.0, "grad_norm": 1.9399650971274744, "language_loss": 0.91913724, "learning_rate": 3.988538711005229e-06, "loss": 0.94233757, "num_input_tokens_seen": 11230010, "step": 525, "time_per_iteration": 2.5533370971679688 }, { "auxiliary_loss_clip": 0.01254739, "auxiliary_loss_mlp": 0.01045345, "balance_loss_clip": 1.07180393, "balance_loss_mlp": 1.03244102, "epoch": 0.06324776047616185, "flos": 21507008785920.0, "grad_norm": 2.257703989251811, "language_loss": 0.88241124, "learning_rate": 3.988455285052622e-06, "loss": 0.90541208, "num_input_tokens_seen": 11246190, "step": 526, "time_per_iteration": 2.486172914505005 }, { "auxiliary_loss_clip": 0.01257162, "auxiliary_loss_mlp": 0.010524, "balance_loss_clip": 1.07308877, "balance_loss_mlp": 1.03854191, "epoch": 0.06336800336680094, "flos": 21688034353920.0, "grad_norm": 2.0333813020777622, "language_loss": 0.84032071, "learning_rate": 3.98837155745293e-06, "loss": 0.86341631, "num_input_tokens_seen": 11264230, "step": 527, "time_per_iteration": 2.557490825653076 }, { "auxiliary_loss_clip": 0.01263769, "auxiliary_loss_mlp": 0.01045589, "balance_loss_clip": 1.07662797, "balance_loss_mlp": 1.03129542, "epoch": 0.06348824625744003, "flos": 19500895221120.0, "grad_norm": 2.0420687094624435, "language_loss": 0.7600944, "learning_rate": 3.988287528218854e-06, "loss": 0.78318799, "num_input_tokens_seen": 11283015, "step": 528, "time_per_iteration": 3.268552303314209 }, { "auxiliary_loss_clip": 0.01259022, "auxiliary_loss_mlp": 0.01044304, "balance_loss_clip": 1.07505417, "balance_loss_mlp": 1.0313158, "epoch": 0.06360848914807912, "flos": 15481233976320.0, "grad_norm": 2.07627932200372, "language_loss": 0.90610939, "learning_rate": 3.98820319736314e-06, "loss": 0.92914271, "num_input_tokens_seen": 11299630, "step": 529, "time_per_iteration": 2.5093588829040527 }, { "auxiliary_loss_clip": 0.01228938, "auxiliary_loss_mlp": 0.01044071, "balance_loss_clip": 1.06564808, "balance_loss_mlp": 1.03015924, "epoch": 0.0637287320387182, "flos": 20593369422720.0, "grad_norm": 1.8016543805090741, "language_loss": 0.85254645, "learning_rate": 3.988118564898582e-06, "loss": 0.87527651, "num_input_tokens_seen": 11319170, "step": 530, "time_per_iteration": 2.573732852935791 }, { "auxiliary_loss_clip": 0.01223118, "auxiliary_loss_mlp": 0.0076726, "balance_loss_clip": 1.07045996, "balance_loss_mlp": 1.00039816, "epoch": 0.0638489749293573, "flos": 17412222245760.0, "grad_norm": 2.2854967834248274, "language_loss": 0.89324105, "learning_rate": 3.988033630838019e-06, "loss": 0.91314483, "num_input_tokens_seen": 11333210, "step": 531, "time_per_iteration": 2.605701446533203 }, { "auxiliary_loss_clip": 0.01263484, "auxiliary_loss_mlp": 0.0105179, "balance_loss_clip": 1.07501411, "balance_loss_mlp": 1.03842676, "epoch": 0.0639692178199964, "flos": 23807661874560.0, "grad_norm": 1.6475820456493724, "language_loss": 0.88092899, "learning_rate": 3.987948395194334e-06, "loss": 0.9040817, "num_input_tokens_seen": 11355590, "step": 532, "time_per_iteration": 4.175787925720215 }, { "auxiliary_loss_clip": 0.01250721, "auxiliary_loss_mlp": 0.01052487, "balance_loss_clip": 1.06814122, "balance_loss_mlp": 1.03903401, "epoch": 0.06408946071063548, "flos": 18477225521280.0, "grad_norm": 2.1057366672911475, "language_loss": 0.76576364, "learning_rate": 3.987862857980458e-06, "loss": 0.78879577, "num_input_tokens_seen": 11371535, "step": 533, "time_per_iteration": 2.5321125984191895 }, { "auxiliary_loss_clip": 0.01228782, "auxiliary_loss_mlp": 0.01045632, "balance_loss_clip": 1.06811428, "balance_loss_mlp": 1.03112459, "epoch": 0.06420970360127458, "flos": 27162220936320.0, "grad_norm": 2.634596840346206, "language_loss": 0.76803386, "learning_rate": 3.987777019209368e-06, "loss": 0.79077792, "num_input_tokens_seen": 11392050, "step": 534, "time_per_iteration": 3.3541109561920166 }, { "auxiliary_loss_clip": 0.01278138, "auxiliary_loss_mlp": 0.01042612, "balance_loss_clip": 1.07637906, "balance_loss_mlp": 1.02886152, "epoch": 0.06432994649191366, "flos": 23659673840640.0, "grad_norm": 4.810119883205886, "language_loss": 0.81286293, "learning_rate": 3.987690878894084e-06, "loss": 0.83607042, "num_input_tokens_seen": 11411765, "step": 535, "time_per_iteration": 2.5653088092803955 }, { "auxiliary_loss_clip": 0.01250204, "auxiliary_loss_mlp": 0.01036321, "balance_loss_clip": 1.07265449, "balance_loss_mlp": 1.02244473, "epoch": 0.06445018938255276, "flos": 23403953940480.0, "grad_norm": 2.6439322320632006, "language_loss": 0.85333127, "learning_rate": 3.987604437047673e-06, "loss": 0.8761965, "num_input_tokens_seen": 11431565, "step": 536, "time_per_iteration": 2.558048725128174 }, { "auxiliary_loss_clip": 0.01256634, "auxiliary_loss_mlp": 0.01043786, "balance_loss_clip": 1.07157469, "balance_loss_mlp": 1.03023195, "epoch": 0.06457043227319184, "flos": 19646692525440.0, "grad_norm": 2.2994892071675324, "language_loss": 0.77566165, "learning_rate": 3.987517693683251e-06, "loss": 0.79866588, "num_input_tokens_seen": 11450140, "step": 537, "time_per_iteration": 2.570960760116577 }, { "auxiliary_loss_clip": 0.01240312, "auxiliary_loss_mlp": 0.01056201, "balance_loss_clip": 1.07161307, "balance_loss_mlp": 1.04209828, "epoch": 0.06469067516383094, "flos": 16978744915200.0, "grad_norm": 3.178924399386921, "language_loss": 0.9600358, "learning_rate": 3.9874306488139745e-06, "loss": 0.98300099, "num_input_tokens_seen": 11465400, "step": 538, "time_per_iteration": 2.5246057510375977 }, { "auxiliary_loss_clip": 0.01225697, "auxiliary_loss_mlp": 0.01047211, "balance_loss_clip": 1.06984317, "balance_loss_mlp": 1.03340697, "epoch": 0.06481091805447003, "flos": 23296401642240.0, "grad_norm": 2.022463506911064, "language_loss": 0.87789249, "learning_rate": 3.987343302453049e-06, "loss": 0.90062153, "num_input_tokens_seen": 11486675, "step": 539, "time_per_iteration": 2.6185922622680664 }, { "auxiliary_loss_clip": 0.01242706, "auxiliary_loss_mlp": 0.0104951, "balance_loss_clip": 1.07214081, "balance_loss_mlp": 1.03572941, "epoch": 0.06493116094510912, "flos": 29172356824320.0, "grad_norm": 1.646683429518235, "language_loss": 0.82634234, "learning_rate": 3.987255654613724e-06, "loss": 0.8492645, "num_input_tokens_seen": 11510440, "step": 540, "time_per_iteration": 2.6307239532470703 }, { "auxiliary_loss_clip": 0.01220481, "auxiliary_loss_mlp": 0.01046343, "balance_loss_clip": 1.06522894, "balance_loss_mlp": 1.03256226, "epoch": 0.06505140383574821, "flos": 19865065259520.0, "grad_norm": 2.8280155344709574, "language_loss": 0.70145231, "learning_rate": 3.987167705309296e-06, "loss": 0.72412056, "num_input_tokens_seen": 11529715, "step": 541, "time_per_iteration": 2.5936086177825928 }, { "auxiliary_loss_clip": 0.01259629, "auxiliary_loss_mlp": 0.00766089, "balance_loss_clip": 1.07191896, "balance_loss_mlp": 1.00040436, "epoch": 0.0651716467263873, "flos": 17924703540480.0, "grad_norm": 2.014998082808893, "language_loss": 0.95142359, "learning_rate": 3.987079454553108e-06, "loss": 0.97168076, "num_input_tokens_seen": 11547665, "step": 542, "time_per_iteration": 2.5201218128204346 }, { "auxiliary_loss_clip": 0.01223939, "auxiliary_loss_mlp": 0.01043385, "balance_loss_clip": 1.07018447, "balance_loss_mlp": 1.02989042, "epoch": 0.0652918896170264, "flos": 20842840356480.0, "grad_norm": 1.8811402526990344, "language_loss": 0.91133386, "learning_rate": 3.986990902358546e-06, "loss": 0.93400711, "num_input_tokens_seen": 11564605, "step": 543, "time_per_iteration": 2.5884342193603516 }, { "auxiliary_loss_clip": 0.01259024, "auxiliary_loss_mlp": 0.01048271, "balance_loss_clip": 1.07132578, "balance_loss_mlp": 1.03453207, "epoch": 0.06541213250766549, "flos": 21872507627520.0, "grad_norm": 2.4480662023516198, "language_loss": 0.93326581, "learning_rate": 3.986902048739045e-06, "loss": 0.95633876, "num_input_tokens_seen": 11584550, "step": 544, "time_per_iteration": 2.540523052215576 }, { "auxiliary_loss_clip": 0.01244425, "auxiliary_loss_mlp": 0.0104968, "balance_loss_clip": 1.07057416, "balance_loss_mlp": 1.03486204, "epoch": 0.06553237539830457, "flos": 23110743219840.0, "grad_norm": 2.685322106617403, "language_loss": 0.79816675, "learning_rate": 3.986812893708082e-06, "loss": 0.82110775, "num_input_tokens_seen": 11600740, "step": 545, "time_per_iteration": 2.569758415222168 }, { "auxiliary_loss_clip": 0.01242421, "auxiliary_loss_mlp": 0.01048732, "balance_loss_clip": 1.06779075, "balance_loss_mlp": 1.03405702, "epoch": 0.06565261828894367, "flos": 17923769786880.0, "grad_norm": 2.3005544598330743, "language_loss": 0.81333226, "learning_rate": 3.9867234372791826e-06, "loss": 0.83624381, "num_input_tokens_seen": 11618695, "step": 546, "time_per_iteration": 2.511117935180664 }, { "auxiliary_loss_clip": 0.01255258, "auxiliary_loss_mlp": 0.0104612, "balance_loss_clip": 1.07118583, "balance_loss_mlp": 1.03232718, "epoch": 0.06577286117958275, "flos": 22783058421120.0, "grad_norm": 3.6692112474506358, "language_loss": 0.87165922, "learning_rate": 3.986633679465918e-06, "loss": 0.89467299, "num_input_tokens_seen": 11638850, "step": 547, "time_per_iteration": 2.5443801879882812 }, { "auxiliary_loss_clip": 0.01211543, "auxiliary_loss_mlp": 0.01052541, "balance_loss_clip": 1.06769454, "balance_loss_mlp": 1.038939, "epoch": 0.06589310407022185, "flos": 23696194993920.0, "grad_norm": 2.111359443670224, "language_loss": 0.80513996, "learning_rate": 3.986543620281904e-06, "loss": 0.82778072, "num_input_tokens_seen": 11658500, "step": 548, "time_per_iteration": 2.636155366897583 }, { "auxiliary_loss_clip": 0.01223896, "auxiliary_loss_mlp": 0.01034971, "balance_loss_clip": 1.06502867, "balance_loss_mlp": 1.02123213, "epoch": 0.06601334696086093, "flos": 26864772410880.0, "grad_norm": 1.927825081736006, "language_loss": 0.91351926, "learning_rate": 3.986453259740802e-06, "loss": 0.93610793, "num_input_tokens_seen": 11676670, "step": 549, "time_per_iteration": 2.6165995597839355 }, { "auxiliary_loss_clip": 0.01240177, "auxiliary_loss_mlp": 0.01049228, "balance_loss_clip": 1.07355571, "balance_loss_mlp": 1.03520274, "epoch": 0.06613358985150003, "flos": 12567694101120.0, "grad_norm": 2.7127872046687806, "language_loss": 0.79120648, "learning_rate": 3.986362597856319e-06, "loss": 0.8141005, "num_input_tokens_seen": 11693170, "step": 550, "time_per_iteration": 2.559262990951538 }, { "auxiliary_loss_clip": 0.01237732, "auxiliary_loss_mlp": 0.00767832, "balance_loss_clip": 1.06790137, "balance_loss_mlp": 1.00048375, "epoch": 0.06625383274213913, "flos": 18332505624960.0, "grad_norm": 2.8655993041631063, "language_loss": 0.81506467, "learning_rate": 3.986271634642211e-06, "loss": 0.83512026, "num_input_tokens_seen": 11710150, "step": 551, "time_per_iteration": 2.5514111518859863 }, { "auxiliary_loss_clip": 0.01271234, "auxiliary_loss_mlp": 0.01046563, "balance_loss_clip": 1.07411563, "balance_loss_mlp": 1.03225207, "epoch": 0.06637407563277821, "flos": 15375585098880.0, "grad_norm": 1.99877330424272, "language_loss": 0.81640399, "learning_rate": 3.986180370112274e-06, "loss": 0.83958191, "num_input_tokens_seen": 11726670, "step": 552, "time_per_iteration": 2.4699618816375732 }, { "auxiliary_loss_clip": 0.01257554, "auxiliary_loss_mlp": 0.00767611, "balance_loss_clip": 1.07208824, "balance_loss_mlp": 1.00044155, "epoch": 0.0664943185234173, "flos": 24025244509440.0, "grad_norm": 1.748967635523427, "language_loss": 0.74675941, "learning_rate": 3.986088804280354e-06, "loss": 0.76701099, "num_input_tokens_seen": 11746400, "step": 553, "time_per_iteration": 2.5647308826446533 }, { "auxiliary_loss_clip": 0.012432, "auxiliary_loss_mlp": 0.01046937, "balance_loss_clip": 1.07146311, "balance_loss_mlp": 1.03258979, "epoch": 0.06661456141405639, "flos": 20957503547520.0, "grad_norm": 2.530207107619296, "language_loss": 0.94020468, "learning_rate": 3.985996937160342e-06, "loss": 0.96310604, "num_input_tokens_seen": 11765590, "step": 554, "time_per_iteration": 3.3341805934906006 }, { "auxiliary_loss_clip": 0.01251465, "auxiliary_loss_mlp": 0.01051324, "balance_loss_clip": 1.07013559, "balance_loss_mlp": 1.03773403, "epoch": 0.06673480430469549, "flos": 52223953322880.0, "grad_norm": 1.9353637869000901, "language_loss": 0.68715233, "learning_rate": 3.985904768766173e-06, "loss": 0.71018022, "num_input_tokens_seen": 11788365, "step": 555, "time_per_iteration": 2.8013808727264404 }, { "auxiliary_loss_clip": 0.0122862, "auxiliary_loss_mlp": 0.01047409, "balance_loss_clip": 1.06846809, "balance_loss_mlp": 1.03291917, "epoch": 0.06685504719533458, "flos": 16217079995520.0, "grad_norm": 2.907012445313646, "language_loss": 0.75966603, "learning_rate": 3.98581229911183e-06, "loss": 0.7824263, "num_input_tokens_seen": 11807285, "step": 556, "time_per_iteration": 2.5596134662628174 }, { "auxiliary_loss_clip": 0.01257191, "auxiliary_loss_mlp": 0.01045985, "balance_loss_clip": 1.06949735, "balance_loss_mlp": 1.03170407, "epoch": 0.06697529008597367, "flos": 22491535639680.0, "grad_norm": 1.7139576351672947, "language_loss": 0.91908765, "learning_rate": 3.985719528211341e-06, "loss": 0.94211942, "num_input_tokens_seen": 11826655, "step": 557, "time_per_iteration": 2.5499160289764404 }, { "auxiliary_loss_clip": 0.01143971, "auxiliary_loss_mlp": 0.0100841, "balance_loss_clip": 1.03919816, "balance_loss_mlp": 1.00252092, "epoch": 0.06709553297661276, "flos": 62688216936960.0, "grad_norm": 0.8447410932211391, "language_loss": 0.63014245, "learning_rate": 3.985626456078777e-06, "loss": 0.65166628, "num_input_tokens_seen": 11891310, "step": 558, "time_per_iteration": 4.009221076965332 }, { "auxiliary_loss_clip": 0.01228934, "auxiliary_loss_mlp": 0.01045226, "balance_loss_clip": 1.07019997, "balance_loss_mlp": 1.03140962, "epoch": 0.06721577586725185, "flos": 11216590997760.0, "grad_norm": 2.1453007329263425, "language_loss": 0.86267483, "learning_rate": 3.985533082728259e-06, "loss": 0.88541645, "num_input_tokens_seen": 11906965, "step": 559, "time_per_iteration": 3.4730141162872314 }, { "auxiliary_loss_clip": 0.01277107, "auxiliary_loss_mlp": 0.01037639, "balance_loss_clip": 1.07436895, "balance_loss_mlp": 1.02359629, "epoch": 0.06733601875789094, "flos": 25922189664000.0, "grad_norm": 1.9090270022415432, "language_loss": 0.75014746, "learning_rate": 3.985439408173951e-06, "loss": 0.77329493, "num_input_tokens_seen": 11927190, "step": 560, "time_per_iteration": 2.5410356521606445 }, { "auxiliary_loss_clip": 0.0127515, "auxiliary_loss_mlp": 0.0105379, "balance_loss_clip": 1.0754025, "balance_loss_mlp": 1.03944969, "epoch": 0.06745626164853002, "flos": 20813645577600.0, "grad_norm": 2.430005953821239, "language_loss": 0.70951903, "learning_rate": 3.9853454324300634e-06, "loss": 0.73280847, "num_input_tokens_seen": 11946400, "step": 561, "time_per_iteration": 3.2623889446258545 }, { "auxiliary_loss_clip": 0.01199154, "auxiliary_loss_mlp": 0.01042931, "balance_loss_clip": 1.06361532, "balance_loss_mlp": 1.0283339, "epoch": 0.06757650453916912, "flos": 19829262378240.0, "grad_norm": 2.075310094903047, "language_loss": 0.77732491, "learning_rate": 3.985251155510852e-06, "loss": 0.7997458, "num_input_tokens_seen": 11965430, "step": 562, "time_per_iteration": 2.6828272342681885 }, { "auxiliary_loss_clip": 0.01209868, "auxiliary_loss_mlp": 0.01043947, "balance_loss_clip": 1.07043219, "balance_loss_mlp": 1.02976108, "epoch": 0.06769674742980822, "flos": 25739224761600.0, "grad_norm": 1.775925135573773, "language_loss": 0.80364096, "learning_rate": 3.98515657743062e-06, "loss": 0.82617909, "num_input_tokens_seen": 11984895, "step": 563, "time_per_iteration": 2.674323320388794 }, { "auxiliary_loss_clip": 0.01235705, "auxiliary_loss_mlp": 0.01054189, "balance_loss_clip": 1.06648397, "balance_loss_mlp": 1.04071283, "epoch": 0.0678169903204473, "flos": 13074788355840.0, "grad_norm": 2.2398220849929085, "language_loss": 0.77869093, "learning_rate": 3.985061698203711e-06, "loss": 0.80158991, "num_input_tokens_seen": 12002010, "step": 564, "time_per_iteration": 2.5196056365966797 }, { "auxiliary_loss_clip": 0.01161138, "auxiliary_loss_mlp": 0.01009265, "balance_loss_clip": 1.03621054, "balance_loss_mlp": 1.00354278, "epoch": 0.0679372332110864, "flos": 70865830788480.0, "grad_norm": 0.8904670521634256, "language_loss": 0.63779444, "learning_rate": 3.984966517844523e-06, "loss": 0.65949851, "num_input_tokens_seen": 12057255, "step": 565, "time_per_iteration": 3.0515527725219727 }, { "auxiliary_loss_clip": 0.01274531, "auxiliary_loss_mlp": 0.01049362, "balance_loss_clip": 1.07415497, "balance_loss_mlp": 1.03561151, "epoch": 0.06805747610172548, "flos": 28256418990720.0, "grad_norm": 2.4938047860429164, "language_loss": 0.80811733, "learning_rate": 3.984871036367492e-06, "loss": 0.83135623, "num_input_tokens_seen": 12077280, "step": 566, "time_per_iteration": 2.541935920715332 }, { "auxiliary_loss_clip": 0.01254022, "auxiliary_loss_mlp": 0.00766778, "balance_loss_clip": 1.07211626, "balance_loss_mlp": 1.00041664, "epoch": 0.06817771899236458, "flos": 20120533764480.0, "grad_norm": 1.9718423772549256, "language_loss": 0.83261657, "learning_rate": 3.984775253787102e-06, "loss": 0.85282451, "num_input_tokens_seen": 12095570, "step": 567, "time_per_iteration": 2.543840169906616 }, { "auxiliary_loss_clip": 0.01259013, "auxiliary_loss_mlp": 0.01040242, "balance_loss_clip": 1.0702976, "balance_loss_mlp": 1.02662253, "epoch": 0.06829796188300366, "flos": 17930629284480.0, "grad_norm": 2.9638526082362917, "language_loss": 0.87789202, "learning_rate": 3.984679170117885e-06, "loss": 0.90088463, "num_input_tokens_seen": 12111775, "step": 568, "time_per_iteration": 2.4911394119262695 }, { "auxiliary_loss_clip": 0.01253339, "auxiliary_loss_mlp": 0.01041753, "balance_loss_clip": 1.06978703, "balance_loss_mlp": 1.02701879, "epoch": 0.06841820477364276, "flos": 14501627285760.0, "grad_norm": 2.599977270728226, "language_loss": 0.78607631, "learning_rate": 3.984582785374415e-06, "loss": 0.80902725, "num_input_tokens_seen": 12129215, "step": 569, "time_per_iteration": 2.512500524520874 }, { "auxiliary_loss_clip": 0.01240847, "auxiliary_loss_mlp": 0.00766995, "balance_loss_clip": 1.07188869, "balance_loss_mlp": 1.00038803, "epoch": 0.06853844766428185, "flos": 21938474954880.0, "grad_norm": 2.106959947043695, "language_loss": 0.80572426, "learning_rate": 3.9844860995713155e-06, "loss": 0.82580268, "num_input_tokens_seen": 12148755, "step": 570, "time_per_iteration": 2.5807104110717773 }, { "auxiliary_loss_clip": 0.01257449, "auxiliary_loss_mlp": 0.01043734, "balance_loss_clip": 1.07771003, "balance_loss_mlp": 1.03032315, "epoch": 0.06865869055492094, "flos": 16800628348800.0, "grad_norm": 2.3690490243003826, "language_loss": 0.83040488, "learning_rate": 3.9843891127232524e-06, "loss": 0.8534168, "num_input_tokens_seen": 12166290, "step": 571, "time_per_iteration": 2.5020804405212402 }, { "auxiliary_loss_clip": 0.0119598, "auxiliary_loss_mlp": 0.0104394, "balance_loss_clip": 1.06299663, "balance_loss_mlp": 1.02921748, "epoch": 0.06877893344556003, "flos": 19937281553280.0, "grad_norm": 2.2342563898718804, "language_loss": 0.66607964, "learning_rate": 3.984291824844938e-06, "loss": 0.68847883, "num_input_tokens_seen": 12181385, "step": 572, "time_per_iteration": 2.6187117099761963 }, { "auxiliary_loss_clip": 0.0127414, "auxiliary_loss_mlp": 0.01042764, "balance_loss_clip": 1.07464194, "balance_loss_mlp": 1.02866173, "epoch": 0.06889917633619912, "flos": 23039388852480.0, "grad_norm": 2.5559593446396294, "language_loss": 0.85313123, "learning_rate": 3.984194235951132e-06, "loss": 0.87630028, "num_input_tokens_seen": 12197530, "step": 573, "time_per_iteration": 2.500725746154785 }, { "auxiliary_loss_clip": 0.01277281, "auxiliary_loss_mlp": 0.01057823, "balance_loss_clip": 1.07881522, "balance_loss_mlp": 1.04467988, "epoch": 0.06901941922683821, "flos": 20960556203520.0, "grad_norm": 2.6776634110256503, "language_loss": 0.84668672, "learning_rate": 3.9840963460566375e-06, "loss": 0.87003773, "num_input_tokens_seen": 12216310, "step": 574, "time_per_iteration": 2.4887170791625977 }, { "auxiliary_loss_clip": 0.01180166, "auxiliary_loss_mlp": 0.01040193, "balance_loss_clip": 1.06158209, "balance_loss_mlp": 1.02635908, "epoch": 0.06913966211747731, "flos": 24821850384000.0, "grad_norm": 2.8095923957813396, "language_loss": 0.89605409, "learning_rate": 3.983998155176305e-06, "loss": 0.91825771, "num_input_tokens_seen": 12236670, "step": 575, "time_per_iteration": 2.8775432109832764 }, { "auxiliary_loss_clip": 0.01160083, "auxiliary_loss_mlp": 0.01004491, "balance_loss_clip": 1.03653765, "balance_loss_mlp": 0.99853033, "epoch": 0.06925990500811639, "flos": 58367446957440.0, "grad_norm": 0.8249431166627818, "language_loss": 0.57010722, "learning_rate": 3.9838996633250305e-06, "loss": 0.59175301, "num_input_tokens_seen": 12297185, "step": 576, "time_per_iteration": 3.157111883163452 }, { "auxiliary_loss_clip": 0.01255641, "auxiliary_loss_mlp": 0.01048447, "balance_loss_clip": 1.07015228, "balance_loss_mlp": 1.03566158, "epoch": 0.06938014789875549, "flos": 12749940731520.0, "grad_norm": 2.202914150551851, "language_loss": 0.88080609, "learning_rate": 3.983800870517753e-06, "loss": 0.90384698, "num_input_tokens_seen": 12313975, "step": 577, "time_per_iteration": 2.5130162239074707 }, { "auxiliary_loss_clip": 0.01252016, "auxiliary_loss_mlp": 0.01045524, "balance_loss_clip": 1.07390738, "balance_loss_mlp": 1.03321528, "epoch": 0.06950039078939457, "flos": 22820226019200.0, "grad_norm": 2.539474317481431, "language_loss": 0.78479326, "learning_rate": 3.983701776769463e-06, "loss": 0.80776858, "num_input_tokens_seen": 12331385, "step": 578, "time_per_iteration": 2.5052263736724854 }, { "auxiliary_loss_clip": 0.01250283, "auxiliary_loss_mlp": 0.01043673, "balance_loss_clip": 1.07302582, "balance_loss_mlp": 1.02979124, "epoch": 0.06962063368003367, "flos": 21941348042880.0, "grad_norm": 1.8980668948412203, "language_loss": 0.85984695, "learning_rate": 3.9836023820951885e-06, "loss": 0.88278651, "num_input_tokens_seen": 12350600, "step": 579, "time_per_iteration": 2.526423692703247 }, { "auxiliary_loss_clip": 0.01216759, "auxiliary_loss_mlp": 0.01049247, "balance_loss_clip": 1.06326473, "balance_loss_mlp": 1.03671849, "epoch": 0.06974087657067275, "flos": 20706021452160.0, "grad_norm": 1.789721971271885, "language_loss": 0.68311691, "learning_rate": 3.983502686510011e-06, "loss": 0.70577699, "num_input_tokens_seen": 12371430, "step": 580, "time_per_iteration": 2.6056437492370605 }, { "auxiliary_loss_clip": 0.01258208, "auxiliary_loss_mlp": 0.00766454, "balance_loss_clip": 1.06969035, "balance_loss_mlp": 1.00043762, "epoch": 0.06986111946131185, "flos": 22638230784000.0, "grad_norm": 2.542945349135428, "language_loss": 0.73738778, "learning_rate": 3.9834026900290525e-06, "loss": 0.75763446, "num_input_tokens_seen": 12390825, "step": 581, "time_per_iteration": 3.3825559616088867 }, { "auxiliary_loss_clip": 0.01270935, "auxiliary_loss_mlp": 0.01044763, "balance_loss_clip": 1.07250535, "balance_loss_mlp": 1.03175092, "epoch": 0.06998136235195095, "flos": 26943453152640.0, "grad_norm": 2.0011242746998557, "language_loss": 1.00358772, "learning_rate": 3.983302392667482e-06, "loss": 1.0267446, "num_input_tokens_seen": 12411670, "step": 582, "time_per_iteration": 2.5829575061798096 }, { "auxiliary_loss_clip": 0.01254151, "auxiliary_loss_mlp": 0.01043282, "balance_loss_clip": 1.07271314, "balance_loss_mlp": 1.03006136, "epoch": 0.07010160524259003, "flos": 22492505306880.0, "grad_norm": 1.8228352045585985, "language_loss": 0.93882, "learning_rate": 3.983201794440517e-06, "loss": 0.96179426, "num_input_tokens_seen": 12431245, "step": 583, "time_per_iteration": 2.4973533153533936 }, { "auxiliary_loss_clip": 0.01227027, "auxiliary_loss_mlp": 0.01040701, "balance_loss_clip": 1.06721604, "balance_loss_mlp": 1.02745128, "epoch": 0.07022184813322913, "flos": 18332541538560.0, "grad_norm": 1.6934672667998625, "language_loss": 0.67659652, "learning_rate": 3.9831008953634165e-06, "loss": 0.69927382, "num_input_tokens_seen": 12450535, "step": 584, "time_per_iteration": 3.3078415393829346 }, { "auxiliary_loss_clip": 0.01189039, "auxiliary_loss_mlp": 0.0104552, "balance_loss_clip": 1.05903637, "balance_loss_mlp": 1.03095865, "epoch": 0.07034209102386821, "flos": 24675550289280.0, "grad_norm": 2.2925334378134368, "language_loss": 0.8127383, "learning_rate": 3.9829996954514864e-06, "loss": 0.83508384, "num_input_tokens_seen": 12469675, "step": 585, "time_per_iteration": 2.6435909271240234 }, { "auxiliary_loss_clip": 0.01244677, "auxiliary_loss_mlp": 0.01047812, "balance_loss_clip": 1.06925857, "balance_loss_mlp": 1.03337574, "epoch": 0.0704623339145073, "flos": 25995878415360.0, "grad_norm": 1.8228520911126989, "language_loss": 0.84309483, "learning_rate": 3.982898194720079e-06, "loss": 0.86601979, "num_input_tokens_seen": 12490405, "step": 586, "time_per_iteration": 3.4164116382598877 }, { "auxiliary_loss_clip": 0.01236621, "auxiliary_loss_mlp": 0.00767169, "balance_loss_clip": 1.07208943, "balance_loss_mlp": 1.00044227, "epoch": 0.0705825768051464, "flos": 25338318088320.0, "grad_norm": 2.1328378285480496, "language_loss": 0.82623172, "learning_rate": 3.982796393184592e-06, "loss": 0.84626967, "num_input_tokens_seen": 12509485, "step": 587, "time_per_iteration": 3.375338554382324 }, { "auxiliary_loss_clip": 0.0114264, "auxiliary_loss_mlp": 0.01018892, "balance_loss_clip": 1.03358495, "balance_loss_mlp": 1.01264572, "epoch": 0.07070281969578548, "flos": 66047552507520.0, "grad_norm": 0.7932328357860626, "language_loss": 0.62630105, "learning_rate": 3.98269429086047e-06, "loss": 0.64791638, "num_input_tokens_seen": 12567325, "step": 588, "time_per_iteration": 2.972320318222046 }, { "auxiliary_loss_clip": 0.01227739, "auxiliary_loss_mlp": 0.01047875, "balance_loss_clip": 1.06882036, "balance_loss_mlp": 1.03336143, "epoch": 0.07082306258642458, "flos": 23653568528640.0, "grad_norm": 3.1409108826991807, "language_loss": 0.8650409, "learning_rate": 3.982591887763199e-06, "loss": 0.887797, "num_input_tokens_seen": 12584785, "step": 589, "time_per_iteration": 2.5690934658050537 }, { "auxiliary_loss_clip": 0.01199029, "auxiliary_loss_mlp": 0.01041068, "balance_loss_clip": 1.05703413, "balance_loss_mlp": 1.02635217, "epoch": 0.07094330547706366, "flos": 13880049408000.0, "grad_norm": 2.259644323531196, "language_loss": 0.81569409, "learning_rate": 3.982489183908316e-06, "loss": 0.83809513, "num_input_tokens_seen": 12601205, "step": 590, "time_per_iteration": 2.539745807647705 }, { "auxiliary_loss_clip": 0.01162121, "auxiliary_loss_mlp": 0.01044046, "balance_loss_clip": 1.05206561, "balance_loss_mlp": 1.03160095, "epoch": 0.07106354836770276, "flos": 24645098534400.0, "grad_norm": 2.361266753145197, "language_loss": 0.84465516, "learning_rate": 3.982386179311399e-06, "loss": 0.86671686, "num_input_tokens_seen": 12621725, "step": 591, "time_per_iteration": 2.6699905395507812 }, { "auxiliary_loss_clip": 0.01258631, "auxiliary_loss_mlp": 0.01047805, "balance_loss_clip": 1.07177889, "balance_loss_mlp": 1.03220034, "epoch": 0.07118379125834184, "flos": 16217223649920.0, "grad_norm": 2.277611624363638, "language_loss": 0.87452149, "learning_rate": 3.982282873988075e-06, "loss": 0.89758587, "num_input_tokens_seen": 12639600, "step": 592, "time_per_iteration": 2.4876914024353027 }, { "auxiliary_loss_clip": 0.01238607, "auxiliary_loss_mlp": 0.01042214, "balance_loss_clip": 1.07014549, "balance_loss_mlp": 1.02961385, "epoch": 0.07130403414898094, "flos": 19719986227200.0, "grad_norm": 1.6624626919673313, "language_loss": 0.87099338, "learning_rate": 3.982179267954016e-06, "loss": 0.89380157, "num_input_tokens_seen": 12660030, "step": 593, "time_per_iteration": 2.58138108253479 }, { "auxiliary_loss_clip": 0.0126898, "auxiliary_loss_mlp": 0.01038798, "balance_loss_clip": 1.07175934, "balance_loss_mlp": 1.02510643, "epoch": 0.07142427703962004, "flos": 21871933009920.0, "grad_norm": 2.132711499028081, "language_loss": 0.96150446, "learning_rate": 3.982075361224937e-06, "loss": 0.98458219, "num_input_tokens_seen": 12678395, "step": 594, "time_per_iteration": 2.519207239151001 }, { "auxiliary_loss_clip": 0.0124978, "auxiliary_loss_mlp": 0.00766412, "balance_loss_clip": 1.07181573, "balance_loss_mlp": 1.00041747, "epoch": 0.07154451993025912, "flos": 18296595002880.0, "grad_norm": 2.029373387923085, "language_loss": 0.87810063, "learning_rate": 3.981971153816602e-06, "loss": 0.89826262, "num_input_tokens_seen": 12696000, "step": 595, "time_per_iteration": 2.512521743774414 }, { "auxiliary_loss_clip": 0.01269919, "auxiliary_loss_mlp": 0.01041849, "balance_loss_clip": 1.07637262, "balance_loss_mlp": 1.02937961, "epoch": 0.07166476282089822, "flos": 22160690444160.0, "grad_norm": 1.6195154270940286, "language_loss": 0.96278137, "learning_rate": 3.981866645744819e-06, "loss": 0.98589909, "num_input_tokens_seen": 12716715, "step": 596, "time_per_iteration": 2.5581467151641846 }, { "auxiliary_loss_clip": 0.01272034, "auxiliary_loss_mlp": 0.00767275, "balance_loss_clip": 1.07455778, "balance_loss_mlp": 1.00038826, "epoch": 0.0717850057115373, "flos": 14136343925760.0, "grad_norm": 2.0429279469758113, "language_loss": 0.81561875, "learning_rate": 3.9817618370254416e-06, "loss": 0.83601189, "num_input_tokens_seen": 12733370, "step": 597, "time_per_iteration": 2.490833282470703 }, { "auxiliary_loss_clip": 0.01270889, "auxiliary_loss_mlp": 0.01050715, "balance_loss_clip": 1.07419133, "balance_loss_mlp": 1.03690469, "epoch": 0.0719052486021764, "flos": 30917794412160.0, "grad_norm": 2.035687491896235, "language_loss": 0.87465435, "learning_rate": 3.9816567276743684e-06, "loss": 0.89787042, "num_input_tokens_seen": 12753235, "step": 598, "time_per_iteration": 2.560598134994507 }, { "auxiliary_loss_clip": 0.01233461, "auxiliary_loss_mlp": 0.01042264, "balance_loss_clip": 1.06928658, "balance_loss_mlp": 1.02846587, "epoch": 0.0720254914928155, "flos": 21287019939840.0, "grad_norm": 2.1583285288588976, "language_loss": 0.77268213, "learning_rate": 3.9815513177075466e-06, "loss": 0.79543942, "num_input_tokens_seen": 12772020, "step": 599, "time_per_iteration": 2.5347886085510254 }, { "auxiliary_loss_clip": 0.01243331, "auxiliary_loss_mlp": 0.01043086, "balance_loss_clip": 1.06968951, "balance_loss_mlp": 1.0309267, "epoch": 0.07214573438345458, "flos": 27819170732160.0, "grad_norm": 1.6157611217976484, "language_loss": 0.70185947, "learning_rate": 3.9814456071409646e-06, "loss": 0.72472358, "num_input_tokens_seen": 12792555, "step": 600, "time_per_iteration": 2.6127421855926514 }, { "auxiliary_loss_clip": 0.01207243, "auxiliary_loss_mlp": 0.01054259, "balance_loss_clip": 1.06441975, "balance_loss_mlp": 1.03988266, "epoch": 0.07226597727409367, "flos": 25483576688640.0, "grad_norm": 3.3255513682903772, "language_loss": 0.85243708, "learning_rate": 3.981339595990659e-06, "loss": 0.87505209, "num_input_tokens_seen": 12811085, "step": 601, "time_per_iteration": 2.6214487552642822 }, { "auxiliary_loss_clip": 0.01252778, "auxiliary_loss_mlp": 0.01051634, "balance_loss_clip": 1.07165539, "balance_loss_mlp": 1.03695917, "epoch": 0.07238622016473276, "flos": 23513840622720.0, "grad_norm": 2.047304202507383, "language_loss": 0.80958486, "learning_rate": 3.981233284272713e-06, "loss": 0.83262897, "num_input_tokens_seen": 12830830, "step": 602, "time_per_iteration": 2.525503396987915 }, { "auxiliary_loss_clip": 0.0122027, "auxiliary_loss_mlp": 0.01042446, "balance_loss_clip": 1.06605244, "balance_loss_mlp": 1.03034663, "epoch": 0.07250646305537185, "flos": 25453519983360.0, "grad_norm": 2.1507699758599914, "language_loss": 0.90146458, "learning_rate": 3.981126672003253e-06, "loss": 0.92409182, "num_input_tokens_seen": 12853505, "step": 603, "time_per_iteration": 2.6561989784240723 }, { "auxiliary_loss_clip": 0.01239124, "auxiliary_loss_mlp": 0.01045948, "balance_loss_clip": 1.06450915, "balance_loss_mlp": 1.03307962, "epoch": 0.07262670594601094, "flos": 27155038216320.0, "grad_norm": 4.26484111265464, "language_loss": 0.78068703, "learning_rate": 3.981019759198451e-06, "loss": 0.80353779, "num_input_tokens_seen": 12872455, "step": 604, "time_per_iteration": 2.6214585304260254 }, { "auxiliary_loss_clip": 0.012373, "auxiliary_loss_mlp": 0.01048402, "balance_loss_clip": 1.06864572, "balance_loss_mlp": 1.03468108, "epoch": 0.07274694883665003, "flos": 26651607148800.0, "grad_norm": 2.466745686178935, "language_loss": 0.84249026, "learning_rate": 3.980912545874528e-06, "loss": 0.86534727, "num_input_tokens_seen": 12892620, "step": 605, "time_per_iteration": 2.586289405822754 }, { "auxiliary_loss_clip": 0.01248823, "auxiliary_loss_mlp": 0.007667, "balance_loss_clip": 1.07069409, "balance_loss_mlp": 1.00038338, "epoch": 0.07286719172728913, "flos": 29862344154240.0, "grad_norm": 1.8941259392666263, "language_loss": 0.85113156, "learning_rate": 3.980805032047746e-06, "loss": 0.87128687, "num_input_tokens_seen": 12914090, "step": 606, "time_per_iteration": 2.5893375873565674 }, { "auxiliary_loss_clip": 0.01230261, "auxiliary_loss_mlp": 0.01045049, "balance_loss_clip": 1.06495249, "balance_loss_mlp": 1.02938533, "epoch": 0.07298743461792821, "flos": 17382057799680.0, "grad_norm": 2.07065328759031, "language_loss": 0.81085038, "learning_rate": 3.980697217734415e-06, "loss": 0.8336035, "num_input_tokens_seen": 12931830, "step": 607, "time_per_iteration": 3.341517925262451 }, { "auxiliary_loss_clip": 0.01208733, "auxiliary_loss_mlp": 0.00766282, "balance_loss_clip": 1.06585383, "balance_loss_mlp": 1.00038433, "epoch": 0.07310767750856731, "flos": 19498201701120.0, "grad_norm": 3.847156902932048, "language_loss": 0.91369158, "learning_rate": 3.980589102950891e-06, "loss": 0.93344176, "num_input_tokens_seen": 12949995, "step": 608, "time_per_iteration": 2.586209535598755 }, { "auxiliary_loss_clip": 0.01237563, "auxiliary_loss_mlp": 0.01045981, "balance_loss_clip": 1.07175493, "balance_loss_mlp": 1.03175354, "epoch": 0.07322792039920639, "flos": 29168693637120.0, "grad_norm": 2.4805693067264927, "language_loss": 0.76412082, "learning_rate": 3.9804806877135755e-06, "loss": 0.78695619, "num_input_tokens_seen": 12968040, "step": 609, "time_per_iteration": 2.6467506885528564 }, { "auxiliary_loss_clip": 0.01258454, "auxiliary_loss_mlp": 0.00767288, "balance_loss_clip": 1.07046413, "balance_loss_mlp": 1.00043476, "epoch": 0.07334816328984549, "flos": 23477822259840.0, "grad_norm": 2.431415471061829, "language_loss": 0.85902739, "learning_rate": 3.980371972038915e-06, "loss": 0.87928486, "num_input_tokens_seen": 12988530, "step": 610, "time_per_iteration": 2.5906574726104736 }, { "auxiliary_loss_clip": 0.01270548, "auxiliary_loss_mlp": 0.01048316, "balance_loss_clip": 1.0740211, "balance_loss_mlp": 1.03436267, "epoch": 0.07346840618048459, "flos": 22962467877120.0, "grad_norm": 1.8416321857646867, "language_loss": 0.84427935, "learning_rate": 3.980262955943399e-06, "loss": 0.867468, "num_input_tokens_seen": 13008195, "step": 611, "time_per_iteration": 3.3923685550689697 }, { "auxiliary_loss_clip": 0.01229167, "auxiliary_loss_mlp": 0.01045747, "balance_loss_clip": 1.07031846, "balance_loss_mlp": 1.03315806, "epoch": 0.07358864907112367, "flos": 17673903803520.0, "grad_norm": 2.529981481700748, "language_loss": 0.86986494, "learning_rate": 3.980153639443569e-06, "loss": 0.89261407, "num_input_tokens_seen": 13024180, "step": 612, "time_per_iteration": 3.392343759536743 }, { "auxiliary_loss_clip": 0.01241957, "auxiliary_loss_mlp": 0.01045824, "balance_loss_clip": 1.07091379, "balance_loss_mlp": 1.03235972, "epoch": 0.07370889196176277, "flos": 24097029840000.0, "grad_norm": 2.0301275125406475, "language_loss": 0.8016212, "learning_rate": 3.980044022556005e-06, "loss": 0.82449901, "num_input_tokens_seen": 13043865, "step": 613, "time_per_iteration": 2.5738940238952637 }, { "auxiliary_loss_clip": 0.01253469, "auxiliary_loss_mlp": 0.01054136, "balance_loss_clip": 1.07109725, "balance_loss_mlp": 1.04085612, "epoch": 0.07382913485240185, "flos": 25885919905920.0, "grad_norm": 2.2365504979392195, "language_loss": 0.72833198, "learning_rate": 3.9799341052973375e-06, "loss": 0.75140798, "num_input_tokens_seen": 13063700, "step": 614, "time_per_iteration": 3.3543448448181152 }, { "auxiliary_loss_clip": 0.01238887, "auxiliary_loss_mlp": 0.01041745, "balance_loss_clip": 1.07326174, "balance_loss_mlp": 1.02729702, "epoch": 0.07394937774304094, "flos": 16873850223360.0, "grad_norm": 2.370760727714136, "language_loss": 0.74629426, "learning_rate": 3.979823887684241e-06, "loss": 0.76910055, "num_input_tokens_seen": 13082640, "step": 615, "time_per_iteration": 2.532843828201294 }, { "auxiliary_loss_clip": 0.01268693, "auxiliary_loss_mlp": 0.01048816, "balance_loss_clip": 1.07419467, "balance_loss_mlp": 1.03496361, "epoch": 0.07406962063368003, "flos": 20703471586560.0, "grad_norm": 6.031963824455091, "language_loss": 0.84616119, "learning_rate": 3.979713369733434e-06, "loss": 0.86933625, "num_input_tokens_seen": 13100505, "step": 616, "time_per_iteration": 2.4737789630889893 }, { "auxiliary_loss_clip": 0.01249145, "auxiliary_loss_mlp": 0.01053868, "balance_loss_clip": 1.07168019, "balance_loss_mlp": 1.04010522, "epoch": 0.07418986352431912, "flos": 21430985650560.0, "grad_norm": 2.1586248099889898, "language_loss": 0.84475207, "learning_rate": 3.979602551461683e-06, "loss": 0.86778224, "num_input_tokens_seen": 13121285, "step": 617, "time_per_iteration": 2.545459747314453 }, { "auxiliary_loss_clip": 0.01232963, "auxiliary_loss_mlp": 0.01045402, "balance_loss_clip": 1.06911814, "balance_loss_mlp": 1.03206229, "epoch": 0.07431010641495822, "flos": 12021133777920.0, "grad_norm": 2.211790457161869, "language_loss": 0.91610944, "learning_rate": 3.979491432885799e-06, "loss": 0.93889308, "num_input_tokens_seen": 13137550, "step": 618, "time_per_iteration": 2.505262613296509 }, { "auxiliary_loss_clip": 0.01199363, "auxiliary_loss_mlp": 0.00766176, "balance_loss_clip": 1.06141984, "balance_loss_mlp": 1.00033629, "epoch": 0.0744303493055973, "flos": 20957575374720.0, "grad_norm": 3.1984461305388243, "language_loss": 0.83165497, "learning_rate": 3.97938001402264e-06, "loss": 0.85131025, "num_input_tokens_seen": 13156675, "step": 619, "time_per_iteration": 2.5636367797851562 }, { "auxiliary_loss_clip": 0.01214358, "auxiliary_loss_mlp": 0.01044044, "balance_loss_clip": 1.06799436, "balance_loss_mlp": 1.0309844, "epoch": 0.0745505921962364, "flos": 16253134272000.0, "grad_norm": 2.762055545839595, "language_loss": 0.79720354, "learning_rate": 3.979268294889105e-06, "loss": 0.81978762, "num_input_tokens_seen": 13172225, "step": 620, "time_per_iteration": 2.5558083057403564 }, { "auxiliary_loss_clip": 0.01268026, "auxiliary_loss_mlp": 0.01045921, "balance_loss_clip": 1.07303298, "balance_loss_mlp": 1.03282547, "epoch": 0.07467083508687548, "flos": 50944635550080.0, "grad_norm": 1.8737530166268968, "language_loss": 0.7385307, "learning_rate": 3.979156275502143e-06, "loss": 0.76167017, "num_input_tokens_seen": 13195885, "step": 621, "time_per_iteration": 2.7360219955444336 }, { "auxiliary_loss_clip": 0.0122435, "auxiliary_loss_mlp": 0.01049868, "balance_loss_clip": 1.06858814, "balance_loss_mlp": 1.03562284, "epoch": 0.07479107797751458, "flos": 17529686697600.0, "grad_norm": 2.2542555759247103, "language_loss": 0.91447878, "learning_rate": 3.979043955878749e-06, "loss": 0.93722099, "num_input_tokens_seen": 13213730, "step": 622, "time_per_iteration": 2.5534796714782715 }, { "auxiliary_loss_clip": 0.0123374, "auxiliary_loss_mlp": 0.01041248, "balance_loss_clip": 1.06887007, "balance_loss_mlp": 1.02829587, "epoch": 0.07491132086815366, "flos": 23473943591040.0, "grad_norm": 2.323915351861144, "language_loss": 0.83450806, "learning_rate": 3.978931336035959e-06, "loss": 0.85725796, "num_input_tokens_seen": 13232540, "step": 623, "time_per_iteration": 2.5568387508392334 }, { "auxiliary_loss_clip": 0.0125447, "auxiliary_loss_mlp": 0.01048807, "balance_loss_clip": 1.07419753, "balance_loss_mlp": 1.03507972, "epoch": 0.07503156375879276, "flos": 20157557708160.0, "grad_norm": 2.1762620619161925, "language_loss": 0.82393086, "learning_rate": 3.9788184159908595e-06, "loss": 0.84696364, "num_input_tokens_seen": 13249670, "step": 624, "time_per_iteration": 2.513934373855591 }, { "auxiliary_loss_clip": 0.01230364, "auxiliary_loss_mlp": 0.01049586, "balance_loss_clip": 1.06845868, "balance_loss_mlp": 1.03695536, "epoch": 0.07515180664943186, "flos": 15115519653120.0, "grad_norm": 2.3595351152660653, "language_loss": 0.81973308, "learning_rate": 3.97870519576058e-06, "loss": 0.84253263, "num_input_tokens_seen": 13266095, "step": 625, "time_per_iteration": 2.5559334754943848 }, { "auxiliary_loss_clip": 0.01218141, "auxiliary_loss_mlp": 0.00767158, "balance_loss_clip": 1.06659019, "balance_loss_mlp": 1.0003103, "epoch": 0.07527204954007094, "flos": 21287702298240.0, "grad_norm": 2.801635387591967, "language_loss": 0.80702305, "learning_rate": 3.978591675362295e-06, "loss": 0.82687598, "num_input_tokens_seen": 13284810, "step": 626, "time_per_iteration": 2.595956802368164 }, { "auxiliary_loss_clip": 0.01201154, "auxiliary_loss_mlp": 0.01044592, "balance_loss_clip": 1.06840372, "balance_loss_mlp": 1.03134155, "epoch": 0.07539229243071004, "flos": 21324187537920.0, "grad_norm": 1.6730813388501025, "language_loss": 0.87549746, "learning_rate": 3.978477854813226e-06, "loss": 0.89795494, "num_input_tokens_seen": 13304150, "step": 627, "time_per_iteration": 2.600982666015625 }, { "auxiliary_loss_clip": 0.01253248, "auxiliary_loss_mlp": 0.01048684, "balance_loss_clip": 1.07085347, "balance_loss_mlp": 1.03615475, "epoch": 0.07551253532134912, "flos": 13042540920960.0, "grad_norm": 1.9558530616412129, "language_loss": 0.82543725, "learning_rate": 3.97836373413064e-06, "loss": 0.8484565, "num_input_tokens_seen": 13322205, "step": 628, "time_per_iteration": 2.5177395343780518 }, { "auxiliary_loss_clip": 0.01266088, "auxiliary_loss_mlp": 0.01043854, "balance_loss_clip": 1.0708847, "balance_loss_mlp": 1.03028774, "epoch": 0.07563277821198822, "flos": 19208761908480.0, "grad_norm": 1.9345176139540943, "language_loss": 0.74608195, "learning_rate": 3.978249313331848e-06, "loss": 0.76918143, "num_input_tokens_seen": 13340435, "step": 629, "time_per_iteration": 2.465623378753662 }, { "auxiliary_loss_clip": 0.01258161, "auxiliary_loss_mlp": 0.00766786, "balance_loss_clip": 1.06967878, "balance_loss_mlp": 1.00033998, "epoch": 0.07575302110262731, "flos": 19537200892800.0, "grad_norm": 3.378119891573478, "language_loss": 0.61919975, "learning_rate": 3.978134592434208e-06, "loss": 0.63944924, "num_input_tokens_seen": 13358185, "step": 630, "time_per_iteration": 2.564579486846924 }, { "auxiliary_loss_clip": 0.01114498, "auxiliary_loss_mlp": 0.01025124, "balance_loss_clip": 1.04549789, "balance_loss_mlp": 1.01830506, "epoch": 0.0758732639932664, "flos": 67961808017280.0, "grad_norm": 1.0213358593015807, "language_loss": 0.59412086, "learning_rate": 3.978019571455123e-06, "loss": 0.61551702, "num_input_tokens_seen": 13410130, "step": 631, "time_per_iteration": 3.167510509490967 }, { "auxiliary_loss_clip": 0.01265159, "auxiliary_loss_mlp": 0.01043403, "balance_loss_clip": 1.07312822, "balance_loss_mlp": 1.0312674, "epoch": 0.07599350688390549, "flos": 18989204025600.0, "grad_norm": 2.048854635764491, "language_loss": 0.84041762, "learning_rate": 3.977904250412042e-06, "loss": 0.86350322, "num_input_tokens_seen": 13429085, "step": 632, "time_per_iteration": 2.4932351112365723 }, { "auxiliary_loss_clip": 0.0121114, "auxiliary_loss_mlp": 0.01043917, "balance_loss_clip": 1.06574404, "balance_loss_mlp": 1.03105998, "epoch": 0.07611374977454458, "flos": 21069006341760.0, "grad_norm": 2.2991946161603662, "language_loss": 0.85610855, "learning_rate": 3.97778862932246e-06, "loss": 0.87865913, "num_input_tokens_seen": 13446250, "step": 633, "time_per_iteration": 2.5709784030914307 }, { "auxiliary_loss_clip": 0.01163959, "auxiliary_loss_mlp": 0.01039104, "balance_loss_clip": 1.05389297, "balance_loss_mlp": 1.02615809, "epoch": 0.07623399266518367, "flos": 18514536773760.0, "grad_norm": 2.2726817816136817, "language_loss": 0.94089937, "learning_rate": 3.9776727082039144e-06, "loss": 0.96293008, "num_input_tokens_seen": 13463220, "step": 634, "time_per_iteration": 3.648066520690918 }, { "auxiliary_loss_clip": 0.01176437, "auxiliary_loss_mlp": 0.01020059, "balance_loss_clip": 1.05035758, "balance_loss_mlp": 1.01281059, "epoch": 0.07635423555582276, "flos": 44663036077440.0, "grad_norm": 0.82104065496255, "language_loss": 0.55479491, "learning_rate": 3.977556487073991e-06, "loss": 0.57675982, "num_input_tokens_seen": 13517775, "step": 635, "time_per_iteration": 3.402374505996704 }, { "auxiliary_loss_clip": 0.01223577, "auxiliary_loss_mlp": 0.0104852, "balance_loss_clip": 1.06167388, "balance_loss_mlp": 1.03644371, "epoch": 0.07647447844646185, "flos": 21761148487680.0, "grad_norm": 1.946885550848919, "language_loss": 0.81662524, "learning_rate": 3.97743996595032e-06, "loss": 0.83934617, "num_input_tokens_seen": 13537815, "step": 636, "time_per_iteration": 2.5996227264404297 }, { "auxiliary_loss_clip": 0.01265604, "auxiliary_loss_mlp": 0.01049177, "balance_loss_clip": 1.0724721, "balance_loss_mlp": 1.03498518, "epoch": 0.07659472133710095, "flos": 23806799948160.0, "grad_norm": 1.6803141202601504, "language_loss": 0.8170054, "learning_rate": 3.9773231448505804e-06, "loss": 0.84015316, "num_input_tokens_seen": 13559605, "step": 637, "time_per_iteration": 2.537360191345215 }, { "auxiliary_loss_clip": 0.01230394, "auxiliary_loss_mlp": 0.0076715, "balance_loss_clip": 1.06851327, "balance_loss_mlp": 1.00029206, "epoch": 0.07671496422774003, "flos": 21469984842240.0, "grad_norm": 2.2124122697110415, "language_loss": 0.78051829, "learning_rate": 3.977206023792491e-06, "loss": 0.80049372, "num_input_tokens_seen": 13579495, "step": 638, "time_per_iteration": 3.310861349105835 }, { "auxiliary_loss_clip": 0.0125072, "auxiliary_loss_mlp": 0.01058308, "balance_loss_clip": 1.07406616, "balance_loss_mlp": 1.04571342, "epoch": 0.07683520711837913, "flos": 16980971558400.0, "grad_norm": 2.2335488895141045, "language_loss": 0.81255984, "learning_rate": 3.97708860279382e-06, "loss": 0.83565015, "num_input_tokens_seen": 13597605, "step": 639, "time_per_iteration": 3.3759865760803223 }, { "auxiliary_loss_clip": 0.01214848, "auxiliary_loss_mlp": 0.01053252, "balance_loss_clip": 1.06397736, "balance_loss_mlp": 1.03928065, "epoch": 0.07695545000901821, "flos": 23476744851840.0, "grad_norm": 1.8175020274815743, "language_loss": 0.78140259, "learning_rate": 3.97697088187238e-06, "loss": 0.80408359, "num_input_tokens_seen": 13618120, "step": 640, "time_per_iteration": 2.6446046829223633 }, { "auxiliary_loss_clip": 0.01230928, "auxiliary_loss_mlp": 0.01050001, "balance_loss_clip": 1.07108378, "balance_loss_mlp": 1.03768706, "epoch": 0.07707569289965731, "flos": 17634258167040.0, "grad_norm": 2.831044167807043, "language_loss": 0.92103654, "learning_rate": 3.976852861046029e-06, "loss": 0.94384587, "num_input_tokens_seen": 13634735, "step": 641, "time_per_iteration": 3.3709909915924072 }, { "auxiliary_loss_clip": 0.01201538, "auxiliary_loss_mlp": 0.01045002, "balance_loss_clip": 1.06465745, "balance_loss_mlp": 1.03225219, "epoch": 0.0771959357902964, "flos": 25775674087680.0, "grad_norm": 1.579703201189408, "language_loss": 0.80200726, "learning_rate": 3.97673454033267e-06, "loss": 0.82447267, "num_input_tokens_seen": 13656835, "step": 642, "time_per_iteration": 2.7125182151794434 }, { "auxiliary_loss_clip": 0.01229893, "auxiliary_loss_mlp": 0.01050755, "balance_loss_clip": 1.06527793, "balance_loss_mlp": 1.03763044, "epoch": 0.07731617868093549, "flos": 19828651847040.0, "grad_norm": 1.9451512243597169, "language_loss": 0.82321537, "learning_rate": 3.976615919750254e-06, "loss": 0.84602189, "num_input_tokens_seen": 13674535, "step": 643, "time_per_iteration": 2.5751240253448486 }, { "auxiliary_loss_clip": 0.01246034, "auxiliary_loss_mlp": 0.01049205, "balance_loss_clip": 1.06955707, "balance_loss_mlp": 1.03494167, "epoch": 0.07743642157157458, "flos": 21324654414720.0, "grad_norm": 2.091017236888172, "language_loss": 0.86677492, "learning_rate": 3.976496999316775e-06, "loss": 0.88972723, "num_input_tokens_seen": 13693290, "step": 644, "time_per_iteration": 2.5353384017944336 }, { "auxiliary_loss_clip": 0.01229732, "auxiliary_loss_mlp": 0.01049438, "balance_loss_clip": 1.07174492, "balance_loss_mlp": 1.03572929, "epoch": 0.07755666446221367, "flos": 19969133938560.0, "grad_norm": 2.1687205140379144, "language_loss": 0.84138197, "learning_rate": 3.976377779050271e-06, "loss": 0.86417365, "num_input_tokens_seen": 13711420, "step": 645, "time_per_iteration": 2.529776096343994 }, { "auxiliary_loss_clip": 0.01238387, "auxiliary_loss_mlp": 0.01052681, "balance_loss_clip": 1.06626809, "balance_loss_mlp": 1.03955626, "epoch": 0.07767690735285276, "flos": 23623224514560.0, "grad_norm": 2.204351951587358, "language_loss": 0.84668046, "learning_rate": 3.976258258968831e-06, "loss": 0.86959118, "num_input_tokens_seen": 13729965, "step": 646, "time_per_iteration": 2.5297060012817383 }, { "auxiliary_loss_clip": 0.01214031, "auxiliary_loss_mlp": 0.01049297, "balance_loss_clip": 1.06744981, "balance_loss_mlp": 1.03694677, "epoch": 0.07779715024349185, "flos": 22236246702720.0, "grad_norm": 2.2872897961186633, "language_loss": 0.7417531, "learning_rate": 3.976138439090583e-06, "loss": 0.76438642, "num_input_tokens_seen": 13748045, "step": 647, "time_per_iteration": 2.563839912414551 }, { "auxiliary_loss_clip": 0.01218976, "auxiliary_loss_mlp": 0.01041613, "balance_loss_clip": 1.06795955, "balance_loss_mlp": 1.02790403, "epoch": 0.07791739313413094, "flos": 20955097336320.0, "grad_norm": 2.080131721883903, "language_loss": 0.85294449, "learning_rate": 3.976018319433706e-06, "loss": 0.87555039, "num_input_tokens_seen": 13765590, "step": 648, "time_per_iteration": 2.6100010871887207 }, { "auxiliary_loss_clip": 0.01244642, "auxiliary_loss_mlp": 0.01045595, "balance_loss_clip": 1.06948924, "balance_loss_mlp": 1.03258371, "epoch": 0.07803763602477004, "flos": 19312327797120.0, "grad_norm": 2.919402372990991, "language_loss": 0.90911055, "learning_rate": 3.9758979000164205e-06, "loss": 0.93201292, "num_input_tokens_seen": 13782410, "step": 649, "time_per_iteration": 2.493290424346924 }, { "auxiliary_loss_clip": 0.01222736, "auxiliary_loss_mlp": 0.01038453, "balance_loss_clip": 1.06792569, "balance_loss_mlp": 1.02435088, "epoch": 0.07815787891540912, "flos": 22710806213760.0, "grad_norm": 1.9905593002458517, "language_loss": 0.71715081, "learning_rate": 3.975777180856995e-06, "loss": 0.73976272, "num_input_tokens_seen": 13801530, "step": 650, "time_per_iteration": 2.5844192504882812 }, { "auxiliary_loss_clip": 0.01269164, "auxiliary_loss_mlp": 0.01052243, "balance_loss_clip": 1.07210231, "balance_loss_mlp": 1.03818274, "epoch": 0.07827812180604822, "flos": 22711129436160.0, "grad_norm": 1.9965005916549772, "language_loss": 0.86122298, "learning_rate": 3.975656161973742e-06, "loss": 0.88443708, "num_input_tokens_seen": 13820615, "step": 651, "time_per_iteration": 2.4871466159820557 }, { "auxiliary_loss_clip": 0.01266259, "auxiliary_loss_mlp": 0.01048124, "balance_loss_clip": 1.0703311, "balance_loss_mlp": 1.03427768, "epoch": 0.0783983646966873, "flos": 21725597001600.0, "grad_norm": 2.5839022706032084, "language_loss": 0.89153659, "learning_rate": 3.9755348433850194e-06, "loss": 0.91468048, "num_input_tokens_seen": 13835955, "step": 652, "time_per_iteration": 2.4620301723480225 }, { "auxiliary_loss_clip": 0.01129248, "auxiliary_loss_mlp": 0.0102131, "balance_loss_clip": 1.03415215, "balance_loss_mlp": 1.01501548, "epoch": 0.0785186075873264, "flos": 60640877537280.0, "grad_norm": 0.9887619338597119, "language_loss": 0.63607579, "learning_rate": 3.975413225109232e-06, "loss": 0.65758133, "num_input_tokens_seen": 13896505, "step": 653, "time_per_iteration": 3.128957748413086 }, { "auxiliary_loss_clip": 0.01247449, "auxiliary_loss_mlp": 0.01043969, "balance_loss_clip": 1.06966519, "balance_loss_mlp": 1.03072464, "epoch": 0.0786388504779655, "flos": 23877902920320.0, "grad_norm": 3.090177528024159, "language_loss": 0.93495977, "learning_rate": 3.975291307164829e-06, "loss": 0.95787394, "num_input_tokens_seen": 13915150, "step": 654, "time_per_iteration": 2.518588066101074 }, { "auxiliary_loss_clip": 0.01203009, "auxiliary_loss_mlp": 0.01042327, "balance_loss_clip": 1.06245482, "balance_loss_mlp": 1.03022122, "epoch": 0.07875909336860458, "flos": 15158684822400.0, "grad_norm": 2.3712801821022276, "language_loss": 0.85193503, "learning_rate": 3.975169089570306e-06, "loss": 0.8743884, "num_input_tokens_seen": 13933525, "step": 655, "time_per_iteration": 2.5760207176208496 }, { "auxiliary_loss_clip": 0.01231263, "auxiliary_loss_mlp": 0.01044512, "balance_loss_clip": 1.06589484, "balance_loss_mlp": 1.03122044, "epoch": 0.07887933625924368, "flos": 22236857233920.0, "grad_norm": 2.017975083997081, "language_loss": 0.91828609, "learning_rate": 3.975046572344202e-06, "loss": 0.94104385, "num_input_tokens_seen": 13949985, "step": 656, "time_per_iteration": 2.5114076137542725 }, { "auxiliary_loss_clip": 0.01208942, "auxiliary_loss_mlp": 0.0105043, "balance_loss_clip": 1.06128097, "balance_loss_mlp": 1.0372932, "epoch": 0.07899957914988276, "flos": 20777734955520.0, "grad_norm": 1.9462314733049724, "language_loss": 0.71072853, "learning_rate": 3.974923755505103e-06, "loss": 0.73332226, "num_input_tokens_seen": 13969215, "step": 657, "time_per_iteration": 2.5772500038146973 }, { "auxiliary_loss_clip": 0.01204601, "auxiliary_loss_mlp": 0.01043775, "balance_loss_clip": 1.06278753, "balance_loss_mlp": 1.03082943, "epoch": 0.07911982204052186, "flos": 23003047267200.0, "grad_norm": 1.7194952265079781, "language_loss": 0.91309035, "learning_rate": 3.974800639071641e-06, "loss": 0.93557405, "num_input_tokens_seen": 13989935, "step": 658, "time_per_iteration": 2.632525682449341 }, { "auxiliary_loss_clip": 0.01170711, "auxiliary_loss_mlp": 0.0076629, "balance_loss_clip": 1.05856371, "balance_loss_mlp": 1.00033998, "epoch": 0.07924006493116094, "flos": 23111389664640.0, "grad_norm": 2.136139036446059, "language_loss": 1.00600171, "learning_rate": 3.974677223062492e-06, "loss": 1.02537167, "num_input_tokens_seen": 14007150, "step": 659, "time_per_iteration": 2.723534107208252 }, { "auxiliary_loss_clip": 0.0122892, "auxiliary_loss_mlp": 0.01038667, "balance_loss_clip": 1.06812882, "balance_loss_mlp": 1.02604234, "epoch": 0.07936030782180004, "flos": 16472153450880.0, "grad_norm": 1.9440289711396954, "language_loss": 0.74405819, "learning_rate": 3.974553507496378e-06, "loss": 0.76673406, "num_input_tokens_seen": 14025725, "step": 660, "time_per_iteration": 3.3443922996520996 }, { "auxiliary_loss_clip": 0.01222159, "auxiliary_loss_mlp": 0.01042804, "balance_loss_clip": 1.0670166, "balance_loss_mlp": 1.02789712, "epoch": 0.07948055071243913, "flos": 23733290764800.0, "grad_norm": 2.3643717075132424, "language_loss": 0.88975132, "learning_rate": 3.974429492392068e-06, "loss": 0.91240096, "num_input_tokens_seen": 14045750, "step": 661, "time_per_iteration": 2.5975606441497803 }, { "auxiliary_loss_clip": 0.01260346, "auxiliary_loss_mlp": 0.00766076, "balance_loss_clip": 1.07208467, "balance_loss_mlp": 1.0003016, "epoch": 0.07960079360307822, "flos": 19573326996480.0, "grad_norm": 2.0034913793044646, "language_loss": 0.91186309, "learning_rate": 3.974305177768373e-06, "loss": 0.93212724, "num_input_tokens_seen": 14063960, "step": 662, "time_per_iteration": 2.5311028957366943 }, { "auxiliary_loss_clip": 0.0120549, "auxiliary_loss_mlp": 0.01049673, "balance_loss_clip": 1.06485152, "balance_loss_mlp": 1.03619099, "epoch": 0.07972103649371731, "flos": 23513409659520.0, "grad_norm": 3.2858681107034564, "language_loss": 0.86359072, "learning_rate": 3.974180563644152e-06, "loss": 0.88614237, "num_input_tokens_seen": 14082525, "step": 663, "time_per_iteration": 2.5958173274993896 }, { "auxiliary_loss_clip": 0.01234112, "auxiliary_loss_mlp": 0.01045875, "balance_loss_clip": 1.06768823, "balance_loss_mlp": 1.03302395, "epoch": 0.0798412793843564, "flos": 16726867770240.0, "grad_norm": 2.529548067039062, "language_loss": 0.89450055, "learning_rate": 3.97405565003831e-06, "loss": 0.91730046, "num_input_tokens_seen": 14098610, "step": 664, "time_per_iteration": 3.256565570831299 }, { "auxiliary_loss_clip": 0.01213461, "auxiliary_loss_mlp": 0.01037467, "balance_loss_clip": 1.06377316, "balance_loss_mlp": 1.02453303, "epoch": 0.07996152227499549, "flos": 18223337214720.0, "grad_norm": 1.9961695254687806, "language_loss": 0.78604126, "learning_rate": 3.973930436969794e-06, "loss": 0.80855048, "num_input_tokens_seen": 14117065, "step": 665, "time_per_iteration": 2.5939548015594482 }, { "auxiliary_loss_clip": 0.01220187, "auxiliary_loss_mlp": 0.01050269, "balance_loss_clip": 1.06355524, "balance_loss_mlp": 1.03709602, "epoch": 0.08008176516563459, "flos": 20594877793920.0, "grad_norm": 1.9619162619694925, "language_loss": 0.85930985, "learning_rate": 3.973804924457602e-06, "loss": 0.88201445, "num_input_tokens_seen": 14135145, "step": 666, "time_per_iteration": 3.353271484375 }, { "auxiliary_loss_clip": 0.01221835, "auxiliary_loss_mlp": 0.01058227, "balance_loss_clip": 1.06583881, "balance_loss_mlp": 1.04523349, "epoch": 0.08020200805627367, "flos": 31834306863360.0, "grad_norm": 1.9799605433536214, "language_loss": 0.85940409, "learning_rate": 3.973679112520771e-06, "loss": 0.88220471, "num_input_tokens_seen": 14156860, "step": 667, "time_per_iteration": 2.6472995281219482 }, { "auxiliary_loss_clip": 0.01203249, "auxiliary_loss_mlp": 0.01038871, "balance_loss_clip": 1.06058013, "balance_loss_mlp": 1.02604365, "epoch": 0.08032225094691277, "flos": 17783503176960.0, "grad_norm": 1.9834034320635612, "language_loss": 0.99088603, "learning_rate": 3.973553001178389e-06, "loss": 1.01330733, "num_input_tokens_seen": 14174365, "step": 668, "time_per_iteration": 3.356459379196167 }, { "auxiliary_loss_clip": 0.01215875, "auxiliary_loss_mlp": 0.01037173, "balance_loss_clip": 1.06615376, "balance_loss_mlp": 1.02455425, "epoch": 0.08044249383755185, "flos": 24061693835520.0, "grad_norm": 2.304705174710886, "language_loss": 0.75877726, "learning_rate": 3.973426590449585e-06, "loss": 0.7813077, "num_input_tokens_seen": 14192320, "step": 669, "time_per_iteration": 2.5939626693725586 }, { "auxiliary_loss_clip": 0.01198574, "auxiliary_loss_mlp": 0.01039915, "balance_loss_clip": 1.06314945, "balance_loss_mlp": 1.02717161, "epoch": 0.08056273672819095, "flos": 18223624523520.0, "grad_norm": 1.8725881784779344, "language_loss": 0.75441134, "learning_rate": 3.9732998803535364e-06, "loss": 0.77679622, "num_input_tokens_seen": 14210380, "step": 670, "time_per_iteration": 2.5770716667175293 }, { "auxiliary_loss_clip": 0.01262434, "auxiliary_loss_mlp": 0.01044146, "balance_loss_clip": 1.0697782, "balance_loss_mlp": 1.03130746, "epoch": 0.08068297961883003, "flos": 19676856971520.0, "grad_norm": 2.540563615216749, "language_loss": 0.85565472, "learning_rate": 3.973172870909465e-06, "loss": 0.87872052, "num_input_tokens_seen": 14225145, "step": 671, "time_per_iteration": 2.4702680110931396 }, { "auxiliary_loss_clip": 0.01235763, "auxiliary_loss_mlp": 0.01040664, "balance_loss_clip": 1.06584334, "balance_loss_mlp": 1.02686596, "epoch": 0.08080322250946913, "flos": 23148736830720.0, "grad_norm": 2.9818086536384993, "language_loss": 0.80207551, "learning_rate": 3.973045562136638e-06, "loss": 0.82483977, "num_input_tokens_seen": 14241960, "step": 672, "time_per_iteration": 2.5467593669891357 }, { "auxiliary_loss_clip": 0.01249875, "auxiliary_loss_mlp": 0.01041783, "balance_loss_clip": 1.06946349, "balance_loss_mlp": 1.02912903, "epoch": 0.08092346540010822, "flos": 21763626526080.0, "grad_norm": 2.2014684530543898, "language_loss": 0.91550684, "learning_rate": 3.972917954054368e-06, "loss": 0.93842345, "num_input_tokens_seen": 14260515, "step": 673, "time_per_iteration": 2.5179567337036133 }, { "auxiliary_loss_clip": 0.01229938, "auxiliary_loss_mlp": 0.01047212, "balance_loss_clip": 1.07049966, "balance_loss_mlp": 1.03244793, "epoch": 0.08104370829074731, "flos": 21032485188480.0, "grad_norm": 2.1421099713565432, "language_loss": 0.81740284, "learning_rate": 3.972790046682013e-06, "loss": 0.84017432, "num_input_tokens_seen": 14279190, "step": 674, "time_per_iteration": 2.5228404998779297 }, { "auxiliary_loss_clip": 0.01214516, "auxiliary_loss_mlp": 0.01041553, "balance_loss_clip": 1.06217706, "balance_loss_mlp": 1.02849913, "epoch": 0.0811639511813864, "flos": 20083186598400.0, "grad_norm": 1.803704715733776, "language_loss": 0.78986847, "learning_rate": 3.972661840038977e-06, "loss": 0.81242919, "num_input_tokens_seen": 14299480, "step": 675, "time_per_iteration": 2.58609676361084 }, { "auxiliary_loss_clip": 0.01248848, "auxiliary_loss_mlp": 0.01040152, "balance_loss_clip": 1.07150388, "balance_loss_mlp": 1.02758765, "epoch": 0.08128419407202549, "flos": 16836718538880.0, "grad_norm": 2.1729115301359716, "language_loss": 0.83291638, "learning_rate": 3.972533334144707e-06, "loss": 0.85580635, "num_input_tokens_seen": 14316405, "step": 676, "time_per_iteration": 2.4742016792297363 }, { "auxiliary_loss_clip": 0.01250297, "auxiliary_loss_mlp": 0.01043874, "balance_loss_clip": 1.06717348, "balance_loss_mlp": 1.03067183, "epoch": 0.08140443696266458, "flos": 23769273214080.0, "grad_norm": 2.422506621434802, "language_loss": 0.783234, "learning_rate": 3.972404529018699e-06, "loss": 0.80617571, "num_input_tokens_seen": 14336265, "step": 677, "time_per_iteration": 2.538540840148926 }, { "auxiliary_loss_clip": 0.01223953, "auxiliary_loss_mlp": 0.01035828, "balance_loss_clip": 1.06134868, "balance_loss_mlp": 1.02343643, "epoch": 0.08152467985330367, "flos": 24390132819840.0, "grad_norm": 1.984735276641255, "language_loss": 0.85500354, "learning_rate": 3.972275424680493e-06, "loss": 0.87760133, "num_input_tokens_seen": 14356375, "step": 678, "time_per_iteration": 2.5549771785736084 }, { "auxiliary_loss_clip": 0.01259167, "auxiliary_loss_mlp": 0.01037917, "balance_loss_clip": 1.06892157, "balance_loss_mlp": 1.0253942, "epoch": 0.08164492274394276, "flos": 19317750750720.0, "grad_norm": 2.101815434680312, "language_loss": 0.91848153, "learning_rate": 3.972146021149673e-06, "loss": 0.94145238, "num_input_tokens_seen": 14374650, "step": 679, "time_per_iteration": 2.4407427310943604 }, { "auxiliary_loss_clip": 0.01213828, "auxiliary_loss_mlp": 0.01040496, "balance_loss_clip": 1.06540799, "balance_loss_mlp": 1.02911127, "epoch": 0.08176516563458186, "flos": 14830461319680.0, "grad_norm": 3.178083159867089, "language_loss": 0.7836771, "learning_rate": 3.972016318445868e-06, "loss": 0.80622035, "num_input_tokens_seen": 14392650, "step": 680, "time_per_iteration": 2.5333454608917236 }, { "auxiliary_loss_clip": 0.01243865, "auxiliary_loss_mlp": 0.01046605, "balance_loss_clip": 1.0676018, "balance_loss_mlp": 1.03423095, "epoch": 0.08188540852522094, "flos": 22602320161920.0, "grad_norm": 2.239080391683848, "language_loss": 0.92591882, "learning_rate": 3.971886316588757e-06, "loss": 0.94882351, "num_input_tokens_seen": 14413155, "step": 681, "time_per_iteration": 2.5087900161743164 }, { "auxiliary_loss_clip": 0.01204447, "auxiliary_loss_mlp": 0.01049683, "balance_loss_clip": 1.06499052, "balance_loss_mlp": 1.03581882, "epoch": 0.08200565141586004, "flos": 19463727623040.0, "grad_norm": 2.4840795439774612, "language_loss": 0.73761892, "learning_rate": 3.9717560155980595e-06, "loss": 0.76016021, "num_input_tokens_seen": 14428805, "step": 682, "time_per_iteration": 2.557297706604004 }, { "auxiliary_loss_clip": 0.01245373, "auxiliary_loss_mlp": 0.01045271, "balance_loss_clip": 1.06866574, "balance_loss_mlp": 1.03249168, "epoch": 0.08212589430649912, "flos": 20594662312320.0, "grad_norm": 2.6635650868617953, "language_loss": 0.92139184, "learning_rate": 3.971625415493542e-06, "loss": 0.94429833, "num_input_tokens_seen": 14447125, "step": 683, "time_per_iteration": 2.495816469192505 }, { "auxiliary_loss_clip": 0.0120769, "auxiliary_loss_mlp": 0.01042311, "balance_loss_clip": 1.06395745, "balance_loss_mlp": 1.02907932, "epoch": 0.08224613719713822, "flos": 25953611086080.0, "grad_norm": 1.8735624448812276, "language_loss": 0.87464941, "learning_rate": 3.971494516295017e-06, "loss": 0.89714932, "num_input_tokens_seen": 14466575, "step": 684, "time_per_iteration": 2.5975239276885986 }, { "auxiliary_loss_clip": 0.01216489, "auxiliary_loss_mlp": 0.0104531, "balance_loss_clip": 1.06358957, "balance_loss_mlp": 1.03220904, "epoch": 0.08236638008777732, "flos": 23768734510080.0, "grad_norm": 1.977049678289977, "language_loss": 0.85380328, "learning_rate": 3.971363318022341e-06, "loss": 0.87642121, "num_input_tokens_seen": 14487915, "step": 685, "time_per_iteration": 2.5909695625305176 }, { "auxiliary_loss_clip": 0.0123084, "auxiliary_loss_mlp": 0.01047554, "balance_loss_clip": 1.06277061, "balance_loss_mlp": 1.03398764, "epoch": 0.0824866229784164, "flos": 38799144887040.0, "grad_norm": 2.320995925729637, "language_loss": 0.68715346, "learning_rate": 3.971231820695417e-06, "loss": 0.70993733, "num_input_tokens_seen": 14511530, "step": 686, "time_per_iteration": 2.6729483604431152 }, { "auxiliary_loss_clip": 0.01236806, "auxiliary_loss_mlp": 0.01045352, "balance_loss_clip": 1.06897497, "balance_loss_mlp": 1.03226888, "epoch": 0.0826068658690555, "flos": 23107762391040.0, "grad_norm": 1.7689594334367413, "language_loss": 0.81249619, "learning_rate": 3.971100024334193e-06, "loss": 0.83531773, "num_input_tokens_seen": 14529050, "step": 687, "time_per_iteration": 3.4973368644714355 }, { "auxiliary_loss_clip": 0.01195503, "auxiliary_loss_mlp": 0.01045408, "balance_loss_clip": 1.05743408, "balance_loss_mlp": 1.03322434, "epoch": 0.08272710875969458, "flos": 21136374299520.0, "grad_norm": 2.556138440075114, "language_loss": 0.86235952, "learning_rate": 3.970967928958663e-06, "loss": 0.88476861, "num_input_tokens_seen": 14546165, "step": 688, "time_per_iteration": 2.646243095397949 }, { "auxiliary_loss_clip": 0.01202286, "auxiliary_loss_mlp": 0.01048323, "balance_loss_clip": 1.06436276, "balance_loss_mlp": 1.03584182, "epoch": 0.08284735165033368, "flos": 19063000517760.0, "grad_norm": 1.6641429196113793, "language_loss": 0.83409357, "learning_rate": 3.970835534588865e-06, "loss": 0.85659969, "num_input_tokens_seen": 14563660, "step": 689, "time_per_iteration": 2.590881109237671 }, { "auxiliary_loss_clip": 0.01231876, "auxiliary_loss_mlp": 0.0104943, "balance_loss_clip": 1.06994677, "balance_loss_mlp": 1.03721166, "epoch": 0.08296759454097276, "flos": 16727442387840.0, "grad_norm": 1.9299228935292398, "language_loss": 0.85633647, "learning_rate": 3.970702841244883e-06, "loss": 0.87914944, "num_input_tokens_seen": 14581980, "step": 690, "time_per_iteration": 2.5300991535186768 }, { "auxiliary_loss_clip": 0.01249433, "auxiliary_loss_mlp": 0.01044814, "balance_loss_clip": 1.07060182, "balance_loss_mlp": 1.03250003, "epoch": 0.08308783743161186, "flos": 18004928567040.0, "grad_norm": 2.3234322801547704, "language_loss": 0.82538581, "learning_rate": 3.970569848946847e-06, "loss": 0.84832823, "num_input_tokens_seen": 14601795, "step": 691, "time_per_iteration": 3.330634355545044 }, { "auxiliary_loss_clip": 0.01230133, "auxiliary_loss_mlp": 0.01039325, "balance_loss_clip": 1.06582785, "balance_loss_mlp": 1.02703404, "epoch": 0.08320808032225095, "flos": 15079788599040.0, "grad_norm": 2.413250600106508, "language_loss": 0.82537591, "learning_rate": 3.970436557714932e-06, "loss": 0.8480705, "num_input_tokens_seen": 14618315, "step": 692, "time_per_iteration": 2.54731822013855 }, { "auxiliary_loss_clip": 0.01222118, "auxiliary_loss_mlp": 0.01039079, "balance_loss_clip": 1.06346726, "balance_loss_mlp": 1.02607346, "epoch": 0.08332832321289003, "flos": 22383085501440.0, "grad_norm": 8.100572751541788, "language_loss": 0.86280978, "learning_rate": 3.970302967569358e-06, "loss": 0.88542175, "num_input_tokens_seen": 14636905, "step": 693, "time_per_iteration": 3.327517509460449 }, { "auxiliary_loss_clip": 0.01247305, "auxiliary_loss_mlp": 0.01045359, "balance_loss_clip": 1.07215595, "balance_loss_mlp": 1.0325501, "epoch": 0.08344856610352913, "flos": 24717386655360.0, "grad_norm": 1.901300033673213, "language_loss": 0.68107098, "learning_rate": 3.9701690785303896e-06, "loss": 0.70399761, "num_input_tokens_seen": 14656100, "step": 694, "time_per_iteration": 2.547891139984131 }, { "auxiliary_loss_clip": 0.01250563, "auxiliary_loss_mlp": 0.01042239, "balance_loss_clip": 1.06959915, "balance_loss_mlp": 1.02993703, "epoch": 0.08356880899416821, "flos": 25370206387200.0, "grad_norm": 2.2751908071743117, "language_loss": 0.88450772, "learning_rate": 3.970034890618339e-06, "loss": 0.90743572, "num_input_tokens_seen": 14675790, "step": 695, "time_per_iteration": 3.253553867340088 }, { "auxiliary_loss_clip": 0.01229564, "auxiliary_loss_mlp": 0.01037178, "balance_loss_clip": 1.06536865, "balance_loss_mlp": 1.02533412, "epoch": 0.08368905188480731, "flos": 24353072962560.0, "grad_norm": 1.9940795528678446, "language_loss": 0.88021863, "learning_rate": 3.969900403853562e-06, "loss": 0.90288603, "num_input_tokens_seen": 14694830, "step": 696, "time_per_iteration": 2.5634326934814453 }, { "auxiliary_loss_clip": 0.01265723, "auxiliary_loss_mlp": 0.01052848, "balance_loss_clip": 1.07358658, "balance_loss_mlp": 1.03993201, "epoch": 0.08380929477544641, "flos": 18037319656320.0, "grad_norm": 1.5595384805472767, "language_loss": 0.77765125, "learning_rate": 3.96976561825646e-06, "loss": 0.80083692, "num_input_tokens_seen": 14711920, "step": 697, "time_per_iteration": 2.472980260848999 }, { "auxiliary_loss_clip": 0.01198585, "auxiliary_loss_mlp": 0.01038486, "balance_loss_clip": 1.06354189, "balance_loss_mlp": 1.0268271, "epoch": 0.08392953766608549, "flos": 26286287875200.0, "grad_norm": 2.1700018346639642, "language_loss": 0.87154067, "learning_rate": 3.969630533847479e-06, "loss": 0.89391136, "num_input_tokens_seen": 14730880, "step": 698, "time_per_iteration": 2.649151563644409 }, { "auxiliary_loss_clip": 0.01246313, "auxiliary_loss_mlp": 0.01039335, "balance_loss_clip": 1.06890726, "balance_loss_mlp": 1.02702081, "epoch": 0.08404978055672459, "flos": 22492146170880.0, "grad_norm": 2.1303424371162727, "language_loss": 0.84012669, "learning_rate": 3.969495150647113e-06, "loss": 0.86298317, "num_input_tokens_seen": 14749050, "step": 699, "time_per_iteration": 2.5055344104766846 }, { "auxiliary_loss_clip": 0.01210867, "auxiliary_loss_mlp": 0.01037475, "balance_loss_clip": 1.06637263, "balance_loss_mlp": 1.02541149, "epoch": 0.08417002344736367, "flos": 24826878288000.0, "grad_norm": 1.7501081576618442, "language_loss": 0.76538599, "learning_rate": 3.969359468675899e-06, "loss": 0.78786945, "num_input_tokens_seen": 14769180, "step": 700, "time_per_iteration": 2.6334521770477295 }, { "auxiliary_loss_clip": 0.01241755, "auxiliary_loss_mlp": 0.01038554, "balance_loss_clip": 1.06796682, "balance_loss_mlp": 1.02598906, "epoch": 0.08429026633800277, "flos": 16945922862720.0, "grad_norm": 2.099904727483784, "language_loss": 0.89646226, "learning_rate": 3.969223487954418e-06, "loss": 0.91926539, "num_input_tokens_seen": 14786640, "step": 701, "time_per_iteration": 2.493617057800293 }, { "auxiliary_loss_clip": 0.01199374, "auxiliary_loss_mlp": 0.01045654, "balance_loss_clip": 1.06746805, "balance_loss_mlp": 1.03342295, "epoch": 0.08441050922864185, "flos": 23841920471040.0, "grad_norm": 2.2777933330410383, "language_loss": 0.82761592, "learning_rate": 3.969087208503301e-06, "loss": 0.85006618, "num_input_tokens_seen": 14806720, "step": 702, "time_per_iteration": 2.6200878620147705 }, { "auxiliary_loss_clip": 0.01201294, "auxiliary_loss_mlp": 0.01043453, "balance_loss_clip": 1.06502533, "balance_loss_mlp": 1.03105485, "epoch": 0.08453075211928095, "flos": 25520205582720.0, "grad_norm": 2.426950836105903, "language_loss": 0.84487534, "learning_rate": 3.968950630343219e-06, "loss": 0.8673228, "num_input_tokens_seen": 14823705, "step": 703, "time_per_iteration": 2.6058740615844727 }, { "auxiliary_loss_clip": 0.01226597, "auxiliary_loss_mlp": 0.01044622, "balance_loss_clip": 1.06400526, "balance_loss_mlp": 1.03252172, "epoch": 0.08465099500992004, "flos": 19532496211200.0, "grad_norm": 1.9550391882105205, "language_loss": 0.93532425, "learning_rate": 3.968813753494892e-06, "loss": 0.95803642, "num_input_tokens_seen": 14841865, "step": 704, "time_per_iteration": 2.548114538192749 }, { "auxiliary_loss_clip": 0.01200686, "auxiliary_loss_mlp": 0.00766973, "balance_loss_clip": 1.05830419, "balance_loss_mlp": 1.00026751, "epoch": 0.08477123790055913, "flos": 29351299403520.0, "grad_norm": 2.5948731469874735, "language_loss": 0.75383914, "learning_rate": 3.968676577979084e-06, "loss": 0.77351582, "num_input_tokens_seen": 14861415, "step": 705, "time_per_iteration": 2.655470609664917 }, { "auxiliary_loss_clip": 0.0118913, "auxiliary_loss_mlp": 0.01050092, "balance_loss_clip": 1.05848932, "balance_loss_mlp": 1.03756881, "epoch": 0.08489148079119822, "flos": 18624495283200.0, "grad_norm": 13.204197203013443, "language_loss": 0.78260064, "learning_rate": 3.968539103816605e-06, "loss": 0.80499291, "num_input_tokens_seen": 14879215, "step": 706, "time_per_iteration": 2.5678460597991943 }, { "auxiliary_loss_clip": 0.01228512, "auxiliary_loss_mlp": 0.00766381, "balance_loss_clip": 1.06937885, "balance_loss_mlp": 1.00028443, "epoch": 0.0850117236818373, "flos": 23471393725440.0, "grad_norm": 1.8222376117592827, "language_loss": 0.89680481, "learning_rate": 3.9684013310283085e-06, "loss": 0.91675377, "num_input_tokens_seen": 14897900, "step": 707, "time_per_iteration": 2.6264874935150146 }, { "auxiliary_loss_clip": 0.01224558, "auxiliary_loss_mlp": 0.01050407, "balance_loss_clip": 1.06930017, "balance_loss_mlp": 1.03790176, "epoch": 0.0851319665724764, "flos": 40625058896640.0, "grad_norm": 1.8237705906278956, "language_loss": 0.63923478, "learning_rate": 3.9682632596350956e-06, "loss": 0.66198444, "num_input_tokens_seen": 14919065, "step": 708, "time_per_iteration": 2.7082176208496094 }, { "auxiliary_loss_clip": 0.01240995, "auxiliary_loss_mlp": 0.01039717, "balance_loss_clip": 1.06945312, "balance_loss_mlp": 1.02738512, "epoch": 0.0852522094631155, "flos": 15879554870400.0, "grad_norm": 2.556451634365381, "language_loss": 0.78258598, "learning_rate": 3.968124889657911e-06, "loss": 0.80539304, "num_input_tokens_seen": 14934165, "step": 709, "time_per_iteration": 2.4888973236083984 }, { "auxiliary_loss_clip": 0.01193257, "auxiliary_loss_mlp": 0.0104827, "balance_loss_clip": 1.06094432, "balance_loss_mlp": 1.03685558, "epoch": 0.08537245235375458, "flos": 14567091822720.0, "grad_norm": 2.1951517339270437, "language_loss": 0.90588379, "learning_rate": 3.967986221117746e-06, "loss": 0.92829907, "num_input_tokens_seen": 14950105, "step": 710, "time_per_iteration": 2.567375421524048 }, { "auxiliary_loss_clip": 0.01169918, "auxiliary_loss_mlp": 0.01038101, "balance_loss_clip": 1.05987644, "balance_loss_mlp": 1.02613211, "epoch": 0.08549269524439368, "flos": 26468929555200.0, "grad_norm": 1.762491144624555, "language_loss": 0.86440134, "learning_rate": 3.967847254035635e-06, "loss": 0.88648152, "num_input_tokens_seen": 14969490, "step": 711, "time_per_iteration": 2.763526201248169 }, { "auxiliary_loss_clip": 0.01210719, "auxiliary_loss_mlp": 0.01041439, "balance_loss_clip": 1.0624547, "balance_loss_mlp": 1.02914262, "epoch": 0.08561293813503276, "flos": 13590214565760.0, "grad_norm": 2.1984781828452333, "language_loss": 0.86634451, "learning_rate": 3.967707988432661e-06, "loss": 0.88886607, "num_input_tokens_seen": 14987195, "step": 712, "time_per_iteration": 2.5990025997161865 }, { "auxiliary_loss_clip": 0.01261284, "auxiliary_loss_mlp": 0.01046279, "balance_loss_clip": 1.06909609, "balance_loss_mlp": 1.03321981, "epoch": 0.08573318102567186, "flos": 26943524979840.0, "grad_norm": 2.3147700761463725, "language_loss": 0.87544751, "learning_rate": 3.967568424329949e-06, "loss": 0.89852315, "num_input_tokens_seen": 15007620, "step": 713, "time_per_iteration": 3.3077073097229004 }, { "auxiliary_loss_clip": 0.01133771, "auxiliary_loss_mlp": 0.01011697, "balance_loss_clip": 1.03357041, "balance_loss_mlp": 1.00630915, "epoch": 0.08585342391631094, "flos": 67302739319040.0, "grad_norm": 0.8213084999241819, "language_loss": 0.55536014, "learning_rate": 3.967428561748671e-06, "loss": 0.57681483, "num_input_tokens_seen": 15075590, "step": 714, "time_per_iteration": 3.2335643768310547 }, { "auxiliary_loss_clip": 0.01188326, "auxiliary_loss_mlp": 0.01046159, "balance_loss_clip": 1.05715561, "balance_loss_mlp": 1.03285527, "epoch": 0.08597366680695004, "flos": 22456594684800.0, "grad_norm": 1.7662692165734508, "language_loss": 0.87422907, "learning_rate": 3.967288400710045e-06, "loss": 0.8965739, "num_input_tokens_seen": 15095055, "step": 715, "time_per_iteration": 2.6219892501831055 }, { "auxiliary_loss_clip": 0.0120543, "auxiliary_loss_mlp": 0.01042058, "balance_loss_clip": 1.06550384, "balance_loss_mlp": 1.02917147, "epoch": 0.08609390969758914, "flos": 23550505430400.0, "grad_norm": 6.4115412399782326, "language_loss": 0.8852272, "learning_rate": 3.9671479412353335e-06, "loss": 0.90770209, "num_input_tokens_seen": 15113520, "step": 716, "time_per_iteration": 2.5895464420318604 }, { "auxiliary_loss_clip": 0.01245746, "auxiliary_loss_mlp": 0.01046487, "balance_loss_clip": 1.07027149, "balance_loss_mlp": 1.03395176, "epoch": 0.08621415258822822, "flos": 25885848078720.0, "grad_norm": 2.286610213923094, "language_loss": 0.74080592, "learning_rate": 3.967007183345843e-06, "loss": 0.76372826, "num_input_tokens_seen": 15133375, "step": 717, "time_per_iteration": 2.538978338241577 }, { "auxiliary_loss_clip": 0.01238085, "auxiliary_loss_mlp": 0.01041016, "balance_loss_clip": 1.06754804, "balance_loss_mlp": 1.0286417, "epoch": 0.08633439547886732, "flos": 13589568120960.0, "grad_norm": 5.229286347001644, "language_loss": 0.89719927, "learning_rate": 3.966866127062927e-06, "loss": 0.91999024, "num_input_tokens_seen": 15150500, "step": 718, "time_per_iteration": 3.2775516510009766 }, { "auxiliary_loss_clip": 0.01138275, "auxiliary_loss_mlp": 0.01004857, "balance_loss_clip": 1.03685832, "balance_loss_mlp": 0.99963522, "epoch": 0.0864546383695064, "flos": 57767342434560.0, "grad_norm": 0.8650890060221531, "language_loss": 0.62711769, "learning_rate": 3.966724772407982e-06, "loss": 0.64854902, "num_input_tokens_seen": 15208015, "step": 719, "time_per_iteration": 3.7964155673980713 }, { "auxiliary_loss_clip": 0.01203392, "auxiliary_loss_mlp": 0.01040317, "balance_loss_clip": 1.06223512, "balance_loss_mlp": 1.02830648, "epoch": 0.0865748812601455, "flos": 20046952753920.0, "grad_norm": 2.6066224651585856, "language_loss": 0.88814062, "learning_rate": 3.966583119402454e-06, "loss": 0.91057777, "num_input_tokens_seen": 15224780, "step": 720, "time_per_iteration": 2.5559685230255127 }, { "auxiliary_loss_clip": 0.0123805, "auxiliary_loss_mlp": 0.00766036, "balance_loss_clip": 1.06715417, "balance_loss_mlp": 1.0003736, "epoch": 0.08669512415078459, "flos": 35262446935680.0, "grad_norm": 1.8037432730052723, "language_loss": 0.82261783, "learning_rate": 3.9664411680678305e-06, "loss": 0.8426587, "num_input_tokens_seen": 15246535, "step": 721, "time_per_iteration": 3.4028894901275635 }, { "auxiliary_loss_clip": 0.0111008, "auxiliary_loss_mlp": 0.0100618, "balance_loss_clip": 1.03080273, "balance_loss_mlp": 1.00062466, "epoch": 0.08681536704142367, "flos": 65654870048640.0, "grad_norm": 0.846456965627966, "language_loss": 0.61413199, "learning_rate": 3.966298918425644e-06, "loss": 0.63529462, "num_input_tokens_seen": 15304025, "step": 722, "time_per_iteration": 3.0168802738189697 }, { "auxiliary_loss_clip": 0.01242945, "auxiliary_loss_mlp": 0.01044157, "balance_loss_clip": 1.06585038, "balance_loss_mlp": 1.03106213, "epoch": 0.08693560993206277, "flos": 34529940881280.0, "grad_norm": 1.7754324483920783, "language_loss": 0.8268522, "learning_rate": 3.966156370497476e-06, "loss": 0.84972322, "num_input_tokens_seen": 15327635, "step": 723, "time_per_iteration": 2.634556293487549 }, { "auxiliary_loss_clip": 0.01244077, "auxiliary_loss_mlp": 0.01039826, "balance_loss_clip": 1.0660311, "balance_loss_mlp": 1.02744007, "epoch": 0.08705585282270185, "flos": 23149419189120.0, "grad_norm": 1.7772117672542782, "language_loss": 0.88431096, "learning_rate": 3.96601352430495e-06, "loss": 0.90714991, "num_input_tokens_seen": 15347405, "step": 724, "time_per_iteration": 2.532996654510498 }, { "auxiliary_loss_clip": 0.01226274, "auxiliary_loss_mlp": 0.01048428, "balance_loss_clip": 1.06837964, "balance_loss_mlp": 1.03544569, "epoch": 0.08717609571334095, "flos": 29497599498240.0, "grad_norm": 1.7896550083133873, "language_loss": 0.83266312, "learning_rate": 3.965870379869735e-06, "loss": 0.85541016, "num_input_tokens_seen": 15369450, "step": 725, "time_per_iteration": 2.60739803314209 }, { "auxiliary_loss_clip": 0.01238696, "auxiliary_loss_mlp": 0.01042638, "balance_loss_clip": 1.06270838, "balance_loss_mlp": 1.03075862, "epoch": 0.08729633860398003, "flos": 20667489137280.0, "grad_norm": 2.141342025028866, "language_loss": 0.86694145, "learning_rate": 3.965726937213547e-06, "loss": 0.88975489, "num_input_tokens_seen": 15388085, "step": 726, "time_per_iteration": 2.5231199264526367 }, { "auxiliary_loss_clip": 0.01239395, "auxiliary_loss_mlp": 0.01049424, "balance_loss_clip": 1.06296325, "balance_loss_mlp": 1.03657281, "epoch": 0.08741658149461913, "flos": 18369493655040.0, "grad_norm": 2.0968770109349633, "language_loss": 0.81280845, "learning_rate": 3.965583196358144e-06, "loss": 0.83569664, "num_input_tokens_seen": 15407120, "step": 727, "time_per_iteration": 2.6597909927368164 }, { "auxiliary_loss_clip": 0.01260028, "auxiliary_loss_mlp": 0.0104298, "balance_loss_clip": 1.06877446, "balance_loss_mlp": 1.02906823, "epoch": 0.08753682438525823, "flos": 18729677283840.0, "grad_norm": 2.689444276680461, "language_loss": 0.74456441, "learning_rate": 3.965439157325335e-06, "loss": 0.76759458, "num_input_tokens_seen": 15424485, "step": 728, "time_per_iteration": 2.538700819015503 }, { "auxiliary_loss_clip": 0.01218934, "auxiliary_loss_mlp": 0.01036936, "balance_loss_clip": 1.05975735, "balance_loss_mlp": 1.02298844, "epoch": 0.08765706727589731, "flos": 27776113303680.0, "grad_norm": 1.9871436051786833, "language_loss": 0.76060796, "learning_rate": 3.965294820136968e-06, "loss": 0.78316665, "num_input_tokens_seen": 15446285, "step": 729, "time_per_iteration": 2.654282331466675 }, { "auxiliary_loss_clip": 0.01229199, "auxiliary_loss_mlp": 0.01038602, "balance_loss_clip": 1.06583357, "balance_loss_mlp": 1.0264132, "epoch": 0.08777731016653641, "flos": 24389127239040.0, "grad_norm": 2.2427772571083304, "language_loss": 0.87208581, "learning_rate": 3.965150184814938e-06, "loss": 0.89476383, "num_input_tokens_seen": 15465770, "step": 730, "time_per_iteration": 2.6372649669647217 }, { "auxiliary_loss_clip": 0.01215954, "auxiliary_loss_mlp": 0.01045195, "balance_loss_clip": 1.06418157, "balance_loss_mlp": 1.03205848, "epoch": 0.08789755305717549, "flos": 21981855605760.0, "grad_norm": 3.629207783504735, "language_loss": 0.76280951, "learning_rate": 3.965005251381189e-06, "loss": 0.78542101, "num_input_tokens_seen": 15483705, "step": 731, "time_per_iteration": 2.636958360671997 }, { "auxiliary_loss_clip": 0.01133541, "auxiliary_loss_mlp": 0.01005898, "balance_loss_clip": 1.03007054, "balance_loss_mlp": 1.00070095, "epoch": 0.08801779594781459, "flos": 58360120583040.0, "grad_norm": 0.9085074219945398, "language_loss": 0.64629811, "learning_rate": 3.964860019857705e-06, "loss": 0.66769254, "num_input_tokens_seen": 15548620, "step": 732, "time_per_iteration": 3.1310997009277344 }, { "auxiliary_loss_clip": 0.01260248, "auxiliary_loss_mlp": 0.01045512, "balance_loss_clip": 1.07243061, "balance_loss_mlp": 1.03412759, "epoch": 0.08813803883845367, "flos": 23294785530240.0, "grad_norm": 1.8580667711307786, "language_loss": 0.84247398, "learning_rate": 3.964714490266518e-06, "loss": 0.86553162, "num_input_tokens_seen": 15569265, "step": 733, "time_per_iteration": 2.5205700397491455 }, { "auxiliary_loss_clip": 0.0112836, "auxiliary_loss_mlp": 0.01007157, "balance_loss_clip": 1.02875125, "balance_loss_mlp": 1.00191176, "epoch": 0.08825828172909277, "flos": 63424924882560.0, "grad_norm": 0.8963673826631723, "language_loss": 0.6459381, "learning_rate": 3.964568662629706e-06, "loss": 0.66729331, "num_input_tokens_seen": 15630570, "step": 734, "time_per_iteration": 3.007103204727173 }, { "auxiliary_loss_clip": 0.01234452, "auxiliary_loss_mlp": 0.01042552, "balance_loss_clip": 1.06225801, "balance_loss_mlp": 1.03089297, "epoch": 0.08837852461973186, "flos": 26720986268160.0, "grad_norm": 3.237547215292283, "language_loss": 0.84352016, "learning_rate": 3.9644225369693895e-06, "loss": 0.86629015, "num_input_tokens_seen": 15650870, "step": 735, "time_per_iteration": 2.5715677738189697 }, { "auxiliary_loss_clip": 0.01255842, "auxiliary_loss_mlp": 0.01039215, "balance_loss_clip": 1.06903756, "balance_loss_mlp": 1.02711511, "epoch": 0.08849876751037095, "flos": 27265427688960.0, "grad_norm": 1.9027809355522933, "language_loss": 0.86782962, "learning_rate": 3.964276113307735e-06, "loss": 0.89078015, "num_input_tokens_seen": 15670835, "step": 736, "time_per_iteration": 2.5143232345581055 }, { "auxiliary_loss_clip": 0.012099, "auxiliary_loss_mlp": 0.01050886, "balance_loss_clip": 1.06421185, "balance_loss_mlp": 1.0377847, "epoch": 0.08861901040101004, "flos": 19828759587840.0, "grad_norm": 1.8659619759796535, "language_loss": 0.80636764, "learning_rate": 3.9641293916669574e-06, "loss": 0.8289755, "num_input_tokens_seen": 15689795, "step": 737, "time_per_iteration": 2.5595498085021973 }, { "auxiliary_loss_clip": 0.01205053, "auxiliary_loss_mlp": 0.01037356, "balance_loss_clip": 1.06331253, "balance_loss_mlp": 1.02405179, "epoch": 0.08873925329164913, "flos": 23658704173440.0, "grad_norm": 1.8882247729683301, "language_loss": 0.82776213, "learning_rate": 3.9639823720693115e-06, "loss": 0.85018617, "num_input_tokens_seen": 15711650, "step": 738, "time_per_iteration": 2.6314258575439453 }, { "auxiliary_loss_clip": 0.01109011, "auxiliary_loss_mlp": 0.01019373, "balance_loss_clip": 1.04003358, "balance_loss_mlp": 1.01491451, "epoch": 0.08885949618228822, "flos": 71831541893760.0, "grad_norm": 0.8482495326162162, "language_loss": 0.59987473, "learning_rate": 3.963835054537102e-06, "loss": 0.62115854, "num_input_tokens_seen": 15780615, "step": 739, "time_per_iteration": 3.193509817123413 }, { "auxiliary_loss_clip": 0.01220775, "auxiliary_loss_mlp": 0.01055811, "balance_loss_clip": 1.06141114, "balance_loss_mlp": 1.0437355, "epoch": 0.08897973907292732, "flos": 22346169298560.0, "grad_norm": 2.2139264102670073, "language_loss": 0.61381447, "learning_rate": 3.963687439092676e-06, "loss": 0.63658041, "num_input_tokens_seen": 15801300, "step": 740, "time_per_iteration": 3.3782408237457275 }, { "auxiliary_loss_clip": 0.01240986, "auxiliary_loss_mlp": 0.01046714, "balance_loss_clip": 1.06674385, "balance_loss_mlp": 1.03450692, "epoch": 0.0890999819635664, "flos": 21251827589760.0, "grad_norm": 3.9173797294777475, "language_loss": 0.80378187, "learning_rate": 3.963539525758427e-06, "loss": 0.8266589, "num_input_tokens_seen": 15820860, "step": 741, "time_per_iteration": 2.5305728912353516 }, { "auxiliary_loss_clip": 0.01226877, "auxiliary_loss_mlp": 0.01039917, "balance_loss_clip": 1.0660212, "balance_loss_mlp": 1.0269649, "epoch": 0.0892202248542055, "flos": 25370888745600.0, "grad_norm": 2.430622964395486, "language_loss": 0.67870855, "learning_rate": 3.9633913145567925e-06, "loss": 0.7013765, "num_input_tokens_seen": 15841350, "step": 742, "time_per_iteration": 2.564605951309204 }, { "auxiliary_loss_clip": 0.01225065, "auxiliary_loss_mlp": 0.01034582, "balance_loss_clip": 1.06720603, "balance_loss_mlp": 1.02282858, "epoch": 0.08934046774484458, "flos": 24457895827200.0, "grad_norm": 2.2087618900411017, "language_loss": 0.81364214, "learning_rate": 3.9632428055102575e-06, "loss": 0.83623862, "num_input_tokens_seen": 15861360, "step": 743, "time_per_iteration": 2.5781798362731934 }, { "auxiliary_loss_clip": 0.01246759, "auxiliary_loss_mlp": 0.01046256, "balance_loss_clip": 1.07117701, "balance_loss_mlp": 1.03242755, "epoch": 0.08946071063548368, "flos": 35772773414400.0, "grad_norm": 2.4649839207881596, "language_loss": 0.66916597, "learning_rate": 3.9630939986413495e-06, "loss": 0.69209611, "num_input_tokens_seen": 15883160, "step": 744, "time_per_iteration": 2.6227333545684814 }, { "auxiliary_loss_clip": 0.01197922, "auxiliary_loss_mlp": 0.01049724, "balance_loss_clip": 1.06317616, "balance_loss_mlp": 1.0368731, "epoch": 0.08958095352612276, "flos": 14356584167040.0, "grad_norm": 1.8499385600165459, "language_loss": 0.78343606, "learning_rate": 3.962944893972643e-06, "loss": 0.80591255, "num_input_tokens_seen": 15901610, "step": 745, "time_per_iteration": 3.3605949878692627 }, { "auxiliary_loss_clip": 0.01223982, "auxiliary_loss_mlp": 0.01044765, "balance_loss_clip": 1.06447363, "balance_loss_mlp": 1.03206921, "epoch": 0.08970119641676186, "flos": 17853277345920.0, "grad_norm": 3.0551993831848607, "language_loss": 0.91105425, "learning_rate": 3.962795491526756e-06, "loss": 0.93374175, "num_input_tokens_seen": 15918770, "step": 746, "time_per_iteration": 3.4027154445648193 }, { "auxiliary_loss_clip": 0.01264162, "auxiliary_loss_mlp": 0.01056027, "balance_loss_clip": 1.07289803, "balance_loss_mlp": 1.04190671, "epoch": 0.08982143930740095, "flos": 20811670329600.0, "grad_norm": 2.1750555823974267, "language_loss": 0.89316124, "learning_rate": 3.962645791326354e-06, "loss": 0.91636312, "num_input_tokens_seen": 15938025, "step": 747, "time_per_iteration": 2.490537405014038 }, { "auxiliary_loss_clip": 0.01239971, "auxiliary_loss_mlp": 0.01041097, "balance_loss_clip": 1.06860781, "balance_loss_mlp": 1.02948022, "epoch": 0.08994168219804004, "flos": 24097712198400.0, "grad_norm": 1.9740534642002414, "language_loss": 0.82780039, "learning_rate": 3.962495793394146e-06, "loss": 0.85061109, "num_input_tokens_seen": 15957215, "step": 748, "time_per_iteration": 3.3045437335968018 }, { "auxiliary_loss_clip": 0.01143532, "auxiliary_loss_mlp": 0.01040568, "balance_loss_clip": 1.03070569, "balance_loss_mlp": 1.03584719, "epoch": 0.09006192508867913, "flos": 57188893812480.0, "grad_norm": 0.7418661616341179, "language_loss": 0.61205, "learning_rate": 3.9623454977528864e-06, "loss": 0.63389093, "num_input_tokens_seen": 16015870, "step": 749, "time_per_iteration": 2.89220929145813 }, { "auxiliary_loss_clip": 0.01214954, "auxiliary_loss_mlp": 0.01051043, "balance_loss_clip": 1.06468487, "balance_loss_mlp": 1.03827572, "epoch": 0.09018216797931822, "flos": 20487505063680.0, "grad_norm": 1.638919864447389, "language_loss": 0.85080326, "learning_rate": 3.962194904425375e-06, "loss": 0.87346327, "num_input_tokens_seen": 16036500, "step": 750, "time_per_iteration": 2.6000778675079346 }, { "auxiliary_loss_clip": 0.01236507, "auxiliary_loss_mlp": 0.01042042, "balance_loss_clip": 1.06613481, "balance_loss_mlp": 1.029459, "epoch": 0.09030241086995731, "flos": 22638123043200.0, "grad_norm": 3.524315116278463, "language_loss": 0.6796189, "learning_rate": 3.9620440134344566e-06, "loss": 0.70240438, "num_input_tokens_seen": 16054655, "step": 751, "time_per_iteration": 2.5078063011169434 }, { "auxiliary_loss_clip": 0.01207956, "auxiliary_loss_mlp": 0.01050457, "balance_loss_clip": 1.0646832, "balance_loss_mlp": 1.03704023, "epoch": 0.09042265376059641, "flos": 21871502046720.0, "grad_norm": 2.90199615579018, "language_loss": 0.82166553, "learning_rate": 3.9618928248030215e-06, "loss": 0.84424967, "num_input_tokens_seen": 16074165, "step": 752, "time_per_iteration": 2.5974485874176025 }, { "auxiliary_loss_clip": 0.01239493, "auxiliary_loss_mlp": 0.01049187, "balance_loss_clip": 1.06807065, "balance_loss_mlp": 1.03687274, "epoch": 0.0905428966512355, "flos": 24316192673280.0, "grad_norm": 2.1489688556392674, "language_loss": 0.83093143, "learning_rate": 3.961741338554005e-06, "loss": 0.85381818, "num_input_tokens_seen": 16092505, "step": 753, "time_per_iteration": 2.5624661445617676 }, { "auxiliary_loss_clip": 0.01233045, "auxiliary_loss_mlp": 0.01055188, "balance_loss_clip": 1.06765342, "balance_loss_mlp": 1.04195607, "epoch": 0.09066313954187459, "flos": 35845061535360.0, "grad_norm": 2.1602943930785923, "language_loss": 0.75770986, "learning_rate": 3.9615895547103865e-06, "loss": 0.7805922, "num_input_tokens_seen": 16116150, "step": 754, "time_per_iteration": 2.742488384246826 }, { "auxiliary_loss_clip": 0.01223138, "auxiliary_loss_mlp": 0.01050512, "balance_loss_clip": 1.06320918, "balance_loss_mlp": 1.03761959, "epoch": 0.09078338243251367, "flos": 29168729550720.0, "grad_norm": 1.839582196697024, "language_loss": 0.77699804, "learning_rate": 3.961437473295193e-06, "loss": 0.79973447, "num_input_tokens_seen": 16136295, "step": 755, "time_per_iteration": 2.5963313579559326 }, { "auxiliary_loss_clip": 0.01178461, "auxiliary_loss_mlp": 0.01046698, "balance_loss_clip": 1.05443311, "balance_loss_mlp": 1.03419936, "epoch": 0.09090362532315277, "flos": 21907699977600.0, "grad_norm": 2.41548011638984, "language_loss": 0.72252071, "learning_rate": 3.961285094331495e-06, "loss": 0.74477232, "num_input_tokens_seen": 16154210, "step": 756, "time_per_iteration": 2.641754388809204 }, { "auxiliary_loss_clip": 0.01251217, "auxiliary_loss_mlp": 0.01041559, "balance_loss_clip": 1.06597853, "balance_loss_mlp": 1.029948, "epoch": 0.09102386821379185, "flos": 27344503480320.0, "grad_norm": 4.562012824283252, "language_loss": 0.86085641, "learning_rate": 3.961132417842406e-06, "loss": 0.88378417, "num_input_tokens_seen": 16173995, "step": 757, "time_per_iteration": 2.521122455596924 }, { "auxiliary_loss_clip": 0.01232284, "auxiliary_loss_mlp": 0.01050963, "balance_loss_clip": 1.06513047, "balance_loss_mlp": 1.03897023, "epoch": 0.09114411110443095, "flos": 20813501923200.0, "grad_norm": 3.489379949445828, "language_loss": 0.75569755, "learning_rate": 3.960979443851089e-06, "loss": 0.77853, "num_input_tokens_seen": 16191020, "step": 758, "time_per_iteration": 2.5540335178375244 }, { "auxiliary_loss_clip": 0.01222789, "auxiliary_loss_mlp": 0.01042283, "balance_loss_clip": 1.06476259, "balance_loss_mlp": 1.02890193, "epoch": 0.09126435399507005, "flos": 26145949438080.0, "grad_norm": 1.712151297559235, "language_loss": 0.7875455, "learning_rate": 3.96082617238075e-06, "loss": 0.81019622, "num_input_tokens_seen": 16213645, "step": 759, "time_per_iteration": 2.618605136871338 }, { "auxiliary_loss_clip": 0.0122199, "auxiliary_loss_mlp": 0.01038585, "balance_loss_clip": 1.06349874, "balance_loss_mlp": 1.02709365, "epoch": 0.09138459688570913, "flos": 24388911757440.0, "grad_norm": 2.581203286872189, "language_loss": 0.79955423, "learning_rate": 3.960672603454639e-06, "loss": 0.82215995, "num_input_tokens_seen": 16233625, "step": 760, "time_per_iteration": 2.5661139488220215 }, { "auxiliary_loss_clip": 0.01234321, "auxiliary_loss_mlp": 0.01047414, "balance_loss_clip": 1.06649208, "balance_loss_mlp": 1.03436661, "epoch": 0.09150483977634823, "flos": 21032664756480.0, "grad_norm": 2.908520015453707, "language_loss": 0.77360773, "learning_rate": 3.960518737096054e-06, "loss": 0.7964251, "num_input_tokens_seen": 16253255, "step": 761, "time_per_iteration": 2.5108797550201416 }, { "auxiliary_loss_clip": 0.01239668, "auxiliary_loss_mlp": 0.01039487, "balance_loss_clip": 1.06747317, "balance_loss_mlp": 1.02695203, "epoch": 0.09162508266698731, "flos": 22856998567680.0, "grad_norm": 2.1656183213501445, "language_loss": 0.72762758, "learning_rate": 3.960364573328334e-06, "loss": 0.75041914, "num_input_tokens_seen": 16272580, "step": 762, "time_per_iteration": 2.537421941757202 }, { "auxiliary_loss_clip": 0.01208382, "auxiliary_loss_mlp": 0.01040424, "balance_loss_clip": 1.06091428, "balance_loss_mlp": 1.02698278, "epoch": 0.0917453255576264, "flos": 21724411852800.0, "grad_norm": 1.9035893829647503, "language_loss": 0.8847158, "learning_rate": 3.9602101121748675e-06, "loss": 0.90720385, "num_input_tokens_seen": 16293075, "step": 763, "time_per_iteration": 2.5745084285736084 }, { "auxiliary_loss_clip": 0.01223701, "auxiliary_loss_mlp": 0.01044249, "balance_loss_clip": 1.06795657, "balance_loss_mlp": 1.03285289, "epoch": 0.0918655684482655, "flos": 14609215497600.0, "grad_norm": 3.0841465319861605, "language_loss": 0.72333789, "learning_rate": 3.960055353659085e-06, "loss": 0.7460174, "num_input_tokens_seen": 16310185, "step": 764, "time_per_iteration": 2.49641489982605 }, { "auxiliary_loss_clip": 0.01211061, "auxiliary_loss_mlp": 0.01035208, "balance_loss_clip": 1.06531608, "balance_loss_mlp": 1.02291179, "epoch": 0.09198581133890459, "flos": 23435016226560.0, "grad_norm": 1.8135206928884973, "language_loss": 0.83774364, "learning_rate": 3.959900297804465e-06, "loss": 0.86020637, "num_input_tokens_seen": 16330355, "step": 765, "time_per_iteration": 2.6304118633270264 }, { "auxiliary_loss_clip": 0.01210006, "auxiliary_loss_mlp": 0.01039267, "balance_loss_clip": 1.06236756, "balance_loss_mlp": 1.02711332, "epoch": 0.09210605422954368, "flos": 16795887753600.0, "grad_norm": 1.7923047136102985, "language_loss": 0.77408326, "learning_rate": 3.9597449446345276e-06, "loss": 0.79657596, "num_input_tokens_seen": 16347600, "step": 766, "time_per_iteration": 2.6017959117889404 }, { "auxiliary_loss_clip": 0.01209076, "auxiliary_loss_mlp": 0.01037431, "balance_loss_clip": 1.05900049, "balance_loss_mlp": 1.02579629, "epoch": 0.09222629712018277, "flos": 22674249146880.0, "grad_norm": 2.241517571459111, "language_loss": 0.8341403, "learning_rate": 3.95958929417284e-06, "loss": 0.85660541, "num_input_tokens_seen": 16365755, "step": 767, "time_per_iteration": 3.4614627361297607 }, { "auxiliary_loss_clip": 0.01131984, "auxiliary_loss_mlp": 0.0100879, "balance_loss_clip": 1.0303998, "balance_loss_mlp": 1.00435531, "epoch": 0.09234654001082186, "flos": 69976756327680.0, "grad_norm": 0.7498201645231182, "language_loss": 0.58803904, "learning_rate": 3.9594333464430145e-06, "loss": 0.60944676, "num_input_tokens_seen": 16435245, "step": 768, "time_per_iteration": 3.232046365737915 }, { "auxiliary_loss_clip": 0.0114631, "auxiliary_loss_mlp": 0.0104232, "balance_loss_clip": 1.05068898, "balance_loss_mlp": 1.03140604, "epoch": 0.09246678290146094, "flos": 20011437181440.0, "grad_norm": 1.877993792484274, "language_loss": 0.88569391, "learning_rate": 3.959277101468709e-06, "loss": 0.90758014, "num_input_tokens_seen": 16454795, "step": 769, "time_per_iteration": 2.844501495361328 }, { "auxiliary_loss_clip": 0.01205583, "auxiliary_loss_mlp": 0.01048922, "balance_loss_clip": 1.06103909, "balance_loss_mlp": 1.03714979, "epoch": 0.09258702579210004, "flos": 17747448900480.0, "grad_norm": 2.381511027482938, "language_loss": 0.78577614, "learning_rate": 3.959120559273624e-06, "loss": 0.80832124, "num_input_tokens_seen": 16472580, "step": 770, "time_per_iteration": 2.8863327503204346 }, { "auxiliary_loss_clip": 0.01205286, "auxiliary_loss_mlp": 0.01042845, "balance_loss_clip": 1.0624392, "balance_loss_mlp": 1.0311209, "epoch": 0.09270726868273914, "flos": 20886544229760.0, "grad_norm": 1.7712475182230991, "language_loss": 0.83321321, "learning_rate": 3.958963719881509e-06, "loss": 0.85569459, "num_input_tokens_seen": 16490670, "step": 771, "time_per_iteration": 3.3953518867492676 }, { "auxiliary_loss_clip": 0.01238179, "auxiliary_loss_mlp": 0.01036271, "balance_loss_clip": 1.06904769, "balance_loss_mlp": 1.02334261, "epoch": 0.09282751157337822, "flos": 17015697031680.0, "grad_norm": 2.4160427784363647, "language_loss": 0.93895793, "learning_rate": 3.958806583316154e-06, "loss": 0.96170247, "num_input_tokens_seen": 16508640, "step": 772, "time_per_iteration": 2.5044260025024414 }, { "auxiliary_loss_clip": 0.01253304, "auxiliary_loss_mlp": 0.01032895, "balance_loss_clip": 1.07007515, "balance_loss_mlp": 1.02168894, "epoch": 0.09294775446401732, "flos": 32523647748480.0, "grad_norm": 2.2935244338408043, "language_loss": 0.78857535, "learning_rate": 3.9586491496013985e-06, "loss": 0.81143737, "num_input_tokens_seen": 16531035, "step": 773, "time_per_iteration": 3.506129503250122 }, { "auxiliary_loss_clip": 0.01242441, "auxiliary_loss_mlp": 0.01047174, "balance_loss_clip": 1.06782627, "balance_loss_mlp": 1.03559327, "epoch": 0.0930679973546564, "flos": 18259750627200.0, "grad_norm": 2.443343993646761, "language_loss": 0.82959175, "learning_rate": 3.958491418761124e-06, "loss": 0.85248786, "num_input_tokens_seen": 16548605, "step": 774, "time_per_iteration": 2.484351873397827 }, { "auxiliary_loss_clip": 0.01220364, "auxiliary_loss_mlp": 0.01039879, "balance_loss_clip": 1.060323, "balance_loss_mlp": 1.02854264, "epoch": 0.0931882402452955, "flos": 21099745405440.0, "grad_norm": 3.511223698697841, "language_loss": 0.72452903, "learning_rate": 3.958333390819258e-06, "loss": 0.74713147, "num_input_tokens_seen": 16565535, "step": 775, "time_per_iteration": 3.320323944091797 }, { "auxiliary_loss_clip": 0.01252348, "auxiliary_loss_mlp": 0.01040878, "balance_loss_clip": 1.06885231, "balance_loss_mlp": 1.02976751, "epoch": 0.0933084831359346, "flos": 24207275658240.0, "grad_norm": 2.162574745285735, "language_loss": 0.80319798, "learning_rate": 3.9581750657997754e-06, "loss": 0.82613021, "num_input_tokens_seen": 16584900, "step": 776, "time_per_iteration": 2.5163486003875732 }, { "auxiliary_loss_clip": 0.01218066, "auxiliary_loss_mlp": 0.0103836, "balance_loss_clip": 1.06113338, "balance_loss_mlp": 1.02718973, "epoch": 0.09342872602657368, "flos": 25480272637440.0, "grad_norm": 1.8029756803353811, "language_loss": 0.89400685, "learning_rate": 3.95801644372669e-06, "loss": 0.91657108, "num_input_tokens_seen": 16604805, "step": 777, "time_per_iteration": 2.582204818725586 }, { "auxiliary_loss_clip": 0.01227318, "auxiliary_loss_mlp": 0.01041934, "balance_loss_clip": 1.06274056, "balance_loss_mlp": 1.03037691, "epoch": 0.09354896891721277, "flos": 23149060053120.0, "grad_norm": 2.7455108891514133, "language_loss": 0.84638035, "learning_rate": 3.957857524624068e-06, "loss": 0.86907291, "num_input_tokens_seen": 16623685, "step": 778, "time_per_iteration": 2.5764808654785156 }, { "auxiliary_loss_clip": 0.01220554, "auxiliary_loss_mlp": 0.01041214, "balance_loss_clip": 1.06451082, "balance_loss_mlp": 1.02994251, "epoch": 0.09366921180785186, "flos": 24279563779200.0, "grad_norm": 1.7713504253024963, "language_loss": 0.89744759, "learning_rate": 3.957698308516016e-06, "loss": 0.92006528, "num_input_tokens_seen": 16644985, "step": 779, "time_per_iteration": 2.5860352516174316 }, { "auxiliary_loss_clip": 0.01232785, "auxiliary_loss_mlp": 0.00765174, "balance_loss_clip": 1.06731415, "balance_loss_mlp": 1.00008166, "epoch": 0.09378945469849095, "flos": 18730036419840.0, "grad_norm": 1.9528626408057206, "language_loss": 0.82340336, "learning_rate": 3.957538795426688e-06, "loss": 0.84338295, "num_input_tokens_seen": 16662410, "step": 780, "time_per_iteration": 2.4987144470214844 }, { "auxiliary_loss_clip": 0.01222187, "auxiliary_loss_mlp": 0.01045749, "balance_loss_clip": 1.06392038, "balance_loss_mlp": 1.0328387, "epoch": 0.09390969758913004, "flos": 23218834222080.0, "grad_norm": 2.775819738756101, "language_loss": 0.7731142, "learning_rate": 3.9573789853802804e-06, "loss": 0.79579353, "num_input_tokens_seen": 16680885, "step": 781, "time_per_iteration": 2.5398433208465576 }, { "auxiliary_loss_clip": 0.01222441, "auxiliary_loss_mlp": 0.00764928, "balance_loss_clip": 1.0673008, "balance_loss_mlp": 1.00009966, "epoch": 0.09402994047976913, "flos": 19646728439040.0, "grad_norm": 2.2231868911922863, "language_loss": 0.75059783, "learning_rate": 3.957218878401037e-06, "loss": 0.77047157, "num_input_tokens_seen": 16699375, "step": 782, "time_per_iteration": 2.52759051322937 }, { "auxiliary_loss_clip": 0.01253558, "auxiliary_loss_mlp": 0.01044325, "balance_loss_clip": 1.06937087, "balance_loss_mlp": 1.03211188, "epoch": 0.09415018337040823, "flos": 29420463041280.0, "grad_norm": 2.1228779026288866, "language_loss": 0.89221221, "learning_rate": 3.957058474513246e-06, "loss": 0.91519105, "num_input_tokens_seen": 16719230, "step": 783, "time_per_iteration": 2.536959409713745 }, { "auxiliary_loss_clip": 0.0123381, "auxiliary_loss_mlp": 0.01047226, "balance_loss_clip": 1.06738698, "balance_loss_mlp": 1.03640795, "epoch": 0.09427042626104731, "flos": 24572092141440.0, "grad_norm": 2.098397684064178, "language_loss": 0.78701937, "learning_rate": 3.956897773741241e-06, "loss": 0.80982971, "num_input_tokens_seen": 16738220, "step": 784, "time_per_iteration": 2.520097494125366 }, { "auxiliary_loss_clip": 0.01211045, "auxiliary_loss_mlp": 0.01045078, "balance_loss_clip": 1.06269038, "balance_loss_mlp": 1.03306139, "epoch": 0.09439066915168641, "flos": 26359581576960.0, "grad_norm": 1.803975054384304, "language_loss": 0.71477062, "learning_rate": 3.956736776109398e-06, "loss": 0.73733187, "num_input_tokens_seen": 16759395, "step": 785, "time_per_iteration": 2.5932247638702393 }, { "auxiliary_loss_clip": 0.01227363, "auxiliary_loss_mlp": 0.00765931, "balance_loss_clip": 1.06248522, "balance_loss_mlp": 1.00014472, "epoch": 0.09451091204232549, "flos": 19427278296960.0, "grad_norm": 1.96090050861924, "language_loss": 0.83667952, "learning_rate": 3.956575481642143e-06, "loss": 0.85661244, "num_input_tokens_seen": 16778285, "step": 786, "time_per_iteration": 2.526232957839966 }, { "auxiliary_loss_clip": 0.01180531, "auxiliary_loss_mlp": 0.01035462, "balance_loss_clip": 1.05511522, "balance_loss_mlp": 1.02349329, "epoch": 0.09463115493296459, "flos": 25368051571200.0, "grad_norm": 2.9720776375749427, "language_loss": 0.74837685, "learning_rate": 3.956413890363943e-06, "loss": 0.77053678, "num_input_tokens_seen": 16795265, "step": 787, "time_per_iteration": 2.658855676651001 }, { "auxiliary_loss_clip": 0.01234635, "auxiliary_loss_mlp": 0.01039224, "balance_loss_clip": 1.06727445, "balance_loss_mlp": 1.02813196, "epoch": 0.09475139782360369, "flos": 10123254869760.0, "grad_norm": 1.9638130848844293, "language_loss": 0.81917673, "learning_rate": 3.956252002299312e-06, "loss": 0.84191531, "num_input_tokens_seen": 16811165, "step": 788, "time_per_iteration": 2.5442371368408203 }, { "auxiliary_loss_clip": 0.01251322, "auxiliary_loss_mlp": 0.01034021, "balance_loss_clip": 1.06852293, "balance_loss_mlp": 1.02258253, "epoch": 0.09487164071424277, "flos": 17231088936960.0, "grad_norm": 1.8412229805445395, "language_loss": 0.90676838, "learning_rate": 3.956089817472807e-06, "loss": 0.92962176, "num_input_tokens_seen": 16828470, "step": 789, "time_per_iteration": 2.519622802734375 }, { "auxiliary_loss_clip": 0.01220214, "auxiliary_loss_mlp": 0.01038964, "balance_loss_clip": 1.06710017, "balance_loss_mlp": 1.02788401, "epoch": 0.09499188360488187, "flos": 30849564528000.0, "grad_norm": 2.086778008739669, "language_loss": 0.85708684, "learning_rate": 3.955927335909032e-06, "loss": 0.87967861, "num_input_tokens_seen": 16851680, "step": 790, "time_per_iteration": 2.6684296131134033 }, { "auxiliary_loss_clip": 0.01188987, "auxiliary_loss_mlp": 0.01040374, "balance_loss_clip": 1.06550288, "balance_loss_mlp": 1.02929318, "epoch": 0.09511212649552095, "flos": 29351694453120.0, "grad_norm": 2.037735741394394, "language_loss": 0.75897175, "learning_rate": 3.955764557632634e-06, "loss": 0.78126538, "num_input_tokens_seen": 16871490, "step": 791, "time_per_iteration": 2.6928322315216064 }, { "auxiliary_loss_clip": 0.01215442, "auxiliary_loss_mlp": 0.01038112, "balance_loss_clip": 1.06282735, "balance_loss_mlp": 1.02656031, "epoch": 0.09523236938616005, "flos": 10378687461120.0, "grad_norm": 2.083291490242172, "language_loss": 0.94643569, "learning_rate": 3.955601482668309e-06, "loss": 0.96897125, "num_input_tokens_seen": 16889350, "step": 792, "time_per_iteration": 2.5200576782226562 }, { "auxiliary_loss_clip": 0.01183547, "auxiliary_loss_mlp": 0.01040419, "balance_loss_clip": 1.05602777, "balance_loss_mlp": 1.02699637, "epoch": 0.09535261227679913, "flos": 19061815368960.0, "grad_norm": 1.7384333115757473, "language_loss": 0.88378692, "learning_rate": 3.955438111040794e-06, "loss": 0.9060266, "num_input_tokens_seen": 16907625, "step": 793, "time_per_iteration": 2.6130173206329346 }, { "auxiliary_loss_clip": 0.01183201, "auxiliary_loss_mlp": 0.0104513, "balance_loss_clip": 1.05778408, "balance_loss_mlp": 1.034199, "epoch": 0.09547285516743823, "flos": 20922993555840.0, "grad_norm": 2.007447161959936, "language_loss": 0.79972219, "learning_rate": 3.955274442774873e-06, "loss": 0.82200551, "num_input_tokens_seen": 16926205, "step": 794, "time_per_iteration": 3.4259731769561768 }, { "auxiliary_loss_clip": 0.01234252, "auxiliary_loss_mlp": 0.01044026, "balance_loss_clip": 1.06554341, "balance_loss_mlp": 1.03181863, "epoch": 0.09559309805807732, "flos": 30154405639680.0, "grad_norm": 2.8176491728850506, "language_loss": 0.70629764, "learning_rate": 3.9551104778953725e-06, "loss": 0.72908038, "num_input_tokens_seen": 16946500, "step": 795, "time_per_iteration": 2.583831310272217 }, { "auxiliary_loss_clip": 0.01203554, "auxiliary_loss_mlp": 0.010329, "balance_loss_clip": 1.06006777, "balance_loss_mlp": 1.02169394, "epoch": 0.0957133409487164, "flos": 21066743784960.0, "grad_norm": 1.8080487606794478, "language_loss": 0.85488039, "learning_rate": 3.954946216427167e-06, "loss": 0.87724489, "num_input_tokens_seen": 16966960, "step": 796, "time_per_iteration": 2.593061923980713 }, { "auxiliary_loss_clip": 0.01102803, "auxiliary_loss_mlp": 0.01008399, "balance_loss_clip": 1.0285213, "balance_loss_mlp": 1.0036782, "epoch": 0.0958335838393555, "flos": 71297979315840.0, "grad_norm": 0.9598719269203762, "language_loss": 0.61594796, "learning_rate": 3.954781658395176e-06, "loss": 0.63705993, "num_input_tokens_seen": 17023215, "step": 797, "time_per_iteration": 3.097337245941162 }, { "auxiliary_loss_clip": 0.01224686, "auxiliary_loss_mlp": 0.01038883, "balance_loss_clip": 1.06469059, "balance_loss_mlp": 1.02677083, "epoch": 0.09595382672999458, "flos": 21872974504320.0, "grad_norm": 3.0385188852364826, "language_loss": 0.92234981, "learning_rate": 3.95461680382436e-06, "loss": 0.94498551, "num_input_tokens_seen": 17042140, "step": 798, "time_per_iteration": 3.3518388271331787 }, { "auxiliary_loss_clip": 0.01240478, "auxiliary_loss_mlp": 0.01042074, "balance_loss_clip": 1.06897712, "balance_loss_mlp": 1.02977157, "epoch": 0.09607406962063368, "flos": 18695562341760.0, "grad_norm": 3.224595268756587, "language_loss": 0.86197603, "learning_rate": 3.9544516527397295e-06, "loss": 0.88480151, "num_input_tokens_seen": 17058490, "step": 799, "time_per_iteration": 2.55964994430542 }, { "auxiliary_loss_clip": 0.01204065, "auxiliary_loss_mlp": 0.01035895, "balance_loss_clip": 1.06159639, "balance_loss_mlp": 1.02425396, "epoch": 0.09619431251127276, "flos": 22568456615040.0, "grad_norm": 2.21245382892145, "language_loss": 0.80572426, "learning_rate": 3.954286205166338e-06, "loss": 0.82812387, "num_input_tokens_seen": 17079655, "step": 800, "time_per_iteration": 3.525792360305786 }, { "auxiliary_loss_clip": 0.012428, "auxiliary_loss_mlp": 0.01043224, "balance_loss_clip": 1.07241571, "balance_loss_mlp": 1.03049839, "epoch": 0.09631455540191186, "flos": 14246230608000.0, "grad_norm": 8.926001993125734, "language_loss": 0.83849692, "learning_rate": 3.954120461129282e-06, "loss": 0.86135709, "num_input_tokens_seen": 17097065, "step": 801, "time_per_iteration": 3.286698818206787 }, { "auxiliary_loss_clip": 0.01256027, "auxiliary_loss_mlp": 0.010466, "balance_loss_clip": 1.07260573, "balance_loss_mlp": 1.03525114, "epoch": 0.09643479829255096, "flos": 20740387789440.0, "grad_norm": 2.315680098338758, "language_loss": 0.84151202, "learning_rate": 3.953954420653706e-06, "loss": 0.86453825, "num_input_tokens_seen": 17114090, "step": 802, "time_per_iteration": 2.5248186588287354 }, { "auxiliary_loss_clip": 0.01232975, "auxiliary_loss_mlp": 0.01043224, "balance_loss_clip": 1.06682491, "balance_loss_mlp": 1.03179741, "epoch": 0.09655504118319004, "flos": 24420476833920.0, "grad_norm": 2.0181071838890965, "language_loss": 0.88420212, "learning_rate": 3.953788083764798e-06, "loss": 0.90696412, "num_input_tokens_seen": 17133325, "step": 803, "time_per_iteration": 2.5352556705474854 }, { "auxiliary_loss_clip": 0.01189739, "auxiliary_loss_mlp": 0.01048035, "balance_loss_clip": 1.06242228, "balance_loss_mlp": 1.0364002, "epoch": 0.09667528407382914, "flos": 18441961344000.0, "grad_norm": 2.496995321964076, "language_loss": 0.92321628, "learning_rate": 3.953621450487792e-06, "loss": 0.94559401, "num_input_tokens_seen": 17151945, "step": 804, "time_per_iteration": 2.595557928085327 }, { "auxiliary_loss_clip": 0.01142567, "auxiliary_loss_mlp": 0.0101372, "balance_loss_clip": 1.03245211, "balance_loss_mlp": 1.00902319, "epoch": 0.09679552696446822, "flos": 70816455544320.0, "grad_norm": 0.8332030199490637, "language_loss": 0.61185551, "learning_rate": 3.953454520847964e-06, "loss": 0.63341832, "num_input_tokens_seen": 17216790, "step": 805, "time_per_iteration": 3.22357439994812 }, { "auxiliary_loss_clip": 0.0121738, "auxiliary_loss_mlp": 0.01045988, "balance_loss_clip": 1.06415582, "balance_loss_mlp": 1.03172421, "epoch": 0.09691576985510732, "flos": 21945514020480.0, "grad_norm": 1.9788814838925464, "language_loss": 0.73775649, "learning_rate": 3.9532872948706395e-06, "loss": 0.76039016, "num_input_tokens_seen": 17236285, "step": 806, "time_per_iteration": 2.7168655395507812 }, { "auxiliary_loss_clip": 0.01220711, "auxiliary_loss_mlp": 0.01047229, "balance_loss_clip": 1.06429696, "balance_loss_mlp": 1.03431857, "epoch": 0.09703601274574641, "flos": 17965211103360.0, "grad_norm": 2.846603981391811, "language_loss": 0.83077765, "learning_rate": 3.9531197725811845e-06, "loss": 0.85345703, "num_input_tokens_seen": 17251670, "step": 807, "time_per_iteration": 2.551278591156006 }, { "auxiliary_loss_clip": 0.01252512, "auxiliary_loss_mlp": 0.01048074, "balance_loss_clip": 1.07249331, "balance_loss_mlp": 1.03608131, "epoch": 0.0971562556363855, "flos": 22162162901760.0, "grad_norm": 2.3128161223949384, "language_loss": 0.88030207, "learning_rate": 3.952951954005013e-06, "loss": 0.90330791, "num_input_tokens_seen": 17271355, "step": 808, "time_per_iteration": 2.4848601818084717 }, { "auxiliary_loss_clip": 0.01216005, "auxiliary_loss_mlp": 0.01037847, "balance_loss_clip": 1.06033742, "balance_loss_mlp": 1.0267663, "epoch": 0.0972764985270246, "flos": 25848716394240.0, "grad_norm": 1.8299890768925609, "language_loss": 0.84859526, "learning_rate": 3.952783839167584e-06, "loss": 0.8711338, "num_input_tokens_seen": 17291400, "step": 809, "time_per_iteration": 2.5771279335021973 }, { "auxiliary_loss_clip": 0.01233059, "auxiliary_loss_mlp": 0.01050485, "balance_loss_clip": 1.06492054, "balance_loss_mlp": 1.03826046, "epoch": 0.09739674141766368, "flos": 20339373375360.0, "grad_norm": 2.4756381520636106, "language_loss": 0.74346733, "learning_rate": 3.952615428094398e-06, "loss": 0.76630276, "num_input_tokens_seen": 17310920, "step": 810, "time_per_iteration": 2.5418622493743896 }, { "auxiliary_loss_clip": 0.01179405, "auxiliary_loss_mlp": 0.0104426, "balance_loss_clip": 1.05624688, "balance_loss_mlp": 1.03253555, "epoch": 0.09751698430830277, "flos": 15743059188480.0, "grad_norm": 1.8843149909741272, "language_loss": 0.73117381, "learning_rate": 3.952446720811004e-06, "loss": 0.75341046, "num_input_tokens_seen": 17329245, "step": 811, "time_per_iteration": 2.5892035961151123 }, { "auxiliary_loss_clip": 0.01098297, "auxiliary_loss_mlp": 0.01012904, "balance_loss_clip": 1.02645946, "balance_loss_mlp": 1.00827909, "epoch": 0.09763722719894186, "flos": 63716806800000.0, "grad_norm": 0.9533060309205846, "language_loss": 0.63613534, "learning_rate": 3.952277717342995e-06, "loss": 0.65724736, "num_input_tokens_seen": 17395680, "step": 812, "time_per_iteration": 3.256331205368042 }, { "auxiliary_loss_clip": 0.01225391, "auxiliary_loss_mlp": 0.01047311, "balance_loss_clip": 1.06686258, "balance_loss_mlp": 1.03509188, "epoch": 0.09775747008958095, "flos": 22090916275200.0, "grad_norm": 3.7191780324000128, "language_loss": 0.85725152, "learning_rate": 3.952108417716009e-06, "loss": 0.87997854, "num_input_tokens_seen": 17415135, "step": 813, "time_per_iteration": 2.5758559703826904 }, { "auxiliary_loss_clip": 0.01238119, "auxiliary_loss_mlp": 0.01037425, "balance_loss_clip": 1.06992388, "balance_loss_mlp": 1.02531981, "epoch": 0.09787771298022005, "flos": 21286050272640.0, "grad_norm": 2.002374886920885, "language_loss": 0.85238481, "learning_rate": 3.951938821955727e-06, "loss": 0.87514025, "num_input_tokens_seen": 17434535, "step": 814, "time_per_iteration": 2.526736259460449 }, { "auxiliary_loss_clip": 0.01220191, "auxiliary_loss_mlp": 0.01054191, "balance_loss_clip": 1.06682432, "balance_loss_mlp": 1.04103017, "epoch": 0.09799795587085913, "flos": 22054574689920.0, "grad_norm": 1.761950191157259, "language_loss": 0.76614404, "learning_rate": 3.9517689300878786e-06, "loss": 0.78888786, "num_input_tokens_seen": 17454270, "step": 815, "time_per_iteration": 2.563603162765503 }, { "auxiliary_loss_clip": 0.0124907, "auxiliary_loss_mlp": 0.0104743, "balance_loss_clip": 1.06668615, "balance_loss_mlp": 1.035074, "epoch": 0.09811819876149823, "flos": 22163743100160.0, "grad_norm": 1.6154304050276231, "language_loss": 0.78369677, "learning_rate": 3.951598742138236e-06, "loss": 0.80666184, "num_input_tokens_seen": 17472995, "step": 816, "time_per_iteration": 2.5028696060180664 }, { "auxiliary_loss_clip": 0.01222801, "auxiliary_loss_mlp": 0.01044022, "balance_loss_clip": 1.0601058, "balance_loss_mlp": 1.03222632, "epoch": 0.09823844165213731, "flos": 22231111057920.0, "grad_norm": 2.087037870464484, "language_loss": 0.79446566, "learning_rate": 3.951428258132615e-06, "loss": 0.81713378, "num_input_tokens_seen": 17491115, "step": 817, "time_per_iteration": 2.542043685913086 }, { "auxiliary_loss_clip": 0.01221113, "auxiliary_loss_mlp": 0.00766135, "balance_loss_clip": 1.06701481, "balance_loss_mlp": 1.000049, "epoch": 0.09835868454277641, "flos": 22487728798080.0, "grad_norm": 1.8101882484421883, "language_loss": 0.84654403, "learning_rate": 3.951257478096879e-06, "loss": 0.86641657, "num_input_tokens_seen": 17509480, "step": 818, "time_per_iteration": 2.5610127449035645 }, { "auxiliary_loss_clip": 0.01225898, "auxiliary_loss_mlp": 0.00767, "balance_loss_clip": 1.06874037, "balance_loss_mlp": 1.00004125, "epoch": 0.0984789274334155, "flos": 16362554077440.0, "grad_norm": 11.664055505286736, "language_loss": 0.68325204, "learning_rate": 3.951086402056936e-06, "loss": 0.70318103, "num_input_tokens_seen": 17524080, "step": 819, "time_per_iteration": 2.517002582550049 }, { "auxiliary_loss_clip": 0.01151403, "auxiliary_loss_mlp": 0.01040122, "balance_loss_clip": 1.05917859, "balance_loss_mlp": 1.02807021, "epoch": 0.09859917032405459, "flos": 24243545416320.0, "grad_norm": 1.6781942512696926, "language_loss": 0.83340466, "learning_rate": 3.950915030038735e-06, "loss": 0.85531998, "num_input_tokens_seen": 17543875, "step": 820, "time_per_iteration": 2.673495292663574 }, { "auxiliary_loss_clip": 0.01231708, "auxiliary_loss_mlp": 0.0104036, "balance_loss_clip": 1.06697381, "balance_loss_mlp": 1.02834392, "epoch": 0.09871941321469369, "flos": 17420195064960.0, "grad_norm": 2.1538299088470545, "language_loss": 0.83764315, "learning_rate": 3.9507433620682765e-06, "loss": 0.86036384, "num_input_tokens_seen": 17560810, "step": 821, "time_per_iteration": 3.302797794342041 }, { "auxiliary_loss_clip": 0.01204123, "auxiliary_loss_mlp": 0.01044282, "balance_loss_clip": 1.06205118, "balance_loss_mlp": 1.03183103, "epoch": 0.09883965610533277, "flos": 28477341590400.0, "grad_norm": 1.8392728428405152, "language_loss": 0.88368249, "learning_rate": 3.9505713981716e-06, "loss": 0.90616655, "num_input_tokens_seen": 17583640, "step": 822, "time_per_iteration": 2.639491319656372 }, { "auxiliary_loss_clip": 0.01217438, "auxiliary_loss_mlp": 0.01040398, "balance_loss_clip": 1.06719327, "balance_loss_mlp": 1.02942514, "epoch": 0.09895989899597187, "flos": 23693932437120.0, "grad_norm": 1.7954747837844185, "language_loss": 0.8082515, "learning_rate": 3.950399138374795e-06, "loss": 0.83082992, "num_input_tokens_seen": 17602720, "step": 823, "time_per_iteration": 2.5586977005004883 }, { "auxiliary_loss_clip": 0.01235169, "auxiliary_loss_mlp": 0.0104385, "balance_loss_clip": 1.06695068, "balance_loss_mlp": 1.03111267, "epoch": 0.09908014188661095, "flos": 24679608526080.0, "grad_norm": 1.5718540344801786, "language_loss": 0.74346745, "learning_rate": 3.95022658270399e-06, "loss": 0.76625764, "num_input_tokens_seen": 17623085, "step": 824, "time_per_iteration": 2.5423552989959717 }, { "auxiliary_loss_clip": 0.01218675, "auxiliary_loss_mlp": 0.01043008, "balance_loss_clip": 1.06728852, "balance_loss_mlp": 1.03121233, "epoch": 0.09920038477725004, "flos": 14064307200000.0, "grad_norm": 1.8990821428257043, "language_loss": 0.78294086, "learning_rate": 3.9500537311853635e-06, "loss": 0.80555767, "num_input_tokens_seen": 17641040, "step": 825, "time_per_iteration": 3.282687187194824 }, { "auxiliary_loss_clip": 0.0123229, "auxiliary_loss_mlp": 0.01039592, "balance_loss_clip": 1.06240439, "balance_loss_mlp": 1.02661586, "epoch": 0.09932062766788914, "flos": 13407070095360.0, "grad_norm": 2.3387405094626996, "language_loss": 0.83182621, "learning_rate": 3.949880583845136e-06, "loss": 0.854545, "num_input_tokens_seen": 17659115, "step": 826, "time_per_iteration": 2.4942092895507812 }, { "auxiliary_loss_clip": 0.01218665, "auxiliary_loss_mlp": 0.01036333, "balance_loss_clip": 1.06479549, "balance_loss_mlp": 1.02437007, "epoch": 0.09944087055852822, "flos": 19500751566720.0, "grad_norm": 1.7635335002744208, "language_loss": 0.8168053, "learning_rate": 3.949707140709575e-06, "loss": 0.83935529, "num_input_tokens_seen": 17678845, "step": 827, "time_per_iteration": 3.41426157951355 }, { "auxiliary_loss_clip": 0.0123503, "auxiliary_loss_mlp": 0.01041133, "balance_loss_clip": 1.06363916, "balance_loss_mlp": 1.02837777, "epoch": 0.09956111344916732, "flos": 17749100926080.0, "grad_norm": 2.188741142924056, "language_loss": 0.83308917, "learning_rate": 3.949533401804991e-06, "loss": 0.85585082, "num_input_tokens_seen": 17695750, "step": 828, "time_per_iteration": 3.250134229660034 }, { "auxiliary_loss_clip": 0.01233639, "auxiliary_loss_mlp": 0.00766858, "balance_loss_clip": 1.06704879, "balance_loss_mlp": 0.99998927, "epoch": 0.0996813563398064, "flos": 17967581400960.0, "grad_norm": 1.9560624122765464, "language_loss": 0.90871429, "learning_rate": 3.949359367157739e-06, "loss": 0.92871928, "num_input_tokens_seen": 17714445, "step": 829, "time_per_iteration": 2.4974539279937744 }, { "auxiliary_loss_clip": 0.01239137, "auxiliary_loss_mlp": 0.01043372, "balance_loss_clip": 1.06791723, "balance_loss_mlp": 1.03074765, "epoch": 0.0998015992304455, "flos": 17457039440640.0, "grad_norm": 1.9942286437401784, "language_loss": 0.75674176, "learning_rate": 3.949185036794222e-06, "loss": 0.77956688, "num_input_tokens_seen": 17732455, "step": 830, "time_per_iteration": 2.5000433921813965 }, { "auxiliary_loss_clip": 0.01249127, "auxiliary_loss_mlp": 0.01044695, "balance_loss_clip": 1.06883395, "balance_loss_mlp": 1.03331614, "epoch": 0.0999218421210846, "flos": 25888757080320.0, "grad_norm": 1.6033214206592308, "language_loss": 0.78983754, "learning_rate": 3.949010410740884e-06, "loss": 0.81277573, "num_input_tokens_seen": 17755280, "step": 831, "time_per_iteration": 2.525277853012085 }, { "auxiliary_loss_clip": 0.01208779, "auxiliary_loss_mlp": 0.0076628, "balance_loss_clip": 1.06050897, "balance_loss_mlp": 1.00001538, "epoch": 0.10004208501172368, "flos": 21215916967680.0, "grad_norm": 1.62522100689321, "language_loss": 0.86340111, "learning_rate": 3.948835489024216e-06, "loss": 0.88315165, "num_input_tokens_seen": 17775015, "step": 832, "time_per_iteration": 2.541314125061035 }, { "auxiliary_loss_clip": 0.01236704, "auxiliary_loss_mlp": 0.01045288, "balance_loss_clip": 1.06627107, "balance_loss_mlp": 1.03300381, "epoch": 0.10016232790236278, "flos": 17348409734400.0, "grad_norm": 3.1618027524712726, "language_loss": 0.90499282, "learning_rate": 3.948660271670755e-06, "loss": 0.9278127, "num_input_tokens_seen": 17792165, "step": 833, "time_per_iteration": 2.4987680912017822 }, { "auxiliary_loss_clip": 0.01215297, "auxiliary_loss_mlp": 0.01044734, "balance_loss_clip": 1.06385088, "balance_loss_mlp": 1.03330183, "epoch": 0.10028257079300186, "flos": 25666541591040.0, "grad_norm": 2.5402864867469956, "language_loss": 0.84196162, "learning_rate": 3.948484758707079e-06, "loss": 0.86456198, "num_input_tokens_seen": 17811765, "step": 834, "time_per_iteration": 2.575428009033203 }, { "auxiliary_loss_clip": 0.01193654, "auxiliary_loss_mlp": 0.01041632, "balance_loss_clip": 1.05822778, "balance_loss_mlp": 1.02848887, "epoch": 0.10040281368364096, "flos": 25156035544320.0, "grad_norm": 2.205290033562829, "language_loss": 0.83342886, "learning_rate": 3.948308950159815e-06, "loss": 0.85578179, "num_input_tokens_seen": 17830445, "step": 835, "time_per_iteration": 2.605041980743408 }, { "auxiliary_loss_clip": 0.01198919, "auxiliary_loss_mlp": 0.01047501, "balance_loss_clip": 1.05968094, "balance_loss_mlp": 1.03349376, "epoch": 0.10052305657428004, "flos": 17603303621760.0, "grad_norm": 3.343470919223055, "language_loss": 0.75939095, "learning_rate": 3.9481328460556326e-06, "loss": 0.78185511, "num_input_tokens_seen": 17847665, "step": 836, "time_per_iteration": 2.577462911605835 }, { "auxiliary_loss_clip": 0.01209342, "auxiliary_loss_mlp": 0.01038248, "balance_loss_clip": 1.06079316, "balance_loss_mlp": 1.0261606, "epoch": 0.10064329946491914, "flos": 18660154510080.0, "grad_norm": 2.119490816381886, "language_loss": 0.89630353, "learning_rate": 3.9479564464212455e-06, "loss": 0.91877943, "num_input_tokens_seen": 17866825, "step": 837, "time_per_iteration": 2.529484987258911 }, { "auxiliary_loss_clip": 0.01255556, "auxiliary_loss_mlp": 0.01040527, "balance_loss_clip": 1.06921005, "balance_loss_mlp": 1.02846348, "epoch": 0.10076354235555823, "flos": 17199056983680.0, "grad_norm": 2.4916729210920843, "language_loss": 0.76543212, "learning_rate": 3.947779751283414e-06, "loss": 0.7883929, "num_input_tokens_seen": 17883995, "step": 838, "time_per_iteration": 2.4439988136291504 }, { "auxiliary_loss_clip": 0.01237269, "auxiliary_loss_mlp": 0.00767009, "balance_loss_clip": 1.07233191, "balance_loss_mlp": 0.99994469, "epoch": 0.10088378524619732, "flos": 22962252395520.0, "grad_norm": 1.7667242844680333, "language_loss": 0.76098895, "learning_rate": 3.947602760668944e-06, "loss": 0.78103173, "num_input_tokens_seen": 17903785, "step": 839, "time_per_iteration": 2.494786262512207 }, { "auxiliary_loss_clip": 0.012346, "auxiliary_loss_mlp": 0.01044215, "balance_loss_clip": 1.06970143, "balance_loss_mlp": 1.03175759, "epoch": 0.10100402813683641, "flos": 37885828746240.0, "grad_norm": 2.005843276947239, "language_loss": 0.71448809, "learning_rate": 3.947425474604684e-06, "loss": 0.73727626, "num_input_tokens_seen": 17927720, "step": 840, "time_per_iteration": 2.651487112045288 }, { "auxiliary_loss_clip": 0.01217405, "auxiliary_loss_mlp": 0.01048681, "balance_loss_clip": 1.06426644, "balance_loss_mlp": 1.03670037, "epoch": 0.1011242710274755, "flos": 21543458112000.0, "grad_norm": 2.5419988743685646, "language_loss": 0.92191446, "learning_rate": 3.947247893117528e-06, "loss": 0.94457537, "num_input_tokens_seen": 17946225, "step": 841, "time_per_iteration": 2.5118865966796875 }, { "auxiliary_loss_clip": 0.01230428, "auxiliary_loss_mlp": 0.01046582, "balance_loss_clip": 1.06437933, "balance_loss_mlp": 1.03419077, "epoch": 0.10124451391811459, "flos": 13621456419840.0, "grad_norm": 3.355128122581476, "language_loss": 0.69190425, "learning_rate": 3.947070016234413e-06, "loss": 0.71467435, "num_input_tokens_seen": 17962015, "step": 842, "time_per_iteration": 2.452122688293457 }, { "auxiliary_loss_clip": 0.01229044, "auxiliary_loss_mlp": 0.01042643, "balance_loss_clip": 1.06816494, "balance_loss_mlp": 1.02964902, "epoch": 0.10136475680875369, "flos": 16649228522880.0, "grad_norm": 2.8394286288835353, "language_loss": 0.74950635, "learning_rate": 3.946891843982326e-06, "loss": 0.77222323, "num_input_tokens_seen": 17979680, "step": 843, "time_per_iteration": 2.517972707748413 }, { "auxiliary_loss_clip": 0.0123458, "auxiliary_loss_mlp": 0.01040493, "balance_loss_clip": 1.06779242, "balance_loss_mlp": 1.02793479, "epoch": 0.10148499969939277, "flos": 19461034103040.0, "grad_norm": 2.3815770918193384, "language_loss": 0.74619412, "learning_rate": 3.9467133763882935e-06, "loss": 0.76894486, "num_input_tokens_seen": 17998145, "step": 844, "time_per_iteration": 2.463911533355713 }, { "auxiliary_loss_clip": 0.01223991, "auxiliary_loss_mlp": 0.01045653, "balance_loss_clip": 1.06532001, "balance_loss_mlp": 1.03283191, "epoch": 0.10160524259003187, "flos": 21104988791040.0, "grad_norm": 2.137850562545883, "language_loss": 0.8633101, "learning_rate": 3.9465346134793905e-06, "loss": 0.88600647, "num_input_tokens_seen": 18017955, "step": 845, "time_per_iteration": 2.5580990314483643 }, { "auxiliary_loss_clip": 0.01206699, "auxiliary_loss_mlp": 0.0104082, "balance_loss_clip": 1.0662179, "balance_loss_mlp": 1.02924466, "epoch": 0.10172548548067095, "flos": 17712687513600.0, "grad_norm": 1.9391952462209916, "language_loss": 0.79561269, "learning_rate": 3.9463555552827335e-06, "loss": 0.81808794, "num_input_tokens_seen": 18035125, "step": 846, "time_per_iteration": 2.539644718170166 }, { "auxiliary_loss_clip": 0.012232, "auxiliary_loss_mlp": 0.01048094, "balance_loss_clip": 1.06467104, "balance_loss_mlp": 1.03598249, "epoch": 0.10184572837131005, "flos": 21104845136640.0, "grad_norm": 2.3799335297335693, "language_loss": 0.86475122, "learning_rate": 3.946176201825487e-06, "loss": 0.88746417, "num_input_tokens_seen": 18053160, "step": 847, "time_per_iteration": 3.2794134616851807 }, { "auxiliary_loss_clip": 0.01220933, "auxiliary_loss_mlp": 0.01048109, "balance_loss_clip": 1.06788039, "balance_loss_mlp": 1.03592002, "epoch": 0.10196597126194913, "flos": 26067591918720.0, "grad_norm": 1.8757728826935736, "language_loss": 0.83539331, "learning_rate": 3.9459965531348575e-06, "loss": 0.85808378, "num_input_tokens_seen": 18072815, "step": 848, "time_per_iteration": 2.5506062507629395 }, { "auxiliary_loss_clip": 0.01220936, "auxiliary_loss_mlp": 0.00766942, "balance_loss_clip": 1.06723249, "balance_loss_mlp": 1.00012016, "epoch": 0.10208621415258823, "flos": 29314634595840.0, "grad_norm": 2.1663592208987756, "language_loss": 0.85718745, "learning_rate": 3.945816609238098e-06, "loss": 0.87706625, "num_input_tokens_seen": 18092225, "step": 849, "time_per_iteration": 2.592404842376709 }, { "auxiliary_loss_clip": 0.01180991, "auxiliary_loss_mlp": 0.0104352, "balance_loss_clip": 1.06252837, "balance_loss_mlp": 1.0308001, "epoch": 0.10220645704322733, "flos": 23805794367360.0, "grad_norm": 2.5953664706914634, "language_loss": 0.85040855, "learning_rate": 3.945636370162507e-06, "loss": 0.8726536, "num_input_tokens_seen": 18112335, "step": 850, "time_per_iteration": 2.6394407749176025 }, { "auxiliary_loss_clip": 0.01231576, "auxiliary_loss_mlp": 0.01047507, "balance_loss_clip": 1.0672524, "balance_loss_mlp": 1.03615785, "epoch": 0.10232669993386641, "flos": 23218546913280.0, "grad_norm": 2.0973420090993704, "language_loss": 0.79131997, "learning_rate": 3.945455835935425e-06, "loss": 0.81411076, "num_input_tokens_seen": 18131520, "step": 851, "time_per_iteration": 2.672485589981079 }, { "auxiliary_loss_clip": 0.01220252, "auxiliary_loss_mlp": 0.01046661, "balance_loss_clip": 1.06529629, "balance_loss_mlp": 1.03484726, "epoch": 0.1024469428245055, "flos": 22922929981440.0, "grad_norm": 2.166325375407431, "language_loss": 0.75056547, "learning_rate": 3.94527500658424e-06, "loss": 0.77323461, "num_input_tokens_seen": 18149185, "step": 852, "time_per_iteration": 3.268260955810547 }, { "auxiliary_loss_clip": 0.01188679, "auxiliary_loss_mlp": 0.01038561, "balance_loss_clip": 1.06291997, "balance_loss_mlp": 1.02733111, "epoch": 0.10256718571514459, "flos": 31359495957120.0, "grad_norm": 1.9073348344713514, "language_loss": 0.8100422, "learning_rate": 3.945093882136382e-06, "loss": 0.83231461, "num_input_tokens_seen": 18172960, "step": 853, "time_per_iteration": 3.671428918838501 }, { "auxiliary_loss_clip": 0.01216055, "auxiliary_loss_mlp": 0.00765419, "balance_loss_clip": 1.06689453, "balance_loss_mlp": 1.00006795, "epoch": 0.10268742860578368, "flos": 23474877344640.0, "grad_norm": 2.0369046488082856, "language_loss": 0.84509921, "learning_rate": 3.944912462619329e-06, "loss": 0.86491388, "num_input_tokens_seen": 18191925, "step": 854, "time_per_iteration": 2.6228551864624023 }, { "auxiliary_loss_clip": 0.01221666, "auxiliary_loss_mlp": 0.01049189, "balance_loss_clip": 1.06585908, "balance_loss_mlp": 1.03633189, "epoch": 0.10280767149642277, "flos": 25520313323520.0, "grad_norm": 3.404673092178469, "language_loss": 0.80233961, "learning_rate": 3.9447307480606025e-06, "loss": 0.82504815, "num_input_tokens_seen": 18212010, "step": 855, "time_per_iteration": 3.3405556678771973 }, { "auxiliary_loss_clip": 0.01214746, "auxiliary_loss_mlp": 0.01043223, "balance_loss_clip": 1.06535614, "balance_loss_mlp": 1.03015125, "epoch": 0.10292791438706186, "flos": 17347691462400.0, "grad_norm": 3.003890064888439, "language_loss": 0.89972758, "learning_rate": 3.944548738487767e-06, "loss": 0.92230725, "num_input_tokens_seen": 18229525, "step": 856, "time_per_iteration": 2.5274417400360107 }, { "auxiliary_loss_clip": 0.01257635, "auxiliary_loss_mlp": 0.01044011, "balance_loss_clip": 1.07388747, "balance_loss_mlp": 1.03259122, "epoch": 0.10304815727770096, "flos": 27052693390080.0, "grad_norm": 2.100832403580187, "language_loss": 0.90825057, "learning_rate": 3.944366433928434e-06, "loss": 0.93126702, "num_input_tokens_seen": 18249505, "step": 857, "time_per_iteration": 2.5308327674865723 }, { "auxiliary_loss_clip": 0.01213004, "auxiliary_loss_mlp": 0.01047817, "balance_loss_clip": 1.06232917, "balance_loss_mlp": 1.03597975, "epoch": 0.10316840016834004, "flos": 22782591544320.0, "grad_norm": 2.237102036843472, "language_loss": 0.83802682, "learning_rate": 3.9441838344102594e-06, "loss": 0.86063504, "num_input_tokens_seen": 18269230, "step": 858, "time_per_iteration": 2.627354860305786 }, { "auxiliary_loss_clip": 0.01226715, "auxiliary_loss_mlp": 0.01042823, "balance_loss_clip": 1.06885815, "balance_loss_mlp": 1.03116405, "epoch": 0.10328864305897914, "flos": 20704584908160.0, "grad_norm": 3.303488850851589, "language_loss": 0.67042291, "learning_rate": 3.944000939960943e-06, "loss": 0.69311827, "num_input_tokens_seen": 18287955, "step": 859, "time_per_iteration": 2.5827934741973877 }, { "auxiliary_loss_clip": 0.01237978, "auxiliary_loss_mlp": 0.01039218, "balance_loss_clip": 1.06616306, "balance_loss_mlp": 1.02867961, "epoch": 0.10340888594961822, "flos": 28478814048000.0, "grad_norm": 6.370256847344052, "language_loss": 0.80057919, "learning_rate": 3.943817750608229e-06, "loss": 0.8233512, "num_input_tokens_seen": 18310505, "step": 860, "time_per_iteration": 2.584465265274048 }, { "auxiliary_loss_clip": 0.01241111, "auxiliary_loss_mlp": 0.0104005, "balance_loss_clip": 1.07246232, "balance_loss_mlp": 1.0286541, "epoch": 0.10352912884025732, "flos": 13370333460480.0, "grad_norm": 2.175762921988747, "language_loss": 0.81847197, "learning_rate": 3.943634266379908e-06, "loss": 0.84128356, "num_input_tokens_seen": 18327400, "step": 861, "time_per_iteration": 2.47428822517395 }, { "auxiliary_loss_clip": 0.01237188, "auxiliary_loss_mlp": 0.01040193, "balance_loss_clip": 1.06687546, "balance_loss_mlp": 1.02873671, "epoch": 0.10364937173089642, "flos": 25558558329600.0, "grad_norm": 1.8234020500404189, "language_loss": 0.84948528, "learning_rate": 3.943450487303815e-06, "loss": 0.87225908, "num_input_tokens_seen": 18347895, "step": 862, "time_per_iteration": 2.5316033363342285 }, { "auxiliary_loss_clip": 0.01232499, "auxiliary_loss_mlp": 0.01039847, "balance_loss_clip": 1.06812179, "balance_loss_mlp": 1.02818191, "epoch": 0.1037696146215355, "flos": 21215486004480.0, "grad_norm": 1.7833454724004991, "language_loss": 0.85031354, "learning_rate": 3.943266413407827e-06, "loss": 0.87303698, "num_input_tokens_seen": 18367170, "step": 863, "time_per_iteration": 2.4865143299102783 }, { "auxiliary_loss_clip": 0.01237765, "auxiliary_loss_mlp": 0.01043358, "balance_loss_clip": 1.07003617, "balance_loss_mlp": 1.03169918, "epoch": 0.1038898575121746, "flos": 25807382818560.0, "grad_norm": 1.8825425416363033, "language_loss": 0.84739596, "learning_rate": 3.94308204471987e-06, "loss": 0.87020719, "num_input_tokens_seen": 18386185, "step": 864, "time_per_iteration": 2.539384365081787 }, { "auxiliary_loss_clip": 0.01205002, "auxiliary_loss_mlp": 0.01035991, "balance_loss_clip": 1.06376529, "balance_loss_mlp": 1.02414763, "epoch": 0.10401010040281368, "flos": 19062425900160.0, "grad_norm": 3.319257796739146, "language_loss": 0.7492885, "learning_rate": 3.942897381267912e-06, "loss": 0.77169847, "num_input_tokens_seen": 18402550, "step": 865, "time_per_iteration": 2.544003486633301 }, { "auxiliary_loss_clip": 0.01243042, "auxiliary_loss_mlp": 0.01034302, "balance_loss_clip": 1.07225144, "balance_loss_mlp": 1.02283382, "epoch": 0.10413034329345278, "flos": 16355119962240.0, "grad_norm": 2.7712562649215355, "language_loss": 0.65953261, "learning_rate": 3.942712423079965e-06, "loss": 0.68230605, "num_input_tokens_seen": 18418940, "step": 866, "time_per_iteration": 2.4790189266204834 }, { "auxiliary_loss_clip": 0.01184209, "auxiliary_loss_mlp": 0.01035459, "balance_loss_clip": 1.0545032, "balance_loss_mlp": 1.02528405, "epoch": 0.10425058618409186, "flos": 17236511890560.0, "grad_norm": 2.526350744957532, "language_loss": 0.90156156, "learning_rate": 3.942527170184088e-06, "loss": 0.92375827, "num_input_tokens_seen": 18435560, "step": 867, "time_per_iteration": 2.5399932861328125 }, { "auxiliary_loss_clip": 0.01253517, "auxiliary_loss_mlp": 0.01041219, "balance_loss_clip": 1.07183504, "balance_loss_mlp": 1.0294168, "epoch": 0.10437082907473096, "flos": 17967365919360.0, "grad_norm": 2.264853869802826, "language_loss": 0.77554595, "learning_rate": 3.942341622608385e-06, "loss": 0.79849333, "num_input_tokens_seen": 18452590, "step": 868, "time_per_iteration": 2.451336145401001 }, { "auxiliary_loss_clip": 0.01222189, "auxiliary_loss_mlp": 0.01043724, "balance_loss_clip": 1.0691812, "balance_loss_mlp": 1.03211904, "epoch": 0.10449107196537005, "flos": 36283315374720.0, "grad_norm": 2.001293168578348, "language_loss": 0.78016436, "learning_rate": 3.942155780381001e-06, "loss": 0.80282348, "num_input_tokens_seen": 18476325, "step": 869, "time_per_iteration": 2.6595215797424316 }, { "auxiliary_loss_clip": 0.01220711, "auxiliary_loss_mlp": 0.0104597, "balance_loss_clip": 1.06374049, "balance_loss_mlp": 1.03341126, "epoch": 0.10461131485600914, "flos": 23802095266560.0, "grad_norm": 1.9106085158732284, "language_loss": 0.75781316, "learning_rate": 3.94196964353013e-06, "loss": 0.78047997, "num_input_tokens_seen": 18495775, "step": 870, "time_per_iteration": 2.5559327602386475 }, { "auxiliary_loss_clip": 0.01216956, "auxiliary_loss_mlp": 0.00765485, "balance_loss_clip": 1.06374252, "balance_loss_mlp": 1.00012648, "epoch": 0.10473155774664823, "flos": 18405476104320.0, "grad_norm": 1.9737760200382524, "language_loss": 0.80670154, "learning_rate": 3.941783212084008e-06, "loss": 0.82652593, "num_input_tokens_seen": 18513530, "step": 871, "time_per_iteration": 2.5382020473480225 }, { "auxiliary_loss_clip": 0.01205355, "auxiliary_loss_mlp": 0.01044931, "balance_loss_clip": 1.06511593, "balance_loss_mlp": 1.03271246, "epoch": 0.10485180063728732, "flos": 25592637358080.0, "grad_norm": 2.4557297627007997, "language_loss": 0.79317546, "learning_rate": 3.941596486070916e-06, "loss": 0.81567836, "num_input_tokens_seen": 18531575, "step": 872, "time_per_iteration": 2.588190793991089 }, { "auxiliary_loss_clip": 0.01186879, "auxiliary_loss_mlp": 0.01036622, "balance_loss_clip": 1.06414151, "balance_loss_mlp": 1.02384257, "epoch": 0.10497204352792641, "flos": 27088747666560.0, "grad_norm": 2.1412634343711154, "language_loss": 0.58533347, "learning_rate": 3.941409465519182e-06, "loss": 0.60756838, "num_input_tokens_seen": 18552100, "step": 873, "time_per_iteration": 2.6472792625427246 }, { "auxiliary_loss_clip": 0.0122552, "auxiliary_loss_mlp": 0.01044806, "balance_loss_clip": 1.06407785, "balance_loss_mlp": 1.03165078, "epoch": 0.10509228641856551, "flos": 32858479353600.0, "grad_norm": 1.6447512575985188, "language_loss": 0.85198975, "learning_rate": 3.941222150457176e-06, "loss": 0.87469304, "num_input_tokens_seen": 18575355, "step": 874, "time_per_iteration": 3.423719882965088 }, { "auxiliary_loss_clip": 0.01240177, "auxiliary_loss_mlp": 0.01038778, "balance_loss_clip": 1.0676465, "balance_loss_mlp": 1.02667212, "epoch": 0.10521252930920459, "flos": 14319165173760.0, "grad_norm": 2.8463727202923286, "language_loss": 0.71149546, "learning_rate": 3.941034540913311e-06, "loss": 0.734285, "num_input_tokens_seen": 18592885, "step": 875, "time_per_iteration": 2.475145101547241 }, { "auxiliary_loss_clip": 0.01236973, "auxiliary_loss_mlp": 0.00766624, "balance_loss_clip": 1.07008505, "balance_loss_mlp": 1.00016022, "epoch": 0.10533277219984369, "flos": 21687028773120.0, "grad_norm": 1.5962355853127843, "language_loss": 0.82411563, "learning_rate": 3.940846636916051e-06, "loss": 0.84415162, "num_input_tokens_seen": 18612920, "step": 876, "time_per_iteration": 2.5428812503814697 }, { "auxiliary_loss_clip": 0.01220273, "auxiliary_loss_mlp": 0.01048191, "balance_loss_clip": 1.07053804, "balance_loss_mlp": 1.03508949, "epoch": 0.10545301509048277, "flos": 22269787027200.0, "grad_norm": 2.0829580387345055, "language_loss": 0.86610603, "learning_rate": 3.940658438493899e-06, "loss": 0.88879067, "num_input_tokens_seen": 18630765, "step": 877, "time_per_iteration": 2.545813798904419 }, { "auxiliary_loss_clip": 0.01254124, "auxiliary_loss_mlp": 0.01042168, "balance_loss_clip": 1.06617868, "balance_loss_mlp": 1.0295136, "epoch": 0.10557325798112187, "flos": 22199725549440.0, "grad_norm": 2.1651349502749775, "language_loss": 0.76163447, "learning_rate": 3.940469945675405e-06, "loss": 0.7845974, "num_input_tokens_seen": 18649150, "step": 878, "time_per_iteration": 3.246293067932129 }, { "auxiliary_loss_clip": 0.01166553, "auxiliary_loss_mlp": 0.01045867, "balance_loss_clip": 1.05816996, "balance_loss_mlp": 1.03444648, "epoch": 0.10569350087176095, "flos": 25775889569280.0, "grad_norm": 2.023790246003973, "language_loss": 0.91725224, "learning_rate": 3.940281158489163e-06, "loss": 0.93937647, "num_input_tokens_seen": 18668380, "step": 879, "time_per_iteration": 2.630768060684204 }, { "auxiliary_loss_clip": 0.01168387, "auxiliary_loss_mlp": 0.01044457, "balance_loss_clip": 1.0555619, "balance_loss_mlp": 1.03275084, "epoch": 0.10581374376240005, "flos": 17311385790720.0, "grad_norm": 7.253074076075061, "language_loss": 0.82890636, "learning_rate": 3.940092076963812e-06, "loss": 0.85103476, "num_input_tokens_seen": 18685875, "step": 880, "time_per_iteration": 3.491748809814453 }, { "auxiliary_loss_clip": 0.01216041, "auxiliary_loss_mlp": 0.01046981, "balance_loss_clip": 1.06327569, "balance_loss_mlp": 1.03451133, "epoch": 0.10593398665303914, "flos": 34349454017280.0, "grad_norm": 2.201171697155286, "language_loss": 0.78926682, "learning_rate": 3.9399027011280355e-06, "loss": 0.81189704, "num_input_tokens_seen": 18707970, "step": 881, "time_per_iteration": 2.6801040172576904 }, { "auxiliary_loss_clip": 0.0121879, "auxiliary_loss_mlp": 0.01040829, "balance_loss_clip": 1.06817174, "balance_loss_mlp": 1.02867508, "epoch": 0.10605422954367823, "flos": 23257977068160.0, "grad_norm": 2.1336259516275486, "language_loss": 0.77490532, "learning_rate": 3.939713031010561e-06, "loss": 0.7975015, "num_input_tokens_seen": 18726335, "step": 882, "time_per_iteration": 3.284637451171875 }, { "auxiliary_loss_clip": 0.0120143, "auxiliary_loss_mlp": 0.01042383, "balance_loss_clip": 1.06556129, "balance_loss_mlp": 1.02910328, "epoch": 0.10617447243431732, "flos": 22820118278400.0, "grad_norm": 2.3591702407021558, "language_loss": 0.77737093, "learning_rate": 3.939523066640163e-06, "loss": 0.7998091, "num_input_tokens_seen": 18745230, "step": 883, "time_per_iteration": 2.578439474105835 }, { "auxiliary_loss_clip": 0.01237568, "auxiliary_loss_mlp": 0.01041504, "balance_loss_clip": 1.06838441, "balance_loss_mlp": 1.02965486, "epoch": 0.10629471532495641, "flos": 24386577373440.0, "grad_norm": 1.8746076137111203, "language_loss": 0.81338352, "learning_rate": 3.939332808045657e-06, "loss": 0.83617419, "num_input_tokens_seen": 18764880, "step": 884, "time_per_iteration": 2.5560302734375 }, { "auxiliary_loss_clip": 0.0120491, "auxiliary_loss_mlp": 0.01043612, "balance_loss_clip": 1.0650779, "balance_loss_mlp": 1.03197157, "epoch": 0.1064149582155955, "flos": 21105491581440.0, "grad_norm": 1.6841616910548782, "language_loss": 0.84690976, "learning_rate": 3.939142255255906e-06, "loss": 0.86939502, "num_input_tokens_seen": 18785765, "step": 885, "time_per_iteration": 2.6163365840911865 }, { "auxiliary_loss_clip": 0.01236461, "auxiliary_loss_mlp": 0.01035274, "balance_loss_clip": 1.06873155, "balance_loss_mlp": 1.02271545, "epoch": 0.1065352011062346, "flos": 20702035042560.0, "grad_norm": 2.3705089487107935, "language_loss": 0.86455911, "learning_rate": 3.938951408299817e-06, "loss": 0.88727641, "num_input_tokens_seen": 18804605, "step": 886, "time_per_iteration": 2.5161798000335693 }, { "auxiliary_loss_clip": 0.01107957, "auxiliary_loss_mlp": 0.01022806, "balance_loss_clip": 1.05832243, "balance_loss_mlp": 1.01918221, "epoch": 0.10665544399687368, "flos": 62659632689280.0, "grad_norm": 0.8062078229451766, "language_loss": 0.5441612, "learning_rate": 3.938760267206342e-06, "loss": 0.56546885, "num_input_tokens_seen": 18866425, "step": 887, "time_per_iteration": 3.104126214981079 }, { "auxiliary_loss_clip": 0.01250889, "auxiliary_loss_mlp": 0.01039364, "balance_loss_clip": 1.0702492, "balance_loss_mlp": 1.02737117, "epoch": 0.10677568688751278, "flos": 26140382830080.0, "grad_norm": 2.2213090147609504, "language_loss": 0.79058969, "learning_rate": 3.938568832004475e-06, "loss": 0.81349224, "num_input_tokens_seen": 18885130, "step": 888, "time_per_iteration": 2.5349457263946533 }, { "auxiliary_loss_clip": 0.0121022, "auxiliary_loss_mlp": 0.01051649, "balance_loss_clip": 1.06313348, "balance_loss_mlp": 1.03900731, "epoch": 0.10689592977815186, "flos": 12786533712000.0, "grad_norm": 2.250505992968594, "language_loss": 0.75351048, "learning_rate": 3.938377102723257e-06, "loss": 0.77612913, "num_input_tokens_seen": 18902265, "step": 889, "time_per_iteration": 2.5150461196899414 }, { "auxiliary_loss_clip": 0.01171473, "auxiliary_loss_mlp": 0.01051904, "balance_loss_clip": 1.05809522, "balance_loss_mlp": 1.03889275, "epoch": 0.10701617266879096, "flos": 22126683242880.0, "grad_norm": 2.0272943104641965, "language_loss": 0.8326872, "learning_rate": 3.938185079391774e-06, "loss": 0.85492098, "num_input_tokens_seen": 18919310, "step": 890, "time_per_iteration": 2.6090900897979736 }, { "auxiliary_loss_clip": 0.01250107, "auxiliary_loss_mlp": 0.01035693, "balance_loss_clip": 1.06854534, "balance_loss_mlp": 1.02387333, "epoch": 0.10713641555943004, "flos": 19745625559680.0, "grad_norm": 2.4502619689593828, "language_loss": 1.05944252, "learning_rate": 3.937992762039157e-06, "loss": 1.08230042, "num_input_tokens_seen": 18932635, "step": 891, "time_per_iteration": 2.4560630321502686 }, { "auxiliary_loss_clip": 0.01232828, "auxiliary_loss_mlp": 0.01049007, "balance_loss_clip": 1.06847763, "balance_loss_mlp": 1.03724122, "epoch": 0.10725665845006914, "flos": 23952992302080.0, "grad_norm": 1.5648784372640385, "language_loss": 0.8026191, "learning_rate": 3.937800150694577e-06, "loss": 0.82543743, "num_input_tokens_seen": 18953810, "step": 892, "time_per_iteration": 2.628404378890991 }, { "auxiliary_loss_clip": 0.01188786, "auxiliary_loss_mlp": 0.01041942, "balance_loss_clip": 1.06502533, "balance_loss_mlp": 1.02937186, "epoch": 0.10737690134070824, "flos": 18551704371840.0, "grad_norm": 2.125893412374179, "language_loss": 0.75950187, "learning_rate": 3.937607245387255e-06, "loss": 0.78180915, "num_input_tokens_seen": 18973175, "step": 893, "time_per_iteration": 2.6575801372528076 }, { "auxiliary_loss_clip": 0.01226172, "auxiliary_loss_mlp": 0.01046969, "balance_loss_clip": 1.06583548, "balance_loss_mlp": 1.03556085, "epoch": 0.10749714423134732, "flos": 22707609903360.0, "grad_norm": 1.8954759059579405, "language_loss": 0.72281992, "learning_rate": 3.937414046146455e-06, "loss": 0.74555135, "num_input_tokens_seen": 18991130, "step": 894, "time_per_iteration": 2.5614848136901855 }, { "auxiliary_loss_clip": 0.01252668, "auxiliary_loss_mlp": 0.01051168, "balance_loss_clip": 1.07163644, "balance_loss_mlp": 1.03790641, "epoch": 0.10761738712198642, "flos": 21106066199040.0, "grad_norm": 2.4041544553142336, "language_loss": 0.75544989, "learning_rate": 3.9372205530014845e-06, "loss": 0.77848828, "num_input_tokens_seen": 19009610, "step": 895, "time_per_iteration": 2.501729965209961 }, { "auxiliary_loss_clip": 0.0124967, "auxiliary_loss_mlp": 0.01057419, "balance_loss_clip": 1.06933403, "balance_loss_mlp": 1.04577255, "epoch": 0.1077376300126255, "flos": 23766723348480.0, "grad_norm": 3.3340744936338242, "language_loss": 0.7168293, "learning_rate": 3.937026765981696e-06, "loss": 0.73990023, "num_input_tokens_seen": 19029680, "step": 896, "time_per_iteration": 2.5032219886779785 }, { "auxiliary_loss_clip": 0.01209457, "auxiliary_loss_mlp": 0.01048998, "balance_loss_clip": 1.06693029, "balance_loss_mlp": 1.03682637, "epoch": 0.1078578729032646, "flos": 20919581763840.0, "grad_norm": 1.944272017540537, "language_loss": 0.79521108, "learning_rate": 3.936832685116488e-06, "loss": 0.81779563, "num_input_tokens_seen": 19047775, "step": 897, "time_per_iteration": 2.5997555255889893 }, { "auxiliary_loss_clip": 0.01250602, "auxiliary_loss_mlp": 0.01050089, "balance_loss_clip": 1.0701344, "balance_loss_mlp": 1.03820968, "epoch": 0.10797811579390369, "flos": 14829886702080.0, "grad_norm": 2.2601872502838085, "language_loss": 0.90124583, "learning_rate": 3.936638310435301e-06, "loss": 0.92425281, "num_input_tokens_seen": 19065640, "step": 898, "time_per_iteration": 2.476712226867676 }, { "auxiliary_loss_clip": 0.01238517, "auxiliary_loss_mlp": 0.01041953, "balance_loss_clip": 1.06957984, "balance_loss_mlp": 1.02962112, "epoch": 0.10809835868454278, "flos": 19536985411200.0, "grad_norm": 1.8516766769738098, "language_loss": 0.81500447, "learning_rate": 3.936443641967623e-06, "loss": 0.83780915, "num_input_tokens_seen": 19084470, "step": 899, "time_per_iteration": 2.5221753120422363 }, { "auxiliary_loss_clip": 0.01220105, "auxiliary_loss_mlp": 0.01046999, "balance_loss_clip": 1.06801021, "balance_loss_mlp": 1.03463709, "epoch": 0.10821860157518187, "flos": 18442320480000.0, "grad_norm": 3.7749681671373114, "language_loss": 0.83078492, "learning_rate": 3.936248679742983e-06, "loss": 0.8534559, "num_input_tokens_seen": 19102965, "step": 900, "time_per_iteration": 3.4695215225219727 }, { "auxiliary_loss_clip": 0.01095446, "auxiliary_loss_mlp": 0.01046967, "balance_loss_clip": 1.03037715, "balance_loss_mlp": 1.0428896, "epoch": 0.10833884446582095, "flos": 49359468447360.0, "grad_norm": 1.059869429396583, "language_loss": 0.70163828, "learning_rate": 3.936053423790959e-06, "loss": 0.7230624, "num_input_tokens_seen": 19151285, "step": 901, "time_per_iteration": 2.9060444831848145 }, { "auxiliary_loss_clip": 0.01250976, "auxiliary_loss_mlp": 0.01053356, "balance_loss_clip": 1.07237124, "balance_loss_mlp": 1.04193604, "epoch": 0.10845908735646005, "flos": 20411912891520.0, "grad_norm": 1.9478997185895697, "language_loss": 0.77381372, "learning_rate": 3.935857874141168e-06, "loss": 0.79685712, "num_input_tokens_seen": 19170120, "step": 902, "time_per_iteration": 2.510826587677002 }, { "auxiliary_loss_clip": 0.01211105, "auxiliary_loss_mlp": 0.01037111, "balance_loss_clip": 1.06463456, "balance_loss_mlp": 1.02439713, "epoch": 0.10857933024709913, "flos": 14027750133120.0, "grad_norm": 2.0829838065274995, "language_loss": 0.83508182, "learning_rate": 3.935662030823279e-06, "loss": 0.85756397, "num_input_tokens_seen": 19186305, "step": 903, "time_per_iteration": 2.5632071495056152 }, { "auxiliary_loss_clip": 0.01232649, "auxiliary_loss_mlp": 0.01047161, "balance_loss_clip": 1.06551313, "balance_loss_mlp": 1.035604, "epoch": 0.10869957313773823, "flos": 13369004657280.0, "grad_norm": 2.153745957717676, "language_loss": 0.7304405, "learning_rate": 3.935465893866998e-06, "loss": 0.75323868, "num_input_tokens_seen": 19204530, "step": 904, "time_per_iteration": 2.532444953918457 }, { "auxiliary_loss_clip": 0.01221785, "auxiliary_loss_mlp": 0.01041092, "balance_loss_clip": 1.06830382, "balance_loss_mlp": 1.02881348, "epoch": 0.10881981602837733, "flos": 25807095509760.0, "grad_norm": 2.131759386209718, "language_loss": 0.80578297, "learning_rate": 3.935269463302079e-06, "loss": 0.8284117, "num_input_tokens_seen": 19222735, "step": 905, "time_per_iteration": 3.355724334716797 }, { "auxiliary_loss_clip": 0.0123828, "auxiliary_loss_mlp": 0.01044739, "balance_loss_clip": 1.06912005, "balance_loss_mlp": 1.03218603, "epoch": 0.10894005891901641, "flos": 20777555387520.0, "grad_norm": 1.9055713645301335, "language_loss": 0.76770175, "learning_rate": 3.935072739158322e-06, "loss": 0.79053187, "num_input_tokens_seen": 19242445, "step": 906, "time_per_iteration": 2.534024477005005 }, { "auxiliary_loss_clip": 0.01221895, "auxiliary_loss_mlp": 0.01046419, "balance_loss_clip": 1.0669359, "balance_loss_mlp": 1.03391957, "epoch": 0.10906030180965551, "flos": 26649883296000.0, "grad_norm": 1.666361891444486, "language_loss": 0.7970683, "learning_rate": 3.934875721465569e-06, "loss": 0.81975144, "num_input_tokens_seen": 19262865, "step": 907, "time_per_iteration": 3.5007591247558594 }, { "auxiliary_loss_clip": 0.0121226, "auxiliary_loss_mlp": 0.01039761, "balance_loss_clip": 1.06006896, "balance_loss_mlp": 1.0268625, "epoch": 0.10918054470029459, "flos": 36534402420480.0, "grad_norm": 2.2352780812368684, "language_loss": 0.72035009, "learning_rate": 3.9346784102537076e-06, "loss": 0.74287027, "num_input_tokens_seen": 19285000, "step": 908, "time_per_iteration": 3.4692440032958984 }, { "auxiliary_loss_clip": 0.01247544, "auxiliary_loss_mlp": 0.01036063, "balance_loss_clip": 1.06801498, "balance_loss_mlp": 1.02451789, "epoch": 0.10930078759093369, "flos": 21762549118080.0, "grad_norm": 2.152585757675597, "language_loss": 0.78707099, "learning_rate": 3.934480805552669e-06, "loss": 0.80990702, "num_input_tokens_seen": 19306010, "step": 909, "time_per_iteration": 2.4838972091674805 }, { "auxiliary_loss_clip": 0.01247914, "auxiliary_loss_mlp": 0.00766377, "balance_loss_clip": 1.06895864, "balance_loss_mlp": 1.00022078, "epoch": 0.10942103048157277, "flos": 22601781457920.0, "grad_norm": 2.239238319449174, "language_loss": 0.87944019, "learning_rate": 3.93428290739243e-06, "loss": 0.8995831, "num_input_tokens_seen": 19325380, "step": 910, "time_per_iteration": 2.55458927154541 }, { "auxiliary_loss_clip": 0.0121816, "auxiliary_loss_mlp": 0.01043011, "balance_loss_clip": 1.06523955, "balance_loss_mlp": 1.03058898, "epoch": 0.10954127337221187, "flos": 15045781397760.0, "grad_norm": 2.6231745476004176, "language_loss": 0.79772121, "learning_rate": 3.9340847158030125e-06, "loss": 0.82033288, "num_input_tokens_seen": 19338960, "step": 911, "time_per_iteration": 2.498889684677124 }, { "auxiliary_loss_clip": 0.01232069, "auxiliary_loss_mlp": 0.01049271, "balance_loss_clip": 1.06386662, "balance_loss_mlp": 1.03742158, "epoch": 0.10966151626285096, "flos": 21650974496640.0, "grad_norm": 1.792408265944816, "language_loss": 0.75346315, "learning_rate": 3.9338862308144814e-06, "loss": 0.77627653, "num_input_tokens_seen": 19357780, "step": 912, "time_per_iteration": 2.5177719593048096 }, { "auxiliary_loss_clip": 0.01246908, "auxiliary_loss_mlp": 0.01044201, "balance_loss_clip": 1.06724405, "balance_loss_mlp": 1.03180289, "epoch": 0.10978175915349005, "flos": 20121359777280.0, "grad_norm": 2.077854498788477, "language_loss": 0.84350652, "learning_rate": 3.933687452456946e-06, "loss": 0.86641765, "num_input_tokens_seen": 19377680, "step": 913, "time_per_iteration": 2.493380069732666 }, { "auxiliary_loss_clip": 0.01198801, "auxiliary_loss_mlp": 0.01042249, "balance_loss_clip": 1.05911553, "balance_loss_mlp": 1.02928543, "epoch": 0.10990200204412914, "flos": 20412667077120.0, "grad_norm": 5.516580276220256, "language_loss": 0.86608702, "learning_rate": 3.933488380760562e-06, "loss": 0.88849753, "num_input_tokens_seen": 19397040, "step": 914, "time_per_iteration": 2.588554859161377 }, { "auxiliary_loss_clip": 0.01246548, "auxiliary_loss_mlp": 0.00767264, "balance_loss_clip": 1.06695592, "balance_loss_mlp": 1.00021315, "epoch": 0.11002224493476823, "flos": 17530117660800.0, "grad_norm": 1.925260600445307, "language_loss": 0.87039983, "learning_rate": 3.9332890157555286e-06, "loss": 0.89053786, "num_input_tokens_seen": 19413975, "step": 915, "time_per_iteration": 2.531214952468872 }, { "auxiliary_loss_clip": 0.01222129, "auxiliary_loss_mlp": 0.0104416, "balance_loss_clip": 1.06563783, "balance_loss_mlp": 1.03213167, "epoch": 0.11014248782540732, "flos": 12203093099520.0, "grad_norm": 1.7937093139015061, "language_loss": 0.76440042, "learning_rate": 3.933089357472088e-06, "loss": 0.78706336, "num_input_tokens_seen": 19432005, "step": 916, "time_per_iteration": 2.5258312225341797 }, { "auxiliary_loss_clip": 0.01244837, "auxiliary_loss_mlp": 0.01040597, "balance_loss_clip": 1.06782269, "balance_loss_mlp": 1.02932549, "epoch": 0.11026273071604642, "flos": 22382977760640.0, "grad_norm": 2.740617401560644, "language_loss": 0.85856855, "learning_rate": 3.932889405940529e-06, "loss": 0.88142288, "num_input_tokens_seen": 19450100, "step": 917, "time_per_iteration": 2.4750916957855225 }, { "auxiliary_loss_clip": 0.01219534, "auxiliary_loss_mlp": 0.01045129, "balance_loss_clip": 1.0698576, "balance_loss_mlp": 1.03369641, "epoch": 0.1103829736066855, "flos": 19829046896640.0, "grad_norm": 2.430533758463773, "language_loss": 0.79831803, "learning_rate": 3.932689161191184e-06, "loss": 0.82096469, "num_input_tokens_seen": 19467805, "step": 918, "time_per_iteration": 2.524777889251709 }, { "auxiliary_loss_clip": 0.01229415, "auxiliary_loss_mlp": 0.01046226, "balance_loss_clip": 1.06445098, "balance_loss_mlp": 1.03371453, "epoch": 0.1105032164973246, "flos": 22669616292480.0, "grad_norm": 2.0472912761321327, "language_loss": 0.88172996, "learning_rate": 3.93248862325443e-06, "loss": 0.90448642, "num_input_tokens_seen": 19486710, "step": 919, "time_per_iteration": 2.501171112060547 }, { "auxiliary_loss_clip": 0.01133369, "auxiliary_loss_mlp": 0.01026136, "balance_loss_clip": 1.03769565, "balance_loss_mlp": 1.02213085, "epoch": 0.11062345938796368, "flos": 66483507876480.0, "grad_norm": 0.9466168740475717, "language_loss": 0.64495075, "learning_rate": 3.932287792160688e-06, "loss": 0.66654587, "num_input_tokens_seen": 19545170, "step": 920, "time_per_iteration": 2.9975528717041016 }, { "auxiliary_loss_clip": 0.01234142, "auxiliary_loss_mlp": 0.01043238, "balance_loss_clip": 1.06399441, "balance_loss_mlp": 1.02985084, "epoch": 0.11074370227860278, "flos": 21907771804800.0, "grad_norm": 2.3951888163336976, "language_loss": 0.80806577, "learning_rate": 3.932086667940424e-06, "loss": 0.83083957, "num_input_tokens_seen": 19561875, "step": 921, "time_per_iteration": 2.5301268100738525 }, { "auxiliary_loss_clip": 0.01230691, "auxiliary_loss_mlp": 0.00766549, "balance_loss_clip": 1.06736255, "balance_loss_mlp": 1.0002768, "epoch": 0.11086394516924186, "flos": 28658115763200.0, "grad_norm": 1.9402681870680323, "language_loss": 0.81970757, "learning_rate": 3.93188525062415e-06, "loss": 0.83967996, "num_input_tokens_seen": 19582340, "step": 922, "time_per_iteration": 2.5770580768585205 }, { "auxiliary_loss_clip": 0.01231217, "auxiliary_loss_mlp": 0.01052854, "balance_loss_clip": 1.06536674, "balance_loss_mlp": 1.04053402, "epoch": 0.11098418805988096, "flos": 24535247765760.0, "grad_norm": 1.997487286234216, "language_loss": 0.86130506, "learning_rate": 3.931683540242418e-06, "loss": 0.8841458, "num_input_tokens_seen": 19603405, "step": 923, "time_per_iteration": 2.62070369720459 }, { "auxiliary_loss_clip": 0.01225107, "auxiliary_loss_mlp": 0.01042776, "balance_loss_clip": 1.06392956, "balance_loss_mlp": 1.03000283, "epoch": 0.11110443095052006, "flos": 22960384888320.0, "grad_norm": 2.956341397776146, "language_loss": 0.9099561, "learning_rate": 3.9314815368258295e-06, "loss": 0.93263489, "num_input_tokens_seen": 19619885, "step": 924, "time_per_iteration": 2.5184521675109863 }, { "auxiliary_loss_clip": 0.01235169, "auxiliary_loss_mlp": 0.01036184, "balance_loss_clip": 1.07062948, "balance_loss_mlp": 1.02430487, "epoch": 0.11122467384115914, "flos": 18950025265920.0, "grad_norm": 1.7315162262529846, "language_loss": 0.78997731, "learning_rate": 3.9312792404050275e-06, "loss": 0.81269085, "num_input_tokens_seen": 19637940, "step": 925, "time_per_iteration": 2.5577445030212402 }, { "auxiliary_loss_clip": 0.01245269, "auxiliary_loss_mlp": 0.01042558, "balance_loss_clip": 1.0685674, "balance_loss_mlp": 1.03186452, "epoch": 0.11134491673179824, "flos": 25082957324160.0, "grad_norm": 2.9284512006801258, "language_loss": 0.77137518, "learning_rate": 3.9310766510107e-06, "loss": 0.79425347, "num_input_tokens_seen": 19657115, "step": 926, "time_per_iteration": 2.555898427963257 }, { "auxiliary_loss_clip": 0.01199394, "auxiliary_loss_mlp": 0.01046524, "balance_loss_clip": 1.05817652, "balance_loss_mlp": 1.03373289, "epoch": 0.11146515962243732, "flos": 24499121662080.0, "grad_norm": 1.7924464628446843, "language_loss": 0.92370033, "learning_rate": 3.9308737686735806e-06, "loss": 0.94615948, "num_input_tokens_seen": 19677075, "step": 927, "time_per_iteration": 2.6491332054138184 }, { "auxiliary_loss_clip": 0.0124858, "auxiliary_loss_mlp": 0.01048801, "balance_loss_clip": 1.0684545, "balance_loss_mlp": 1.03753579, "epoch": 0.11158540251307641, "flos": 22343763087360.0, "grad_norm": 2.0353876311859302, "language_loss": 0.82979387, "learning_rate": 3.9306705934244455e-06, "loss": 0.85276765, "num_input_tokens_seen": 19697155, "step": 928, "time_per_iteration": 3.3209948539733887 }, { "auxiliary_loss_clip": 0.01205064, "auxiliary_loss_mlp": 0.01036856, "balance_loss_clip": 1.06205869, "balance_loss_mlp": 1.02527523, "epoch": 0.11170564540371551, "flos": 19902304684800.0, "grad_norm": 1.7112590411728725, "language_loss": 0.88181931, "learning_rate": 3.930467125294116e-06, "loss": 0.90423858, "num_input_tokens_seen": 19716705, "step": 929, "time_per_iteration": 2.533008337020874 }, { "auxiliary_loss_clip": 0.01068866, "auxiliary_loss_mlp": 0.01006282, "balance_loss_clip": 1.0246582, "balance_loss_mlp": 1.00311112, "epoch": 0.1118258882943546, "flos": 64586239499520.0, "grad_norm": 0.9239433802578965, "language_loss": 0.6047616, "learning_rate": 3.930263364313458e-06, "loss": 0.62551308, "num_input_tokens_seen": 19767275, "step": 930, "time_per_iteration": 3.112867832183838 }, { "auxiliary_loss_clip": 0.0119898, "auxiliary_loss_mlp": 0.01052218, "balance_loss_clip": 1.06153238, "balance_loss_mlp": 1.03958821, "epoch": 0.11194613118499369, "flos": 17201965985280.0, "grad_norm": 2.0271495757930964, "language_loss": 0.82767642, "learning_rate": 3.930059310513384e-06, "loss": 0.85018837, "num_input_tokens_seen": 19786315, "step": 931, "time_per_iteration": 3.5619053840637207 }, { "auxiliary_loss_clip": 0.01183773, "auxiliary_loss_mlp": 0.00766056, "balance_loss_clip": 1.05905557, "balance_loss_mlp": 1.0001719, "epoch": 0.11206637407563277, "flos": 31863465728640.0, "grad_norm": 1.8536881824331894, "language_loss": 0.839719, "learning_rate": 3.929854963924846e-06, "loss": 0.85921729, "num_input_tokens_seen": 19806580, "step": 932, "time_per_iteration": 2.6600515842437744 }, { "auxiliary_loss_clip": 0.01201518, "auxiliary_loss_mlp": 0.01037229, "balance_loss_clip": 1.06075644, "balance_loss_mlp": 1.02571964, "epoch": 0.11218661696627187, "flos": 21945621761280.0, "grad_norm": 1.9242009350584048, "language_loss": 0.77293712, "learning_rate": 3.929650324578845e-06, "loss": 0.79532456, "num_input_tokens_seen": 19826045, "step": 933, "time_per_iteration": 2.5921401977539062 }, { "auxiliary_loss_clip": 0.01220124, "auxiliary_loss_mlp": 0.01043782, "balance_loss_clip": 1.06360102, "balance_loss_mlp": 1.03066325, "epoch": 0.11230685985691095, "flos": 25878198481920.0, "grad_norm": 2.682825909780638, "language_loss": 0.81794131, "learning_rate": 3.929445392506423e-06, "loss": 0.84058028, "num_input_tokens_seen": 19843985, "step": 934, "time_per_iteration": 3.467923641204834 }, { "auxiliary_loss_clip": 0.01231093, "auxiliary_loss_mlp": 0.01046508, "balance_loss_clip": 1.06905484, "balance_loss_mlp": 1.03536761, "epoch": 0.11242710274755005, "flos": 22231506107520.0, "grad_norm": 1.9855036795577263, "language_loss": 0.7621538, "learning_rate": 3.92924016773867e-06, "loss": 0.78492975, "num_input_tokens_seen": 19860480, "step": 935, "time_per_iteration": 3.287496328353882 }, { "auxiliary_loss_clip": 0.01216609, "auxiliary_loss_mlp": 0.00765709, "balance_loss_clip": 1.06162333, "balance_loss_mlp": 1.00018072, "epoch": 0.11254734563818915, "flos": 17712184723200.0, "grad_norm": 2.498503811494862, "language_loss": 0.73961407, "learning_rate": 3.9290346503067175e-06, "loss": 0.75943726, "num_input_tokens_seen": 19877145, "step": 936, "time_per_iteration": 2.5347349643707275 }, { "auxiliary_loss_clip": 0.0123198, "auxiliary_loss_mlp": 0.0104414, "balance_loss_clip": 1.0633018, "balance_loss_mlp": 1.03251135, "epoch": 0.11266758852882823, "flos": 54930397334400.0, "grad_norm": 1.879736759513515, "language_loss": 0.78577709, "learning_rate": 3.9288288402417415e-06, "loss": 0.80853832, "num_input_tokens_seen": 19903405, "step": 937, "time_per_iteration": 2.8518640995025635 }, { "auxiliary_loss_clip": 0.01235733, "auxiliary_loss_mlp": 0.01041264, "balance_loss_clip": 1.06848824, "balance_loss_mlp": 1.02865195, "epoch": 0.11278783141946733, "flos": 18878132194560.0, "grad_norm": 2.4776593630824952, "language_loss": 0.7051841, "learning_rate": 3.928622737574964e-06, "loss": 0.72795415, "num_input_tokens_seen": 19918740, "step": 938, "time_per_iteration": 2.540126085281372 }, { "auxiliary_loss_clip": 0.01213743, "auxiliary_loss_mlp": 0.01045296, "balance_loss_clip": 1.06124425, "balance_loss_mlp": 1.03332162, "epoch": 0.11290807431010641, "flos": 26469252777600.0, "grad_norm": 2.185617239783044, "language_loss": 0.90913594, "learning_rate": 3.928416342337652e-06, "loss": 0.93172634, "num_input_tokens_seen": 19938475, "step": 939, "time_per_iteration": 2.5731821060180664 }, { "auxiliary_loss_clip": 0.01217515, "auxiliary_loss_mlp": 0.01042492, "balance_loss_clip": 1.06504345, "balance_loss_mlp": 1.03062451, "epoch": 0.1130283172007455, "flos": 22710590732160.0, "grad_norm": 1.709423476168821, "language_loss": 0.82656705, "learning_rate": 3.928209654561113e-06, "loss": 0.84916717, "num_input_tokens_seen": 19959310, "step": 940, "time_per_iteration": 2.55037522315979 }, { "auxiliary_loss_clip": 0.01208729, "auxiliary_loss_mlp": 0.01042085, "balance_loss_clip": 1.06423962, "balance_loss_mlp": 1.0308553, "epoch": 0.1131485600913846, "flos": 23219911630080.0, "grad_norm": 2.2333480538657104, "language_loss": 0.81722045, "learning_rate": 3.928002674276703e-06, "loss": 0.83972859, "num_input_tokens_seen": 19978700, "step": 941, "time_per_iteration": 2.5602898597717285 }, { "auxiliary_loss_clip": 0.01165758, "auxiliary_loss_mlp": 0.01038979, "balance_loss_clip": 1.0541929, "balance_loss_mlp": 1.0266645, "epoch": 0.11326880298202369, "flos": 14064271286400.0, "grad_norm": 2.308935222181706, "language_loss": 0.75160742, "learning_rate": 3.92779540151582e-06, "loss": 0.77365482, "num_input_tokens_seen": 19995785, "step": 942, "time_per_iteration": 2.5651979446411133 }, { "auxiliary_loss_clip": 0.01213113, "auxiliary_loss_mlp": 0.01033353, "balance_loss_clip": 1.06258774, "balance_loss_mlp": 1.02242732, "epoch": 0.11338904587266278, "flos": 16325386479360.0, "grad_norm": 3.5205517173683547, "language_loss": 0.8551293, "learning_rate": 3.927587836309907e-06, "loss": 0.87759399, "num_input_tokens_seen": 20013615, "step": 943, "time_per_iteration": 2.5361077785491943 }, { "auxiliary_loss_clip": 0.01210224, "auxiliary_loss_mlp": 0.0104377, "balance_loss_clip": 1.06141591, "balance_loss_mlp": 1.03237987, "epoch": 0.11350928876330187, "flos": 24426258923520.0, "grad_norm": 2.2304199453775584, "language_loss": 0.78512174, "learning_rate": 3.927379978690452e-06, "loss": 0.80766165, "num_input_tokens_seen": 20032880, "step": 944, "time_per_iteration": 2.5932183265686035 }, { "auxiliary_loss_clip": 0.01185526, "auxiliary_loss_mlp": 0.01046516, "balance_loss_clip": 1.05306458, "balance_loss_mlp": 1.03479791, "epoch": 0.11362953165394096, "flos": 24497074586880.0, "grad_norm": 4.6681776614956965, "language_loss": 0.87398177, "learning_rate": 3.927171828688987e-06, "loss": 0.89630222, "num_input_tokens_seen": 20052405, "step": 945, "time_per_iteration": 2.6163206100463867 }, { "auxiliary_loss_clip": 0.01249499, "auxiliary_loss_mlp": 0.01037155, "balance_loss_clip": 1.07157063, "balance_loss_mlp": 1.02580023, "epoch": 0.11374977454458005, "flos": 24060831909120.0, "grad_norm": 4.964377312919228, "language_loss": 0.82049829, "learning_rate": 3.926963386337088e-06, "loss": 0.84336483, "num_input_tokens_seen": 20070635, "step": 946, "time_per_iteration": 2.511021614074707 }, { "auxiliary_loss_clip": 0.01251038, "auxiliary_loss_mlp": 0.01040158, "balance_loss_clip": 1.06869376, "balance_loss_mlp": 1.026896, "epoch": 0.11387001743521914, "flos": 39457638967680.0, "grad_norm": 2.477697303273926, "language_loss": 0.70211494, "learning_rate": 3.926754651666375e-06, "loss": 0.72502685, "num_input_tokens_seen": 20091195, "step": 947, "time_per_iteration": 2.6101086139678955 }, { "auxiliary_loss_clip": 0.01201567, "auxiliary_loss_mlp": 0.01045759, "balance_loss_clip": 1.06499004, "balance_loss_mlp": 1.03382611, "epoch": 0.11399026032585824, "flos": 25082454533760.0, "grad_norm": 2.5759321953506773, "language_loss": 0.78511375, "learning_rate": 3.926545624708513e-06, "loss": 0.80758703, "num_input_tokens_seen": 20110435, "step": 948, "time_per_iteration": 2.662794351577759 }, { "auxiliary_loss_clip": 0.01196644, "auxiliary_loss_mlp": 0.01048158, "balance_loss_clip": 1.06100798, "balance_loss_mlp": 1.0364933, "epoch": 0.11411050321649732, "flos": 17961835224960.0, "grad_norm": 2.2317643758693615, "language_loss": 0.8537426, "learning_rate": 3.926336305495213e-06, "loss": 0.87619054, "num_input_tokens_seen": 20128995, "step": 949, "time_per_iteration": 2.5853984355926514 }, { "auxiliary_loss_clip": 0.01187161, "auxiliary_loss_mlp": 0.01044806, "balance_loss_clip": 1.06098485, "balance_loss_mlp": 1.03126359, "epoch": 0.11423074610713642, "flos": 22455409536000.0, "grad_norm": 2.057630330366888, "language_loss": 0.88978767, "learning_rate": 3.926126694058226e-06, "loss": 0.91210735, "num_input_tokens_seen": 20148145, "step": 950, "time_per_iteration": 2.6092495918273926 }, { "auxiliary_loss_clip": 0.01183678, "auxiliary_loss_mlp": 0.01044353, "balance_loss_clip": 1.06493485, "balance_loss_mlp": 1.03386319, "epoch": 0.1143509889977755, "flos": 19717687756800.0, "grad_norm": 1.4908712306820662, "language_loss": 0.82143223, "learning_rate": 3.92591679042935e-06, "loss": 0.84371257, "num_input_tokens_seen": 20168035, "step": 951, "time_per_iteration": 2.6263771057128906 }, { "auxiliary_loss_clip": 0.012316, "auxiliary_loss_mlp": 0.01043318, "balance_loss_clip": 1.06818664, "balance_loss_mlp": 1.03068161, "epoch": 0.1144712318884146, "flos": 19822869757440.0, "grad_norm": 1.9576754472941424, "language_loss": 0.82164466, "learning_rate": 3.92570659464043e-06, "loss": 0.84439385, "num_input_tokens_seen": 20186095, "step": 952, "time_per_iteration": 2.5014209747314453 }, { "auxiliary_loss_clip": 0.01228435, "auxiliary_loss_mlp": 0.00766193, "balance_loss_clip": 1.06834412, "balance_loss_mlp": 1.00021887, "epoch": 0.1145914747790537, "flos": 14939198766720.0, "grad_norm": 2.068305216321503, "language_loss": 0.79911947, "learning_rate": 3.925496106723349e-06, "loss": 0.81906575, "num_input_tokens_seen": 20203535, "step": 953, "time_per_iteration": 2.4958319664001465 }, { "auxiliary_loss_clip": 0.01234318, "auxiliary_loss_mlp": 0.01040557, "balance_loss_clip": 1.06794047, "balance_loss_mlp": 1.02936339, "epoch": 0.11471171766969278, "flos": 19865029345920.0, "grad_norm": 2.1686564435671607, "language_loss": 0.83851522, "learning_rate": 3.9252853267100405e-06, "loss": 0.86126399, "num_input_tokens_seen": 20222780, "step": 954, "time_per_iteration": 2.5247936248779297 }, { "auxiliary_loss_clip": 0.01189794, "auxiliary_loss_mlp": 0.01044997, "balance_loss_clip": 1.06037807, "balance_loss_mlp": 1.03305256, "epoch": 0.11483196056033187, "flos": 22526476594560.0, "grad_norm": 4.389400191734664, "language_loss": 0.83695138, "learning_rate": 3.9250742546324786e-06, "loss": 0.8592993, "num_input_tokens_seen": 20243015, "step": 955, "time_per_iteration": 3.384770631790161 }, { "auxiliary_loss_clip": 0.0121383, "auxiliary_loss_mlp": 0.01043415, "balance_loss_clip": 1.0634445, "balance_loss_mlp": 1.03280544, "epoch": 0.11495220345097096, "flos": 28220292887040.0, "grad_norm": 5.98804414515056, "language_loss": 0.87021178, "learning_rate": 3.924862890522683e-06, "loss": 0.89278424, "num_input_tokens_seen": 20263025, "step": 956, "time_per_iteration": 2.6026947498321533 }, { "auxiliary_loss_clip": 0.0123029, "auxiliary_loss_mlp": 0.01041209, "balance_loss_clip": 1.06451845, "balance_loss_mlp": 1.02943146, "epoch": 0.11507244634161005, "flos": 17492267704320.0, "grad_norm": 2.104482533949446, "language_loss": 0.85904467, "learning_rate": 3.9246512344127174e-06, "loss": 0.88175964, "num_input_tokens_seen": 20280685, "step": 957, "time_per_iteration": 2.5276901721954346 }, { "auxiliary_loss_clip": 0.01149633, "auxiliary_loss_mlp": 0.01039141, "balance_loss_clip": 1.05571187, "balance_loss_mlp": 1.02764297, "epoch": 0.11519268923224914, "flos": 22564937082240.0, "grad_norm": 3.0797442543689293, "language_loss": 0.81831956, "learning_rate": 3.9244392863346895e-06, "loss": 0.84020722, "num_input_tokens_seen": 20300090, "step": 958, "time_per_iteration": 3.444572925567627 }, { "auxiliary_loss_clip": 0.01220195, "auxiliary_loss_mlp": 0.01046949, "balance_loss_clip": 1.06974697, "balance_loss_mlp": 1.03428292, "epoch": 0.11531293212288823, "flos": 16982839065600.0, "grad_norm": 1.9642476485491807, "language_loss": 0.92335945, "learning_rate": 3.9242270463207524e-06, "loss": 0.94603091, "num_input_tokens_seen": 20318480, "step": 959, "time_per_iteration": 2.530734062194824 }, { "auxiliary_loss_clip": 0.01170837, "auxiliary_loss_mlp": 0.01041607, "balance_loss_clip": 1.05797625, "balance_loss_mlp": 1.0292151, "epoch": 0.11543317501352733, "flos": 12422004537600.0, "grad_norm": 5.7963241261524105, "language_loss": 0.85763395, "learning_rate": 3.924014514403102e-06, "loss": 0.87975836, "num_input_tokens_seen": 20334635, "step": 960, "time_per_iteration": 2.582185745239258 }, { "auxiliary_loss_clip": 0.01173004, "auxiliary_loss_mlp": 0.01047938, "balance_loss_clip": 1.05811596, "balance_loss_mlp": 1.03498578, "epoch": 0.11555341790416641, "flos": 19821648695040.0, "grad_norm": 2.0216681726805295, "language_loss": 0.91281712, "learning_rate": 3.92380169061398e-06, "loss": 0.93502653, "num_input_tokens_seen": 20352415, "step": 961, "time_per_iteration": 3.510355234146118 }, { "auxiliary_loss_clip": 0.01190988, "auxiliary_loss_mlp": 0.00766267, "balance_loss_clip": 1.05787051, "balance_loss_mlp": 1.00016916, "epoch": 0.11567366079480551, "flos": 25738865625600.0, "grad_norm": 4.168283357649946, "language_loss": 0.83740705, "learning_rate": 3.9235885749856705e-06, "loss": 0.85697961, "num_input_tokens_seen": 20371095, "step": 962, "time_per_iteration": 3.366799831390381 }, { "auxiliary_loss_clip": 0.01221592, "auxiliary_loss_mlp": 0.0104391, "balance_loss_clip": 1.07287169, "balance_loss_mlp": 1.03172088, "epoch": 0.1157939036854446, "flos": 18223301301120.0, "grad_norm": 2.3843259083233894, "language_loss": 0.82769305, "learning_rate": 3.9233751675505035e-06, "loss": 0.85034811, "num_input_tokens_seen": 20389805, "step": 963, "time_per_iteration": 2.538644790649414 }, { "auxiliary_loss_clip": 0.01211749, "auxiliary_loss_mlp": 0.01041751, "balance_loss_clip": 1.0667789, "balance_loss_mlp": 1.02849495, "epoch": 0.11591414657608369, "flos": 23073755189760.0, "grad_norm": 1.8655173095163404, "language_loss": 0.84628808, "learning_rate": 3.923161468340853e-06, "loss": 0.86882311, "num_input_tokens_seen": 20409640, "step": 964, "time_per_iteration": 2.5701324939727783 }, { "auxiliary_loss_clip": 0.01170812, "auxiliary_loss_mlp": 0.01041327, "balance_loss_clip": 1.05627096, "balance_loss_mlp": 1.02952552, "epoch": 0.11603438946672277, "flos": 19461716461440.0, "grad_norm": 1.6981682620451493, "language_loss": 0.81574696, "learning_rate": 3.9229474773891374e-06, "loss": 0.83786833, "num_input_tokens_seen": 20428180, "step": 965, "time_per_iteration": 2.60032320022583 }, { "auxiliary_loss_clip": 0.01205725, "auxiliary_loss_mlp": 0.01050564, "balance_loss_clip": 1.05884755, "balance_loss_mlp": 1.03767192, "epoch": 0.11615463235736187, "flos": 26831986272000.0, "grad_norm": 3.0569375799848006, "language_loss": 0.83467007, "learning_rate": 3.922733194727818e-06, "loss": 0.85723293, "num_input_tokens_seen": 20447975, "step": 966, "time_per_iteration": 2.639197826385498 }, { "auxiliary_loss_clip": 0.01236906, "auxiliary_loss_mlp": 0.01042939, "balance_loss_clip": 1.06900549, "balance_loss_mlp": 1.03072011, "epoch": 0.11627487524800097, "flos": 18580324533120.0, "grad_norm": 2.3384779580136685, "language_loss": 0.8749249, "learning_rate": 3.922518620389402e-06, "loss": 0.89772332, "num_input_tokens_seen": 20464840, "step": 967, "time_per_iteration": 2.4678986072540283 }, { "auxiliary_loss_clip": 0.01123668, "auxiliary_loss_mlp": 0.01039579, "balance_loss_clip": 1.05272841, "balance_loss_mlp": 1.02700853, "epoch": 0.11639511813864005, "flos": 18150474476160.0, "grad_norm": 3.5046784383285283, "language_loss": 0.89498007, "learning_rate": 3.922303754406439e-06, "loss": 0.91661251, "num_input_tokens_seen": 20482680, "step": 968, "time_per_iteration": 2.648559331893921 }, { "auxiliary_loss_clip": 0.01181333, "auxiliary_loss_mlp": 0.01049982, "balance_loss_clip": 1.05794382, "balance_loss_mlp": 1.03638005, "epoch": 0.11651536102927915, "flos": 20922023888640.0, "grad_norm": 2.4029588315319392, "language_loss": 0.79096127, "learning_rate": 3.922088596811526e-06, "loss": 0.81327444, "num_input_tokens_seen": 20501810, "step": 969, "time_per_iteration": 2.5997016429901123 }, { "auxiliary_loss_clip": 0.01216686, "auxiliary_loss_mlp": 0.01041144, "balance_loss_clip": 1.06253982, "balance_loss_mlp": 1.02989638, "epoch": 0.11663560391991823, "flos": 16508602776960.0, "grad_norm": 2.2351264901267016, "language_loss": 0.86859179, "learning_rate": 3.9218731476373e-06, "loss": 0.89117014, "num_input_tokens_seen": 20517995, "step": 970, "time_per_iteration": 2.4892120361328125 }, { "auxiliary_loss_clip": 0.01239136, "auxiliary_loss_mlp": 0.01048607, "balance_loss_clip": 1.07091939, "balance_loss_mlp": 1.03542817, "epoch": 0.11675584681055733, "flos": 19865029345920.0, "grad_norm": 2.091045713161438, "language_loss": 0.84533858, "learning_rate": 3.9216574069164455e-06, "loss": 0.86821598, "num_input_tokens_seen": 20536970, "step": 971, "time_per_iteration": 2.4939916133880615 }, { "auxiliary_loss_clip": 0.01242973, "auxiliary_loss_mlp": 0.01041652, "balance_loss_clip": 1.06727505, "balance_loss_mlp": 1.03064299, "epoch": 0.11687608970119642, "flos": 21944364785280.0, "grad_norm": 1.5287384535308486, "language_loss": 0.79943645, "learning_rate": 3.921441374681691e-06, "loss": 0.82228267, "num_input_tokens_seen": 20557030, "step": 972, "time_per_iteration": 2.519932508468628 }, { "auxiliary_loss_clip": 0.0121033, "auxiliary_loss_mlp": 0.01038171, "balance_loss_clip": 1.06396246, "balance_loss_mlp": 1.0262444, "epoch": 0.1169963325918355, "flos": 24061155131520.0, "grad_norm": 2.1838902758807968, "language_loss": 0.65210819, "learning_rate": 3.921225050965808e-06, "loss": 0.67459321, "num_input_tokens_seen": 20576915, "step": 973, "time_per_iteration": 2.5642313957214355 }, { "auxiliary_loss_clip": 0.01196285, "auxiliary_loss_mlp": 0.01040274, "balance_loss_clip": 1.06056392, "balance_loss_mlp": 1.02817452, "epoch": 0.1171165754824746, "flos": 23368151059200.0, "grad_norm": 2.168247555973414, "language_loss": 0.74746329, "learning_rate": 3.921008435801612e-06, "loss": 0.76982886, "num_input_tokens_seen": 20596000, "step": 974, "time_per_iteration": 2.5717360973358154 }, { "auxiliary_loss_clip": 0.01217838, "auxiliary_loss_mlp": 0.01040911, "balance_loss_clip": 1.06437182, "balance_loss_mlp": 1.02840626, "epoch": 0.11723681837311369, "flos": 18552243075840.0, "grad_norm": 3.907495793045152, "language_loss": 0.75140554, "learning_rate": 3.920791529221963e-06, "loss": 0.77399302, "num_input_tokens_seen": 20614675, "step": 975, "time_per_iteration": 2.504629135131836 }, { "auxiliary_loss_clip": 0.01218052, "auxiliary_loss_mlp": 0.00766203, "balance_loss_clip": 1.06507802, "balance_loss_mlp": 1.00019956, "epoch": 0.11735706126375278, "flos": 23550541344000.0, "grad_norm": 2.22358341578932, "language_loss": 0.7644068, "learning_rate": 3.920574331259768e-06, "loss": 0.78424937, "num_input_tokens_seen": 20635875, "step": 976, "time_per_iteration": 2.600533962249756 }, { "auxiliary_loss_clip": 0.01203751, "auxiliary_loss_mlp": 0.01039368, "balance_loss_clip": 1.0616771, "balance_loss_mlp": 1.02849603, "epoch": 0.11747730415439187, "flos": 22381541216640.0, "grad_norm": 2.478980233029111, "language_loss": 0.79192472, "learning_rate": 3.9203568419479716e-06, "loss": 0.81435591, "num_input_tokens_seen": 20656430, "step": 977, "time_per_iteration": 2.5490169525146484 }, { "auxiliary_loss_clip": 0.01213339, "auxiliary_loss_mlp": 0.01031249, "balance_loss_clip": 1.06437027, "balance_loss_mlp": 1.02024031, "epoch": 0.11759754704503096, "flos": 22200731130240.0, "grad_norm": 1.95727999590609, "language_loss": 0.75444448, "learning_rate": 3.92013906131957e-06, "loss": 0.77689034, "num_input_tokens_seen": 20675360, "step": 978, "time_per_iteration": 2.5403544902801514 }, { "auxiliary_loss_clip": 0.01197648, "auxiliary_loss_mlp": 0.01051672, "balance_loss_clip": 1.06457043, "balance_loss_mlp": 1.04108584, "epoch": 0.11771778993567006, "flos": 22309755886080.0, "grad_norm": 1.6120511554181212, "language_loss": 0.8221814, "learning_rate": 3.9199209894076e-06, "loss": 0.84467459, "num_input_tokens_seen": 20695675, "step": 979, "time_per_iteration": 2.5954396724700928 }, { "auxiliary_loss_clip": 0.01246823, "auxiliary_loss_mlp": 0.01036814, "balance_loss_clip": 1.06715775, "balance_loss_mlp": 1.02373064, "epoch": 0.11783803282630914, "flos": 21288169175040.0, "grad_norm": 1.808185574917874, "language_loss": 0.89461297, "learning_rate": 3.919702626245142e-06, "loss": 0.91744941, "num_input_tokens_seen": 20715330, "step": 980, "time_per_iteration": 2.491637706756592 }, { "auxiliary_loss_clip": 0.01199504, "auxiliary_loss_mlp": 0.0103896, "balance_loss_clip": 1.06062829, "balance_loss_mlp": 1.02705121, "epoch": 0.11795827571694824, "flos": 25371535190400.0, "grad_norm": 2.1781573558250193, "language_loss": 0.6653322, "learning_rate": 3.919483971865322e-06, "loss": 0.68771684, "num_input_tokens_seen": 20735325, "step": 981, "time_per_iteration": 3.409590482711792 }, { "auxiliary_loss_clip": 0.01212901, "auxiliary_loss_mlp": 0.0103667, "balance_loss_clip": 1.06766129, "balance_loss_mlp": 1.02573252, "epoch": 0.11807851860758732, "flos": 23622218933760.0, "grad_norm": 2.1999078750208447, "language_loss": 0.88020611, "learning_rate": 3.91926502630131e-06, "loss": 0.9027018, "num_input_tokens_seen": 20755940, "step": 982, "time_per_iteration": 2.5646488666534424 }, { "auxiliary_loss_clip": 0.01234963, "auxiliary_loss_mlp": 0.01041378, "balance_loss_clip": 1.07178938, "balance_loss_mlp": 1.0300653, "epoch": 0.11819876149822642, "flos": 24972496024320.0, "grad_norm": 1.8677443841249768, "language_loss": 0.72238576, "learning_rate": 3.91904578958632e-06, "loss": 0.74514914, "num_input_tokens_seen": 20775355, "step": 983, "time_per_iteration": 2.537376642227173 }, { "auxiliary_loss_clip": 0.01248471, "auxiliary_loss_mlp": 0.01043772, "balance_loss_clip": 1.06952524, "balance_loss_mlp": 1.03206599, "epoch": 0.11831900438886551, "flos": 23003226835200.0, "grad_norm": 4.291269702588566, "language_loss": 0.83957946, "learning_rate": 3.918826261753608e-06, "loss": 0.86250186, "num_input_tokens_seen": 20794935, "step": 984, "time_per_iteration": 2.5083096027374268 }, { "auxiliary_loss_clip": 0.01212761, "auxiliary_loss_mlp": 0.01033618, "balance_loss_clip": 1.06468964, "balance_loss_mlp": 1.0236696, "epoch": 0.1184392472795046, "flos": 27965147604480.0, "grad_norm": 2.5894371408203622, "language_loss": 0.70987403, "learning_rate": 3.918606442836478e-06, "loss": 0.73233777, "num_input_tokens_seen": 20817155, "step": 985, "time_per_iteration": 3.4155235290527344 }, { "auxiliary_loss_clip": 0.01228825, "auxiliary_loss_mlp": 0.01037365, "balance_loss_clip": 1.07058084, "balance_loss_mlp": 1.02657628, "epoch": 0.1185594901701437, "flos": 19898497843200.0, "grad_norm": 1.841555434564848, "language_loss": 0.77150846, "learning_rate": 3.918386332868277e-06, "loss": 0.79417026, "num_input_tokens_seen": 20835125, "step": 986, "time_per_iteration": 2.5039713382720947 }, { "auxiliary_loss_clip": 0.01218375, "auxiliary_loss_mlp": 0.01045865, "balance_loss_clip": 1.06409431, "balance_loss_mlp": 1.03445721, "epoch": 0.11867973306078278, "flos": 18912354877440.0, "grad_norm": 1.845581159073504, "language_loss": 0.94383878, "learning_rate": 3.918165931882394e-06, "loss": 0.96648121, "num_input_tokens_seen": 20853525, "step": 987, "time_per_iteration": 2.5112380981445312 }, { "auxiliary_loss_clip": 0.0115319, "auxiliary_loss_mlp": 0.01038497, "balance_loss_clip": 1.05210078, "balance_loss_mlp": 1.02598, "epoch": 0.11879997595142187, "flos": 16982803152000.0, "grad_norm": 3.3857860573184153, "language_loss": 0.75145078, "learning_rate": 3.917945239912264e-06, "loss": 0.7733677, "num_input_tokens_seen": 20871000, "step": 988, "time_per_iteration": 4.2755584716796875 }, { "auxiliary_loss_clip": 0.01178952, "auxiliary_loss_mlp": 0.0104041, "balance_loss_clip": 1.05881488, "balance_loss_mlp": 1.03004456, "epoch": 0.11892021884206096, "flos": 17530369056000.0, "grad_norm": 1.9657056685116383, "language_loss": 0.75387728, "learning_rate": 3.917724256991367e-06, "loss": 0.77607089, "num_input_tokens_seen": 20889745, "step": 989, "time_per_iteration": 2.686502456665039 }, { "auxiliary_loss_clip": 0.01203132, "auxiliary_loss_mlp": 0.010496, "balance_loss_clip": 1.06265628, "balance_loss_mlp": 1.03812623, "epoch": 0.11904046173270005, "flos": 30955895763840.0, "grad_norm": 2.1633720294946435, "language_loss": 0.81432229, "learning_rate": 3.9175029831532245e-06, "loss": 0.83684963, "num_input_tokens_seen": 20909260, "step": 990, "time_per_iteration": 2.6568806171417236 }, { "auxiliary_loss_clip": 0.01202964, "auxiliary_loss_mlp": 0.01034942, "balance_loss_clip": 1.06832671, "balance_loss_mlp": 1.02449894, "epoch": 0.11916070462333915, "flos": 20157234485760.0, "grad_norm": 2.0790307712942884, "language_loss": 0.88281453, "learning_rate": 3.917281418431404e-06, "loss": 0.90519357, "num_input_tokens_seen": 20928305, "step": 991, "time_per_iteration": 2.6638309955596924 }, { "auxiliary_loss_clip": 0.01212428, "auxiliary_loss_mlp": 0.01042167, "balance_loss_clip": 1.06696117, "balance_loss_mlp": 1.0302701, "epoch": 0.11928094751397823, "flos": 23551115961600.0, "grad_norm": 1.9549558493065298, "language_loss": 0.76683331, "learning_rate": 3.917059562859516e-06, "loss": 0.78937924, "num_input_tokens_seen": 20947630, "step": 992, "time_per_iteration": 2.564865827560425 }, { "auxiliary_loss_clip": 0.01204012, "auxiliary_loss_mlp": 0.01048542, "balance_loss_clip": 1.0655551, "balance_loss_mlp": 1.03557777, "epoch": 0.11940119040461733, "flos": 23908426502400.0, "grad_norm": 2.5006763860702588, "language_loss": 0.88598096, "learning_rate": 3.916837416471218e-06, "loss": 0.90850651, "num_input_tokens_seen": 20964250, "step": 993, "time_per_iteration": 2.5598461627960205 }, { "auxiliary_loss_clip": 0.01221992, "auxiliary_loss_mlp": 0.0104173, "balance_loss_clip": 1.06321526, "balance_loss_mlp": 1.0302918, "epoch": 0.11952143329525641, "flos": 13844533835520.0, "grad_norm": 3.045862317496398, "language_loss": 0.71790588, "learning_rate": 3.916614979300207e-06, "loss": 0.74054313, "num_input_tokens_seen": 20979095, "step": 994, "time_per_iteration": 2.4886584281921387 }, { "auxiliary_loss_clip": 0.01170908, "auxiliary_loss_mlp": 0.01042568, "balance_loss_clip": 1.05977333, "balance_loss_mlp": 1.03173828, "epoch": 0.11964167618589551, "flos": 27015525792000.0, "grad_norm": 1.554600703162546, "language_loss": 0.78779697, "learning_rate": 3.9163922513802274e-06, "loss": 0.80993176, "num_input_tokens_seen": 21001430, "step": 995, "time_per_iteration": 2.7433764934539795 }, { "auxiliary_loss_clip": 0.01247442, "auxiliary_loss_mlp": 0.01039953, "balance_loss_clip": 1.06869578, "balance_loss_mlp": 1.02862787, "epoch": 0.1197619190765346, "flos": 12567622273920.0, "grad_norm": 2.7222687295562857, "language_loss": 0.82726228, "learning_rate": 3.916169232745067e-06, "loss": 0.85013628, "num_input_tokens_seen": 21019105, "step": 996, "time_per_iteration": 2.5554778575897217 }, { "auxiliary_loss_clip": 0.01201251, "auxiliary_loss_mlp": 0.01045157, "balance_loss_clip": 1.0626092, "balance_loss_mlp": 1.033355, "epoch": 0.11988216196717369, "flos": 16909437623040.0, "grad_norm": 2.586800204020113, "language_loss": 0.91545796, "learning_rate": 3.915945923428559e-06, "loss": 0.93792206, "num_input_tokens_seen": 21035630, "step": 997, "time_per_iteration": 2.5254056453704834 }, { "auxiliary_loss_clip": 0.01223692, "auxiliary_loss_mlp": 0.01038237, "balance_loss_clip": 1.06402433, "balance_loss_mlp": 1.02670395, "epoch": 0.12000240485781279, "flos": 16216577205120.0, "grad_norm": 2.3763455338426884, "language_loss": 0.82817006, "learning_rate": 3.915722323464577e-06, "loss": 0.85078937, "num_input_tokens_seen": 21054235, "step": 998, "time_per_iteration": 2.485231876373291 }, { "auxiliary_loss_clip": 0.01229292, "auxiliary_loss_mlp": 0.01042071, "balance_loss_clip": 1.06680942, "balance_loss_mlp": 1.03034723, "epoch": 0.12012264774845187, "flos": 49344887525760.0, "grad_norm": 2.3394692009951137, "language_loss": 0.70101988, "learning_rate": 3.91549843288704e-06, "loss": 0.72373348, "num_input_tokens_seen": 21077915, "step": 999, "time_per_iteration": 2.760650396347046 }, { "auxiliary_loss_clip": 0.01193444, "auxiliary_loss_mlp": 0.00765927, "balance_loss_clip": 1.05798531, "balance_loss_mlp": 1.00029325, "epoch": 0.12024289063909097, "flos": 26979435601920.0, "grad_norm": 1.9741332994676182, "language_loss": 0.79050684, "learning_rate": 3.915274251729916e-06, "loss": 0.81010056, "num_input_tokens_seen": 21099205, "step": 1000, "time_per_iteration": 2.697967290878296 }, { "auxiliary_loss_clip": 0.01199432, "auxiliary_loss_mlp": 0.01035334, "balance_loss_clip": 1.0646801, "balance_loss_mlp": 1.02346087, "epoch": 0.12036313352973005, "flos": 19537308633600.0, "grad_norm": 1.9642043382316006, "language_loss": 0.89977753, "learning_rate": 3.91504978002721e-06, "loss": 0.92212522, "num_input_tokens_seen": 21118260, "step": 1001, "time_per_iteration": 2.580869197845459 }, { "auxiliary_loss_clip": 0.01215672, "auxiliary_loss_mlp": 0.00765759, "balance_loss_clip": 1.06333685, "balance_loss_mlp": 1.00017905, "epoch": 0.12048337642036915, "flos": 17268256535040.0, "grad_norm": 1.956488779013999, "language_loss": 0.76307422, "learning_rate": 3.914825017812974e-06, "loss": 0.78288853, "num_input_tokens_seen": 21134910, "step": 1002, "time_per_iteration": 2.6101584434509277 }, { "auxiliary_loss_clip": 0.0121423, "auxiliary_loss_mlp": 0.01042335, "balance_loss_clip": 1.06611443, "balance_loss_mlp": 1.03052139, "epoch": 0.12060361931100824, "flos": 22856962654080.0, "grad_norm": 2.6868168685223743, "language_loss": 0.72553027, "learning_rate": 3.9145999651213065e-06, "loss": 0.74809587, "num_input_tokens_seen": 21154150, "step": 1003, "time_per_iteration": 2.6212010383605957 }, { "auxiliary_loss_clip": 0.01230815, "auxiliary_loss_mlp": 0.01043561, "balance_loss_clip": 1.06716847, "balance_loss_mlp": 1.03109193, "epoch": 0.12072386220164733, "flos": 16726795943040.0, "grad_norm": 3.28910021969422, "language_loss": 0.88465434, "learning_rate": 3.9143746219863465e-06, "loss": 0.9073981, "num_input_tokens_seen": 21171255, "step": 1004, "time_per_iteration": 2.501833915710449 }, { "auxiliary_loss_clip": 0.01119583, "auxiliary_loss_mlp": 0.01007689, "balance_loss_clip": 1.03047168, "balance_loss_mlp": 1.00439918, "epoch": 0.12084410509228642, "flos": 55144176105600.0, "grad_norm": 0.9718233589977261, "language_loss": 0.64785594, "learning_rate": 3.914148988442278e-06, "loss": 0.66912866, "num_input_tokens_seen": 21227045, "step": 1005, "time_per_iteration": 3.104682683944702 }, { "auxiliary_loss_clip": 0.01200345, "auxiliary_loss_mlp": 0.01036453, "balance_loss_clip": 1.06267571, "balance_loss_mlp": 1.02454448, "epoch": 0.1209643479829255, "flos": 26760236855040.0, "grad_norm": 2.7257032149404576, "language_loss": 0.94802999, "learning_rate": 3.91392306452333e-06, "loss": 0.97039801, "num_input_tokens_seen": 21244120, "step": 1006, "time_per_iteration": 2.603006601333618 }, { "auxiliary_loss_clip": 0.0124971, "auxiliary_loss_mlp": 0.0103622, "balance_loss_clip": 1.07035995, "balance_loss_mlp": 1.02459073, "epoch": 0.1210845908735646, "flos": 11035026725760.0, "grad_norm": 4.355028201786306, "language_loss": 0.66476691, "learning_rate": 3.913696850263774e-06, "loss": 0.68762624, "num_input_tokens_seen": 21258485, "step": 1007, "time_per_iteration": 2.5299501419067383 }, { "auxiliary_loss_clip": 0.01228557, "auxiliary_loss_mlp": 0.01037599, "balance_loss_clip": 1.06662476, "balance_loss_mlp": 1.02625632, "epoch": 0.1212048337642037, "flos": 20484631975680.0, "grad_norm": 2.345442799468082, "language_loss": 0.79027092, "learning_rate": 3.913470345697929e-06, "loss": 0.81293243, "num_input_tokens_seen": 21277115, "step": 1008, "time_per_iteration": 3.3289883136749268 }, { "auxiliary_loss_clip": 0.01184583, "auxiliary_loss_mlp": 0.01040448, "balance_loss_clip": 1.06186759, "balance_loss_mlp": 1.02905202, "epoch": 0.12132507665484278, "flos": 22346061557760.0, "grad_norm": 2.14806818769836, "language_loss": 0.85448384, "learning_rate": 3.913243550860153e-06, "loss": 0.87673414, "num_input_tokens_seen": 21294880, "step": 1009, "time_per_iteration": 2.643724203109741 }, { "auxiliary_loss_clip": 0.01235156, "auxiliary_loss_mlp": 0.01041644, "balance_loss_clip": 1.07236242, "balance_loss_mlp": 1.02952623, "epoch": 0.12144531954548188, "flos": 29314957818240.0, "grad_norm": 4.906364335392105, "language_loss": 0.76191723, "learning_rate": 3.913016465784852e-06, "loss": 0.78468519, "num_input_tokens_seen": 21315555, "step": 1010, "time_per_iteration": 2.5796854496002197 }, { "auxiliary_loss_clip": 0.01181669, "auxiliary_loss_mlp": 0.01040886, "balance_loss_clip": 1.05798078, "balance_loss_mlp": 1.02863169, "epoch": 0.12156556243612096, "flos": 20485242506880.0, "grad_norm": 2.396291420679566, "language_loss": 0.72145033, "learning_rate": 3.912789090506474e-06, "loss": 0.74367583, "num_input_tokens_seen": 21334815, "step": 1011, "time_per_iteration": 2.6079001426696777 }, { "auxiliary_loss_clip": 0.01204323, "auxiliary_loss_mlp": 0.01046784, "balance_loss_clip": 1.0612576, "balance_loss_mlp": 1.03437412, "epoch": 0.12168580532676006, "flos": 16472009796480.0, "grad_norm": 2.8260098849450337, "language_loss": 0.71941292, "learning_rate": 3.9125614250595114e-06, "loss": 0.74192393, "num_input_tokens_seen": 21351025, "step": 1012, "time_per_iteration": 3.292996406555176 }, { "auxiliary_loss_clip": 0.01228391, "auxiliary_loss_mlp": 0.01038409, "balance_loss_clip": 1.06464124, "balance_loss_mlp": 1.0263629, "epoch": 0.12180604821739914, "flos": 15341290588800.0, "grad_norm": 4.1629457312837275, "language_loss": 0.8891508, "learning_rate": 3.912333469478502e-06, "loss": 0.91181874, "num_input_tokens_seen": 21368990, "step": 1013, "time_per_iteration": 2.4810686111450195 }, { "auxiliary_loss_clip": 0.01211681, "auxiliary_loss_mlp": 0.01036762, "balance_loss_clip": 1.06220818, "balance_loss_mlp": 1.02556276, "epoch": 0.12192629110803824, "flos": 19318038059520.0, "grad_norm": 2.192660204255425, "language_loss": 0.7811994, "learning_rate": 3.912105223798025e-06, "loss": 0.80368388, "num_input_tokens_seen": 21388410, "step": 1014, "time_per_iteration": 4.213102102279663 }, { "auxiliary_loss_clip": 0.01104352, "auxiliary_loss_mlp": 0.01006984, "balance_loss_clip": 1.02628875, "balance_loss_mlp": 1.00400352, "epoch": 0.12204653399867733, "flos": 47725354085760.0, "grad_norm": 1.004827507167101, "language_loss": 0.67704082, "learning_rate": 3.9118766880527065e-06, "loss": 0.69815409, "num_input_tokens_seen": 21442845, "step": 1015, "time_per_iteration": 3.033341884613037 }, { "auxiliary_loss_clip": 0.01172378, "auxiliary_loss_mlp": 0.01033919, "balance_loss_clip": 1.05895948, "balance_loss_mlp": 1.02293396, "epoch": 0.12216677688931642, "flos": 18221936584320.0, "grad_norm": 1.7625180339866533, "language_loss": 0.73916751, "learning_rate": 3.9116478622772145e-06, "loss": 0.76123047, "num_input_tokens_seen": 21461420, "step": 1016, "time_per_iteration": 2.603590965270996 }, { "auxiliary_loss_clip": 0.01228523, "auxiliary_loss_mlp": 0.01047251, "balance_loss_clip": 1.06761563, "balance_loss_mlp": 1.03543186, "epoch": 0.12228701977995551, "flos": 27525636789120.0, "grad_norm": 1.641851045288504, "language_loss": 0.87946522, "learning_rate": 3.911418746506261e-06, "loss": 0.90222299, "num_input_tokens_seen": 21481550, "step": 1017, "time_per_iteration": 2.5945804119110107 }, { "auxiliary_loss_clip": 0.01236243, "auxiliary_loss_mlp": 0.0104591, "balance_loss_clip": 1.07309771, "balance_loss_mlp": 1.03404868, "epoch": 0.1224072626705946, "flos": 21798136517760.0, "grad_norm": 1.7461045162125741, "language_loss": 0.78283679, "learning_rate": 3.911189340774604e-06, "loss": 0.80565834, "num_input_tokens_seen": 21501680, "step": 1018, "time_per_iteration": 2.583975315093994 }, { "auxiliary_loss_clip": 0.01222008, "auxiliary_loss_mlp": 0.01040371, "balance_loss_clip": 1.06582403, "balance_loss_mlp": 1.0284853, "epoch": 0.1225275055612337, "flos": 20703758895360.0, "grad_norm": 1.8798613184521535, "language_loss": 0.79470217, "learning_rate": 3.910959645117043e-06, "loss": 0.81732595, "num_input_tokens_seen": 21521015, "step": 1019, "time_per_iteration": 2.5504541397094727 }, { "auxiliary_loss_clip": 0.01108783, "auxiliary_loss_mlp": 0.00755753, "balance_loss_clip": 1.02608109, "balance_loss_mlp": 0.99982893, "epoch": 0.12264774845187278, "flos": 57745294462080.0, "grad_norm": 0.8250716008429229, "language_loss": 0.5673728, "learning_rate": 3.910729659568423e-06, "loss": 0.58601809, "num_input_tokens_seen": 21578200, "step": 1020, "time_per_iteration": 3.0903711318969727 }, { "auxiliary_loss_clip": 0.01214141, "auxiliary_loss_mlp": 0.01040553, "balance_loss_clip": 1.06681371, "balance_loss_mlp": 1.02984226, "epoch": 0.12276799134251187, "flos": 26396282298240.0, "grad_norm": 1.8921659514973732, "language_loss": 0.82302976, "learning_rate": 3.9104993841636344e-06, "loss": 0.8455767, "num_input_tokens_seen": 21598770, "step": 1021, "time_per_iteration": 2.5958285331726074 }, { "auxiliary_loss_clip": 0.01213043, "auxiliary_loss_mlp": 0.00765046, "balance_loss_clip": 1.06837499, "balance_loss_mlp": 1.00014257, "epoch": 0.12288823423315097, "flos": 21064193919360.0, "grad_norm": 1.838728974577964, "language_loss": 0.80708408, "learning_rate": 3.910268818937608e-06, "loss": 0.82686502, "num_input_tokens_seen": 21616925, "step": 1022, "time_per_iteration": 2.6083619594573975 }, { "auxiliary_loss_clip": 0.01182559, "auxiliary_loss_mlp": 0.01040164, "balance_loss_clip": 1.06323099, "balance_loss_mlp": 1.02887464, "epoch": 0.12300847712379005, "flos": 12312441077760.0, "grad_norm": 2.79147987123607, "language_loss": 0.87200677, "learning_rate": 3.9100379639253196e-06, "loss": 0.89423394, "num_input_tokens_seen": 21633645, "step": 1023, "time_per_iteration": 2.563525676727295 }, { "auxiliary_loss_clip": 0.01209931, "auxiliary_loss_mlp": 0.01038633, "balance_loss_clip": 1.05983126, "balance_loss_mlp": 1.02677727, "epoch": 0.12312872001442915, "flos": 16762239688320.0, "grad_norm": 7.128808907151381, "language_loss": 0.86316663, "learning_rate": 3.909806819161791e-06, "loss": 0.8856523, "num_input_tokens_seen": 21649120, "step": 1024, "time_per_iteration": 2.5015146732330322 }, { "auxiliary_loss_clip": 0.01200876, "auxiliary_loss_mlp": 0.01035231, "balance_loss_clip": 1.06189966, "balance_loss_mlp": 1.02354836, "epoch": 0.12324896290506823, "flos": 18404937400320.0, "grad_norm": 3.367361678904254, "language_loss": 0.85992765, "learning_rate": 3.909575384682086e-06, "loss": 0.88228869, "num_input_tokens_seen": 21668000, "step": 1025, "time_per_iteration": 2.5635976791381836 }, { "auxiliary_loss_clip": 0.01230146, "auxiliary_loss_mlp": 0.01053171, "balance_loss_clip": 1.06511676, "balance_loss_mlp": 1.04119062, "epoch": 0.12336920579570733, "flos": 18915407533440.0, "grad_norm": 2.107270800292623, "language_loss": 0.69035459, "learning_rate": 3.9093436605213144e-06, "loss": 0.71318769, "num_input_tokens_seen": 21688500, "step": 1026, "time_per_iteration": 2.5208303928375244 }, { "auxiliary_loss_clip": 0.01213136, "auxiliary_loss_mlp": 0.01045114, "balance_loss_clip": 1.06448293, "balance_loss_mlp": 1.03399217, "epoch": 0.12348944868634643, "flos": 23878369797120.0, "grad_norm": 1.931677531205367, "language_loss": 0.79118574, "learning_rate": 3.909111646714627e-06, "loss": 0.81376833, "num_input_tokens_seen": 21709345, "step": 1027, "time_per_iteration": 2.563126802444458 }, { "auxiliary_loss_clip": 0.01239718, "auxiliary_loss_mlp": 0.01032906, "balance_loss_clip": 1.06591201, "balance_loss_mlp": 1.02241564, "epoch": 0.12360969157698551, "flos": 19026084314880.0, "grad_norm": 3.2425410989092485, "language_loss": 0.72293043, "learning_rate": 3.9088793432972206e-06, "loss": 0.74565667, "num_input_tokens_seen": 21728165, "step": 1028, "time_per_iteration": 2.495054006576538 }, { "auxiliary_loss_clip": 0.0118091, "auxiliary_loss_mlp": 0.01041615, "balance_loss_clip": 1.06134009, "balance_loss_mlp": 1.03015304, "epoch": 0.1237299344676246, "flos": 13224607983360.0, "grad_norm": 2.1911629470058616, "language_loss": 0.81957233, "learning_rate": 3.908646750304336e-06, "loss": 0.84179765, "num_input_tokens_seen": 21745850, "step": 1029, "time_per_iteration": 2.695385217666626 }, { "auxiliary_loss_clip": 0.01215696, "auxiliary_loss_mlp": 0.01038473, "balance_loss_clip": 1.06603193, "balance_loss_mlp": 1.02723098, "epoch": 0.12385017735826369, "flos": 20485673470080.0, "grad_norm": 1.736101877835852, "language_loss": 0.87471676, "learning_rate": 3.908413867771257e-06, "loss": 0.89725852, "num_input_tokens_seen": 21764760, "step": 1030, "time_per_iteration": 2.531994342803955 }, { "auxiliary_loss_clip": 0.01227987, "auxiliary_loss_mlp": 0.01044605, "balance_loss_clip": 1.06733656, "balance_loss_mlp": 1.03248167, "epoch": 0.12397042024890279, "flos": 17347835116800.0, "grad_norm": 1.783422361228728, "language_loss": 0.80791593, "learning_rate": 3.908180695733311e-06, "loss": 0.83064187, "num_input_tokens_seen": 21784250, "step": 1031, "time_per_iteration": 2.479642152786255 }, { "auxiliary_loss_clip": 0.01159212, "auxiliary_loss_mlp": 0.01046317, "balance_loss_clip": 1.05432057, "balance_loss_mlp": 1.03497982, "epoch": 0.12409066313954187, "flos": 20412343854720.0, "grad_norm": 1.8873261131681616, "language_loss": 0.82790679, "learning_rate": 3.907947234225871e-06, "loss": 0.84996206, "num_input_tokens_seen": 21803260, "step": 1032, "time_per_iteration": 2.625009775161743 }, { "auxiliary_loss_clip": 0.01160627, "auxiliary_loss_mlp": 0.01035329, "balance_loss_clip": 1.0585804, "balance_loss_mlp": 1.02444494, "epoch": 0.12421090603018096, "flos": 20736688688640.0, "grad_norm": 2.190576644275307, "language_loss": 0.87038624, "learning_rate": 3.907713483284352e-06, "loss": 0.89234579, "num_input_tokens_seen": 21822735, "step": 1033, "time_per_iteration": 2.6443607807159424 }, { "auxiliary_loss_clip": 0.01140039, "auxiliary_loss_mlp": 0.01044361, "balance_loss_clip": 1.05256724, "balance_loss_mlp": 1.03128386, "epoch": 0.12433114892082006, "flos": 24498834353280.0, "grad_norm": 2.2901131175290224, "language_loss": 0.97754049, "learning_rate": 3.907479442944216e-06, "loss": 0.99938446, "num_input_tokens_seen": 21841140, "step": 1034, "time_per_iteration": 3.532508134841919 }, { "auxiliary_loss_clip": 0.01225909, "auxiliary_loss_mlp": 0.01036581, "balance_loss_clip": 1.0662756, "balance_loss_mlp": 1.02637112, "epoch": 0.12445139181145914, "flos": 19682315838720.0, "grad_norm": 2.198523362472246, "language_loss": 0.9256736, "learning_rate": 3.907245113240963e-06, "loss": 0.94829845, "num_input_tokens_seen": 21859260, "step": 1035, "time_per_iteration": 2.4889605045318604 }, { "auxiliary_loss_clip": 0.01192853, "auxiliary_loss_mlp": 0.01035134, "balance_loss_clip": 1.05667853, "balance_loss_mlp": 1.0236001, "epoch": 0.12457163470209824, "flos": 46423087522560.0, "grad_norm": 1.6752296986508524, "language_loss": 0.73605829, "learning_rate": 3.907010494210144e-06, "loss": 0.75833815, "num_input_tokens_seen": 21881920, "step": 1036, "time_per_iteration": 2.764355421066284 }, { "auxiliary_loss_clip": 0.01230011, "auxiliary_loss_mlp": 0.01046237, "balance_loss_clip": 1.06778121, "balance_loss_mlp": 1.03355885, "epoch": 0.12469187759273732, "flos": 20376289578240.0, "grad_norm": 2.0552989901705367, "language_loss": 0.92072618, "learning_rate": 3.9067755858873495e-06, "loss": 0.9434886, "num_input_tokens_seen": 21898720, "step": 1037, "time_per_iteration": 2.509488344192505 }, { "auxiliary_loss_clip": 0.01088206, "auxiliary_loss_mlp": 0.01006039, "balance_loss_clip": 1.01946926, "balance_loss_mlp": 1.00302303, "epoch": 0.12481212048337642, "flos": 69224641447680.0, "grad_norm": 0.8631232454559965, "language_loss": 0.62778598, "learning_rate": 3.906540388308214e-06, "loss": 0.64872843, "num_input_tokens_seen": 21958305, "step": 1038, "time_per_iteration": 3.111362934112549 }, { "auxiliary_loss_clip": 0.01166583, "auxiliary_loss_mlp": 0.01047939, "balance_loss_clip": 1.05954885, "balance_loss_mlp": 1.03619719, "epoch": 0.12493236337401552, "flos": 18223696350720.0, "grad_norm": 1.7252077372252637, "language_loss": 0.81430256, "learning_rate": 3.906304901508417e-06, "loss": 0.83644783, "num_input_tokens_seen": 21977205, "step": 1039, "time_per_iteration": 3.4055750370025635 }, { "auxiliary_loss_clip": 0.01230046, "auxiliary_loss_mlp": 0.01043586, "balance_loss_clip": 1.07025886, "balance_loss_mlp": 1.03304839, "epoch": 0.12505260626465461, "flos": 30044375303040.0, "grad_norm": 2.030673336104599, "language_loss": 0.7545746, "learning_rate": 3.9060691255236835e-06, "loss": 0.77731097, "num_input_tokens_seen": 21997770, "step": 1040, "time_per_iteration": 2.5736887454986572 }, { "auxiliary_loss_clip": 0.01219124, "auxiliary_loss_mlp": 0.01041874, "balance_loss_clip": 1.05977941, "balance_loss_mlp": 1.02981019, "epoch": 0.1251728491552937, "flos": 24433980347520.0, "grad_norm": 1.7378624766192161, "language_loss": 0.80589688, "learning_rate": 3.905833060389778e-06, "loss": 0.82850689, "num_input_tokens_seen": 22021890, "step": 1041, "time_per_iteration": 4.35455584526062 }, { "auxiliary_loss_clip": 0.01242913, "auxiliary_loss_mlp": 0.00765922, "balance_loss_clip": 1.06769443, "balance_loss_mlp": 1.00015247, "epoch": 0.12529309204593278, "flos": 27119809952640.0, "grad_norm": 2.4006404917695154, "language_loss": 0.78460449, "learning_rate": 3.905596706142513e-06, "loss": 0.80469286, "num_input_tokens_seen": 22043300, "step": 1042, "time_per_iteration": 2.5455574989318848 }, { "auxiliary_loss_clip": 0.0119042, "auxiliary_loss_mlp": 0.01042626, "balance_loss_clip": 1.05873978, "balance_loss_mlp": 1.03075874, "epoch": 0.12541333493657186, "flos": 30774151923840.0, "grad_norm": 1.9265047748491917, "language_loss": 0.85775608, "learning_rate": 3.9053600628177435e-06, "loss": 0.88008654, "num_input_tokens_seen": 22062910, "step": 1043, "time_per_iteration": 2.6622276306152344 }, { "auxiliary_loss_clip": 0.01240352, "auxiliary_loss_mlp": 0.01037129, "balance_loss_clip": 1.06610239, "balance_loss_mlp": 1.02621567, "epoch": 0.12553357782721097, "flos": 23659566099840.0, "grad_norm": 2.189959295462693, "language_loss": 0.84603149, "learning_rate": 3.905123130451367e-06, "loss": 0.86880624, "num_input_tokens_seen": 22084010, "step": 1044, "time_per_iteration": 2.6156985759735107 }, { "auxiliary_loss_clip": 0.01243552, "auxiliary_loss_mlp": 0.01038057, "balance_loss_clip": 1.06874752, "balance_loss_mlp": 1.02632654, "epoch": 0.12565382071785006, "flos": 24863758577280.0, "grad_norm": 1.8574377599064194, "language_loss": 0.79525554, "learning_rate": 3.904885909079326e-06, "loss": 0.8180716, "num_input_tokens_seen": 22102795, "step": 1045, "time_per_iteration": 2.5214390754699707 }, { "auxiliary_loss_clip": 0.0122688, "auxiliary_loss_mlp": 0.01037631, "balance_loss_clip": 1.06402993, "balance_loss_mlp": 1.02630591, "epoch": 0.12577406360848914, "flos": 21360780518400.0, "grad_norm": 2.1491868964460044, "language_loss": 0.77576238, "learning_rate": 3.904648398737607e-06, "loss": 0.79840744, "num_input_tokens_seen": 22121360, "step": 1046, "time_per_iteration": 2.5263524055480957 }, { "auxiliary_loss_clip": 0.01241834, "auxiliary_loss_mlp": 0.01042549, "balance_loss_clip": 1.06716454, "balance_loss_mlp": 1.03160596, "epoch": 0.12589430649912825, "flos": 36138056774400.0, "grad_norm": 1.7367113541557497, "language_loss": 0.78241658, "learning_rate": 3.9044105994622406e-06, "loss": 0.80526048, "num_input_tokens_seen": 22142505, "step": 1047, "time_per_iteration": 2.6438097953796387 }, { "auxiliary_loss_clip": 0.01213877, "auxiliary_loss_mlp": 0.00766208, "balance_loss_clip": 1.06240201, "balance_loss_mlp": 1.00026429, "epoch": 0.12601454938976733, "flos": 25337671643520.0, "grad_norm": 1.9834209488074352, "language_loss": 0.81556469, "learning_rate": 3.9041725112893005e-06, "loss": 0.83536553, "num_input_tokens_seen": 22163730, "step": 1048, "time_per_iteration": 2.6039910316467285 }, { "auxiliary_loss_clip": 0.01191212, "auxiliary_loss_mlp": 0.0103965, "balance_loss_clip": 1.06269813, "balance_loss_mlp": 1.02858782, "epoch": 0.12613479228040642, "flos": 15560094286080.0, "grad_norm": 2.0108690803891704, "language_loss": 0.74804461, "learning_rate": 3.903934134254904e-06, "loss": 0.77035326, "num_input_tokens_seen": 22181520, "step": 1049, "time_per_iteration": 2.567673444747925 }, { "auxiliary_loss_clip": 0.01230172, "auxiliary_loss_mlp": 0.01043875, "balance_loss_clip": 1.06453228, "balance_loss_mlp": 1.03203142, "epoch": 0.1262550351710455, "flos": 21470595373440.0, "grad_norm": 2.921216138891059, "language_loss": 0.84975553, "learning_rate": 3.903695468395213e-06, "loss": 0.87249595, "num_input_tokens_seen": 22199390, "step": 1050, "time_per_iteration": 2.5238101482391357 }, { "auxiliary_loss_clip": 0.01213068, "auxiliary_loss_mlp": 0.01042308, "balance_loss_clip": 1.05948162, "balance_loss_mlp": 1.03168654, "epoch": 0.1263752780616846, "flos": 31576719456000.0, "grad_norm": 2.6318880292129165, "language_loss": 0.55679047, "learning_rate": 3.903456513746434e-06, "loss": 0.57934421, "num_input_tokens_seen": 22220365, "step": 1051, "time_per_iteration": 2.613668203353882 }, { "auxiliary_loss_clip": 0.01238207, "auxiliary_loss_mlp": 0.01038733, "balance_loss_clip": 1.06540203, "balance_loss_mlp": 1.02813554, "epoch": 0.1264955209523237, "flos": 28768217927040.0, "grad_norm": 1.7206193974943775, "language_loss": 0.87629473, "learning_rate": 3.903217270344815e-06, "loss": 0.89906412, "num_input_tokens_seen": 22240615, "step": 1052, "time_per_iteration": 2.537637710571289 }, { "auxiliary_loss_clip": 0.01184548, "auxiliary_loss_mlp": 0.01037958, "balance_loss_clip": 1.05635595, "balance_loss_mlp": 1.02667487, "epoch": 0.12661576384296278, "flos": 29241125412480.0, "grad_norm": 2.076397698030303, "language_loss": 0.82413191, "learning_rate": 3.902977738226648e-06, "loss": 0.84635699, "num_input_tokens_seen": 22261350, "step": 1053, "time_per_iteration": 2.6436336040496826 }, { "auxiliary_loss_clip": 0.01229076, "auxiliary_loss_mlp": 0.0104186, "balance_loss_clip": 1.06581616, "balance_loss_mlp": 1.02984357, "epoch": 0.12673600673360189, "flos": 20850346298880.0, "grad_norm": 2.012519176240002, "language_loss": 0.91379714, "learning_rate": 3.902737917428273e-06, "loss": 0.93650657, "num_input_tokens_seen": 22279515, "step": 1054, "time_per_iteration": 2.490696668624878 }, { "auxiliary_loss_clip": 0.01239093, "auxiliary_loss_mlp": 0.01039842, "balance_loss_clip": 1.06527889, "balance_loss_mlp": 1.02880883, "epoch": 0.12685624962424097, "flos": 25263695583360.0, "grad_norm": 1.6635908928488745, "language_loss": 0.83977073, "learning_rate": 3.902497807986068e-06, "loss": 0.86256003, "num_input_tokens_seen": 22299535, "step": 1055, "time_per_iteration": 2.502523422241211 }, { "auxiliary_loss_clip": 0.01194806, "auxiliary_loss_mlp": 0.01039005, "balance_loss_clip": 1.05784416, "balance_loss_mlp": 1.02744198, "epoch": 0.12697649251488005, "flos": 27527109246720.0, "grad_norm": 1.5842689481459875, "language_loss": 0.83632183, "learning_rate": 3.902257409936458e-06, "loss": 0.85865998, "num_input_tokens_seen": 22320300, "step": 1056, "time_per_iteration": 2.641488552093506 }, { "auxiliary_loss_clip": 0.01209253, "auxiliary_loss_mlp": 0.01039576, "balance_loss_clip": 1.06527722, "balance_loss_mlp": 1.0288713, "epoch": 0.12709673540551916, "flos": 21251863503360.0, "grad_norm": 1.8396098698391874, "language_loss": 0.83933568, "learning_rate": 3.902016723315912e-06, "loss": 0.86182398, "num_input_tokens_seen": 22338240, "step": 1057, "time_per_iteration": 2.5296690464019775 }, { "auxiliary_loss_clip": 0.01221255, "auxiliary_loss_mlp": 0.01040542, "balance_loss_clip": 1.06176949, "balance_loss_mlp": 1.02964067, "epoch": 0.12721697829615825, "flos": 25337707557120.0, "grad_norm": 7.034821121062021, "language_loss": 0.69578528, "learning_rate": 3.901775748160941e-06, "loss": 0.71840322, "num_input_tokens_seen": 22357420, "step": 1058, "time_per_iteration": 2.536893606185913 }, { "auxiliary_loss_clip": 0.01100003, "auxiliary_loss_mlp": 0.01008418, "balance_loss_clip": 1.02518487, "balance_loss_mlp": 1.00559258, "epoch": 0.12733722118679733, "flos": 61943287754880.0, "grad_norm": 0.7970972117639246, "language_loss": 0.60888958, "learning_rate": 3.901534484508101e-06, "loss": 0.62997377, "num_input_tokens_seen": 22420095, "step": 1059, "time_per_iteration": 3.098818302154541 }, { "auxiliary_loss_clip": 0.01200318, "auxiliary_loss_mlp": 0.01037541, "balance_loss_clip": 1.0596056, "balance_loss_mlp": 1.0263294, "epoch": 0.1274574640774364, "flos": 26976742081920.0, "grad_norm": 2.7969127552917716, "language_loss": 0.74671823, "learning_rate": 3.901292932393991e-06, "loss": 0.76909685, "num_input_tokens_seen": 22438975, "step": 1060, "time_per_iteration": 2.5649211406707764 }, { "auxiliary_loss_clip": 0.0123987, "auxiliary_loss_mlp": 0.01042676, "balance_loss_clip": 1.06724727, "balance_loss_mlp": 1.03141642, "epoch": 0.12757770696807552, "flos": 22236318529920.0, "grad_norm": 2.818224049382878, "language_loss": 0.85229158, "learning_rate": 3.9010510918552555e-06, "loss": 0.87511706, "num_input_tokens_seen": 22458050, "step": 1061, "time_per_iteration": 3.32651424407959 }, { "auxiliary_loss_clip": 0.01206446, "auxiliary_loss_mlp": 0.01045209, "balance_loss_clip": 1.06109846, "balance_loss_mlp": 1.03220916, "epoch": 0.1276979498587146, "flos": 28547905858560.0, "grad_norm": 3.845495566493531, "language_loss": 0.74482942, "learning_rate": 3.900808962928581e-06, "loss": 0.76734602, "num_input_tokens_seen": 22475665, "step": 1062, "time_per_iteration": 2.6039249897003174 }, { "auxiliary_loss_clip": 0.01241533, "auxiliary_loss_mlp": 0.01043355, "balance_loss_clip": 1.069332, "balance_loss_mlp": 1.03210807, "epoch": 0.1278181927493537, "flos": 17420338719360.0, "grad_norm": 2.3556815949442447, "language_loss": 0.89496851, "learning_rate": 3.900566545650698e-06, "loss": 0.91781747, "num_input_tokens_seen": 22493335, "step": 1063, "time_per_iteration": 2.5018365383148193 }, { "auxiliary_loss_clip": 0.01224384, "auxiliary_loss_mlp": 0.01038408, "balance_loss_clip": 1.06568766, "balance_loss_mlp": 1.02628446, "epoch": 0.1279384356399928, "flos": 21138636856320.0, "grad_norm": 2.4870396700110913, "language_loss": 0.81683242, "learning_rate": 3.900323840058381e-06, "loss": 0.83946037, "num_input_tokens_seen": 22511045, "step": 1064, "time_per_iteration": 2.5124664306640625 }, { "auxiliary_loss_clip": 0.01222741, "auxiliary_loss_mlp": 0.01039625, "balance_loss_clip": 1.06175506, "balance_loss_mlp": 1.02920628, "epoch": 0.12805867853063188, "flos": 26576733248640.0, "grad_norm": 3.1755074166154538, "language_loss": 0.81772017, "learning_rate": 3.900080846188449e-06, "loss": 0.84034383, "num_input_tokens_seen": 22529635, "step": 1065, "time_per_iteration": 2.5471057891845703 }, { "auxiliary_loss_clip": 0.01238892, "auxiliary_loss_mlp": 0.01036302, "balance_loss_clip": 1.06594896, "balance_loss_mlp": 1.02464962, "epoch": 0.12817892142127096, "flos": 16436206915200.0, "grad_norm": 2.080051520696307, "language_loss": 0.8166948, "learning_rate": 3.8998375640777625e-06, "loss": 0.83944678, "num_input_tokens_seen": 22547505, "step": 1066, "time_per_iteration": 3.271850347518921 }, { "auxiliary_loss_clip": 0.01106057, "auxiliary_loss_mlp": 0.01002177, "balance_loss_clip": 1.03350782, "balance_loss_mlp": 0.99933952, "epoch": 0.12829916431191005, "flos": 60757049099520.0, "grad_norm": 0.7070557646598391, "language_loss": 0.52648526, "learning_rate": 3.899593993763229e-06, "loss": 0.54756761, "num_input_tokens_seen": 22608465, "step": 1067, "time_per_iteration": 4.689392805099487 }, { "auxiliary_loss_clip": 0.01189222, "auxiliary_loss_mlp": 0.01041807, "balance_loss_clip": 1.06122744, "balance_loss_mlp": 1.02853906, "epoch": 0.12841940720254916, "flos": 29786895636480.0, "grad_norm": 4.176731565411882, "language_loss": 0.81389064, "learning_rate": 3.899350135281796e-06, "loss": 0.83620095, "num_input_tokens_seen": 22629465, "step": 1068, "time_per_iteration": 2.6616933345794678 }, { "auxiliary_loss_clip": 0.01196689, "auxiliary_loss_mlp": 0.01038846, "balance_loss_clip": 1.06291044, "balance_loss_mlp": 1.02799249, "epoch": 0.12853965009318824, "flos": 25951851319680.0, "grad_norm": 2.0782611993186997, "language_loss": 0.7963084, "learning_rate": 3.8991059886704585e-06, "loss": 0.81866372, "num_input_tokens_seen": 22648970, "step": 1069, "time_per_iteration": 2.618229866027832 }, { "auxiliary_loss_clip": 0.01186604, "auxiliary_loss_mlp": 0.01045415, "balance_loss_clip": 1.0594517, "balance_loss_mlp": 1.03403068, "epoch": 0.12865989298382732, "flos": 30846871008000.0, "grad_norm": 2.1171472038622725, "language_loss": 0.8299666, "learning_rate": 3.898861553966252e-06, "loss": 0.85228682, "num_input_tokens_seen": 22668620, "step": 1070, "time_per_iteration": 2.625675916671753 }, { "auxiliary_loss_clip": 0.01150602, "auxiliary_loss_mlp": 0.0104496, "balance_loss_clip": 1.05454755, "balance_loss_mlp": 1.03380799, "epoch": 0.12878013587446643, "flos": 25885776251520.0, "grad_norm": 1.767287465185013, "language_loss": 0.88117641, "learning_rate": 3.898616831206257e-06, "loss": 0.90313202, "num_input_tokens_seen": 22689045, "step": 1071, "time_per_iteration": 2.7152910232543945 }, { "auxiliary_loss_clip": 0.01190213, "auxiliary_loss_mlp": 0.01039473, "balance_loss_clip": 1.05717158, "balance_loss_mlp": 1.025841, "epoch": 0.12890037876510552, "flos": 23333138277120.0, "grad_norm": 2.7753628309298186, "language_loss": 0.76919007, "learning_rate": 3.8983718204276e-06, "loss": 0.79148692, "num_input_tokens_seen": 22711265, "step": 1072, "time_per_iteration": 2.6214137077331543 }, { "auxiliary_loss_clip": 0.01205405, "auxiliary_loss_mlp": 0.0104505, "balance_loss_clip": 1.06063318, "balance_loss_mlp": 1.03459585, "epoch": 0.1290206216557446, "flos": 23587242065280.0, "grad_norm": 1.801344787788328, "language_loss": 0.82660186, "learning_rate": 3.898126521667446e-06, "loss": 0.84910643, "num_input_tokens_seen": 22731420, "step": 1073, "time_per_iteration": 2.5689280033111572 }, { "auxiliary_loss_clip": 0.01220513, "auxiliary_loss_mlp": 0.01048484, "balance_loss_clip": 1.06045818, "balance_loss_mlp": 1.03642035, "epoch": 0.12914086454638368, "flos": 24170610850560.0, "grad_norm": 1.6126434237526102, "language_loss": 0.83289838, "learning_rate": 3.897880934963007e-06, "loss": 0.85558832, "num_input_tokens_seen": 22750970, "step": 1074, "time_per_iteration": 2.523643732070923 }, { "auxiliary_loss_clip": 0.01203398, "auxiliary_loss_mlp": 0.01038013, "balance_loss_clip": 1.05829477, "balance_loss_mlp": 1.02604461, "epoch": 0.1292611074370228, "flos": 20267157081600.0, "grad_norm": 2.45808645228896, "language_loss": 0.7873584, "learning_rate": 3.89763506035154e-06, "loss": 0.80977255, "num_input_tokens_seen": 22768820, "step": 1075, "time_per_iteration": 2.5363309383392334 }, { "auxiliary_loss_clip": 0.01210482, "auxiliary_loss_mlp": 0.01035575, "balance_loss_clip": 1.06172299, "balance_loss_mlp": 1.02454185, "epoch": 0.12938135032766188, "flos": 27377684668800.0, "grad_norm": 3.0214997472114447, "language_loss": 0.80906409, "learning_rate": 3.897388897870343e-06, "loss": 0.83152467, "num_input_tokens_seen": 22789460, "step": 1076, "time_per_iteration": 2.5580832958221436 }, { "auxiliary_loss_clip": 0.01220461, "auxiliary_loss_mlp": 0.01036571, "balance_loss_clip": 1.06149411, "balance_loss_mlp": 1.02417898, "epoch": 0.12950159321830096, "flos": 29277107861760.0, "grad_norm": 1.9033456538678497, "language_loss": 0.74950051, "learning_rate": 3.89714244755676e-06, "loss": 0.77207083, "num_input_tokens_seen": 22810820, "step": 1077, "time_per_iteration": 2.6054811477661133 }, { "auxiliary_loss_clip": 0.01163658, "auxiliary_loss_mlp": 0.0104029, "balance_loss_clip": 1.05325067, "balance_loss_mlp": 1.02897716, "epoch": 0.12962183610894007, "flos": 24534888629760.0, "grad_norm": 2.5521679309733747, "language_loss": 0.86465424, "learning_rate": 3.896895709448175e-06, "loss": 0.88669372, "num_input_tokens_seen": 22830570, "step": 1078, "time_per_iteration": 2.6051576137542725 }, { "auxiliary_loss_clip": 0.01154197, "auxiliary_loss_mlp": 0.01041021, "balance_loss_clip": 1.05152571, "balance_loss_mlp": 1.02999401, "epoch": 0.12974207899957915, "flos": 11215944552960.0, "grad_norm": 2.658614775245721, "language_loss": 0.76757759, "learning_rate": 3.896648683582019e-06, "loss": 0.78952974, "num_input_tokens_seen": 22845905, "step": 1079, "time_per_iteration": 2.603343963623047 }, { "auxiliary_loss_clip": 0.01175617, "auxiliary_loss_mlp": 0.01036716, "balance_loss_clip": 1.06057978, "balance_loss_mlp": 1.02589798, "epoch": 0.12986232189021824, "flos": 24717889445760.0, "grad_norm": 2.1070772137520026, "language_loss": 0.80655158, "learning_rate": 3.896401369995766e-06, "loss": 0.82867491, "num_input_tokens_seen": 22865710, "step": 1080, "time_per_iteration": 2.643333673477173 }, { "auxiliary_loss_clip": 0.01241174, "auxiliary_loss_mlp": 0.0104661, "balance_loss_clip": 1.06868219, "balance_loss_mlp": 1.03552365, "epoch": 0.12998256478085732, "flos": 23915357827200.0, "grad_norm": 1.8554135263392066, "language_loss": 0.79311562, "learning_rate": 3.896153768726932e-06, "loss": 0.81599343, "num_input_tokens_seen": 22886020, "step": 1081, "time_per_iteration": 2.5395965576171875 }, { "auxiliary_loss_clip": 0.0122403, "auxiliary_loss_mlp": 0.01040681, "balance_loss_clip": 1.06594539, "balance_loss_mlp": 1.02932608, "epoch": 0.13010280767149643, "flos": 18624207974400.0, "grad_norm": 1.9891574188386492, "language_loss": 0.88152111, "learning_rate": 3.8959058798130806e-06, "loss": 0.90416819, "num_input_tokens_seen": 22903995, "step": 1082, "time_per_iteration": 2.474423408508301 }, { "auxiliary_loss_clip": 0.01211475, "auxiliary_loss_mlp": 0.00766485, "balance_loss_clip": 1.06288767, "balance_loss_mlp": 1.00034475, "epoch": 0.1302230505621355, "flos": 22783992174720.0, "grad_norm": 1.93527029883986, "language_loss": 0.74766552, "learning_rate": 3.895657703291814e-06, "loss": 0.76744515, "num_input_tokens_seen": 22924100, "step": 1083, "time_per_iteration": 2.55025053024292 }, { "auxiliary_loss_clip": 0.01216616, "auxiliary_loss_mlp": 0.01035848, "balance_loss_clip": 1.06058097, "balance_loss_mlp": 1.02440393, "epoch": 0.1303432934527746, "flos": 21323612920320.0, "grad_norm": 2.8579568162587927, "language_loss": 0.7955972, "learning_rate": 3.895409239200781e-06, "loss": 0.81812179, "num_input_tokens_seen": 22939985, "step": 1084, "time_per_iteration": 2.5215818881988525 }, { "auxiliary_loss_clip": 0.0121806, "auxiliary_loss_mlp": 0.01041378, "balance_loss_clip": 1.0621314, "balance_loss_mlp": 1.02850354, "epoch": 0.1304635363434137, "flos": 20922490765440.0, "grad_norm": 2.390041997093012, "language_loss": 0.91477942, "learning_rate": 3.895160487577673e-06, "loss": 0.93737376, "num_input_tokens_seen": 22957555, "step": 1085, "time_per_iteration": 2.496581554412842 }, { "auxiliary_loss_clip": 0.01114649, "auxiliary_loss_mlp": 0.01003939, "balance_loss_clip": 1.02569234, "balance_loss_mlp": 1.00112534, "epoch": 0.1305837792340528, "flos": 63245659080960.0, "grad_norm": 0.7858939093723247, "language_loss": 0.60890037, "learning_rate": 3.894911448460226e-06, "loss": 0.63008624, "num_input_tokens_seen": 23016870, "step": 1086, "time_per_iteration": 2.9670746326446533 }, { "auxiliary_loss_clip": 0.01126529, "auxiliary_loss_mlp": 0.01046833, "balance_loss_clip": 1.05105567, "balance_loss_mlp": 1.03490007, "epoch": 0.13070402212469187, "flos": 26428852955520.0, "grad_norm": 1.9298341026671, "language_loss": 0.72878098, "learning_rate": 3.8946621218862195e-06, "loss": 0.75051457, "num_input_tokens_seen": 23037870, "step": 1087, "time_per_iteration": 3.5803439617156982 }, { "auxiliary_loss_clip": 0.01189059, "auxiliary_loss_mlp": 0.01043075, "balance_loss_clip": 1.06005859, "balance_loss_mlp": 1.03217936, "epoch": 0.13082426501533098, "flos": 27673409341440.0, "grad_norm": 3.9347373992496233, "language_loss": 0.89026272, "learning_rate": 3.894412507893475e-06, "loss": 0.91258407, "num_input_tokens_seen": 23058150, "step": 1088, "time_per_iteration": 2.6474409103393555 }, { "auxiliary_loss_clip": 0.01184951, "auxiliary_loss_mlp": 0.0104793, "balance_loss_clip": 1.05821359, "balance_loss_mlp": 1.03605092, "epoch": 0.13094450790597006, "flos": 24826770547200.0, "grad_norm": 2.040015778418802, "language_loss": 0.71958792, "learning_rate": 3.894162606519859e-06, "loss": 0.74191678, "num_input_tokens_seen": 23077100, "step": 1089, "time_per_iteration": 2.671130418777466 }, { "auxiliary_loss_clip": 0.01176615, "auxiliary_loss_mlp": 0.01041977, "balance_loss_clip": 1.05913639, "balance_loss_mlp": 1.03121877, "epoch": 0.13106475079660915, "flos": 19062605468160.0, "grad_norm": 1.9492241161256092, "language_loss": 0.76893365, "learning_rate": 3.893912417803282e-06, "loss": 0.79111958, "num_input_tokens_seen": 23096815, "step": 1090, "time_per_iteration": 2.6335387229919434 }, { "auxiliary_loss_clip": 0.01178566, "auxiliary_loss_mlp": 0.01042553, "balance_loss_clip": 1.05422139, "balance_loss_mlp": 1.03019667, "epoch": 0.13118499368724823, "flos": 28913189218560.0, "grad_norm": 2.079338919257438, "language_loss": 0.77189744, "learning_rate": 3.8936619417816975e-06, "loss": 0.79410869, "num_input_tokens_seen": 23117145, "step": 1091, "time_per_iteration": 2.689709424972534 }, { "auxiliary_loss_clip": 0.01192748, "auxiliary_loss_mlp": 0.01033427, "balance_loss_clip": 1.06201732, "balance_loss_mlp": 1.02257299, "epoch": 0.13130523657788734, "flos": 14283398206080.0, "grad_norm": 1.7588624226971892, "language_loss": 0.71889734, "learning_rate": 3.8934111784931015e-06, "loss": 0.74115902, "num_input_tokens_seen": 23134595, "step": 1092, "time_per_iteration": 2.6658742427825928 }, { "auxiliary_loss_clip": 0.01104425, "auxiliary_loss_mlp": 0.01002692, "balance_loss_clip": 1.0229907, "balance_loss_mlp": 0.99985522, "epoch": 0.13142547946852642, "flos": 70174155519360.0, "grad_norm": 0.9243659688392305, "language_loss": 0.59138483, "learning_rate": 3.893160127975535e-06, "loss": 0.61245602, "num_input_tokens_seen": 23195285, "step": 1093, "time_per_iteration": 3.997832775115967 }, { "auxiliary_loss_clip": 0.01180875, "auxiliary_loss_mlp": 0.01037954, "balance_loss_clip": 1.05656636, "balance_loss_mlp": 1.02686143, "epoch": 0.1315457223591655, "flos": 45805998844800.0, "grad_norm": 2.248025414806439, "language_loss": 0.81055689, "learning_rate": 3.8929087902670826e-06, "loss": 0.83274513, "num_input_tokens_seen": 23216915, "step": 1094, "time_per_iteration": 4.473284959793091 }, { "auxiliary_loss_clip": 0.01115156, "auxiliary_loss_mlp": 0.01002481, "balance_loss_clip": 1.02234173, "balance_loss_mlp": 0.9998461, "epoch": 0.13166596524980462, "flos": 62881165820160.0, "grad_norm": 0.9316042087994315, "language_loss": 0.60643041, "learning_rate": 3.8926571654058715e-06, "loss": 0.62760675, "num_input_tokens_seen": 23273560, "step": 1095, "time_per_iteration": 3.001385450363159 }, { "auxiliary_loss_clip": 0.01189067, "auxiliary_loss_mlp": 0.01036534, "balance_loss_clip": 1.06040812, "balance_loss_mlp": 1.0253458, "epoch": 0.1317862081404437, "flos": 23586523793280.0, "grad_norm": 2.3690148783628695, "language_loss": 0.77153385, "learning_rate": 3.892405253430074e-06, "loss": 0.79378986, "num_input_tokens_seen": 23291080, "step": 1096, "time_per_iteration": 2.585012435913086 }, { "auxiliary_loss_clip": 0.01211897, "auxiliary_loss_mlp": 0.00766704, "balance_loss_clip": 1.06427073, "balance_loss_mlp": 1.00037026, "epoch": 0.13190645103108278, "flos": 20260764460800.0, "grad_norm": 2.6380488360395065, "language_loss": 0.82375747, "learning_rate": 3.892153054377904e-06, "loss": 0.84354347, "num_input_tokens_seen": 23308485, "step": 1097, "time_per_iteration": 2.505908250808716 }, { "auxiliary_loss_clip": 0.01053572, "auxiliary_loss_mlp": 0.01006238, "balance_loss_clip": 1.02045012, "balance_loss_mlp": 1.00341308, "epoch": 0.13202669392172187, "flos": 53455440136320.0, "grad_norm": 0.9438268029637948, "language_loss": 0.59427595, "learning_rate": 3.891900568287619e-06, "loss": 0.61487406, "num_input_tokens_seen": 23360870, "step": 1098, "time_per_iteration": 3.0244534015655518 }, { "auxiliary_loss_clip": 0.01196647, "auxiliary_loss_mlp": 0.01037809, "balance_loss_clip": 1.06021881, "balance_loss_mlp": 1.02593517, "epoch": 0.13214693681236098, "flos": 15851293845120.0, "grad_norm": 2.7464925446688007, "language_loss": 0.72030115, "learning_rate": 3.891647795197523e-06, "loss": 0.74264568, "num_input_tokens_seen": 23376910, "step": 1099, "time_per_iteration": 2.5363311767578125 }, { "auxiliary_loss_clip": 0.01199107, "auxiliary_loss_mlp": 0.01046583, "balance_loss_clip": 1.05809522, "balance_loss_mlp": 1.03436375, "epoch": 0.13226717970300006, "flos": 19353840940800.0, "grad_norm": 1.9702353367692258, "language_loss": 0.68540627, "learning_rate": 3.8913947351459605e-06, "loss": 0.70786315, "num_input_tokens_seen": 23394450, "step": 1100, "time_per_iteration": 2.570089817047119 }, { "auxiliary_loss_clip": 0.01242258, "auxiliary_loss_mlp": 0.01038752, "balance_loss_clip": 1.06853461, "balance_loss_mlp": 1.02806497, "epoch": 0.13238742259363914, "flos": 20698084546560.0, "grad_norm": 1.992411835738522, "language_loss": 0.67721313, "learning_rate": 3.89114138817132e-06, "loss": 0.70002329, "num_input_tokens_seen": 23411115, "step": 1101, "time_per_iteration": 2.4617741107940674 }, { "auxiliary_loss_clip": 0.0122472, "auxiliary_loss_mlp": 0.01033473, "balance_loss_clip": 1.06689572, "balance_loss_mlp": 1.02222586, "epoch": 0.13250766548427825, "flos": 21032449274880.0, "grad_norm": 1.7244034211692043, "language_loss": 0.84244347, "learning_rate": 3.890887754312035e-06, "loss": 0.8650254, "num_input_tokens_seen": 23429360, "step": 1102, "time_per_iteration": 2.5234038829803467 }, { "auxiliary_loss_clip": 0.01198335, "auxiliary_loss_mlp": 0.01047433, "balance_loss_clip": 1.05613685, "balance_loss_mlp": 1.03599524, "epoch": 0.13262790837491734, "flos": 22637871648000.0, "grad_norm": 1.8425585182217774, "language_loss": 0.87634182, "learning_rate": 3.890633833606581e-06, "loss": 0.89879954, "num_input_tokens_seen": 23449050, "step": 1103, "time_per_iteration": 2.519550323486328 }, { "auxiliary_loss_clip": 0.01223589, "auxiliary_loss_mlp": 0.01033289, "balance_loss_clip": 1.06723905, "balance_loss_mlp": 1.0228045, "epoch": 0.13274815126555642, "flos": 19683141851520.0, "grad_norm": 1.8864708105263763, "language_loss": 0.69656742, "learning_rate": 3.890379626093477e-06, "loss": 0.71913624, "num_input_tokens_seen": 23468800, "step": 1104, "time_per_iteration": 2.5130162239074707 }, { "auxiliary_loss_clip": 0.01164821, "auxiliary_loss_mlp": 0.01037858, "balance_loss_clip": 1.0563271, "balance_loss_mlp": 1.02599669, "epoch": 0.1328683941561955, "flos": 21317687176320.0, "grad_norm": 3.0201325999247755, "language_loss": 0.92674553, "learning_rate": 3.890125131811287e-06, "loss": 0.94877231, "num_input_tokens_seen": 23486850, "step": 1105, "time_per_iteration": 2.5744683742523193 }, { "auxiliary_loss_clip": 0.01193684, "auxiliary_loss_mlp": 0.01037857, "balance_loss_clip": 1.055655, "balance_loss_mlp": 1.02708602, "epoch": 0.1329886370468346, "flos": 13699131580800.0, "grad_norm": 2.0178759255710434, "language_loss": 0.75340331, "learning_rate": 3.889870350798618e-06, "loss": 0.77571869, "num_input_tokens_seen": 23504195, "step": 1106, "time_per_iteration": 2.5424411296844482 }, { "auxiliary_loss_clip": 0.01241392, "auxiliary_loss_mlp": 0.01038581, "balance_loss_clip": 1.06606436, "balance_loss_mlp": 1.02779841, "epoch": 0.1331088799374737, "flos": 21032413361280.0, "grad_norm": 1.6954560955448363, "language_loss": 0.78544891, "learning_rate": 3.889615283094119e-06, "loss": 0.80824864, "num_input_tokens_seen": 23523385, "step": 1107, "time_per_iteration": 2.4959022998809814 }, { "auxiliary_loss_clip": 0.01244503, "auxiliary_loss_mlp": 0.01041223, "balance_loss_clip": 1.06660509, "balance_loss_mlp": 1.02921891, "epoch": 0.13322912282811278, "flos": 18260432985600.0, "grad_norm": 2.1217730320413266, "language_loss": 0.84691358, "learning_rate": 3.889359928736485e-06, "loss": 0.86977088, "num_input_tokens_seen": 23541330, "step": 1108, "time_per_iteration": 2.5023608207702637 }, { "auxiliary_loss_clip": 0.01201133, "auxiliary_loss_mlp": 0.00766386, "balance_loss_clip": 1.06272173, "balance_loss_mlp": 1.00047123, "epoch": 0.1333493657187519, "flos": 24460876656000.0, "grad_norm": 2.030399835573967, "language_loss": 0.90990919, "learning_rate": 3.889104287764451e-06, "loss": 0.92958438, "num_input_tokens_seen": 23561705, "step": 1109, "time_per_iteration": 2.5452005863189697 }, { "auxiliary_loss_clip": 0.01208617, "auxiliary_loss_mlp": 0.01042573, "balance_loss_clip": 1.0652591, "balance_loss_mlp": 1.03161192, "epoch": 0.13346960860939097, "flos": 22158930677760.0, "grad_norm": 2.0471571648656246, "language_loss": 0.90363342, "learning_rate": 3.888848360216798e-06, "loss": 0.92614532, "num_input_tokens_seen": 23579350, "step": 1110, "time_per_iteration": 2.536431312561035 }, { "auxiliary_loss_clip": 0.01103863, "auxiliary_loss_mlp": 0.01004289, "balance_loss_clip": 1.02094758, "balance_loss_mlp": 1.00163019, "epoch": 0.13358985150003005, "flos": 67931212608000.0, "grad_norm": 0.7998990758872707, "language_loss": 0.5653863, "learning_rate": 3.888592146132351e-06, "loss": 0.58646786, "num_input_tokens_seen": 23640620, "step": 1111, "time_per_iteration": 3.199981689453125 }, { "auxiliary_loss_clip": 0.01224742, "auxiliary_loss_mlp": 0.01043393, "balance_loss_clip": 1.06680393, "balance_loss_mlp": 1.03215134, "epoch": 0.13371009439066917, "flos": 26834284742400.0, "grad_norm": 2.09199863814541, "language_loss": 0.78491414, "learning_rate": 3.888335645549978e-06, "loss": 0.80759543, "num_input_tokens_seen": 23661040, "step": 1112, "time_per_iteration": 2.5374910831451416 }, { "auxiliary_loss_clip": 0.01242404, "auxiliary_loss_mlp": 0.0104559, "balance_loss_clip": 1.07057321, "balance_loss_mlp": 1.03442645, "epoch": 0.13383033728130825, "flos": 26322844942080.0, "grad_norm": 2.8271973731830724, "language_loss": 0.81292695, "learning_rate": 3.888078858508588e-06, "loss": 0.83580691, "num_input_tokens_seen": 23680900, "step": 1113, "time_per_iteration": 2.512892246246338 }, { "auxiliary_loss_clip": 0.01209126, "auxiliary_loss_mlp": 0.01037102, "balance_loss_clip": 1.06670332, "balance_loss_mlp": 1.02578866, "epoch": 0.13395058017194733, "flos": 22563931501440.0, "grad_norm": 1.9607594972561997, "language_loss": 0.84491694, "learning_rate": 3.8878217850471365e-06, "loss": 0.86737925, "num_input_tokens_seen": 23700815, "step": 1114, "time_per_iteration": 3.590632200241089 }, { "auxiliary_loss_clip": 0.01244709, "auxiliary_loss_mlp": 0.0104511, "balance_loss_clip": 1.07039833, "balance_loss_mlp": 1.03270042, "epoch": 0.13407082306258641, "flos": 25810938264960.0, "grad_norm": 1.7815688823615061, "language_loss": 0.74153411, "learning_rate": 3.887564425204621e-06, "loss": 0.76443231, "num_input_tokens_seen": 23722500, "step": 1115, "time_per_iteration": 2.5157196521759033 }, { "auxiliary_loss_clip": 0.01082572, "auxiliary_loss_mlp": 0.01002906, "balance_loss_clip": 1.02276957, "balance_loss_mlp": 1.00024784, "epoch": 0.13419106595322552, "flos": 68338365269760.0, "grad_norm": 0.8454408615764806, "language_loss": 0.54641867, "learning_rate": 3.887306779020083e-06, "loss": 0.56727344, "num_input_tokens_seen": 23777155, "step": 1116, "time_per_iteration": 3.055781602859497 }, { "auxiliary_loss_clip": 0.01229169, "auxiliary_loss_mlp": 0.01043548, "balance_loss_clip": 1.06806564, "balance_loss_mlp": 1.03185344, "epoch": 0.1343113088438646, "flos": 20449080489600.0, "grad_norm": 2.609570743827039, "language_loss": 0.7029599, "learning_rate": 3.887048846532608e-06, "loss": 0.72568709, "num_input_tokens_seen": 23794130, "step": 1117, "time_per_iteration": 2.49936842918396 }, { "auxiliary_loss_clip": 0.0108691, "auxiliary_loss_mlp": 0.01003492, "balance_loss_clip": 1.01915431, "balance_loss_mlp": 1.00084519, "epoch": 0.1344315517345037, "flos": 67389784951680.0, "grad_norm": 0.7589417508545027, "language_loss": 0.58111024, "learning_rate": 3.8867906277813224e-06, "loss": 0.6020143, "num_input_tokens_seen": 23852285, "step": 1118, "time_per_iteration": 3.0013554096221924 }, { "auxiliary_loss_clip": 0.01226343, "auxiliary_loss_mlp": 0.00766284, "balance_loss_clip": 1.06450415, "balance_loss_mlp": 1.00029206, "epoch": 0.1345517946251428, "flos": 40734442788480.0, "grad_norm": 3.41867014912935, "language_loss": 0.74377131, "learning_rate": 3.886532122805399e-06, "loss": 0.76369756, "num_input_tokens_seen": 23874765, "step": 1119, "time_per_iteration": 3.567920207977295 }, { "auxiliary_loss_clip": 0.0114868, "auxiliary_loss_mlp": 0.01045096, "balance_loss_clip": 1.05268717, "balance_loss_mlp": 1.03314567, "epoch": 0.13467203751578188, "flos": 22816850140800.0, "grad_norm": 1.6925202653526292, "language_loss": 0.89782441, "learning_rate": 3.886273331644053e-06, "loss": 0.91976219, "num_input_tokens_seen": 23893635, "step": 1120, "time_per_iteration": 3.566516637802124 }, { "auxiliary_loss_clip": 0.01174579, "auxiliary_loss_mlp": 0.01035851, "balance_loss_clip": 1.06019986, "balance_loss_mlp": 1.02468705, "epoch": 0.13479228040642097, "flos": 17091576512640.0, "grad_norm": 2.018650031964999, "language_loss": 0.82421106, "learning_rate": 3.886014254336542e-06, "loss": 0.84631538, "num_input_tokens_seen": 23910110, "step": 1121, "time_per_iteration": 2.569540500640869 }, { "auxiliary_loss_clip": 0.0122203, "auxiliary_loss_mlp": 0.01035701, "balance_loss_clip": 1.06409669, "balance_loss_mlp": 1.02470994, "epoch": 0.13491252329706005, "flos": 23730525417600.0, "grad_norm": 1.810207558557642, "language_loss": 0.92445028, "learning_rate": 3.885754890922168e-06, "loss": 0.94702762, "num_input_tokens_seen": 23930440, "step": 1122, "time_per_iteration": 2.5304903984069824 }, { "auxiliary_loss_clip": 0.01131994, "auxiliary_loss_mlp": 0.01047555, "balance_loss_clip": 1.05270624, "balance_loss_mlp": 1.03595567, "epoch": 0.13503276618769916, "flos": 34127058960000.0, "grad_norm": 1.9392186077251008, "language_loss": 0.78558898, "learning_rate": 3.885495241440277e-06, "loss": 0.80738449, "num_input_tokens_seen": 23954535, "step": 1123, "time_per_iteration": 2.7606773376464844 }, { "auxiliary_loss_clip": 0.01242893, "auxiliary_loss_mlp": 0.01043186, "balance_loss_clip": 1.06803179, "balance_loss_mlp": 1.03191471, "epoch": 0.13515300907833824, "flos": 17712328377600.0, "grad_norm": 1.8355905977515146, "language_loss": 0.74198043, "learning_rate": 3.885235305930257e-06, "loss": 0.7648412, "num_input_tokens_seen": 23972735, "step": 1124, "time_per_iteration": 2.4871087074279785 }, { "auxiliary_loss_clip": 0.01190885, "auxiliary_loss_mlp": 0.01048269, "balance_loss_clip": 1.06436086, "balance_loss_mlp": 1.0352211, "epoch": 0.13527325196897733, "flos": 20260872201600.0, "grad_norm": 1.847931973002981, "language_loss": 0.85501164, "learning_rate": 3.884975084431539e-06, "loss": 0.87740314, "num_input_tokens_seen": 23987685, "step": 1125, "time_per_iteration": 2.5507395267486572 }, { "auxiliary_loss_clip": 0.01215421, "auxiliary_loss_mlp": 0.00766433, "balance_loss_clip": 1.06335557, "balance_loss_mlp": 1.00042605, "epoch": 0.13539349485961644, "flos": 18186492839040.0, "grad_norm": 2.2416136256575228, "language_loss": 0.91655236, "learning_rate": 3.8847145769836e-06, "loss": 0.93637091, "num_input_tokens_seen": 24004105, "step": 1126, "time_per_iteration": 2.515225648880005 }, { "auxiliary_loss_clip": 0.01242819, "auxiliary_loss_mlp": 0.01041202, "balance_loss_clip": 1.06684661, "balance_loss_mlp": 1.02956116, "epoch": 0.13551373775025552, "flos": 19317463441920.0, "grad_norm": 2.386962710680505, "language_loss": 0.66415197, "learning_rate": 3.884453783625959e-06, "loss": 0.68699217, "num_input_tokens_seen": 24021715, "step": 1127, "time_per_iteration": 2.482673168182373 }, { "auxiliary_loss_clip": 0.01203213, "auxiliary_loss_mlp": 0.01035733, "balance_loss_clip": 1.06316209, "balance_loss_mlp": 1.02518249, "epoch": 0.1356339806408946, "flos": 20850813175680.0, "grad_norm": 2.1108926415072697, "language_loss": 0.84781194, "learning_rate": 3.884192704398176e-06, "loss": 0.87020141, "num_input_tokens_seen": 24038915, "step": 1128, "time_per_iteration": 2.531822443008423 }, { "auxiliary_loss_clip": 0.012242, "auxiliary_loss_mlp": 0.01051782, "balance_loss_clip": 1.06316972, "balance_loss_mlp": 1.04063559, "epoch": 0.13575422353153369, "flos": 50476037696640.0, "grad_norm": 1.6739670975124537, "language_loss": 0.74570715, "learning_rate": 3.883931339339858e-06, "loss": 0.76846701, "num_input_tokens_seen": 24063300, "step": 1129, "time_per_iteration": 2.761939287185669 }, { "auxiliary_loss_clip": 0.01227772, "auxiliary_loss_mlp": 0.01039168, "balance_loss_clip": 1.06450844, "balance_loss_mlp": 1.02742577, "epoch": 0.1358744664221728, "flos": 18150797698560.0, "grad_norm": 2.4110153977600044, "language_loss": 0.78667426, "learning_rate": 3.883669688490654e-06, "loss": 0.8093437, "num_input_tokens_seen": 24081070, "step": 1130, "time_per_iteration": 2.5068349838256836 }, { "auxiliary_loss_clip": 0.01195893, "auxiliary_loss_mlp": 0.00766058, "balance_loss_clip": 1.05866027, "balance_loss_mlp": 1.00041699, "epoch": 0.13599470931281188, "flos": 18442966924800.0, "grad_norm": 1.9424638081951011, "language_loss": 0.85507464, "learning_rate": 3.883407751890256e-06, "loss": 0.87469411, "num_input_tokens_seen": 24099675, "step": 1131, "time_per_iteration": 2.5076582431793213 }, { "auxiliary_loss_clip": 0.01191876, "auxiliary_loss_mlp": 0.01048915, "balance_loss_clip": 1.05910599, "balance_loss_mlp": 1.03619492, "epoch": 0.13611495220345096, "flos": 26680766014080.0, "grad_norm": 1.7181343736842447, "language_loss": 0.85642576, "learning_rate": 3.8831455295783994e-06, "loss": 0.87883371, "num_input_tokens_seen": 24118925, "step": 1132, "time_per_iteration": 2.627047061920166 }, { "auxiliary_loss_clip": 0.01203861, "auxiliary_loss_mlp": 0.0104093, "balance_loss_clip": 1.06280267, "balance_loss_mlp": 1.02939045, "epoch": 0.13623519509409007, "flos": 21686238673920.0, "grad_norm": 1.643416965574082, "language_loss": 0.73959291, "learning_rate": 3.882883021594864e-06, "loss": 0.76204079, "num_input_tokens_seen": 24137065, "step": 1133, "time_per_iteration": 2.579265594482422 }, { "auxiliary_loss_clip": 0.01181733, "auxiliary_loss_mlp": 0.01036871, "balance_loss_clip": 1.06068063, "balance_loss_mlp": 1.02583218, "epoch": 0.13635543798472916, "flos": 14830389492480.0, "grad_norm": 2.4077745151837826, "language_loss": 0.86901402, "learning_rate": 3.8826202279794705e-06, "loss": 0.89120007, "num_input_tokens_seen": 24154125, "step": 1134, "time_per_iteration": 2.5350449085235596 }, { "auxiliary_loss_clip": 0.0124275, "auxiliary_loss_mlp": 0.01039736, "balance_loss_clip": 1.06911612, "balance_loss_mlp": 1.0288341, "epoch": 0.13647568087536824, "flos": 22890323410560.0, "grad_norm": 1.9634094323757916, "language_loss": 0.70318204, "learning_rate": 3.882357148772085e-06, "loss": 0.72600693, "num_input_tokens_seen": 24171550, "step": 1135, "time_per_iteration": 2.4762027263641357 }, { "auxiliary_loss_clip": 0.01175443, "auxiliary_loss_mlp": 0.01045917, "balance_loss_clip": 1.05802798, "balance_loss_mlp": 1.03416872, "epoch": 0.13659592376600732, "flos": 19937927998080.0, "grad_norm": 2.7922594386765605, "language_loss": 0.84492689, "learning_rate": 3.882093784012617e-06, "loss": 0.86714047, "num_input_tokens_seen": 24190190, "step": 1136, "time_per_iteration": 2.534083127975464 }, { "auxiliary_loss_clip": 0.01205692, "auxiliary_loss_mlp": 0.0103658, "balance_loss_clip": 1.06324697, "balance_loss_mlp": 1.02511835, "epoch": 0.13671616665664643, "flos": 21428579439360.0, "grad_norm": 1.9171071513542501, "language_loss": 0.84301311, "learning_rate": 3.881830133741019e-06, "loss": 0.86543584, "num_input_tokens_seen": 24209055, "step": 1137, "time_per_iteration": 2.5401196479797363 }, { "auxiliary_loss_clip": 0.01190995, "auxiliary_loss_mlp": 0.010478, "balance_loss_clip": 1.06460023, "balance_loss_mlp": 1.03609347, "epoch": 0.13683640954728551, "flos": 22778138257920.0, "grad_norm": 1.9410813736814505, "language_loss": 0.76070166, "learning_rate": 3.881566197997285e-06, "loss": 0.78308958, "num_input_tokens_seen": 24225490, "step": 1138, "time_per_iteration": 2.585272789001465 }, { "auxiliary_loss_clip": 0.01204776, "auxiliary_loss_mlp": 0.01039517, "balance_loss_clip": 1.06676984, "balance_loss_mlp": 1.02851439, "epoch": 0.1369566524379246, "flos": 21725884310400.0, "grad_norm": 1.5339118104084037, "language_loss": 0.75060034, "learning_rate": 3.881301976821456e-06, "loss": 0.77304322, "num_input_tokens_seen": 24245520, "step": 1139, "time_per_iteration": 2.5421037673950195 }, { "auxiliary_loss_clip": 0.0122006, "auxiliary_loss_mlp": 0.0104191, "balance_loss_clip": 1.06502652, "balance_loss_mlp": 1.03129435, "epoch": 0.1370768953285637, "flos": 18624459369600.0, "grad_norm": 1.8970164879730396, "language_loss": 0.90650415, "learning_rate": 3.881037470253612e-06, "loss": 0.92912388, "num_input_tokens_seen": 24265035, "step": 1140, "time_per_iteration": 3.3628947734832764 }, { "auxiliary_loss_clip": 0.01175835, "auxiliary_loss_mlp": 0.01040442, "balance_loss_clip": 1.06009603, "balance_loss_mlp": 1.02960598, "epoch": 0.1371971382192028, "flos": 14939521989120.0, "grad_norm": 2.6303102786651906, "language_loss": 0.79046839, "learning_rate": 3.88077267833388e-06, "loss": 0.81263113, "num_input_tokens_seen": 24281550, "step": 1141, "time_per_iteration": 2.544905424118042 }, { "auxiliary_loss_clip": 0.0117028, "auxiliary_loss_mlp": 0.01044457, "balance_loss_clip": 1.05718565, "balance_loss_mlp": 1.0331862, "epoch": 0.13731738110984187, "flos": 19023785844480.0, "grad_norm": 2.096575567553329, "language_loss": 0.83701593, "learning_rate": 3.880507601102427e-06, "loss": 0.85916328, "num_input_tokens_seen": 24299485, "step": 1142, "time_per_iteration": 2.6105563640594482 }, { "auxiliary_loss_clip": 0.01239174, "auxiliary_loss_mlp": 0.01047938, "balance_loss_clip": 1.06891823, "balance_loss_mlp": 1.03702462, "epoch": 0.13743762400048098, "flos": 18187462506240.0, "grad_norm": 2.8469880353884602, "language_loss": 0.82199681, "learning_rate": 3.880242238599467e-06, "loss": 0.84486794, "num_input_tokens_seen": 24316010, "step": 1143, "time_per_iteration": 2.445014715194702 }, { "auxiliary_loss_clip": 0.01234912, "auxiliary_loss_mlp": 0.01047229, "balance_loss_clip": 1.06560016, "balance_loss_mlp": 1.03576708, "epoch": 0.13755786689112007, "flos": 21031982398080.0, "grad_norm": 1.8033794513004204, "language_loss": 0.83189905, "learning_rate": 3.879976590865254e-06, "loss": 0.85472047, "num_input_tokens_seen": 24335465, "step": 1144, "time_per_iteration": 2.5212836265563965 }, { "auxiliary_loss_clip": 0.01207388, "auxiliary_loss_mlp": 0.01045335, "balance_loss_clip": 1.06589866, "balance_loss_mlp": 1.03406358, "epoch": 0.13767810978175915, "flos": 21360636864000.0, "grad_norm": 1.8640830388365308, "language_loss": 0.87317169, "learning_rate": 3.879710657940087e-06, "loss": 0.8956989, "num_input_tokens_seen": 24354415, "step": 1145, "time_per_iteration": 2.547109842300415 }, { "auxiliary_loss_clip": 0.01226205, "auxiliary_loss_mlp": 0.01054492, "balance_loss_clip": 1.06520343, "balance_loss_mlp": 1.04227889, "epoch": 0.13779835267239823, "flos": 30592084861440.0, "grad_norm": 2.015525290913156, "language_loss": 0.70455736, "learning_rate": 3.879444439864308e-06, "loss": 0.7273643, "num_input_tokens_seen": 24373990, "step": 1146, "time_per_iteration": 3.3156747817993164 }, { "auxiliary_loss_clip": 0.01221469, "auxiliary_loss_mlp": 0.00766313, "balance_loss_clip": 1.06280899, "balance_loss_mlp": 1.00066495, "epoch": 0.13791859556303734, "flos": 22669867687680.0, "grad_norm": 1.9964562402872512, "language_loss": 0.86180943, "learning_rate": 3.879177936678301e-06, "loss": 0.88168716, "num_input_tokens_seen": 24392995, "step": 1147, "time_per_iteration": 4.178260564804077 }, { "auxiliary_loss_clip": 0.01211583, "auxiliary_loss_mlp": 0.01043046, "balance_loss_clip": 1.06459701, "balance_loss_mlp": 1.03132725, "epoch": 0.13803883845367643, "flos": 35224166016000.0, "grad_norm": 2.2988233421089848, "language_loss": 0.76987356, "learning_rate": 3.878911148422496e-06, "loss": 0.79241979, "num_input_tokens_seen": 24414470, "step": 1148, "time_per_iteration": 2.643392324447632 }, { "auxiliary_loss_clip": 0.01222322, "auxiliary_loss_mlp": 0.01041263, "balance_loss_clip": 1.06442523, "balance_loss_mlp": 1.02956891, "epoch": 0.1381590813443155, "flos": 32014542332160.0, "grad_norm": 2.189990400431333, "language_loss": 0.70677912, "learning_rate": 3.878644075137364e-06, "loss": 0.72941506, "num_input_tokens_seen": 24435120, "step": 1149, "time_per_iteration": 2.5886855125427246 }, { "auxiliary_loss_clip": 0.0116837, "auxiliary_loss_mlp": 0.01037692, "balance_loss_clip": 1.0549655, "balance_loss_mlp": 1.02666509, "epoch": 0.13827932423495462, "flos": 17821855923840.0, "grad_norm": 2.1550576330570514, "language_loss": 0.79334706, "learning_rate": 3.878376716863418e-06, "loss": 0.81540769, "num_input_tokens_seen": 24451420, "step": 1150, "time_per_iteration": 2.51723051071167 }, { "auxiliary_loss_clip": 0.01202777, "auxiliary_loss_mlp": 0.01045085, "balance_loss_clip": 1.05982995, "balance_loss_mlp": 1.03306866, "epoch": 0.1383995671255937, "flos": 19427098728960.0, "grad_norm": 5.871303816597254, "language_loss": 0.71790552, "learning_rate": 3.878109073641219e-06, "loss": 0.74038422, "num_input_tokens_seen": 24470450, "step": 1151, "time_per_iteration": 2.5488815307617188 }, { "auxiliary_loss_clip": 0.01172043, "auxiliary_loss_mlp": 0.010417, "balance_loss_clip": 1.05994558, "balance_loss_mlp": 1.03064334, "epoch": 0.13851981001623279, "flos": 28296603331200.0, "grad_norm": 1.5508358547550867, "language_loss": 0.81265759, "learning_rate": 3.877841145511366e-06, "loss": 0.834795, "num_input_tokens_seen": 24493190, "step": 1152, "time_per_iteration": 2.6709396839141846 }, { "auxiliary_loss_clip": 0.01226703, "auxiliary_loss_mlp": 0.01041941, "balance_loss_clip": 1.06567085, "balance_loss_mlp": 1.03045535, "epoch": 0.13864005290687187, "flos": 21213079793280.0, "grad_norm": 1.8762178696647758, "language_loss": 0.82642603, "learning_rate": 3.8775729325145035e-06, "loss": 0.84911251, "num_input_tokens_seen": 24512425, "step": 1153, "time_per_iteration": 2.5234885215759277 }, { "auxiliary_loss_clip": 0.01073429, "auxiliary_loss_mlp": 0.01012187, "balance_loss_clip": 1.01950455, "balance_loss_mlp": 1.00952899, "epoch": 0.13876029579751098, "flos": 71653389413760.0, "grad_norm": 0.7938055868828094, "language_loss": 0.6472044, "learning_rate": 3.877304434691321e-06, "loss": 0.66806054, "num_input_tokens_seen": 24579275, "step": 1154, "time_per_iteration": 3.2339136600494385 }, { "auxiliary_loss_clip": 0.01188929, "auxiliary_loss_mlp": 0.01032813, "balance_loss_clip": 1.06273544, "balance_loss_mlp": 1.02256095, "epoch": 0.13888053868815006, "flos": 21941348042880.0, "grad_norm": 1.6842712277662588, "language_loss": 0.79914737, "learning_rate": 3.877035652082548e-06, "loss": 0.82136476, "num_input_tokens_seen": 24598720, "step": 1155, "time_per_iteration": 2.562546491622925 }, { "auxiliary_loss_clip": 0.01196628, "auxiliary_loss_mlp": 0.01036279, "balance_loss_clip": 1.06213915, "balance_loss_mlp": 1.02471614, "epoch": 0.13900078157878915, "flos": 19608627087360.0, "grad_norm": 1.9114433231332668, "language_loss": 0.85481822, "learning_rate": 3.87676658472896e-06, "loss": 0.87714732, "num_input_tokens_seen": 24617530, "step": 1156, "time_per_iteration": 2.5305514335632324 }, { "auxiliary_loss_clip": 0.01220095, "auxiliary_loss_mlp": 0.0104503, "balance_loss_clip": 1.0616771, "balance_loss_mlp": 1.03439069, "epoch": 0.13912102446942826, "flos": 22638051216000.0, "grad_norm": 2.0059744219112257, "language_loss": 0.85106218, "learning_rate": 3.876497232671372e-06, "loss": 0.87371337, "num_input_tokens_seen": 24637485, "step": 1157, "time_per_iteration": 2.489564895629883 }, { "auxiliary_loss_clip": 0.0117955, "auxiliary_loss_mlp": 0.01039531, "balance_loss_clip": 1.05941677, "balance_loss_mlp": 1.02901089, "epoch": 0.13924126736006734, "flos": 29643324975360.0, "grad_norm": 1.9446276015636033, "language_loss": 0.83603609, "learning_rate": 3.876227595950647e-06, "loss": 0.8582269, "num_input_tokens_seen": 24656915, "step": 1158, "time_per_iteration": 2.6752874851226807 }, { "auxiliary_loss_clip": 0.01238211, "auxiliary_loss_mlp": 0.01041393, "balance_loss_clip": 1.06763482, "balance_loss_mlp": 1.02937031, "epoch": 0.13936151025070642, "flos": 27417653527680.0, "grad_norm": 1.6112974470249788, "language_loss": 0.79248452, "learning_rate": 3.875957674607686e-06, "loss": 0.81528056, "num_input_tokens_seen": 24679190, "step": 1159, "time_per_iteration": 2.5316503047943115 }, { "auxiliary_loss_clip": 0.01212441, "auxiliary_loss_mlp": 0.00766559, "balance_loss_clip": 1.05945969, "balance_loss_mlp": 1.00070596, "epoch": 0.1394817531413455, "flos": 16399326625920.0, "grad_norm": 2.745152521557462, "language_loss": 0.88152218, "learning_rate": 3.8756874686834386e-06, "loss": 0.90131223, "num_input_tokens_seen": 24697405, "step": 1160, "time_per_iteration": 2.4753894805908203 }, { "auxiliary_loss_clip": 0.01224718, "auxiliary_loss_mlp": 0.00766718, "balance_loss_clip": 1.06229985, "balance_loss_mlp": 1.00059319, "epoch": 0.13960199603198462, "flos": 30922319525760.0, "grad_norm": 1.5296851091031625, "language_loss": 0.80769718, "learning_rate": 3.875416978218893e-06, "loss": 0.82761157, "num_input_tokens_seen": 24720600, "step": 1161, "time_per_iteration": 2.580864191055298 }, { "auxiliary_loss_clip": 0.01198174, "auxiliary_loss_mlp": 0.01040919, "balance_loss_clip": 1.05661917, "balance_loss_mlp": 1.0294807, "epoch": 0.1397222389226237, "flos": 18113773754880.0, "grad_norm": 13.512801540782924, "language_loss": 0.82489759, "learning_rate": 3.8751462032550835e-06, "loss": 0.84728849, "num_input_tokens_seen": 24737605, "step": 1162, "time_per_iteration": 2.534541606903076 }, { "auxiliary_loss_clip": 0.01202995, "auxiliary_loss_mlp": 0.01027993, "balance_loss_clip": 1.06549644, "balance_loss_mlp": 1.01723433, "epoch": 0.13984248181326278, "flos": 16872772815360.0, "grad_norm": 2.5041874374000392, "language_loss": 0.82664227, "learning_rate": 3.874875143833085e-06, "loss": 0.84895217, "num_input_tokens_seen": 24755845, "step": 1163, "time_per_iteration": 2.5094411373138428 }, { "auxiliary_loss_clip": 0.0122095, "auxiliary_loss_mlp": 0.01047486, "balance_loss_clip": 1.06333816, "balance_loss_mlp": 1.03568423, "epoch": 0.1399627247039019, "flos": 54121401267840.0, "grad_norm": 1.809441658863343, "language_loss": 0.68863541, "learning_rate": 3.874603799994019e-06, "loss": 0.7113198, "num_input_tokens_seen": 24779380, "step": 1164, "time_per_iteration": 2.797175168991089 }, { "auxiliary_loss_clip": 0.01182786, "auxiliary_loss_mlp": 0.01036673, "balance_loss_clip": 1.05867195, "balance_loss_mlp": 1.02638566, "epoch": 0.14008296759454097, "flos": 11765521618560.0, "grad_norm": 2.0568116690227636, "language_loss": 0.87022305, "learning_rate": 3.874332171779046e-06, "loss": 0.89241767, "num_input_tokens_seen": 24794260, "step": 1165, "time_per_iteration": 2.568225145339966 }, { "auxiliary_loss_clip": 0.0118308, "auxiliary_loss_mlp": 0.0103346, "balance_loss_clip": 1.05651498, "balance_loss_mlp": 1.02255273, "epoch": 0.14020321048518006, "flos": 22017514832640.0, "grad_norm": 1.6720728256415833, "language_loss": 0.75627828, "learning_rate": 3.874060259229373e-06, "loss": 0.77844369, "num_input_tokens_seen": 24815835, "step": 1166, "time_per_iteration": 2.6282758712768555 }, { "auxiliary_loss_clip": 0.01224605, "auxiliary_loss_mlp": 0.0104494, "balance_loss_clip": 1.06599569, "balance_loss_mlp": 1.03320432, "epoch": 0.14032345337581917, "flos": 23404313076480.0, "grad_norm": 2.1005222844913387, "language_loss": 0.93716538, "learning_rate": 3.873788062386249e-06, "loss": 0.9598608, "num_input_tokens_seen": 24834095, "step": 1167, "time_per_iteration": 3.344567060470581 }, { "auxiliary_loss_clip": 0.01193847, "auxiliary_loss_mlp": 0.01042578, "balance_loss_clip": 1.06391454, "balance_loss_mlp": 1.03195691, "epoch": 0.14044369626645825, "flos": 29645767100160.0, "grad_norm": 1.9557510903710569, "language_loss": 0.82077014, "learning_rate": 3.873515581290965e-06, "loss": 0.8431344, "num_input_tokens_seen": 24858900, "step": 1168, "time_per_iteration": 2.672848701477051 }, { "auxiliary_loss_clip": 0.01191536, "auxiliary_loss_mlp": 0.01039288, "balance_loss_clip": 1.0639782, "balance_loss_mlp": 1.02794576, "epoch": 0.14056393915709733, "flos": 18332972501760.0, "grad_norm": 10.310937155097955, "language_loss": 0.75927639, "learning_rate": 3.8732428159848575e-06, "loss": 0.78158462, "num_input_tokens_seen": 24877875, "step": 1169, "time_per_iteration": 2.574352502822876 }, { "auxiliary_loss_clip": 0.01222151, "auxiliary_loss_mlp": 0.0103911, "balance_loss_clip": 1.0677588, "balance_loss_mlp": 1.02782106, "epoch": 0.14068418204773642, "flos": 26687517770880.0, "grad_norm": 1.914566701977521, "language_loss": 0.78181374, "learning_rate": 3.872969766509304e-06, "loss": 0.80442631, "num_input_tokens_seen": 24898430, "step": 1170, "time_per_iteration": 2.5370090007781982 }, { "auxiliary_loss_clip": 0.01078067, "auxiliary_loss_mlp": 0.01003821, "balance_loss_clip": 1.02103245, "balance_loss_mlp": 1.00079286, "epoch": 0.14080442493837553, "flos": 65259314501760.0, "grad_norm": 0.7658627073932307, "language_loss": 0.55671537, "learning_rate": 3.872696432905726e-06, "loss": 0.5775342, "num_input_tokens_seen": 24959250, "step": 1171, "time_per_iteration": 3.1267569065093994 }, { "auxiliary_loss_clip": 0.01222473, "auxiliary_loss_mlp": 0.01043889, "balance_loss_clip": 1.06142688, "balance_loss_mlp": 1.03257596, "epoch": 0.1409246678290146, "flos": 25776715582080.0, "grad_norm": 5.546998461052881, "language_loss": 0.71459758, "learning_rate": 3.872422815215589e-06, "loss": 0.73726118, "num_input_tokens_seen": 24978330, "step": 1172, "time_per_iteration": 2.5472028255462646 }, { "auxiliary_loss_clip": 0.01215745, "auxiliary_loss_mlp": 0.01044703, "balance_loss_clip": 1.05991089, "balance_loss_mlp": 1.03180456, "epoch": 0.1410449107196537, "flos": 21868521217920.0, "grad_norm": 1.7228580108787452, "language_loss": 0.74132156, "learning_rate": 3.8721489134803994e-06, "loss": 0.76392603, "num_input_tokens_seen": 24997120, "step": 1173, "time_per_iteration": 3.3893330097198486 }, { "auxiliary_loss_clip": 0.01218674, "auxiliary_loss_mlp": 0.0104568, "balance_loss_clip": 1.06441677, "balance_loss_mlp": 1.0335927, "epoch": 0.1411651536102928, "flos": 16684133564160.0, "grad_norm": 2.4506299516665164, "language_loss": 0.72398233, "learning_rate": 3.871874727741707e-06, "loss": 0.74662584, "num_input_tokens_seen": 25014350, "step": 1174, "time_per_iteration": 4.179301023483276 }, { "auxiliary_loss_clip": 0.0121732, "auxiliary_loss_mlp": 0.01039841, "balance_loss_clip": 1.06673551, "balance_loss_mlp": 1.02944636, "epoch": 0.1412853965009319, "flos": 20992264934400.0, "grad_norm": 2.0172332081482733, "language_loss": 0.96609676, "learning_rate": 3.871600258041108e-06, "loss": 0.98866832, "num_input_tokens_seen": 25033875, "step": 1175, "time_per_iteration": 2.601478099822998 }, { "auxiliary_loss_clip": 0.01201002, "auxiliary_loss_mlp": 0.01039342, "balance_loss_clip": 1.0600369, "balance_loss_mlp": 1.02734923, "epoch": 0.14140563939157097, "flos": 20335279224960.0, "grad_norm": 3.268061804043667, "language_loss": 0.85863793, "learning_rate": 3.871325504420238e-06, "loss": 0.88104135, "num_input_tokens_seen": 25052865, "step": 1176, "time_per_iteration": 2.562810182571411 }, { "auxiliary_loss_clip": 0.01236416, "auxiliary_loss_mlp": 0.01035924, "balance_loss_clip": 1.06784892, "balance_loss_mlp": 1.02533817, "epoch": 0.14152588228221005, "flos": 21068826773760.0, "grad_norm": 1.7145377081477498, "language_loss": 0.81782746, "learning_rate": 3.871050466920776e-06, "loss": 0.8405509, "num_input_tokens_seen": 25072770, "step": 1177, "time_per_iteration": 2.502150058746338 }, { "auxiliary_loss_clip": 0.01179289, "auxiliary_loss_mlp": 0.01031377, "balance_loss_clip": 1.05759549, "balance_loss_mlp": 1.02102399, "epoch": 0.14164612517284916, "flos": 18223157646720.0, "grad_norm": 1.8022537104679055, "language_loss": 0.79641569, "learning_rate": 3.870775145584447e-06, "loss": 0.81852233, "num_input_tokens_seen": 25090550, "step": 1178, "time_per_iteration": 2.632185459136963 }, { "auxiliary_loss_clip": 0.01212098, "auxiliary_loss_mlp": 0.01044947, "balance_loss_clip": 1.06302476, "balance_loss_mlp": 1.0333184, "epoch": 0.14176636806348825, "flos": 22744454279040.0, "grad_norm": 2.808521870278306, "language_loss": 0.64842236, "learning_rate": 3.8704995404530145e-06, "loss": 0.67099285, "num_input_tokens_seen": 25106175, "step": 1179, "time_per_iteration": 2.6408607959747314 }, { "auxiliary_loss_clip": 0.01233894, "auxiliary_loss_mlp": 0.01038018, "balance_loss_clip": 1.06740737, "balance_loss_mlp": 1.02773619, "epoch": 0.14188661095412733, "flos": 22091095843200.0, "grad_norm": 2.009320451255184, "language_loss": 0.84931207, "learning_rate": 3.87022365156829e-06, "loss": 0.87203121, "num_input_tokens_seen": 25126890, "step": 1180, "time_per_iteration": 2.526172161102295 }, { "auxiliary_loss_clip": 0.01142297, "auxiliary_loss_mlp": 0.01039185, "balance_loss_clip": 1.05483246, "balance_loss_mlp": 1.0281527, "epoch": 0.14200685384476644, "flos": 24352390604160.0, "grad_norm": 2.282580410868269, "language_loss": 0.81009901, "learning_rate": 3.869947478972123e-06, "loss": 0.83191383, "num_input_tokens_seen": 25147915, "step": 1181, "time_per_iteration": 2.83522629737854 }, { "auxiliary_loss_clip": 0.01214349, "auxiliary_loss_mlp": 0.01037039, "balance_loss_clip": 1.0625701, "balance_loss_mlp": 1.02524948, "epoch": 0.14212709673540552, "flos": 24022048199040.0, "grad_norm": 2.262742897864024, "language_loss": 0.82555723, "learning_rate": 3.869671022706412e-06, "loss": 0.84807116, "num_input_tokens_seen": 25166645, "step": 1182, "time_per_iteration": 2.696279525756836 }, { "auxiliary_loss_clip": 0.01158631, "auxiliary_loss_mlp": 0.0104602, "balance_loss_clip": 1.05308425, "balance_loss_mlp": 1.03517818, "epoch": 0.1422473396260446, "flos": 26431797870720.0, "grad_norm": 1.9557381545607537, "language_loss": 0.64627188, "learning_rate": 3.869394282813092e-06, "loss": 0.66831839, "num_input_tokens_seen": 25185845, "step": 1183, "time_per_iteration": 2.659381628036499 }, { "auxiliary_loss_clip": 0.01193628, "auxiliary_loss_mlp": 0.01043299, "balance_loss_clip": 1.05697608, "balance_loss_mlp": 1.03195632, "epoch": 0.1423675825166837, "flos": 17055306754560.0, "grad_norm": 3.1121010254052033, "language_loss": 0.89463937, "learning_rate": 3.869117259334147e-06, "loss": 0.91700864, "num_input_tokens_seen": 25203770, "step": 1184, "time_per_iteration": 2.5750486850738525 }, { "auxiliary_loss_clip": 0.01214635, "auxiliary_loss_mlp": 0.01042238, "balance_loss_clip": 1.0623312, "balance_loss_mlp": 1.03145552, "epoch": 0.1424878254073228, "flos": 17929480049280.0, "grad_norm": 1.853639622062382, "language_loss": 0.81930923, "learning_rate": 3.868839952311599e-06, "loss": 0.84187794, "num_input_tokens_seen": 25221725, "step": 1185, "time_per_iteration": 2.49806547164917 }, { "auxiliary_loss_clip": 0.01200315, "auxiliary_loss_mlp": 0.0103552, "balance_loss_clip": 1.06298769, "balance_loss_mlp": 1.02446318, "epoch": 0.14260806829796188, "flos": 20303606407680.0, "grad_norm": 2.323956828573697, "language_loss": 0.80340397, "learning_rate": 3.868562361787516e-06, "loss": 0.82576227, "num_input_tokens_seen": 25240855, "step": 1186, "time_per_iteration": 2.5370519161224365 }, { "auxiliary_loss_clip": 0.01136701, "auxiliary_loss_mlp": 0.01033927, "balance_loss_clip": 1.05215549, "balance_loss_mlp": 1.02290595, "epoch": 0.14272831118860096, "flos": 23185724860800.0, "grad_norm": 2.056503246063335, "language_loss": 0.691715, "learning_rate": 3.868284487804009e-06, "loss": 0.71342123, "num_input_tokens_seen": 25260085, "step": 1187, "time_per_iteration": 2.696873903274536 }, { "auxiliary_loss_clip": 0.01208476, "auxiliary_loss_mlp": 0.01046048, "balance_loss_clip": 1.06049669, "balance_loss_mlp": 1.03538513, "epoch": 0.14284855407924008, "flos": 27232210586880.0, "grad_norm": 1.6143287209150559, "language_loss": 0.78217274, "learning_rate": 3.86800633040323e-06, "loss": 0.80471802, "num_input_tokens_seen": 25280675, "step": 1188, "time_per_iteration": 2.6943676471710205 }, { "auxiliary_loss_clip": 0.01202893, "auxiliary_loss_mlp": 0.00765847, "balance_loss_clip": 1.06457949, "balance_loss_mlp": 1.00075531, "epoch": 0.14296879696987916, "flos": 28184202696960.0, "grad_norm": 2.001762600775245, "language_loss": 0.78061759, "learning_rate": 3.867727889627376e-06, "loss": 0.80030501, "num_input_tokens_seen": 25300290, "step": 1189, "time_per_iteration": 2.5867836475372314 }, { "auxiliary_loss_clip": 0.01178168, "auxiliary_loss_mlp": 0.01044245, "balance_loss_clip": 1.05849648, "balance_loss_mlp": 1.03204358, "epoch": 0.14308903986051824, "flos": 19390290266880.0, "grad_norm": 2.1207731494882895, "language_loss": 0.78223324, "learning_rate": 3.867449165518687e-06, "loss": 0.80445737, "num_input_tokens_seen": 25316760, "step": 1190, "time_per_iteration": 2.5264713764190674 }, { "auxiliary_loss_clip": 0.01238012, "auxiliary_loss_mlp": 0.00766565, "balance_loss_clip": 1.06639266, "balance_loss_mlp": 1.00079107, "epoch": 0.14320928275115732, "flos": 17457506317440.0, "grad_norm": 1.7759648261721543, "language_loss": 0.71031684, "learning_rate": 3.867170158119444e-06, "loss": 0.73036253, "num_input_tokens_seen": 25335760, "step": 1191, "time_per_iteration": 2.556504487991333 }, { "auxiliary_loss_clip": 0.01238046, "auxiliary_loss_mlp": 0.01037906, "balance_loss_clip": 1.06650412, "balance_loss_mlp": 1.02736795, "epoch": 0.14332952564179643, "flos": 21466070259840.0, "grad_norm": 2.214404937392677, "language_loss": 0.75551212, "learning_rate": 3.866890867471972e-06, "loss": 0.77827168, "num_input_tokens_seen": 25354230, "step": 1192, "time_per_iteration": 2.5354533195495605 }, { "auxiliary_loss_clip": 0.01198297, "auxiliary_loss_mlp": 0.01044425, "balance_loss_clip": 1.0549221, "balance_loss_mlp": 1.03257012, "epoch": 0.14344976853243552, "flos": 16396992241920.0, "grad_norm": 2.5540064589372715, "language_loss": 0.89672583, "learning_rate": 3.86661129361864e-06, "loss": 0.91915309, "num_input_tokens_seen": 25368720, "step": 1193, "time_per_iteration": 3.3329994678497314 }, { "auxiliary_loss_clip": 0.01202651, "auxiliary_loss_mlp": 0.01046793, "balance_loss_clip": 1.06281376, "balance_loss_mlp": 1.03503346, "epoch": 0.1435700114230746, "flos": 18916736336640.0, "grad_norm": 1.9344724869248264, "language_loss": 0.86119479, "learning_rate": 3.866331436601859e-06, "loss": 0.88368922, "num_input_tokens_seen": 25386715, "step": 1194, "time_per_iteration": 2.5208323001861572 }, { "auxiliary_loss_clip": 0.01235557, "auxiliary_loss_mlp": 0.01042975, "balance_loss_clip": 1.06596208, "balance_loss_mlp": 1.03156674, "epoch": 0.1436902543137137, "flos": 19755394058880.0, "grad_norm": 2.205488070809292, "language_loss": 0.73481584, "learning_rate": 3.866051296464083e-06, "loss": 0.75760114, "num_input_tokens_seen": 25405550, "step": 1195, "time_per_iteration": 2.485048770904541 }, { "auxiliary_loss_clip": 0.01234827, "auxiliary_loss_mlp": 0.00765843, "balance_loss_clip": 1.06353295, "balance_loss_mlp": 1.00076652, "epoch": 0.1438104972043528, "flos": 14684807669760.0, "grad_norm": 2.8025494673110067, "language_loss": 0.85218567, "learning_rate": 3.86577087324781e-06, "loss": 0.87219238, "num_input_tokens_seen": 25422040, "step": 1196, "time_per_iteration": 2.424133539199829 }, { "auxiliary_loss_clip": 0.01217736, "auxiliary_loss_mlp": 0.01037417, "balance_loss_clip": 1.06584454, "balance_loss_mlp": 1.02664042, "epoch": 0.14393074009499188, "flos": 17092330698240.0, "grad_norm": 1.9612024199306752, "language_loss": 0.77624083, "learning_rate": 3.865490166995578e-06, "loss": 0.79879236, "num_input_tokens_seen": 25440270, "step": 1197, "time_per_iteration": 2.529550313949585 }, { "auxiliary_loss_clip": 0.01219585, "auxiliary_loss_mlp": 0.01041657, "balance_loss_clip": 1.06455755, "balance_loss_mlp": 1.03010535, "epoch": 0.144050982985631, "flos": 30476200608000.0, "grad_norm": 4.971996642695616, "language_loss": 0.8433696, "learning_rate": 3.86520917774997e-06, "loss": 0.86598206, "num_input_tokens_seen": 25459705, "step": 1198, "time_per_iteration": 2.6028761863708496 }, { "auxiliary_loss_clip": 0.01213289, "auxiliary_loss_mlp": 0.01044164, "balance_loss_clip": 1.06324959, "balance_loss_mlp": 1.03384066, "epoch": 0.14417122587627007, "flos": 17858484817920.0, "grad_norm": 2.2202358873986574, "language_loss": 0.74863768, "learning_rate": 3.864927905553614e-06, "loss": 0.77121222, "num_input_tokens_seen": 25477615, "step": 1199, "time_per_iteration": 2.5557374954223633 }, { "auxiliary_loss_clip": 0.01181641, "auxiliary_loss_mlp": 0.01042246, "balance_loss_clip": 1.05734086, "balance_loss_mlp": 1.03181517, "epoch": 0.14429146876690915, "flos": 21613914639360.0, "grad_norm": 1.6120325061678757, "language_loss": 0.88999832, "learning_rate": 3.8646463504491765e-06, "loss": 0.91223723, "num_input_tokens_seen": 25497750, "step": 1200, "time_per_iteration": 4.302061557769775 }, { "auxiliary_loss_clip": 0.01222008, "auxiliary_loss_mlp": 0.0104002, "balance_loss_clip": 1.06677556, "balance_loss_mlp": 1.02803946, "epoch": 0.14441171165754824, "flos": 23258120722560.0, "grad_norm": 1.7643310954077014, "language_loss": 0.8278898, "learning_rate": 3.8643645124793705e-06, "loss": 0.85051, "num_input_tokens_seen": 25516650, "step": 1201, "time_per_iteration": 2.5275492668151855 }, { "auxiliary_loss_clip": 0.01215386, "auxiliary_loss_mlp": 0.01035574, "balance_loss_clip": 1.06172729, "balance_loss_mlp": 1.02488685, "epoch": 0.14453195454818735, "flos": 42854213963520.0, "grad_norm": 1.5969281970308902, "language_loss": 0.74687552, "learning_rate": 3.8640823916869515e-06, "loss": 0.7693851, "num_input_tokens_seen": 25540960, "step": 1202, "time_per_iteration": 2.702115297317505 }, { "auxiliary_loss_clip": 0.01233115, "auxiliary_loss_mlp": 0.01036422, "balance_loss_clip": 1.06433201, "balance_loss_mlp": 1.02570486, "epoch": 0.14465219743882643, "flos": 27235873774080.0, "grad_norm": 1.745997800890538, "language_loss": 0.78473842, "learning_rate": 3.863799988114714e-06, "loss": 0.80743378, "num_input_tokens_seen": 25562990, "step": 1203, "time_per_iteration": 2.5356040000915527 }, { "auxiliary_loss_clip": 0.01239593, "auxiliary_loss_mlp": 0.01034699, "balance_loss_clip": 1.06682611, "balance_loss_mlp": 1.02324939, "epoch": 0.1447724403294655, "flos": 16690705752960.0, "grad_norm": 5.995993969021272, "language_loss": 0.70618761, "learning_rate": 3.863517301805502e-06, "loss": 0.72893059, "num_input_tokens_seen": 25581380, "step": 1204, "time_per_iteration": 2.4148178100585938 }, { "auxiliary_loss_clip": 0.01190975, "auxiliary_loss_mlp": 0.01040444, "balance_loss_clip": 1.06332421, "balance_loss_mlp": 1.02926266, "epoch": 0.14489268322010462, "flos": 20073741321600.0, "grad_norm": 2.501096555932035, "language_loss": 0.9669441, "learning_rate": 3.863234332802196e-06, "loss": 0.98925823, "num_input_tokens_seen": 25593585, "step": 1205, "time_per_iteration": 2.5622525215148926 }, { "auxiliary_loss_clip": 0.01197562, "auxiliary_loss_mlp": 0.01043914, "balance_loss_clip": 1.05821252, "balance_loss_mlp": 1.0336864, "epoch": 0.1450129261107437, "flos": 27125627955840.0, "grad_norm": 2.1030886228043117, "language_loss": 0.74215102, "learning_rate": 3.862951081147723e-06, "loss": 0.76456577, "num_input_tokens_seen": 25613750, "step": 1206, "time_per_iteration": 2.5627236366271973 }, { "auxiliary_loss_clip": 0.01218229, "auxiliary_loss_mlp": 0.01040788, "balance_loss_clip": 1.06570017, "balance_loss_mlp": 1.03087544, "epoch": 0.1451331690013828, "flos": 25702344472320.0, "grad_norm": 2.3329062118639916, "language_loss": 0.77881825, "learning_rate": 3.862667546885053e-06, "loss": 0.80140841, "num_input_tokens_seen": 25632300, "step": 1207, "time_per_iteration": 2.5468392372131348 }, { "auxiliary_loss_clip": 0.01207141, "auxiliary_loss_mlp": 0.01042052, "balance_loss_clip": 1.06144905, "balance_loss_mlp": 1.03090012, "epoch": 0.14525341189202187, "flos": 25737393168000.0, "grad_norm": 2.0588635540689166, "language_loss": 0.73146099, "learning_rate": 3.8623837300571965e-06, "loss": 0.75395298, "num_input_tokens_seen": 25651285, "step": 1208, "time_per_iteration": 2.5630273818969727 }, { "auxiliary_loss_clip": 0.01236599, "auxiliary_loss_mlp": 0.01038061, "balance_loss_clip": 1.06619263, "balance_loss_mlp": 1.02670598, "epoch": 0.14537365478266098, "flos": 23073898844160.0, "grad_norm": 2.3296526383850877, "language_loss": 0.84067225, "learning_rate": 3.8620996307072085e-06, "loss": 0.86341882, "num_input_tokens_seen": 25671990, "step": 1209, "time_per_iteration": 2.474330425262451 }, { "auxiliary_loss_clip": 0.01188648, "auxiliary_loss_mlp": 0.01038214, "balance_loss_clip": 1.05639768, "balance_loss_mlp": 1.02694249, "epoch": 0.14549389767330007, "flos": 20595021448320.0, "grad_norm": 1.8315405527030273, "language_loss": 0.64536703, "learning_rate": 3.861815248878188e-06, "loss": 0.66763568, "num_input_tokens_seen": 25689475, "step": 1210, "time_per_iteration": 2.593482255935669 }, { "auxiliary_loss_clip": 0.0120048, "auxiliary_loss_mlp": 0.01043552, "balance_loss_clip": 1.06309366, "balance_loss_mlp": 1.03321075, "epoch": 0.14561414056393915, "flos": 15121804533120.0, "grad_norm": 5.521584986918201, "language_loss": 0.79655075, "learning_rate": 3.861530584613274e-06, "loss": 0.81899112, "num_input_tokens_seen": 25707475, "step": 1211, "time_per_iteration": 2.5235373973846436 }, { "auxiliary_loss_clip": 0.01222332, "auxiliary_loss_mlp": 0.00766027, "balance_loss_clip": 1.06636417, "balance_loss_mlp": 1.00107944, "epoch": 0.14573438345457826, "flos": 19427493778560.0, "grad_norm": 2.199432454665693, "language_loss": 0.82192487, "learning_rate": 3.86124563795565e-06, "loss": 0.84180844, "num_input_tokens_seen": 25726290, "step": 1212, "time_per_iteration": 2.523937702178955 }, { "auxiliary_loss_clip": 0.01233505, "auxiliary_loss_mlp": 0.0103872, "balance_loss_clip": 1.0668366, "balance_loss_mlp": 1.02824187, "epoch": 0.14585462634521734, "flos": 24828422572800.0, "grad_norm": 1.6804086105383762, "language_loss": 0.70204425, "learning_rate": 3.860960408948543e-06, "loss": 0.72476649, "num_input_tokens_seen": 25748040, "step": 1213, "time_per_iteration": 2.5231709480285645 }, { "auxiliary_loss_clip": 0.01207665, "auxiliary_loss_mlp": 0.01042277, "balance_loss_clip": 1.06314898, "balance_loss_mlp": 1.03201938, "epoch": 0.14597486923585642, "flos": 15448627405440.0, "grad_norm": 2.4291383192565514, "language_loss": 0.89876348, "learning_rate": 3.860674897635222e-06, "loss": 0.92126286, "num_input_tokens_seen": 25764525, "step": 1214, "time_per_iteration": 2.5019190311431885 }, { "auxiliary_loss_clip": 0.01218165, "auxiliary_loss_mlp": 0.01048291, "balance_loss_clip": 1.0659411, "balance_loss_mlp": 1.03723979, "epoch": 0.1460951121264955, "flos": 16655154266880.0, "grad_norm": 1.9940973093713716, "language_loss": 0.83366507, "learning_rate": 3.860389104058998e-06, "loss": 0.85632968, "num_input_tokens_seen": 25782755, "step": 1215, "time_per_iteration": 2.559870481491089 }, { "auxiliary_loss_clip": 0.01201474, "auxiliary_loss_mlp": 0.01034726, "balance_loss_clip": 1.06231809, "balance_loss_mlp": 1.02429473, "epoch": 0.14621535501713462, "flos": 24863291700480.0, "grad_norm": 1.8641895390783614, "language_loss": 0.72593713, "learning_rate": 3.860103028263227e-06, "loss": 0.74829912, "num_input_tokens_seen": 25805860, "step": 1216, "time_per_iteration": 2.6286466121673584 }, { "auxiliary_loss_clip": 0.0116329, "auxiliary_loss_mlp": 0.01035046, "balance_loss_clip": 1.05190015, "balance_loss_mlp": 1.02422214, "epoch": 0.1463355979077737, "flos": 25228000442880.0, "grad_norm": 2.333938901783611, "language_loss": 0.70038521, "learning_rate": 3.859816670291304e-06, "loss": 0.7223686, "num_input_tokens_seen": 25824955, "step": 1217, "time_per_iteration": 2.682464838027954 }, { "auxiliary_loss_clip": 0.01150706, "auxiliary_loss_mlp": 0.01032483, "balance_loss_clip": 1.05657268, "balance_loss_mlp": 1.02086031, "epoch": 0.14645584079841278, "flos": 22054143726720.0, "grad_norm": 2.3336884630007955, "language_loss": 0.8993752, "learning_rate": 3.859530030186672e-06, "loss": 0.92120707, "num_input_tokens_seen": 25841965, "step": 1218, "time_per_iteration": 2.666663408279419 }, { "auxiliary_loss_clip": 0.01207673, "auxiliary_loss_mlp": 0.01037527, "balance_loss_clip": 1.06523824, "balance_loss_mlp": 1.02678597, "epoch": 0.1465760836890519, "flos": 23623870959360.0, "grad_norm": 2.1699847963898073, "language_loss": 0.8243829, "learning_rate": 3.859243107992813e-06, "loss": 0.84683496, "num_input_tokens_seen": 25860770, "step": 1219, "time_per_iteration": 2.598076820373535 }, { "auxiliary_loss_clip": 0.01188952, "auxiliary_loss_mlp": 0.01038613, "balance_loss_clip": 1.05639017, "balance_loss_mlp": 1.02721632, "epoch": 0.14669632657969098, "flos": 37407893356800.0, "grad_norm": 2.087799042459627, "language_loss": 0.77893454, "learning_rate": 3.858955903753252e-06, "loss": 0.80121017, "num_input_tokens_seen": 25879410, "step": 1220, "time_per_iteration": 2.7478127479553223 }, { "auxiliary_loss_clip": 0.01216577, "auxiliary_loss_mlp": 0.01037962, "balance_loss_clip": 1.06097937, "balance_loss_mlp": 1.02846062, "epoch": 0.14681656947033006, "flos": 28365910623360.0, "grad_norm": 1.5556749032978585, "language_loss": 0.83639264, "learning_rate": 3.858668417511559e-06, "loss": 0.85893798, "num_input_tokens_seen": 25902160, "step": 1221, "time_per_iteration": 3.4299418926239014 }, { "auxiliary_loss_clip": 0.01206603, "auxiliary_loss_mlp": 0.01032313, "balance_loss_clip": 1.0648303, "balance_loss_mlp": 1.02134025, "epoch": 0.14693681236096917, "flos": 18479488078080.0, "grad_norm": 2.1766633783015403, "language_loss": 0.76281548, "learning_rate": 3.8583806493113445e-06, "loss": 0.78520465, "num_input_tokens_seen": 25920505, "step": 1222, "time_per_iteration": 2.6012563705444336 }, { "auxiliary_loss_clip": 0.01215196, "auxiliary_loss_mlp": 0.01041007, "balance_loss_clip": 1.06405354, "balance_loss_mlp": 1.03082013, "epoch": 0.14705705525160825, "flos": 20777806782720.0, "grad_norm": 2.2225291822672437, "language_loss": 0.82385731, "learning_rate": 3.858092599196263e-06, "loss": 0.84641933, "num_input_tokens_seen": 25938460, "step": 1223, "time_per_iteration": 2.4934475421905518 }, { "auxiliary_loss_clip": 0.01218014, "auxiliary_loss_mlp": 0.01032236, "balance_loss_clip": 1.06439936, "balance_loss_mlp": 1.02188921, "epoch": 0.14717729814224734, "flos": 29932944336000.0, "grad_norm": 2.5930044814038324, "language_loss": 0.82323581, "learning_rate": 3.857804267210012e-06, "loss": 0.84573829, "num_input_tokens_seen": 25957760, "step": 1224, "time_per_iteration": 2.541569232940674 }, { "auxiliary_loss_clip": 0.01170889, "auxiliary_loss_mlp": 0.01043631, "balance_loss_clip": 1.05348575, "balance_loss_mlp": 1.03323007, "epoch": 0.14729754103288642, "flos": 20047491457920.0, "grad_norm": 2.1280711188396375, "language_loss": 0.88104141, "learning_rate": 3.857515653396331e-06, "loss": 0.90318668, "num_input_tokens_seen": 25974970, "step": 1225, "time_per_iteration": 2.5456156730651855 }, { "auxiliary_loss_clip": 0.01171425, "auxiliary_loss_mlp": 0.01038743, "balance_loss_clip": 1.05644774, "balance_loss_mlp": 1.02866352, "epoch": 0.14741778392352553, "flos": 19281516906240.0, "grad_norm": 2.3431463258731995, "language_loss": 0.87043232, "learning_rate": 3.857226757799002e-06, "loss": 0.89253402, "num_input_tokens_seen": 25992525, "step": 1226, "time_per_iteration": 3.3609275817871094 }, { "auxiliary_loss_clip": 0.01199118, "auxiliary_loss_mlp": 0.01037997, "balance_loss_clip": 1.05969715, "balance_loss_mlp": 1.02741694, "epoch": 0.1475380268141646, "flos": 25411108999680.0, "grad_norm": 2.3536418791766964, "language_loss": 0.74155647, "learning_rate": 3.85693758046185e-06, "loss": 0.76392758, "num_input_tokens_seen": 26010815, "step": 1227, "time_per_iteration": 4.199660301208496 }, { "auxiliary_loss_clip": 0.01232908, "auxiliary_loss_mlp": 0.01041756, "balance_loss_clip": 1.06786013, "balance_loss_mlp": 1.03155172, "epoch": 0.1476582697048037, "flos": 20847652778880.0, "grad_norm": 1.926404603545611, "language_loss": 0.82754517, "learning_rate": 3.8566481214287435e-06, "loss": 0.85029173, "num_input_tokens_seen": 26028935, "step": 1228, "time_per_iteration": 2.489389181137085 }, { "auxiliary_loss_clip": 0.01177275, "auxiliary_loss_mlp": 0.01044358, "balance_loss_clip": 1.05549455, "balance_loss_mlp": 1.0336647, "epoch": 0.1477785125954428, "flos": 14028109269120.0, "grad_norm": 2.0631050130531796, "language_loss": 0.90664279, "learning_rate": 3.8563583807435935e-06, "loss": 0.92885911, "num_input_tokens_seen": 26045080, "step": 1229, "time_per_iteration": 2.5459182262420654 }, { "auxiliary_loss_clip": 0.01220487, "auxiliary_loss_mlp": 0.00765693, "balance_loss_clip": 1.06373549, "balance_loss_mlp": 1.00118995, "epoch": 0.1478987554860819, "flos": 20516699842560.0, "grad_norm": 2.151583407052703, "language_loss": 0.77519083, "learning_rate": 3.856068358450353e-06, "loss": 0.79505265, "num_input_tokens_seen": 26065030, "step": 1230, "time_per_iteration": 2.523956298828125 }, { "auxiliary_loss_clip": 0.01199544, "auxiliary_loss_mlp": 0.01041675, "balance_loss_clip": 1.06643867, "balance_loss_mlp": 1.031358, "epoch": 0.14801899837672097, "flos": 17857012360320.0, "grad_norm": 1.7727411085181164, "language_loss": 0.855335, "learning_rate": 3.8557780545930186e-06, "loss": 0.87774724, "num_input_tokens_seen": 26083445, "step": 1231, "time_per_iteration": 2.5099692344665527 }, { "auxiliary_loss_clip": 0.01201041, "auxiliary_loss_mlp": 0.0104045, "balance_loss_clip": 1.06306303, "balance_loss_mlp": 1.03039432, "epoch": 0.14813924126736006, "flos": 20881408584960.0, "grad_norm": 1.7840170347637485, "language_loss": 0.7953316, "learning_rate": 3.855487469215628e-06, "loss": 0.81774652, "num_input_tokens_seen": 26102375, "step": 1232, "time_per_iteration": 2.579627752304077 }, { "auxiliary_loss_clip": 0.01186958, "auxiliary_loss_mlp": 0.01034884, "balance_loss_clip": 1.06098998, "balance_loss_mlp": 1.02443576, "epoch": 0.14825948415799917, "flos": 37414070496000.0, "grad_norm": 2.1075343167426537, "language_loss": 0.72536087, "learning_rate": 3.855196602362264e-06, "loss": 0.74757934, "num_input_tokens_seen": 26125295, "step": 1233, "time_per_iteration": 2.8180525302886963 }, { "auxiliary_loss_clip": 0.01214793, "auxiliary_loss_mlp": 0.01031907, "balance_loss_clip": 1.06178856, "balance_loss_mlp": 1.02136278, "epoch": 0.14837972704863825, "flos": 22014641744640.0, "grad_norm": 1.9838775211896933, "language_loss": 0.94167888, "learning_rate": 3.854905454077051e-06, "loss": 0.9641459, "num_input_tokens_seen": 26142905, "step": 1234, "time_per_iteration": 2.5686495304107666 }, { "auxiliary_loss_clip": 0.01137481, "auxiliary_loss_mlp": 0.01037529, "balance_loss_clip": 1.05279207, "balance_loss_mlp": 1.02734292, "epoch": 0.14849996993927733, "flos": 20996323171200.0, "grad_norm": 2.2573491934859526, "language_loss": 0.88214552, "learning_rate": 3.854614024404155e-06, "loss": 0.90389562, "num_input_tokens_seen": 26161215, "step": 1235, "time_per_iteration": 2.7196803092956543 }, { "auxiliary_loss_clip": 0.01187531, "auxiliary_loss_mlp": 0.01033257, "balance_loss_clip": 1.05716324, "balance_loss_mlp": 1.02280831, "epoch": 0.14862021282991644, "flos": 20047994248320.0, "grad_norm": 1.9089457635837919, "language_loss": 0.89282167, "learning_rate": 3.8543223133877865e-06, "loss": 0.91502953, "num_input_tokens_seen": 26179810, "step": 1236, "time_per_iteration": 2.5940613746643066 }, { "auxiliary_loss_clip": 0.01183638, "auxiliary_loss_mlp": 0.01042046, "balance_loss_clip": 1.05688262, "balance_loss_mlp": 1.02977979, "epoch": 0.14874045572055553, "flos": 22712027276160.0, "grad_norm": 1.8445128142860934, "language_loss": 0.88202894, "learning_rate": 3.854030321072198e-06, "loss": 0.90428585, "num_input_tokens_seen": 26199715, "step": 1237, "time_per_iteration": 2.8678267002105713 }, { "auxiliary_loss_clip": 0.01191781, "auxiliary_loss_mlp": 0.01030139, "balance_loss_clip": 1.06005001, "balance_loss_mlp": 1.01998234, "epoch": 0.1488606986111946, "flos": 25411288567680.0, "grad_norm": 1.9483647369410164, "language_loss": 0.73584008, "learning_rate": 3.853738047501682e-06, "loss": 0.75805926, "num_input_tokens_seen": 26220275, "step": 1238, "time_per_iteration": 2.7049202919006348 }, { "auxiliary_loss_clip": 0.01218715, "auxiliary_loss_mlp": 0.01039142, "balance_loss_clip": 1.06586349, "balance_loss_mlp": 1.02797818, "epoch": 0.1489809415018337, "flos": 17018749687680.0, "grad_norm": 2.075972479156352, "language_loss": 0.77499306, "learning_rate": 3.85344549272058e-06, "loss": 0.7975716, "num_input_tokens_seen": 26238255, "step": 1239, "time_per_iteration": 2.5351147651672363 }, { "auxiliary_loss_clip": 0.01212706, "auxiliary_loss_mlp": 0.01039232, "balance_loss_clip": 1.06142783, "balance_loss_mlp": 1.02824092, "epoch": 0.1491011843924728, "flos": 33659394860160.0, "grad_norm": 1.7969874009261801, "language_loss": 0.82827985, "learning_rate": 3.853152656773269e-06, "loss": 0.8507992, "num_input_tokens_seen": 26259690, "step": 1240, "time_per_iteration": 2.667109966278076 }, { "auxiliary_loss_clip": 0.01198511, "auxiliary_loss_mlp": 0.01034156, "balance_loss_clip": 1.06150651, "balance_loss_mlp": 1.02347493, "epoch": 0.14922142728311188, "flos": 21179000764800.0, "grad_norm": 1.9606264955825021, "language_loss": 0.84800541, "learning_rate": 3.852859539704174e-06, "loss": 0.87033212, "num_input_tokens_seen": 26278990, "step": 1241, "time_per_iteration": 2.6013031005859375 }, { "auxiliary_loss_clip": 0.01166085, "auxiliary_loss_mlp": 0.01040563, "balance_loss_clip": 1.05418491, "balance_loss_mlp": 1.02991736, "epoch": 0.14934167017375097, "flos": 29860548474240.0, "grad_norm": 1.894130687108162, "language_loss": 0.76595157, "learning_rate": 3.85256614155776e-06, "loss": 0.78801805, "num_input_tokens_seen": 26299120, "step": 1242, "time_per_iteration": 2.697136878967285 }, { "auxiliary_loss_clip": 0.01212995, "auxiliary_loss_mlp": 0.01035343, "balance_loss_clip": 1.05970931, "balance_loss_mlp": 1.024912, "epoch": 0.14946191306439008, "flos": 17019216564480.0, "grad_norm": 1.9377910472175266, "language_loss": 0.74569196, "learning_rate": 3.852272462378535e-06, "loss": 0.76817536, "num_input_tokens_seen": 26316995, "step": 1243, "time_per_iteration": 2.5294992923736572 }, { "auxiliary_loss_clip": 0.0119959, "auxiliary_loss_mlp": 0.01043554, "balance_loss_clip": 1.06105196, "balance_loss_mlp": 1.03357601, "epoch": 0.14958215595502916, "flos": 15669047214720.0, "grad_norm": 1.9167691044361883, "language_loss": 0.77718198, "learning_rate": 3.85197850221105e-06, "loss": 0.79961348, "num_input_tokens_seen": 26333295, "step": 1244, "time_per_iteration": 2.5356898307800293 }, { "auxiliary_loss_clip": 0.01217267, "auxiliary_loss_mlp": 0.01038893, "balance_loss_clip": 1.06765032, "balance_loss_mlp": 1.02871215, "epoch": 0.14970239884566824, "flos": 33108560818560.0, "grad_norm": 1.846309031505951, "language_loss": 0.76009542, "learning_rate": 3.851684261099899e-06, "loss": 0.78265703, "num_input_tokens_seen": 26355035, "step": 1245, "time_per_iteration": 2.6334850788116455 }, { "auxiliary_loss_clip": 0.01195769, "auxiliary_loss_mlp": 0.01035537, "balance_loss_clip": 1.05859756, "balance_loss_mlp": 1.0236876, "epoch": 0.14982264173630733, "flos": 17821245392640.0, "grad_norm": 1.9784966353138738, "language_loss": 0.86618328, "learning_rate": 3.851389739089718e-06, "loss": 0.88849628, "num_input_tokens_seen": 26371655, "step": 1246, "time_per_iteration": 2.4992992877960205 }, { "auxiliary_loss_clip": 0.01222927, "auxiliary_loss_mlp": 0.01037831, "balance_loss_clip": 1.06952918, "balance_loss_mlp": 1.02729273, "epoch": 0.14994288462694644, "flos": 32409559175040.0, "grad_norm": 1.8646401595558197, "language_loss": 0.80390334, "learning_rate": 3.851094936225186e-06, "loss": 0.82651091, "num_input_tokens_seen": 26392540, "step": 1247, "time_per_iteration": 3.3245184421539307 }, { "auxiliary_loss_clip": 0.01198179, "auxiliary_loss_mlp": 0.01030544, "balance_loss_clip": 1.06495309, "balance_loss_mlp": 1.02032173, "epoch": 0.15006312751758552, "flos": 31794661226880.0, "grad_norm": 1.4361056906965086, "language_loss": 0.76639128, "learning_rate": 3.850799852551024e-06, "loss": 0.78867853, "num_input_tokens_seen": 26414960, "step": 1248, "time_per_iteration": 2.604133129119873 }, { "auxiliary_loss_clip": 0.01208518, "auxiliary_loss_mlp": 0.01042822, "balance_loss_clip": 1.06055403, "balance_loss_mlp": 1.03155136, "epoch": 0.1501833704082246, "flos": 16618022582400.0, "grad_norm": 2.3968315918848697, "language_loss": 0.86450148, "learning_rate": 3.850504488111995e-06, "loss": 0.88701487, "num_input_tokens_seen": 26431635, "step": 1249, "time_per_iteration": 2.4781830310821533 }, { "auxiliary_loss_clip": 0.01191947, "auxiliary_loss_mlp": 0.01032365, "balance_loss_clip": 1.05761659, "balance_loss_mlp": 1.02242303, "epoch": 0.15030361329886371, "flos": 23471178243840.0, "grad_norm": 2.1618187401357, "language_loss": 0.82751626, "learning_rate": 3.850208842952907e-06, "loss": 0.84975946, "num_input_tokens_seen": 26450440, "step": 1250, "time_per_iteration": 2.5932819843292236 }, { "auxiliary_loss_clip": 0.01177491, "auxiliary_loss_mlp": 0.01041265, "balance_loss_clip": 1.05656052, "balance_loss_mlp": 1.03056002, "epoch": 0.1504238561895028, "flos": 25629409906560.0, "grad_norm": 1.9155241691215383, "language_loss": 0.79397964, "learning_rate": 3.849912917118608e-06, "loss": 0.81616724, "num_input_tokens_seen": 26471480, "step": 1251, "time_per_iteration": 2.6426584720611572 }, { "auxiliary_loss_clip": 0.01128717, "auxiliary_loss_mlp": 0.01010275, "balance_loss_clip": 1.03730643, "balance_loss_mlp": 1.00724685, "epoch": 0.15054409908014188, "flos": 52095146129280.0, "grad_norm": 0.8852069280610807, "language_loss": 0.59294569, "learning_rate": 3.849616710653992e-06, "loss": 0.61433554, "num_input_tokens_seen": 26532950, "step": 1252, "time_per_iteration": 3.051146984100342 }, { "auxiliary_loss_clip": 0.01213775, "auxiliary_loss_mlp": 0.01039175, "balance_loss_clip": 1.06332695, "balance_loss_mlp": 1.02819586, "epoch": 0.150664341970781, "flos": 18880251096960.0, "grad_norm": 1.8306079649858018, "language_loss": 0.75011694, "learning_rate": 3.84932022360399e-06, "loss": 0.77264643, "num_input_tokens_seen": 26551615, "step": 1253, "time_per_iteration": 3.309061288833618 }, { "auxiliary_loss_clip": 0.0120186, "auxiliary_loss_mlp": 0.01047958, "balance_loss_clip": 1.06609726, "balance_loss_mlp": 1.03640103, "epoch": 0.15078458486142007, "flos": 22163240309760.0, "grad_norm": 3.1467694429227144, "language_loss": 0.84721869, "learning_rate": 3.849023456013581e-06, "loss": 0.86971688, "num_input_tokens_seen": 26569175, "step": 1254, "time_per_iteration": 3.4654369354248047 }, { "auxiliary_loss_clip": 0.01223025, "auxiliary_loss_mlp": 0.01040879, "balance_loss_clip": 1.06529975, "balance_loss_mlp": 1.02985799, "epoch": 0.15090482775205916, "flos": 26651894457600.0, "grad_norm": 2.206111092908473, "language_loss": 0.62324739, "learning_rate": 3.848726407927784e-06, "loss": 0.64588642, "num_input_tokens_seen": 26589560, "step": 1255, "time_per_iteration": 2.5752763748168945 }, { "auxiliary_loss_clip": 0.01204494, "auxiliary_loss_mlp": 0.01040541, "balance_loss_clip": 1.06490159, "balance_loss_mlp": 1.03045022, "epoch": 0.15102507064269824, "flos": 21798998444160.0, "grad_norm": 2.455917277957209, "language_loss": 0.86136806, "learning_rate": 3.84842907939166e-06, "loss": 0.88381845, "num_input_tokens_seen": 26608785, "step": 1256, "time_per_iteration": 2.579418182373047 }, { "auxiliary_loss_clip": 0.01179641, "auxiliary_loss_mlp": 0.01044904, "balance_loss_clip": 1.05955935, "balance_loss_mlp": 1.03450906, "epoch": 0.15114531353333735, "flos": 22820908377600.0, "grad_norm": 2.8060968145051794, "language_loss": 0.71663916, "learning_rate": 3.8481314704503146e-06, "loss": 0.73888457, "num_input_tokens_seen": 26628615, "step": 1257, "time_per_iteration": 2.5872514247894287 }, { "auxiliary_loss_clip": 0.01219811, "auxiliary_loss_mlp": 0.01040407, "balance_loss_clip": 1.0691011, "balance_loss_mlp": 1.03022075, "epoch": 0.15126555642397643, "flos": 19682674974720.0, "grad_norm": 2.0882302997137914, "language_loss": 0.87752086, "learning_rate": 3.847833581148895e-06, "loss": 0.90012306, "num_input_tokens_seen": 26647525, "step": 1258, "time_per_iteration": 2.5161356925964355 }, { "auxiliary_loss_clip": 0.01230845, "auxiliary_loss_mlp": 0.01035757, "balance_loss_clip": 1.06317234, "balance_loss_mlp": 1.02459311, "epoch": 0.15138579931461552, "flos": 28726022424960.0, "grad_norm": 5.407241101664347, "language_loss": 0.81182969, "learning_rate": 3.84753541153259e-06, "loss": 0.83449572, "num_input_tokens_seen": 26667095, "step": 1259, "time_per_iteration": 2.545649766921997 }, { "auxiliary_loss_clip": 0.01218021, "auxiliary_loss_mlp": 0.01037919, "balance_loss_clip": 1.06639695, "balance_loss_mlp": 1.02786338, "epoch": 0.15150604220525463, "flos": 22127006465280.0, "grad_norm": 1.5734462770400977, "language_loss": 0.83185923, "learning_rate": 3.847236961646633e-06, "loss": 0.85441858, "num_input_tokens_seen": 26686075, "step": 1260, "time_per_iteration": 2.5548789501190186 }, { "auxiliary_loss_clip": 0.01194536, "auxiliary_loss_mlp": 0.01042146, "balance_loss_clip": 1.05972373, "balance_loss_mlp": 1.03080893, "epoch": 0.1516262850958937, "flos": 12968708515200.0, "grad_norm": 2.144056087054954, "language_loss": 0.77974689, "learning_rate": 3.846938231536296e-06, "loss": 0.80211365, "num_input_tokens_seen": 26701695, "step": 1261, "time_per_iteration": 2.5173611640930176 }, { "auxiliary_loss_clip": 0.01222641, "auxiliary_loss_mlp": 0.01032574, "balance_loss_clip": 1.06780601, "balance_loss_mlp": 1.02248275, "epoch": 0.1517465279865328, "flos": 21797130936960.0, "grad_norm": 1.9857101710018616, "language_loss": 0.81057531, "learning_rate": 3.8466392212468995e-06, "loss": 0.8331275, "num_input_tokens_seen": 26721885, "step": 1262, "time_per_iteration": 2.545497179031372 }, { "auxiliary_loss_clip": 0.01102087, "auxiliary_loss_mlp": 0.01007892, "balance_loss_clip": 1.02883697, "balance_loss_mlp": 1.00476837, "epoch": 0.15186677087717187, "flos": 58174569901440.0, "grad_norm": 0.82025512844847, "language_loss": 0.61884898, "learning_rate": 3.8463399308238e-06, "loss": 0.63994879, "num_input_tokens_seen": 26780990, "step": 1263, "time_per_iteration": 3.1565325260162354 }, { "auxiliary_loss_clip": 0.01216945, "auxiliary_loss_mlp": 0.01040488, "balance_loss_clip": 1.06655836, "balance_loss_mlp": 1.02850199, "epoch": 0.15198701376781099, "flos": 32669696448000.0, "grad_norm": 1.950180375412627, "language_loss": 0.63794374, "learning_rate": 3.846040360312402e-06, "loss": 0.66051811, "num_input_tokens_seen": 26804250, "step": 1264, "time_per_iteration": 2.6542649269104004 }, { "auxiliary_loss_clip": 0.01232222, "auxiliary_loss_mlp": 0.01042431, "balance_loss_clip": 1.06537294, "balance_loss_mlp": 1.03154683, "epoch": 0.15210725665845007, "flos": 28402575431040.0, "grad_norm": 5.505775451890121, "language_loss": 0.81541437, "learning_rate": 3.8457405097581485e-06, "loss": 0.83816087, "num_input_tokens_seen": 26823240, "step": 1265, "time_per_iteration": 2.6206254959106445 }, { "auxiliary_loss_clip": 0.01172616, "auxiliary_loss_mlp": 0.01040508, "balance_loss_clip": 1.05505085, "balance_loss_mlp": 1.0293622, "epoch": 0.15222749954908915, "flos": 19938179393280.0, "grad_norm": 2.0813915623907375, "language_loss": 0.77849263, "learning_rate": 3.8454403792065275e-06, "loss": 0.80062389, "num_input_tokens_seen": 26842060, "step": 1266, "time_per_iteration": 2.6828672885894775 }, { "auxiliary_loss_clip": 0.01175289, "auxiliary_loss_mlp": 0.01048096, "balance_loss_clip": 1.05693722, "balance_loss_mlp": 1.03721261, "epoch": 0.15234774243972826, "flos": 21324223451520.0, "grad_norm": 1.9520953429284653, "language_loss": 0.85729051, "learning_rate": 3.845139968703068e-06, "loss": 0.87952435, "num_input_tokens_seen": 26859580, "step": 1267, "time_per_iteration": 2.5823848247528076 }, { "auxiliary_loss_clip": 0.01169603, "auxiliary_loss_mlp": 0.01044353, "balance_loss_clip": 1.0563761, "balance_loss_mlp": 1.03303993, "epoch": 0.15246798533036734, "flos": 25957812977280.0, "grad_norm": 1.9090533275148713, "language_loss": 0.82801747, "learning_rate": 3.844839278293342e-06, "loss": 0.85015702, "num_input_tokens_seen": 26880430, "step": 1268, "time_per_iteration": 2.7268917560577393 }, { "auxiliary_loss_clip": 0.01236778, "auxiliary_loss_mlp": 0.0103976, "balance_loss_clip": 1.0693264, "balance_loss_mlp": 1.02874529, "epoch": 0.15258822822100643, "flos": 25811907932160.0, "grad_norm": 2.8893714409464484, "language_loss": 0.7668348, "learning_rate": 3.8445383080229654e-06, "loss": 0.78960019, "num_input_tokens_seen": 26896445, "step": 1269, "time_per_iteration": 2.538482189178467 }, { "auxiliary_loss_clip": 0.01194403, "auxiliary_loss_mlp": 0.01034566, "balance_loss_clip": 1.05814433, "balance_loss_mlp": 1.02336049, "epoch": 0.1527084711116455, "flos": 25265455349760.0, "grad_norm": 2.1131678030585768, "language_loss": 0.7376042, "learning_rate": 3.844237057937593e-06, "loss": 0.75989389, "num_input_tokens_seen": 26915450, "step": 1270, "time_per_iteration": 2.585883378982544 }, { "auxiliary_loss_clip": 0.0122409, "auxiliary_loss_mlp": 0.0103638, "balance_loss_clip": 1.06435156, "balance_loss_mlp": 1.02497745, "epoch": 0.15282871400228462, "flos": 29240227572480.0, "grad_norm": 2.400238039857949, "language_loss": 0.77860111, "learning_rate": 3.843935528082926e-06, "loss": 0.80120587, "num_input_tokens_seen": 26936475, "step": 1271, "time_per_iteration": 2.593721628189087 }, { "auxiliary_loss_clip": 0.01218084, "auxiliary_loss_mlp": 0.01033306, "balance_loss_clip": 1.06450868, "balance_loss_mlp": 1.0230782, "epoch": 0.1529489568929237, "flos": 20882952869760.0, "grad_norm": 1.8662503511665753, "language_loss": 0.8498466, "learning_rate": 3.843633718504704e-06, "loss": 0.87236059, "num_input_tokens_seen": 26954920, "step": 1272, "time_per_iteration": 2.5233373641967773 }, { "auxiliary_loss_clip": 0.01184753, "auxiliary_loss_mlp": 0.01037267, "balance_loss_clip": 1.05964136, "balance_loss_mlp": 1.0268724, "epoch": 0.1530691997835628, "flos": 20083833043200.0, "grad_norm": 2.500035629287759, "language_loss": 0.90378654, "learning_rate": 3.843331629248715e-06, "loss": 0.92600679, "num_input_tokens_seen": 26972520, "step": 1273, "time_per_iteration": 2.6000401973724365 }, { "auxiliary_loss_clip": 0.01234526, "auxiliary_loss_mlp": 0.01033767, "balance_loss_clip": 1.06784344, "balance_loss_mlp": 1.02362239, "epoch": 0.1531894426742019, "flos": 28759814144640.0, "grad_norm": 2.3991455516462037, "language_loss": 0.76452708, "learning_rate": 3.843029260360782e-06, "loss": 0.78721005, "num_input_tokens_seen": 26990890, "step": 1274, "time_per_iteration": 3.3469324111938477 }, { "auxiliary_loss_clip": 0.01218224, "auxiliary_loss_mlp": 0.01045229, "balance_loss_clip": 1.06559873, "balance_loss_mlp": 1.03507292, "epoch": 0.15330968556484098, "flos": 22236282616320.0, "grad_norm": 2.5379867313463187, "language_loss": 0.79083681, "learning_rate": 3.8427266118867755e-06, "loss": 0.81347132, "num_input_tokens_seen": 27010640, "step": 1275, "time_per_iteration": 2.564117908477783 }, { "auxiliary_loss_clip": 0.01201257, "auxiliary_loss_mlp": 0.01036847, "balance_loss_clip": 1.06319606, "balance_loss_mlp": 1.02596307, "epoch": 0.15342992845548006, "flos": 27527504296320.0, "grad_norm": 2.1819681684979897, "language_loss": 0.82657218, "learning_rate": 3.842423683872608e-06, "loss": 0.84895325, "num_input_tokens_seen": 27031215, "step": 1276, "time_per_iteration": 2.5950303077697754 }, { "auxiliary_loss_clip": 0.01216163, "auxiliary_loss_mlp": 0.01043493, "balance_loss_clip": 1.0629673, "balance_loss_mlp": 1.03274584, "epoch": 0.15355017134611917, "flos": 19609596754560.0, "grad_norm": 2.445758739801298, "language_loss": 0.77784663, "learning_rate": 3.842120476364232e-06, "loss": 0.80044317, "num_input_tokens_seen": 27049665, "step": 1277, "time_per_iteration": 2.517185688018799 }, { "auxiliary_loss_clip": 0.01222064, "auxiliary_loss_mlp": 0.01031818, "balance_loss_clip": 1.06284201, "balance_loss_mlp": 1.02114904, "epoch": 0.15367041423675826, "flos": 18478590238080.0, "grad_norm": 2.086664832911439, "language_loss": 0.83740085, "learning_rate": 3.841816989407644e-06, "loss": 0.85993969, "num_input_tokens_seen": 27065155, "step": 1278, "time_per_iteration": 2.4948816299438477 }, { "auxiliary_loss_clip": 0.0118492, "auxiliary_loss_mlp": 0.01046029, "balance_loss_clip": 1.06155968, "balance_loss_mlp": 1.03506184, "epoch": 0.15379065712739734, "flos": 41427662342400.0, "grad_norm": 1.9984647715335626, "language_loss": 0.7709744, "learning_rate": 3.841513223048884e-06, "loss": 0.79328394, "num_input_tokens_seen": 27085840, "step": 1279, "time_per_iteration": 2.7593600749969482 }, { "auxiliary_loss_clip": 0.01182383, "auxiliary_loss_mlp": 0.0103826, "balance_loss_clip": 1.05660594, "balance_loss_mlp": 1.02737617, "epoch": 0.15391090001803642, "flos": 22054215553920.0, "grad_norm": 2.4294632891232535, "language_loss": 0.78472275, "learning_rate": 3.841209177334031e-06, "loss": 0.80692917, "num_input_tokens_seen": 27104200, "step": 1280, "time_per_iteration": 3.4639816284179688 }, { "auxiliary_loss_clip": 0.01213993, "auxiliary_loss_mlp": 0.0103739, "balance_loss_clip": 1.06417727, "balance_loss_mlp": 1.02682781, "epoch": 0.15403114290867553, "flos": 15450351258240.0, "grad_norm": 2.066657805289406, "language_loss": 0.74962389, "learning_rate": 3.84090485230921e-06, "loss": 0.7721377, "num_input_tokens_seen": 27122440, "step": 1281, "time_per_iteration": 3.363729238510132 }, { "auxiliary_loss_clip": 0.01233154, "auxiliary_loss_mlp": 0.0103607, "balance_loss_clip": 1.06654787, "balance_loss_mlp": 1.02524614, "epoch": 0.15415138579931462, "flos": 17929156826880.0, "grad_norm": 3.197919723147775, "language_loss": 0.76139849, "learning_rate": 3.840600248020588e-06, "loss": 0.78409076, "num_input_tokens_seen": 27139380, "step": 1282, "time_per_iteration": 2.443704843521118 }, { "auxiliary_loss_clip": 0.01206283, "auxiliary_loss_mlp": 0.01047181, "balance_loss_clip": 1.05950594, "balance_loss_mlp": 1.0359571, "epoch": 0.1542716286899537, "flos": 11429325296640.0, "grad_norm": 2.218352223811508, "language_loss": 0.79768419, "learning_rate": 3.840295364514371e-06, "loss": 0.82021886, "num_input_tokens_seen": 27156760, "step": 1283, "time_per_iteration": 2.545259952545166 }, { "auxiliary_loss_clip": 0.01202859, "auxiliary_loss_mlp": 0.01041116, "balance_loss_clip": 1.06284356, "balance_loss_mlp": 1.02999425, "epoch": 0.1543918715805928, "flos": 17420338719360.0, "grad_norm": 2.3094999921301653, "language_loss": 0.7854867, "learning_rate": 3.83999020183681e-06, "loss": 0.80792642, "num_input_tokens_seen": 27175455, "step": 1284, "time_per_iteration": 2.6204116344451904 }, { "auxiliary_loss_clip": 0.01146617, "auxiliary_loss_mlp": 0.01043022, "balance_loss_clip": 1.05383277, "balance_loss_mlp": 1.03269887, "epoch": 0.1545121144712319, "flos": 17786376264960.0, "grad_norm": 2.061416944553707, "language_loss": 0.78716886, "learning_rate": 3.839684760034199e-06, "loss": 0.80906522, "num_input_tokens_seen": 27193660, "step": 1285, "time_per_iteration": 2.7224276065826416 }, { "auxiliary_loss_clip": 0.01181168, "auxiliary_loss_mlp": 0.01038995, "balance_loss_clip": 1.05995119, "balance_loss_mlp": 1.02752066, "epoch": 0.15463235736187098, "flos": 28220185146240.0, "grad_norm": 2.1749821451854863, "language_loss": 0.65758193, "learning_rate": 3.8393790391528716e-06, "loss": 0.67978358, "num_input_tokens_seen": 27214355, "step": 1286, "time_per_iteration": 2.6035068035125732 }, { "auxiliary_loss_clip": 0.01198753, "auxiliary_loss_mlp": 0.01037027, "balance_loss_clip": 1.05927157, "balance_loss_mlp": 1.02650094, "epoch": 0.15475260025251006, "flos": 22856890826880.0, "grad_norm": 2.0392908280263033, "language_loss": 0.89314538, "learning_rate": 3.8390730392392075e-06, "loss": 0.9155032, "num_input_tokens_seen": 27234335, "step": 1287, "time_per_iteration": 2.540876626968384 }, { "auxiliary_loss_clip": 0.01234376, "auxiliary_loss_mlp": 0.01036589, "balance_loss_clip": 1.06759024, "balance_loss_mlp": 1.02630162, "epoch": 0.15487284314314917, "flos": 17602872658560.0, "grad_norm": 3.9021632639204054, "language_loss": 0.79294878, "learning_rate": 3.838766760339626e-06, "loss": 0.81565845, "num_input_tokens_seen": 27252860, "step": 1288, "time_per_iteration": 2.4512548446655273 }, { "auxiliary_loss_clip": 0.01166882, "auxiliary_loss_mlp": 0.01039393, "balance_loss_clip": 1.05561733, "balance_loss_mlp": 1.02815199, "epoch": 0.15499308603378825, "flos": 20082037363200.0, "grad_norm": 2.477251727838228, "language_loss": 0.79087478, "learning_rate": 3.838460202500587e-06, "loss": 0.8129375, "num_input_tokens_seen": 27268650, "step": 1289, "time_per_iteration": 2.575636863708496 }, { "auxiliary_loss_clip": 0.01181334, "auxiliary_loss_mlp": 0.01034357, "balance_loss_clip": 1.06307626, "balance_loss_mlp": 1.02257359, "epoch": 0.15511332892442733, "flos": 15918051271680.0, "grad_norm": 2.5256556364037914, "language_loss": 0.74148095, "learning_rate": 3.838153365768599e-06, "loss": 0.76363784, "num_input_tokens_seen": 27285160, "step": 1290, "time_per_iteration": 2.6385819911956787 }, { "auxiliary_loss_clip": 0.01185779, "auxiliary_loss_mlp": 0.01050897, "balance_loss_clip": 1.0661453, "balance_loss_mlp": 1.03948307, "epoch": 0.15523357181506645, "flos": 41282475569280.0, "grad_norm": 2.4787875437808387, "language_loss": 0.7563808, "learning_rate": 3.837846250190206e-06, "loss": 0.77874762, "num_input_tokens_seen": 27308025, "step": 1291, "time_per_iteration": 2.7535786628723145 }, { "auxiliary_loss_clip": 0.01164503, "auxiliary_loss_mlp": 0.0076618, "balance_loss_clip": 1.05694509, "balance_loss_mlp": 1.00158119, "epoch": 0.15535381470570553, "flos": 18478769806080.0, "grad_norm": 1.9988032328664185, "language_loss": 0.76965094, "learning_rate": 3.837538855811998e-06, "loss": 0.78895772, "num_input_tokens_seen": 27326200, "step": 1292, "time_per_iteration": 2.5844223499298096 }, { "auxiliary_loss_clip": 0.01209864, "auxiliary_loss_mlp": 0.01043264, "balance_loss_clip": 1.06541908, "balance_loss_mlp": 1.03286862, "epoch": 0.1554740575963446, "flos": 13918150759680.0, "grad_norm": 2.072007059468015, "language_loss": 0.71081936, "learning_rate": 3.837231182680606e-06, "loss": 0.73335069, "num_input_tokens_seen": 27344165, "step": 1293, "time_per_iteration": 2.5352165699005127 }, { "auxiliary_loss_clip": 0.01225103, "auxiliary_loss_mlp": 0.01037405, "balance_loss_clip": 1.06754553, "balance_loss_mlp": 1.02632999, "epoch": 0.1555943004869837, "flos": 20847078161280.0, "grad_norm": 1.6758742727645968, "language_loss": 0.75904238, "learning_rate": 3.836923230842706e-06, "loss": 0.78166747, "num_input_tokens_seen": 27363280, "step": 1294, "time_per_iteration": 2.5206072330474854 }, { "auxiliary_loss_clip": 0.01171255, "auxiliary_loss_mlp": 0.01040005, "balance_loss_clip": 1.05404019, "balance_loss_mlp": 1.02873349, "epoch": 0.1557145433776228, "flos": 22085888371200.0, "grad_norm": 1.8952306131757557, "language_loss": 0.81068122, "learning_rate": 3.836615000345011e-06, "loss": 0.83279383, "num_input_tokens_seen": 27381460, "step": 1295, "time_per_iteration": 2.6306960582733154 }, { "auxiliary_loss_clip": 0.01229092, "auxiliary_loss_mlp": 0.01036545, "balance_loss_clip": 1.06455302, "balance_loss_mlp": 1.02669811, "epoch": 0.1558347862682619, "flos": 19791987039360.0, "grad_norm": 1.9693740919876468, "language_loss": 0.77962816, "learning_rate": 3.836306491234282e-06, "loss": 0.8022846, "num_input_tokens_seen": 27399310, "step": 1296, "time_per_iteration": 2.4918596744537354 }, { "auxiliary_loss_clip": 0.01197237, "auxiliary_loss_mlp": 0.01040416, "balance_loss_clip": 1.06636786, "balance_loss_mlp": 1.03065872, "epoch": 0.15595502915890097, "flos": 17237086508160.0, "grad_norm": 2.256355953925597, "language_loss": 0.75914252, "learning_rate": 3.835997703557317e-06, "loss": 0.78151906, "num_input_tokens_seen": 27416050, "step": 1297, "time_per_iteration": 2.54742431640625 }, { "auxiliary_loss_clip": 0.01169121, "auxiliary_loss_mlp": 0.01038782, "balance_loss_clip": 1.05299473, "balance_loss_mlp": 1.02845311, "epoch": 0.15607527204954008, "flos": 19719519350400.0, "grad_norm": 1.645801302749267, "language_loss": 0.80161953, "learning_rate": 3.83568863736096e-06, "loss": 0.82369852, "num_input_tokens_seen": 27434920, "step": 1298, "time_per_iteration": 2.6482622623443604 }, { "auxiliary_loss_clip": 0.01187663, "auxiliary_loss_mlp": 0.01037645, "balance_loss_clip": 1.05865836, "balance_loss_mlp": 1.02741694, "epoch": 0.15619551494017916, "flos": 18515650095360.0, "grad_norm": 2.8277417804540192, "language_loss": 0.89143074, "learning_rate": 3.8353792926920975e-06, "loss": 0.91368371, "num_input_tokens_seen": 27453570, "step": 1299, "time_per_iteration": 2.560147285461426 }, { "auxiliary_loss_clip": 0.01225637, "auxiliary_loss_mlp": 0.01045142, "balance_loss_clip": 1.0686295, "balance_loss_mlp": 1.03401339, "epoch": 0.15631575783081825, "flos": 19902125116800.0, "grad_norm": 2.1531110795537747, "language_loss": 0.81672341, "learning_rate": 3.835069669597655e-06, "loss": 0.83943117, "num_input_tokens_seen": 27471960, "step": 1300, "time_per_iteration": 3.353548526763916 }, { "auxiliary_loss_clip": 0.01222735, "auxiliary_loss_mlp": 0.00766352, "balance_loss_clip": 1.0662626, "balance_loss_mlp": 1.0013653, "epoch": 0.15643600072145733, "flos": 20777663128320.0, "grad_norm": 2.1402735527529555, "language_loss": 0.79648507, "learning_rate": 3.834759768124603e-06, "loss": 0.81637597, "num_input_tokens_seen": 27490835, "step": 1301, "time_per_iteration": 2.532723903656006 }, { "auxiliary_loss_clip": 0.01192352, "auxiliary_loss_mlp": 0.01035551, "balance_loss_clip": 1.06492293, "balance_loss_mlp": 1.0251143, "epoch": 0.15655624361209644, "flos": 18546389159040.0, "grad_norm": 2.223335978249787, "language_loss": 0.76437336, "learning_rate": 3.834449588319953e-06, "loss": 0.78665245, "num_input_tokens_seen": 27508870, "step": 1302, "time_per_iteration": 2.549192190170288 }, { "auxiliary_loss_clip": 0.01215807, "auxiliary_loss_mlp": 0.01040587, "balance_loss_clip": 1.06791782, "balance_loss_mlp": 1.0304544, "epoch": 0.15667648650273552, "flos": 25229544727680.0, "grad_norm": 1.7577237529635878, "language_loss": 0.85125482, "learning_rate": 3.834139130230758e-06, "loss": 0.87381887, "num_input_tokens_seen": 27528175, "step": 1303, "time_per_iteration": 2.5474674701690674 }, { "auxiliary_loss_clip": 0.01204918, "auxiliary_loss_mlp": 0.01039047, "balance_loss_clip": 1.06090903, "balance_loss_mlp": 1.02859271, "epoch": 0.1567967293933746, "flos": 24827093769600.0, "grad_norm": 1.8089787601871836, "language_loss": 0.81391156, "learning_rate": 3.833828393904117e-06, "loss": 0.83635116, "num_input_tokens_seen": 27548455, "step": 1304, "time_per_iteration": 2.595574378967285 }, { "auxiliary_loss_clip": 0.01166488, "auxiliary_loss_mlp": 0.01033154, "balance_loss_clip": 1.05539382, "balance_loss_mlp": 1.02206767, "epoch": 0.15691697228401372, "flos": 19164555244800.0, "grad_norm": 2.3217233323079047, "language_loss": 0.77731085, "learning_rate": 3.833517379387165e-06, "loss": 0.79930729, "num_input_tokens_seen": 27564910, "step": 1305, "time_per_iteration": 2.5775556564331055 }, { "auxiliary_loss_clip": 0.01222565, "auxiliary_loss_mlp": 0.01041748, "balance_loss_clip": 1.06769645, "balance_loss_mlp": 1.03103673, "epoch": 0.1570372151746528, "flos": 24790931752320.0, "grad_norm": 3.092072864454218, "language_loss": 0.88825744, "learning_rate": 3.833206086727085e-06, "loss": 0.91090059, "num_input_tokens_seen": 27584260, "step": 1306, "time_per_iteration": 2.5487661361694336 }, { "auxiliary_loss_clip": 0.01190021, "auxiliary_loss_mlp": 0.01037477, "balance_loss_clip": 1.05850053, "balance_loss_mlp": 1.02743971, "epoch": 0.15715745806529188, "flos": 24863650836480.0, "grad_norm": 1.963992388615502, "language_loss": 0.70863497, "learning_rate": 3.8328945159710994e-06, "loss": 0.73090994, "num_input_tokens_seen": 27604440, "step": 1307, "time_per_iteration": 4.979633331298828 }, { "auxiliary_loss_clip": 0.01226948, "auxiliary_loss_mlp": 0.00765368, "balance_loss_clip": 1.06996155, "balance_loss_mlp": 1.00123274, "epoch": 0.157277700955931, "flos": 21872148491520.0, "grad_norm": 2.3597321218678435, "language_loss": 0.88840675, "learning_rate": 3.832582667166473e-06, "loss": 0.90832996, "num_input_tokens_seen": 27624250, "step": 1308, "time_per_iteration": 2.510880947113037 }, { "auxiliary_loss_clip": 0.01205869, "auxiliary_loss_mlp": 0.01038475, "balance_loss_clip": 1.06444168, "balance_loss_mlp": 1.02699482, "epoch": 0.15739794384657008, "flos": 24533344344960.0, "grad_norm": 1.741141653743624, "language_loss": 0.82020366, "learning_rate": 3.8322705403605125e-06, "loss": 0.84264708, "num_input_tokens_seen": 27644595, "step": 1309, "time_per_iteration": 2.5459489822387695 }, { "auxiliary_loss_clip": 0.01195501, "auxiliary_loss_mlp": 0.01037319, "balance_loss_clip": 1.06246519, "balance_loss_mlp": 1.02796674, "epoch": 0.15751818673720916, "flos": 17745329998080.0, "grad_norm": 2.433016627240624, "language_loss": 0.80914116, "learning_rate": 3.831958135600568e-06, "loss": 0.8314693, "num_input_tokens_seen": 27662145, "step": 1310, "time_per_iteration": 2.481736421585083 }, { "auxiliary_loss_clip": 0.01218928, "auxiliary_loss_mlp": 0.01034927, "balance_loss_clip": 1.06626499, "balance_loss_mlp": 1.02568829, "epoch": 0.15763842962784824, "flos": 17858520731520.0, "grad_norm": 1.9674667067269547, "language_loss": 0.79463673, "learning_rate": 3.831645452934032e-06, "loss": 0.81717527, "num_input_tokens_seen": 27680575, "step": 1311, "time_per_iteration": 2.461761474609375 }, { "auxiliary_loss_clip": 0.0123525, "auxiliary_loss_mlp": 0.01044459, "balance_loss_clip": 1.07023692, "balance_loss_mlp": 1.03440952, "epoch": 0.15775867251848735, "flos": 26980908059520.0, "grad_norm": 1.8567165983109104, "language_loss": 0.80104721, "learning_rate": 3.831332492408336e-06, "loss": 0.82384431, "num_input_tokens_seen": 27701985, "step": 1312, "time_per_iteration": 2.5449059009552 }, { "auxiliary_loss_clip": 0.01199619, "auxiliary_loss_mlp": 0.01030943, "balance_loss_clip": 1.06210828, "balance_loss_mlp": 1.0205543, "epoch": 0.15787891540912644, "flos": 19240398812160.0, "grad_norm": 2.0189939879799477, "language_loss": 0.69460416, "learning_rate": 3.831019254070957e-06, "loss": 0.71690983, "num_input_tokens_seen": 27719770, "step": 1313, "time_per_iteration": 2.5115060806274414 }, { "auxiliary_loss_clip": 0.01175294, "auxiliary_loss_mlp": 0.01033725, "balance_loss_clip": 1.05974579, "balance_loss_mlp": 1.02364564, "epoch": 0.15799915829976552, "flos": 27271102037760.0, "grad_norm": 2.720418961676346, "language_loss": 0.95244241, "learning_rate": 3.8307057379694135e-06, "loss": 0.9745326, "num_input_tokens_seen": 27739105, "step": 1314, "time_per_iteration": 2.6371397972106934 }, { "auxiliary_loss_clip": 0.01231751, "auxiliary_loss_mlp": 0.01041856, "balance_loss_clip": 1.06557298, "balance_loss_mlp": 1.03184867, "epoch": 0.15811940119040463, "flos": 20405520270720.0, "grad_norm": 2.0725347502638236, "language_loss": 0.82151711, "learning_rate": 3.830391944151264e-06, "loss": 0.84425312, "num_input_tokens_seen": 27754985, "step": 1315, "time_per_iteration": 2.4328408241271973 }, { "auxiliary_loss_clip": 0.01200777, "auxiliary_loss_mlp": 0.01043284, "balance_loss_clip": 1.06002641, "balance_loss_mlp": 1.03357458, "epoch": 0.1582396440810437, "flos": 32599347661440.0, "grad_norm": 1.932466968115719, "language_loss": 0.67339623, "learning_rate": 3.830077872664114e-06, "loss": 0.6958369, "num_input_tokens_seen": 27776110, "step": 1316, "time_per_iteration": 2.6023385524749756 }, { "auxiliary_loss_clip": 0.01154208, "auxiliary_loss_mlp": 0.01042564, "balance_loss_clip": 1.05398107, "balance_loss_mlp": 1.03291392, "epoch": 0.1583598869716828, "flos": 33800559310080.0, "grad_norm": 1.7667233206382023, "language_loss": 0.72862113, "learning_rate": 3.829763523555604e-06, "loss": 0.75058889, "num_input_tokens_seen": 27796510, "step": 1317, "time_per_iteration": 2.7155137062072754 }, { "auxiliary_loss_clip": 0.01211408, "auxiliary_loss_mlp": 0.01035761, "balance_loss_clip": 1.06838012, "balance_loss_mlp": 1.02619505, "epoch": 0.15848012986232188, "flos": 24681332378880.0, "grad_norm": 2.114874555651769, "language_loss": 0.77905083, "learning_rate": 3.829448896873423e-06, "loss": 0.80152249, "num_input_tokens_seen": 27815610, "step": 1318, "time_per_iteration": 2.543682813644409 }, { "auxiliary_loss_clip": 0.01159997, "auxiliary_loss_mlp": 0.00765157, "balance_loss_clip": 1.06019151, "balance_loss_mlp": 1.00129008, "epoch": 0.158600372752961, "flos": 22602068766720.0, "grad_norm": 1.7690122907861627, "language_loss": 0.79127955, "learning_rate": 3.829133992665299e-06, "loss": 0.81053114, "num_input_tokens_seen": 27834735, "step": 1319, "time_per_iteration": 2.610743761062622 }, { "auxiliary_loss_clip": 0.01205086, "auxiliary_loss_mlp": 0.01038033, "balance_loss_clip": 1.06355608, "balance_loss_mlp": 1.02825737, "epoch": 0.15872061564360007, "flos": 27927944092800.0, "grad_norm": 2.1452207925442734, "language_loss": 0.88678765, "learning_rate": 3.828818810979002e-06, "loss": 0.90921885, "num_input_tokens_seen": 27853065, "step": 1320, "time_per_iteration": 2.5379014015197754 }, { "auxiliary_loss_clip": 0.01232221, "auxiliary_loss_mlp": 0.01041007, "balance_loss_clip": 1.07075691, "balance_loss_mlp": 1.0312196, "epoch": 0.15884085853423915, "flos": 23696805525120.0, "grad_norm": 1.8274815272764542, "language_loss": 0.80584955, "learning_rate": 3.8285033518623454e-06, "loss": 0.82858193, "num_input_tokens_seen": 27873315, "step": 1321, "time_per_iteration": 2.4854278564453125 }, { "auxiliary_loss_clip": 0.01221472, "auxiliary_loss_mlp": 0.01040578, "balance_loss_clip": 1.06742227, "balance_loss_mlp": 1.02958679, "epoch": 0.15896110142487826, "flos": 23112359331840.0, "grad_norm": 2.692626492663524, "language_loss": 0.81532472, "learning_rate": 3.8281876153631845e-06, "loss": 0.83794522, "num_input_tokens_seen": 27890070, "step": 1322, "time_per_iteration": 2.507112979888916 }, { "auxiliary_loss_clip": 0.01166667, "auxiliary_loss_mlp": 0.0104213, "balance_loss_clip": 1.05711365, "balance_loss_mlp": 1.03118706, "epoch": 0.15908134431551735, "flos": 14685238632960.0, "grad_norm": 1.883383624501825, "language_loss": 0.64697874, "learning_rate": 3.827871601529416e-06, "loss": 0.66906679, "num_input_tokens_seen": 27908590, "step": 1323, "time_per_iteration": 2.5850954055786133 }, { "auxiliary_loss_clip": 0.01179731, "auxiliary_loss_mlp": 0.01040591, "balance_loss_clip": 1.05840039, "balance_loss_mlp": 1.0305593, "epoch": 0.15920158720615643, "flos": 20193611984640.0, "grad_norm": 1.7632585409258437, "language_loss": 0.80515623, "learning_rate": 3.827555310408979e-06, "loss": 0.82735944, "num_input_tokens_seen": 27927985, "step": 1324, "time_per_iteration": 2.5570547580718994 }, { "auxiliary_loss_clip": 0.01181059, "auxiliary_loss_mlp": 0.01038098, "balance_loss_clip": 1.06348395, "balance_loss_mlp": 1.02760148, "epoch": 0.1593218300967955, "flos": 24826626892800.0, "grad_norm": 1.7186704152473238, "language_loss": 0.82870293, "learning_rate": 3.827238742049854e-06, "loss": 0.85089451, "num_input_tokens_seen": 27948280, "step": 1325, "time_per_iteration": 2.654414415359497 }, { "auxiliary_loss_clip": 0.01229299, "auxiliary_loss_mlp": 0.01037082, "balance_loss_clip": 1.06540775, "balance_loss_mlp": 1.02651358, "epoch": 0.15944207298743462, "flos": 28328707111680.0, "grad_norm": 1.949110916574243, "language_loss": 0.52068645, "learning_rate": 3.826921896500066e-06, "loss": 0.54335022, "num_input_tokens_seen": 27969565, "step": 1326, "time_per_iteration": 2.5206069946289062 }, { "auxiliary_loss_clip": 0.01190421, "auxiliary_loss_mlp": 0.01036531, "balance_loss_clip": 1.06250894, "balance_loss_mlp": 1.02545023, "epoch": 0.1595623158780737, "flos": 22964838174720.0, "grad_norm": 2.819476890216086, "language_loss": 0.7808429, "learning_rate": 3.826604773807678e-06, "loss": 0.80311245, "num_input_tokens_seen": 27987540, "step": 1327, "time_per_iteration": 3.3958075046539307 }, { "auxiliary_loss_clip": 0.01196276, "auxiliary_loss_mlp": 0.01035758, "balance_loss_clip": 1.05713904, "balance_loss_mlp": 1.02485597, "epoch": 0.1596825587687128, "flos": 19710540950400.0, "grad_norm": 2.73269049048548, "language_loss": 0.73612595, "learning_rate": 3.826287374020798e-06, "loss": 0.75844634, "num_input_tokens_seen": 28002345, "step": 1328, "time_per_iteration": 2.5247321128845215 }, { "auxiliary_loss_clip": 0.01231704, "auxiliary_loss_mlp": 0.01036938, "balance_loss_clip": 1.06858444, "balance_loss_mlp": 1.02735353, "epoch": 0.1598028016593519, "flos": 22637727993600.0, "grad_norm": 2.068608124183978, "language_loss": 0.82837182, "learning_rate": 3.825969697187575e-06, "loss": 0.85105824, "num_input_tokens_seen": 28021675, "step": 1329, "time_per_iteration": 2.4613306522369385 }, { "auxiliary_loss_clip": 0.01181687, "auxiliary_loss_mlp": 0.01031509, "balance_loss_clip": 1.05808735, "balance_loss_mlp": 1.02122116, "epoch": 0.15992304454999098, "flos": 20482908122880.0, "grad_norm": 1.8308811204645985, "language_loss": 0.69524753, "learning_rate": 3.8256517433562015e-06, "loss": 0.71737945, "num_input_tokens_seen": 28039615, "step": 1330, "time_per_iteration": 2.5553975105285645 }, { "auxiliary_loss_clip": 0.01227855, "auxiliary_loss_mlp": 0.01036025, "balance_loss_clip": 1.06534481, "balance_loss_mlp": 1.02704883, "epoch": 0.16004328744063007, "flos": 17676094533120.0, "grad_norm": 2.7546571020653694, "language_loss": 0.91858697, "learning_rate": 3.82533351257491e-06, "loss": 0.94122577, "num_input_tokens_seen": 28057565, "step": 1331, "time_per_iteration": 2.4413905143737793 }, { "auxiliary_loss_clip": 0.0121539, "auxiliary_loss_mlp": 0.01036362, "balance_loss_clip": 1.06783652, "balance_loss_mlp": 1.02681303, "epoch": 0.16016353033126918, "flos": 24098717779200.0, "grad_norm": 1.7333625243184108, "language_loss": 0.88734305, "learning_rate": 3.825015004891975e-06, "loss": 0.90986061, "num_input_tokens_seen": 28076305, "step": 1332, "time_per_iteration": 2.5300896167755127 }, { "auxiliary_loss_clip": 0.01211026, "auxiliary_loss_mlp": 0.01030573, "balance_loss_clip": 1.06296003, "balance_loss_mlp": 1.02062535, "epoch": 0.16028377322190826, "flos": 27634841112960.0, "grad_norm": 1.8013514686793173, "language_loss": 0.7596792, "learning_rate": 3.824696220355716e-06, "loss": 0.78209519, "num_input_tokens_seen": 28097895, "step": 1333, "time_per_iteration": 2.5350000858306885 }, { "auxiliary_loss_clip": 0.01196898, "auxiliary_loss_mlp": 0.01044124, "balance_loss_clip": 1.06147027, "balance_loss_mlp": 1.03393137, "epoch": 0.16040401611254734, "flos": 20961202648320.0, "grad_norm": 1.5950696886137068, "language_loss": 0.78940523, "learning_rate": 3.824377159014491e-06, "loss": 0.81181544, "num_input_tokens_seen": 28118790, "step": 1334, "time_per_iteration": 3.5115861892700195 }, { "auxiliary_loss_clip": 0.01211728, "auxiliary_loss_mlp": 0.01036307, "balance_loss_clip": 1.06505656, "balance_loss_mlp": 1.02653742, "epoch": 0.16052425900318643, "flos": 21247051080960.0, "grad_norm": 2.7081115425188123, "language_loss": 0.85111851, "learning_rate": 3.824057820916702e-06, "loss": 0.87359881, "num_input_tokens_seen": 28135995, "step": 1335, "time_per_iteration": 2.549170970916748 }, { "auxiliary_loss_clip": 0.01201159, "auxiliary_loss_mlp": 0.01031602, "balance_loss_clip": 1.06250501, "balance_loss_mlp": 1.02070642, "epoch": 0.16064450189382554, "flos": 15524004096000.0, "grad_norm": 5.70837034613212, "language_loss": 0.71691126, "learning_rate": 3.8237382061107904e-06, "loss": 0.73923886, "num_input_tokens_seen": 28152715, "step": 1336, "time_per_iteration": 2.482983112335205 }, { "auxiliary_loss_clip": 0.01125025, "auxiliary_loss_mlp": 0.01045104, "balance_loss_clip": 1.04826617, "balance_loss_mlp": 1.03539991, "epoch": 0.16076474478446462, "flos": 21178497974400.0, "grad_norm": 1.9038784275590552, "language_loss": 0.78527892, "learning_rate": 3.823418314645243e-06, "loss": 0.80698025, "num_input_tokens_seen": 28171590, "step": 1337, "time_per_iteration": 2.6402690410614014 }, { "auxiliary_loss_clip": 0.01151947, "auxiliary_loss_mlp": 0.01040552, "balance_loss_clip": 1.05759811, "balance_loss_mlp": 1.03121805, "epoch": 0.1608849876751037, "flos": 18366476912640.0, "grad_norm": 2.2080051398346994, "language_loss": 0.75382489, "learning_rate": 3.823098146568588e-06, "loss": 0.77574986, "num_input_tokens_seen": 28191295, "step": 1338, "time_per_iteration": 2.5634398460388184 }, { "auxiliary_loss_clip": 0.01211793, "auxiliary_loss_mlp": 0.01037522, "balance_loss_clip": 1.06369948, "balance_loss_mlp": 1.02862263, "epoch": 0.1610052305657428, "flos": 29497024880640.0, "grad_norm": 1.6013405989881024, "language_loss": 0.71679163, "learning_rate": 3.822777701929394e-06, "loss": 0.73928481, "num_input_tokens_seen": 28213120, "step": 1339, "time_per_iteration": 2.5468263626098633 }, { "auxiliary_loss_clip": 0.01200914, "auxiliary_loss_mlp": 0.01041717, "balance_loss_clip": 1.05954421, "balance_loss_mlp": 1.03128624, "epoch": 0.1611254734563819, "flos": 26797871329920.0, "grad_norm": 1.8032768835029767, "language_loss": 0.73576581, "learning_rate": 3.8224569807762714e-06, "loss": 0.75819218, "num_input_tokens_seen": 28232440, "step": 1340, "time_per_iteration": 2.5195319652557373 }, { "auxiliary_loss_clip": 0.01147128, "auxiliary_loss_mlp": 0.01042173, "balance_loss_clip": 1.05072176, "balance_loss_mlp": 1.0315274, "epoch": 0.16124571634702098, "flos": 22419570741120.0, "grad_norm": 1.8950014598778249, "language_loss": 0.76562715, "learning_rate": 3.822135983157873e-06, "loss": 0.78752011, "num_input_tokens_seen": 28251715, "step": 1341, "time_per_iteration": 2.598649501800537 }, { "auxiliary_loss_clip": 0.01224813, "auxiliary_loss_mlp": 0.00765386, "balance_loss_clip": 1.06451225, "balance_loss_mlp": 1.00135827, "epoch": 0.16136595923766006, "flos": 10999116103680.0, "grad_norm": 2.945320386367759, "language_loss": 0.84666026, "learning_rate": 3.821814709122896e-06, "loss": 0.86656225, "num_input_tokens_seen": 28269765, "step": 1342, "time_per_iteration": 2.530895709991455 }, { "auxiliary_loss_clip": 0.01196426, "auxiliary_loss_mlp": 0.01036763, "balance_loss_clip": 1.06143856, "balance_loss_mlp": 1.02738142, "epoch": 0.16148620212829917, "flos": 21214983214080.0, "grad_norm": 2.1281212295278324, "language_loss": 0.84891212, "learning_rate": 3.821493158720076e-06, "loss": 0.87124407, "num_input_tokens_seen": 28288870, "step": 1343, "time_per_iteration": 2.579946517944336 }, { "auxiliary_loss_clip": 0.01181437, "auxiliary_loss_mlp": 0.01033738, "balance_loss_clip": 1.05649877, "balance_loss_mlp": 1.0229609, "epoch": 0.16160644501893826, "flos": 16758468760320.0, "grad_norm": 2.8835141669314845, "language_loss": 0.73465884, "learning_rate": 3.821171331998191e-06, "loss": 0.75681055, "num_input_tokens_seen": 28305400, "step": 1344, "time_per_iteration": 2.512822389602661 }, { "auxiliary_loss_clip": 0.01112276, "auxiliary_loss_mlp": 0.01009186, "balance_loss_clip": 1.04560614, "balance_loss_mlp": 1.0065279, "epoch": 0.16172668790957734, "flos": 64444967308800.0, "grad_norm": 0.9347021770854987, "language_loss": 0.5448283, "learning_rate": 3.820849229006064e-06, "loss": 0.5660429, "num_input_tokens_seen": 28373150, "step": 1345, "time_per_iteration": 3.2853260040283203 }, { "auxiliary_loss_clip": 0.01230609, "auxiliary_loss_mlp": 0.01033698, "balance_loss_clip": 1.0661428, "balance_loss_mlp": 1.0237323, "epoch": 0.16184693080021645, "flos": 23257689759360.0, "grad_norm": 2.5152504911007587, "language_loss": 0.70680386, "learning_rate": 3.8205268497925564e-06, "loss": 0.72944701, "num_input_tokens_seen": 28393620, "step": 1346, "time_per_iteration": 2.4712090492248535 }, { "auxiliary_loss_clip": 0.01229389, "auxiliary_loss_mlp": 0.01038249, "balance_loss_clip": 1.06678486, "balance_loss_mlp": 1.02845621, "epoch": 0.16196717369085553, "flos": 17451113696640.0, "grad_norm": 2.235659299754506, "language_loss": 0.78539979, "learning_rate": 3.8202041944065725e-06, "loss": 0.8080762, "num_input_tokens_seen": 28409440, "step": 1347, "time_per_iteration": 2.425034761428833 }, { "auxiliary_loss_clip": 0.01230157, "auxiliary_loss_mlp": 0.01036703, "balance_loss_clip": 1.06876183, "balance_loss_mlp": 1.02648044, "epoch": 0.16208741658149461, "flos": 23873377806720.0, "grad_norm": 1.7500583286342155, "language_loss": 0.73807669, "learning_rate": 3.819881262897061e-06, "loss": 0.76074529, "num_input_tokens_seen": 28427575, "step": 1348, "time_per_iteration": 2.4973952770233154 }, { "auxiliary_loss_clip": 0.01186739, "auxiliary_loss_mlp": 0.01034755, "balance_loss_clip": 1.06578946, "balance_loss_mlp": 1.02391839, "epoch": 0.1622076594721337, "flos": 25884806584320.0, "grad_norm": 1.884018408940885, "language_loss": 0.73562455, "learning_rate": 3.819558055313008e-06, "loss": 0.7578395, "num_input_tokens_seen": 28448260, "step": 1349, "time_per_iteration": 2.6526899337768555 }, { "auxiliary_loss_clip": 0.01218988, "auxiliary_loss_mlp": 0.01041079, "balance_loss_clip": 1.06480014, "balance_loss_mlp": 1.03117836, "epoch": 0.1623279023627728, "flos": 21539759011200.0, "grad_norm": 2.0984279190709736, "language_loss": 0.77566063, "learning_rate": 3.819234571703444e-06, "loss": 0.79826134, "num_input_tokens_seen": 28467085, "step": 1350, "time_per_iteration": 2.4966864585876465 }, { "auxiliary_loss_clip": 0.01204774, "auxiliary_loss_mlp": 0.01040655, "balance_loss_clip": 1.06003189, "balance_loss_mlp": 1.03030181, "epoch": 0.1624481452534119, "flos": 22085421494400.0, "grad_norm": 1.8600360865161258, "language_loss": 0.85695267, "learning_rate": 3.8189108121174435e-06, "loss": 0.87940693, "num_input_tokens_seen": 28486850, "step": 1351, "time_per_iteration": 2.5167431831359863 }, { "auxiliary_loss_clip": 0.01180566, "auxiliary_loss_mlp": 0.01039758, "balance_loss_clip": 1.06437624, "balance_loss_mlp": 1.0297029, "epoch": 0.16256838814405097, "flos": 27087490690560.0, "grad_norm": 1.8317684110019719, "language_loss": 0.83584058, "learning_rate": 3.818586776604118e-06, "loss": 0.85804385, "num_input_tokens_seen": 28507490, "step": 1352, "time_per_iteration": 2.5973947048187256 }, { "auxiliary_loss_clip": 0.01195985, "auxiliary_loss_mlp": 0.0104071, "balance_loss_clip": 1.06215477, "balance_loss_mlp": 1.03058279, "epoch": 0.16268863103469008, "flos": 20120354196480.0, "grad_norm": 1.8695855380481676, "language_loss": 0.61486769, "learning_rate": 3.818262465212625e-06, "loss": 0.63723469, "num_input_tokens_seen": 28527615, "step": 1353, "time_per_iteration": 2.514058828353882 }, { "auxiliary_loss_clip": 0.01205958, "auxiliary_loss_mlp": 0.01047808, "balance_loss_clip": 1.06521297, "balance_loss_mlp": 1.03678751, "epoch": 0.16280887392532917, "flos": 18332792933760.0, "grad_norm": 1.8389471092448109, "language_loss": 0.77340096, "learning_rate": 3.817937877992161e-06, "loss": 0.79593861, "num_input_tokens_seen": 28544910, "step": 1354, "time_per_iteration": 3.2517943382263184 }, { "auxiliary_loss_clip": 0.01183704, "auxiliary_loss_mlp": 0.00766408, "balance_loss_clip": 1.05734324, "balance_loss_mlp": 1.0012275, "epoch": 0.16292911681596825, "flos": 11874330892800.0, "grad_norm": 4.189629685814826, "language_loss": 0.85334384, "learning_rate": 3.817613014991967e-06, "loss": 0.87284499, "num_input_tokens_seen": 28561050, "step": 1355, "time_per_iteration": 2.5084424018859863 }, { "auxiliary_loss_clip": 0.01175051, "auxiliary_loss_mlp": 0.01035481, "balance_loss_clip": 1.05732155, "balance_loss_mlp": 1.02522874, "epoch": 0.16304935970660733, "flos": 26103466627200.0, "grad_norm": 2.079017450339228, "language_loss": 0.76704466, "learning_rate": 3.817287876261323e-06, "loss": 0.78915, "num_input_tokens_seen": 28581385, "step": 1356, "time_per_iteration": 2.577904462814331 }, { "auxiliary_loss_clip": 0.01194618, "auxiliary_loss_mlp": 0.01033754, "balance_loss_clip": 1.06390715, "balance_loss_mlp": 1.02274537, "epoch": 0.16316960259724644, "flos": 29351945848320.0, "grad_norm": 1.9203192633913142, "language_loss": 0.80038011, "learning_rate": 3.816962461849553e-06, "loss": 0.82266378, "num_input_tokens_seen": 28603255, "step": 1357, "time_per_iteration": 2.577784538269043 }, { "auxiliary_loss_clip": 0.01193545, "auxiliary_loss_mlp": 0.01038386, "balance_loss_clip": 1.06450117, "balance_loss_mlp": 1.02784228, "epoch": 0.16328984548788553, "flos": 20886759711360.0, "grad_norm": 1.9062865964148028, "language_loss": 0.84846306, "learning_rate": 3.8166367718060235e-06, "loss": 0.87078232, "num_input_tokens_seen": 28623145, "step": 1358, "time_per_iteration": 2.559354305267334 }, { "auxiliary_loss_clip": 0.01209237, "auxiliary_loss_mlp": 0.01034096, "balance_loss_clip": 1.06168008, "balance_loss_mlp": 1.02407002, "epoch": 0.1634100883785246, "flos": 18041090584320.0, "grad_norm": 2.463957080596287, "language_loss": 0.76440525, "learning_rate": 3.816310806180139e-06, "loss": 0.78683859, "num_input_tokens_seen": 28641555, "step": 1359, "time_per_iteration": 2.459226131439209 }, { "auxiliary_loss_clip": 0.01192513, "auxiliary_loss_mlp": 0.01042625, "balance_loss_clip": 1.0613699, "balance_loss_mlp": 1.03256989, "epoch": 0.16353033126916372, "flos": 24572128055040.0, "grad_norm": 1.6189070321330312, "language_loss": 0.8088994, "learning_rate": 3.81598456502135e-06, "loss": 0.83125079, "num_input_tokens_seen": 28661575, "step": 1360, "time_per_iteration": 3.468073844909668 }, { "auxiliary_loss_clip": 0.01196332, "auxiliary_loss_mlp": 0.01040531, "balance_loss_clip": 1.0652957, "balance_loss_mlp": 1.0299809, "epoch": 0.1636505741598028, "flos": 19892895321600.0, "grad_norm": 2.000039107262487, "language_loss": 0.87098432, "learning_rate": 3.8156580483791455e-06, "loss": 0.89335293, "num_input_tokens_seen": 28676765, "step": 1361, "time_per_iteration": 3.2954471111297607 }, { "auxiliary_loss_clip": 0.01230967, "auxiliary_loss_mlp": 0.01035631, "balance_loss_clip": 1.0666666, "balance_loss_mlp": 1.025653, "epoch": 0.16377081705044189, "flos": 28402611344640.0, "grad_norm": 2.5208244661310597, "language_loss": 0.7678265, "learning_rate": 3.815331256303059e-06, "loss": 0.79049248, "num_input_tokens_seen": 28696795, "step": 1362, "time_per_iteration": 2.5115602016448975 }, { "auxiliary_loss_clip": 0.01180133, "auxiliary_loss_mlp": 0.01035799, "balance_loss_clip": 1.06296825, "balance_loss_mlp": 1.02529657, "epoch": 0.163891059941081, "flos": 21908059113600.0, "grad_norm": 2.1770114400105514, "language_loss": 0.77557611, "learning_rate": 3.815004188842665e-06, "loss": 0.79773539, "num_input_tokens_seen": 28714835, "step": 1363, "time_per_iteration": 2.5333497524261475 }, { "auxiliary_loss_clip": 0.01193249, "auxiliary_loss_mlp": 0.01036788, "balance_loss_clip": 1.05873311, "balance_loss_mlp": 1.02604723, "epoch": 0.16401130283172008, "flos": 26797619934720.0, "grad_norm": 1.6382319331718556, "language_loss": 0.80046415, "learning_rate": 3.814676846047578e-06, "loss": 0.82276446, "num_input_tokens_seen": 28735710, "step": 1364, "time_per_iteration": 2.56845760345459 }, { "auxiliary_loss_clip": 0.01210519, "auxiliary_loss_mlp": 0.01043548, "balance_loss_clip": 1.06334269, "balance_loss_mlp": 1.03334308, "epoch": 0.16413154572235916, "flos": 32997417160320.0, "grad_norm": 1.6800424987292948, "language_loss": 0.70344496, "learning_rate": 3.8143492279674565e-06, "loss": 0.72598565, "num_input_tokens_seen": 28758405, "step": 1365, "time_per_iteration": 2.58172607421875 }, { "auxiliary_loss_clip": 0.01109381, "auxiliary_loss_mlp": 0.01006006, "balance_loss_clip": 1.04409742, "balance_loss_mlp": 1.0031327, "epoch": 0.16425178861299825, "flos": 40113622074240.0, "grad_norm": 0.8419780044993703, "language_loss": 0.58476913, "learning_rate": 3.8140213346519997e-06, "loss": 0.60592306, "num_input_tokens_seen": 28809000, "step": 1366, "time_per_iteration": 2.869281530380249 }, { "auxiliary_loss_clip": 0.01167376, "auxiliary_loss_mlp": 0.01036153, "balance_loss_clip": 1.05608976, "balance_loss_mlp": 1.02591872, "epoch": 0.16437203150363736, "flos": 25447486498560.0, "grad_norm": 1.733083969531301, "language_loss": 0.77403069, "learning_rate": 3.813693166150948e-06, "loss": 0.79606593, "num_input_tokens_seen": 28829210, "step": 1367, "time_per_iteration": 2.5883116722106934 }, { "auxiliary_loss_clip": 0.01174026, "auxiliary_loss_mlp": 0.01035768, "balance_loss_clip": 1.0583725, "balance_loss_mlp": 1.02519977, "epoch": 0.16449227439427644, "flos": 23476888506240.0, "grad_norm": 2.2346238575865276, "language_loss": 0.85482037, "learning_rate": 3.813364722514086e-06, "loss": 0.87691832, "num_input_tokens_seen": 28847545, "step": 1368, "time_per_iteration": 2.583721160888672 }, { "auxiliary_loss_clip": 0.01210375, "auxiliary_loss_mlp": 0.01036431, "balance_loss_clip": 1.06160176, "balance_loss_mlp": 1.02639985, "epoch": 0.16461251728491552, "flos": 13545217802880.0, "grad_norm": 2.0230408962172817, "language_loss": 0.80816555, "learning_rate": 3.8130360037912368e-06, "loss": 0.83063364, "num_input_tokens_seen": 28863990, "step": 1369, "time_per_iteration": 2.4576709270477295 }, { "auxiliary_loss_clip": 0.01210056, "auxiliary_loss_mlp": 0.01041436, "balance_loss_clip": 1.06145298, "balance_loss_mlp": 1.03005731, "epoch": 0.16473276017555463, "flos": 23003298662400.0, "grad_norm": 1.9511306278999785, "language_loss": 0.81501842, "learning_rate": 3.812707010032268e-06, "loss": 0.83753335, "num_input_tokens_seen": 28883045, "step": 1370, "time_per_iteration": 2.518853187561035 }, { "auxiliary_loss_clip": 0.01219745, "auxiliary_loss_mlp": 0.01040782, "balance_loss_clip": 1.06846142, "balance_loss_mlp": 1.03033924, "epoch": 0.16485300306619372, "flos": 24790680357120.0, "grad_norm": 1.7125036783675724, "language_loss": 0.79440933, "learning_rate": 3.8123777412870863e-06, "loss": 0.81701458, "num_input_tokens_seen": 28902545, "step": 1371, "time_per_iteration": 2.55684232711792 }, { "auxiliary_loss_clip": 0.01203217, "auxiliary_loss_mlp": 0.01041595, "balance_loss_clip": 1.06136775, "balance_loss_mlp": 1.03152823, "epoch": 0.1649732459568328, "flos": 21106497162240.0, "grad_norm": 2.1325425615849722, "language_loss": 0.78685129, "learning_rate": 3.812048197605643e-06, "loss": 0.80929935, "num_input_tokens_seen": 28921440, "step": 1372, "time_per_iteration": 2.525838613510132 }, { "auxiliary_loss_clip": 0.01212257, "auxiliary_loss_mlp": 0.01029226, "balance_loss_clip": 1.06244302, "balance_loss_mlp": 1.01880085, "epoch": 0.16509348884747188, "flos": 20266726118400.0, "grad_norm": 1.8115963360665694, "language_loss": 0.81618851, "learning_rate": 3.8117183790379277e-06, "loss": 0.83860332, "num_input_tokens_seen": 28939890, "step": 1373, "time_per_iteration": 2.5070483684539795 }, { "auxiliary_loss_clip": 0.0122795, "auxiliary_loss_mlp": 0.01034944, "balance_loss_clip": 1.06444144, "balance_loss_mlp": 1.02419734, "epoch": 0.165213731738111, "flos": 11035493602560.0, "grad_norm": 2.714810433806094, "language_loss": 0.93467635, "learning_rate": 3.811388285633976e-06, "loss": 0.95730531, "num_input_tokens_seen": 28955875, "step": 1374, "time_per_iteration": 2.4512100219726562 }, { "auxiliary_loss_clip": 0.01171078, "auxiliary_loss_mlp": 0.01046444, "balance_loss_clip": 1.0563519, "balance_loss_mlp": 1.03607297, "epoch": 0.16533397462875007, "flos": 29972051268480.0, "grad_norm": 1.8133684746255796, "language_loss": 0.61846185, "learning_rate": 3.811057917443861e-06, "loss": 0.6406371, "num_input_tokens_seen": 28975140, "step": 1375, "time_per_iteration": 2.6408092975616455 }, { "auxiliary_loss_clip": 0.01125922, "auxiliary_loss_mlp": 0.0100883, "balance_loss_clip": 1.04679835, "balance_loss_mlp": 1.00600457, "epoch": 0.16545421751938916, "flos": 65556763027200.0, "grad_norm": 0.8520100086531267, "language_loss": 0.68370509, "learning_rate": 3.8107272745177e-06, "loss": 0.70505261, "num_input_tokens_seen": 29047470, "step": 1376, "time_per_iteration": 3.237907648086548 }, { "auxiliary_loss_clip": 0.01183851, "auxiliary_loss_mlp": 0.01037385, "balance_loss_clip": 1.0614059, "balance_loss_mlp": 1.02759159, "epoch": 0.16557446041002827, "flos": 22492361652480.0, "grad_norm": 1.81254315151487, "language_loss": 0.78804159, "learning_rate": 3.8103963569056513e-06, "loss": 0.81025398, "num_input_tokens_seen": 29066605, "step": 1377, "time_per_iteration": 2.5636985301971436 }, { "auxiliary_loss_clip": 0.01190261, "auxiliary_loss_mlp": 0.01039603, "balance_loss_clip": 1.05754375, "balance_loss_mlp": 1.02918434, "epoch": 0.16569470330066735, "flos": 24602723464320.0, "grad_norm": 1.8081945780223116, "language_loss": 0.88220656, "learning_rate": 3.8100651646579146e-06, "loss": 0.90450519, "num_input_tokens_seen": 29085815, "step": 1378, "time_per_iteration": 2.5551843643188477 }, { "auxiliary_loss_clip": 0.0119106, "auxiliary_loss_mlp": 0.01040802, "balance_loss_clip": 1.0567534, "balance_loss_mlp": 1.03037679, "epoch": 0.16581494619130643, "flos": 15006207588480.0, "grad_norm": 2.2579932200894492, "language_loss": 0.92520642, "learning_rate": 3.8097336978247317e-06, "loss": 0.94752502, "num_input_tokens_seen": 29102520, "step": 1379, "time_per_iteration": 2.5022709369659424 }, { "auxiliary_loss_clip": 0.01180913, "auxiliary_loss_mlp": 0.01033551, "balance_loss_clip": 1.05739605, "balance_loss_mlp": 1.02250028, "epoch": 0.16593518908194552, "flos": 17420338719360.0, "grad_norm": 8.971629474181267, "language_loss": 0.89236295, "learning_rate": 3.8094019564563854e-06, "loss": 0.91450757, "num_input_tokens_seen": 29119450, "step": 1380, "time_per_iteration": 2.4834752082824707 }, { "auxiliary_loss_clip": 0.01224668, "auxiliary_loss_mlp": 0.00765699, "balance_loss_clip": 1.06271076, "balance_loss_mlp": 1.00107872, "epoch": 0.16605543197258463, "flos": 20412631163520.0, "grad_norm": 2.052760973060971, "language_loss": 0.75581467, "learning_rate": 3.809069940603201e-06, "loss": 0.77571833, "num_input_tokens_seen": 29137405, "step": 1381, "time_per_iteration": 3.274097442626953 }, { "auxiliary_loss_clip": 0.01183993, "auxiliary_loss_mlp": 0.01036738, "balance_loss_clip": 1.05729604, "balance_loss_mlp": 1.02640867, "epoch": 0.1661756748632237, "flos": 14209745368320.0, "grad_norm": 2.0132959896732205, "language_loss": 0.78133857, "learning_rate": 3.8087376503155452e-06, "loss": 0.80354583, "num_input_tokens_seen": 29154890, "step": 1382, "time_per_iteration": 2.5003859996795654 }, { "auxiliary_loss_clip": 0.01113596, "auxiliary_loss_mlp": 0.01005909, "balance_loss_clip": 1.0397017, "balance_loss_mlp": 1.00326288, "epoch": 0.1662959177538628, "flos": 66080877350400.0, "grad_norm": 0.8994570687113425, "language_loss": 0.56258345, "learning_rate": 3.808405085643826e-06, "loss": 0.5837785, "num_input_tokens_seen": 29219770, "step": 1383, "time_per_iteration": 3.1710710525512695 }, { "auxiliary_loss_clip": 0.01229323, "auxiliary_loss_mlp": 0.00764991, "balance_loss_clip": 1.06563878, "balance_loss_mlp": 1.00100219, "epoch": 0.1664161606445019, "flos": 20740567357440.0, "grad_norm": 1.982860553189857, "language_loss": 0.89035559, "learning_rate": 3.8080722466384925e-06, "loss": 0.91029871, "num_input_tokens_seen": 29237620, "step": 1384, "time_per_iteration": 2.564457654953003 }, { "auxiliary_loss_clip": 0.0122652, "auxiliary_loss_mlp": 0.0103542, "balance_loss_clip": 1.06080079, "balance_loss_mlp": 1.02411914, "epoch": 0.166536403535141, "flos": 25260930236160.0, "grad_norm": 2.4412553749917314, "language_loss": 0.70852494, "learning_rate": 3.8077391333500376e-06, "loss": 0.73114431, "num_input_tokens_seen": 29256760, "step": 1385, "time_per_iteration": 2.5124237537384033 }, { "auxiliary_loss_clip": 0.01197055, "auxiliary_loss_mlp": 0.01034695, "balance_loss_clip": 1.06316423, "balance_loss_mlp": 1.02525413, "epoch": 0.16665664642578007, "flos": 25447450584960.0, "grad_norm": 1.6219565142174277, "language_loss": 0.76914406, "learning_rate": 3.8074057458289934e-06, "loss": 0.79146153, "num_input_tokens_seen": 29277450, "step": 1386, "time_per_iteration": 2.571053981781006 }, { "auxiliary_loss_clip": 0.01197183, "auxiliary_loss_mlp": 0.01032856, "balance_loss_clip": 1.05859804, "balance_loss_mlp": 1.022717, "epoch": 0.16677688931641918, "flos": 22200767043840.0, "grad_norm": 2.1283436875111574, "language_loss": 0.82579291, "learning_rate": 3.807072084125934e-06, "loss": 0.84809327, "num_input_tokens_seen": 29299300, "step": 1387, "time_per_iteration": 4.2407801151275635 }, { "auxiliary_loss_clip": 0.0119133, "auxiliary_loss_mlp": 0.01036626, "balance_loss_clip": 1.06075597, "balance_loss_mlp": 1.02601099, "epoch": 0.16689713220705826, "flos": 16945958776320.0, "grad_norm": 2.206606544761664, "language_loss": 0.80379069, "learning_rate": 3.806738148291477e-06, "loss": 0.82607031, "num_input_tokens_seen": 29316125, "step": 1388, "time_per_iteration": 3.274813413619995 }, { "auxiliary_loss_clip": 0.01153644, "auxiliary_loss_mlp": 0.01036511, "balance_loss_clip": 1.05329514, "balance_loss_mlp": 1.02517402, "epoch": 0.16701737509769735, "flos": 36244423923840.0, "grad_norm": 2.0105182950254163, "language_loss": 0.7117635, "learning_rate": 3.8064039383762793e-06, "loss": 0.73366505, "num_input_tokens_seen": 29338490, "step": 1389, "time_per_iteration": 2.7404568195343018 }, { "auxiliary_loss_clip": 0.01211217, "auxiliary_loss_mlp": 0.01035724, "balance_loss_clip": 1.0650568, "balance_loss_mlp": 1.02581751, "epoch": 0.16713761798833643, "flos": 23258659426560.0, "grad_norm": 2.0744782597917077, "language_loss": 0.77264869, "learning_rate": 3.8060694544310396e-06, "loss": 0.79511809, "num_input_tokens_seen": 29357000, "step": 1390, "time_per_iteration": 2.5060887336730957 }, { "auxiliary_loss_clip": 0.01227837, "auxiliary_loss_mlp": 0.01046129, "balance_loss_clip": 1.0635972, "balance_loss_mlp": 1.03505468, "epoch": 0.16725786087897554, "flos": 25302515207040.0, "grad_norm": 2.8604823169462423, "language_loss": 0.78473896, "learning_rate": 3.8057346965065006e-06, "loss": 0.80747861, "num_input_tokens_seen": 29378230, "step": 1391, "time_per_iteration": 2.5625696182250977 }, { "auxiliary_loss_clip": 0.01194707, "auxiliary_loss_mlp": 0.01039949, "balance_loss_clip": 1.06337333, "balance_loss_mlp": 1.02995348, "epoch": 0.16737810376961462, "flos": 31831541516160.0, "grad_norm": 1.7428483930149752, "language_loss": 0.84235871, "learning_rate": 3.805399664653443e-06, "loss": 0.86470526, "num_input_tokens_seen": 29400370, "step": 1392, "time_per_iteration": 2.5940401554107666 }, { "auxiliary_loss_clip": 0.01228854, "auxiliary_loss_mlp": 0.01032973, "balance_loss_clip": 1.06441784, "balance_loss_mlp": 1.02242899, "epoch": 0.1674983466602537, "flos": 27961843553280.0, "grad_norm": 2.5359832136223766, "language_loss": 0.73990822, "learning_rate": 3.805064358922692e-06, "loss": 0.76252645, "num_input_tokens_seen": 29418660, "step": 1393, "time_per_iteration": 2.4924235343933105 }, { "auxiliary_loss_clip": 0.01215762, "auxiliary_loss_mlp": 0.01034291, "balance_loss_clip": 1.0626502, "balance_loss_mlp": 1.02334189, "epoch": 0.16761858955089282, "flos": 21762656858880.0, "grad_norm": 1.6605558146107995, "language_loss": 0.80892026, "learning_rate": 3.8047287793651136e-06, "loss": 0.83142078, "num_input_tokens_seen": 29440105, "step": 1394, "time_per_iteration": 2.5098814964294434 }, { "auxiliary_loss_clip": 0.01182484, "auxiliary_loss_mlp": 0.01040966, "balance_loss_clip": 1.05948305, "balance_loss_mlp": 1.03104222, "epoch": 0.1677388324415319, "flos": 23805507058560.0, "grad_norm": 1.9090367690346433, "language_loss": 0.89084631, "learning_rate": 3.8043929260316137e-06, "loss": 0.91308081, "num_input_tokens_seen": 29458260, "step": 1395, "time_per_iteration": 2.5944623947143555 }, { "auxiliary_loss_clip": 0.01200802, "auxiliary_loss_mlp": 0.01038303, "balance_loss_clip": 1.06691086, "balance_loss_mlp": 1.02762794, "epoch": 0.16785907533217098, "flos": 20558859431040.0, "grad_norm": 2.6490667294790984, "language_loss": 0.83648103, "learning_rate": 3.8040567989731417e-06, "loss": 0.85887212, "num_input_tokens_seen": 29476205, "step": 1396, "time_per_iteration": 2.5344557762145996 }, { "auxiliary_loss_clip": 0.01206737, "auxiliary_loss_mlp": 0.01033, "balance_loss_clip": 1.06304944, "balance_loss_mlp": 1.02339196, "epoch": 0.16797931822281006, "flos": 15669657745920.0, "grad_norm": 2.0475327141272888, "language_loss": 0.79928887, "learning_rate": 3.8037203982406876e-06, "loss": 0.82168621, "num_input_tokens_seen": 29494370, "step": 1397, "time_per_iteration": 2.488276481628418 }, { "auxiliary_loss_clip": 0.01226939, "auxiliary_loss_mlp": 0.01035158, "balance_loss_clip": 1.06578791, "balance_loss_mlp": 1.02456009, "epoch": 0.16809956111344918, "flos": 16541101607040.0, "grad_norm": 1.8410107359287795, "language_loss": 0.73271787, "learning_rate": 3.8033837238852835e-06, "loss": 0.75533879, "num_input_tokens_seen": 29511070, "step": 1398, "time_per_iteration": 2.436167001724243 }, { "auxiliary_loss_clip": 0.01186933, "auxiliary_loss_mlp": 0.01036228, "balance_loss_clip": 1.05714464, "balance_loss_mlp": 1.02669716, "epoch": 0.16821980400408826, "flos": 23258084808960.0, "grad_norm": 1.7759487014664332, "language_loss": 0.69822156, "learning_rate": 3.8030467759580017e-06, "loss": 0.72045314, "num_input_tokens_seen": 29531990, "step": 1399, "time_per_iteration": 2.5321736335754395 }, { "auxiliary_loss_clip": 0.01215549, "auxiliary_loss_mlp": 0.01040832, "balance_loss_clip": 1.06260562, "balance_loss_mlp": 1.03009772, "epoch": 0.16834004689472734, "flos": 20774754126720.0, "grad_norm": 1.8760981690256469, "language_loss": 0.87315619, "learning_rate": 3.802709554509958e-06, "loss": 0.89572001, "num_input_tokens_seen": 29549790, "step": 1400, "time_per_iteration": 2.4656083583831787 }, { "auxiliary_loss_clip": 0.01194745, "auxiliary_loss_mlp": 0.01030626, "balance_loss_clip": 1.0597322, "balance_loss_mlp": 1.02144051, "epoch": 0.16846028978536645, "flos": 26687302289280.0, "grad_norm": 2.18740556275661, "language_loss": 0.79255962, "learning_rate": 3.8023720595923083e-06, "loss": 0.81481344, "num_input_tokens_seen": 29569045, "step": 1401, "time_per_iteration": 2.5857348442077637 }, { "auxiliary_loss_clip": 0.01161763, "auxiliary_loss_mlp": 0.01036317, "balance_loss_clip": 1.05569124, "balance_loss_mlp": 1.0259341, "epoch": 0.16858053267600553, "flos": 18843298980480.0, "grad_norm": 1.984343811345821, "language_loss": 0.87713242, "learning_rate": 3.80203429125625e-06, "loss": 0.89911318, "num_input_tokens_seen": 29587220, "step": 1402, "time_per_iteration": 2.5734236240386963 }, { "auxiliary_loss_clip": 0.01141583, "auxiliary_loss_mlp": 0.01034267, "balance_loss_clip": 1.05420411, "balance_loss_mlp": 1.02434897, "epoch": 0.16870077556664462, "flos": 27744548227200.0, "grad_norm": 1.7811897472461624, "language_loss": 0.70326531, "learning_rate": 3.8016962495530225e-06, "loss": 0.72502381, "num_input_tokens_seen": 29606410, "step": 1403, "time_per_iteration": 2.6648831367492676 }, { "auxiliary_loss_clip": 0.01229687, "auxiliary_loss_mlp": 0.01039471, "balance_loss_clip": 1.0662477, "balance_loss_mlp": 1.02994609, "epoch": 0.1688210184572837, "flos": 13730768484480.0, "grad_norm": 2.2473037856654776, "language_loss": 0.77412438, "learning_rate": 3.8013579345339063e-06, "loss": 0.79681587, "num_input_tokens_seen": 29621275, "step": 1404, "time_per_iteration": 2.428229570388794 }, { "auxiliary_loss_clip": 0.01185222, "auxiliary_loss_mlp": 0.01032288, "balance_loss_clip": 1.05883288, "balance_loss_mlp": 1.02195835, "epoch": 0.1689412613479228, "flos": 26468785900800.0, "grad_norm": 2.128282175240056, "language_loss": 0.69736111, "learning_rate": 3.801019346250224e-06, "loss": 0.7195363, "num_input_tokens_seen": 29641420, "step": 1405, "time_per_iteration": 2.6064810752868652 }, { "auxiliary_loss_clip": 0.01208163, "auxiliary_loss_mlp": 0.01032851, "balance_loss_clip": 1.0626018, "balance_loss_mlp": 1.02249122, "epoch": 0.1690615042385619, "flos": 21138852337920.0, "grad_norm": 4.432058064474053, "language_loss": 0.84003842, "learning_rate": 3.8006804847533395e-06, "loss": 0.86244857, "num_input_tokens_seen": 29660935, "step": 1406, "time_per_iteration": 2.5351383686065674 }, { "auxiliary_loss_clip": 0.01230538, "auxiliary_loss_mlp": 0.01039557, "balance_loss_clip": 1.06615186, "balance_loss_mlp": 1.03051496, "epoch": 0.16918174712920098, "flos": 20849340718080.0, "grad_norm": 2.1460214244826257, "language_loss": 0.8576861, "learning_rate": 3.8003413500946556e-06, "loss": 0.88038707, "num_input_tokens_seen": 29681045, "step": 1407, "time_per_iteration": 3.298356056213379 }, { "auxiliary_loss_clip": 0.01198509, "auxiliary_loss_mlp": 0.01041213, "balance_loss_clip": 1.06286693, "balance_loss_mlp": 1.03038335, "epoch": 0.1693019900198401, "flos": 16983270028800.0, "grad_norm": 2.373936961580943, "language_loss": 0.82466704, "learning_rate": 3.8000019423256216e-06, "loss": 0.84706426, "num_input_tokens_seen": 29698810, "step": 1408, "time_per_iteration": 2.5213544368743896 }, { "auxiliary_loss_clip": 0.01187343, "auxiliary_loss_mlp": 0.01045447, "balance_loss_clip": 1.06205058, "balance_loss_mlp": 1.03530288, "epoch": 0.16942223291047917, "flos": 26796901662720.0, "grad_norm": 1.5668950939483715, "language_loss": 0.87990034, "learning_rate": 3.7996622614977234e-06, "loss": 0.90222824, "num_input_tokens_seen": 29720000, "step": 1409, "time_per_iteration": 2.570082664489746 }, { "auxiliary_loss_clip": 0.01194845, "auxiliary_loss_mlp": 0.01035001, "balance_loss_clip": 1.06289458, "balance_loss_mlp": 1.02547002, "epoch": 0.16954247580111825, "flos": 18583700411520.0, "grad_norm": 1.7840986055052674, "language_loss": 0.79178345, "learning_rate": 3.799322307662492e-06, "loss": 0.81408191, "num_input_tokens_seen": 29737820, "step": 1410, "time_per_iteration": 2.5340983867645264 }, { "auxiliary_loss_clip": 0.01170895, "auxiliary_loss_mlp": 0.01031126, "balance_loss_clip": 1.05707288, "balance_loss_mlp": 1.02070141, "epoch": 0.16966271869175734, "flos": 13983651210240.0, "grad_norm": 2.1602495070158882, "language_loss": 0.83495837, "learning_rate": 3.798982080871496e-06, "loss": 0.8569786, "num_input_tokens_seen": 29752960, "step": 1411, "time_per_iteration": 2.6597843170166016 }, { "auxiliary_loss_clip": 0.01230922, "auxiliary_loss_mlp": 0.01039459, "balance_loss_clip": 1.06725669, "balance_loss_mlp": 1.0287838, "epoch": 0.16978296158239645, "flos": 37487328284160.0, "grad_norm": 2.186501420728542, "language_loss": 0.67638087, "learning_rate": 3.798641581176349e-06, "loss": 0.69908476, "num_input_tokens_seen": 29775240, "step": 1412, "time_per_iteration": 2.607224702835083 }, { "auxiliary_loss_clip": 0.0119717, "auxiliary_loss_mlp": 0.01043152, "balance_loss_clip": 1.06025338, "balance_loss_mlp": 1.03231621, "epoch": 0.16990320447303553, "flos": 28328958506880.0, "grad_norm": 1.8804893321233875, "language_loss": 0.74920118, "learning_rate": 3.7983008086287044e-06, "loss": 0.77160436, "num_input_tokens_seen": 29796560, "step": 1413, "time_per_iteration": 2.6074204444885254 }, { "auxiliary_loss_clip": 0.01192482, "auxiliary_loss_mlp": 0.01039117, "balance_loss_clip": 1.05804729, "balance_loss_mlp": 1.02831626, "epoch": 0.1700234473636746, "flos": 20188189031040.0, "grad_norm": 3.1100469396977504, "language_loss": 0.79611427, "learning_rate": 3.797959763280257e-06, "loss": 0.81843024, "num_input_tokens_seen": 29815245, "step": 1414, "time_per_iteration": 4.217747211456299 }, { "auxiliary_loss_clip": 0.01216257, "auxiliary_loss_mlp": 0.01046714, "balance_loss_clip": 1.06441629, "balance_loss_mlp": 1.03708172, "epoch": 0.17014369025431372, "flos": 24858658846080.0, "grad_norm": 1.8866636825495138, "language_loss": 0.79411358, "learning_rate": 3.797618445182743e-06, "loss": 0.81674325, "num_input_tokens_seen": 29836640, "step": 1415, "time_per_iteration": 3.3129775524139404 }, { "auxiliary_loss_clip": 0.01162699, "auxiliary_loss_mlp": 0.01035782, "balance_loss_clip": 1.05588186, "balance_loss_mlp": 1.02528572, "epoch": 0.1702639331449528, "flos": 16467233287680.0, "grad_norm": 1.943683991808472, "language_loss": 0.84756851, "learning_rate": 3.79727685438794e-06, "loss": 0.86955333, "num_input_tokens_seen": 29850830, "step": 1416, "time_per_iteration": 2.5465750694274902 }, { "auxiliary_loss_clip": 0.01134974, "auxiliary_loss_mlp": 0.01003247, "balance_loss_clip": 1.04604411, "balance_loss_mlp": 1.00050533, "epoch": 0.1703841760355919, "flos": 52508870979840.0, "grad_norm": 0.8388428360397058, "language_loss": 0.61706597, "learning_rate": 3.796934990947667e-06, "loss": 0.63844818, "num_input_tokens_seen": 29912515, "step": 1417, "time_per_iteration": 3.1363086700439453 }, { "auxiliary_loss_clip": 0.01133715, "auxiliary_loss_mlp": 0.0100463, "balance_loss_clip": 1.04555082, "balance_loss_mlp": 1.00185287, "epoch": 0.170504418926231, "flos": 49370637576960.0, "grad_norm": 0.878300632858872, "language_loss": 0.62479937, "learning_rate": 3.7965928549137854e-06, "loss": 0.64618289, "num_input_tokens_seen": 29969330, "step": 1418, "time_per_iteration": 2.9846107959747314 }, { "auxiliary_loss_clip": 0.01185632, "auxiliary_loss_mlp": 0.01040592, "balance_loss_clip": 1.0556097, "balance_loss_mlp": 1.02974439, "epoch": 0.17062466181687008, "flos": 25849219184640.0, "grad_norm": 2.0942533726877905, "language_loss": 0.77478254, "learning_rate": 3.7962504463381953e-06, "loss": 0.79704475, "num_input_tokens_seen": 29990820, "step": 1419, "time_per_iteration": 2.6042940616607666 }, { "auxiliary_loss_clip": 0.0119339, "auxiliary_loss_mlp": 0.00766508, "balance_loss_clip": 1.06383729, "balance_loss_mlp": 1.00090909, "epoch": 0.17074490470750917, "flos": 20960412549120.0, "grad_norm": 1.7503120276866715, "language_loss": 0.7898913, "learning_rate": 3.7959077652728412e-06, "loss": 0.80949026, "num_input_tokens_seen": 30009275, "step": 1420, "time_per_iteration": 2.5444934368133545 }, { "auxiliary_loss_clip": 0.01196036, "auxiliary_loss_mlp": 0.01039795, "balance_loss_clip": 1.05955565, "balance_loss_mlp": 1.02935195, "epoch": 0.17086514759814825, "flos": 20959766104320.0, "grad_norm": 2.6277076389224443, "language_loss": 0.77596462, "learning_rate": 3.795564811769707e-06, "loss": 0.79832292, "num_input_tokens_seen": 30027630, "step": 1421, "time_per_iteration": 2.5280802249908447 }, { "auxiliary_loss_clip": 0.0119714, "auxiliary_loss_mlp": 0.01037743, "balance_loss_clip": 1.06417918, "balance_loss_mlp": 1.02626896, "epoch": 0.17098539048878736, "flos": 28474073452800.0, "grad_norm": 1.9606844455074244, "language_loss": 0.78223407, "learning_rate": 3.795221585880818e-06, "loss": 0.80458289, "num_input_tokens_seen": 30048310, "step": 1422, "time_per_iteration": 2.607741117477417 }, { "auxiliary_loss_clip": 0.01182946, "auxiliary_loss_mlp": 0.01037483, "balance_loss_clip": 1.06351352, "balance_loss_mlp": 1.0280118, "epoch": 0.17110563337942644, "flos": 16290014561280.0, "grad_norm": 1.8635863668084915, "language_loss": 0.91161311, "learning_rate": 3.794878087658242e-06, "loss": 0.93381739, "num_input_tokens_seen": 30066080, "step": 1423, "time_per_iteration": 2.5263290405273438 }, { "auxiliary_loss_clip": 0.01212412, "auxiliary_loss_mlp": 0.01036035, "balance_loss_clip": 1.06093574, "balance_loss_mlp": 1.02628374, "epoch": 0.17122587627006552, "flos": 29674207693440.0, "grad_norm": 1.972740840965769, "language_loss": 0.78389752, "learning_rate": 3.7945343171540873e-06, "loss": 0.80638194, "num_input_tokens_seen": 30086955, "step": 1424, "time_per_iteration": 2.557713747024536 }, { "auxiliary_loss_clip": 0.01229543, "auxiliary_loss_mlp": 0.01038775, "balance_loss_clip": 1.06529009, "balance_loss_mlp": 1.02756333, "epoch": 0.17134611916070464, "flos": 25338389915520.0, "grad_norm": 1.7429233529747776, "language_loss": 0.7850129, "learning_rate": 3.7941902744205033e-06, "loss": 0.8076961, "num_input_tokens_seen": 30107990, "step": 1425, "time_per_iteration": 2.517632246017456 }, { "auxiliary_loss_clip": 0.01200463, "auxiliary_loss_mlp": 0.01035194, "balance_loss_clip": 1.06053579, "balance_loss_mlp": 1.02386379, "epoch": 0.17146636205134372, "flos": 13953845900160.0, "grad_norm": 1.8725603799353174, "language_loss": 0.83362359, "learning_rate": 3.7938459595096817e-06, "loss": 0.85598022, "num_input_tokens_seen": 30126535, "step": 1426, "time_per_iteration": 2.5453715324401855 }, { "auxiliary_loss_clip": 0.01218655, "auxiliary_loss_mlp": 0.01037703, "balance_loss_clip": 1.06246901, "balance_loss_mlp": 1.02697444, "epoch": 0.1715866049419828, "flos": 23915214172800.0, "grad_norm": 1.749569964461334, "language_loss": 0.86320484, "learning_rate": 3.7935013724738545e-06, "loss": 0.88576841, "num_input_tokens_seen": 30147035, "step": 1427, "time_per_iteration": 2.524198055267334 }, { "auxiliary_loss_clip": 0.01206981, "auxiliary_loss_mlp": 0.01040626, "balance_loss_clip": 1.06174147, "balance_loss_mlp": 1.03084469, "epoch": 0.17170684783262188, "flos": 22709369669760.0, "grad_norm": 1.7177835555974934, "language_loss": 0.77919137, "learning_rate": 3.7931565133652945e-06, "loss": 0.80166739, "num_input_tokens_seen": 30167110, "step": 1428, "time_per_iteration": 2.503652572631836 }, { "auxiliary_loss_clip": 0.01226606, "auxiliary_loss_mlp": 0.01036469, "balance_loss_clip": 1.06393242, "balance_loss_mlp": 1.02600861, "epoch": 0.171827090723261, "flos": 26613290315520.0, "grad_norm": 2.3727193250489607, "language_loss": 0.68187082, "learning_rate": 3.792811382236317e-06, "loss": 0.70450157, "num_input_tokens_seen": 30185620, "step": 1429, "time_per_iteration": 2.523554801940918 }, { "auxiliary_loss_clip": 0.01216202, "auxiliary_loss_mlp": 0.0103349, "balance_loss_clip": 1.06277788, "balance_loss_mlp": 1.02280283, "epoch": 0.17194733361390008, "flos": 28148507556480.0, "grad_norm": 2.075552834756765, "language_loss": 0.78314114, "learning_rate": 3.792465979139279e-06, "loss": 0.80563807, "num_input_tokens_seen": 30208225, "step": 1430, "time_per_iteration": 2.573277473449707 }, { "auxiliary_loss_clip": 0.01100972, "auxiliary_loss_mlp": 0.0102745, "balance_loss_clip": 1.03785551, "balance_loss_mlp": 1.02462447, "epoch": 0.17206757650453916, "flos": 65530689753600.0, "grad_norm": 0.9244882717881197, "language_loss": 0.65645969, "learning_rate": 3.792120304126576e-06, "loss": 0.67774385, "num_input_tokens_seen": 30271600, "step": 1431, "time_per_iteration": 3.1704819202423096 }, { "auxiliary_loss_clip": 0.01138726, "auxiliary_loss_mlp": 0.01027081, "balance_loss_clip": 1.05465043, "balance_loss_mlp": 1.01774073, "epoch": 0.17218781939517827, "flos": 22273486128000.0, "grad_norm": 1.878238450222419, "language_loss": 0.83875877, "learning_rate": 3.791774357250649e-06, "loss": 0.86041683, "num_input_tokens_seen": 30290430, "step": 1432, "time_per_iteration": 2.774661064147949 }, { "auxiliary_loss_clip": 0.01192632, "auxiliary_loss_mlp": 0.01048282, "balance_loss_clip": 1.05880046, "balance_loss_mlp": 1.03708231, "epoch": 0.17230806228581735, "flos": 14137313592960.0, "grad_norm": 2.074017535049121, "language_loss": 0.79137564, "learning_rate": 3.7914281385639757e-06, "loss": 0.81378484, "num_input_tokens_seen": 30308305, "step": 1433, "time_per_iteration": 2.5613033771514893 }, { "auxiliary_loss_clip": 0.01209972, "auxiliary_loss_mlp": 0.01034717, "balance_loss_clip": 1.05939877, "balance_loss_mlp": 1.02413106, "epoch": 0.17242830517645644, "flos": 20704836303360.0, "grad_norm": 2.020455169326723, "language_loss": 0.79531109, "learning_rate": 3.7910816481190784e-06, "loss": 0.81775796, "num_input_tokens_seen": 30328120, "step": 1434, "time_per_iteration": 3.3162264823913574 }, { "auxiliary_loss_clip": 0.01182037, "auxiliary_loss_mlp": 0.01037166, "balance_loss_clip": 1.05602324, "balance_loss_mlp": 1.02668166, "epoch": 0.17254854806709552, "flos": 30774582887040.0, "grad_norm": 2.0229952467878256, "language_loss": 0.74950767, "learning_rate": 3.7907348859685193e-06, "loss": 0.77169973, "num_input_tokens_seen": 30349825, "step": 1435, "time_per_iteration": 2.602051258087158 }, { "auxiliary_loss_clip": 0.01202795, "auxiliary_loss_mlp": 0.01034527, "balance_loss_clip": 1.06140232, "balance_loss_mlp": 1.02376199, "epoch": 0.17266879095773463, "flos": 26614726859520.0, "grad_norm": 1.92522314709842, "language_loss": 0.80151463, "learning_rate": 3.790387852164902e-06, "loss": 0.82388783, "num_input_tokens_seen": 30370555, "step": 1436, "time_per_iteration": 2.533043384552002 }, { "auxiliary_loss_clip": 0.01210455, "auxiliary_loss_mlp": 0.01037869, "balance_loss_clip": 1.06172287, "balance_loss_mlp": 1.02795076, "epoch": 0.1727890338483737, "flos": 20266295155200.0, "grad_norm": 4.618461891576178, "language_loss": 0.77113122, "learning_rate": 3.7900405467608707e-06, "loss": 0.79361439, "num_input_tokens_seen": 30390100, "step": 1437, "time_per_iteration": 2.4939024448394775 }, { "auxiliary_loss_clip": 0.01149515, "auxiliary_loss_mlp": 0.01033304, "balance_loss_clip": 1.05029559, "balance_loss_mlp": 1.02269423, "epoch": 0.1729092767390128, "flos": 18179812909440.0, "grad_norm": 3.2653280561745, "language_loss": 0.78959757, "learning_rate": 3.7896929698091114e-06, "loss": 0.81142581, "num_input_tokens_seen": 30402915, "step": 1438, "time_per_iteration": 2.5417556762695312 }, { "auxiliary_loss_clip": 0.01228201, "auxiliary_loss_mlp": 0.01042917, "balance_loss_clip": 1.06677377, "balance_loss_mlp": 1.03250384, "epoch": 0.1730295196296519, "flos": 26759518583040.0, "grad_norm": 3.5712285430426873, "language_loss": 0.68447709, "learning_rate": 3.7893451213623518e-06, "loss": 0.70718831, "num_input_tokens_seen": 30420145, "step": 1439, "time_per_iteration": 2.4793314933776855 }, { "auxiliary_loss_clip": 0.01210656, "auxiliary_loss_mlp": 0.00765794, "balance_loss_clip": 1.06375563, "balance_loss_mlp": 1.00084448, "epoch": 0.173149762520291, "flos": 23842531002240.0, "grad_norm": 2.2291654417466815, "language_loss": 0.82187635, "learning_rate": 3.7889970014733606e-06, "loss": 0.84164083, "num_input_tokens_seen": 30439250, "step": 1440, "time_per_iteration": 3.323448896408081 }, { "auxiliary_loss_clip": 0.01146114, "auxiliary_loss_mlp": 0.01036768, "balance_loss_clip": 1.05067933, "balance_loss_mlp": 1.02589035, "epoch": 0.17327000541093007, "flos": 23368186972800.0, "grad_norm": 1.7709030682981897, "language_loss": 0.78132027, "learning_rate": 3.7886486101949463e-06, "loss": 0.8031491, "num_input_tokens_seen": 30460430, "step": 1441, "time_per_iteration": 4.266057252883911 }, { "auxiliary_loss_clip": 0.01154186, "auxiliary_loss_mlp": 0.0104612, "balance_loss_clip": 1.05389822, "balance_loss_mlp": 1.0355165, "epoch": 0.17339024830156918, "flos": 18221290139520.0, "grad_norm": 1.9061362269322848, "language_loss": 0.88305372, "learning_rate": 3.7882999475799594e-06, "loss": 0.90505677, "num_input_tokens_seen": 30478465, "step": 1442, "time_per_iteration": 2.5881457328796387 }, { "auxiliary_loss_clip": 0.01148973, "auxiliary_loss_mlp": 0.01039488, "balance_loss_clip": 1.05625772, "balance_loss_mlp": 1.02933741, "epoch": 0.17351049119220827, "flos": 23332024955520.0, "grad_norm": 1.8436320015938201, "language_loss": 0.81345373, "learning_rate": 3.787951013681293e-06, "loss": 0.83533835, "num_input_tokens_seen": 30496510, "step": 1443, "time_per_iteration": 2.612109422683716 }, { "auxiliary_loss_clip": 0.0120618, "auxiliary_loss_mlp": 0.01043906, "balance_loss_clip": 1.05773818, "balance_loss_mlp": 1.03287363, "epoch": 0.17363073408284735, "flos": 23803495896960.0, "grad_norm": 1.9162495103262853, "language_loss": 0.77306521, "learning_rate": 3.787601808551879e-06, "loss": 0.79556608, "num_input_tokens_seen": 30516325, "step": 1444, "time_per_iteration": 2.516946792602539 }, { "auxiliary_loss_clip": 0.0118117, "auxiliary_loss_mlp": 0.01045223, "balance_loss_clip": 1.05812764, "balance_loss_mlp": 1.03485739, "epoch": 0.17375097697348643, "flos": 18515290959360.0, "grad_norm": 2.439937634319878, "language_loss": 0.83674777, "learning_rate": 3.7872523322446926e-06, "loss": 0.85901171, "num_input_tokens_seen": 30535210, "step": 1445, "time_per_iteration": 2.5380184650421143 }, { "auxiliary_loss_clip": 0.01169404, "auxiliary_loss_mlp": 0.01030467, "balance_loss_clip": 1.05203438, "balance_loss_mlp": 1.02079868, "epoch": 0.17387121986412554, "flos": 38877897456000.0, "grad_norm": 1.7687062258596444, "language_loss": 0.60204852, "learning_rate": 3.7869025848127478e-06, "loss": 0.62404728, "num_input_tokens_seen": 30559405, "step": 1446, "time_per_iteration": 2.8109099864959717 }, { "auxiliary_loss_clip": 0.01208658, "auxiliary_loss_mlp": 0.01038238, "balance_loss_clip": 1.05843425, "balance_loss_mlp": 1.02814138, "epoch": 0.17399146275476463, "flos": 20375714960640.0, "grad_norm": 4.609653296830032, "language_loss": 0.80659467, "learning_rate": 3.786552566309102e-06, "loss": 0.82906365, "num_input_tokens_seen": 30577615, "step": 1447, "time_per_iteration": 2.563925266265869 }, { "auxiliary_loss_clip": 0.01191322, "auxiliary_loss_mlp": 0.00765469, "balance_loss_clip": 1.06150126, "balance_loss_mlp": 1.00075054, "epoch": 0.1741117056454037, "flos": 19164339763200.0, "grad_norm": 2.155001230410101, "language_loss": 0.86027318, "learning_rate": 3.7862022767868517e-06, "loss": 0.87984109, "num_input_tokens_seen": 30595205, "step": 1448, "time_per_iteration": 2.5667619705200195 }, { "auxiliary_loss_clip": 0.01177304, "auxiliary_loss_mlp": 0.01042081, "balance_loss_clip": 1.06241715, "balance_loss_mlp": 1.0320673, "epoch": 0.17423194853604282, "flos": 25374300537600.0, "grad_norm": 2.703550903405167, "language_loss": 0.84045398, "learning_rate": 3.7858517162991367e-06, "loss": 0.86264777, "num_input_tokens_seen": 30615280, "step": 1449, "time_per_iteration": 2.5943922996520996 }, { "auxiliary_loss_clip": 0.01180372, "auxiliary_loss_mlp": 0.01037063, "balance_loss_clip": 1.05726457, "balance_loss_mlp": 1.02626264, "epoch": 0.1743521914266819, "flos": 25191874339200.0, "grad_norm": 2.35581001849865, "language_loss": 0.60471541, "learning_rate": 3.7855008848991363e-06, "loss": 0.62688971, "num_input_tokens_seen": 30633485, "step": 1450, "time_per_iteration": 2.662813901901245 }, { "auxiliary_loss_clip": 0.01190967, "auxiliary_loss_mlp": 0.0103882, "balance_loss_clip": 1.06117392, "balance_loss_mlp": 1.02910447, "epoch": 0.17447243431732098, "flos": 25666577504640.0, "grad_norm": 2.1520722266441887, "language_loss": 0.77669948, "learning_rate": 3.7851497826400714e-06, "loss": 0.7989974, "num_input_tokens_seen": 30653625, "step": 1451, "time_per_iteration": 2.6153042316436768 }, { "auxiliary_loss_clip": 0.01226882, "auxiliary_loss_mlp": 0.01039907, "balance_loss_clip": 1.06416225, "balance_loss_mlp": 1.02949405, "epoch": 0.17459267720796007, "flos": 36281950657920.0, "grad_norm": 2.3020210001543644, "language_loss": 0.76027685, "learning_rate": 3.7847984095752034e-06, "loss": 0.78294468, "num_input_tokens_seen": 30677080, "step": 1452, "time_per_iteration": 2.599299907684326 }, { "auxiliary_loss_clip": 0.01225495, "auxiliary_loss_mlp": 0.01031859, "balance_loss_clip": 1.06297827, "balance_loss_mlp": 1.02205372, "epoch": 0.17471292009859918, "flos": 20011113959040.0, "grad_norm": 1.9298031505748081, "language_loss": 0.80195272, "learning_rate": 3.784446765757836e-06, "loss": 0.82452631, "num_input_tokens_seen": 30695725, "step": 1453, "time_per_iteration": 2.4770569801330566 }, { "auxiliary_loss_clip": 0.01160946, "auxiliary_loss_mlp": 0.01034635, "balance_loss_clip": 1.05506372, "balance_loss_mlp": 1.02446628, "epoch": 0.17483316298923826, "flos": 27819242559360.0, "grad_norm": 2.3354274889735556, "language_loss": 0.776748, "learning_rate": 3.7840948512413133e-06, "loss": 0.79870385, "num_input_tokens_seen": 30713310, "step": 1454, "time_per_iteration": 2.60199236869812 }, { "auxiliary_loss_clip": 0.01177273, "auxiliary_loss_mlp": 0.01037327, "balance_loss_clip": 1.06002808, "balance_loss_mlp": 1.02616858, "epoch": 0.17495340587987734, "flos": 44017934791680.0, "grad_norm": 1.89224206983156, "language_loss": 0.78945887, "learning_rate": 3.7837426660790196e-06, "loss": 0.81160486, "num_input_tokens_seen": 30734725, "step": 1455, "time_per_iteration": 2.738637924194336 }, { "auxiliary_loss_clip": 0.01222324, "auxiliary_loss_mlp": 0.01042938, "balance_loss_clip": 1.0622673, "balance_loss_mlp": 1.03334212, "epoch": 0.17507364877051645, "flos": 20885825957760.0, "grad_norm": 2.4297032226263107, "language_loss": 0.81725085, "learning_rate": 3.783390210324382e-06, "loss": 0.83990347, "num_input_tokens_seen": 30754450, "step": 1456, "time_per_iteration": 2.4665238857269287 }, { "auxiliary_loss_clip": 0.01177982, "auxiliary_loss_mlp": 0.0103402, "balance_loss_clip": 1.05898142, "balance_loss_mlp": 1.02416682, "epoch": 0.17519389166115554, "flos": 24717602136960.0, "grad_norm": 1.973457794068408, "language_loss": 0.73072994, "learning_rate": 3.7830374840308676e-06, "loss": 0.75284994, "num_input_tokens_seen": 30774605, "step": 1457, "time_per_iteration": 2.5830042362213135 }, { "auxiliary_loss_clip": 0.01213608, "auxiliary_loss_mlp": 0.01038405, "balance_loss_clip": 1.06407142, "balance_loss_mlp": 1.02771223, "epoch": 0.17531413455179462, "flos": 23798144770560.0, "grad_norm": 2.555148700001538, "language_loss": 0.82423353, "learning_rate": 3.7826844872519842e-06, "loss": 0.84675366, "num_input_tokens_seen": 30792460, "step": 1458, "time_per_iteration": 2.5104057788848877 }, { "auxiliary_loss_clip": 0.0119335, "auxiliary_loss_mlp": 0.01035738, "balance_loss_clip": 1.06257606, "balance_loss_mlp": 1.0265826, "epoch": 0.1754343774424337, "flos": 24572379450240.0, "grad_norm": 1.8569162347459516, "language_loss": 0.72393346, "learning_rate": 3.782331220041282e-06, "loss": 0.74622434, "num_input_tokens_seen": 30812525, "step": 1459, "time_per_iteration": 2.5745351314544678 }, { "auxiliary_loss_clip": 0.01187546, "auxiliary_loss_mlp": 0.01035122, "balance_loss_clip": 1.05800128, "balance_loss_mlp": 1.02525711, "epoch": 0.17555462033307281, "flos": 18114599767680.0, "grad_norm": 2.042219647492752, "language_loss": 0.82882649, "learning_rate": 3.7819776824523504e-06, "loss": 0.85105312, "num_input_tokens_seen": 30830390, "step": 1460, "time_per_iteration": 3.3179755210876465 }, { "auxiliary_loss_clip": 0.01202059, "auxiliary_loss_mlp": 0.01039125, "balance_loss_clip": 1.06060779, "balance_loss_mlp": 1.02904034, "epoch": 0.1756748632237119, "flos": 28366018364160.0, "grad_norm": 1.8475730294686807, "language_loss": 0.84003615, "learning_rate": 3.7816238745388213e-06, "loss": 0.8624481, "num_input_tokens_seen": 30849935, "step": 1461, "time_per_iteration": 2.5845611095428467 }, { "auxiliary_loss_clip": 0.01200645, "auxiliary_loss_mlp": 0.0103381, "balance_loss_clip": 1.05853176, "balance_loss_mlp": 1.02432132, "epoch": 0.17579510611435098, "flos": 25732939881600.0, "grad_norm": 1.8750138804707794, "language_loss": 0.87164271, "learning_rate": 3.781269796354367e-06, "loss": 0.89398724, "num_input_tokens_seen": 30869555, "step": 1462, "time_per_iteration": 2.5806846618652344 }, { "auxiliary_loss_clip": 0.01195185, "auxiliary_loss_mlp": 0.01037909, "balance_loss_clip": 1.06106198, "balance_loss_mlp": 1.02833033, "epoch": 0.1759153490049901, "flos": 18588081870720.0, "grad_norm": 1.947386897657048, "language_loss": 0.86154115, "learning_rate": 3.7809154479527006e-06, "loss": 0.88387203, "num_input_tokens_seen": 30888760, "step": 1463, "time_per_iteration": 2.5226638317108154 }, { "auxiliary_loss_clip": 0.01169522, "auxiliary_loss_mlp": 0.01028578, "balance_loss_clip": 1.05673873, "balance_loss_mlp": 1.01911306, "epoch": 0.17603559189562917, "flos": 18619323724800.0, "grad_norm": 2.001769956131492, "language_loss": 0.84472024, "learning_rate": 3.780560829387577e-06, "loss": 0.86670125, "num_input_tokens_seen": 30907260, "step": 1464, "time_per_iteration": 2.554675579071045 }, { "auxiliary_loss_clip": 0.01128691, "auxiliary_loss_mlp": 0.01005733, "balance_loss_clip": 1.0426625, "balance_loss_mlp": 1.00286007, "epoch": 0.17615583478626826, "flos": 60530775373440.0, "grad_norm": 0.8495114524484088, "language_loss": 0.57915521, "learning_rate": 3.7802059407127915e-06, "loss": 0.60049939, "num_input_tokens_seen": 30965810, "step": 1465, "time_per_iteration": 3.0502445697784424 }, { "auxiliary_loss_clip": 0.01186942, "auxiliary_loss_mlp": 0.01040673, "balance_loss_clip": 1.05628407, "balance_loss_mlp": 1.03075504, "epoch": 0.17627607767690734, "flos": 23616221362560.0, "grad_norm": 2.2109287551385486, "language_loss": 0.85917604, "learning_rate": 3.7798507819821797e-06, "loss": 0.8814522, "num_input_tokens_seen": 30982935, "step": 1466, "time_per_iteration": 2.5486092567443848 }, { "auxiliary_loss_clip": 0.01173108, "auxiliary_loss_mlp": 0.0104459, "balance_loss_clip": 1.05823112, "balance_loss_mlp": 1.03389132, "epoch": 0.17639632056754645, "flos": 17639070589440.0, "grad_norm": 2.0832754892882273, "language_loss": 0.78466392, "learning_rate": 3.7794953532496197e-06, "loss": 0.8068409, "num_input_tokens_seen": 30998840, "step": 1467, "time_per_iteration": 4.221733093261719 }, { "auxiliary_loss_clip": 0.01071842, "auxiliary_loss_mlp": 0.00754799, "balance_loss_clip": 1.03470135, "balance_loss_mlp": 0.99977905, "epoch": 0.17651656345818553, "flos": 57932604910080.0, "grad_norm": 0.855341136007038, "language_loss": 0.57919025, "learning_rate": 3.7791396545690295e-06, "loss": 0.59745669, "num_input_tokens_seen": 31060075, "step": 1468, "time_per_iteration": 3.8507862091064453 }, { "auxiliary_loss_clip": 0.01210613, "auxiliary_loss_mlp": 0.01038874, "balance_loss_clip": 1.06518435, "balance_loss_mlp": 1.02934897, "epoch": 0.17663680634882462, "flos": 22929502170240.0, "grad_norm": 2.158932158853507, "language_loss": 0.81077164, "learning_rate": 3.7787836859943685e-06, "loss": 0.8332665, "num_input_tokens_seen": 31078800, "step": 1469, "time_per_iteration": 2.529515504837036 }, { "auxiliary_loss_clip": 0.01209728, "auxiliary_loss_mlp": 0.01038, "balance_loss_clip": 1.06318355, "balance_loss_mlp": 1.02758694, "epoch": 0.17675704923946373, "flos": 22637979388800.0, "grad_norm": 2.278284758983468, "language_loss": 0.78876251, "learning_rate": 3.7784274475796363e-06, "loss": 0.81123972, "num_input_tokens_seen": 31097430, "step": 1470, "time_per_iteration": 2.5154569149017334 }, { "auxiliary_loss_clip": 0.01178776, "auxiliary_loss_mlp": 0.010363, "balance_loss_clip": 1.05563188, "balance_loss_mlp": 1.02638745, "epoch": 0.1768772921301028, "flos": 27126525795840.0, "grad_norm": 1.9951574275723658, "language_loss": 0.7625432, "learning_rate": 3.7780709393788745e-06, "loss": 0.78469396, "num_input_tokens_seen": 31117905, "step": 1471, "time_per_iteration": 2.7140955924987793 }, { "auxiliary_loss_clip": 0.01222892, "auxiliary_loss_mlp": 0.01033481, "balance_loss_clip": 1.06321156, "balance_loss_mlp": 1.02297306, "epoch": 0.1769975350207419, "flos": 19172133014400.0, "grad_norm": 4.366009816233025, "language_loss": 0.75224197, "learning_rate": 3.777714161446165e-06, "loss": 0.77480567, "num_input_tokens_seen": 31137610, "step": 1472, "time_per_iteration": 2.4866795539855957 }, { "auxiliary_loss_clip": 0.01208347, "auxiliary_loss_mlp": 0.01030992, "balance_loss_clip": 1.06228113, "balance_loss_mlp": 1.02123463, "epoch": 0.177117777911381, "flos": 36134932291200.0, "grad_norm": 2.1389740925599288, "language_loss": 0.69545299, "learning_rate": 3.7773571138356304e-06, "loss": 0.71784639, "num_input_tokens_seen": 31157780, "step": 1473, "time_per_iteration": 2.6275858879089355 }, { "auxiliary_loss_clip": 0.01150146, "auxiliary_loss_mlp": 0.01028438, "balance_loss_clip": 1.0546205, "balance_loss_mlp": 1.01954544, "epoch": 0.17723802080202009, "flos": 22090593052800.0, "grad_norm": 2.5795402931985296, "language_loss": 0.89260077, "learning_rate": 3.776999796601435e-06, "loss": 0.91438657, "num_input_tokens_seen": 31176540, "step": 1474, "time_per_iteration": 2.667428493499756 }, { "auxiliary_loss_clip": 0.01214343, "auxiliary_loss_mlp": 0.01036738, "balance_loss_clip": 1.06284189, "balance_loss_mlp": 1.02686131, "epoch": 0.17735826369265917, "flos": 30222671437440.0, "grad_norm": 2.1946095154893928, "language_loss": 0.7291131, "learning_rate": 3.776642209797783e-06, "loss": 0.75162393, "num_input_tokens_seen": 31198370, "step": 1475, "time_per_iteration": 2.5917141437530518 }, { "auxiliary_loss_clip": 0.01204982, "auxiliary_loss_mlp": 0.01032309, "balance_loss_clip": 1.05961418, "balance_loss_mlp": 1.02152681, "epoch": 0.17747850658329825, "flos": 21397588980480.0, "grad_norm": 2.429136957104276, "language_loss": 0.78360379, "learning_rate": 3.7762843534789205e-06, "loss": 0.80597675, "num_input_tokens_seen": 31217120, "step": 1476, "time_per_iteration": 2.5454788208007812 }, { "auxiliary_loss_clip": 0.01198183, "auxiliary_loss_mlp": 0.01035271, "balance_loss_clip": 1.05950713, "balance_loss_mlp": 1.02564454, "epoch": 0.17759874947393736, "flos": 16983341856000.0, "grad_norm": 2.0581964809595625, "language_loss": 0.88305044, "learning_rate": 3.7759262276991343e-06, "loss": 0.90538502, "num_input_tokens_seen": 31234730, "step": 1477, "time_per_iteration": 2.535046339035034 }, { "auxiliary_loss_clip": 0.01200286, "auxiliary_loss_mlp": 0.01032532, "balance_loss_clip": 1.06151652, "balance_loss_mlp": 1.02220833, "epoch": 0.17771899236457644, "flos": 11546107390080.0, "grad_norm": 2.3361819428161628, "language_loss": 0.80585402, "learning_rate": 3.7755678325127506e-06, "loss": 0.82818222, "num_input_tokens_seen": 31252410, "step": 1478, "time_per_iteration": 2.5008137226104736 }, { "auxiliary_loss_clip": 0.011603, "auxiliary_loss_mlp": 0.01032603, "balance_loss_clip": 1.05909979, "balance_loss_mlp": 1.02286983, "epoch": 0.17783923525521553, "flos": 18807747494400.0, "grad_norm": 1.713092054486288, "language_loss": 0.75895846, "learning_rate": 3.7752091679741393e-06, "loss": 0.78088742, "num_input_tokens_seen": 31270200, "step": 1479, "time_per_iteration": 2.5827689170837402 }, { "auxiliary_loss_clip": 0.01207346, "auxiliary_loss_mlp": 0.01033358, "balance_loss_clip": 1.06191838, "balance_loss_mlp": 1.02300429, "epoch": 0.17795947814585464, "flos": 30408365773440.0, "grad_norm": 4.302458204812137, "language_loss": 0.77533615, "learning_rate": 3.774850234137708e-06, "loss": 0.7977432, "num_input_tokens_seen": 31287495, "step": 1480, "time_per_iteration": 2.7176499366760254 }, { "auxiliary_loss_clip": 0.01206287, "auxiliary_loss_mlp": 0.01037276, "balance_loss_clip": 1.06163311, "balance_loss_mlp": 1.02706623, "epoch": 0.17807972103649372, "flos": 24389055411840.0, "grad_norm": 2.197802986589133, "language_loss": 0.82569242, "learning_rate": 3.7744910310579076e-06, "loss": 0.84812808, "num_input_tokens_seen": 31306420, "step": 1481, "time_per_iteration": 2.55305552482605 }, { "auxiliary_loss_clip": 0.01225942, "auxiliary_loss_mlp": 0.01031003, "balance_loss_clip": 1.06754196, "balance_loss_mlp": 1.02240181, "epoch": 0.1781999639271328, "flos": 20301559332480.0, "grad_norm": 2.1169771261957395, "language_loss": 0.85378468, "learning_rate": 3.774131558789229e-06, "loss": 0.87635422, "num_input_tokens_seen": 31325750, "step": 1482, "time_per_iteration": 2.4669079780578613 }, { "auxiliary_loss_clip": 0.01224661, "auxiliary_loss_mlp": 0.00764944, "balance_loss_clip": 1.06543303, "balance_loss_mlp": 1.00091124, "epoch": 0.1783202068177719, "flos": 15924479806080.0, "grad_norm": 2.5119747375358408, "language_loss": 0.69803882, "learning_rate": 3.773771817386203e-06, "loss": 0.71793485, "num_input_tokens_seen": 31343080, "step": 1483, "time_per_iteration": 2.470374822616577 }, { "auxiliary_loss_clip": 0.01192895, "auxiliary_loss_mlp": 0.01032248, "balance_loss_clip": 1.06067371, "balance_loss_mlp": 1.02302694, "epoch": 0.178440449708411, "flos": 20631758083200.0, "grad_norm": 1.4890868257939718, "language_loss": 0.79691088, "learning_rate": 3.773411806903403e-06, "loss": 0.81916225, "num_input_tokens_seen": 31362160, "step": 1484, "time_per_iteration": 2.5353639125823975 }, { "auxiliary_loss_clip": 0.01152099, "auxiliary_loss_mlp": 0.01036826, "balance_loss_clip": 1.05368328, "balance_loss_mlp": 1.02650261, "epoch": 0.17856069259905008, "flos": 21686059105920.0, "grad_norm": 1.6847964208177657, "language_loss": 0.94846934, "learning_rate": 3.7730515273954415e-06, "loss": 0.97035861, "num_input_tokens_seen": 31380770, "step": 1485, "time_per_iteration": 2.637413263320923 }, { "auxiliary_loss_clip": 0.01224606, "auxiliary_loss_mlp": 0.01034421, "balance_loss_clip": 1.06681061, "balance_loss_mlp": 1.0253253, "epoch": 0.17868093548968916, "flos": 26572962320640.0, "grad_norm": 1.8637740361500366, "language_loss": 0.8491075, "learning_rate": 3.772690978916973e-06, "loss": 0.87169778, "num_input_tokens_seen": 31400525, "step": 1486, "time_per_iteration": 2.5730369091033936 }, { "auxiliary_loss_clip": 0.01209227, "auxiliary_loss_mlp": 0.01038043, "balance_loss_clip": 1.06364071, "balance_loss_mlp": 1.02794051, "epoch": 0.17880117838032827, "flos": 18581006891520.0, "grad_norm": 1.9737619373313253, "language_loss": 0.86346591, "learning_rate": 3.772330161522693e-06, "loss": 0.88593858, "num_input_tokens_seen": 31418435, "step": 1487, "time_per_iteration": 3.3004579544067383 }, { "auxiliary_loss_clip": 0.01193917, "auxiliary_loss_mlp": 0.01037744, "balance_loss_clip": 1.06570292, "balance_loss_mlp": 1.0276525, "epoch": 0.17892142127096736, "flos": 26541217676160.0, "grad_norm": 1.9298624089719283, "language_loss": 0.80043507, "learning_rate": 3.7719690752673365e-06, "loss": 0.82275164, "num_input_tokens_seen": 31439230, "step": 1488, "time_per_iteration": 2.592899799346924 }, { "auxiliary_loss_clip": 0.01183723, "auxiliary_loss_mlp": 0.01036004, "balance_loss_clip": 1.06366706, "balance_loss_mlp": 1.02635431, "epoch": 0.17904166416160644, "flos": 23872623621120.0, "grad_norm": 2.047057719401131, "language_loss": 0.7816599, "learning_rate": 3.7716077202056796e-06, "loss": 0.80385721, "num_input_tokens_seen": 31457705, "step": 1489, "time_per_iteration": 2.610059976577759 }, { "auxiliary_loss_clip": 0.01179867, "auxiliary_loss_mlp": 0.01033833, "balance_loss_clip": 1.05894876, "balance_loss_mlp": 1.02436733, "epoch": 0.17916190705224552, "flos": 19134426712320.0, "grad_norm": 2.740640488183298, "language_loss": 0.93789661, "learning_rate": 3.7712460963925404e-06, "loss": 0.9600336, "num_input_tokens_seen": 31473645, "step": 1490, "time_per_iteration": 2.505293846130371 }, { "auxiliary_loss_clip": 0.01185243, "auxiliary_loss_mlp": 0.01031464, "balance_loss_clip": 1.05834007, "balance_loss_mlp": 1.02176023, "epoch": 0.17928214994288463, "flos": 25152120961920.0, "grad_norm": 2.267990210001713, "language_loss": 0.75221699, "learning_rate": 3.7708842038827775e-06, "loss": 0.77438414, "num_input_tokens_seen": 31492605, "step": 1491, "time_per_iteration": 2.5933423042297363 }, { "auxiliary_loss_clip": 0.01207974, "auxiliary_loss_mlp": 0.01035046, "balance_loss_clip": 1.06115985, "balance_loss_mlp": 1.02586687, "epoch": 0.17940239283352372, "flos": 22384629786240.0, "grad_norm": 1.8536466907740017, "language_loss": 0.85825455, "learning_rate": 3.770522042731288e-06, "loss": 0.88068473, "num_input_tokens_seen": 31514500, "step": 1492, "time_per_iteration": 2.542797803878784 }, { "auxiliary_loss_clip": 0.01156358, "auxiliary_loss_mlp": 0.01046, "balance_loss_clip": 1.05834937, "balance_loss_mlp": 1.03580189, "epoch": 0.1795226357241628, "flos": 23178685795200.0, "grad_norm": 1.7988507727588825, "language_loss": 0.87881899, "learning_rate": 3.7701596129930122e-06, "loss": 0.90084255, "num_input_tokens_seen": 31533225, "step": 1493, "time_per_iteration": 4.280670404434204 }, { "auxiliary_loss_clip": 0.01187977, "auxiliary_loss_mlp": 0.01031104, "balance_loss_clip": 1.06107438, "balance_loss_mlp": 1.02035737, "epoch": 0.1796428786148019, "flos": 22090413484800.0, "grad_norm": 2.064594396199253, "language_loss": 0.73572993, "learning_rate": 3.7697969147229315e-06, "loss": 0.75792074, "num_input_tokens_seen": 31551385, "step": 1494, "time_per_iteration": 2.570464611053467 }, { "auxiliary_loss_clip": 0.01205212, "auxiliary_loss_mlp": 0.01035446, "balance_loss_clip": 1.06118536, "balance_loss_mlp": 1.02566493, "epoch": 0.179763121505441, "flos": 21324618501120.0, "grad_norm": 2.103074518000646, "language_loss": 0.85422015, "learning_rate": 3.7694339479760647e-06, "loss": 0.87662673, "num_input_tokens_seen": 31570415, "step": 1495, "time_per_iteration": 3.258523464202881 }, { "auxiliary_loss_clip": 0.01113256, "auxiliary_loss_mlp": 0.01003051, "balance_loss_clip": 1.03997636, "balance_loss_mlp": 1.00044, "epoch": 0.17988336439608008, "flos": 68161864815360.0, "grad_norm": 4.514920265727813, "language_loss": 0.57378691, "learning_rate": 3.769070712807476e-06, "loss": 0.5949499, "num_input_tokens_seen": 31632445, "step": 1496, "time_per_iteration": 3.197551965713501 }, { "auxiliary_loss_clip": 0.01137732, "auxiliary_loss_mlp": 0.01038205, "balance_loss_clip": 1.05715156, "balance_loss_mlp": 1.02822709, "epoch": 0.18000360728671919, "flos": 21945047143680.0, "grad_norm": 1.764423469370373, "language_loss": 0.7895304, "learning_rate": 3.768707209272266e-06, "loss": 0.81128979, "num_input_tokens_seen": 31652575, "step": 1497, "time_per_iteration": 2.6191747188568115 }, { "auxiliary_loss_clip": 0.01190722, "auxiliary_loss_mlp": 0.01037318, "balance_loss_clip": 1.06033218, "balance_loss_mlp": 1.02738142, "epoch": 0.18012385017735827, "flos": 18986330937600.0, "grad_norm": 2.0468522117321974, "language_loss": 0.76846707, "learning_rate": 3.768343437425579e-06, "loss": 0.79074752, "num_input_tokens_seen": 31671145, "step": 1498, "time_per_iteration": 2.5120108127593994 }, { "auxiliary_loss_clip": 0.01126844, "auxiliary_loss_mlp": 0.010336, "balance_loss_clip": 1.05371094, "balance_loss_mlp": 1.0236342, "epoch": 0.18024409306799735, "flos": 19748103598080.0, "grad_norm": 2.3269925917180316, "language_loss": 0.8611154, "learning_rate": 3.7679793973225987e-06, "loss": 0.88271976, "num_input_tokens_seen": 31686955, "step": 1499, "time_per_iteration": 2.649259090423584 }, { "auxiliary_loss_clip": 0.0107695, "auxiliary_loss_mlp": 0.01003823, "balance_loss_clip": 1.03430009, "balance_loss_mlp": 1.00114083, "epoch": 0.18036433595863643, "flos": 67227183060480.0, "grad_norm": 0.8480529418827248, "language_loss": 0.61628759, "learning_rate": 3.767615089018549e-06, "loss": 0.63709533, "num_input_tokens_seen": 31749300, "step": 1500, "time_per_iteration": 3.1692075729370117 }, { "auxiliary_loss_clip": 0.01188465, "auxiliary_loss_mlp": 0.01036555, "balance_loss_clip": 1.05985451, "balance_loss_mlp": 1.02629113, "epoch": 0.18048457884927555, "flos": 18181464935040.0, "grad_norm": 2.2498618680409166, "language_loss": 0.86419702, "learning_rate": 3.7672505125686966e-06, "loss": 0.88644719, "num_input_tokens_seen": 31765665, "step": 1501, "time_per_iteration": 2.505552053451538 }, { "auxiliary_loss_clip": 0.01164955, "auxiliary_loss_mlp": 0.01037045, "balance_loss_clip": 1.05516553, "balance_loss_mlp": 1.02716851, "epoch": 0.18060482173991463, "flos": 15813767111040.0, "grad_norm": 3.0057506041281044, "language_loss": 0.84138823, "learning_rate": 3.7668856680283455e-06, "loss": 0.86340821, "num_input_tokens_seen": 31782690, "step": 1502, "time_per_iteration": 2.561558246612549 }, { "auxiliary_loss_clip": 0.01199503, "auxiliary_loss_mlp": 0.01037237, "balance_loss_clip": 1.06205773, "balance_loss_mlp": 1.02741385, "epoch": 0.1807250646305537, "flos": 18587399512320.0, "grad_norm": 1.7969620063242147, "language_loss": 0.82497483, "learning_rate": 3.7665205554528437e-06, "loss": 0.84734225, "num_input_tokens_seen": 31802045, "step": 1503, "time_per_iteration": 2.5025320053100586 }, { "auxiliary_loss_clip": 0.01199605, "auxiliary_loss_mlp": 0.0103255, "balance_loss_clip": 1.06707644, "balance_loss_mlp": 1.02281022, "epoch": 0.18084530752119282, "flos": 23149131880320.0, "grad_norm": 1.7578252908519156, "language_loss": 0.74376667, "learning_rate": 3.7661551748975782e-06, "loss": 0.76608825, "num_input_tokens_seen": 31820220, "step": 1504, "time_per_iteration": 2.5450692176818848 }, { "auxiliary_loss_clip": 0.01104328, "auxiliary_loss_mlp": 0.01003541, "balance_loss_clip": 1.03164744, "balance_loss_mlp": 1.00079918, "epoch": 0.1809655504118319, "flos": 59803153568640.0, "grad_norm": 0.9104568629095467, "language_loss": 0.60551387, "learning_rate": 3.7657895264179772e-06, "loss": 0.62659252, "num_input_tokens_seen": 31876195, "step": 1505, "time_per_iteration": 3.0797336101531982 }, { "auxiliary_loss_clip": 0.01184072, "auxiliary_loss_mlp": 0.01033478, "balance_loss_clip": 1.05677164, "balance_loss_mlp": 1.02409005, "epoch": 0.181085793302471, "flos": 44201941188480.0, "grad_norm": 2.9132779773083035, "language_loss": 0.74252313, "learning_rate": 3.765423610069509e-06, "loss": 0.76469862, "num_input_tokens_seen": 31901585, "step": 1506, "time_per_iteration": 2.7813596725463867 }, { "auxiliary_loss_clip": 0.01196176, "auxiliary_loss_mlp": 0.01037243, "balance_loss_clip": 1.06472647, "balance_loss_mlp": 1.02733707, "epoch": 0.18120603619311007, "flos": 34898384638080.0, "grad_norm": 1.830509107320071, "language_loss": 0.72516954, "learning_rate": 3.765057425907683e-06, "loss": 0.74750376, "num_input_tokens_seen": 31923045, "step": 1507, "time_per_iteration": 2.6557724475860596 }, { "auxiliary_loss_clip": 0.01210801, "auxiliary_loss_mlp": 0.01038067, "balance_loss_clip": 1.0616945, "balance_loss_mlp": 1.02778554, "epoch": 0.18132627908374918, "flos": 21506757390720.0, "grad_norm": 1.8188248935205904, "language_loss": 0.78347516, "learning_rate": 3.764690973988048e-06, "loss": 0.80596387, "num_input_tokens_seen": 31943385, "step": 1508, "time_per_iteration": 2.527029037475586 }, { "auxiliary_loss_clip": 0.01181735, "auxiliary_loss_mlp": 0.01031418, "balance_loss_clip": 1.05971861, "balance_loss_mlp": 1.02176225, "epoch": 0.18144652197438826, "flos": 29057693633280.0, "grad_norm": 2.188100403863643, "language_loss": 0.74156475, "learning_rate": 3.7643242543661967e-06, "loss": 0.76369631, "num_input_tokens_seen": 31966045, "step": 1509, "time_per_iteration": 2.610428810119629 }, { "auxiliary_loss_clip": 0.0109427, "auxiliary_loss_mlp": 0.01010746, "balance_loss_clip": 1.02890992, "balance_loss_mlp": 1.00825465, "epoch": 0.18156676486502735, "flos": 68675064382080.0, "grad_norm": 0.8151927676223004, "language_loss": 0.60522521, "learning_rate": 3.7639572670977573e-06, "loss": 0.6262753, "num_input_tokens_seen": 32021540, "step": 1510, "time_per_iteration": 3.000666618347168 }, { "auxiliary_loss_clip": 0.01179659, "auxiliary_loss_mlp": 0.01038989, "balance_loss_clip": 1.05718935, "balance_loss_mlp": 1.02893996, "epoch": 0.18168700775566646, "flos": 26471515334400.0, "grad_norm": 1.5609653937695833, "language_loss": 0.76758742, "learning_rate": 3.7635900122384042e-06, "loss": 0.78977394, "num_input_tokens_seen": 32044535, "step": 1511, "time_per_iteration": 2.663465738296509 }, { "auxiliary_loss_clip": 0.01195938, "auxiliary_loss_mlp": 0.0104001, "balance_loss_clip": 1.05869639, "balance_loss_mlp": 1.0290246, "epoch": 0.18180725064630554, "flos": 15005668884480.0, "grad_norm": 3.4339188296995973, "language_loss": 0.8700766, "learning_rate": 3.7632224898438477e-06, "loss": 0.89243615, "num_input_tokens_seen": 32061010, "step": 1512, "time_per_iteration": 2.4977450370788574 }, { "auxiliary_loss_clip": 0.0118461, "auxiliary_loss_mlp": 0.0103333, "balance_loss_clip": 1.05840421, "balance_loss_mlp": 1.02357912, "epoch": 0.18192749353694462, "flos": 19682387665920.0, "grad_norm": 1.4877012113122992, "language_loss": 0.79553455, "learning_rate": 3.762854699969842e-06, "loss": 0.81771398, "num_input_tokens_seen": 32081520, "step": 1513, "time_per_iteration": 3.386384963989258 }, { "auxiliary_loss_clip": 0.01206675, "auxiliary_loss_mlp": 0.01041821, "balance_loss_clip": 1.0639956, "balance_loss_mlp": 1.0308243, "epoch": 0.1820477364275837, "flos": 20702717400960.0, "grad_norm": 1.8971641765236287, "language_loss": 0.73319352, "learning_rate": 3.762486642672179e-06, "loss": 0.75567847, "num_input_tokens_seen": 32098460, "step": 1514, "time_per_iteration": 2.507464647293091 }, { "auxiliary_loss_clip": 0.01190944, "auxiliary_loss_mlp": 0.01038044, "balance_loss_clip": 1.05835521, "balance_loss_mlp": 1.0280962, "epoch": 0.18216797931822282, "flos": 17128708197120.0, "grad_norm": 2.0553824890871986, "language_loss": 0.87007093, "learning_rate": 3.7621183180066946e-06, "loss": 0.89236081, "num_input_tokens_seen": 32116420, "step": 1515, "time_per_iteration": 2.5627338886260986 }, { "auxiliary_loss_clip": 0.01191499, "auxiliary_loss_mlp": 0.01035312, "balance_loss_clip": 1.05834126, "balance_loss_mlp": 1.02504253, "epoch": 0.1822882222088619, "flos": 29242561956480.0, "grad_norm": 1.5215141883411794, "language_loss": 0.7374227, "learning_rate": 3.7617497260292625e-06, "loss": 0.75969082, "num_input_tokens_seen": 32138475, "step": 1516, "time_per_iteration": 2.6125621795654297 }, { "auxiliary_loss_clip": 0.01187473, "auxiliary_loss_mlp": 0.01034012, "balance_loss_clip": 1.06143785, "balance_loss_mlp": 1.02330732, "epoch": 0.18240846509950098, "flos": 17702739446400.0, "grad_norm": 2.605499572940373, "language_loss": 0.78701055, "learning_rate": 3.7613808667957967e-06, "loss": 0.80922544, "num_input_tokens_seen": 32151165, "step": 1517, "time_per_iteration": 2.476233720779419 }, { "auxiliary_loss_clip": 0.01194993, "auxiliary_loss_mlp": 0.01044304, "balance_loss_clip": 1.06038189, "balance_loss_mlp": 1.03433812, "epoch": 0.1825287079901401, "flos": 14790025584000.0, "grad_norm": 2.1357697738897183, "language_loss": 0.90924817, "learning_rate": 3.7610117403622547e-06, "loss": 0.93164116, "num_input_tokens_seen": 32167725, "step": 1518, "time_per_iteration": 2.5070536136627197 }, { "auxiliary_loss_clip": 0.01169235, "auxiliary_loss_mlp": 0.01040558, "balance_loss_clip": 1.05352306, "balance_loss_mlp": 1.0300144, "epoch": 0.18264895088077918, "flos": 21946232292480.0, "grad_norm": 1.681642278010146, "language_loss": 0.89922565, "learning_rate": 3.7606423467846313e-06, "loss": 0.92132354, "num_input_tokens_seen": 32187330, "step": 1519, "time_per_iteration": 3.410222291946411 }, { "auxiliary_loss_clip": 0.01184287, "auxiliary_loss_mlp": 0.01040141, "balance_loss_clip": 1.06102538, "balance_loss_mlp": 1.03022909, "epoch": 0.18276919377141826, "flos": 20886759711360.0, "grad_norm": 1.434769108150616, "language_loss": 0.7946586, "learning_rate": 3.760272686118964e-06, "loss": 0.81690288, "num_input_tokens_seen": 32205550, "step": 1520, "time_per_iteration": 3.417752981185913 }, { "auxiliary_loss_clip": 0.01193656, "auxiliary_loss_mlp": 0.01038381, "balance_loss_clip": 1.05914736, "balance_loss_mlp": 1.02868962, "epoch": 0.18288943666205737, "flos": 21469877101440.0, "grad_norm": 2.016186871273352, "language_loss": 0.92624295, "learning_rate": 3.7599027584213297e-06, "loss": 0.94856334, "num_input_tokens_seen": 32224430, "step": 1521, "time_per_iteration": 3.2623822689056396 }, { "auxiliary_loss_clip": 0.01211489, "auxiliary_loss_mlp": 0.01038563, "balance_loss_clip": 1.06056035, "balance_loss_mlp": 1.02835262, "epoch": 0.18300967955269645, "flos": 21539363961600.0, "grad_norm": 1.9658953811751863, "language_loss": 0.7793622, "learning_rate": 3.7595325637478465e-06, "loss": 0.80186266, "num_input_tokens_seen": 32242455, "step": 1522, "time_per_iteration": 2.48207950592041 }, { "auxiliary_loss_clip": 0.01183828, "auxiliary_loss_mlp": 0.01044729, "balance_loss_clip": 1.06006742, "balance_loss_mlp": 1.03318357, "epoch": 0.18312992244333554, "flos": 28876237102080.0, "grad_norm": 2.0363307076798165, "language_loss": 0.81768596, "learning_rate": 3.7591621021546723e-06, "loss": 0.83997154, "num_input_tokens_seen": 32264450, "step": 1523, "time_per_iteration": 2.598876714706421 }, { "auxiliary_loss_clip": 0.01198881, "auxiliary_loss_mlp": 0.01036027, "balance_loss_clip": 1.05824828, "balance_loss_mlp": 1.02474356, "epoch": 0.18325016533397462, "flos": 20120102801280.0, "grad_norm": 1.7008505788703276, "language_loss": 0.81537443, "learning_rate": 3.7587913736980062e-06, "loss": 0.83772361, "num_input_tokens_seen": 32284090, "step": 1524, "time_per_iteration": 2.50065016746521 }, { "auxiliary_loss_clip": 0.01130617, "auxiliary_loss_mlp": 0.0103698, "balance_loss_clip": 1.04904747, "balance_loss_mlp": 1.02687693, "epoch": 0.18337040822461373, "flos": 23329187781120.0, "grad_norm": 1.6063378258604248, "language_loss": 0.84336019, "learning_rate": 3.7584203784340865e-06, "loss": 0.86503613, "num_input_tokens_seen": 32303260, "step": 1525, "time_per_iteration": 2.6171212196350098 }, { "auxiliary_loss_clip": 0.01187262, "auxiliary_loss_mlp": 0.01037775, "balance_loss_clip": 1.05623412, "balance_loss_mlp": 1.02760017, "epoch": 0.1834906511152528, "flos": 25009555881600.0, "grad_norm": 2.1812027389513156, "language_loss": 0.85733509, "learning_rate": 3.7580491164191938e-06, "loss": 0.8795855, "num_input_tokens_seen": 32321570, "step": 1526, "time_per_iteration": 2.5753986835479736 }, { "auxiliary_loss_clip": 0.01109682, "auxiliary_loss_mlp": 0.01004203, "balance_loss_clip": 1.02729988, "balance_loss_mlp": 1.00154424, "epoch": 0.1836108940058919, "flos": 67251493589760.0, "grad_norm": 0.8076315705857933, "language_loss": 0.61306846, "learning_rate": 3.757677587709648e-06, "loss": 0.63420737, "num_input_tokens_seen": 32384835, "step": 1527, "time_per_iteration": 3.209385871887207 }, { "auxiliary_loss_clip": 0.01172461, "auxiliary_loss_mlp": 0.01036698, "balance_loss_clip": 1.05861878, "balance_loss_mlp": 1.02675009, "epoch": 0.183731136896531, "flos": 25738721971200.0, "grad_norm": 2.1933410267974964, "language_loss": 0.75526786, "learning_rate": 3.7573057923618095e-06, "loss": 0.77735949, "num_input_tokens_seen": 32404930, "step": 1528, "time_per_iteration": 2.6907362937927246 }, { "auxiliary_loss_clip": 0.01158788, "auxiliary_loss_mlp": 0.01034745, "balance_loss_clip": 1.05195332, "balance_loss_mlp": 1.02396226, "epoch": 0.1838513797871701, "flos": 20449403712000.0, "grad_norm": 2.6770094243595057, "language_loss": 0.74216211, "learning_rate": 3.7569337304320793e-06, "loss": 0.76409739, "num_input_tokens_seen": 32424515, "step": 1529, "time_per_iteration": 2.6035349369049072 }, { "auxiliary_loss_clip": 0.01091968, "auxiliary_loss_mlp": 0.01002657, "balance_loss_clip": 1.02373588, "balance_loss_mlp": 1.00010586, "epoch": 0.18397162267780917, "flos": 68565141786240.0, "grad_norm": 2.002893437752971, "language_loss": 0.64455068, "learning_rate": 3.756561401976899e-06, "loss": 0.66549695, "num_input_tokens_seen": 32484220, "step": 1530, "time_per_iteration": 2.99403715133667 }, { "auxiliary_loss_clip": 0.01224167, "auxiliary_loss_mlp": 0.0103464, "balance_loss_clip": 1.06396341, "balance_loss_mlp": 1.02453113, "epoch": 0.18409186556844825, "flos": 31941104976000.0, "grad_norm": 2.902315012913277, "language_loss": 0.82708448, "learning_rate": 3.7561888070527514e-06, "loss": 0.84967256, "num_input_tokens_seen": 32506260, "step": 1531, "time_per_iteration": 2.581179618835449 }, { "auxiliary_loss_clip": 0.01159704, "auxiliary_loss_mlp": 0.00764749, "balance_loss_clip": 1.05558276, "balance_loss_mlp": 1.00165319, "epoch": 0.18421210845908736, "flos": 20120533764480.0, "grad_norm": 2.304512789812378, "language_loss": 0.79688239, "learning_rate": 3.7558159457161577e-06, "loss": 0.81612694, "num_input_tokens_seen": 32524225, "step": 1532, "time_per_iteration": 2.5616469383239746 }, { "auxiliary_loss_clip": 0.01195696, "auxiliary_loss_mlp": 0.00765474, "balance_loss_clip": 1.06339979, "balance_loss_mlp": 1.00163865, "epoch": 0.18433235134972645, "flos": 23110491824640.0, "grad_norm": 2.171485483059511, "language_loss": 0.78154618, "learning_rate": 3.755442818023681e-06, "loss": 0.80115783, "num_input_tokens_seen": 32543850, "step": 1533, "time_per_iteration": 2.5394325256347656 }, { "auxiliary_loss_clip": 0.01180838, "auxiliary_loss_mlp": 0.01032199, "balance_loss_clip": 1.0600965, "balance_loss_mlp": 1.02269816, "epoch": 0.18445259424036553, "flos": 18291351617280.0, "grad_norm": 2.746214856149023, "language_loss": 0.75996596, "learning_rate": 3.7550694240319246e-06, "loss": 0.78209633, "num_input_tokens_seen": 32561725, "step": 1534, "time_per_iteration": 2.5386745929718018 }, { "auxiliary_loss_clip": 0.01209616, "auxiliary_loss_mlp": 0.01032439, "balance_loss_clip": 1.06069005, "balance_loss_mlp": 1.02284217, "epoch": 0.18457283713100464, "flos": 21324079797120.0, "grad_norm": 2.347459196984291, "language_loss": 0.76074064, "learning_rate": 3.7546957637975326e-06, "loss": 0.78316116, "num_input_tokens_seen": 32579135, "step": 1535, "time_per_iteration": 2.4987564086914062 }, { "auxiliary_loss_clip": 0.01133956, "auxiliary_loss_mlp": 0.01035106, "balance_loss_clip": 1.04652584, "balance_loss_mlp": 1.02533722, "epoch": 0.18469308002164372, "flos": 20375679047040.0, "grad_norm": 1.5648573358390365, "language_loss": 0.73890042, "learning_rate": 3.7543218373771873e-06, "loss": 0.76059109, "num_input_tokens_seen": 32598460, "step": 1536, "time_per_iteration": 2.6410598754882812 }, { "auxiliary_loss_clip": 0.01138834, "auxiliary_loss_mlp": 0.00764842, "balance_loss_clip": 1.05341995, "balance_loss_mlp": 1.00161314, "epoch": 0.1848133229122828, "flos": 26435892021120.0, "grad_norm": 1.4305178161697758, "language_loss": 0.78245389, "learning_rate": 3.753947644827615e-06, "loss": 0.8014906, "num_input_tokens_seen": 32621920, "step": 1537, "time_per_iteration": 2.7086331844329834 }, { "auxiliary_loss_clip": 0.01097863, "auxiliary_loss_mlp": 0.01007098, "balance_loss_clip": 1.02693629, "balance_loss_mlp": 1.00420141, "epoch": 0.1849335658029219, "flos": 70547447612160.0, "grad_norm": 0.9466289929534218, "language_loss": 0.57233226, "learning_rate": 3.753573186205579e-06, "loss": 0.59338188, "num_input_tokens_seen": 32690040, "step": 1538, "time_per_iteration": 3.2292301654815674 }, { "auxiliary_loss_clip": 0.01178675, "auxiliary_loss_mlp": 0.00764978, "balance_loss_clip": 1.05461168, "balance_loss_mlp": 1.00152361, "epoch": 0.185053808693561, "flos": 17384140788480.0, "grad_norm": 2.3997511599309775, "language_loss": 0.77776992, "learning_rate": 3.753198461567885e-06, "loss": 0.79720652, "num_input_tokens_seen": 32707285, "step": 1539, "time_per_iteration": 2.5214316844940186 }, { "auxiliary_loss_clip": 0.01172123, "auxiliary_loss_mlp": 0.01039535, "balance_loss_clip": 1.06048667, "balance_loss_mlp": 1.0299505, "epoch": 0.18517405158420008, "flos": 28986159697920.0, "grad_norm": 1.7380681267525369, "language_loss": 0.91797101, "learning_rate": 3.7528234709713783e-06, "loss": 0.94008756, "num_input_tokens_seen": 32730030, "step": 1540, "time_per_iteration": 3.4087469577789307 }, { "auxiliary_loss_clip": 0.01207114, "auxiliary_loss_mlp": 0.01037142, "balance_loss_clip": 1.06225419, "balance_loss_mlp": 1.02755165, "epoch": 0.18529429447483917, "flos": 26794962328320.0, "grad_norm": 2.1009670548790442, "language_loss": 0.84451687, "learning_rate": 3.7524482144729447e-06, "loss": 0.86695945, "num_input_tokens_seen": 32749485, "step": 1541, "time_per_iteration": 2.5261754989624023 }, { "auxiliary_loss_clip": 0.01169729, "auxiliary_loss_mlp": 0.01043812, "balance_loss_clip": 1.05321252, "balance_loss_mlp": 1.03370941, "epoch": 0.18541453736547828, "flos": 13581595301760.0, "grad_norm": 2.0002675676265924, "language_loss": 0.83764476, "learning_rate": 3.7520726921295106e-06, "loss": 0.85978013, "num_input_tokens_seen": 32766205, "step": 1542, "time_per_iteration": 2.537696123123169 }, { "auxiliary_loss_clip": 0.01200029, "auxiliary_loss_mlp": 0.01037473, "balance_loss_clip": 1.05551124, "balance_loss_mlp": 1.02770996, "epoch": 0.18553478025611736, "flos": 24025424077440.0, "grad_norm": 2.1540854720962472, "language_loss": 0.72559702, "learning_rate": 3.751696903998042e-06, "loss": 0.74797213, "num_input_tokens_seen": 32784840, "step": 1543, "time_per_iteration": 2.530993938446045 }, { "auxiliary_loss_clip": 0.01202769, "auxiliary_loss_mlp": 0.01035386, "balance_loss_clip": 1.06119204, "balance_loss_mlp": 1.02571845, "epoch": 0.18565502314675644, "flos": 25885165720320.0, "grad_norm": 1.8648186820523522, "language_loss": 0.70149606, "learning_rate": 3.7513208501355456e-06, "loss": 0.72387761, "num_input_tokens_seen": 32805945, "step": 1544, "time_per_iteration": 2.5216071605682373 }, { "auxiliary_loss_clip": 0.01184486, "auxiliary_loss_mlp": 0.01036935, "balance_loss_clip": 1.05612588, "balance_loss_mlp": 1.027601, "epoch": 0.18577526603739553, "flos": 19610063631360.0, "grad_norm": 1.9026032211099624, "language_loss": 0.83359313, "learning_rate": 3.750944530599069e-06, "loss": 0.85580736, "num_input_tokens_seen": 32825515, "step": 1545, "time_per_iteration": 2.5397531986236572 }, { "auxiliary_loss_clip": 0.01212013, "auxiliary_loss_mlp": 0.00765541, "balance_loss_clip": 1.06248677, "balance_loss_mlp": 1.00149095, "epoch": 0.18589550892803464, "flos": 18474891137280.0, "grad_norm": 2.4093407187983225, "language_loss": 0.80668128, "learning_rate": 3.7505679454456992e-06, "loss": 0.82645679, "num_input_tokens_seen": 32842125, "step": 1546, "time_per_iteration": 3.226943254470825 }, { "auxiliary_loss_clip": 0.01123606, "auxiliary_loss_mlp": 0.01035045, "balance_loss_clip": 1.04883432, "balance_loss_mlp": 1.02495944, "epoch": 0.18601575181867372, "flos": 23549966726400.0, "grad_norm": 1.9983446963873355, "language_loss": 0.69847667, "learning_rate": 3.750191094732564e-06, "loss": 0.72006321, "num_input_tokens_seen": 32862990, "step": 1547, "time_per_iteration": 3.718635320663452 }, { "auxiliary_loss_clip": 0.01124866, "auxiliary_loss_mlp": 0.0103661, "balance_loss_clip": 1.04918528, "balance_loss_mlp": 1.02711463, "epoch": 0.1861359947093128, "flos": 26360192108160.0, "grad_norm": 1.8191058941031168, "language_loss": 0.75083649, "learning_rate": 3.7498139785168313e-06, "loss": 0.77245122, "num_input_tokens_seen": 32883595, "step": 1548, "time_per_iteration": 3.481144428253174 }, { "auxiliary_loss_clip": 0.01202968, "auxiliary_loss_mlp": 0.01040031, "balance_loss_clip": 1.06250334, "balance_loss_mlp": 1.02975488, "epoch": 0.1862562375999519, "flos": 23331198942720.0, "grad_norm": 1.8334972983197446, "language_loss": 0.77460778, "learning_rate": 3.749436596855709e-06, "loss": 0.79703778, "num_input_tokens_seen": 32902895, "step": 1549, "time_per_iteration": 2.5085291862487793 }, { "auxiliary_loss_clip": 0.0119827, "auxiliary_loss_mlp": 0.01032316, "balance_loss_clip": 1.05699182, "balance_loss_mlp": 1.0219748, "epoch": 0.186376480490591, "flos": 16648222942080.0, "grad_norm": 2.4494272208097, "language_loss": 0.90518767, "learning_rate": 3.749058949806446e-06, "loss": 0.92749357, "num_input_tokens_seen": 32919620, "step": 1550, "time_per_iteration": 2.480605363845825 }, { "auxiliary_loss_clip": 0.0120421, "auxiliary_loss_mlp": 0.01031887, "balance_loss_clip": 1.05871773, "balance_loss_mlp": 1.02238619, "epoch": 0.18649672338123008, "flos": 21468656039040.0, "grad_norm": 1.570947577975064, "language_loss": 0.84367442, "learning_rate": 3.748681037426331e-06, "loss": 0.8660354, "num_input_tokens_seen": 32938830, "step": 1551, "time_per_iteration": 2.526944398880005 }, { "auxiliary_loss_clip": 0.01221474, "auxiliary_loss_mlp": 0.01039448, "balance_loss_clip": 1.06297278, "balance_loss_mlp": 1.03006029, "epoch": 0.1866169662718692, "flos": 12312728386560.0, "grad_norm": 2.2582317905945386, "language_loss": 0.91802633, "learning_rate": 3.7483028597726936e-06, "loss": 0.94063556, "num_input_tokens_seen": 32955600, "step": 1552, "time_per_iteration": 2.4482104778289795 }, { "auxiliary_loss_clip": 0.01172441, "auxiliary_loss_mlp": 0.01039889, "balance_loss_clip": 1.05642521, "balance_loss_mlp": 1.02983928, "epoch": 0.18673720916250827, "flos": 23581280407680.0, "grad_norm": 2.5686354009574552, "language_loss": 0.62850809, "learning_rate": 3.7479244169029017e-06, "loss": 0.65063143, "num_input_tokens_seen": 32975390, "step": 1553, "time_per_iteration": 2.60425066947937 }, { "auxiliary_loss_clip": 0.01205993, "auxiliary_loss_mlp": 0.01028614, "balance_loss_clip": 1.05682731, "balance_loss_mlp": 1.01936948, "epoch": 0.18685745205314735, "flos": 19718370115200.0, "grad_norm": 3.81416848843719, "language_loss": 0.73412538, "learning_rate": 3.7475457088743658e-06, "loss": 0.75647146, "num_input_tokens_seen": 32992640, "step": 1554, "time_per_iteration": 2.513031244277954 }, { "auxiliary_loss_clip": 0.01181296, "auxiliary_loss_mlp": 0.01039496, "balance_loss_clip": 1.05780029, "balance_loss_mlp": 1.02874887, "epoch": 0.18697769494378644, "flos": 34204123589760.0, "grad_norm": 2.08819388970875, "language_loss": 0.74445719, "learning_rate": 3.7471667357445348e-06, "loss": 0.76666504, "num_input_tokens_seen": 33012470, "step": 1555, "time_per_iteration": 2.6156959533691406 }, { "auxiliary_loss_clip": 0.01147034, "auxiliary_loss_mlp": 0.0102712, "balance_loss_clip": 1.05498934, "balance_loss_mlp": 1.01796448, "epoch": 0.18709793783442555, "flos": 34241327101440.0, "grad_norm": 1.8715351199422372, "language_loss": 0.72317046, "learning_rate": 3.7467874975709e-06, "loss": 0.74491203, "num_input_tokens_seen": 33033275, "step": 1556, "time_per_iteration": 2.774231433868408 }, { "auxiliary_loss_clip": 0.01210674, "auxiliary_loss_mlp": 0.01044213, "balance_loss_clip": 1.06219125, "balance_loss_mlp": 1.03426504, "epoch": 0.18721818072506463, "flos": 40734550529280.0, "grad_norm": 1.9946060261181777, "language_loss": 0.77806765, "learning_rate": 3.7464079944109904e-06, "loss": 0.8006165, "num_input_tokens_seen": 33055135, "step": 1557, "time_per_iteration": 2.656710624694824 }, { "auxiliary_loss_clip": 0.01176867, "auxiliary_loss_mlp": 0.01032369, "balance_loss_clip": 1.05627906, "balance_loss_mlp": 1.02304721, "epoch": 0.18733842361570371, "flos": 22157386392960.0, "grad_norm": 1.8940492322244606, "language_loss": 0.77750742, "learning_rate": 3.746028226322376e-06, "loss": 0.79959977, "num_input_tokens_seen": 33071015, "step": 1558, "time_per_iteration": 2.5436840057373047 }, { "auxiliary_loss_clip": 0.01186655, "auxiliary_loss_mlp": 0.01032227, "balance_loss_clip": 1.05809438, "balance_loss_mlp": 1.02285719, "epoch": 0.18745866650634282, "flos": 18914940656640.0, "grad_norm": 1.691101638614279, "language_loss": 0.75481844, "learning_rate": 3.745648193362669e-06, "loss": 0.77700734, "num_input_tokens_seen": 33090370, "step": 1559, "time_per_iteration": 2.5269317626953125 }, { "auxiliary_loss_clip": 0.01191393, "auxiliary_loss_mlp": 0.01034652, "balance_loss_clip": 1.05902386, "balance_loss_mlp": 1.02544856, "epoch": 0.1875789093969819, "flos": 19314626267520.0, "grad_norm": 1.990582355898355, "language_loss": 0.72553068, "learning_rate": 3.745267895589518e-06, "loss": 0.74779111, "num_input_tokens_seen": 33108910, "step": 1560, "time_per_iteration": 2.5093231201171875 }, { "auxiliary_loss_clip": 0.0119095, "auxiliary_loss_mlp": 0.01035706, "balance_loss_clip": 1.06006432, "balance_loss_mlp": 1.02597272, "epoch": 0.187699152287621, "flos": 17018965169280.0, "grad_norm": 1.9053093804656958, "language_loss": 0.82285178, "learning_rate": 3.7448873330606154e-06, "loss": 0.84511834, "num_input_tokens_seen": 33126680, "step": 1561, "time_per_iteration": 2.5056815147399902 }, { "auxiliary_loss_clip": 0.01169884, "auxiliary_loss_mlp": 0.01035417, "balance_loss_clip": 1.05939007, "balance_loss_mlp": 1.02529597, "epoch": 0.18781939517826007, "flos": 22346384780160.0, "grad_norm": 2.055608567219313, "language_loss": 0.87481058, "learning_rate": 3.7445065058336914e-06, "loss": 0.89686364, "num_input_tokens_seen": 33145550, "step": 1562, "time_per_iteration": 2.571474313735962 }, { "auxiliary_loss_clip": 0.01146731, "auxiliary_loss_mlp": 0.01030448, "balance_loss_clip": 1.05012631, "balance_loss_mlp": 1.02114391, "epoch": 0.18793963806889918, "flos": 14611478054400.0, "grad_norm": 1.7780072356699181, "language_loss": 0.86268795, "learning_rate": 3.7441254139665176e-06, "loss": 0.88445973, "num_input_tokens_seen": 33161735, "step": 1563, "time_per_iteration": 2.5689122676849365 }, { "auxiliary_loss_clip": 0.01220112, "auxiliary_loss_mlp": 0.01038636, "balance_loss_clip": 1.06428432, "balance_loss_mlp": 1.02958822, "epoch": 0.18805988095953827, "flos": 17457075354240.0, "grad_norm": 1.889981719556016, "language_loss": 0.82513249, "learning_rate": 3.743744057516905e-06, "loss": 0.84771991, "num_input_tokens_seen": 33179795, "step": 1564, "time_per_iteration": 2.4595906734466553 }, { "auxiliary_loss_clip": 0.01160012, "auxiliary_loss_mlp": 0.01040223, "balance_loss_clip": 1.05430913, "balance_loss_mlp": 1.02976871, "epoch": 0.18818012385017735, "flos": 15043877976960.0, "grad_norm": 2.6625675410773963, "language_loss": 0.87648696, "learning_rate": 3.743362436542706e-06, "loss": 0.89848924, "num_input_tokens_seen": 33194485, "step": 1565, "time_per_iteration": 2.594287633895874 }, { "auxiliary_loss_clip": 0.01216345, "auxiliary_loss_mlp": 0.01031719, "balance_loss_clip": 1.06055367, "balance_loss_mlp": 1.02247405, "epoch": 0.18830036674081646, "flos": 47551975136640.0, "grad_norm": 1.816273244239361, "language_loss": 0.76638985, "learning_rate": 3.7429805511018115e-06, "loss": 0.78887045, "num_input_tokens_seen": 33216145, "step": 1566, "time_per_iteration": 2.6811671257019043 }, { "auxiliary_loss_clip": 0.01170264, "auxiliary_loss_mlp": 0.00765555, "balance_loss_clip": 1.05768275, "balance_loss_mlp": 1.00140524, "epoch": 0.18842060963145554, "flos": 30044626698240.0, "grad_norm": 1.7407027727474496, "language_loss": 0.77963156, "learning_rate": 3.7425984012521524e-06, "loss": 0.79898977, "num_input_tokens_seen": 33236345, "step": 1567, "time_per_iteration": 3.412440061569214 }, { "auxiliary_loss_clip": 0.01082143, "auxiliary_loss_mlp": 0.00755266, "balance_loss_clip": 1.02781129, "balance_loss_mlp": 0.99990445, "epoch": 0.18854085252209463, "flos": 70318372625280.0, "grad_norm": 0.7348696941141475, "language_loss": 0.60379803, "learning_rate": 3.7422159870517025e-06, "loss": 0.62217212, "num_input_tokens_seen": 33301600, "step": 1568, "time_per_iteration": 3.1371662616729736 }, { "auxiliary_loss_clip": 0.01185823, "auxiliary_loss_mlp": 0.01033086, "balance_loss_clip": 1.05699897, "balance_loss_mlp": 1.02343059, "epoch": 0.1886610954127337, "flos": 21289318410240.0, "grad_norm": 1.5667916484034317, "language_loss": 0.7874403, "learning_rate": 3.7418333085584717e-06, "loss": 0.80962938, "num_input_tokens_seen": 33322785, "step": 1569, "time_per_iteration": 2.583754539489746 }, { "auxiliary_loss_clip": 0.01176572, "auxiliary_loss_mlp": 0.01034576, "balance_loss_clip": 1.05920696, "balance_loss_mlp": 1.02469969, "epoch": 0.18878133830337282, "flos": 17266819991040.0, "grad_norm": 2.183867635388492, "language_loss": 0.90675151, "learning_rate": 3.7414503658305128e-06, "loss": 0.92886305, "num_input_tokens_seen": 33340020, "step": 1570, "time_per_iteration": 2.550875186920166 }, { "auxiliary_loss_clip": 0.01162559, "auxiliary_loss_mlp": 0.01034235, "balance_loss_clip": 1.05030751, "balance_loss_mlp": 1.02482903, "epoch": 0.1889015811940119, "flos": 25775207210880.0, "grad_norm": 2.513363555119363, "language_loss": 0.77377975, "learning_rate": 3.7410671589259185e-06, "loss": 0.7957477, "num_input_tokens_seen": 33358620, "step": 1571, "time_per_iteration": 2.6687686443328857 }, { "auxiliary_loss_clip": 0.01221091, "auxiliary_loss_mlp": 0.01035802, "balance_loss_clip": 1.0634793, "balance_loss_mlp": 1.02565145, "epoch": 0.18902182408465099, "flos": 21032197879680.0, "grad_norm": 2.0974020198640164, "language_loss": 0.79677409, "learning_rate": 3.7406836879028205e-06, "loss": 0.81934303, "num_input_tokens_seen": 33378845, "step": 1572, "time_per_iteration": 3.241908073425293 }, { "auxiliary_loss_clip": 0.01204745, "auxiliary_loss_mlp": 0.01033645, "balance_loss_clip": 1.06179833, "balance_loss_mlp": 1.02382195, "epoch": 0.1891420669752901, "flos": 22272121411200.0, "grad_norm": 2.877257024225958, "language_loss": 0.76615435, "learning_rate": 3.7402999528193907e-06, "loss": 0.78853822, "num_input_tokens_seen": 33398345, "step": 1573, "time_per_iteration": 2.522653818130493 }, { "auxiliary_loss_clip": 0.01159945, "auxiliary_loss_mlp": 0.00765082, "balance_loss_clip": 1.05522072, "balance_loss_mlp": 1.00125611, "epoch": 0.18926230986592918, "flos": 22017802141440.0, "grad_norm": 2.5088768662122134, "language_loss": 0.85467756, "learning_rate": 3.739915953733842e-06, "loss": 0.87392783, "num_input_tokens_seen": 33416390, "step": 1574, "time_per_iteration": 4.194178819656372 }, { "auxiliary_loss_clip": 0.01218979, "auxiliary_loss_mlp": 0.01032959, "balance_loss_clip": 1.06255531, "balance_loss_mlp": 1.0234108, "epoch": 0.18938255275656826, "flos": 24462672336000.0, "grad_norm": 1.5902777594996012, "language_loss": 0.82046533, "learning_rate": 3.7395316907044264e-06, "loss": 0.84298474, "num_input_tokens_seen": 33437175, "step": 1575, "time_per_iteration": 2.5510268211364746 }, { "auxiliary_loss_clip": 0.01203771, "auxiliary_loss_mlp": 0.01036116, "balance_loss_clip": 1.05937088, "balance_loss_mlp": 1.0265733, "epoch": 0.18950279564720737, "flos": 24427049022720.0, "grad_norm": 1.5638724528012768, "language_loss": 0.79413855, "learning_rate": 3.7391471637894364e-06, "loss": 0.81653738, "num_input_tokens_seen": 33459440, "step": 1576, "time_per_iteration": 2.5726404190063477 }, { "auxiliary_loss_clip": 0.01177521, "auxiliary_loss_mlp": 0.01036276, "balance_loss_clip": 1.05484235, "balance_loss_mlp": 1.02683473, "epoch": 0.18962303853784646, "flos": 19756291898880.0, "grad_norm": 1.8238264378690268, "language_loss": 0.85006297, "learning_rate": 3.738762373047205e-06, "loss": 0.87220097, "num_input_tokens_seen": 33479360, "step": 1577, "time_per_iteration": 2.580717086791992 }, { "auxiliary_loss_clip": 0.01176726, "auxiliary_loss_mlp": 0.01036154, "balance_loss_clip": 1.05898416, "balance_loss_mlp": 1.02662933, "epoch": 0.18974328142848554, "flos": 21032054225280.0, "grad_norm": 1.6618809774010064, "language_loss": 0.83225596, "learning_rate": 3.738377318536103e-06, "loss": 0.85438478, "num_input_tokens_seen": 33499245, "step": 1578, "time_per_iteration": 2.584155559539795 }, { "auxiliary_loss_clip": 0.01216222, "auxiliary_loss_mlp": 0.01035089, "balance_loss_clip": 1.06367838, "balance_loss_mlp": 1.02635121, "epoch": 0.18986352431912462, "flos": 12966122736000.0, "grad_norm": 1.9758848388073804, "language_loss": 0.70866024, "learning_rate": 3.7379920003145447e-06, "loss": 0.73117328, "num_input_tokens_seen": 33513520, "step": 1579, "time_per_iteration": 2.4178686141967773 }, { "auxiliary_loss_clip": 0.01182925, "auxiliary_loss_mlp": 0.01037389, "balance_loss_clip": 1.05954337, "balance_loss_mlp": 1.02701235, "epoch": 0.18998376720976373, "flos": 23767908497280.0, "grad_norm": 2.258022130184419, "language_loss": 0.837309, "learning_rate": 3.7376064184409817e-06, "loss": 0.85951215, "num_input_tokens_seen": 33533100, "step": 1580, "time_per_iteration": 2.534109592437744 }, { "auxiliary_loss_clip": 0.01188387, "auxiliary_loss_mlp": 0.01033981, "balance_loss_clip": 1.05996442, "balance_loss_mlp": 1.02377629, "epoch": 0.19010401010040281, "flos": 22966023323520.0, "grad_norm": 1.4164268897440808, "language_loss": 0.87074792, "learning_rate": 3.7372205729739063e-06, "loss": 0.89297158, "num_input_tokens_seen": 33554915, "step": 1581, "time_per_iteration": 2.535048723220825 }, { "auxiliary_loss_clip": 0.01206932, "auxiliary_loss_mlp": 0.01030462, "balance_loss_clip": 1.06021559, "balance_loss_mlp": 1.02000725, "epoch": 0.1902242529910419, "flos": 19135647774720.0, "grad_norm": 2.822840903334306, "language_loss": 0.72001266, "learning_rate": 3.7368344639718514e-06, "loss": 0.74238664, "num_input_tokens_seen": 33572850, "step": 1582, "time_per_iteration": 2.4754326343536377 }, { "auxiliary_loss_clip": 0.01205246, "auxiliary_loss_mlp": 0.01041084, "balance_loss_clip": 1.06047106, "balance_loss_mlp": 1.03226805, "epoch": 0.190344495881681, "flos": 25483935824640.0, "grad_norm": 1.6753643999543122, "language_loss": 0.80642939, "learning_rate": 3.7364480914933895e-06, "loss": 0.82889271, "num_input_tokens_seen": 33593090, "step": 1583, "time_per_iteration": 2.531172513961792 }, { "auxiliary_loss_clip": 0.01156561, "auxiliary_loss_mlp": 0.00765206, "balance_loss_clip": 1.05529881, "balance_loss_mlp": 1.00116158, "epoch": 0.1904647387723201, "flos": 26792843425920.0, "grad_norm": 1.7764484770584368, "language_loss": 0.80815411, "learning_rate": 3.7360614555971325e-06, "loss": 0.82737184, "num_input_tokens_seen": 33612745, "step": 1584, "time_per_iteration": 2.633847236633301 }, { "auxiliary_loss_clip": 0.01202965, "auxiliary_loss_mlp": 0.00764723, "balance_loss_clip": 1.06017041, "balance_loss_mlp": 1.00108933, "epoch": 0.19058498166295917, "flos": 23987753688960.0, "grad_norm": 1.9150613475059173, "language_loss": 0.84991652, "learning_rate": 3.735674556341733e-06, "loss": 0.86959338, "num_input_tokens_seen": 33632360, "step": 1585, "time_per_iteration": 2.5255250930786133 }, { "auxiliary_loss_clip": 0.01188029, "auxiliary_loss_mlp": 0.01038093, "balance_loss_clip": 1.06202781, "balance_loss_mlp": 1.02861571, "epoch": 0.19070522455359826, "flos": 28293299280000.0, "grad_norm": 2.195488658542415, "language_loss": 0.82918012, "learning_rate": 3.7352873937858835e-06, "loss": 0.85144138, "num_input_tokens_seen": 33653895, "step": 1586, "time_per_iteration": 2.5863850116729736 }, { "auxiliary_loss_clip": 0.01168593, "auxiliary_loss_mlp": 0.00765267, "balance_loss_clip": 1.05731654, "balance_loss_mlp": 1.00098729, "epoch": 0.19082546744423737, "flos": 25660220797440.0, "grad_norm": 1.8382884467899543, "language_loss": 0.71705234, "learning_rate": 3.734899967988316e-06, "loss": 0.73639095, "num_input_tokens_seen": 33672075, "step": 1587, "time_per_iteration": 2.5889408588409424 }, { "auxiliary_loss_clip": 0.0116527, "auxiliary_loss_mlp": 0.01031333, "balance_loss_clip": 1.05372965, "balance_loss_mlp": 1.02186811, "epoch": 0.19094571033487645, "flos": 19719483436800.0, "grad_norm": 1.7617970397877933, "language_loss": 0.84209192, "learning_rate": 3.7345122790078026e-06, "loss": 0.86405796, "num_input_tokens_seen": 33689640, "step": 1588, "time_per_iteration": 2.548868417739868 }, { "auxiliary_loss_clip": 0.0120359, "auxiliary_loss_mlp": 0.01034373, "balance_loss_clip": 1.06087148, "balance_loss_mlp": 1.0238055, "epoch": 0.19106595322551553, "flos": 21616320850560.0, "grad_norm": 2.9013520651165643, "language_loss": 0.93263906, "learning_rate": 3.7341243269031556e-06, "loss": 0.95501864, "num_input_tokens_seen": 33708630, "step": 1589, "time_per_iteration": 2.487123966217041 }, { "auxiliary_loss_clip": 0.01179851, "auxiliary_loss_mlp": 0.01033735, "balance_loss_clip": 1.05791235, "balance_loss_mlp": 1.02462196, "epoch": 0.19118619611615464, "flos": 29896890059520.0, "grad_norm": 1.742265582404153, "language_loss": 0.77464372, "learning_rate": 3.7337361117332275e-06, "loss": 0.79677957, "num_input_tokens_seen": 33730370, "step": 1590, "time_per_iteration": 2.59507155418396 }, { "auxiliary_loss_clip": 0.01172771, "auxiliary_loss_mlp": 0.01029884, "balance_loss_clip": 1.05422902, "balance_loss_mlp": 1.02107489, "epoch": 0.19130643900679373, "flos": 17273428093440.0, "grad_norm": 1.987391806197037, "language_loss": 0.77362764, "learning_rate": 3.7333476335569087e-06, "loss": 0.79565418, "num_input_tokens_seen": 33748370, "step": 1591, "time_per_iteration": 2.5393762588500977 }, { "auxiliary_loss_clip": 0.01188525, "auxiliary_loss_mlp": 0.01035647, "balance_loss_clip": 1.05983901, "balance_loss_mlp": 1.02528167, "epoch": 0.1914266818974328, "flos": 24826339584000.0, "grad_norm": 2.221837379579082, "language_loss": 0.67002207, "learning_rate": 3.7329588924331325e-06, "loss": 0.69226372, "num_input_tokens_seen": 33769575, "step": 1592, "time_per_iteration": 2.57541561126709 }, { "auxiliary_loss_clip": 0.01164584, "auxiliary_loss_mlp": 0.01035212, "balance_loss_clip": 1.05271208, "balance_loss_mlp": 1.02580631, "epoch": 0.1915469247880719, "flos": 18952467390720.0, "grad_norm": 1.6975999779184505, "language_loss": 0.82252264, "learning_rate": 3.732569888420871e-06, "loss": 0.84452057, "num_input_tokens_seen": 33789110, "step": 1593, "time_per_iteration": 3.3779399394989014 }, { "auxiliary_loss_clip": 0.01219186, "auxiliary_loss_mlp": 0.01034044, "balance_loss_clip": 1.06030118, "balance_loss_mlp": 1.02360141, "epoch": 0.191667167678711, "flos": 21032952065280.0, "grad_norm": 3.3382731143799758, "language_loss": 0.82523894, "learning_rate": 3.732180621579134e-06, "loss": 0.84777123, "num_input_tokens_seen": 33808325, "step": 1594, "time_per_iteration": 2.4570164680480957 }, { "auxiliary_loss_clip": 0.01184581, "auxiliary_loss_mlp": 0.01035344, "balance_loss_clip": 1.05961871, "balance_loss_mlp": 1.02532482, "epoch": 0.1917874105693501, "flos": 34237663914240.0, "grad_norm": 4.589277036258129, "language_loss": 0.81210053, "learning_rate": 3.7317910919669745e-06, "loss": 0.8342998, "num_input_tokens_seen": 33829520, "step": 1595, "time_per_iteration": 2.6728694438934326 }, { "auxiliary_loss_clip": 0.01202828, "auxiliary_loss_mlp": 0.01040124, "balance_loss_clip": 1.06096947, "balance_loss_mlp": 1.03000343, "epoch": 0.19190765345998917, "flos": 23550613171200.0, "grad_norm": 2.118972675721372, "language_loss": 0.76637793, "learning_rate": 3.7314012996434826e-06, "loss": 0.78880751, "num_input_tokens_seen": 33848250, "step": 1596, "time_per_iteration": 2.498216390609741 }, { "auxiliary_loss_clip": 0.01190154, "auxiliary_loss_mlp": 0.01030894, "balance_loss_clip": 1.05953336, "balance_loss_mlp": 1.02100563, "epoch": 0.19202789635062828, "flos": 19861330245120.0, "grad_norm": 1.9762185883592378, "language_loss": 0.81549913, "learning_rate": 3.7310112446677907e-06, "loss": 0.83770967, "num_input_tokens_seen": 33866160, "step": 1597, "time_per_iteration": 2.504812479019165 }, { "auxiliary_loss_clip": 0.01222459, "auxiliary_loss_mlp": 0.01030574, "balance_loss_clip": 1.06431508, "balance_loss_mlp": 1.020805, "epoch": 0.19214813924126736, "flos": 20922957642240.0, "grad_norm": 3.6050880450292677, "language_loss": 0.69361144, "learning_rate": 3.7306209270990695e-06, "loss": 0.71614176, "num_input_tokens_seen": 33884165, "step": 1598, "time_per_iteration": 2.488412857055664 }, { "auxiliary_loss_clip": 0.01188485, "auxiliary_loss_mlp": 0.01040707, "balance_loss_clip": 1.05908775, "balance_loss_mlp": 1.03145051, "epoch": 0.19226838213190645, "flos": 26359725231360.0, "grad_norm": 1.9539046762315475, "language_loss": 0.86761081, "learning_rate": 3.7302303469965292e-06, "loss": 0.88990283, "num_input_tokens_seen": 33903705, "step": 1599, "time_per_iteration": 3.3601696491241455 }, { "auxiliary_loss_clip": 0.0120324, "auxiliary_loss_mlp": 0.01042113, "balance_loss_clip": 1.06108558, "balance_loss_mlp": 1.0325942, "epoch": 0.19238862502254553, "flos": 20850525866880.0, "grad_norm": 5.035027889308814, "language_loss": 0.709975, "learning_rate": 3.7298395044194206e-06, "loss": 0.73242855, "num_input_tokens_seen": 33922515, "step": 1600, "time_per_iteration": 3.328348159790039 }, { "auxiliary_loss_clip": 0.01222932, "auxiliary_loss_mlp": 0.01034773, "balance_loss_clip": 1.0658021, "balance_loss_mlp": 1.0252068, "epoch": 0.19250886791318464, "flos": 21726063878400.0, "grad_norm": 2.5307715227204817, "language_loss": 0.94272757, "learning_rate": 3.7294483994270356e-06, "loss": 0.96530461, "num_input_tokens_seen": 33940840, "step": 1601, "time_per_iteration": 3.2153122425079346 }, { "auxiliary_loss_clip": 0.01146556, "auxiliary_loss_mlp": 0.0103168, "balance_loss_clip": 1.05223823, "balance_loss_mlp": 1.02334726, "epoch": 0.19262911080382372, "flos": 23367827836800.0, "grad_norm": 2.1602791193047346, "language_loss": 0.78748274, "learning_rate": 3.7290570320787033e-06, "loss": 0.80926508, "num_input_tokens_seen": 33960420, "step": 1602, "time_per_iteration": 2.5850577354431152 }, { "auxiliary_loss_clip": 0.01203137, "auxiliary_loss_mlp": 0.01031909, "balance_loss_clip": 1.06198585, "balance_loss_mlp": 1.02238417, "epoch": 0.1927493536944628, "flos": 21943502858880.0, "grad_norm": 2.260678450619943, "language_loss": 0.71356058, "learning_rate": 3.728665402433793e-06, "loss": 0.73591101, "num_input_tokens_seen": 33978990, "step": 1603, "time_per_iteration": 2.5086870193481445 }, { "auxiliary_loss_clip": 0.01192163, "auxiliary_loss_mlp": 0.01034638, "balance_loss_clip": 1.06267428, "balance_loss_mlp": 1.02552414, "epoch": 0.19286959658510192, "flos": 16545590807040.0, "grad_norm": 2.3352052621092008, "language_loss": 0.86060452, "learning_rate": 3.7282735105517164e-06, "loss": 0.88287258, "num_input_tokens_seen": 33997115, "step": 1604, "time_per_iteration": 2.497164249420166 }, { "auxiliary_loss_clip": 0.01165361, "auxiliary_loss_mlp": 0.01036685, "balance_loss_clip": 1.05348873, "balance_loss_mlp": 1.02702296, "epoch": 0.192989839475741, "flos": 21616967295360.0, "grad_norm": 1.9861077245212593, "language_loss": 0.67265725, "learning_rate": 3.727881356491922e-06, "loss": 0.69467771, "num_input_tokens_seen": 34015525, "step": 1605, "time_per_iteration": 2.606354236602783 }, { "auxiliary_loss_clip": 0.01219973, "auxiliary_loss_mlp": 0.01036542, "balance_loss_clip": 1.06457198, "balance_loss_mlp": 1.02789354, "epoch": 0.19311008236638008, "flos": 19281516906240.0, "grad_norm": 1.8079207738179592, "language_loss": 0.75772434, "learning_rate": 3.7274889403139002e-06, "loss": 0.78028947, "num_input_tokens_seen": 34033150, "step": 1606, "time_per_iteration": 2.4447648525238037 }, { "auxiliary_loss_clip": 0.01157561, "auxiliary_loss_mlp": 0.01032947, "balance_loss_clip": 1.05782604, "balance_loss_mlp": 1.0236131, "epoch": 0.1932303252570192, "flos": 28652369587200.0, "grad_norm": 2.4854466326576645, "language_loss": 0.78287339, "learning_rate": 3.727096262077179e-06, "loss": 0.80477846, "num_input_tokens_seen": 34052145, "step": 1607, "time_per_iteration": 2.6555755138397217 }, { "auxiliary_loss_clip": 0.01205283, "auxiliary_loss_mlp": 0.01031442, "balance_loss_clip": 1.06171179, "balance_loss_mlp": 1.02213192, "epoch": 0.19335056814765827, "flos": 18368990864640.0, "grad_norm": 1.712899263160362, "language_loss": 0.85260642, "learning_rate": 3.7267033218413285e-06, "loss": 0.87497365, "num_input_tokens_seen": 34069940, "step": 1608, "time_per_iteration": 2.485337734222412 }, { "auxiliary_loss_clip": 0.01144435, "auxiliary_loss_mlp": 0.01037978, "balance_loss_clip": 1.04949069, "balance_loss_mlp": 1.02715993, "epoch": 0.19347081103829736, "flos": 13260877741440.0, "grad_norm": 2.3274551957348333, "language_loss": 0.81559098, "learning_rate": 3.726310119665957e-06, "loss": 0.83741516, "num_input_tokens_seen": 34086275, "step": 1609, "time_per_iteration": 2.5958425998687744 }, { "auxiliary_loss_clip": 0.01203492, "auxiliary_loss_mlp": 0.01032228, "balance_loss_clip": 1.05945301, "balance_loss_mlp": 1.02264357, "epoch": 0.19359105392893644, "flos": 20300122788480.0, "grad_norm": 6.797871910514251, "language_loss": 0.8529979, "learning_rate": 3.725916655610713e-06, "loss": 0.87535512, "num_input_tokens_seen": 34105605, "step": 1610, "time_per_iteration": 2.50962233543396 }, { "auxiliary_loss_clip": 0.01179336, "auxiliary_loss_mlp": 0.0103085, "balance_loss_clip": 1.05387163, "balance_loss_mlp": 1.02072346, "epoch": 0.19371129681957555, "flos": 20484596062080.0, "grad_norm": 2.271893400566545, "language_loss": 0.756423, "learning_rate": 3.725522929735284e-06, "loss": 0.77852488, "num_input_tokens_seen": 34122540, "step": 1611, "time_per_iteration": 2.4995856285095215 }, { "auxiliary_loss_clip": 0.01195033, "auxiliary_loss_mlp": 0.0103125, "balance_loss_clip": 1.05754519, "balance_loss_mlp": 1.02136147, "epoch": 0.19383153971021463, "flos": 30445497457920.0, "grad_norm": 3.9646823815996592, "language_loss": 0.74336874, "learning_rate": 3.725128942099399e-06, "loss": 0.76563156, "num_input_tokens_seen": 34142940, "step": 1612, "time_per_iteration": 2.5952932834625244 }, { "auxiliary_loss_clip": 0.01176847, "auxiliary_loss_mlp": 0.01033384, "balance_loss_clip": 1.0547688, "balance_loss_mlp": 1.02373433, "epoch": 0.19395178260085372, "flos": 24569937325440.0, "grad_norm": 3.5108690578685433, "language_loss": 0.79512978, "learning_rate": 3.7247346927628245e-06, "loss": 0.81723213, "num_input_tokens_seen": 34162875, "step": 1613, "time_per_iteration": 2.5632119178771973 }, { "auxiliary_loss_clip": 0.01183374, "auxiliary_loss_mlp": 0.00765144, "balance_loss_clip": 1.05575013, "balance_loss_mlp": 1.00093484, "epoch": 0.19407202549149283, "flos": 28950608211840.0, "grad_norm": 1.8605008041969282, "language_loss": 0.79384148, "learning_rate": 3.7243401817853694e-06, "loss": 0.81332666, "num_input_tokens_seen": 34183565, "step": 1614, "time_per_iteration": 2.5685200691223145 }, { "auxiliary_loss_clip": 0.01195809, "auxiliary_loss_mlp": 0.01033267, "balance_loss_clip": 1.05703545, "balance_loss_mlp": 1.02399826, "epoch": 0.1941922683821319, "flos": 18004497603840.0, "grad_norm": 1.9572675795824275, "language_loss": 0.71726632, "learning_rate": 3.723945409226879e-06, "loss": 0.73955715, "num_input_tokens_seen": 34202055, "step": 1615, "time_per_iteration": 2.445333242416382 }, { "auxiliary_loss_clip": 0.01202986, "auxiliary_loss_mlp": 0.01042162, "balance_loss_clip": 1.05983257, "balance_loss_mlp": 1.03220236, "epoch": 0.194312511272771, "flos": 9720337034880.0, "grad_norm": 2.569418678549781, "language_loss": 0.80127764, "learning_rate": 3.723550375147241e-06, "loss": 0.82372916, "num_input_tokens_seen": 34216830, "step": 1616, "time_per_iteration": 2.473280668258667 }, { "auxiliary_loss_clip": 0.01159501, "auxiliary_loss_mlp": 0.0103576, "balance_loss_clip": 1.05108607, "balance_loss_mlp": 1.02555561, "epoch": 0.19443275416341008, "flos": 27016208150400.0, "grad_norm": 1.6555038259686654, "language_loss": 0.79878938, "learning_rate": 3.7231550796063816e-06, "loss": 0.82074201, "num_input_tokens_seen": 34236840, "step": 1617, "time_per_iteration": 2.594310760498047 }, { "auxiliary_loss_clip": 0.01195926, "auxiliary_loss_mlp": 0.01038931, "balance_loss_clip": 1.06091058, "balance_loss_mlp": 1.02860737, "epoch": 0.1945529970540492, "flos": 15846625077120.0, "grad_norm": 2.2155233497930267, "language_loss": 0.64881492, "learning_rate": 3.722759522664266e-06, "loss": 0.6711635, "num_input_tokens_seen": 34254140, "step": 1618, "time_per_iteration": 2.5026543140411377 }, { "auxiliary_loss_clip": 0.01159288, "auxiliary_loss_mlp": 0.01029697, "balance_loss_clip": 1.05464172, "balance_loss_mlp": 1.01980817, "epoch": 0.19467323994468827, "flos": 19314985403520.0, "grad_norm": 2.094113840125367, "language_loss": 0.81364161, "learning_rate": 3.7223637043809016e-06, "loss": 0.83553147, "num_input_tokens_seen": 34273120, "step": 1619, "time_per_iteration": 3.3568055629730225 }, { "auxiliary_loss_clip": 0.01177042, "auxiliary_loss_mlp": 0.01039793, "balance_loss_clip": 1.05788827, "balance_loss_mlp": 1.03059053, "epoch": 0.19479348283532735, "flos": 24133227770880.0, "grad_norm": 1.7939700519857602, "language_loss": 0.86837673, "learning_rate": 3.7219676248163322e-06, "loss": 0.89054513, "num_input_tokens_seen": 34290285, "step": 1620, "time_per_iteration": 2.550503730773926 }, { "auxiliary_loss_clip": 0.01209746, "auxiliary_loss_mlp": 0.01033836, "balance_loss_clip": 1.06290007, "balance_loss_mlp": 1.02362609, "epoch": 0.19491372572596646, "flos": 25775638174080.0, "grad_norm": 1.9266301918596251, "language_loss": 0.93155599, "learning_rate": 3.721571284030643e-06, "loss": 0.95399183, "num_input_tokens_seen": 34310095, "step": 1621, "time_per_iteration": 2.527374744415283 }, { "auxiliary_loss_clip": 0.01207716, "auxiliary_loss_mlp": 0.01026182, "balance_loss_clip": 1.06077635, "balance_loss_mlp": 1.01655555, "epoch": 0.19503396861660555, "flos": 19645220067840.0, "grad_norm": 2.611281306459111, "language_loss": 0.79294729, "learning_rate": 3.7211746820839587e-06, "loss": 0.81528628, "num_input_tokens_seen": 34327190, "step": 1622, "time_per_iteration": 2.482006788253784 }, { "auxiliary_loss_clip": 0.01109937, "auxiliary_loss_mlp": 0.01030952, "balance_loss_clip": 1.04737592, "balance_loss_mlp": 1.02103972, "epoch": 0.19515421150724463, "flos": 21033023892480.0, "grad_norm": 1.6229701700855272, "language_loss": 0.80526686, "learning_rate": 3.7207778190364437e-06, "loss": 0.82667577, "num_input_tokens_seen": 34345615, "step": 1623, "time_per_iteration": 2.632573366165161 }, { "auxiliary_loss_clip": 0.01130734, "auxiliary_loss_mlp": 0.01033525, "balance_loss_clip": 1.04932904, "balance_loss_mlp": 1.02382112, "epoch": 0.1952744543978837, "flos": 32961255143040.0, "grad_norm": 1.6046977402444162, "language_loss": 0.73752093, "learning_rate": 3.720380694948302e-06, "loss": 0.7591635, "num_input_tokens_seen": 34368500, "step": 1624, "time_per_iteration": 2.6935505867004395 }, { "auxiliary_loss_clip": 0.01083639, "auxiliary_loss_mlp": 0.01007573, "balance_loss_clip": 1.03001809, "balance_loss_mlp": 1.00430667, "epoch": 0.19539469728852282, "flos": 64044312030720.0, "grad_norm": 1.0340483389483592, "language_loss": 0.71240377, "learning_rate": 3.719983309879777e-06, "loss": 0.73331594, "num_input_tokens_seen": 34428280, "step": 1625, "time_per_iteration": 3.9505069255828857 }, { "auxiliary_loss_clip": 0.01164246, "auxiliary_loss_mlp": 0.0104103, "balance_loss_clip": 1.05362272, "balance_loss_mlp": 1.03160608, "epoch": 0.1955149401791619, "flos": 13370908078080.0, "grad_norm": 1.9448949063990069, "language_loss": 0.7744208, "learning_rate": 3.719585663891151e-06, "loss": 0.7964735, "num_input_tokens_seen": 34445815, "step": 1626, "time_per_iteration": 3.36405873298645 }, { "auxiliary_loss_clip": 0.0115292, "auxiliary_loss_mlp": 0.0104182, "balance_loss_clip": 1.05643702, "balance_loss_mlp": 1.03162766, "epoch": 0.195635183069801, "flos": 18728887184640.0, "grad_norm": 2.104964249852706, "language_loss": 0.78847951, "learning_rate": 3.719187757042747e-06, "loss": 0.81042689, "num_input_tokens_seen": 34463635, "step": 1627, "time_per_iteration": 3.3201677799224854 }, { "auxiliary_loss_clip": 0.01104896, "auxiliary_loss_mlp": 0.01003333, "balance_loss_clip": 1.0356195, "balance_loss_mlp": 1.00038815, "epoch": 0.1957554259604401, "flos": 69313952615040.0, "grad_norm": 0.7295797716334137, "language_loss": 0.54937702, "learning_rate": 3.7187895893949275e-06, "loss": 0.57045931, "num_input_tokens_seen": 34530105, "step": 1628, "time_per_iteration": 3.1895103454589844 }, { "auxiliary_loss_clip": 0.01145832, "auxiliary_loss_mlp": 0.01025827, "balance_loss_clip": 1.05101264, "balance_loss_mlp": 1.01543212, "epoch": 0.19587566885107918, "flos": 21069257736960.0, "grad_norm": 2.2061824573171993, "language_loss": 0.76364571, "learning_rate": 3.7183911610080937e-06, "loss": 0.78536224, "num_input_tokens_seen": 34546970, "step": 1629, "time_per_iteration": 2.5896711349487305 }, { "auxiliary_loss_clip": 0.01178498, "auxiliary_loss_mlp": 0.01042951, "balance_loss_clip": 1.05810952, "balance_loss_mlp": 1.03204989, "epoch": 0.19599591174171827, "flos": 22194661731840.0, "grad_norm": 2.9276473469108653, "language_loss": 0.74557209, "learning_rate": 3.7179924719426872e-06, "loss": 0.76778662, "num_input_tokens_seen": 34564865, "step": 1630, "time_per_iteration": 2.5846400260925293 }, { "auxiliary_loss_clip": 0.01208447, "auxiliary_loss_mlp": 0.01040735, "balance_loss_clip": 1.06222546, "balance_loss_mlp": 1.03079903, "epoch": 0.19611615463235738, "flos": 23768375374080.0, "grad_norm": 2.511109352186563, "language_loss": 0.76104778, "learning_rate": 3.7175935222591885e-06, "loss": 0.78353965, "num_input_tokens_seen": 34584165, "step": 1631, "time_per_iteration": 2.536773204803467 }, { "auxiliary_loss_clip": 0.0119398, "auxiliary_loss_mlp": 0.01034552, "balance_loss_clip": 1.06454563, "balance_loss_mlp": 1.02465725, "epoch": 0.19623639752299646, "flos": 28618218731520.0, "grad_norm": 1.8385144209183029, "language_loss": 0.74802679, "learning_rate": 3.717194312018118e-06, "loss": 0.77031207, "num_input_tokens_seen": 34603150, "step": 1632, "time_per_iteration": 2.6672165393829346 }, { "auxiliary_loss_clip": 0.01203381, "auxiliary_loss_mlp": 0.01037981, "balance_loss_clip": 1.0590229, "balance_loss_mlp": 1.02780676, "epoch": 0.19635664041363554, "flos": 21032700670080.0, "grad_norm": 2.1660042650376092, "language_loss": 0.76431143, "learning_rate": 3.716794841280036e-06, "loss": 0.78672504, "num_input_tokens_seen": 34621855, "step": 1633, "time_per_iteration": 2.50923752784729 }, { "auxiliary_loss_clip": 0.0121039, "auxiliary_loss_mlp": 0.01038689, "balance_loss_clip": 1.06045151, "balance_loss_mlp": 1.02867508, "epoch": 0.19647688330427462, "flos": 18879748306560.0, "grad_norm": 2.0475445430633057, "language_loss": 0.77557826, "learning_rate": 3.7163951101055407e-06, "loss": 0.798069, "num_input_tokens_seen": 34639915, "step": 1634, "time_per_iteration": 2.4987168312072754 }, { "auxiliary_loss_clip": 0.01188349, "auxiliary_loss_mlp": 0.01036922, "balance_loss_clip": 1.06042349, "balance_loss_mlp": 1.02673519, "epoch": 0.19659712619491373, "flos": 24242503921920.0, "grad_norm": 1.9019399445160583, "language_loss": 0.78896469, "learning_rate": 3.715995118555273e-06, "loss": 0.81121737, "num_input_tokens_seen": 34659890, "step": 1635, "time_per_iteration": 2.5771420001983643 }, { "auxiliary_loss_clip": 0.01154421, "auxiliary_loss_mlp": 0.01039155, "balance_loss_clip": 1.05317211, "balance_loss_mlp": 1.02812254, "epoch": 0.19671736908555282, "flos": 24717422568960.0, "grad_norm": 2.4529292534742893, "language_loss": 0.85550624, "learning_rate": 3.71559486668991e-06, "loss": 0.877442, "num_input_tokens_seen": 34678750, "step": 1636, "time_per_iteration": 2.63238263130188 }, { "auxiliary_loss_clip": 0.01211982, "auxiliary_loss_mlp": 0.00765143, "balance_loss_clip": 1.06431162, "balance_loss_mlp": 1.00102079, "epoch": 0.1968376119761919, "flos": 23842279607040.0, "grad_norm": 1.6884853463991911, "language_loss": 0.7729736, "learning_rate": 3.715194354570169e-06, "loss": 0.79274487, "num_input_tokens_seen": 34698755, "step": 1637, "time_per_iteration": 2.5613176822662354 }, { "auxiliary_loss_clip": 0.01206988, "auxiliary_loss_mlp": 0.01042607, "balance_loss_clip": 1.06504273, "balance_loss_mlp": 1.03295088, "epoch": 0.196957854866831, "flos": 18113917409280.0, "grad_norm": 1.9691690305393585, "language_loss": 0.83363259, "learning_rate": 3.714793582256809e-06, "loss": 0.85612857, "num_input_tokens_seen": 34715820, "step": 1638, "time_per_iteration": 2.583327054977417 }, { "auxiliary_loss_clip": 0.01218352, "auxiliary_loss_mlp": 0.01036213, "balance_loss_clip": 1.06208348, "balance_loss_mlp": 1.02621758, "epoch": 0.1970780977574701, "flos": 21653129312640.0, "grad_norm": 2.6975099761931673, "language_loss": 0.85075057, "learning_rate": 3.7143925498106253e-06, "loss": 0.87329626, "num_input_tokens_seen": 34734360, "step": 1639, "time_per_iteration": 2.4852707386016846 }, { "auxiliary_loss_clip": 0.01189501, "auxiliary_loss_mlp": 0.01035805, "balance_loss_clip": 1.05576229, "balance_loss_mlp": 1.02508259, "epoch": 0.19719834064810918, "flos": 20811813984000.0, "grad_norm": 1.9507284402425875, "language_loss": 0.79450566, "learning_rate": 3.7139912572924558e-06, "loss": 0.81675875, "num_input_tokens_seen": 34753390, "step": 1640, "time_per_iteration": 2.5805928707122803 }, { "auxiliary_loss_clip": 0.01201007, "auxiliary_loss_mlp": 0.01038118, "balance_loss_clip": 1.05786896, "balance_loss_mlp": 1.02830696, "epoch": 0.19731858353874826, "flos": 23434800744960.0, "grad_norm": 4.183664415689264, "language_loss": 0.79915917, "learning_rate": 3.7135897047631744e-06, "loss": 0.82155037, "num_input_tokens_seen": 34771275, "step": 1641, "time_per_iteration": 2.5684657096862793 }, { "auxiliary_loss_clip": 0.01192026, "auxiliary_loss_mlp": 0.01036715, "balance_loss_clip": 1.06010377, "balance_loss_mlp": 1.02636778, "epoch": 0.19743882642938737, "flos": 23988184652160.0, "grad_norm": 2.4634363500286907, "language_loss": 0.76119924, "learning_rate": 3.713187892283698e-06, "loss": 0.7834866, "num_input_tokens_seen": 34790885, "step": 1642, "time_per_iteration": 2.5714285373687744 }, { "auxiliary_loss_clip": 0.01158354, "auxiliary_loss_mlp": 0.01039186, "balance_loss_clip": 1.05249333, "balance_loss_mlp": 1.02901709, "epoch": 0.19755906932002645, "flos": 15004340081280.0, "grad_norm": 2.459635447959844, "language_loss": 0.87307298, "learning_rate": 3.71278581991498e-06, "loss": 0.89504838, "num_input_tokens_seen": 34806745, "step": 1643, "time_per_iteration": 2.5991883277893066 }, { "auxiliary_loss_clip": 0.0118069, "auxiliary_loss_mlp": 0.00766174, "balance_loss_clip": 1.06415129, "balance_loss_mlp": 1.00091124, "epoch": 0.19767931221066554, "flos": 19494466686720.0, "grad_norm": 1.7661277831995152, "language_loss": 0.79050279, "learning_rate": 3.712383487718015e-06, "loss": 0.80997145, "num_input_tokens_seen": 34824985, "step": 1644, "time_per_iteration": 2.565838098526001 }, { "auxiliary_loss_clip": 0.01141436, "auxiliary_loss_mlp": 0.01034648, "balance_loss_clip": 1.05368018, "balance_loss_mlp": 1.02492642, "epoch": 0.19779955510130465, "flos": 25737895958400.0, "grad_norm": 1.9384558475600515, "language_loss": 0.86899418, "learning_rate": 3.7119808957538365e-06, "loss": 0.89075506, "num_input_tokens_seen": 34843980, "step": 1645, "time_per_iteration": 2.6641485691070557 }, { "auxiliary_loss_clip": 0.01185913, "auxiliary_loss_mlp": 0.01035776, "balance_loss_clip": 1.05651164, "balance_loss_mlp": 1.02508879, "epoch": 0.19791979799194373, "flos": 20777699041920.0, "grad_norm": 8.451098249241808, "language_loss": 0.80159289, "learning_rate": 3.711578044083517e-06, "loss": 0.8238098, "num_input_tokens_seen": 34860780, "step": 1646, "time_per_iteration": 3.3448476791381836 }, { "auxiliary_loss_clip": 0.01192336, "auxiliary_loss_mlp": 0.01041315, "balance_loss_clip": 1.06008196, "balance_loss_mlp": 1.03087234, "epoch": 0.1980400408825828, "flos": 25589010084480.0, "grad_norm": 1.9040656801999716, "language_loss": 0.74541759, "learning_rate": 3.7111749327681698e-06, "loss": 0.76775408, "num_input_tokens_seen": 34880815, "step": 1647, "time_per_iteration": 2.5679314136505127 }, { "auxiliary_loss_clip": 0.0121259, "auxiliary_loss_mlp": 0.01030518, "balance_loss_clip": 1.06656599, "balance_loss_mlp": 1.02118969, "epoch": 0.1981602837732219, "flos": 23513840622720.0, "grad_norm": 3.0242349250946368, "language_loss": 0.86099494, "learning_rate": 3.7107715618689455e-06, "loss": 0.88342601, "num_input_tokens_seen": 34899790, "step": 1648, "time_per_iteration": 2.5200653076171875 }, { "auxiliary_loss_clip": 0.01202204, "auxiliary_loss_mlp": 0.01033351, "balance_loss_clip": 1.0612092, "balance_loss_mlp": 1.02311718, "epoch": 0.198280526663861, "flos": 23185365724800.0, "grad_norm": 1.5420652078259156, "language_loss": 0.83605528, "learning_rate": 3.710367931447035e-06, "loss": 0.85841078, "num_input_tokens_seen": 34921570, "step": 1649, "time_per_iteration": 2.5283823013305664 }, { "auxiliary_loss_clip": 0.01214658, "auxiliary_loss_mlp": 0.01041979, "balance_loss_clip": 1.0640049, "balance_loss_mlp": 1.03121471, "epoch": 0.1984007695545001, "flos": 21689470897920.0, "grad_norm": 2.1688839824402373, "language_loss": 0.86143672, "learning_rate": 3.70996404156367e-06, "loss": 0.88400304, "num_input_tokens_seen": 34941205, "step": 1650, "time_per_iteration": 2.5186758041381836 }, { "auxiliary_loss_clip": 0.01152221, "auxiliary_loss_mlp": 0.01038833, "balance_loss_clip": 1.05473626, "balance_loss_mlp": 1.02929068, "epoch": 0.19852101244513917, "flos": 36064008887040.0, "grad_norm": 1.6877518690527404, "language_loss": 0.7288053, "learning_rate": 3.7095598922801187e-06, "loss": 0.75071585, "num_input_tokens_seen": 34963280, "step": 1651, "time_per_iteration": 2.7213032245635986 }, { "auxiliary_loss_clip": 0.01221892, "auxiliary_loss_mlp": 0.01036918, "balance_loss_clip": 1.06495404, "balance_loss_mlp": 1.02660072, "epoch": 0.19864125533577828, "flos": 23105894883840.0, "grad_norm": 2.998001063498881, "language_loss": 0.7636205, "learning_rate": 3.7091554836576914e-06, "loss": 0.78620857, "num_input_tokens_seen": 34979955, "step": 1652, "time_per_iteration": 3.3028557300567627 }, { "auxiliary_loss_clip": 0.01205783, "auxiliary_loss_mlp": 0.00765017, "balance_loss_clip": 1.0644052, "balance_loss_mlp": 1.00104976, "epoch": 0.19876149822641737, "flos": 24608505553920.0, "grad_norm": 1.7078364518663895, "language_loss": 0.82894939, "learning_rate": 3.708750815757736e-06, "loss": 0.84865743, "num_input_tokens_seen": 35000725, "step": 1653, "time_per_iteration": 4.1630539894104 }, { "auxiliary_loss_clip": 0.01208338, "auxiliary_loss_mlp": 0.01040195, "balance_loss_clip": 1.06394792, "balance_loss_mlp": 1.03007448, "epoch": 0.19888174111705645, "flos": 32196645308160.0, "grad_norm": 2.4355750053133414, "language_loss": 0.73480725, "learning_rate": 3.7083458886416407e-06, "loss": 0.75729263, "num_input_tokens_seen": 35019920, "step": 1654, "time_per_iteration": 2.5967397689819336 }, { "auxiliary_loss_clip": 0.01152681, "auxiliary_loss_mlp": 0.01034596, "balance_loss_clip": 1.05905902, "balance_loss_mlp": 1.02455306, "epoch": 0.19900198400769553, "flos": 24608469640320.0, "grad_norm": 2.2523856304074594, "language_loss": 0.87917298, "learning_rate": 3.707940702370832e-06, "loss": 0.90104574, "num_input_tokens_seen": 35040765, "step": 1655, "time_per_iteration": 2.7132151126861572 }, { "auxiliary_loss_clip": 0.01111372, "auxiliary_loss_mlp": 0.01003066, "balance_loss_clip": 1.03698659, "balance_loss_mlp": 1.00069416, "epoch": 0.19912222689833464, "flos": 67915805673600.0, "grad_norm": 0.7581153224962618, "language_loss": 0.58303392, "learning_rate": 3.707535257006777e-06, "loss": 0.60417831, "num_input_tokens_seen": 35106390, "step": 1656, "time_per_iteration": 3.175356149673462 }, { "auxiliary_loss_clip": 0.01193514, "auxiliary_loss_mlp": 0.01039223, "balance_loss_clip": 1.06086111, "balance_loss_mlp": 1.02853, "epoch": 0.19924246978897373, "flos": 15742340916480.0, "grad_norm": 2.4797533285055735, "language_loss": 0.88435018, "learning_rate": 3.707129552610981e-06, "loss": 0.90667754, "num_input_tokens_seen": 35125040, "step": 1657, "time_per_iteration": 2.524670362472534 }, { "auxiliary_loss_clip": 0.01187576, "auxiliary_loss_mlp": 0.01031936, "balance_loss_clip": 1.06300759, "balance_loss_mlp": 1.02218461, "epoch": 0.1993627126796128, "flos": 17566566986880.0, "grad_norm": 2.0764022080352387, "language_loss": 0.73549104, "learning_rate": 3.70672358924499e-06, "loss": 0.75768614, "num_input_tokens_seen": 35144280, "step": 1658, "time_per_iteration": 2.53963303565979 }, { "auxiliary_loss_clip": 0.01176565, "auxiliary_loss_mlp": 0.01037206, "balance_loss_clip": 1.06365061, "balance_loss_mlp": 1.0269897, "epoch": 0.19948295557025192, "flos": 40843826680320.0, "grad_norm": 2.197653805299495, "language_loss": 0.78508693, "learning_rate": 3.706317366970386e-06, "loss": 0.80722463, "num_input_tokens_seen": 35165280, "step": 1659, "time_per_iteration": 2.7581794261932373 }, { "auxiliary_loss_clip": 0.01223309, "auxiliary_loss_mlp": 0.00765974, "balance_loss_clip": 1.06246698, "balance_loss_mlp": 1.00098264, "epoch": 0.199603198460891, "flos": 25082418620160.0, "grad_norm": 6.106228511336119, "language_loss": 0.83624744, "learning_rate": 3.705910885848795e-06, "loss": 0.85614032, "num_input_tokens_seen": 35183655, "step": 1660, "time_per_iteration": 2.5417072772979736 }, { "auxiliary_loss_clip": 0.01207101, "auxiliary_loss_mlp": 0.01031039, "balance_loss_clip": 1.06376636, "balance_loss_mlp": 1.02113295, "epoch": 0.19972344135153008, "flos": 20084120352000.0, "grad_norm": 2.050175655351116, "language_loss": 0.8430661, "learning_rate": 3.705504145941879e-06, "loss": 0.86544752, "num_input_tokens_seen": 35201825, "step": 1661, "time_per_iteration": 2.493687868118286 }, { "auxiliary_loss_clip": 0.01220762, "auxiliary_loss_mlp": 0.01031462, "balance_loss_clip": 1.06453264, "balance_loss_mlp": 1.02156162, "epoch": 0.1998436842421692, "flos": 23727472761600.0, "grad_norm": 1.8797041062696531, "language_loss": 0.78811651, "learning_rate": 3.7050971473113403e-06, "loss": 0.81063873, "num_input_tokens_seen": 35221600, "step": 1662, "time_per_iteration": 2.49241304397583 }, { "auxiliary_loss_clip": 0.01201331, "auxiliary_loss_mlp": 0.00764981, "balance_loss_clip": 1.06013894, "balance_loss_mlp": 1.00094926, "epoch": 0.19996392713280828, "flos": 36102361633920.0, "grad_norm": 1.6599697666414834, "language_loss": 0.80064654, "learning_rate": 3.7046898900189196e-06, "loss": 0.82030964, "num_input_tokens_seen": 35245935, "step": 1663, "time_per_iteration": 2.6219301223754883 }, { "auxiliary_loss_clip": 0.01181284, "auxiliary_loss_mlp": 0.01040233, "balance_loss_clip": 1.06198764, "balance_loss_mlp": 1.03007054, "epoch": 0.20008417002344736, "flos": 23657662679040.0, "grad_norm": 1.6288642803221676, "language_loss": 0.82824743, "learning_rate": 3.704282374126398e-06, "loss": 0.85046256, "num_input_tokens_seen": 35265615, "step": 1664, "time_per_iteration": 2.5889265537261963 }, { "auxiliary_loss_clip": 0.0117494, "auxiliary_loss_mlp": 0.01032182, "balance_loss_clip": 1.05887675, "balance_loss_mlp": 1.02223396, "epoch": 0.20020441291408644, "flos": 21872076664320.0, "grad_norm": 1.6582016618941289, "language_loss": 0.87228185, "learning_rate": 3.7038745996955954e-06, "loss": 0.89435303, "num_input_tokens_seen": 35284960, "step": 1665, "time_per_iteration": 2.676917791366577 }, { "auxiliary_loss_clip": 0.01181586, "auxiliary_loss_mlp": 0.01035357, "balance_loss_clip": 1.05892849, "balance_loss_mlp": 1.02605915, "epoch": 0.20032465580472555, "flos": 23179691376000.0, "grad_norm": 8.364605105437795, "language_loss": 0.71882677, "learning_rate": 3.703466566788371e-06, "loss": 0.74099624, "num_input_tokens_seen": 35304090, "step": 1666, "time_per_iteration": 2.6527786254882812 }, { "auxiliary_loss_clip": 0.01185144, "auxiliary_loss_mlp": 0.01034719, "balance_loss_clip": 1.06267095, "balance_loss_mlp": 1.02418709, "epoch": 0.20044489869536464, "flos": 23873521461120.0, "grad_norm": 1.7802398091804892, "language_loss": 0.74326873, "learning_rate": 3.703058275466622e-06, "loss": 0.76546741, "num_input_tokens_seen": 35323325, "step": 1667, "time_per_iteration": 2.6319797039031982 }, { "auxiliary_loss_clip": 0.01190139, "auxiliary_loss_mlp": 0.01037229, "balance_loss_clip": 1.06007433, "balance_loss_mlp": 1.02771592, "epoch": 0.20056514158600372, "flos": 21945226711680.0, "grad_norm": 1.6500551770118452, "language_loss": 0.77622211, "learning_rate": 3.7026497257922877e-06, "loss": 0.79849571, "num_input_tokens_seen": 35343635, "step": 1668, "time_per_iteration": 2.636244535446167 }, { "auxiliary_loss_clip": 0.01156455, "auxiliary_loss_mlp": 0.01047609, "balance_loss_clip": 1.0545131, "balance_loss_mlp": 1.03779793, "epoch": 0.20068538447664283, "flos": 23879159896320.0, "grad_norm": 1.7287193903547278, "language_loss": 0.8523429, "learning_rate": 3.7022409178273436e-06, "loss": 0.87438351, "num_input_tokens_seen": 35364615, "step": 1669, "time_per_iteration": 2.7075541019439697 }, { "auxiliary_loss_clip": 0.0120001, "auxiliary_loss_mlp": 0.01029436, "balance_loss_clip": 1.06105685, "balance_loss_mlp": 1.02031028, "epoch": 0.2008056273672819, "flos": 18442823270400.0, "grad_norm": 1.744699902509725, "language_loss": 0.78678107, "learning_rate": 3.7018318516338054e-06, "loss": 0.80907547, "num_input_tokens_seen": 35383775, "step": 1670, "time_per_iteration": 2.5497567653656006 }, { "auxiliary_loss_clip": 0.01208669, "auxiliary_loss_mlp": 0.01028947, "balance_loss_clip": 1.062778, "balance_loss_mlp": 1.01976764, "epoch": 0.200925870257921, "flos": 23659530186240.0, "grad_norm": 2.87900845657148, "language_loss": 0.81513917, "learning_rate": 3.7014225272737284e-06, "loss": 0.83751535, "num_input_tokens_seen": 35403000, "step": 1671, "time_per_iteration": 2.586712121963501 }, { "auxiliary_loss_clip": 0.01198212, "auxiliary_loss_mlp": 0.01034641, "balance_loss_clip": 1.06055844, "balance_loss_mlp": 1.02484179, "epoch": 0.20104611314856008, "flos": 16217115909120.0, "grad_norm": 2.2723517604150048, "language_loss": 0.7415908, "learning_rate": 3.701012944809207e-06, "loss": 0.76391935, "num_input_tokens_seen": 35420115, "step": 1672, "time_per_iteration": 3.333911418914795 }, { "auxiliary_loss_clip": 0.01188707, "auxiliary_loss_mlp": 0.00764452, "balance_loss_clip": 1.06163788, "balance_loss_mlp": 1.00105047, "epoch": 0.2011663560391992, "flos": 21397373498880.0, "grad_norm": 2.3451975569181904, "language_loss": 0.79037344, "learning_rate": 3.700603104302374e-06, "loss": 0.80990499, "num_input_tokens_seen": 35439925, "step": 1673, "time_per_iteration": 2.618485689163208 }, { "auxiliary_loss_clip": 0.01070271, "auxiliary_loss_mlp": 0.01003165, "balance_loss_clip": 1.02871227, "balance_loss_mlp": 1.00064969, "epoch": 0.20128659892983827, "flos": 62229459409920.0, "grad_norm": 0.8881138279454771, "language_loss": 0.55909812, "learning_rate": 3.7001930058154027e-06, "loss": 0.57983243, "num_input_tokens_seen": 35504885, "step": 1674, "time_per_iteration": 3.2131967544555664 }, { "auxiliary_loss_clip": 0.01173404, "auxiliary_loss_mlp": 0.0103804, "balance_loss_clip": 1.05735135, "balance_loss_mlp": 1.02799642, "epoch": 0.20140684182047736, "flos": 28438737448320.0, "grad_norm": 2.282931835369254, "language_loss": 0.80218577, "learning_rate": 3.6997826494105037e-06, "loss": 0.82430023, "num_input_tokens_seen": 35525330, "step": 1675, "time_per_iteration": 2.6722702980041504 }, { "auxiliary_loss_clip": 0.01189185, "auxiliary_loss_mlp": 0.01029578, "balance_loss_clip": 1.06025648, "balance_loss_mlp": 1.02014899, "epoch": 0.20152708471111647, "flos": 28074064619520.0, "grad_norm": 2.0277036115078615, "language_loss": 0.69524276, "learning_rate": 3.6993720351499286e-06, "loss": 0.71743041, "num_input_tokens_seen": 35546455, "step": 1676, "time_per_iteration": 2.6323435306549072 }, { "auxiliary_loss_clip": 0.01183767, "auxiliary_loss_mlp": 0.01033742, "balance_loss_clip": 1.06196809, "balance_loss_mlp": 1.02442551, "epoch": 0.20164732760175555, "flos": 23549751244800.0, "grad_norm": 1.7119437772356263, "language_loss": 0.77048182, "learning_rate": 3.6989611630959666e-06, "loss": 0.79265696, "num_input_tokens_seen": 35565010, "step": 1677, "time_per_iteration": 2.6108388900756836 }, { "auxiliary_loss_clip": 0.01103065, "auxiliary_loss_mlp": 0.01000476, "balance_loss_clip": 1.02454388, "balance_loss_mlp": 0.99802065, "epoch": 0.20176757049239463, "flos": 71100616037760.0, "grad_norm": 0.6932006752150882, "language_loss": 0.58356351, "learning_rate": 3.6985500333109474e-06, "loss": 0.60459894, "num_input_tokens_seen": 35633340, "step": 1678, "time_per_iteration": 3.211085081100464 }, { "auxiliary_loss_clip": 0.01165351, "auxiliary_loss_mlp": 0.01034402, "balance_loss_clip": 1.0536654, "balance_loss_mlp": 1.0252645, "epoch": 0.20188781338303372, "flos": 21430159637760.0, "grad_norm": 2.292766218170489, "language_loss": 0.7631526, "learning_rate": 3.6981386458572385e-06, "loss": 0.78515017, "num_input_tokens_seen": 35651315, "step": 1679, "time_per_iteration": 5.008856773376465 }, { "auxiliary_loss_clip": 0.01169122, "auxiliary_loss_mlp": 0.0103906, "balance_loss_clip": 1.05624056, "balance_loss_mlp": 1.02888584, "epoch": 0.20200805627367283, "flos": 11546215130880.0, "grad_norm": 2.6081241469431906, "language_loss": 0.7658906, "learning_rate": 3.6977270007972468e-06, "loss": 0.78797245, "num_input_tokens_seen": 35668850, "step": 1680, "time_per_iteration": 3.4652955532073975 }, { "auxiliary_loss_clip": 0.01191109, "auxiliary_loss_mlp": 0.01033774, "balance_loss_clip": 1.06098557, "balance_loss_mlp": 1.02419519, "epoch": 0.2021282991643119, "flos": 28545391906560.0, "grad_norm": 3.541670296225184, "language_loss": 0.7194438, "learning_rate": 3.6973150981934196e-06, "loss": 0.7416926, "num_input_tokens_seen": 35690080, "step": 1681, "time_per_iteration": 2.653881549835205 }, { "auxiliary_loss_clip": 0.01221046, "auxiliary_loss_mlp": 0.01035603, "balance_loss_clip": 1.06283867, "balance_loss_mlp": 1.025846, "epoch": 0.202248542054951, "flos": 17923446564480.0, "grad_norm": 2.4605282363226184, "language_loss": 0.84155953, "learning_rate": 3.6969029381082415e-06, "loss": 0.86412597, "num_input_tokens_seen": 35706075, "step": 1682, "time_per_iteration": 2.495000123977661 }, { "auxiliary_loss_clip": 0.01184506, "auxiliary_loss_mlp": 0.01031277, "balance_loss_clip": 1.05899298, "balance_loss_mlp": 1.02191353, "epoch": 0.2023687849455901, "flos": 19864634296320.0, "grad_norm": 3.9453847076596618, "language_loss": 0.79655206, "learning_rate": 3.696490520604237e-06, "loss": 0.81870985, "num_input_tokens_seen": 35724765, "step": 1683, "time_per_iteration": 2.578317403793335 }, { "auxiliary_loss_clip": 0.01198847, "auxiliary_loss_mlp": 0.01025917, "balance_loss_clip": 1.0611608, "balance_loss_mlp": 1.01688099, "epoch": 0.20248902783622919, "flos": 22564721600640.0, "grad_norm": 1.7052939037775134, "language_loss": 0.80656451, "learning_rate": 3.696077845743968e-06, "loss": 0.82881212, "num_input_tokens_seen": 35744355, "step": 1684, "time_per_iteration": 2.5487425327301025 }, { "auxiliary_loss_clip": 0.01221345, "auxiliary_loss_mlp": 0.01033491, "balance_loss_clip": 1.06353426, "balance_loss_mlp": 1.02342391, "epoch": 0.20260927072686827, "flos": 22709728805760.0, "grad_norm": 2.3815747828739964, "language_loss": 0.73337722, "learning_rate": 3.69566491359004e-06, "loss": 0.75592554, "num_input_tokens_seen": 35761000, "step": 1685, "time_per_iteration": 2.528510093688965 }, { "auxiliary_loss_clip": 0.0118516, "auxiliary_loss_mlp": 0.01034886, "balance_loss_clip": 1.05817342, "balance_loss_mlp": 1.02492011, "epoch": 0.20272951361750738, "flos": 51023998650240.0, "grad_norm": 3.933181495170971, "language_loss": 0.69612551, "learning_rate": 3.695251724205092e-06, "loss": 0.71832597, "num_input_tokens_seen": 35785360, "step": 1686, "time_per_iteration": 2.8178625106811523 }, { "auxiliary_loss_clip": 0.01216492, "auxiliary_loss_mlp": 0.01033008, "balance_loss_clip": 1.06177151, "balance_loss_mlp": 1.02323294, "epoch": 0.20284975650814646, "flos": 26578133879040.0, "grad_norm": 1.6286031159580288, "language_loss": 0.86638433, "learning_rate": 3.6948382776518054e-06, "loss": 0.8888793, "num_input_tokens_seen": 35806065, "step": 1687, "time_per_iteration": 2.5669915676116943 }, { "auxiliary_loss_clip": 0.01181965, "auxiliary_loss_mlp": 0.01039167, "balance_loss_clip": 1.05671179, "balance_loss_mlp": 1.02941513, "epoch": 0.20296999939878554, "flos": 16034222833920.0, "grad_norm": 4.514600189293757, "language_loss": 0.79371703, "learning_rate": 3.6944245739929e-06, "loss": 0.81592834, "num_input_tokens_seen": 35822225, "step": 1688, "time_per_iteration": 2.549323797225952 }, { "auxiliary_loss_clip": 0.01201791, "auxiliary_loss_mlp": 0.01040372, "balance_loss_clip": 1.06029034, "balance_loss_mlp": 1.03046536, "epoch": 0.20309024228942463, "flos": 19203374868480.0, "grad_norm": 6.2101087334099425, "language_loss": 0.71964628, "learning_rate": 3.6940106132911332e-06, "loss": 0.74206793, "num_input_tokens_seen": 35839410, "step": 1689, "time_per_iteration": 2.533395767211914 }, { "auxiliary_loss_clip": 0.01207532, "auxiliary_loss_mlp": 0.01034316, "balance_loss_clip": 1.06277561, "balance_loss_mlp": 1.02512527, "epoch": 0.20321048518006374, "flos": 22821087945600.0, "grad_norm": 1.7585589232562562, "language_loss": 0.88923246, "learning_rate": 3.6935963956093037e-06, "loss": 0.91165096, "num_input_tokens_seen": 35859495, "step": 1690, "time_per_iteration": 2.545464038848877 }, { "auxiliary_loss_clip": 0.01194583, "auxiliary_loss_mlp": 0.01032331, "balance_loss_clip": 1.05826819, "balance_loss_mlp": 1.02330697, "epoch": 0.20333072807070282, "flos": 19096397187840.0, "grad_norm": 1.734803479224798, "language_loss": 0.69162923, "learning_rate": 3.6931819210102474e-06, "loss": 0.71389842, "num_input_tokens_seen": 35878890, "step": 1691, "time_per_iteration": 2.5537819862365723 }, { "auxiliary_loss_clip": 0.0122165, "auxiliary_loss_mlp": 0.0103516, "balance_loss_clip": 1.06419921, "balance_loss_mlp": 1.02537251, "epoch": 0.2034509709613419, "flos": 18180962144640.0, "grad_norm": 2.022279199970276, "language_loss": 0.8474915, "learning_rate": 3.6927671895568402e-06, "loss": 0.87005955, "num_input_tokens_seen": 35897950, "step": 1692, "time_per_iteration": 2.5202362537384033 }, { "auxiliary_loss_clip": 0.01218279, "auxiliary_loss_mlp": 0.01035747, "balance_loss_clip": 1.0636524, "balance_loss_mlp": 1.02619874, "epoch": 0.20357121385198101, "flos": 22923899648640.0, "grad_norm": 1.8241207289876609, "language_loss": 0.86901534, "learning_rate": 3.692352201311996e-06, "loss": 0.89155555, "num_input_tokens_seen": 35916800, "step": 1693, "time_per_iteration": 2.512848138809204 }, { "auxiliary_loss_clip": 0.01169803, "auxiliary_loss_mlp": 0.01029335, "balance_loss_clip": 1.05633664, "balance_loss_mlp": 1.02000093, "epoch": 0.2036914567426201, "flos": 20922131629440.0, "grad_norm": 1.7842334458437972, "language_loss": 0.76321661, "learning_rate": 3.6919369563386687e-06, "loss": 0.78520793, "num_input_tokens_seen": 35936600, "step": 1694, "time_per_iteration": 2.620120048522949 }, { "auxiliary_loss_clip": 0.01187043, "auxiliary_loss_mlp": 0.01032737, "balance_loss_clip": 1.06135082, "balance_loss_mlp": 1.02387381, "epoch": 0.20381169963325918, "flos": 15519155760000.0, "grad_norm": 2.6611281919122014, "language_loss": 0.78831178, "learning_rate": 3.69152145469985e-06, "loss": 0.81050956, "num_input_tokens_seen": 35953645, "step": 1695, "time_per_iteration": 2.553910255432129 }, { "auxiliary_loss_clip": 0.01162611, "auxiliary_loss_mlp": 0.01044852, "balance_loss_clip": 1.05558705, "balance_loss_mlp": 1.03397417, "epoch": 0.20393194252389826, "flos": 28833143760000.0, "grad_norm": 2.0520999050451434, "language_loss": 0.81947726, "learning_rate": 3.691105696458572e-06, "loss": 0.8415519, "num_input_tokens_seen": 35970940, "step": 1696, "time_per_iteration": 2.6742448806762695 }, { "auxiliary_loss_clip": 0.01220367, "auxiliary_loss_mlp": 0.01030749, "balance_loss_clip": 1.06654644, "balance_loss_mlp": 1.02135515, "epoch": 0.20405218541453737, "flos": 22488554810880.0, "grad_norm": 4.328252715624741, "language_loss": 0.67752624, "learning_rate": 3.690689681677904e-06, "loss": 0.70003736, "num_input_tokens_seen": 35989410, "step": 1697, "time_per_iteration": 2.527660846710205 }, { "auxiliary_loss_clip": 0.01188608, "auxiliary_loss_mlp": 0.01031746, "balance_loss_clip": 1.05847466, "balance_loss_mlp": 1.02272201, "epoch": 0.20417242830517646, "flos": 25374408278400.0, "grad_norm": 1.7069664040136512, "language_loss": 0.88590586, "learning_rate": 3.690273410420956e-06, "loss": 0.90810943, "num_input_tokens_seen": 36009175, "step": 1698, "time_per_iteration": 2.61661434173584 }, { "auxiliary_loss_clip": 0.01202719, "auxiliary_loss_mlp": 0.0103629, "balance_loss_clip": 1.06076145, "balance_loss_mlp": 1.02700412, "epoch": 0.20429267119581554, "flos": 14793078240000.0, "grad_norm": 4.383743480557219, "language_loss": 0.76848876, "learning_rate": 3.689856882750875e-06, "loss": 0.79087889, "num_input_tokens_seen": 36024375, "step": 1699, "time_per_iteration": 3.3093175888061523 }, { "auxiliary_loss_clip": 0.01200932, "auxiliary_loss_mlp": 0.010327, "balance_loss_clip": 1.06264687, "balance_loss_mlp": 1.02391386, "epoch": 0.20441291408645465, "flos": 17781851151360.0, "grad_norm": 1.8709501683404441, "language_loss": 0.78793871, "learning_rate": 3.6894400987308486e-06, "loss": 0.81027502, "num_input_tokens_seen": 36041895, "step": 1700, "time_per_iteration": 2.4629805088043213 }, { "auxiliary_loss_clip": 0.01207364, "auxiliary_loss_mlp": 0.01038483, "balance_loss_clip": 1.06224144, "balance_loss_mlp": 1.02873802, "epoch": 0.20453315697709373, "flos": 16435668211200.0, "grad_norm": 1.8963789630471082, "language_loss": 0.84851086, "learning_rate": 3.6890230584241024e-06, "loss": 0.87096936, "num_input_tokens_seen": 36058825, "step": 1701, "time_per_iteration": 2.473630905151367 }, { "auxiliary_loss_clip": 0.01117082, "auxiliary_loss_mlp": 0.01017989, "balance_loss_clip": 1.02653635, "balance_loss_mlp": 1.01595056, "epoch": 0.20465339986773282, "flos": 66713085653760.0, "grad_norm": 1.079628427469279, "language_loss": 0.66417402, "learning_rate": 3.6886057618939016e-06, "loss": 0.6855247, "num_input_tokens_seen": 36121645, "step": 1702, "time_per_iteration": 3.0877153873443604 }, { "auxiliary_loss_clip": 0.01166953, "auxiliary_loss_mlp": 0.01043209, "balance_loss_clip": 1.0561502, "balance_loss_mlp": 1.03309953, "epoch": 0.2047736427583719, "flos": 41974114924800.0, "grad_norm": 1.9040943911125248, "language_loss": 0.6910609, "learning_rate": 3.6881882092035492e-06, "loss": 0.71316254, "num_input_tokens_seen": 36143030, "step": 1703, "time_per_iteration": 2.7542049884796143 }, { "auxiliary_loss_clip": 0.01088371, "auxiliary_loss_mlp": 0.00755769, "balance_loss_clip": 1.02766347, "balance_loss_mlp": 1.00053215, "epoch": 0.204893885649011, "flos": 69940878641280.0, "grad_norm": 0.9258463559652654, "language_loss": 0.61184669, "learning_rate": 3.6877704004163873e-06, "loss": 0.630288, "num_input_tokens_seen": 36203435, "step": 1704, "time_per_iteration": 3.2626121044158936 }, { "auxiliary_loss_clip": 0.01220419, "auxiliary_loss_mlp": 0.01033827, "balance_loss_clip": 1.06428337, "balance_loss_mlp": 1.02348018, "epoch": 0.2050141285396501, "flos": 22200012858240.0, "grad_norm": 2.175912443947148, "language_loss": 0.77588618, "learning_rate": 3.6873523355957984e-06, "loss": 0.79842865, "num_input_tokens_seen": 36222435, "step": 1705, "time_per_iteration": 3.327622413635254 }, { "auxiliary_loss_clip": 0.01115365, "auxiliary_loss_mlp": 0.01006685, "balance_loss_clip": 1.02520096, "balance_loss_mlp": 1.0046463, "epoch": 0.20513437143028918, "flos": 46283721730560.0, "grad_norm": 0.9799203017007736, "language_loss": 0.6413393, "learning_rate": 3.686934014805201e-06, "loss": 0.66255981, "num_input_tokens_seen": 36273065, "step": 1706, "time_per_iteration": 3.7156033515930176 }, { "auxiliary_loss_clip": 0.01202934, "auxiliary_loss_mlp": 0.01038778, "balance_loss_clip": 1.06356168, "balance_loss_mlp": 1.02921748, "epoch": 0.20525461432092829, "flos": 21904324099200.0, "grad_norm": 1.8423043649697464, "language_loss": 0.81273705, "learning_rate": 3.6865154381080552e-06, "loss": 0.83515418, "num_input_tokens_seen": 36293750, "step": 1707, "time_per_iteration": 3.3514809608459473 }, { "auxiliary_loss_clip": 0.01128011, "auxiliary_loss_mlp": 0.010299, "balance_loss_clip": 1.05236316, "balance_loss_mlp": 1.02069092, "epoch": 0.20537485721156737, "flos": 21214264942080.0, "grad_norm": 1.94272526529882, "language_loss": 0.82480317, "learning_rate": 3.6860966055678585e-06, "loss": 0.84638226, "num_input_tokens_seen": 36310105, "step": 1708, "time_per_iteration": 2.6890857219696045 }, { "auxiliary_loss_clip": 0.01205871, "auxiliary_loss_mlp": 0.01041815, "balance_loss_clip": 1.06422508, "balance_loss_mlp": 1.0317837, "epoch": 0.20549510010220645, "flos": 20191205773440.0, "grad_norm": 1.8327083175605325, "language_loss": 0.86357588, "learning_rate": 3.685677517248147e-06, "loss": 0.88605273, "num_input_tokens_seen": 36328995, "step": 1709, "time_per_iteration": 2.5011420249938965 }, { "auxiliary_loss_clip": 0.01189419, "auxiliary_loss_mlp": 0.00764406, "balance_loss_clip": 1.06558514, "balance_loss_mlp": 1.00087571, "epoch": 0.20561534299284553, "flos": 17016702612480.0, "grad_norm": 1.8940528465644986, "language_loss": 0.80435836, "learning_rate": 3.6852581732124967e-06, "loss": 0.82389659, "num_input_tokens_seen": 36346340, "step": 1710, "time_per_iteration": 2.5553958415985107 }, { "auxiliary_loss_clip": 0.01206969, "auxiliary_loss_mlp": 0.01033567, "balance_loss_clip": 1.06480241, "balance_loss_mlp": 1.02351213, "epoch": 0.20573558588348465, "flos": 22890467064960.0, "grad_norm": 2.9550287906302084, "language_loss": 0.76077873, "learning_rate": 3.6848385735245213e-06, "loss": 0.78318411, "num_input_tokens_seen": 36365430, "step": 1711, "time_per_iteration": 2.545814275741577 }, { "auxiliary_loss_clip": 0.01186558, "auxiliary_loss_mlp": 0.01032695, "balance_loss_clip": 1.05517054, "balance_loss_mlp": 1.02336097, "epoch": 0.20585582877412373, "flos": 24643123286400.0, "grad_norm": 1.816164872469563, "language_loss": 0.86060917, "learning_rate": 3.6844187182478734e-06, "loss": 0.88280165, "num_input_tokens_seen": 36386285, "step": 1712, "time_per_iteration": 2.5832128524780273 }, { "auxiliary_loss_clip": 0.01178493, "auxiliary_loss_mlp": 0.01030458, "balance_loss_clip": 1.05583549, "balance_loss_mlp": 1.02099848, "epoch": 0.2059760716647628, "flos": 24206952435840.0, "grad_norm": 1.656136867252933, "language_loss": 0.74730504, "learning_rate": 3.683998607446246e-06, "loss": 0.76939458, "num_input_tokens_seen": 36404935, "step": 1713, "time_per_iteration": 2.5760860443115234 }, { "auxiliary_loss_clip": 0.01204658, "auxiliary_loss_mlp": 0.01045292, "balance_loss_clip": 1.06433654, "balance_loss_mlp": 1.03669727, "epoch": 0.20609631455540192, "flos": 20229522606720.0, "grad_norm": 1.8677170672975212, "language_loss": 0.75019348, "learning_rate": 3.6835782411833686e-06, "loss": 0.77269304, "num_input_tokens_seen": 36424455, "step": 1714, "time_per_iteration": 2.557457447052002 }, { "auxiliary_loss_clip": 0.01163789, "auxiliary_loss_mlp": 0.01034639, "balance_loss_clip": 1.05579591, "balance_loss_mlp": 1.02487016, "epoch": 0.206216557446041, "flos": 19864957518720.0, "grad_norm": 1.9216832351580975, "language_loss": 0.74329787, "learning_rate": 3.68315761952301e-06, "loss": 0.76528215, "num_input_tokens_seen": 36441685, "step": 1715, "time_per_iteration": 2.557408094406128 }, { "auxiliary_loss_clip": 0.01220315, "auxiliary_loss_mlp": 0.01037572, "balance_loss_clip": 1.06523728, "balance_loss_mlp": 1.02794552, "epoch": 0.2063368003366801, "flos": 24096311568000.0, "grad_norm": 2.007141782840156, "language_loss": 0.83047593, "learning_rate": 3.6827367425289797e-06, "loss": 0.85305476, "num_input_tokens_seen": 36461460, "step": 1716, "time_per_iteration": 2.5541040897369385 }, { "auxiliary_loss_clip": 0.01190372, "auxiliary_loss_mlp": 0.01030896, "balance_loss_clip": 1.06114793, "balance_loss_mlp": 1.02061439, "epoch": 0.2064570432273192, "flos": 20340163474560.0, "grad_norm": 2.7823186274286016, "language_loss": 0.72970492, "learning_rate": 3.6823156102651225e-06, "loss": 0.7519176, "num_input_tokens_seen": 36479615, "step": 1717, "time_per_iteration": 2.5223388671875 }, { "auxiliary_loss_clip": 0.01134096, "auxiliary_loss_mlp": 0.010311, "balance_loss_clip": 1.05721724, "balance_loss_mlp": 1.02146244, "epoch": 0.20657728611795828, "flos": 20520363029760.0, "grad_norm": 1.8698432655575277, "language_loss": 0.70648134, "learning_rate": 3.6818942227953257e-06, "loss": 0.7281332, "num_input_tokens_seen": 36500160, "step": 1718, "time_per_iteration": 2.614090919494629 }, { "auxiliary_loss_clip": 0.01175611, "auxiliary_loss_mlp": 0.01031033, "balance_loss_clip": 1.0603174, "balance_loss_mlp": 1.02132297, "epoch": 0.20669752900859736, "flos": 21799285752960.0, "grad_norm": 1.9106012471330371, "language_loss": 0.69166601, "learning_rate": 3.681472580183512e-06, "loss": 0.71373242, "num_input_tokens_seen": 36518810, "step": 1719, "time_per_iteration": 2.5682260990142822 }, { "auxiliary_loss_clip": 0.0120147, "auxiliary_loss_mlp": 0.01032327, "balance_loss_clip": 1.06394529, "balance_loss_mlp": 1.02333891, "epoch": 0.20681777189923645, "flos": 15122020014720.0, "grad_norm": 1.7915004781149155, "language_loss": 0.86325067, "learning_rate": 3.6810506824936455e-06, "loss": 0.88558865, "num_input_tokens_seen": 36536890, "step": 1720, "time_per_iteration": 2.47111177444458 }, { "auxiliary_loss_clip": 0.01087482, "auxiliary_loss_mlp": 0.010051, "balance_loss_clip": 1.02257824, "balance_loss_mlp": 1.00282288, "epoch": 0.20693801478987556, "flos": 56481021509760.0, "grad_norm": 1.1317507893838084, "language_loss": 0.62530267, "learning_rate": 3.680628529789726e-06, "loss": 0.64622855, "num_input_tokens_seen": 36589300, "step": 1721, "time_per_iteration": 2.94596266746521 }, { "auxiliary_loss_clip": 0.012262, "auxiliary_loss_mlp": 0.01039448, "balance_loss_clip": 1.0663687, "balance_loss_mlp": 1.02882624, "epoch": 0.20705825768051464, "flos": 21614201948160.0, "grad_norm": 1.8765563320858571, "language_loss": 0.85612869, "learning_rate": 3.680206122135796e-06, "loss": 0.87878513, "num_input_tokens_seen": 36609905, "step": 1722, "time_per_iteration": 2.4761531352996826 }, { "auxiliary_loss_clip": 0.0116922, "auxiliary_loss_mlp": 0.010387, "balance_loss_clip": 1.06507683, "balance_loss_mlp": 1.02997386, "epoch": 0.20717850057115372, "flos": 25848895962240.0, "grad_norm": 1.803188406557687, "language_loss": 0.78218549, "learning_rate": 3.6797834595959323e-06, "loss": 0.80426466, "num_input_tokens_seen": 36629805, "step": 1723, "time_per_iteration": 2.61983323097229 }, { "auxiliary_loss_clip": 0.01147727, "auxiliary_loss_mlp": 0.01042117, "balance_loss_clip": 1.05313325, "balance_loss_mlp": 1.03179991, "epoch": 0.20729874346179283, "flos": 29130807767040.0, "grad_norm": 2.4710442945737277, "language_loss": 0.78367615, "learning_rate": 3.679360542234254e-06, "loss": 0.80557466, "num_input_tokens_seen": 36649150, "step": 1724, "time_per_iteration": 2.654662847518921 }, { "auxiliary_loss_clip": 0.01182316, "auxiliary_loss_mlp": 0.00764963, "balance_loss_clip": 1.05590117, "balance_loss_mlp": 1.00089419, "epoch": 0.20741898635243192, "flos": 29023363209600.0, "grad_norm": 1.6746600180368396, "language_loss": 0.72186583, "learning_rate": 3.678937370114916e-06, "loss": 0.74133861, "num_input_tokens_seen": 36668955, "step": 1725, "time_per_iteration": 3.48898983001709 }, { "auxiliary_loss_clip": 0.01184175, "auxiliary_loss_mlp": 0.0102942, "balance_loss_clip": 1.06155932, "balance_loss_mlp": 1.02088475, "epoch": 0.207539229243071, "flos": 15559447841280.0, "grad_norm": 1.9454122270823844, "language_loss": 0.78826487, "learning_rate": 3.678513943302114e-06, "loss": 0.81040084, "num_input_tokens_seen": 36685730, "step": 1726, "time_per_iteration": 2.545203924179077 }, { "auxiliary_loss_clip": 0.0121827, "auxiliary_loss_mlp": 0.01035791, "balance_loss_clip": 1.06433117, "balance_loss_mlp": 1.02687383, "epoch": 0.20765947213371008, "flos": 20521081301760.0, "grad_norm": 1.6933242353386795, "language_loss": 0.8538264, "learning_rate": 3.678090261860082e-06, "loss": 0.87636709, "num_input_tokens_seen": 36705460, "step": 1727, "time_per_iteration": 2.5430078506469727 }, { "auxiliary_loss_clip": 0.01172423, "auxiliary_loss_mlp": 0.01035899, "balance_loss_clip": 1.05392444, "balance_loss_mlp": 1.02663016, "epoch": 0.2077797150243492, "flos": 19354415558400.0, "grad_norm": 1.8073339637475292, "language_loss": 0.77326918, "learning_rate": 3.6776663258530906e-06, "loss": 0.7953524, "num_input_tokens_seen": 36724110, "step": 1728, "time_per_iteration": 2.6801679134368896 }, { "auxiliary_loss_clip": 0.01206503, "auxiliary_loss_mlp": 0.01032931, "balance_loss_clip": 1.06291127, "balance_loss_mlp": 1.0240382, "epoch": 0.20789995791498828, "flos": 21829952989440.0, "grad_norm": 1.8307166220351654, "language_loss": 0.71375144, "learning_rate": 3.6772421353454516e-06, "loss": 0.73614573, "num_input_tokens_seen": 36742705, "step": 1729, "time_per_iteration": 2.6156044006347656 }, { "auxiliary_loss_clip": 0.01202153, "auxiliary_loss_mlp": 0.01032222, "balance_loss_clip": 1.06386757, "balance_loss_mlp": 1.02293587, "epoch": 0.20802020080562736, "flos": 23148844571520.0, "grad_norm": 8.882434085461293, "language_loss": 0.8839817, "learning_rate": 3.6768176904015153e-06, "loss": 0.90632546, "num_input_tokens_seen": 36762510, "step": 1730, "time_per_iteration": 2.529400587081909 }, { "auxiliary_loss_clip": 0.01203104, "auxiliary_loss_mlp": 0.01038505, "balance_loss_clip": 1.06127071, "balance_loss_mlp": 1.02921844, "epoch": 0.20814044369626647, "flos": 23072677781760.0, "grad_norm": 2.6368028659940492, "language_loss": 0.60422206, "learning_rate": 3.6763929910856674e-06, "loss": 0.62663811, "num_input_tokens_seen": 36780960, "step": 1731, "time_per_iteration": 3.3543660640716553 }, { "auxiliary_loss_clip": 0.01203673, "auxiliary_loss_mlp": 0.01041385, "balance_loss_clip": 1.06389642, "balance_loss_mlp": 1.03196144, "epoch": 0.20826068658690555, "flos": 19608016556160.0, "grad_norm": 2.414765034132618, "language_loss": 0.77754235, "learning_rate": 3.6759680374623365e-06, "loss": 0.79999292, "num_input_tokens_seen": 36798875, "step": 1732, "time_per_iteration": 3.3706953525543213 }, { "auxiliary_loss_clip": 0.01216353, "auxiliary_loss_mlp": 0.01030399, "balance_loss_clip": 1.06389928, "balance_loss_mlp": 1.02096367, "epoch": 0.20838092947754464, "flos": 25374049142400.0, "grad_norm": 2.383951389535304, "language_loss": 0.75453377, "learning_rate": 3.675542829595986e-06, "loss": 0.7770012, "num_input_tokens_seen": 36818540, "step": 1733, "time_per_iteration": 3.331346035003662 }, { "auxiliary_loss_clip": 0.01187756, "auxiliary_loss_mlp": 0.01034533, "balance_loss_clip": 1.05992317, "balance_loss_mlp": 1.02521074, "epoch": 0.20850117236818372, "flos": 24061729749120.0, "grad_norm": 1.437901828774705, "language_loss": 0.79396731, "learning_rate": 3.6751173675511213e-06, "loss": 0.81619018, "num_input_tokens_seen": 36840585, "step": 1734, "time_per_iteration": 2.581486463546753 }, { "auxiliary_loss_clip": 0.01182469, "auxiliary_loss_mlp": 0.01036183, "balance_loss_clip": 1.05398667, "balance_loss_mlp": 1.02736163, "epoch": 0.20862141525882283, "flos": 20077799558400.0, "grad_norm": 2.1602936115016993, "language_loss": 0.87514347, "learning_rate": 3.674691651392283e-06, "loss": 0.89733005, "num_input_tokens_seen": 36858255, "step": 1735, "time_per_iteration": 2.5433480739593506 }, { "auxiliary_loss_clip": 0.01194135, "auxiliary_loss_mlp": 0.01045702, "balance_loss_clip": 1.0634383, "balance_loss_mlp": 1.03649926, "epoch": 0.2087416581494619, "flos": 39015183237120.0, "grad_norm": 2.4436272545325513, "language_loss": 0.76214147, "learning_rate": 3.674265681184053e-06, "loss": 0.78453982, "num_input_tokens_seen": 36881515, "step": 1736, "time_per_iteration": 2.6752898693084717 }, { "auxiliary_loss_clip": 0.01188291, "auxiliary_loss_mlp": 0.01031949, "balance_loss_clip": 1.05806065, "balance_loss_mlp": 1.02281713, "epoch": 0.208861901040101, "flos": 26101994169600.0, "grad_norm": 1.6540068303848314, "language_loss": 0.86296463, "learning_rate": 3.6738394569910504e-06, "loss": 0.885167, "num_input_tokens_seen": 36902055, "step": 1737, "time_per_iteration": 2.56394362449646 }, { "auxiliary_loss_clip": 0.01202918, "auxiliary_loss_mlp": 0.01032384, "balance_loss_clip": 1.06375539, "balance_loss_mlp": 1.02349091, "epoch": 0.2089821439307401, "flos": 28398732675840.0, "grad_norm": 2.2776346492230375, "language_loss": 0.82504511, "learning_rate": 3.6734129788779333e-06, "loss": 0.8473981, "num_input_tokens_seen": 36921230, "step": 1738, "time_per_iteration": 2.5582215785980225 }, { "auxiliary_loss_clip": 0.01170748, "auxiliary_loss_mlp": 0.01031851, "balance_loss_clip": 1.06071949, "balance_loss_mlp": 1.02264166, "epoch": 0.2091023868213792, "flos": 21069616872960.0, "grad_norm": 1.6406682291254882, "language_loss": 0.90560412, "learning_rate": 3.6729862469093976e-06, "loss": 0.92763019, "num_input_tokens_seen": 36940325, "step": 1739, "time_per_iteration": 2.5706984996795654 }, { "auxiliary_loss_clip": 0.01172463, "auxiliary_loss_mlp": 0.0103843, "balance_loss_clip": 1.05536759, "balance_loss_mlp": 1.02911961, "epoch": 0.20922262971201827, "flos": 22455481363200.0, "grad_norm": 2.4190153924251176, "language_loss": 0.82254064, "learning_rate": 3.6725592611501782e-06, "loss": 0.84464955, "num_input_tokens_seen": 36959000, "step": 1740, "time_per_iteration": 2.5368447303771973 }, { "auxiliary_loss_clip": 0.01200981, "auxiliary_loss_mlp": 0.01039083, "balance_loss_clip": 1.06038165, "balance_loss_mlp": 1.02958798, "epoch": 0.20934287260265738, "flos": 27852244179840.0, "grad_norm": 3.252599703194495, "language_loss": 0.7698943, "learning_rate": 3.6721320216650496e-06, "loss": 0.79229492, "num_input_tokens_seen": 36979615, "step": 1741, "time_per_iteration": 2.5715086460113525 }, { "auxiliary_loss_clip": 0.01185693, "auxiliary_loss_mlp": 0.0104033, "balance_loss_clip": 1.0593679, "balance_loss_mlp": 1.03079963, "epoch": 0.20946311549329646, "flos": 16435309075200.0, "grad_norm": 1.6848133030444423, "language_loss": 0.83857214, "learning_rate": 3.6717045285188215e-06, "loss": 0.86083245, "num_input_tokens_seen": 36997310, "step": 1742, "time_per_iteration": 2.538525342941284 }, { "auxiliary_loss_clip": 0.01141515, "auxiliary_loss_mlp": 0.01036919, "balance_loss_clip": 1.05084956, "balance_loss_mlp": 1.02732861, "epoch": 0.20958335838393555, "flos": 22492720788480.0, "grad_norm": 2.1099686823320654, "language_loss": 0.87020957, "learning_rate": 3.671276781776346e-06, "loss": 0.89199394, "num_input_tokens_seen": 37015965, "step": 1743, "time_per_iteration": 2.6134045124053955 }, { "auxiliary_loss_clip": 0.01176265, "auxiliary_loss_mlp": 0.01033339, "balance_loss_clip": 1.05311918, "balance_loss_mlp": 1.02415419, "epoch": 0.20970360127457463, "flos": 25224768218880.0, "grad_norm": 1.906691330481457, "language_loss": 0.67145264, "learning_rate": 3.6708487815025128e-06, "loss": 0.69354868, "num_input_tokens_seen": 37036545, "step": 1744, "time_per_iteration": 2.58872652053833 }, { "auxiliary_loss_clip": 0.01172732, "auxiliary_loss_mlp": 0.01030618, "balance_loss_clip": 1.05657196, "balance_loss_mlp": 1.02116442, "epoch": 0.20982384416521374, "flos": 18479164855680.0, "grad_norm": 2.292926979330607, "language_loss": 0.74404144, "learning_rate": 3.6704205277622463e-06, "loss": 0.7660749, "num_input_tokens_seen": 37054985, "step": 1745, "time_per_iteration": 2.6021785736083984 }, { "auxiliary_loss_clip": 0.01187591, "auxiliary_loss_mlp": 0.01033413, "balance_loss_clip": 1.05626488, "balance_loss_mlp": 1.0242219, "epoch": 0.20994408705585282, "flos": 25373546352000.0, "grad_norm": 1.6762954296293981, "language_loss": 0.80512577, "learning_rate": 3.6699920206205146e-06, "loss": 0.82733583, "num_input_tokens_seen": 37075725, "step": 1746, "time_per_iteration": 2.5860376358032227 }, { "auxiliary_loss_clip": 0.01201971, "auxiliary_loss_mlp": 0.01034074, "balance_loss_clip": 1.05970824, "balance_loss_mlp": 1.02510393, "epoch": 0.2100643299464919, "flos": 21320955313920.0, "grad_norm": 2.0023847954338843, "language_loss": 0.81598699, "learning_rate": 3.669563260142321e-06, "loss": 0.83834743, "num_input_tokens_seen": 37094615, "step": 1747, "time_per_iteration": 2.5021493434906006 }, { "auxiliary_loss_clip": 0.01182758, "auxiliary_loss_mlp": 0.01038151, "balance_loss_clip": 1.06117511, "balance_loss_mlp": 1.02884698, "epoch": 0.21018457283713102, "flos": 19354379644800.0, "grad_norm": 1.9329789875094434, "language_loss": 0.84209371, "learning_rate": 3.6691342463927083e-06, "loss": 0.86430287, "num_input_tokens_seen": 37113610, "step": 1748, "time_per_iteration": 2.5173652172088623 }, { "auxiliary_loss_clip": 0.01174863, "auxiliary_loss_mlp": 0.01038985, "balance_loss_clip": 1.05768466, "balance_loss_mlp": 1.02957892, "epoch": 0.2103048157277701, "flos": 28330035914880.0, "grad_norm": 1.619241627535905, "language_loss": 0.81551063, "learning_rate": 3.668704979436758e-06, "loss": 0.83764911, "num_input_tokens_seen": 37133705, "step": 1749, "time_per_iteration": 2.626819610595703 }, { "auxiliary_loss_clip": 0.01176972, "auxiliary_loss_mlp": 0.01035693, "balance_loss_clip": 1.05405307, "balance_loss_mlp": 1.02638865, "epoch": 0.21042505861840918, "flos": 17457290835840.0, "grad_norm": 1.9597630624351245, "language_loss": 0.78709507, "learning_rate": 3.668275459339588e-06, "loss": 0.80922174, "num_input_tokens_seen": 37152185, "step": 1750, "time_per_iteration": 2.5133371353149414 }, { "auxiliary_loss_clip": 0.01216895, "auxiliary_loss_mlp": 0.01037369, "balance_loss_clip": 1.0639286, "balance_loss_mlp": 1.02714109, "epoch": 0.21054530150904827, "flos": 14209817195520.0, "grad_norm": 2.112450244642857, "language_loss": 0.80269277, "learning_rate": 3.667845686166358e-06, "loss": 0.82523543, "num_input_tokens_seen": 37169110, "step": 1751, "time_per_iteration": 2.430022716522217 }, { "auxiliary_loss_clip": 0.01153772, "auxiliary_loss_mlp": 0.01030117, "balance_loss_clip": 1.05169058, "balance_loss_mlp": 1.02035403, "epoch": 0.21066554439968738, "flos": 18618210403200.0, "grad_norm": 1.7866486114255284, "language_loss": 0.85896385, "learning_rate": 3.6674156599822634e-06, "loss": 0.88080275, "num_input_tokens_seen": 37184905, "step": 1752, "time_per_iteration": 3.3723833560943604 }, { "auxiliary_loss_clip": 0.01157478, "auxiliary_loss_mlp": 0.0103964, "balance_loss_clip": 1.0509603, "balance_loss_mlp": 1.02996576, "epoch": 0.21078578729032646, "flos": 23658883741440.0, "grad_norm": 2.2056655646680534, "language_loss": 0.81837642, "learning_rate": 3.666985380852539e-06, "loss": 0.84034765, "num_input_tokens_seen": 37203910, "step": 1753, "time_per_iteration": 2.600553512573242 }, { "auxiliary_loss_clip": 0.01185862, "auxiliary_loss_mlp": 0.0103255, "balance_loss_clip": 1.0602572, "balance_loss_mlp": 1.02281022, "epoch": 0.21090603018096554, "flos": 29346379240320.0, "grad_norm": 3.260311458547385, "language_loss": 0.74748254, "learning_rate": 3.6665548488424576e-06, "loss": 0.76966667, "num_input_tokens_seen": 37222670, "step": 1754, "time_per_iteration": 2.575789451599121 }, { "auxiliary_loss_clip": 0.01217189, "auxiliary_loss_mlp": 0.01037331, "balance_loss_clip": 1.06208384, "balance_loss_mlp": 1.02701926, "epoch": 0.21102627307160465, "flos": 23261245205760.0, "grad_norm": 2.0270978935915496, "language_loss": 0.88008004, "learning_rate": 3.6661240640173307e-06, "loss": 0.90262526, "num_input_tokens_seen": 37244140, "step": 1755, "time_per_iteration": 2.5638747215270996 }, { "auxiliary_loss_clip": 0.01083785, "auxiliary_loss_mlp": 0.01010325, "balance_loss_clip": 1.02759123, "balance_loss_mlp": 1.00809598, "epoch": 0.21114651596224374, "flos": 54633454577280.0, "grad_norm": 0.8889622053571035, "language_loss": 0.57912737, "learning_rate": 3.6656930264425085e-06, "loss": 0.60006845, "num_input_tokens_seen": 37308185, "step": 1756, "time_per_iteration": 3.1736364364624023 }, { "auxiliary_loss_clip": 0.01217146, "auxiliary_loss_mlp": 0.01037821, "balance_loss_clip": 1.06263375, "balance_loss_mlp": 1.02786684, "epoch": 0.21126675885288282, "flos": 21543314457600.0, "grad_norm": 1.7376468322884182, "language_loss": 0.75861591, "learning_rate": 3.665261736183378e-06, "loss": 0.7811656, "num_input_tokens_seen": 37328220, "step": 1757, "time_per_iteration": 2.507418394088745 }, { "auxiliary_loss_clip": 0.01172478, "auxiliary_loss_mlp": 0.01031661, "balance_loss_clip": 1.05848813, "balance_loss_mlp": 1.02159965, "epoch": 0.2113870017435219, "flos": 10961876678400.0, "grad_norm": 2.39046261252039, "language_loss": 0.89167386, "learning_rate": 3.664830193305366e-06, "loss": 0.91371524, "num_input_tokens_seen": 37345995, "step": 1758, "time_per_iteration": 4.308273792266846 }, { "auxiliary_loss_clip": 0.01166234, "auxiliary_loss_mlp": 0.01036465, "balance_loss_clip": 1.05290079, "balance_loss_mlp": 1.02668428, "epoch": 0.211507244634161, "flos": 16653825463680.0, "grad_norm": 6.170319221508419, "language_loss": 0.76876199, "learning_rate": 3.6643983978739373e-06, "loss": 0.79078901, "num_input_tokens_seen": 37362610, "step": 1759, "time_per_iteration": 3.3425939083099365 }, { "auxiliary_loss_clip": 0.01180341, "auxiliary_loss_mlp": 0.01033525, "balance_loss_clip": 1.05904484, "balance_loss_mlp": 1.02352321, "epoch": 0.2116274875248001, "flos": 20954091755520.0, "grad_norm": 1.9004249772441613, "language_loss": 0.8203029, "learning_rate": 3.663966349954596e-06, "loss": 0.84244156, "num_input_tokens_seen": 37382790, "step": 1760, "time_per_iteration": 2.5548155307769775 }, { "auxiliary_loss_clip": 0.01105109, "auxiliary_loss_mlp": 0.01001855, "balance_loss_clip": 1.02659369, "balance_loss_mlp": 0.99964952, "epoch": 0.21174773041543918, "flos": 68196949424640.0, "grad_norm": 0.7922156201813727, "language_loss": 0.59808755, "learning_rate": 3.6635340496128816e-06, "loss": 0.6191572, "num_input_tokens_seen": 37439720, "step": 1761, "time_per_iteration": 2.9759645462036133 }, { "auxiliary_loss_clip": 0.01154979, "auxiliary_loss_mlp": 0.01034496, "balance_loss_clip": 1.05480206, "balance_loss_mlp": 1.02534103, "epoch": 0.2118679733060783, "flos": 20668315150080.0, "grad_norm": 1.6493272588694963, "language_loss": 0.92545718, "learning_rate": 3.6631014969143747e-06, "loss": 0.94735187, "num_input_tokens_seen": 37459410, "step": 1762, "time_per_iteration": 2.632343292236328 }, { "auxiliary_loss_clip": 0.01206052, "auxiliary_loss_mlp": 0.01043486, "balance_loss_clip": 1.06574392, "balance_loss_mlp": 1.03408027, "epoch": 0.21198821619671737, "flos": 23223431162880.0, "grad_norm": 1.7015162526646155, "language_loss": 0.89152634, "learning_rate": 3.662668691924693e-06, "loss": 0.91402173, "num_input_tokens_seen": 37480460, "step": 1763, "time_per_iteration": 2.518904685974121 }, { "auxiliary_loss_clip": 0.01170215, "auxiliary_loss_mlp": 0.01039916, "balance_loss_clip": 1.05585933, "balance_loss_mlp": 1.02950943, "epoch": 0.21210845908735645, "flos": 24498547044480.0, "grad_norm": 1.944626822653477, "language_loss": 0.72100049, "learning_rate": 3.6622356347094927e-06, "loss": 0.74310178, "num_input_tokens_seen": 37502025, "step": 1764, "time_per_iteration": 2.589101791381836 }, { "auxiliary_loss_clip": 0.01171676, "auxiliary_loss_mlp": 0.01037009, "balance_loss_clip": 1.05284286, "balance_loss_mlp": 1.02623296, "epoch": 0.21222870197799554, "flos": 27089789160960.0, "grad_norm": 1.945570693918676, "language_loss": 0.7870971, "learning_rate": 3.6618023253344684e-06, "loss": 0.80918401, "num_input_tokens_seen": 37520885, "step": 1765, "time_per_iteration": 2.5923502445220947 }, { "auxiliary_loss_clip": 0.01200865, "auxiliary_loss_mlp": 0.01041109, "balance_loss_clip": 1.0592438, "balance_loss_mlp": 1.030792, "epoch": 0.21234894486863465, "flos": 16873850223360.0, "grad_norm": 1.4454107411192525, "language_loss": 0.83385426, "learning_rate": 3.6613687638653527e-06, "loss": 0.85627401, "num_input_tokens_seen": 37539055, "step": 1766, "time_per_iteration": 2.4639527797698975 }, { "auxiliary_loss_clip": 0.01181866, "auxiliary_loss_mlp": 0.01035126, "balance_loss_clip": 1.05760121, "balance_loss_mlp": 1.02518988, "epoch": 0.21246918775927373, "flos": 23474949171840.0, "grad_norm": 1.8126336215905998, "language_loss": 0.7757827, "learning_rate": 3.660934950367916e-06, "loss": 0.79795271, "num_input_tokens_seen": 37558300, "step": 1767, "time_per_iteration": 2.5510809421539307 }, { "auxiliary_loss_clip": 0.01204323, "auxiliary_loss_mlp": 0.01036614, "balance_loss_clip": 1.06220198, "balance_loss_mlp": 1.02671957, "epoch": 0.21258943064991281, "flos": 22382295402240.0, "grad_norm": 1.699834246609623, "language_loss": 0.83455169, "learning_rate": 3.660500884907968e-06, "loss": 0.85696107, "num_input_tokens_seen": 37579040, "step": 1768, "time_per_iteration": 2.5239593982696533 }, { "auxiliary_loss_clip": 0.01069117, "auxiliary_loss_mlp": 0.01003329, "balance_loss_clip": 1.02535892, "balance_loss_mlp": 1.00082552, "epoch": 0.21270967354055192, "flos": 59440168679040.0, "grad_norm": 0.8216218626058457, "language_loss": 0.60000062, "learning_rate": 3.660066567551356e-06, "loss": 0.6207251, "num_input_tokens_seen": 37639185, "step": 1769, "time_per_iteration": 3.0549306869506836 }, { "auxiliary_loss_clip": 0.01200479, "auxiliary_loss_mlp": 0.00765161, "balance_loss_clip": 1.06048334, "balance_loss_mlp": 1.00098419, "epoch": 0.212829916431191, "flos": 21544032729600.0, "grad_norm": 2.228823779654595, "language_loss": 0.84556282, "learning_rate": 3.6596319983639657e-06, "loss": 0.86521918, "num_input_tokens_seen": 37657765, "step": 1770, "time_per_iteration": 2.515789747238159 }, { "auxiliary_loss_clip": 0.0117351, "auxiliary_loss_mlp": 0.00765912, "balance_loss_clip": 1.05885124, "balance_loss_mlp": 1.00096118, "epoch": 0.2129501593218301, "flos": 28987739896320.0, "grad_norm": 1.550211201985683, "language_loss": 0.86196232, "learning_rate": 3.6591971774117214e-06, "loss": 0.88135654, "num_input_tokens_seen": 37680740, "step": 1771, "time_per_iteration": 2.6453745365142822 }, { "auxiliary_loss_clip": 0.01207496, "auxiliary_loss_mlp": 0.01040615, "balance_loss_clip": 1.0632937, "balance_loss_mlp": 1.03048182, "epoch": 0.2130704022124692, "flos": 18806993308800.0, "grad_norm": 2.059605367534018, "language_loss": 0.80518997, "learning_rate": 3.6587621047605833e-06, "loss": 0.82767105, "num_input_tokens_seen": 37697910, "step": 1772, "time_per_iteration": 2.4984118938446045 }, { "auxiliary_loss_clip": 0.0120295, "auxiliary_loss_mlp": 0.01039117, "balance_loss_clip": 1.06244361, "balance_loss_mlp": 1.02975905, "epoch": 0.21319064510310828, "flos": 13918150759680.0, "grad_norm": 1.9476953152537653, "language_loss": 0.86659396, "learning_rate": 3.6583267804765542e-06, "loss": 0.8890146, "num_input_tokens_seen": 37712245, "step": 1773, "time_per_iteration": 2.5339155197143555 }, { "auxiliary_loss_clip": 0.01198622, "auxiliary_loss_mlp": 0.01037087, "balance_loss_clip": 1.05881619, "balance_loss_mlp": 1.02641821, "epoch": 0.21331088799374737, "flos": 20959694277120.0, "grad_norm": 1.685265008435033, "language_loss": 0.85512364, "learning_rate": 3.6578912046256702e-06, "loss": 0.87748063, "num_input_tokens_seen": 37730765, "step": 1774, "time_per_iteration": 2.5187294483184814 }, { "auxiliary_loss_clip": 0.0116821, "auxiliary_loss_mlp": 0.01033041, "balance_loss_clip": 1.05329263, "balance_loss_mlp": 1.02243781, "epoch": 0.21343113088438645, "flos": 18624638937600.0, "grad_norm": 1.95408503356112, "language_loss": 0.75892329, "learning_rate": 3.6574553772740083e-06, "loss": 0.78093576, "num_input_tokens_seen": 37748695, "step": 1775, "time_per_iteration": 2.541170597076416 }, { "auxiliary_loss_clip": 0.01094954, "auxiliary_loss_mlp": 0.01002763, "balance_loss_clip": 1.0269798, "balance_loss_mlp": 1.00051045, "epoch": 0.21355137377502556, "flos": 67413128791680.0, "grad_norm": 0.8493705330258402, "language_loss": 0.61886555, "learning_rate": 3.657019298487684e-06, "loss": 0.63984269, "num_input_tokens_seen": 37813705, "step": 1776, "time_per_iteration": 3.1203174591064453 }, { "auxiliary_loss_clip": 0.01210159, "auxiliary_loss_mlp": 0.0076584, "balance_loss_clip": 1.06136084, "balance_loss_mlp": 1.00101912, "epoch": 0.21367161666566464, "flos": 34532095697280.0, "grad_norm": 2.6303326203727138, "language_loss": 0.83458847, "learning_rate": 3.6565829683328495e-06, "loss": 0.85434848, "num_input_tokens_seen": 37836330, "step": 1777, "time_per_iteration": 2.6514554023742676 }, { "auxiliary_loss_clip": 0.01196245, "auxiliary_loss_mlp": 0.0103759, "balance_loss_clip": 1.05905044, "balance_loss_mlp": 1.02734363, "epoch": 0.21379185955630373, "flos": 18989347680000.0, "grad_norm": 1.6819733295980346, "language_loss": 0.85998476, "learning_rate": 3.6561463868756965e-06, "loss": 0.88232303, "num_input_tokens_seen": 37855030, "step": 1778, "time_per_iteration": 2.454549551010132 }, { "auxiliary_loss_clip": 0.01202991, "auxiliary_loss_mlp": 0.01036865, "balance_loss_clip": 1.0637095, "balance_loss_mlp": 1.0266782, "epoch": 0.21391210244694284, "flos": 28218497207040.0, "grad_norm": 1.7194331359769421, "language_loss": 0.78008235, "learning_rate": 3.655709554182452e-06, "loss": 0.80248088, "num_input_tokens_seen": 37875370, "step": 1779, "time_per_iteration": 3.3246679306030273 }, { "auxiliary_loss_clip": 0.01204383, "auxiliary_loss_mlp": 0.01033702, "balance_loss_clip": 1.05911446, "balance_loss_mlp": 1.02421927, "epoch": 0.21403234533758192, "flos": 17455064192640.0, "grad_norm": 2.5177189526327997, "language_loss": 0.8467648, "learning_rate": 3.6552724703193855e-06, "loss": 0.86914563, "num_input_tokens_seen": 37892560, "step": 1780, "time_per_iteration": 2.4566287994384766 }, { "auxiliary_loss_clip": 0.01063446, "auxiliary_loss_mlp": 0.01008573, "balance_loss_clip": 1.02203, "balance_loss_mlp": 1.00595033, "epoch": 0.214152588228221, "flos": 51637606686720.0, "grad_norm": 0.7924905630335375, "language_loss": 0.55932772, "learning_rate": 3.654835135352801e-06, "loss": 0.58004791, "num_input_tokens_seen": 37947370, "step": 1781, "time_per_iteration": 3.0200095176696777 }, { "auxiliary_loss_clip": 0.01154921, "auxiliary_loss_mlp": 0.01032253, "balance_loss_clip": 1.05036926, "balance_loss_mlp": 1.02206063, "epoch": 0.21427283111886009, "flos": 19496154625920.0, "grad_norm": 1.7512825860459846, "language_loss": 0.87445223, "learning_rate": 3.654397549349043e-06, "loss": 0.89632404, "num_input_tokens_seen": 37964745, "step": 1782, "time_per_iteration": 2.579237699508667 }, { "auxiliary_loss_clip": 0.01185015, "auxiliary_loss_mlp": 0.01034363, "balance_loss_clip": 1.06101263, "balance_loss_mlp": 1.0240159, "epoch": 0.2143930740094992, "flos": 20084802710400.0, "grad_norm": 2.2447810221344535, "language_loss": 0.75453168, "learning_rate": 3.653959712374491e-06, "loss": 0.77672553, "num_input_tokens_seen": 37982850, "step": 1783, "time_per_iteration": 2.521242141723633 }, { "auxiliary_loss_clip": 0.01166413, "auxiliary_loss_mlp": 0.01026669, "balance_loss_clip": 1.05910063, "balance_loss_mlp": 1.01706624, "epoch": 0.21451331690013828, "flos": 21798603394560.0, "grad_norm": 1.5610972534755763, "language_loss": 0.82655156, "learning_rate": 3.6535216244955663e-06, "loss": 0.84848237, "num_input_tokens_seen": 38002745, "step": 1784, "time_per_iteration": 3.4302234649658203 }, { "auxiliary_loss_clip": 0.01185738, "auxiliary_loss_mlp": 0.01037906, "balance_loss_clip": 1.05997849, "balance_loss_mlp": 1.02786827, "epoch": 0.21463355979077736, "flos": 32853882412800.0, "grad_norm": 1.6081607400772948, "language_loss": 0.70927966, "learning_rate": 3.653083285778726e-06, "loss": 0.73151606, "num_input_tokens_seen": 38024115, "step": 1785, "time_per_iteration": 3.5211477279663086 }, { "auxiliary_loss_clip": 0.01206595, "auxiliary_loss_mlp": 0.01032194, "balance_loss_clip": 1.06041694, "balance_loss_mlp": 1.02140558, "epoch": 0.21475380268141647, "flos": 21543817248000.0, "grad_norm": 1.9384667437333827, "language_loss": 0.811643, "learning_rate": 3.6526446962904653e-06, "loss": 0.83403093, "num_input_tokens_seen": 38042830, "step": 1786, "time_per_iteration": 3.3201074600219727 }, { "auxiliary_loss_clip": 0.01197391, "auxiliary_loss_mlp": 0.0104066, "balance_loss_clip": 1.06145024, "balance_loss_mlp": 1.03102171, "epoch": 0.21487404557205556, "flos": 32159082660480.0, "grad_norm": 1.513096369347035, "language_loss": 0.74209732, "learning_rate": 3.652205856097318e-06, "loss": 0.76447779, "num_input_tokens_seen": 38066015, "step": 1787, "time_per_iteration": 2.612100124359131 }, { "auxiliary_loss_clip": 0.01180835, "auxiliary_loss_mlp": 0.00765009, "balance_loss_clip": 1.05796349, "balance_loss_mlp": 1.00116944, "epoch": 0.21499428846269464, "flos": 12673091583360.0, "grad_norm": 1.8831345578204703, "language_loss": 0.79402864, "learning_rate": 3.651766765265856e-06, "loss": 0.81348705, "num_input_tokens_seen": 38083025, "step": 1788, "time_per_iteration": 2.5232691764831543 }, { "auxiliary_loss_clip": 0.01180869, "auxiliary_loss_mlp": 0.01026411, "balance_loss_clip": 1.05589414, "balance_loss_mlp": 1.01684487, "epoch": 0.21511453135333372, "flos": 23471573293440.0, "grad_norm": 2.041857349656563, "language_loss": 0.81488442, "learning_rate": 3.65132742386269e-06, "loss": 0.83695722, "num_input_tokens_seen": 38098245, "step": 1789, "time_per_iteration": 2.5078086853027344 }, { "auxiliary_loss_clip": 0.01214362, "auxiliary_loss_mlp": 0.0103262, "balance_loss_clip": 1.05942988, "balance_loss_mlp": 1.02215934, "epoch": 0.21523477424397283, "flos": 26943560893440.0, "grad_norm": 1.662498619104773, "language_loss": 0.84748101, "learning_rate": 3.6508878319544656e-06, "loss": 0.86995089, "num_input_tokens_seen": 38118460, "step": 1790, "time_per_iteration": 2.4993860721588135 }, { "auxiliary_loss_clip": 0.0117633, "auxiliary_loss_mlp": 0.01045007, "balance_loss_clip": 1.05865979, "balance_loss_mlp": 1.03489256, "epoch": 0.21535501713461191, "flos": 18916161719040.0, "grad_norm": 2.563312368127213, "language_loss": 0.81922615, "learning_rate": 3.65044798960787e-06, "loss": 0.84143949, "num_input_tokens_seen": 38136800, "step": 1791, "time_per_iteration": 2.5290913581848145 }, { "auxiliary_loss_clip": 0.01164793, "auxiliary_loss_mlp": 0.01031974, "balance_loss_clip": 1.05520153, "balance_loss_mlp": 1.02253819, "epoch": 0.215475260025251, "flos": 17895113712000.0, "grad_norm": 1.6737757317399367, "language_loss": 0.78327453, "learning_rate": 3.650007896889627e-06, "loss": 0.80524218, "num_input_tokens_seen": 38155380, "step": 1792, "time_per_iteration": 2.5355114936828613 }, { "auxiliary_loss_clip": 0.01214479, "auxiliary_loss_mlp": 0.01039045, "balance_loss_clip": 1.06281829, "balance_loss_mlp": 1.02914512, "epoch": 0.2155955029158901, "flos": 16654292340480.0, "grad_norm": 2.1883727892734024, "language_loss": 0.80680132, "learning_rate": 3.6495675538664974e-06, "loss": 0.82933658, "num_input_tokens_seen": 38174395, "step": 1793, "time_per_iteration": 2.457888126373291 }, { "auxiliary_loss_clip": 0.01185336, "auxiliary_loss_mlp": 0.01034756, "balance_loss_clip": 1.05595183, "balance_loss_mlp": 1.02495694, "epoch": 0.2157157458065292, "flos": 23621213352960.0, "grad_norm": 1.7575446136644943, "language_loss": 0.82871157, "learning_rate": 3.649126960605282e-06, "loss": 0.85091251, "num_input_tokens_seen": 38195380, "step": 1794, "time_per_iteration": 2.544405221939087 }, { "auxiliary_loss_clip": 0.01182188, "auxiliary_loss_mlp": 0.01033776, "balance_loss_clip": 1.05798399, "balance_loss_mlp": 1.02397752, "epoch": 0.21583598869716827, "flos": 22127078292480.0, "grad_norm": 2.3241727635455027, "language_loss": 0.83531022, "learning_rate": 3.6486861171728174e-06, "loss": 0.85746992, "num_input_tokens_seen": 38213775, "step": 1795, "time_per_iteration": 2.516519546508789 }, { "auxiliary_loss_clip": 0.01167489, "auxiliary_loss_mlp": 0.01029691, "balance_loss_clip": 1.05154955, "balance_loss_mlp": 1.01995194, "epoch": 0.21595623158780738, "flos": 23441229279360.0, "grad_norm": 1.6675853419057018, "language_loss": 0.7864151, "learning_rate": 3.6482450236359803e-06, "loss": 0.80838692, "num_input_tokens_seen": 38235630, "step": 1796, "time_per_iteration": 2.6114914417266846 }, { "auxiliary_loss_clip": 0.01198251, "auxiliary_loss_mlp": 0.01039043, "balance_loss_clip": 1.06090307, "balance_loss_mlp": 1.02982223, "epoch": 0.21607647447844647, "flos": 26906501036160.0, "grad_norm": 2.642848341706185, "language_loss": 0.77897894, "learning_rate": 3.647803680061683e-06, "loss": 0.80135179, "num_input_tokens_seen": 38256045, "step": 1797, "time_per_iteration": 2.5336523056030273 }, { "auxiliary_loss_clip": 0.01186294, "auxiliary_loss_mlp": 0.01036353, "balance_loss_clip": 1.05849743, "balance_loss_mlp": 1.02542746, "epoch": 0.21619671736908555, "flos": 14495378319360.0, "grad_norm": 2.4854600263610647, "language_loss": 0.74615562, "learning_rate": 3.6473620865168776e-06, "loss": 0.76838207, "num_input_tokens_seen": 38272915, "step": 1798, "time_per_iteration": 2.494514226913452 }, { "auxiliary_loss_clip": 0.01184594, "auxiliary_loss_mlp": 0.0103556, "balance_loss_clip": 1.0608511, "balance_loss_mlp": 1.02644014, "epoch": 0.21631696025972463, "flos": 17931096161280.0, "grad_norm": 1.9309158295459796, "language_loss": 0.81498837, "learning_rate": 3.646920243068554e-06, "loss": 0.83718991, "num_input_tokens_seen": 38290810, "step": 1799, "time_per_iteration": 2.5032858848571777 }, { "auxiliary_loss_clip": 0.01167867, "auxiliary_loss_mlp": 0.01034563, "balance_loss_clip": 1.05375743, "balance_loss_mlp": 1.02534819, "epoch": 0.21643720315036374, "flos": 24462385027200.0, "grad_norm": 1.5824022339318022, "language_loss": 0.74510413, "learning_rate": 3.6464781497837384e-06, "loss": 0.76712847, "num_input_tokens_seen": 38312785, "step": 1800, "time_per_iteration": 2.5370969772338867 }, { "auxiliary_loss_clip": 0.0118437, "auxiliary_loss_mlp": 0.01045752, "balance_loss_clip": 1.05401719, "balance_loss_mlp": 1.03638232, "epoch": 0.21655744604100283, "flos": 28474432588800.0, "grad_norm": 1.8600297324915298, "language_loss": 0.72813314, "learning_rate": 3.6460358067294965e-06, "loss": 0.75043434, "num_input_tokens_seen": 38334015, "step": 1801, "time_per_iteration": 2.576519727706909 }, { "auxiliary_loss_clip": 0.01216647, "auxiliary_loss_mlp": 0.01030826, "balance_loss_clip": 1.06006932, "balance_loss_mlp": 1.02083635, "epoch": 0.2166776889316419, "flos": 20152960767360.0, "grad_norm": 1.955194711043629, "language_loss": 0.78240514, "learning_rate": 3.645593213972932e-06, "loss": 0.80487984, "num_input_tokens_seen": 38352920, "step": 1802, "time_per_iteration": 2.4722437858581543 }, { "auxiliary_loss_clip": 0.01194934, "auxiliary_loss_mlp": 0.01032185, "balance_loss_clip": 1.05827296, "balance_loss_mlp": 1.02185571, "epoch": 0.21679793182228102, "flos": 15193482122880.0, "grad_norm": 2.605012971886413, "language_loss": 0.79298192, "learning_rate": 3.6451503715811852e-06, "loss": 0.81525314, "num_input_tokens_seen": 38371230, "step": 1803, "time_per_iteration": 2.4734320640563965 }, { "auxiliary_loss_clip": 0.01182008, "auxiliary_loss_mlp": 0.01033363, "balance_loss_clip": 1.05973446, "balance_loss_mlp": 1.0253222, "epoch": 0.2169181747129201, "flos": 17384464010880.0, "grad_norm": 1.9668271057411717, "language_loss": 0.80384636, "learning_rate": 3.6447072796214345e-06, "loss": 0.82600009, "num_input_tokens_seen": 38389795, "step": 1804, "time_per_iteration": 2.4981882572174072 }, { "auxiliary_loss_clip": 0.01065626, "auxiliary_loss_mlp": 0.01005924, "balance_loss_clip": 1.02624571, "balance_loss_mlp": 1.00354028, "epoch": 0.21703841760355919, "flos": 58760955429120.0, "grad_norm": 0.9554364486366573, "language_loss": 0.63303971, "learning_rate": 3.644263938160898e-06, "loss": 0.65375525, "num_input_tokens_seen": 38445760, "step": 1805, "time_per_iteration": 3.0463926792144775 }, { "auxiliary_loss_clip": 0.01166202, "auxiliary_loss_mlp": 0.01031654, "balance_loss_clip": 1.05590272, "balance_loss_mlp": 1.02158642, "epoch": 0.21715866049419827, "flos": 22418457419520.0, "grad_norm": 1.8374340320280655, "language_loss": 0.72122657, "learning_rate": 3.6438203472668293e-06, "loss": 0.74320513, "num_input_tokens_seen": 38465405, "step": 1806, "time_per_iteration": 3.380117177963257 }, { "auxiliary_loss_clip": 0.01186358, "auxiliary_loss_mlp": 0.01034154, "balance_loss_clip": 1.05735385, "balance_loss_mlp": 1.0243547, "epoch": 0.21727890338483738, "flos": 17237732952960.0, "grad_norm": 1.9070799122138213, "language_loss": 0.82056355, "learning_rate": 3.6433765070065206e-06, "loss": 0.84276867, "num_input_tokens_seen": 38483195, "step": 1807, "time_per_iteration": 2.499148368835449 }, { "auxiliary_loss_clip": 0.01213145, "auxiliary_loss_mlp": 0.01031619, "balance_loss_clip": 1.05952454, "balance_loss_mlp": 1.02148652, "epoch": 0.21739914627547646, "flos": 13434792416640.0, "grad_norm": 2.5394212463227146, "language_loss": 0.87596506, "learning_rate": 3.6429324174473025e-06, "loss": 0.8984127, "num_input_tokens_seen": 38496735, "step": 1808, "time_per_iteration": 2.4230000972747803 }, { "auxiliary_loss_clip": 0.01199368, "auxiliary_loss_mlp": 0.01035993, "balance_loss_clip": 1.05679536, "balance_loss_mlp": 1.02656329, "epoch": 0.21751938916611555, "flos": 20959514709120.0, "grad_norm": 1.9673286826152745, "language_loss": 0.84808159, "learning_rate": 3.6424880786565425e-06, "loss": 0.87043512, "num_input_tokens_seen": 38512880, "step": 1809, "time_per_iteration": 2.478933095932007 }, { "auxiliary_loss_clip": 0.01152118, "auxiliary_loss_mlp": 0.01037819, "balance_loss_clip": 1.05597627, "balance_loss_mlp": 1.0272212, "epoch": 0.21763963205675466, "flos": 27599936071680.0, "grad_norm": 2.2724983028377452, "language_loss": 0.79833269, "learning_rate": 3.6420434907016482e-06, "loss": 0.82023209, "num_input_tokens_seen": 38532570, "step": 1810, "time_per_iteration": 3.4623312950134277 }, { "auxiliary_loss_clip": 0.01200193, "auxiliary_loss_mlp": 0.01033967, "balance_loss_clip": 1.06220508, "balance_loss_mlp": 1.02466893, "epoch": 0.21775987494739374, "flos": 21430411032960.0, "grad_norm": 1.5040179017112, "language_loss": 0.8115114, "learning_rate": 3.6415986536500606e-06, "loss": 0.83385295, "num_input_tokens_seen": 38550900, "step": 1811, "time_per_iteration": 2.485686779022217 }, { "auxiliary_loss_clip": 0.01147597, "auxiliary_loss_mlp": 0.01038427, "balance_loss_clip": 1.05770361, "balance_loss_mlp": 1.02853298, "epoch": 0.21788011783803282, "flos": 18332972501760.0, "grad_norm": 6.0255268028168105, "language_loss": 0.80824983, "learning_rate": 3.641153567569263e-06, "loss": 0.83011007, "num_input_tokens_seen": 38569215, "step": 1812, "time_per_iteration": 3.4030940532684326 }, { "auxiliary_loss_clip": 0.01194622, "auxiliary_loss_mlp": 0.0102929, "balance_loss_clip": 1.05840492, "balance_loss_mlp": 1.01991379, "epoch": 0.2180003607286719, "flos": 30262748037120.0, "grad_norm": 2.333828781563822, "language_loss": 0.95938945, "learning_rate": 3.640708232526774e-06, "loss": 0.9816286, "num_input_tokens_seen": 38587870, "step": 1813, "time_per_iteration": 3.4120779037475586 }, { "auxiliary_loss_clip": 0.01132549, "auxiliary_loss_mlp": 0.0103344, "balance_loss_clip": 1.04485655, "balance_loss_mlp": 1.02314019, "epoch": 0.21812060361931102, "flos": 25480272637440.0, "grad_norm": 4.50554017108526, "language_loss": 0.78354979, "learning_rate": 3.6402626485901504e-06, "loss": 0.80520976, "num_input_tokens_seen": 38606965, "step": 1814, "time_per_iteration": 2.6621203422546387 }, { "auxiliary_loss_clip": 0.01194347, "auxiliary_loss_mlp": 0.0103815, "balance_loss_clip": 1.06085503, "balance_loss_mlp": 1.02885103, "epoch": 0.2182408465099501, "flos": 21908166854400.0, "grad_norm": 1.935069136220931, "language_loss": 0.77921939, "learning_rate": 3.639816815826988e-06, "loss": 0.80154437, "num_input_tokens_seen": 38626290, "step": 1815, "time_per_iteration": 2.5036466121673584 }, { "auxiliary_loss_clip": 0.01181054, "auxiliary_loss_mlp": 0.01028961, "balance_loss_clip": 1.0582943, "balance_loss_mlp": 1.01971006, "epoch": 0.21836108940058918, "flos": 23657339456640.0, "grad_norm": 1.7389176893223532, "language_loss": 0.77971947, "learning_rate": 3.6393707343049176e-06, "loss": 0.80181956, "num_input_tokens_seen": 38646620, "step": 1816, "time_per_iteration": 2.55485463142395 }, { "auxiliary_loss_clip": 0.01199444, "auxiliary_loss_mlp": 0.01029475, "balance_loss_clip": 1.0571363, "balance_loss_mlp": 1.02026582, "epoch": 0.2184813322912283, "flos": 24681009156480.0, "grad_norm": 2.750800225248675, "language_loss": 0.73751456, "learning_rate": 3.6389244040916104e-06, "loss": 0.75980377, "num_input_tokens_seen": 38665695, "step": 1817, "time_per_iteration": 2.543424606323242 }, { "auxiliary_loss_clip": 0.01175243, "auxiliary_loss_mlp": 0.00765681, "balance_loss_clip": 1.0555681, "balance_loss_mlp": 1.00089371, "epoch": 0.21860157518186737, "flos": 26574650259840.0, "grad_norm": 1.9858860385453883, "language_loss": 0.79217559, "learning_rate": 3.6384778252547747e-06, "loss": 0.81158483, "num_input_tokens_seen": 38681575, "step": 1818, "time_per_iteration": 2.546870231628418 }, { "auxiliary_loss_clip": 0.01179903, "auxiliary_loss_mlp": 0.00764651, "balance_loss_clip": 1.06010067, "balance_loss_mlp": 1.0009644, "epoch": 0.21872181807250646, "flos": 20886292834560.0, "grad_norm": 2.633498003300965, "language_loss": 0.78427494, "learning_rate": 3.638030997862155e-06, "loss": 0.80372047, "num_input_tokens_seen": 38700510, "step": 1819, "time_per_iteration": 2.5029172897338867 }, { "auxiliary_loss_clip": 0.01089882, "auxiliary_loss_mlp": 0.01008806, "balance_loss_clip": 1.02916503, "balance_loss_mlp": 1.00705338, "epoch": 0.21884206096314554, "flos": 61209452897280.0, "grad_norm": 0.7692780539040929, "language_loss": 0.59501928, "learning_rate": 3.6375839219815356e-06, "loss": 0.61600614, "num_input_tokens_seen": 38758310, "step": 1820, "time_per_iteration": 3.018749475479126 }, { "auxiliary_loss_clip": 0.0121308, "auxiliary_loss_mlp": 0.01031777, "balance_loss_clip": 1.06106544, "balance_loss_mlp": 1.02185249, "epoch": 0.21896230385378465, "flos": 23473835850240.0, "grad_norm": 1.9937184411250723, "language_loss": 0.82753086, "learning_rate": 3.6371365976807375e-06, "loss": 0.8499794, "num_input_tokens_seen": 38778705, "step": 1821, "time_per_iteration": 2.4649085998535156 }, { "auxiliary_loss_clip": 0.0114489, "auxiliary_loss_mlp": 0.01029548, "balance_loss_clip": 1.05391288, "balance_loss_mlp": 1.02007675, "epoch": 0.21908254674442373, "flos": 25081915829760.0, "grad_norm": 1.700915811783782, "language_loss": 0.84071362, "learning_rate": 3.6366890250276185e-06, "loss": 0.86245799, "num_input_tokens_seen": 38799660, "step": 1822, "time_per_iteration": 2.6104955673217773 }, { "auxiliary_loss_clip": 0.01212311, "auxiliary_loss_mlp": 0.01030653, "balance_loss_clip": 1.06049335, "balance_loss_mlp": 1.02105069, "epoch": 0.21920278963506282, "flos": 23513768795520.0, "grad_norm": 1.8820216773477807, "language_loss": 0.89986455, "learning_rate": 3.6362412040900764e-06, "loss": 0.9222942, "num_input_tokens_seen": 38819450, "step": 1823, "time_per_iteration": 2.475553274154663 }, { "auxiliary_loss_clip": 0.01199665, "auxiliary_loss_mlp": 0.01032477, "balance_loss_clip": 1.05791688, "balance_loss_mlp": 1.02274978, "epoch": 0.21932303252570193, "flos": 29242238734080.0, "grad_norm": 2.0366168150927515, "language_loss": 0.80926651, "learning_rate": 3.635793134936044e-06, "loss": 0.83158791, "num_input_tokens_seen": 38840460, "step": 1824, "time_per_iteration": 2.543588161468506 }, { "auxiliary_loss_clip": 0.01196323, "auxiliary_loss_mlp": 0.01036387, "balance_loss_clip": 1.05972886, "balance_loss_mlp": 1.02686214, "epoch": 0.219443275416341, "flos": 20806857907200.0, "grad_norm": 2.055728139162839, "language_loss": 0.73196483, "learning_rate": 3.635344817633494e-06, "loss": 0.75429189, "num_input_tokens_seen": 38859775, "step": 1825, "time_per_iteration": 2.595139265060425 }, { "auxiliary_loss_clip": 0.01192514, "auxiliary_loss_mlp": 0.01032715, "balance_loss_clip": 1.05707002, "balance_loss_mlp": 1.02314222, "epoch": 0.2195635183069801, "flos": 14501555458560.0, "grad_norm": 2.067874691579874, "language_loss": 0.75541836, "learning_rate": 3.634896252250436e-06, "loss": 0.77767074, "num_input_tokens_seen": 38876540, "step": 1826, "time_per_iteration": 2.4944369792938232 }, { "auxiliary_loss_clip": 0.01217211, "auxiliary_loss_mlp": 0.01042603, "balance_loss_clip": 1.06392622, "balance_loss_mlp": 1.03305995, "epoch": 0.2196837611976192, "flos": 24243473589120.0, "grad_norm": 1.9677866078938475, "language_loss": 0.82004106, "learning_rate": 3.6344474388549157e-06, "loss": 0.84263921, "num_input_tokens_seen": 38896195, "step": 1827, "time_per_iteration": 2.4724934101104736 }, { "auxiliary_loss_clip": 0.01203914, "auxiliary_loss_mlp": 0.01040499, "balance_loss_clip": 1.06372714, "balance_loss_mlp": 1.0298661, "epoch": 0.2198040040882583, "flos": 18074523168000.0, "grad_norm": 2.095655207348312, "language_loss": 0.80106449, "learning_rate": 3.6339983775150183e-06, "loss": 0.82350862, "num_input_tokens_seen": 38912755, "step": 1828, "time_per_iteration": 2.479191780090332 }, { "auxiliary_loss_clip": 0.01195874, "auxiliary_loss_mlp": 0.01030235, "balance_loss_clip": 1.06006789, "balance_loss_mlp": 1.02009618, "epoch": 0.21992424697889737, "flos": 17784185535360.0, "grad_norm": 2.5605731210981166, "language_loss": 0.83987868, "learning_rate": 3.6335490682988664e-06, "loss": 0.86213982, "num_input_tokens_seen": 38928365, "step": 1829, "time_per_iteration": 2.4454495906829834 }, { "auxiliary_loss_clip": 0.01130761, "auxiliary_loss_mlp": 0.0103396, "balance_loss_clip": 1.04988146, "balance_loss_mlp": 1.02451229, "epoch": 0.22004448986953645, "flos": 17638495971840.0, "grad_norm": 2.016747118862944, "language_loss": 0.82983798, "learning_rate": 3.63309951127462e-06, "loss": 0.85148519, "num_input_tokens_seen": 38945275, "step": 1830, "time_per_iteration": 2.604417324066162 }, { "auxiliary_loss_clip": 0.0116918, "auxiliary_loss_mlp": 0.01041439, "balance_loss_clip": 1.05855632, "balance_loss_mlp": 1.03133583, "epoch": 0.22016473276017556, "flos": 22275533203200.0, "grad_norm": 2.07099195142222, "language_loss": 0.75454307, "learning_rate": 3.6326497065104757e-06, "loss": 0.7766493, "num_input_tokens_seen": 38965740, "step": 1831, "time_per_iteration": 2.576331377029419 }, { "auxiliary_loss_clip": 0.01203776, "auxiliary_loss_mlp": 0.01032998, "balance_loss_clip": 1.06062341, "balance_loss_mlp": 1.0234791, "epoch": 0.22028497565081465, "flos": 25556259859200.0, "grad_norm": 1.962192022577612, "language_loss": 0.77963573, "learning_rate": 3.6321996540746697e-06, "loss": 0.8020035, "num_input_tokens_seen": 38984815, "step": 1832, "time_per_iteration": 3.343738555908203 }, { "auxiliary_loss_clip": 0.01168782, "auxiliary_loss_mlp": 0.01033061, "balance_loss_clip": 1.05751157, "balance_loss_mlp": 1.02288043, "epoch": 0.22040521854145373, "flos": 36247332925440.0, "grad_norm": 1.7843214336478292, "language_loss": 0.80798113, "learning_rate": 3.6317493540354733e-06, "loss": 0.82999957, "num_input_tokens_seen": 39008230, "step": 1833, "time_per_iteration": 2.691223621368408 }, { "auxiliary_loss_clip": 0.01196108, "auxiliary_loss_mlp": 0.01038222, "balance_loss_clip": 1.05848503, "balance_loss_mlp": 1.02853072, "epoch": 0.22052546143209284, "flos": 11838420270720.0, "grad_norm": 2.1075454989375526, "language_loss": 0.76603472, "learning_rate": 3.6312988064611976e-06, "loss": 0.788378, "num_input_tokens_seen": 39026540, "step": 1834, "time_per_iteration": 2.503697395324707 }, { "auxiliary_loss_clip": 0.01170335, "auxiliary_loss_mlp": 0.01037744, "balance_loss_clip": 1.05327177, "balance_loss_mlp": 1.02775407, "epoch": 0.22064570432273192, "flos": 24209250906240.0, "grad_norm": 1.7980106022524054, "language_loss": 0.81579375, "learning_rate": 3.6308480114201896e-06, "loss": 0.83787453, "num_input_tokens_seen": 39048460, "step": 1835, "time_per_iteration": 2.586886167526245 }, { "auxiliary_loss_clip": 0.01218161, "auxiliary_loss_mlp": 0.01039778, "balance_loss_clip": 1.06628919, "balance_loss_mlp": 1.02968121, "epoch": 0.220765947213371, "flos": 17931347556480.0, "grad_norm": 1.9866853062646816, "language_loss": 0.76503837, "learning_rate": 3.630396968980835e-06, "loss": 0.78761774, "num_input_tokens_seen": 39066335, "step": 1836, "time_per_iteration": 2.4501953125 }, { "auxiliary_loss_clip": 0.0118653, "auxiliary_loss_mlp": 0.01036977, "balance_loss_clip": 1.05829692, "balance_loss_mlp": 1.02668285, "epoch": 0.2208861901040101, "flos": 26757040544640.0, "grad_norm": 2.4257323987347736, "language_loss": 0.83769405, "learning_rate": 3.6299456792115575e-06, "loss": 0.85992903, "num_input_tokens_seen": 39087590, "step": 1837, "time_per_iteration": 3.375005006790161 }, { "auxiliary_loss_clip": 0.01110082, "auxiliary_loss_mlp": 0.01031275, "balance_loss_clip": 1.0461539, "balance_loss_mlp": 1.02144611, "epoch": 0.2210064329946492, "flos": 17817977255040.0, "grad_norm": 1.867138953688265, "language_loss": 0.81080282, "learning_rate": 3.629494142180815e-06, "loss": 0.83221638, "num_input_tokens_seen": 39106335, "step": 1838, "time_per_iteration": 3.5625452995300293 }, { "auxiliary_loss_clip": 0.01214356, "auxiliary_loss_mlp": 0.01033185, "balance_loss_clip": 1.06248856, "balance_loss_mlp": 1.02335024, "epoch": 0.22112667588528828, "flos": 17967401832960.0, "grad_norm": 2.4137200429991235, "language_loss": 0.8487736, "learning_rate": 3.6290423579571075e-06, "loss": 0.87124902, "num_input_tokens_seen": 39122875, "step": 1839, "time_per_iteration": 2.429676055908203 }, { "auxiliary_loss_clip": 0.0119767, "auxiliary_loss_mlp": 0.01033447, "balance_loss_clip": 1.06198418, "balance_loss_mlp": 1.02320123, "epoch": 0.22124691877592736, "flos": 18369206346240.0, "grad_norm": 1.5888362504762383, "language_loss": 0.80414015, "learning_rate": 3.6285903266089694e-06, "loss": 0.8264513, "num_input_tokens_seen": 39142150, "step": 1840, "time_per_iteration": 3.2666099071502686 }, { "auxiliary_loss_clip": 0.01188929, "auxiliary_loss_mlp": 0.01031047, "balance_loss_clip": 1.06119275, "balance_loss_mlp": 1.02119398, "epoch": 0.22136716166656648, "flos": 20813286441600.0, "grad_norm": 2.277933352628288, "language_loss": 0.77550155, "learning_rate": 3.628138048204974e-06, "loss": 0.79770124, "num_input_tokens_seen": 39162835, "step": 1841, "time_per_iteration": 2.5117993354797363 }, { "auxiliary_loss_clip": 0.01147157, "auxiliary_loss_mlp": 0.01035456, "balance_loss_clip": 1.05471373, "balance_loss_mlp": 1.02466178, "epoch": 0.22148740455720556, "flos": 17675699483520.0, "grad_norm": 1.7526688298215332, "language_loss": 0.75941789, "learning_rate": 3.6276855228137304e-06, "loss": 0.78124404, "num_input_tokens_seen": 39181040, "step": 1842, "time_per_iteration": 2.5626485347747803 }, { "auxiliary_loss_clip": 0.01215864, "auxiliary_loss_mlp": 0.00765363, "balance_loss_clip": 1.06377769, "balance_loss_mlp": 1.0009048, "epoch": 0.22160764744784464, "flos": 21726710323200.0, "grad_norm": 2.1723645940308165, "language_loss": 0.81869686, "learning_rate": 3.6272327505038874e-06, "loss": 0.83850914, "num_input_tokens_seen": 39197505, "step": 1843, "time_per_iteration": 2.474837303161621 }, { "auxiliary_loss_clip": 0.01158895, "auxiliary_loss_mlp": 0.01036233, "balance_loss_clip": 1.0536046, "balance_loss_mlp": 1.02705932, "epoch": 0.22172789033848372, "flos": 23764712186880.0, "grad_norm": 2.0657982927518783, "language_loss": 0.78608894, "learning_rate": 3.626779731344131e-06, "loss": 0.80804026, "num_input_tokens_seen": 39217295, "step": 1844, "time_per_iteration": 2.614419460296631 }, { "auxiliary_loss_clip": 0.01209412, "auxiliary_loss_mlp": 0.01039411, "balance_loss_clip": 1.06075335, "balance_loss_mlp": 1.03011298, "epoch": 0.22184813322912283, "flos": 16982300361600.0, "grad_norm": 1.8979379168343127, "language_loss": 0.85162294, "learning_rate": 3.6263264654031814e-06, "loss": 0.87411118, "num_input_tokens_seen": 39234195, "step": 1845, "time_per_iteration": 2.429943323135376 }, { "auxiliary_loss_clip": 0.0107815, "auxiliary_loss_mlp": 0.01002226, "balance_loss_clip": 1.02922654, "balance_loss_mlp": 1.00043821, "epoch": 0.22196837611976192, "flos": 61823740314240.0, "grad_norm": 0.7080897769965405, "language_loss": 0.59222591, "learning_rate": 3.6258729527498008e-06, "loss": 0.61302972, "num_input_tokens_seen": 39295040, "step": 1846, "time_per_iteration": 3.091163396835327 }, { "auxiliary_loss_clip": 0.01191191, "auxiliary_loss_mlp": 0.01031038, "balance_loss_clip": 1.06260633, "balance_loss_mlp": 1.02148342, "epoch": 0.222088619010401, "flos": 25558019625600.0, "grad_norm": 2.234913187081375, "language_loss": 0.64814746, "learning_rate": 3.6254191934527854e-06, "loss": 0.6703698, "num_input_tokens_seen": 39314395, "step": 1847, "time_per_iteration": 2.6546897888183594 }, { "auxiliary_loss_clip": 0.0116668, "auxiliary_loss_mlp": 0.0103203, "balance_loss_clip": 1.05999601, "balance_loss_mlp": 1.02168822, "epoch": 0.2222088619010401, "flos": 19318612677120.0, "grad_norm": 1.9880423692090115, "language_loss": 0.65021938, "learning_rate": 3.6249651875809715e-06, "loss": 0.67220646, "num_input_tokens_seen": 39334275, "step": 1848, "time_per_iteration": 2.595595598220825 }, { "auxiliary_loss_clip": 0.01179336, "auxiliary_loss_mlp": 0.01031689, "balance_loss_clip": 1.06053615, "balance_loss_mlp": 1.02221775, "epoch": 0.2223291047916792, "flos": 19099342103040.0, "grad_norm": 1.8900897185309564, "language_loss": 0.89523858, "learning_rate": 3.62451093520323e-06, "loss": 0.91734886, "num_input_tokens_seen": 39352180, "step": 1849, "time_per_iteration": 2.5012753009796143 }, { "auxiliary_loss_clip": 0.01151103, "auxiliary_loss_mlp": 0.01043861, "balance_loss_clip": 1.0532589, "balance_loss_mlp": 1.0345273, "epoch": 0.22244934768231828, "flos": 20850418126080.0, "grad_norm": 2.192864299218851, "language_loss": 0.90520245, "learning_rate": 3.6240564363884714e-06, "loss": 0.9271521, "num_input_tokens_seen": 39372125, "step": 1850, "time_per_iteration": 2.60660982131958 }, { "auxiliary_loss_clip": 0.01201212, "auxiliary_loss_mlp": 0.01038678, "balance_loss_clip": 1.05873311, "balance_loss_mlp": 1.02897382, "epoch": 0.2225695905729574, "flos": 15632921111040.0, "grad_norm": 2.1501431983204053, "language_loss": 0.70448303, "learning_rate": 3.623601691205643e-06, "loss": 0.72688192, "num_input_tokens_seen": 39391200, "step": 1851, "time_per_iteration": 2.502091646194458 }, { "auxiliary_loss_clip": 0.01196985, "auxiliary_loss_mlp": 0.01029312, "balance_loss_clip": 1.05827117, "balance_loss_mlp": 1.02013314, "epoch": 0.22268983346359647, "flos": 25373582265600.0, "grad_norm": 2.0433813144031587, "language_loss": 0.81309092, "learning_rate": 3.623146699723729e-06, "loss": 0.83535391, "num_input_tokens_seen": 39410660, "step": 1852, "time_per_iteration": 2.553537368774414 }, { "auxiliary_loss_clip": 0.01186827, "auxiliary_loss_mlp": 0.01037897, "balance_loss_clip": 1.06475699, "balance_loss_mlp": 1.02832437, "epoch": 0.22281007635423555, "flos": 13261452359040.0, "grad_norm": 1.7434932059917718, "language_loss": 0.77765834, "learning_rate": 3.6226914620117507e-06, "loss": 0.7999056, "num_input_tokens_seen": 39429280, "step": 1853, "time_per_iteration": 2.5166845321655273 }, { "auxiliary_loss_clip": 0.01171565, "auxiliary_loss_mlp": 0.01031864, "balance_loss_clip": 1.05404162, "balance_loss_mlp": 1.02259493, "epoch": 0.22293031924487464, "flos": 15340536403200.0, "grad_norm": 2.448947588991763, "language_loss": 0.81192482, "learning_rate": 3.622235978138768e-06, "loss": 0.8339591, "num_input_tokens_seen": 39446905, "step": 1854, "time_per_iteration": 2.531911849975586 }, { "auxiliary_loss_clip": 0.01200147, "auxiliary_loss_mlp": 0.01030829, "balance_loss_clip": 1.06383705, "balance_loss_mlp": 1.02135217, "epoch": 0.22305056213551375, "flos": 22564649773440.0, "grad_norm": 1.8666837020135476, "language_loss": 0.81472456, "learning_rate": 3.621780248173877e-06, "loss": 0.83703429, "num_input_tokens_seen": 39465105, "step": 1855, "time_per_iteration": 2.485137701034546 }, { "auxiliary_loss_clip": 0.01105582, "auxiliary_loss_mlp": 0.01004052, "balance_loss_clip": 1.028512, "balance_loss_mlp": 1.00253248, "epoch": 0.22317080502615283, "flos": 64880419887360.0, "grad_norm": 0.8282101929430018, "language_loss": 0.61029661, "learning_rate": 3.6213242721862125e-06, "loss": 0.63139296, "num_input_tokens_seen": 39523560, "step": 1856, "time_per_iteration": 3.068829298019409 }, { "auxiliary_loss_clip": 0.01173778, "auxiliary_loss_mlp": 0.01035931, "balance_loss_clip": 1.0582335, "balance_loss_mlp": 1.02625108, "epoch": 0.2232910479167919, "flos": 25775997310080.0, "grad_norm": 1.588392971160899, "language_loss": 0.75256741, "learning_rate": 3.620868050244945e-06, "loss": 0.77466452, "num_input_tokens_seen": 39544040, "step": 1857, "time_per_iteration": 2.5508782863616943 }, { "auxiliary_loss_clip": 0.01178659, "auxiliary_loss_mlp": 0.01030533, "balance_loss_clip": 1.05777502, "balance_loss_mlp": 1.02074552, "epoch": 0.22341129080743102, "flos": 23251799928960.0, "grad_norm": 1.7773498125523781, "language_loss": 0.7767967, "learning_rate": 3.6204115824192817e-06, "loss": 0.79888856, "num_input_tokens_seen": 39561515, "step": 1858, "time_per_iteration": 2.5358359813690186 }, { "auxiliary_loss_clip": 0.01173929, "auxiliary_loss_mlp": 0.01032516, "balance_loss_clip": 1.05437028, "balance_loss_mlp": 1.02241302, "epoch": 0.2235315336980701, "flos": 21214552250880.0, "grad_norm": 2.57898043621557, "language_loss": 0.76530206, "learning_rate": 3.619954868778471e-06, "loss": 0.78736651, "num_input_tokens_seen": 39578210, "step": 1859, "time_per_iteration": 3.29205584526062 }, { "auxiliary_loss_clip": 0.01183414, "auxiliary_loss_mlp": 0.01034268, "balance_loss_clip": 1.05726385, "balance_loss_mlp": 1.02543473, "epoch": 0.2236517765887092, "flos": 19901945548800.0, "grad_norm": 1.922907339410487, "language_loss": 0.82973999, "learning_rate": 3.6194979093917944e-06, "loss": 0.85191673, "num_input_tokens_seen": 39597625, "step": 1860, "time_per_iteration": 2.516542673110962 }, { "auxiliary_loss_clip": 0.01179317, "auxiliary_loss_mlp": 0.01037151, "balance_loss_clip": 1.05759752, "balance_loss_mlp": 1.02783465, "epoch": 0.22377201947934827, "flos": 23214847812480.0, "grad_norm": 1.8880782064336612, "language_loss": 0.87007153, "learning_rate": 3.6190407043285724e-06, "loss": 0.89223617, "num_input_tokens_seen": 39615360, "step": 1861, "time_per_iteration": 2.525892734527588 }, { "auxiliary_loss_clip": 0.01216472, "auxiliary_loss_mlp": 0.01034519, "balance_loss_clip": 1.06313014, "balance_loss_mlp": 1.02469587, "epoch": 0.22389226236998738, "flos": 26794244056320.0, "grad_norm": 1.8270058990913984, "language_loss": 0.75750405, "learning_rate": 3.618583253658163e-06, "loss": 0.78001392, "num_input_tokens_seen": 39635460, "step": 1862, "time_per_iteration": 2.4903433322906494 }, { "auxiliary_loss_clip": 0.0115348, "auxiliary_loss_mlp": 0.0076543, "balance_loss_clip": 1.05478442, "balance_loss_mlp": 1.00098884, "epoch": 0.22401250526062647, "flos": 24170359455360.0, "grad_norm": 2.0531138651682936, "language_loss": 0.86383408, "learning_rate": 3.618125557449961e-06, "loss": 0.88302314, "num_input_tokens_seen": 39653515, "step": 1863, "time_per_iteration": 2.628727436065674 }, { "auxiliary_loss_clip": 0.01195902, "auxiliary_loss_mlp": 0.01032405, "balance_loss_clip": 1.06027055, "balance_loss_mlp": 1.02274323, "epoch": 0.22413274815126555, "flos": 16759761649920.0, "grad_norm": 1.9826821231827214, "language_loss": 0.83092076, "learning_rate": 3.6176676157733983e-06, "loss": 0.85320383, "num_input_tokens_seen": 39668525, "step": 1864, "time_per_iteration": 3.395414113998413 }, { "auxiliary_loss_clip": 0.01163012, "auxiliary_loss_mlp": 0.0103444, "balance_loss_clip": 1.0552299, "balance_loss_mlp": 1.02475476, "epoch": 0.22425299104190466, "flos": 21360205900800.0, "grad_norm": 1.9861608042584011, "language_loss": 0.75771934, "learning_rate": 3.6172094286979443e-06, "loss": 0.77969384, "num_input_tokens_seen": 39685895, "step": 1865, "time_per_iteration": 2.538931131362915 }, { "auxiliary_loss_clip": 0.01180527, "auxiliary_loss_mlp": 0.01033329, "balance_loss_clip": 1.05447745, "balance_loss_mlp": 1.02385747, "epoch": 0.22437323393254374, "flos": 32165547108480.0, "grad_norm": 1.4542573198902833, "language_loss": 0.81347823, "learning_rate": 3.6167509962931064e-06, "loss": 0.83561677, "num_input_tokens_seen": 39711595, "step": 1866, "time_per_iteration": 3.361229181289673 }, { "auxiliary_loss_clip": 0.011636, "auxiliary_loss_mlp": 0.01033082, "balance_loss_clip": 1.05907941, "balance_loss_mlp": 1.02331305, "epoch": 0.22449347682318282, "flos": 18002809664640.0, "grad_norm": 2.425195390624701, "language_loss": 0.77168375, "learning_rate": 3.6162923186284276e-06, "loss": 0.79365057, "num_input_tokens_seen": 39727555, "step": 1867, "time_per_iteration": 2.542787790298462 }, { "auxiliary_loss_clip": 0.01184134, "auxiliary_loss_mlp": 0.01038992, "balance_loss_clip": 1.05755353, "balance_loss_mlp": 1.02940118, "epoch": 0.2246137197138219, "flos": 18697286194560.0, "grad_norm": 1.871388549069376, "language_loss": 0.85816681, "learning_rate": 3.6158333957734888e-06, "loss": 0.88039809, "num_input_tokens_seen": 39746145, "step": 1868, "time_per_iteration": 2.494091272354126 }, { "auxiliary_loss_clip": 0.01169593, "auxiliary_loss_mlp": 0.01033391, "balance_loss_clip": 1.0536294, "balance_loss_mlp": 1.02390838, "epoch": 0.22473396260446102, "flos": 15590653781760.0, "grad_norm": 2.5066638802733663, "language_loss": 0.82652211, "learning_rate": 3.6153742277979088e-06, "loss": 0.84855193, "num_input_tokens_seen": 39763575, "step": 1869, "time_per_iteration": 2.527780055999756 }, { "auxiliary_loss_clip": 0.01186785, "auxiliary_loss_mlp": 0.01038545, "balance_loss_clip": 1.05839658, "balance_loss_mlp": 1.02910352, "epoch": 0.2248542054951001, "flos": 14465501182080.0, "grad_norm": 1.8966546830711624, "language_loss": 0.78301626, "learning_rate": 3.6149148147713434e-06, "loss": 0.8052696, "num_input_tokens_seen": 39781810, "step": 1870, "time_per_iteration": 2.492715358734131 }, { "auxiliary_loss_clip": 0.01205259, "auxiliary_loss_mlp": 0.01036571, "balance_loss_clip": 1.06516886, "balance_loss_mlp": 1.02749372, "epoch": 0.22497444838573918, "flos": 19243882431360.0, "grad_norm": 1.948123388163964, "language_loss": 0.86649078, "learning_rate": 3.614455156763484e-06, "loss": 0.8889091, "num_input_tokens_seen": 39800115, "step": 1871, "time_per_iteration": 2.4802510738372803 }, { "auxiliary_loss_clip": 0.01147589, "auxiliary_loss_mlp": 0.01034369, "balance_loss_clip": 1.0484823, "balance_loss_mlp": 1.02514207, "epoch": 0.2250946912763783, "flos": 16910299549440.0, "grad_norm": 1.9783770824194185, "language_loss": 0.71753198, "learning_rate": 3.613995253844061e-06, "loss": 0.73935157, "num_input_tokens_seen": 39817795, "step": 1872, "time_per_iteration": 2.5490310192108154 }, { "auxiliary_loss_clip": 0.011987, "auxiliary_loss_mlp": 0.01037336, "balance_loss_clip": 1.06279373, "balance_loss_mlp": 1.02771521, "epoch": 0.22521493416701738, "flos": 24681368292480.0, "grad_norm": 1.8430744053092842, "language_loss": 0.81225878, "learning_rate": 3.6135351060828414e-06, "loss": 0.83461916, "num_input_tokens_seen": 39838270, "step": 1873, "time_per_iteration": 2.5368502140045166 }, { "auxiliary_loss_clip": 0.01220939, "auxiliary_loss_mlp": 0.01041575, "balance_loss_clip": 1.06565356, "balance_loss_mlp": 1.0313648, "epoch": 0.22533517705765646, "flos": 17821963664640.0, "grad_norm": 2.4485399220804536, "language_loss": 0.69037896, "learning_rate": 3.6130747135496285e-06, "loss": 0.71300411, "num_input_tokens_seen": 39857270, "step": 1874, "time_per_iteration": 2.5364906787872314 }, { "auxiliary_loss_clip": 0.0121332, "auxiliary_loss_mlp": 0.01033476, "balance_loss_clip": 1.06186152, "balance_loss_mlp": 1.02355158, "epoch": 0.22545541994829554, "flos": 33691390899840.0, "grad_norm": 1.912430356470123, "language_loss": 0.65819371, "learning_rate": 3.6126140763142646e-06, "loss": 0.68066162, "num_input_tokens_seen": 39882300, "step": 1875, "time_per_iteration": 2.5601909160614014 }, { "auxiliary_loss_clip": 0.01215015, "auxiliary_loss_mlp": 0.01039081, "balance_loss_clip": 1.06363487, "balance_loss_mlp": 1.02875769, "epoch": 0.22557566283893465, "flos": 19171594310400.0, "grad_norm": 2.4741014409101187, "language_loss": 0.85806561, "learning_rate": 3.6121531944466275e-06, "loss": 0.88060653, "num_input_tokens_seen": 39899625, "step": 1876, "time_per_iteration": 2.439235210418701 }, { "auxiliary_loss_clip": 0.01195966, "auxiliary_loss_mlp": 0.01037557, "balance_loss_clip": 1.06039047, "balance_loss_mlp": 1.02843761, "epoch": 0.22569590572957374, "flos": 20773281669120.0, "grad_norm": 2.0766529801959623, "language_loss": 0.78550196, "learning_rate": 3.611692068016633e-06, "loss": 0.80783719, "num_input_tokens_seen": 39915955, "step": 1877, "time_per_iteration": 2.5173697471618652 }, { "auxiliary_loss_clip": 0.01160749, "auxiliary_loss_mlp": 0.01040672, "balance_loss_clip": 1.05057383, "balance_loss_mlp": 1.03011644, "epoch": 0.22581614862021282, "flos": 18442715529600.0, "grad_norm": 2.0774131567164624, "language_loss": 0.74898779, "learning_rate": 3.611230697094233e-06, "loss": 0.77100205, "num_input_tokens_seen": 39932655, "step": 1878, "time_per_iteration": 2.56384015083313 }, { "auxiliary_loss_clip": 0.01185986, "auxiliary_loss_mlp": 0.01037462, "balance_loss_clip": 1.05694747, "balance_loss_mlp": 1.02830684, "epoch": 0.22593639151085193, "flos": 20048389297920.0, "grad_norm": 1.7982894101729847, "language_loss": 0.86887002, "learning_rate": 3.6107690817494173e-06, "loss": 0.89110446, "num_input_tokens_seen": 39952875, "step": 1879, "time_per_iteration": 2.598787307739258 }, { "auxiliary_loss_clip": 0.01148995, "auxiliary_loss_mlp": 0.01032976, "balance_loss_clip": 1.05222559, "balance_loss_mlp": 1.02327812, "epoch": 0.226056634401491, "flos": 13115116350720.0, "grad_norm": 2.589123386647954, "language_loss": 0.70996881, "learning_rate": 3.6103072220522117e-06, "loss": 0.73178852, "num_input_tokens_seen": 39968405, "step": 1880, "time_per_iteration": 2.5793960094451904 }, { "auxiliary_loss_clip": 0.01172021, "auxiliary_loss_mlp": 0.010378, "balance_loss_clip": 1.05532455, "balance_loss_mlp": 1.02825689, "epoch": 0.2261768772921301, "flos": 18988378012800.0, "grad_norm": 1.8618163450802068, "language_loss": 0.91884744, "learning_rate": 3.609845118072682e-06, "loss": 0.94094574, "num_input_tokens_seen": 39987075, "step": 1881, "time_per_iteration": 2.562549352645874 }, { "auxiliary_loss_clip": 0.01204044, "auxiliary_loss_mlp": 0.00765513, "balance_loss_clip": 1.06053567, "balance_loss_mlp": 1.00106442, "epoch": 0.2262971201827692, "flos": 19974054101760.0, "grad_norm": 1.8250034757698619, "language_loss": 0.79947442, "learning_rate": 3.6093827698809276e-06, "loss": 0.81917, "num_input_tokens_seen": 40006175, "step": 1882, "time_per_iteration": 2.489743947982788 }, { "auxiliary_loss_clip": 0.0119492, "auxiliary_loss_mlp": 0.01037179, "balance_loss_clip": 1.0564754, "balance_loss_mlp": 1.02767777, "epoch": 0.2264173630734083, "flos": 16654543735680.0, "grad_norm": 2.136312271798255, "language_loss": 0.84983349, "learning_rate": 3.6089201775470864e-06, "loss": 0.87215436, "num_input_tokens_seen": 40021630, "step": 1883, "time_per_iteration": 2.4470057487487793 }, { "auxiliary_loss_clip": 0.01158182, "auxiliary_loss_mlp": 0.01036651, "balance_loss_clip": 1.05583787, "balance_loss_mlp": 1.02687633, "epoch": 0.22653760596404737, "flos": 24389809597440.0, "grad_norm": 1.3768009123846234, "language_loss": 0.77395582, "learning_rate": 3.6084573411413334e-06, "loss": 0.79590416, "num_input_tokens_seen": 40041025, "step": 1884, "time_per_iteration": 2.5785439014434814 }, { "auxiliary_loss_clip": 0.01169279, "auxiliary_loss_mlp": 0.01040842, "balance_loss_clip": 1.05641735, "balance_loss_mlp": 1.03001213, "epoch": 0.22665784885468646, "flos": 18332541538560.0, "grad_norm": 1.9760189842238627, "language_loss": 0.80994856, "learning_rate": 3.607994260733881e-06, "loss": 0.83204985, "num_input_tokens_seen": 40060265, "step": 1885, "time_per_iteration": 3.3323817253112793 }, { "auxiliary_loss_clip": 0.01187462, "auxiliary_loss_mlp": 0.01031074, "balance_loss_clip": 1.05670381, "balance_loss_mlp": 1.02208531, "epoch": 0.22677809174532557, "flos": 24058102475520.0, "grad_norm": 1.634145050080432, "language_loss": 0.74713755, "learning_rate": 3.6075309363949776e-06, "loss": 0.76932299, "num_input_tokens_seen": 40079435, "step": 1886, "time_per_iteration": 2.514387845993042 }, { "auxiliary_loss_clip": 0.01213582, "auxiliary_loss_mlp": 0.0103372, "balance_loss_clip": 1.06082606, "balance_loss_mlp": 1.02393341, "epoch": 0.22689833463596465, "flos": 20374242503040.0, "grad_norm": 2.282287208111359, "language_loss": 0.81096125, "learning_rate": 3.6070673681949094e-06, "loss": 0.83343434, "num_input_tokens_seen": 40097800, "step": 1887, "time_per_iteration": 2.4556827545166016 }, { "auxiliary_loss_clip": 0.01185518, "auxiliary_loss_mlp": 0.00765134, "balance_loss_clip": 1.0596745, "balance_loss_mlp": 1.00110686, "epoch": 0.22701857752660373, "flos": 30120398438400.0, "grad_norm": 1.8463247542872159, "language_loss": 0.8146311, "learning_rate": 3.606603556203999e-06, "loss": 0.83413768, "num_input_tokens_seen": 40122745, "step": 1888, "time_per_iteration": 2.601454973220825 }, { "auxiliary_loss_clip": 0.01197506, "auxiliary_loss_mlp": 0.0103553, "balance_loss_clip": 1.05647182, "balance_loss_mlp": 1.02602863, "epoch": 0.22713882041724284, "flos": 22492182084480.0, "grad_norm": 1.811119368682901, "language_loss": 0.83463424, "learning_rate": 3.6061395004926066e-06, "loss": 0.85696459, "num_input_tokens_seen": 40141680, "step": 1889, "time_per_iteration": 2.480898141860962 }, { "auxiliary_loss_clip": 0.01213199, "auxiliary_loss_mlp": 0.01034164, "balance_loss_clip": 1.0600574, "balance_loss_mlp": 1.024544, "epoch": 0.22725906330788193, "flos": 20521548178560.0, "grad_norm": 1.9479741659128826, "language_loss": 0.84772599, "learning_rate": 3.605675201131129e-06, "loss": 0.87019956, "num_input_tokens_seen": 40160140, "step": 1890, "time_per_iteration": 2.4434306621551514 }, { "auxiliary_loss_clip": 0.01206992, "auxiliary_loss_mlp": 0.01033835, "balance_loss_clip": 1.06376541, "balance_loss_mlp": 1.02487016, "epoch": 0.227379306198521, "flos": 18989922297600.0, "grad_norm": 2.244446726712968, "language_loss": 0.79414457, "learning_rate": 3.60521065819e-06, "loss": 0.81655288, "num_input_tokens_seen": 40177450, "step": 1891, "time_per_iteration": 3.3893744945526123 }, { "auxiliary_loss_clip": 0.01186334, "auxiliary_loss_mlp": 0.01032853, "balance_loss_clip": 1.05716181, "balance_loss_mlp": 1.02424002, "epoch": 0.2274995490891601, "flos": 21798351999360.0, "grad_norm": 1.7231169556552874, "language_loss": 0.87688828, "learning_rate": 3.60474587173969e-06, "loss": 0.89908016, "num_input_tokens_seen": 40195935, "step": 1892, "time_per_iteration": 3.260382890701294 }, { "auxiliary_loss_clip": 0.01197128, "auxiliary_loss_mlp": 0.01035994, "balance_loss_clip": 1.0621841, "balance_loss_mlp": 1.02658868, "epoch": 0.2276197919797992, "flos": 19058654972160.0, "grad_norm": 1.9786237873562795, "language_loss": 0.84173048, "learning_rate": 3.6042808418507084e-06, "loss": 0.86406171, "num_input_tokens_seen": 40213620, "step": 1893, "time_per_iteration": 2.4510369300842285 }, { "auxiliary_loss_clip": 0.01200645, "auxiliary_loss_mlp": 0.01037345, "balance_loss_clip": 1.062029, "balance_loss_mlp": 1.02753949, "epoch": 0.22774003487043828, "flos": 18806777827200.0, "grad_norm": 3.832560721133269, "language_loss": 0.76831335, "learning_rate": 3.6038155685935976e-06, "loss": 0.79069322, "num_input_tokens_seen": 40230190, "step": 1894, "time_per_iteration": 2.454312562942505 }, { "auxiliary_loss_clip": 0.01195333, "auxiliary_loss_mlp": 0.01033483, "balance_loss_clip": 1.05904293, "balance_loss_mlp": 1.02455401, "epoch": 0.22786027776107737, "flos": 23002544476800.0, "grad_norm": 1.8478021553382742, "language_loss": 0.70841503, "learning_rate": 3.6033500520389404e-06, "loss": 0.73070323, "num_input_tokens_seen": 40246860, "step": 1895, "time_per_iteration": 2.4760398864746094 }, { "auxiliary_loss_clip": 0.01070673, "auxiliary_loss_mlp": 0.01006922, "balance_loss_clip": 1.02587962, "balance_loss_mlp": 1.00509787, "epoch": 0.22798052065171648, "flos": 66706872600960.0, "grad_norm": 0.8040780334635551, "language_loss": 0.64823902, "learning_rate": 3.6028842922573553e-06, "loss": 0.66901493, "num_input_tokens_seen": 40311005, "step": 1896, "time_per_iteration": 3.206712245941162 }, { "auxiliary_loss_clip": 0.01090266, "auxiliary_loss_mlp": 0.00754752, "balance_loss_clip": 1.03051019, "balance_loss_mlp": 1.00066173, "epoch": 0.22810076354235556, "flos": 62080896758400.0, "grad_norm": 0.8509710157333448, "language_loss": 0.62913644, "learning_rate": 3.602418289319497e-06, "loss": 0.64758658, "num_input_tokens_seen": 40369560, "step": 1897, "time_per_iteration": 3.1072418689727783 }, { "auxiliary_loss_clip": 0.01149405, "auxiliary_loss_mlp": 0.01034233, "balance_loss_clip": 1.05266404, "balance_loss_mlp": 1.02444577, "epoch": 0.22822100643299464, "flos": 23876358635520.0, "grad_norm": 1.9594700156480302, "language_loss": 0.73504794, "learning_rate": 3.601952043296059e-06, "loss": 0.75688434, "num_input_tokens_seen": 40389555, "step": 1898, "time_per_iteration": 2.6265969276428223 }, { "auxiliary_loss_clip": 0.01187299, "auxiliary_loss_mlp": 0.01032276, "balance_loss_clip": 1.05607986, "balance_loss_mlp": 1.02270961, "epoch": 0.22834124932363373, "flos": 20991331180800.0, "grad_norm": 2.209433595094319, "language_loss": 0.80496663, "learning_rate": 3.6014855542577696e-06, "loss": 0.82716238, "num_input_tokens_seen": 40406765, "step": 1899, "time_per_iteration": 2.5008046627044678 }, { "auxiliary_loss_clip": 0.01182195, "auxiliary_loss_mlp": 0.01028833, "balance_loss_clip": 1.05792546, "balance_loss_mlp": 1.01908135, "epoch": 0.22846149221427284, "flos": 24901572620160.0, "grad_norm": 2.22064301455328, "language_loss": 0.84333611, "learning_rate": 3.6010188222753943e-06, "loss": 0.86544639, "num_input_tokens_seen": 40427535, "step": 1900, "time_per_iteration": 2.5809316635131836 }, { "auxiliary_loss_clip": 0.01091189, "auxiliary_loss_mlp": 0.01011563, "balance_loss_clip": 1.02984834, "balance_loss_mlp": 1.00984609, "epoch": 0.22858173510491192, "flos": 56132294319360.0, "grad_norm": 0.9047666995725748, "language_loss": 0.64161056, "learning_rate": 3.6005518474197372e-06, "loss": 0.66263807, "num_input_tokens_seen": 40479580, "step": 1901, "time_per_iteration": 2.9828567504882812 }, { "auxiliary_loss_clip": 0.0119795, "auxiliary_loss_mlp": 0.01031531, "balance_loss_clip": 1.06150997, "balance_loss_mlp": 1.02176785, "epoch": 0.228701977995551, "flos": 24170826332160.0, "grad_norm": 2.096582661508518, "language_loss": 0.78212988, "learning_rate": 3.6000846297616373e-06, "loss": 0.80442476, "num_input_tokens_seen": 40497880, "step": 1902, "time_per_iteration": 2.5138919353485107 }, { "auxiliary_loss_clip": 0.01217016, "auxiliary_loss_mlp": 0.01035119, "balance_loss_clip": 1.06371641, "balance_loss_mlp": 1.02481353, "epoch": 0.22882222088619011, "flos": 21387892308480.0, "grad_norm": 2.208548764161951, "language_loss": 0.72412646, "learning_rate": 3.5996171693719717e-06, "loss": 0.74664783, "num_input_tokens_seen": 40513975, "step": 1903, "time_per_iteration": 2.4436137676239014 }, { "auxiliary_loss_clip": 0.01109584, "auxiliary_loss_mlp": 0.01005274, "balance_loss_clip": 1.03206694, "balance_loss_mlp": 1.00360513, "epoch": 0.2289424637768292, "flos": 64589615377920.0, "grad_norm": 0.8396527258145293, "language_loss": 0.64781499, "learning_rate": 3.5991494663216528e-06, "loss": 0.66896355, "num_input_tokens_seen": 40576960, "step": 1904, "time_per_iteration": 3.1305947303771973 }, { "auxiliary_loss_clip": 0.01213654, "auxiliary_loss_mlp": 0.01029727, "balance_loss_clip": 1.06340075, "balance_loss_mlp": 1.02005911, "epoch": 0.22906270666746828, "flos": 22163419877760.0, "grad_norm": 3.14637454918225, "language_loss": 0.87503016, "learning_rate": 3.5986815206816314e-06, "loss": 0.89746398, "num_input_tokens_seen": 40595780, "step": 1905, "time_per_iteration": 2.4825522899627686 }, { "auxiliary_loss_clip": 0.01212803, "auxiliary_loss_mlp": 0.01036475, "balance_loss_clip": 1.06230724, "balance_loss_mlp": 1.02745128, "epoch": 0.2291829495581074, "flos": 25772334122880.0, "grad_norm": 1.974033283095527, "language_loss": 0.74598277, "learning_rate": 3.598213332522895e-06, "loss": 0.76847559, "num_input_tokens_seen": 40615810, "step": 1906, "time_per_iteration": 2.4877636432647705 }, { "auxiliary_loss_clip": 0.01195163, "auxiliary_loss_mlp": 0.01033071, "balance_loss_clip": 1.05830097, "balance_loss_mlp": 1.0232898, "epoch": 0.22930319244874647, "flos": 31172760126720.0, "grad_norm": 1.821359731588867, "language_loss": 0.77216184, "learning_rate": 3.597744901916466e-06, "loss": 0.7944442, "num_input_tokens_seen": 40637095, "step": 1907, "time_per_iteration": 2.561657190322876 }, { "auxiliary_loss_clip": 0.01215777, "auxiliary_loss_mlp": 0.01035309, "balance_loss_clip": 1.06053555, "balance_loss_mlp": 1.02530122, "epoch": 0.22942343533938556, "flos": 23254098399360.0, "grad_norm": 3.3067372795388024, "language_loss": 0.77003586, "learning_rate": 3.5972762289334058e-06, "loss": 0.79254675, "num_input_tokens_seen": 40656725, "step": 1908, "time_per_iteration": 2.472668409347534 }, { "auxiliary_loss_clip": 0.01135152, "auxiliary_loss_mlp": 0.01029602, "balance_loss_clip": 1.05400538, "balance_loss_mlp": 1.01947498, "epoch": 0.22954367823002464, "flos": 14610903436800.0, "grad_norm": 2.24948213431362, "language_loss": 0.84914607, "learning_rate": 3.5968073136448116e-06, "loss": 0.87079358, "num_input_tokens_seen": 40674745, "step": 1909, "time_per_iteration": 2.600472927093506 }, { "auxiliary_loss_clip": 0.01202628, "auxiliary_loss_mlp": 0.01037943, "balance_loss_clip": 1.06018329, "balance_loss_mlp": 1.02745259, "epoch": 0.22966392112066375, "flos": 16763604405120.0, "grad_norm": 1.7186533335638219, "language_loss": 0.91463274, "learning_rate": 3.596338156121818e-06, "loss": 0.93703842, "num_input_tokens_seen": 40693630, "step": 1910, "time_per_iteration": 2.455132007598877 }, { "auxiliary_loss_clip": 0.0109297, "auxiliary_loss_mlp": 0.01001664, "balance_loss_clip": 1.02800202, "balance_loss_mlp": 0.99997121, "epoch": 0.22978416401130283, "flos": 67474247783040.0, "grad_norm": 0.7406317435003689, "language_loss": 0.59394455, "learning_rate": 3.595868756435595e-06, "loss": 0.61489081, "num_input_tokens_seen": 40761310, "step": 1911, "time_per_iteration": 4.0456647872924805 }, { "auxiliary_loss_clip": 0.01173517, "auxiliary_loss_mlp": 0.01034111, "balance_loss_clip": 1.06183112, "balance_loss_mlp": 1.02435338, "epoch": 0.22990440690194192, "flos": 19865137086720.0, "grad_norm": 2.649085475940073, "language_loss": 0.80906856, "learning_rate": 3.5953991146573504e-06, "loss": 0.83114481, "num_input_tokens_seen": 40779955, "step": 1912, "time_per_iteration": 2.576812505722046 }, { "auxiliary_loss_clip": 0.01199708, "auxiliary_loss_mlp": 0.01032644, "balance_loss_clip": 1.05798328, "balance_loss_mlp": 1.02240396, "epoch": 0.23002464979258103, "flos": 13289246507520.0, "grad_norm": 2.4439647535748903, "language_loss": 0.83580017, "learning_rate": 3.5949292308583294e-06, "loss": 0.85812366, "num_input_tokens_seen": 40793200, "step": 1913, "time_per_iteration": 2.4744374752044678 }, { "auxiliary_loss_clip": 0.01214594, "auxiliary_loss_mlp": 0.01034855, "balance_loss_clip": 1.06406713, "balance_loss_mlp": 1.0239476, "epoch": 0.2301448926832201, "flos": 22163779013760.0, "grad_norm": 2.0029949690347464, "language_loss": 0.80833125, "learning_rate": 3.594459105109811e-06, "loss": 0.83082575, "num_input_tokens_seen": 40812380, "step": 1914, "time_per_iteration": 2.452862501144409 }, { "auxiliary_loss_clip": 0.01202086, "auxiliary_loss_mlp": 0.01033229, "balance_loss_clip": 1.06376553, "balance_loss_mlp": 1.02444339, "epoch": 0.2302651355738592, "flos": 20704477167360.0, "grad_norm": 1.7716538625401077, "language_loss": 0.81096971, "learning_rate": 3.593988737483115e-06, "loss": 0.83332288, "num_input_tokens_seen": 40832320, "step": 1915, "time_per_iteration": 2.4938321113586426 }, { "auxiliary_loss_clip": 0.01182161, "auxiliary_loss_mlp": 0.01031473, "balance_loss_clip": 1.0580945, "balance_loss_mlp": 1.02159071, "epoch": 0.23038537846449827, "flos": 18588943797120.0, "grad_norm": 1.9160218149132646, "language_loss": 0.78065801, "learning_rate": 3.5935181280495947e-06, "loss": 0.8027944, "num_input_tokens_seen": 40850900, "step": 1916, "time_per_iteration": 2.5129659175872803 }, { "auxiliary_loss_clip": 0.01087867, "auxiliary_loss_mlp": 0.01001513, "balance_loss_clip": 1.02836192, "balance_loss_mlp": 0.99984437, "epoch": 0.23050562135513739, "flos": 64224260190720.0, "grad_norm": 0.9080719464070544, "language_loss": 0.54271203, "learning_rate": 3.5930472768806412e-06, "loss": 0.56360584, "num_input_tokens_seen": 40909570, "step": 1917, "time_per_iteration": 3.8663241863250732 }, { "auxiliary_loss_clip": 0.01211672, "auxiliary_loss_mlp": 0.01036062, "balance_loss_clip": 1.06318474, "balance_loss_mlp": 1.02603054, "epoch": 0.23062586424577647, "flos": 17313396952320.0, "grad_norm": 2.302850048551058, "language_loss": 0.77138782, "learning_rate": 3.5925761840476826e-06, "loss": 0.7938652, "num_input_tokens_seen": 40928180, "step": 1918, "time_per_iteration": 3.3247687816619873 }, { "auxiliary_loss_clip": 0.01178787, "auxiliary_loss_mlp": 0.01031142, "balance_loss_clip": 1.05813265, "balance_loss_mlp": 1.02196872, "epoch": 0.23074610713641555, "flos": 27855979194240.0, "grad_norm": 7.456738816837622, "language_loss": 0.81363225, "learning_rate": 3.592104849622183e-06, "loss": 0.83573157, "num_input_tokens_seen": 40950435, "step": 1919, "time_per_iteration": 3.3578546047210693 }, { "auxiliary_loss_clip": 0.01143675, "auxiliary_loss_mlp": 0.01032357, "balance_loss_clip": 1.05322242, "balance_loss_mlp": 1.02280807, "epoch": 0.23086635002705466, "flos": 28841798937600.0, "grad_norm": 1.5063473308876887, "language_loss": 0.73119915, "learning_rate": 3.591633273675644e-06, "loss": 0.75295943, "num_input_tokens_seen": 40972670, "step": 1920, "time_per_iteration": 2.6541922092437744 }, { "auxiliary_loss_clip": 0.01064178, "auxiliary_loss_mlp": 0.01008834, "balance_loss_clip": 1.02913833, "balance_loss_mlp": 1.00650966, "epoch": 0.23098659291769374, "flos": 62923681566720.0, "grad_norm": 0.9073545390728627, "language_loss": 0.58245361, "learning_rate": 3.591161456279602e-06, "loss": 0.60318375, "num_input_tokens_seen": 41018215, "step": 1921, "time_per_iteration": 2.92101788520813 }, { "auxiliary_loss_clip": 0.01186544, "auxiliary_loss_mlp": 0.01033526, "balance_loss_clip": 1.0567441, "balance_loss_mlp": 1.02385259, "epoch": 0.23110683580833283, "flos": 23476816679040.0, "grad_norm": 1.5056857106502466, "language_loss": 0.80467165, "learning_rate": 3.590689397505633e-06, "loss": 0.82687235, "num_input_tokens_seen": 41039125, "step": 1922, "time_per_iteration": 2.712777853012085 }, { "auxiliary_loss_clip": 0.01210273, "auxiliary_loss_mlp": 0.010294, "balance_loss_clip": 1.06129718, "balance_loss_mlp": 1.01991057, "epoch": 0.2312270786989719, "flos": 27271066124160.0, "grad_norm": 1.6353858722836059, "language_loss": 0.86771405, "learning_rate": 3.590217097425347e-06, "loss": 0.89011079, "num_input_tokens_seen": 41059025, "step": 1923, "time_per_iteration": 2.529050827026367 }, { "auxiliary_loss_clip": 0.01215289, "auxiliary_loss_mlp": 0.01032136, "balance_loss_clip": 1.06352818, "balance_loss_mlp": 1.02207446, "epoch": 0.23134732158961102, "flos": 13261344618240.0, "grad_norm": 1.7843459673148958, "language_loss": 0.71058226, "learning_rate": 3.589744556110391e-06, "loss": 0.73305655, "num_input_tokens_seen": 41077015, "step": 1924, "time_per_iteration": 2.439056158065796 }, { "auxiliary_loss_clip": 0.01179935, "auxiliary_loss_mlp": 0.01034317, "balance_loss_clip": 1.05626798, "balance_loss_mlp": 1.02564418, "epoch": 0.2314675644802501, "flos": 36977648250240.0, "grad_norm": 1.6877289085880027, "language_loss": 0.84338379, "learning_rate": 3.58927177363245e-06, "loss": 0.86552632, "num_input_tokens_seen": 41099840, "step": 1925, "time_per_iteration": 2.697558641433716 }, { "auxiliary_loss_clip": 0.01163788, "auxiliary_loss_mlp": 0.01035418, "balance_loss_clip": 1.05415154, "balance_loss_mlp": 1.02501678, "epoch": 0.2315878073708892, "flos": 23842207779840.0, "grad_norm": 2.3634779859918296, "language_loss": 0.72541994, "learning_rate": 3.5887987500632447e-06, "loss": 0.74741197, "num_input_tokens_seen": 41117845, "step": 1926, "time_per_iteration": 2.5524322986602783 }, { "auxiliary_loss_clip": 0.01169859, "auxiliary_loss_mlp": 0.010361, "balance_loss_clip": 1.05530918, "balance_loss_mlp": 1.02742767, "epoch": 0.2317080502615283, "flos": 23039424766080.0, "grad_norm": 1.8556639998048141, "language_loss": 0.84464228, "learning_rate": 3.5883254854745325e-06, "loss": 0.86670184, "num_input_tokens_seen": 41136235, "step": 1927, "time_per_iteration": 2.6146647930145264 }, { "auxiliary_loss_clip": 0.0119896, "auxiliary_loss_mlp": 0.01031803, "balance_loss_clip": 1.05650103, "balance_loss_mlp": 1.0221529, "epoch": 0.23182829315216738, "flos": 11254656435840.0, "grad_norm": 1.9891985581114355, "language_loss": 0.75201017, "learning_rate": 3.587851979938107e-06, "loss": 0.7743178, "num_input_tokens_seen": 41153125, "step": 1928, "time_per_iteration": 2.4627838134765625 }, { "auxiliary_loss_clip": 0.0119501, "auxiliary_loss_mlp": 0.01032014, "balance_loss_clip": 1.05958784, "balance_loss_mlp": 1.02259636, "epoch": 0.23194853604280646, "flos": 19828939155840.0, "grad_norm": 2.727395033535805, "language_loss": 0.77979016, "learning_rate": 3.5873782335257985e-06, "loss": 0.80206043, "num_input_tokens_seen": 41171290, "step": 1929, "time_per_iteration": 2.4804580211639404 }, { "auxiliary_loss_clip": 0.01163595, "auxiliary_loss_mlp": 0.01036443, "balance_loss_clip": 1.05667007, "balance_loss_mlp": 1.02674484, "epoch": 0.23206877893344555, "flos": 15305020830720.0, "grad_norm": 2.197442347653758, "language_loss": 0.78496575, "learning_rate": 3.5869042463094744e-06, "loss": 0.80696607, "num_input_tokens_seen": 41189005, "step": 1930, "time_per_iteration": 2.513066053390503 }, { "auxiliary_loss_clip": 0.01131983, "auxiliary_loss_mlp": 0.01038852, "balance_loss_clip": 1.04848385, "balance_loss_mlp": 1.02873111, "epoch": 0.23218902182408466, "flos": 22711488572160.0, "grad_norm": 1.9071195582850453, "language_loss": 0.76829469, "learning_rate": 3.586430018361038e-06, "loss": 0.79000306, "num_input_tokens_seen": 41208775, "step": 1931, "time_per_iteration": 2.6066949367523193 }, { "auxiliary_loss_clip": 0.01166604, "auxiliary_loss_mlp": 0.01033146, "balance_loss_clip": 1.05240917, "balance_loss_mlp": 1.02277541, "epoch": 0.23230926471472374, "flos": 22710734386560.0, "grad_norm": 2.443180941755307, "language_loss": 0.76346928, "learning_rate": 3.5859555497524283e-06, "loss": 0.78546685, "num_input_tokens_seen": 41226010, "step": 1932, "time_per_iteration": 2.5034677982330322 }, { "auxiliary_loss_clip": 0.01195427, "auxiliary_loss_mlp": 0.01036002, "balance_loss_clip": 1.05957794, "balance_loss_mlp": 1.02704334, "epoch": 0.23242950760536282, "flos": 20375499479040.0, "grad_norm": 1.9287028496982914, "language_loss": 0.91858327, "learning_rate": 3.5854808405556237e-06, "loss": 0.94089758, "num_input_tokens_seen": 41245245, "step": 1933, "time_per_iteration": 2.479234457015991 }, { "auxiliary_loss_clip": 0.01164682, "auxiliary_loss_mlp": 0.01035285, "balance_loss_clip": 1.05380058, "balance_loss_mlp": 1.02699971, "epoch": 0.23254975049600193, "flos": 16908324301440.0, "grad_norm": 2.2676536556648554, "language_loss": 0.74906296, "learning_rate": 3.5850058908426355e-06, "loss": 0.77106261, "num_input_tokens_seen": 41263795, "step": 1934, "time_per_iteration": 2.5206565856933594 }, { "auxiliary_loss_clip": 0.01182763, "auxiliary_loss_mlp": 0.01035244, "balance_loss_clip": 1.05431151, "balance_loss_mlp": 1.02611232, "epoch": 0.23266999338664102, "flos": 23294821443840.0, "grad_norm": 1.7005426418367304, "language_loss": 0.85302925, "learning_rate": 3.584530700685514e-06, "loss": 0.87520927, "num_input_tokens_seen": 41284055, "step": 1935, "time_per_iteration": 2.5419108867645264 }, { "auxiliary_loss_clip": 0.01181153, "auxiliary_loss_mlp": 0.01029347, "balance_loss_clip": 1.06176329, "balance_loss_mlp": 1.02011478, "epoch": 0.2327902362772801, "flos": 19569987031680.0, "grad_norm": 1.9727706416318, "language_loss": 0.88636446, "learning_rate": 3.5840552701563448e-06, "loss": 0.90846944, "num_input_tokens_seen": 41300255, "step": 1936, "time_per_iteration": 2.4959185123443604 }, { "auxiliary_loss_clip": 0.01184918, "auxiliary_loss_mlp": 0.01032039, "balance_loss_clip": 1.05404174, "balance_loss_mlp": 1.02241278, "epoch": 0.2329104791679192, "flos": 16727514215040.0, "grad_norm": 2.3995711887507736, "language_loss": 0.82066882, "learning_rate": 3.5835795993272513e-06, "loss": 0.84283841, "num_input_tokens_seen": 41318540, "step": 1937, "time_per_iteration": 2.5202860832214355 }, { "auxiliary_loss_clip": 0.01124891, "auxiliary_loss_mlp": 0.01043073, "balance_loss_clip": 1.05209231, "balance_loss_mlp": 1.03323257, "epoch": 0.2330307220585583, "flos": 22163743100160.0, "grad_norm": 4.447855821462046, "language_loss": 0.71046537, "learning_rate": 3.583103688270391e-06, "loss": 0.73214501, "num_input_tokens_seen": 41338320, "step": 1938, "time_per_iteration": 3.511669874191284 }, { "auxiliary_loss_clip": 0.01167642, "auxiliary_loss_mlp": 0.01036455, "balance_loss_clip": 1.05252004, "balance_loss_mlp": 1.02556562, "epoch": 0.23315096494919738, "flos": 19317319787520.0, "grad_norm": 2.0946630408222084, "language_loss": 0.89410293, "learning_rate": 3.58262753705796e-06, "loss": 0.91614395, "num_input_tokens_seen": 41353210, "step": 1939, "time_per_iteration": 2.519331693649292 }, { "auxiliary_loss_clip": 0.01094488, "auxiliary_loss_mlp": 0.01009485, "balance_loss_clip": 1.03496528, "balance_loss_mlp": 1.00788724, "epoch": 0.23327120783983646, "flos": 53031048946560.0, "grad_norm": 0.762076360197271, "language_loss": 0.55555022, "learning_rate": 3.5821511457621902e-06, "loss": 0.57658994, "num_input_tokens_seen": 41410510, "step": 1940, "time_per_iteration": 3.0885448455810547 }, { "auxiliary_loss_clip": 0.01178312, "auxiliary_loss_mlp": 0.0103693, "balance_loss_clip": 1.05692327, "balance_loss_mlp": 1.02673125, "epoch": 0.23339145073047557, "flos": 17126984344320.0, "grad_norm": 3.0561073140637784, "language_loss": 0.81302071, "learning_rate": 3.5816745144553497e-06, "loss": 0.83517313, "num_input_tokens_seen": 41425830, "step": 1941, "time_per_iteration": 2.4839398860931396 }, { "auxiliary_loss_clip": 0.01146039, "auxiliary_loss_mlp": 0.01027794, "balance_loss_clip": 1.05399966, "balance_loss_mlp": 1.01841247, "epoch": 0.23351169362111465, "flos": 13078918419840.0, "grad_norm": 2.814779265914372, "language_loss": 0.75493437, "learning_rate": 3.5811976432097424e-06, "loss": 0.77667266, "num_input_tokens_seen": 41443500, "step": 1942, "time_per_iteration": 2.5535335540771484 }, { "auxiliary_loss_clip": 0.01195001, "auxiliary_loss_mlp": 0.00764066, "balance_loss_clip": 1.06108856, "balance_loss_mlp": 1.00104523, "epoch": 0.23363193651175373, "flos": 15851257931520.0, "grad_norm": 2.2281377174652794, "language_loss": 0.84510922, "learning_rate": 3.58072053209771e-06, "loss": 0.86469984, "num_input_tokens_seen": 41460055, "step": 1943, "time_per_iteration": 2.499406099319458 }, { "auxiliary_loss_clip": 0.01173637, "auxiliary_loss_mlp": 0.01033634, "balance_loss_clip": 1.05330825, "balance_loss_mlp": 1.0238533, "epoch": 0.23375217940239285, "flos": 21025769345280.0, "grad_norm": 2.0944955210956797, "language_loss": 0.7908181, "learning_rate": 3.5802431811916296e-06, "loss": 0.81289077, "num_input_tokens_seen": 41476665, "step": 1944, "time_per_iteration": 4.196576356887817 }, { "auxiliary_loss_clip": 0.0117512, "auxiliary_loss_mlp": 0.01028221, "balance_loss_clip": 1.05613697, "balance_loss_mlp": 1.01920271, "epoch": 0.23387242229303193, "flos": 20594698225920.0, "grad_norm": 1.9070219817968121, "language_loss": 0.80690712, "learning_rate": 3.579765590563916e-06, "loss": 0.82894051, "num_input_tokens_seen": 41496065, "step": 1945, "time_per_iteration": 3.342613458633423 }, { "auxiliary_loss_clip": 0.01183395, "auxiliary_loss_mlp": 0.01032511, "balance_loss_clip": 1.05560887, "balance_loss_mlp": 1.02302241, "epoch": 0.233992665183671, "flos": 24279491952000.0, "grad_norm": 1.8303253083666764, "language_loss": 0.81812763, "learning_rate": 3.579287760287017e-06, "loss": 0.84028673, "num_input_tokens_seen": 41516815, "step": 1946, "time_per_iteration": 2.544931173324585 }, { "auxiliary_loss_clip": 0.0119233, "auxiliary_loss_mlp": 0.01034112, "balance_loss_clip": 1.05827081, "balance_loss_mlp": 1.02507567, "epoch": 0.2341129080743101, "flos": 30154621121280.0, "grad_norm": 1.8392033690714538, "language_loss": 0.72793335, "learning_rate": 3.578809690433421e-06, "loss": 0.75019777, "num_input_tokens_seen": 41538525, "step": 1947, "time_per_iteration": 2.5669567584991455 }, { "auxiliary_loss_clip": 0.01215407, "auxiliary_loss_mlp": 0.01036612, "balance_loss_clip": 1.06262004, "balance_loss_mlp": 1.02676547, "epoch": 0.2342331509649492, "flos": 22784135829120.0, "grad_norm": 4.005893996273609, "language_loss": 0.81113994, "learning_rate": 3.578331381075651e-06, "loss": 0.83366013, "num_input_tokens_seen": 41559025, "step": 1948, "time_per_iteration": 2.4530582427978516 }, { "auxiliary_loss_clip": 0.01196052, "auxiliary_loss_mlp": 0.01030069, "balance_loss_clip": 1.05800653, "balance_loss_mlp": 1.02032363, "epoch": 0.2343533938555883, "flos": 23623152687360.0, "grad_norm": 2.29998884192404, "language_loss": 0.69959247, "learning_rate": 3.5778528322862646e-06, "loss": 0.72185367, "num_input_tokens_seen": 41577845, "step": 1949, "time_per_iteration": 2.4965648651123047 }, { "auxiliary_loss_clip": 0.01196498, "auxiliary_loss_mlp": 0.01031312, "balance_loss_clip": 1.05714893, "balance_loss_mlp": 1.02231812, "epoch": 0.23447363674622737, "flos": 24570332375040.0, "grad_norm": 1.4333363379706736, "language_loss": 0.8650347, "learning_rate": 3.5773740441378585e-06, "loss": 0.88731283, "num_input_tokens_seen": 41598600, "step": 1950, "time_per_iteration": 2.4947404861450195 }, { "auxiliary_loss_clip": 0.01191019, "auxiliary_loss_mlp": 0.01036777, "balance_loss_clip": 1.05705655, "balance_loss_mlp": 1.0277946, "epoch": 0.23459387963686648, "flos": 53140322119680.0, "grad_norm": 1.7019286599252261, "language_loss": 0.73543864, "learning_rate": 3.5768950167030633e-06, "loss": 0.75771654, "num_input_tokens_seen": 41623300, "step": 1951, "time_per_iteration": 2.832526445388794 }, { "auxiliary_loss_clip": 0.01169855, "auxiliary_loss_mlp": 0.01038853, "balance_loss_clip": 1.05426359, "balance_loss_mlp": 1.02863121, "epoch": 0.23471412252750556, "flos": 23951412103680.0, "grad_norm": 1.8080582810706136, "language_loss": 0.78592414, "learning_rate": 3.576415750054548e-06, "loss": 0.80801123, "num_input_tokens_seen": 41643420, "step": 1952, "time_per_iteration": 2.5353944301605225 }, { "auxiliary_loss_clip": 0.01171031, "auxiliary_loss_mlp": 0.01027405, "balance_loss_clip": 1.05397022, "balance_loss_mlp": 1.01798141, "epoch": 0.23483436541814465, "flos": 15706573948800.0, "grad_norm": 2.0703214103623306, "language_loss": 0.85655081, "learning_rate": 3.5759362442650172e-06, "loss": 0.87853515, "num_input_tokens_seen": 41660170, "step": 1953, "time_per_iteration": 2.4940755367279053 }, { "auxiliary_loss_clip": 0.01197657, "auxiliary_loss_mlp": 0.01037699, "balance_loss_clip": 1.06208384, "balance_loss_mlp": 1.02786446, "epoch": 0.23495460830878373, "flos": 24936262179840.0, "grad_norm": 2.2820312120406805, "language_loss": 0.85677314, "learning_rate": 3.5754564994072113e-06, "loss": 0.87912667, "num_input_tokens_seen": 41679010, "step": 1954, "time_per_iteration": 2.507598876953125 }, { "auxiliary_loss_clip": 0.01177887, "auxiliary_loss_mlp": 0.01028535, "balance_loss_clip": 1.05579591, "balance_loss_mlp": 1.01818132, "epoch": 0.23507485119942284, "flos": 30482665056000.0, "grad_norm": 3.449529449489836, "language_loss": 0.59643817, "learning_rate": 3.5749765155539067e-06, "loss": 0.61850238, "num_input_tokens_seen": 41699495, "step": 1955, "time_per_iteration": 2.6668214797973633 }, { "auxiliary_loss_clip": 0.01162357, "auxiliary_loss_mlp": 0.01032017, "balance_loss_clip": 1.05296242, "balance_loss_mlp": 1.02195013, "epoch": 0.23519509409006192, "flos": 18329129746560.0, "grad_norm": 2.0889383424788512, "language_loss": 0.92178923, "learning_rate": 3.574496292777917e-06, "loss": 0.94373298, "num_input_tokens_seen": 41717705, "step": 1956, "time_per_iteration": 2.519028663635254 }, { "auxiliary_loss_clip": 0.01188259, "auxiliary_loss_mlp": 0.01039365, "balance_loss_clip": 1.05840158, "balance_loss_mlp": 1.02860069, "epoch": 0.235315336980701, "flos": 29643217234560.0, "grad_norm": 1.9944505516913964, "language_loss": 0.71760261, "learning_rate": 3.574015831152092e-06, "loss": 0.73987889, "num_input_tokens_seen": 41738120, "step": 1957, "time_per_iteration": 2.5755717754364014 }, { "auxiliary_loss_clip": 0.01171126, "auxiliary_loss_mlp": 0.0102877, "balance_loss_clip": 1.05480516, "balance_loss_mlp": 1.01953769, "epoch": 0.23543557987134012, "flos": 18551704371840.0, "grad_norm": 2.1748782295751448, "language_loss": 0.83523822, "learning_rate": 3.573535130749316e-06, "loss": 0.8572371, "num_input_tokens_seen": 41756070, "step": 1958, "time_per_iteration": 2.4874367713928223 }, { "auxiliary_loss_clip": 0.01171534, "auxiliary_loss_mlp": 0.0103606, "balance_loss_clip": 1.05612242, "balance_loss_mlp": 1.0266242, "epoch": 0.2355558227619792, "flos": 24679033908480.0, "grad_norm": 1.6505177091237302, "language_loss": 0.7380923, "learning_rate": 3.5730541916425127e-06, "loss": 0.76016819, "num_input_tokens_seen": 41777550, "step": 1959, "time_per_iteration": 2.5593602657318115 }, { "auxiliary_loss_clip": 0.0116974, "auxiliary_loss_mlp": 0.01030847, "balance_loss_clip": 1.05687869, "balance_loss_mlp": 1.02144718, "epoch": 0.23567606565261828, "flos": 21944795748480.0, "grad_norm": 2.191747657827774, "language_loss": 0.86217797, "learning_rate": 3.572573013904639e-06, "loss": 0.88418388, "num_input_tokens_seen": 41797460, "step": 1960, "time_per_iteration": 2.5693535804748535 }, { "auxiliary_loss_clip": 0.01209579, "auxiliary_loss_mlp": 0.0102695, "balance_loss_clip": 1.05912602, "balance_loss_mlp": 1.01752019, "epoch": 0.2357963085432574, "flos": 13589352639360.0, "grad_norm": 2.1483254493480497, "language_loss": 0.92030525, "learning_rate": 3.572091597608689e-06, "loss": 0.94267046, "num_input_tokens_seen": 41815585, "step": 1961, "time_per_iteration": 2.4453775882720947 }, { "auxiliary_loss_clip": 0.01187965, "auxiliary_loss_mlp": 0.01031834, "balance_loss_clip": 1.0585885, "balance_loss_mlp": 1.0214448, "epoch": 0.23591655143389648, "flos": 22088689632000.0, "grad_norm": 2.082642869489746, "language_loss": 0.73624814, "learning_rate": 3.571609942827694e-06, "loss": 0.75844616, "num_input_tokens_seen": 41834700, "step": 1962, "time_per_iteration": 2.519336700439453 }, { "auxiliary_loss_clip": 0.01180052, "auxiliary_loss_mlp": 0.0103066, "balance_loss_clip": 1.05754554, "balance_loss_mlp": 1.02139783, "epoch": 0.23603679432453556, "flos": 17017349057280.0, "grad_norm": 2.0141837876752233, "language_loss": 0.88374054, "learning_rate": 3.57112804963472e-06, "loss": 0.90584767, "num_input_tokens_seen": 41852915, "step": 1963, "time_per_iteration": 2.499077796936035 }, { "auxiliary_loss_clip": 0.01163569, "auxiliary_loss_mlp": 0.01032965, "balance_loss_clip": 1.05818379, "balance_loss_mlp": 1.02410722, "epoch": 0.23615703721517464, "flos": 19171307001600.0, "grad_norm": 1.8335817382633506, "language_loss": 0.76499212, "learning_rate": 3.57064591810287e-06, "loss": 0.7869575, "num_input_tokens_seen": 41870415, "step": 1964, "time_per_iteration": 2.5408272743225098 }, { "auxiliary_loss_clip": 0.01209766, "auxiliary_loss_mlp": 0.00764064, "balance_loss_clip": 1.06146097, "balance_loss_mlp": 1.00107098, "epoch": 0.23627728010581375, "flos": 19098803399040.0, "grad_norm": 2.091247943689354, "language_loss": 0.81079847, "learning_rate": 3.570163548305284e-06, "loss": 0.83053684, "num_input_tokens_seen": 41889345, "step": 1965, "time_per_iteration": 3.2580833435058594 }, { "auxiliary_loss_clip": 0.01179277, "auxiliary_loss_mlp": 0.01036764, "balance_loss_clip": 1.05619395, "balance_loss_mlp": 1.02648187, "epoch": 0.23639752299645284, "flos": 14282213057280.0, "grad_norm": 2.225712014217286, "language_loss": 0.69493705, "learning_rate": 3.569680940315135e-06, "loss": 0.7170974, "num_input_tokens_seen": 41905745, "step": 1966, "time_per_iteration": 2.5123424530029297 }, { "auxiliary_loss_clip": 0.01171246, "auxiliary_loss_mlp": 0.01036642, "balance_loss_clip": 1.05665433, "balance_loss_mlp": 1.02585959, "epoch": 0.23651776588709192, "flos": 22893411980160.0, "grad_norm": 1.8032376027295576, "language_loss": 0.82164556, "learning_rate": 3.5691980942056356e-06, "loss": 0.84372449, "num_input_tokens_seen": 41925115, "step": 1967, "time_per_iteration": 2.562812328338623 }, { "auxiliary_loss_clip": 0.01200229, "auxiliary_loss_mlp": 0.01029254, "balance_loss_clip": 1.05865979, "balance_loss_mlp": 1.01964009, "epoch": 0.23663800877773103, "flos": 18624531196800.0, "grad_norm": 1.8431660051754692, "language_loss": 0.79439557, "learning_rate": 3.5687150100500332e-06, "loss": 0.81669039, "num_input_tokens_seen": 41944815, "step": 1968, "time_per_iteration": 2.5219881534576416 }, { "auxiliary_loss_clip": 0.01194978, "auxiliary_loss_mlp": 0.01028519, "balance_loss_clip": 1.05712223, "balance_loss_mlp": 1.01887512, "epoch": 0.2367582516683701, "flos": 25555828896000.0, "grad_norm": 1.5966483987149835, "language_loss": 0.74536216, "learning_rate": 3.568231687921611e-06, "loss": 0.76759714, "num_input_tokens_seen": 41964990, "step": 1969, "time_per_iteration": 2.592735528945923 }, { "auxiliary_loss_clip": 0.01208422, "auxiliary_loss_mlp": 0.01032555, "balance_loss_clip": 1.06056547, "balance_loss_mlp": 1.02361417, "epoch": 0.2368784945590092, "flos": 23295072839040.0, "grad_norm": 1.4908503387488736, "language_loss": 0.80382669, "learning_rate": 3.5677481278936883e-06, "loss": 0.82623649, "num_input_tokens_seen": 41984570, "step": 1970, "time_per_iteration": 3.3571486473083496 }, { "auxiliary_loss_clip": 0.01106091, "auxiliary_loss_mlp": 0.01003246, "balance_loss_clip": 1.04770589, "balance_loss_mlp": 1.00119519, "epoch": 0.23699873744964828, "flos": 69859291875840.0, "grad_norm": 0.8305623939719402, "language_loss": 0.57858795, "learning_rate": 3.5672643300396214e-06, "loss": 0.59968132, "num_input_tokens_seen": 42053715, "step": 1971, "time_per_iteration": 4.0693888664245605 }, { "auxiliary_loss_clip": 0.01166607, "auxiliary_loss_mlp": 0.01032146, "balance_loss_clip": 1.0561471, "balance_loss_mlp": 1.02358079, "epoch": 0.2371189803402874, "flos": 21835052720640.0, "grad_norm": 2.1581035283230445, "language_loss": 0.67821872, "learning_rate": 3.566780294432802e-06, "loss": 0.70020628, "num_input_tokens_seen": 42070890, "step": 1972, "time_per_iteration": 3.3236582279205322 }, { "auxiliary_loss_clip": 0.012112, "auxiliary_loss_mlp": 0.01033971, "balance_loss_clip": 1.06122279, "balance_loss_mlp": 1.0253408, "epoch": 0.23723922323092647, "flos": 21908490076800.0, "grad_norm": 3.020341069181952, "language_loss": 0.74790835, "learning_rate": 3.566296021146657e-06, "loss": 0.77036005, "num_input_tokens_seen": 42090270, "step": 1973, "time_per_iteration": 2.476217746734619 }, { "auxiliary_loss_clip": 0.01215522, "auxiliary_loss_mlp": 0.01030141, "balance_loss_clip": 1.06330132, "balance_loss_mlp": 1.01981199, "epoch": 0.23735946612156555, "flos": 32708803380480.0, "grad_norm": 1.714424796244191, "language_loss": 0.73202336, "learning_rate": 3.565811510254652e-06, "loss": 0.75448, "num_input_tokens_seen": 42111150, "step": 1974, "time_per_iteration": 2.55385684967041 }, { "auxiliary_loss_clip": 0.01104221, "auxiliary_loss_mlp": 0.01022096, "balance_loss_clip": 1.0378381, "balance_loss_mlp": 1.01942599, "epoch": 0.23747970901220466, "flos": 70546944821760.0, "grad_norm": 0.8250801290332915, "language_loss": 0.58260167, "learning_rate": 3.5653267618302845e-06, "loss": 0.60386491, "num_input_tokens_seen": 42178730, "step": 1975, "time_per_iteration": 3.116600275039673 }, { "auxiliary_loss_clip": 0.01208879, "auxiliary_loss_mlp": 0.01032268, "balance_loss_clip": 1.05930519, "balance_loss_mlp": 1.02248716, "epoch": 0.23759995190284375, "flos": 20849807594880.0, "grad_norm": 2.0319083326706395, "language_loss": 0.85615057, "learning_rate": 3.564841775947093e-06, "loss": 0.87856209, "num_input_tokens_seen": 42199620, "step": 1976, "time_per_iteration": 2.463066577911377 }, { "auxiliary_loss_clip": 0.01164922, "auxiliary_loss_mlp": 0.01033955, "balance_loss_clip": 1.05365968, "balance_loss_mlp": 1.0239172, "epoch": 0.23772019479348283, "flos": 32921645420160.0, "grad_norm": 2.3155159880116485, "language_loss": 0.76085317, "learning_rate": 3.5643565526786475e-06, "loss": 0.78284192, "num_input_tokens_seen": 42219560, "step": 1977, "time_per_iteration": 2.707880735397339 }, { "auxiliary_loss_clip": 0.01212583, "auxiliary_loss_mlp": 0.01031647, "balance_loss_clip": 1.06224513, "balance_loss_mlp": 1.02216387, "epoch": 0.2378404376841219, "flos": 32342765834880.0, "grad_norm": 1.578334388010235, "language_loss": 0.77168584, "learning_rate": 3.5638710920985574e-06, "loss": 0.79412812, "num_input_tokens_seen": 42241020, "step": 1978, "time_per_iteration": 2.5489962100982666 }, { "auxiliary_loss_clip": 0.01202672, "auxiliary_loss_mlp": 0.00765138, "balance_loss_clip": 1.0582931, "balance_loss_mlp": 1.0010426, "epoch": 0.23796068057476102, "flos": 22997624313600.0, "grad_norm": 2.1064573105054776, "language_loss": 0.81533271, "learning_rate": 3.5633853942804655e-06, "loss": 0.83501077, "num_input_tokens_seen": 42259345, "step": 1979, "time_per_iteration": 2.4966232776641846 }, { "auxiliary_loss_clip": 0.01167636, "auxiliary_loss_mlp": 0.01033803, "balance_loss_clip": 1.05255985, "balance_loss_mlp": 1.02381372, "epoch": 0.2380809234654001, "flos": 13480938414720.0, "grad_norm": 3.389874887004538, "language_loss": 0.76879197, "learning_rate": 3.5628994592980527e-06, "loss": 0.79080635, "num_input_tokens_seen": 42277250, "step": 1980, "time_per_iteration": 2.5508618354797363 }, { "auxiliary_loss_clip": 0.0121223, "auxiliary_loss_mlp": 0.01031529, "balance_loss_clip": 1.06083322, "balance_loss_mlp": 1.02267718, "epoch": 0.2382011663560392, "flos": 16871803148160.0, "grad_norm": 1.9230295745879118, "language_loss": 0.70369422, "learning_rate": 3.562413287225034e-06, "loss": 0.7261318, "num_input_tokens_seen": 42295360, "step": 1981, "time_per_iteration": 2.4266204833984375 }, { "auxiliary_loss_clip": 0.01192312, "auxiliary_loss_mlp": 0.01029125, "balance_loss_clip": 1.05920398, "balance_loss_mlp": 1.01951706, "epoch": 0.2383214092466783, "flos": 18441135331200.0, "grad_norm": 2.189259462930049, "language_loss": 0.89515364, "learning_rate": 3.5619268781351623e-06, "loss": 0.91736799, "num_input_tokens_seen": 42313430, "step": 1982, "time_per_iteration": 2.4984307289123535 }, { "auxiliary_loss_clip": 0.01176238, "auxiliary_loss_mlp": 0.01031011, "balance_loss_clip": 1.05834305, "balance_loss_mlp": 1.02224338, "epoch": 0.23844165213731738, "flos": 19755717281280.0, "grad_norm": 2.076334799203889, "language_loss": 0.76870787, "learning_rate": 3.5614402321022256e-06, "loss": 0.79078031, "num_input_tokens_seen": 42331260, "step": 1983, "time_per_iteration": 2.522869110107422 }, { "auxiliary_loss_clip": 0.01143687, "auxiliary_loss_mlp": 0.01031858, "balance_loss_clip": 1.0510968, "balance_loss_mlp": 1.02254772, "epoch": 0.23856189502795647, "flos": 23367360960000.0, "grad_norm": 1.7014832108313558, "language_loss": 0.87191939, "learning_rate": 3.5609533492000463e-06, "loss": 0.89367479, "num_input_tokens_seen": 42350150, "step": 1984, "time_per_iteration": 2.61458158493042 }, { "auxiliary_loss_clip": 0.01177556, "auxiliary_loss_mlp": 0.01026957, "balance_loss_clip": 1.05840349, "balance_loss_mlp": 1.01724744, "epoch": 0.23868213791859555, "flos": 23475056912640.0, "grad_norm": 2.0577482234676463, "language_loss": 0.78511691, "learning_rate": 3.560466229502485e-06, "loss": 0.80716199, "num_input_tokens_seen": 42369495, "step": 1985, "time_per_iteration": 2.5246098041534424 }, { "auxiliary_loss_clip": 0.0117998, "auxiliary_loss_mlp": 0.00763875, "balance_loss_clip": 1.06073093, "balance_loss_mlp": 1.00109267, "epoch": 0.23880238080923466, "flos": 16617340224000.0, "grad_norm": 2.2642586534869737, "language_loss": 0.8974998, "learning_rate": 3.5599788730834384e-06, "loss": 0.91693836, "num_input_tokens_seen": 42387455, "step": 1986, "time_per_iteration": 2.4817707538604736 }, { "auxiliary_loss_clip": 0.01200265, "auxiliary_loss_mlp": 0.01029176, "balance_loss_clip": 1.0603857, "balance_loss_mlp": 1.01988411, "epoch": 0.23892262369987374, "flos": 17348409734400.0, "grad_norm": 2.670884152669051, "language_loss": 0.78372413, "learning_rate": 3.559491280016836e-06, "loss": 0.80601859, "num_input_tokens_seen": 42405400, "step": 1987, "time_per_iteration": 2.4722206592559814 }, { "auxiliary_loss_clip": 0.01182919, "auxiliary_loss_mlp": 0.01039264, "balance_loss_clip": 1.05933201, "balance_loss_mlp": 1.02882707, "epoch": 0.23904286659051283, "flos": 22309899540480.0, "grad_norm": 2.4691006407956957, "language_loss": 0.70876753, "learning_rate": 3.5590034503766465e-06, "loss": 0.73098934, "num_input_tokens_seen": 42425065, "step": 1988, "time_per_iteration": 2.526493549346924 }, { "auxiliary_loss_clip": 0.01210234, "auxiliary_loss_mlp": 0.01035053, "balance_loss_clip": 1.06063533, "balance_loss_mlp": 1.02621365, "epoch": 0.23916310948115194, "flos": 21178246579200.0, "grad_norm": 4.9314699812013405, "language_loss": 0.81078196, "learning_rate": 3.558515384236874e-06, "loss": 0.83323479, "num_input_tokens_seen": 42442495, "step": 1989, "time_per_iteration": 2.4772684574127197 }, { "auxiliary_loss_clip": 0.01157176, "auxiliary_loss_mlp": 0.00764762, "balance_loss_clip": 1.05485356, "balance_loss_mlp": 1.00118685, "epoch": 0.23928335237179102, "flos": 14137349506560.0, "grad_norm": 2.0335352429936515, "language_loss": 0.84043908, "learning_rate": 3.558027081671556e-06, "loss": 0.85965842, "num_input_tokens_seen": 42459480, "step": 1990, "time_per_iteration": 2.5206592082977295 }, { "auxiliary_loss_clip": 0.01200441, "auxiliary_loss_mlp": 0.01038769, "balance_loss_clip": 1.05949664, "balance_loss_mlp": 1.0281533, "epoch": 0.2394035952624301, "flos": 23769596436480.0, "grad_norm": 1.708095185118585, "language_loss": 0.69002587, "learning_rate": 3.557538542754769e-06, "loss": 0.71241796, "num_input_tokens_seen": 42479175, "step": 1991, "time_per_iteration": 3.338768243789673 }, { "auxiliary_loss_clip": 0.01212787, "auxiliary_loss_mlp": 0.01036353, "balance_loss_clip": 1.06277037, "balance_loss_mlp": 1.0266912, "epoch": 0.2395238381530692, "flos": 24206198250240.0, "grad_norm": 3.227478221003939, "language_loss": 0.66544425, "learning_rate": 3.557049767560623e-06, "loss": 0.68793571, "num_input_tokens_seen": 42498090, "step": 1992, "time_per_iteration": 2.4876692295074463 }, { "auxiliary_loss_clip": 0.01155904, "auxiliary_loss_mlp": 0.01032077, "balance_loss_clip": 1.05894113, "balance_loss_mlp": 1.02250493, "epoch": 0.2396440810437083, "flos": 25295763450240.0, "grad_norm": 2.274049790923728, "language_loss": 0.85965216, "learning_rate": 3.5565607561632655e-06, "loss": 0.88153195, "num_input_tokens_seen": 42516930, "step": 1993, "time_per_iteration": 2.623361825942993 }, { "auxiliary_loss_clip": 0.01177635, "auxiliary_loss_mlp": 0.01029115, "balance_loss_clip": 1.0570519, "balance_loss_mlp": 1.01871979, "epoch": 0.23976432393434738, "flos": 28543093436160.0, "grad_norm": 2.644441143435662, "language_loss": 0.79429114, "learning_rate": 3.5560715086368787e-06, "loss": 0.81635857, "num_input_tokens_seen": 42534800, "step": 1994, "time_per_iteration": 2.5808451175689697 }, { "auxiliary_loss_clip": 0.01175713, "auxiliary_loss_mlp": 0.01033302, "balance_loss_clip": 1.0576731, "balance_loss_mlp": 1.02399766, "epoch": 0.23988456682498646, "flos": 19494358945920.0, "grad_norm": 2.2746090547770277, "language_loss": 0.82297271, "learning_rate": 3.5555820250556816e-06, "loss": 0.84506285, "num_input_tokens_seen": 42552000, "step": 1995, "time_per_iteration": 2.533167839050293 }, { "auxiliary_loss_clip": 0.011888, "auxiliary_loss_mlp": 0.01035532, "balance_loss_clip": 1.06117821, "balance_loss_mlp": 1.02594709, "epoch": 0.24000480971562557, "flos": 20266331068800.0, "grad_norm": 2.381183328399539, "language_loss": 0.69149089, "learning_rate": 3.5550923054939278e-06, "loss": 0.71373427, "num_input_tokens_seen": 42571455, "step": 1996, "time_per_iteration": 2.5521271228790283 }, { "auxiliary_loss_clip": 0.01146469, "auxiliary_loss_mlp": 0.0103399, "balance_loss_clip": 1.05402184, "balance_loss_mlp": 1.02445889, "epoch": 0.24012505260626466, "flos": 25443176866560.0, "grad_norm": 2.0017799039157533, "language_loss": 0.74773115, "learning_rate": 3.5546023500259083e-06, "loss": 0.76953578, "num_input_tokens_seen": 42592550, "step": 1997, "time_per_iteration": 4.325376987457275 }, { "auxiliary_loss_clip": 0.0115644, "auxiliary_loss_mlp": 0.01033456, "balance_loss_clip": 1.05416584, "balance_loss_mlp": 1.02372873, "epoch": 0.24024529549690374, "flos": 15553342529280.0, "grad_norm": 4.136079711503388, "language_loss": 0.80801117, "learning_rate": 3.5541121587259477e-06, "loss": 0.82991016, "num_input_tokens_seen": 42610385, "step": 1998, "time_per_iteration": 3.39522123336792 }, { "auxiliary_loss_clip": 0.01120171, "auxiliary_loss_mlp": 0.01027313, "balance_loss_clip": 1.04854107, "balance_loss_mlp": 1.02526307, "epoch": 0.24036553838754285, "flos": 57122351867520.0, "grad_norm": 0.8263216891795258, "language_loss": 0.57880324, "learning_rate": 3.553621731668408e-06, "loss": 0.60027814, "num_input_tokens_seen": 42673595, "step": 1999, "time_per_iteration": 3.084599256515503 }, { "auxiliary_loss_clip": 0.01190031, "auxiliary_loss_mlp": 0.0103552, "balance_loss_clip": 1.05644226, "balance_loss_mlp": 1.02538168, "epoch": 0.24048578127818193, "flos": 24969946158720.0, "grad_norm": 2.173614219858502, "language_loss": 0.83147275, "learning_rate": 3.553131068927688e-06, "loss": 0.85372829, "num_input_tokens_seen": 42692000, "step": 2000, "time_per_iteration": 2.5484702587127686 }, { "auxiliary_loss_clip": 0.01168126, "auxiliary_loss_mlp": 0.01033877, "balance_loss_clip": 1.0582186, "balance_loss_mlp": 1.02546728, "epoch": 0.24060602416882101, "flos": 23330947547520.0, "grad_norm": 1.7355793923444756, "language_loss": 0.80381954, "learning_rate": 3.552640170578219e-06, "loss": 0.82583958, "num_input_tokens_seen": 42712250, "step": 2001, "time_per_iteration": 2.5578243732452393 }, { "auxiliary_loss_clip": 0.01184208, "auxiliary_loss_mlp": 0.01042265, "balance_loss_clip": 1.0605073, "balance_loss_mlp": 1.03318691, "epoch": 0.2407262670594601, "flos": 14173260128640.0, "grad_norm": 2.11599837347217, "language_loss": 0.77785522, "learning_rate": 3.5521490366944703e-06, "loss": 0.80012, "num_input_tokens_seen": 42729900, "step": 2002, "time_per_iteration": 2.518450975418091 }, { "auxiliary_loss_clip": 0.01169127, "auxiliary_loss_mlp": 0.0103176, "balance_loss_clip": 1.05627108, "balance_loss_mlp": 1.02252769, "epoch": 0.2408465099500992, "flos": 13663113217920.0, "grad_norm": 2.6119038656090794, "language_loss": 0.79861313, "learning_rate": 3.5516576673509474e-06, "loss": 0.82062197, "num_input_tokens_seen": 42747900, "step": 2003, "time_per_iteration": 2.53810977935791 }, { "auxiliary_loss_clip": 0.01215718, "auxiliary_loss_mlp": 0.01038508, "balance_loss_clip": 1.06410551, "balance_loss_mlp": 1.0290246, "epoch": 0.2409667528407383, "flos": 31248029076480.0, "grad_norm": 2.0195210994399795, "language_loss": 0.86382061, "learning_rate": 3.5511660626221896e-06, "loss": 0.88636291, "num_input_tokens_seen": 42768540, "step": 2004, "time_per_iteration": 2.553990364074707 }, { "auxiliary_loss_clip": 0.01182337, "auxiliary_loss_mlp": 0.00764808, "balance_loss_clip": 1.06022203, "balance_loss_mlp": 1.00137043, "epoch": 0.24108699573137737, "flos": 22199941031040.0, "grad_norm": 2.370699556953951, "language_loss": 0.89387584, "learning_rate": 3.5506742225827744e-06, "loss": 0.9133473, "num_input_tokens_seen": 42785395, "step": 2005, "time_per_iteration": 2.5323922634124756 }, { "auxiliary_loss_clip": 0.01170277, "auxiliary_loss_mlp": 0.01036954, "balance_loss_clip": 1.05847931, "balance_loss_mlp": 1.02695262, "epoch": 0.24120723862201648, "flos": 26103035664000.0, "grad_norm": 2.1499744919736767, "language_loss": 0.9029125, "learning_rate": 3.5501821473073116e-06, "loss": 0.92498481, "num_input_tokens_seen": 42801980, "step": 2006, "time_per_iteration": 2.579784870147705 }, { "auxiliary_loss_clip": 0.01162718, "auxiliary_loss_mlp": 0.01042249, "balance_loss_clip": 1.05691576, "balance_loss_mlp": 1.03118014, "epoch": 0.24132748151265557, "flos": 18624926246400.0, "grad_norm": 2.021978399084634, "language_loss": 0.86715221, "learning_rate": 3.54968983687045e-06, "loss": 0.88920188, "num_input_tokens_seen": 42818850, "step": 2007, "time_per_iteration": 2.555131196975708 }, { "auxiliary_loss_clip": 0.0118702, "auxiliary_loss_mlp": 0.0104759, "balance_loss_clip": 1.06143117, "balance_loss_mlp": 1.03646731, "epoch": 0.24144772440329465, "flos": 15267673664640.0, "grad_norm": 2.45496738679731, "language_loss": 0.88954461, "learning_rate": 3.549197291346872e-06, "loss": 0.91189069, "num_input_tokens_seen": 42835375, "step": 2008, "time_per_iteration": 2.5052084922790527 }, { "auxiliary_loss_clip": 0.01200798, "auxiliary_loss_mlp": 0.01038247, "balance_loss_clip": 1.06058848, "balance_loss_mlp": 1.02877605, "epoch": 0.24156796729393373, "flos": 24024274842240.0, "grad_norm": 2.2877681148079785, "language_loss": 0.79204863, "learning_rate": 3.548704510811297e-06, "loss": 0.81443906, "num_input_tokens_seen": 42854570, "step": 2009, "time_per_iteration": 2.513949155807495 }, { "auxiliary_loss_clip": 0.01159791, "auxiliary_loss_mlp": 0.0104702, "balance_loss_clip": 1.05521989, "balance_loss_mlp": 1.03644013, "epoch": 0.24168821018457284, "flos": 26286790665600.0, "grad_norm": 2.1958659209856344, "language_loss": 0.75249618, "learning_rate": 3.5482114953384787e-06, "loss": 0.77456427, "num_input_tokens_seen": 42873800, "step": 2010, "time_per_iteration": 2.5988929271698 }, { "auxiliary_loss_clip": 0.01203421, "auxiliary_loss_mlp": 0.01041529, "balance_loss_clip": 1.06248856, "balance_loss_mlp": 1.03088951, "epoch": 0.24180845307521193, "flos": 18223193560320.0, "grad_norm": 3.09569395858439, "language_loss": 0.84602809, "learning_rate": 3.5477182450032077e-06, "loss": 0.86847758, "num_input_tokens_seen": 42892400, "step": 2011, "time_per_iteration": 2.4890429973602295 }, { "auxiliary_loss_clip": 0.01196911, "auxiliary_loss_mlp": 0.01042508, "balance_loss_clip": 1.06060362, "balance_loss_mlp": 1.03245819, "epoch": 0.241928695965851, "flos": 20449260057600.0, "grad_norm": 2.4190500559330257, "language_loss": 0.83505142, "learning_rate": 3.5472247598803097e-06, "loss": 0.85744566, "num_input_tokens_seen": 42911745, "step": 2012, "time_per_iteration": 2.4939379692077637 }, { "auxiliary_loss_clip": 0.01216312, "auxiliary_loss_mlp": 0.01041645, "balance_loss_clip": 1.06358266, "balance_loss_mlp": 1.03129733, "epoch": 0.24204893885649012, "flos": 25556475340800.0, "grad_norm": 2.5065725840685147, "language_loss": 0.85111445, "learning_rate": 3.546731040044645e-06, "loss": 0.873694, "num_input_tokens_seen": 42926915, "step": 2013, "time_per_iteration": 2.5154919624328613 }, { "auxiliary_loss_clip": 0.01214434, "auxiliary_loss_mlp": 0.01029225, "balance_loss_clip": 1.06319857, "balance_loss_mlp": 1.01926494, "epoch": 0.2421691817471292, "flos": 30660207004800.0, "grad_norm": 1.8008223679215087, "language_loss": 0.75196111, "learning_rate": 3.546237085571112e-06, "loss": 0.77439767, "num_input_tokens_seen": 42945350, "step": 2014, "time_per_iteration": 2.5209712982177734 }, { "auxiliary_loss_clip": 0.01201106, "auxiliary_loss_mlp": 0.01034831, "balance_loss_clip": 1.06409693, "balance_loss_mlp": 1.02533054, "epoch": 0.24228942463776829, "flos": 21945011230080.0, "grad_norm": 6.148319701296482, "language_loss": 0.72269219, "learning_rate": 3.5457428965346425e-06, "loss": 0.74505156, "num_input_tokens_seen": 42964290, "step": 2015, "time_per_iteration": 2.5096564292907715 }, { "auxiliary_loss_clip": 0.01137212, "auxiliary_loss_mlp": 0.01034901, "balance_loss_clip": 1.05390656, "balance_loss_mlp": 1.02496481, "epoch": 0.2424096675284074, "flos": 33984493879680.0, "grad_norm": 1.5657157676524969, "language_loss": 0.74686772, "learning_rate": 3.545248473010205e-06, "loss": 0.7685889, "num_input_tokens_seen": 42987095, "step": 2016, "time_per_iteration": 2.7369117736816406 }, { "auxiliary_loss_clip": 0.01218534, "auxiliary_loss_mlp": 0.00765327, "balance_loss_clip": 1.06256843, "balance_loss_mlp": 1.00142455, "epoch": 0.24252991041904648, "flos": 21653416621440.0, "grad_norm": 1.6743571227131462, "language_loss": 0.87753975, "learning_rate": 3.544753815072802e-06, "loss": 0.89737833, "num_input_tokens_seen": 43005750, "step": 2017, "time_per_iteration": 2.492324113845825 }, { "auxiliary_loss_clip": 0.01114044, "auxiliary_loss_mlp": 0.01033153, "balance_loss_clip": 1.04609799, "balance_loss_mlp": 1.02315176, "epoch": 0.24265015330968556, "flos": 21870065502720.0, "grad_norm": 1.9727245398722877, "language_loss": 0.88129169, "learning_rate": 3.544258922797474e-06, "loss": 0.90276372, "num_input_tokens_seen": 43023870, "step": 2018, "time_per_iteration": 3.4754931926727295 }, { "auxiliary_loss_clip": 0.01213558, "auxiliary_loss_mlp": 0.01036534, "balance_loss_clip": 1.06346726, "balance_loss_mlp": 1.02721834, "epoch": 0.24277039620032465, "flos": 25628260671360.0, "grad_norm": 1.5793490770632446, "language_loss": 0.77898896, "learning_rate": 3.543763796259295e-06, "loss": 0.80148989, "num_input_tokens_seen": 43043825, "step": 2019, "time_per_iteration": 2.499797821044922 }, { "auxiliary_loss_clip": 0.0119975, "auxiliary_loss_mlp": 0.01038526, "balance_loss_clip": 1.06118834, "balance_loss_mlp": 1.02820837, "epoch": 0.24289063909096376, "flos": 26286575184000.0, "grad_norm": 1.7661350007665524, "language_loss": 0.90859336, "learning_rate": 3.5432684355333754e-06, "loss": 0.93097615, "num_input_tokens_seen": 43062480, "step": 2020, "time_per_iteration": 2.6600663661956787 }, { "auxiliary_loss_clip": 0.01199005, "auxiliary_loss_mlp": 0.0103918, "balance_loss_clip": 1.05935287, "balance_loss_mlp": 1.02935088, "epoch": 0.24301088198160284, "flos": 25075056332160.0, "grad_norm": 4.423105025546754, "language_loss": 0.76537555, "learning_rate": 3.5427728406948613e-06, "loss": 0.7877574, "num_input_tokens_seen": 43081595, "step": 2021, "time_per_iteration": 2.519862651824951 }, { "auxiliary_loss_clip": 0.01106161, "auxiliary_loss_mlp": 0.01016923, "balance_loss_clip": 1.04079425, "balance_loss_mlp": 1.01457453, "epoch": 0.24313112487224192, "flos": 69900948673920.0, "grad_norm": 0.7546676576833723, "language_loss": 0.57935798, "learning_rate": 3.542277011818934e-06, "loss": 0.60058886, "num_input_tokens_seen": 43145430, "step": 2022, "time_per_iteration": 3.404602289199829 }, { "auxiliary_loss_clip": 0.01186284, "auxiliary_loss_mlp": 0.01035743, "balance_loss_clip": 1.06215978, "balance_loss_mlp": 1.02656412, "epoch": 0.24325136776288103, "flos": 40662334235520.0, "grad_norm": 2.093529315317973, "language_loss": 0.74164367, "learning_rate": 3.5417809489808104e-06, "loss": 0.76386392, "num_input_tokens_seen": 43167040, "step": 2023, "time_per_iteration": 3.550360679626465 }, { "auxiliary_loss_clip": 0.01201141, "auxiliary_loss_mlp": 0.01035041, "balance_loss_clip": 1.06249368, "balance_loss_mlp": 1.02625513, "epoch": 0.24337161065352012, "flos": 25046400257280.0, "grad_norm": 2.049317021195419, "language_loss": 0.72529352, "learning_rate": 3.5412846522557422e-06, "loss": 0.74765539, "num_input_tokens_seen": 43187930, "step": 2024, "time_per_iteration": 3.3872809410095215 }, { "auxiliary_loss_clip": 0.01214038, "auxiliary_loss_mlp": 0.01037638, "balance_loss_clip": 1.06300068, "balance_loss_mlp": 1.02782702, "epoch": 0.2434918535441592, "flos": 18661160090880.0, "grad_norm": 2.0911263302162486, "language_loss": 0.73977381, "learning_rate": 3.540788121719018e-06, "loss": 0.7622906, "num_input_tokens_seen": 43206350, "step": 2025, "time_per_iteration": 3.2318644523620605 }, { "auxiliary_loss_clip": 0.01161845, "auxiliary_loss_mlp": 0.01038611, "balance_loss_clip": 1.05774343, "balance_loss_mlp": 1.02838254, "epoch": 0.24361209643479828, "flos": 23915142345600.0, "grad_norm": 1.96263399866518, "language_loss": 0.81964219, "learning_rate": 3.5402913574459604e-06, "loss": 0.84164679, "num_input_tokens_seen": 43226255, "step": 2026, "time_per_iteration": 2.556619882583618 }, { "auxiliary_loss_clip": 0.01131489, "auxiliary_loss_mlp": 0.01037101, "balance_loss_clip": 1.0484376, "balance_loss_mlp": 1.02795196, "epoch": 0.2437323393254374, "flos": 28657505232000.0, "grad_norm": 1.5884391313971205, "language_loss": 0.86278164, "learning_rate": 3.5397943595119297e-06, "loss": 0.8844676, "num_input_tokens_seen": 43247675, "step": 2027, "time_per_iteration": 2.677842378616333 }, { "auxiliary_loss_clip": 0.01177597, "auxiliary_loss_mlp": 0.01037646, "balance_loss_clip": 1.05822158, "balance_loss_mlp": 1.02702475, "epoch": 0.24385258221607647, "flos": 23550325862400.0, "grad_norm": 2.9476236891476213, "language_loss": 0.77395499, "learning_rate": 3.5392971279923177e-06, "loss": 0.79610747, "num_input_tokens_seen": 43265895, "step": 2028, "time_per_iteration": 2.5468316078186035 }, { "auxiliary_loss_clip": 0.01161203, "auxiliary_loss_mlp": 0.01033426, "balance_loss_clip": 1.05382395, "balance_loss_mlp": 1.02217269, "epoch": 0.24397282510671556, "flos": 25336091445120.0, "grad_norm": 2.155010711190702, "language_loss": 0.82711738, "learning_rate": 3.5387996629625557e-06, "loss": 0.84906363, "num_input_tokens_seen": 43283485, "step": 2029, "time_per_iteration": 2.5875539779663086 }, { "auxiliary_loss_clip": 0.01136193, "auxiliary_loss_mlp": 0.01001549, "balance_loss_clip": 1.04746079, "balance_loss_mlp": 0.99948645, "epoch": 0.24409306799735467, "flos": 65187421430400.0, "grad_norm": 0.810109331667588, "language_loss": 0.55020881, "learning_rate": 3.5383019644981083e-06, "loss": 0.57158619, "num_input_tokens_seen": 43347180, "step": 2030, "time_per_iteration": 3.0763237476348877 }, { "auxiliary_loss_clip": 0.01182364, "auxiliary_loss_mlp": 0.01027815, "balance_loss_clip": 1.05807924, "balance_loss_mlp": 1.01784921, "epoch": 0.24421331088799375, "flos": 19537093152000.0, "grad_norm": 2.361511047805532, "language_loss": 0.72730899, "learning_rate": 3.5378040326744763e-06, "loss": 0.74941087, "num_input_tokens_seen": 43366665, "step": 2031, "time_per_iteration": 2.498952865600586 }, { "auxiliary_loss_clip": 0.01170289, "auxiliary_loss_mlp": 0.01033252, "balance_loss_clip": 1.05861604, "balance_loss_mlp": 1.02385211, "epoch": 0.24433355377863283, "flos": 21068575378560.0, "grad_norm": 2.4674421774485764, "language_loss": 0.85472143, "learning_rate": 3.5373058675671946e-06, "loss": 0.87675679, "num_input_tokens_seen": 43384670, "step": 2032, "time_per_iteration": 2.543724536895752 }, { "auxiliary_loss_clip": 0.0114541, "auxiliary_loss_mlp": 0.01033524, "balance_loss_clip": 1.05329907, "balance_loss_mlp": 1.02300346, "epoch": 0.24445379666927192, "flos": 22637189289600.0, "grad_norm": 2.7095208164151057, "language_loss": 0.72379386, "learning_rate": 3.536807469251836e-06, "loss": 0.74558318, "num_input_tokens_seen": 43403825, "step": 2033, "time_per_iteration": 2.586548328399658 }, { "auxiliary_loss_clip": 0.01172252, "auxiliary_loss_mlp": 0.01031929, "balance_loss_clip": 1.05458188, "balance_loss_mlp": 1.0223093, "epoch": 0.24457403955991103, "flos": 21251612108160.0, "grad_norm": 3.4754204057747735, "language_loss": 0.82476127, "learning_rate": 3.5363088378040055e-06, "loss": 0.84680307, "num_input_tokens_seen": 43422715, "step": 2034, "time_per_iteration": 2.580615997314453 }, { "auxiliary_loss_clip": 0.01133405, "auxiliary_loss_mlp": 0.00754292, "balance_loss_clip": 1.04519033, "balance_loss_mlp": 1.0009644, "epoch": 0.2446942824505501, "flos": 66997820764800.0, "grad_norm": 0.7546160432939819, "language_loss": 0.64419895, "learning_rate": 3.5358099732993463e-06, "loss": 0.66307592, "num_input_tokens_seen": 43481825, "step": 2035, "time_per_iteration": 2.9549691677093506 }, { "auxiliary_loss_clip": 0.01188764, "auxiliary_loss_mlp": 0.01033974, "balance_loss_clip": 1.05916369, "balance_loss_mlp": 1.02451491, "epoch": 0.2448145253411892, "flos": 20411122792320.0, "grad_norm": 4.0376159961893725, "language_loss": 0.89357877, "learning_rate": 3.535310875813535e-06, "loss": 0.91580617, "num_input_tokens_seen": 43500220, "step": 2036, "time_per_iteration": 2.54103422164917 }, { "auxiliary_loss_clip": 0.01195911, "auxiliary_loss_mlp": 0.01032595, "balance_loss_clip": 1.05919862, "balance_loss_mlp": 1.02315331, "epoch": 0.2449347682318283, "flos": 28804739080320.0, "grad_norm": 2.136723515496185, "language_loss": 0.81929231, "learning_rate": 3.5348115454222843e-06, "loss": 0.84157741, "num_input_tokens_seen": 43522805, "step": 2037, "time_per_iteration": 2.5451924800872803 }, { "auxiliary_loss_clip": 0.01178184, "auxiliary_loss_mlp": 0.01046084, "balance_loss_clip": 1.05475402, "balance_loss_mlp": 1.03624344, "epoch": 0.2450550111224674, "flos": 22528990546560.0, "grad_norm": 2.1410221616501515, "language_loss": 0.85668814, "learning_rate": 3.5343119822013425e-06, "loss": 0.87893081, "num_input_tokens_seen": 43541915, "step": 2038, "time_per_iteration": 2.524336338043213 }, { "auxiliary_loss_clip": 0.01204193, "auxiliary_loss_mlp": 0.01043001, "balance_loss_clip": 1.060987, "balance_loss_mlp": 1.03256416, "epoch": 0.24517525401310647, "flos": 21759137326080.0, "grad_norm": 3.1350782773695625, "language_loss": 0.77415615, "learning_rate": 3.533812186226493e-06, "loss": 0.79662812, "num_input_tokens_seen": 43562625, "step": 2039, "time_per_iteration": 2.4927306175231934 }, { "auxiliary_loss_clip": 0.01207834, "auxiliary_loss_mlp": 0.01031357, "balance_loss_clip": 1.05925858, "balance_loss_mlp": 1.0221417, "epoch": 0.24529549690374555, "flos": 25043311687680.0, "grad_norm": 1.976807956764818, "language_loss": 0.7574625, "learning_rate": 3.5333121575735545e-06, "loss": 0.77985442, "num_input_tokens_seen": 43582265, "step": 2040, "time_per_iteration": 2.467381715774536 }, { "auxiliary_loss_clip": 0.01179493, "auxiliary_loss_mlp": 0.01033534, "balance_loss_clip": 1.05795443, "balance_loss_mlp": 1.02340174, "epoch": 0.24541573979438466, "flos": 32123638915200.0, "grad_norm": 2.876660177603598, "language_loss": 0.75867587, "learning_rate": 3.532811896318381e-06, "loss": 0.78080618, "num_input_tokens_seen": 43604335, "step": 2041, "time_per_iteration": 2.621657609939575 }, { "auxiliary_loss_clip": 0.01170441, "auxiliary_loss_mlp": 0.01029322, "balance_loss_clip": 1.05528152, "balance_loss_mlp": 1.01926112, "epoch": 0.24553598268502375, "flos": 31357556622720.0, "grad_norm": 2.0825100759679556, "language_loss": 0.81932783, "learning_rate": 3.5323114025368615e-06, "loss": 0.84132552, "num_input_tokens_seen": 43619400, "step": 2042, "time_per_iteration": 2.6108646392822266 }, { "auxiliary_loss_clip": 0.01189871, "auxiliary_loss_mlp": 0.01030265, "balance_loss_clip": 1.05483294, "balance_loss_mlp": 1.02075195, "epoch": 0.24565622557566283, "flos": 14027462824320.0, "grad_norm": 2.184648304099261, "language_loss": 0.8168236, "learning_rate": 3.53181067630492e-06, "loss": 0.8390249, "num_input_tokens_seen": 43636870, "step": 2043, "time_per_iteration": 2.488929510116577 }, { "auxiliary_loss_clip": 0.01172107, "auxiliary_loss_mlp": 0.01039677, "balance_loss_clip": 1.0547843, "balance_loss_mlp": 1.03005052, "epoch": 0.24577646846630194, "flos": 16581465515520.0, "grad_norm": 1.8214350373296109, "language_loss": 0.76064312, "learning_rate": 3.5313097176985175e-06, "loss": 0.78276092, "num_input_tokens_seen": 43655180, "step": 2044, "time_per_iteration": 2.494130849838257 }, { "auxiliary_loss_clip": 0.01195588, "auxiliary_loss_mlp": 0.0103033, "balance_loss_clip": 1.05939984, "balance_loss_mlp": 1.02100754, "epoch": 0.24589671135694102, "flos": 18807424272000.0, "grad_norm": 1.8690621047120717, "language_loss": 0.8122102, "learning_rate": 3.5308085267936482e-06, "loss": 0.83446932, "num_input_tokens_seen": 43672895, "step": 2045, "time_per_iteration": 3.2762341499328613 }, { "auxiliary_loss_clip": 0.01135645, "auxiliary_loss_mlp": 0.00763297, "balance_loss_clip": 1.05259132, "balance_loss_mlp": 1.00127506, "epoch": 0.2460169542475801, "flos": 19938538529280.0, "grad_norm": 1.7637586934780578, "language_loss": 0.89747202, "learning_rate": 3.530307103666342e-06, "loss": 0.91646147, "num_input_tokens_seen": 43691975, "step": 2046, "time_per_iteration": 2.6238620281219482 }, { "auxiliary_loss_clip": 0.01172407, "auxiliary_loss_mlp": 0.01029838, "balance_loss_clip": 1.05668688, "balance_loss_mlp": 1.02040827, "epoch": 0.24613719713821922, "flos": 24171221381760.0, "grad_norm": 1.918258230136605, "language_loss": 0.80333829, "learning_rate": 3.5298054483926658e-06, "loss": 0.82536077, "num_input_tokens_seen": 43712670, "step": 2047, "time_per_iteration": 2.558009386062622 }, { "auxiliary_loss_clip": 0.01204154, "auxiliary_loss_mlp": 0.01034855, "balance_loss_clip": 1.06022596, "balance_loss_mlp": 1.02528846, "epoch": 0.2462574400288583, "flos": 30221055325440.0, "grad_norm": 1.946700773247387, "language_loss": 0.83041006, "learning_rate": 3.5293035610487187e-06, "loss": 0.85280013, "num_input_tokens_seen": 43732035, "step": 2048, "time_per_iteration": 2.5432701110839844 }, { "auxiliary_loss_clip": 0.01094238, "auxiliary_loss_mlp": 0.0100003, "balance_loss_clip": 1.03473091, "balance_loss_mlp": 0.99782497, "epoch": 0.24637768291949738, "flos": 68943030819840.0, "grad_norm": 0.7355315456194561, "language_loss": 0.62013125, "learning_rate": 3.5288014417106374e-06, "loss": 0.64107394, "num_input_tokens_seen": 43798055, "step": 2049, "time_per_iteration": 3.1210954189300537 }, { "auxiliary_loss_clip": 0.01164488, "auxiliary_loss_mlp": 0.01034967, "balance_loss_clip": 1.05463457, "balance_loss_mlp": 1.02562737, "epoch": 0.24649792581013646, "flos": 34383999922560.0, "grad_norm": 1.8033945803821139, "language_loss": 0.75461221, "learning_rate": 3.528299090454593e-06, "loss": 0.77660674, "num_input_tokens_seen": 43818590, "step": 2050, "time_per_iteration": 3.580585241317749 }, { "auxiliary_loss_clip": 0.01198203, "auxiliary_loss_mlp": 0.01031496, "balance_loss_clip": 1.05714953, "balance_loss_mlp": 1.02179265, "epoch": 0.24661816870077558, "flos": 19680448331520.0, "grad_norm": 2.4760690033290906, "language_loss": 0.8288486, "learning_rate": 3.527796507356792e-06, "loss": 0.85114563, "num_input_tokens_seen": 43832480, "step": 2051, "time_per_iteration": 3.2736921310424805 }, { "auxiliary_loss_clip": 0.01198782, "auxiliary_loss_mlp": 0.01035215, "balance_loss_clip": 1.05769539, "balance_loss_mlp": 1.02580333, "epoch": 0.24673841159141466, "flos": 20002279213440.0, "grad_norm": 2.8511891206930087, "language_loss": 0.90640426, "learning_rate": 3.527293692493475e-06, "loss": 0.9287442, "num_input_tokens_seen": 43848345, "step": 2052, "time_per_iteration": 3.175246000289917 }, { "auxiliary_loss_clip": 0.01197318, "auxiliary_loss_mlp": 0.01034074, "balance_loss_clip": 1.05705571, "balance_loss_mlp": 1.02398872, "epoch": 0.24685865448205374, "flos": 21646593037440.0, "grad_norm": 4.698295329890442, "language_loss": 0.73035491, "learning_rate": 3.52679064594092e-06, "loss": 0.75266886, "num_input_tokens_seen": 43865685, "step": 2053, "time_per_iteration": 2.487556219100952 }, { "auxiliary_loss_clip": 0.01135895, "auxiliary_loss_mlp": 0.01034459, "balance_loss_clip": 1.04321694, "balance_loss_mlp": 1.02548802, "epoch": 0.24697889737269285, "flos": 17960470508160.0, "grad_norm": 2.366456713382833, "language_loss": 0.74841893, "learning_rate": 3.5262873677754375e-06, "loss": 0.77012247, "num_input_tokens_seen": 43883690, "step": 2054, "time_per_iteration": 2.579397439956665 }, { "auxiliary_loss_clip": 0.01205296, "auxiliary_loss_mlp": 0.01033834, "balance_loss_clip": 1.05769897, "balance_loss_mlp": 1.02456522, "epoch": 0.24709914026333193, "flos": 27344611221120.0, "grad_norm": 1.5794420879481634, "language_loss": 0.80695426, "learning_rate": 3.5257838580733745e-06, "loss": 0.82934558, "num_input_tokens_seen": 43903295, "step": 2055, "time_per_iteration": 2.513293743133545 }, { "auxiliary_loss_clip": 0.01198354, "auxiliary_loss_mlp": 0.01034304, "balance_loss_clip": 1.05797529, "balance_loss_mlp": 1.02474952, "epoch": 0.24721938315397102, "flos": 19275519335040.0, "grad_norm": 2.2452749283039934, "language_loss": 0.87138367, "learning_rate": 3.5252801169111138e-06, "loss": 0.89371026, "num_input_tokens_seen": 43920960, "step": 2056, "time_per_iteration": 2.4872045516967773 }, { "auxiliary_loss_clip": 0.01176834, "auxiliary_loss_mlp": 0.01033213, "balance_loss_clip": 1.05707264, "balance_loss_mlp": 1.02405167, "epoch": 0.2473396260446101, "flos": 23185796688000.0, "grad_norm": 1.7779788533585361, "language_loss": 0.79825628, "learning_rate": 3.524776144365072e-06, "loss": 0.82035679, "num_input_tokens_seen": 43939415, "step": 2057, "time_per_iteration": 2.544431209564209 }, { "auxiliary_loss_clip": 0.01172434, "auxiliary_loss_mlp": 0.01037819, "balance_loss_clip": 1.05627775, "balance_loss_mlp": 1.02811563, "epoch": 0.2474598689352492, "flos": 21142443697920.0, "grad_norm": 1.747216171799763, "language_loss": 0.79203486, "learning_rate": 3.5242719405117016e-06, "loss": 0.8141374, "num_input_tokens_seen": 43959220, "step": 2058, "time_per_iteration": 2.553508996963501 }, { "auxiliary_loss_clip": 0.01183398, "auxiliary_loss_mlp": 0.00764941, "balance_loss_clip": 1.05795598, "balance_loss_mlp": 1.00136673, "epoch": 0.2475801118258883, "flos": 21648352803840.0, "grad_norm": 2.4071583400354455, "language_loss": 0.75201744, "learning_rate": 3.5237675054274893e-06, "loss": 0.77150083, "num_input_tokens_seen": 43978420, "step": 2059, "time_per_iteration": 2.564394235610962 }, { "auxiliary_loss_clip": 0.01194892, "auxiliary_loss_mlp": 0.01036708, "balance_loss_clip": 1.05726445, "balance_loss_mlp": 1.02686167, "epoch": 0.24770035471652738, "flos": 22674500542080.0, "grad_norm": 1.9041185073414162, "language_loss": 0.80239081, "learning_rate": 3.5232628391889584e-06, "loss": 0.82470685, "num_input_tokens_seen": 43996710, "step": 2060, "time_per_iteration": 2.5405423641204834 }, { "auxiliary_loss_clip": 0.01147297, "auxiliary_loss_mlp": 0.01029758, "balance_loss_clip": 1.05293083, "balance_loss_mlp": 1.02075791, "epoch": 0.2478205976071665, "flos": 22163814927360.0, "grad_norm": 2.552710114290554, "language_loss": 0.64309239, "learning_rate": 3.522757941872666e-06, "loss": 0.66486293, "num_input_tokens_seen": 44014865, "step": 2061, "time_per_iteration": 2.6115903854370117 }, { "auxiliary_loss_clip": 0.0121193, "auxiliary_loss_mlp": 0.00764735, "balance_loss_clip": 1.06299806, "balance_loss_mlp": 1.00151336, "epoch": 0.24794084049780557, "flos": 24973106555520.0, "grad_norm": 1.9459748099389635, "language_loss": 0.82584316, "learning_rate": 3.5222528135552042e-06, "loss": 0.84560978, "num_input_tokens_seen": 44036325, "step": 2062, "time_per_iteration": 2.4948155879974365 }, { "auxiliary_loss_clip": 0.01193669, "auxiliary_loss_mlp": 0.01038304, "balance_loss_clip": 1.05974042, "balance_loss_mlp": 1.02858233, "epoch": 0.24806108338844465, "flos": 18296379521280.0, "grad_norm": 2.0349655137334497, "language_loss": 0.80289996, "learning_rate": 3.521747454313201e-06, "loss": 0.82521963, "num_input_tokens_seen": 44055005, "step": 2063, "time_per_iteration": 2.5140719413757324 }, { "auxiliary_loss_clip": 0.01154758, "auxiliary_loss_mlp": 0.01030696, "balance_loss_clip": 1.04835868, "balance_loss_mlp": 1.02133822, "epoch": 0.24818132627908374, "flos": 19282163351040.0, "grad_norm": 1.8442731566255053, "language_loss": 0.66645384, "learning_rate": 3.521241864223319e-06, "loss": 0.68830836, "num_input_tokens_seen": 44073965, "step": 2064, "time_per_iteration": 2.591148853302002 }, { "auxiliary_loss_clip": 0.01099617, "auxiliary_loss_mlp": 0.01012281, "balance_loss_clip": 1.03159118, "balance_loss_mlp": 1.00978971, "epoch": 0.24830156916972285, "flos": 70285837881600.0, "grad_norm": 0.7902384694965982, "language_loss": 0.62013566, "learning_rate": 3.5207360433622552e-06, "loss": 0.64125466, "num_input_tokens_seen": 44135965, "step": 2065, "time_per_iteration": 3.106696128845215 }, { "auxiliary_loss_clip": 0.0117789, "auxiliary_loss_mlp": 0.01036297, "balance_loss_clip": 1.05839241, "balance_loss_mlp": 1.02717733, "epoch": 0.24842181206036193, "flos": 40409128287360.0, "grad_norm": 2.7841112066361813, "language_loss": 0.74404866, "learning_rate": 3.5202299918067437e-06, "loss": 0.76619053, "num_input_tokens_seen": 44159560, "step": 2066, "time_per_iteration": 2.7093608379364014 }, { "auxiliary_loss_clip": 0.01191923, "auxiliary_loss_mlp": 0.01029238, "balance_loss_clip": 1.0574069, "balance_loss_mlp": 1.02025592, "epoch": 0.248542054951001, "flos": 20082432412800.0, "grad_norm": 2.520681619220884, "language_loss": 0.69171578, "learning_rate": 3.519723709633551e-06, "loss": 0.71392739, "num_input_tokens_seen": 44178320, "step": 2067, "time_per_iteration": 2.506783962249756 }, { "auxiliary_loss_clip": 0.01176716, "auxiliary_loss_mlp": 0.01034366, "balance_loss_clip": 1.05890632, "balance_loss_mlp": 1.02412593, "epoch": 0.24866229784164012, "flos": 23513948363520.0, "grad_norm": 2.2137245662377234, "language_loss": 0.83609009, "learning_rate": 3.519217196919479e-06, "loss": 0.85820091, "num_input_tokens_seen": 44197305, "step": 2068, "time_per_iteration": 2.6375486850738525 }, { "auxiliary_loss_clip": 0.01187801, "auxiliary_loss_mlp": 0.01036683, "balance_loss_clip": 1.06230497, "balance_loss_mlp": 1.0276525, "epoch": 0.2487825407322792, "flos": 19865101173120.0, "grad_norm": 1.7368374066876753, "language_loss": 0.73603821, "learning_rate": 3.518710453741367e-06, "loss": 0.75828302, "num_input_tokens_seen": 44216505, "step": 2069, "time_per_iteration": 2.523452043533325 }, { "auxiliary_loss_clip": 0.01171708, "auxiliary_loss_mlp": 0.0076475, "balance_loss_clip": 1.0539875, "balance_loss_mlp": 1.00146341, "epoch": 0.2489027836229183, "flos": 22017622573440.0, "grad_norm": 2.8041739847238407, "language_loss": 0.67833257, "learning_rate": 3.518203480176086e-06, "loss": 0.69769704, "num_input_tokens_seen": 44235435, "step": 2070, "time_per_iteration": 2.554170846939087 }, { "auxiliary_loss_clip": 0.01118645, "auxiliary_loss_mlp": 0.01041243, "balance_loss_clip": 1.04649854, "balance_loss_mlp": 1.03166485, "epoch": 0.2490230265135574, "flos": 23294354567040.0, "grad_norm": 1.7876822009766662, "language_loss": 0.80530941, "learning_rate": 3.517696276300545e-06, "loss": 0.82690835, "num_input_tokens_seen": 44256975, "step": 2071, "time_per_iteration": 2.6718549728393555 }, { "auxiliary_loss_clip": 0.01201026, "auxiliary_loss_mlp": 0.0103944, "balance_loss_clip": 1.06533027, "balance_loss_mlp": 1.02928329, "epoch": 0.24914326940419648, "flos": 19826784339840.0, "grad_norm": 2.690801197211992, "language_loss": 0.6918785, "learning_rate": 3.517188842191685e-06, "loss": 0.71428317, "num_input_tokens_seen": 44275125, "step": 2072, "time_per_iteration": 3.311549425125122 }, { "auxiliary_loss_clip": 0.01193792, "auxiliary_loss_mlp": 0.01030596, "balance_loss_clip": 1.05785728, "balance_loss_mlp": 1.02098203, "epoch": 0.24926351229483557, "flos": 20229271211520.0, "grad_norm": 1.6511159642335216, "language_loss": 0.73940581, "learning_rate": 3.5166811779264837e-06, "loss": 0.76164967, "num_input_tokens_seen": 44295445, "step": 2073, "time_per_iteration": 2.5805375576019287 }, { "auxiliary_loss_clip": 0.01209578, "auxiliary_loss_mlp": 0.01032976, "balance_loss_clip": 1.0592593, "balance_loss_mlp": 1.02302814, "epoch": 0.24938375518547465, "flos": 23294570048640.0, "grad_norm": 1.7831994857452753, "language_loss": 0.77649891, "learning_rate": 3.5161732835819545e-06, "loss": 0.79892445, "num_input_tokens_seen": 44314755, "step": 2074, "time_per_iteration": 2.480890989303589 }, { "auxiliary_loss_clip": 0.01213042, "auxiliary_loss_mlp": 0.01029485, "balance_loss_clip": 1.06322157, "balance_loss_mlp": 1.02025867, "epoch": 0.24950399807611376, "flos": 17311673099520.0, "grad_norm": 1.7685037465490576, "language_loss": 0.83107197, "learning_rate": 3.515665159235143e-06, "loss": 0.85349727, "num_input_tokens_seen": 44333640, "step": 2075, "time_per_iteration": 2.4774997234344482 }, { "auxiliary_loss_clip": 0.01172965, "auxiliary_loss_mlp": 0.01026145, "balance_loss_clip": 1.05004966, "balance_loss_mlp": 1.01793146, "epoch": 0.24962424096675284, "flos": 19024863252480.0, "grad_norm": 1.5841661672194396, "language_loss": 0.74937916, "learning_rate": 3.5151568049631318e-06, "loss": 0.77137029, "num_input_tokens_seen": 44352355, "step": 2076, "time_per_iteration": 2.553252696990967 }, { "auxiliary_loss_clip": 0.01209839, "auxiliary_loss_mlp": 0.01031618, "balance_loss_clip": 1.05929852, "balance_loss_mlp": 1.02175391, "epoch": 0.24974448385739192, "flos": 33398790710400.0, "grad_norm": 1.7465785249407617, "language_loss": 0.80122817, "learning_rate": 3.5146482208430385e-06, "loss": 0.82364279, "num_input_tokens_seen": 44374185, "step": 2077, "time_per_iteration": 3.498643159866333 }, { "auxiliary_loss_clip": 0.01127088, "auxiliary_loss_mlp": 0.0103391, "balance_loss_clip": 1.04837871, "balance_loss_mlp": 1.02331853, "epoch": 0.24986472674803104, "flos": 30007279532160.0, "grad_norm": 1.8325132885915663, "language_loss": 0.67639148, "learning_rate": 3.514139406952014e-06, "loss": 0.69800144, "num_input_tokens_seen": 44396210, "step": 2078, "time_per_iteration": 3.4114110469818115 }, { "auxiliary_loss_clip": 0.01195756, "auxiliary_loss_mlp": 0.01031752, "balance_loss_clip": 1.06046069, "balance_loss_mlp": 1.02257836, "epoch": 0.24998496963867012, "flos": 26613074833920.0, "grad_norm": 1.767743304935125, "language_loss": 0.83352041, "learning_rate": 3.5136303633672454e-06, "loss": 0.85579538, "num_input_tokens_seen": 44416340, "step": 2079, "time_per_iteration": 2.549199342727661 }, { "auxiliary_loss_clip": 0.01174866, "auxiliary_loss_mlp": 0.00764589, "balance_loss_clip": 1.05822933, "balance_loss_mlp": 1.00126028, "epoch": 0.25010521252930923, "flos": 23553989049600.0, "grad_norm": 1.763635891382621, "language_loss": 0.74358666, "learning_rate": 3.5131210901659544e-06, "loss": 0.76298118, "num_input_tokens_seen": 44438095, "step": 2080, "time_per_iteration": 2.6323556900024414 }, { "auxiliary_loss_clip": 0.01156598, "auxiliary_loss_mlp": 0.01032041, "balance_loss_clip": 1.0512861, "balance_loss_mlp": 1.02250409, "epoch": 0.2502254554199483, "flos": 23441193365760.0, "grad_norm": 2.245331530783253, "language_loss": 0.81737947, "learning_rate": 3.5126115874253967e-06, "loss": 0.83926582, "num_input_tokens_seen": 44457650, "step": 2081, "time_per_iteration": 2.5791807174682617 }, { "auxiliary_loss_clip": 0.01166006, "auxiliary_loss_mlp": 0.0103492, "balance_loss_clip": 1.05755448, "balance_loss_mlp": 1.02504325, "epoch": 0.2503456983105874, "flos": 28761681651840.0, "grad_norm": 2.027573687189531, "language_loss": 0.80711162, "learning_rate": 3.5121018552228644e-06, "loss": 0.82912087, "num_input_tokens_seen": 44476155, "step": 2082, "time_per_iteration": 2.6169235706329346 }, { "auxiliary_loss_clip": 0.01166833, "auxiliary_loss_mlp": 0.01030079, "balance_loss_clip": 1.05506468, "balance_loss_mlp": 1.02041078, "epoch": 0.2504659412012265, "flos": 18770256673920.0, "grad_norm": 2.029972118376554, "language_loss": 0.76470101, "learning_rate": 3.5115918936356827e-06, "loss": 0.78667009, "num_input_tokens_seen": 44492910, "step": 2083, "time_per_iteration": 2.552333354949951 }, { "auxiliary_loss_clip": 0.01147884, "auxiliary_loss_mlp": 0.01036163, "balance_loss_clip": 1.0536449, "balance_loss_mlp": 1.02694845, "epoch": 0.25058618409186556, "flos": 16873383346560.0, "grad_norm": 2.370308756378618, "language_loss": 0.78755164, "learning_rate": 3.5110817027412123e-06, "loss": 0.80939209, "num_input_tokens_seen": 44512000, "step": 2084, "time_per_iteration": 2.5458931922912598 }, { "auxiliary_loss_clip": 0.01157185, "auxiliary_loss_mlp": 0.01030422, "balance_loss_clip": 1.05020928, "balance_loss_mlp": 1.02140415, "epoch": 0.25070642698250467, "flos": 24425540651520.0, "grad_norm": 2.8180417453343094, "language_loss": 0.68780911, "learning_rate": 3.5105712826168493e-06, "loss": 0.70968521, "num_input_tokens_seen": 44531650, "step": 2085, "time_per_iteration": 2.596689224243164 }, { "auxiliary_loss_clip": 0.01192986, "auxiliary_loss_mlp": 0.00763298, "balance_loss_clip": 1.05675495, "balance_loss_mlp": 1.00116539, "epoch": 0.2508266698731437, "flos": 20260944028800.0, "grad_norm": 1.9214801655449527, "language_loss": 0.7067641, "learning_rate": 3.5100606333400235e-06, "loss": 0.72632694, "num_input_tokens_seen": 44548785, "step": 2086, "time_per_iteration": 2.499660015106201 }, { "auxiliary_loss_clip": 0.01191625, "auxiliary_loss_mlp": 0.0103215, "balance_loss_clip": 1.05773067, "balance_loss_mlp": 1.02131391, "epoch": 0.25094691276378284, "flos": 19245318975360.0, "grad_norm": 2.9429969861002228, "language_loss": 0.77205259, "learning_rate": 3.5095497549882006e-06, "loss": 0.7942903, "num_input_tokens_seen": 44567230, "step": 2087, "time_per_iteration": 2.5072739124298096 }, { "auxiliary_loss_clip": 0.01200347, "auxiliary_loss_mlp": 0.01029978, "balance_loss_clip": 1.06318438, "balance_loss_mlp": 1.02018487, "epoch": 0.25106715565442195, "flos": 26943237671040.0, "grad_norm": 2.52724869028484, "language_loss": 0.7207886, "learning_rate": 3.50903864763888e-06, "loss": 0.74309188, "num_input_tokens_seen": 44588020, "step": 2088, "time_per_iteration": 2.5900368690490723 }, { "auxiliary_loss_clip": 0.01200584, "auxiliary_loss_mlp": 0.01031707, "balance_loss_clip": 1.05949688, "balance_loss_mlp": 1.02236116, "epoch": 0.251187398545061, "flos": 48359570572800.0, "grad_norm": 1.9646511295915956, "language_loss": 0.75727779, "learning_rate": 3.5085273113695965e-06, "loss": 0.77960068, "num_input_tokens_seen": 44612590, "step": 2089, "time_per_iteration": 2.723231077194214 }, { "auxiliary_loss_clip": 0.01210739, "auxiliary_loss_mlp": 0.01034052, "balance_loss_clip": 1.06029487, "balance_loss_mlp": 1.02427125, "epoch": 0.2513076414357001, "flos": 27016100409600.0, "grad_norm": 1.789578014643017, "language_loss": 0.78525037, "learning_rate": 3.508015746257919e-06, "loss": 0.80769825, "num_input_tokens_seen": 44631630, "step": 2090, "time_per_iteration": 2.5166873931884766 }, { "auxiliary_loss_clip": 0.0116848, "auxiliary_loss_mlp": 0.01034886, "balance_loss_clip": 1.05636525, "balance_loss_mlp": 1.02509272, "epoch": 0.2514278843263392, "flos": 19463619882240.0, "grad_norm": 2.0204404147046375, "language_loss": 0.83061755, "learning_rate": 3.5075039523814518e-06, "loss": 0.85265118, "num_input_tokens_seen": 44650820, "step": 2091, "time_per_iteration": 2.564723491668701 }, { "auxiliary_loss_clip": 0.01200617, "auxiliary_loss_mlp": 0.01031737, "balance_loss_clip": 1.05775988, "balance_loss_mlp": 1.0213418, "epoch": 0.2515481272169783, "flos": 16866092885760.0, "grad_norm": 2.188227869645875, "language_loss": 0.81560993, "learning_rate": 3.506991929817834e-06, "loss": 0.83793342, "num_input_tokens_seen": 44667540, "step": 2092, "time_per_iteration": 2.5234711170196533 }, { "auxiliary_loss_clip": 0.01206147, "auxiliary_loss_mlp": 0.0102999, "balance_loss_clip": 1.06064868, "balance_loss_mlp": 1.02123988, "epoch": 0.2516683701076174, "flos": 23732464752000.0, "grad_norm": 1.777410296667334, "language_loss": 0.82636571, "learning_rate": 3.506479678644738e-06, "loss": 0.84872711, "num_input_tokens_seen": 44687935, "step": 2093, "time_per_iteration": 2.471876621246338 }, { "auxiliary_loss_clip": 0.0114097, "auxiliary_loss_mlp": 0.01029315, "balance_loss_clip": 1.05137801, "balance_loss_mlp": 1.02036881, "epoch": 0.2517886129982565, "flos": 27635954434560.0, "grad_norm": 2.463330777144152, "language_loss": 0.73889148, "learning_rate": 3.505967198939873e-06, "loss": 0.76059437, "num_input_tokens_seen": 44704975, "step": 2094, "time_per_iteration": 2.62829852104187 }, { "auxiliary_loss_clip": 0.01174546, "auxiliary_loss_mlp": 0.01028558, "balance_loss_clip": 1.05231702, "balance_loss_mlp": 1.01917052, "epoch": 0.25190885588889556, "flos": 38104596529920.0, "grad_norm": 1.9723883693400794, "language_loss": 0.77958047, "learning_rate": 3.5054544907809813e-06, "loss": 0.80161142, "num_input_tokens_seen": 44725475, "step": 2095, "time_per_iteration": 2.6456923484802246 }, { "auxiliary_loss_clip": 0.01177222, "auxiliary_loss_mlp": 0.00764623, "balance_loss_clip": 1.05789363, "balance_loss_mlp": 1.00136232, "epoch": 0.25202909877953467, "flos": 22269894768000.0, "grad_norm": 2.785930989601767, "language_loss": 0.80698156, "learning_rate": 3.50494155424584e-06, "loss": 0.82640004, "num_input_tokens_seen": 44744380, "step": 2096, "time_per_iteration": 2.55389404296875 }, { "auxiliary_loss_clip": 0.01198467, "auxiliary_loss_mlp": 0.01032096, "balance_loss_clip": 1.05908585, "balance_loss_mlp": 1.02257681, "epoch": 0.2521493416701738, "flos": 21761759018880.0, "grad_norm": 1.623382079299628, "language_loss": 0.83419585, "learning_rate": 3.504428389412262e-06, "loss": 0.85650146, "num_input_tokens_seen": 44765190, "step": 2097, "time_per_iteration": 2.52713942527771 }, { "auxiliary_loss_clip": 0.01190967, "auxiliary_loss_mlp": 0.01033106, "balance_loss_clip": 1.05650091, "balance_loss_mlp": 1.02393866, "epoch": 0.25226958456081283, "flos": 27746738956800.0, "grad_norm": 2.2312354328634574, "language_loss": 0.72665489, "learning_rate": 3.5039149963580927e-06, "loss": 0.74889559, "num_input_tokens_seen": 44785210, "step": 2098, "time_per_iteration": 3.3202974796295166 }, { "auxiliary_loss_clip": 0.01174854, "auxiliary_loss_mlp": 0.01033435, "balance_loss_clip": 1.05871892, "balance_loss_mlp": 1.02432728, "epoch": 0.25238982745145194, "flos": 30732171903360.0, "grad_norm": 2.281896727527845, "language_loss": 0.70450383, "learning_rate": 3.503401375161215e-06, "loss": 0.72658676, "num_input_tokens_seen": 44804955, "step": 2099, "time_per_iteration": 2.578850269317627 }, { "auxiliary_loss_clip": 0.0120508, "auxiliary_loss_mlp": 0.01030092, "balance_loss_clip": 1.05811071, "balance_loss_mlp": 1.0211755, "epoch": 0.252510070342091, "flos": 20266331068800.0, "grad_norm": 1.7796924308867659, "language_loss": 0.83597136, "learning_rate": 3.502887525899544e-06, "loss": 0.85832304, "num_input_tokens_seen": 44823935, "step": 2100, "time_per_iteration": 2.46354341506958 }, { "auxiliary_loss_clip": 0.01180265, "auxiliary_loss_mlp": 0.01029197, "balance_loss_clip": 1.05734277, "balance_loss_mlp": 1.01944029, "epoch": 0.2526303132327301, "flos": 22747399194240.0, "grad_norm": 1.681258631645701, "language_loss": 0.82519603, "learning_rate": 3.50237344865103e-06, "loss": 0.84729064, "num_input_tokens_seen": 44844935, "step": 2101, "time_per_iteration": 2.5621910095214844 }, { "auxiliary_loss_clip": 0.01210722, "auxiliary_loss_mlp": 0.0103736, "balance_loss_clip": 1.06085944, "balance_loss_mlp": 1.02814507, "epoch": 0.2527505561233692, "flos": 30263466309120.0, "grad_norm": 3.0287617960629722, "language_loss": 0.76394808, "learning_rate": 3.501859143493658e-06, "loss": 0.78642887, "num_input_tokens_seen": 44865565, "step": 2102, "time_per_iteration": 2.5238852500915527 }, { "auxiliary_loss_clip": 0.01127548, "auxiliary_loss_mlp": 0.01004128, "balance_loss_clip": 1.04237247, "balance_loss_mlp": 1.00212514, "epoch": 0.2528707990140083, "flos": 58492917164160.0, "grad_norm": 0.9198774542697352, "language_loss": 0.60590291, "learning_rate": 3.5013446105054488e-06, "loss": 0.62721968, "num_input_tokens_seen": 44918485, "step": 2103, "time_per_iteration": 2.8064167499542236 }, { "auxiliary_loss_clip": 0.01149734, "auxiliary_loss_mlp": 0.01036195, "balance_loss_clip": 1.053478, "balance_loss_mlp": 1.02671242, "epoch": 0.2529910419046474, "flos": 24645134448000.0, "grad_norm": 2.689947959380527, "language_loss": 0.74874812, "learning_rate": 3.5008298497644555e-06, "loss": 0.77060741, "num_input_tokens_seen": 44937530, "step": 2104, "time_per_iteration": 3.510887384414673 }, { "auxiliary_loss_clip": 0.01168663, "auxiliary_loss_mlp": 0.01034801, "balance_loss_clip": 1.05809307, "balance_loss_mlp": 1.02478135, "epoch": 0.2531112847952865, "flos": 23842135952640.0, "grad_norm": 1.620817927311642, "language_loss": 0.88023871, "learning_rate": 3.500314861348767e-06, "loss": 0.90227336, "num_input_tokens_seen": 44958165, "step": 2105, "time_per_iteration": 3.3672237396240234 }, { "auxiliary_loss_clip": 0.01157978, "auxiliary_loss_mlp": 0.01036351, "balance_loss_clip": 1.05668521, "balance_loss_mlp": 1.02749956, "epoch": 0.25323152768592555, "flos": 16143822207360.0, "grad_norm": 1.7870348900076634, "language_loss": 0.76794493, "learning_rate": 3.499799645336507e-06, "loss": 0.78988814, "num_input_tokens_seen": 44975060, "step": 2106, "time_per_iteration": 2.5176327228546143 }, { "auxiliary_loss_clip": 0.01198163, "auxiliary_loss_mlp": 0.01029664, "balance_loss_clip": 1.06277585, "balance_loss_mlp": 1.02108669, "epoch": 0.25335177057656466, "flos": 28405161210240.0, "grad_norm": 1.560940088177471, "language_loss": 0.87272751, "learning_rate": 3.4992842018058336e-06, "loss": 0.89500576, "num_input_tokens_seen": 44997960, "step": 2107, "time_per_iteration": 2.5629451274871826 }, { "auxiliary_loss_clip": 0.01170272, "auxiliary_loss_mlp": 0.01029399, "balance_loss_clip": 1.05606508, "balance_loss_mlp": 1.02040446, "epoch": 0.25347201346720377, "flos": 18799666934400.0, "grad_norm": 2.247012906264391, "language_loss": 0.88612056, "learning_rate": 3.4987685308349384e-06, "loss": 0.90811729, "num_input_tokens_seen": 45015690, "step": 2108, "time_per_iteration": 2.551663875579834 }, { "auxiliary_loss_clip": 0.01162662, "auxiliary_loss_mlp": 0.01036231, "balance_loss_clip": 1.05203533, "balance_loss_mlp": 1.02670062, "epoch": 0.2535922563578428, "flos": 15815490963840.0, "grad_norm": 2.1902859430760744, "language_loss": 0.61364943, "learning_rate": 3.4982526325020497e-06, "loss": 0.63563836, "num_input_tokens_seen": 45032660, "step": 2109, "time_per_iteration": 2.5311169624328613 }, { "auxiliary_loss_clip": 0.01184227, "auxiliary_loss_mlp": 0.01031697, "balance_loss_clip": 1.06005657, "balance_loss_mlp": 1.02185011, "epoch": 0.25371249924848194, "flos": 16318922031360.0, "grad_norm": 2.5398507535148274, "language_loss": 0.8234089, "learning_rate": 3.4977365068854273e-06, "loss": 0.84556818, "num_input_tokens_seen": 45048280, "step": 2110, "time_per_iteration": 2.4695026874542236 }, { "auxiliary_loss_clip": 0.01174229, "auxiliary_loss_mlp": 0.01033969, "balance_loss_clip": 1.05703449, "balance_loss_mlp": 1.02419376, "epoch": 0.25383274213912105, "flos": 21761615364480.0, "grad_norm": 1.7644066351984797, "language_loss": 0.73649764, "learning_rate": 3.4972201540633676e-06, "loss": 0.75857961, "num_input_tokens_seen": 45067635, "step": 2111, "time_per_iteration": 2.526698350906372 }, { "auxiliary_loss_clip": 0.01169871, "auxiliary_loss_mlp": 0.01033634, "balance_loss_clip": 1.05546629, "balance_loss_mlp": 1.02351904, "epoch": 0.2539529850297601, "flos": 21396870708480.0, "grad_norm": 1.7543738315816078, "language_loss": 0.85379744, "learning_rate": 3.4967035741142008e-06, "loss": 0.87583244, "num_input_tokens_seen": 45086455, "step": 2112, "time_per_iteration": 2.527505874633789 }, { "auxiliary_loss_clip": 0.01171342, "auxiliary_loss_mlp": 0.01033705, "balance_loss_clip": 1.06256962, "balance_loss_mlp": 1.02473426, "epoch": 0.2540732279203992, "flos": 25228467319680.0, "grad_norm": 1.823329256966325, "language_loss": 0.81955135, "learning_rate": 3.4961867671162917e-06, "loss": 0.84160179, "num_input_tokens_seen": 45106385, "step": 2113, "time_per_iteration": 2.5422539710998535 }, { "auxiliary_loss_clip": 0.0121323, "auxiliary_loss_mlp": 0.01032302, "balance_loss_clip": 1.06257617, "balance_loss_mlp": 1.02240777, "epoch": 0.2541934708110383, "flos": 19427386037760.0, "grad_norm": 2.452121420849565, "language_loss": 0.77036297, "learning_rate": 3.4956697331480402e-06, "loss": 0.79281831, "num_input_tokens_seen": 45124955, "step": 2114, "time_per_iteration": 2.428863525390625 }, { "auxiliary_loss_clip": 0.01171696, "auxiliary_loss_mlp": 0.01032802, "balance_loss_clip": 1.05643868, "balance_loss_mlp": 1.02305675, "epoch": 0.2543137137016774, "flos": 23949436855680.0, "grad_norm": 1.7019061444726813, "language_loss": 0.79976726, "learning_rate": 3.495152472287879e-06, "loss": 0.82181227, "num_input_tokens_seen": 45145665, "step": 2115, "time_per_iteration": 2.6080031394958496 }, { "auxiliary_loss_clip": 0.01165409, "auxiliary_loss_mlp": 0.01030051, "balance_loss_clip": 1.05810928, "balance_loss_mlp": 1.02141404, "epoch": 0.2544339565923165, "flos": 25593283802880.0, "grad_norm": 1.8134489290409792, "language_loss": 0.73761773, "learning_rate": 3.4946349846142766e-06, "loss": 0.75957233, "num_input_tokens_seen": 45164805, "step": 2116, "time_per_iteration": 2.5908701419830322 }, { "auxiliary_loss_clip": 0.01210647, "auxiliary_loss_mlp": 0.01039099, "balance_loss_clip": 1.0630033, "balance_loss_mlp": 1.02992618, "epoch": 0.25455419948295555, "flos": 21689470897920.0, "grad_norm": 2.714984083260144, "language_loss": 0.7545352, "learning_rate": 3.4941172702057353e-06, "loss": 0.77703261, "num_input_tokens_seen": 45184865, "step": 2117, "time_per_iteration": 2.5367674827575684 }, { "auxiliary_loss_clip": 0.01179643, "auxiliary_loss_mlp": 0.01033511, "balance_loss_clip": 1.06030905, "balance_loss_mlp": 1.02409899, "epoch": 0.25467444237359466, "flos": 26250341339520.0, "grad_norm": 1.8224707180842552, "language_loss": 0.80697495, "learning_rate": 3.4935993291407924e-06, "loss": 0.82910645, "num_input_tokens_seen": 45203690, "step": 2118, "time_per_iteration": 2.5747146606445312 }, { "auxiliary_loss_clip": 0.01177731, "auxiliary_loss_mlp": 0.01033375, "balance_loss_clip": 1.05787659, "balance_loss_mlp": 1.02348089, "epoch": 0.25479468526423377, "flos": 26979686997120.0, "grad_norm": 2.3822660401443474, "language_loss": 0.70912182, "learning_rate": 3.4930811614980183e-06, "loss": 0.73123294, "num_input_tokens_seen": 45225385, "step": 2119, "time_per_iteration": 2.5754342079162598 }, { "auxiliary_loss_clip": 0.01188256, "auxiliary_loss_mlp": 0.01035929, "balance_loss_clip": 1.05858421, "balance_loss_mlp": 1.02682686, "epoch": 0.2549149281548728, "flos": 23475811098240.0, "grad_norm": 1.8970964193664563, "language_loss": 0.79338145, "learning_rate": 3.4925627673560198e-06, "loss": 0.81562328, "num_input_tokens_seen": 45246045, "step": 2120, "time_per_iteration": 2.522830009460449 }, { "auxiliary_loss_clip": 0.01165668, "auxiliary_loss_mlp": 0.01038438, "balance_loss_clip": 1.05677044, "balance_loss_mlp": 1.02977741, "epoch": 0.25503517104551193, "flos": 25812302981760.0, "grad_norm": 2.661072957673536, "language_loss": 0.88428557, "learning_rate": 3.4920441467934357e-06, "loss": 0.90632659, "num_input_tokens_seen": 45266560, "step": 2121, "time_per_iteration": 2.587238311767578 }, { "auxiliary_loss_clip": 0.01157619, "auxiliary_loss_mlp": 0.01036816, "balance_loss_clip": 1.05562901, "balance_loss_mlp": 1.02777994, "epoch": 0.25515541393615104, "flos": 26645106787200.0, "grad_norm": 1.966157032080573, "language_loss": 0.82839191, "learning_rate": 3.491525299888941e-06, "loss": 0.85033631, "num_input_tokens_seen": 45285405, "step": 2122, "time_per_iteration": 2.6034395694732666 }, { "auxiliary_loss_clip": 0.01094853, "auxiliary_loss_mlp": 0.00754878, "balance_loss_clip": 1.04400086, "balance_loss_mlp": 1.00095141, "epoch": 0.2552756568267901, "flos": 65955945847680.0, "grad_norm": 0.8849975689910463, "language_loss": 0.62691009, "learning_rate": 3.491006226721244e-06, "loss": 0.64540744, "num_input_tokens_seen": 45349615, "step": 2123, "time_per_iteration": 3.1456305980682373 }, { "auxiliary_loss_clip": 0.01186754, "auxiliary_loss_mlp": 0.007643, "balance_loss_clip": 1.06344199, "balance_loss_mlp": 1.0012691, "epoch": 0.2553958997174292, "flos": 17931096161280.0, "grad_norm": 1.8908626252248844, "language_loss": 0.77408051, "learning_rate": 3.4904869273690882e-06, "loss": 0.79359102, "num_input_tokens_seen": 45367505, "step": 2124, "time_per_iteration": 2.534048318862915 }, { "auxiliary_loss_clip": 0.01198786, "auxiliary_loss_mlp": 0.01026167, "balance_loss_clip": 1.06181324, "balance_loss_mlp": 1.01729155, "epoch": 0.2555161426080683, "flos": 23367791923200.0, "grad_norm": 2.7644258258860352, "language_loss": 0.88696647, "learning_rate": 3.489967401911251e-06, "loss": 0.90921593, "num_input_tokens_seen": 45386805, "step": 2125, "time_per_iteration": 3.295128583908081 }, { "auxiliary_loss_clip": 0.0121717, "auxiliary_loss_mlp": 0.01033972, "balance_loss_clip": 1.06608415, "balance_loss_mlp": 1.02309442, "epoch": 0.2556363854987074, "flos": 40625130723840.0, "grad_norm": 1.8508022066010383, "language_loss": 0.6953373, "learning_rate": 3.4894476504265428e-06, "loss": 0.71784872, "num_input_tokens_seen": 45411045, "step": 2126, "time_per_iteration": 2.6433393955230713 }, { "auxiliary_loss_clip": 0.01115047, "auxiliary_loss_mlp": 0.01004231, "balance_loss_clip": 1.04365349, "balance_loss_mlp": 1.00169134, "epoch": 0.2557566283893465, "flos": 68019443389440.0, "grad_norm": 0.7405179474258614, "language_loss": 0.5443002, "learning_rate": 3.4889276729938104e-06, "loss": 0.56549299, "num_input_tokens_seen": 45469575, "step": 2127, "time_per_iteration": 2.9664998054504395 }, { "auxiliary_loss_clip": 0.01174716, "auxiliary_loss_mlp": 0.01027755, "balance_loss_clip": 1.05700588, "balance_loss_mlp": 1.01780152, "epoch": 0.2558768712799856, "flos": 22635645004800.0, "grad_norm": 1.9136250464817475, "language_loss": 0.80596977, "learning_rate": 3.488407469691934e-06, "loss": 0.82799447, "num_input_tokens_seen": 45490270, "step": 2128, "time_per_iteration": 2.522911787033081 }, { "auxiliary_loss_clip": 0.01179273, "auxiliary_loss_mlp": 0.01032516, "balance_loss_clip": 1.05745125, "balance_loss_mlp": 1.02284241, "epoch": 0.25599711417062465, "flos": 26396354125440.0, "grad_norm": 2.349254643827396, "language_loss": 0.80623996, "learning_rate": 3.487887040599828e-06, "loss": 0.82835782, "num_input_tokens_seen": 45510070, "step": 2129, "time_per_iteration": 2.5704989433288574 }, { "auxiliary_loss_clip": 0.01213838, "auxiliary_loss_mlp": 0.0103564, "balance_loss_clip": 1.06481028, "balance_loss_mlp": 1.02570391, "epoch": 0.25611735706126376, "flos": 22852042490880.0, "grad_norm": 2.2105955077988324, "language_loss": 0.7597664, "learning_rate": 3.4873663857964407e-06, "loss": 0.78226113, "num_input_tokens_seen": 45527285, "step": 2130, "time_per_iteration": 3.4133925437927246 }, { "auxiliary_loss_clip": 0.011489, "auxiliary_loss_mlp": 0.01035092, "balance_loss_clip": 1.05491734, "balance_loss_mlp": 1.02495933, "epoch": 0.2562375999519028, "flos": 23367863750400.0, "grad_norm": 1.7716620352230272, "language_loss": 0.66543591, "learning_rate": 3.4868455053607556e-06, "loss": 0.68727589, "num_input_tokens_seen": 45546900, "step": 2131, "time_per_iteration": 3.3764991760253906 }, { "auxiliary_loss_clip": 0.01199258, "auxiliary_loss_mlp": 0.0103568, "balance_loss_clip": 1.05922198, "balance_loss_mlp": 1.02603614, "epoch": 0.2563578428425419, "flos": 22856962654080.0, "grad_norm": 3.2073863762838473, "language_loss": 0.72177035, "learning_rate": 3.486324399371789e-06, "loss": 0.74411976, "num_input_tokens_seen": 45566200, "step": 2132, "time_per_iteration": 2.498868942260742 }, { "auxiliary_loss_clip": 0.01161479, "auxiliary_loss_mlp": 0.01038006, "balance_loss_clip": 1.05672216, "balance_loss_mlp": 1.02896965, "epoch": 0.25647808573318104, "flos": 21653883498240.0, "grad_norm": 1.9541045187352488, "language_loss": 0.78494954, "learning_rate": 3.485803067908593e-06, "loss": 0.80694437, "num_input_tokens_seen": 45585710, "step": 2133, "time_per_iteration": 2.546391487121582 }, { "auxiliary_loss_clip": 0.01110771, "auxiliary_loss_mlp": 0.0103468, "balance_loss_clip": 1.04474068, "balance_loss_mlp": 1.02492917, "epoch": 0.2565983286238201, "flos": 33730569659520.0, "grad_norm": 1.8062645234911745, "language_loss": 0.79570651, "learning_rate": 3.485281511050253e-06, "loss": 0.81716096, "num_input_tokens_seen": 45607845, "step": 2134, "time_per_iteration": 2.7235312461853027 }, { "auxiliary_loss_clip": 0.01199729, "auxiliary_loss_mlp": 0.01034719, "balance_loss_clip": 1.06069183, "balance_loss_mlp": 1.02504539, "epoch": 0.2567185715144592, "flos": 16216002587520.0, "grad_norm": 3.6119305721229447, "language_loss": 0.89920598, "learning_rate": 3.484759728875889e-06, "loss": 0.92155039, "num_input_tokens_seen": 45623210, "step": 2135, "time_per_iteration": 2.4774370193481445 }, { "auxiliary_loss_clip": 0.01136779, "auxiliary_loss_mlp": 0.0103751, "balance_loss_clip": 1.05229926, "balance_loss_mlp": 1.0283016, "epoch": 0.2568388144050983, "flos": 17458475984640.0, "grad_norm": 1.7588320643640283, "language_loss": 0.80961126, "learning_rate": 3.4842377214646543e-06, "loss": 0.83135414, "num_input_tokens_seen": 45641505, "step": 2136, "time_per_iteration": 2.571418046951294 }, { "auxiliary_loss_clip": 0.01209165, "auxiliary_loss_mlp": 0.01035252, "balance_loss_clip": 1.0621078, "balance_loss_mlp": 1.02619827, "epoch": 0.25695905729573737, "flos": 20887442069760.0, "grad_norm": 1.7296693757860617, "language_loss": 0.66624212, "learning_rate": 3.483715488895737e-06, "loss": 0.68868637, "num_input_tokens_seen": 45661835, "step": 2137, "time_per_iteration": 2.4665029048919678 }, { "auxiliary_loss_clip": 0.01147459, "auxiliary_loss_mlp": 0.01029968, "balance_loss_clip": 1.05019176, "balance_loss_mlp": 1.02044892, "epoch": 0.2570793001863765, "flos": 24717278914560.0, "grad_norm": 2.1795694889333523, "language_loss": 0.78334129, "learning_rate": 3.48319303124836e-06, "loss": 0.80511558, "num_input_tokens_seen": 45682215, "step": 2138, "time_per_iteration": 2.625080108642578 }, { "auxiliary_loss_clip": 0.0117557, "auxiliary_loss_mlp": 0.01031237, "balance_loss_clip": 1.05925131, "balance_loss_mlp": 1.0218612, "epoch": 0.2571995430770156, "flos": 26906896085760.0, "grad_norm": 2.085022932892146, "language_loss": 0.66648912, "learning_rate": 3.4826703486017798e-06, "loss": 0.68855721, "num_input_tokens_seen": 45701840, "step": 2139, "time_per_iteration": 2.5612897872924805 }, { "auxiliary_loss_clip": 0.01192519, "auxiliary_loss_mlp": 0.01030485, "balance_loss_clip": 1.06067789, "balance_loss_mlp": 1.02156854, "epoch": 0.25731978596765465, "flos": 19792561656960.0, "grad_norm": 1.6318146761712238, "language_loss": 0.76885098, "learning_rate": 3.4821474410352867e-06, "loss": 0.79108107, "num_input_tokens_seen": 45720500, "step": 2140, "time_per_iteration": 2.556918144226074 }, { "auxiliary_loss_clip": 0.01084958, "auxiliary_loss_mlp": 0.01005569, "balance_loss_clip": 1.03921795, "balance_loss_mlp": 1.00239778, "epoch": 0.25744002885829376, "flos": 70564970471040.0, "grad_norm": 0.8999623557472874, "language_loss": 0.62621933, "learning_rate": 3.481624308628205e-06, "loss": 0.64712459, "num_input_tokens_seen": 45781870, "step": 2141, "time_per_iteration": 3.2933452129364014 }, { "auxiliary_loss_clip": 0.01177322, "auxiliary_loss_mlp": 0.01033661, "balance_loss_clip": 1.05670762, "balance_loss_mlp": 1.02414823, "epoch": 0.25756027174893287, "flos": 18038181582720.0, "grad_norm": 3.212182069857203, "language_loss": 1.00315654, "learning_rate": 3.481100951459893e-06, "loss": 1.02526641, "num_input_tokens_seen": 45794890, "step": 2142, "time_per_iteration": 2.534937620162964 }, { "auxiliary_loss_clip": 0.01191474, "auxiliary_loss_mlp": 0.01031178, "balance_loss_clip": 1.058568, "balance_loss_mlp": 1.0217663, "epoch": 0.2576805146395719, "flos": 22674069578880.0, "grad_norm": 1.5899786862721246, "language_loss": 0.78834361, "learning_rate": 3.4805773696097453e-06, "loss": 0.81057012, "num_input_tokens_seen": 45815780, "step": 2143, "time_per_iteration": 2.525907039642334 }, { "auxiliary_loss_clip": 0.01174866, "auxiliary_loss_mlp": 0.01029233, "balance_loss_clip": 1.05991483, "balance_loss_mlp": 1.01961243, "epoch": 0.25780075753021103, "flos": 16472225278080.0, "grad_norm": 1.9527030131637773, "language_loss": 0.87907928, "learning_rate": 3.4800535631571874e-06, "loss": 0.90112025, "num_input_tokens_seen": 45831310, "step": 2144, "time_per_iteration": 2.5082976818084717 }, { "auxiliary_loss_clip": 0.01183989, "auxiliary_loss_mlp": 0.01037565, "balance_loss_clip": 1.05801511, "balance_loss_mlp": 1.0278194, "epoch": 0.25792100042085014, "flos": 22820297846400.0, "grad_norm": 2.493707492825981, "language_loss": 0.76478982, "learning_rate": 3.4795295321816804e-06, "loss": 0.78700531, "num_input_tokens_seen": 45850135, "step": 2145, "time_per_iteration": 2.529543876647949 }, { "auxiliary_loss_clip": 0.01168256, "auxiliary_loss_mlp": 0.01037876, "balance_loss_clip": 1.05690563, "balance_loss_mlp": 1.02841663, "epoch": 0.2580412433114892, "flos": 18697286194560.0, "grad_norm": 3.393185209914697, "language_loss": 0.91164839, "learning_rate": 3.47900527676272e-06, "loss": 0.93370974, "num_input_tokens_seen": 45868470, "step": 2146, "time_per_iteration": 2.5117640495300293 }, { "auxiliary_loss_clip": 0.01211428, "auxiliary_loss_mlp": 0.01035204, "balance_loss_clip": 1.06449318, "balance_loss_mlp": 1.02572691, "epoch": 0.2581614862021283, "flos": 14283146810880.0, "grad_norm": 1.9667345444962852, "language_loss": 0.8832534, "learning_rate": 3.478480796979835e-06, "loss": 0.90571964, "num_input_tokens_seen": 45886355, "step": 2147, "time_per_iteration": 2.446657657623291 }, { "auxiliary_loss_clip": 0.01174209, "auxiliary_loss_mlp": 0.01028127, "balance_loss_clip": 1.05717802, "balance_loss_mlp": 1.01895952, "epoch": 0.25828172909276736, "flos": 29498281856640.0, "grad_norm": 1.4975642819999468, "language_loss": 0.7757293, "learning_rate": 3.4779560929125894e-06, "loss": 0.79775262, "num_input_tokens_seen": 45907900, "step": 2148, "time_per_iteration": 2.6038734912872314 }, { "auxiliary_loss_clip": 0.01089575, "auxiliary_loss_mlp": 0.01005444, "balance_loss_clip": 1.04155564, "balance_loss_mlp": 1.00286889, "epoch": 0.2584019719834065, "flos": 67114387376640.0, "grad_norm": 0.6799228745901635, "language_loss": 0.56979376, "learning_rate": 3.4774311646405783e-06, "loss": 0.5907439, "num_input_tokens_seen": 45977805, "step": 2149, "time_per_iteration": 3.242149591445923 }, { "auxiliary_loss_clip": 0.0115538, "auxiliary_loss_mlp": 0.01031984, "balance_loss_clip": 1.05454135, "balance_loss_mlp": 1.02326965, "epoch": 0.2585222148740456, "flos": 22893555634560.0, "grad_norm": 2.2430496661563573, "language_loss": 0.83480787, "learning_rate": 3.476906012243435e-06, "loss": 0.85668153, "num_input_tokens_seen": 45996715, "step": 2150, "time_per_iteration": 2.583371877670288 }, { "auxiliary_loss_clip": 0.01184199, "auxiliary_loss_mlp": 0.0102884, "balance_loss_clip": 1.06057811, "balance_loss_mlp": 1.01930368, "epoch": 0.25864245776468464, "flos": 28909202808960.0, "grad_norm": 1.6246665045476154, "language_loss": 0.81386536, "learning_rate": 3.476380635800824e-06, "loss": 0.83599567, "num_input_tokens_seen": 46017915, "step": 2151, "time_per_iteration": 2.6141316890716553 }, { "auxiliary_loss_clip": 0.011777, "auxiliary_loss_mlp": 0.01031166, "balance_loss_clip": 1.05898774, "balance_loss_mlp": 1.02232063, "epoch": 0.25876270065532375, "flos": 14793185980800.0, "grad_norm": 2.0830984784151703, "language_loss": 0.86036265, "learning_rate": 3.475855035392444e-06, "loss": 0.88245124, "num_input_tokens_seen": 46033235, "step": 2152, "time_per_iteration": 3.8390533924102783 }, { "auxiliary_loss_clip": 0.01131826, "auxiliary_loss_mlp": 0.01029817, "balance_loss_clip": 1.05437434, "balance_loss_mlp": 1.02089381, "epoch": 0.25888294354596286, "flos": 60467821810560.0, "grad_norm": 2.08306049660786, "language_loss": 0.71569443, "learning_rate": 3.475329211098029e-06, "loss": 0.73731083, "num_input_tokens_seen": 46056390, "step": 2153, "time_per_iteration": 2.9688358306884766 }, { "auxiliary_loss_clip": 0.01152508, "auxiliary_loss_mlp": 0.01028557, "balance_loss_clip": 1.05670452, "balance_loss_mlp": 1.01941359, "epoch": 0.2590031864366019, "flos": 27851166771840.0, "grad_norm": 1.5582507213354646, "language_loss": 0.82287574, "learning_rate": 3.4748031629973453e-06, "loss": 0.84468639, "num_input_tokens_seen": 46077120, "step": 2154, "time_per_iteration": 2.6567227840423584 }, { "auxiliary_loss_clip": 0.01073611, "auxiliary_loss_mlp": 0.01003855, "balance_loss_clip": 1.04009473, "balance_loss_mlp": 1.00209069, "epoch": 0.25912342932724103, "flos": 62422444206720.0, "grad_norm": 0.9153383801395144, "language_loss": 0.56557369, "learning_rate": 3.4742768911701944e-06, "loss": 0.58634841, "num_input_tokens_seen": 46139815, "step": 2155, "time_per_iteration": 3.23642897605896 }, { "auxiliary_loss_clip": 0.0120126, "auxiliary_loss_mlp": 0.01040047, "balance_loss_clip": 1.06386042, "balance_loss_mlp": 1.0291332, "epoch": 0.25924367221788014, "flos": 12378839368320.0, "grad_norm": 3.071590148759398, "language_loss": 0.69918042, "learning_rate": 3.4737503956964113e-06, "loss": 0.72159344, "num_input_tokens_seen": 46152120, "step": 2156, "time_per_iteration": 4.235851764678955 }, { "auxiliary_loss_clip": 0.01172114, "auxiliary_loss_mlp": 0.01038798, "balance_loss_clip": 1.05689108, "balance_loss_mlp": 1.02848077, "epoch": 0.2593639151085192, "flos": 14575208296320.0, "grad_norm": 3.177376680075476, "language_loss": 0.67230749, "learning_rate": 3.473223676655865e-06, "loss": 0.69441658, "num_input_tokens_seen": 46170120, "step": 2157, "time_per_iteration": 3.321575164794922 }, { "auxiliary_loss_clip": 0.01171035, "auxiliary_loss_mlp": 0.01034157, "balance_loss_clip": 1.05484676, "balance_loss_mlp": 1.02372575, "epoch": 0.2594841579991583, "flos": 15230937029760.0, "grad_norm": 1.986048264940477, "language_loss": 0.79678667, "learning_rate": 3.472696734128459e-06, "loss": 0.8188386, "num_input_tokens_seen": 46187985, "step": 2158, "time_per_iteration": 2.5242321491241455 }, { "auxiliary_loss_clip": 0.0119552, "auxiliary_loss_mlp": 0.01031364, "balance_loss_clip": 1.06124496, "balance_loss_mlp": 1.02183914, "epoch": 0.2596044008897974, "flos": 23623583650560.0, "grad_norm": 2.572027662561668, "language_loss": 0.75841039, "learning_rate": 3.4721695681941286e-06, "loss": 0.78067923, "num_input_tokens_seen": 46207025, "step": 2159, "time_per_iteration": 2.5904955863952637 }, { "auxiliary_loss_clip": 0.01175774, "auxiliary_loss_mlp": 0.00764758, "balance_loss_clip": 1.05667567, "balance_loss_mlp": 1.00088191, "epoch": 0.25972464378043647, "flos": 13772281628160.0, "grad_norm": 3.556448341155961, "language_loss": 0.82570046, "learning_rate": 3.471642178932845e-06, "loss": 0.84510577, "num_input_tokens_seen": 46225670, "step": 2160, "time_per_iteration": 2.5687167644500732 }, { "auxiliary_loss_clip": 0.01180046, "auxiliary_loss_mlp": 0.01030961, "balance_loss_clip": 1.05669641, "balance_loss_mlp": 1.02163255, "epoch": 0.2598448866710756, "flos": 19573578391680.0, "grad_norm": 2.3503553334861516, "language_loss": 0.89278805, "learning_rate": 3.471114566424613e-06, "loss": 0.91489816, "num_input_tokens_seen": 46244130, "step": 2161, "time_per_iteration": 2.524306297302246 }, { "auxiliary_loss_clip": 0.01178501, "auxiliary_loss_mlp": 0.01030836, "balance_loss_clip": 1.06000483, "balance_loss_mlp": 1.02081084, "epoch": 0.25996512956171464, "flos": 21653237053440.0, "grad_norm": 1.868592188369467, "language_loss": 0.76046926, "learning_rate": 3.4705867307494715e-06, "loss": 0.78256261, "num_input_tokens_seen": 46263200, "step": 2162, "time_per_iteration": 2.550487756729126 }, { "auxiliary_loss_clip": 0.01197357, "auxiliary_loss_mlp": 0.01030685, "balance_loss_clip": 1.06076562, "balance_loss_mlp": 1.02163744, "epoch": 0.26008537245235375, "flos": 18223480869120.0, "grad_norm": 2.1920378979209243, "language_loss": 0.84468186, "learning_rate": 3.470058671987492e-06, "loss": 0.86696231, "num_input_tokens_seen": 46281465, "step": 2163, "time_per_iteration": 2.4827804565429688 }, { "auxiliary_loss_clip": 0.01198144, "auxiliary_loss_mlp": 0.01036858, "balance_loss_clip": 1.05964136, "balance_loss_mlp": 1.02676725, "epoch": 0.26020561534299286, "flos": 24645385843200.0, "grad_norm": 1.9838805671554751, "language_loss": 0.84219944, "learning_rate": 3.4695303902187805e-06, "loss": 0.86454946, "num_input_tokens_seen": 46301020, "step": 2164, "time_per_iteration": 2.576317310333252 }, { "auxiliary_loss_clip": 0.01159739, "auxiliary_loss_mlp": 0.01037567, "balance_loss_clip": 1.05413914, "balance_loss_mlp": 1.02752924, "epoch": 0.2603258582336319, "flos": 25773662926080.0, "grad_norm": 2.0365316758181566, "language_loss": 0.78617871, "learning_rate": 3.469001885523478e-06, "loss": 0.80815184, "num_input_tokens_seen": 46321740, "step": 2165, "time_per_iteration": 2.676100254058838 }, { "auxiliary_loss_clip": 0.01206667, "auxiliary_loss_mlp": 0.01041361, "balance_loss_clip": 1.06028962, "balance_loss_mlp": 1.03142488, "epoch": 0.260446101124271, "flos": 28766314506240.0, "grad_norm": 1.6983852959484727, "language_loss": 0.81188893, "learning_rate": 3.4684731579817568e-06, "loss": 0.83436918, "num_input_tokens_seen": 46342730, "step": 2166, "time_per_iteration": 2.53603458404541 }, { "auxiliary_loss_clip": 0.01132057, "auxiliary_loss_mlp": 0.0103698, "balance_loss_clip": 1.0555687, "balance_loss_mlp": 1.02784848, "epoch": 0.26056634401491013, "flos": 25666757072640.0, "grad_norm": 1.5626522140039851, "language_loss": 0.76280355, "learning_rate": 3.4679442076738247e-06, "loss": 0.78449392, "num_input_tokens_seen": 46362445, "step": 2167, "time_per_iteration": 2.657926082611084 }, { "auxiliary_loss_clip": 0.01212013, "auxiliary_loss_mlp": 0.01033837, "balance_loss_clip": 1.06358683, "balance_loss_mlp": 1.02368653, "epoch": 0.2606865869055492, "flos": 27052765217280.0, "grad_norm": 2.0148815023432887, "language_loss": 0.83762741, "learning_rate": 3.4674150346799245e-06, "loss": 0.86008584, "num_input_tokens_seen": 46382145, "step": 2168, "time_per_iteration": 2.4923644065856934 }, { "auxiliary_loss_clip": 0.01177443, "auxiliary_loss_mlp": 0.01030746, "balance_loss_clip": 1.05903304, "balance_loss_mlp": 1.0215435, "epoch": 0.2608068297961883, "flos": 17712615686400.0, "grad_norm": 2.297595910295783, "language_loss": 0.79767686, "learning_rate": 3.4668856390803295e-06, "loss": 0.81975877, "num_input_tokens_seen": 46400025, "step": 2169, "time_per_iteration": 2.5130422115325928 }, { "auxiliary_loss_clip": 0.01183175, "auxiliary_loss_mlp": 0.01030862, "balance_loss_clip": 1.060076, "balance_loss_mlp": 1.02166498, "epoch": 0.2609270726868274, "flos": 18551632544640.0, "grad_norm": 1.8567906303704276, "language_loss": 0.89915407, "learning_rate": 3.4663560209553495e-06, "loss": 0.92129445, "num_input_tokens_seen": 46418090, "step": 2170, "time_per_iteration": 2.4837265014648438 }, { "auxiliary_loss_clip": 0.01168994, "auxiliary_loss_mlp": 0.01032734, "balance_loss_clip": 1.05658078, "balance_loss_mlp": 1.023579, "epoch": 0.26104731557746647, "flos": 21835699165440.0, "grad_norm": 1.6486232169764274, "language_loss": 0.79310489, "learning_rate": 3.4658261803853267e-06, "loss": 0.81512219, "num_input_tokens_seen": 46436015, "step": 2171, "time_per_iteration": 2.5603272914886475 }, { "auxiliary_loss_clip": 0.01175017, "auxiliary_loss_mlp": 0.01033053, "balance_loss_clip": 1.05923963, "balance_loss_mlp": 1.02346206, "epoch": 0.2611675584681056, "flos": 21689650465920.0, "grad_norm": 7.403395318207266, "language_loss": 0.80895674, "learning_rate": 3.4652961174506383e-06, "loss": 0.83103752, "num_input_tokens_seen": 46455885, "step": 2172, "time_per_iteration": 2.526472330093384 }, { "auxiliary_loss_clip": 0.01120973, "auxiliary_loss_mlp": 0.01001811, "balance_loss_clip": 1.05513811, "balance_loss_mlp": 1.00016582, "epoch": 0.2612878013587447, "flos": 71862101389440.0, "grad_norm": 0.9712397895893216, "language_loss": 0.58154279, "learning_rate": 3.464765832231694e-06, "loss": 0.60277063, "num_input_tokens_seen": 46510050, "step": 2173, "time_per_iteration": 3.0684995651245117 }, { "auxiliary_loss_clip": 0.01196592, "auxiliary_loss_mlp": 0.01028922, "balance_loss_clip": 1.06342673, "balance_loss_mlp": 1.01979637, "epoch": 0.26140804424938374, "flos": 20227511445120.0, "grad_norm": 1.870311859634478, "language_loss": 0.7064085, "learning_rate": 3.4642353248089373e-06, "loss": 0.72866356, "num_input_tokens_seen": 46528810, "step": 2174, "time_per_iteration": 2.4820303916931152 }, { "auxiliary_loss_clip": 0.01173217, "auxiliary_loss_mlp": 0.0102929, "balance_loss_clip": 1.05699348, "balance_loss_mlp": 1.01954424, "epoch": 0.26152828714002285, "flos": 25557085872000.0, "grad_norm": 1.629799740219285, "language_loss": 0.80035865, "learning_rate": 3.463704595262846e-06, "loss": 0.82238376, "num_input_tokens_seen": 46549690, "step": 2175, "time_per_iteration": 2.5791454315185547 }, { "auxiliary_loss_clip": 0.01160216, "auxiliary_loss_mlp": 0.0103619, "balance_loss_clip": 1.05637741, "balance_loss_mlp": 1.02717173, "epoch": 0.26164853003066196, "flos": 25446516831360.0, "grad_norm": 2.072702741122209, "language_loss": 0.70460141, "learning_rate": 3.463173643673931e-06, "loss": 0.72656536, "num_input_tokens_seen": 46572215, "step": 2176, "time_per_iteration": 2.6131815910339355 }, { "auxiliary_loss_clip": 0.01128957, "auxiliary_loss_mlp": 0.0100988, "balance_loss_clip": 1.05613852, "balance_loss_mlp": 1.00804412, "epoch": 0.261768772921301, "flos": 53944580568960.0, "grad_norm": 0.9016404712133905, "language_loss": 0.63482016, "learning_rate": 3.4626424701227387e-06, "loss": 0.65620857, "num_input_tokens_seen": 46627275, "step": 2177, "time_per_iteration": 3.012408971786499 }, { "auxiliary_loss_clip": 0.01138466, "auxiliary_loss_mlp": 0.0100753, "balance_loss_clip": 1.05523539, "balance_loss_mlp": 1.00567079, "epoch": 0.26188901581194013, "flos": 70687606481280.0, "grad_norm": 0.8200781995043724, "language_loss": 0.55837572, "learning_rate": 3.4621110746898452e-06, "loss": 0.57983571, "num_input_tokens_seen": 46695135, "step": 2178, "time_per_iteration": 4.358171701431274 }, { "auxiliary_loss_clip": 0.01197398, "auxiliary_loss_mlp": 0.01032189, "balance_loss_clip": 1.06295514, "balance_loss_mlp": 1.02277136, "epoch": 0.2620092587025792, "flos": 21069580959360.0, "grad_norm": 1.6563492262859696, "language_loss": 0.74328589, "learning_rate": 3.4615794574558654e-06, "loss": 0.76558173, "num_input_tokens_seen": 46714145, "step": 2179, "time_per_iteration": 2.561577796936035 }, { "auxiliary_loss_clip": 0.01178204, "auxiliary_loss_mlp": 0.01025749, "balance_loss_clip": 1.05941629, "balance_loss_mlp": 1.01761293, "epoch": 0.2621295015932183, "flos": 18369601395840.0, "grad_norm": 3.124222071143197, "language_loss": 0.83916706, "learning_rate": 3.4610476185014436e-06, "loss": 0.86120659, "num_input_tokens_seen": 46731405, "step": 2180, "time_per_iteration": 2.5642709732055664 }, { "auxiliary_loss_clip": 0.01208678, "auxiliary_loss_mlp": 0.01031623, "balance_loss_clip": 1.06085014, "balance_loss_mlp": 1.02179396, "epoch": 0.2622497444838574, "flos": 23659997063040.0, "grad_norm": 1.8866342374167722, "language_loss": 0.79257935, "learning_rate": 3.4605155579072597e-06, "loss": 0.81498241, "num_input_tokens_seen": 46751260, "step": 2181, "time_per_iteration": 2.545687437057495 }, { "auxiliary_loss_clip": 0.01138863, "auxiliary_loss_mlp": 0.0102935, "balance_loss_clip": 1.05171895, "balance_loss_mlp": 1.02037311, "epoch": 0.26236998737449646, "flos": 22123810154880.0, "grad_norm": 1.7193284635237824, "language_loss": 0.71557009, "learning_rate": 3.459983275754027e-06, "loss": 0.73725224, "num_input_tokens_seen": 46770155, "step": 2182, "time_per_iteration": 2.701566219329834 }, { "auxiliary_loss_clip": 0.01208017, "auxiliary_loss_mlp": 0.01029567, "balance_loss_clip": 1.06258309, "balance_loss_mlp": 1.02051926, "epoch": 0.26249023026513557, "flos": 17895185539200.0, "grad_norm": 2.709506957094928, "language_loss": 0.80074733, "learning_rate": 3.4594507721224918e-06, "loss": 0.82312316, "num_input_tokens_seen": 46788805, "step": 2183, "time_per_iteration": 3.352287530899048 }, { "auxiliary_loss_clip": 0.01178987, "auxiliary_loss_mlp": 0.01041921, "balance_loss_clip": 1.05708432, "balance_loss_mlp": 1.03295064, "epoch": 0.2626104731557747, "flos": 18332936588160.0, "grad_norm": 1.636558240169881, "language_loss": 0.82141447, "learning_rate": 3.4589180470934353e-06, "loss": 0.84362358, "num_input_tokens_seen": 46808670, "step": 2184, "time_per_iteration": 4.191047191619873 }, { "auxiliary_loss_clip": 0.01199236, "auxiliary_loss_mlp": 0.0103517, "balance_loss_clip": 1.05877709, "balance_loss_mlp": 1.0253948, "epoch": 0.26273071604641374, "flos": 19317714837120.0, "grad_norm": 1.7760122200612447, "language_loss": 0.76692963, "learning_rate": 3.4583851007476713e-06, "loss": 0.78927362, "num_input_tokens_seen": 46827140, "step": 2185, "time_per_iteration": 2.5613741874694824 }, { "auxiliary_loss_clip": 0.01165308, "auxiliary_loss_mlp": 0.01031117, "balance_loss_clip": 1.05647099, "balance_loss_mlp": 1.02132988, "epoch": 0.26285095893705285, "flos": 18327477720960.0, "grad_norm": 2.0776210310879475, "language_loss": 0.6891613, "learning_rate": 3.4578519331660464e-06, "loss": 0.71112555, "num_input_tokens_seen": 46844135, "step": 2186, "time_per_iteration": 2.7397818565368652 }, { "auxiliary_loss_clip": 0.01190579, "auxiliary_loss_mlp": 0.01032679, "balance_loss_clip": 1.06217432, "balance_loss_mlp": 1.02412593, "epoch": 0.26297120182769196, "flos": 20193827466240.0, "grad_norm": 2.096911355778989, "language_loss": 0.81944418, "learning_rate": 3.4573185444294426e-06, "loss": 0.84167671, "num_input_tokens_seen": 46862500, "step": 2187, "time_per_iteration": 2.6239640712738037 }, { "auxiliary_loss_clip": 0.01175456, "auxiliary_loss_mlp": 0.00763908, "balance_loss_clip": 1.05786455, "balance_loss_mlp": 1.00085139, "epoch": 0.263091444718331, "flos": 22418421505920.0, "grad_norm": 1.7986511251009665, "language_loss": 0.79012066, "learning_rate": 3.456784934618774e-06, "loss": 0.80951428, "num_input_tokens_seen": 46883665, "step": 2188, "time_per_iteration": 2.6499650478363037 }, { "auxiliary_loss_clip": 0.01177985, "auxiliary_loss_mlp": 0.01031499, "balance_loss_clip": 1.05916893, "balance_loss_mlp": 1.02302361, "epoch": 0.2632116876089701, "flos": 19024827338880.0, "grad_norm": 1.9286659506983903, "language_loss": 0.80325353, "learning_rate": 3.4562511038149897e-06, "loss": 0.82534832, "num_input_tokens_seen": 46899160, "step": 2189, "time_per_iteration": 2.610973834991455 }, { "auxiliary_loss_clip": 0.0107272, "auxiliary_loss_mlp": 0.0100414, "balance_loss_clip": 1.04186225, "balance_loss_mlp": 1.00206554, "epoch": 0.26333193049960923, "flos": 67308054531840.0, "grad_norm": 0.8596719810699128, "language_loss": 0.57779741, "learning_rate": 3.4557170520990705e-06, "loss": 0.59856594, "num_input_tokens_seen": 46959835, "step": 2190, "time_per_iteration": 3.254857301712036 }, { "auxiliary_loss_clip": 0.01184648, "auxiliary_loss_mlp": 0.01033131, "balance_loss_clip": 1.05729985, "balance_loss_mlp": 1.02413034, "epoch": 0.2634521733902483, "flos": 25048806468480.0, "grad_norm": 2.596168456299673, "language_loss": 0.86207569, "learning_rate": 3.4551827795520324e-06, "loss": 0.88425344, "num_input_tokens_seen": 46982720, "step": 2191, "time_per_iteration": 2.6148581504821777 }, { "auxiliary_loss_clip": 0.01192949, "auxiliary_loss_mlp": 0.010263, "balance_loss_clip": 1.05866671, "balance_loss_mlp": 1.01775312, "epoch": 0.2635724162808874, "flos": 20594985534720.0, "grad_norm": 1.6603366512539401, "language_loss": 0.85139942, "learning_rate": 3.4546482862549226e-06, "loss": 0.8735919, "num_input_tokens_seen": 47003035, "step": 2192, "time_per_iteration": 2.573833703994751 }, { "auxiliary_loss_clip": 0.0115699, "auxiliary_loss_mlp": 0.01038709, "balance_loss_clip": 1.05532324, "balance_loss_mlp": 1.02886784, "epoch": 0.2636926591715265, "flos": 19244636616960.0, "grad_norm": 2.106359142695926, "language_loss": 0.78791392, "learning_rate": 3.4541135722888253e-06, "loss": 0.80987096, "num_input_tokens_seen": 47019625, "step": 2193, "time_per_iteration": 2.6386377811431885 }, { "auxiliary_loss_clip": 0.01203023, "auxiliary_loss_mlp": 0.01029189, "balance_loss_clip": 1.05889523, "balance_loss_mlp": 1.01966405, "epoch": 0.26381290206216557, "flos": 28804882734720.0, "grad_norm": 1.9076585989765682, "language_loss": 0.80558097, "learning_rate": 3.453578637734854e-06, "loss": 0.82790315, "num_input_tokens_seen": 47040815, "step": 2194, "time_per_iteration": 2.6013710498809814 }, { "auxiliary_loss_clip": 0.01209885, "auxiliary_loss_mlp": 0.01034202, "balance_loss_clip": 1.06535769, "balance_loss_mlp": 1.02486193, "epoch": 0.2639331449528047, "flos": 25008909436800.0, "grad_norm": 1.6391315992143196, "language_loss": 0.78201735, "learning_rate": 3.4530434826741605e-06, "loss": 0.80445826, "num_input_tokens_seen": 47061755, "step": 2195, "time_per_iteration": 2.553014039993286 }, { "auxiliary_loss_clip": 0.01173584, "auxiliary_loss_mlp": 0.01033158, "balance_loss_clip": 1.05922675, "balance_loss_mlp": 1.02456856, "epoch": 0.26405338784344373, "flos": 46535775465600.0, "grad_norm": 2.0830058878795175, "language_loss": 0.69017345, "learning_rate": 3.452508107187926e-06, "loss": 0.71224082, "num_input_tokens_seen": 47085130, "step": 2196, "time_per_iteration": 2.774874687194824 }, { "auxiliary_loss_clip": 0.01130644, "auxiliary_loss_mlp": 0.0102902, "balance_loss_clip": 1.04896259, "balance_loss_mlp": 1.01932192, "epoch": 0.26417363073408284, "flos": 21179467641600.0, "grad_norm": 2.514524947463671, "language_loss": 0.77544224, "learning_rate": 3.451972511357366e-06, "loss": 0.79703885, "num_input_tokens_seen": 47104675, "step": 2197, "time_per_iteration": 2.6837775707244873 }, { "auxiliary_loss_clip": 0.01189841, "auxiliary_loss_mlp": 0.01030978, "balance_loss_clip": 1.06157112, "balance_loss_mlp": 1.02299118, "epoch": 0.26429387362472195, "flos": 22674751937280.0, "grad_norm": 1.6631402666991062, "language_loss": 0.85201919, "learning_rate": 3.45143669526373e-06, "loss": 0.87422729, "num_input_tokens_seen": 47124435, "step": 2198, "time_per_iteration": 2.550856590270996 }, { "auxiliary_loss_clip": 0.01112672, "auxiliary_loss_mlp": 0.0100712, "balance_loss_clip": 1.05072165, "balance_loss_mlp": 1.00540304, "epoch": 0.264414116515361, "flos": 67180534272000.0, "grad_norm": 0.7883361354841523, "language_loss": 0.63230938, "learning_rate": 3.450900658988302e-06, "loss": 0.65350729, "num_input_tokens_seen": 47185985, "step": 2199, "time_per_iteration": 3.0846142768859863 }, { "auxiliary_loss_clip": 0.01167213, "auxiliary_loss_mlp": 0.01036646, "balance_loss_clip": 1.05756617, "balance_loss_mlp": 1.02717495, "epoch": 0.2645343594060001, "flos": 25664709997440.0, "grad_norm": 1.8785795321841914, "language_loss": 0.77414012, "learning_rate": 3.450364402612397e-06, "loss": 0.79617876, "num_input_tokens_seen": 47203140, "step": 2200, "time_per_iteration": 2.60007905960083 }, { "auxiliary_loss_clip": 0.0117046, "auxiliary_loss_mlp": 0.01032737, "balance_loss_clip": 1.05552101, "balance_loss_mlp": 1.02308655, "epoch": 0.26465460229663923, "flos": 22491822948480.0, "grad_norm": 1.9171533015382658, "language_loss": 0.83797586, "learning_rate": 3.449827926217366e-06, "loss": 0.86000788, "num_input_tokens_seen": 47222575, "step": 2201, "time_per_iteration": 2.5830206871032715 }, { "auxiliary_loss_clip": 0.01180158, "auxiliary_loss_mlp": 0.01031717, "balance_loss_clip": 1.05565095, "balance_loss_mlp": 1.0224961, "epoch": 0.2647748451872783, "flos": 29388036038400.0, "grad_norm": 1.8039305930145464, "language_loss": 0.80450892, "learning_rate": 3.449291229884591e-06, "loss": 0.82662767, "num_input_tokens_seen": 47243815, "step": 2202, "time_per_iteration": 2.6505749225616455 }, { "auxiliary_loss_clip": 0.01167585, "auxiliary_loss_mlp": 0.01029513, "balance_loss_clip": 1.05624235, "balance_loss_mlp": 1.02018523, "epoch": 0.2648950880779174, "flos": 26797799502720.0, "grad_norm": 3.3983243971699366, "language_loss": 0.86773252, "learning_rate": 3.4487543136954887e-06, "loss": 0.88970351, "num_input_tokens_seen": 47263435, "step": 2203, "time_per_iteration": 2.6584179401397705 }, { "auxiliary_loss_clip": 0.0116103, "auxiliary_loss_mlp": 0.01034696, "balance_loss_clip": 1.05634904, "balance_loss_mlp": 1.02539182, "epoch": 0.2650153309685565, "flos": 28841008838400.0, "grad_norm": 1.659183504880564, "language_loss": 0.91029167, "learning_rate": 3.448217177731509e-06, "loss": 0.93224883, "num_input_tokens_seen": 47283920, "step": 2204, "time_per_iteration": 2.6511173248291016 }, { "auxiliary_loss_clip": 0.01171713, "auxiliary_loss_mlp": 0.01032123, "balance_loss_clip": 1.05940711, "balance_loss_mlp": 1.02384424, "epoch": 0.26513557385919556, "flos": 20303247271680.0, "grad_norm": 2.069605312070416, "language_loss": 0.78080666, "learning_rate": 3.4476798220741348e-06, "loss": 0.80284506, "num_input_tokens_seen": 47302800, "step": 2205, "time_per_iteration": 3.443758249282837 }, { "auxiliary_loss_clip": 0.01207371, "auxiliary_loss_mlp": 0.01032254, "balance_loss_clip": 1.06392574, "balance_loss_mlp": 1.02402902, "epoch": 0.26525581674983467, "flos": 17676274101120.0, "grad_norm": 1.571396884156926, "language_loss": 0.78391492, "learning_rate": 3.4471422468048826e-06, "loss": 0.80631113, "num_input_tokens_seen": 47321525, "step": 2206, "time_per_iteration": 2.517871856689453 }, { "auxiliary_loss_clip": 0.01183666, "auxiliary_loss_mlp": 0.01028851, "balance_loss_clip": 1.06074798, "balance_loss_mlp": 1.0202744, "epoch": 0.2653760596404738, "flos": 26833746038400.0, "grad_norm": 3.022226023368953, "language_loss": 0.73499399, "learning_rate": 3.4466044520053022e-06, "loss": 0.75711918, "num_input_tokens_seen": 47340530, "step": 2207, "time_per_iteration": 2.5882511138916016 }, { "auxiliary_loss_clip": 0.01163886, "auxiliary_loss_mlp": 0.01032197, "balance_loss_clip": 1.05616009, "balance_loss_mlp": 1.02327967, "epoch": 0.26549630253111284, "flos": 22782160581120.0, "grad_norm": 1.9279362087523995, "language_loss": 0.60457009, "learning_rate": 3.446066437756977e-06, "loss": 0.62653089, "num_input_tokens_seen": 47359735, "step": 2208, "time_per_iteration": 2.584620714187622 }, { "auxiliary_loss_clip": 0.01174298, "auxiliary_loss_mlp": 0.01024962, "balance_loss_clip": 1.05758798, "balance_loss_mlp": 1.01624823, "epoch": 0.26561654542175195, "flos": 23550002640000.0, "grad_norm": 2.2992960028362135, "language_loss": 0.75146151, "learning_rate": 3.4455282041415224e-06, "loss": 0.77345413, "num_input_tokens_seen": 47378945, "step": 2209, "time_per_iteration": 2.6133382320404053 }, { "auxiliary_loss_clip": 0.01166358, "auxiliary_loss_mlp": 0.01028491, "balance_loss_clip": 1.0576725, "balance_loss_mlp": 1.01975298, "epoch": 0.265736788312391, "flos": 26906680604160.0, "grad_norm": 2.506035992973833, "language_loss": 0.86997288, "learning_rate": 3.4449897512405894e-06, "loss": 0.8919214, "num_input_tokens_seen": 47398095, "step": 2210, "time_per_iteration": 4.433354139328003 }, { "auxiliary_loss_clip": 0.01124917, "auxiliary_loss_mlp": 0.00763939, "balance_loss_clip": 1.05033374, "balance_loss_mlp": 1.00095773, "epoch": 0.2658570312030301, "flos": 23477139901440.0, "grad_norm": 2.5691505657724, "language_loss": 0.7501049, "learning_rate": 3.444451079135859e-06, "loss": 0.76899344, "num_input_tokens_seen": 47417605, "step": 2211, "time_per_iteration": 2.7216885089874268 }, { "auxiliary_loss_clip": 0.01135794, "auxiliary_loss_mlp": 0.00764254, "balance_loss_clip": 1.05027723, "balance_loss_mlp": 1.00095475, "epoch": 0.2659772740936692, "flos": 21866402315520.0, "grad_norm": 4.164591014971873, "language_loss": 0.74083984, "learning_rate": 3.4439121879090493e-06, "loss": 0.75984025, "num_input_tokens_seen": 47435385, "step": 2212, "time_per_iteration": 2.6728148460388184 }, { "auxiliary_loss_clip": 0.01180487, "auxiliary_loss_mlp": 0.0103204, "balance_loss_clip": 1.05928302, "balance_loss_mlp": 1.02285492, "epoch": 0.2660975169843083, "flos": 19793100360960.0, "grad_norm": 1.8156408081871496, "language_loss": 0.83416331, "learning_rate": 3.4433730776419082e-06, "loss": 0.85628855, "num_input_tokens_seen": 47454310, "step": 2213, "time_per_iteration": 2.5893635749816895 }, { "auxiliary_loss_clip": 0.01194827, "auxiliary_loss_mlp": 0.00764103, "balance_loss_clip": 1.05907667, "balance_loss_mlp": 1.00104117, "epoch": 0.2662177598749474, "flos": 29018981750400.0, "grad_norm": 2.88617253080466, "language_loss": 0.8063392, "learning_rate": 3.4428337484162183e-06, "loss": 0.82592845, "num_input_tokens_seen": 47475120, "step": 2214, "time_per_iteration": 2.6042234897613525 }, { "auxiliary_loss_clip": 0.0117323, "auxiliary_loss_mlp": 0.0103082, "balance_loss_clip": 1.05683851, "balance_loss_mlp": 1.02192712, "epoch": 0.2663380027655865, "flos": 21762549118080.0, "grad_norm": 1.8702691314145343, "language_loss": 0.84157419, "learning_rate": 3.442294200313797e-06, "loss": 0.86361468, "num_input_tokens_seen": 47493150, "step": 2215, "time_per_iteration": 2.5609002113342285 }, { "auxiliary_loss_clip": 0.01129059, "auxiliary_loss_mlp": 0.0100152, "balance_loss_clip": 1.04827094, "balance_loss_mlp": 1.00000596, "epoch": 0.26645824565622556, "flos": 66980333819520.0, "grad_norm": 0.766375761484475, "language_loss": 0.52656794, "learning_rate": 3.4417544334164916e-06, "loss": 0.54787374, "num_input_tokens_seen": 47557295, "step": 2216, "time_per_iteration": 3.1085009574890137 }, { "auxiliary_loss_clip": 0.0113574, "auxiliary_loss_mlp": 0.01031027, "balance_loss_clip": 1.05086994, "balance_loss_mlp": 1.02155602, "epoch": 0.26657848854686467, "flos": 25264198373760.0, "grad_norm": 1.7237241397883767, "language_loss": 0.77284902, "learning_rate": 3.4412144478061854e-06, "loss": 0.79451668, "num_input_tokens_seen": 47579705, "step": 2217, "time_per_iteration": 2.688756227493286 }, { "auxiliary_loss_clip": 0.01117476, "auxiliary_loss_mlp": 0.01030548, "balance_loss_clip": 1.0491854, "balance_loss_mlp": 1.0210824, "epoch": 0.2666987314375038, "flos": 23696769611520.0, "grad_norm": 1.8510502611519608, "language_loss": 0.75081944, "learning_rate": 3.4406742435647925e-06, "loss": 0.77229965, "num_input_tokens_seen": 47599770, "step": 2218, "time_per_iteration": 2.677963972091675 }, { "auxiliary_loss_clip": 0.01187372, "auxiliary_loss_mlp": 0.01032668, "balance_loss_clip": 1.06072783, "balance_loss_mlp": 1.02397799, "epoch": 0.26681897432814283, "flos": 27048958375680.0, "grad_norm": 2.0988809630433156, "language_loss": 0.78762227, "learning_rate": 3.440133820774263e-06, "loss": 0.80982268, "num_input_tokens_seen": 47619580, "step": 2219, "time_per_iteration": 2.612597703933716 }, { "auxiliary_loss_clip": 0.01177929, "auxiliary_loss_mlp": 0.01038852, "balance_loss_clip": 1.05692625, "balance_loss_mlp": 1.02911866, "epoch": 0.26693921721878194, "flos": 28985944216320.0, "grad_norm": 1.9719463049090495, "language_loss": 0.8199681, "learning_rate": 3.439593179516578e-06, "loss": 0.84213591, "num_input_tokens_seen": 47639490, "step": 2220, "time_per_iteration": 2.597015142440796 }, { "auxiliary_loss_clip": 0.01180935, "auxiliary_loss_mlp": 0.0103203, "balance_loss_clip": 1.05918884, "balance_loss_mlp": 1.02271366, "epoch": 0.26705946010942105, "flos": 21507834798720.0, "grad_norm": 1.7801340039728282, "language_loss": 0.81058663, "learning_rate": 3.4390523198737524e-06, "loss": 0.83271623, "num_input_tokens_seen": 47658650, "step": 2221, "time_per_iteration": 2.5597517490386963 }, { "auxiliary_loss_clip": 0.01205964, "auxiliary_loss_mlp": 0.00764025, "balance_loss_clip": 1.06115246, "balance_loss_mlp": 1.00104702, "epoch": 0.2671797030000601, "flos": 21471277731840.0, "grad_norm": 1.5964224359378898, "language_loss": 0.73500913, "learning_rate": 3.4385112419278333e-06, "loss": 0.75470906, "num_input_tokens_seen": 47679875, "step": 2222, "time_per_iteration": 2.51297664642334 }, { "auxiliary_loss_clip": 0.01119354, "auxiliary_loss_mlp": 0.0100569, "balance_loss_clip": 1.04763269, "balance_loss_mlp": 1.00414073, "epoch": 0.2672999458906992, "flos": 64189929767040.0, "grad_norm": 0.7976912376205909, "language_loss": 0.64793193, "learning_rate": 3.4379699457609033e-06, "loss": 0.66918242, "num_input_tokens_seen": 47737700, "step": 2223, "time_per_iteration": 2.986011028289795 }, { "auxiliary_loss_clip": 0.01166102, "auxiliary_loss_mlp": 0.01026669, "balance_loss_clip": 1.0538398, "balance_loss_mlp": 1.01759088, "epoch": 0.26742018878133833, "flos": 16909042573440.0, "grad_norm": 1.9473485888576572, "language_loss": 0.90078104, "learning_rate": 3.4374284314550755e-06, "loss": 0.92270875, "num_input_tokens_seen": 47756740, "step": 2224, "time_per_iteration": 2.568167209625244 }, { "auxiliary_loss_clip": 0.01203695, "auxiliary_loss_mlp": 0.01026819, "balance_loss_clip": 1.06096828, "balance_loss_mlp": 1.01820922, "epoch": 0.2675404316719774, "flos": 20667560964480.0, "grad_norm": 2.000341276099548, "language_loss": 0.80752969, "learning_rate": 3.436886699092498e-06, "loss": 0.82983488, "num_input_tokens_seen": 47775255, "step": 2225, "time_per_iteration": 2.687025308609009 }, { "auxiliary_loss_clip": 0.01206604, "auxiliary_loss_mlp": 0.01033894, "balance_loss_clip": 1.06055307, "balance_loss_mlp": 1.02459025, "epoch": 0.2676606745626165, "flos": 17485013157120.0, "grad_norm": 2.7264236370420205, "language_loss": 0.71757275, "learning_rate": 3.4363447487553502e-06, "loss": 0.73997772, "num_input_tokens_seen": 47788570, "step": 2226, "time_per_iteration": 2.472900152206421 }, { "auxiliary_loss_clip": 0.01170698, "auxiliary_loss_mlp": 0.01032031, "balance_loss_clip": 1.05710268, "balance_loss_mlp": 1.02229166, "epoch": 0.26778091745325555, "flos": 27852675143040.0, "grad_norm": 1.8891480480563503, "language_loss": 0.77584475, "learning_rate": 3.4358025805258455e-06, "loss": 0.79787207, "num_input_tokens_seen": 47808275, "step": 2227, "time_per_iteration": 2.6246864795684814 }, { "auxiliary_loss_clip": 0.01151102, "auxiliary_loss_mlp": 0.01024657, "balance_loss_clip": 1.05271149, "balance_loss_mlp": 1.01568055, "epoch": 0.26790116034389466, "flos": 20955995176320.0, "grad_norm": 1.6981420800543432, "language_loss": 0.83317292, "learning_rate": 3.435260194486232e-06, "loss": 0.85493058, "num_input_tokens_seen": 47826245, "step": 2228, "time_per_iteration": 2.6458258628845215 }, { "auxiliary_loss_clip": 0.01174902, "auxiliary_loss_mlp": 0.01029675, "balance_loss_clip": 1.05742013, "balance_loss_mlp": 1.02044272, "epoch": 0.2680214032345338, "flos": 18040659621120.0, "grad_norm": 3.3687843102912742, "language_loss": 0.82139403, "learning_rate": 3.4347175907187875e-06, "loss": 0.84343982, "num_input_tokens_seen": 47843235, "step": 2229, "time_per_iteration": 2.531506061553955 }, { "auxiliary_loss_clip": 0.01178841, "auxiliary_loss_mlp": 0.01036322, "balance_loss_clip": 1.05836511, "balance_loss_mlp": 1.02801943, "epoch": 0.26814164612517283, "flos": 22419427086720.0, "grad_norm": 1.7159714206015537, "language_loss": 0.879282, "learning_rate": 3.4341747693058254e-06, "loss": 0.90143371, "num_input_tokens_seen": 47861710, "step": 2230, "time_per_iteration": 2.546628952026367 }, { "auxiliary_loss_clip": 0.01106508, "auxiliary_loss_mlp": 0.01029378, "balance_loss_clip": 1.05124855, "balance_loss_mlp": 1.02104545, "epoch": 0.26826188901581194, "flos": 35627371159680.0, "grad_norm": 1.9615332249089135, "language_loss": 0.77418643, "learning_rate": 3.4336317303296916e-06, "loss": 0.79554534, "num_input_tokens_seen": 47882685, "step": 2231, "time_per_iteration": 3.6975531578063965 }, { "auxiliary_loss_clip": 0.01184785, "auxiliary_loss_mlp": 0.01026561, "balance_loss_clip": 1.05721736, "balance_loss_mlp": 1.01787663, "epoch": 0.26838213190645105, "flos": 17639788861440.0, "grad_norm": 2.0245650612079857, "language_loss": 0.75145757, "learning_rate": 3.4330884738727635e-06, "loss": 0.77357101, "num_input_tokens_seen": 47900860, "step": 2232, "time_per_iteration": 2.540956974029541 }, { "auxiliary_loss_clip": 0.01136939, "auxiliary_loss_mlp": 0.01028713, "balance_loss_clip": 1.05197644, "balance_loss_mlp": 1.01986146, "epoch": 0.2685023747970901, "flos": 22674823764480.0, "grad_norm": 1.850396606133055, "language_loss": 0.70930791, "learning_rate": 3.4325450000174535e-06, "loss": 0.73096442, "num_input_tokens_seen": 47917500, "step": 2233, "time_per_iteration": 2.6546518802642822 }, { "auxiliary_loss_clip": 0.0113734, "auxiliary_loss_mlp": 0.01034148, "balance_loss_clip": 1.05256629, "balance_loss_mlp": 1.02443218, "epoch": 0.2686226176877292, "flos": 20120533764480.0, "grad_norm": 1.7274205688248698, "language_loss": 0.74145687, "learning_rate": 3.4320013088462067e-06, "loss": 0.76317173, "num_input_tokens_seen": 47934860, "step": 2234, "time_per_iteration": 2.616955280303955 }, { "auxiliary_loss_clip": 0.01163611, "auxiliary_loss_mlp": 0.01029775, "balance_loss_clip": 1.05412567, "balance_loss_mlp": 1.02103734, "epoch": 0.2687428605783683, "flos": 21872040750720.0, "grad_norm": 1.4676258164373803, "language_loss": 0.8179189, "learning_rate": 3.431457400441499e-06, "loss": 0.83985281, "num_input_tokens_seen": 47955255, "step": 2235, "time_per_iteration": 2.6273481845855713 }, { "auxiliary_loss_clip": 0.01050567, "auxiliary_loss_mlp": 0.01007846, "balance_loss_clip": 1.03402674, "balance_loss_mlp": 1.00623703, "epoch": 0.2688631034690074, "flos": 69943320766080.0, "grad_norm": 0.9279174900054146, "language_loss": 0.60790086, "learning_rate": 3.4309132748858424e-06, "loss": 0.62848496, "num_input_tokens_seen": 48016245, "step": 2236, "time_per_iteration": 4.947690963745117 }, { "auxiliary_loss_clip": 0.01184347, "auxiliary_loss_mlp": 0.01032172, "balance_loss_clip": 1.0592854, "balance_loss_mlp": 1.02332115, "epoch": 0.2689833463596465, "flos": 22856639431680.0, "grad_norm": 2.0357490634703126, "language_loss": 0.83803993, "learning_rate": 3.430368932261779e-06, "loss": 0.86020511, "num_input_tokens_seen": 48036600, "step": 2237, "time_per_iteration": 3.3660030364990234 }, { "auxiliary_loss_clip": 0.0117048, "auxiliary_loss_mlp": 0.01029827, "balance_loss_clip": 1.05655789, "balance_loss_mlp": 1.02069557, "epoch": 0.2691035892502856, "flos": 17200242132480.0, "grad_norm": 1.9003080193379176, "language_loss": 0.75151074, "learning_rate": 3.429824372651886e-06, "loss": 0.77351379, "num_input_tokens_seen": 48054750, "step": 2238, "time_per_iteration": 2.5235631465911865 }, { "auxiliary_loss_clip": 0.01150177, "auxiliary_loss_mlp": 0.01035213, "balance_loss_clip": 1.05424917, "balance_loss_mlp": 1.02627242, "epoch": 0.26922383214092466, "flos": 17747484814080.0, "grad_norm": 1.9934701546489355, "language_loss": 0.8366797, "learning_rate": 3.4292795961387732e-06, "loss": 0.85853362, "num_input_tokens_seen": 48072650, "step": 2239, "time_per_iteration": 2.696112871170044 }, { "auxiliary_loss_clip": 0.01202859, "auxiliary_loss_mlp": 0.01032386, "balance_loss_clip": 1.05973649, "balance_loss_mlp": 1.02382112, "epoch": 0.26934407503156377, "flos": 16173376122240.0, "grad_norm": 2.1702667712905153, "language_loss": 0.87873799, "learning_rate": 3.4287346028050818e-06, "loss": 0.90109044, "num_input_tokens_seen": 48088720, "step": 2240, "time_per_iteration": 2.4811339378356934 }, { "auxiliary_loss_clip": 0.01169416, "auxiliary_loss_mlp": 0.01027507, "balance_loss_clip": 1.05507183, "balance_loss_mlp": 1.01900136, "epoch": 0.2694643179222028, "flos": 23732895715200.0, "grad_norm": 1.7180874399626538, "language_loss": 0.7955277, "learning_rate": 3.4281893927334866e-06, "loss": 0.81749696, "num_input_tokens_seen": 48108630, "step": 2241, "time_per_iteration": 2.637162923812866 }, { "auxiliary_loss_clip": 0.01188866, "auxiliary_loss_mlp": 0.01028182, "balance_loss_clip": 1.06009424, "balance_loss_mlp": 1.01996827, "epoch": 0.26958456081284193, "flos": 24718140840960.0, "grad_norm": 2.030111149495139, "language_loss": 0.75194442, "learning_rate": 3.4276439660066963e-06, "loss": 0.77411485, "num_input_tokens_seen": 48128330, "step": 2242, "time_per_iteration": 2.587635040283203 }, { "auxiliary_loss_clip": 0.01199403, "auxiliary_loss_mlp": 0.01031044, "balance_loss_clip": 1.05970287, "balance_loss_mlp": 1.02209735, "epoch": 0.26970480370348104, "flos": 18112588606080.0, "grad_norm": 2.0028487577953835, "language_loss": 0.84297961, "learning_rate": 3.427098322707452e-06, "loss": 0.86528409, "num_input_tokens_seen": 48144295, "step": 2243, "time_per_iteration": 2.4801080226898193 }, { "auxiliary_loss_clip": 0.0118927, "auxiliary_loss_mlp": 0.0103394, "balance_loss_clip": 1.06410575, "balance_loss_mlp": 1.02439713, "epoch": 0.2698250465941201, "flos": 10816546250880.0, "grad_norm": 1.9671372307385728, "language_loss": 0.89197153, "learning_rate": 3.426552462918526e-06, "loss": 0.91420358, "num_input_tokens_seen": 48162230, "step": 2244, "time_per_iteration": 2.593916654586792 }, { "auxiliary_loss_clip": 0.01202744, "auxiliary_loss_mlp": 0.0102988, "balance_loss_clip": 1.06268883, "balance_loss_mlp": 1.02182698, "epoch": 0.2699452894847592, "flos": 17308117653120.0, "grad_norm": 2.292324163488663, "language_loss": 0.73412573, "learning_rate": 3.426006386722726e-06, "loss": 0.75645196, "num_input_tokens_seen": 48180290, "step": 2245, "time_per_iteration": 2.4957385063171387 }, { "auxiliary_loss_clip": 0.01160077, "auxiliary_loss_mlp": 0.01037226, "balance_loss_clip": 1.05859447, "balance_loss_mlp": 1.02848828, "epoch": 0.2700655323753983, "flos": 18078150441600.0, "grad_norm": 1.9090827293427153, "language_loss": 0.92414212, "learning_rate": 3.4254600942028914e-06, "loss": 0.94611514, "num_input_tokens_seen": 48198165, "step": 2246, "time_per_iteration": 2.623569965362549 }, { "auxiliary_loss_clip": 0.01168978, "auxiliary_loss_mlp": 0.01031802, "balance_loss_clip": 1.05709541, "balance_loss_mlp": 1.02337432, "epoch": 0.2701857752660374, "flos": 18186636493440.0, "grad_norm": 10.830067433011967, "language_loss": 0.82037365, "learning_rate": 3.424913585441893e-06, "loss": 0.84238148, "num_input_tokens_seen": 48216000, "step": 2247, "time_per_iteration": 2.5540707111358643 }, { "auxiliary_loss_clip": 0.01183803, "auxiliary_loss_mlp": 0.01030401, "balance_loss_clip": 1.0598104, "balance_loss_mlp": 1.02165675, "epoch": 0.2703060181566765, "flos": 16319496648960.0, "grad_norm": 1.9830612677353634, "language_loss": 0.87202406, "learning_rate": 3.4243668605226374e-06, "loss": 0.89416611, "num_input_tokens_seen": 48233025, "step": 2248, "time_per_iteration": 2.516472101211548 }, { "auxiliary_loss_clip": 0.01157501, "auxiliary_loss_mlp": 0.01033893, "balance_loss_clip": 1.05503964, "balance_loss_mlp": 1.02526188, "epoch": 0.2704262610473156, "flos": 19572357329280.0, "grad_norm": 2.227013617892131, "language_loss": 0.82716298, "learning_rate": 3.423819919528061e-06, "loss": 0.84907687, "num_input_tokens_seen": 48251110, "step": 2249, "time_per_iteration": 2.5871667861938477 }, { "auxiliary_loss_clip": 0.01149035, "auxiliary_loss_mlp": 0.01029623, "balance_loss_clip": 1.05148458, "balance_loss_mlp": 1.02018166, "epoch": 0.27054650393795465, "flos": 20740746925440.0, "grad_norm": 1.7484871819685195, "language_loss": 0.77721608, "learning_rate": 3.4232727625411355e-06, "loss": 0.79900265, "num_input_tokens_seen": 48270215, "step": 2250, "time_per_iteration": 2.6701366901397705 }, { "auxiliary_loss_clip": 0.01118308, "auxiliary_loss_mlp": 0.01024662, "balance_loss_clip": 1.04822075, "balance_loss_mlp": 1.0167346, "epoch": 0.27066674682859376, "flos": 18658322916480.0, "grad_norm": 1.7222932146497025, "language_loss": 0.86081409, "learning_rate": 3.4227253896448626e-06, "loss": 0.88224375, "num_input_tokens_seen": 48288075, "step": 2251, "time_per_iteration": 2.643840789794922 }, { "auxiliary_loss_clip": 0.01202671, "auxiliary_loss_mlp": 0.010317, "balance_loss_clip": 1.06053793, "balance_loss_mlp": 1.02314711, "epoch": 0.2707869897192329, "flos": 23002759958400.0, "grad_norm": 3.7543008295539306, "language_loss": 0.82018793, "learning_rate": 3.42217780092228e-06, "loss": 0.84253162, "num_input_tokens_seen": 48306415, "step": 2252, "time_per_iteration": 2.5433032512664795 }, { "auxiliary_loss_clip": 0.01091971, "auxiliary_loss_mlp": 0.01003335, "balance_loss_clip": 1.04173684, "balance_loss_mlp": 1.00177336, "epoch": 0.27090723260987193, "flos": 58323240293760.0, "grad_norm": 0.7879095360478465, "language_loss": 0.60360694, "learning_rate": 3.421629996456456e-06, "loss": 0.62456, "num_input_tokens_seen": 48365035, "step": 2253, "time_per_iteration": 3.0753674507141113 }, { "auxiliary_loss_clip": 0.01185307, "auxiliary_loss_mlp": 0.01032312, "balance_loss_clip": 1.05666614, "balance_loss_mlp": 1.02256703, "epoch": 0.27102747550051104, "flos": 11984540797440.0, "grad_norm": 2.0454470776182334, "language_loss": 0.82128537, "learning_rate": 3.421081976330491e-06, "loss": 0.84346157, "num_input_tokens_seen": 48383550, "step": 2254, "time_per_iteration": 2.5213615894317627 }, { "auxiliary_loss_clip": 0.01166618, "auxiliary_loss_mlp": 0.010338, "balance_loss_clip": 1.05304694, "balance_loss_mlp": 1.02470458, "epoch": 0.27114771839115015, "flos": 19900401264000.0, "grad_norm": 1.8520133018526586, "language_loss": 0.87950951, "learning_rate": 3.4205337406275207e-06, "loss": 0.9015137, "num_input_tokens_seen": 48403670, "step": 2255, "time_per_iteration": 2.5624430179595947 }, { "auxiliary_loss_clip": 0.01199932, "auxiliary_loss_mlp": 0.01027433, "balance_loss_clip": 1.05889618, "balance_loss_mlp": 1.01900458, "epoch": 0.2712679612817892, "flos": 18331966920960.0, "grad_norm": 2.2961845989349463, "language_loss": 0.75398445, "learning_rate": 3.4199852894307114e-06, "loss": 0.77625811, "num_input_tokens_seen": 48420420, "step": 2256, "time_per_iteration": 2.5139122009277344 }, { "auxiliary_loss_clip": 0.01131478, "auxiliary_loss_mlp": 0.01033429, "balance_loss_clip": 1.05274034, "balance_loss_mlp": 1.02495885, "epoch": 0.2713882041724283, "flos": 24460302038400.0, "grad_norm": 28.465531865149284, "language_loss": 0.78757352, "learning_rate": 3.419436622823262e-06, "loss": 0.80922258, "num_input_tokens_seen": 48441140, "step": 2257, "time_per_iteration": 2.691471815109253 }, { "auxiliary_loss_clip": 0.01172077, "auxiliary_loss_mlp": 0.01030746, "balance_loss_clip": 1.05842304, "balance_loss_mlp": 1.02228808, "epoch": 0.27150844706306737, "flos": 23039317025280.0, "grad_norm": 1.7342160261801571, "language_loss": 0.74526668, "learning_rate": 3.4188877408884063e-06, "loss": 0.76729488, "num_input_tokens_seen": 48461845, "step": 2258, "time_per_iteration": 3.4508912563323975 }, { "auxiliary_loss_clip": 0.01170132, "auxiliary_loss_mlp": 0.01038157, "balance_loss_clip": 1.05688334, "balance_loss_mlp": 1.0286262, "epoch": 0.2716286899537065, "flos": 22563644192640.0, "grad_norm": 2.5550507144329897, "language_loss": 0.65720934, "learning_rate": 3.4183386437094088e-06, "loss": 0.6792922, "num_input_tokens_seen": 48478510, "step": 2259, "time_per_iteration": 2.5705761909484863 }, { "auxiliary_loss_clip": 0.01173578, "auxiliary_loss_mlp": 0.01026785, "balance_loss_clip": 1.05497336, "balance_loss_mlp": 1.01840472, "epoch": 0.2717489328443456, "flos": 13115044523520.0, "grad_norm": 2.746135128200953, "language_loss": 0.81819636, "learning_rate": 3.417789331369565e-06, "loss": 0.84020001, "num_input_tokens_seen": 48494300, "step": 2260, "time_per_iteration": 2.6228742599487305 }, { "auxiliary_loss_clip": 0.01205123, "auxiliary_loss_mlp": 0.01032751, "balance_loss_clip": 1.06133878, "balance_loss_mlp": 1.02354169, "epoch": 0.27186917573498465, "flos": 29278688060160.0, "grad_norm": 2.565695001007483, "language_loss": 0.90917522, "learning_rate": 3.4172398039522088e-06, "loss": 0.93155396, "num_input_tokens_seen": 48515585, "step": 2261, "time_per_iteration": 2.5726802349090576 }, { "auxiliary_loss_clip": 0.01186335, "auxiliary_loss_mlp": 0.01025465, "balance_loss_clip": 1.05688477, "balance_loss_mlp": 1.01679265, "epoch": 0.27198941862562376, "flos": 26032220000640.0, "grad_norm": 1.7100227764244185, "language_loss": 0.80055898, "learning_rate": 3.4166900615407e-06, "loss": 0.82267702, "num_input_tokens_seen": 48533500, "step": 2262, "time_per_iteration": 2.561304807662964 }, { "auxiliary_loss_clip": 0.01185469, "auxiliary_loss_mlp": 0.0102862, "balance_loss_clip": 1.05736971, "balance_loss_mlp": 1.01979208, "epoch": 0.27210966151626287, "flos": 32780983760640.0, "grad_norm": 2.514317314131098, "language_loss": 0.75424999, "learning_rate": 3.416140104218436e-06, "loss": 0.77639091, "num_input_tokens_seen": 48552865, "step": 2263, "time_per_iteration": 5.107882499694824 }, { "auxiliary_loss_clip": 0.01086179, "auxiliary_loss_mlp": 0.00754045, "balance_loss_clip": 1.03053486, "balance_loss_mlp": 1.00092733, "epoch": 0.2722299044069019, "flos": 65471043219840.0, "grad_norm": 0.842769029650522, "language_loss": 0.69652462, "learning_rate": 3.4155899320688437e-06, "loss": 0.71492684, "num_input_tokens_seen": 48618940, "step": 2264, "time_per_iteration": 3.1810214519500732 }, { "auxiliary_loss_clip": 0.0113024, "auxiliary_loss_mlp": 0.01027132, "balance_loss_clip": 1.05238247, "balance_loss_mlp": 1.01750028, "epoch": 0.27235014729754103, "flos": 15334143782400.0, "grad_norm": 2.05843568013521, "language_loss": 0.73922384, "learning_rate": 3.415039545175384e-06, "loss": 0.76079756, "num_input_tokens_seen": 48634665, "step": 2265, "time_per_iteration": 2.638352870941162 }, { "auxiliary_loss_clip": 0.01187859, "auxiliary_loss_mlp": 0.01029153, "balance_loss_clip": 1.05834961, "balance_loss_mlp": 1.02055252, "epoch": 0.27247039018818014, "flos": 21872363973120.0, "grad_norm": 2.0995664252394306, "language_loss": 0.65288234, "learning_rate": 3.414488943621551e-06, "loss": 0.67505252, "num_input_tokens_seen": 48653330, "step": 2266, "time_per_iteration": 2.574570894241333 }, { "auxiliary_loss_clip": 0.01182684, "auxiliary_loss_mlp": 0.01031598, "balance_loss_clip": 1.05724061, "balance_loss_mlp": 1.02276421, "epoch": 0.2725906330788192, "flos": 18695490514560.0, "grad_norm": 1.7483211306663822, "language_loss": 0.73786974, "learning_rate": 3.41393812749087e-06, "loss": 0.76001251, "num_input_tokens_seen": 48671375, "step": 2267, "time_per_iteration": 2.517695903778076 }, { "auxiliary_loss_clip": 0.01168562, "auxiliary_loss_mlp": 0.01030763, "balance_loss_clip": 1.05664778, "balance_loss_mlp": 1.02169752, "epoch": 0.2727108759694583, "flos": 17886099398400.0, "grad_norm": 2.3245715058488194, "language_loss": 0.71956003, "learning_rate": 3.4133870968668984e-06, "loss": 0.74155325, "num_input_tokens_seen": 48686175, "step": 2268, "time_per_iteration": 2.5397541522979736 }, { "auxiliary_loss_clip": 0.01174338, "auxiliary_loss_mlp": 0.01029815, "balance_loss_clip": 1.05736113, "balance_loss_mlp": 1.02110076, "epoch": 0.2728311188600974, "flos": 24461666755200.0, "grad_norm": 1.6418234099784983, "language_loss": 0.78672612, "learning_rate": 3.412835851833229e-06, "loss": 0.80876768, "num_input_tokens_seen": 48708370, "step": 2269, "time_per_iteration": 2.7194840908050537 }, { "auxiliary_loss_clip": 0.01184135, "auxiliary_loss_mlp": 0.0103047, "balance_loss_clip": 1.06002295, "balance_loss_mlp": 1.02146947, "epoch": 0.2729513617507365, "flos": 30993314757120.0, "grad_norm": 1.7233135631284064, "language_loss": 0.78083861, "learning_rate": 3.4122843924734834e-06, "loss": 0.8029846, "num_input_tokens_seen": 48730670, "step": 2270, "time_per_iteration": 2.667440891265869 }, { "auxiliary_loss_clip": 0.01167166, "auxiliary_loss_mlp": 0.01032901, "balance_loss_clip": 1.05530655, "balance_loss_mlp": 1.02380538, "epoch": 0.2730716046413756, "flos": 19094637421440.0, "grad_norm": 1.8046767893824014, "language_loss": 0.87522066, "learning_rate": 3.411732718871319e-06, "loss": 0.89722133, "num_input_tokens_seen": 48746510, "step": 2271, "time_per_iteration": 2.5938398838043213 }, { "auxiliary_loss_clip": 0.01198006, "auxiliary_loss_mlp": 0.01030446, "balance_loss_clip": 1.06020331, "balance_loss_mlp": 1.02235723, "epoch": 0.27319184753201464, "flos": 26944566474240.0, "grad_norm": 1.6100913658973959, "language_loss": 0.78764844, "learning_rate": 3.4111808311104227e-06, "loss": 0.80993295, "num_input_tokens_seen": 48768825, "step": 2272, "time_per_iteration": 2.6117727756500244 }, { "auxiliary_loss_clip": 0.01176248, "auxiliary_loss_mlp": 0.01030141, "balance_loss_clip": 1.05379152, "balance_loss_mlp": 1.02077699, "epoch": 0.27331209042265375, "flos": 31759828012800.0, "grad_norm": 2.075599805796371, "language_loss": 0.69473076, "learning_rate": 3.410628729274517e-06, "loss": 0.71679461, "num_input_tokens_seen": 48790345, "step": 2273, "time_per_iteration": 2.641658306121826 }, { "auxiliary_loss_clip": 0.01165904, "auxiliary_loss_mlp": 0.00763933, "balance_loss_clip": 1.05490375, "balance_loss_mlp": 1.00111079, "epoch": 0.27343233331329286, "flos": 25739081107200.0, "grad_norm": 1.8633737130588401, "language_loss": 0.8258971, "learning_rate": 3.4100764134473546e-06, "loss": 0.84519553, "num_input_tokens_seen": 48809630, "step": 2274, "time_per_iteration": 2.6776325702667236 }, { "auxiliary_loss_clip": 0.01200349, "auxiliary_loss_mlp": 0.01031464, "balance_loss_clip": 1.06106305, "balance_loss_mlp": 1.02310181, "epoch": 0.2735525762039319, "flos": 24389414547840.0, "grad_norm": 2.1252814747568247, "language_loss": 0.84735936, "learning_rate": 3.4095238837127215e-06, "loss": 0.86967742, "num_input_tokens_seen": 48828770, "step": 2275, "time_per_iteration": 2.5401053428649902 }, { "auxiliary_loss_clip": 0.01152907, "auxiliary_loss_mlp": 0.01024967, "balance_loss_clip": 1.05246913, "balance_loss_mlp": 1.0162468, "epoch": 0.27367281909457103, "flos": 14465357527680.0, "grad_norm": 1.8405068988020015, "language_loss": 0.7926175, "learning_rate": 3.4089711401544355e-06, "loss": 0.81439626, "num_input_tokens_seen": 48846365, "step": 2276, "time_per_iteration": 2.635746717453003 }, { "auxiliary_loss_clip": 0.01182115, "auxiliary_loss_mlp": 0.01024031, "balance_loss_clip": 1.05336463, "balance_loss_mlp": 1.01560259, "epoch": 0.27379306198521014, "flos": 23476996247040.0, "grad_norm": 2.5395313500270675, "language_loss": 0.67487431, "learning_rate": 3.4084181828563486e-06, "loss": 0.69693577, "num_input_tokens_seen": 48863085, "step": 2277, "time_per_iteration": 2.5924232006073 }, { "auxiliary_loss_clip": 0.01142037, "auxiliary_loss_mlp": 0.01026488, "balance_loss_clip": 1.05180907, "balance_loss_mlp": 1.01777411, "epoch": 0.2739133048758492, "flos": 17458152762240.0, "grad_norm": 1.63872546872183, "language_loss": 0.70435667, "learning_rate": 3.4078650119023428e-06, "loss": 0.72604191, "num_input_tokens_seen": 48881400, "step": 2278, "time_per_iteration": 2.6369099617004395 }, { "auxiliary_loss_clip": 0.01129631, "auxiliary_loss_mlp": 0.01033129, "balance_loss_clip": 1.04789996, "balance_loss_mlp": 1.02339005, "epoch": 0.2740335477664883, "flos": 19273113123840.0, "grad_norm": 2.3591697923356945, "language_loss": 0.74473888, "learning_rate": 3.4073116273763337e-06, "loss": 0.76636648, "num_input_tokens_seen": 48895845, "step": 2279, "time_per_iteration": 2.6410837173461914 }, { "auxiliary_loss_clip": 0.01175502, "auxiliary_loss_mlp": 0.01033138, "balance_loss_clip": 1.05449617, "balance_loss_mlp": 1.02364254, "epoch": 0.2741537906571274, "flos": 26104723603200.0, "grad_norm": 1.7409045269684267, "language_loss": 0.8121416, "learning_rate": 3.40675802936227e-06, "loss": 0.83422792, "num_input_tokens_seen": 48916630, "step": 2280, "time_per_iteration": 2.618314743041992 }, { "auxiliary_loss_clip": 0.01166766, "auxiliary_loss_mlp": 0.01037257, "balance_loss_clip": 1.0579977, "balance_loss_mlp": 1.02746391, "epoch": 0.27427403354776647, "flos": 34164190644480.0, "grad_norm": 2.076216025342256, "language_loss": 0.7152608, "learning_rate": 3.4062042179441318e-06, "loss": 0.73730099, "num_input_tokens_seen": 48937100, "step": 2281, "time_per_iteration": 2.716334581375122 }, { "auxiliary_loss_clip": 0.01182867, "auxiliary_loss_mlp": 0.0102598, "balance_loss_clip": 1.0586555, "balance_loss_mlp": 1.01770687, "epoch": 0.2743942764384056, "flos": 18766988536320.0, "grad_norm": 2.4691139178697217, "language_loss": 0.80563724, "learning_rate": 3.4056501932059314e-06, "loss": 0.82772565, "num_input_tokens_seen": 48955175, "step": 2282, "time_per_iteration": 2.580759048461914 }, { "auxiliary_loss_clip": 0.01110148, "auxiliary_loss_mlp": 0.01003243, "balance_loss_clip": 1.03339624, "balance_loss_mlp": 1.00190163, "epoch": 0.2745145193290447, "flos": 64904048058240.0, "grad_norm": 0.7666347693249996, "language_loss": 0.58153772, "learning_rate": 3.405095955231715e-06, "loss": 0.60267162, "num_input_tokens_seen": 49006830, "step": 2283, "time_per_iteration": 3.0404679775238037 }, { "auxiliary_loss_clip": 0.01189729, "auxiliary_loss_mlp": 0.01027791, "balance_loss_clip": 1.05763578, "balance_loss_mlp": 1.01886785, "epoch": 0.27463476221968375, "flos": 16136926796160.0, "grad_norm": 2.523924182004298, "language_loss": 0.94218481, "learning_rate": 3.4045415041055585e-06, "loss": 0.96436006, "num_input_tokens_seen": 49022470, "step": 2284, "time_per_iteration": 2.5761613845825195 }, { "auxiliary_loss_clip": 0.01176194, "auxiliary_loss_mlp": 0.010301, "balance_loss_clip": 1.05835891, "balance_loss_mlp": 1.02065849, "epoch": 0.27475500511032286, "flos": 10376712213120.0, "grad_norm": 2.1045137554009656, "language_loss": 0.78368366, "learning_rate": 3.4039868399115728e-06, "loss": 0.80574656, "num_input_tokens_seen": 49037110, "step": 2285, "time_per_iteration": 3.416395425796509 }, { "auxiliary_loss_clip": 0.01136074, "auxiliary_loss_mlp": 0.01029949, "balance_loss_clip": 1.05695748, "balance_loss_mlp": 1.02121651, "epoch": 0.27487524800096197, "flos": 17311062568320.0, "grad_norm": 1.7549044070174171, "language_loss": 0.80335116, "learning_rate": 3.4034319627339003e-06, "loss": 0.82501137, "num_input_tokens_seen": 49053975, "step": 2286, "time_per_iteration": 2.652266025543213 }, { "auxiliary_loss_clip": 0.01175356, "auxiliary_loss_mlp": 0.01035137, "balance_loss_clip": 1.0593164, "balance_loss_mlp": 1.02606475, "epoch": 0.274995490891601, "flos": 27120205002240.0, "grad_norm": 2.7455388767119713, "language_loss": 0.69592637, "learning_rate": 3.402876872656715e-06, "loss": 0.71803129, "num_input_tokens_seen": 49072295, "step": 2287, "time_per_iteration": 2.626197099685669 }, { "auxiliary_loss_clip": 0.01171219, "auxiliary_loss_mlp": 0.01034554, "balance_loss_clip": 1.05758548, "balance_loss_mlp": 1.02590561, "epoch": 0.27511573378224013, "flos": 23436093634560.0, "grad_norm": 7.9369149199794355, "language_loss": 0.89517951, "learning_rate": 3.402321569764223e-06, "loss": 0.91723728, "num_input_tokens_seen": 49091600, "step": 2288, "time_per_iteration": 2.643428325653076 }, { "auxiliary_loss_clip": 0.01150496, "auxiliary_loss_mlp": 0.00764772, "balance_loss_clip": 1.05458713, "balance_loss_mlp": 1.00131893, "epoch": 0.2752359766728792, "flos": 16722019434240.0, "grad_norm": 3.2711832734459536, "language_loss": 0.83556664, "learning_rate": 3.4017660541406635e-06, "loss": 0.8547194, "num_input_tokens_seen": 49107665, "step": 2289, "time_per_iteration": 3.530979871749878 }, { "auxiliary_loss_clip": 0.0118377, "auxiliary_loss_mlp": 0.01028378, "balance_loss_clip": 1.05886519, "balance_loss_mlp": 1.01952124, "epoch": 0.2753562195635183, "flos": 25297738698240.0, "grad_norm": 1.7170772357239443, "language_loss": 0.74407005, "learning_rate": 3.4012103258703092e-06, "loss": 0.7661916, "num_input_tokens_seen": 49126420, "step": 2290, "time_per_iteration": 4.260874509811401 }, { "auxiliary_loss_clip": 0.01157504, "auxiliary_loss_mlp": 0.01023131, "balance_loss_clip": 1.05454397, "balance_loss_mlp": 1.01430392, "epoch": 0.2754764624541574, "flos": 27338972785920.0, "grad_norm": 2.0528263521778687, "language_loss": 0.82945174, "learning_rate": 3.4006543850374616e-06, "loss": 0.85125804, "num_input_tokens_seen": 49141470, "step": 2291, "time_per_iteration": 2.617689371109009 }, { "auxiliary_loss_clip": 0.01189104, "auxiliary_loss_mlp": 0.01033238, "balance_loss_clip": 1.05748844, "balance_loss_mlp": 1.02490497, "epoch": 0.27559670534479647, "flos": 17238379397760.0, "grad_norm": 2.0068094901254887, "language_loss": 0.74810296, "learning_rate": 3.400098231726458e-06, "loss": 0.77032638, "num_input_tokens_seen": 49158570, "step": 2292, "time_per_iteration": 2.4912991523742676 }, { "auxiliary_loss_clip": 0.01162393, "auxiliary_loss_mlp": 0.01036679, "balance_loss_clip": 1.0534339, "balance_loss_mlp": 1.02729678, "epoch": 0.2757169482354356, "flos": 21939085486080.0, "grad_norm": 1.8171196006707633, "language_loss": 0.87356663, "learning_rate": 3.3995418660216657e-06, "loss": 0.89555728, "num_input_tokens_seen": 49176025, "step": 2293, "time_per_iteration": 2.6027116775512695 }, { "auxiliary_loss_clip": 0.01208866, "auxiliary_loss_mlp": 0.01034035, "balance_loss_clip": 1.06288445, "balance_loss_mlp": 1.02504635, "epoch": 0.2758371911260747, "flos": 20850669521280.0, "grad_norm": 2.4482372769351284, "language_loss": 0.80623233, "learning_rate": 3.3989852880074848e-06, "loss": 0.82866138, "num_input_tokens_seen": 49197455, "step": 2294, "time_per_iteration": 2.487154245376587 }, { "auxiliary_loss_clip": 0.01089437, "auxiliary_loss_mlp": 0.01003293, "balance_loss_clip": 1.0398109, "balance_loss_mlp": 1.00113487, "epoch": 0.27595743401671374, "flos": 69269063592960.0, "grad_norm": 0.7411011988427074, "language_loss": 0.60670304, "learning_rate": 3.398428497768348e-06, "loss": 0.62763035, "num_input_tokens_seen": 49262625, "step": 2295, "time_per_iteration": 3.242403268814087 }, { "auxiliary_loss_clip": 0.01168106, "auxiliary_loss_mlp": 0.01028707, "balance_loss_clip": 1.05554807, "balance_loss_mlp": 1.01948631, "epoch": 0.27607767690735285, "flos": 21215019127680.0, "grad_norm": 1.7996103019163994, "language_loss": 0.71940064, "learning_rate": 3.3978714953887205e-06, "loss": 0.74136877, "num_input_tokens_seen": 49282380, "step": 2296, "time_per_iteration": 2.6048669815063477 }, { "auxiliary_loss_clip": 0.01131485, "auxiliary_loss_mlp": 0.01029429, "balance_loss_clip": 1.04829657, "balance_loss_mlp": 1.0204345, "epoch": 0.27619791979799196, "flos": 24825334003200.0, "grad_norm": 1.9130129589227036, "language_loss": 0.85990274, "learning_rate": 3.397314280953098e-06, "loss": 0.88151181, "num_input_tokens_seen": 49303205, "step": 2297, "time_per_iteration": 2.638890027999878 }, { "auxiliary_loss_clip": 0.01164224, "auxiliary_loss_mlp": 0.0102636, "balance_loss_clip": 1.05412889, "balance_loss_mlp": 1.01782489, "epoch": 0.276318162688631, "flos": 24753548672640.0, "grad_norm": 3.319790641089275, "language_loss": 0.8011657, "learning_rate": 3.3967568545460108e-06, "loss": 0.8230716, "num_input_tokens_seen": 49322745, "step": 2298, "time_per_iteration": 2.556432008743286 }, { "auxiliary_loss_clip": 0.01187012, "auxiliary_loss_mlp": 0.01032504, "balance_loss_clip": 1.06077337, "balance_loss_mlp": 1.02340794, "epoch": 0.27643840557927013, "flos": 18150007599360.0, "grad_norm": 2.2343643399360884, "language_loss": 0.80108303, "learning_rate": 3.3961992162520185e-06, "loss": 0.82327819, "num_input_tokens_seen": 49341370, "step": 2299, "time_per_iteration": 2.5123610496520996 }, { "auxiliary_loss_clip": 0.01188966, "auxiliary_loss_mlp": 0.01031562, "balance_loss_clip": 1.05956244, "balance_loss_mlp": 1.02217984, "epoch": 0.27655864846990924, "flos": 24823933372800.0, "grad_norm": 2.319248446740071, "language_loss": 0.71698606, "learning_rate": 3.3956413661557156e-06, "loss": 0.73919129, "num_input_tokens_seen": 49361545, "step": 2300, "time_per_iteration": 2.5811564922332764 }, { "auxiliary_loss_clip": 0.01165889, "auxiliary_loss_mlp": 0.0103373, "balance_loss_clip": 1.05479372, "balance_loss_mlp": 1.02438974, "epoch": 0.2766788913605483, "flos": 20266582464000.0, "grad_norm": 2.245048520138294, "language_loss": 0.66343451, "learning_rate": 3.3950833043417273e-06, "loss": 0.68543071, "num_input_tokens_seen": 49379690, "step": 2301, "time_per_iteration": 2.6058032512664795 }, { "auxiliary_loss_clip": 0.01192258, "auxiliary_loss_mlp": 0.01028703, "balance_loss_clip": 1.06261659, "balance_loss_mlp": 1.0188148, "epoch": 0.2767991342511874, "flos": 21470272151040.0, "grad_norm": 2.1459483031713105, "language_loss": 0.72972083, "learning_rate": 3.3945250308947105e-06, "loss": 0.75193042, "num_input_tokens_seen": 49395995, "step": 2302, "time_per_iteration": 2.5117101669311523 }, { "auxiliary_loss_clip": 0.01100616, "auxiliary_loss_mlp": 0.01010573, "balance_loss_clip": 1.03200841, "balance_loss_mlp": 1.00903499, "epoch": 0.2769193771418265, "flos": 66002627571840.0, "grad_norm": 1.2516432673174016, "language_loss": 0.68342334, "learning_rate": 3.3939665458993556e-06, "loss": 0.70453525, "num_input_tokens_seen": 49450415, "step": 2303, "time_per_iteration": 3.0422446727752686 }, { "auxiliary_loss_clip": 0.01163331, "auxiliary_loss_mlp": 0.00764806, "balance_loss_clip": 1.05333459, "balance_loss_mlp": 1.00124931, "epoch": 0.27703962003246557, "flos": 20704441253760.0, "grad_norm": 1.8284182175453665, "language_loss": 0.76726037, "learning_rate": 3.3934078494403843e-06, "loss": 0.7865417, "num_input_tokens_seen": 49469990, "step": 2304, "time_per_iteration": 2.6103415489196777 }, { "auxiliary_loss_clip": 0.01109891, "auxiliary_loss_mlp": 0.01043208, "balance_loss_clip": 1.04834855, "balance_loss_mlp": 1.03272331, "epoch": 0.2771598629231047, "flos": 22929897219840.0, "grad_norm": 2.4626805714158357, "language_loss": 0.81398809, "learning_rate": 3.3928489416025495e-06, "loss": 0.83551908, "num_input_tokens_seen": 49490835, "step": 2305, "time_per_iteration": 2.688297748565674 }, { "auxiliary_loss_clip": 0.01171536, "auxiliary_loss_mlp": 0.0104041, "balance_loss_clip": 1.05652678, "balance_loss_mlp": 1.03034282, "epoch": 0.27728010581374374, "flos": 18369457741440.0, "grad_norm": 2.490454316898515, "language_loss": 0.78673679, "learning_rate": 3.392289822470638e-06, "loss": 0.80885625, "num_input_tokens_seen": 49508815, "step": 2306, "time_per_iteration": 2.5390889644622803 }, { "auxiliary_loss_clip": 0.01169412, "auxiliary_loss_mlp": 0.0102802, "balance_loss_clip": 1.05469537, "balance_loss_mlp": 1.01853752, "epoch": 0.27740034870438285, "flos": 19427637432960.0, "grad_norm": 1.9863388544497083, "language_loss": 0.75672752, "learning_rate": 3.3917304921294674e-06, "loss": 0.77870184, "num_input_tokens_seen": 49526980, "step": 2307, "time_per_iteration": 2.6404366493225098 }, { "auxiliary_loss_clip": 0.0118897, "auxiliary_loss_mlp": 0.01033904, "balance_loss_clip": 1.05825448, "balance_loss_mlp": 1.02432537, "epoch": 0.27752059159502196, "flos": 21614776565760.0, "grad_norm": 1.6643399870341387, "language_loss": 0.8058055, "learning_rate": 3.3911709506638876e-06, "loss": 0.82803428, "num_input_tokens_seen": 49546290, "step": 2308, "time_per_iteration": 2.532336711883545 }, { "auxiliary_loss_clip": 0.01147685, "auxiliary_loss_mlp": 0.00764965, "balance_loss_clip": 1.0495584, "balance_loss_mlp": 1.00126362, "epoch": 0.277640834485661, "flos": 26608011016320.0, "grad_norm": 2.2848239249049964, "language_loss": 0.81060874, "learning_rate": 3.390611198158781e-06, "loss": 0.82973528, "num_input_tokens_seen": 49564165, "step": 2309, "time_per_iteration": 2.6282243728637695 }, { "auxiliary_loss_clip": 0.01207549, "auxiliary_loss_mlp": 0.01034021, "balance_loss_clip": 1.06244457, "balance_loss_mlp": 1.02489567, "epoch": 0.2777610773763001, "flos": 19492814661120.0, "grad_norm": 2.154266483990461, "language_loss": 0.89846921, "learning_rate": 3.3900512346990612e-06, "loss": 0.92088485, "num_input_tokens_seen": 49580155, "step": 2310, "time_per_iteration": 2.4862213134765625 }, { "auxiliary_loss_clip": 0.01145768, "auxiliary_loss_mlp": 0.01036141, "balance_loss_clip": 1.04983544, "balance_loss_mlp": 1.02585864, "epoch": 0.27788132026693924, "flos": 38290650001920.0, "grad_norm": 1.9653351240014445, "language_loss": 0.65747881, "learning_rate": 3.389491060369674e-06, "loss": 0.67929792, "num_input_tokens_seen": 49605830, "step": 2311, "time_per_iteration": 3.580845832824707 }, { "auxiliary_loss_clip": 0.01135299, "auxiliary_loss_mlp": 0.01026795, "balance_loss_clip": 1.05042803, "balance_loss_mlp": 1.01775265, "epoch": 0.2780015631575783, "flos": 22382546797440.0, "grad_norm": 1.975646644909417, "language_loss": 0.8890394, "learning_rate": 3.388930675255598e-06, "loss": 0.91066039, "num_input_tokens_seen": 49625680, "step": 2312, "time_per_iteration": 2.6129791736602783 }, { "auxiliary_loss_clip": 0.01180119, "auxiliary_loss_mlp": 0.01033888, "balance_loss_clip": 1.05840492, "balance_loss_mlp": 1.02352226, "epoch": 0.2781218060482174, "flos": 12203200840320.0, "grad_norm": 2.493920053750425, "language_loss": 0.79525542, "learning_rate": 3.388370079441843e-06, "loss": 0.81739551, "num_input_tokens_seen": 49641195, "step": 2313, "time_per_iteration": 2.551487684249878 }, { "auxiliary_loss_clip": 0.01162172, "auxiliary_loss_mlp": 0.0103543, "balance_loss_clip": 1.05983114, "balance_loss_mlp": 1.02633989, "epoch": 0.2782420489388565, "flos": 18107632529280.0, "grad_norm": 2.2083455780340984, "language_loss": 0.92839342, "learning_rate": 3.3878092730134505e-06, "loss": 0.95036948, "num_input_tokens_seen": 49659180, "step": 2314, "time_per_iteration": 2.5407063961029053 }, { "auxiliary_loss_clip": 0.01180894, "auxiliary_loss_mlp": 0.01035914, "balance_loss_clip": 1.05730247, "balance_loss_mlp": 1.02655625, "epoch": 0.27836229182949557, "flos": 18514752255360.0, "grad_norm": 2.309529374740936, "language_loss": 0.80695623, "learning_rate": 3.3872482560554947e-06, "loss": 0.82912433, "num_input_tokens_seen": 49677955, "step": 2315, "time_per_iteration": 2.4814672470092773 }, { "auxiliary_loss_clip": 0.0109702, "auxiliary_loss_mlp": 0.01001625, "balance_loss_clip": 1.02963758, "balance_loss_mlp": 1.0001471, "epoch": 0.2784825347201347, "flos": 67079230940160.0, "grad_norm": 0.7948957031337394, "language_loss": 0.57035553, "learning_rate": 3.386687028653082e-06, "loss": 0.59134197, "num_input_tokens_seen": 49740800, "step": 2316, "time_per_iteration": 4.028663396835327 }, { "auxiliary_loss_clip": 0.01146956, "auxiliary_loss_mlp": 0.01030347, "balance_loss_clip": 1.05576205, "balance_loss_mlp": 1.02076316, "epoch": 0.2786027776107738, "flos": 22631119891200.0, "grad_norm": 1.9196492220412786, "language_loss": 0.84998566, "learning_rate": 3.386125590891349e-06, "loss": 0.8717587, "num_input_tokens_seen": 49757675, "step": 2317, "time_per_iteration": 3.359004259109497 }, { "auxiliary_loss_clip": 0.01161774, "auxiliary_loss_mlp": 0.01028775, "balance_loss_clip": 1.05310535, "balance_loss_mlp": 1.01991177, "epoch": 0.27872302050141284, "flos": 15778826156160.0, "grad_norm": 2.513360774218176, "language_loss": 0.82911432, "learning_rate": 3.3855639428554657e-06, "loss": 0.85101986, "num_input_tokens_seen": 49775205, "step": 2318, "time_per_iteration": 2.5261754989624023 }, { "auxiliary_loss_clip": 0.01148919, "auxiliary_loss_mlp": 0.01028438, "balance_loss_clip": 1.05549622, "balance_loss_mlp": 1.01964068, "epoch": 0.27884326339205195, "flos": 22126970551680.0, "grad_norm": 1.8332929501310893, "language_loss": 0.80668819, "learning_rate": 3.385002084630635e-06, "loss": 0.82846177, "num_input_tokens_seen": 49794175, "step": 2319, "time_per_iteration": 2.569084405899048 }, { "auxiliary_loss_clip": 0.01196523, "auxiliary_loss_mlp": 0.01034708, "balance_loss_clip": 1.06220078, "balance_loss_mlp": 1.02451563, "epoch": 0.278963506282691, "flos": 20558715776640.0, "grad_norm": 2.149297710017262, "language_loss": 0.85028422, "learning_rate": 3.384440016302088e-06, "loss": 0.8725965, "num_input_tokens_seen": 49812850, "step": 2320, "time_per_iteration": 2.513162136077881 }, { "auxiliary_loss_clip": 0.01185125, "auxiliary_loss_mlp": 0.01034679, "balance_loss_clip": 1.05859601, "balance_loss_mlp": 1.0251838, "epoch": 0.2790837491733301, "flos": 21942928241280.0, "grad_norm": 2.5064229707667804, "language_loss": 0.62424409, "learning_rate": 3.3838777379550923e-06, "loss": 0.64644217, "num_input_tokens_seen": 49832295, "step": 2321, "time_per_iteration": 2.5065596103668213 }, { "auxiliary_loss_clip": 0.01179955, "auxiliary_loss_mlp": 0.01035653, "balance_loss_clip": 1.05970931, "balance_loss_mlp": 1.02632487, "epoch": 0.27920399206396923, "flos": 26286790665600.0, "grad_norm": 2.065703189611674, "language_loss": 0.78120005, "learning_rate": 3.383315249674944e-06, "loss": 0.80335611, "num_input_tokens_seen": 49850860, "step": 2322, "time_per_iteration": 2.5916476249694824 }, { "auxiliary_loss_clip": 0.01162897, "auxiliary_loss_mlp": 0.0103328, "balance_loss_clip": 1.05666399, "balance_loss_mlp": 1.0240953, "epoch": 0.2793242349546083, "flos": 25400981364480.0, "grad_norm": 2.454702743351978, "language_loss": 0.86042655, "learning_rate": 3.3827525515469715e-06, "loss": 0.88238835, "num_input_tokens_seen": 49865765, "step": 2323, "time_per_iteration": 2.6412618160247803 }, { "auxiliary_loss_clip": 0.01149503, "auxiliary_loss_mlp": 0.01040581, "balance_loss_clip": 1.05057168, "balance_loss_mlp": 1.03035247, "epoch": 0.2794444778452474, "flos": 20850346298880.0, "grad_norm": 2.6376881586176335, "language_loss": 0.71126264, "learning_rate": 3.3821896436565367e-06, "loss": 0.73316348, "num_input_tokens_seen": 49885425, "step": 2324, "time_per_iteration": 2.594083547592163 }, { "auxiliary_loss_clip": 0.01193476, "auxiliary_loss_mlp": 0.0103658, "balance_loss_clip": 1.06444263, "balance_loss_mlp": 1.0272342, "epoch": 0.2795647207358865, "flos": 21576244250880.0, "grad_norm": 1.7008364518968293, "language_loss": 0.7003355, "learning_rate": 3.381626526089032e-06, "loss": 0.72263604, "num_input_tokens_seen": 49904990, "step": 2325, "time_per_iteration": 2.5412697792053223 }, { "auxiliary_loss_clip": 0.01172224, "auxiliary_loss_mlp": 0.01029762, "balance_loss_clip": 1.0559392, "balance_loss_mlp": 1.020064, "epoch": 0.27968496362652556, "flos": 21471744608640.0, "grad_norm": 1.9893451231395027, "language_loss": 0.78937197, "learning_rate": 3.3810631989298815e-06, "loss": 0.81139183, "num_input_tokens_seen": 49924600, "step": 2326, "time_per_iteration": 2.608919858932495 }, { "auxiliary_loss_clip": 0.01156335, "auxiliary_loss_mlp": 0.0103312, "balance_loss_clip": 1.06092358, "balance_loss_mlp": 1.02234364, "epoch": 0.2798052065171647, "flos": 23258695340160.0, "grad_norm": 2.3114567806177178, "language_loss": 0.84328163, "learning_rate": 3.3804996622645423e-06, "loss": 0.86517608, "num_input_tokens_seen": 49942600, "step": 2327, "time_per_iteration": 2.665694236755371 }, { "auxiliary_loss_clip": 0.01205958, "auxiliary_loss_mlp": 0.0103056, "balance_loss_clip": 1.06268311, "balance_loss_mlp": 1.02151835, "epoch": 0.2799254494078038, "flos": 21539328048000.0, "grad_norm": 2.898145915437923, "language_loss": 0.89373565, "learning_rate": 3.3799359161785015e-06, "loss": 0.91610086, "num_input_tokens_seen": 49962250, "step": 2328, "time_per_iteration": 2.510000467300415 }, { "auxiliary_loss_clip": 0.0118616, "auxiliary_loss_mlp": 0.01034802, "balance_loss_clip": 1.05907881, "balance_loss_mlp": 1.02512813, "epoch": 0.28004569229844284, "flos": 26393912000640.0, "grad_norm": 1.6260675827277233, "language_loss": 0.85655361, "learning_rate": 3.3793719607572798e-06, "loss": 0.8787632, "num_input_tokens_seen": 49983215, "step": 2329, "time_per_iteration": 2.5816659927368164 }, { "auxiliary_loss_clip": 0.01157664, "auxiliary_loss_mlp": 0.0102981, "balance_loss_clip": 1.05366766, "balance_loss_mlp": 1.02076852, "epoch": 0.28016593518908195, "flos": 33547676584320.0, "grad_norm": 2.100938306879155, "language_loss": 0.77496648, "learning_rate": 3.378807796086428e-06, "loss": 0.79684114, "num_input_tokens_seen": 50006075, "step": 2330, "time_per_iteration": 2.6494498252868652 }, { "auxiliary_loss_clip": 0.01209438, "auxiliary_loss_mlp": 0.01031569, "balance_loss_clip": 1.06612802, "balance_loss_mlp": 1.02177012, "epoch": 0.28028617807972106, "flos": 15340823712000.0, "grad_norm": 1.9293112768998422, "language_loss": 0.77006054, "learning_rate": 3.37824342225153e-06, "loss": 0.79247063, "num_input_tokens_seen": 50022495, "step": 2331, "time_per_iteration": 2.436617136001587 }, { "auxiliary_loss_clip": 0.01148122, "auxiliary_loss_mlp": 0.01033854, "balance_loss_clip": 1.05807436, "balance_loss_mlp": 1.02456784, "epoch": 0.2804064209703601, "flos": 25520277409920.0, "grad_norm": 1.7922196358332234, "language_loss": 0.77682823, "learning_rate": 3.3776788393382006e-06, "loss": 0.798648, "num_input_tokens_seen": 50041975, "step": 2332, "time_per_iteration": 2.6401240825653076 }, { "auxiliary_loss_clip": 0.01206985, "auxiliary_loss_mlp": 0.0103003, "balance_loss_clip": 1.06424546, "balance_loss_mlp": 1.0206244, "epoch": 0.2805266638609992, "flos": 29351766280320.0, "grad_norm": 2.061127767207515, "language_loss": 0.76400775, "learning_rate": 3.3771140474320872e-06, "loss": 0.78637791, "num_input_tokens_seen": 50061925, "step": 2333, "time_per_iteration": 2.5431337356567383 }, { "auxiliary_loss_clip": 0.01169427, "auxiliary_loss_mlp": 0.01036684, "balance_loss_clip": 1.0593071, "balance_loss_mlp": 1.02760649, "epoch": 0.28064690675163834, "flos": 21463735875840.0, "grad_norm": 2.161381305127918, "language_loss": 0.79593223, "learning_rate": 3.3765490466188664e-06, "loss": 0.81799334, "num_input_tokens_seen": 50079325, "step": 2334, "time_per_iteration": 2.5762736797332764 }, { "auxiliary_loss_clip": 0.01159924, "auxiliary_loss_mlp": 0.01029951, "balance_loss_clip": 1.05718207, "balance_loss_mlp": 1.02011597, "epoch": 0.2807671496422774, "flos": 20995640812800.0, "grad_norm": 2.473230499343083, "language_loss": 0.7369501, "learning_rate": 3.3759838369842508e-06, "loss": 0.75884885, "num_input_tokens_seen": 50097400, "step": 2335, "time_per_iteration": 2.566033363342285 }, { "auxiliary_loss_clip": 0.0116503, "auxiliary_loss_mlp": 0.01031918, "balance_loss_clip": 1.06084752, "balance_loss_mlp": 1.02295387, "epoch": 0.2808873925329165, "flos": 21506577822720.0, "grad_norm": 2.2176781950601927, "language_loss": 0.72732151, "learning_rate": 3.375418418613981e-06, "loss": 0.749291, "num_input_tokens_seen": 50116425, "step": 2336, "time_per_iteration": 2.559135675430298 }, { "auxiliary_loss_clip": 0.01176915, "auxiliary_loss_mlp": 0.01034773, "balance_loss_clip": 1.06053233, "balance_loss_mlp": 1.02447891, "epoch": 0.28100763542355556, "flos": 16070815814400.0, "grad_norm": 3.5260090584103305, "language_loss": 0.83013093, "learning_rate": 3.374852791593831e-06, "loss": 0.85224783, "num_input_tokens_seen": 50132625, "step": 2337, "time_per_iteration": 3.3774821758270264 }, { "auxiliary_loss_clip": 0.01156944, "auxiliary_loss_mlp": 0.01033555, "balance_loss_clip": 1.05524826, "balance_loss_mlp": 1.02388132, "epoch": 0.28112787831419467, "flos": 19062605468160.0, "grad_norm": 3.0706785236155483, "language_loss": 0.53758192, "learning_rate": 3.374286956009605e-06, "loss": 0.55948687, "num_input_tokens_seen": 50151190, "step": 2338, "time_per_iteration": 2.5810139179229736 }, { "auxiliary_loss_clip": 0.01192565, "auxiliary_loss_mlp": 0.01030757, "balance_loss_clip": 1.06559896, "balance_loss_mlp": 1.02130938, "epoch": 0.2812481212048338, "flos": 12823629482880.0, "grad_norm": 2.024522059228888, "language_loss": 0.75487411, "learning_rate": 3.3737209119471405e-06, "loss": 0.77710736, "num_input_tokens_seen": 50167700, "step": 2339, "time_per_iteration": 2.5333142280578613 }, { "auxiliary_loss_clip": 0.01198482, "auxiliary_loss_mlp": 0.0103058, "balance_loss_clip": 1.06378555, "balance_loss_mlp": 1.02078724, "epoch": 0.28136836409547283, "flos": 15633064765440.0, "grad_norm": 3.3468805786252953, "language_loss": 0.63466692, "learning_rate": 3.373154659492306e-06, "loss": 0.65695763, "num_input_tokens_seen": 50185840, "step": 2340, "time_per_iteration": 2.509500741958618 }, { "auxiliary_loss_clip": 0.01178868, "auxiliary_loss_mlp": 0.01045446, "balance_loss_clip": 1.060274, "balance_loss_mlp": 1.03621376, "epoch": 0.28148860698611194, "flos": 19933726106880.0, "grad_norm": 1.8242314159819109, "language_loss": 0.85186571, "learning_rate": 3.3725881987310016e-06, "loss": 0.87410885, "num_input_tokens_seen": 50203375, "step": 2341, "time_per_iteration": 2.53665828704834 }, { "auxiliary_loss_clip": 0.01173563, "auxiliary_loss_mlp": 0.01033675, "balance_loss_clip": 1.05899489, "balance_loss_mlp": 1.02503204, "epoch": 0.28160884987675106, "flos": 17457219008640.0, "grad_norm": 1.958764281288809, "language_loss": 0.87675655, "learning_rate": 3.372021529749159e-06, "loss": 0.89882898, "num_input_tokens_seen": 50222435, "step": 2342, "time_per_iteration": 3.409428358078003 }, { "auxiliary_loss_clip": 0.01133496, "auxiliary_loss_mlp": 0.01033349, "balance_loss_clip": 1.05624151, "balance_loss_mlp": 1.02449799, "epoch": 0.2817290927673901, "flos": 16834743290880.0, "grad_norm": 1.856778383664567, "language_loss": 0.92201102, "learning_rate": 3.3714546526327405e-06, "loss": 0.94367945, "num_input_tokens_seen": 50240435, "step": 2343, "time_per_iteration": 3.4450154304504395 }, { "auxiliary_loss_clip": 0.01166647, "auxiliary_loss_mlp": 0.01031424, "balance_loss_clip": 1.05773675, "balance_loss_mlp": 1.02154708, "epoch": 0.2818493356580292, "flos": 15414081500160.0, "grad_norm": 1.9954686708102507, "language_loss": 0.87517333, "learning_rate": 3.3708875674677423e-06, "loss": 0.89715403, "num_input_tokens_seen": 50258410, "step": 2344, "time_per_iteration": 3.3381736278533936 }, { "auxiliary_loss_clip": 0.01186885, "auxiliary_loss_mlp": 0.0103178, "balance_loss_clip": 1.06462586, "balance_loss_mlp": 1.02197528, "epoch": 0.28196957854866833, "flos": 20412451595520.0, "grad_norm": 1.9092639907027233, "language_loss": 0.83694327, "learning_rate": 3.37032027434019e-06, "loss": 0.85912991, "num_input_tokens_seen": 50277930, "step": 2345, "time_per_iteration": 2.535391330718994 }, { "auxiliary_loss_clip": 0.01202432, "auxiliary_loss_mlp": 0.01031873, "balance_loss_clip": 1.06387603, "balance_loss_mlp": 1.02106702, "epoch": 0.2820898214393074, "flos": 19973120348160.0, "grad_norm": 5.171048350433416, "language_loss": 0.82612407, "learning_rate": 3.369752773336141e-06, "loss": 0.84846711, "num_input_tokens_seen": 50297410, "step": 2346, "time_per_iteration": 2.5055010318756104 }, { "auxiliary_loss_clip": 0.01173886, "auxiliary_loss_mlp": 0.01034297, "balance_loss_clip": 1.05696893, "balance_loss_mlp": 1.02402127, "epoch": 0.2822100643299465, "flos": 22528308188160.0, "grad_norm": 1.6579476078028912, "language_loss": 0.77877158, "learning_rate": 3.3691850645416864e-06, "loss": 0.80085343, "num_input_tokens_seen": 50317120, "step": 2347, "time_per_iteration": 2.5489501953125 }, { "auxiliary_loss_clip": 0.01197893, "auxiliary_loss_mlp": 0.01039699, "balance_loss_clip": 1.06306696, "balance_loss_mlp": 1.02986479, "epoch": 0.2823303072205856, "flos": 11546682007680.0, "grad_norm": 1.882846620029653, "language_loss": 0.8282944, "learning_rate": 3.368617148042945e-06, "loss": 0.85067034, "num_input_tokens_seen": 50334790, "step": 2348, "time_per_iteration": 2.505981683731079 }, { "auxiliary_loss_clip": 0.01170172, "auxiliary_loss_mlp": 0.0103684, "balance_loss_clip": 1.05508637, "balance_loss_mlp": 1.0265466, "epoch": 0.28245055011122466, "flos": 18259894281600.0, "grad_norm": 1.8691618871655928, "language_loss": 0.8425113, "learning_rate": 3.368049023926071e-06, "loss": 0.86458147, "num_input_tokens_seen": 50353785, "step": 2349, "time_per_iteration": 2.526679039001465 }, { "auxiliary_loss_clip": 0.011914, "auxiliary_loss_mlp": 0.0103685, "balance_loss_clip": 1.06355131, "balance_loss_mlp": 1.0282495, "epoch": 0.2825707930018638, "flos": 24608110504320.0, "grad_norm": 1.595431489534568, "language_loss": 0.83579248, "learning_rate": 3.3674806922772476e-06, "loss": 0.85807496, "num_input_tokens_seen": 50374670, "step": 2350, "time_per_iteration": 2.624443531036377 }, { "auxiliary_loss_clip": 0.01170053, "auxiliary_loss_mlp": 0.01040415, "balance_loss_clip": 1.05922985, "balance_loss_mlp": 1.03089666, "epoch": 0.28269103589250283, "flos": 25226994862080.0, "grad_norm": 2.136161510632417, "language_loss": 0.74909902, "learning_rate": 3.3669121531826904e-06, "loss": 0.77120376, "num_input_tokens_seen": 50395650, "step": 2351, "time_per_iteration": 2.659956216812134 }, { "auxiliary_loss_clip": 0.01160098, "auxiliary_loss_mlp": 0.01029089, "balance_loss_clip": 1.06141984, "balance_loss_mlp": 1.02030897, "epoch": 0.28281127878314194, "flos": 19281552819840.0, "grad_norm": 2.173373283419757, "language_loss": 0.83295619, "learning_rate": 3.366343406728647e-06, "loss": 0.85484803, "num_input_tokens_seen": 50415100, "step": 2352, "time_per_iteration": 2.5878121852874756 }, { "auxiliary_loss_clip": 0.01183629, "auxiliary_loss_mlp": 0.01030229, "balance_loss_clip": 1.05737889, "balance_loss_mlp": 1.02094913, "epoch": 0.28293152167378105, "flos": 23878405710720.0, "grad_norm": 1.707425677985109, "language_loss": 0.68638068, "learning_rate": 3.3657744530013946e-06, "loss": 0.70851934, "num_input_tokens_seen": 50434335, "step": 2353, "time_per_iteration": 2.5580530166625977 }, { "auxiliary_loss_clip": 0.01195874, "auxiliary_loss_mlp": 0.01030115, "balance_loss_clip": 1.06339359, "balance_loss_mlp": 1.02051306, "epoch": 0.2830517645644201, "flos": 43866965928960.0, "grad_norm": 2.0762422535772638, "language_loss": 0.71071017, "learning_rate": 3.3652052920872437e-06, "loss": 0.73297, "num_input_tokens_seen": 50457200, "step": 2354, "time_per_iteration": 2.7134580612182617 }, { "auxiliary_loss_clip": 0.01179273, "auxiliary_loss_mlp": 0.01037555, "balance_loss_clip": 1.05931056, "balance_loss_mlp": 1.02764273, "epoch": 0.2831720074550592, "flos": 26651750803200.0, "grad_norm": 1.8624680207898485, "language_loss": 0.85815382, "learning_rate": 3.3646359240725355e-06, "loss": 0.8803221, "num_input_tokens_seen": 50476390, "step": 2355, "time_per_iteration": 2.614729642868042 }, { "auxiliary_loss_clip": 0.01185177, "auxiliary_loss_mlp": 0.00764724, "balance_loss_clip": 1.06031001, "balance_loss_mlp": 1.00082004, "epoch": 0.2832922503456983, "flos": 31029979564800.0, "grad_norm": 2.694359359149424, "language_loss": 0.67484367, "learning_rate": 3.364066349043643e-06, "loss": 0.69434267, "num_input_tokens_seen": 50497595, "step": 2356, "time_per_iteration": 2.6167221069335938 }, { "auxiliary_loss_clip": 0.01175585, "auxiliary_loss_mlp": 0.01032289, "balance_loss_clip": 1.06018806, "balance_loss_mlp": 1.02372348, "epoch": 0.2834124932363374, "flos": 20405699838720.0, "grad_norm": 1.9004247575826367, "language_loss": 0.82089382, "learning_rate": 3.363496567086969e-06, "loss": 0.84297258, "num_input_tokens_seen": 50514690, "step": 2357, "time_per_iteration": 2.53696870803833 }, { "auxiliary_loss_clip": 0.01206444, "auxiliary_loss_mlp": 0.01030636, "balance_loss_clip": 1.06446064, "balance_loss_mlp": 1.02153432, "epoch": 0.2835327361269765, "flos": 39384848056320.0, "grad_norm": 1.9190401425507195, "language_loss": 0.75401032, "learning_rate": 3.3629265782889506e-06, "loss": 0.77638113, "num_input_tokens_seen": 50536515, "step": 2358, "time_per_iteration": 2.6603875160217285 }, { "auxiliary_loss_clip": 0.01157554, "auxiliary_loss_mlp": 0.01033714, "balance_loss_clip": 1.055071, "balance_loss_mlp": 1.02403402, "epoch": 0.2836529790176156, "flos": 30261598801920.0, "grad_norm": 2.042476570994474, "language_loss": 0.72166002, "learning_rate": 3.362356382736054e-06, "loss": 0.74357271, "num_input_tokens_seen": 50557120, "step": 2359, "time_per_iteration": 2.6420087814331055 }, { "auxiliary_loss_clip": 0.0116007, "auxiliary_loss_mlp": 0.01023801, "balance_loss_clip": 1.05396056, "balance_loss_mlp": 1.01566529, "epoch": 0.28377322190825466, "flos": 12677796264960.0, "grad_norm": 1.9568675531477338, "language_loss": 0.90907407, "learning_rate": 3.361785980514777e-06, "loss": 0.93091279, "num_input_tokens_seen": 50573320, "step": 2360, "time_per_iteration": 2.6443564891815186 }, { "auxiliary_loss_clip": 0.01128404, "auxiliary_loss_mlp": 0.01035024, "balance_loss_clip": 1.05643725, "balance_loss_mlp": 1.02615535, "epoch": 0.28389346479889377, "flos": 18296666830080.0, "grad_norm": 1.9874025011676186, "language_loss": 0.76633704, "learning_rate": 3.361215371711649e-06, "loss": 0.78797132, "num_input_tokens_seen": 50592415, "step": 2361, "time_per_iteration": 2.6237235069274902 }, { "auxiliary_loss_clip": 0.01155492, "auxiliary_loss_mlp": 0.01031293, "balance_loss_clip": 1.05587792, "balance_loss_mlp": 1.02279377, "epoch": 0.2840137076895329, "flos": 20406992728320.0, "grad_norm": 1.9902492788309643, "language_loss": 0.83466303, "learning_rate": 3.3606445564132326e-06, "loss": 0.8565309, "num_input_tokens_seen": 50609710, "step": 2362, "time_per_iteration": 2.636603832244873 }, { "auxiliary_loss_clip": 0.01208778, "auxiliary_loss_mlp": 0.00764205, "balance_loss_clip": 1.06684673, "balance_loss_mlp": 1.00074553, "epoch": 0.28413395058017193, "flos": 20048030161920.0, "grad_norm": 1.960051778617603, "language_loss": 0.82106245, "learning_rate": 3.360073534706118e-06, "loss": 0.8407923, "num_input_tokens_seen": 50626865, "step": 2363, "time_per_iteration": 2.5006539821624756 }, { "auxiliary_loss_clip": 0.0117748, "auxiliary_loss_mlp": 0.01027523, "balance_loss_clip": 1.05975437, "balance_loss_mlp": 1.01862371, "epoch": 0.28425419347081105, "flos": 37663613256960.0, "grad_norm": 2.5619835828188657, "language_loss": 0.75949162, "learning_rate": 3.35950230667693e-06, "loss": 0.78154165, "num_input_tokens_seen": 50648560, "step": 2364, "time_per_iteration": 3.575005292892456 }, { "auxiliary_loss_clip": 0.01193141, "auxiliary_loss_mlp": 0.01026669, "balance_loss_clip": 1.06227612, "balance_loss_mlp": 1.01805031, "epoch": 0.28437443636145016, "flos": 13845072539520.0, "grad_norm": 2.300519316214218, "language_loss": 0.85227096, "learning_rate": 3.358930872412323e-06, "loss": 0.87446904, "num_input_tokens_seen": 50665725, "step": 2365, "time_per_iteration": 2.4773616790771484 }, { "auxiliary_loss_clip": 0.0118766, "auxiliary_loss_mlp": 0.01033034, "balance_loss_clip": 1.06158233, "balance_loss_mlp": 1.02386665, "epoch": 0.2844946792520892, "flos": 22747794243840.0, "grad_norm": 1.7623387457713378, "language_loss": 0.80856657, "learning_rate": 3.3583592319989825e-06, "loss": 0.83077353, "num_input_tokens_seen": 50685095, "step": 2366, "time_per_iteration": 2.582291603088379 }, { "auxiliary_loss_clip": 0.01198795, "auxiliary_loss_mlp": 0.01037993, "balance_loss_clip": 1.06252277, "balance_loss_mlp": 1.02849197, "epoch": 0.2846149221427283, "flos": 32415987709440.0, "grad_norm": 2.1967308208184906, "language_loss": 0.68716174, "learning_rate": 3.357787385523627e-06, "loss": 0.70952964, "num_input_tokens_seen": 50706500, "step": 2367, "time_per_iteration": 2.585573196411133 }, { "auxiliary_loss_clip": 0.01141871, "auxiliary_loss_mlp": 0.01031395, "balance_loss_clip": 1.05591512, "balance_loss_mlp": 1.02290118, "epoch": 0.2847351650333674, "flos": 28475976873600.0, "grad_norm": 2.025690913299581, "language_loss": 0.82715809, "learning_rate": 3.3572153330730048e-06, "loss": 0.84889078, "num_input_tokens_seen": 50727595, "step": 2368, "time_per_iteration": 2.7367939949035645 }, { "auxiliary_loss_clip": 0.01094788, "auxiliary_loss_mlp": 0.01001877, "balance_loss_clip": 1.04459929, "balance_loss_mlp": 1.00039852, "epoch": 0.2848554079240065, "flos": 55753399704960.0, "grad_norm": 0.8242773002180156, "language_loss": 0.64665413, "learning_rate": 3.3566430747338956e-06, "loss": 0.66762078, "num_input_tokens_seen": 50782800, "step": 2369, "time_per_iteration": 3.959866762161255 }, { "auxiliary_loss_clip": 0.01192984, "auxiliary_loss_mlp": 0.01031914, "balance_loss_clip": 1.06040013, "balance_loss_mlp": 1.0227232, "epoch": 0.2849756508146456, "flos": 11836875985920.0, "grad_norm": 4.972296229661133, "language_loss": 0.86422235, "learning_rate": 3.35607061059311e-06, "loss": 0.88647127, "num_input_tokens_seen": 50797730, "step": 2370, "time_per_iteration": 3.390984296798706 }, { "auxiliary_loss_clip": 0.01201395, "auxiliary_loss_mlp": 0.01033095, "balance_loss_clip": 1.06159592, "balance_loss_mlp": 1.02454221, "epoch": 0.28509589370528465, "flos": 25155209531520.0, "grad_norm": 1.8411600341994314, "language_loss": 0.75162351, "learning_rate": 3.3554979407374917e-06, "loss": 0.77396846, "num_input_tokens_seen": 50819840, "step": 2371, "time_per_iteration": 2.599081039428711 }, { "auxiliary_loss_clip": 0.01192201, "auxiliary_loss_mlp": 0.01033268, "balance_loss_clip": 1.06195831, "balance_loss_mlp": 1.02445197, "epoch": 0.28521613659592376, "flos": 19974808287360.0, "grad_norm": 2.168944280064147, "language_loss": 0.73397607, "learning_rate": 3.3549250652539134e-06, "loss": 0.75623071, "num_input_tokens_seen": 50838935, "step": 2372, "time_per_iteration": 2.517530679702759 }, { "auxiliary_loss_clip": 0.01175333, "auxiliary_loss_mlp": 0.01032283, "balance_loss_clip": 1.05793881, "balance_loss_mlp": 1.02278793, "epoch": 0.2853363794865629, "flos": 23367971491200.0, "grad_norm": 1.8746988481138647, "language_loss": 0.81731111, "learning_rate": 3.3543519842292794e-06, "loss": 0.8393873, "num_input_tokens_seen": 50858590, "step": 2373, "time_per_iteration": 2.567857265472412 }, { "auxiliary_loss_clip": 0.01206605, "auxiliary_loss_mlp": 0.00763809, "balance_loss_clip": 1.06479335, "balance_loss_mlp": 1.0007211, "epoch": 0.28545662237720193, "flos": 19861940776320.0, "grad_norm": 1.744301002779992, "language_loss": 0.83457518, "learning_rate": 3.353778697750527e-06, "loss": 0.85427928, "num_input_tokens_seen": 50876995, "step": 2374, "time_per_iteration": 2.4623165130615234 }, { "auxiliary_loss_clip": 0.011677, "auxiliary_loss_mlp": 0.01027711, "balance_loss_clip": 1.05745029, "balance_loss_mlp": 1.01866889, "epoch": 0.28557686526784104, "flos": 23879016241920.0, "grad_norm": 1.9550237495053546, "language_loss": 0.89293051, "learning_rate": 3.353205205904622e-06, "loss": 0.91488457, "num_input_tokens_seen": 50896105, "step": 2375, "time_per_iteration": 2.658015727996826 }, { "auxiliary_loss_clip": 0.01176211, "auxiliary_loss_mlp": 0.0103181, "balance_loss_clip": 1.059587, "balance_loss_mlp": 1.02300668, "epoch": 0.28569710815848015, "flos": 44890384233600.0, "grad_norm": 2.1578884171303163, "language_loss": 0.72087818, "learning_rate": 3.3526315087785637e-06, "loss": 0.74295843, "num_input_tokens_seen": 50917220, "step": 2376, "time_per_iteration": 2.7428653240203857 }, { "auxiliary_loss_clip": 0.01125875, "auxiliary_loss_mlp": 0.01031991, "balance_loss_clip": 1.05305409, "balance_loss_mlp": 1.02302682, "epoch": 0.2858173510491192, "flos": 26829759628800.0, "grad_norm": 1.6070539904570438, "language_loss": 0.80692399, "learning_rate": 3.3520576064593805e-06, "loss": 0.82850266, "num_input_tokens_seen": 50937175, "step": 2377, "time_per_iteration": 2.633937358856201 }, { "auxiliary_loss_clip": 0.01194351, "auxiliary_loss_mlp": 0.01027429, "balance_loss_clip": 1.06182778, "balance_loss_mlp": 1.01830363, "epoch": 0.2859375939397583, "flos": 23148916398720.0, "grad_norm": 1.7961619675202265, "language_loss": 0.81772435, "learning_rate": 3.3514834990341337e-06, "loss": 0.8399421, "num_input_tokens_seen": 50957500, "step": 2378, "time_per_iteration": 2.606462240219116 }, { "auxiliary_loss_clip": 0.01184098, "auxiliary_loss_mlp": 0.0103082, "balance_loss_clip": 1.0625329, "balance_loss_mlp": 1.0225054, "epoch": 0.2860578368303974, "flos": 12129799397760.0, "grad_norm": 3.7731482053835363, "language_loss": 0.92979473, "learning_rate": 3.3509091865899144e-06, "loss": 0.95194387, "num_input_tokens_seen": 50972690, "step": 2379, "time_per_iteration": 2.6021652221679688 }, { "auxiliary_loss_clip": 0.01205216, "auxiliary_loss_mlp": 0.01032062, "balance_loss_clip": 1.06268048, "balance_loss_mlp": 1.02290678, "epoch": 0.2861780797210365, "flos": 19938035738880.0, "grad_norm": 2.329486631853123, "language_loss": 0.7042048, "learning_rate": 3.350334669213846e-06, "loss": 0.72657758, "num_input_tokens_seen": 50990095, "step": 2380, "time_per_iteration": 2.533942937850952 }, { "auxiliary_loss_clip": 0.01187849, "auxiliary_loss_mlp": 0.01033934, "balance_loss_clip": 1.06122661, "balance_loss_mlp": 1.02572083, "epoch": 0.2862983226116756, "flos": 27563127609600.0, "grad_norm": 1.9372811296714165, "language_loss": 0.75630999, "learning_rate": 3.3497599469930816e-06, "loss": 0.77852774, "num_input_tokens_seen": 51008305, "step": 2381, "time_per_iteration": 2.552480936050415 }, { "auxiliary_loss_clip": 0.0120487, "auxiliary_loss_mlp": 0.01031317, "balance_loss_clip": 1.0609355, "balance_loss_mlp": 1.02201271, "epoch": 0.28641856550231465, "flos": 22053964158720.0, "grad_norm": 2.169885444424467, "language_loss": 0.83164561, "learning_rate": 3.349185020014807e-06, "loss": 0.85400754, "num_input_tokens_seen": 51025570, "step": 2382, "time_per_iteration": 2.498779535293579 }, { "auxiliary_loss_clip": 0.01193425, "auxiliary_loss_mlp": 0.01029867, "balance_loss_clip": 1.06109154, "balance_loss_mlp": 1.02130163, "epoch": 0.28653880839295376, "flos": 22378775869440.0, "grad_norm": 2.104845382266116, "language_loss": 0.74216336, "learning_rate": 3.348609888366237e-06, "loss": 0.76439625, "num_input_tokens_seen": 51044585, "step": 2383, "time_per_iteration": 2.539055824279785 }, { "auxiliary_loss_clip": 0.01127379, "auxiliary_loss_mlp": 0.0102561, "balance_loss_clip": 1.05329919, "balance_loss_mlp": 1.01644325, "epoch": 0.28665905128359287, "flos": 23367971491200.0, "grad_norm": 2.32982882217772, "language_loss": 0.62526208, "learning_rate": 3.348034552134619e-06, "loss": 0.64679199, "num_input_tokens_seen": 51063990, "step": 2384, "time_per_iteration": 2.6518490314483643 }, { "auxiliary_loss_clip": 0.01140508, "auxiliary_loss_mlp": 0.01032827, "balance_loss_clip": 1.05743515, "balance_loss_mlp": 1.02445817, "epoch": 0.2867792941742319, "flos": 20881695893760.0, "grad_norm": 1.9825600712093907, "language_loss": 0.83897316, "learning_rate": 3.3474590114072316e-06, "loss": 0.86070657, "num_input_tokens_seen": 51081990, "step": 2385, "time_per_iteration": 2.6022582054138184 }, { "auxiliary_loss_clip": 0.0116075, "auxiliary_loss_mlp": 0.0103603, "balance_loss_clip": 1.06148148, "balance_loss_mlp": 1.02677917, "epoch": 0.28689953706487104, "flos": 20664005518080.0, "grad_norm": 2.4936825176014845, "language_loss": 0.82716608, "learning_rate": 3.3468832662713836e-06, "loss": 0.84913391, "num_input_tokens_seen": 51100235, "step": 2386, "time_per_iteration": 2.5869791507720947 }, { "auxiliary_loss_clip": 0.01155358, "auxiliary_loss_mlp": 0.01037734, "balance_loss_clip": 1.05651855, "balance_loss_mlp": 1.02873945, "epoch": 0.28701977995551015, "flos": 12675533708160.0, "grad_norm": 2.1766105322883322, "language_loss": 0.83761138, "learning_rate": 3.346307316814415e-06, "loss": 0.85954225, "num_input_tokens_seen": 51115405, "step": 2387, "time_per_iteration": 2.56998348236084 }, { "auxiliary_loss_clip": 0.0119017, "auxiliary_loss_mlp": 0.01030491, "balance_loss_clip": 1.06199944, "balance_loss_mlp": 1.02121699, "epoch": 0.2871400228461492, "flos": 21252366293760.0, "grad_norm": 2.1630717103264665, "language_loss": 0.75839806, "learning_rate": 3.3457311631236965e-06, "loss": 0.78060466, "num_input_tokens_seen": 51136390, "step": 2388, "time_per_iteration": 2.5333447456359863 }, { "auxiliary_loss_clip": 0.01163842, "auxiliary_loss_mlp": 0.01030202, "balance_loss_clip": 1.05806637, "balance_loss_mlp": 1.02139854, "epoch": 0.2872602657367883, "flos": 25119262995840.0, "grad_norm": 1.755746825594979, "language_loss": 0.84420305, "learning_rate": 3.345154805286631e-06, "loss": 0.86614358, "num_input_tokens_seen": 51156650, "step": 2389, "time_per_iteration": 2.6632096767425537 }, { "auxiliary_loss_clip": 0.01186223, "auxiliary_loss_mlp": 0.0103386, "balance_loss_clip": 1.06000936, "balance_loss_mlp": 1.02463937, "epoch": 0.2873805086274274, "flos": 16646606830080.0, "grad_norm": 2.4479971218267007, "language_loss": 0.76206875, "learning_rate": 3.344578243390651e-06, "loss": 0.78426957, "num_input_tokens_seen": 51172210, "step": 2390, "time_per_iteration": 2.51564884185791 }, { "auxiliary_loss_clip": 0.0117361, "auxiliary_loss_mlp": 0.01032658, "balance_loss_clip": 1.0606041, "balance_loss_mlp": 1.02313328, "epoch": 0.2875007515180665, "flos": 17420123237760.0, "grad_norm": 3.1989744779227776, "language_loss": 0.78559852, "learning_rate": 3.3440014775232206e-06, "loss": 0.80766124, "num_input_tokens_seen": 51190265, "step": 2391, "time_per_iteration": 3.3988842964172363 }, { "auxiliary_loss_clip": 0.01164769, "auxiliary_loss_mlp": 0.01030922, "balance_loss_clip": 1.05933332, "balance_loss_mlp": 1.02253604, "epoch": 0.2876209944087056, "flos": 23434190213760.0, "grad_norm": 2.9111860552634603, "language_loss": 0.70960832, "learning_rate": 3.343424507771834e-06, "loss": 0.73156524, "num_input_tokens_seen": 51208475, "step": 2392, "time_per_iteration": 2.6180832386016846 }, { "auxiliary_loss_clip": 0.01160356, "auxiliary_loss_mlp": 0.01028745, "balance_loss_clip": 1.05859554, "balance_loss_mlp": 1.020329, "epoch": 0.2877412372993447, "flos": 13735509079680.0, "grad_norm": 1.7787883190164946, "language_loss": 0.8678295, "learning_rate": 3.342847334224018e-06, "loss": 0.8897205, "num_input_tokens_seen": 51225875, "step": 2393, "time_per_iteration": 2.6140360832214355 }, { "auxiliary_loss_clip": 0.01110196, "auxiliary_loss_mlp": 0.01006328, "balance_loss_clip": 1.04217052, "balance_loss_mlp": 1.0049628, "epoch": 0.28786148018998375, "flos": 58079695104000.0, "grad_norm": 0.9416801896126653, "language_loss": 0.62427354, "learning_rate": 3.342269956967329e-06, "loss": 0.64543879, "num_input_tokens_seen": 51287780, "step": 2394, "time_per_iteration": 3.1465885639190674 }, { "auxiliary_loss_clip": 0.01196602, "auxiliary_loss_mlp": 0.01036542, "balance_loss_clip": 1.06365514, "balance_loss_mlp": 1.02614737, "epoch": 0.28798172308062286, "flos": 23435052140160.0, "grad_norm": 2.5055395850598643, "language_loss": 0.71856058, "learning_rate": 3.341692376089355e-06, "loss": 0.74089205, "num_input_tokens_seen": 51303335, "step": 2395, "time_per_iteration": 3.4253551959991455 }, { "auxiliary_loss_clip": 0.01189538, "auxiliary_loss_mlp": 0.01033825, "balance_loss_clip": 1.06328869, "balance_loss_mlp": 1.02490187, "epoch": 0.288101965971262, "flos": 25110033200640.0, "grad_norm": 4.208070991132828, "language_loss": 0.84362888, "learning_rate": 3.3411145916777146e-06, "loss": 0.86586249, "num_input_tokens_seen": 51317495, "step": 2396, "time_per_iteration": 3.314282178878784 }, { "auxiliary_loss_clip": 0.01168151, "auxiliary_loss_mlp": 0.01031198, "balance_loss_clip": 1.05717874, "balance_loss_mlp": 1.02188182, "epoch": 0.28822220886190103, "flos": 16252559654400.0, "grad_norm": 7.363153001333871, "language_loss": 0.90462404, "learning_rate": 3.3405366038200566e-06, "loss": 0.92661756, "num_input_tokens_seen": 51336430, "step": 2397, "time_per_iteration": 3.2972218990325928 }, { "auxiliary_loss_clip": 0.01180515, "auxiliary_loss_mlp": 0.01039798, "balance_loss_clip": 1.06512213, "balance_loss_mlp": 1.03036833, "epoch": 0.28834245175254014, "flos": 24535642815360.0, "grad_norm": 2.316113287464621, "language_loss": 0.8474431, "learning_rate": 3.3399584126040617e-06, "loss": 0.86964619, "num_input_tokens_seen": 51355930, "step": 2398, "time_per_iteration": 2.5883123874664307 }, { "auxiliary_loss_clip": 0.01205295, "auxiliary_loss_mlp": 0.00763223, "balance_loss_clip": 1.06315017, "balance_loss_mlp": 1.00055063, "epoch": 0.2884626946431792, "flos": 24571445696640.0, "grad_norm": 1.895950222794461, "language_loss": 0.90676975, "learning_rate": 3.339380018117441e-06, "loss": 0.92645496, "num_input_tokens_seen": 51376765, "step": 2399, "time_per_iteration": 2.533411979675293 }, { "auxiliary_loss_clip": 0.01188791, "auxiliary_loss_mlp": 0.01029815, "balance_loss_clip": 1.0625298, "balance_loss_mlp": 1.02110028, "epoch": 0.2885829375338183, "flos": 16544657053440.0, "grad_norm": 2.5877069647403452, "language_loss": 0.78251559, "learning_rate": 3.3388014204479366e-06, "loss": 0.80470169, "num_input_tokens_seen": 51394570, "step": 2400, "time_per_iteration": 2.5092945098876953 }, { "auxiliary_loss_clip": 0.01210963, "auxiliary_loss_mlp": 0.01029922, "balance_loss_clip": 1.06550336, "balance_loss_mlp": 1.02107632, "epoch": 0.2887031804244574, "flos": 24061226958720.0, "grad_norm": 2.6541472970759554, "language_loss": 0.91612792, "learning_rate": 3.338222619683321e-06, "loss": 0.9385367, "num_input_tokens_seen": 51414535, "step": 2401, "time_per_iteration": 2.5581748485565186 }, { "auxiliary_loss_clip": 0.011797, "auxiliary_loss_mlp": 0.01028987, "balance_loss_clip": 1.06140471, "balance_loss_mlp": 1.01964736, "epoch": 0.2888234233150965, "flos": 23330696152320.0, "grad_norm": 2.4906168921043914, "language_loss": 0.73581588, "learning_rate": 3.337643615911398e-06, "loss": 0.75790274, "num_input_tokens_seen": 51434160, "step": 2402, "time_per_iteration": 2.5718328952789307 }, { "auxiliary_loss_clip": 0.01192058, "auxiliary_loss_mlp": 0.01028843, "balance_loss_clip": 1.06080925, "balance_loss_mlp": 1.01890087, "epoch": 0.2889436662057356, "flos": 22272767856000.0, "grad_norm": 2.117520059593653, "language_loss": 0.7878207, "learning_rate": 3.3370644092200026e-06, "loss": 0.81002975, "num_input_tokens_seen": 51451435, "step": 2403, "time_per_iteration": 2.558042526245117 }, { "auxiliary_loss_clip": 0.01145025, "auxiliary_loss_mlp": 0.01032651, "balance_loss_clip": 1.05169439, "balance_loss_mlp": 1.02333522, "epoch": 0.2890639090963747, "flos": 21616931381760.0, "grad_norm": 1.7973179924388445, "language_loss": 0.78104806, "learning_rate": 3.3364849996969985e-06, "loss": 0.80282485, "num_input_tokens_seen": 51471455, "step": 2404, "time_per_iteration": 2.6071979999542236 }, { "auxiliary_loss_clip": 0.01188986, "auxiliary_loss_mlp": 0.01030196, "balance_loss_clip": 1.06104934, "balance_loss_mlp": 1.02102888, "epoch": 0.28918415198701375, "flos": 28585540333440.0, "grad_norm": 1.806707488056981, "language_loss": 0.8532275, "learning_rate": 3.335905387430283e-06, "loss": 0.87541926, "num_input_tokens_seen": 51492890, "step": 2405, "time_per_iteration": 2.5849053859710693 }, { "auxiliary_loss_clip": 0.01178508, "auxiliary_loss_mlp": 0.01033211, "balance_loss_clip": 1.05829358, "balance_loss_mlp": 1.02399671, "epoch": 0.28930439487765286, "flos": 21944688007680.0, "grad_norm": 1.7859010656047312, "language_loss": 0.83107346, "learning_rate": 3.335325572507782e-06, "loss": 0.85319066, "num_input_tokens_seen": 51513390, "step": 2406, "time_per_iteration": 2.5744731426239014 }, { "auxiliary_loss_clip": 0.01210634, "auxiliary_loss_mlp": 0.00764099, "balance_loss_clip": 1.06816113, "balance_loss_mlp": 1.00064349, "epoch": 0.28942463776829197, "flos": 19281911955840.0, "grad_norm": 1.6883569413838524, "language_loss": 0.73805565, "learning_rate": 3.3347455550174537e-06, "loss": 0.75780296, "num_input_tokens_seen": 51532730, "step": 2407, "time_per_iteration": 2.475522518157959 }, { "auxiliary_loss_clip": 0.01157693, "auxiliary_loss_mlp": 0.01034389, "balance_loss_clip": 1.05476999, "balance_loss_mlp": 1.024405, "epoch": 0.289544880658931, "flos": 14645700737280.0, "grad_norm": 2.0141154729951216, "language_loss": 0.68109119, "learning_rate": 3.3341653350472864e-06, "loss": 0.70301205, "num_input_tokens_seen": 51549560, "step": 2408, "time_per_iteration": 2.5330286026000977 }, { "auxiliary_loss_clip": 0.01214259, "auxiliary_loss_mlp": 0.01033442, "balance_loss_clip": 1.06336641, "balance_loss_mlp": 1.02283239, "epoch": 0.28966512354957014, "flos": 28621881918720.0, "grad_norm": 2.3235009440326757, "language_loss": 0.69177634, "learning_rate": 3.333584912685298e-06, "loss": 0.71425337, "num_input_tokens_seen": 51568180, "step": 2409, "time_per_iteration": 2.5282223224639893 }, { "auxiliary_loss_clip": 0.01079926, "auxiliary_loss_mlp": 0.01009002, "balance_loss_clip": 1.03628111, "balance_loss_mlp": 1.00772655, "epoch": 0.28978536644020925, "flos": 64711784511360.0, "grad_norm": 0.9681170968111037, "language_loss": 0.55545115, "learning_rate": 3.3330042880195385e-06, "loss": 0.57634044, "num_input_tokens_seen": 51622530, "step": 2410, "time_per_iteration": 3.09198260307312 }, { "auxiliary_loss_clip": 0.01176145, "auxiliary_loss_mlp": 0.01028157, "balance_loss_clip": 1.05848563, "balance_loss_mlp": 1.01870334, "epoch": 0.2899056093308483, "flos": 18624638937600.0, "grad_norm": 1.93755332741465, "language_loss": 0.78646594, "learning_rate": 3.3324234611380888e-06, "loss": 0.80850899, "num_input_tokens_seen": 51641260, "step": 2411, "time_per_iteration": 2.5475380420684814 }, { "auxiliary_loss_clip": 0.01151313, "auxiliary_loss_mlp": 0.01032779, "balance_loss_clip": 1.05500996, "balance_loss_mlp": 1.02390361, "epoch": 0.2900258522214874, "flos": 22893735202560.0, "grad_norm": 2.657662894469066, "language_loss": 0.81773156, "learning_rate": 3.3318424321290596e-06, "loss": 0.83957249, "num_input_tokens_seen": 51660975, "step": 2412, "time_per_iteration": 2.63137149810791 }, { "auxiliary_loss_clip": 0.01076642, "auxiliary_loss_mlp": 0.0100363, "balance_loss_clip": 1.03442025, "balance_loss_mlp": 1.00226521, "epoch": 0.2901460951121265, "flos": 71106036013440.0, "grad_norm": 0.8528914580462031, "language_loss": 0.59982955, "learning_rate": 3.3312612010805917e-06, "loss": 0.62063229, "num_input_tokens_seen": 51720550, "step": 2413, "time_per_iteration": 3.1731765270233154 }, { "auxiliary_loss_clip": 0.01162893, "auxiliary_loss_mlp": 0.01038415, "balance_loss_clip": 1.05737257, "balance_loss_mlp": 1.02894998, "epoch": 0.2902663380027656, "flos": 32160986081280.0, "grad_norm": 1.713133232171617, "language_loss": 0.70072764, "learning_rate": 3.330679768080858e-06, "loss": 0.72274065, "num_input_tokens_seen": 51744435, "step": 2414, "time_per_iteration": 2.6292827129364014 }, { "auxiliary_loss_clip": 0.01190576, "auxiliary_loss_mlp": 0.01034585, "balance_loss_clip": 1.06367862, "balance_loss_mlp": 1.02460098, "epoch": 0.2903865808934047, "flos": 29351658539520.0, "grad_norm": 2.024128948640041, "language_loss": 0.83503389, "learning_rate": 3.3300981332180627e-06, "loss": 0.8572855, "num_input_tokens_seen": 51763640, "step": 2415, "time_per_iteration": 2.5794622898101807 }, { "auxiliary_loss_clip": 0.01167465, "auxiliary_loss_mlp": 0.01028051, "balance_loss_clip": 1.05756402, "balance_loss_mlp": 1.01902103, "epoch": 0.29050682378404374, "flos": 17089026647040.0, "grad_norm": 2.137085589400161, "language_loss": 0.80126452, "learning_rate": 3.3295162965804373e-06, "loss": 0.82321966, "num_input_tokens_seen": 51782135, "step": 2416, "time_per_iteration": 2.5965566635131836 }, { "auxiliary_loss_clip": 0.01156201, "auxiliary_loss_mlp": 0.01027398, "balance_loss_clip": 1.05722833, "balance_loss_mlp": 1.01815963, "epoch": 0.29062706667468285, "flos": 17858233422720.0, "grad_norm": 2.0182079626067235, "language_loss": 0.78811514, "learning_rate": 3.328934258256247e-06, "loss": 0.80995113, "num_input_tokens_seen": 51800200, "step": 2417, "time_per_iteration": 3.513564348220825 }, { "auxiliary_loss_clip": 0.0119104, "auxiliary_loss_mlp": 0.01030553, "balance_loss_clip": 1.05934966, "balance_loss_mlp": 1.02039635, "epoch": 0.29074730956532197, "flos": 24279815174400.0, "grad_norm": 2.024376027443427, "language_loss": 0.67472589, "learning_rate": 3.3283520183337856e-06, "loss": 0.69694179, "num_input_tokens_seen": 51819905, "step": 2418, "time_per_iteration": 2.5593769550323486 }, { "auxiliary_loss_clip": 0.01172277, "auxiliary_loss_mlp": 0.01031434, "balance_loss_clip": 1.05761409, "balance_loss_mlp": 1.02201653, "epoch": 0.290867552455961, "flos": 22340961826560.0, "grad_norm": 1.5750965800784684, "language_loss": 0.68862599, "learning_rate": 3.3277695769013797e-06, "loss": 0.71066308, "num_input_tokens_seen": 51839350, "step": 2419, "time_per_iteration": 2.5420775413513184 }, { "auxiliary_loss_clip": 0.01194084, "auxiliary_loss_mlp": 0.01031481, "balance_loss_clip": 1.06259513, "balance_loss_mlp": 1.02207494, "epoch": 0.29098779534660013, "flos": 23186155824000.0, "grad_norm": 2.1426285722053944, "language_loss": 0.77466422, "learning_rate": 3.327186934047385e-06, "loss": 0.79691982, "num_input_tokens_seen": 51858045, "step": 2420, "time_per_iteration": 2.5408473014831543 }, { "auxiliary_loss_clip": 0.01163673, "auxiliary_loss_mlp": 0.01033997, "balance_loss_clip": 1.05411804, "balance_loss_mlp": 1.02470434, "epoch": 0.29110803823723924, "flos": 15304194817920.0, "grad_norm": 1.7916491406087527, "language_loss": 0.65689802, "learning_rate": 3.3266040898601877e-06, "loss": 0.67887473, "num_input_tokens_seen": 51875880, "step": 2421, "time_per_iteration": 2.517857313156128 }, { "auxiliary_loss_clip": 0.01140338, "auxiliary_loss_mlp": 0.01030631, "balance_loss_clip": 1.05222094, "balance_loss_mlp": 1.0214932, "epoch": 0.2912282811278783, "flos": 22595352923520.0, "grad_norm": 2.053182994260173, "language_loss": 0.77941227, "learning_rate": 3.3260210444282045e-06, "loss": 0.80112201, "num_input_tokens_seen": 51893835, "step": 2422, "time_per_iteration": 3.5465195178985596 }, { "auxiliary_loss_clip": 0.01186018, "auxiliary_loss_mlp": 0.01035057, "balance_loss_clip": 1.06199372, "balance_loss_mlp": 1.02566361, "epoch": 0.2913485240185174, "flos": 24497900599680.0, "grad_norm": 2.136328364839051, "language_loss": 0.73118502, "learning_rate": 3.325437797839883e-06, "loss": 0.75339574, "num_input_tokens_seen": 51912205, "step": 2423, "time_per_iteration": 3.264315128326416 }, { "auxiliary_loss_clip": 0.01204405, "auxiliary_loss_mlp": 0.01034084, "balance_loss_clip": 1.06104994, "balance_loss_mlp": 1.0247376, "epoch": 0.2914687669091565, "flos": 17931024334080.0, "grad_norm": 2.105803453504304, "language_loss": 0.74570858, "learning_rate": 3.3248543501837015e-06, "loss": 0.76809347, "num_input_tokens_seen": 51929410, "step": 2424, "time_per_iteration": 2.444683790206909 }, { "auxiliary_loss_clip": 0.0115476, "auxiliary_loss_mlp": 0.01033075, "balance_loss_clip": 1.05951738, "balance_loss_mlp": 1.0239495, "epoch": 0.2915890097997956, "flos": 22529313768960.0, "grad_norm": 1.7549213412504703, "language_loss": 0.77552181, "learning_rate": 3.3242707015481684e-06, "loss": 0.79740012, "num_input_tokens_seen": 51949345, "step": 2425, "time_per_iteration": 2.625549077987671 }, { "auxiliary_loss_clip": 0.01171152, "auxiliary_loss_mlp": 0.01030623, "balance_loss_clip": 1.05472851, "balance_loss_mlp": 1.02218342, "epoch": 0.2917092526904347, "flos": 13845216193920.0, "grad_norm": 1.637159052522053, "language_loss": 0.80268264, "learning_rate": 3.323686852021823e-06, "loss": 0.82470036, "num_input_tokens_seen": 51966855, "step": 2426, "time_per_iteration": 2.5675816535949707 }, { "auxiliary_loss_clip": 0.0116416, "auxiliary_loss_mlp": 0.01031326, "balance_loss_clip": 1.05450225, "balance_loss_mlp": 1.02187288, "epoch": 0.2918294955810738, "flos": 22674859678080.0, "grad_norm": 2.3121532436044405, "language_loss": 0.79587591, "learning_rate": 3.323102801693235e-06, "loss": 0.8178308, "num_input_tokens_seen": 51985620, "step": 2427, "time_per_iteration": 2.65246319770813 }, { "auxiliary_loss_clip": 0.01182783, "auxiliary_loss_mlp": 0.01030354, "balance_loss_clip": 1.05762768, "balance_loss_mlp": 1.02089453, "epoch": 0.29194973847171285, "flos": 23438284364160.0, "grad_norm": 2.4546048710733985, "language_loss": 0.80223823, "learning_rate": 3.322518550651003e-06, "loss": 0.82436955, "num_input_tokens_seen": 52004930, "step": 2428, "time_per_iteration": 2.5681416988372803 }, { "auxiliary_loss_clip": 0.01180822, "auxiliary_loss_mlp": 0.01035829, "balance_loss_clip": 1.05698776, "balance_loss_mlp": 1.02663839, "epoch": 0.29206998136235196, "flos": 21909064694400.0, "grad_norm": 2.137436896155721, "language_loss": 0.81402034, "learning_rate": 3.3219340989837586e-06, "loss": 0.83618689, "num_input_tokens_seen": 52024920, "step": 2429, "time_per_iteration": 2.661130666732788 }, { "auxiliary_loss_clip": 0.01174282, "auxiliary_loss_mlp": 0.0103343, "balance_loss_clip": 1.05909109, "balance_loss_mlp": 1.02479947, "epoch": 0.292190224252991, "flos": 23215925220480.0, "grad_norm": 1.9097660838340935, "language_loss": 0.80582821, "learning_rate": 3.3213494467801625e-06, "loss": 0.82790542, "num_input_tokens_seen": 52044095, "step": 2430, "time_per_iteration": 2.6785995960235596 }, { "auxiliary_loss_clip": 0.01113519, "auxiliary_loss_mlp": 0.01029596, "balance_loss_clip": 1.04592931, "balance_loss_mlp": 1.02052474, "epoch": 0.2923104671436301, "flos": 20740818752640.0, "grad_norm": 1.8227797343507777, "language_loss": 0.71225917, "learning_rate": 3.3207645941289063e-06, "loss": 0.73369032, "num_input_tokens_seen": 52062440, "step": 2431, "time_per_iteration": 2.6438469886779785 }, { "auxiliary_loss_clip": 0.01191784, "auxiliary_loss_mlp": 0.00763923, "balance_loss_clip": 1.06377077, "balance_loss_mlp": 1.0002867, "epoch": 0.29243071003426924, "flos": 35809114999680.0, "grad_norm": 1.7465996955748733, "language_loss": 0.80065852, "learning_rate": 3.320179541118711e-06, "loss": 0.82021558, "num_input_tokens_seen": 52084940, "step": 2432, "time_per_iteration": 2.6420559883117676 }, { "auxiliary_loss_clip": 0.01100604, "auxiliary_loss_mlp": 0.01003887, "balance_loss_clip": 1.03223062, "balance_loss_mlp": 1.00246227, "epoch": 0.2925509529249083, "flos": 58081598524800.0, "grad_norm": 1.0249214197562737, "language_loss": 0.60312903, "learning_rate": 3.3195942878383293e-06, "loss": 0.62417388, "num_input_tokens_seen": 52141040, "step": 2433, "time_per_iteration": 3.103043556213379 }, { "auxiliary_loss_clip": 0.01193019, "auxiliary_loss_mlp": 0.01031312, "balance_loss_clip": 1.06189704, "balance_loss_mlp": 1.02141118, "epoch": 0.2926711958155474, "flos": 21397122103680.0, "grad_norm": 1.9670036998358922, "language_loss": 0.77871996, "learning_rate": 3.319008834376543e-06, "loss": 0.80096328, "num_input_tokens_seen": 52160730, "step": 2434, "time_per_iteration": 2.486548662185669 }, { "auxiliary_loss_clip": 0.01164531, "auxiliary_loss_mlp": 0.01025148, "balance_loss_clip": 1.05157483, "balance_loss_mlp": 1.01607037, "epoch": 0.2927914387061865, "flos": 23185796688000.0, "grad_norm": 2.0954825512035, "language_loss": 0.88984632, "learning_rate": 3.3184231808221654e-06, "loss": 0.91174316, "num_input_tokens_seen": 52175055, "step": 2435, "time_per_iteration": 2.5504398345947266 }, { "auxiliary_loss_clip": 0.01164275, "auxiliary_loss_mlp": 0.01037997, "balance_loss_clip": 1.06017447, "balance_loss_mlp": 1.0284369, "epoch": 0.29291168159682557, "flos": 22455553190400.0, "grad_norm": 1.8407655570802917, "language_loss": 0.62755907, "learning_rate": 3.3178373272640394e-06, "loss": 0.64958179, "num_input_tokens_seen": 52194150, "step": 2436, "time_per_iteration": 2.5636203289031982 }, { "auxiliary_loss_clip": 0.01204314, "auxiliary_loss_mlp": 0.01036166, "balance_loss_clip": 1.06290472, "balance_loss_mlp": 1.02752304, "epoch": 0.2930319244874647, "flos": 21170632896000.0, "grad_norm": 2.177517658942623, "language_loss": 0.85170913, "learning_rate": 3.3172512737910387e-06, "loss": 0.87411386, "num_input_tokens_seen": 52211660, "step": 2437, "time_per_iteration": 2.451910972595215 }, { "auxiliary_loss_clip": 0.01190476, "auxiliary_loss_mlp": 0.01033878, "balance_loss_clip": 1.05854487, "balance_loss_mlp": 1.0245676, "epoch": 0.2931521673781038, "flos": 31357843931520.0, "grad_norm": 2.8979296588964254, "language_loss": 0.87840325, "learning_rate": 3.3166650204920674e-06, "loss": 0.90064681, "num_input_tokens_seen": 52232830, "step": 2438, "time_per_iteration": 2.5606577396392822 }, { "auxiliary_loss_clip": 0.0119167, "auxiliary_loss_mlp": 0.01033876, "balance_loss_clip": 1.06327248, "balance_loss_mlp": 1.02433634, "epoch": 0.29327241026874284, "flos": 24200990778240.0, "grad_norm": 1.5372924706964617, "language_loss": 0.81654668, "learning_rate": 3.316078567456059e-06, "loss": 0.83880216, "num_input_tokens_seen": 52250670, "step": 2439, "time_per_iteration": 2.5211644172668457 }, { "auxiliary_loss_clip": 0.01136472, "auxiliary_loss_mlp": 0.01028248, "balance_loss_clip": 1.05421662, "balance_loss_mlp": 1.02004027, "epoch": 0.29339265315938196, "flos": 24242611662720.0, "grad_norm": 2.1372627892072105, "language_loss": 0.75933146, "learning_rate": 3.3154919147719786e-06, "loss": 0.78097868, "num_input_tokens_seen": 52271685, "step": 2440, "time_per_iteration": 2.6627840995788574 }, { "auxiliary_loss_clip": 0.01190477, "auxiliary_loss_mlp": 0.01030369, "balance_loss_clip": 1.06052673, "balance_loss_mlp": 1.02110076, "epoch": 0.29351289605002107, "flos": 16946641134720.0, "grad_norm": 2.1196209954178333, "language_loss": 0.86603832, "learning_rate": 3.31490506252882e-06, "loss": 0.88824677, "num_input_tokens_seen": 52291065, "step": 2441, "time_per_iteration": 2.5005228519439697 }, { "auxiliary_loss_clip": 0.01147855, "auxiliary_loss_mlp": 0.01032235, "balance_loss_clip": 1.05032372, "balance_loss_mlp": 1.02397394, "epoch": 0.2936331389406601, "flos": 19829082810240.0, "grad_norm": 1.8255698780371186, "language_loss": 0.84314984, "learning_rate": 3.31431801081561e-06, "loss": 0.86495072, "num_input_tokens_seen": 52310000, "step": 2442, "time_per_iteration": 2.536606788635254 }, { "auxiliary_loss_clip": 0.01087172, "auxiliary_loss_mlp": 0.01011709, "balance_loss_clip": 1.03608942, "balance_loss_mlp": 1.0104872, "epoch": 0.29375338183129923, "flos": 71416844398080.0, "grad_norm": 0.8978043828204144, "language_loss": 0.67893684, "learning_rate": 3.313730759721402e-06, "loss": 0.69992566, "num_input_tokens_seen": 52372930, "step": 2443, "time_per_iteration": 3.2036445140838623 }, { "auxiliary_loss_clip": 0.01172636, "auxiliary_loss_mlp": 0.01035066, "balance_loss_clip": 1.05867529, "balance_loss_mlp": 1.02524877, "epoch": 0.29387362472193834, "flos": 22054502862720.0, "grad_norm": 1.9646828932350573, "language_loss": 0.86361015, "learning_rate": 3.313143309335282e-06, "loss": 0.88568711, "num_input_tokens_seen": 52391420, "step": 2444, "time_per_iteration": 3.3804564476013184 }, { "auxiliary_loss_clip": 0.01158347, "auxiliary_loss_mlp": 0.01034168, "balance_loss_clip": 1.05759907, "balance_loss_mlp": 1.02468514, "epoch": 0.2939938676125774, "flos": 22966418373120.0, "grad_norm": 1.9760051808641652, "language_loss": 0.84948897, "learning_rate": 3.3125556597463665e-06, "loss": 0.87141418, "num_input_tokens_seen": 52410725, "step": 2445, "time_per_iteration": 2.570573568344116 }, { "auxiliary_loss_clip": 0.01187592, "auxiliary_loss_mlp": 0.01033759, "balance_loss_clip": 1.06216717, "balance_loss_mlp": 1.02493179, "epoch": 0.2941141105032165, "flos": 31358705857920.0, "grad_norm": 1.567619141861922, "language_loss": 0.66339487, "learning_rate": 3.311967811043801e-06, "loss": 0.68560839, "num_input_tokens_seen": 52432645, "step": 2446, "time_per_iteration": 2.623185873031616 }, { "auxiliary_loss_clip": 0.01190521, "auxiliary_loss_mlp": 0.01031455, "balance_loss_clip": 1.06228948, "balance_loss_mlp": 1.0221504, "epoch": 0.29423435339385556, "flos": 23222138273280.0, "grad_norm": 2.5557226140536344, "language_loss": 0.82255226, "learning_rate": 3.3113797633167617e-06, "loss": 0.84477198, "num_input_tokens_seen": 52450940, "step": 2447, "time_per_iteration": 2.616151809692383 }, { "auxiliary_loss_clip": 0.01203828, "auxiliary_loss_mlp": 0.01029877, "balance_loss_clip": 1.06127059, "balance_loss_mlp": 1.02025044, "epoch": 0.2943545962844947, "flos": 26864054138880.0, "grad_norm": 2.116558527049001, "language_loss": 0.68866307, "learning_rate": 3.310791516654455e-06, "loss": 0.71100014, "num_input_tokens_seen": 52468000, "step": 2448, "time_per_iteration": 3.415570020675659 }, { "auxiliary_loss_clip": 0.01169073, "auxiliary_loss_mlp": 0.01042707, "balance_loss_clip": 1.05729854, "balance_loss_mlp": 1.03240764, "epoch": 0.2944748391751338, "flos": 20231677422720.0, "grad_norm": 1.825830585048486, "language_loss": 0.79675436, "learning_rate": 3.3102030711461177e-06, "loss": 0.81887221, "num_input_tokens_seen": 52487575, "step": 2449, "time_per_iteration": 4.269005537033081 }, { "auxiliary_loss_clip": 0.01164726, "auxiliary_loss_mlp": 0.01024555, "balance_loss_clip": 1.05778742, "balance_loss_mlp": 1.01538229, "epoch": 0.29459508206577284, "flos": 15960965045760.0, "grad_norm": 1.8538415440303369, "language_loss": 0.67971587, "learning_rate": 3.3096144268810156e-06, "loss": 0.70160866, "num_input_tokens_seen": 52506335, "step": 2450, "time_per_iteration": 2.6147491931915283 }, { "auxiliary_loss_clip": 0.01179106, "auxiliary_loss_mlp": 0.01031909, "balance_loss_clip": 1.05796123, "balance_loss_mlp": 1.02250957, "epoch": 0.29471532495641195, "flos": 20412882558720.0, "grad_norm": 1.9659645250214002, "language_loss": 0.7295301, "learning_rate": 3.3090255839484462e-06, "loss": 0.75164026, "num_input_tokens_seen": 52524330, "step": 2451, "time_per_iteration": 2.6420652866363525 }, { "auxiliary_loss_clip": 0.01176882, "auxiliary_loss_mlp": 0.01030402, "balance_loss_clip": 1.05749226, "balance_loss_mlp": 1.02075231, "epoch": 0.29483556784705106, "flos": 20376576887040.0, "grad_norm": 1.925991110262362, "language_loss": 0.85528541, "learning_rate": 3.3084365424377366e-06, "loss": 0.8773582, "num_input_tokens_seen": 52543095, "step": 2452, "time_per_iteration": 2.5142624378204346 }, { "auxiliary_loss_clip": 0.01073416, "auxiliary_loss_mlp": 0.01008848, "balance_loss_clip": 1.04353976, "balance_loss_mlp": 1.00715494, "epoch": 0.2949558107376901, "flos": 68555660595840.0, "grad_norm": 0.72802823671306, "language_loss": 0.5594064, "learning_rate": 3.307847302438245e-06, "loss": 0.58022904, "num_input_tokens_seen": 52597075, "step": 2453, "time_per_iteration": 3.0465087890625 }, { "auxiliary_loss_clip": 0.01128962, "auxiliary_loss_mlp": 0.01029565, "balance_loss_clip": 1.04890275, "balance_loss_mlp": 1.01953936, "epoch": 0.2950760536283292, "flos": 16107085572480.0, "grad_norm": 2.246497937726557, "language_loss": 0.78037846, "learning_rate": 3.3072578640393562e-06, "loss": 0.80196381, "num_input_tokens_seen": 52614410, "step": 2454, "time_per_iteration": 2.612377405166626 }, { "auxiliary_loss_clip": 0.01174708, "auxiliary_loss_mlp": 0.01030006, "balance_loss_clip": 1.05923843, "balance_loss_mlp": 1.02053535, "epoch": 0.29519629651896834, "flos": 20483626394880.0, "grad_norm": 1.8481795362133178, "language_loss": 0.79444516, "learning_rate": 3.3066682273304886e-06, "loss": 0.81649226, "num_input_tokens_seen": 52632055, "step": 2455, "time_per_iteration": 2.5631654262542725 }, { "auxiliary_loss_clip": 0.01195319, "auxiliary_loss_mlp": 0.0076407, "balance_loss_clip": 1.06213617, "balance_loss_mlp": 1.00032496, "epoch": 0.2953165394096074, "flos": 18916484941440.0, "grad_norm": 1.9910736544904315, "language_loss": 0.78806609, "learning_rate": 3.3060783924010904e-06, "loss": 0.80765998, "num_input_tokens_seen": 52649980, "step": 2456, "time_per_iteration": 2.5193769931793213 }, { "auxiliary_loss_clip": 0.01163171, "auxiliary_loss_mlp": 0.01034789, "balance_loss_clip": 1.05919313, "balance_loss_mlp": 1.02501416, "epoch": 0.2954367823002465, "flos": 20624467622400.0, "grad_norm": 2.471307453115101, "language_loss": 0.85131407, "learning_rate": 3.3054883593406387e-06, "loss": 0.8732937, "num_input_tokens_seen": 52664730, "step": 2457, "time_per_iteration": 2.5412800312042236 }, { "auxiliary_loss_clip": 0.01176856, "auxiliary_loss_mlp": 0.01034044, "balance_loss_clip": 1.05884945, "balance_loss_mlp": 1.02504992, "epoch": 0.2955570251908856, "flos": 31175525473920.0, "grad_norm": 2.1421960615479567, "language_loss": 0.65481204, "learning_rate": 3.3048981282386404e-06, "loss": 0.67692101, "num_input_tokens_seen": 52686040, "step": 2458, "time_per_iteration": 2.6395134925842285 }, { "auxiliary_loss_clip": 0.0114987, "auxiliary_loss_mlp": 0.01029788, "balance_loss_clip": 1.05724669, "balance_loss_mlp": 1.02119303, "epoch": 0.29567726808152467, "flos": 21650328051840.0, "grad_norm": 1.934980542574379, "language_loss": 0.82540143, "learning_rate": 3.304307699184634e-06, "loss": 0.84719801, "num_input_tokens_seen": 52704630, "step": 2459, "time_per_iteration": 2.562892436981201 }, { "auxiliary_loss_clip": 0.01175545, "auxiliary_loss_mlp": 0.01034541, "balance_loss_clip": 1.06029212, "balance_loss_mlp": 1.02565408, "epoch": 0.2957975109721638, "flos": 24243868638720.0, "grad_norm": 1.7621834198712325, "language_loss": 0.78859651, "learning_rate": 3.3037170722681866e-06, "loss": 0.81069744, "num_input_tokens_seen": 52725465, "step": 2460, "time_per_iteration": 2.54333758354187 }, { "auxiliary_loss_clip": 0.01156518, "auxiliary_loss_mlp": 0.01032624, "balance_loss_clip": 1.0592941, "balance_loss_mlp": 1.02322483, "epoch": 0.29591775386280283, "flos": 13479717352320.0, "grad_norm": 1.761281639389518, "language_loss": 0.68101501, "learning_rate": 3.3031262475788956e-06, "loss": 0.70290643, "num_input_tokens_seen": 52742405, "step": 2461, "time_per_iteration": 2.526418447494507 }, { "auxiliary_loss_clip": 0.01173169, "auxiliary_loss_mlp": 0.01032337, "balance_loss_clip": 1.05831647, "balance_loss_mlp": 1.02332461, "epoch": 0.29603799675344195, "flos": 17749783284480.0, "grad_norm": 1.7226588489405246, "language_loss": 0.73160857, "learning_rate": 3.3025352252063897e-06, "loss": 0.7536636, "num_input_tokens_seen": 52761100, "step": 2462, "time_per_iteration": 2.5071654319763184 }, { "auxiliary_loss_clip": 0.0118954, "auxiliary_loss_mlp": 0.01038403, "balance_loss_clip": 1.06385076, "balance_loss_mlp": 1.02912831, "epoch": 0.29615823964408106, "flos": 22783920347520.0, "grad_norm": 1.9316008937784697, "language_loss": 0.7524932, "learning_rate": 3.3019440052403252e-06, "loss": 0.77477264, "num_input_tokens_seen": 52780965, "step": 2463, "time_per_iteration": 2.5210492610931396 }, { "auxiliary_loss_clip": 0.01176218, "auxiliary_loss_mlp": 0.01030969, "balance_loss_clip": 1.05787027, "balance_loss_mlp": 1.021909, "epoch": 0.2962784825347201, "flos": 23514199758720.0, "grad_norm": 1.9874792727442891, "language_loss": 0.70831293, "learning_rate": 3.30135258777039e-06, "loss": 0.73038483, "num_input_tokens_seen": 52800335, "step": 2464, "time_per_iteration": 2.548659086227417 }, { "auxiliary_loss_clip": 0.01194284, "auxiliary_loss_mlp": 0.00764029, "balance_loss_clip": 1.06067729, "balance_loss_mlp": 1.00035286, "epoch": 0.2963987254253592, "flos": 16362769559040.0, "grad_norm": 2.0135302193605282, "language_loss": 0.70604622, "learning_rate": 3.3007609728863024e-06, "loss": 0.72562933, "num_input_tokens_seen": 52818425, "step": 2465, "time_per_iteration": 2.4830453395843506 }, { "auxiliary_loss_clip": 0.01124545, "auxiliary_loss_mlp": 0.01029915, "balance_loss_clip": 1.05726361, "balance_loss_mlp": 1.02039647, "epoch": 0.29651896831599833, "flos": 33472263980160.0, "grad_norm": 2.0939213696398236, "language_loss": 0.7299872, "learning_rate": 3.300169160677809e-06, "loss": 0.75153184, "num_input_tokens_seen": 52842340, "step": 2466, "time_per_iteration": 2.7332284450531006 }, { "auxiliary_loss_clip": 0.01168318, "auxiliary_loss_mlp": 0.01027985, "balance_loss_clip": 1.05975151, "balance_loss_mlp": 1.018466, "epoch": 0.2966392112066374, "flos": 23805363404160.0, "grad_norm": 2.252350045765582, "language_loss": 0.7770915, "learning_rate": 3.2995771512346878e-06, "loss": 0.7990545, "num_input_tokens_seen": 52860690, "step": 2467, "time_per_iteration": 2.5818231105804443 }, { "auxiliary_loss_clip": 0.01208669, "auxiliary_loss_mlp": 0.00764232, "balance_loss_clip": 1.06461656, "balance_loss_mlp": 1.00033641, "epoch": 0.2967594540972765, "flos": 19938466702080.0, "grad_norm": 9.878362557482154, "language_loss": 0.73227632, "learning_rate": 3.298984944646746e-06, "loss": 0.7520054, "num_input_tokens_seen": 52879370, "step": 2468, "time_per_iteration": 2.4837563037872314 }, { "auxiliary_loss_clip": 0.01193126, "auxiliary_loss_mlp": 0.0076337, "balance_loss_clip": 1.0625639, "balance_loss_mlp": 1.00027835, "epoch": 0.2968796969879156, "flos": 23732823888000.0, "grad_norm": 1.8758742520349645, "language_loss": 0.81659961, "learning_rate": 3.298392541003822e-06, "loss": 0.83616459, "num_input_tokens_seen": 52898775, "step": 2469, "time_per_iteration": 2.5382080078125 }, { "auxiliary_loss_clip": 0.01174975, "auxiliary_loss_mlp": 0.0103081, "balance_loss_clip": 1.06108212, "balance_loss_mlp": 1.02200603, "epoch": 0.29699993987855466, "flos": 22893699288960.0, "grad_norm": 1.6177336234424458, "language_loss": 0.89689589, "learning_rate": 3.2977999403957806e-06, "loss": 0.91895366, "num_input_tokens_seen": 52917535, "step": 2470, "time_per_iteration": 3.4227635860443115 }, { "auxiliary_loss_clip": 0.01206573, "auxiliary_loss_mlp": 0.01037172, "balance_loss_clip": 1.06557202, "balance_loss_mlp": 1.02745628, "epoch": 0.2971201827691938, "flos": 33832555349760.0, "grad_norm": 2.9443293596005033, "language_loss": 0.66731179, "learning_rate": 3.2972071429125207e-06, "loss": 0.68974924, "num_input_tokens_seen": 52938755, "step": 2471, "time_per_iteration": 2.5778756141662598 }, { "auxiliary_loss_clip": 0.01155426, "auxiliary_loss_mlp": 0.01031614, "balance_loss_clip": 1.05770791, "balance_loss_mlp": 1.02232766, "epoch": 0.2972404256598329, "flos": 22054359208320.0, "grad_norm": 1.9541774829106588, "language_loss": 0.8816393, "learning_rate": 3.2966141486439682e-06, "loss": 0.90350974, "num_input_tokens_seen": 52957945, "step": 2472, "time_per_iteration": 2.54937744140625 }, { "auxiliary_loss_clip": 0.01134126, "auxiliary_loss_mlp": 0.0103045, "balance_loss_clip": 1.05253124, "balance_loss_mlp": 1.02101469, "epoch": 0.29736066855047194, "flos": 31978595796480.0, "grad_norm": 2.4185275375393225, "language_loss": 0.64247286, "learning_rate": 3.29602095768008e-06, "loss": 0.66411865, "num_input_tokens_seen": 52978460, "step": 2473, "time_per_iteration": 2.736271381378174 }, { "auxiliary_loss_clip": 0.01169971, "auxiliary_loss_mlp": 0.0103006, "balance_loss_clip": 1.0612036, "balance_loss_mlp": 1.02129173, "epoch": 0.29748091144111105, "flos": 33510401245440.0, "grad_norm": 1.8397306232531603, "language_loss": 0.63817883, "learning_rate": 3.2954275701108437e-06, "loss": 0.66017914, "num_input_tokens_seen": 52999640, "step": 2474, "time_per_iteration": 2.6391453742980957 }, { "auxiliary_loss_clip": 0.01142061, "auxiliary_loss_mlp": 0.01026019, "balance_loss_clip": 1.05406857, "balance_loss_mlp": 1.01678634, "epoch": 0.29760115433175016, "flos": 41283373409280.0, "grad_norm": 2.338121174326109, "language_loss": 0.68623805, "learning_rate": 3.294833986026275e-06, "loss": 0.70791882, "num_input_tokens_seen": 53022880, "step": 2475, "time_per_iteration": 3.6481127738952637 }, { "auxiliary_loss_clip": 0.01153067, "auxiliary_loss_mlp": 0.01025789, "balance_loss_clip": 1.05640996, "balance_loss_mlp": 1.01688433, "epoch": 0.2977213972223892, "flos": 24493339572480.0, "grad_norm": 2.0947412880151806, "language_loss": 0.85193896, "learning_rate": 3.29424020551642e-06, "loss": 0.87372756, "num_input_tokens_seen": 53041515, "step": 2476, "time_per_iteration": 4.150704622268677 }, { "auxiliary_loss_clip": 0.01210018, "auxiliary_loss_mlp": 0.01035184, "balance_loss_clip": 1.06459832, "balance_loss_mlp": 1.02477074, "epoch": 0.2978416401130283, "flos": 21285116519040.0, "grad_norm": 1.978533878096831, "language_loss": 0.72296906, "learning_rate": 3.2936462286713546e-06, "loss": 0.74542105, "num_input_tokens_seen": 53059865, "step": 2477, "time_per_iteration": 2.480623245239258 }, { "auxiliary_loss_clip": 0.01191504, "auxiliary_loss_mlp": 0.00764295, "balance_loss_clip": 1.06174481, "balance_loss_mlp": 1.00030088, "epoch": 0.2979618830036674, "flos": 25772154554880.0, "grad_norm": 1.996844815724552, "language_loss": 0.77014327, "learning_rate": 3.2930520555811846e-06, "loss": 0.78970122, "num_input_tokens_seen": 53079490, "step": 2478, "time_per_iteration": 2.5560431480407715 }, { "auxiliary_loss_clip": 0.01095932, "auxiliary_loss_mlp": 0.01033001, "balance_loss_clip": 1.04948032, "balance_loss_mlp": 1.02344012, "epoch": 0.2980821258943065, "flos": 23476996247040.0, "grad_norm": 1.6958406021904748, "language_loss": 0.80130553, "learning_rate": 3.292457686336046e-06, "loss": 0.82259488, "num_input_tokens_seen": 53098810, "step": 2479, "time_per_iteration": 2.7030694484710693 }, { "auxiliary_loss_clip": 0.01091854, "auxiliary_loss_mlp": 0.01009119, "balance_loss_clip": 1.03518629, "balance_loss_mlp": 1.00764132, "epoch": 0.2982023687849456, "flos": 69752314195200.0, "grad_norm": 0.874127644883929, "language_loss": 0.61287028, "learning_rate": 3.291863121026105e-06, "loss": 0.63388002, "num_input_tokens_seen": 53162590, "step": 2480, "time_per_iteration": 3.1950089931488037 }, { "auxiliary_loss_clip": 0.01191739, "auxiliary_loss_mlp": 0.01033224, "balance_loss_clip": 1.06191647, "balance_loss_mlp": 1.02424145, "epoch": 0.29832261167558466, "flos": 29825930741760.0, "grad_norm": 1.8341941873830745, "language_loss": 0.76604187, "learning_rate": 3.2912683597415547e-06, "loss": 0.78829145, "num_input_tokens_seen": 53186675, "step": 2481, "time_per_iteration": 2.6439967155456543 }, { "auxiliary_loss_clip": 0.011644, "auxiliary_loss_mlp": 0.01035994, "balance_loss_clip": 1.05876565, "balance_loss_mlp": 1.02667737, "epoch": 0.29844285456622377, "flos": 33910158683520.0, "grad_norm": 2.051309937754646, "language_loss": 0.78282851, "learning_rate": 3.2906734025726213e-06, "loss": 0.80483246, "num_input_tokens_seen": 53205940, "step": 2482, "time_per_iteration": 2.6849558353424072 }, { "auxiliary_loss_clip": 0.01198728, "auxiliary_loss_mlp": 0.01039036, "balance_loss_clip": 1.06457233, "balance_loss_mlp": 1.02949929, "epoch": 0.2985630974568629, "flos": 23876933253120.0, "grad_norm": 1.7798961196721699, "language_loss": 0.88462925, "learning_rate": 3.290078249609559e-06, "loss": 0.90700692, "num_input_tokens_seen": 53225360, "step": 2483, "time_per_iteration": 2.5498769283294678 }, { "auxiliary_loss_clip": 0.01188429, "auxiliary_loss_mlp": 0.01034313, "balance_loss_clip": 1.06426525, "balance_loss_mlp": 1.02515769, "epoch": 0.29868334034750194, "flos": 21799106184960.0, "grad_norm": 1.8362350643667995, "language_loss": 0.87950534, "learning_rate": 3.2894829009426514e-06, "loss": 0.9017328, "num_input_tokens_seen": 53243195, "step": 2484, "time_per_iteration": 2.5167906284332275 }, { "auxiliary_loss_clip": 0.01188402, "auxiliary_loss_mlp": 0.01033614, "balance_loss_clip": 1.06123793, "balance_loss_mlp": 1.02445829, "epoch": 0.29880358323814105, "flos": 25666649331840.0, "grad_norm": 1.8898556178261638, "language_loss": 0.77748764, "learning_rate": 3.288887356662213e-06, "loss": 0.79970771, "num_input_tokens_seen": 53264530, "step": 2485, "time_per_iteration": 2.5755698680877686 }, { "auxiliary_loss_clip": 0.01094887, "auxiliary_loss_mlp": 0.01006101, "balance_loss_clip": 1.03364277, "balance_loss_mlp": 1.00483787, "epoch": 0.29892382612878016, "flos": 71005846003200.0, "grad_norm": 0.7694142112376068, "language_loss": 0.59711593, "learning_rate": 3.288291616858588e-06, "loss": 0.6181258, "num_input_tokens_seen": 53319920, "step": 2486, "time_per_iteration": 2.9750685691833496 }, { "auxiliary_loss_clip": 0.01141358, "auxiliary_loss_mlp": 0.01027051, "balance_loss_clip": 1.05825543, "balance_loss_mlp": 1.01824117, "epoch": 0.2990440690194192, "flos": 25481134563840.0, "grad_norm": 1.6290586183661782, "language_loss": 0.7689122, "learning_rate": 3.287695681622149e-06, "loss": 0.79059625, "num_input_tokens_seen": 53339270, "step": 2487, "time_per_iteration": 2.6511435508728027 }, { "auxiliary_loss_clip": 0.01179766, "auxiliary_loss_mlp": 0.01027272, "balance_loss_clip": 1.05895138, "balance_loss_mlp": 1.01882613, "epoch": 0.2991643119100583, "flos": 23732357011200.0, "grad_norm": 1.9882470204192673, "language_loss": 0.81252182, "learning_rate": 3.2870995510432982e-06, "loss": 0.83459222, "num_input_tokens_seen": 53357750, "step": 2488, "time_per_iteration": 2.5806496143341064 }, { "auxiliary_loss_clip": 0.01182326, "auxiliary_loss_mlp": 0.01035152, "balance_loss_clip": 1.05971909, "balance_loss_mlp": 1.02672434, "epoch": 0.29928455480069743, "flos": 27417545786880.0, "grad_norm": 1.7434606052828534, "language_loss": 0.76765078, "learning_rate": 3.2865032252124697e-06, "loss": 0.78982556, "num_input_tokens_seen": 53378265, "step": 2489, "time_per_iteration": 2.5856635570526123 }, { "auxiliary_loss_clip": 0.01174397, "auxiliary_loss_mlp": 0.01035883, "balance_loss_clip": 1.0585072, "balance_loss_mlp": 1.02685928, "epoch": 0.2994047976913365, "flos": 33692935184640.0, "grad_norm": 1.3682145206630378, "language_loss": 0.77613735, "learning_rate": 3.2859067042201243e-06, "loss": 0.79824018, "num_input_tokens_seen": 53400305, "step": 2490, "time_per_iteration": 2.6538589000701904 }, { "auxiliary_loss_clip": 0.01110008, "auxiliary_loss_mlp": 0.01033365, "balance_loss_clip": 1.05195951, "balance_loss_mlp": 1.02441239, "epoch": 0.2995250405819756, "flos": 16763963541120.0, "grad_norm": 1.9833230117530085, "language_loss": 0.77764446, "learning_rate": 3.2853099881567544e-06, "loss": 0.79907817, "num_input_tokens_seen": 53418705, "step": 2491, "time_per_iteration": 2.609543561935425 }, { "auxiliary_loss_clip": 0.01202512, "auxiliary_loss_mlp": 0.01034757, "balance_loss_clip": 1.06415343, "balance_loss_mlp": 1.0265789, "epoch": 0.29964528347261465, "flos": 22963976248320.0, "grad_norm": 2.097809545643159, "language_loss": 0.79014528, "learning_rate": 3.284713077112881e-06, "loss": 0.81251794, "num_input_tokens_seen": 53438135, "step": 2492, "time_per_iteration": 2.5228989124298096 }, { "auxiliary_loss_clip": 0.01169831, "auxiliary_loss_mlp": 0.01032711, "balance_loss_clip": 1.06279361, "balance_loss_mlp": 1.02304316, "epoch": 0.29976552636325376, "flos": 16938021870720.0, "grad_norm": 3.0825661009400624, "language_loss": 0.86485302, "learning_rate": 3.284115971179056e-06, "loss": 0.88687849, "num_input_tokens_seen": 53452165, "step": 2493, "time_per_iteration": 2.545217514038086 }, { "auxiliary_loss_clip": 0.01136519, "auxiliary_loss_mlp": 0.01033522, "balance_loss_clip": 1.05784714, "balance_loss_mlp": 1.02459359, "epoch": 0.2998857692538929, "flos": 17056455989760.0, "grad_norm": 1.8039814319555105, "language_loss": 0.78785151, "learning_rate": 3.283518670445859e-06, "loss": 0.80955195, "num_input_tokens_seen": 53470075, "step": 2494, "time_per_iteration": 2.604356050491333 }, { "auxiliary_loss_clip": 0.01074994, "auxiliary_loss_mlp": 0.00753826, "balance_loss_clip": 1.02911782, "balance_loss_mlp": 1.00020719, "epoch": 0.30000601214453193, "flos": 68831528025600.0, "grad_norm": 0.6880120674253565, "language_loss": 0.54308426, "learning_rate": 3.2829211750038995e-06, "loss": 0.56137252, "num_input_tokens_seen": 53538705, "step": 2495, "time_per_iteration": 3.1627235412597656 }, { "auxiliary_loss_clip": 0.01156512, "auxiliary_loss_mlp": 0.01033936, "balance_loss_clip": 1.05592608, "balance_loss_mlp": 1.02512026, "epoch": 0.30012625503517104, "flos": 17603267708160.0, "grad_norm": 1.9796274089981198, "language_loss": 0.89079237, "learning_rate": 3.2823234849438183e-06, "loss": 0.91269684, "num_input_tokens_seen": 53556740, "step": 2496, "time_per_iteration": 2.5424180030822754 }, { "auxiliary_loss_clip": 0.01177575, "auxiliary_loss_mlp": 0.01032121, "balance_loss_clip": 1.06031883, "balance_loss_mlp": 1.02330506, "epoch": 0.30024649792581015, "flos": 21252581775360.0, "grad_norm": 1.9246867939105723, "language_loss": 0.75855315, "learning_rate": 3.2817256003562836e-06, "loss": 0.78065014, "num_input_tokens_seen": 53577115, "step": 2497, "time_per_iteration": 3.4003679752349854 }, { "auxiliary_loss_clip": 0.01134962, "auxiliary_loss_mlp": 0.01040467, "balance_loss_clip": 1.05717862, "balance_loss_mlp": 1.03079939, "epoch": 0.3003667408164492, "flos": 23003262748800.0, "grad_norm": 1.7296803073515479, "language_loss": 0.66278738, "learning_rate": 3.281127521331995e-06, "loss": 0.6845417, "num_input_tokens_seen": 53598295, "step": 2498, "time_per_iteration": 2.653770923614502 }, { "auxiliary_loss_clip": 0.01103264, "auxiliary_loss_mlp": 0.01002994, "balance_loss_clip": 1.02895892, "balance_loss_mlp": 1.00180769, "epoch": 0.3004869837070883, "flos": 64232340750720.0, "grad_norm": 0.8835941656506333, "language_loss": 0.60625434, "learning_rate": 3.2805292479616798e-06, "loss": 0.62731689, "num_input_tokens_seen": 53657160, "step": 2499, "time_per_iteration": 2.97001314163208 }, { "auxiliary_loss_clip": 0.01177163, "auxiliary_loss_mlp": 0.01030096, "balance_loss_clip": 1.05830836, "balance_loss_mlp": 1.0209167, "epoch": 0.30060722659772743, "flos": 26248653400320.0, "grad_norm": 2.2195220560177997, "language_loss": 0.91660655, "learning_rate": 3.2799307803360955e-06, "loss": 0.9386791, "num_input_tokens_seen": 53673090, "step": 2500, "time_per_iteration": 2.6368207931518555 }, { "auxiliary_loss_clip": 0.01199409, "auxiliary_loss_mlp": 0.01033264, "balance_loss_clip": 1.06164384, "balance_loss_mlp": 1.02465725, "epoch": 0.3007274694883665, "flos": 24970879912320.0, "grad_norm": 1.3896059002488632, "language_loss": 0.8169753, "learning_rate": 3.27933211854603e-06, "loss": 0.83930206, "num_input_tokens_seen": 53692145, "step": 2501, "time_per_iteration": 3.4357757568359375 }, { "auxiliary_loss_clip": 0.01176098, "auxiliary_loss_mlp": 0.01030171, "balance_loss_clip": 1.0614953, "balance_loss_mlp": 1.02126038, "epoch": 0.3008477123790056, "flos": 17055845458560.0, "grad_norm": 1.5334444339985713, "language_loss": 0.8710283, "learning_rate": 3.278733262682299e-06, "loss": 0.89309108, "num_input_tokens_seen": 53710000, "step": 2502, "time_per_iteration": 2.5217535495758057 }, { "auxiliary_loss_clip": 0.01202946, "auxiliary_loss_mlp": 0.01029761, "balance_loss_clip": 1.06204009, "balance_loss_mlp": 1.02104104, "epoch": 0.3009679552696447, "flos": 21506398254720.0, "grad_norm": 2.1236831730283776, "language_loss": 0.82534826, "learning_rate": 3.2781342128357484e-06, "loss": 0.84767532, "num_input_tokens_seen": 53729355, "step": 2503, "time_per_iteration": 4.005939960479736 }, { "auxiliary_loss_clip": 0.01159268, "auxiliary_loss_mlp": 0.01032485, "balance_loss_clip": 1.05540967, "balance_loss_mlp": 1.02297807, "epoch": 0.30108819816028376, "flos": 21134004001920.0, "grad_norm": 2.192504855647531, "language_loss": 0.80452847, "learning_rate": 3.2775349690972547e-06, "loss": 0.826446, "num_input_tokens_seen": 53743505, "step": 2504, "time_per_iteration": 2.5541675090789795 }, { "auxiliary_loss_clip": 0.01086458, "auxiliary_loss_mlp": 0.01005833, "balance_loss_clip": 1.02872694, "balance_loss_mlp": 1.0046525, "epoch": 0.30120844105092287, "flos": 71126434938240.0, "grad_norm": 0.7594273797325565, "language_loss": 0.5185554, "learning_rate": 3.276935531557722e-06, "loss": 0.5394783, "num_input_tokens_seen": 53808725, "step": 2505, "time_per_iteration": 3.1689682006835938 }, { "auxiliary_loss_clip": 0.0114669, "auxiliary_loss_mlp": 0.01035879, "balance_loss_clip": 1.05268741, "balance_loss_mlp": 1.02696228, "epoch": 0.301328683941562, "flos": 20264571302400.0, "grad_norm": 2.0878670427861725, "language_loss": 0.79518831, "learning_rate": 3.2763359003080837e-06, "loss": 0.81701398, "num_input_tokens_seen": 53825680, "step": 2506, "time_per_iteration": 2.5947065353393555 }, { "auxiliary_loss_clip": 0.01077126, "auxiliary_loss_mlp": 0.01004033, "balance_loss_clip": 1.02478361, "balance_loss_mlp": 1.00269222, "epoch": 0.30144892683220104, "flos": 70648212240000.0, "grad_norm": 0.7999185397598263, "language_loss": 0.624529, "learning_rate": 3.2757360754393047e-06, "loss": 0.64534056, "num_input_tokens_seen": 53889750, "step": 2507, "time_per_iteration": 3.1939477920532227 }, { "auxiliary_loss_clip": 0.01185691, "auxiliary_loss_mlp": 0.01028733, "balance_loss_clip": 1.05850244, "balance_loss_mlp": 1.01961398, "epoch": 0.30156916972284015, "flos": 22820549241600.0, "grad_norm": 2.5404471931931263, "language_loss": 0.63833511, "learning_rate": 3.2751360570423767e-06, "loss": 0.66047931, "num_input_tokens_seen": 53908135, "step": 2508, "time_per_iteration": 2.5242459774017334 }, { "auxiliary_loss_clip": 0.01173782, "auxiliary_loss_mlp": 0.01034597, "balance_loss_clip": 1.05881381, "balance_loss_mlp": 1.02562666, "epoch": 0.3016894126134792, "flos": 29899188529920.0, "grad_norm": 2.0208810286106322, "language_loss": 0.7583909, "learning_rate": 3.2745358452083236e-06, "loss": 0.78047472, "num_input_tokens_seen": 53931035, "step": 2509, "time_per_iteration": 2.669614315032959 }, { "auxiliary_loss_clip": 0.01187899, "auxiliary_loss_mlp": 0.01028985, "balance_loss_clip": 1.06250525, "balance_loss_mlp": 1.02098036, "epoch": 0.3018096555041183, "flos": 21546331200000.0, "grad_norm": 1.3699594928564742, "language_loss": 0.82284606, "learning_rate": 3.2739354400281955e-06, "loss": 0.84501493, "num_input_tokens_seen": 53952255, "step": 2510, "time_per_iteration": 2.6003730297088623 }, { "auxiliary_loss_clip": 0.01065413, "auxiliary_loss_mlp": 0.00753369, "balance_loss_clip": 1.02279091, "balance_loss_mlp": 1.00027013, "epoch": 0.3019298983947574, "flos": 59136294597120.0, "grad_norm": 0.8677804210448475, "language_loss": 0.63737381, "learning_rate": 3.2733348415930744e-06, "loss": 0.65556163, "num_input_tokens_seen": 54014125, "step": 2511, "time_per_iteration": 3.18686842918396 }, { "auxiliary_loss_clip": 0.01153976, "auxiliary_loss_mlp": 0.01029852, "balance_loss_clip": 1.05634844, "balance_loss_mlp": 1.02103055, "epoch": 0.3020501412853965, "flos": 34423070941440.0, "grad_norm": 1.753753272570682, "language_loss": 0.80697787, "learning_rate": 3.27273404999407e-06, "loss": 0.82881618, "num_input_tokens_seen": 54036345, "step": 2512, "time_per_iteration": 2.7594552040100098 }, { "auxiliary_loss_clip": 0.01076693, "auxiliary_loss_mlp": 0.01001928, "balance_loss_clip": 1.02240872, "balance_loss_mlp": 1.00051546, "epoch": 0.3021703841760356, "flos": 71008288128000.0, "grad_norm": 0.7945268294047647, "language_loss": 0.60523999, "learning_rate": 3.272133065322322e-06, "loss": 0.62602621, "num_input_tokens_seen": 54094615, "step": 2513, "time_per_iteration": 3.1161978244781494 }, { "auxiliary_loss_clip": 0.01198852, "auxiliary_loss_mlp": 0.01033194, "balance_loss_clip": 1.05981231, "balance_loss_mlp": 1.02458715, "epoch": 0.3022906270666747, "flos": 21510528318720.0, "grad_norm": 2.294592755166712, "language_loss": 0.79291415, "learning_rate": 3.271531887669e-06, "loss": 0.81523466, "num_input_tokens_seen": 54114675, "step": 2514, "time_per_iteration": 2.560870885848999 }, { "auxiliary_loss_clip": 0.01146909, "auxiliary_loss_mlp": 0.01031409, "balance_loss_clip": 1.0526731, "balance_loss_mlp": 1.0221231, "epoch": 0.30241086995731375, "flos": 31132001168640.0, "grad_norm": 2.3291015231033088, "language_loss": 0.63726056, "learning_rate": 3.2709305171253015e-06, "loss": 0.65904373, "num_input_tokens_seen": 54134795, "step": 2515, "time_per_iteration": 2.691617250442505 }, { "auxiliary_loss_clip": 0.0118586, "auxiliary_loss_mlp": 0.01029907, "balance_loss_clip": 1.06004691, "balance_loss_mlp": 1.02132368, "epoch": 0.30253111284795287, "flos": 23511542152320.0, "grad_norm": 1.767714350659697, "language_loss": 0.7750448, "learning_rate": 3.2703289537824536e-06, "loss": 0.79720253, "num_input_tokens_seen": 54154595, "step": 2516, "time_per_iteration": 2.5222325325012207 }, { "auxiliary_loss_clip": 0.01145503, "auxiliary_loss_mlp": 0.01037356, "balance_loss_clip": 1.05513978, "balance_loss_mlp": 1.0281949, "epoch": 0.302651355738592, "flos": 18725367651840.0, "grad_norm": 2.5750889189759074, "language_loss": 0.79021883, "learning_rate": 3.269727197731714e-06, "loss": 0.81204736, "num_input_tokens_seen": 54167360, "step": 2517, "time_per_iteration": 2.5560317039489746 }, { "auxiliary_loss_clip": 0.01138605, "auxiliary_loss_mlp": 0.01035049, "balance_loss_clip": 1.05475283, "balance_loss_mlp": 1.02629876, "epoch": 0.30277159862923103, "flos": 22418888382720.0, "grad_norm": 1.6179424218917458, "language_loss": 0.78088784, "learning_rate": 3.269125249064367e-06, "loss": 0.8026244, "num_input_tokens_seen": 54187055, "step": 2518, "time_per_iteration": 2.5996103286743164 }, { "auxiliary_loss_clip": 0.01201619, "auxiliary_loss_mlp": 0.01029556, "balance_loss_clip": 1.06070828, "balance_loss_mlp": 1.02100289, "epoch": 0.30289184151987014, "flos": 22273126992000.0, "grad_norm": 1.6123060852828135, "language_loss": 0.83222878, "learning_rate": 3.2685231078717297e-06, "loss": 0.85454059, "num_input_tokens_seen": 54207245, "step": 2519, "time_per_iteration": 2.5161924362182617 }, { "auxiliary_loss_clip": 0.01148396, "auxiliary_loss_mlp": 0.00763591, "balance_loss_clip": 1.05572081, "balance_loss_mlp": 1.00041389, "epoch": 0.30301208441050925, "flos": 25225594231680.0, "grad_norm": 2.634598646533618, "language_loss": 0.75251359, "learning_rate": 3.267920774245145e-06, "loss": 0.77163351, "num_input_tokens_seen": 54226650, "step": 2520, "time_per_iteration": 2.594172239303589 }, { "auxiliary_loss_clip": 0.01191567, "auxiliary_loss_mlp": 0.01038977, "balance_loss_clip": 1.06405401, "balance_loss_mlp": 1.02931499, "epoch": 0.3031323273011483, "flos": 23039245198080.0, "grad_norm": 2.1235876154280096, "language_loss": 0.84863114, "learning_rate": 3.2673182482759876e-06, "loss": 0.87093663, "num_input_tokens_seen": 54245765, "step": 2521, "time_per_iteration": 2.506431818008423 }, { "auxiliary_loss_clip": 0.01186747, "auxiliary_loss_mlp": 0.01029515, "balance_loss_clip": 1.06061029, "balance_loss_mlp": 1.02059186, "epoch": 0.3032525701917874, "flos": 18876695650560.0, "grad_norm": 2.65023783129727, "language_loss": 0.66362751, "learning_rate": 3.266715530055659e-06, "loss": 0.68579012, "num_input_tokens_seen": 54263915, "step": 2522, "time_per_iteration": 2.4898760318756104 }, { "auxiliary_loss_clip": 0.01177743, "auxiliary_loss_mlp": 0.01029182, "balance_loss_clip": 1.0578686, "balance_loss_mlp": 1.02010381, "epoch": 0.30337281308242653, "flos": 17782641250560.0, "grad_norm": 1.5567698222303759, "language_loss": 0.80538595, "learning_rate": 3.2661126196755927e-06, "loss": 0.82745516, "num_input_tokens_seen": 54283025, "step": 2523, "time_per_iteration": 3.3456547260284424 }, { "auxiliary_loss_clip": 0.01093292, "auxiliary_loss_mlp": 0.0100313, "balance_loss_clip": 1.02000046, "balance_loss_mlp": 1.00193238, "epoch": 0.3034930559730656, "flos": 57824298426240.0, "grad_norm": 0.7842039617537705, "language_loss": 0.55972791, "learning_rate": 3.265509517227248e-06, "loss": 0.58069217, "num_input_tokens_seen": 54339840, "step": 2524, "time_per_iteration": 3.0423965454101562 }, { "auxiliary_loss_clip": 0.01173376, "auxiliary_loss_mlp": 0.01029172, "balance_loss_clip": 1.05697024, "balance_loss_mlp": 1.02056503, "epoch": 0.3036132988637047, "flos": 14755587419520.0, "grad_norm": 1.6294588722256331, "language_loss": 0.81412554, "learning_rate": 3.264906222802115e-06, "loss": 0.836151, "num_input_tokens_seen": 54357690, "step": 2525, "time_per_iteration": 2.520167589187622 }, { "auxiliary_loss_clip": 0.01203987, "auxiliary_loss_mlp": 0.01032519, "balance_loss_clip": 1.06117499, "balance_loss_mlp": 1.02296424, "epoch": 0.30373354175434375, "flos": 21033203460480.0, "grad_norm": 1.963109081989034, "language_loss": 0.77811176, "learning_rate": 3.264302736491715e-06, "loss": 0.80047679, "num_input_tokens_seen": 54377810, "step": 2526, "time_per_iteration": 2.5642902851104736 }, { "auxiliary_loss_clip": 0.0118515, "auxiliary_loss_mlp": 0.01028345, "balance_loss_clip": 1.06349432, "balance_loss_mlp": 1.01973176, "epoch": 0.30385378464498286, "flos": 21143233797120.0, "grad_norm": 1.7335452160130531, "language_loss": 0.87648231, "learning_rate": 3.263699058387594e-06, "loss": 0.89861721, "num_input_tokens_seen": 54395245, "step": 2527, "time_per_iteration": 2.5394296646118164 }, { "auxiliary_loss_clip": 0.01152684, "auxiliary_loss_mlp": 0.01035535, "balance_loss_clip": 1.0527072, "balance_loss_mlp": 1.02629042, "epoch": 0.30397402753562197, "flos": 20629244131200.0, "grad_norm": 2.1590391549208627, "language_loss": 0.90146124, "learning_rate": 3.2630951885813315e-06, "loss": 0.92334342, "num_input_tokens_seen": 54412640, "step": 2528, "time_per_iteration": 3.4062328338623047 }, { "auxiliary_loss_clip": 0.01170894, "auxiliary_loss_mlp": 0.01034786, "balance_loss_clip": 1.05523014, "balance_loss_mlp": 1.02592254, "epoch": 0.304094270426261, "flos": 15085678429440.0, "grad_norm": 2.254779655991013, "language_loss": 0.78191692, "learning_rate": 3.262491127164533e-06, "loss": 0.80397367, "num_input_tokens_seen": 54431455, "step": 2529, "time_per_iteration": 3.35711669921875 }, { "auxiliary_loss_clip": 0.01178079, "auxiliary_loss_mlp": 0.00763844, "balance_loss_clip": 1.05872631, "balance_loss_mlp": 1.00048196, "epoch": 0.30421451331690014, "flos": 13845216193920.0, "grad_norm": 2.3109062809461705, "language_loss": 0.80576789, "learning_rate": 3.2618868742288337e-06, "loss": 0.82518709, "num_input_tokens_seen": 54448380, "step": 2530, "time_per_iteration": 2.5056116580963135 }, { "auxiliary_loss_clip": 0.01184193, "auxiliary_loss_mlp": 0.01034752, "balance_loss_clip": 1.05911517, "balance_loss_mlp": 1.02624011, "epoch": 0.30433475620753925, "flos": 17384212615680.0, "grad_norm": 1.9498125816392136, "language_loss": 0.721264, "learning_rate": 3.261282429865899e-06, "loss": 0.7434535, "num_input_tokens_seen": 54466385, "step": 2531, "time_per_iteration": 2.5009498596191406 }, { "auxiliary_loss_clip": 0.0117875, "auxiliary_loss_mlp": 0.00762875, "balance_loss_clip": 1.06110644, "balance_loss_mlp": 1.00043011, "epoch": 0.3044549990981783, "flos": 18916951818240.0, "grad_norm": 1.7131432541660319, "language_loss": 0.72472709, "learning_rate": 3.2606777941674225e-06, "loss": 0.74414337, "num_input_tokens_seen": 54485040, "step": 2532, "time_per_iteration": 2.523322820663452 }, { "auxiliary_loss_clip": 0.01136456, "auxiliary_loss_mlp": 0.01034818, "balance_loss_clip": 1.05527604, "balance_loss_mlp": 1.02533531, "epoch": 0.3045752419888174, "flos": 21068431724160.0, "grad_norm": 2.048658010233344, "language_loss": 0.84644771, "learning_rate": 3.2600729672251276e-06, "loss": 0.86816043, "num_input_tokens_seen": 54502755, "step": 2533, "time_per_iteration": 2.6052637100219727 }, { "auxiliary_loss_clip": 0.01201848, "auxiliary_loss_mlp": 0.00764031, "balance_loss_clip": 1.06364822, "balance_loss_mlp": 1.00044715, "epoch": 0.3046954848794565, "flos": 29096405516160.0, "grad_norm": 1.957431760527712, "language_loss": 0.65837675, "learning_rate": 3.259467949130765e-06, "loss": 0.67803556, "num_input_tokens_seen": 54524165, "step": 2534, "time_per_iteration": 2.528587579727173 }, { "auxiliary_loss_clip": 0.01175538, "auxiliary_loss_mlp": 0.01032652, "balance_loss_clip": 1.06147075, "balance_loss_mlp": 1.0237174, "epoch": 0.3048157277700956, "flos": 20295346279680.0, "grad_norm": 2.717079864066357, "language_loss": 0.82668656, "learning_rate": 3.2588627399761164e-06, "loss": 0.84876847, "num_input_tokens_seen": 54540160, "step": 2535, "time_per_iteration": 2.52421498298645 }, { "auxiliary_loss_clip": 0.01169873, "auxiliary_loss_mlp": 0.01030812, "balance_loss_clip": 1.05769825, "balance_loss_mlp": 1.02286112, "epoch": 0.3049359706607347, "flos": 22739929165440.0, "grad_norm": 1.8297408170458171, "language_loss": 0.70868313, "learning_rate": 3.2582573398529903e-06, "loss": 0.73068994, "num_input_tokens_seen": 54557515, "step": 2536, "time_per_iteration": 2.525496244430542 }, { "auxiliary_loss_clip": 0.01157652, "auxiliary_loss_mlp": 0.01030913, "balance_loss_clip": 1.05547166, "balance_loss_mlp": 1.02116156, "epoch": 0.3050562135513738, "flos": 18434634969600.0, "grad_norm": 2.2877642762140815, "language_loss": 0.73673189, "learning_rate": 3.2576517488532265e-06, "loss": 0.75861752, "num_input_tokens_seen": 54573865, "step": 2537, "time_per_iteration": 2.5248055458068848 }, { "auxiliary_loss_clip": 0.01183149, "auxiliary_loss_mlp": 0.01031619, "balance_loss_clip": 1.05673671, "balance_loss_mlp": 1.02342308, "epoch": 0.30517645644201286, "flos": 20370327920640.0, "grad_norm": 1.996552654536769, "language_loss": 0.87468207, "learning_rate": 3.257045967068692e-06, "loss": 0.89682972, "num_input_tokens_seen": 54593120, "step": 2538, "time_per_iteration": 2.4772703647613525 }, { "auxiliary_loss_clip": 0.0120401, "auxiliary_loss_mlp": 0.01036768, "balance_loss_clip": 1.06302202, "balance_loss_mlp": 1.02708817, "epoch": 0.30529669933265197, "flos": 21945118970880.0, "grad_norm": 1.5150788562623656, "language_loss": 0.82151115, "learning_rate": 3.2564399945912848e-06, "loss": 0.84391892, "num_input_tokens_seen": 54612910, "step": 2539, "time_per_iteration": 2.465324878692627 }, { "auxiliary_loss_clip": 0.01147856, "auxiliary_loss_mlp": 0.01031328, "balance_loss_clip": 1.0562706, "balance_loss_mlp": 1.02314401, "epoch": 0.305416942223291, "flos": 21835411856640.0, "grad_norm": 2.112055452973483, "language_loss": 0.81893671, "learning_rate": 3.2558338315129287e-06, "loss": 0.84072852, "num_input_tokens_seen": 54631055, "step": 2540, "time_per_iteration": 2.579378128051758 }, { "auxiliary_loss_clip": 0.0117988, "auxiliary_loss_mlp": 0.01031593, "balance_loss_clip": 1.05811334, "balance_loss_mlp": 1.02225316, "epoch": 0.30553718511393013, "flos": 33911810709120.0, "grad_norm": 2.3360721068148647, "language_loss": 0.75758135, "learning_rate": 3.2552274779255785e-06, "loss": 0.77969611, "num_input_tokens_seen": 54651985, "step": 2541, "time_per_iteration": 2.568037509918213 }, { "auxiliary_loss_clip": 0.01188058, "auxiliary_loss_mlp": 0.01035742, "balance_loss_clip": 1.06177032, "balance_loss_mlp": 1.0269084, "epoch": 0.30565742800456924, "flos": 22268530051200.0, "grad_norm": 2.816432528797108, "language_loss": 0.7695508, "learning_rate": 3.2546209339212184e-06, "loss": 0.7917887, "num_input_tokens_seen": 54671005, "step": 2542, "time_per_iteration": 2.503436326980591 }, { "auxiliary_loss_clip": 0.01174199, "auxiliary_loss_mlp": 0.0103293, "balance_loss_clip": 1.05772519, "balance_loss_mlp": 1.02360773, "epoch": 0.3057776708952083, "flos": 22565044823040.0, "grad_norm": 4.817841391028305, "language_loss": 0.77093232, "learning_rate": 3.25401419959186e-06, "loss": 0.79300356, "num_input_tokens_seen": 54691615, "step": 2543, "time_per_iteration": 2.577075242996216 }, { "auxiliary_loss_clip": 0.01184035, "auxiliary_loss_mlp": 0.010404, "balance_loss_clip": 1.06369662, "balance_loss_mlp": 1.03090477, "epoch": 0.3058979137858474, "flos": 21799213925760.0, "grad_norm": 1.985366899540621, "language_loss": 0.76251411, "learning_rate": 3.253407275029545e-06, "loss": 0.78475845, "num_input_tokens_seen": 54710520, "step": 2544, "time_per_iteration": 2.552225112915039 }, { "auxiliary_loss_clip": 0.01164375, "auxiliary_loss_mlp": 0.01031448, "balance_loss_clip": 1.06153047, "balance_loss_mlp": 1.02136254, "epoch": 0.3060181566764865, "flos": 26979435601920.0, "grad_norm": 3.547934076312213, "language_loss": 0.79899305, "learning_rate": 3.2528001603263425e-06, "loss": 0.82095122, "num_input_tokens_seen": 54732590, "step": 2545, "time_per_iteration": 2.676417112350464 }, { "auxiliary_loss_clip": 0.01189625, "auxiliary_loss_mlp": 0.01032753, "balance_loss_clip": 1.06468487, "balance_loss_mlp": 1.02368116, "epoch": 0.3061383995671256, "flos": 19865101173120.0, "grad_norm": 1.7584696023057218, "language_loss": 0.81316751, "learning_rate": 3.2521928555743514e-06, "loss": 0.83539128, "num_input_tokens_seen": 54749935, "step": 2546, "time_per_iteration": 2.502725124359131 }, { "auxiliary_loss_clip": 0.01165588, "auxiliary_loss_mlp": 0.00764089, "balance_loss_clip": 1.05534077, "balance_loss_mlp": 1.0003953, "epoch": 0.3062586424577647, "flos": 22127509255680.0, "grad_norm": 1.7343426097250114, "language_loss": 0.67941946, "learning_rate": 3.2515853608657e-06, "loss": 0.69871622, "num_input_tokens_seen": 54767935, "step": 2547, "time_per_iteration": 2.6365807056427 }, { "auxiliary_loss_clip": 0.01183558, "auxiliary_loss_mlp": 0.01034689, "balance_loss_clip": 1.06005025, "balance_loss_mlp": 1.02557516, "epoch": 0.3063788853484038, "flos": 20845497962880.0, "grad_norm": 1.9789148819918887, "language_loss": 0.75077724, "learning_rate": 3.250977676292545e-06, "loss": 0.77295965, "num_input_tokens_seen": 54786175, "step": 2548, "time_per_iteration": 2.62123966217041 }, { "auxiliary_loss_clip": 0.01176399, "auxiliary_loss_mlp": 0.01028232, "balance_loss_clip": 1.05896759, "balance_loss_mlp": 1.01921952, "epoch": 0.30649912823904285, "flos": 16209717707520.0, "grad_norm": 2.1408589079634472, "language_loss": 0.79484624, "learning_rate": 3.2503698019470712e-06, "loss": 0.8168925, "num_input_tokens_seen": 54801945, "step": 2549, "time_per_iteration": 3.4255385398864746 }, { "auxiliary_loss_clip": 0.01185958, "auxiliary_loss_mlp": 0.01035496, "balance_loss_clip": 1.05819559, "balance_loss_mlp": 1.02554846, "epoch": 0.30661937112968196, "flos": 18617815353600.0, "grad_norm": 2.9209145172391393, "language_loss": 0.77924049, "learning_rate": 3.249761737921492e-06, "loss": 0.80145496, "num_input_tokens_seen": 54818475, "step": 2550, "time_per_iteration": 2.5074427127838135 }, { "auxiliary_loss_clip": 0.01172711, "auxiliary_loss_mlp": 0.01039531, "balance_loss_clip": 1.06199098, "balance_loss_mlp": 1.0304476, "epoch": 0.30673961402032107, "flos": 31390809638400.0, "grad_norm": 1.930094948696054, "language_loss": 0.74417871, "learning_rate": 3.249153484308051e-06, "loss": 0.76630116, "num_input_tokens_seen": 54837090, "step": 2551, "time_per_iteration": 2.6570584774017334 }, { "auxiliary_loss_clip": 0.0113492, "auxiliary_loss_mlp": 0.01031278, "balance_loss_clip": 1.05287671, "balance_loss_mlp": 1.02201533, "epoch": 0.3068598569109601, "flos": 20229809915520.0, "grad_norm": 1.8974327612896555, "language_loss": 0.78112525, "learning_rate": 3.2485450411990194e-06, "loss": 0.8027873, "num_input_tokens_seen": 54856445, "step": 2552, "time_per_iteration": 2.609426736831665 }, { "auxiliary_loss_clip": 0.01203668, "auxiliary_loss_mlp": 0.01034169, "balance_loss_clip": 1.06156111, "balance_loss_mlp": 1.02482295, "epoch": 0.30698009980159924, "flos": 29601991399680.0, "grad_norm": 1.67458772363467, "language_loss": 0.82118469, "learning_rate": 3.2479364086866983e-06, "loss": 0.84356302, "num_input_tokens_seen": 54876700, "step": 2553, "time_per_iteration": 2.568513870239258 }, { "auxiliary_loss_clip": 0.01175701, "auxiliary_loss_mlp": 0.00764556, "balance_loss_clip": 1.06309676, "balance_loss_mlp": 1.00036275, "epoch": 0.30710034269223835, "flos": 23842423261440.0, "grad_norm": 2.0202817862949427, "language_loss": 0.81202972, "learning_rate": 3.247327586863416e-06, "loss": 0.83143228, "num_input_tokens_seen": 54897580, "step": 2554, "time_per_iteration": 2.635573387145996 }, { "auxiliary_loss_clip": 0.01164287, "auxiliary_loss_mlp": 0.01031833, "balance_loss_clip": 1.05838156, "balance_loss_mlp": 1.02221262, "epoch": 0.3072205855828774, "flos": 25884986152320.0, "grad_norm": 2.5539249194740923, "language_loss": 0.76973808, "learning_rate": 3.2467185758215304e-06, "loss": 0.79169923, "num_input_tokens_seen": 54917320, "step": 2555, "time_per_iteration": 4.326486110687256 }, { "auxiliary_loss_clip": 0.01164073, "auxiliary_loss_mlp": 0.00764421, "balance_loss_clip": 1.06065357, "balance_loss_mlp": 1.0003407, "epoch": 0.3073408284735165, "flos": 22236390357120.0, "grad_norm": 2.3399816496710737, "language_loss": 0.85378468, "learning_rate": 3.246109375653428e-06, "loss": 0.87306958, "num_input_tokens_seen": 54934085, "step": 2556, "time_per_iteration": 2.566512107849121 }, { "auxiliary_loss_clip": 0.01202081, "auxiliary_loss_mlp": 0.01032906, "balance_loss_clip": 1.06287003, "balance_loss_mlp": 1.02345324, "epoch": 0.30746107136415557, "flos": 19500284689920.0, "grad_norm": 1.8613564569188183, "language_loss": 0.78450274, "learning_rate": 3.2454999864515243e-06, "loss": 0.80685258, "num_input_tokens_seen": 54953460, "step": 2557, "time_per_iteration": 2.485262632369995 }, { "auxiliary_loss_clip": 0.01169424, "auxiliary_loss_mlp": 0.00764566, "balance_loss_clip": 1.05856824, "balance_loss_mlp": 1.00033796, "epoch": 0.3075813142547947, "flos": 21724806902400.0, "grad_norm": 1.9734679764863288, "language_loss": 0.69481289, "learning_rate": 3.244890408308263e-06, "loss": 0.71415275, "num_input_tokens_seen": 54974165, "step": 2558, "time_per_iteration": 2.5684025287628174 }, { "auxiliary_loss_clip": 0.01143103, "auxiliary_loss_mlp": 0.01027458, "balance_loss_clip": 1.0528121, "balance_loss_mlp": 1.01857102, "epoch": 0.3077015571454338, "flos": 24097963593600.0, "grad_norm": 2.0964095580441033, "language_loss": 0.61549819, "learning_rate": 3.2442806413161165e-06, "loss": 0.63720381, "num_input_tokens_seen": 54993810, "step": 2559, "time_per_iteration": 2.6360771656036377 }, { "auxiliary_loss_clip": 0.01145844, "auxiliary_loss_mlp": 0.01036367, "balance_loss_clip": 1.05490255, "balance_loss_mlp": 1.02663326, "epoch": 0.30782180003607285, "flos": 18405476104320.0, "grad_norm": 2.262498332346023, "language_loss": 0.75669622, "learning_rate": 3.243670685567586e-06, "loss": 0.77851838, "num_input_tokens_seen": 55011210, "step": 2560, "time_per_iteration": 2.574000835418701 }, { "auxiliary_loss_clip": 0.01167786, "auxiliary_loss_mlp": 0.00763195, "balance_loss_clip": 1.05660057, "balance_loss_mlp": 1.000337, "epoch": 0.30794204292671196, "flos": 23878549365120.0, "grad_norm": 2.2030241756908877, "language_loss": 0.80321157, "learning_rate": 3.2430605411552012e-06, "loss": 0.82252139, "num_input_tokens_seen": 55031325, "step": 2561, "time_per_iteration": 2.574800968170166 }, { "auxiliary_loss_clip": 0.0106763, "auxiliary_loss_mlp": 0.01002978, "balance_loss_clip": 1.02392912, "balance_loss_mlp": 1.00163066, "epoch": 0.30806228581735107, "flos": 67927800816000.0, "grad_norm": 0.895991085028544, "language_loss": 0.70571971, "learning_rate": 3.2424502081715205e-06, "loss": 0.72642577, "num_input_tokens_seen": 55094440, "step": 2562, "time_per_iteration": 3.1551928520202637 }, { "auxiliary_loss_clip": 0.01173007, "auxiliary_loss_mlp": 0.01035635, "balance_loss_clip": 1.05844688, "balance_loss_mlp": 1.02624738, "epoch": 0.3081825287079901, "flos": 23843213360640.0, "grad_norm": 1.7024109787593373, "language_loss": 0.78258002, "learning_rate": 3.241839686709132e-06, "loss": 0.80466652, "num_input_tokens_seen": 55115375, "step": 2563, "time_per_iteration": 2.5709052085876465 }, { "auxiliary_loss_clip": 0.01182179, "auxiliary_loss_mlp": 0.01033847, "balance_loss_clip": 1.05560935, "balance_loss_mlp": 1.02456093, "epoch": 0.30830277159862923, "flos": 16209969102720.0, "grad_norm": 2.2087492961752875, "language_loss": 0.81828892, "learning_rate": 3.2412289768606495e-06, "loss": 0.84044921, "num_input_tokens_seen": 55131945, "step": 2564, "time_per_iteration": 2.4789037704467773 }, { "auxiliary_loss_clip": 0.01188191, "auxiliary_loss_mlp": 0.01036984, "balance_loss_clip": 1.06065464, "balance_loss_mlp": 1.02835298, "epoch": 0.30842301448926834, "flos": 29349503723520.0, "grad_norm": 1.7893080932667713, "language_loss": 0.82809997, "learning_rate": 3.240618078718718e-06, "loss": 0.85035169, "num_input_tokens_seen": 55153405, "step": 2565, "time_per_iteration": 2.6351876258850098 }, { "auxiliary_loss_clip": 0.01153988, "auxiliary_loss_mlp": 0.01034538, "balance_loss_clip": 1.05391157, "balance_loss_mlp": 1.02522802, "epoch": 0.3085432573799074, "flos": 21945190798080.0, "grad_norm": 2.7851221701190694, "language_loss": 0.74114138, "learning_rate": 3.240006992376011e-06, "loss": 0.7630266, "num_input_tokens_seen": 55173030, "step": 2566, "time_per_iteration": 2.579993724822998 }, { "auxiliary_loss_clip": 0.01175019, "auxiliary_loss_mlp": 0.0104024, "balance_loss_clip": 1.05949759, "balance_loss_mlp": 1.03106689, "epoch": 0.3086635002705465, "flos": 22054718344320.0, "grad_norm": 2.198744559276731, "language_loss": 0.75980198, "learning_rate": 3.2393957179252284e-06, "loss": 0.78195465, "num_input_tokens_seen": 55189565, "step": 2567, "time_per_iteration": 2.548823833465576 }, { "auxiliary_loss_clip": 0.0119968, "auxiliary_loss_mlp": 0.01032221, "balance_loss_clip": 1.06107259, "balance_loss_mlp": 1.02361417, "epoch": 0.3087837431611856, "flos": 32665925520000.0, "grad_norm": 2.21318109491651, "language_loss": 0.8092196, "learning_rate": 3.2387842554591016e-06, "loss": 0.83153862, "num_input_tokens_seen": 55210380, "step": 2568, "time_per_iteration": 2.575373411178589 }, { "auxiliary_loss_clip": 0.0119888, "auxiliary_loss_mlp": 0.01038499, "balance_loss_clip": 1.0610764, "balance_loss_mlp": 1.02949309, "epoch": 0.3089039860518247, "flos": 17599245384960.0, "grad_norm": 2.0483665100440356, "language_loss": 0.87769192, "learning_rate": 3.238172605070388e-06, "loss": 0.90006578, "num_input_tokens_seen": 55225795, "step": 2569, "time_per_iteration": 2.4413695335388184 }, { "auxiliary_loss_clip": 0.01182767, "auxiliary_loss_mlp": 0.00764516, "balance_loss_clip": 1.05796194, "balance_loss_mlp": 1.00036359, "epoch": 0.3090242289424638, "flos": 14383839611520.0, "grad_norm": 2.2588303445951907, "language_loss": 0.78495175, "learning_rate": 3.2375607668518745e-06, "loss": 0.80442452, "num_input_tokens_seen": 55238830, "step": 2570, "time_per_iteration": 2.4739160537719727 }, { "auxiliary_loss_clip": 0.01161531, "auxiliary_loss_mlp": 0.01033084, "balance_loss_clip": 1.05574048, "balance_loss_mlp": 1.02392864, "epoch": 0.30914447183310284, "flos": 16068625084800.0, "grad_norm": 2.4154320674368708, "language_loss": 0.90108687, "learning_rate": 3.236948740896377e-06, "loss": 0.923033, "num_input_tokens_seen": 55253630, "step": 2571, "time_per_iteration": 2.5089807510375977 }, { "auxiliary_loss_clip": 0.01184009, "auxiliary_loss_mlp": 0.01033267, "balance_loss_clip": 1.05975842, "balance_loss_mlp": 1.02452934, "epoch": 0.30926471472374195, "flos": 32230221546240.0, "grad_norm": 1.3806592437244731, "language_loss": 0.84018606, "learning_rate": 3.2363365272967384e-06, "loss": 0.86235881, "num_input_tokens_seen": 55276200, "step": 2572, "time_per_iteration": 2.575801134109497 }, { "auxiliary_loss_clip": 0.01182971, "auxiliary_loss_mlp": 0.01036158, "balance_loss_clip": 1.06215227, "balance_loss_mlp": 1.02581048, "epoch": 0.30938495761438106, "flos": 20370722970240.0, "grad_norm": 1.8761373595849862, "language_loss": 0.81487721, "learning_rate": 3.235724126145832e-06, "loss": 0.8370685, "num_input_tokens_seen": 55292235, "step": 2573, "time_per_iteration": 2.4904286861419678 }, { "auxiliary_loss_clip": 0.01174138, "auxiliary_loss_mlp": 0.01036856, "balance_loss_clip": 1.05594969, "balance_loss_mlp": 1.02745676, "epoch": 0.3095052005050201, "flos": 24061155131520.0, "grad_norm": 1.493099705913763, "language_loss": 0.77647698, "learning_rate": 3.235111537536558e-06, "loss": 0.79858696, "num_input_tokens_seen": 55313050, "step": 2574, "time_per_iteration": 2.524824619293213 }, { "auxiliary_loss_clip": 0.01186069, "auxiliary_loss_mlp": 0.01026742, "balance_loss_clip": 1.06078708, "balance_loss_mlp": 1.01811147, "epoch": 0.30962544339565923, "flos": 23401547729280.0, "grad_norm": 1.8130970592096554, "language_loss": 0.8289758, "learning_rate": 3.2344987615618456e-06, "loss": 0.8511039, "num_input_tokens_seen": 55332885, "step": 2575, "time_per_iteration": 2.562351942062378 }, { "auxiliary_loss_clip": 0.01153689, "auxiliary_loss_mlp": 0.0103978, "balance_loss_clip": 1.05712855, "balance_loss_mlp": 1.03079736, "epoch": 0.30974568628629834, "flos": 33799984692480.0, "grad_norm": 1.5430371627268409, "language_loss": 0.78230202, "learning_rate": 3.2338857983146533e-06, "loss": 0.80423671, "num_input_tokens_seen": 55354385, "step": 2576, "time_per_iteration": 2.6436009407043457 }, { "auxiliary_loss_clip": 0.01159953, "auxiliary_loss_mlp": 0.01027954, "balance_loss_clip": 1.05686581, "balance_loss_mlp": 1.01823258, "epoch": 0.3098659291769374, "flos": 20229594433920.0, "grad_norm": 1.7415915432371871, "language_loss": 0.76381147, "learning_rate": 3.233272647887966e-06, "loss": 0.78569055, "num_input_tokens_seen": 55373275, "step": 2577, "time_per_iteration": 3.376648187637329 }, { "auxiliary_loss_clip": 0.01201486, "auxiliary_loss_mlp": 0.0103839, "balance_loss_clip": 1.06256604, "balance_loss_mlp": 1.02931213, "epoch": 0.3099861720675765, "flos": 24748556682240.0, "grad_norm": 1.5511575236520265, "language_loss": 0.8996799, "learning_rate": 3.2326593103747985e-06, "loss": 0.92207867, "num_input_tokens_seen": 55392290, "step": 2578, "time_per_iteration": 2.5045881271362305 }, { "auxiliary_loss_clip": 0.01182392, "auxiliary_loss_mlp": 0.01033662, "balance_loss_clip": 1.06043911, "balance_loss_mlp": 1.02469134, "epoch": 0.3101064149582156, "flos": 11765485704960.0, "grad_norm": 3.4427079003615595, "language_loss": 0.85038143, "learning_rate": 3.2320457858681936e-06, "loss": 0.87254196, "num_input_tokens_seen": 55410680, "step": 2579, "time_per_iteration": 2.4898135662078857 }, { "auxiliary_loss_clip": 0.01169894, "auxiliary_loss_mlp": 0.01027, "balance_loss_clip": 1.05594587, "balance_loss_mlp": 1.0181253, "epoch": 0.31022665784885467, "flos": 23033247626880.0, "grad_norm": 2.0790316675104306, "language_loss": 0.85322726, "learning_rate": 3.2314320744612228e-06, "loss": 0.87519616, "num_input_tokens_seen": 55425980, "step": 2580, "time_per_iteration": 2.5371475219726562 }, { "auxiliary_loss_clip": 0.0118271, "auxiliary_loss_mlp": 0.01029878, "balance_loss_clip": 1.06065238, "balance_loss_mlp": 1.02134824, "epoch": 0.3103469007394938, "flos": 16289188548480.0, "grad_norm": 1.6085496797856134, "language_loss": 0.76590157, "learning_rate": 3.2308181762469854e-06, "loss": 0.78802741, "num_input_tokens_seen": 55443925, "step": 2581, "time_per_iteration": 2.503220558166504 }, { "auxiliary_loss_clip": 0.01200649, "auxiliary_loss_mlp": 0.01035645, "balance_loss_clip": 1.0603869, "balance_loss_mlp": 1.02676368, "epoch": 0.3104671436301329, "flos": 30515271626880.0, "grad_norm": 2.1078312850665246, "language_loss": 0.78558457, "learning_rate": 3.230204091318609e-06, "loss": 0.80794752, "num_input_tokens_seen": 55464465, "step": 2582, "time_per_iteration": 4.275298833847046 }, { "auxiliary_loss_clip": 0.01196594, "auxiliary_loss_mlp": 0.00762884, "balance_loss_clip": 1.0596447, "balance_loss_mlp": 1.00037193, "epoch": 0.31058738652077195, "flos": 20047240062720.0, "grad_norm": 1.7650193911695984, "language_loss": 0.84690374, "learning_rate": 3.2295898197692503e-06, "loss": 0.86649853, "num_input_tokens_seen": 55483425, "step": 2583, "time_per_iteration": 2.4883065223693848 }, { "auxiliary_loss_clip": 0.01197684, "auxiliary_loss_mlp": 0.0103378, "balance_loss_clip": 1.05907989, "balance_loss_mlp": 1.02538133, "epoch": 0.31070762941141106, "flos": 28074639237120.0, "grad_norm": 1.6860276225057864, "language_loss": 0.78884518, "learning_rate": 3.228975361692094e-06, "loss": 0.81115985, "num_input_tokens_seen": 55504445, "step": 2584, "time_per_iteration": 2.5420374870300293 }, { "auxiliary_loss_clip": 0.0119043, "auxiliary_loss_mlp": 0.00764102, "balance_loss_clip": 1.05986571, "balance_loss_mlp": 1.00039804, "epoch": 0.31082787230205017, "flos": 20521907314560.0, "grad_norm": 2.476101687890319, "language_loss": 0.80007064, "learning_rate": 3.228360717180352e-06, "loss": 0.81961596, "num_input_tokens_seen": 55521970, "step": 2585, "time_per_iteration": 2.5032527446746826 }, { "auxiliary_loss_clip": 0.01096014, "auxiliary_loss_mlp": 0.00753815, "balance_loss_clip": 1.02352905, "balance_loss_mlp": 1.00082803, "epoch": 0.3109481151926892, "flos": 62445928723200.0, "grad_norm": 0.8243884665032329, "language_loss": 0.59454364, "learning_rate": 3.227745886327266e-06, "loss": 0.61304194, "num_input_tokens_seen": 55580665, "step": 2586, "time_per_iteration": 3.005657434463501 }, { "auxiliary_loss_clip": 0.01095846, "auxiliary_loss_mlp": 0.01005528, "balance_loss_clip": 1.02343237, "balance_loss_mlp": 1.00436008, "epoch": 0.31106835808332833, "flos": 44746744723200.0, "grad_norm": 0.80831643848649, "language_loss": 0.55916452, "learning_rate": 3.227130869226105e-06, "loss": 0.58017826, "num_input_tokens_seen": 55637825, "step": 2587, "time_per_iteration": 3.029630661010742 }, { "auxiliary_loss_clip": 0.01181659, "auxiliary_loss_mlp": 0.01023639, "balance_loss_clip": 1.05678034, "balance_loss_mlp": 1.01563406, "epoch": 0.3111886009739674, "flos": 23403056100480.0, "grad_norm": 2.407839259251215, "language_loss": 0.83063811, "learning_rate": 3.226515665970167e-06, "loss": 0.85269111, "num_input_tokens_seen": 55655365, "step": 2588, "time_per_iteration": 2.5212247371673584 }, { "auxiliary_loss_clip": 0.01182968, "auxiliary_loss_mlp": 0.01028551, "balance_loss_clip": 1.0575577, "balance_loss_mlp": 1.0193181, "epoch": 0.3113088438646065, "flos": 17530728192000.0, "grad_norm": 2.3574393718216236, "language_loss": 0.85905403, "learning_rate": 3.225900276652777e-06, "loss": 0.8811692, "num_input_tokens_seen": 55672140, "step": 2589, "time_per_iteration": 2.507843017578125 }, { "auxiliary_loss_clip": 0.01174194, "auxiliary_loss_mlp": 0.01035051, "balance_loss_clip": 1.05657244, "balance_loss_mlp": 1.02644444, "epoch": 0.3114290867552456, "flos": 28365802882560.0, "grad_norm": 1.711456013896892, "language_loss": 0.75411683, "learning_rate": 3.2252847013672906e-06, "loss": 0.77620929, "num_input_tokens_seen": 55694800, "step": 2590, "time_per_iteration": 2.6080386638641357 }, { "auxiliary_loss_clip": 0.01145343, "auxiliary_loss_mlp": 0.01027895, "balance_loss_clip": 1.05118597, "balance_loss_mlp": 1.01910901, "epoch": 0.31154932964588467, "flos": 27379157126400.0, "grad_norm": 1.9189915256378762, "language_loss": 0.76221371, "learning_rate": 3.224668940207089e-06, "loss": 0.7839461, "num_input_tokens_seen": 55713785, "step": 2591, "time_per_iteration": 2.7283730506896973 }, { "auxiliary_loss_clip": 0.01129153, "auxiliary_loss_mlp": 0.01038358, "balance_loss_clip": 1.04771507, "balance_loss_mlp": 1.02913737, "epoch": 0.3116695725365238, "flos": 26541864120960.0, "grad_norm": 1.7434602583579224, "language_loss": 0.86475694, "learning_rate": 3.2240529932655828e-06, "loss": 0.88643205, "num_input_tokens_seen": 55733050, "step": 2592, "time_per_iteration": 2.645447015762329 }, { "auxiliary_loss_clip": 0.01165464, "auxiliary_loss_mlp": 0.01037197, "balance_loss_clip": 1.05661762, "balance_loss_mlp": 1.02818441, "epoch": 0.3117898154271629, "flos": 21177600134400.0, "grad_norm": 2.420129463294857, "language_loss": 0.88537049, "learning_rate": 3.223436860636211e-06, "loss": 0.90739703, "num_input_tokens_seen": 55748685, "step": 2593, "time_per_iteration": 2.565955400466919 }, { "auxiliary_loss_clip": 0.01198644, "auxiliary_loss_mlp": 0.01037605, "balance_loss_clip": 1.06061149, "balance_loss_mlp": 1.02911162, "epoch": 0.31191005831780194, "flos": 27272430840960.0, "grad_norm": 1.6335165093259305, "language_loss": 0.7407468, "learning_rate": 3.2228205424124403e-06, "loss": 0.76310927, "num_input_tokens_seen": 55771840, "step": 2594, "time_per_iteration": 2.57686710357666 }, { "auxiliary_loss_clip": 0.01155782, "auxiliary_loss_mlp": 0.01025274, "balance_loss_clip": 1.05371058, "balance_loss_mlp": 1.01619005, "epoch": 0.31203030120844105, "flos": 12963501043200.0, "grad_norm": 2.4080262192992126, "language_loss": 0.74761307, "learning_rate": 3.222204038687765e-06, "loss": 0.7694236, "num_input_tokens_seen": 55784975, "step": 2595, "time_per_iteration": 2.5144760608673096 }, { "auxiliary_loss_clip": 0.01178774, "auxiliary_loss_mlp": 0.01028422, "balance_loss_clip": 1.05625701, "balance_loss_mlp": 1.02026248, "epoch": 0.31215054409908016, "flos": 27562014288000.0, "grad_norm": 1.7144754742947774, "language_loss": 0.87691414, "learning_rate": 3.221587349555709e-06, "loss": 0.8989861, "num_input_tokens_seen": 55805235, "step": 2596, "time_per_iteration": 2.611163854598999 }, { "auxiliary_loss_clip": 0.01173395, "auxiliary_loss_mlp": 0.01026693, "balance_loss_clip": 1.05747008, "balance_loss_mlp": 1.01781225, "epoch": 0.3122707869897192, "flos": 21506326427520.0, "grad_norm": 1.6947963425357557, "language_loss": 0.69178277, "learning_rate": 3.2209704751098236e-06, "loss": 0.71378362, "num_input_tokens_seen": 55824265, "step": 2597, "time_per_iteration": 2.5642731189727783 }, { "auxiliary_loss_clip": 0.01172083, "auxiliary_loss_mlp": 0.01029978, "balance_loss_clip": 1.05868542, "balance_loss_mlp": 1.02103162, "epoch": 0.31239102988035833, "flos": 15187017674880.0, "grad_norm": 2.015586882940876, "language_loss": 0.82808524, "learning_rate": 3.2203534154436875e-06, "loss": 0.85010588, "num_input_tokens_seen": 55838620, "step": 2598, "time_per_iteration": 2.5268592834472656 }, { "auxiliary_loss_clip": 0.0112323, "auxiliary_loss_mlp": 0.01040877, "balance_loss_clip": 1.0518713, "balance_loss_mlp": 1.03202569, "epoch": 0.31251127277099744, "flos": 22053712763520.0, "grad_norm": 1.9494711890694392, "language_loss": 0.75375074, "learning_rate": 3.2197361706509084e-06, "loss": 0.77539182, "num_input_tokens_seen": 55859375, "step": 2599, "time_per_iteration": 2.629678964614868 }, { "auxiliary_loss_clip": 0.01201196, "auxiliary_loss_mlp": 0.01033082, "balance_loss_clip": 1.05931282, "balance_loss_mlp": 1.02324128, "epoch": 0.3126315156616365, "flos": 15193984913280.0, "grad_norm": 2.536284799310931, "language_loss": 0.8332063, "learning_rate": 3.2191187408251228e-06, "loss": 0.8555491, "num_input_tokens_seen": 55876535, "step": 2600, "time_per_iteration": 2.469064950942993 }, { "auxiliary_loss_clip": 0.01190933, "auxiliary_loss_mlp": 0.0103469, "balance_loss_clip": 1.05906367, "balance_loss_mlp": 1.02470016, "epoch": 0.3127517585522756, "flos": 18145338831360.0, "grad_norm": 2.0636303833563083, "language_loss": 0.7878629, "learning_rate": 3.218501126059993e-06, "loss": 0.81011915, "num_input_tokens_seen": 55891930, "step": 2601, "time_per_iteration": 2.456859588623047 }, { "auxiliary_loss_clip": 0.01186845, "auxiliary_loss_mlp": 0.01028349, "balance_loss_clip": 1.05746675, "balance_loss_mlp": 1.01906276, "epoch": 0.31287200144291466, "flos": 21908633731200.0, "grad_norm": 1.9752061429597036, "language_loss": 0.81559646, "learning_rate": 3.2178833264492116e-06, "loss": 0.83774841, "num_input_tokens_seen": 55910635, "step": 2602, "time_per_iteration": 2.511857271194458 }, { "auxiliary_loss_clip": 0.01193463, "auxiliary_loss_mlp": 0.01027102, "balance_loss_clip": 1.06080794, "balance_loss_mlp": 1.01791739, "epoch": 0.31299224433355377, "flos": 29896997800320.0, "grad_norm": 2.0025992454860844, "language_loss": 0.76295877, "learning_rate": 3.217265342086498e-06, "loss": 0.78516448, "num_input_tokens_seen": 55931125, "step": 2603, "time_per_iteration": 3.4304370880126953 }, { "auxiliary_loss_clip": 0.01161095, "auxiliary_loss_mlp": 0.00765278, "balance_loss_clip": 1.05863607, "balance_loss_mlp": 1.00053036, "epoch": 0.3131124872241929, "flos": 11655886331520.0, "grad_norm": 4.3125303152270975, "language_loss": 0.72939312, "learning_rate": 3.216647173065599e-06, "loss": 0.74865687, "num_input_tokens_seen": 55946590, "step": 2604, "time_per_iteration": 2.5584330558776855 }, { "auxiliary_loss_clip": 0.01169084, "auxiliary_loss_mlp": 0.01034275, "balance_loss_clip": 1.05905271, "balance_loss_mlp": 1.02479219, "epoch": 0.31323273011483194, "flos": 49848785470080.0, "grad_norm": 1.8110611455828947, "language_loss": 0.73628491, "learning_rate": 3.216028819480292e-06, "loss": 0.75831854, "num_input_tokens_seen": 55967930, "step": 2605, "time_per_iteration": 2.773589849472046 }, { "auxiliary_loss_clip": 0.01157006, "auxiliary_loss_mlp": 0.01033852, "balance_loss_clip": 1.0549798, "balance_loss_mlp": 1.02457714, "epoch": 0.31335297300547105, "flos": 22601278667520.0, "grad_norm": 3.366422634366582, "language_loss": 0.75332034, "learning_rate": 3.2154102814243793e-06, "loss": 0.77522898, "num_input_tokens_seen": 55987070, "step": 2606, "time_per_iteration": 2.541259765625 }, { "auxiliary_loss_clip": 0.01160178, "auxiliary_loss_mlp": 0.01034541, "balance_loss_clip": 1.05788648, "balance_loss_mlp": 1.02585053, "epoch": 0.31347321589611016, "flos": 34710858708480.0, "grad_norm": 2.3671053727266798, "language_loss": 0.66825628, "learning_rate": 3.2147915589916937e-06, "loss": 0.69020343, "num_input_tokens_seen": 56008630, "step": 2607, "time_per_iteration": 2.675459861755371 }, { "auxiliary_loss_clip": 0.0116368, "auxiliary_loss_mlp": 0.01031091, "balance_loss_clip": 1.05455661, "balance_loss_mlp": 1.02198386, "epoch": 0.3135934587867492, "flos": 19755789108480.0, "grad_norm": 2.367441191309818, "language_loss": 0.8307637, "learning_rate": 3.2141726522760938e-06, "loss": 0.85271144, "num_input_tokens_seen": 56026690, "step": 2608, "time_per_iteration": 4.257196664810181 }, { "auxiliary_loss_clip": 0.01080903, "auxiliary_loss_mlp": 0.01003236, "balance_loss_clip": 1.02172875, "balance_loss_mlp": 1.00213897, "epoch": 0.3137137016773883, "flos": 65815535583360.0, "grad_norm": 0.7093106504270107, "language_loss": 0.52659452, "learning_rate": 3.213553561371469e-06, "loss": 0.54743588, "num_input_tokens_seen": 56090425, "step": 2609, "time_per_iteration": 3.9358878135681152 }, { "auxiliary_loss_clip": 0.01138251, "auxiliary_loss_mlp": 0.01033492, "balance_loss_clip": 1.05418134, "balance_loss_mlp": 1.02505767, "epoch": 0.31383394456802743, "flos": 16252739222400.0, "grad_norm": 2.1073021052085177, "language_loss": 0.95681918, "learning_rate": 3.212934286371733e-06, "loss": 0.97853661, "num_input_tokens_seen": 56107135, "step": 2610, "time_per_iteration": 2.6059062480926514 }, { "auxiliary_loss_clip": 0.01187071, "auxiliary_loss_mlp": 0.01029251, "balance_loss_clip": 1.06295073, "balance_loss_mlp": 1.01986957, "epoch": 0.3139541874586665, "flos": 38795517613440.0, "grad_norm": 2.136827029398247, "language_loss": 0.830742, "learning_rate": 3.2123148273708304e-06, "loss": 0.85290515, "num_input_tokens_seen": 56127325, "step": 2611, "time_per_iteration": 2.7035458087921143 }, { "auxiliary_loss_clip": 0.01197998, "auxiliary_loss_mlp": 0.01028919, "balance_loss_clip": 1.06103134, "balance_loss_mlp": 1.01931071, "epoch": 0.3140744303493056, "flos": 25046328430080.0, "grad_norm": 3.679265695046637, "language_loss": 0.7698766, "learning_rate": 3.211695184462733e-06, "loss": 0.79214579, "num_input_tokens_seen": 56148500, "step": 2612, "time_per_iteration": 2.5048346519470215 }, { "auxiliary_loss_clip": 0.01063136, "auxiliary_loss_mlp": 0.01003309, "balance_loss_clip": 1.02280474, "balance_loss_mlp": 1.002159, "epoch": 0.3141946732399447, "flos": 72504254782080.0, "grad_norm": 0.8938898730384528, "language_loss": 0.60477912, "learning_rate": 3.2110753577414383e-06, "loss": 0.62544358, "num_input_tokens_seen": 56210080, "step": 2613, "time_per_iteration": 3.12212872505188 }, { "auxiliary_loss_clip": 0.01171845, "auxiliary_loss_mlp": 0.0102732, "balance_loss_clip": 1.05562329, "balance_loss_mlp": 1.01834309, "epoch": 0.31431491613058377, "flos": 19239788280960.0, "grad_norm": 1.8428816284518195, "language_loss": 0.79341722, "learning_rate": 3.2104553473009757e-06, "loss": 0.81540883, "num_input_tokens_seen": 56228200, "step": 2614, "time_per_iteration": 2.6002743244171143 }, { "auxiliary_loss_clip": 0.01134423, "auxiliary_loss_mlp": 0.01029078, "balance_loss_clip": 1.05161834, "balance_loss_mlp": 1.02038729, "epoch": 0.3144351590212229, "flos": 36210596290560.0, "grad_norm": 2.0145890970336846, "language_loss": 0.68034756, "learning_rate": 3.209835153235399e-06, "loss": 0.70198256, "num_input_tokens_seen": 56249755, "step": 2615, "time_per_iteration": 2.747421979904175 }, { "auxiliary_loss_clip": 0.01144817, "auxiliary_loss_mlp": 0.01029793, "balance_loss_clip": 1.05205631, "balance_loss_mlp": 1.02116847, "epoch": 0.314555401911862, "flos": 18551740285440.0, "grad_norm": 1.982516614324758, "language_loss": 0.67601323, "learning_rate": 3.2092147756387916e-06, "loss": 0.69775939, "num_input_tokens_seen": 56270080, "step": 2616, "time_per_iteration": 2.594224691390991 }, { "auxiliary_loss_clip": 0.0116238, "auxiliary_loss_mlp": 0.01031769, "balance_loss_clip": 1.05348861, "balance_loss_mlp": 1.02202916, "epoch": 0.31467564480250104, "flos": 16362877299840.0, "grad_norm": 2.235411078431745, "language_loss": 0.83681297, "learning_rate": 3.208594214605264e-06, "loss": 0.85875446, "num_input_tokens_seen": 56288625, "step": 2617, "time_per_iteration": 2.5438573360443115 }, { "auxiliary_loss_clip": 0.01158166, "auxiliary_loss_mlp": 0.01031737, "balance_loss_clip": 1.05377257, "balance_loss_mlp": 1.02332664, "epoch": 0.31479588769314015, "flos": 21652375127040.0, "grad_norm": 1.9088795915386598, "language_loss": 0.77119839, "learning_rate": 3.2079734702289553e-06, "loss": 0.79309744, "num_input_tokens_seen": 56307520, "step": 2618, "time_per_iteration": 2.5629146099090576 }, { "auxiliary_loss_clip": 0.01080781, "auxiliary_loss_mlp": 0.00753882, "balance_loss_clip": 1.02235162, "balance_loss_mlp": 1.00093412, "epoch": 0.3149161305837792, "flos": 66051072040320.0, "grad_norm": 0.8130474375700619, "language_loss": 0.60432166, "learning_rate": 3.207352542604031e-06, "loss": 0.62266827, "num_input_tokens_seen": 56369855, "step": 2619, "time_per_iteration": 3.16501522064209 }, { "auxiliary_loss_clip": 0.0114237, "auxiliary_loss_mlp": 0.01035521, "balance_loss_clip": 1.05244112, "balance_loss_mlp": 1.02701545, "epoch": 0.3150363734744183, "flos": 28987201192320.0, "grad_norm": 1.472725751268888, "language_loss": 0.78521335, "learning_rate": 3.2067314318246864e-06, "loss": 0.80699223, "num_input_tokens_seen": 56390570, "step": 2620, "time_per_iteration": 2.6150474548339844 }, { "auxiliary_loss_clip": 0.0116014, "auxiliary_loss_mlp": 0.01027545, "balance_loss_clip": 1.05875874, "balance_loss_mlp": 1.01849079, "epoch": 0.31515661636505743, "flos": 27636600879360.0, "grad_norm": 1.834349620895664, "language_loss": 0.77787185, "learning_rate": 3.206110137985143e-06, "loss": 0.79974866, "num_input_tokens_seen": 56410775, "step": 2621, "time_per_iteration": 2.601893663406372 }, { "auxiliary_loss_clip": 0.01142134, "auxiliary_loss_mlp": 0.01031852, "balance_loss_clip": 1.05157673, "balance_loss_mlp": 1.02236319, "epoch": 0.3152768592556965, "flos": 24605632465920.0, "grad_norm": 2.046596875939888, "language_loss": 0.92146122, "learning_rate": 3.2054886611796505e-06, "loss": 0.94320112, "num_input_tokens_seen": 56429770, "step": 2622, "time_per_iteration": 2.5968804359436035 }, { "auxiliary_loss_clip": 0.01092255, "auxiliary_loss_mlp": 0.01005424, "balance_loss_clip": 1.02100754, "balance_loss_mlp": 1.00423741, "epoch": 0.3153971021463356, "flos": 68476908026880.0, "grad_norm": 0.9058345926050926, "language_loss": 0.63517946, "learning_rate": 3.204867001502487e-06, "loss": 0.65615624, "num_input_tokens_seen": 56488425, "step": 2623, "time_per_iteration": 3.0292139053344727 }, { "auxiliary_loss_clip": 0.01200761, "auxiliary_loss_mlp": 0.01037772, "balance_loss_clip": 1.06225538, "balance_loss_mlp": 1.02772212, "epoch": 0.3155173450369747, "flos": 25593714766080.0, "grad_norm": 2.7139432294518127, "language_loss": 0.80709815, "learning_rate": 3.2042451590479567e-06, "loss": 0.82948345, "num_input_tokens_seen": 56508940, "step": 2624, "time_per_iteration": 2.5205681324005127 }, { "auxiliary_loss_clip": 0.01195533, "auxiliary_loss_mlp": 0.01030614, "balance_loss_clip": 1.06097662, "balance_loss_mlp": 1.02177453, "epoch": 0.31563758792761376, "flos": 24309333175680.0, "grad_norm": 1.6041019879332563, "language_loss": 0.86744505, "learning_rate": 3.203623133910394e-06, "loss": 0.88970655, "num_input_tokens_seen": 56527245, "step": 2625, "time_per_iteration": 2.5068979263305664 }, { "auxiliary_loss_clip": 0.0112798, "auxiliary_loss_mlp": 0.01031815, "balance_loss_clip": 1.05176115, "balance_loss_mlp": 1.02295136, "epoch": 0.31575783081825287, "flos": 31903865550720.0, "grad_norm": 2.8678377148333003, "language_loss": 0.77635121, "learning_rate": 3.203000926184158e-06, "loss": 0.79794919, "num_input_tokens_seen": 56546170, "step": 2626, "time_per_iteration": 2.7037241458892822 }, { "auxiliary_loss_clip": 0.01196746, "auxiliary_loss_mlp": 0.0103294, "balance_loss_clip": 1.06001151, "balance_loss_mlp": 1.02461886, "epoch": 0.315878073708892, "flos": 30810960385920.0, "grad_norm": 1.60115179440242, "language_loss": 0.77441078, "learning_rate": 3.202378535963639e-06, "loss": 0.79670763, "num_input_tokens_seen": 56567085, "step": 2627, "time_per_iteration": 2.5422019958496094 }, { "auxiliary_loss_clip": 0.01160373, "auxiliary_loss_mlp": 0.00764797, "balance_loss_clip": 1.0549165, "balance_loss_mlp": 1.00059295, "epoch": 0.31599831659953104, "flos": 22200264253440.0, "grad_norm": 1.7362295986004062, "language_loss": 0.83955228, "learning_rate": 3.2017559633432516e-06, "loss": 0.85880399, "num_input_tokens_seen": 56586715, "step": 2628, "time_per_iteration": 2.552711009979248 }, { "auxiliary_loss_clip": 0.01175928, "auxiliary_loss_mlp": 0.01035718, "balance_loss_clip": 1.05587053, "balance_loss_mlp": 1.02692628, "epoch": 0.31611855949017015, "flos": 25593463370880.0, "grad_norm": 3.1274857547054555, "language_loss": 0.66390634, "learning_rate": 3.2011332084174398e-06, "loss": 0.68602282, "num_input_tokens_seen": 56607585, "step": 2629, "time_per_iteration": 2.5938720703125 }, { "auxiliary_loss_clip": 0.0118026, "auxiliary_loss_mlp": 0.01030081, "balance_loss_clip": 1.05879664, "balance_loss_mlp": 1.02122974, "epoch": 0.31623880238080926, "flos": 20594087694720.0, "grad_norm": 1.806039296557657, "language_loss": 0.89102179, "learning_rate": 3.2005102712806756e-06, "loss": 0.91312522, "num_input_tokens_seen": 56626415, "step": 2630, "time_per_iteration": 3.357517719268799 }, { "auxiliary_loss_clip": 0.01185915, "auxiliary_loss_mlp": 0.01033641, "balance_loss_clip": 1.05891585, "balance_loss_mlp": 1.02447951, "epoch": 0.3163590452714483, "flos": 12784917600000.0, "grad_norm": 2.2428645288132847, "language_loss": 0.72755176, "learning_rate": 3.1998871520274575e-06, "loss": 0.74974728, "num_input_tokens_seen": 56641750, "step": 2631, "time_per_iteration": 2.4795680046081543 }, { "auxiliary_loss_clip": 0.01169185, "auxiliary_loss_mlp": 0.01035875, "balance_loss_clip": 1.05409968, "balance_loss_mlp": 1.02701151, "epoch": 0.3164792881620874, "flos": 23041292273280.0, "grad_norm": 1.9105310617123425, "language_loss": 0.84661406, "learning_rate": 3.199263850752312e-06, "loss": 0.86866462, "num_input_tokens_seen": 56662585, "step": 2632, "time_per_iteration": 2.5387017726898193 }, { "auxiliary_loss_clip": 0.01183183, "auxiliary_loss_mlp": 0.01035867, "balance_loss_clip": 1.05679798, "balance_loss_mlp": 1.02616882, "epoch": 0.31659953105272653, "flos": 18296271780480.0, "grad_norm": 1.9980979232432317, "language_loss": 0.85386515, "learning_rate": 3.198640367549795e-06, "loss": 0.8760556, "num_input_tokens_seen": 56681480, "step": 2633, "time_per_iteration": 2.5256948471069336 }, { "auxiliary_loss_clip": 0.01182296, "auxiliary_loss_mlp": 0.00763639, "balance_loss_clip": 1.05698788, "balance_loss_mlp": 1.00052941, "epoch": 0.3167197739433656, "flos": 25703421880320.0, "grad_norm": 1.6505522849425667, "language_loss": 0.85933113, "learning_rate": 3.198016702514487e-06, "loss": 0.8787905, "num_input_tokens_seen": 56701760, "step": 2634, "time_per_iteration": 2.5501585006713867 }, { "auxiliary_loss_clip": 0.01195496, "auxiliary_loss_mlp": 0.01030855, "balance_loss_clip": 1.05887139, "balance_loss_mlp": 1.02205682, "epoch": 0.3168400168340047, "flos": 23546016230400.0, "grad_norm": 1.583378822339557, "language_loss": 0.84266776, "learning_rate": 3.1973928557409972e-06, "loss": 0.86493123, "num_input_tokens_seen": 56719800, "step": 2635, "time_per_iteration": 5.027114629745483 }, { "auxiliary_loss_clip": 0.01193628, "auxiliary_loss_mlp": 0.01031587, "balance_loss_clip": 1.05843687, "balance_loss_mlp": 1.02272344, "epoch": 0.31696025972464376, "flos": 28366449327360.0, "grad_norm": 1.9198474815738502, "language_loss": 0.71574003, "learning_rate": 3.1967688273239636e-06, "loss": 0.73799217, "num_input_tokens_seen": 56739605, "step": 2636, "time_per_iteration": 2.531128406524658 }, { "auxiliary_loss_clip": 0.01154394, "auxiliary_loss_mlp": 0.01032035, "balance_loss_clip": 1.05565, "balance_loss_mlp": 1.0230881, "epoch": 0.31708050261528287, "flos": 16399111144320.0, "grad_norm": 1.6785021220979, "language_loss": 0.82015347, "learning_rate": 3.1961446173580503e-06, "loss": 0.84201783, "num_input_tokens_seen": 56756545, "step": 2637, "time_per_iteration": 2.5340161323547363 }, { "auxiliary_loss_clip": 0.01165216, "auxiliary_loss_mlp": 0.01032469, "balance_loss_clip": 1.05676579, "balance_loss_mlp": 1.02384412, "epoch": 0.317200745505922, "flos": 26212347728640.0, "grad_norm": 1.648827139568316, "language_loss": 0.77255428, "learning_rate": 3.1955202259379502e-06, "loss": 0.79453117, "num_input_tokens_seen": 56778275, "step": 2638, "time_per_iteration": 2.572671890258789 }, { "auxiliary_loss_clip": 0.01179761, "auxiliary_loss_mlp": 0.01033718, "balance_loss_clip": 1.05564928, "balance_loss_mlp": 1.02509892, "epoch": 0.31732098839656103, "flos": 31350876693120.0, "grad_norm": 2.1162610733862754, "language_loss": 0.82747221, "learning_rate": 3.194895653158381e-06, "loss": 0.84960705, "num_input_tokens_seen": 56797215, "step": 2639, "time_per_iteration": 2.5851449966430664 }, { "auxiliary_loss_clip": 0.01089319, "auxiliary_loss_mlp": 0.01001988, "balance_loss_clip": 1.01750064, "balance_loss_mlp": 1.00092149, "epoch": 0.31744123128720014, "flos": 58989024835200.0, "grad_norm": 0.9047474169079357, "language_loss": 0.55607307, "learning_rate": 3.194270899114093e-06, "loss": 0.57698613, "num_input_tokens_seen": 56863010, "step": 2640, "time_per_iteration": 3.142221689224243 }, { "auxiliary_loss_clip": 0.01187806, "auxiliary_loss_mlp": 0.01034459, "balance_loss_clip": 1.05908442, "balance_loss_mlp": 1.02434397, "epoch": 0.31756147417783925, "flos": 17417573372160.0, "grad_norm": 2.0362146685051843, "language_loss": 0.81996644, "learning_rate": 3.193645963899858e-06, "loss": 0.84218907, "num_input_tokens_seen": 56880625, "step": 2641, "time_per_iteration": 2.5452685356140137 }, { "auxiliary_loss_clip": 0.01161364, "auxiliary_loss_mlp": 0.01027953, "balance_loss_clip": 1.05526114, "balance_loss_mlp": 1.01926208, "epoch": 0.3176817170684783, "flos": 25481673267840.0, "grad_norm": 1.7722952033375337, "language_loss": 0.84076703, "learning_rate": 3.193020847610479e-06, "loss": 0.86266017, "num_input_tokens_seen": 56900945, "step": 2642, "time_per_iteration": 2.5563437938690186 }, { "auxiliary_loss_clip": 0.01163302, "auxiliary_loss_mlp": 0.01031854, "balance_loss_clip": 1.05836082, "balance_loss_mlp": 1.02279413, "epoch": 0.3178019599591174, "flos": 24972603765120.0, "grad_norm": 2.0340945530910575, "language_loss": 0.71406412, "learning_rate": 3.192395550340787e-06, "loss": 0.73601568, "num_input_tokens_seen": 56918895, "step": 2643, "time_per_iteration": 2.5983755588531494 }, { "auxiliary_loss_clip": 0.01182187, "auxiliary_loss_mlp": 0.01029862, "balance_loss_clip": 1.05884242, "balance_loss_mlp": 1.02141619, "epoch": 0.31792220284975653, "flos": 12422220019200.0, "grad_norm": 1.852321342700682, "language_loss": 0.76504725, "learning_rate": 3.191770072185638e-06, "loss": 0.78716767, "num_input_tokens_seen": 56935890, "step": 2644, "time_per_iteration": 2.5009708404541016 }, { "auxiliary_loss_clip": 0.01180588, "auxiliary_loss_mlp": 0.0103713, "balance_loss_clip": 1.05742192, "balance_loss_mlp": 1.0278852, "epoch": 0.3180424457403956, "flos": 15485759089920.0, "grad_norm": 2.395451684155793, "language_loss": 0.72973293, "learning_rate": 3.191144413239916e-06, "loss": 0.75191009, "num_input_tokens_seen": 56952460, "step": 2645, "time_per_iteration": 2.4313387870788574 }, { "auxiliary_loss_clip": 0.01169339, "auxiliary_loss_mlp": 0.01036704, "balance_loss_clip": 1.05692148, "balance_loss_mlp": 1.02751279, "epoch": 0.3181626886310347, "flos": 26174964648960.0, "grad_norm": 1.983108536040906, "language_loss": 0.88373291, "learning_rate": 3.190518573598534e-06, "loss": 0.90579337, "num_input_tokens_seen": 56969065, "step": 2646, "time_per_iteration": 2.5359935760498047 }, { "auxiliary_loss_clip": 0.01159165, "auxiliary_loss_mlp": 0.01037176, "balance_loss_clip": 1.05368567, "balance_loss_mlp": 1.02776408, "epoch": 0.3182829315216738, "flos": 25483109811840.0, "grad_norm": 2.122215686681643, "language_loss": 0.77337086, "learning_rate": 3.1898925533564308e-06, "loss": 0.79533428, "num_input_tokens_seen": 56990535, "step": 2647, "time_per_iteration": 2.6039280891418457 }, { "auxiliary_loss_clip": 0.01142102, "auxiliary_loss_mlp": 0.01034459, "balance_loss_clip": 1.05286932, "balance_loss_mlp": 1.02556622, "epoch": 0.31840317441231286, "flos": 18113701927680.0, "grad_norm": 2.342769357604192, "language_loss": 0.64212525, "learning_rate": 3.1892663526085733e-06, "loss": 0.66389084, "num_input_tokens_seen": 57008910, "step": 2648, "time_per_iteration": 2.537564516067505 }, { "auxiliary_loss_clip": 0.01088266, "auxiliary_loss_mlp": 0.01004107, "balance_loss_clip": 1.01677966, "balance_loss_mlp": 1.00302792, "epoch": 0.31852341730295197, "flos": 64741948957440.0, "grad_norm": 0.7484988669303206, "language_loss": 0.56965578, "learning_rate": 3.188639971449956e-06, "loss": 0.59057951, "num_input_tokens_seen": 57074960, "step": 2649, "time_per_iteration": 3.0121805667877197 }, { "auxiliary_loss_clip": 0.01197483, "auxiliary_loss_mlp": 0.01028465, "balance_loss_clip": 1.05930972, "balance_loss_mlp": 1.01973331, "epoch": 0.318643660193591, "flos": 20668135582080.0, "grad_norm": 2.331412653515872, "language_loss": 0.72185898, "learning_rate": 3.1880134099756e-06, "loss": 0.74411851, "num_input_tokens_seen": 57094595, "step": 2650, "time_per_iteration": 2.5160577297210693 }, { "auxiliary_loss_clip": 0.01177795, "auxiliary_loss_mlp": 0.0102325, "balance_loss_clip": 1.05430102, "balance_loss_mlp": 1.01451826, "epoch": 0.31876390308423014, "flos": 26943345411840.0, "grad_norm": 1.9460425718120204, "language_loss": 0.69406295, "learning_rate": 3.1873866682805535e-06, "loss": 0.71607339, "num_input_tokens_seen": 57115290, "step": 2651, "time_per_iteration": 2.5709779262542725 }, { "auxiliary_loss_clip": 0.01171562, "auxiliary_loss_mlp": 0.01033712, "balance_loss_clip": 1.05688798, "balance_loss_mlp": 1.02476549, "epoch": 0.31888414597486925, "flos": 18041916597120.0, "grad_norm": 1.938450377907265, "language_loss": 0.88781404, "learning_rate": 3.186759746459894e-06, "loss": 0.90986675, "num_input_tokens_seen": 57134400, "step": 2652, "time_per_iteration": 2.4988715648651123 }, { "auxiliary_loss_clip": 0.01167836, "auxiliary_loss_mlp": 0.01028148, "balance_loss_clip": 1.05722857, "balance_loss_mlp": 1.01909363, "epoch": 0.3190043888655083, "flos": 25149319701120.0, "grad_norm": 1.7924446837430996, "language_loss": 0.79769063, "learning_rate": 3.1861326446087246e-06, "loss": 0.81965047, "num_input_tokens_seen": 57153140, "step": 2653, "time_per_iteration": 2.556647777557373 }, { "auxiliary_loss_clip": 0.01183329, "auxiliary_loss_mlp": 0.01030442, "balance_loss_clip": 1.05700707, "balance_loss_mlp": 1.02064347, "epoch": 0.3191246317561474, "flos": 22053892331520.0, "grad_norm": 2.52207124254031, "language_loss": 0.71587068, "learning_rate": 3.1855053628221763e-06, "loss": 0.73800838, "num_input_tokens_seen": 57172395, "step": 2654, "time_per_iteration": 2.5029003620147705 }, { "auxiliary_loss_clip": 0.01143666, "auxiliary_loss_mlp": 0.01030823, "balance_loss_clip": 1.04982662, "balance_loss_mlp": 1.0214119, "epoch": 0.3192448746467865, "flos": 14901815687040.0, "grad_norm": 2.867701481127341, "language_loss": 0.8983897, "learning_rate": 3.184877901195407e-06, "loss": 0.92013454, "num_input_tokens_seen": 57189090, "step": 2655, "time_per_iteration": 2.5815958976745605 }, { "auxiliary_loss_clip": 0.01074607, "auxiliary_loss_mlp": 0.01010327, "balance_loss_clip": 1.0312084, "balance_loss_mlp": 1.00793076, "epoch": 0.3193651175374256, "flos": 67234832657280.0, "grad_norm": 0.8000441545952708, "language_loss": 0.62858403, "learning_rate": 3.184250259823602e-06, "loss": 0.64943337, "num_input_tokens_seen": 57251620, "step": 2656, "time_per_iteration": 3.179706573486328 }, { "auxiliary_loss_clip": 0.01151327, "auxiliary_loss_mlp": 0.01032392, "balance_loss_clip": 1.05284452, "balance_loss_mlp": 1.02278924, "epoch": 0.3194853604280647, "flos": 12233077977600.0, "grad_norm": 2.10505701184891, "language_loss": 0.81961387, "learning_rate": 3.183622438801974e-06, "loss": 0.84145105, "num_input_tokens_seen": 57266910, "step": 2657, "time_per_iteration": 3.364516496658325 }, { "auxiliary_loss_clip": 0.01198138, "auxiliary_loss_mlp": 0.01037331, "balance_loss_clip": 1.06061089, "balance_loss_mlp": 1.02889752, "epoch": 0.3196056033187038, "flos": 14939917038720.0, "grad_norm": 2.0034519755168665, "language_loss": 0.75626612, "learning_rate": 3.1829944382257637e-06, "loss": 0.77862084, "num_input_tokens_seen": 57285040, "step": 2658, "time_per_iteration": 2.4232571125030518 }, { "auxiliary_loss_clip": 0.01178779, "auxiliary_loss_mlp": 0.01029475, "balance_loss_clip": 1.05815458, "balance_loss_mlp": 1.02107072, "epoch": 0.31972584620934286, "flos": 23768878164480.0, "grad_norm": 2.5705549797136262, "language_loss": 0.81624818, "learning_rate": 3.1823662581902373e-06, "loss": 0.83833075, "num_input_tokens_seen": 57302725, "step": 2659, "time_per_iteration": 2.488865852355957 }, { "auxiliary_loss_clip": 0.0113726, "auxiliary_loss_mlp": 0.01028363, "balance_loss_clip": 1.04705095, "balance_loss_mlp": 1.01924896, "epoch": 0.31984608909998197, "flos": 21251540280960.0, "grad_norm": 2.2266351166514236, "language_loss": 0.74425817, "learning_rate": 3.1817378987906896e-06, "loss": 0.76591432, "num_input_tokens_seen": 57322230, "step": 2660, "time_per_iteration": 2.53401255607605 }, { "auxiliary_loss_clip": 0.0112966, "auxiliary_loss_mlp": 0.01037263, "balance_loss_clip": 1.05116057, "balance_loss_mlp": 1.02820945, "epoch": 0.3199663319906211, "flos": 18296235866880.0, "grad_norm": 1.987371293722694, "language_loss": 0.79672003, "learning_rate": 3.181109360122442e-06, "loss": 0.8183893, "num_input_tokens_seen": 57339820, "step": 2661, "time_per_iteration": 3.4290072917938232 }, { "auxiliary_loss_clip": 0.01146868, "auxiliary_loss_mlp": 0.01029228, "balance_loss_clip": 1.05140936, "balance_loss_mlp": 1.02025127, "epoch": 0.32008657488126013, "flos": 18733627779840.0, "grad_norm": 2.425880746601867, "language_loss": 0.78360391, "learning_rate": 3.1804806422808445e-06, "loss": 0.80536491, "num_input_tokens_seen": 57356955, "step": 2662, "time_per_iteration": 4.198965787887573 }, { "auxiliary_loss_clip": 0.01156041, "auxiliary_loss_mlp": 0.01036098, "balance_loss_clip": 1.05287266, "balance_loss_mlp": 1.02707434, "epoch": 0.32020681777189924, "flos": 20595344670720.0, "grad_norm": 1.7860179000476801, "language_loss": 0.72918212, "learning_rate": 3.1798517453612714e-06, "loss": 0.75110352, "num_input_tokens_seen": 57376760, "step": 2663, "time_per_iteration": 2.5562779903411865 }, { "auxiliary_loss_clip": 0.01179521, "auxiliary_loss_mlp": 0.01034504, "balance_loss_clip": 1.06063056, "balance_loss_mlp": 1.02580774, "epoch": 0.32032706066253835, "flos": 35261692750080.0, "grad_norm": 1.773240893572529, "language_loss": 0.74971962, "learning_rate": 3.1792226694591265e-06, "loss": 0.77185988, "num_input_tokens_seen": 57398145, "step": 2664, "time_per_iteration": 2.658367156982422 }, { "auxiliary_loss_clip": 0.0114839, "auxiliary_loss_mlp": 0.01028116, "balance_loss_clip": 1.0551343, "balance_loss_mlp": 1.01968193, "epoch": 0.3204473035531774, "flos": 15304230731520.0, "grad_norm": 2.03338043129447, "language_loss": 0.80644727, "learning_rate": 3.178593414669841e-06, "loss": 0.82821238, "num_input_tokens_seen": 57416730, "step": 2665, "time_per_iteration": 2.598355293273926 }, { "auxiliary_loss_clip": 0.01183026, "auxiliary_loss_mlp": 0.01026295, "balance_loss_clip": 1.05799913, "balance_loss_mlp": 1.01747918, "epoch": 0.3205675464438165, "flos": 24462564595200.0, "grad_norm": 2.8005865739918296, "language_loss": 0.7041229, "learning_rate": 3.1779639810888707e-06, "loss": 0.72621614, "num_input_tokens_seen": 57436325, "step": 2666, "time_per_iteration": 2.521804094314575 }, { "auxiliary_loss_clip": 0.01179358, "auxiliary_loss_mlp": 0.01031687, "balance_loss_clip": 1.05889463, "balance_loss_mlp": 1.02268636, "epoch": 0.3206877893344556, "flos": 22456235548800.0, "grad_norm": 1.786391339346807, "language_loss": 0.75545996, "learning_rate": 3.1773343688117013e-06, "loss": 0.77757043, "num_input_tokens_seen": 57457235, "step": 2667, "time_per_iteration": 2.5075602531433105 }, { "auxiliary_loss_clip": 0.01169584, "auxiliary_loss_mlp": 0.00763307, "balance_loss_clip": 1.05522203, "balance_loss_mlp": 1.00067472, "epoch": 0.3208080322250947, "flos": 20412236113920.0, "grad_norm": 4.225036790900256, "language_loss": 0.84068, "learning_rate": 3.1767045779338445e-06, "loss": 0.8600089, "num_input_tokens_seen": 57474895, "step": 2668, "time_per_iteration": 2.5326688289642334 }, { "auxiliary_loss_clip": 0.01176118, "auxiliary_loss_mlp": 0.01026263, "balance_loss_clip": 1.05298924, "balance_loss_mlp": 1.01857424, "epoch": 0.3209282751157338, "flos": 21762118154880.0, "grad_norm": 1.941818888435655, "language_loss": 0.91274035, "learning_rate": 3.176074608550839e-06, "loss": 0.93476427, "num_input_tokens_seen": 57490715, "step": 2669, "time_per_iteration": 2.5570433139801025 }, { "auxiliary_loss_clip": 0.01123878, "auxiliary_loss_mlp": 0.01034385, "balance_loss_clip": 1.05132246, "balance_loss_mlp": 1.02551031, "epoch": 0.32104851800637285, "flos": 22055041566720.0, "grad_norm": 2.381875996007557, "language_loss": 0.82460862, "learning_rate": 3.17544446075825e-06, "loss": 0.84619123, "num_input_tokens_seen": 57509880, "step": 2670, "time_per_iteration": 2.651054859161377 }, { "auxiliary_loss_clip": 0.01168614, "auxiliary_loss_mlp": 0.0103203, "balance_loss_clip": 1.05406141, "balance_loss_mlp": 1.02391744, "epoch": 0.32116876089701196, "flos": 37012301896320.0, "grad_norm": 1.597889973427214, "language_loss": 0.70859063, "learning_rate": 3.174814134651671e-06, "loss": 0.73059708, "num_input_tokens_seen": 57532430, "step": 2671, "time_per_iteration": 2.6753392219543457 }, { "auxiliary_loss_clip": 0.0119022, "auxiliary_loss_mlp": 0.01029587, "balance_loss_clip": 1.05826557, "balance_loss_mlp": 1.0209918, "epoch": 0.3212890037876511, "flos": 21979233912960.0, "grad_norm": 1.9115989746728388, "language_loss": 0.80422068, "learning_rate": 3.1741836303267215e-06, "loss": 0.82641876, "num_input_tokens_seen": 57551965, "step": 2672, "time_per_iteration": 2.563977003097534 }, { "auxiliary_loss_clip": 0.01192452, "auxiliary_loss_mlp": 0.01029348, "balance_loss_clip": 1.05839014, "balance_loss_mlp": 1.02112865, "epoch": 0.32140924667829013, "flos": 10342345875840.0, "grad_norm": 1.8069210179413981, "language_loss": 0.74914384, "learning_rate": 3.1735529478790496e-06, "loss": 0.77136183, "num_input_tokens_seen": 57569955, "step": 2673, "time_per_iteration": 2.4567229747772217 }, { "auxiliary_loss_clip": 0.01181318, "auxiliary_loss_mlp": 0.01036346, "balance_loss_clip": 1.05734229, "balance_loss_mlp": 1.02689862, "epoch": 0.32152948956892924, "flos": 50798910072960.0, "grad_norm": 2.121830759928693, "language_loss": 0.79461741, "learning_rate": 3.172922087404328e-06, "loss": 0.81679416, "num_input_tokens_seen": 57592215, "step": 2674, "time_per_iteration": 2.7686643600463867 }, { "auxiliary_loss_clip": 0.01091215, "auxiliary_loss_mlp": 0.01002056, "balance_loss_clip": 1.02087653, "balance_loss_mlp": 1.00110841, "epoch": 0.32164973245956835, "flos": 63863250549120.0, "grad_norm": 0.7683724866515861, "language_loss": 0.55282241, "learning_rate": 3.1722910489982586e-06, "loss": 0.57375509, "num_input_tokens_seen": 57652575, "step": 2675, "time_per_iteration": 3.1243398189544678 }, { "auxiliary_loss_clip": 0.01159543, "auxiliary_loss_mlp": 0.01033774, "balance_loss_clip": 1.05268073, "balance_loss_mlp": 1.02466655, "epoch": 0.3217699753502074, "flos": 23513948363520.0, "grad_norm": 1.482625143549675, "language_loss": 0.80013525, "learning_rate": 3.1716598327565694e-06, "loss": 0.82206845, "num_input_tokens_seen": 57672215, "step": 2676, "time_per_iteration": 2.5619418621063232 }, { "auxiliary_loss_clip": 0.01190618, "auxiliary_loss_mlp": 0.01030638, "balance_loss_clip": 1.05724406, "balance_loss_mlp": 1.02173281, "epoch": 0.3218902182408465, "flos": 19062533640960.0, "grad_norm": 1.5643073483380618, "language_loss": 0.84217495, "learning_rate": 3.171028438775015e-06, "loss": 0.86438751, "num_input_tokens_seen": 57691410, "step": 2677, "time_per_iteration": 2.4965391159057617 }, { "auxiliary_loss_clip": 0.01192187, "auxiliary_loss_mlp": 0.01029001, "balance_loss_clip": 1.05769992, "balance_loss_mlp": 1.02070999, "epoch": 0.3220104611314856, "flos": 20375571306240.0, "grad_norm": 1.858680983702436, "language_loss": 0.84208, "learning_rate": 3.170396867149377e-06, "loss": 0.86429185, "num_input_tokens_seen": 57709415, "step": 2678, "time_per_iteration": 2.4523134231567383 }, { "auxiliary_loss_clip": 0.01130078, "auxiliary_loss_mlp": 0.01036336, "balance_loss_clip": 1.05200458, "balance_loss_mlp": 1.02751482, "epoch": 0.3221307040221247, "flos": 20117014231680.0, "grad_norm": 1.7938712894813198, "language_loss": 0.86117458, "learning_rate": 3.1697651179754653e-06, "loss": 0.88283873, "num_input_tokens_seen": 57728075, "step": 2679, "time_per_iteration": 2.612335205078125 }, { "auxiliary_loss_clip": 0.01152921, "auxiliary_loss_mlp": 0.01031333, "balance_loss_clip": 1.05902421, "balance_loss_mlp": 1.02272058, "epoch": 0.3222509469127638, "flos": 23987789602560.0, "grad_norm": 1.782525146767077, "language_loss": 0.73276806, "learning_rate": 3.1691331913491153e-06, "loss": 0.75461054, "num_input_tokens_seen": 57750645, "step": 2680, "time_per_iteration": 2.628188371658325 }, { "auxiliary_loss_clip": 0.0119234, "auxiliary_loss_mlp": 0.01027515, "balance_loss_clip": 1.05584121, "balance_loss_mlp": 1.01950371, "epoch": 0.32237118980340285, "flos": 17675735397120.0, "grad_norm": 1.990361221118503, "language_loss": 0.84645855, "learning_rate": 3.1685010873661898e-06, "loss": 0.86865705, "num_input_tokens_seen": 57769820, "step": 2681, "time_per_iteration": 2.4477040767669678 }, { "auxiliary_loss_clip": 0.01177423, "auxiliary_loss_mlp": 0.01032369, "balance_loss_clip": 1.05705523, "balance_loss_mlp": 1.0229274, "epoch": 0.32249143269404196, "flos": 23147982645120.0, "grad_norm": 1.9073689939583216, "language_loss": 0.79787964, "learning_rate": 3.167868806122578e-06, "loss": 0.81997758, "num_input_tokens_seen": 57788870, "step": 2682, "time_per_iteration": 2.5127296447753906 }, { "auxiliary_loss_clip": 0.01170678, "auxiliary_loss_mlp": 0.010305, "balance_loss_clip": 1.0571593, "balance_loss_mlp": 1.02176201, "epoch": 0.32261167558468107, "flos": 24422308427520.0, "grad_norm": 1.7893641358590515, "language_loss": 0.66334766, "learning_rate": 3.1672363477141968e-06, "loss": 0.68535942, "num_input_tokens_seen": 57808165, "step": 2683, "time_per_iteration": 3.403834581375122 }, { "auxiliary_loss_clip": 0.01167707, "auxiliary_loss_mlp": 0.01034313, "balance_loss_clip": 1.05328894, "balance_loss_mlp": 1.02533042, "epoch": 0.3227319184753201, "flos": 30367175852160.0, "grad_norm": 1.7531527269139193, "language_loss": 0.85104823, "learning_rate": 3.1666037122369903e-06, "loss": 0.87306839, "num_input_tokens_seen": 57828825, "step": 2684, "time_per_iteration": 2.5829105377197266 }, { "auxiliary_loss_clip": 0.01177012, "auxiliary_loss_mlp": 0.01028924, "balance_loss_clip": 1.05405569, "balance_loss_mlp": 1.02074575, "epoch": 0.32285216136595923, "flos": 16946174257920.0, "grad_norm": 2.0704149156554, "language_loss": 0.86442816, "learning_rate": 3.165970899786928e-06, "loss": 0.8864876, "num_input_tokens_seen": 57846740, "step": 2685, "time_per_iteration": 2.5022048950195312 }, { "auxiliary_loss_clip": 0.01154311, "auxiliary_loss_mlp": 0.01028351, "balance_loss_clip": 1.05366087, "balance_loss_mlp": 1.02008939, "epoch": 0.32297240425659834, "flos": 21981532383360.0, "grad_norm": 1.6916691925977374, "language_loss": 0.75412464, "learning_rate": 3.1653379104600067e-06, "loss": 0.77595121, "num_input_tokens_seen": 57866885, "step": 2686, "time_per_iteration": 2.5646004676818848 }, { "auxiliary_loss_clip": 0.01176373, "auxiliary_loss_mlp": 0.01029921, "balance_loss_clip": 1.05480671, "balance_loss_mlp": 1.02155828, "epoch": 0.3230926471472374, "flos": 22748045639040.0, "grad_norm": 2.331889449696419, "language_loss": 0.69665074, "learning_rate": 3.164704744352251e-06, "loss": 0.7187137, "num_input_tokens_seen": 57887690, "step": 2687, "time_per_iteration": 2.5375418663024902 }, { "auxiliary_loss_clip": 0.011763, "auxiliary_loss_mlp": 0.01032012, "balance_loss_clip": 1.05488431, "balance_loss_mlp": 1.02422166, "epoch": 0.3232128900378765, "flos": 16942977947520.0, "grad_norm": 2.105842305246496, "language_loss": 0.80590993, "learning_rate": 3.164071401559713e-06, "loss": 0.82799304, "num_input_tokens_seen": 57905090, "step": 2688, "time_per_iteration": 3.3633596897125244 }, { "auxiliary_loss_clip": 0.01164644, "auxiliary_loss_mlp": 0.01032487, "balance_loss_clip": 1.054299, "balance_loss_mlp": 1.02410626, "epoch": 0.3233331329285156, "flos": 24023736138240.0, "grad_norm": 1.6717343602788888, "language_loss": 0.70919359, "learning_rate": 3.1634378821784674e-06, "loss": 0.73116487, "num_input_tokens_seen": 57925305, "step": 2689, "time_per_iteration": 4.2222888469696045 }, { "auxiliary_loss_clip": 0.01153321, "auxiliary_loss_mlp": 0.01034456, "balance_loss_clip": 1.05504024, "balance_loss_mlp": 1.02609301, "epoch": 0.3234533758191547, "flos": 18113845582080.0, "grad_norm": 2.4492995973323644, "language_loss": 0.74481857, "learning_rate": 3.1628041863046208e-06, "loss": 0.76669633, "num_input_tokens_seen": 57942720, "step": 2690, "time_per_iteration": 2.558166265487671 }, { "auxiliary_loss_clip": 0.01196038, "auxiliary_loss_mlp": 0.01032378, "balance_loss_clip": 1.05574393, "balance_loss_mlp": 1.02316892, "epoch": 0.3235736187097938, "flos": 16946138344320.0, "grad_norm": 2.21581339529008, "language_loss": 0.90908998, "learning_rate": 3.162170314034304e-06, "loss": 0.93137413, "num_input_tokens_seen": 57960135, "step": 2691, "time_per_iteration": 2.4545295238494873 }, { "auxiliary_loss_clip": 0.01194613, "auxiliary_loss_mlp": 0.01032584, "balance_loss_clip": 1.05597401, "balance_loss_mlp": 1.02337503, "epoch": 0.3236938616004329, "flos": 22127150119680.0, "grad_norm": 1.69881504924168, "language_loss": 0.80864364, "learning_rate": 3.1615362654636738e-06, "loss": 0.83091563, "num_input_tokens_seen": 57980875, "step": 2692, "time_per_iteration": 2.4704015254974365 }, { "auxiliary_loss_clip": 0.01143242, "auxiliary_loss_mlp": 0.01033969, "balance_loss_clip": 1.05474329, "balance_loss_mlp": 1.02554703, "epoch": 0.32381410449107195, "flos": 17164618819200.0, "grad_norm": 1.6989526200457512, "language_loss": 0.87463701, "learning_rate": 3.1609020406889163e-06, "loss": 0.89640915, "num_input_tokens_seen": 57998310, "step": 2693, "time_per_iteration": 2.5160059928894043 }, { "auxiliary_loss_clip": 0.01167696, "auxiliary_loss_mlp": 0.01033677, "balance_loss_clip": 1.05462456, "balance_loss_mlp": 1.02396131, "epoch": 0.32393434738171106, "flos": 16578125550720.0, "grad_norm": 1.6470579250626114, "language_loss": 0.85137844, "learning_rate": 3.1602676398062416e-06, "loss": 0.87339216, "num_input_tokens_seen": 58017220, "step": 2694, "time_per_iteration": 2.51727294921875 }, { "auxiliary_loss_clip": 0.0117627, "auxiliary_loss_mlp": 0.01028129, "balance_loss_clip": 1.05586851, "balance_loss_mlp": 1.0191462, "epoch": 0.3240545902723502, "flos": 25483612602240.0, "grad_norm": 2.051156024763937, "language_loss": 0.61586684, "learning_rate": 3.1596330629118886e-06, "loss": 0.63791084, "num_input_tokens_seen": 58037190, "step": 2695, "time_per_iteration": 2.571971893310547 }, { "auxiliary_loss_clip": 0.01131357, "auxiliary_loss_mlp": 0.01031594, "balance_loss_clip": 1.0513432, "balance_loss_mlp": 1.02285552, "epoch": 0.32417483316298923, "flos": 35845851634560.0, "grad_norm": 1.9871282258802616, "language_loss": 0.73243213, "learning_rate": 3.1589983101021223e-06, "loss": 0.75406164, "num_input_tokens_seen": 58055820, "step": 2696, "time_per_iteration": 2.725529909133911 }, { "auxiliary_loss_clip": 0.01165154, "auxiliary_loss_mlp": 0.01028393, "balance_loss_clip": 1.05454111, "balance_loss_mlp": 1.01994681, "epoch": 0.32429507605362834, "flos": 30080501406720.0, "grad_norm": 2.3291311033144093, "language_loss": 0.85074127, "learning_rate": 3.1583633814732337e-06, "loss": 0.87267673, "num_input_tokens_seen": 58075340, "step": 2697, "time_per_iteration": 2.5923664569854736 }, { "auxiliary_loss_clip": 0.01190134, "auxiliary_loss_mlp": 0.01035893, "balance_loss_clip": 1.05493593, "balance_loss_mlp": 1.02710152, "epoch": 0.3244153189442674, "flos": 18223265387520.0, "grad_norm": 2.7287229313480172, "language_loss": 0.71703166, "learning_rate": 3.157728277121541e-06, "loss": 0.73929197, "num_input_tokens_seen": 58093515, "step": 2698, "time_per_iteration": 2.4554076194763184 }, { "auxiliary_loss_clip": 0.01190914, "auxiliary_loss_mlp": 0.01028687, "balance_loss_clip": 1.05290318, "balance_loss_mlp": 1.01988959, "epoch": 0.3245355618349065, "flos": 17710317216000.0, "grad_norm": 2.557176730795796, "language_loss": 0.78560084, "learning_rate": 3.1570929971433897e-06, "loss": 0.80779684, "num_input_tokens_seen": 58109300, "step": 2699, "time_per_iteration": 2.4922051429748535 }, { "auxiliary_loss_clip": 0.01178222, "auxiliary_loss_mlp": 0.01034697, "balance_loss_clip": 1.05738342, "balance_loss_mlp": 1.02638793, "epoch": 0.3246558047255456, "flos": 23440798316160.0, "grad_norm": 2.3486320779730523, "language_loss": 0.83945686, "learning_rate": 3.1564575416351504e-06, "loss": 0.86158609, "num_input_tokens_seen": 58128000, "step": 2700, "time_per_iteration": 2.5409977436065674 }, { "auxiliary_loss_clip": 0.01192635, "auxiliary_loss_mlp": 0.01028964, "balance_loss_clip": 1.05661488, "balance_loss_mlp": 1.01995742, "epoch": 0.32477604761618467, "flos": 21760861178880.0, "grad_norm": 1.8382055656498018, "language_loss": 0.74573141, "learning_rate": 3.155821910693221e-06, "loss": 0.76794744, "num_input_tokens_seen": 58147415, "step": 2701, "time_per_iteration": 2.4598922729492188 }, { "auxiliary_loss_clip": 0.01161604, "auxiliary_loss_mlp": 0.01030281, "balance_loss_clip": 1.05148828, "balance_loss_mlp": 1.02149582, "epoch": 0.3248962905068238, "flos": 19828328624640.0, "grad_norm": 1.77561343765119, "language_loss": 0.85860306, "learning_rate": 3.1551861044140275e-06, "loss": 0.88052189, "num_input_tokens_seen": 58167050, "step": 2702, "time_per_iteration": 2.545163869857788 }, { "auxiliary_loss_clip": 0.01130465, "auxiliary_loss_mlp": 0.01030933, "balance_loss_clip": 1.0518229, "balance_loss_mlp": 1.0226537, "epoch": 0.3250165333974629, "flos": 23948215793280.0, "grad_norm": 2.518944028727769, "language_loss": 0.77937633, "learning_rate": 3.15455012289402e-06, "loss": 0.80099034, "num_input_tokens_seen": 58186695, "step": 2703, "time_per_iteration": 2.6312096118927 }, { "auxiliary_loss_clip": 0.01179474, "auxiliary_loss_mlp": 0.01029946, "balance_loss_clip": 1.05674911, "balance_loss_mlp": 1.02092755, "epoch": 0.32513677628810195, "flos": 23989333887360.0, "grad_norm": 1.8105436482824648, "language_loss": 0.84446788, "learning_rate": 3.153913966229677e-06, "loss": 0.86656213, "num_input_tokens_seen": 58205815, "step": 2704, "time_per_iteration": 2.520097255706787 }, { "auxiliary_loss_clip": 0.01082358, "auxiliary_loss_mlp": 0.01004924, "balance_loss_clip": 1.02118278, "balance_loss_mlp": 1.00370789, "epoch": 0.32525701917874106, "flos": 70655790009600.0, "grad_norm": 0.6400157666945211, "language_loss": 0.50297797, "learning_rate": 3.1532776345175027e-06, "loss": 0.5238508, "num_input_tokens_seen": 58270960, "step": 2705, "time_per_iteration": 3.094642162322998 }, { "auxiliary_loss_clip": 0.01190474, "auxiliary_loss_mlp": 0.01034261, "balance_loss_clip": 1.05572164, "balance_loss_mlp": 1.02524841, "epoch": 0.32537726206938017, "flos": 19682639061120.0, "grad_norm": 1.9398741628678617, "language_loss": 0.78877556, "learning_rate": 3.1526411278540285e-06, "loss": 0.81102294, "num_input_tokens_seen": 58289390, "step": 2706, "time_per_iteration": 2.4604310989379883 }, { "auxiliary_loss_clip": 0.01170815, "auxiliary_loss_mlp": 0.01033581, "balance_loss_clip": 1.05296147, "balance_loss_mlp": 1.02412748, "epoch": 0.3254975049600192, "flos": 28760999293440.0, "grad_norm": 2.6649028613281573, "language_loss": 0.8103748, "learning_rate": 3.1520044463358116e-06, "loss": 0.83241868, "num_input_tokens_seen": 58306120, "step": 2707, "time_per_iteration": 2.578207015991211 }, { "auxiliary_loss_clip": 0.01177063, "auxiliary_loss_mlp": 0.01028943, "balance_loss_clip": 1.05536747, "balance_loss_mlp": 1.01990354, "epoch": 0.32561774785065833, "flos": 18877378008960.0, "grad_norm": 1.5542523578386402, "language_loss": 0.80113983, "learning_rate": 3.151367590059436e-06, "loss": 0.82319987, "num_input_tokens_seen": 58324545, "step": 2708, "time_per_iteration": 2.4830994606018066 }, { "auxiliary_loss_clip": 0.01193839, "auxiliary_loss_mlp": 0.00763984, "balance_loss_clip": 1.05644178, "balance_loss_mlp": 1.00061584, "epoch": 0.32573799074129745, "flos": 23112107936640.0, "grad_norm": 1.9158896819042956, "language_loss": 0.86814183, "learning_rate": 3.1507305591215117e-06, "loss": 0.88772005, "num_input_tokens_seen": 58342455, "step": 2709, "time_per_iteration": 2.5095303058624268 }, { "auxiliary_loss_clip": 0.01081599, "auxiliary_loss_mlp": 0.01000387, "balance_loss_clip": 1.02072513, "balance_loss_mlp": 0.99936146, "epoch": 0.3258582336319365, "flos": 71237650423680.0, "grad_norm": 0.6710497388181448, "language_loss": 0.55728269, "learning_rate": 3.150093353618677e-06, "loss": 0.57810253, "num_input_tokens_seen": 58407185, "step": 2710, "time_per_iteration": 3.9900460243225098 }, { "auxiliary_loss_clip": 0.01179995, "auxiliary_loss_mlp": 0.01028208, "balance_loss_clip": 1.05258071, "balance_loss_mlp": 1.01936841, "epoch": 0.3259784765225756, "flos": 22456020067200.0, "grad_norm": 2.3641429077897764, "language_loss": 0.88119578, "learning_rate": 3.149455973647596e-06, "loss": 0.90327775, "num_input_tokens_seen": 58425245, "step": 2711, "time_per_iteration": 2.496067762374878 }, { "auxiliary_loss_clip": 0.01139049, "auxiliary_loss_mlp": 0.01028339, "balance_loss_clip": 1.04591322, "balance_loss_mlp": 1.01880252, "epoch": 0.32609871941321467, "flos": 20484811543680.0, "grad_norm": 1.7644762406713717, "language_loss": 0.77162302, "learning_rate": 3.1488184193049563e-06, "loss": 0.79329693, "num_input_tokens_seen": 58444780, "step": 2712, "time_per_iteration": 2.5653631687164307 }, { "auxiliary_loss_clip": 0.0119313, "auxiliary_loss_mlp": 0.01029374, "balance_loss_clip": 1.05860734, "balance_loss_mlp": 1.02093983, "epoch": 0.3262189623038538, "flos": 22416805393920.0, "grad_norm": 1.5234617118566582, "language_loss": 0.72071731, "learning_rate": 3.1481806906874767e-06, "loss": 0.74294233, "num_input_tokens_seen": 58466090, "step": 2713, "time_per_iteration": 2.537545680999756 }, { "auxiliary_loss_clip": 0.01191152, "auxiliary_loss_mlp": 0.01029131, "balance_loss_clip": 1.05639124, "balance_loss_mlp": 1.02097738, "epoch": 0.3263392051944929, "flos": 20923496346240.0, "grad_norm": 1.5222323815950405, "language_loss": 0.87659419, "learning_rate": 3.147542787891899e-06, "loss": 0.89879698, "num_input_tokens_seen": 58485435, "step": 2714, "time_per_iteration": 2.4770402908325195 }, { "auxiliary_loss_clip": 0.01165721, "auxiliary_loss_mlp": 0.01034286, "balance_loss_clip": 1.05646634, "balance_loss_mlp": 1.02497625, "epoch": 0.32645944808513194, "flos": 24025172682240.0, "grad_norm": 2.226836444538743, "language_loss": 0.75280476, "learning_rate": 3.1469047110149926e-06, "loss": 0.77480483, "num_input_tokens_seen": 58504175, "step": 2715, "time_per_iteration": 3.446967363357544 }, { "auxiliary_loss_clip": 0.01131137, "auxiliary_loss_mlp": 0.01028571, "balance_loss_clip": 1.05160069, "balance_loss_mlp": 1.02020264, "epoch": 0.32657969097577105, "flos": 21032413361280.0, "grad_norm": 1.9436120964076988, "language_loss": 0.85275984, "learning_rate": 3.146266460153554e-06, "loss": 0.87435687, "num_input_tokens_seen": 58523885, "step": 2716, "time_per_iteration": 3.363856077194214 }, { "auxiliary_loss_clip": 0.01162522, "auxiliary_loss_mlp": 0.00763834, "balance_loss_clip": 1.05383217, "balance_loss_mlp": 1.00063705, "epoch": 0.32669993386641016, "flos": 22710267509760.0, "grad_norm": 1.8365836094478587, "language_loss": 0.80179358, "learning_rate": 3.145628035404404e-06, "loss": 0.82105708, "num_input_tokens_seen": 58543085, "step": 2717, "time_per_iteration": 2.5573678016662598 }, { "auxiliary_loss_clip": 0.01079784, "auxiliary_loss_mlp": 0.01004353, "balance_loss_clip": 1.02022147, "balance_loss_mlp": 1.00322008, "epoch": 0.3268201767570492, "flos": 72105718406400.0, "grad_norm": 0.8800989363533821, "language_loss": 0.57526404, "learning_rate": 3.1449894368643922e-06, "loss": 0.5961054, "num_input_tokens_seen": 58605400, "step": 2718, "time_per_iteration": 3.1740405559539795 }, { "auxiliary_loss_clip": 0.01149127, "auxiliary_loss_mlp": 0.0103224, "balance_loss_clip": 1.05382383, "balance_loss_mlp": 1.02353776, "epoch": 0.32694041964768833, "flos": 24535175938560.0, "grad_norm": 1.4842212095007867, "language_loss": 0.71396226, "learning_rate": 3.1443506646303934e-06, "loss": 0.73577595, "num_input_tokens_seen": 58626700, "step": 2719, "time_per_iteration": 2.5880401134490967 }, { "auxiliary_loss_clip": 0.01181125, "auxiliary_loss_mlp": 0.0102859, "balance_loss_clip": 1.05421257, "balance_loss_mlp": 1.01957762, "epoch": 0.32706066253832744, "flos": 33183003755520.0, "grad_norm": 3.096262231864864, "language_loss": 0.66281915, "learning_rate": 3.1437117187993086e-06, "loss": 0.68491626, "num_input_tokens_seen": 58649020, "step": 2720, "time_per_iteration": 2.5920658111572266 }, { "auxiliary_loss_clip": 0.01142924, "auxiliary_loss_mlp": 0.01036316, "balance_loss_clip": 1.04939818, "balance_loss_mlp": 1.0276618, "epoch": 0.3271809054289665, "flos": 24061622008320.0, "grad_norm": 1.6352889588221644, "language_loss": 0.79710084, "learning_rate": 3.143072599468065e-06, "loss": 0.81889319, "num_input_tokens_seen": 58668845, "step": 2721, "time_per_iteration": 2.5829741954803467 }, { "auxiliary_loss_clip": 0.01162481, "auxiliary_loss_mlp": 0.01026562, "balance_loss_clip": 1.05459476, "balance_loss_mlp": 1.01813388, "epoch": 0.3273011483196056, "flos": 38253769712640.0, "grad_norm": 1.5041987902718992, "language_loss": 0.75658202, "learning_rate": 3.1424333067336174e-06, "loss": 0.77847242, "num_input_tokens_seen": 58691610, "step": 2722, "time_per_iteration": 2.6655757427215576 }, { "auxiliary_loss_clip": 0.01182258, "auxiliary_loss_mlp": 0.01034431, "balance_loss_clip": 1.05412769, "balance_loss_mlp": 1.02504921, "epoch": 0.3274213912102447, "flos": 29054389582080.0, "grad_norm": 1.7790416177480768, "language_loss": 0.78194147, "learning_rate": 3.141793840692945e-06, "loss": 0.80410838, "num_input_tokens_seen": 58712360, "step": 2723, "time_per_iteration": 2.5465664863586426 }, { "auxiliary_loss_clip": 0.01155013, "auxiliary_loss_mlp": 0.01031634, "balance_loss_clip": 1.05115497, "balance_loss_mlp": 1.02207315, "epoch": 0.32754163410088377, "flos": 29133249891840.0, "grad_norm": 2.141415948292631, "language_loss": 0.61650538, "learning_rate": 3.1411542014430553e-06, "loss": 0.63837183, "num_input_tokens_seen": 58733440, "step": 2724, "time_per_iteration": 2.591707706451416 }, { "auxiliary_loss_clip": 0.01145698, "auxiliary_loss_mlp": 0.01031293, "balance_loss_clip": 1.04682374, "balance_loss_mlp": 1.02284694, "epoch": 0.3276618769915229, "flos": 20631075724800.0, "grad_norm": 1.6898622287564375, "language_loss": 0.8172431, "learning_rate": 3.1405143890809804e-06, "loss": 0.83901304, "num_input_tokens_seen": 58752735, "step": 2725, "time_per_iteration": 2.615417003631592 }, { "auxiliary_loss_clip": 0.01161647, "auxiliary_loss_mlp": 0.01027014, "balance_loss_clip": 1.05358028, "balance_loss_mlp": 1.01898193, "epoch": 0.327782119882162, "flos": 18657425076480.0, "grad_norm": 1.7748133468683849, "language_loss": 0.69959855, "learning_rate": 3.1398744037037796e-06, "loss": 0.72148514, "num_input_tokens_seen": 58772070, "step": 2726, "time_per_iteration": 2.5282788276672363 }, { "auxiliary_loss_clip": 0.01162873, "auxiliary_loss_mlp": 0.01029437, "balance_loss_clip": 1.05430639, "balance_loss_mlp": 1.02123523, "epoch": 0.32790236277280105, "flos": 21795802133760.0, "grad_norm": 3.5058638702400886, "language_loss": 0.84428805, "learning_rate": 3.139234245408538e-06, "loss": 0.86621118, "num_input_tokens_seen": 58790950, "step": 2727, "time_per_iteration": 2.5232949256896973 }, { "auxiliary_loss_clip": 0.01150785, "auxiliary_loss_mlp": 0.00763251, "balance_loss_clip": 1.0541718, "balance_loss_mlp": 1.0006001, "epoch": 0.32802260566344016, "flos": 23331414424320.0, "grad_norm": 1.4027983021834594, "language_loss": 0.76069951, "learning_rate": 3.1385939142923666e-06, "loss": 0.77983987, "num_input_tokens_seen": 58813340, "step": 2728, "time_per_iteration": 2.567537784576416 }, { "auxiliary_loss_clip": 0.01164984, "auxiliary_loss_mlp": 0.01032246, "balance_loss_clip": 1.05145645, "balance_loss_mlp": 1.02325797, "epoch": 0.3281428485540792, "flos": 24206988349440.0, "grad_norm": 1.961203247980162, "language_loss": 0.78408444, "learning_rate": 3.137953410452405e-06, "loss": 0.80605674, "num_input_tokens_seen": 58833610, "step": 2729, "time_per_iteration": 2.545344829559326 }, { "auxiliary_loss_clip": 0.01158294, "auxiliary_loss_mlp": 0.01033674, "balance_loss_clip": 1.0496335, "balance_loss_mlp": 1.02546036, "epoch": 0.3282630914447183, "flos": 34128962380800.0, "grad_norm": 1.6421596659617923, "language_loss": 0.74445462, "learning_rate": 3.1373127339858146e-06, "loss": 0.76637435, "num_input_tokens_seen": 58856210, "step": 2730, "time_per_iteration": 2.644753932952881 }, { "auxiliary_loss_clip": 0.01142428, "auxiliary_loss_mlp": 0.01027631, "balance_loss_clip": 1.04760504, "balance_loss_mlp": 1.02037692, "epoch": 0.32838333433535744, "flos": 27600726170880.0, "grad_norm": 1.8103094504922241, "language_loss": 0.74423331, "learning_rate": 3.136671884989787e-06, "loss": 0.76593387, "num_input_tokens_seen": 58876120, "step": 2731, "time_per_iteration": 2.620925188064575 }, { "auxiliary_loss_clip": 0.0112396, "auxiliary_loss_mlp": 0.01029609, "balance_loss_clip": 1.04842675, "balance_loss_mlp": 1.02072811, "epoch": 0.3285035772259965, "flos": 12349500935040.0, "grad_norm": 2.1177205652376565, "language_loss": 0.87565303, "learning_rate": 3.1360308635615383e-06, "loss": 0.89718866, "num_input_tokens_seen": 58894660, "step": 2732, "time_per_iteration": 2.6295647621154785 }, { "auxiliary_loss_clip": 0.0116973, "auxiliary_loss_mlp": 0.0102941, "balance_loss_clip": 1.05275822, "balance_loss_mlp": 1.01976609, "epoch": 0.3286238201166356, "flos": 24316084932480.0, "grad_norm": 1.9301541430485614, "language_loss": 0.78520918, "learning_rate": 3.135389669798311e-06, "loss": 0.80720055, "num_input_tokens_seen": 58912720, "step": 2733, "time_per_iteration": 2.5923287868499756 }, { "auxiliary_loss_clip": 0.0117521, "auxiliary_loss_mlp": 0.00763201, "balance_loss_clip": 1.05306721, "balance_loss_mlp": 1.00057244, "epoch": 0.3287440630072747, "flos": 21392812471680.0, "grad_norm": 1.7195785694770358, "language_loss": 0.80045044, "learning_rate": 3.134748303797373e-06, "loss": 0.81983453, "num_input_tokens_seen": 58930090, "step": 2734, "time_per_iteration": 2.5224335193634033 }, { "auxiliary_loss_clip": 0.01136794, "auxiliary_loss_mlp": 0.01032068, "balance_loss_clip": 1.04885435, "balance_loss_mlp": 1.02257895, "epoch": 0.32886430589791377, "flos": 23732536579200.0, "grad_norm": 1.789089162321881, "language_loss": 0.81015676, "learning_rate": 3.1341067656560203e-06, "loss": 0.8318454, "num_input_tokens_seen": 58947935, "step": 2735, "time_per_iteration": 2.609121084213257 }, { "auxiliary_loss_clip": 0.01173713, "auxiliary_loss_mlp": 0.01030072, "balance_loss_clip": 1.05412519, "balance_loss_mlp": 1.02098203, "epoch": 0.3289845487885529, "flos": 22418708814720.0, "grad_norm": 2.005109041925455, "language_loss": 0.86447185, "learning_rate": 3.133465055471572e-06, "loss": 0.88650972, "num_input_tokens_seen": 58967720, "step": 2736, "time_per_iteration": 3.4520626068115234 }, { "auxiliary_loss_clip": 0.01143588, "auxiliary_loss_mlp": 0.01028348, "balance_loss_clip": 1.04840648, "balance_loss_mlp": 1.01983643, "epoch": 0.329104791679192, "flos": 19682603147520.0, "grad_norm": 3.9249019448475315, "language_loss": 0.66175836, "learning_rate": 3.1328231733413767e-06, "loss": 0.68347776, "num_input_tokens_seen": 58984360, "step": 2737, "time_per_iteration": 2.5393660068511963 }, { "auxiliary_loss_clip": 0.0117341, "auxiliary_loss_mlp": 0.01035398, "balance_loss_clip": 1.0530076, "balance_loss_mlp": 1.02602851, "epoch": 0.32922503456983104, "flos": 15997234803840.0, "grad_norm": 2.517977310543383, "language_loss": 0.91078639, "learning_rate": 3.1321811193628067e-06, "loss": 0.93287444, "num_input_tokens_seen": 59002505, "step": 2738, "time_per_iteration": 2.5558552742004395 }, { "auxiliary_loss_clip": 0.01180166, "auxiliary_loss_mlp": 0.00764131, "balance_loss_clip": 1.0562675, "balance_loss_mlp": 1.00049877, "epoch": 0.32934527746047015, "flos": 26834069260800.0, "grad_norm": 1.692209889409344, "language_loss": 0.69912708, "learning_rate": 3.131538893633261e-06, "loss": 0.71856999, "num_input_tokens_seen": 59022065, "step": 2739, "time_per_iteration": 2.58483624458313 }, { "auxiliary_loss_clip": 0.01193664, "auxiliary_loss_mlp": 0.01031653, "balance_loss_clip": 1.05687737, "balance_loss_mlp": 1.02320099, "epoch": 0.32946552035110926, "flos": 23403774372480.0, "grad_norm": 2.0316138901416703, "language_loss": 0.78344131, "learning_rate": 3.130896496250165e-06, "loss": 0.80569446, "num_input_tokens_seen": 59041890, "step": 2740, "time_per_iteration": 2.502199411392212 }, { "auxiliary_loss_clip": 0.01193416, "auxiliary_loss_mlp": 0.01030122, "balance_loss_clip": 1.05590367, "balance_loss_mlp": 1.02155089, "epoch": 0.3295857632417483, "flos": 14172470029440.0, "grad_norm": 2.057239755727636, "language_loss": 0.86492032, "learning_rate": 3.1302539273109693e-06, "loss": 0.88715571, "num_input_tokens_seen": 59058715, "step": 2741, "time_per_iteration": 4.060149669647217 }, { "auxiliary_loss_clip": 0.01158753, "auxiliary_loss_mlp": 0.01034995, "balance_loss_clip": 1.05486703, "balance_loss_mlp": 1.02543414, "epoch": 0.32970600613238743, "flos": 22196708807040.0, "grad_norm": 1.6136790967051624, "language_loss": 0.803855, "learning_rate": 3.1296111869131513e-06, "loss": 0.82579249, "num_input_tokens_seen": 59076140, "step": 2742, "time_per_iteration": 3.3554325103759766 }, { "auxiliary_loss_clip": 0.01192164, "auxiliary_loss_mlp": 0.01030759, "balance_loss_clip": 1.05575359, "balance_loss_mlp": 1.02224731, "epoch": 0.32982624902302654, "flos": 22053784590720.0, "grad_norm": 1.8611195951837967, "language_loss": 0.85763323, "learning_rate": 3.1289682751542153e-06, "loss": 0.87986243, "num_input_tokens_seen": 59095700, "step": 2743, "time_per_iteration": 2.464463233947754 }, { "auxiliary_loss_clip": 0.01176162, "auxiliary_loss_mlp": 0.0102958, "balance_loss_clip": 1.05371094, "balance_loss_mlp": 1.020872, "epoch": 0.3299464919136656, "flos": 18661626967680.0, "grad_norm": 1.9409555574641155, "language_loss": 0.71338344, "learning_rate": 3.1283251921316883e-06, "loss": 0.73544085, "num_input_tokens_seen": 59113445, "step": 2744, "time_per_iteration": 2.4867451190948486 }, { "auxiliary_loss_clip": 0.01134698, "auxiliary_loss_mlp": 0.01034318, "balance_loss_clip": 1.05118132, "balance_loss_mlp": 1.0244832, "epoch": 0.3300667348043047, "flos": 13407357404160.0, "grad_norm": 2.913798876066495, "language_loss": 0.81023258, "learning_rate": 3.1276819379431277e-06, "loss": 0.83192277, "num_input_tokens_seen": 59131535, "step": 2745, "time_per_iteration": 2.5624399185180664 }, { "auxiliary_loss_clip": 0.01173215, "auxiliary_loss_mlp": 0.00764189, "balance_loss_clip": 1.05453289, "balance_loss_mlp": 1.00064063, "epoch": 0.33018697769494376, "flos": 15742556398080.0, "grad_norm": 2.0936198590434225, "language_loss": 0.75473702, "learning_rate": 3.1270385126861134e-06, "loss": 0.77411103, "num_input_tokens_seen": 59149520, "step": 2746, "time_per_iteration": 2.4873788356781006 }, { "auxiliary_loss_clip": 0.01195702, "auxiliary_loss_mlp": 0.01034259, "balance_loss_clip": 1.0569973, "balance_loss_mlp": 1.02432871, "epoch": 0.3303072205855829, "flos": 18258601392000.0, "grad_norm": 1.8831606611305816, "language_loss": 0.82025903, "learning_rate": 3.1263949164582533e-06, "loss": 0.84255868, "num_input_tokens_seen": 59169170, "step": 2747, "time_per_iteration": 2.4668898582458496 }, { "auxiliary_loss_clip": 0.01192521, "auxiliary_loss_mlp": 0.01031519, "balance_loss_clip": 1.05309093, "balance_loss_mlp": 1.02266765, "epoch": 0.330427463476222, "flos": 17749424148480.0, "grad_norm": 4.539365216926317, "language_loss": 0.78308821, "learning_rate": 3.1257511493571797e-06, "loss": 0.80532861, "num_input_tokens_seen": 59187675, "step": 2748, "time_per_iteration": 2.4170444011688232 }, { "auxiliary_loss_clip": 0.01150153, "auxiliary_loss_mlp": 0.01031558, "balance_loss_clip": 1.05056405, "balance_loss_mlp": 1.02291489, "epoch": 0.33054770636686104, "flos": 27162580072320.0, "grad_norm": 1.9490683824747084, "language_loss": 0.78467047, "learning_rate": 3.125107211480552e-06, "loss": 0.80648762, "num_input_tokens_seen": 59207610, "step": 2749, "time_per_iteration": 2.579108476638794 }, { "auxiliary_loss_clip": 0.01119104, "auxiliary_loss_mlp": 0.01037455, "balance_loss_clip": 1.04829955, "balance_loss_mlp": 1.02853203, "epoch": 0.33066794925750015, "flos": 20117193799680.0, "grad_norm": 1.7075292249442584, "language_loss": 0.79815769, "learning_rate": 3.124463102926054e-06, "loss": 0.81972325, "num_input_tokens_seen": 59226945, "step": 2750, "time_per_iteration": 2.5984702110290527 }, { "auxiliary_loss_clip": 0.0107862, "auxiliary_loss_mlp": 0.01002748, "balance_loss_clip": 1.02483416, "balance_loss_mlp": 1.00163329, "epoch": 0.33078819214813926, "flos": 70642609718400.0, "grad_norm": 0.7729465978551084, "language_loss": 0.6163817, "learning_rate": 3.1238188237913984e-06, "loss": 0.63719541, "num_input_tokens_seen": 59291485, "step": 2751, "time_per_iteration": 3.1397552490234375 }, { "auxiliary_loss_clip": 0.01199678, "auxiliary_loss_mlp": 0.01034006, "balance_loss_clip": 1.05940342, "balance_loss_mlp": 1.02455831, "epoch": 0.3309084350387783, "flos": 21141940907520.0, "grad_norm": 2.5169710047634193, "language_loss": 0.76210451, "learning_rate": 3.1231743741743202e-06, "loss": 0.78444135, "num_input_tokens_seen": 59310990, "step": 2752, "time_per_iteration": 2.4696764945983887 }, { "auxiliary_loss_clip": 0.01172715, "auxiliary_loss_mlp": 0.01033895, "balance_loss_clip": 1.05092049, "balance_loss_mlp": 1.02516294, "epoch": 0.3310286779294174, "flos": 14209350318720.0, "grad_norm": 2.4352076894350305, "language_loss": 0.83444971, "learning_rate": 3.122529754172582e-06, "loss": 0.85651577, "num_input_tokens_seen": 59327875, "step": 2753, "time_per_iteration": 2.48481822013855 }, { "auxiliary_loss_clip": 0.0118076, "auxiliary_loss_mlp": 0.01035805, "balance_loss_clip": 1.05689394, "balance_loss_mlp": 1.02652478, "epoch": 0.33114892082005654, "flos": 20778130005120.0, "grad_norm": 1.9102499010941272, "language_loss": 0.73010021, "learning_rate": 3.1218849638839736e-06, "loss": 0.75226587, "num_input_tokens_seen": 59347135, "step": 2754, "time_per_iteration": 2.5567264556884766 }, { "auxiliary_loss_clip": 0.01137548, "auxiliary_loss_mlp": 0.01036108, "balance_loss_clip": 1.04534566, "balance_loss_mlp": 1.0262078, "epoch": 0.3312691637106956, "flos": 17090750499840.0, "grad_norm": 1.6547585653169383, "language_loss": 0.785092, "learning_rate": 3.121240003406307e-06, "loss": 0.80682856, "num_input_tokens_seen": 59365985, "step": 2755, "time_per_iteration": 2.542545795440674 }, { "auxiliary_loss_clip": 0.01153884, "auxiliary_loss_mlp": 0.01031601, "balance_loss_clip": 1.05309296, "balance_loss_mlp": 1.02196908, "epoch": 0.3313894066013347, "flos": 29456230008960.0, "grad_norm": 2.5690401948586534, "language_loss": 0.72341871, "learning_rate": 3.120594872837425e-06, "loss": 0.74527359, "num_input_tokens_seen": 59384655, "step": 2756, "time_per_iteration": 2.600107192993164 }, { "auxiliary_loss_clip": 0.01078267, "auxiliary_loss_mlp": 0.00754039, "balance_loss_clip": 1.02038193, "balance_loss_mlp": 1.00154066, "epoch": 0.3315096494919738, "flos": 61419242280960.0, "grad_norm": 0.8304241367334306, "language_loss": 0.62385881, "learning_rate": 3.1199495722751906e-06, "loss": 0.64218187, "num_input_tokens_seen": 59444185, "step": 2757, "time_per_iteration": 3.1084578037261963 }, { "auxiliary_loss_clip": 0.0113728, "auxiliary_loss_mlp": 0.01035406, "balance_loss_clip": 1.04860616, "balance_loss_mlp": 1.02584529, "epoch": 0.33162989238261287, "flos": 21653057485440.0, "grad_norm": 1.5867036401819143, "language_loss": 0.84134471, "learning_rate": 3.1193041018174972e-06, "loss": 0.86307156, "num_input_tokens_seen": 59464900, "step": 2758, "time_per_iteration": 2.5904479026794434 }, { "auxiliary_loss_clip": 0.01182796, "auxiliary_loss_mlp": 0.01028725, "balance_loss_clip": 1.05649698, "balance_loss_mlp": 1.01961744, "epoch": 0.331750135273252, "flos": 22674787850880.0, "grad_norm": 2.518464440665844, "language_loss": 0.94788033, "learning_rate": 3.118658461562261e-06, "loss": 0.96999556, "num_input_tokens_seen": 59481000, "step": 2759, "time_per_iteration": 2.5496444702148438 }, { "auxiliary_loss_clip": 0.0116562, "auxiliary_loss_mlp": 0.01035404, "balance_loss_clip": 1.05583405, "balance_loss_mlp": 1.02552152, "epoch": 0.33187037816389103, "flos": 22746896403840.0, "grad_norm": 1.6533177678466706, "language_loss": 0.8507998, "learning_rate": 3.118012651607426e-06, "loss": 0.87281001, "num_input_tokens_seen": 59502605, "step": 2760, "time_per_iteration": 2.5263097286224365 }, { "auxiliary_loss_clip": 0.01193611, "auxiliary_loss_mlp": 0.01037843, "balance_loss_clip": 1.05622172, "balance_loss_mlp": 1.02765107, "epoch": 0.33199062105453014, "flos": 19203769918080.0, "grad_norm": 2.4254441292340485, "language_loss": 0.83478636, "learning_rate": 3.1173666720509603e-06, "loss": 0.85710084, "num_input_tokens_seen": 59519540, "step": 2761, "time_per_iteration": 2.4219202995300293 }, { "auxiliary_loss_clip": 0.01168581, "auxiliary_loss_mlp": 0.01030734, "balance_loss_clip": 1.05312145, "balance_loss_mlp": 1.02172756, "epoch": 0.33211086394516925, "flos": 31577006764800.0, "grad_norm": 1.6923913410383216, "language_loss": 0.68148553, "learning_rate": 3.116720522990859e-06, "loss": 0.70347857, "num_input_tokens_seen": 59540415, "step": 2762, "time_per_iteration": 2.5745129585266113 }, { "auxiliary_loss_clip": 0.01125152, "auxiliary_loss_mlp": 0.01033436, "balance_loss_clip": 1.05001163, "balance_loss_mlp": 1.02435267, "epoch": 0.3322311068358083, "flos": 17932496791680.0, "grad_norm": 1.9306088334356015, "language_loss": 0.61995107, "learning_rate": 3.116074204525142e-06, "loss": 0.64153695, "num_input_tokens_seen": 59558590, "step": 2763, "time_per_iteration": 3.3974854946136475 }, { "auxiliary_loss_clip": 0.0117086, "auxiliary_loss_mlp": 0.01029229, "balance_loss_clip": 1.05436826, "balance_loss_mlp": 1.02089024, "epoch": 0.3323513497264474, "flos": 32269831269120.0, "grad_norm": 1.4331940432253774, "language_loss": 0.83622736, "learning_rate": 3.1154277167518553e-06, "loss": 0.85822821, "num_input_tokens_seen": 59580205, "step": 2764, "time_per_iteration": 2.5662119388580322 }, { "auxiliary_loss_clip": 0.01062631, "auxiliary_loss_mlp": 0.01001813, "balance_loss_clip": 1.01838255, "balance_loss_mlp": 1.00066233, "epoch": 0.33247159261708653, "flos": 52668674588160.0, "grad_norm": 0.7804633477763853, "language_loss": 0.59487081, "learning_rate": 3.114781059769072e-06, "loss": 0.61551523, "num_input_tokens_seen": 59631530, "step": 2765, "time_per_iteration": 2.976964235305786 }, { "auxiliary_loss_clip": 0.01162952, "auxiliary_loss_mlp": 0.01030335, "balance_loss_clip": 1.05310428, "balance_loss_mlp": 1.02079248, "epoch": 0.3325918355077256, "flos": 27125232906240.0, "grad_norm": 2.743449929994885, "language_loss": 0.67059427, "learning_rate": 3.1141342336748874e-06, "loss": 0.69252712, "num_input_tokens_seen": 59651090, "step": 2766, "time_per_iteration": 2.6151795387268066 }, { "auxiliary_loss_clip": 0.01177667, "auxiliary_loss_mlp": 0.01034036, "balance_loss_clip": 1.05664277, "balance_loss_mlp": 1.02500534, "epoch": 0.3327120783983647, "flos": 23664414435840.0, "grad_norm": 1.4137262903912304, "language_loss": 0.82222188, "learning_rate": 3.1134872385674253e-06, "loss": 0.84433889, "num_input_tokens_seen": 59675245, "step": 2767, "time_per_iteration": 2.5766637325286865 }, { "auxiliary_loss_clip": 0.01167436, "auxiliary_loss_mlp": 0.01034197, "balance_loss_clip": 1.05121827, "balance_loss_mlp": 1.02471995, "epoch": 0.3328323212890038, "flos": 19171378828800.0, "grad_norm": 1.6515907177798275, "language_loss": 0.85343361, "learning_rate": 3.1128400745448353e-06, "loss": 0.8754499, "num_input_tokens_seen": 59694625, "step": 2768, "time_per_iteration": 3.3697237968444824 }, { "auxiliary_loss_clip": 0.01180835, "auxiliary_loss_mlp": 0.01032246, "balance_loss_clip": 1.05591393, "balance_loss_mlp": 1.02310896, "epoch": 0.33295256417964286, "flos": 37706347463040.0, "grad_norm": 2.169925744566654, "language_loss": 0.62822759, "learning_rate": 3.11219274170529e-06, "loss": 0.65035838, "num_input_tokens_seen": 59716435, "step": 2769, "time_per_iteration": 3.335397720336914 }, { "auxiliary_loss_clip": 0.01157557, "auxiliary_loss_mlp": 0.01034031, "balance_loss_clip": 1.05101037, "balance_loss_mlp": 1.02562094, "epoch": 0.333072807070282, "flos": 26505989412480.0, "grad_norm": 14.509614954692156, "language_loss": 0.81849802, "learning_rate": 3.1115452401469903e-06, "loss": 0.84041399, "num_input_tokens_seen": 59736835, "step": 2770, "time_per_iteration": 2.556903123855591 }, { "auxiliary_loss_clip": 0.01125532, "auxiliary_loss_mlp": 0.01033786, "balance_loss_clip": 1.04597116, "balance_loss_mlp": 1.02481604, "epoch": 0.3331930499609211, "flos": 21430913823360.0, "grad_norm": 2.1390266964790836, "language_loss": 0.86334103, "learning_rate": 3.1108975699681613e-06, "loss": 0.88493425, "num_input_tokens_seen": 59754230, "step": 2771, "time_per_iteration": 2.5740745067596436 }, { "auxiliary_loss_clip": 0.01147929, "auxiliary_loss_mlp": 0.01036992, "balance_loss_clip": 1.05150056, "balance_loss_mlp": 1.02792573, "epoch": 0.33331329285156014, "flos": 20659947281280.0, "grad_norm": 1.874550957165198, "language_loss": 0.71346569, "learning_rate": 3.1102497312670542e-06, "loss": 0.73531485, "num_input_tokens_seen": 59772235, "step": 2772, "time_per_iteration": 2.539773464202881 }, { "auxiliary_loss_clip": 0.01153109, "auxiliary_loss_mlp": 0.01038073, "balance_loss_clip": 1.05132282, "balance_loss_mlp": 1.02885818, "epoch": 0.33343353574219925, "flos": 28001596930560.0, "grad_norm": 2.8438995920134076, "language_loss": 0.8049711, "learning_rate": 3.109601724141946e-06, "loss": 0.8268829, "num_input_tokens_seen": 59791230, "step": 2773, "time_per_iteration": 2.547266721725464 }, { "auxiliary_loss_clip": 0.01161656, "auxiliary_loss_mlp": 0.01029194, "balance_loss_clip": 1.05396748, "balance_loss_mlp": 1.02053893, "epoch": 0.33355377863283836, "flos": 23764963582080.0, "grad_norm": 1.7184250157778536, "language_loss": 0.68006015, "learning_rate": 3.108953548691138e-06, "loss": 0.70196861, "num_input_tokens_seen": 59811315, "step": 2774, "time_per_iteration": 2.53174090385437 }, { "auxiliary_loss_clip": 0.01195884, "auxiliary_loss_mlp": 0.01029654, "balance_loss_clip": 1.05773067, "balance_loss_mlp": 1.02047515, "epoch": 0.3336740215234774, "flos": 37779677078400.0, "grad_norm": 2.1293331143453287, "language_loss": 0.72533524, "learning_rate": 3.108305205012959e-06, "loss": 0.74759054, "num_input_tokens_seen": 59832010, "step": 2775, "time_per_iteration": 2.579028844833374 }, { "auxiliary_loss_clip": 0.01161332, "auxiliary_loss_mlp": 0.01029798, "balance_loss_clip": 1.05292702, "balance_loss_mlp": 1.02032089, "epoch": 0.3337942644141165, "flos": 25519056347520.0, "grad_norm": 2.3667259888482577, "language_loss": 0.87442344, "learning_rate": 3.107656693205761e-06, "loss": 0.89633477, "num_input_tokens_seen": 59851450, "step": 2776, "time_per_iteration": 2.5330100059509277 }, { "auxiliary_loss_clip": 0.01197016, "auxiliary_loss_mlp": 0.01035707, "balance_loss_clip": 1.05699849, "balance_loss_mlp": 1.0258956, "epoch": 0.3339145073047556, "flos": 25989844930560.0, "grad_norm": 2.9084244473816225, "language_loss": 0.70411718, "learning_rate": 3.107008013367924e-06, "loss": 0.72644442, "num_input_tokens_seen": 59870245, "step": 2777, "time_per_iteration": 2.4657881259918213 }, { "auxiliary_loss_clip": 0.01148842, "auxiliary_loss_mlp": 0.01031527, "balance_loss_clip": 1.05172324, "balance_loss_mlp": 1.02202642, "epoch": 0.3340347501953947, "flos": 19062569554560.0, "grad_norm": 2.0964109295983344, "language_loss": 0.86988473, "learning_rate": 3.1063591655978507e-06, "loss": 0.89168835, "num_input_tokens_seen": 59886195, "step": 2778, "time_per_iteration": 2.5176336765289307 }, { "auxiliary_loss_clip": 0.01122109, "auxiliary_loss_mlp": 0.0103238, "balance_loss_clip": 1.0455277, "balance_loss_mlp": 1.02304578, "epoch": 0.3341549930860338, "flos": 18109715518080.0, "grad_norm": 1.728711784970176, "language_loss": 0.79506189, "learning_rate": 3.105710149993972e-06, "loss": 0.81660682, "num_input_tokens_seen": 59905525, "step": 2779, "time_per_iteration": 2.5531671047210693 }, { "auxiliary_loss_clip": 0.01196644, "auxiliary_loss_mlp": 0.01030354, "balance_loss_clip": 1.0569284, "balance_loss_mlp": 1.02102017, "epoch": 0.33427523597667286, "flos": 22674967418880.0, "grad_norm": 1.7292401895339276, "language_loss": 0.85241753, "learning_rate": 3.1050609666547427e-06, "loss": 0.87468749, "num_input_tokens_seen": 59925085, "step": 2780, "time_per_iteration": 2.4527883529663086 }, { "auxiliary_loss_clip": 0.01158706, "auxiliary_loss_mlp": 0.01039448, "balance_loss_clip": 1.05448353, "balance_loss_mlp": 1.03050768, "epoch": 0.33439547886731197, "flos": 22638338524800.0, "grad_norm": 1.8283661288281738, "language_loss": 0.77297407, "learning_rate": 3.104411615678644e-06, "loss": 0.79495561, "num_input_tokens_seen": 59943935, "step": 2781, "time_per_iteration": 2.5432119369506836 }, { "auxiliary_loss_clip": 0.0116099, "auxiliary_loss_mlp": 0.0103476, "balance_loss_clip": 1.05300725, "balance_loss_mlp": 1.02479994, "epoch": 0.3345157217579511, "flos": 24096383395200.0, "grad_norm": 4.294028980692174, "language_loss": 0.73754764, "learning_rate": 3.1037620971641803e-06, "loss": 0.75950515, "num_input_tokens_seen": 59963725, "step": 2782, "time_per_iteration": 2.5091447830200195 }, { "auxiliary_loss_clip": 0.0119616, "auxiliary_loss_mlp": 0.01036294, "balance_loss_clip": 1.05814433, "balance_loss_mlp": 1.02685869, "epoch": 0.33463596464859013, "flos": 18989491334400.0, "grad_norm": 2.1889981021101548, "language_loss": 0.64687735, "learning_rate": 3.1031124112098844e-06, "loss": 0.66920185, "num_input_tokens_seen": 59981935, "step": 2783, "time_per_iteration": 2.4288344383239746 }, { "auxiliary_loss_clip": 0.0116814, "auxiliary_loss_mlp": 0.01026885, "balance_loss_clip": 1.05526125, "balance_loss_mlp": 1.01790881, "epoch": 0.33475620753922924, "flos": 20375607219840.0, "grad_norm": 2.059659066138216, "language_loss": 0.72374785, "learning_rate": 3.1024625579143127e-06, "loss": 0.74569809, "num_input_tokens_seen": 59999455, "step": 2784, "time_per_iteration": 2.489598035812378 }, { "auxiliary_loss_clip": 0.01193086, "auxiliary_loss_mlp": 0.0103534, "balance_loss_clip": 1.05673635, "balance_loss_mlp": 1.02617896, "epoch": 0.33487645042986836, "flos": 18182578256640.0, "grad_norm": 1.808290904659665, "language_loss": 0.73041236, "learning_rate": 3.101812537376048e-06, "loss": 0.75269663, "num_input_tokens_seen": 60018475, "step": 2785, "time_per_iteration": 2.443201780319214 }, { "auxiliary_loss_clip": 0.01158049, "auxiliary_loss_mlp": 0.00763856, "balance_loss_clip": 1.05207539, "balance_loss_mlp": 1.00059342, "epoch": 0.3349966933205074, "flos": 25848824135040.0, "grad_norm": 1.9729312251774827, "language_loss": 0.84468871, "learning_rate": 3.1011623496936973e-06, "loss": 0.86390775, "num_input_tokens_seen": 60036770, "step": 2786, "time_per_iteration": 2.5408260822296143 }, { "auxiliary_loss_clip": 0.01193375, "auxiliary_loss_mlp": 0.01031668, "balance_loss_clip": 1.05780411, "balance_loss_mlp": 1.02285278, "epoch": 0.3351169362111465, "flos": 28111447699200.0, "grad_norm": 1.7474483151917581, "language_loss": 0.6976797, "learning_rate": 3.100511994965893e-06, "loss": 0.71993011, "num_input_tokens_seen": 60056725, "step": 2787, "time_per_iteration": 2.5296106338500977 }, { "auxiliary_loss_clip": 0.01174502, "auxiliary_loss_mlp": 0.01032182, "balance_loss_clip": 1.05610752, "balance_loss_mlp": 1.02318156, "epoch": 0.33523717910178563, "flos": 22673315393280.0, "grad_norm": 1.7022789766044617, "language_loss": 0.84408545, "learning_rate": 3.0998614732912947e-06, "loss": 0.86615229, "num_input_tokens_seen": 60076100, "step": 2788, "time_per_iteration": 2.4613378047943115 }, { "auxiliary_loss_clip": 0.01181942, "auxiliary_loss_mlp": 0.01034897, "balance_loss_clip": 1.05838513, "balance_loss_mlp": 1.02564025, "epoch": 0.3353574219924247, "flos": 15669801400320.0, "grad_norm": 2.4432268464487645, "language_loss": 0.67808342, "learning_rate": 3.0992107847685855e-06, "loss": 0.70025182, "num_input_tokens_seen": 60093815, "step": 2789, "time_per_iteration": 3.2646031379699707 }, { "auxiliary_loss_clip": 0.01171927, "auxiliary_loss_mlp": 0.01040012, "balance_loss_clip": 1.05956745, "balance_loss_mlp": 1.0304513, "epoch": 0.3354776648830638, "flos": 24790644443520.0, "grad_norm": 1.5583806863305008, "language_loss": 0.79321051, "learning_rate": 3.0985599294964736e-06, "loss": 0.81532985, "num_input_tokens_seen": 60113370, "step": 2790, "time_per_iteration": 2.546794891357422 }, { "auxiliary_loss_clip": 0.01162839, "auxiliary_loss_mlp": 0.01039022, "balance_loss_clip": 1.05497122, "balance_loss_mlp": 1.02819812, "epoch": 0.33559790777370285, "flos": 28694852398080.0, "grad_norm": 2.081542223193698, "language_loss": 0.70130706, "learning_rate": 3.097908907573695e-06, "loss": 0.72332573, "num_input_tokens_seen": 60131350, "step": 2791, "time_per_iteration": 2.5969111919403076 }, { "auxiliary_loss_clip": 0.01124099, "auxiliary_loss_mlp": 0.01037581, "balance_loss_clip": 1.05260944, "balance_loss_mlp": 1.02872729, "epoch": 0.33571815066434196, "flos": 22235779825920.0, "grad_norm": 1.9882523961761898, "language_loss": 0.89591956, "learning_rate": 3.0972577190990067e-06, "loss": 0.91753638, "num_input_tokens_seen": 60149830, "step": 2792, "time_per_iteration": 2.596595525741577 }, { "auxiliary_loss_clip": 0.01156373, "auxiliary_loss_mlp": 0.01033906, "balance_loss_clip": 1.0543313, "balance_loss_mlp": 1.02497685, "epoch": 0.3358383935549811, "flos": 23842279607040.0, "grad_norm": 1.7828988341915757, "language_loss": 0.80069876, "learning_rate": 3.096606364171196e-06, "loss": 0.82260156, "num_input_tokens_seen": 60169620, "step": 2793, "time_per_iteration": 2.5602777004241943 }, { "auxiliary_loss_clip": 0.01135225, "auxiliary_loss_mlp": 0.01033361, "balance_loss_clip": 1.04930151, "balance_loss_mlp": 1.02370477, "epoch": 0.33595863644562013, "flos": 22267308988800.0, "grad_norm": 2.3224747717232224, "language_loss": 0.85101867, "learning_rate": 3.0959548428890703e-06, "loss": 0.87270457, "num_input_tokens_seen": 60188490, "step": 2794, "time_per_iteration": 2.544663429260254 }, { "auxiliary_loss_clip": 0.01177984, "auxiliary_loss_mlp": 0.01039454, "balance_loss_clip": 1.05831838, "balance_loss_mlp": 1.03011346, "epoch": 0.33607887933625924, "flos": 20119779578880.0, "grad_norm": 1.5196336030087647, "language_loss": 0.84115517, "learning_rate": 3.095303155351468e-06, "loss": 0.86332953, "num_input_tokens_seen": 60208695, "step": 2795, "time_per_iteration": 3.3538596630096436 }, { "auxiliary_loss_clip": 0.01132007, "auxiliary_loss_mlp": 0.01035944, "balance_loss_clip": 1.05377054, "balance_loss_mlp": 1.02703881, "epoch": 0.33619912222689835, "flos": 19318109886720.0, "grad_norm": 2.175559617833444, "language_loss": 0.79453677, "learning_rate": 3.0946513016572464e-06, "loss": 0.81621623, "num_input_tokens_seen": 60227600, "step": 2796, "time_per_iteration": 3.410482883453369 }, { "auxiliary_loss_clip": 0.01184565, "auxiliary_loss_mlp": 0.01033796, "balance_loss_clip": 1.05628538, "balance_loss_mlp": 1.02425289, "epoch": 0.3363193651175374, "flos": 16800664262400.0, "grad_norm": 3.020137812732992, "language_loss": 0.76981485, "learning_rate": 3.0939992819052938e-06, "loss": 0.79199839, "num_input_tokens_seen": 60245110, "step": 2797, "time_per_iteration": 2.4818058013916016 }, { "auxiliary_loss_clip": 0.01169443, "auxiliary_loss_mlp": 0.01030983, "balance_loss_clip": 1.0585146, "balance_loss_mlp": 1.02157736, "epoch": 0.3364396080081765, "flos": 23550289948800.0, "grad_norm": 2.0811658074619106, "language_loss": 0.81283718, "learning_rate": 3.0933470961945193e-06, "loss": 0.83484143, "num_input_tokens_seen": 60263405, "step": 2798, "time_per_iteration": 2.548593521118164 }, { "auxiliary_loss_clip": 0.01164715, "auxiliary_loss_mlp": 0.01036986, "balance_loss_clip": 1.05844295, "balance_loss_mlp": 1.02821207, "epoch": 0.3365598508988156, "flos": 28037902602240.0, "grad_norm": 1.5933206515666067, "language_loss": 0.68220854, "learning_rate": 3.0926947446238597e-06, "loss": 0.70422554, "num_input_tokens_seen": 60282975, "step": 2799, "time_per_iteration": 2.603602886199951 }, { "auxiliary_loss_clip": 0.01183793, "auxiliary_loss_mlp": 0.01035064, "balance_loss_clip": 1.05471015, "balance_loss_mlp": 1.0252049, "epoch": 0.3366800937894547, "flos": 16982767238400.0, "grad_norm": 2.39436592149514, "language_loss": 0.82461476, "learning_rate": 3.092042227292276e-06, "loss": 0.84680331, "num_input_tokens_seen": 60299810, "step": 2800, "time_per_iteration": 2.5505878925323486 }, { "auxiliary_loss_clip": 0.01191723, "auxiliary_loss_mlp": 0.01030951, "balance_loss_clip": 1.05854976, "balance_loss_mlp": 1.02246976, "epoch": 0.3368003366800938, "flos": 23915321913600.0, "grad_norm": 1.5730277935919512, "language_loss": 0.88006878, "learning_rate": 3.0913895442987557e-06, "loss": 0.90229547, "num_input_tokens_seen": 60320775, "step": 2801, "time_per_iteration": 2.4913220405578613 }, { "auxiliary_loss_clip": 0.01153269, "auxiliary_loss_mlp": 0.00764059, "balance_loss_clip": 1.05524945, "balance_loss_mlp": 1.00052059, "epoch": 0.3369205795707329, "flos": 24791219061120.0, "grad_norm": 1.5405413196578854, "language_loss": 0.86292446, "learning_rate": 3.090736695742308e-06, "loss": 0.88209772, "num_input_tokens_seen": 60341905, "step": 2802, "time_per_iteration": 2.5626161098480225 }, { "auxiliary_loss_clip": 0.01131308, "auxiliary_loss_mlp": 0.01032308, "balance_loss_clip": 1.05089545, "balance_loss_mlp": 1.02376688, "epoch": 0.33704082246137196, "flos": 17931096161280.0, "grad_norm": 2.2596717581977095, "language_loss": 0.52499092, "learning_rate": 3.0900836817219713e-06, "loss": 0.5466271, "num_input_tokens_seen": 60358335, "step": 2803, "time_per_iteration": 2.552642583847046 }, { "auxiliary_loss_clip": 0.01192919, "auxiliary_loss_mlp": 0.01032191, "balance_loss_clip": 1.05703592, "balance_loss_mlp": 1.02365518, "epoch": 0.33716106535201107, "flos": 21286517149440.0, "grad_norm": 1.6339236632469183, "language_loss": 0.83554125, "learning_rate": 3.089430502336807e-06, "loss": 0.85779238, "num_input_tokens_seen": 60378305, "step": 2804, "time_per_iteration": 2.4783382415771484 }, { "auxiliary_loss_clip": 0.01183403, "auxiliary_loss_mlp": 0.01029714, "balance_loss_clip": 1.05715764, "balance_loss_mlp": 1.02036202, "epoch": 0.3372813082426502, "flos": 18402962152320.0, "grad_norm": 3.299616317513142, "language_loss": 0.89962888, "learning_rate": 3.088777157685902e-06, "loss": 0.92176002, "num_input_tokens_seen": 60393895, "step": 2805, "time_per_iteration": 2.476597547531128 }, { "auxiliary_loss_clip": 0.01162708, "auxiliary_loss_mlp": 0.01027235, "balance_loss_clip": 1.05594015, "balance_loss_mlp": 1.01829433, "epoch": 0.33740155113328923, "flos": 17201391367680.0, "grad_norm": 1.8817690805679947, "language_loss": 0.85918957, "learning_rate": 3.088123647868367e-06, "loss": 0.88108909, "num_input_tokens_seen": 60410445, "step": 2806, "time_per_iteration": 2.489069700241089 }, { "auxiliary_loss_clip": 0.01184795, "auxiliary_loss_mlp": 0.01033842, "balance_loss_clip": 1.05695033, "balance_loss_mlp": 1.02505589, "epoch": 0.33752179402392835, "flos": 29058950609280.0, "grad_norm": 1.841258936596911, "language_loss": 0.81294703, "learning_rate": 3.0874699729833405e-06, "loss": 0.83513343, "num_input_tokens_seen": 60431815, "step": 2807, "time_per_iteration": 2.571648120880127 }, { "auxiliary_loss_clip": 0.01162052, "auxiliary_loss_mlp": 0.01027991, "balance_loss_clip": 1.05567408, "balance_loss_mlp": 1.01874042, "epoch": 0.3376420369145674, "flos": 25080730680960.0, "grad_norm": 1.5942560599041764, "language_loss": 0.79916126, "learning_rate": 3.086816133129983e-06, "loss": 0.82106173, "num_input_tokens_seen": 60452075, "step": 2808, "time_per_iteration": 2.542128801345825 }, { "auxiliary_loss_clip": 0.01195746, "auxiliary_loss_mlp": 0.01028453, "balance_loss_clip": 1.06124699, "balance_loss_mlp": 1.01963139, "epoch": 0.3377622798052065, "flos": 27490624007040.0, "grad_norm": 1.6209068587423892, "language_loss": 0.76131868, "learning_rate": 3.0861621284074826e-06, "loss": 0.78356069, "num_input_tokens_seen": 60472600, "step": 2809, "time_per_iteration": 2.5211522579193115 }, { "auxiliary_loss_clip": 0.01173977, "auxiliary_loss_mlp": 0.010321, "balance_loss_clip": 1.05811119, "balance_loss_mlp": 1.02384448, "epoch": 0.3378825226958456, "flos": 21975211589760.0, "grad_norm": 1.4633976848118593, "language_loss": 0.73133975, "learning_rate": 3.085507958915051e-06, "loss": 0.75340056, "num_input_tokens_seen": 60491030, "step": 2810, "time_per_iteration": 2.4954710006713867 }, { "auxiliary_loss_clip": 0.0116423, "auxiliary_loss_mlp": 0.01032624, "balance_loss_clip": 1.05806351, "balance_loss_mlp": 1.02296162, "epoch": 0.3380027655864847, "flos": 42523189200000.0, "grad_norm": 1.9593642710409174, "language_loss": 0.71038061, "learning_rate": 3.084853624751925e-06, "loss": 0.73234916, "num_input_tokens_seen": 60512615, "step": 2811, "time_per_iteration": 2.677703857421875 }, { "auxiliary_loss_clip": 0.01156172, "auxiliary_loss_mlp": 0.01034031, "balance_loss_clip": 1.05782747, "balance_loss_mlp": 1.02499461, "epoch": 0.3381230084771238, "flos": 26725080418560.0, "grad_norm": 1.6767827979958827, "language_loss": 0.85907954, "learning_rate": 3.0841991260173668e-06, "loss": 0.88098156, "num_input_tokens_seen": 60532520, "step": 2812, "time_per_iteration": 2.593587875366211 }, { "auxiliary_loss_clip": 0.01198465, "auxiliary_loss_mlp": 0.01029983, "balance_loss_clip": 1.0609982, "balance_loss_mlp": 1.02039814, "epoch": 0.3382432513677629, "flos": 22710375250560.0, "grad_norm": 1.826504349550865, "language_loss": 0.80315924, "learning_rate": 3.0835444628106634e-06, "loss": 0.82544374, "num_input_tokens_seen": 60551500, "step": 2813, "time_per_iteration": 2.4562697410583496 }, { "auxiliary_loss_clip": 0.01193363, "auxiliary_loss_mlp": 0.00764117, "balance_loss_clip": 1.058496, "balance_loss_mlp": 1.00055718, "epoch": 0.33836349425840195, "flos": 22122409524480.0, "grad_norm": 1.7304601418190944, "language_loss": 0.8320021, "learning_rate": 3.082889635231126e-06, "loss": 0.85157686, "num_input_tokens_seen": 60570160, "step": 2814, "time_per_iteration": 2.4557278156280518 }, { "auxiliary_loss_clip": 0.01168663, "auxiliary_loss_mlp": 0.01028183, "balance_loss_clip": 1.05481958, "balance_loss_mlp": 1.01848519, "epoch": 0.33848373714904106, "flos": 27308090067840.0, "grad_norm": 2.672041583900298, "language_loss": 0.76411843, "learning_rate": 3.0822346433780925e-06, "loss": 0.78608686, "num_input_tokens_seen": 60590885, "step": 2815, "time_per_iteration": 2.65100359916687 }, { "auxiliary_loss_clip": 0.01179063, "auxiliary_loss_mlp": 0.01028793, "balance_loss_clip": 1.05459321, "balance_loss_mlp": 1.01936984, "epoch": 0.3386039800396802, "flos": 25848716394240.0, "grad_norm": 1.9189750251488258, "language_loss": 0.87047362, "learning_rate": 3.0815794873509237e-06, "loss": 0.89255226, "num_input_tokens_seen": 60609170, "step": 2816, "time_per_iteration": 3.373980760574341 }, { "auxiliary_loss_clip": 0.01196763, "auxiliary_loss_mlp": 0.01030862, "balance_loss_clip": 1.06155539, "balance_loss_mlp": 1.02186131, "epoch": 0.33872422293031923, "flos": 18880646146560.0, "grad_norm": 1.7859634282738086, "language_loss": 0.73045981, "learning_rate": 3.0809241672490066e-06, "loss": 0.75273609, "num_input_tokens_seen": 60627340, "step": 2817, "time_per_iteration": 2.4223473072052 }, { "auxiliary_loss_clip": 0.01170535, "auxiliary_loss_mlp": 0.01028023, "balance_loss_clip": 1.05873156, "balance_loss_mlp": 1.01927888, "epoch": 0.33884446582095834, "flos": 23146977064320.0, "grad_norm": 1.6399070242588367, "language_loss": 0.84991193, "learning_rate": 3.080268683171753e-06, "loss": 0.87189746, "num_input_tokens_seen": 60647630, "step": 2818, "time_per_iteration": 2.541529893875122 }, { "auxiliary_loss_clip": 0.01178853, "auxiliary_loss_mlp": 0.01028763, "balance_loss_clip": 1.05573595, "balance_loss_mlp": 1.02034044, "epoch": 0.33896470871159745, "flos": 15997342544640.0, "grad_norm": 3.465952927681238, "language_loss": 0.89056385, "learning_rate": 3.0796130352185985e-06, "loss": 0.91263998, "num_input_tokens_seen": 60664485, "step": 2819, "time_per_iteration": 2.4549951553344727 }, { "auxiliary_loss_clip": 0.0115203, "auxiliary_loss_mlp": 0.00764486, "balance_loss_clip": 1.05051506, "balance_loss_mlp": 1.00056481, "epoch": 0.3390849516022365, "flos": 34495754112000.0, "grad_norm": 1.7206782321086551, "language_loss": 0.66293436, "learning_rate": 3.0789572234890057e-06, "loss": 0.68209958, "num_input_tokens_seen": 60686125, "step": 2820, "time_per_iteration": 2.6476056575775146 }, { "auxiliary_loss_clip": 0.01168583, "auxiliary_loss_mlp": 0.01031529, "balance_loss_clip": 1.05967259, "balance_loss_mlp": 1.02205217, "epoch": 0.3392051944928756, "flos": 16180307447040.0, "grad_norm": 1.5980720703876337, "language_loss": 0.77722597, "learning_rate": 3.0783012480824596e-06, "loss": 0.79922712, "num_input_tokens_seen": 60705270, "step": 2821, "time_per_iteration": 3.373331308364868 }, { "auxiliary_loss_clip": 0.01192974, "auxiliary_loss_mlp": 0.01035819, "balance_loss_clip": 1.05705953, "balance_loss_mlp": 1.02707541, "epoch": 0.33932543738351467, "flos": 17086656349440.0, "grad_norm": 2.05781483801845, "language_loss": 0.74176073, "learning_rate": 3.077645109098471e-06, "loss": 0.7640487, "num_input_tokens_seen": 60721540, "step": 2822, "time_per_iteration": 3.177468776702881 }, { "auxiliary_loss_clip": 0.01138654, "auxiliary_loss_mlp": 0.01031232, "balance_loss_clip": 1.05350184, "balance_loss_mlp": 1.02254176, "epoch": 0.3394456802741538, "flos": 22126970551680.0, "grad_norm": 1.8771677630903518, "language_loss": 0.72189283, "learning_rate": 3.076988806636577e-06, "loss": 0.74359167, "num_input_tokens_seen": 60739300, "step": 2823, "time_per_iteration": 3.2664270401000977 }, { "auxiliary_loss_clip": 0.01171885, "auxiliary_loss_mlp": 0.00764276, "balance_loss_clip": 1.05893779, "balance_loss_mlp": 1.00064969, "epoch": 0.3395659231647929, "flos": 25226887121280.0, "grad_norm": 1.8426482199465122, "language_loss": 0.88545227, "learning_rate": 3.0763323407963377e-06, "loss": 0.90481389, "num_input_tokens_seen": 60758910, "step": 2824, "time_per_iteration": 2.5418875217437744 }, { "auxiliary_loss_clip": 0.01177069, "auxiliary_loss_mlp": 0.01030703, "balance_loss_clip": 1.0530386, "balance_loss_mlp": 1.02266824, "epoch": 0.33968616605543195, "flos": 29096477343360.0, "grad_norm": 1.7419393378233285, "language_loss": 0.80139029, "learning_rate": 3.075675711677337e-06, "loss": 0.82346797, "num_input_tokens_seen": 60779005, "step": 2825, "time_per_iteration": 2.550868034362793 }, { "auxiliary_loss_clip": 0.01161744, "auxiliary_loss_mlp": 0.01038405, "balance_loss_clip": 1.05785692, "balance_loss_mlp": 1.02902889, "epoch": 0.33980640894607106, "flos": 21433966479360.0, "grad_norm": 1.875569005835158, "language_loss": 0.77708316, "learning_rate": 3.0750189193791865e-06, "loss": 0.7990846, "num_input_tokens_seen": 60798590, "step": 2826, "time_per_iteration": 2.492832660675049 }, { "auxiliary_loss_clip": 0.01177994, "auxiliary_loss_mlp": 0.01028095, "balance_loss_clip": 1.05595779, "balance_loss_mlp": 1.01919031, "epoch": 0.33992665183671017, "flos": 32490035596800.0, "grad_norm": 2.25449747897589, "language_loss": 0.70592326, "learning_rate": 3.0743619640015203e-06, "loss": 0.72798419, "num_input_tokens_seen": 60818840, "step": 2827, "time_per_iteration": 2.5631790161132812 }, { "auxiliary_loss_clip": 0.0116979, "auxiliary_loss_mlp": 0.01032412, "balance_loss_clip": 1.05369449, "balance_loss_mlp": 1.02279806, "epoch": 0.3400468947273492, "flos": 17055414495360.0, "grad_norm": 3.858596552100289, "language_loss": 0.92709273, "learning_rate": 3.073704845643999e-06, "loss": 0.9491148, "num_input_tokens_seen": 60835965, "step": 2828, "time_per_iteration": 2.5107100009918213 }, { "auxiliary_loss_clip": 0.01181236, "auxiliary_loss_mlp": 0.01039316, "balance_loss_clip": 1.05490315, "balance_loss_mlp": 1.02958906, "epoch": 0.34016713761798834, "flos": 16872988296960.0, "grad_norm": 2.7177534372300367, "language_loss": 0.77822864, "learning_rate": 3.0730475644063063e-06, "loss": 0.80043423, "num_input_tokens_seen": 60851065, "step": 2829, "time_per_iteration": 2.4641716480255127 }, { "auxiliary_loss_clip": 0.01156822, "auxiliary_loss_mlp": 0.00763451, "balance_loss_clip": 1.05148387, "balance_loss_mlp": 1.00060332, "epoch": 0.34028738050862745, "flos": 21907161273600.0, "grad_norm": 1.6332431121648796, "language_loss": 0.64891446, "learning_rate": 3.072390120388151e-06, "loss": 0.66811717, "num_input_tokens_seen": 60869390, "step": 2830, "time_per_iteration": 2.4921460151672363 }, { "auxiliary_loss_clip": 0.01181327, "auxiliary_loss_mlp": 0.01029944, "balance_loss_clip": 1.05773854, "balance_loss_mlp": 1.02050829, "epoch": 0.3404076233992665, "flos": 22746034477440.0, "grad_norm": 2.7823955540506313, "language_loss": 0.70972896, "learning_rate": 3.071732513689267e-06, "loss": 0.73184162, "num_input_tokens_seen": 60887925, "step": 2831, "time_per_iteration": 2.477276563644409 }, { "auxiliary_loss_clip": 0.01183593, "auxiliary_loss_mlp": 0.01034559, "balance_loss_clip": 1.06125474, "balance_loss_mlp": 1.02537358, "epoch": 0.3405278662899056, "flos": 17052361839360.0, "grad_norm": 2.119739785314952, "language_loss": 0.6717006, "learning_rate": 3.0710747444094134e-06, "loss": 0.69388211, "num_input_tokens_seen": 60905955, "step": 2832, "time_per_iteration": 2.439854621887207 }, { "auxiliary_loss_clip": 0.01168702, "auxiliary_loss_mlp": 0.01033059, "balance_loss_clip": 1.05697453, "balance_loss_mlp": 1.02399278, "epoch": 0.3406481091805447, "flos": 42813131783040.0, "grad_norm": 1.7877636726489856, "language_loss": 0.64813149, "learning_rate": 3.070416812648372e-06, "loss": 0.67014909, "num_input_tokens_seen": 60929405, "step": 2833, "time_per_iteration": 2.6894752979278564 }, { "auxiliary_loss_clip": 0.01147064, "auxiliary_loss_mlp": 0.01031857, "balance_loss_clip": 1.04871488, "balance_loss_mlp": 1.02310157, "epoch": 0.3407683520711838, "flos": 26761457917440.0, "grad_norm": 2.0562421399888353, "language_loss": 0.65065026, "learning_rate": 3.069758718505951e-06, "loss": 0.67243946, "num_input_tokens_seen": 60951145, "step": 2834, "time_per_iteration": 2.5993926525115967 }, { "auxiliary_loss_clip": 0.01194028, "auxiliary_loss_mlp": 0.0103552, "balance_loss_clip": 1.05948353, "balance_loss_mlp": 1.02622747, "epoch": 0.3408885949618229, "flos": 28767643309440.0, "grad_norm": 1.5872817200681593, "language_loss": 0.79973483, "learning_rate": 3.0691004620819836e-06, "loss": 0.82203031, "num_input_tokens_seen": 60971275, "step": 2835, "time_per_iteration": 2.5071046352386475 }, { "auxiliary_loss_clip": 0.01047716, "auxiliary_loss_mlp": 0.01001519, "balance_loss_clip": 1.02581668, "balance_loss_mlp": 1.00033307, "epoch": 0.341008837852462, "flos": 63576252881280.0, "grad_norm": 0.794941977036189, "language_loss": 0.60230374, "learning_rate": 3.0684420434763254e-06, "loss": 0.62279606, "num_input_tokens_seen": 61037460, "step": 2836, "time_per_iteration": 3.1616110801696777 }, { "auxiliary_loss_clip": 0.0114123, "auxiliary_loss_mlp": 0.01035426, "balance_loss_clip": 1.05393887, "balance_loss_mlp": 1.02721214, "epoch": 0.34112908074310105, "flos": 20812173120000.0, "grad_norm": 1.875328642869392, "language_loss": 0.77162451, "learning_rate": 3.06778346278886e-06, "loss": 0.79339111, "num_input_tokens_seen": 61056295, "step": 2837, "time_per_iteration": 2.584921360015869 }, { "auxiliary_loss_clip": 0.01196858, "auxiliary_loss_mlp": 0.01028809, "balance_loss_clip": 1.06109297, "balance_loss_mlp": 1.01985598, "epoch": 0.34124932363374016, "flos": 24976446520320.0, "grad_norm": 1.6778330485733912, "language_loss": 0.78711438, "learning_rate": 3.0671247201194906e-06, "loss": 0.80937111, "num_input_tokens_seen": 61078430, "step": 2838, "time_per_iteration": 2.5037589073181152 }, { "auxiliary_loss_clip": 0.01150946, "auxiliary_loss_mlp": 0.0103295, "balance_loss_clip": 1.05079067, "balance_loss_mlp": 1.02421236, "epoch": 0.3413695665243792, "flos": 28402970480640.0, "grad_norm": 1.7403362728454987, "language_loss": 0.75793797, "learning_rate": 3.066465815568151e-06, "loss": 0.77977693, "num_input_tokens_seen": 61099260, "step": 2839, "time_per_iteration": 2.672842264175415 }, { "auxiliary_loss_clip": 0.01179379, "auxiliary_loss_mlp": 0.01027021, "balance_loss_clip": 1.05462503, "balance_loss_mlp": 1.01860511, "epoch": 0.34148980941501833, "flos": 25302012416640.0, "grad_norm": 1.799863540739093, "language_loss": 0.69140643, "learning_rate": 3.0658067492347947e-06, "loss": 0.7134704, "num_input_tokens_seen": 61121900, "step": 2840, "time_per_iteration": 2.557950258255005 }, { "auxiliary_loss_clip": 0.01101928, "auxiliary_loss_mlp": 0.01031228, "balance_loss_clip": 1.04898286, "balance_loss_mlp": 1.0218823, "epoch": 0.34161005230565744, "flos": 17530081747200.0, "grad_norm": 3.3886022029106737, "language_loss": 0.66399121, "learning_rate": 3.065147521219402e-06, "loss": 0.68532276, "num_input_tokens_seen": 61141155, "step": 2841, "time_per_iteration": 2.627209186553955 }, { "auxiliary_loss_clip": 0.01159118, "auxiliary_loss_mlp": 0.01035578, "balance_loss_clip": 1.05848575, "balance_loss_mlp": 1.02666676, "epoch": 0.3417302951962965, "flos": 43650101566080.0, "grad_norm": 1.4637670655832455, "language_loss": 0.74448383, "learning_rate": 3.064488131621977e-06, "loss": 0.7664308, "num_input_tokens_seen": 61164480, "step": 2842, "time_per_iteration": 2.6990067958831787 }, { "auxiliary_loss_clip": 0.01171966, "auxiliary_loss_mlp": 0.01035033, "balance_loss_clip": 1.05330873, "balance_loss_mlp": 1.02562118, "epoch": 0.3418505380869356, "flos": 30882207012480.0, "grad_norm": 1.8366930298179058, "language_loss": 0.73696601, "learning_rate": 3.063828580542549e-06, "loss": 0.75903594, "num_input_tokens_seen": 61185675, "step": 2843, "time_per_iteration": 3.413469076156616 }, { "auxiliary_loss_clip": 0.01162901, "auxiliary_loss_mlp": 0.01032966, "balance_loss_clip": 1.05430579, "balance_loss_mlp": 1.02448463, "epoch": 0.3419707809775747, "flos": 19463871277440.0, "grad_norm": 3.4930476029731596, "language_loss": 0.73393798, "learning_rate": 3.0631688680811706e-06, "loss": 0.75589663, "num_input_tokens_seen": 61205300, "step": 2844, "time_per_iteration": 2.522000312805176 }, { "auxiliary_loss_clip": 0.01195029, "auxiliary_loss_mlp": 0.0103834, "balance_loss_clip": 1.0588218, "balance_loss_mlp": 1.02940512, "epoch": 0.3420910238682138, "flos": 28727818104960.0, "grad_norm": 2.5380852845665056, "language_loss": 0.75948596, "learning_rate": 3.062508994337921e-06, "loss": 0.7818197, "num_input_tokens_seen": 61224905, "step": 2845, "time_per_iteration": 2.5299503803253174 }, { "auxiliary_loss_clip": 0.01179607, "auxiliary_loss_mlp": 0.01030705, "balance_loss_clip": 1.05426919, "balance_loss_mlp": 1.02167535, "epoch": 0.3422112667588529, "flos": 21397265758080.0, "grad_norm": 1.9335868565547332, "language_loss": 0.79656082, "learning_rate": 3.0618489594129013e-06, "loss": 0.81866395, "num_input_tokens_seen": 61243045, "step": 2846, "time_per_iteration": 2.472214460372925 }, { "auxiliary_loss_clip": 0.01156333, "auxiliary_loss_mlp": 0.01030199, "balance_loss_clip": 1.05567932, "balance_loss_mlp": 1.02195549, "epoch": 0.342331509649492, "flos": 13881450038400.0, "grad_norm": 2.8792717115825157, "language_loss": 0.71314698, "learning_rate": 3.061188763406239e-06, "loss": 0.73501229, "num_input_tokens_seen": 61259190, "step": 2847, "time_per_iteration": 2.5289528369903564 }, { "auxiliary_loss_clip": 0.01159031, "auxiliary_loss_mlp": 0.01036551, "balance_loss_clip": 1.05303383, "balance_loss_mlp": 1.02731848, "epoch": 0.34245175254013105, "flos": 28621450955520.0, "grad_norm": 2.0738726317178307, "language_loss": 0.82062089, "learning_rate": 3.060528406418085e-06, "loss": 0.84257674, "num_input_tokens_seen": 61279040, "step": 2848, "time_per_iteration": 3.411067485809326 }, { "auxiliary_loss_clip": 0.01156205, "auxiliary_loss_mlp": 0.01032724, "balance_loss_clip": 1.05335414, "balance_loss_mlp": 1.02458823, "epoch": 0.34257199543077016, "flos": 34127058960000.0, "grad_norm": 1.7329179094990828, "language_loss": 0.61690629, "learning_rate": 3.0598678885486145e-06, "loss": 0.63879555, "num_input_tokens_seen": 61301580, "step": 2849, "time_per_iteration": 4.286153793334961 }, { "auxiliary_loss_clip": 0.01152153, "auxiliary_loss_mlp": 0.00763574, "balance_loss_clip": 1.05148506, "balance_loss_mlp": 1.00064421, "epoch": 0.34269223832140927, "flos": 19974018188160.0, "grad_norm": 1.7104306537581189, "language_loss": 0.74597412, "learning_rate": 3.0592072098980282e-06, "loss": 0.76513135, "num_input_tokens_seen": 61321240, "step": 2850, "time_per_iteration": 2.5296194553375244 }, { "auxiliary_loss_clip": 0.01156511, "auxiliary_loss_mlp": 0.01032452, "balance_loss_clip": 1.05220044, "balance_loss_mlp": 1.02336204, "epoch": 0.3428124812120483, "flos": 27235658292480.0, "grad_norm": 2.0927009317449117, "language_loss": 0.72429907, "learning_rate": 3.0585463705665514e-06, "loss": 0.7461887, "num_input_tokens_seen": 61341615, "step": 2851, "time_per_iteration": 2.558152437210083 }, { "auxiliary_loss_clip": 0.01147678, "auxiliary_loss_mlp": 0.0103319, "balance_loss_clip": 1.05035079, "balance_loss_mlp": 1.02469015, "epoch": 0.34293272410268744, "flos": 24570871079040.0, "grad_norm": 2.3775123087698216, "language_loss": 0.7083534, "learning_rate": 3.0578853706544304e-06, "loss": 0.73016202, "num_input_tokens_seen": 61359005, "step": 2852, "time_per_iteration": 2.5719010829925537 }, { "auxiliary_loss_clip": 0.01151815, "auxiliary_loss_mlp": 0.00764177, "balance_loss_clip": 1.05299461, "balance_loss_mlp": 1.00068247, "epoch": 0.34305296699332655, "flos": 21506865131520.0, "grad_norm": 2.533326538591344, "language_loss": 0.65476978, "learning_rate": 3.0572242102619404e-06, "loss": 0.67392975, "num_input_tokens_seen": 61376160, "step": 2853, "time_per_iteration": 2.5469963550567627 }, { "auxiliary_loss_clip": 0.01161826, "auxiliary_loss_mlp": 0.01033595, "balance_loss_clip": 1.05493999, "balance_loss_mlp": 1.02486849, "epoch": 0.3431732098839656, "flos": 24056665931520.0, "grad_norm": 1.7055883971682353, "language_loss": 0.80535883, "learning_rate": 3.0565628894893784e-06, "loss": 0.82731307, "num_input_tokens_seen": 61396795, "step": 2854, "time_per_iteration": 2.5753142833709717 }, { "auxiliary_loss_clip": 0.01171231, "auxiliary_loss_mlp": 0.01031877, "balance_loss_clip": 1.05597413, "balance_loss_mlp": 1.0231812, "epoch": 0.3432934527746047, "flos": 16800879744000.0, "grad_norm": 1.6235873889745867, "language_loss": 0.74764627, "learning_rate": 3.0559014084370655e-06, "loss": 0.76967728, "num_input_tokens_seen": 61415320, "step": 2855, "time_per_iteration": 2.5247013568878174 }, { "auxiliary_loss_clip": 0.01168402, "auxiliary_loss_mlp": 0.01029502, "balance_loss_clip": 1.05393612, "balance_loss_mlp": 1.02011454, "epoch": 0.34341369566524377, "flos": 23439720908160.0, "grad_norm": 1.6235014409123967, "language_loss": 0.78580523, "learning_rate": 3.055239767205349e-06, "loss": 0.80778426, "num_input_tokens_seen": 61437070, "step": 2856, "time_per_iteration": 2.5534403324127197 }, { "auxiliary_loss_clip": 0.01179992, "auxiliary_loss_mlp": 0.01033105, "balance_loss_clip": 1.0611037, "balance_loss_mlp": 1.02428365, "epoch": 0.3435339385558829, "flos": 17267466435840.0, "grad_norm": 1.8040581741170942, "language_loss": 0.78219175, "learning_rate": 3.054577965894599e-06, "loss": 0.80432272, "num_input_tokens_seen": 61453215, "step": 2857, "time_per_iteration": 2.477968454360962 }, { "auxiliary_loss_clip": 0.01175765, "auxiliary_loss_mlp": 0.01033708, "balance_loss_clip": 1.06136703, "balance_loss_mlp": 1.02341461, "epoch": 0.343654181446522, "flos": 22199366413440.0, "grad_norm": 1.7485965374897743, "language_loss": 0.70266128, "learning_rate": 3.0539160046052094e-06, "loss": 0.724756, "num_input_tokens_seen": 61472915, "step": 2858, "time_per_iteration": 2.531754493713379 }, { "auxiliary_loss_clip": 0.01156407, "auxiliary_loss_mlp": 0.01036703, "balance_loss_clip": 1.05174041, "balance_loss_mlp": 1.02662969, "epoch": 0.34377442433716104, "flos": 19901801894400.0, "grad_norm": 2.3387116561969172, "language_loss": 0.70100939, "learning_rate": 3.0532538834376003e-06, "loss": 0.72294044, "num_input_tokens_seen": 61492475, "step": 2859, "time_per_iteration": 2.5214338302612305 }, { "auxiliary_loss_clip": 0.01181855, "auxiliary_loss_mlp": 0.01035845, "balance_loss_clip": 1.05591464, "balance_loss_mlp": 1.02688098, "epoch": 0.34389466722780015, "flos": 22197678474240.0, "grad_norm": 1.7691035631254464, "language_loss": 0.78658879, "learning_rate": 3.0525916024922143e-06, "loss": 0.80876577, "num_input_tokens_seen": 61511660, "step": 2860, "time_per_iteration": 2.461674213409424 }, { "auxiliary_loss_clip": 0.01162466, "auxiliary_loss_mlp": 0.0103288, "balance_loss_clip": 1.05317581, "balance_loss_mlp": 1.02409387, "epoch": 0.34401491011843927, "flos": 18624567110400.0, "grad_norm": 2.8709792150611073, "language_loss": 0.84490848, "learning_rate": 3.0519291618695193e-06, "loss": 0.86686194, "num_input_tokens_seen": 61529060, "step": 2861, "time_per_iteration": 2.4676291942596436 }, { "auxiliary_loss_clip": 0.01140195, "auxiliary_loss_mlp": 0.01034243, "balance_loss_clip": 1.0478785, "balance_loss_mlp": 1.02580857, "epoch": 0.3441351530090783, "flos": 17858197509120.0, "grad_norm": 1.5933669564669064, "language_loss": 0.75363326, "learning_rate": 3.0512665616700065e-06, "loss": 0.77537769, "num_input_tokens_seen": 61548125, "step": 2862, "time_per_iteration": 2.509248971939087 }, { "auxiliary_loss_clip": 0.01126235, "auxiliary_loss_mlp": 0.01035955, "balance_loss_clip": 1.04855323, "balance_loss_mlp": 1.02740145, "epoch": 0.34425539589971743, "flos": 23112754381440.0, "grad_norm": 1.8061499002611905, "language_loss": 0.89301401, "learning_rate": 3.0506038019941933e-06, "loss": 0.91463596, "num_input_tokens_seen": 61568135, "step": 2863, "time_per_iteration": 2.658419132232666 }, { "auxiliary_loss_clip": 0.01149556, "auxiliary_loss_mlp": 0.01027002, "balance_loss_clip": 1.05410445, "balance_loss_mlp": 1.01834762, "epoch": 0.34437563879035654, "flos": 21907699977600.0, "grad_norm": 3.3593188937805154, "language_loss": 0.67939389, "learning_rate": 3.049940882942617e-06, "loss": 0.70115948, "num_input_tokens_seen": 61586920, "step": 2864, "time_per_iteration": 2.616769790649414 }, { "auxiliary_loss_clip": 0.01191192, "auxiliary_loss_mlp": 0.01031707, "balance_loss_clip": 1.05491972, "balance_loss_mlp": 1.02242041, "epoch": 0.3444958816809956, "flos": 23076915586560.0, "grad_norm": 10.237397370839172, "language_loss": 0.80480927, "learning_rate": 3.0492778046158448e-06, "loss": 0.82703823, "num_input_tokens_seen": 61608340, "step": 2865, "time_per_iteration": 2.4431538581848145 }, { "auxiliary_loss_clip": 0.01178974, "auxiliary_loss_mlp": 0.01032186, "balance_loss_clip": 1.05908966, "balance_loss_mlp": 1.0240171, "epoch": 0.3446161245716347, "flos": 21908633731200.0, "grad_norm": 1.9731235851958284, "language_loss": 0.77011299, "learning_rate": 3.0486145671144633e-06, "loss": 0.79222465, "num_input_tokens_seen": 61628130, "step": 2866, "time_per_iteration": 2.4715888500213623 }, { "auxiliary_loss_clip": 0.01099033, "auxiliary_loss_mlp": 0.01035573, "balance_loss_clip": 1.0438087, "balance_loss_mlp": 1.0259831, "epoch": 0.3447363674622738, "flos": 25112834461440.0, "grad_norm": 2.543567343040239, "language_loss": 0.76915431, "learning_rate": 3.047951170539086e-06, "loss": 0.7905004, "num_input_tokens_seen": 61647755, "step": 2867, "time_per_iteration": 2.6193974018096924 }, { "auxiliary_loss_clip": 0.01148248, "auxiliary_loss_mlp": 0.01039723, "balance_loss_clip": 1.05726612, "balance_loss_mlp": 1.03155136, "epoch": 0.3448566103529129, "flos": 11984684451840.0, "grad_norm": 1.8275632005147604, "language_loss": 0.83997416, "learning_rate": 3.047287614990349e-06, "loss": 0.8618539, "num_input_tokens_seen": 61665675, "step": 2868, "time_per_iteration": 2.513073205947876 }, { "auxiliary_loss_clip": 0.01158404, "auxiliary_loss_mlp": 0.01031191, "balance_loss_clip": 1.05368495, "balance_loss_mlp": 1.02187431, "epoch": 0.344976853243552, "flos": 40187882465280.0, "grad_norm": 2.342040551024905, "language_loss": 0.62204897, "learning_rate": 3.046623900568914e-06, "loss": 0.64394498, "num_input_tokens_seen": 61688240, "step": 2869, "time_per_iteration": 2.650275468826294 }, { "auxiliary_loss_clip": 0.01159393, "auxiliary_loss_mlp": 0.0103366, "balance_loss_clip": 1.05224645, "balance_loss_mlp": 1.0245111, "epoch": 0.34509709613419104, "flos": 28723652127360.0, "grad_norm": 5.212375758726025, "language_loss": 0.69895619, "learning_rate": 3.045960027375465e-06, "loss": 0.72088671, "num_input_tokens_seen": 61706075, "step": 2870, "time_per_iteration": 3.398531675338745 }, { "auxiliary_loss_clip": 0.01182474, "auxiliary_loss_mlp": 0.01032133, "balance_loss_clip": 1.05527139, "balance_loss_mlp": 1.02251267, "epoch": 0.34521733902483015, "flos": 29967597982080.0, "grad_norm": 4.708531668857249, "language_loss": 0.82405508, "learning_rate": 3.045295995510711e-06, "loss": 0.84620118, "num_input_tokens_seen": 61723045, "step": 2871, "time_per_iteration": 2.5406501293182373 }, { "auxiliary_loss_clip": 0.01159498, "auxiliary_loss_mlp": 0.01031837, "balance_loss_clip": 1.05462241, "balance_loss_mlp": 1.02331328, "epoch": 0.34533758191546926, "flos": 27923059843200.0, "grad_norm": 2.0246450591988254, "language_loss": 0.7362324, "learning_rate": 3.0446318050753865e-06, "loss": 0.75814581, "num_input_tokens_seen": 61743525, "step": 2872, "time_per_iteration": 2.5530900955200195 }, { "auxiliary_loss_clip": 0.01171955, "auxiliary_loss_mlp": 0.01031196, "balance_loss_clip": 1.05520427, "balance_loss_mlp": 1.02305448, "epoch": 0.3454578248061083, "flos": 27125879351040.0, "grad_norm": 2.013350686971414, "language_loss": 0.77449453, "learning_rate": 3.0439674561702474e-06, "loss": 0.79652596, "num_input_tokens_seen": 61763025, "step": 2873, "time_per_iteration": 2.594564914703369 }, { "auxiliary_loss_clip": 0.01174804, "auxiliary_loss_mlp": 0.01031586, "balance_loss_clip": 1.05588913, "balance_loss_mlp": 1.02320588, "epoch": 0.3455780676967474, "flos": 19024899166080.0, "grad_norm": 2.378899857448509, "language_loss": 0.88083708, "learning_rate": 3.043302948896076e-06, "loss": 0.90290093, "num_input_tokens_seen": 61781630, "step": 2874, "time_per_iteration": 2.458533525466919 }, { "auxiliary_loss_clip": 0.01124766, "auxiliary_loss_mlp": 0.01033048, "balance_loss_clip": 1.05099654, "balance_loss_mlp": 1.02399421, "epoch": 0.34569831058738654, "flos": 34496005507200.0, "grad_norm": 2.1854991874801555, "language_loss": 0.60771257, "learning_rate": 3.0426382833536756e-06, "loss": 0.6292907, "num_input_tokens_seen": 61804985, "step": 2875, "time_per_iteration": 4.31379508972168 }, { "auxiliary_loss_clip": 0.01143287, "auxiliary_loss_mlp": 0.01030138, "balance_loss_clip": 1.04923487, "balance_loss_mlp": 1.02156055, "epoch": 0.3458185534780256, "flos": 31138681098240.0, "grad_norm": 2.264687999635517, "language_loss": 0.77959049, "learning_rate": 3.041973459643877e-06, "loss": 0.80132473, "num_input_tokens_seen": 61824440, "step": 2876, "time_per_iteration": 3.3671629428863525 }, { "auxiliary_loss_clip": 0.0112489, "auxiliary_loss_mlp": 0.01030125, "balance_loss_clip": 1.04592729, "balance_loss_mlp": 1.02159607, "epoch": 0.3459387963686647, "flos": 32452508862720.0, "grad_norm": 2.1421198147396643, "language_loss": 0.67067444, "learning_rate": 3.0413084778675334e-06, "loss": 0.69222462, "num_input_tokens_seen": 61845690, "step": 2877, "time_per_iteration": 2.657008409500122 }, { "auxiliary_loss_clip": 0.01153671, "auxiliary_loss_mlp": 0.00763408, "balance_loss_clip": 1.04934549, "balance_loss_mlp": 1.00064945, "epoch": 0.3460590392593038, "flos": 24675658030080.0, "grad_norm": 2.0852301773977917, "language_loss": 0.8401655, "learning_rate": 3.0406433381255214e-06, "loss": 0.85933626, "num_input_tokens_seen": 61863725, "step": 2878, "time_per_iteration": 2.548107385635376 }, { "auxiliary_loss_clip": 0.01175807, "auxiliary_loss_mlp": 0.01027541, "balance_loss_clip": 1.05775738, "balance_loss_mlp": 1.01937246, "epoch": 0.34617928214994287, "flos": 18807316531200.0, "grad_norm": 3.048187612612579, "language_loss": 0.82509327, "learning_rate": 3.0399780405187425e-06, "loss": 0.84712672, "num_input_tokens_seen": 61882720, "step": 2879, "time_per_iteration": 2.463813066482544 }, { "auxiliary_loss_clip": 0.01174518, "auxiliary_loss_mlp": 0.01027048, "balance_loss_clip": 1.05523729, "balance_loss_mlp": 1.01909065, "epoch": 0.346299525040582, "flos": 24857653265280.0, "grad_norm": 2.4315994336817637, "language_loss": 0.78618073, "learning_rate": 3.0393125851481216e-06, "loss": 0.80819643, "num_input_tokens_seen": 61902595, "step": 2880, "time_per_iteration": 2.514599084854126 }, { "auxiliary_loss_clip": 0.01146137, "auxiliary_loss_mlp": 0.01024417, "balance_loss_clip": 1.05446315, "balance_loss_mlp": 1.01624501, "epoch": 0.3464197679312211, "flos": 16434914025600.0, "grad_norm": 2.2383979210936613, "language_loss": 0.86767185, "learning_rate": 3.038646972114608e-06, "loss": 0.88937736, "num_input_tokens_seen": 61918920, "step": 2881, "time_per_iteration": 2.5247910022735596 }, { "auxiliary_loss_clip": 0.0114468, "auxiliary_loss_mlp": 0.01041295, "balance_loss_clip": 1.05382347, "balance_loss_mlp": 1.03275335, "epoch": 0.34654001082186014, "flos": 22382474970240.0, "grad_norm": 1.7562835057200905, "language_loss": 0.67518997, "learning_rate": 3.037981201519174e-06, "loss": 0.69704974, "num_input_tokens_seen": 61939520, "step": 2882, "time_per_iteration": 2.561526298522949 }, { "auxiliary_loss_clip": 0.01178516, "auxiliary_loss_mlp": 0.01033042, "balance_loss_clip": 1.05910099, "balance_loss_mlp": 1.02461433, "epoch": 0.34666025371249926, "flos": 19573901614080.0, "grad_norm": 2.7169571989647916, "language_loss": 0.71286345, "learning_rate": 3.0373152734628175e-06, "loss": 0.73497903, "num_input_tokens_seen": 61957800, "step": 2883, "time_per_iteration": 2.4900712966918945 }, { "auxiliary_loss_clip": 0.01172466, "auxiliary_loss_mlp": 0.01027372, "balance_loss_clip": 1.0540055, "balance_loss_mlp": 1.01847339, "epoch": 0.34678049660313837, "flos": 15267637751040.0, "grad_norm": 2.152190947245872, "language_loss": 0.76223826, "learning_rate": 3.0366491880465584e-06, "loss": 0.78423667, "num_input_tokens_seen": 61975820, "step": 2884, "time_per_iteration": 2.4612512588500977 }, { "auxiliary_loss_clip": 0.01195421, "auxiliary_loss_mlp": 0.0103248, "balance_loss_clip": 1.059726, "balance_loss_mlp": 1.02328932, "epoch": 0.3469007394937774, "flos": 21181550630400.0, "grad_norm": 1.613784129173304, "language_loss": 0.82047951, "learning_rate": 3.035982945371443e-06, "loss": 0.84275854, "num_input_tokens_seen": 61997515, "step": 2885, "time_per_iteration": 2.4990787506103516 }, { "auxiliary_loss_clip": 0.01170348, "auxiliary_loss_mlp": 0.01029612, "balance_loss_clip": 1.05553651, "balance_loss_mlp": 1.02049804, "epoch": 0.34702098238441653, "flos": 22375471818240.0, "grad_norm": 2.055747682232124, "language_loss": 0.85413849, "learning_rate": 3.035316545538537e-06, "loss": 0.87613809, "num_input_tokens_seen": 62016310, "step": 2886, "time_per_iteration": 2.521435260772705 }, { "auxiliary_loss_clip": 0.01165092, "auxiliary_loss_mlp": 0.01031167, "balance_loss_clip": 1.06063163, "balance_loss_mlp": 1.02253628, "epoch": 0.3471412252750556, "flos": 22929430343040.0, "grad_norm": 2.649077036405507, "language_loss": 0.7893368, "learning_rate": 3.034649988648935e-06, "loss": 0.81129944, "num_input_tokens_seen": 62036075, "step": 2887, "time_per_iteration": 2.516091823577881 }, { "auxiliary_loss_clip": 0.01167655, "auxiliary_loss_mlp": 0.01026649, "balance_loss_clip": 1.0553093, "balance_loss_mlp": 1.01782775, "epoch": 0.3472614681656947, "flos": 21324259365120.0, "grad_norm": 1.682397715866157, "language_loss": 0.8068493, "learning_rate": 3.033983274803752e-06, "loss": 0.82879233, "num_input_tokens_seen": 62055865, "step": 2888, "time_per_iteration": 2.536759853363037 }, { "auxiliary_loss_clip": 0.01160019, "auxiliary_loss_mlp": 0.01031533, "balance_loss_clip": 1.05383372, "balance_loss_mlp": 1.02261615, "epoch": 0.3473817110563338, "flos": 23475739271040.0, "grad_norm": 4.246333451645234, "language_loss": 0.7272895, "learning_rate": 3.0333164041041283e-06, "loss": 0.74920505, "num_input_tokens_seen": 62072180, "step": 2889, "time_per_iteration": 2.5323047637939453 }, { "auxiliary_loss_clip": 0.01121529, "auxiliary_loss_mlp": 0.01025175, "balance_loss_clip": 1.04906702, "balance_loss_mlp": 1.01687253, "epoch": 0.34750195394697286, "flos": 22346025644160.0, "grad_norm": 2.175184053088743, "language_loss": 0.71810549, "learning_rate": 3.032649376651228e-06, "loss": 0.73957253, "num_input_tokens_seen": 62091600, "step": 2890, "time_per_iteration": 2.6042559146881104 }, { "auxiliary_loss_clip": 0.01150244, "auxiliary_loss_mlp": 0.01027838, "balance_loss_clip": 1.05282807, "balance_loss_mlp": 1.01841486, "epoch": 0.347622196837612, "flos": 29095004885760.0, "grad_norm": 3.4002649271359715, "language_loss": 0.75715297, "learning_rate": 3.031982192546238e-06, "loss": 0.77893382, "num_input_tokens_seen": 62114695, "step": 2891, "time_per_iteration": 2.593083381652832 }, { "auxiliary_loss_clip": 0.01178937, "auxiliary_loss_mlp": 0.01032835, "balance_loss_clip": 1.05558956, "balance_loss_mlp": 1.02431178, "epoch": 0.3477424397282511, "flos": 22455732758400.0, "grad_norm": 1.996430127765452, "language_loss": 0.94606566, "learning_rate": 3.0313148518903696e-06, "loss": 0.9681834, "num_input_tokens_seen": 62134520, "step": 2892, "time_per_iteration": 2.484234571456909 }, { "auxiliary_loss_clip": 0.01167416, "auxiliary_loss_mlp": 0.01028234, "balance_loss_clip": 1.05686927, "balance_loss_mlp": 1.01939476, "epoch": 0.34786268261889014, "flos": 15778790242560.0, "grad_norm": 2.0046582628429657, "language_loss": 0.81266308, "learning_rate": 3.030647354784859e-06, "loss": 0.83461958, "num_input_tokens_seen": 62151560, "step": 2893, "time_per_iteration": 2.498206377029419 }, { "auxiliary_loss_clip": 0.01150919, "auxiliary_loss_mlp": 0.01033242, "balance_loss_clip": 1.05435252, "balance_loss_mlp": 1.02456331, "epoch": 0.34798292550952925, "flos": 20777627214720.0, "grad_norm": 1.7405890773186343, "language_loss": 0.76891041, "learning_rate": 3.029979701330964e-06, "loss": 0.79075205, "num_input_tokens_seen": 62170985, "step": 2894, "time_per_iteration": 2.5509817600250244 }, { "auxiliary_loss_clip": 0.0116866, "auxiliary_loss_mlp": 0.01032342, "balance_loss_clip": 1.05564237, "balance_loss_mlp": 1.023646, "epoch": 0.34810316840016836, "flos": 19937820257280.0, "grad_norm": 2.271278945655884, "language_loss": 0.80084401, "learning_rate": 3.029311891629966e-06, "loss": 0.82285404, "num_input_tokens_seen": 62189440, "step": 2895, "time_per_iteration": 2.5065693855285645 }, { "auxiliary_loss_clip": 0.01159761, "auxiliary_loss_mlp": 0.01036377, "balance_loss_clip": 1.05475485, "balance_loss_mlp": 1.02717996, "epoch": 0.3482234112908074, "flos": 23623296341760.0, "grad_norm": 1.7273877714629253, "language_loss": 0.73963678, "learning_rate": 3.0286439257831744e-06, "loss": 0.76159817, "num_input_tokens_seen": 62208910, "step": 2896, "time_per_iteration": 3.395796298980713 }, { "auxiliary_loss_clip": 0.01198129, "auxiliary_loss_mlp": 0.01037214, "balance_loss_clip": 1.05938578, "balance_loss_mlp": 1.02655709, "epoch": 0.3483436541814465, "flos": 23986712194560.0, "grad_norm": 3.16386010005365, "language_loss": 0.71699721, "learning_rate": 3.0279758038919156e-06, "loss": 0.73935062, "num_input_tokens_seen": 62227135, "step": 2897, "time_per_iteration": 2.4587204456329346 }, { "auxiliary_loss_clip": 0.01179106, "auxiliary_loss_mlp": 0.01032963, "balance_loss_clip": 1.05689836, "balance_loss_mlp": 1.02384305, "epoch": 0.34846389707208564, "flos": 22638338524800.0, "grad_norm": 1.6537049686488743, "language_loss": 0.77854073, "learning_rate": 3.0273075260575455e-06, "loss": 0.80066139, "num_input_tokens_seen": 62246035, "step": 2898, "time_per_iteration": 2.4945077896118164 }, { "auxiliary_loss_clip": 0.01166885, "auxiliary_loss_mlp": 0.01035079, "balance_loss_clip": 1.05392015, "balance_loss_mlp": 1.02496982, "epoch": 0.3485841399627247, "flos": 21792857218560.0, "grad_norm": 1.8870563275997037, "language_loss": 0.80711806, "learning_rate": 3.0266390923814396e-06, "loss": 0.82913768, "num_input_tokens_seen": 62264095, "step": 2899, "time_per_iteration": 2.508171319961548 }, { "auxiliary_loss_clip": 0.01170637, "auxiliary_loss_mlp": 0.01037961, "balance_loss_clip": 1.06040359, "balance_loss_mlp": 1.02800679, "epoch": 0.3487043828533638, "flos": 17019036996480.0, "grad_norm": 1.8152172148819326, "language_loss": 0.82229769, "learning_rate": 3.0259705029650008e-06, "loss": 0.8443836, "num_input_tokens_seen": 62282025, "step": 2900, "time_per_iteration": 2.485881805419922 }, { "auxiliary_loss_clip": 0.01180616, "auxiliary_loss_mlp": 0.01026141, "balance_loss_clip": 1.05577207, "balance_loss_mlp": 1.01788545, "epoch": 0.34882462574400286, "flos": 22601135013120.0, "grad_norm": 1.6252599133425325, "language_loss": 0.72880208, "learning_rate": 3.025301757909652e-06, "loss": 0.75086969, "num_input_tokens_seen": 62302220, "step": 2901, "time_per_iteration": 2.4926414489746094 }, { "auxiliary_loss_clip": 0.0115377, "auxiliary_loss_mlp": 0.00764605, "balance_loss_clip": 1.05450368, "balance_loss_mlp": 1.00065589, "epoch": 0.34894486863464197, "flos": 29861518141440.0, "grad_norm": 1.783490674131647, "language_loss": 0.80567622, "learning_rate": 3.024632857316842e-06, "loss": 0.82485998, "num_input_tokens_seen": 62323535, "step": 2902, "time_per_iteration": 4.201333999633789 }, { "auxiliary_loss_clip": 0.0118364, "auxiliary_loss_mlp": 0.01028962, "balance_loss_clip": 1.06009841, "balance_loss_mlp": 1.01966381, "epoch": 0.3490651115252811, "flos": 22122265870080.0, "grad_norm": 1.919950527440679, "language_loss": 0.77397877, "learning_rate": 3.0239638012880412e-06, "loss": 0.79610479, "num_input_tokens_seen": 62343430, "step": 2903, "time_per_iteration": 3.2788820266723633 }, { "auxiliary_loss_clip": 0.0112799, "auxiliary_loss_mlp": 0.01031676, "balance_loss_clip": 1.04885983, "balance_loss_mlp": 1.02212143, "epoch": 0.34918535441592014, "flos": 12676682943360.0, "grad_norm": 2.3410715635561457, "language_loss": 0.81419742, "learning_rate": 3.0232945899247466e-06, "loss": 0.83579409, "num_input_tokens_seen": 62360365, "step": 2904, "time_per_iteration": 2.7015492916107178 }, { "auxiliary_loss_clip": 0.01178911, "auxiliary_loss_mlp": 0.01037029, "balance_loss_clip": 1.05403042, "balance_loss_mlp": 1.02729511, "epoch": 0.34930559730655925, "flos": 23185617120000.0, "grad_norm": 1.776881880303038, "language_loss": 0.77339458, "learning_rate": 3.022625223328476e-06, "loss": 0.79555392, "num_input_tokens_seen": 62382105, "step": 2905, "time_per_iteration": 2.6516213417053223 }, { "auxiliary_loss_clip": 0.01186029, "auxiliary_loss_mlp": 0.01035203, "balance_loss_clip": 1.05825877, "balance_loss_mlp": 1.02438486, "epoch": 0.34942584019719836, "flos": 22855023319680.0, "grad_norm": 1.393738410815465, "language_loss": 0.69099772, "learning_rate": 3.0219557016007723e-06, "loss": 0.71321005, "num_input_tokens_seen": 62402235, "step": 2906, "time_per_iteration": 2.589181661605835 }, { "auxiliary_loss_clip": 0.01176101, "auxiliary_loss_mlp": 0.01030392, "balance_loss_clip": 1.05693066, "balance_loss_mlp": 1.02091467, "epoch": 0.3495460830878374, "flos": 24426043441920.0, "grad_norm": 1.8369676914442659, "language_loss": 0.6969316, "learning_rate": 3.021286024843202e-06, "loss": 0.71899647, "num_input_tokens_seen": 62420430, "step": 2907, "time_per_iteration": 2.490055799484253 }, { "auxiliary_loss_clip": 0.0109323, "auxiliary_loss_mlp": 0.01002864, "balance_loss_clip": 1.02513075, "balance_loss_mlp": 1.00180876, "epoch": 0.3496663259784765, "flos": 70008749389440.0, "grad_norm": 1.0726586884330434, "language_loss": 0.6485374, "learning_rate": 3.0206161931573526e-06, "loss": 0.66949832, "num_input_tokens_seen": 62472980, "step": 2908, "time_per_iteration": 2.960287570953369 }, { "auxiliary_loss_clip": 0.01160997, "auxiliary_loss_mlp": 0.01035073, "balance_loss_clip": 1.05074692, "balance_loss_mlp": 1.0265795, "epoch": 0.34978656886911563, "flos": 28692805322880.0, "grad_norm": 1.603290085295026, "language_loss": 0.92862695, "learning_rate": 3.0199462066448388e-06, "loss": 0.95058757, "num_input_tokens_seen": 62495175, "step": 2909, "time_per_iteration": 2.577993392944336 }, { "auxiliary_loss_clip": 0.01181659, "auxiliary_loss_mlp": 0.01030685, "balance_loss_clip": 1.05983531, "balance_loss_mlp": 1.02118409, "epoch": 0.3499068117597547, "flos": 21142156389120.0, "grad_norm": 1.8314536908633274, "language_loss": 0.69664884, "learning_rate": 3.019276065407296e-06, "loss": 0.71877223, "num_input_tokens_seen": 62514295, "step": 2910, "time_per_iteration": 2.47017765045166 }, { "auxiliary_loss_clip": 0.01138413, "auxiliary_loss_mlp": 0.01035426, "balance_loss_clip": 1.05168366, "balance_loss_mlp": 1.02610993, "epoch": 0.3500270546503938, "flos": 22782699285120.0, "grad_norm": 1.8468371536414188, "language_loss": 0.80348551, "learning_rate": 3.018605769546385e-06, "loss": 0.82522392, "num_input_tokens_seen": 62534850, "step": 2911, "time_per_iteration": 2.575129508972168 }, { "auxiliary_loss_clip": 0.0117745, "auxiliary_loss_mlp": 0.01034137, "balance_loss_clip": 1.05382395, "balance_loss_mlp": 1.0245049, "epoch": 0.3501472975410329, "flos": 22894058424960.0, "grad_norm": 1.804305570846599, "language_loss": 0.79593498, "learning_rate": 3.017935319163788e-06, "loss": 0.81805086, "num_input_tokens_seen": 62553810, "step": 2912, "time_per_iteration": 2.48646879196167 }, { "auxiliary_loss_clip": 0.01180931, "auxiliary_loss_mlp": 0.01034433, "balance_loss_clip": 1.05763423, "balance_loss_mlp": 1.0240376, "epoch": 0.35026754043167196, "flos": 25446588658560.0, "grad_norm": 3.3857023562322914, "language_loss": 0.70608127, "learning_rate": 3.017264714361213e-06, "loss": 0.72823489, "num_input_tokens_seen": 62573460, "step": 2913, "time_per_iteration": 2.4986729621887207 }, { "auxiliary_loss_clip": 0.01164549, "auxiliary_loss_mlp": 0.00764494, "balance_loss_clip": 1.05506444, "balance_loss_mlp": 1.00079739, "epoch": 0.3503877833223111, "flos": 19573757959680.0, "grad_norm": 1.8962038542750148, "language_loss": 0.82142198, "learning_rate": 3.016593955240389e-06, "loss": 0.84071243, "num_input_tokens_seen": 62592150, "step": 2914, "time_per_iteration": 2.5073235034942627 }, { "auxiliary_loss_clip": 0.01079893, "auxiliary_loss_mlp": 0.010011, "balance_loss_clip": 1.02345228, "balance_loss_mlp": 1.0000447, "epoch": 0.3505080262129502, "flos": 65072075880960.0, "grad_norm": 0.822748475037358, "language_loss": 0.63760668, "learning_rate": 3.015923041903071e-06, "loss": 0.65841663, "num_input_tokens_seen": 62658275, "step": 2915, "time_per_iteration": 3.100501537322998 }, { "auxiliary_loss_clip": 0.01181317, "auxiliary_loss_mlp": 0.01030381, "balance_loss_clip": 1.06030369, "balance_loss_mlp": 1.02055824, "epoch": 0.35062826910358924, "flos": 29314562768640.0, "grad_norm": 2.3166085606717055, "language_loss": 0.8339988, "learning_rate": 3.0152519744510347e-06, "loss": 0.85611582, "num_input_tokens_seen": 62678075, "step": 2916, "time_per_iteration": 2.5334339141845703 }, { "auxiliary_loss_clip": 0.01152459, "auxiliary_loss_mlp": 0.01033822, "balance_loss_clip": 1.05300522, "balance_loss_mlp": 1.0243094, "epoch": 0.35074851199422835, "flos": 23987717775360.0, "grad_norm": 1.8136290003301474, "language_loss": 0.82546228, "learning_rate": 3.014580752986081e-06, "loss": 0.84732509, "num_input_tokens_seen": 62696950, "step": 2917, "time_per_iteration": 2.5631556510925293 }, { "auxiliary_loss_clip": 0.01139605, "auxiliary_loss_mlp": 0.01035894, "balance_loss_clip": 1.05364418, "balance_loss_mlp": 1.02705419, "epoch": 0.3508687548848674, "flos": 15224436668160.0, "grad_norm": 2.3067430055861204, "language_loss": 0.78285944, "learning_rate": 3.0139093776100345e-06, "loss": 0.80461442, "num_input_tokens_seen": 62713540, "step": 2918, "time_per_iteration": 2.5434465408325195 }, { "auxiliary_loss_clip": 0.01190736, "auxiliary_loss_mlp": 0.01029792, "balance_loss_clip": 1.05679631, "balance_loss_mlp": 1.02061892, "epoch": 0.3509889977755065, "flos": 21361750185600.0, "grad_norm": 1.7939210097053386, "language_loss": 0.75197208, "learning_rate": 3.013237848424741e-06, "loss": 0.77417737, "num_input_tokens_seen": 62732925, "step": 2919, "time_per_iteration": 2.4598734378814697 }, { "auxiliary_loss_clip": 0.01166497, "auxiliary_loss_mlp": 0.01034552, "balance_loss_clip": 1.05639613, "balance_loss_mlp": 1.0256592, "epoch": 0.35110924066614563, "flos": 19135360465920.0, "grad_norm": 4.647650362100503, "language_loss": 0.75478858, "learning_rate": 3.012566165532072e-06, "loss": 0.77679908, "num_input_tokens_seen": 62751715, "step": 2920, "time_per_iteration": 2.514832019805908 }, { "auxiliary_loss_clip": 0.01129612, "auxiliary_loss_mlp": 0.01034139, "balance_loss_clip": 1.05160165, "balance_loss_mlp": 1.02517498, "epoch": 0.3512294835567847, "flos": 21980885938560.0, "grad_norm": 1.9924305951716799, "language_loss": 0.76581919, "learning_rate": 3.0118943290339207e-06, "loss": 0.78745663, "num_input_tokens_seen": 62771925, "step": 2921, "time_per_iteration": 2.6265459060668945 }, { "auxiliary_loss_clip": 0.01142973, "auxiliary_loss_mlp": 0.01031763, "balance_loss_clip": 1.0510267, "balance_loss_mlp": 1.02229762, "epoch": 0.3513497264474238, "flos": 17817294896640.0, "grad_norm": 2.1042732354260294, "language_loss": 0.68369895, "learning_rate": 3.011222339032204e-06, "loss": 0.7054463, "num_input_tokens_seen": 62790075, "step": 2922, "time_per_iteration": 2.5352516174316406 }, { "auxiliary_loss_clip": 0.01191693, "auxiliary_loss_mlp": 0.01037479, "balance_loss_clip": 1.05668116, "balance_loss_mlp": 1.02787066, "epoch": 0.3514699693380629, "flos": 26943417239040.0, "grad_norm": 1.7984535259243666, "language_loss": 0.69699448, "learning_rate": 3.0105501956288626e-06, "loss": 0.7192862, "num_input_tokens_seen": 62810545, "step": 2923, "time_per_iteration": 3.3468363285064697 }, { "auxiliary_loss_clip": 0.0118585, "auxiliary_loss_mlp": 0.0103405, "balance_loss_clip": 1.05740762, "balance_loss_mlp": 1.02428055, "epoch": 0.35159021222870196, "flos": 15267565923840.0, "grad_norm": 1.9478105332390803, "language_loss": 0.72798163, "learning_rate": 3.0098778989258602e-06, "loss": 0.7501806, "num_input_tokens_seen": 62829155, "step": 2924, "time_per_iteration": 2.454888105392456 }, { "auxiliary_loss_clip": 0.01145254, "auxiliary_loss_mlp": 0.01036472, "balance_loss_clip": 1.05203819, "balance_loss_mlp": 1.02738249, "epoch": 0.35171045511934107, "flos": 13984154000640.0, "grad_norm": 2.951258086630555, "language_loss": 0.8815009, "learning_rate": 3.009205449025183e-06, "loss": 0.90331817, "num_input_tokens_seen": 62845350, "step": 2925, "time_per_iteration": 2.51007080078125 }, { "auxiliary_loss_clip": 0.01146694, "auxiliary_loss_mlp": 0.01033294, "balance_loss_clip": 1.05057228, "balance_loss_mlp": 1.02404332, "epoch": 0.3518306980099802, "flos": 14283434119680.0, "grad_norm": 1.9469640914707576, "language_loss": 0.63222831, "learning_rate": 3.008532846028842e-06, "loss": 0.65402818, "num_input_tokens_seen": 62862110, "step": 2926, "time_per_iteration": 2.513120412826538 }, { "auxiliary_loss_clip": 0.01197455, "auxiliary_loss_mlp": 0.01037714, "balance_loss_clip": 1.06008434, "balance_loss_mlp": 1.02777159, "epoch": 0.35195094090061924, "flos": 27052872958080.0, "grad_norm": 2.6667906545515296, "language_loss": 0.72278976, "learning_rate": 3.0078600900388694e-06, "loss": 0.74514151, "num_input_tokens_seen": 62882415, "step": 2927, "time_per_iteration": 2.4651401042938232 }, { "auxiliary_loss_clip": 0.01138861, "auxiliary_loss_mlp": 0.01031047, "balance_loss_clip": 1.04865503, "balance_loss_mlp": 1.02143335, "epoch": 0.35207118379125835, "flos": 25629266252160.0, "grad_norm": 1.8224458948767925, "language_loss": 0.73845279, "learning_rate": 3.007187181157323e-06, "loss": 0.76015192, "num_input_tokens_seen": 62902425, "step": 2928, "time_per_iteration": 2.5711920261383057 }, { "auxiliary_loss_clip": 0.0111219, "auxiliary_loss_mlp": 0.01028789, "balance_loss_clip": 1.04720783, "balance_loss_mlp": 1.01982498, "epoch": 0.35219142668189746, "flos": 18004713085440.0, "grad_norm": 2.219903385204011, "language_loss": 0.68447924, "learning_rate": 3.006514119486282e-06, "loss": 0.70588905, "num_input_tokens_seen": 62919255, "step": 2929, "time_per_iteration": 4.204680681228638 }, { "auxiliary_loss_clip": 0.01142461, "auxiliary_loss_mlp": 0.01027718, "balance_loss_clip": 1.05109572, "balance_loss_mlp": 1.01898611, "epoch": 0.3523116695725365, "flos": 14028109269120.0, "grad_norm": 1.7345613065517642, "language_loss": 0.69631529, "learning_rate": 3.005840905127849e-06, "loss": 0.7180171, "num_input_tokens_seen": 62936160, "step": 2930, "time_per_iteration": 2.516960382461548 }, { "auxiliary_loss_clip": 0.01193654, "auxiliary_loss_mlp": 0.00764166, "balance_loss_clip": 1.06096816, "balance_loss_mlp": 1.00087798, "epoch": 0.3524319124631756, "flos": 21433966479360.0, "grad_norm": 2.3048826319079865, "language_loss": 0.87105119, "learning_rate": 3.0051675381841516e-06, "loss": 0.89062941, "num_input_tokens_seen": 62953470, "step": 2931, "time_per_iteration": 2.4634788036346436 }, { "auxiliary_loss_clip": 0.01106299, "auxiliary_loss_mlp": 0.01035957, "balance_loss_clip": 1.04673111, "balance_loss_mlp": 1.02665281, "epoch": 0.3525521553538147, "flos": 26322773114880.0, "grad_norm": 1.600391766366554, "language_loss": 0.76800299, "learning_rate": 3.0044940187573363e-06, "loss": 0.78942549, "num_input_tokens_seen": 62974480, "step": 2932, "time_per_iteration": 2.670949935913086 }, { "auxiliary_loss_clip": 0.01182458, "auxiliary_loss_mlp": 0.01034208, "balance_loss_clip": 1.05614138, "balance_loss_mlp": 1.02508903, "epoch": 0.3526723982444538, "flos": 21543314457600.0, "grad_norm": 1.7523460779030162, "language_loss": 0.65343928, "learning_rate": 3.003820346949578e-06, "loss": 0.67560595, "num_input_tokens_seen": 62992560, "step": 2933, "time_per_iteration": 2.576967477798462 }, { "auxiliary_loss_clip": 0.01192051, "auxiliary_loss_mlp": 0.01033635, "balance_loss_clip": 1.05648708, "balance_loss_mlp": 1.02417004, "epoch": 0.3527926411350929, "flos": 23733649900800.0, "grad_norm": 1.8096476756193962, "language_loss": 0.79540652, "learning_rate": 3.003146522863071e-06, "loss": 0.81766337, "num_input_tokens_seen": 63013445, "step": 2934, "time_per_iteration": 2.5080907344818115 }, { "auxiliary_loss_clip": 0.0116387, "auxiliary_loss_mlp": 0.010317, "balance_loss_clip": 1.05665207, "balance_loss_mlp": 1.02315843, "epoch": 0.35291288402573195, "flos": 30445461544320.0, "grad_norm": 2.1890740253179786, "language_loss": 0.85630852, "learning_rate": 3.0024725466000345e-06, "loss": 0.87826431, "num_input_tokens_seen": 63033400, "step": 2935, "time_per_iteration": 2.5992751121520996 }, { "auxiliary_loss_clip": 0.01179127, "auxiliary_loss_mlp": 0.01027721, "balance_loss_clip": 1.05850887, "balance_loss_mlp": 1.01947129, "epoch": 0.35303312691637107, "flos": 23112179763840.0, "grad_norm": 1.662825815363589, "language_loss": 0.78672743, "learning_rate": 3.0017984182627087e-06, "loss": 0.80879587, "num_input_tokens_seen": 63052725, "step": 2936, "time_per_iteration": 2.496722459793091 }, { "auxiliary_loss_clip": 0.01148166, "auxiliary_loss_mlp": 0.00764392, "balance_loss_clip": 1.0506084, "balance_loss_mlp": 1.00084865, "epoch": 0.3531533698070102, "flos": 21835699165440.0, "grad_norm": 18.760500707031348, "language_loss": 0.82230437, "learning_rate": 3.00112413795336e-06, "loss": 0.84143001, "num_input_tokens_seen": 63072560, "step": 2937, "time_per_iteration": 2.68031644821167 }, { "auxiliary_loss_clip": 0.01159717, "auxiliary_loss_mlp": 0.01032299, "balance_loss_clip": 1.05035639, "balance_loss_mlp": 1.02319145, "epoch": 0.35327361269764923, "flos": 15778969810560.0, "grad_norm": 1.8526473721634917, "language_loss": 0.8030709, "learning_rate": 3.000449705774275e-06, "loss": 0.82499111, "num_input_tokens_seen": 63090800, "step": 2938, "time_per_iteration": 2.502084732055664 }, { "auxiliary_loss_clip": 0.01179959, "auxiliary_loss_mlp": 0.01027559, "balance_loss_clip": 1.05810738, "balance_loss_mlp": 1.01852274, "epoch": 0.35339385558828834, "flos": 22090413484800.0, "grad_norm": 2.5738479897557567, "language_loss": 0.71376145, "learning_rate": 2.9997751218277654e-06, "loss": 0.73583663, "num_input_tokens_seen": 63108955, "step": 2939, "time_per_iteration": 2.479315996170044 }, { "auxiliary_loss_clip": 0.01195111, "auxiliary_loss_mlp": 0.01031689, "balance_loss_clip": 1.06016338, "balance_loss_mlp": 1.02243233, "epoch": 0.35351409847892745, "flos": 24165008328960.0, "grad_norm": 1.883787022771459, "language_loss": 0.77750146, "learning_rate": 2.999100386216166e-06, "loss": 0.79976946, "num_input_tokens_seen": 63127895, "step": 2940, "time_per_iteration": 2.445446729660034 }, { "auxiliary_loss_clip": 0.01164061, "auxiliary_loss_mlp": 0.01030784, "balance_loss_clip": 1.05529094, "balance_loss_mlp": 1.02202201, "epoch": 0.3536343413695665, "flos": 27052298340480.0, "grad_norm": 1.8192266325998778, "language_loss": 0.7428627, "learning_rate": 2.998425499041831e-06, "loss": 0.76481116, "num_input_tokens_seen": 63148410, "step": 2941, "time_per_iteration": 2.552973508834839 }, { "auxiliary_loss_clip": 0.01077976, "auxiliary_loss_mlp": 0.01001691, "balance_loss_clip": 1.02188063, "balance_loss_mlp": 1.00062418, "epoch": 0.3537545842602056, "flos": 65991066370560.0, "grad_norm": 1.3645667159315482, "language_loss": 0.64607334, "learning_rate": 2.997750460407142e-06, "loss": 0.66687, "num_input_tokens_seen": 63209765, "step": 2942, "time_per_iteration": 3.092388868331909 }, { "auxiliary_loss_clip": 0.01150851, "auxiliary_loss_mlp": 0.01028865, "balance_loss_clip": 1.04984069, "balance_loss_mlp": 1.01954281, "epoch": 0.35387482715084473, "flos": 18436897526400.0, "grad_norm": 2.0288066105800926, "language_loss": 0.70046389, "learning_rate": 2.997075270414501e-06, "loss": 0.72226107, "num_input_tokens_seen": 63226980, "step": 2943, "time_per_iteration": 2.515299081802368 }, { "auxiliary_loss_clip": 0.01068478, "auxiliary_loss_mlp": 0.01000876, "balance_loss_clip": 1.02372503, "balance_loss_mlp": 0.9996959, "epoch": 0.3539950700414838, "flos": 65588579498880.0, "grad_norm": 0.6984994446207212, "language_loss": 0.57756382, "learning_rate": 2.9963999291663347e-06, "loss": 0.59825736, "num_input_tokens_seen": 63292760, "step": 2944, "time_per_iteration": 3.091273784637451 }, { "auxiliary_loss_clip": 0.01138614, "auxiliary_loss_mlp": 0.01031943, "balance_loss_clip": 1.0551548, "balance_loss_mlp": 1.02364039, "epoch": 0.3541153129321229, "flos": 20521655919360.0, "grad_norm": 2.0815134501411916, "language_loss": 0.73934811, "learning_rate": 2.9957244367650915e-06, "loss": 0.76105368, "num_input_tokens_seen": 63309005, "step": 2945, "time_per_iteration": 2.538546323776245 }, { "auxiliary_loss_clip": 0.01131511, "auxiliary_loss_mlp": 0.01029031, "balance_loss_clip": 1.05332553, "balance_loss_mlp": 1.0197742, "epoch": 0.354235555822762, "flos": 19573578391680.0, "grad_norm": 1.7673821560908987, "language_loss": 0.83683157, "learning_rate": 2.9950487933132425e-06, "loss": 0.85843694, "num_input_tokens_seen": 63326420, "step": 2946, "time_per_iteration": 2.5497591495513916 }, { "auxiliary_loss_clip": 0.01180689, "auxiliary_loss_mlp": 0.01030523, "balance_loss_clip": 1.05506551, "balance_loss_mlp": 1.02214837, "epoch": 0.35435579871340106, "flos": 20777268078720.0, "grad_norm": 2.542723600213033, "language_loss": 0.71537042, "learning_rate": 2.994372998913283e-06, "loss": 0.73748249, "num_input_tokens_seen": 63344925, "step": 2947, "time_per_iteration": 2.4698736667633057 }, { "auxiliary_loss_clip": 0.01165576, "auxiliary_loss_mlp": 0.0103328, "balance_loss_clip": 1.05880082, "balance_loss_mlp": 1.02449429, "epoch": 0.35447604160404017, "flos": 23951807153280.0, "grad_norm": 2.848637827391769, "language_loss": 0.62397897, "learning_rate": 2.99369705366773e-06, "loss": 0.64596748, "num_input_tokens_seen": 63365170, "step": 2948, "time_per_iteration": 2.5240981578826904 }, { "auxiliary_loss_clip": 0.01161566, "auxiliary_loss_mlp": 0.01028654, "balance_loss_clip": 1.05478275, "balance_loss_mlp": 1.01985049, "epoch": 0.3545962844946792, "flos": 23435662671360.0, "grad_norm": 2.057421355231936, "language_loss": 0.82543683, "learning_rate": 2.9930209576791244e-06, "loss": 0.84733903, "num_input_tokens_seen": 63383645, "step": 2949, "time_per_iteration": 2.5266072750091553 }, { "auxiliary_loss_clip": 0.01176316, "auxiliary_loss_mlp": 0.01032345, "balance_loss_clip": 1.05739665, "balance_loss_mlp": 1.02411985, "epoch": 0.35471652738531834, "flos": 22085134185600.0, "grad_norm": 1.9203970493867377, "language_loss": 0.63537633, "learning_rate": 2.9923447110500285e-06, "loss": 0.65746289, "num_input_tokens_seen": 63402390, "step": 2950, "time_per_iteration": 3.2974531650543213 }, { "auxiliary_loss_clip": 0.01166821, "auxiliary_loss_mlp": 0.01037163, "balance_loss_clip": 1.05480599, "balance_loss_mlp": 1.02830553, "epoch": 0.35483677027595745, "flos": 27341881787520.0, "grad_norm": 2.0659287671097832, "language_loss": 0.75342083, "learning_rate": 2.9916683138830295e-06, "loss": 0.77546066, "num_input_tokens_seen": 63423055, "step": 2951, "time_per_iteration": 2.5069446563720703 }, { "auxiliary_loss_clip": 0.01161715, "auxiliary_loss_mlp": 0.01037036, "balance_loss_clip": 1.05561662, "balance_loss_mlp": 1.02758908, "epoch": 0.3549570131665965, "flos": 13516166678400.0, "grad_norm": 1.781025458989053, "language_loss": 0.80889213, "learning_rate": 2.9909917662807353e-06, "loss": 0.83087969, "num_input_tokens_seen": 63440855, "step": 2952, "time_per_iteration": 2.465482234954834 }, { "auxiliary_loss_clip": 0.01174228, "auxiliary_loss_mlp": 0.01039138, "balance_loss_clip": 1.05458212, "balance_loss_mlp": 1.03014016, "epoch": 0.3550772560572356, "flos": 20887549810560.0, "grad_norm": 2.237762260331911, "language_loss": 0.69040287, "learning_rate": 2.9903150683457783e-06, "loss": 0.71253651, "num_input_tokens_seen": 63459400, "step": 2953, "time_per_iteration": 2.4536101818084717 }, { "auxiliary_loss_clip": 0.01164352, "auxiliary_loss_mlp": 0.01030517, "balance_loss_clip": 1.05339575, "balance_loss_mlp": 1.02217865, "epoch": 0.3551974989478747, "flos": 20194042947840.0, "grad_norm": 1.8619957723256158, "language_loss": 0.64979893, "learning_rate": 2.9896382201808126e-06, "loss": 0.67174768, "num_input_tokens_seen": 63476800, "step": 2954, "time_per_iteration": 2.5076870918273926 }, { "auxiliary_loss_clip": 0.01192801, "auxiliary_loss_mlp": 0.01031973, "balance_loss_clip": 1.05734336, "balance_loss_mlp": 1.02325916, "epoch": 0.3553177418385138, "flos": 19828831415040.0, "grad_norm": 2.3509888501696268, "language_loss": 0.81204355, "learning_rate": 2.988961221888516e-06, "loss": 0.83429134, "num_input_tokens_seen": 63493475, "step": 2955, "time_per_iteration": 4.845149993896484 }, { "auxiliary_loss_clip": 0.01137827, "auxiliary_loss_mlp": 0.01028094, "balance_loss_clip": 1.04986143, "balance_loss_mlp": 1.01952302, "epoch": 0.3554379847291529, "flos": 14829132516480.0, "grad_norm": 3.1613313026581826, "language_loss": 0.78882468, "learning_rate": 2.988284073571589e-06, "loss": 0.81048393, "num_input_tokens_seen": 63509560, "step": 2956, "time_per_iteration": 2.496170997619629 }, { "auxiliary_loss_clip": 0.01180597, "auxiliary_loss_mlp": 0.00763554, "balance_loss_clip": 1.05845833, "balance_loss_mlp": 1.00084758, "epoch": 0.355558227619792, "flos": 20485350247680.0, "grad_norm": 2.170774794026298, "language_loss": 0.72462809, "learning_rate": 2.9876067753327528e-06, "loss": 0.74406964, "num_input_tokens_seen": 63527290, "step": 2957, "time_per_iteration": 2.477116346359253 }, { "auxiliary_loss_clip": 0.0118025, "auxiliary_loss_mlp": 0.01040496, "balance_loss_clip": 1.05509901, "balance_loss_mlp": 1.03102446, "epoch": 0.35567847051043106, "flos": 37663613256960.0, "grad_norm": 1.9891017048844266, "language_loss": 0.80440634, "learning_rate": 2.986929327274754e-06, "loss": 0.82661378, "num_input_tokens_seen": 63547870, "step": 2958, "time_per_iteration": 2.586879014968872 }, { "auxiliary_loss_clip": 0.01179266, "auxiliary_loss_mlp": 0.01033146, "balance_loss_clip": 1.0598433, "balance_loss_mlp": 1.02468848, "epoch": 0.35579871340107017, "flos": 26943058103040.0, "grad_norm": 1.682022100539393, "language_loss": 0.7870841, "learning_rate": 2.9862517295003617e-06, "loss": 0.80920827, "num_input_tokens_seen": 63568285, "step": 2959, "time_per_iteration": 2.517982244491577 }, { "auxiliary_loss_clip": 0.01143212, "auxiliary_loss_mlp": 0.01029474, "balance_loss_clip": 1.04818618, "balance_loss_mlp": 1.02129018, "epoch": 0.3559189562917093, "flos": 28293335193600.0, "grad_norm": 1.6866107263385866, "language_loss": 0.7256726, "learning_rate": 2.9855739821123654e-06, "loss": 0.74739945, "num_input_tokens_seen": 63589865, "step": 2960, "time_per_iteration": 2.5814740657806396 }, { "auxiliary_loss_clip": 0.01172072, "auxiliary_loss_mlp": 0.0102601, "balance_loss_clip": 1.05543816, "balance_loss_mlp": 1.01732564, "epoch": 0.35603919918234833, "flos": 25664063552640.0, "grad_norm": 1.795013740185745, "language_loss": 0.81711829, "learning_rate": 2.98489608521358e-06, "loss": 0.83909911, "num_input_tokens_seen": 63609805, "step": 2961, "time_per_iteration": 2.533064365386963 }, { "auxiliary_loss_clip": 0.01181756, "auxiliary_loss_mlp": 0.00763806, "balance_loss_clip": 1.05550539, "balance_loss_mlp": 1.00084913, "epoch": 0.35615944207298744, "flos": 23000856537600.0, "grad_norm": 1.914961886565478, "language_loss": 0.79416561, "learning_rate": 2.9842180389068425e-06, "loss": 0.81362116, "num_input_tokens_seen": 63627115, "step": 2962, "time_per_iteration": 2.484043598175049 }, { "auxiliary_loss_clip": 0.01058521, "auxiliary_loss_mlp": 0.01005315, "balance_loss_clip": 1.02967417, "balance_loss_mlp": 1.00359869, "epoch": 0.35627968496362655, "flos": 68251283723520.0, "grad_norm": 0.7663418270421275, "language_loss": 0.5926497, "learning_rate": 2.98353984329501e-06, "loss": 0.61328804, "num_input_tokens_seen": 63691460, "step": 2963, "time_per_iteration": 3.149522542953491 }, { "auxiliary_loss_clip": 0.01163954, "auxiliary_loss_mlp": 0.01029451, "balance_loss_clip": 1.05596471, "balance_loss_mlp": 1.02062416, "epoch": 0.3563999278542656, "flos": 22641714403200.0, "grad_norm": 1.728939880323328, "language_loss": 0.70818675, "learning_rate": 2.982861498480965e-06, "loss": 0.7301209, "num_input_tokens_seen": 63713840, "step": 2964, "time_per_iteration": 2.5470314025878906 }, { "auxiliary_loss_clip": 0.01143081, "auxiliary_loss_mlp": 0.0103367, "balance_loss_clip": 1.04901266, "balance_loss_mlp": 1.02518821, "epoch": 0.3565201707449047, "flos": 25952533678080.0, "grad_norm": 1.6449619186871334, "language_loss": 0.82461715, "learning_rate": 2.9821830045676122e-06, "loss": 0.84638464, "num_input_tokens_seen": 63733540, "step": 2965, "time_per_iteration": 2.5800325870513916 }, { "auxiliary_loss_clip": 0.01193302, "auxiliary_loss_mlp": 0.01032377, "balance_loss_clip": 1.05826831, "balance_loss_mlp": 1.02447963, "epoch": 0.3566404136355438, "flos": 28475725478400.0, "grad_norm": 1.9779485650648483, "language_loss": 0.72989655, "learning_rate": 2.9815043616578793e-06, "loss": 0.7521534, "num_input_tokens_seen": 63754335, "step": 2966, "time_per_iteration": 2.4848713874816895 }, { "auxiliary_loss_clip": 0.01143199, "auxiliary_loss_mlp": 0.0103879, "balance_loss_clip": 1.04881489, "balance_loss_mlp": 1.03006339, "epoch": 0.3567606565261829, "flos": 38363117690880.0, "grad_norm": 2.0899447486052254, "language_loss": 0.77082103, "learning_rate": 2.9808255698547145e-06, "loss": 0.79264092, "num_input_tokens_seen": 63777135, "step": 2967, "time_per_iteration": 2.7020087242126465 }, { "auxiliary_loss_clip": 0.0117731, "auxiliary_loss_mlp": 0.01029971, "balance_loss_clip": 1.05788755, "balance_loss_mlp": 1.02135789, "epoch": 0.356880899416822, "flos": 21981029592960.0, "grad_norm": 2.0127120577905826, "language_loss": 0.79628217, "learning_rate": 2.9801466292610913e-06, "loss": 0.81835496, "num_input_tokens_seen": 63797020, "step": 2968, "time_per_iteration": 2.4610321521759033 }, { "auxiliary_loss_clip": 0.01174793, "auxiliary_loss_mlp": 0.0102627, "balance_loss_clip": 1.05442548, "balance_loss_mlp": 1.01826167, "epoch": 0.35700114230746105, "flos": 18989132198400.0, "grad_norm": 2.046760480636488, "language_loss": 0.8116557, "learning_rate": 2.979467539980003e-06, "loss": 0.83366632, "num_input_tokens_seen": 63813810, "step": 2969, "time_per_iteration": 2.444014310836792 }, { "auxiliary_loss_clip": 0.01179484, "auxiliary_loss_mlp": 0.01036124, "balance_loss_clip": 1.05624914, "balance_loss_mlp": 1.0277735, "epoch": 0.35712138519810016, "flos": 19756112330880.0, "grad_norm": 2.07334719715132, "language_loss": 0.7704581, "learning_rate": 2.978788302114468e-06, "loss": 0.79261422, "num_input_tokens_seen": 63830925, "step": 2970, "time_per_iteration": 2.4472720623016357 }, { "auxiliary_loss_clip": 0.01174011, "auxiliary_loss_mlp": 0.0103306, "balance_loss_clip": 1.05488682, "balance_loss_mlp": 1.02411962, "epoch": 0.35724162808873927, "flos": 35183012008320.0, "grad_norm": 1.956911213403874, "language_loss": 0.83020657, "learning_rate": 2.9781089157675255e-06, "loss": 0.85227728, "num_input_tokens_seen": 63849385, "step": 2971, "time_per_iteration": 2.5607309341430664 }, { "auxiliary_loss_clip": 0.01172844, "auxiliary_loss_mlp": 0.010338, "balance_loss_clip": 1.05708694, "balance_loss_mlp": 1.02533662, "epoch": 0.3573618709793783, "flos": 25556726736000.0, "grad_norm": 1.4080445607258598, "language_loss": 0.88316917, "learning_rate": 2.977429381042238e-06, "loss": 0.90523559, "num_input_tokens_seen": 63870060, "step": 2972, "time_per_iteration": 2.490654468536377 }, { "auxiliary_loss_clip": 0.01162222, "auxiliary_loss_mlp": 0.01027474, "balance_loss_clip": 1.05405188, "balance_loss_mlp": 1.01970732, "epoch": 0.35748211387001744, "flos": 29132352051840.0, "grad_norm": 2.1795992832952686, "language_loss": 0.88873875, "learning_rate": 2.9767496980416913e-06, "loss": 0.91063571, "num_input_tokens_seen": 63889355, "step": 2973, "time_per_iteration": 2.5249698162078857 }, { "auxiliary_loss_clip": 0.0115881, "auxiliary_loss_mlp": 0.01033317, "balance_loss_clip": 1.05237639, "balance_loss_mlp": 1.02408481, "epoch": 0.35760235676065655, "flos": 13954169122560.0, "grad_norm": 3.159873043654318, "language_loss": 0.80567002, "learning_rate": 2.9760698668689914e-06, "loss": 0.82759136, "num_input_tokens_seen": 63905580, "step": 2974, "time_per_iteration": 2.4530551433563232 }, { "auxiliary_loss_clip": 0.0117614, "auxiliary_loss_mlp": 0.01025103, "balance_loss_clip": 1.05403423, "balance_loss_mlp": 1.0172708, "epoch": 0.3577225996512956, "flos": 44018688977280.0, "grad_norm": 1.8480005907925756, "language_loss": 0.71325302, "learning_rate": 2.975389887627269e-06, "loss": 0.73526549, "num_input_tokens_seen": 63928180, "step": 2975, "time_per_iteration": 2.646256685256958 }, { "auxiliary_loss_clip": 0.01152168, "auxiliary_loss_mlp": 0.01033488, "balance_loss_clip": 1.05320835, "balance_loss_mlp": 1.02555513, "epoch": 0.3578428425419347, "flos": 17055199013760.0, "grad_norm": 2.228301652818403, "language_loss": 0.90113956, "learning_rate": 2.9747097604196764e-06, "loss": 0.9229961, "num_input_tokens_seen": 63944825, "step": 2976, "time_per_iteration": 3.3176894187927246 }, { "auxiliary_loss_clip": 0.01048579, "auxiliary_loss_mlp": 0.01002636, "balance_loss_clip": 1.02354014, "balance_loss_mlp": 1.00149727, "epoch": 0.3579630854325738, "flos": 71676550707840.0, "grad_norm": 0.6782342843040509, "language_loss": 0.56674582, "learning_rate": 2.9740294853493875e-06, "loss": 0.58725792, "num_input_tokens_seen": 64016385, "step": 2977, "time_per_iteration": 3.346116781234741 }, { "auxiliary_loss_clip": 0.01137286, "auxiliary_loss_mlp": 0.0102882, "balance_loss_clip": 1.04993582, "balance_loss_mlp": 1.02073145, "epoch": 0.3580833283232129, "flos": 25046651652480.0, "grad_norm": 2.0318924444903796, "language_loss": 0.67117083, "learning_rate": 2.9733490625196008e-06, "loss": 0.69283187, "num_input_tokens_seen": 64036245, "step": 2978, "time_per_iteration": 2.5833258628845215 }, { "auxiliary_loss_clip": 0.01134314, "auxiliary_loss_mlp": 0.01030004, "balance_loss_clip": 1.04870224, "balance_loss_mlp": 1.02261877, "epoch": 0.358203571213852, "flos": 13953127628160.0, "grad_norm": 3.068179925146427, "language_loss": 0.75730401, "learning_rate": 2.9726684920335353e-06, "loss": 0.77894723, "num_input_tokens_seen": 64054110, "step": 2979, "time_per_iteration": 2.497767210006714 }, { "auxiliary_loss_clip": 0.01192178, "auxiliary_loss_mlp": 0.00763745, "balance_loss_clip": 1.05593419, "balance_loss_mlp": 1.00068951, "epoch": 0.35832381410449105, "flos": 20302457172480.0, "grad_norm": 2.2464413454239467, "language_loss": 0.81978083, "learning_rate": 2.971987773994432e-06, "loss": 0.83934009, "num_input_tokens_seen": 64070295, "step": 2980, "time_per_iteration": 2.4605650901794434 }, { "auxiliary_loss_clip": 0.0116609, "auxiliary_loss_mlp": 0.01024595, "balance_loss_clip": 1.05126548, "balance_loss_mlp": 1.0166198, "epoch": 0.35844405699513016, "flos": 16983234115200.0, "grad_norm": 2.0029866291361884, "language_loss": 0.83118755, "learning_rate": 2.9713069085055566e-06, "loss": 0.8530944, "num_input_tokens_seen": 64088605, "step": 2981, "time_per_iteration": 2.476473331451416 }, { "auxiliary_loss_clip": 0.01146202, "auxiliary_loss_mlp": 0.01025839, "balance_loss_clip": 1.0511632, "balance_loss_mlp": 1.01742864, "epoch": 0.35856429988576927, "flos": 23216858974080.0, "grad_norm": 1.9629050954993847, "language_loss": 0.78956056, "learning_rate": 2.9706258956701958e-06, "loss": 0.81128097, "num_input_tokens_seen": 64108595, "step": 2982, "time_per_iteration": 4.155234098434448 }, { "auxiliary_loss_clip": 0.01177914, "auxiliary_loss_mlp": 0.01030959, "balance_loss_clip": 1.05516458, "balance_loss_mlp": 1.02272201, "epoch": 0.3586845427764083, "flos": 23034576430080.0, "grad_norm": 2.6952859822389312, "language_loss": 0.77377182, "learning_rate": 2.9699447355916575e-06, "loss": 0.79586053, "num_input_tokens_seen": 64127405, "step": 2983, "time_per_iteration": 2.6154489517211914 }, { "auxiliary_loss_clip": 0.01188941, "auxiliary_loss_mlp": 0.00763004, "balance_loss_clip": 1.05645323, "balance_loss_mlp": 1.00072801, "epoch": 0.35880478566704743, "flos": 20010682995840.0, "grad_norm": 2.0530629640322355, "language_loss": 0.74110591, "learning_rate": 2.969263428373275e-06, "loss": 0.76062536, "num_input_tokens_seen": 64145755, "step": 2984, "time_per_iteration": 2.435987710952759 }, { "auxiliary_loss_clip": 0.01162489, "auxiliary_loss_mlp": 0.01032509, "balance_loss_clip": 1.05279374, "balance_loss_mlp": 1.02428913, "epoch": 0.35892502855768654, "flos": 13699095667200.0, "grad_norm": 1.996497047089602, "language_loss": 0.78986418, "learning_rate": 2.9685819741184007e-06, "loss": 0.81181413, "num_input_tokens_seen": 64164195, "step": 2985, "time_per_iteration": 2.4754245281219482 }, { "auxiliary_loss_clip": 0.01140273, "auxiliary_loss_mlp": 0.01032343, "balance_loss_clip": 1.05050433, "balance_loss_mlp": 1.02383435, "epoch": 0.3590452714483256, "flos": 18114096977280.0, "grad_norm": 6.0183160916630785, "language_loss": 0.68161905, "learning_rate": 2.967900372930411e-06, "loss": 0.70334518, "num_input_tokens_seen": 64182705, "step": 2986, "time_per_iteration": 2.6570496559143066 }, { "auxiliary_loss_clip": 0.01154199, "auxiliary_loss_mlp": 0.01032721, "balance_loss_clip": 1.05138612, "balance_loss_mlp": 1.02426291, "epoch": 0.3591655143389647, "flos": 17749352321280.0, "grad_norm": 3.9236801382411315, "language_loss": 0.79233557, "learning_rate": 2.9672186249127046e-06, "loss": 0.81420475, "num_input_tokens_seen": 64202170, "step": 2987, "time_per_iteration": 2.469970703125 }, { "auxiliary_loss_clip": 0.01160333, "auxiliary_loss_mlp": 0.01036119, "balance_loss_clip": 1.05470395, "balance_loss_mlp": 1.02813792, "epoch": 0.3592857572296038, "flos": 25224409082880.0, "grad_norm": 1.9482356707264374, "language_loss": 0.78970307, "learning_rate": 2.9665367301687014e-06, "loss": 0.81166756, "num_input_tokens_seen": 64220415, "step": 2988, "time_per_iteration": 2.5062496662139893 }, { "auxiliary_loss_clip": 0.0115295, "auxiliary_loss_mlp": 0.01029108, "balance_loss_clip": 1.05173755, "balance_loss_mlp": 1.0210495, "epoch": 0.3594060001202429, "flos": 29384408764800.0, "grad_norm": 2.1563760533526044, "language_loss": 0.76961029, "learning_rate": 2.965854688801845e-06, "loss": 0.79143089, "num_input_tokens_seen": 64242475, "step": 2989, "time_per_iteration": 2.5602898597717285 }, { "auxiliary_loss_clip": 0.01170718, "auxiliary_loss_mlp": 0.0102655, "balance_loss_clip": 1.04943419, "balance_loss_mlp": 1.01852095, "epoch": 0.359526243010882, "flos": 17052900543360.0, "grad_norm": 1.925616359896462, "language_loss": 0.76317143, "learning_rate": 2.9651725009156005e-06, "loss": 0.78514421, "num_input_tokens_seen": 64260220, "step": 2990, "time_per_iteration": 2.4543933868408203 }, { "auxiliary_loss_clip": 0.0115328, "auxiliary_loss_mlp": 0.01032245, "balance_loss_clip": 1.04994416, "balance_loss_mlp": 1.02341127, "epoch": 0.3596464859015211, "flos": 22965089569920.0, "grad_norm": 1.5976815336725965, "language_loss": 0.74403501, "learning_rate": 2.964490166613454e-06, "loss": 0.76589024, "num_input_tokens_seen": 64280145, "step": 2991, "time_per_iteration": 2.511255979537964 }, { "auxiliary_loss_clip": 0.0108966, "auxiliary_loss_mlp": 0.01000824, "balance_loss_clip": 1.02266264, "balance_loss_mlp": 0.99978685, "epoch": 0.35976672879216015, "flos": 54739462590720.0, "grad_norm": 0.7590962906847478, "language_loss": 0.57717937, "learning_rate": 2.963807685998917e-06, "loss": 0.59808421, "num_input_tokens_seen": 64336010, "step": 2992, "time_per_iteration": 2.8447141647338867 }, { "auxiliary_loss_clip": 0.01135906, "auxiliary_loss_mlp": 0.01029592, "balance_loss_clip": 1.04918718, "balance_loss_mlp": 1.02121711, "epoch": 0.35988697168279926, "flos": 43139020901760.0, "grad_norm": 1.469671895656916, "language_loss": 0.77936363, "learning_rate": 2.9631250591755196e-06, "loss": 0.8010186, "num_input_tokens_seen": 64358725, "step": 2993, "time_per_iteration": 2.7509398460388184 }, { "auxiliary_loss_clip": 0.01156387, "auxiliary_loss_mlp": 0.01031661, "balance_loss_clip": 1.05406249, "balance_loss_mlp": 1.02260745, "epoch": 0.36000721457343837, "flos": 35845600239360.0, "grad_norm": 1.6352496961306806, "language_loss": 0.57549322, "learning_rate": 2.962442286246817e-06, "loss": 0.59737372, "num_input_tokens_seen": 64381555, "step": 2994, "time_per_iteration": 2.613842725753784 }, { "auxiliary_loss_clip": 0.01164761, "auxiliary_loss_mlp": 0.01029919, "balance_loss_clip": 1.05320179, "balance_loss_mlp": 1.02185488, "epoch": 0.3601274574640774, "flos": 18291100222080.0, "grad_norm": 2.1585049943428656, "language_loss": 0.69675332, "learning_rate": 2.9617593673163853e-06, "loss": 0.71870017, "num_input_tokens_seen": 64400375, "step": 2995, "time_per_iteration": 2.481973648071289 }, { "auxiliary_loss_clip": 0.01162208, "auxiliary_loss_mlp": 0.01024018, "balance_loss_clip": 1.0502317, "balance_loss_mlp": 1.01640046, "epoch": 0.36024770035471654, "flos": 13333955961600.0, "grad_norm": 2.2758988078744617, "language_loss": 0.77393389, "learning_rate": 2.9610763024878216e-06, "loss": 0.79579616, "num_input_tokens_seen": 64415880, "step": 2996, "time_per_iteration": 2.4539296627044678 }, { "auxiliary_loss_clip": 0.01153675, "auxiliary_loss_mlp": 0.01035445, "balance_loss_clip": 1.05115676, "balance_loss_mlp": 1.02703476, "epoch": 0.3603679432453556, "flos": 20267013427200.0, "grad_norm": 1.6758543819927192, "language_loss": 0.91451299, "learning_rate": 2.960393091864747e-06, "loss": 0.93640423, "num_input_tokens_seen": 64434260, "step": 2997, "time_per_iteration": 2.489513635635376 }, { "auxiliary_loss_clip": 0.01162383, "auxiliary_loss_mlp": 0.01024365, "balance_loss_clip": 1.05561626, "balance_loss_mlp": 1.01630664, "epoch": 0.3604881861359947, "flos": 22451135817600.0, "grad_norm": 1.8164362400208431, "language_loss": 0.74771369, "learning_rate": 2.959709735550804e-06, "loss": 0.76958114, "num_input_tokens_seen": 64453855, "step": 2998, "time_per_iteration": 2.49832820892334 }, { "auxiliary_loss_clip": 0.01133679, "auxiliary_loss_mlp": 0.01025359, "balance_loss_clip": 1.04901218, "balance_loss_mlp": 1.01730657, "epoch": 0.3606084290266338, "flos": 22054251467520.0, "grad_norm": 2.3783355842547937, "language_loss": 0.76083314, "learning_rate": 2.9590262336496575e-06, "loss": 0.78242362, "num_input_tokens_seen": 64473585, "step": 2999, "time_per_iteration": 2.5815391540527344 }, { "auxiliary_loss_clip": 0.01142755, "auxiliary_loss_mlp": 0.01034449, "balance_loss_clip": 1.05298042, "balance_loss_mlp": 1.02572334, "epoch": 0.36072867191727287, "flos": 15632921111040.0, "grad_norm": 2.0668966412379697, "language_loss": 0.85315132, "learning_rate": 2.9583425862649936e-06, "loss": 0.87492335, "num_input_tokens_seen": 64491720, "step": 3000, "time_per_iteration": 2.5043280124664307 }, { "auxiliary_loss_clip": 0.01190005, "auxiliary_loss_mlp": 0.01028325, "balance_loss_clip": 1.05618858, "balance_loss_mlp": 1.02005816, "epoch": 0.360848914807912, "flos": 19677000625920.0, "grad_norm": 1.9197797231320495, "language_loss": 0.73385042, "learning_rate": 2.9576587935005215e-06, "loss": 0.75603372, "num_input_tokens_seen": 64509800, "step": 3001, "time_per_iteration": 2.433746337890625 }, { "auxiliary_loss_clip": 0.01175934, "auxiliary_loss_mlp": 0.01028569, "balance_loss_clip": 1.05320883, "balance_loss_mlp": 1.02019477, "epoch": 0.3609691576985511, "flos": 18877808972160.0, "grad_norm": 2.2343689547062087, "language_loss": 0.71778381, "learning_rate": 2.9569748554599713e-06, "loss": 0.73982882, "num_input_tokens_seen": 64525410, "step": 3002, "time_per_iteration": 2.4360578060150146 }, { "auxiliary_loss_clip": 0.01161677, "auxiliary_loss_mlp": 0.01032589, "balance_loss_clip": 1.05496955, "balance_loss_mlp": 1.02442932, "epoch": 0.36108940058919015, "flos": 42224088648960.0, "grad_norm": 4.244675395062702, "language_loss": 0.73288316, "learning_rate": 2.956290772247097e-06, "loss": 0.75482583, "num_input_tokens_seen": 64544085, "step": 3003, "time_per_iteration": 3.464754343032837 }, { "auxiliary_loss_clip": 0.01124095, "auxiliary_loss_mlp": 0.01033909, "balance_loss_clip": 1.04968941, "balance_loss_mlp": 1.0259521, "epoch": 0.36120964347982926, "flos": 23185150243200.0, "grad_norm": 1.6944738342495917, "language_loss": 0.73045415, "learning_rate": 2.9556065439656724e-06, "loss": 0.75203419, "num_input_tokens_seen": 64563135, "step": 3004, "time_per_iteration": 2.5628418922424316 }, { "auxiliary_loss_clip": 0.01110862, "auxiliary_loss_mlp": 0.01029254, "balance_loss_clip": 1.04467463, "balance_loss_mlp": 1.02067125, "epoch": 0.36132988637046837, "flos": 18113055482880.0, "grad_norm": 2.3794617465351733, "language_loss": 0.81456739, "learning_rate": 2.9549221707194952e-06, "loss": 0.83596849, "num_input_tokens_seen": 64581985, "step": 3005, "time_per_iteration": 2.602739095687866 }, { "auxiliary_loss_clip": 0.01176126, "auxiliary_loss_mlp": 0.01025335, "balance_loss_clip": 1.05500448, "balance_loss_mlp": 1.01719332, "epoch": 0.3614501292611074, "flos": 27813101333760.0, "grad_norm": 1.8741744879628925, "language_loss": 0.72265118, "learning_rate": 2.954237652612384e-06, "loss": 0.74466574, "num_input_tokens_seen": 64601035, "step": 3006, "time_per_iteration": 2.5227699279785156 }, { "auxiliary_loss_clip": 0.01155368, "auxiliary_loss_mlp": 0.01027096, "balance_loss_clip": 1.05020416, "balance_loss_mlp": 1.01942468, "epoch": 0.36157037215174653, "flos": 22634926732800.0, "grad_norm": 1.8622469017009153, "language_loss": 0.84637547, "learning_rate": 2.9535529897481796e-06, "loss": 0.86820012, "num_input_tokens_seen": 64618580, "step": 3007, "time_per_iteration": 2.5105478763580322 }, { "auxiliary_loss_clip": 0.01190097, "auxiliary_loss_mlp": 0.0102716, "balance_loss_clip": 1.05491745, "balance_loss_mlp": 1.01880932, "epoch": 0.36169061504238564, "flos": 12600839376000.0, "grad_norm": 2.3578117338497746, "language_loss": 0.76721716, "learning_rate": 2.9528681822307446e-06, "loss": 0.78938973, "num_input_tokens_seen": 64635430, "step": 3008, "time_per_iteration": 3.230964422225952 }, { "auxiliary_loss_clip": 0.01172885, "auxiliary_loss_mlp": 0.00762667, "balance_loss_clip": 1.05761623, "balance_loss_mlp": 1.00062132, "epoch": 0.3618108579330247, "flos": 26684644682880.0, "grad_norm": 2.1050139023473884, "language_loss": 0.8221277, "learning_rate": 2.952183230163964e-06, "loss": 0.84148324, "num_input_tokens_seen": 64655005, "step": 3009, "time_per_iteration": 4.012263774871826 }, { "auxiliary_loss_clip": 0.01142407, "auxiliary_loss_mlp": 0.01025574, "balance_loss_clip": 1.05066669, "balance_loss_mlp": 1.01778948, "epoch": 0.3619311008236638, "flos": 22817029708800.0, "grad_norm": 1.8362056113763947, "language_loss": 0.72998989, "learning_rate": 2.9514981336517448e-06, "loss": 0.75166965, "num_input_tokens_seen": 64674775, "step": 3010, "time_per_iteration": 2.5612552165985107 }, { "auxiliary_loss_clip": 0.01174296, "auxiliary_loss_mlp": 0.01031634, "balance_loss_clip": 1.05639648, "balance_loss_mlp": 1.02302694, "epoch": 0.36205134371430286, "flos": 25919603884800.0, "grad_norm": 2.1586767954913344, "language_loss": 0.81476468, "learning_rate": 2.950812892798015e-06, "loss": 0.83682394, "num_input_tokens_seen": 64695670, "step": 3011, "time_per_iteration": 2.4976279735565186 }, { "auxiliary_loss_clip": 0.01129894, "auxiliary_loss_mlp": 0.00763099, "balance_loss_clip": 1.0525856, "balance_loss_mlp": 1.00069559, "epoch": 0.362171586604942, "flos": 26139592730880.0, "grad_norm": 7.740300828705119, "language_loss": 0.87423897, "learning_rate": 2.9501275077067256e-06, "loss": 0.89316893, "num_input_tokens_seen": 64716290, "step": 3012, "time_per_iteration": 2.5927493572235107 }, { "auxiliary_loss_clip": 0.01101595, "auxiliary_loss_mlp": 0.01024346, "balance_loss_clip": 1.04460859, "balance_loss_mlp": 1.01646662, "epoch": 0.3622918294955811, "flos": 28074208273920.0, "grad_norm": 1.4297100715115676, "language_loss": 0.88562202, "learning_rate": 2.949441978481848e-06, "loss": 0.90688145, "num_input_tokens_seen": 64737190, "step": 3013, "time_per_iteration": 2.665468215942383 }, { "auxiliary_loss_clip": 0.01149607, "auxiliary_loss_mlp": 0.01028959, "balance_loss_clip": 1.04914284, "balance_loss_mlp": 1.02026808, "epoch": 0.36241207238622014, "flos": 19828005402240.0, "grad_norm": 2.4466537836984323, "language_loss": 0.79897869, "learning_rate": 2.9487563052273778e-06, "loss": 0.8207643, "num_input_tokens_seen": 64753950, "step": 3014, "time_per_iteration": 2.503666877746582 }, { "auxiliary_loss_clip": 0.01171805, "auxiliary_loss_mlp": 0.01031463, "balance_loss_clip": 1.05699706, "balance_loss_mlp": 1.02327895, "epoch": 0.36253231527685925, "flos": 21397158017280.0, "grad_norm": 1.8469445822922441, "language_loss": 0.85834885, "learning_rate": 2.94807048804733e-06, "loss": 0.88038158, "num_input_tokens_seen": 64773570, "step": 3015, "time_per_iteration": 2.452136278152466 }, { "auxiliary_loss_clip": 0.01148504, "auxiliary_loss_mlp": 0.01027574, "balance_loss_clip": 1.0476234, "balance_loss_mlp": 1.01873493, "epoch": 0.36265255816749836, "flos": 18362885552640.0, "grad_norm": 1.859501233370567, "language_loss": 0.89644897, "learning_rate": 2.9473845270457434e-06, "loss": 0.91820973, "num_input_tokens_seen": 64790385, "step": 3016, "time_per_iteration": 2.5093281269073486 }, { "auxiliary_loss_clip": 0.01151882, "auxiliary_loss_mlp": 0.01028777, "balance_loss_clip": 1.04930615, "balance_loss_mlp": 1.02036715, "epoch": 0.3627728010581374, "flos": 18660046769280.0, "grad_norm": 2.2555949725112887, "language_loss": 0.70363128, "learning_rate": 2.946698422326677e-06, "loss": 0.72543782, "num_input_tokens_seen": 64807845, "step": 3017, "time_per_iteration": 2.476022958755493 }, { "auxiliary_loss_clip": 0.01127007, "auxiliary_loss_mlp": 0.01028513, "balance_loss_clip": 1.04501915, "balance_loss_mlp": 1.02031136, "epoch": 0.36289304394877653, "flos": 27524272072320.0, "grad_norm": 2.2585039808337632, "language_loss": 0.79826194, "learning_rate": 2.946012173994213e-06, "loss": 0.81981713, "num_input_tokens_seen": 64827630, "step": 3018, "time_per_iteration": 2.61464262008667 }, { "auxiliary_loss_clip": 0.0116869, "auxiliary_loss_mlp": 0.01027502, "balance_loss_clip": 1.05535173, "balance_loss_mlp": 1.01940143, "epoch": 0.36301328683941564, "flos": 34533244932480.0, "grad_norm": 1.5639918027379245, "language_loss": 0.67593777, "learning_rate": 2.945325782152454e-06, "loss": 0.69789964, "num_input_tokens_seen": 64850665, "step": 3019, "time_per_iteration": 2.5781259536743164 }, { "auxiliary_loss_clip": 0.01159809, "auxiliary_loss_mlp": 0.01024673, "balance_loss_clip": 1.04933882, "balance_loss_mlp": 1.01688886, "epoch": 0.3631335297300547, "flos": 19025976574080.0, "grad_norm": 2.47884559230756, "language_loss": 0.78729206, "learning_rate": 2.9446392469055257e-06, "loss": 0.80913687, "num_input_tokens_seen": 64868700, "step": 3020, "time_per_iteration": 2.483788251876831 }, { "auxiliary_loss_clip": 0.01140368, "auxiliary_loss_mlp": 0.01028218, "balance_loss_clip": 1.0542922, "balance_loss_mlp": 1.02014184, "epoch": 0.3632537726206938, "flos": 19536769929600.0, "grad_norm": 1.7360396790473371, "language_loss": 0.79997796, "learning_rate": 2.9439525683575745e-06, "loss": 0.82166386, "num_input_tokens_seen": 64887620, "step": 3021, "time_per_iteration": 2.506786823272705 }, { "auxiliary_loss_clip": 0.011932, "auxiliary_loss_mlp": 0.0103072, "balance_loss_clip": 1.05840254, "balance_loss_mlp": 1.02205336, "epoch": 0.3633740155113329, "flos": 21068611292160.0, "grad_norm": 2.3537419886204636, "language_loss": 0.7490477, "learning_rate": 2.9432657466127694e-06, "loss": 0.77128685, "num_input_tokens_seen": 64907190, "step": 3022, "time_per_iteration": 2.429471254348755 }, { "auxiliary_loss_clip": 0.01133027, "auxiliary_loss_mlp": 0.01027058, "balance_loss_clip": 1.05306089, "balance_loss_mlp": 1.01889479, "epoch": 0.36349425840197197, "flos": 20298722158080.0, "grad_norm": 1.6351722895222163, "language_loss": 0.76420426, "learning_rate": 2.9425787817753007e-06, "loss": 0.78580511, "num_input_tokens_seen": 64925850, "step": 3023, "time_per_iteration": 2.567260503768921 }, { "auxiliary_loss_clip": 0.01147785, "auxiliary_loss_mlp": 0.01030199, "balance_loss_clip": 1.05193591, "balance_loss_mlp": 1.02221227, "epoch": 0.3636145012926111, "flos": 29716762331520.0, "grad_norm": 1.9424965757930548, "language_loss": 0.71516395, "learning_rate": 2.94189167394938e-06, "loss": 0.73694378, "num_input_tokens_seen": 64948285, "step": 3024, "time_per_iteration": 2.604113817214966 }, { "auxiliary_loss_clip": 0.01191209, "auxiliary_loss_mlp": 0.0103109, "balance_loss_clip": 1.05912459, "balance_loss_mlp": 1.02292967, "epoch": 0.3637347441832502, "flos": 21431847576960.0, "grad_norm": 1.8814627423968688, "language_loss": 0.81169844, "learning_rate": 2.941204423239241e-06, "loss": 0.83392143, "num_input_tokens_seen": 64967160, "step": 3025, "time_per_iteration": 2.430171251296997 }, { "auxiliary_loss_clip": 0.0117084, "auxiliary_loss_mlp": 0.01029747, "balance_loss_clip": 1.05368423, "balance_loss_mlp": 1.02141404, "epoch": 0.36385498707388925, "flos": 29533941083520.0, "grad_norm": 2.0726574161555837, "language_loss": 0.75780177, "learning_rate": 2.9405170297491395e-06, "loss": 0.77980763, "num_input_tokens_seen": 64987155, "step": 3026, "time_per_iteration": 2.5254337787628174 }, { "auxiliary_loss_clip": 0.01109924, "auxiliary_loss_mlp": 0.00763453, "balance_loss_clip": 1.05187631, "balance_loss_mlp": 1.00072742, "epoch": 0.36397522996452836, "flos": 22236569925120.0, "grad_norm": 2.6072112493235218, "language_loss": 0.80024898, "learning_rate": 2.939829493583353e-06, "loss": 0.81898272, "num_input_tokens_seen": 65003800, "step": 3027, "time_per_iteration": 2.594695806503296 }, { "auxiliary_loss_clip": 0.01136742, "auxiliary_loss_mlp": 0.01028336, "balance_loss_clip": 1.0462985, "balance_loss_mlp": 1.02015245, "epoch": 0.3640954728551674, "flos": 21506505995520.0, "grad_norm": 2.690110434807627, "language_loss": 0.83562315, "learning_rate": 2.939141814846179e-06, "loss": 0.85727394, "num_input_tokens_seen": 65021215, "step": 3028, "time_per_iteration": 2.5258214473724365 }, { "auxiliary_loss_clip": 0.01160532, "auxiliary_loss_mlp": 0.01025117, "balance_loss_clip": 1.05197728, "balance_loss_mlp": 1.01651645, "epoch": 0.3642157157458065, "flos": 17712867081600.0, "grad_norm": 1.655076761386125, "language_loss": 0.82297587, "learning_rate": 2.938453993641938e-06, "loss": 0.84483242, "num_input_tokens_seen": 65039590, "step": 3029, "time_per_iteration": 3.347519636154175 }, { "auxiliary_loss_clip": 0.01161037, "auxiliary_loss_mlp": 0.0103147, "balance_loss_clip": 1.05648255, "balance_loss_mlp": 1.02283382, "epoch": 0.36433595863644563, "flos": 17639537466240.0, "grad_norm": 2.0226674329179355, "language_loss": 0.70361483, "learning_rate": 2.937766030074973e-06, "loss": 0.72553992, "num_input_tokens_seen": 65056845, "step": 3030, "time_per_iteration": 2.472032308578491 }, { "auxiliary_loss_clip": 0.01151273, "auxiliary_loss_mlp": 0.01030014, "balance_loss_clip": 1.05215549, "balance_loss_mlp": 1.02192557, "epoch": 0.3644562015270847, "flos": 26833279161600.0, "grad_norm": 1.7956553124666847, "language_loss": 0.8272683, "learning_rate": 2.937077924249646e-06, "loss": 0.8490811, "num_input_tokens_seen": 65079435, "step": 3031, "time_per_iteration": 2.582489490509033 }, { "auxiliary_loss_clip": 0.01165928, "auxiliary_loss_mlp": 0.01027589, "balance_loss_clip": 1.05304193, "balance_loss_mlp": 1.01915455, "epoch": 0.3645764444177238, "flos": 14282715847680.0, "grad_norm": 2.079976720707225, "language_loss": 0.76207024, "learning_rate": 2.9363896762703443e-06, "loss": 0.7840054, "num_input_tokens_seen": 65096500, "step": 3032, "time_per_iteration": 2.459040880203247 }, { "auxiliary_loss_clip": 0.01189664, "auxiliary_loss_mlp": 0.01030266, "balance_loss_clip": 1.05624104, "balance_loss_mlp": 1.02109265, "epoch": 0.3646966873083629, "flos": 20667489137280.0, "grad_norm": 1.6881177954387239, "language_loss": 0.84259558, "learning_rate": 2.9357012862414725e-06, "loss": 0.86479485, "num_input_tokens_seen": 65115860, "step": 3033, "time_per_iteration": 2.466895818710327 }, { "auxiliary_loss_clip": 0.01174257, "auxiliary_loss_mlp": 0.01030996, "balance_loss_clip": 1.05515027, "balance_loss_mlp": 1.02252591, "epoch": 0.36481693019900197, "flos": 27782613665280.0, "grad_norm": 1.8057515910878763, "language_loss": 0.71681011, "learning_rate": 2.9350127542674593e-06, "loss": 0.73886263, "num_input_tokens_seen": 65138070, "step": 3034, "time_per_iteration": 2.518522262573242 }, { "auxiliary_loss_clip": 0.01168921, "auxiliary_loss_mlp": 0.01033914, "balance_loss_clip": 1.05580187, "balance_loss_mlp": 1.02570701, "epoch": 0.3649371730896411, "flos": 19712588025600.0, "grad_norm": 1.7788645567277077, "language_loss": 0.76612991, "learning_rate": 2.934324080452755e-06, "loss": 0.7881583, "num_input_tokens_seen": 65155860, "step": 3035, "time_per_iteration": 3.360319137573242 }, { "auxiliary_loss_clip": 0.01135732, "auxiliary_loss_mlp": 0.00763618, "balance_loss_clip": 1.0464834, "balance_loss_mlp": 1.00079393, "epoch": 0.3650574159802802, "flos": 24750496016640.0, "grad_norm": 1.4653325612181396, "language_loss": 0.78154784, "learning_rate": 2.9336352649018307e-06, "loss": 0.80054134, "num_input_tokens_seen": 65175930, "step": 3036, "time_per_iteration": 3.4139373302459717 }, { "auxiliary_loss_clip": 0.01163501, "auxiliary_loss_mlp": 0.01033235, "balance_loss_clip": 1.05402935, "balance_loss_mlp": 1.02508688, "epoch": 0.36517765887091924, "flos": 32853487363200.0, "grad_norm": 2.170677541433056, "language_loss": 0.70108736, "learning_rate": 2.9329463077191783e-06, "loss": 0.72305477, "num_input_tokens_seen": 65199305, "step": 3037, "time_per_iteration": 2.6024792194366455 }, { "auxiliary_loss_clip": 0.01131867, "auxiliary_loss_mlp": 0.01023347, "balance_loss_clip": 1.05067217, "balance_loss_mlp": 1.01478767, "epoch": 0.36529790176155835, "flos": 20120318282880.0, "grad_norm": 2.21491010429096, "language_loss": 0.63917327, "learning_rate": 2.9322572090093135e-06, "loss": 0.66072547, "num_input_tokens_seen": 65218010, "step": 3038, "time_per_iteration": 2.5622024536132812 }, { "auxiliary_loss_clip": 0.01132215, "auxiliary_loss_mlp": 0.01031505, "balance_loss_clip": 1.04761934, "balance_loss_mlp": 1.02286851, "epoch": 0.36541814465219746, "flos": 17639573379840.0, "grad_norm": 3.135063262481432, "language_loss": 0.76357126, "learning_rate": 2.9315679688767713e-06, "loss": 0.78520846, "num_input_tokens_seen": 65236020, "step": 3039, "time_per_iteration": 2.531160354614258 }, { "auxiliary_loss_clip": 0.01155675, "auxiliary_loss_mlp": 0.01028757, "balance_loss_clip": 1.05148506, "balance_loss_mlp": 1.02046919, "epoch": 0.3655383875428365, "flos": 22674356887680.0, "grad_norm": 1.5834482963541567, "language_loss": 0.66304111, "learning_rate": 2.9308785874261085e-06, "loss": 0.68488538, "num_input_tokens_seen": 65256210, "step": 3040, "time_per_iteration": 2.5264980792999268 }, { "auxiliary_loss_clip": 0.01191947, "auxiliary_loss_mlp": 0.01032058, "balance_loss_clip": 1.05893612, "balance_loss_mlp": 1.02393413, "epoch": 0.36565863043347563, "flos": 21981173247360.0, "grad_norm": 1.7571565599158954, "language_loss": 0.81965959, "learning_rate": 2.9301890647619045e-06, "loss": 0.84189957, "num_input_tokens_seen": 65275505, "step": 3041, "time_per_iteration": 2.4477994441986084 }, { "auxiliary_loss_clip": 0.01169142, "auxiliary_loss_mlp": 0.0103692, "balance_loss_clip": 1.05534339, "balance_loss_mlp": 1.02774668, "epoch": 0.36577887332411474, "flos": 24827632473600.0, "grad_norm": 1.8183491614990759, "language_loss": 0.8045131, "learning_rate": 2.929499400988759e-06, "loss": 0.82657367, "num_input_tokens_seen": 65296665, "step": 3042, "time_per_iteration": 2.5319924354553223 }, { "auxiliary_loss_clip": 0.01175415, "auxiliary_loss_mlp": 0.01035678, "balance_loss_clip": 1.05617571, "balance_loss_mlp": 1.02664185, "epoch": 0.3658991162147538, "flos": 28293191539200.0, "grad_norm": 1.957000854596714, "language_loss": 0.65246022, "learning_rate": 2.9288095962112927e-06, "loss": 0.67457116, "num_input_tokens_seen": 65317370, "step": 3043, "time_per_iteration": 2.5277597904205322 }, { "auxiliary_loss_clip": 0.01190413, "auxiliary_loss_mlp": 0.0102892, "balance_loss_clip": 1.05742955, "balance_loss_mlp": 1.02027154, "epoch": 0.3660193591053929, "flos": 17785550252160.0, "grad_norm": 1.8756394006013428, "language_loss": 0.84880692, "learning_rate": 2.9281196505341503e-06, "loss": 0.87100029, "num_input_tokens_seen": 65334540, "step": 3044, "time_per_iteration": 2.408989429473877 }, { "auxiliary_loss_clip": 0.01126781, "auxiliary_loss_mlp": 0.00763246, "balance_loss_clip": 1.05176497, "balance_loss_mlp": 1.00071049, "epoch": 0.36613960199603196, "flos": 10342776839040.0, "grad_norm": 2.169158605704409, "language_loss": 0.78737581, "learning_rate": 2.9274295640619946e-06, "loss": 0.80627608, "num_input_tokens_seen": 65351670, "step": 3045, "time_per_iteration": 2.5475940704345703 }, { "auxiliary_loss_clip": 0.01145906, "auxiliary_loss_mlp": 0.01026332, "balance_loss_clip": 1.05023766, "balance_loss_mlp": 1.01859498, "epoch": 0.36625984488667107, "flos": 19755609540480.0, "grad_norm": 7.06908958868098, "language_loss": 0.78218085, "learning_rate": 2.9267393368995103e-06, "loss": 0.80390322, "num_input_tokens_seen": 65370900, "step": 3046, "time_per_iteration": 2.515636444091797 }, { "auxiliary_loss_clip": 0.01192896, "auxiliary_loss_mlp": 0.01031641, "balance_loss_clip": 1.05931115, "balance_loss_mlp": 1.02339768, "epoch": 0.3663800877773102, "flos": 17674262939520.0, "grad_norm": 2.106238838173772, "language_loss": 0.73751926, "learning_rate": 2.926048969151407e-06, "loss": 0.75976467, "num_input_tokens_seen": 65388185, "step": 3047, "time_per_iteration": 2.4002580642700195 }, { "auxiliary_loss_clip": 0.0113082, "auxiliary_loss_mlp": 0.01028509, "balance_loss_clip": 1.05459344, "balance_loss_mlp": 1.01987267, "epoch": 0.36650033066794924, "flos": 20303606407680.0, "grad_norm": 1.9309487664588667, "language_loss": 0.68730938, "learning_rate": 2.92535846092241e-06, "loss": 0.70890272, "num_input_tokens_seen": 65407200, "step": 3048, "time_per_iteration": 2.5554943084716797 }, { "auxiliary_loss_clip": 0.01165626, "auxiliary_loss_mlp": 0.01031549, "balance_loss_clip": 1.05683959, "balance_loss_mlp": 1.02328801, "epoch": 0.36662057355858835, "flos": 24716237420160.0, "grad_norm": 2.3744138426062364, "language_loss": 0.82654011, "learning_rate": 2.9246678123172704e-06, "loss": 0.84851193, "num_input_tokens_seen": 65427290, "step": 3049, "time_per_iteration": 2.5517947673797607 }, { "auxiliary_loss_clip": 0.01192093, "auxiliary_loss_mlp": 0.0103574, "balance_loss_clip": 1.05754066, "balance_loss_mlp": 1.02699268, "epoch": 0.36674081644922746, "flos": 12385267902720.0, "grad_norm": 2.156203795964543, "language_loss": 0.74395168, "learning_rate": 2.9239770234407596e-06, "loss": 0.76622999, "num_input_tokens_seen": 65445595, "step": 3050, "time_per_iteration": 2.405888557434082 }, { "auxiliary_loss_clip": 0.01176614, "auxiliary_loss_mlp": 0.01027405, "balance_loss_clip": 1.05471969, "balance_loss_mlp": 1.01870859, "epoch": 0.3668610593398665, "flos": 21105922544640.0, "grad_norm": 1.5836866752241445, "language_loss": 0.68324101, "learning_rate": 2.9232860943976686e-06, "loss": 0.7052812, "num_input_tokens_seen": 65466330, "step": 3051, "time_per_iteration": 2.472597360610962 }, { "auxiliary_loss_clip": 0.01159864, "auxiliary_loss_mlp": 0.01025299, "balance_loss_clip": 1.05535579, "balance_loss_mlp": 1.01712132, "epoch": 0.3669813022305056, "flos": 26758082039040.0, "grad_norm": 1.6676428651189341, "language_loss": 0.83898771, "learning_rate": 2.9225950252928115e-06, "loss": 0.86083931, "num_input_tokens_seen": 65487180, "step": 3052, "time_per_iteration": 2.5359652042388916 }, { "auxiliary_loss_clip": 0.01178571, "auxiliary_loss_mlp": 0.00763803, "balance_loss_clip": 1.0590893, "balance_loss_mlp": 1.0006907, "epoch": 0.36710154512114473, "flos": 19099521671040.0, "grad_norm": 2.181883091419619, "language_loss": 0.81489873, "learning_rate": 2.9219038162310217e-06, "loss": 0.83432251, "num_input_tokens_seen": 65505380, "step": 3053, "time_per_iteration": 2.4488818645477295 }, { "auxiliary_loss_clip": 0.01109769, "auxiliary_loss_mlp": 0.01032799, "balance_loss_clip": 1.05098641, "balance_loss_mlp": 1.02443075, "epoch": 0.3672217880117838, "flos": 20812029465600.0, "grad_norm": 2.055129007854651, "language_loss": 0.8281616, "learning_rate": 2.921212467317157e-06, "loss": 0.84958726, "num_input_tokens_seen": 65524825, "step": 3054, "time_per_iteration": 2.6163485050201416 }, { "auxiliary_loss_clip": 0.01149447, "auxiliary_loss_mlp": 0.01031691, "balance_loss_clip": 1.05147016, "balance_loss_mlp": 1.02282739, "epoch": 0.3673420309024229, "flos": 13590394133760.0, "grad_norm": 1.8860546042439028, "language_loss": 0.80125493, "learning_rate": 2.920520978656093e-06, "loss": 0.82306629, "num_input_tokens_seen": 65541790, "step": 3055, "time_per_iteration": 2.4380366802215576 }, { "auxiliary_loss_clip": 0.01187204, "auxiliary_loss_mlp": 0.00763264, "balance_loss_clip": 1.05628812, "balance_loss_mlp": 1.00071919, "epoch": 0.367462273793062, "flos": 28986877969920.0, "grad_norm": 1.9361724785641345, "language_loss": 0.7692529, "learning_rate": 2.919829350352729e-06, "loss": 0.78875756, "num_input_tokens_seen": 65563395, "step": 3056, "time_per_iteration": 3.316824436187744 }, { "auxiliary_loss_clip": 0.01096426, "auxiliary_loss_mlp": 0.01009773, "balance_loss_clip": 1.02844, "balance_loss_mlp": 1.00865889, "epoch": 0.36758251668370107, "flos": 62643148346880.0, "grad_norm": 0.7570547829826064, "language_loss": 0.60030913, "learning_rate": 2.919137582511983e-06, "loss": 0.62137109, "num_input_tokens_seen": 65619835, "step": 3057, "time_per_iteration": 2.945138931274414 }, { "auxiliary_loss_clip": 0.011577, "auxiliary_loss_mlp": 0.01029523, "balance_loss_clip": 1.05977607, "balance_loss_mlp": 1.02150011, "epoch": 0.3677027595743402, "flos": 12713886455040.0, "grad_norm": 1.8878518158817985, "language_loss": 0.64157999, "learning_rate": 2.918445675238797e-06, "loss": 0.66345227, "num_input_tokens_seen": 65636760, "step": 3058, "time_per_iteration": 2.4971792697906494 }, { "auxiliary_loss_clip": 0.01191634, "auxiliary_loss_mlp": 0.0102699, "balance_loss_clip": 1.05725479, "balance_loss_mlp": 1.01840043, "epoch": 0.36782300246497923, "flos": 25046579825280.0, "grad_norm": 1.7956780496879863, "language_loss": 0.69444293, "learning_rate": 2.917753628638132e-06, "loss": 0.71662921, "num_input_tokens_seen": 65657065, "step": 3059, "time_per_iteration": 2.475940465927124 }, { "auxiliary_loss_clip": 0.01163311, "auxiliary_loss_mlp": 0.01027772, "balance_loss_clip": 1.05672216, "balance_loss_mlp": 1.01886678, "epoch": 0.36794324535561834, "flos": 17419512706560.0, "grad_norm": 2.0690827775904648, "language_loss": 0.70161605, "learning_rate": 2.9170614428149716e-06, "loss": 0.72352684, "num_input_tokens_seen": 65675400, "step": 3060, "time_per_iteration": 2.4628260135650635 }, { "auxiliary_loss_clip": 0.01144074, "auxiliary_loss_mlp": 0.0103567, "balance_loss_clip": 1.05368018, "balance_loss_mlp": 1.02645516, "epoch": 0.36806348824625745, "flos": 24089128848000.0, "grad_norm": 3.063828600476903, "language_loss": 0.86817425, "learning_rate": 2.9163691178743195e-06, "loss": 0.88997173, "num_input_tokens_seen": 65694050, "step": 3061, "time_per_iteration": 3.4269320964813232 }, { "auxiliary_loss_clip": 0.01172165, "auxiliary_loss_mlp": 0.01028956, "balance_loss_clip": 1.05546153, "balance_loss_mlp": 1.0203371, "epoch": 0.3681837311368965, "flos": 20521871400960.0, "grad_norm": 1.7580222715142169, "language_loss": 0.7723487, "learning_rate": 2.9156766539212006e-06, "loss": 0.79435992, "num_input_tokens_seen": 65711695, "step": 3062, "time_per_iteration": 3.234375 }, { "auxiliary_loss_clip": 0.01178483, "auxiliary_loss_mlp": 0.01040463, "balance_loss_clip": 1.05431294, "balance_loss_mlp": 1.03173709, "epoch": 0.3683039740275356, "flos": 21466644877440.0, "grad_norm": 1.8889714124636634, "language_loss": 0.71276712, "learning_rate": 2.9149840510606614e-06, "loss": 0.73495656, "num_input_tokens_seen": 65730350, "step": 3063, "time_per_iteration": 2.477159261703491 }, { "auxiliary_loss_clip": 0.01080107, "auxiliary_loss_mlp": 0.00754051, "balance_loss_clip": 1.02423072, "balance_loss_mlp": 1.0013026, "epoch": 0.36842421691817473, "flos": 70380999987840.0, "grad_norm": 1.026092408576825, "language_loss": 0.64183807, "learning_rate": 2.914291309397769e-06, "loss": 0.66017962, "num_input_tokens_seen": 65787820, "step": 3064, "time_per_iteration": 3.151080846786499 }, { "auxiliary_loss_clip": 0.01107949, "auxiliary_loss_mlp": 0.01028576, "balance_loss_clip": 1.04762554, "balance_loss_mlp": 1.01931953, "epoch": 0.3685444598088138, "flos": 23331378510720.0, "grad_norm": 2.2887398058454718, "language_loss": 0.78912234, "learning_rate": 2.9135984290376117e-06, "loss": 0.81048763, "num_input_tokens_seen": 65806685, "step": 3065, "time_per_iteration": 2.6122238636016846 }, { "auxiliary_loss_clip": 0.01115805, "auxiliary_loss_mlp": 0.0103187, "balance_loss_clip": 1.04797053, "balance_loss_mlp": 1.02332294, "epoch": 0.3686647026994529, "flos": 23070271570560.0, "grad_norm": 1.683605623804839, "language_loss": 0.82786429, "learning_rate": 2.9129054100853e-06, "loss": 0.84934103, "num_input_tokens_seen": 65825525, "step": 3066, "time_per_iteration": 2.601835012435913 }, { "auxiliary_loss_clip": 0.01162742, "auxiliary_loss_mlp": 0.01028063, "balance_loss_clip": 1.05429351, "balance_loss_mlp": 1.01949191, "epoch": 0.368784945590092, "flos": 25119909440640.0, "grad_norm": 1.6604839501720707, "language_loss": 0.75971091, "learning_rate": 2.912212252645963e-06, "loss": 0.78161895, "num_input_tokens_seen": 65848110, "step": 3067, "time_per_iteration": 2.551729679107666 }, { "auxiliary_loss_clip": 0.01180342, "auxiliary_loss_mlp": 0.01029661, "balance_loss_clip": 1.05453181, "balance_loss_mlp": 1.02104223, "epoch": 0.36890518848073106, "flos": 18442284566400.0, "grad_norm": 2.0223062882198777, "language_loss": 0.76295191, "learning_rate": 2.9115189568247523e-06, "loss": 0.78505194, "num_input_tokens_seen": 65865670, "step": 3068, "time_per_iteration": 2.447573184967041 }, { "auxiliary_loss_clip": 0.0112468, "auxiliary_loss_mlp": 0.01032262, "balance_loss_clip": 1.0548141, "balance_loss_mlp": 1.02363145, "epoch": 0.36902543137137017, "flos": 16362446336640.0, "grad_norm": 1.9020381572551877, "language_loss": 0.92165548, "learning_rate": 2.910825522726841e-06, "loss": 0.94322485, "num_input_tokens_seen": 65883195, "step": 3069, "time_per_iteration": 2.52885103225708 }, { "auxiliary_loss_clip": 0.01125887, "auxiliary_loss_mlp": 0.01032038, "balance_loss_clip": 1.04700148, "balance_loss_mlp": 1.02358055, "epoch": 0.3691456742620093, "flos": 12275596702080.0, "grad_norm": 1.9053274525265584, "language_loss": 0.77213216, "learning_rate": 2.9101319504574215e-06, "loss": 0.79371142, "num_input_tokens_seen": 65899635, "step": 3070, "time_per_iteration": 2.5252184867858887 }, { "auxiliary_loss_clip": 0.01164673, "auxiliary_loss_mlp": 0.01031266, "balance_loss_clip": 1.05078316, "balance_loss_mlp": 1.02218807, "epoch": 0.36926591715264834, "flos": 17786412178560.0, "grad_norm": 1.8746489748561939, "language_loss": 0.76626921, "learning_rate": 2.909438240121709e-06, "loss": 0.78822857, "num_input_tokens_seen": 65919910, "step": 3071, "time_per_iteration": 2.4977846145629883 }, { "auxiliary_loss_clip": 0.01154717, "auxiliary_loss_mlp": 0.01025087, "balance_loss_clip": 1.05415845, "balance_loss_mlp": 1.01650381, "epoch": 0.36938616004328745, "flos": 28948309741440.0, "grad_norm": 1.740603464270166, "language_loss": 0.70372581, "learning_rate": 2.908744391824939e-06, "loss": 0.72552383, "num_input_tokens_seen": 65940930, "step": 3072, "time_per_iteration": 2.576730728149414 }, { "auxiliary_loss_clip": 0.01119397, "auxiliary_loss_mlp": 0.01027762, "balance_loss_clip": 1.04758203, "balance_loss_mlp": 1.01906002, "epoch": 0.36950640293392656, "flos": 29205394358400.0, "grad_norm": 1.6289692631132964, "language_loss": 0.78989983, "learning_rate": 2.908050405672367e-06, "loss": 0.81137145, "num_input_tokens_seen": 65960475, "step": 3073, "time_per_iteration": 2.6455554962158203 }, { "auxiliary_loss_clip": 0.01164723, "auxiliary_loss_mlp": 0.01030034, "balance_loss_clip": 1.05042887, "balance_loss_mlp": 1.02133775, "epoch": 0.3696266458245656, "flos": 24827776128000.0, "grad_norm": 1.7226198747933599, "language_loss": 0.7927829, "learning_rate": 2.9073562817692703e-06, "loss": 0.81473047, "num_input_tokens_seen": 65979160, "step": 3074, "time_per_iteration": 2.52937912940979 }, { "auxiliary_loss_clip": 0.0105101, "auxiliary_loss_mlp": 0.01005299, "balance_loss_clip": 1.02100444, "balance_loss_mlp": 1.00398815, "epoch": 0.3697468887152047, "flos": 59887257264000.0, "grad_norm": 0.7234587820058428, "language_loss": 0.56523782, "learning_rate": 2.9066620202209468e-06, "loss": 0.58580089, "num_input_tokens_seen": 66041650, "step": 3075, "time_per_iteration": 3.073305368423462 }, { "auxiliary_loss_clip": 0.01137561, "auxiliary_loss_mlp": 0.01028035, "balance_loss_clip": 1.0513308, "balance_loss_mlp": 1.01965511, "epoch": 0.3698671316058438, "flos": 26137581569280.0, "grad_norm": 1.901466344564647, "language_loss": 0.77521539, "learning_rate": 2.905967621132716e-06, "loss": 0.7968713, "num_input_tokens_seen": 66059260, "step": 3076, "time_per_iteration": 2.55488920211792 }, { "auxiliary_loss_clip": 0.01164719, "auxiliary_loss_mlp": 0.01030537, "balance_loss_clip": 1.05217421, "balance_loss_mlp": 1.02199626, "epoch": 0.3699873744964829, "flos": 24607464059520.0, "grad_norm": 1.7925147239513082, "language_loss": 0.75092286, "learning_rate": 2.9052730846099172e-06, "loss": 0.77287543, "num_input_tokens_seen": 66080605, "step": 3077, "time_per_iteration": 2.532503843307495 }, { "auxiliary_loss_clip": 0.01067245, "auxiliary_loss_mlp": 0.0100115, "balance_loss_clip": 1.02181292, "balance_loss_mlp": 0.99992198, "epoch": 0.370107617387122, "flos": 64885340050560.0, "grad_norm": 0.8510180624822664, "language_loss": 0.60826206, "learning_rate": 2.9045784107579123e-06, "loss": 0.62894601, "num_input_tokens_seen": 66140710, "step": 3078, "time_per_iteration": 3.093737840652466 }, { "auxiliary_loss_clip": 0.01188915, "auxiliary_loss_mlp": 0.01027928, "balance_loss_clip": 1.05695796, "balance_loss_mlp": 1.01915979, "epoch": 0.37022786027776106, "flos": 15961683317760.0, "grad_norm": 2.928882061265177, "language_loss": 0.66829419, "learning_rate": 2.9038835996820807e-06, "loss": 0.69046259, "num_input_tokens_seen": 66158320, "step": 3079, "time_per_iteration": 2.416414737701416 }, { "auxiliary_loss_clip": 0.01149601, "auxiliary_loss_mlp": 0.01030944, "balance_loss_clip": 1.04854727, "balance_loss_mlp": 1.02205062, "epoch": 0.37034810316840017, "flos": 18546927863040.0, "grad_norm": 1.776021176644969, "language_loss": 0.79789054, "learning_rate": 2.903188651487826e-06, "loss": 0.81969607, "num_input_tokens_seen": 66176875, "step": 3080, "time_per_iteration": 2.505650758743286 }, { "auxiliary_loss_clip": 0.01179263, "auxiliary_loss_mlp": 0.01029699, "balance_loss_clip": 1.05684805, "balance_loss_mlp": 1.02081203, "epoch": 0.3704683460590393, "flos": 17821927751040.0, "grad_norm": 2.434656837212417, "language_loss": 0.86916959, "learning_rate": 2.902493566280571e-06, "loss": 0.89125919, "num_input_tokens_seen": 66194980, "step": 3081, "time_per_iteration": 2.4463279247283936 }, { "auxiliary_loss_clip": 0.01159309, "auxiliary_loss_mlp": 0.01027246, "balance_loss_clip": 1.0534687, "balance_loss_mlp": 1.01810908, "epoch": 0.37058858894967833, "flos": 14134081368960.0, "grad_norm": 8.847284099012898, "language_loss": 0.81579638, "learning_rate": 2.9017983441657595e-06, "loss": 0.83766186, "num_input_tokens_seen": 66212310, "step": 3082, "time_per_iteration": 2.4657857418060303 }, { "auxiliary_loss_clip": 0.01131881, "auxiliary_loss_mlp": 0.01027357, "balance_loss_clip": 1.0462513, "balance_loss_mlp": 1.01853573, "epoch": 0.37070883184031744, "flos": 13954492344960.0, "grad_norm": 2.7018465906506113, "language_loss": 0.75281215, "learning_rate": 2.9011029852488564e-06, "loss": 0.77440453, "num_input_tokens_seen": 66229545, "step": 3083, "time_per_iteration": 3.397322654724121 }, { "auxiliary_loss_clip": 0.01084857, "auxiliary_loss_mlp": 0.01000838, "balance_loss_clip": 1.01707315, "balance_loss_mlp": 0.99968141, "epoch": 0.37082907473095655, "flos": 52315419306240.0, "grad_norm": 0.9860272000605642, "language_loss": 0.62533206, "learning_rate": 2.9004074896353465e-06, "loss": 0.64618897, "num_input_tokens_seen": 66283545, "step": 3084, "time_per_iteration": 2.960568904876709 }, { "auxiliary_loss_clip": 0.0119141, "auxiliary_loss_mlp": 0.01026881, "balance_loss_clip": 1.06215191, "balance_loss_mlp": 1.01940107, "epoch": 0.3709493176215956, "flos": 15998096730240.0, "grad_norm": 1.7764345363015674, "language_loss": 0.81793094, "learning_rate": 2.8997118574307362e-06, "loss": 0.84011394, "num_input_tokens_seen": 66300500, "step": 3085, "time_per_iteration": 2.4666192531585693 }, { "auxiliary_loss_clip": 0.01152056, "auxiliary_loss_mlp": 0.01030487, "balance_loss_clip": 1.05247355, "balance_loss_mlp": 1.02201104, "epoch": 0.3710695605122347, "flos": 20959837931520.0, "grad_norm": 2.2091659403696626, "language_loss": 0.74374753, "learning_rate": 2.899016088740553e-06, "loss": 0.76557302, "num_input_tokens_seen": 66318610, "step": 3086, "time_per_iteration": 2.5196192264556885 }, { "auxiliary_loss_clip": 0.01129173, "auxiliary_loss_mlp": 0.01025001, "balance_loss_clip": 1.04856503, "balance_loss_mlp": 1.0173955, "epoch": 0.37118980340287383, "flos": 14355578586240.0, "grad_norm": 1.7841428804893666, "language_loss": 0.78998601, "learning_rate": 2.898320183670344e-06, "loss": 0.81152773, "num_input_tokens_seen": 66336025, "step": 3087, "time_per_iteration": 3.3778507709503174 }, { "auxiliary_loss_clip": 0.01132859, "auxiliary_loss_mlp": 0.01029608, "balance_loss_clip": 1.05426693, "balance_loss_mlp": 1.02101898, "epoch": 0.3713100462935129, "flos": 25885381201920.0, "grad_norm": 1.8573648256064648, "language_loss": 0.88930058, "learning_rate": 2.8976241423256767e-06, "loss": 0.91092527, "num_input_tokens_seen": 66356120, "step": 3088, "time_per_iteration": 4.074788331985474 }, { "auxiliary_loss_clip": 0.01153462, "auxiliary_loss_mlp": 0.01031042, "balance_loss_clip": 1.05084324, "balance_loss_mlp": 1.02316236, "epoch": 0.371430289184152, "flos": 30518934814080.0, "grad_norm": 1.8199088292121155, "language_loss": 0.67931402, "learning_rate": 2.896927964812142e-06, "loss": 0.70115912, "num_input_tokens_seen": 66376685, "step": 3089, "time_per_iteration": 2.5718977451324463 }, { "auxiliary_loss_clip": 0.01160492, "auxiliary_loss_mlp": 0.0102944, "balance_loss_clip": 1.0568192, "balance_loss_mlp": 1.02039194, "epoch": 0.37155053207479105, "flos": 15742233175680.0, "grad_norm": 2.3323219440267846, "language_loss": 0.75253165, "learning_rate": 2.8962316512353465e-06, "loss": 0.77443093, "num_input_tokens_seen": 66394230, "step": 3090, "time_per_iteration": 2.484325647354126 }, { "auxiliary_loss_clip": 0.01111884, "auxiliary_loss_mlp": 0.01031081, "balance_loss_clip": 1.04433692, "balance_loss_mlp": 1.02259326, "epoch": 0.37167077496543016, "flos": 23404061681280.0, "grad_norm": 1.5289820764494213, "language_loss": 0.74942881, "learning_rate": 2.8955352017009233e-06, "loss": 0.77085841, "num_input_tokens_seen": 66413475, "step": 3091, "time_per_iteration": 2.6065900325775146 }, { "auxiliary_loss_clip": 0.01160184, "auxiliary_loss_mlp": 0.0103184, "balance_loss_clip": 1.0563153, "balance_loss_mlp": 1.023054, "epoch": 0.3717910178560693, "flos": 22088653718400.0, "grad_norm": 3.1642929808981046, "language_loss": 0.77126801, "learning_rate": 2.8948386163145212e-06, "loss": 0.79318821, "num_input_tokens_seen": 66432685, "step": 3092, "time_per_iteration": 2.5059359073638916 }, { "auxiliary_loss_clip": 0.01179229, "auxiliary_loss_mlp": 0.01030179, "balance_loss_clip": 1.05617952, "balance_loss_mlp": 1.02188253, "epoch": 0.3719112607467083, "flos": 26939969533440.0, "grad_norm": 1.8237267847060572, "language_loss": 0.79071808, "learning_rate": 2.8941418951818135e-06, "loss": 0.81281215, "num_input_tokens_seen": 66452245, "step": 3093, "time_per_iteration": 2.4883203506469727 }, { "auxiliary_loss_clip": 0.01146028, "auxiliary_loss_mlp": 0.01032827, "balance_loss_clip": 1.05118442, "balance_loss_mlp": 1.02492309, "epoch": 0.37203150363734744, "flos": 12166500119040.0, "grad_norm": 2.3930619083808815, "language_loss": 0.71113873, "learning_rate": 2.8934450384084903e-06, "loss": 0.7329272, "num_input_tokens_seen": 66469760, "step": 3094, "time_per_iteration": 2.5112884044647217 }, { "auxiliary_loss_clip": 0.01155341, "auxiliary_loss_mlp": 0.01028875, "balance_loss_clip": 1.05298972, "balance_loss_mlp": 1.02022672, "epoch": 0.37215174652798655, "flos": 23697595624320.0, "grad_norm": 1.8284074983025054, "language_loss": 0.69826859, "learning_rate": 2.8927480461002653e-06, "loss": 0.72011077, "num_input_tokens_seen": 66489730, "step": 3095, "time_per_iteration": 2.5354344844818115 }, { "auxiliary_loss_clip": 0.01159411, "auxiliary_loss_mlp": 0.01037185, "balance_loss_clip": 1.05086005, "balance_loss_mlp": 1.02761877, "epoch": 0.3722719894186256, "flos": 17887751424000.0, "grad_norm": 3.192339297479285, "language_loss": 0.86417294, "learning_rate": 2.892050918362872e-06, "loss": 0.88613892, "num_input_tokens_seen": 66504785, "step": 3096, "time_per_iteration": 2.472733974456787 }, { "auxiliary_loss_clip": 0.010186, "auxiliary_loss_mlp": 0.01003817, "balance_loss_clip": 1.01639569, "balance_loss_mlp": 1.00242794, "epoch": 0.3723922323092647, "flos": 62419891363200.0, "grad_norm": 0.8496562944322783, "language_loss": 0.55903089, "learning_rate": 2.8913536553020626e-06, "loss": 0.5792551, "num_input_tokens_seen": 66558840, "step": 3097, "time_per_iteration": 3.5324528217315674 }, { "auxiliary_loss_clip": 0.01123517, "auxiliary_loss_mlp": 0.01027855, "balance_loss_clip": 1.04819775, "balance_loss_mlp": 1.01969504, "epoch": 0.3725124751999038, "flos": 23039747988480.0, "grad_norm": 2.1081206833218884, "language_loss": 0.84853786, "learning_rate": 2.8906562570236137e-06, "loss": 0.87005162, "num_input_tokens_seen": 66576750, "step": 3098, "time_per_iteration": 2.938476324081421 }, { "auxiliary_loss_clip": 0.01112249, "auxiliary_loss_mlp": 0.01034002, "balance_loss_clip": 1.04490685, "balance_loss_mlp": 1.02617621, "epoch": 0.3726327180905429, "flos": 20920551431040.0, "grad_norm": 1.4888644190420457, "language_loss": 0.76502621, "learning_rate": 2.889958723633318e-06, "loss": 0.78648865, "num_input_tokens_seen": 66595690, "step": 3099, "time_per_iteration": 2.6059322357177734 }, { "auxiliary_loss_clip": 0.01145617, "auxiliary_loss_mlp": 0.01028392, "balance_loss_clip": 1.05128145, "balance_loss_mlp": 1.01993418, "epoch": 0.372752960981182, "flos": 30592156688640.0, "grad_norm": 1.852895213674984, "language_loss": 0.73783535, "learning_rate": 2.889261055236992e-06, "loss": 0.75957549, "num_input_tokens_seen": 66617905, "step": 3100, "time_per_iteration": 2.6023433208465576 }, { "auxiliary_loss_clip": 0.01157703, "auxiliary_loss_mlp": 0.01026852, "balance_loss_clip": 1.0545969, "balance_loss_mlp": 1.01884675, "epoch": 0.3728732038718211, "flos": 25116749043840.0, "grad_norm": 1.9898948264863654, "language_loss": 0.82810473, "learning_rate": 2.8885632519404704e-06, "loss": 0.84995031, "num_input_tokens_seen": 66638175, "step": 3101, "time_per_iteration": 2.5411581993103027 }, { "auxiliary_loss_clip": 0.01161053, "auxiliary_loss_mlp": 0.01028527, "balance_loss_clip": 1.0559783, "balance_loss_mlp": 1.02005136, "epoch": 0.37299344676246016, "flos": 25302048330240.0, "grad_norm": 1.996871536090944, "language_loss": 0.75514901, "learning_rate": 2.8878653138496107e-06, "loss": 0.77704489, "num_input_tokens_seen": 66658670, "step": 3102, "time_per_iteration": 2.539137601852417 }, { "auxiliary_loss_clip": 0.01112825, "auxiliary_loss_mlp": 0.0103097, "balance_loss_clip": 1.04293156, "balance_loss_mlp": 1.02227378, "epoch": 0.37311368965309927, "flos": 23842531002240.0, "grad_norm": 2.252413582131106, "language_loss": 0.76544374, "learning_rate": 2.8871672410702878e-06, "loss": 0.78688169, "num_input_tokens_seen": 66676030, "step": 3103, "time_per_iteration": 2.6596555709838867 }, { "auxiliary_loss_clip": 0.01154628, "auxiliary_loss_mlp": 0.01029672, "balance_loss_clip": 1.05094266, "balance_loss_mlp": 1.02044511, "epoch": 0.3732339325437384, "flos": 25811943845760.0, "grad_norm": 1.8148055266422836, "language_loss": 0.82128298, "learning_rate": 2.8864690337084008e-06, "loss": 0.84312606, "num_input_tokens_seen": 66695305, "step": 3104, "time_per_iteration": 2.5743672847747803 }, { "auxiliary_loss_clip": 0.01169001, "auxiliary_loss_mlp": 0.01031418, "balance_loss_clip": 1.05298913, "balance_loss_mlp": 1.02213144, "epoch": 0.37335417543437743, "flos": 26208433146240.0, "grad_norm": 1.6933106138856056, "language_loss": 0.78106463, "learning_rate": 2.885770691869866e-06, "loss": 0.80306888, "num_input_tokens_seen": 66716185, "step": 3105, "time_per_iteration": 2.5092620849609375 }, { "auxiliary_loss_clip": 0.01170532, "auxiliary_loss_mlp": 0.0103245, "balance_loss_clip": 1.05424166, "balance_loss_mlp": 1.02455795, "epoch": 0.37347441832501654, "flos": 24023879792640.0, "grad_norm": 2.4903035712098958, "language_loss": 0.7450949, "learning_rate": 2.8850722156606207e-06, "loss": 0.76712465, "num_input_tokens_seen": 66734575, "step": 3106, "time_per_iteration": 2.496767044067383 }, { "auxiliary_loss_clip": 0.01167309, "auxiliary_loss_mlp": 0.01035184, "balance_loss_clip": 1.05325627, "balance_loss_mlp": 1.02698195, "epoch": 0.3735946612156556, "flos": 19714922409600.0, "grad_norm": 1.5434421292237188, "language_loss": 0.66832906, "learning_rate": 2.8843736051866252e-06, "loss": 0.69035399, "num_input_tokens_seen": 66753500, "step": 3107, "time_per_iteration": 2.4811577796936035 }, { "auxiliary_loss_clip": 0.01127216, "auxiliary_loss_mlp": 0.00763304, "balance_loss_clip": 1.04810548, "balance_loss_mlp": 1.00075698, "epoch": 0.3737149041062947, "flos": 23039604334080.0, "grad_norm": 1.698243829242408, "language_loss": 0.69489014, "learning_rate": 2.8836748605538557e-06, "loss": 0.7137953, "num_input_tokens_seen": 66775140, "step": 3108, "time_per_iteration": 2.6121256351470947 }, { "auxiliary_loss_clip": 0.01164737, "auxiliary_loss_mlp": 0.01025373, "balance_loss_clip": 1.05248666, "balance_loss_mlp": 1.01650369, "epoch": 0.3738351469969338, "flos": 34678108483200.0, "grad_norm": 3.0445117398238133, "language_loss": 0.63462484, "learning_rate": 2.882975981868313e-06, "loss": 0.65652591, "num_input_tokens_seen": 66795525, "step": 3109, "time_per_iteration": 3.6112146377563477 }, { "auxiliary_loss_clip": 0.01173385, "auxiliary_loss_mlp": 0.01027485, "balance_loss_clip": 1.05592632, "balance_loss_mlp": 1.01895547, "epoch": 0.3739553898875729, "flos": 43507967448960.0, "grad_norm": 2.392857825358932, "language_loss": 0.68686962, "learning_rate": 2.882276969236016e-06, "loss": 0.70887834, "num_input_tokens_seen": 66816885, "step": 3110, "time_per_iteration": 2.663795232772827 }, { "auxiliary_loss_clip": 0.01156132, "auxiliary_loss_mlp": 0.01029268, "balance_loss_clip": 1.05128145, "balance_loss_mlp": 1.0205245, "epoch": 0.374075632778212, "flos": 12856487448960.0, "grad_norm": 2.046303823997311, "language_loss": 0.76305211, "learning_rate": 2.881577822763005e-06, "loss": 0.78490615, "num_input_tokens_seen": 66834835, "step": 3111, "time_per_iteration": 2.464536190032959 }, { "auxiliary_loss_clip": 0.01171917, "auxiliary_loss_mlp": 0.01024302, "balance_loss_clip": 1.05332112, "balance_loss_mlp": 1.01644015, "epoch": 0.3741958756688511, "flos": 26024031699840.0, "grad_norm": 1.7865842072161702, "language_loss": 0.87259078, "learning_rate": 2.880878542555338e-06, "loss": 0.89455301, "num_input_tokens_seen": 66852600, "step": 3112, "time_per_iteration": 2.4953858852386475 }, { "auxiliary_loss_clip": 0.01189556, "auxiliary_loss_mlp": 0.01029965, "balance_loss_clip": 1.05648804, "balance_loss_mlp": 1.02138758, "epoch": 0.37431611855949015, "flos": 21433894652160.0, "grad_norm": 2.258291429189346, "language_loss": 0.80836713, "learning_rate": 2.8801791287190976e-06, "loss": 0.83056241, "num_input_tokens_seen": 66870595, "step": 3113, "time_per_iteration": 2.4246292114257812 }, { "auxiliary_loss_clip": 0.01173967, "auxiliary_loss_mlp": 0.01027955, "balance_loss_clip": 1.05243027, "balance_loss_mlp": 1.01953852, "epoch": 0.37443636145012926, "flos": 24207096090240.0, "grad_norm": 2.8971806978876233, "language_loss": 0.8596946, "learning_rate": 2.8794795813603817e-06, "loss": 0.88171387, "num_input_tokens_seen": 66886060, "step": 3114, "time_per_iteration": 3.3249690532684326 }, { "auxiliary_loss_clip": 0.01178232, "auxiliary_loss_mlp": 0.01031788, "balance_loss_clip": 1.0536319, "balance_loss_mlp": 1.02303791, "epoch": 0.3745566043407684, "flos": 15378601841280.0, "grad_norm": 1.8865316440154212, "language_loss": 0.81706363, "learning_rate": 2.878779900585314e-06, "loss": 0.83916384, "num_input_tokens_seen": 66903900, "step": 3115, "time_per_iteration": 3.2313897609710693 }, { "auxiliary_loss_clip": 0.01163875, "auxiliary_loss_mlp": 0.01029151, "balance_loss_clip": 1.0530107, "balance_loss_mlp": 1.02072275, "epoch": 0.37467684723140743, "flos": 24608218245120.0, "grad_norm": 1.5205094399668995, "language_loss": 0.75291908, "learning_rate": 2.8780800865000336e-06, "loss": 0.7748493, "num_input_tokens_seen": 66925210, "step": 3116, "time_per_iteration": 2.549765110015869 }, { "auxiliary_loss_clip": 0.01076843, "auxiliary_loss_mlp": 0.01006435, "balance_loss_clip": 1.01985288, "balance_loss_mlp": 1.00526702, "epoch": 0.37479709012204654, "flos": 64377491610240.0, "grad_norm": 0.975965565243997, "language_loss": 0.59196055, "learning_rate": 2.877380139210702e-06, "loss": 0.61279333, "num_input_tokens_seen": 66983880, "step": 3117, "time_per_iteration": 3.0146188735961914 }, { "auxiliary_loss_clip": 0.01146186, "auxiliary_loss_mlp": 0.01033, "balance_loss_clip": 1.05080748, "balance_loss_mlp": 1.02374387, "epoch": 0.37491733301268565, "flos": 23803962773760.0, "grad_norm": 2.1955290934198333, "language_loss": 0.76614767, "learning_rate": 2.876680058823501e-06, "loss": 0.78793955, "num_input_tokens_seen": 67004280, "step": 3118, "time_per_iteration": 2.557805299758911 }, { "auxiliary_loss_clip": 0.01148362, "auxiliary_loss_mlp": 0.01030191, "balance_loss_clip": 1.04963684, "balance_loss_mlp": 1.02151227, "epoch": 0.3750375759033247, "flos": 32160950167680.0, "grad_norm": 1.6945757924703555, "language_loss": 0.6601609, "learning_rate": 2.8759798454446314e-06, "loss": 0.68194646, "num_input_tokens_seen": 67027445, "step": 3119, "time_per_iteration": 2.588473081588745 }, { "auxiliary_loss_clip": 0.0117607, "auxiliary_loss_mlp": 0.01038566, "balance_loss_clip": 1.0547415, "balance_loss_mlp": 1.0303998, "epoch": 0.3751578187939638, "flos": 23367791923200.0, "grad_norm": 2.007764817957837, "language_loss": 0.81639218, "learning_rate": 2.8752794991803173e-06, "loss": 0.83853859, "num_input_tokens_seen": 67045130, "step": 3120, "time_per_iteration": 2.4711804389953613 }, { "auxiliary_loss_clip": 0.01156865, "auxiliary_loss_mlp": 0.01032258, "balance_loss_clip": 1.05206013, "balance_loss_mlp": 1.02383029, "epoch": 0.37527806168460287, "flos": 14605731878400.0, "grad_norm": 2.614544852039984, "language_loss": 0.75237489, "learning_rate": 2.8745790201367976e-06, "loss": 0.77426612, "num_input_tokens_seen": 67060885, "step": 3121, "time_per_iteration": 2.4698715209960938 }, { "auxiliary_loss_clip": 0.01190986, "auxiliary_loss_mlp": 0.01036863, "balance_loss_clip": 1.05686259, "balance_loss_mlp": 1.02804184, "epoch": 0.375398304575242, "flos": 26390823431040.0, "grad_norm": 3.286901787291444, "language_loss": 0.84436589, "learning_rate": 2.8738784084203373e-06, "loss": 0.86664438, "num_input_tokens_seen": 67080960, "step": 3122, "time_per_iteration": 2.469667673110962 }, { "auxiliary_loss_clip": 0.01149316, "auxiliary_loss_mlp": 0.01026991, "balance_loss_clip": 1.04714763, "balance_loss_mlp": 1.01893294, "epoch": 0.3755185474658811, "flos": 22236605838720.0, "grad_norm": 1.6258532256173992, "language_loss": 0.78889155, "learning_rate": 2.873177664137216e-06, "loss": 0.81065464, "num_input_tokens_seen": 67101890, "step": 3123, "time_per_iteration": 2.5259437561035156 }, { "auxiliary_loss_clip": 0.01138532, "auxiliary_loss_mlp": 0.01024502, "balance_loss_clip": 1.05095363, "balance_loss_mlp": 1.01613986, "epoch": 0.37563879035652015, "flos": 30812935633920.0, "grad_norm": 1.700286020426204, "language_loss": 0.69339776, "learning_rate": 2.8724767873937384e-06, "loss": 0.71502805, "num_input_tokens_seen": 67126010, "step": 3124, "time_per_iteration": 2.6263017654418945 }, { "auxiliary_loss_clip": 0.01158443, "auxiliary_loss_mlp": 0.01034542, "balance_loss_clip": 1.0518086, "balance_loss_mlp": 1.02633464, "epoch": 0.37575903324715926, "flos": 20773533064320.0, "grad_norm": 2.5622354056585364, "language_loss": 0.87532628, "learning_rate": 2.871775778296225e-06, "loss": 0.89725614, "num_input_tokens_seen": 67143100, "step": 3125, "time_per_iteration": 2.4712038040161133 }, { "auxiliary_loss_clip": 0.01176876, "auxiliary_loss_mlp": 0.0103428, "balance_loss_clip": 1.05744195, "balance_loss_mlp": 1.02479053, "epoch": 0.37587927613779837, "flos": 18697681244160.0, "grad_norm": 1.9549841962074688, "language_loss": 0.78357667, "learning_rate": 2.8710746369510196e-06, "loss": 0.8056882, "num_input_tokens_seen": 67161085, "step": 3126, "time_per_iteration": 2.443674325942993 }, { "auxiliary_loss_clip": 0.01153252, "auxiliary_loss_mlp": 0.01029526, "balance_loss_clip": 1.05298221, "balance_loss_mlp": 1.02138352, "epoch": 0.3759995190284374, "flos": 13624796384640.0, "grad_norm": 2.392028777529309, "language_loss": 0.83229095, "learning_rate": 2.8703733634644846e-06, "loss": 0.8541187, "num_input_tokens_seen": 67175840, "step": 3127, "time_per_iteration": 2.449815273284912 }, { "auxiliary_loss_clip": 0.01184592, "auxiliary_loss_mlp": 0.01028502, "balance_loss_clip": 1.05540895, "balance_loss_mlp": 1.02022266, "epoch": 0.37611976191907653, "flos": 20484847457280.0, "grad_norm": 1.765232265266723, "language_loss": 0.79220593, "learning_rate": 2.869671957943002e-06, "loss": 0.8143369, "num_input_tokens_seen": 67194995, "step": 3128, "time_per_iteration": 2.4282548427581787 }, { "auxiliary_loss_clip": 0.0115593, "auxiliary_loss_mlp": 0.01028357, "balance_loss_clip": 1.05754912, "balance_loss_mlp": 1.01995301, "epoch": 0.37624000480971564, "flos": 21141797253120.0, "grad_norm": 1.7464556906682398, "language_loss": 0.73718297, "learning_rate": 2.8689704204929747e-06, "loss": 0.75902581, "num_input_tokens_seen": 67214175, "step": 3129, "time_per_iteration": 2.496959924697876 }, { "auxiliary_loss_clip": 0.01186897, "auxiliary_loss_mlp": 0.01030624, "balance_loss_clip": 1.05572617, "balance_loss_mlp": 1.02212489, "epoch": 0.3763602477003547, "flos": 22564470205440.0, "grad_norm": 1.723356042838573, "language_loss": 0.81053418, "learning_rate": 2.8682687512208253e-06, "loss": 0.83270943, "num_input_tokens_seen": 67233185, "step": 3130, "time_per_iteration": 2.434842348098755 }, { "auxiliary_loss_clip": 0.01179706, "auxiliary_loss_mlp": 0.01033594, "balance_loss_clip": 1.05539799, "balance_loss_mlp": 1.02483177, "epoch": 0.3764804905909938, "flos": 27526857851520.0, "grad_norm": 2.198284008165177, "language_loss": 0.80208343, "learning_rate": 2.8675669502329972e-06, "loss": 0.82421649, "num_input_tokens_seen": 67254715, "step": 3131, "time_per_iteration": 2.5020923614501953 }, { "auxiliary_loss_clip": 0.01175319, "auxiliary_loss_mlp": 0.00763399, "balance_loss_clip": 1.0556109, "balance_loss_mlp": 1.00060058, "epoch": 0.3766007334816329, "flos": 22528092706560.0, "grad_norm": 2.4549663549152756, "language_loss": 0.85911304, "learning_rate": 2.866865017635952e-06, "loss": 0.87850022, "num_input_tokens_seen": 67272535, "step": 3132, "time_per_iteration": 2.4791293144226074 }, { "auxiliary_loss_clip": 0.01144208, "auxiliary_loss_mlp": 0.01027583, "balance_loss_clip": 1.05422175, "balance_loss_mlp": 1.01894581, "epoch": 0.376720976372272, "flos": 25957166532480.0, "grad_norm": 1.5446627872810743, "language_loss": 0.79567385, "learning_rate": 2.866162953536174e-06, "loss": 0.81739175, "num_input_tokens_seen": 67293505, "step": 3133, "time_per_iteration": 2.5564582347869873 }, { "auxiliary_loss_clip": 0.01156655, "auxiliary_loss_mlp": 0.00763008, "balance_loss_clip": 1.05179453, "balance_loss_mlp": 1.00053525, "epoch": 0.3768412192629111, "flos": 18041162411520.0, "grad_norm": 1.6167623823912223, "language_loss": 0.75061142, "learning_rate": 2.8654607580401634e-06, "loss": 0.76980805, "num_input_tokens_seen": 67313240, "step": 3134, "time_per_iteration": 2.5095746517181396 }, { "auxiliary_loss_clip": 0.01074758, "auxiliary_loss_mlp": 0.01001733, "balance_loss_clip": 1.01861119, "balance_loss_mlp": 1.00059497, "epoch": 0.3769614621535502, "flos": 62989472304000.0, "grad_norm": 0.8811534456221024, "language_loss": 0.65200293, "learning_rate": 2.8647584312544446e-06, "loss": 0.67276788, "num_input_tokens_seen": 67378445, "step": 3135, "time_per_iteration": 3.070974588394165 }, { "auxiliary_loss_clip": 0.01138074, "auxiliary_loss_mlp": 0.00763079, "balance_loss_clip": 1.04827738, "balance_loss_mlp": 1.00057137, "epoch": 0.37708170504418925, "flos": 23661685002240.0, "grad_norm": 1.3930429172460972, "language_loss": 0.85039067, "learning_rate": 2.864055973285559e-06, "loss": 0.86940217, "num_input_tokens_seen": 67400445, "step": 3136, "time_per_iteration": 3.4620230197906494 }, { "auxiliary_loss_clip": 0.01148162, "auxiliary_loss_mlp": 0.01033728, "balance_loss_clip": 1.05047369, "balance_loss_mlp": 1.02497792, "epoch": 0.37720194793482836, "flos": 24423170353920.0, "grad_norm": 1.7110220162382623, "language_loss": 0.8608647, "learning_rate": 2.8633533842400698e-06, "loss": 0.88268363, "num_input_tokens_seen": 67420645, "step": 3137, "time_per_iteration": 2.5413384437561035 }, { "auxiliary_loss_clip": 0.01172878, "auxiliary_loss_mlp": 0.00763479, "balance_loss_clip": 1.05405211, "balance_loss_mlp": 1.00065172, "epoch": 0.3773221908254674, "flos": 20996502739200.0, "grad_norm": 1.8067888065358066, "language_loss": 0.77202499, "learning_rate": 2.862650664224558e-06, "loss": 0.79138851, "num_input_tokens_seen": 67439495, "step": 3138, "time_per_iteration": 2.474343776702881 }, { "auxiliary_loss_clip": 0.01171471, "auxiliary_loss_mlp": 0.01025909, "balance_loss_clip": 1.05762529, "balance_loss_mlp": 1.01851761, "epoch": 0.37744243371610653, "flos": 37631724958080.0, "grad_norm": 1.400180293354287, "language_loss": 0.6971063, "learning_rate": 2.861947813345627e-06, "loss": 0.71908009, "num_input_tokens_seen": 67462195, "step": 3139, "time_per_iteration": 2.6568715572357178 }, { "auxiliary_loss_clip": 0.0119042, "auxiliary_loss_mlp": 0.00763213, "balance_loss_clip": 1.05854702, "balance_loss_mlp": 1.000525, "epoch": 0.37756267660674564, "flos": 26140526484480.0, "grad_norm": 2.9436515015122033, "language_loss": 0.72500849, "learning_rate": 2.8612448317098974e-06, "loss": 0.74454486, "num_input_tokens_seen": 67482530, "step": 3140, "time_per_iteration": 3.3453752994537354 }, { "auxiliary_loss_clip": 0.01147114, "auxiliary_loss_mlp": 0.00763002, "balance_loss_clip": 1.05067492, "balance_loss_mlp": 1.00060177, "epoch": 0.3776829194973847, "flos": 19427888828160.0, "grad_norm": 2.1740787755318682, "language_loss": 0.83223599, "learning_rate": 2.8605417194240114e-06, "loss": 0.85133713, "num_input_tokens_seen": 67500890, "step": 3141, "time_per_iteration": 2.5429534912109375 }, { "auxiliary_loss_clip": 0.01164911, "auxiliary_loss_mlp": 0.01027254, "balance_loss_clip": 1.05115652, "balance_loss_mlp": 1.01951098, "epoch": 0.3778031623880238, "flos": 17382309194880.0, "grad_norm": 2.9788640345191495, "language_loss": 0.78936994, "learning_rate": 2.8598384765946315e-06, "loss": 0.81129164, "num_input_tokens_seen": 67519545, "step": 3142, "time_per_iteration": 3.2364134788513184 }, { "auxiliary_loss_clip": 0.01185699, "auxiliary_loss_mlp": 0.0102849, "balance_loss_clip": 1.05346358, "balance_loss_mlp": 1.02037144, "epoch": 0.3779234052786629, "flos": 27125843437440.0, "grad_norm": 1.8769201854494677, "language_loss": 0.7180388, "learning_rate": 2.8591351033284377e-06, "loss": 0.74018073, "num_input_tokens_seen": 67539275, "step": 3143, "time_per_iteration": 3.2351582050323486 }, { "auxiliary_loss_clip": 0.011739, "auxiliary_loss_mlp": 0.01026651, "balance_loss_clip": 1.05205965, "balance_loss_mlp": 1.01873887, "epoch": 0.37804364816930197, "flos": 19682639061120.0, "grad_norm": 2.015355793944523, "language_loss": 0.83983552, "learning_rate": 2.8584315997321325e-06, "loss": 0.86184108, "num_input_tokens_seen": 67558280, "step": 3144, "time_per_iteration": 2.50832462310791 }, { "auxiliary_loss_clip": 0.01186059, "auxiliary_loss_mlp": 0.0102712, "balance_loss_clip": 1.05478227, "balance_loss_mlp": 1.01867449, "epoch": 0.3781638910599411, "flos": 22702905221760.0, "grad_norm": 4.6456889317445595, "language_loss": 0.780828, "learning_rate": 2.8577279659124356e-06, "loss": 0.8029598, "num_input_tokens_seen": 67575955, "step": 3145, "time_per_iteration": 2.43192458152771 }, { "auxiliary_loss_clip": 0.0116686, "auxiliary_loss_mlp": 0.01026431, "balance_loss_clip": 1.0515486, "balance_loss_mlp": 1.01945472, "epoch": 0.3782841339505802, "flos": 14647604158080.0, "grad_norm": 1.746516935686437, "language_loss": 0.83336186, "learning_rate": 2.857024201976089e-06, "loss": 0.85529482, "num_input_tokens_seen": 67593515, "step": 3146, "time_per_iteration": 2.4630327224731445 }, { "auxiliary_loss_clip": 0.01155457, "auxiliary_loss_mlp": 0.01026075, "balance_loss_clip": 1.05408263, "balance_loss_mlp": 1.01752472, "epoch": 0.37840437684121925, "flos": 32818223185920.0, "grad_norm": 1.8728539770349364, "language_loss": 0.73618233, "learning_rate": 2.8563203080298516e-06, "loss": 0.75799763, "num_input_tokens_seen": 67614290, "step": 3147, "time_per_iteration": 2.5908761024475098 }, { "auxiliary_loss_clip": 0.01157094, "auxiliary_loss_mlp": 0.00763172, "balance_loss_clip": 1.05285096, "balance_loss_mlp": 1.00055099, "epoch": 0.37852461973185836, "flos": 18369206346240.0, "grad_norm": 2.079168619004708, "language_loss": 0.89210075, "learning_rate": 2.855616284180505e-06, "loss": 0.9113034, "num_input_tokens_seen": 67631340, "step": 3148, "time_per_iteration": 2.492553472518921 }, { "auxiliary_loss_clip": 0.01077776, "auxiliary_loss_mlp": 0.01002141, "balance_loss_clip": 1.01902878, "balance_loss_mlp": 1.0008595, "epoch": 0.37864486262249747, "flos": 59500680117120.0, "grad_norm": 0.8727031367201991, "language_loss": 0.66163015, "learning_rate": 2.8549121305348477e-06, "loss": 0.68242937, "num_input_tokens_seen": 67691125, "step": 3149, "time_per_iteration": 3.014672040939331 }, { "auxiliary_loss_clip": 0.01171632, "auxiliary_loss_mlp": 0.01030233, "balance_loss_clip": 1.05333805, "balance_loss_mlp": 1.02304149, "epoch": 0.3787651055131365, "flos": 23363015414400.0, "grad_norm": 2.0741758902264285, "language_loss": 0.83218241, "learning_rate": 2.8542078471997006e-06, "loss": 0.85420108, "num_input_tokens_seen": 67708740, "step": 3150, "time_per_iteration": 2.4627511501312256 }, { "auxiliary_loss_clip": 0.01170095, "auxiliary_loss_mlp": 0.01024641, "balance_loss_clip": 1.05196786, "balance_loss_mlp": 1.01736963, "epoch": 0.37888534840377563, "flos": 24601394661120.0, "grad_norm": 1.6798829687995607, "language_loss": 0.75739157, "learning_rate": 2.8535034342819013e-06, "loss": 0.77933896, "num_input_tokens_seen": 67726150, "step": 3151, "time_per_iteration": 2.4999353885650635 }, { "auxiliary_loss_clip": 0.01180723, "auxiliary_loss_mlp": 0.01031003, "balance_loss_clip": 1.0526576, "balance_loss_mlp": 1.0228374, "epoch": 0.37900559129441475, "flos": 23986891762560.0, "grad_norm": 1.8985002836540235, "language_loss": 0.72623253, "learning_rate": 2.85279889188831e-06, "loss": 0.74834985, "num_input_tokens_seen": 67746525, "step": 3152, "time_per_iteration": 2.4679253101348877 }, { "auxiliary_loss_clip": 0.01140687, "auxiliary_loss_mlp": 0.01025549, "balance_loss_clip": 1.04550624, "balance_loss_mlp": 1.01670969, "epoch": 0.3791258341850538, "flos": 24644667571200.0, "grad_norm": 1.8830747005249437, "language_loss": 0.81117892, "learning_rate": 2.852094220125805e-06, "loss": 0.83284134, "num_input_tokens_seen": 67766035, "step": 3153, "time_per_iteration": 2.553452968597412 }, { "auxiliary_loss_clip": 0.01172363, "auxiliary_loss_mlp": 0.01036381, "balance_loss_clip": 1.05350137, "balance_loss_mlp": 1.02798831, "epoch": 0.3792460770756929, "flos": 17420841509760.0, "grad_norm": 2.011030628955631, "language_loss": 0.71175748, "learning_rate": 2.8513894191012846e-06, "loss": 0.73384488, "num_input_tokens_seen": 67785015, "step": 3154, "time_per_iteration": 2.4432120323181152 }, { "auxiliary_loss_clip": 0.01185751, "auxiliary_loss_mlp": 0.01029871, "balance_loss_clip": 1.05495906, "balance_loss_mlp": 1.02147841, "epoch": 0.37936631996633197, "flos": 24206557386240.0, "grad_norm": 1.4748569945703571, "language_loss": 0.78986973, "learning_rate": 2.8506844889216664e-06, "loss": 0.8120259, "num_input_tokens_seen": 67804400, "step": 3155, "time_per_iteration": 2.451220989227295 }, { "auxiliary_loss_clip": 0.0107217, "auxiliary_loss_mlp": 0.01003007, "balance_loss_clip": 1.01990426, "balance_loss_mlp": 1.00193441, "epoch": 0.3794865628569711, "flos": 70297114752000.0, "grad_norm": 0.8607819496203047, "language_loss": 0.62838602, "learning_rate": 2.849979429693887e-06, "loss": 0.64913774, "num_input_tokens_seen": 67865385, "step": 3156, "time_per_iteration": 3.119034767150879 }, { "auxiliary_loss_clip": 0.01181803, "auxiliary_loss_mlp": 0.01027025, "balance_loss_clip": 1.05283451, "balance_loss_mlp": 1.01924086, "epoch": 0.3796068057476102, "flos": 15779364860160.0, "grad_norm": 2.0084870282797436, "language_loss": 0.74210215, "learning_rate": 2.8492742415249042e-06, "loss": 0.76419044, "num_input_tokens_seen": 67883030, "step": 3157, "time_per_iteration": 2.5124590396881104 }, { "auxiliary_loss_clip": 0.01181307, "auxiliary_loss_mlp": 0.01024898, "balance_loss_clip": 1.05115533, "balance_loss_mlp": 1.01722717, "epoch": 0.37972704863824924, "flos": 25191694771200.0, "grad_norm": 1.6691102650817446, "language_loss": 0.76180249, "learning_rate": 2.848568924521694e-06, "loss": 0.78386456, "num_input_tokens_seen": 67903810, "step": 3158, "time_per_iteration": 2.460641860961914 }, { "auxiliary_loss_clip": 0.01161962, "auxiliary_loss_mlp": 0.01024617, "balance_loss_clip": 1.04823327, "balance_loss_mlp": 1.01620042, "epoch": 0.37984729152888835, "flos": 26210372480640.0, "grad_norm": 1.7492059876551591, "language_loss": 0.73334646, "learning_rate": 2.8478634787912526e-06, "loss": 0.75521225, "num_input_tokens_seen": 67921865, "step": 3159, "time_per_iteration": 2.494974136352539 }, { "auxiliary_loss_clip": 0.01170244, "auxiliary_loss_mlp": 0.0102926, "balance_loss_clip": 1.05121291, "balance_loss_mlp": 1.02118349, "epoch": 0.37996753441952746, "flos": 25629302165760.0, "grad_norm": 1.9521277796201606, "language_loss": 0.76368785, "learning_rate": 2.847157904440596e-06, "loss": 0.78568292, "num_input_tokens_seen": 67941595, "step": 3160, "time_per_iteration": 2.488924503326416 }, { "auxiliary_loss_clip": 0.01168733, "auxiliary_loss_mlp": 0.01028002, "balance_loss_clip": 1.05106163, "balance_loss_mlp": 1.0202713, "epoch": 0.3800877773101665, "flos": 20118414862080.0, "grad_norm": 1.5681746370415068, "language_loss": 0.73709714, "learning_rate": 2.846452201576759e-06, "loss": 0.75906456, "num_input_tokens_seen": 67960970, "step": 3161, "time_per_iteration": 2.504160165786743 }, { "auxiliary_loss_clip": 0.01066212, "auxiliary_loss_mlp": 0.01001792, "balance_loss_clip": 1.01723588, "balance_loss_mlp": 1.00053406, "epoch": 0.38020802020080563, "flos": 63053608037760.0, "grad_norm": 0.8522443023093426, "language_loss": 0.62794399, "learning_rate": 2.845746370306795e-06, "loss": 0.64862406, "num_input_tokens_seen": 68026160, "step": 3162, "time_per_iteration": 3.1608963012695312 }, { "auxiliary_loss_clip": 0.01170603, "auxiliary_loss_mlp": 0.01028638, "balance_loss_clip": 1.05271387, "balance_loss_mlp": 1.02056789, "epoch": 0.38032826309144474, "flos": 21288420570240.0, "grad_norm": 2.0289624462301235, "language_loss": 0.78352225, "learning_rate": 2.84504041073778e-06, "loss": 0.80551469, "num_input_tokens_seen": 68044575, "step": 3163, "time_per_iteration": 3.2978944778442383 }, { "auxiliary_loss_clip": 0.01149744, "auxiliary_loss_mlp": 0.01035938, "balance_loss_clip": 1.05061769, "balance_loss_mlp": 1.02728987, "epoch": 0.3804485059820838, "flos": 18954119416320.0, "grad_norm": 1.68518794615801, "language_loss": 0.78961897, "learning_rate": 2.844334322976806e-06, "loss": 0.81147581, "num_input_tokens_seen": 68064790, "step": 3164, "time_per_iteration": 2.4990808963775635 }, { "auxiliary_loss_clip": 0.01129429, "auxiliary_loss_mlp": 0.01035414, "balance_loss_clip": 1.04753578, "balance_loss_mlp": 1.02714658, "epoch": 0.3805687488727229, "flos": 21833759831040.0, "grad_norm": 1.7590650241902899, "language_loss": 0.8353042, "learning_rate": 2.8436281071309866e-06, "loss": 0.85695261, "num_input_tokens_seen": 68083330, "step": 3165, "time_per_iteration": 2.578946352005005 }, { "auxiliary_loss_clip": 0.01043514, "auxiliary_loss_mlp": 0.01003189, "balance_loss_clip": 1.01465905, "balance_loss_mlp": 1.00191975, "epoch": 0.380688991763362, "flos": 58546209968640.0, "grad_norm": 0.7280922055584209, "language_loss": 0.5307098, "learning_rate": 2.842921763307455e-06, "loss": 0.55117679, "num_input_tokens_seen": 68146140, "step": 3166, "time_per_iteration": 3.1237878799438477 }, { "auxiliary_loss_clip": 0.01147546, "auxiliary_loss_mlp": 0.01026673, "balance_loss_clip": 1.048563, "balance_loss_mlp": 1.01918101, "epoch": 0.38080923465400107, "flos": 23799509487360.0, "grad_norm": 1.76003533843277, "language_loss": 0.82244015, "learning_rate": 2.842215291613361e-06, "loss": 0.84418237, "num_input_tokens_seen": 68164520, "step": 3167, "time_per_iteration": 3.350708484649658 }, { "auxiliary_loss_clip": 0.01008367, "auxiliary_loss_mlp": 0.01000715, "balance_loss_clip": 1.01497889, "balance_loss_mlp": 0.99937433, "epoch": 0.3809294775446402, "flos": 54969866380800.0, "grad_norm": 0.8370077518291396, "language_loss": 0.59312236, "learning_rate": 2.8415086921558774e-06, "loss": 0.61321318, "num_input_tokens_seen": 68227945, "step": 3168, "time_per_iteration": 3.525718927383423 }, { "auxiliary_loss_clip": 0.01137842, "auxiliary_loss_mlp": 0.01025148, "balance_loss_clip": 1.04266143, "balance_loss_mlp": 1.01776886, "epoch": 0.38104972043527924, "flos": 24643697904000.0, "grad_norm": 1.6110889269009732, "language_loss": 0.78616571, "learning_rate": 2.840801965042194e-06, "loss": 0.80779564, "num_input_tokens_seen": 68247405, "step": 3169, "time_per_iteration": 4.051271438598633 }, { "auxiliary_loss_clip": 0.01145332, "auxiliary_loss_mlp": 0.01025198, "balance_loss_clip": 1.04595494, "balance_loss_mlp": 1.01638222, "epoch": 0.38116996332591835, "flos": 22856783086080.0, "grad_norm": 1.791710454339517, "language_loss": 0.83780932, "learning_rate": 2.840095110379521e-06, "loss": 0.85951465, "num_input_tokens_seen": 68266925, "step": 3170, "time_per_iteration": 2.502591848373413 }, { "auxiliary_loss_clip": 0.01039021, "auxiliary_loss_mlp": 0.01001923, "balance_loss_clip": 1.01768005, "balance_loss_mlp": 1.00064707, "epoch": 0.38129020621655746, "flos": 60836160804480.0, "grad_norm": 0.7370650773994835, "language_loss": 0.53917062, "learning_rate": 2.8393881282750884e-06, "loss": 0.55958009, "num_input_tokens_seen": 68329755, "step": 3171, "time_per_iteration": 3.2006514072418213 }, { "auxiliary_loss_clip": 0.01152919, "auxiliary_loss_mlp": 0.01026307, "balance_loss_clip": 1.05122185, "balance_loss_mlp": 1.01796198, "epoch": 0.3814104491071965, "flos": 21648101408640.0, "grad_norm": 2.1167779888947877, "language_loss": 0.78833842, "learning_rate": 2.838681018836144e-06, "loss": 0.8101306, "num_input_tokens_seen": 68347075, "step": 3172, "time_per_iteration": 2.709188938140869 }, { "auxiliary_loss_clip": 0.01140165, "auxiliary_loss_mlp": 0.00762271, "balance_loss_clip": 1.04590583, "balance_loss_mlp": 1.00048661, "epoch": 0.3815306919978356, "flos": 19099090707840.0, "grad_norm": 3.1572789620936867, "language_loss": 0.78345037, "learning_rate": 2.837973782169955e-06, "loss": 0.80247474, "num_input_tokens_seen": 68365450, "step": 3173, "time_per_iteration": 2.5054147243499756 }, { "auxiliary_loss_clip": 0.01083746, "auxiliary_loss_mlp": 0.01003965, "balance_loss_clip": 1.01695681, "balance_loss_mlp": 1.00276124, "epoch": 0.38165093488847474, "flos": 67067918156160.0, "grad_norm": 0.8089844439337767, "language_loss": 0.59189057, "learning_rate": 2.8372664183838096e-06, "loss": 0.6127677, "num_input_tokens_seen": 68428470, "step": 3174, "time_per_iteration": 3.059325695037842 }, { "auxiliary_loss_clip": 0.01181734, "auxiliary_loss_mlp": 0.01028834, "balance_loss_clip": 1.05307579, "balance_loss_mlp": 1.02084374, "epoch": 0.3817711777791138, "flos": 22341105480960.0, "grad_norm": 2.2264495596923406, "language_loss": 0.68303359, "learning_rate": 2.836558927585015e-06, "loss": 0.70513928, "num_input_tokens_seen": 68445440, "step": 3175, "time_per_iteration": 2.4270942211151123 }, { "auxiliary_loss_clip": 0.01171557, "auxiliary_loss_mlp": 0.01032745, "balance_loss_clip": 1.05329132, "balance_loss_mlp": 1.02558327, "epoch": 0.3818914206697529, "flos": 22820621068800.0, "grad_norm": 1.8640719254165987, "language_loss": 0.82636273, "learning_rate": 2.8358513098808957e-06, "loss": 0.84840578, "num_input_tokens_seen": 68465755, "step": 3176, "time_per_iteration": 2.477747917175293 }, { "auxiliary_loss_clip": 0.01117358, "auxiliary_loss_mlp": 0.010266, "balance_loss_clip": 1.04605341, "balance_loss_mlp": 1.01851141, "epoch": 0.382011663560392, "flos": 24386074583040.0, "grad_norm": 1.7021520832148063, "language_loss": 0.76432312, "learning_rate": 2.835143565378798e-06, "loss": 0.78576267, "num_input_tokens_seen": 68486220, "step": 3177, "time_per_iteration": 2.5898873805999756 }, { "auxiliary_loss_clip": 0.01109025, "auxiliary_loss_mlp": 0.01021945, "balance_loss_clip": 1.04546857, "balance_loss_mlp": 1.01432776, "epoch": 0.38213190645103107, "flos": 21981568296960.0, "grad_norm": 2.1506874675296968, "language_loss": 0.78206319, "learning_rate": 2.8344356941860847e-06, "loss": 0.80337286, "num_input_tokens_seen": 68505850, "step": 3178, "time_per_iteration": 2.603647232055664 }, { "auxiliary_loss_clip": 0.01137521, "auxiliary_loss_mlp": 0.01028246, "balance_loss_clip": 1.05007863, "balance_loss_mlp": 1.02045584, "epoch": 0.3822521493416702, "flos": 35516945773440.0, "grad_norm": 2.206334388246954, "language_loss": 0.66234505, "learning_rate": 2.8337276964101403e-06, "loss": 0.6840027, "num_input_tokens_seen": 68526290, "step": 3179, "time_per_iteration": 2.696027994155884 }, { "auxiliary_loss_clip": 0.01169485, "auxiliary_loss_mlp": 0.01027547, "balance_loss_clip": 1.05123329, "balance_loss_mlp": 1.02000737, "epoch": 0.3823723922323093, "flos": 21069904181760.0, "grad_norm": 4.254980769317907, "language_loss": 0.7682572, "learning_rate": 2.833019572158367e-06, "loss": 0.79022747, "num_input_tokens_seen": 68544725, "step": 3180, "time_per_iteration": 2.488067388534546 }, { "auxiliary_loss_clip": 0.0115579, "auxiliary_loss_mlp": 0.01027538, "balance_loss_clip": 1.05235767, "balance_loss_mlp": 1.0198189, "epoch": 0.38249263512294834, "flos": 19789149864960.0, "grad_norm": 1.9127208185256865, "language_loss": 0.79979289, "learning_rate": 2.8323113215381872e-06, "loss": 0.82162619, "num_input_tokens_seen": 68563070, "step": 3181, "time_per_iteration": 2.5239553451538086 }, { "auxiliary_loss_clip": 0.01135962, "auxiliary_loss_mlp": 0.01031835, "balance_loss_clip": 1.04739428, "balance_loss_mlp": 1.02378821, "epoch": 0.38261287801358745, "flos": 21433930565760.0, "grad_norm": 3.3638053866364523, "language_loss": 0.76438922, "learning_rate": 2.831602944657042e-06, "loss": 0.78606719, "num_input_tokens_seen": 68581150, "step": 3182, "time_per_iteration": 2.53416109085083 }, { "auxiliary_loss_clip": 0.01161672, "auxiliary_loss_mlp": 0.01022724, "balance_loss_clip": 1.0515033, "balance_loss_mlp": 1.01535058, "epoch": 0.38273312090422656, "flos": 21981568296960.0, "grad_norm": 2.4542562068155216, "language_loss": 0.7424916, "learning_rate": 2.830894441622391e-06, "loss": 0.76433551, "num_input_tokens_seen": 68597800, "step": 3183, "time_per_iteration": 2.496521472930908 }, { "auxiliary_loss_clip": 0.01137753, "auxiliary_loss_mlp": 0.00762416, "balance_loss_clip": 1.04529953, "balance_loss_mlp": 1.00051236, "epoch": 0.3828533637948656, "flos": 24790895838720.0, "grad_norm": 2.063231703597195, "language_loss": 0.80283439, "learning_rate": 2.8301858125417134e-06, "loss": 0.82183611, "num_input_tokens_seen": 68617640, "step": 3184, "time_per_iteration": 2.565103769302368 }, { "auxiliary_loss_clip": 0.01155518, "auxiliary_loss_mlp": 0.01025726, "balance_loss_clip": 1.05390048, "balance_loss_mlp": 1.01862717, "epoch": 0.38297360668550473, "flos": 22455445449600.0, "grad_norm": 1.7793085337085333, "language_loss": 0.73788238, "learning_rate": 2.8294770575225082e-06, "loss": 0.75969481, "num_input_tokens_seen": 68637770, "step": 3185, "time_per_iteration": 2.5026817321777344 }, { "auxiliary_loss_clip": 0.01171711, "auxiliary_loss_mlp": 0.01030731, "balance_loss_clip": 1.05589783, "balance_loss_mlp": 1.02291048, "epoch": 0.3830938495761438, "flos": 24896903852160.0, "grad_norm": 1.8995471365824692, "language_loss": 0.84415257, "learning_rate": 2.828768176672293e-06, "loss": 0.86617702, "num_input_tokens_seen": 68656885, "step": 3186, "time_per_iteration": 2.499512195587158 }, { "auxiliary_loss_clip": 0.01137167, "auxiliary_loss_mlp": 0.01027959, "balance_loss_clip": 1.0474906, "balance_loss_mlp": 1.01999009, "epoch": 0.3832140924667829, "flos": 33036236784000.0, "grad_norm": 1.7281969888694617, "language_loss": 0.71628308, "learning_rate": 2.8280591700986044e-06, "loss": 0.73793435, "num_input_tokens_seen": 68678750, "step": 3187, "time_per_iteration": 2.6123199462890625 }, { "auxiliary_loss_clip": 0.01156838, "auxiliary_loss_mlp": 0.01030437, "balance_loss_clip": 1.04954779, "balance_loss_mlp": 1.02254581, "epoch": 0.383334335357422, "flos": 31903721896320.0, "grad_norm": 2.1807006866721266, "language_loss": 0.75257981, "learning_rate": 2.827350037908999e-06, "loss": 0.77445263, "num_input_tokens_seen": 68698190, "step": 3188, "time_per_iteration": 2.575436592102051 }, { "auxiliary_loss_clip": 0.01144218, "auxiliary_loss_mlp": 0.01031584, "balance_loss_clip": 1.04833341, "balance_loss_mlp": 1.02313757, "epoch": 0.38345457824806106, "flos": 19791915212160.0, "grad_norm": 2.4878051564816346, "language_loss": 0.7945323, "learning_rate": 2.8266407802110496e-06, "loss": 0.81629032, "num_input_tokens_seen": 68716445, "step": 3189, "time_per_iteration": 3.3202626705169678 }, { "auxiliary_loss_clip": 0.01102949, "auxiliary_loss_mlp": 0.01027768, "balance_loss_clip": 1.04342628, "balance_loss_mlp": 1.01929879, "epoch": 0.3835748211387002, "flos": 22419391173120.0, "grad_norm": 1.6348695623562643, "language_loss": 0.75552332, "learning_rate": 2.8259313971123515e-06, "loss": 0.77683049, "num_input_tokens_seen": 68737565, "step": 3190, "time_per_iteration": 2.631824493408203 }, { "auxiliary_loss_clip": 0.01165465, "auxiliary_loss_mlp": 0.01026559, "balance_loss_clip": 1.05310309, "balance_loss_mlp": 1.01922727, "epoch": 0.3836950640293393, "flos": 25118436983040.0, "grad_norm": 1.4647485065380128, "language_loss": 0.78014088, "learning_rate": 2.8252218887205166e-06, "loss": 0.80206108, "num_input_tokens_seen": 68758255, "step": 3191, "time_per_iteration": 2.5090203285217285 }, { "auxiliary_loss_clip": 0.01113985, "auxiliary_loss_mlp": 0.01027931, "balance_loss_clip": 1.04734313, "balance_loss_mlp": 1.02055228, "epoch": 0.38381530691997834, "flos": 21799213925760.0, "grad_norm": 1.7546562748941634, "language_loss": 0.8057124, "learning_rate": 2.824512255143178e-06, "loss": 0.82713151, "num_input_tokens_seen": 68777490, "step": 3192, "time_per_iteration": 2.5692050457000732 }, { "auxiliary_loss_clip": 0.011409, "auxiliary_loss_mlp": 0.01022626, "balance_loss_clip": 1.04771256, "balance_loss_mlp": 1.01530337, "epoch": 0.38393554981061745, "flos": 21252689516160.0, "grad_norm": 1.5942724656274099, "language_loss": 0.79068297, "learning_rate": 2.8238024964879855e-06, "loss": 0.81231821, "num_input_tokens_seen": 68798385, "step": 3193, "time_per_iteration": 3.372929811477661 }, { "auxiliary_loss_clip": 0.01184641, "auxiliary_loss_mlp": 0.01028911, "balance_loss_clip": 1.05431449, "balance_loss_mlp": 1.02031624, "epoch": 0.38405579270125656, "flos": 17019360218880.0, "grad_norm": 2.2270292550574275, "language_loss": 0.76967895, "learning_rate": 2.8230926128626095e-06, "loss": 0.79181457, "num_input_tokens_seen": 68816880, "step": 3194, "time_per_iteration": 2.4135050773620605 }, { "auxiliary_loss_clip": 0.01150216, "auxiliary_loss_mlp": 0.01030705, "balance_loss_clip": 1.04955673, "balance_loss_mlp": 1.02208662, "epoch": 0.3841760355918956, "flos": 21835375943040.0, "grad_norm": 1.9727650672589567, "language_loss": 0.79329073, "learning_rate": 2.822382604374738e-06, "loss": 0.8150999, "num_input_tokens_seen": 68835805, "step": 3195, "time_per_iteration": 2.4910168647766113 }, { "auxiliary_loss_clip": 0.01155221, "auxiliary_loss_mlp": 0.01033727, "balance_loss_clip": 1.05323637, "balance_loss_mlp": 1.02565098, "epoch": 0.3842962784825347, "flos": 25915114684800.0, "grad_norm": 2.0532166436378527, "language_loss": 0.65856123, "learning_rate": 2.8216724711320793e-06, "loss": 0.68045068, "num_input_tokens_seen": 68854930, "step": 3196, "time_per_iteration": 3.3022334575653076 }, { "auxiliary_loss_clip": 0.01181011, "auxiliary_loss_mlp": 0.00762096, "balance_loss_clip": 1.05351889, "balance_loss_mlp": 1.0004859, "epoch": 0.38441652137317384, "flos": 25337492075520.0, "grad_norm": 1.47877781050593, "language_loss": 0.79610753, "learning_rate": 2.820962213242361e-06, "loss": 0.81553864, "num_input_tokens_seen": 68874260, "step": 3197, "time_per_iteration": 2.4673094749450684 }, { "auxiliary_loss_clip": 0.01165806, "auxiliary_loss_mlp": 0.01030596, "balance_loss_clip": 1.05499542, "balance_loss_mlp": 1.02281749, "epoch": 0.3845367642638129, "flos": 18113486446080.0, "grad_norm": 2.0839808971588023, "language_loss": 0.84196889, "learning_rate": 2.8202518308133264e-06, "loss": 0.86393297, "num_input_tokens_seen": 68891535, "step": 3198, "time_per_iteration": 2.4500091075897217 }, { "auxiliary_loss_clip": 0.01183405, "auxiliary_loss_mlp": 0.010283, "balance_loss_clip": 1.05307496, "balance_loss_mlp": 1.01988339, "epoch": 0.384657007154452, "flos": 25228395492480.0, "grad_norm": 1.7723069316482476, "language_loss": 0.73268163, "learning_rate": 2.8195413239527426e-06, "loss": 0.75479865, "num_input_tokens_seen": 68911275, "step": 3199, "time_per_iteration": 2.4608614444732666 }, { "auxiliary_loss_clip": 0.01165938, "auxiliary_loss_mlp": 0.01025012, "balance_loss_clip": 1.05086684, "balance_loss_mlp": 1.01728714, "epoch": 0.38477725004509106, "flos": 19865855358720.0, "grad_norm": 2.984715420168236, "language_loss": 0.80649495, "learning_rate": 2.8188306927683906e-06, "loss": 0.82840443, "num_input_tokens_seen": 68930745, "step": 3200, "time_per_iteration": 2.479733467102051 }, { "auxiliary_loss_clip": 0.01157421, "auxiliary_loss_mlp": 0.01026386, "balance_loss_clip": 1.05219245, "balance_loss_mlp": 1.01882255, "epoch": 0.38489749293573017, "flos": 18259391491200.0, "grad_norm": 2.0538072666640517, "language_loss": 0.74708247, "learning_rate": 2.818119937368074e-06, "loss": 0.76892054, "num_input_tokens_seen": 68949380, "step": 3201, "time_per_iteration": 2.48122501373291 }, { "auxiliary_loss_clip": 0.01174571, "auxiliary_loss_mlp": 0.01026306, "balance_loss_clip": 1.05240822, "balance_loss_mlp": 1.01789594, "epoch": 0.3850177358263693, "flos": 24389163152640.0, "grad_norm": 1.7977164773743752, "language_loss": 0.6552316, "learning_rate": 2.817409057859613e-06, "loss": 0.67724037, "num_input_tokens_seen": 68968370, "step": 3202, "time_per_iteration": 2.495089530944824 }, { "auxiliary_loss_clip": 0.01119938, "auxiliary_loss_mlp": 0.01029223, "balance_loss_clip": 1.04628098, "balance_loss_mlp": 1.02043748, "epoch": 0.38513797871700833, "flos": 17671533505920.0, "grad_norm": 1.8553361074795838, "language_loss": 0.79098183, "learning_rate": 2.8166980543508482e-06, "loss": 0.81247342, "num_input_tokens_seen": 68984260, "step": 3203, "time_per_iteration": 2.5306320190429688 }, { "auxiliary_loss_clip": 0.01184912, "auxiliary_loss_mlp": 0.01027669, "balance_loss_clip": 1.05515623, "balance_loss_mlp": 1.0195688, "epoch": 0.38525822160764744, "flos": 25739583897600.0, "grad_norm": 2.2440143030083197, "language_loss": 0.79745102, "learning_rate": 2.815986926949638e-06, "loss": 0.81957686, "num_input_tokens_seen": 69002760, "step": 3204, "time_per_iteration": 2.4764270782470703 }, { "auxiliary_loss_clip": 0.01169411, "auxiliary_loss_mlp": 0.0102993, "balance_loss_clip": 1.05345321, "balance_loss_mlp": 1.02224088, "epoch": 0.38537846449828655, "flos": 20193647898240.0, "grad_norm": 1.9366310159940756, "language_loss": 0.80485642, "learning_rate": 2.8152756757638597e-06, "loss": 0.82684982, "num_input_tokens_seen": 69021260, "step": 3205, "time_per_iteration": 2.525256633758545 }, { "auxiliary_loss_clip": 0.01168349, "auxiliary_loss_mlp": 0.01025131, "balance_loss_clip": 1.05362892, "balance_loss_mlp": 1.01730502, "epoch": 0.3854987073889256, "flos": 23039352938880.0, "grad_norm": 1.957318819849449, "language_loss": 0.84805369, "learning_rate": 2.8145643009014093e-06, "loss": 0.86998856, "num_input_tokens_seen": 69039755, "step": 3206, "time_per_iteration": 2.469200611114502 }, { "auxiliary_loss_clip": 0.01169757, "auxiliary_loss_mlp": 0.01027836, "balance_loss_clip": 1.05223334, "balance_loss_mlp": 1.02062714, "epoch": 0.3856189502795647, "flos": 20190631155840.0, "grad_norm": 1.8727171650986616, "language_loss": 0.79482269, "learning_rate": 2.813852802470202e-06, "loss": 0.81679857, "num_input_tokens_seen": 69057650, "step": 3207, "time_per_iteration": 2.4493305683135986 }, { "auxiliary_loss_clip": 0.01149093, "auxiliary_loss_mlp": 0.01028307, "balance_loss_clip": 1.04925859, "balance_loss_mlp": 1.01988482, "epoch": 0.38573919317020383, "flos": 25702631781120.0, "grad_norm": 1.8564654416961834, "language_loss": 0.72477341, "learning_rate": 2.8131411805781717e-06, "loss": 0.74654734, "num_input_tokens_seen": 69077775, "step": 3208, "time_per_iteration": 2.52998685836792 }, { "auxiliary_loss_clip": 0.0115728, "auxiliary_loss_mlp": 0.0103279, "balance_loss_clip": 1.05320108, "balance_loss_mlp": 1.02411771, "epoch": 0.3858594360608429, "flos": 29821405628160.0, "grad_norm": 2.4154988743530725, "language_loss": 0.64251673, "learning_rate": 2.8124294353332707e-06, "loss": 0.66441751, "num_input_tokens_seen": 69096450, "step": 3209, "time_per_iteration": 2.5405120849609375 }, { "auxiliary_loss_clip": 0.01147448, "auxiliary_loss_mlp": 0.01029025, "balance_loss_clip": 1.05024803, "balance_loss_mlp": 1.02114272, "epoch": 0.385979678951482, "flos": 24790428961920.0, "grad_norm": 2.0734802543577624, "language_loss": 0.77002323, "learning_rate": 2.8117175668434713e-06, "loss": 0.79178798, "num_input_tokens_seen": 69116110, "step": 3210, "time_per_iteration": 2.569413185119629 }, { "auxiliary_loss_clip": 0.0118342, "auxiliary_loss_mlp": 0.01025331, "balance_loss_clip": 1.05286443, "balance_loss_mlp": 1.01746917, "epoch": 0.3860999218421211, "flos": 21287881866240.0, "grad_norm": 2.156595512260456, "language_loss": 0.70295525, "learning_rate": 2.811005575216762e-06, "loss": 0.72504276, "num_input_tokens_seen": 69134825, "step": 3211, "time_per_iteration": 2.426785469055176 }, { "auxiliary_loss_clip": 0.01136072, "auxiliary_loss_mlp": 0.01029479, "balance_loss_clip": 1.04940605, "balance_loss_mlp": 1.02164114, "epoch": 0.38622016473276016, "flos": 24536720223360.0, "grad_norm": 1.4879811042686577, "language_loss": 0.78962409, "learning_rate": 2.8102934605611513e-06, "loss": 0.81127954, "num_input_tokens_seen": 69156460, "step": 3212, "time_per_iteration": 2.630581855773926 }, { "auxiliary_loss_clip": 0.01162519, "auxiliary_loss_mlp": 0.01027802, "balance_loss_clip": 1.05406499, "balance_loss_mlp": 1.02010679, "epoch": 0.3863404076233993, "flos": 20558212986240.0, "grad_norm": 1.9334712343041998, "language_loss": 0.67580163, "learning_rate": 2.8095812229846665e-06, "loss": 0.69770491, "num_input_tokens_seen": 69176420, "step": 3213, "time_per_iteration": 2.4756698608398438 }, { "auxiliary_loss_clip": 0.01155649, "auxiliary_loss_mlp": 0.01027104, "balance_loss_clip": 1.05022776, "balance_loss_mlp": 1.01885474, "epoch": 0.3864606505140384, "flos": 22346277039360.0, "grad_norm": 2.2897727708591593, "language_loss": 0.69092166, "learning_rate": 2.808868862595355e-06, "loss": 0.71274924, "num_input_tokens_seen": 69196665, "step": 3214, "time_per_iteration": 2.5003111362457275 }, { "auxiliary_loss_clip": 0.01173008, "auxiliary_loss_mlp": 0.01029896, "balance_loss_clip": 1.05276656, "balance_loss_mlp": 1.02217674, "epoch": 0.38658089340467744, "flos": 25703601448320.0, "grad_norm": 1.794278604247436, "language_loss": 0.79630697, "learning_rate": 2.8081563795012795e-06, "loss": 0.81833601, "num_input_tokens_seen": 69216290, "step": 3215, "time_per_iteration": 2.505335569381714 }, { "auxiliary_loss_clip": 0.01162649, "auxiliary_loss_mlp": 0.01025337, "balance_loss_clip": 1.0503298, "balance_loss_mlp": 1.01706409, "epoch": 0.38670113629531655, "flos": 33802534558080.0, "grad_norm": 1.7089105332158925, "language_loss": 0.73840106, "learning_rate": 2.807443773810524e-06, "loss": 0.76028097, "num_input_tokens_seen": 69237550, "step": 3216, "time_per_iteration": 3.442344903945923 }, { "auxiliary_loss_clip": 0.01140815, "auxiliary_loss_mlp": 0.01029235, "balance_loss_clip": 1.0520426, "balance_loss_mlp": 1.02167737, "epoch": 0.3868213791859556, "flos": 23331522165120.0, "grad_norm": 1.7494564552128187, "language_loss": 0.89491987, "learning_rate": 2.80673104563119e-06, "loss": 0.91662037, "num_input_tokens_seen": 69258175, "step": 3217, "time_per_iteration": 2.5437746047973633 }, { "auxiliary_loss_clip": 0.01167748, "auxiliary_loss_mlp": 0.01026458, "balance_loss_clip": 1.05380428, "balance_loss_mlp": 1.01898932, "epoch": 0.3869416220765947, "flos": 18441530380800.0, "grad_norm": 1.8935414531063413, "language_loss": 0.78779399, "learning_rate": 2.8060181950713976e-06, "loss": 0.80973607, "num_input_tokens_seen": 69274965, "step": 3218, "time_per_iteration": 2.445941925048828 }, { "auxiliary_loss_clip": 0.01140586, "auxiliary_loss_mlp": 0.01030357, "balance_loss_clip": 1.04957235, "balance_loss_mlp": 1.02179217, "epoch": 0.3870618649672338, "flos": 15632992938240.0, "grad_norm": 2.0349587359910974, "language_loss": 0.80779105, "learning_rate": 2.805305222239286e-06, "loss": 0.82950044, "num_input_tokens_seen": 69292220, "step": 3219, "time_per_iteration": 2.5065672397613525 }, { "auxiliary_loss_clip": 0.01152443, "auxiliary_loss_mlp": 0.01032107, "balance_loss_clip": 1.05142903, "balance_loss_mlp": 1.02383995, "epoch": 0.3871821078578729, "flos": 23513804709120.0, "grad_norm": 1.8508080525203485, "language_loss": 0.74011767, "learning_rate": 2.8045921272430118e-06, "loss": 0.76196313, "num_input_tokens_seen": 69311900, "step": 3220, "time_per_iteration": 3.388962984085083 }, { "auxiliary_loss_clip": 0.01177741, "auxiliary_loss_mlp": 0.01029053, "balance_loss_clip": 1.05307615, "balance_loss_mlp": 1.02063656, "epoch": 0.387302350748512, "flos": 17778259791360.0, "grad_norm": 2.8917845533421893, "language_loss": 0.7632134, "learning_rate": 2.803878910190753e-06, "loss": 0.7852813, "num_input_tokens_seen": 69328820, "step": 3221, "time_per_iteration": 2.4334919452667236 }, { "auxiliary_loss_clip": 0.01173599, "auxiliary_loss_mlp": 0.01028342, "balance_loss_clip": 1.0524317, "balance_loss_mlp": 1.02012825, "epoch": 0.3874225936391511, "flos": 11503409097600.0, "grad_norm": 2.2293242821823527, "language_loss": 0.82540745, "learning_rate": 2.8031655711907017e-06, "loss": 0.84742689, "num_input_tokens_seen": 69342525, "step": 3222, "time_per_iteration": 3.234394073486328 }, { "auxiliary_loss_clip": 0.01176365, "auxiliary_loss_mlp": 0.01028772, "balance_loss_clip": 1.05663884, "balance_loss_mlp": 1.02091599, "epoch": 0.38754283652979016, "flos": 21945154884480.0, "grad_norm": 2.0200332788840996, "language_loss": 0.80665362, "learning_rate": 2.8024521103510723e-06, "loss": 0.82870495, "num_input_tokens_seen": 69359295, "step": 3223, "time_per_iteration": 3.2388243675231934 }, { "auxiliary_loss_clip": 0.01168977, "auxiliary_loss_mlp": 0.01026865, "balance_loss_clip": 1.05062079, "balance_loss_mlp": 1.01933742, "epoch": 0.38766307942042927, "flos": 21175984022400.0, "grad_norm": 1.6623459373167608, "language_loss": 0.74894947, "learning_rate": 2.8017385277800952e-06, "loss": 0.77090788, "num_input_tokens_seen": 69377650, "step": 3224, "time_per_iteration": 2.4779410362243652 }, { "auxiliary_loss_clip": 0.01147191, "auxiliary_loss_mlp": 0.01030838, "balance_loss_clip": 1.05246294, "balance_loss_mlp": 1.02270842, "epoch": 0.3877833223110684, "flos": 27417294391680.0, "grad_norm": 2.1022691550027806, "language_loss": 0.74999893, "learning_rate": 2.8010248235860213e-06, "loss": 0.77177918, "num_input_tokens_seen": 69397765, "step": 3225, "time_per_iteration": 2.5907249450683594 }, { "auxiliary_loss_clip": 0.01064901, "auxiliary_loss_mlp": 0.00753871, "balance_loss_clip": 1.01622009, "balance_loss_mlp": 1.00104749, "epoch": 0.38790356520170743, "flos": 64500019879680.0, "grad_norm": 0.8316356263695989, "language_loss": 0.62737465, "learning_rate": 2.8003109978771192e-06, "loss": 0.64556241, "num_input_tokens_seen": 69458930, "step": 3226, "time_per_iteration": 3.1397085189819336 }, { "auxiliary_loss_clip": 0.01132178, "auxiliary_loss_mlp": 0.01028599, "balance_loss_clip": 1.04446375, "balance_loss_mlp": 1.02046585, "epoch": 0.38802380809234654, "flos": 22345415112960.0, "grad_norm": 1.9025751700006426, "language_loss": 0.78832626, "learning_rate": 2.799597050761674e-06, "loss": 0.80993408, "num_input_tokens_seen": 69475135, "step": 3227, "time_per_iteration": 2.5487070083618164 }, { "auxiliary_loss_clip": 0.01185842, "auxiliary_loss_mlp": 0.0102982, "balance_loss_clip": 1.0546875, "balance_loss_mlp": 1.02154112, "epoch": 0.38814405098298566, "flos": 25261361199360.0, "grad_norm": 2.2978969370769353, "language_loss": 0.78845787, "learning_rate": 2.7988829823479924e-06, "loss": 0.81061447, "num_input_tokens_seen": 69493525, "step": 3228, "time_per_iteration": 2.4963016510009766 }, { "auxiliary_loss_clip": 0.01151832, "auxiliary_loss_mlp": 0.01034917, "balance_loss_clip": 1.04949355, "balance_loss_mlp": 1.02595854, "epoch": 0.3882642938736247, "flos": 18841180078080.0, "grad_norm": 1.8006423740272943, "language_loss": 0.64153314, "learning_rate": 2.7981687927443976e-06, "loss": 0.66340065, "num_input_tokens_seen": 69510325, "step": 3229, "time_per_iteration": 2.518388509750366 }, { "auxiliary_loss_clip": 0.01170466, "auxiliary_loss_mlp": 0.01024373, "balance_loss_clip": 1.05142999, "balance_loss_mlp": 1.01699448, "epoch": 0.3883845367642638, "flos": 21652806090240.0, "grad_norm": 2.1126366535360583, "language_loss": 0.85848439, "learning_rate": 2.797454482059231e-06, "loss": 0.88043278, "num_input_tokens_seen": 69530480, "step": 3230, "time_per_iteration": 2.4896163940429688 }, { "auxiliary_loss_clip": 0.01188312, "auxiliary_loss_mlp": 0.01022747, "balance_loss_clip": 1.05571604, "balance_loss_mlp": 1.0150522, "epoch": 0.3885047796549029, "flos": 20557530627840.0, "grad_norm": 1.6989963090614713, "language_loss": 0.84420633, "learning_rate": 2.7967400504008537e-06, "loss": 0.86631697, "num_input_tokens_seen": 69549780, "step": 3231, "time_per_iteration": 2.447850227355957 }, { "auxiliary_loss_clip": 0.01037035, "auxiliary_loss_mlp": 0.01001894, "balance_loss_clip": 1.01485324, "balance_loss_mlp": 1.00045156, "epoch": 0.388625022545542, "flos": 64325491695360.0, "grad_norm": 0.8022526018366798, "language_loss": 0.57465744, "learning_rate": 2.7960254978776456e-06, "loss": 0.59504676, "num_input_tokens_seen": 69611870, "step": 3232, "time_per_iteration": 3.1105988025665283 }, { "auxiliary_loss_clip": 0.01192168, "auxiliary_loss_mlp": 0.01036197, "balance_loss_clip": 1.05890322, "balance_loss_mlp": 1.02764356, "epoch": 0.3887452654361811, "flos": 18113881495680.0, "grad_norm": 2.205559368003149, "language_loss": 0.81482327, "learning_rate": 2.7953108245980006e-06, "loss": 0.83710694, "num_input_tokens_seen": 69630385, "step": 3233, "time_per_iteration": 2.427133321762085 }, { "auxiliary_loss_clip": 0.01150324, "auxiliary_loss_mlp": 0.01028908, "balance_loss_clip": 1.05208707, "balance_loss_mlp": 1.02127862, "epoch": 0.38886550832682015, "flos": 24975261371520.0, "grad_norm": 1.732846160751345, "language_loss": 0.73512185, "learning_rate": 2.7945960306703365e-06, "loss": 0.75691414, "num_input_tokens_seen": 69653370, "step": 3234, "time_per_iteration": 2.526604652404785 }, { "auxiliary_loss_clip": 0.0117497, "auxiliary_loss_mlp": 0.01028342, "balance_loss_clip": 1.05309224, "balance_loss_mlp": 1.02005076, "epoch": 0.38898575121745926, "flos": 27199496275200.0, "grad_norm": 1.588899129368604, "language_loss": 0.65811324, "learning_rate": 2.7938811162030865e-06, "loss": 0.6801464, "num_input_tokens_seen": 69673635, "step": 3235, "time_per_iteration": 2.509488105773926 }, { "auxiliary_loss_clip": 0.01170715, "auxiliary_loss_mlp": 0.01032393, "balance_loss_clip": 1.05484247, "balance_loss_mlp": 1.02483487, "epoch": 0.3891059941080984, "flos": 28763728727040.0, "grad_norm": 1.7277976198209395, "language_loss": 0.82470143, "learning_rate": 2.793166081304702e-06, "loss": 0.84673244, "num_input_tokens_seen": 69694130, "step": 3236, "time_per_iteration": 2.5227341651916504 }, { "auxiliary_loss_clip": 0.01147911, "auxiliary_loss_mlp": 0.01031027, "balance_loss_clip": 1.04874611, "balance_loss_mlp": 1.02245021, "epoch": 0.38922623699873743, "flos": 22893447893760.0, "grad_norm": 2.0276868238012424, "language_loss": 0.8219837, "learning_rate": 2.7924509260836543e-06, "loss": 0.84377313, "num_input_tokens_seen": 69713255, "step": 3237, "time_per_iteration": 2.543348550796509 }, { "auxiliary_loss_clip": 0.0114166, "auxiliary_loss_mlp": 0.01025844, "balance_loss_clip": 1.04836845, "balance_loss_mlp": 1.01775575, "epoch": 0.38934647988937654, "flos": 19792418002560.0, "grad_norm": 2.050684782806791, "language_loss": 0.68545651, "learning_rate": 2.791735650648431e-06, "loss": 0.70713162, "num_input_tokens_seen": 69732375, "step": 3238, "time_per_iteration": 2.5303473472595215 }, { "auxiliary_loss_clip": 0.01155465, "auxiliary_loss_mlp": 0.01027622, "balance_loss_clip": 1.05193543, "balance_loss_mlp": 1.01992679, "epoch": 0.38946672278001565, "flos": 19202081978880.0, "grad_norm": 1.950539965555811, "language_loss": 0.74656129, "learning_rate": 2.791020255107538e-06, "loss": 0.76839209, "num_input_tokens_seen": 69749745, "step": 3239, "time_per_iteration": 2.480377674102783 }, { "auxiliary_loss_clip": 0.01137207, "auxiliary_loss_mlp": 0.01023849, "balance_loss_clip": 1.04646337, "balance_loss_mlp": 1.01620817, "epoch": 0.3895869656706547, "flos": 24936477661440.0, "grad_norm": 1.5719149804481403, "language_loss": 0.80506647, "learning_rate": 2.7903047395695023e-06, "loss": 0.82667708, "num_input_tokens_seen": 69769645, "step": 3240, "time_per_iteration": 2.578160285949707 }, { "auxiliary_loss_clip": 0.01171727, "auxiliary_loss_mlp": 0.00763544, "balance_loss_clip": 1.05551887, "balance_loss_mlp": 1.00050211, "epoch": 0.3897072085612938, "flos": 24133622820480.0, "grad_norm": 2.2504149998124725, "language_loss": 0.90364242, "learning_rate": 2.789589104142865e-06, "loss": 0.92299509, "num_input_tokens_seen": 69787270, "step": 3241, "time_per_iteration": 2.501742362976074 }, { "auxiliary_loss_clip": 0.01147688, "auxiliary_loss_mlp": 0.01030536, "balance_loss_clip": 1.05282593, "balance_loss_mlp": 1.02234602, "epoch": 0.3898274514519329, "flos": 17166342672000.0, "grad_norm": 1.636412462729543, "language_loss": 0.76452243, "learning_rate": 2.7888733489361895e-06, "loss": 0.78630471, "num_input_tokens_seen": 69805685, "step": 3242, "time_per_iteration": 3.352848768234253 }, { "auxiliary_loss_clip": 0.01082728, "auxiliary_loss_mlp": 0.0100187, "balance_loss_clip": 1.01639056, "balance_loss_mlp": 1.0006541, "epoch": 0.389947694342572, "flos": 66074807952000.0, "grad_norm": 0.7385728826113218, "language_loss": 0.58735919, "learning_rate": 2.788157474058054e-06, "loss": 0.6082052, "num_input_tokens_seen": 69867960, "step": 3243, "time_per_iteration": 3.1290664672851562 }, { "auxiliary_loss_clip": 0.01183456, "auxiliary_loss_mlp": 0.01025089, "balance_loss_clip": 1.05488944, "balance_loss_mlp": 1.01723337, "epoch": 0.3900679372332111, "flos": 25740912700800.0, "grad_norm": 2.033501168117817, "language_loss": 0.69751221, "learning_rate": 2.7874414796170555e-06, "loss": 0.7195977, "num_input_tokens_seen": 69889450, "step": 3244, "time_per_iteration": 2.499852180480957 }, { "auxiliary_loss_clip": 0.0116801, "auxiliary_loss_mlp": 0.01030488, "balance_loss_clip": 1.0513339, "balance_loss_mlp": 1.02149379, "epoch": 0.3901881801238502, "flos": 11801611808640.0, "grad_norm": 2.7595706859646376, "language_loss": 0.83891404, "learning_rate": 2.7867253657218113e-06, "loss": 0.86089897, "num_input_tokens_seen": 69903340, "step": 3245, "time_per_iteration": 2.443608045578003 }, { "auxiliary_loss_clip": 0.01153751, "auxiliary_loss_mlp": 0.0076292, "balance_loss_clip": 1.04865181, "balance_loss_mlp": 1.00037992, "epoch": 0.39030842301448926, "flos": 27308951994240.0, "grad_norm": 1.6194609997213272, "language_loss": 0.73074603, "learning_rate": 2.7860091324809544e-06, "loss": 0.74991274, "num_input_tokens_seen": 69924400, "step": 3246, "time_per_iteration": 3.4300479888916016 }, { "auxiliary_loss_clip": 0.01169921, "auxiliary_loss_mlp": 0.01024381, "balance_loss_clip": 1.05628026, "balance_loss_mlp": 1.01642418, "epoch": 0.39042866590512837, "flos": 27163334257920.0, "grad_norm": 1.6558130652737997, "language_loss": 0.81046462, "learning_rate": 2.7852927800031377e-06, "loss": 0.83240759, "num_input_tokens_seen": 69944565, "step": 3247, "time_per_iteration": 2.5320394039154053 }, { "auxiliary_loss_clip": 0.01160907, "auxiliary_loss_mlp": 0.01026287, "balance_loss_clip": 1.05256021, "balance_loss_mlp": 1.0188725, "epoch": 0.3905489087957674, "flos": 29716115886720.0, "grad_norm": 1.6210112064440096, "language_loss": 0.82835734, "learning_rate": 2.7845763083970298e-06, "loss": 0.85022926, "num_input_tokens_seen": 69964965, "step": 3248, "time_per_iteration": 2.5642549991607666 }, { "auxiliary_loss_clip": 0.01162749, "auxiliary_loss_mlp": 0.01026318, "balance_loss_clip": 1.05101204, "balance_loss_mlp": 1.01803303, "epoch": 0.39066915168640653, "flos": 24498618871680.0, "grad_norm": 1.7764446345116056, "language_loss": 0.81763041, "learning_rate": 2.7838597177713205e-06, "loss": 0.83952105, "num_input_tokens_seen": 69986055, "step": 3249, "time_per_iteration": 3.9981529712677 }, { "auxiliary_loss_clip": 0.01103956, "auxiliary_loss_mlp": 0.01033713, "balance_loss_clip": 1.04911184, "balance_loss_mlp": 1.02456367, "epoch": 0.39078939457704565, "flos": 20558572122240.0, "grad_norm": 1.6756429597870666, "language_loss": 0.73753262, "learning_rate": 2.7831430082347143e-06, "loss": 0.75890934, "num_input_tokens_seen": 70005260, "step": 3250, "time_per_iteration": 2.58373761177063 }, { "auxiliary_loss_clip": 0.0117313, "auxiliary_loss_mlp": 0.0076195, "balance_loss_clip": 1.05479407, "balance_loss_mlp": 1.0004319, "epoch": 0.3909096374676847, "flos": 22783417557120.0, "grad_norm": 2.2673661345969656, "language_loss": 0.82241976, "learning_rate": 2.7824261798959373e-06, "loss": 0.84177053, "num_input_tokens_seen": 70023440, "step": 3251, "time_per_iteration": 2.4760334491729736 }, { "auxiliary_loss_clip": 0.01159385, "auxiliary_loss_mlp": 0.01031265, "balance_loss_clip": 1.04956758, "balance_loss_mlp": 1.02312326, "epoch": 0.3910298803583238, "flos": 23003119094400.0, "grad_norm": 3.302233109838475, "language_loss": 0.79711246, "learning_rate": 2.78170923286373e-06, "loss": 0.81901896, "num_input_tokens_seen": 70043040, "step": 3252, "time_per_iteration": 2.503570795059204 }, { "auxiliary_loss_clip": 0.01095594, "auxiliary_loss_mlp": 0.01029322, "balance_loss_clip": 1.04935503, "balance_loss_mlp": 1.02100134, "epoch": 0.3911501232489629, "flos": 24316264500480.0, "grad_norm": 2.524476181282204, "language_loss": 0.84054238, "learning_rate": 2.780992167246854e-06, "loss": 0.86179155, "num_input_tokens_seen": 70060565, "step": 3253, "time_per_iteration": 2.6338977813720703 }, { "auxiliary_loss_clip": 0.01063074, "auxiliary_loss_mlp": 0.01000957, "balance_loss_clip": 1.01545167, "balance_loss_mlp": 0.99963409, "epoch": 0.391270366139602, "flos": 60869054684160.0, "grad_norm": 0.9770640126092774, "language_loss": 0.7217567, "learning_rate": 2.7802749831540883e-06, "loss": 0.74239695, "num_input_tokens_seen": 70119465, "step": 3254, "time_per_iteration": 3.1400845050811768 }, { "auxiliary_loss_clip": 0.01135714, "auxiliary_loss_mlp": 0.01024722, "balance_loss_clip": 1.05231929, "balance_loss_mlp": 1.01810563, "epoch": 0.3913906090302411, "flos": 21543494025600.0, "grad_norm": 1.8329094666211643, "language_loss": 0.81702727, "learning_rate": 2.7795576806942268e-06, "loss": 0.83863163, "num_input_tokens_seen": 70138270, "step": 3255, "time_per_iteration": 2.5657639503479004 }, { "auxiliary_loss_clip": 0.01065919, "auxiliary_loss_mlp": 0.01015118, "balance_loss_clip": 1.0305686, "balance_loss_mlp": 1.01323438, "epoch": 0.3915108519208802, "flos": 49839953702400.0, "grad_norm": 0.7636547175130955, "language_loss": 0.54911411, "learning_rate": 2.778840259976085e-06, "loss": 0.56992459, "num_input_tokens_seen": 70193500, "step": 3256, "time_per_iteration": 3.0667998790740967 }, { "auxiliary_loss_clip": 0.01172861, "auxiliary_loss_mlp": 0.01029573, "balance_loss_clip": 1.05391896, "balance_loss_mlp": 1.02144861, "epoch": 0.39163109481151925, "flos": 16506447960960.0, "grad_norm": 2.222199025902046, "language_loss": 0.76880467, "learning_rate": 2.778122721108495e-06, "loss": 0.79082906, "num_input_tokens_seen": 70211730, "step": 3257, "time_per_iteration": 2.4855129718780518 }, { "auxiliary_loss_clip": 0.01169594, "auxiliary_loss_mlp": 0.01025829, "balance_loss_clip": 1.05525732, "balance_loss_mlp": 1.01809812, "epoch": 0.39175133770215836, "flos": 26067484177920.0, "grad_norm": 1.864806882877144, "language_loss": 0.88906801, "learning_rate": 2.7774050642003076e-06, "loss": 0.91102219, "num_input_tokens_seen": 70232540, "step": 3258, "time_per_iteration": 2.519649028778076 }, { "auxiliary_loss_clip": 0.01189462, "auxiliary_loss_mlp": 0.01031847, "balance_loss_clip": 1.05702865, "balance_loss_mlp": 1.02312708, "epoch": 0.3918715805927975, "flos": 21872076664320.0, "grad_norm": 2.824304961261037, "language_loss": 0.93794262, "learning_rate": 2.7766872893603896e-06, "loss": 0.96015573, "num_input_tokens_seen": 70252515, "step": 3259, "time_per_iteration": 2.465919017791748 }, { "auxiliary_loss_clip": 0.01170958, "auxiliary_loss_mlp": 0.01034205, "balance_loss_clip": 1.05340433, "balance_loss_mlp": 1.02673042, "epoch": 0.39199182348343653, "flos": 20376181837440.0, "grad_norm": 1.8652807224471422, "language_loss": 0.72857916, "learning_rate": 2.7759693966976275e-06, "loss": 0.7506308, "num_input_tokens_seen": 70271020, "step": 3260, "time_per_iteration": 2.4618334770202637 }, { "auxiliary_loss_clip": 0.01139869, "auxiliary_loss_mlp": 0.01029917, "balance_loss_clip": 1.04971039, "balance_loss_mlp": 1.02138758, "epoch": 0.39211206637407564, "flos": 21683545153920.0, "grad_norm": 1.8205055729924071, "language_loss": 0.85115933, "learning_rate": 2.7752513863209242e-06, "loss": 0.87285715, "num_input_tokens_seen": 70289600, "step": 3261, "time_per_iteration": 2.5293543338775635 }, { "auxiliary_loss_clip": 0.01151067, "auxiliary_loss_mlp": 0.00762549, "balance_loss_clip": 1.05329037, "balance_loss_mlp": 1.00041938, "epoch": 0.39223230926471475, "flos": 21066276908160.0, "grad_norm": 1.8292048440569295, "language_loss": 0.84580135, "learning_rate": 2.774533258339203e-06, "loss": 0.86493748, "num_input_tokens_seen": 70307060, "step": 3262, "time_per_iteration": 2.505244493484497 }, { "auxiliary_loss_clip": 0.01127517, "auxiliary_loss_mlp": 0.01032921, "balance_loss_clip": 1.04422212, "balance_loss_mlp": 1.02389729, "epoch": 0.3923525521553538, "flos": 17603016312960.0, "grad_norm": 2.1113699509031942, "language_loss": 0.7951684, "learning_rate": 2.7738150128614014e-06, "loss": 0.81677282, "num_input_tokens_seen": 70324465, "step": 3263, "time_per_iteration": 2.55241322517395 }, { "auxiliary_loss_clip": 0.01136683, "auxiliary_loss_mlp": 0.01032606, "balance_loss_clip": 1.05254436, "balance_loss_mlp": 1.02444577, "epoch": 0.3924727950459929, "flos": 20558284813440.0, "grad_norm": 1.6557518437488754, "language_loss": 0.89292258, "learning_rate": 2.7730966499964777e-06, "loss": 0.91461545, "num_input_tokens_seen": 70341415, "step": 3264, "time_per_iteration": 2.530471086502075 }, { "auxiliary_loss_clip": 0.01189101, "auxiliary_loss_mlp": 0.01032901, "balance_loss_clip": 1.05687809, "balance_loss_mlp": 1.02475333, "epoch": 0.39259303793663197, "flos": 16216110328320.0, "grad_norm": 2.5280735980282953, "language_loss": 0.80623281, "learning_rate": 2.772378169853408e-06, "loss": 0.82845283, "num_input_tokens_seen": 70358985, "step": 3265, "time_per_iteration": 2.406672716140747 }, { "auxiliary_loss_clip": 0.01143242, "auxiliary_loss_mlp": 0.01029589, "balance_loss_clip": 1.05250764, "balance_loss_mlp": 1.02130437, "epoch": 0.3927132808272711, "flos": 16797001075200.0, "grad_norm": 1.6516614693086602, "language_loss": 0.74094671, "learning_rate": 2.771659572541183e-06, "loss": 0.76267505, "num_input_tokens_seen": 70376915, "step": 3266, "time_per_iteration": 2.5126864910125732 }, { "auxiliary_loss_clip": 0.01176372, "auxiliary_loss_mlp": 0.01026713, "balance_loss_clip": 1.05828738, "balance_loss_mlp": 1.01848209, "epoch": 0.3928335237179102, "flos": 20267228908800.0, "grad_norm": 3.2506458650837264, "language_loss": 0.86854154, "learning_rate": 2.7709408581688143e-06, "loss": 0.89057237, "num_input_tokens_seen": 70396900, "step": 3267, "time_per_iteration": 2.467857599258423 }, { "auxiliary_loss_clip": 0.01153162, "auxiliary_loss_mlp": 0.01032149, "balance_loss_clip": 1.05427825, "balance_loss_mlp": 1.02444792, "epoch": 0.39295376660854925, "flos": 24973250209920.0, "grad_norm": 2.6326070050291, "language_loss": 0.8742345, "learning_rate": 2.7702220268453307e-06, "loss": 0.89608765, "num_input_tokens_seen": 70417260, "step": 3268, "time_per_iteration": 2.5738706588745117 }, { "auxiliary_loss_clip": 0.01158392, "auxiliary_loss_mlp": 0.01030681, "balance_loss_clip": 1.05179596, "balance_loss_mlp": 1.02229166, "epoch": 0.39307400949918836, "flos": 18697788984960.0, "grad_norm": 1.9635488571225173, "language_loss": 0.85025442, "learning_rate": 2.7695030786797785e-06, "loss": 0.87214512, "num_input_tokens_seen": 70433155, "step": 3269, "time_per_iteration": 3.3012287616729736 }, { "auxiliary_loss_clip": 0.01123626, "auxiliary_loss_mlp": 0.0102671, "balance_loss_clip": 1.04863238, "balance_loss_mlp": 1.01891422, "epoch": 0.39319425238982747, "flos": 22415476590720.0, "grad_norm": 2.439163504877315, "language_loss": 0.74610567, "learning_rate": 2.7687840137812206e-06, "loss": 0.76760912, "num_input_tokens_seen": 70451240, "step": 3270, "time_per_iteration": 2.560053825378418 }, { "auxiliary_loss_clip": 0.01066677, "auxiliary_loss_mlp": 0.01009795, "balance_loss_clip": 1.01438951, "balance_loss_mlp": 1.00873423, "epoch": 0.3933144952804665, "flos": 66192954762240.0, "grad_norm": 0.8055337839960424, "language_loss": 0.62112021, "learning_rate": 2.7680648322587395e-06, "loss": 0.64188492, "num_input_tokens_seen": 70516115, "step": 3271, "time_per_iteration": 3.0788683891296387 }, { "auxiliary_loss_clip": 0.01184828, "auxiliary_loss_mlp": 0.01025247, "balance_loss_clip": 1.05506277, "balance_loss_mlp": 1.01798701, "epoch": 0.39343473817110564, "flos": 15487159720320.0, "grad_norm": 2.014715414844573, "language_loss": 0.81232679, "learning_rate": 2.7673455342214334e-06, "loss": 0.83442754, "num_input_tokens_seen": 70533105, "step": 3272, "time_per_iteration": 3.252774238586426 }, { "auxiliary_loss_clip": 0.01171341, "auxiliary_loss_mlp": 0.01029245, "balance_loss_clip": 1.05465829, "balance_loss_mlp": 1.021842, "epoch": 0.39355498106174475, "flos": 21324905809920.0, "grad_norm": 1.7149714659231003, "language_loss": 0.76088572, "learning_rate": 2.7666261197784198e-06, "loss": 0.78289151, "num_input_tokens_seen": 70551920, "step": 3273, "time_per_iteration": 2.4684314727783203 }, { "auxiliary_loss_clip": 0.01154392, "auxiliary_loss_mlp": 0.01027443, "balance_loss_clip": 1.05688882, "balance_loss_mlp": 1.01951253, "epoch": 0.3936752239523838, "flos": 13296357400320.0, "grad_norm": 1.8535559958076178, "language_loss": 0.76550823, "learning_rate": 2.7659065890388336e-06, "loss": 0.78732657, "num_input_tokens_seen": 70567920, "step": 3274, "time_per_iteration": 2.4735236167907715 }, { "auxiliary_loss_clip": 0.0116069, "auxiliary_loss_mlp": 0.01032118, "balance_loss_clip": 1.05290723, "balance_loss_mlp": 1.02391624, "epoch": 0.3937954668430229, "flos": 16800161472000.0, "grad_norm": 1.8541142995945012, "language_loss": 0.84696734, "learning_rate": 2.7651869421118266e-06, "loss": 0.86889541, "num_input_tokens_seen": 70584530, "step": 3275, "time_per_iteration": 2.4858474731445312 }, { "auxiliary_loss_clip": 0.01177781, "auxiliary_loss_mlp": 0.01030098, "balance_loss_clip": 1.05752325, "balance_loss_mlp": 1.02223027, "epoch": 0.393915709733662, "flos": 21064229832960.0, "grad_norm": 1.8418147506381601, "language_loss": 0.82987547, "learning_rate": 2.76446717910657e-06, "loss": 0.85195422, "num_input_tokens_seen": 70605235, "step": 3276, "time_per_iteration": 4.063091516494751 }, { "auxiliary_loss_clip": 0.01169298, "auxiliary_loss_mlp": 0.01028292, "balance_loss_clip": 1.05397987, "balance_loss_mlp": 1.0208652, "epoch": 0.3940359526243011, "flos": 17165265264000.0, "grad_norm": 2.278352460426256, "language_loss": 0.76940465, "learning_rate": 2.763747300132249e-06, "loss": 0.79138058, "num_input_tokens_seen": 70622675, "step": 3277, "time_per_iteration": 2.530925750732422 }, { "auxiliary_loss_clip": 0.01186039, "auxiliary_loss_mlp": 0.01027329, "balance_loss_clip": 1.05651355, "balance_loss_mlp": 1.01966929, "epoch": 0.3941561955149402, "flos": 20995856294400.0, "grad_norm": 2.4553229721494967, "language_loss": 0.86797357, "learning_rate": 2.7630273052980704e-06, "loss": 0.89010721, "num_input_tokens_seen": 70643265, "step": 3278, "time_per_iteration": 2.4568777084350586 }, { "auxiliary_loss_clip": 0.01147155, "auxiliary_loss_mlp": 0.01028119, "balance_loss_clip": 1.05091381, "balance_loss_mlp": 1.02043939, "epoch": 0.39427643840557924, "flos": 18843406721280.0, "grad_norm": 2.1139731394065517, "language_loss": 0.67189413, "learning_rate": 2.7623071947132554e-06, "loss": 0.69364685, "num_input_tokens_seen": 70660295, "step": 3279, "time_per_iteration": 2.4969375133514404 }, { "auxiliary_loss_clip": 0.0116263, "auxiliary_loss_mlp": 0.01029876, "balance_loss_clip": 1.0519774, "balance_loss_mlp": 1.02210319, "epoch": 0.39439668129621835, "flos": 23258659426560.0, "grad_norm": 2.0309250766719122, "language_loss": 0.78960013, "learning_rate": 2.7615869684870458e-06, "loss": 0.81152523, "num_input_tokens_seen": 70679605, "step": 3280, "time_per_iteration": 2.5203399658203125 }, { "auxiliary_loss_clip": 0.0116898, "auxiliary_loss_mlp": 0.01025797, "balance_loss_clip": 1.05459476, "balance_loss_mlp": 1.01813805, "epoch": 0.39451692418685746, "flos": 26652289507200.0, "grad_norm": 1.6496987635799056, "language_loss": 0.84804529, "learning_rate": 2.7608666267286986e-06, "loss": 0.86999303, "num_input_tokens_seen": 70699835, "step": 3281, "time_per_iteration": 2.522454023361206 }, { "auxiliary_loss_clip": 0.01110992, "auxiliary_loss_mlp": 0.01028882, "balance_loss_clip": 1.04392552, "balance_loss_mlp": 1.02028084, "epoch": 0.3946371670774965, "flos": 18258709132800.0, "grad_norm": 2.1428145301837414, "language_loss": 0.8665821, "learning_rate": 2.760146169547489e-06, "loss": 0.88798082, "num_input_tokens_seen": 70716600, "step": 3282, "time_per_iteration": 2.5824480056762695 }, { "auxiliary_loss_clip": 0.01159153, "auxiliary_loss_mlp": 0.01029247, "balance_loss_clip": 1.056229, "balance_loss_mlp": 1.0214746, "epoch": 0.39475740996813563, "flos": 24206126423040.0, "grad_norm": 1.4804869744378997, "language_loss": 0.76509339, "learning_rate": 2.75942559705271e-06, "loss": 0.78697741, "num_input_tokens_seen": 70736335, "step": 3283, "time_per_iteration": 2.5750155448913574 }, { "auxiliary_loss_clip": 0.01168846, "auxiliary_loss_mlp": 0.01031608, "balance_loss_clip": 1.05418956, "balance_loss_mlp": 1.0238657, "epoch": 0.39487765285877474, "flos": 19317858491520.0, "grad_norm": 1.885595811139525, "language_loss": 0.89396888, "learning_rate": 2.7587049093536713e-06, "loss": 0.91597342, "num_input_tokens_seen": 70752665, "step": 3284, "time_per_iteration": 2.457979679107666 }, { "auxiliary_loss_clip": 0.01175037, "auxiliary_loss_mlp": 0.01033707, "balance_loss_clip": 1.05383873, "balance_loss_mlp": 1.02561891, "epoch": 0.3949978957494138, "flos": 17311744926720.0, "grad_norm": 1.828702407432217, "language_loss": 0.80783391, "learning_rate": 2.757984106559701e-06, "loss": 0.82992136, "num_input_tokens_seen": 70771650, "step": 3285, "time_per_iteration": 2.451009750366211 }, { "auxiliary_loss_clip": 0.01150458, "auxiliary_loss_mlp": 0.01029947, "balance_loss_clip": 1.05323124, "balance_loss_mlp": 1.02134013, "epoch": 0.3951181386400529, "flos": 36317861280000.0, "grad_norm": 2.131549892746281, "language_loss": 0.71350396, "learning_rate": 2.7572631887801446e-06, "loss": 0.73530793, "num_input_tokens_seen": 70793275, "step": 3286, "time_per_iteration": 2.6279146671295166 }, { "auxiliary_loss_clip": 0.01173297, "auxiliary_loss_mlp": 0.0103289, "balance_loss_clip": 1.05439973, "balance_loss_mlp": 1.02444983, "epoch": 0.395238381530692, "flos": 23110348170240.0, "grad_norm": 1.736145692358794, "language_loss": 0.76463598, "learning_rate": 2.7565421561243654e-06, "loss": 0.78669786, "num_input_tokens_seen": 70811440, "step": 3287, "time_per_iteration": 2.483351230621338 }, { "auxiliary_loss_clip": 0.01135227, "auxiliary_loss_mlp": 0.01027277, "balance_loss_clip": 1.04888797, "balance_loss_mlp": 1.01925969, "epoch": 0.3953586244213311, "flos": 24347614095360.0, "grad_norm": 2.104480105140653, "language_loss": 0.82103282, "learning_rate": 2.7558210087017413e-06, "loss": 0.8426578, "num_input_tokens_seen": 70831375, "step": 3288, "time_per_iteration": 2.595792531967163 }, { "auxiliary_loss_clip": 0.01140659, "auxiliary_loss_mlp": 0.01029322, "balance_loss_clip": 1.05560422, "balance_loss_mlp": 1.02059579, "epoch": 0.3954788673119702, "flos": 23440080044160.0, "grad_norm": 1.7170317189933366, "language_loss": 0.73595458, "learning_rate": 2.7550997466216724e-06, "loss": 0.75765437, "num_input_tokens_seen": 70849170, "step": 3289, "time_per_iteration": 2.534391403198242 }, { "auxiliary_loss_clip": 0.01156417, "auxiliary_loss_mlp": 0.01030762, "balance_loss_clip": 1.05793762, "balance_loss_mlp": 1.02278113, "epoch": 0.3955991102026093, "flos": 17494063384320.0, "grad_norm": 1.795017574319635, "language_loss": 0.81494516, "learning_rate": 2.7543783699935714e-06, "loss": 0.83681697, "num_input_tokens_seen": 70867200, "step": 3290, "time_per_iteration": 2.4821085929870605 }, { "auxiliary_loss_clip": 0.0117174, "auxiliary_loss_mlp": 0.01029309, "balance_loss_clip": 1.05730796, "balance_loss_mlp": 1.02088094, "epoch": 0.39571935309324835, "flos": 18221326053120.0, "grad_norm": 3.3685546131809874, "language_loss": 0.86355239, "learning_rate": 2.753656878926872e-06, "loss": 0.8855629, "num_input_tokens_seen": 70883080, "step": 3291, "time_per_iteration": 2.444225549697876 }, { "auxiliary_loss_clip": 0.01145082, "auxiliary_loss_mlp": 0.01028411, "balance_loss_clip": 1.04885352, "balance_loss_mlp": 1.01986361, "epoch": 0.39583959598388746, "flos": 17748813617280.0, "grad_norm": 1.757528766332802, "language_loss": 0.74279284, "learning_rate": 2.752935273531023e-06, "loss": 0.76452774, "num_input_tokens_seen": 70901230, "step": 3292, "time_per_iteration": 2.4704136848449707 }, { "auxiliary_loss_clip": 0.01172147, "auxiliary_loss_mlp": 0.01026466, "balance_loss_clip": 1.05391765, "balance_loss_mlp": 1.01784158, "epoch": 0.39595983887452657, "flos": 19352368483200.0, "grad_norm": 1.9902570651557974, "language_loss": 0.78126264, "learning_rate": 2.752213553915492e-06, "loss": 0.80324876, "num_input_tokens_seen": 70919585, "step": 3293, "time_per_iteration": 2.4594690799713135 }, { "auxiliary_loss_clip": 0.01060207, "auxiliary_loss_mlp": 0.01005951, "balance_loss_clip": 1.01696682, "balance_loss_mlp": 1.00484204, "epoch": 0.3960800817651656, "flos": 60682282940160.0, "grad_norm": 0.8258775305275187, "language_loss": 0.66093391, "learning_rate": 2.751491720189762e-06, "loss": 0.6815955, "num_input_tokens_seen": 70977695, "step": 3294, "time_per_iteration": 3.0303730964660645 }, { "auxiliary_loss_clip": 0.01155984, "auxiliary_loss_mlp": 0.00762895, "balance_loss_clip": 1.05299711, "balance_loss_mlp": 1.00045896, "epoch": 0.39620032465580474, "flos": 16836718538880.0, "grad_norm": 2.099570115804247, "language_loss": 0.91771901, "learning_rate": 2.7507697724633364e-06, "loss": 0.93690777, "num_input_tokens_seen": 70994455, "step": 3295, "time_per_iteration": 3.3124825954437256 }, { "auxiliary_loss_clip": 0.01050161, "auxiliary_loss_mlp": 0.01019746, "balance_loss_clip": 1.02417922, "balance_loss_mlp": 1.01763618, "epoch": 0.3963205675464438, "flos": 69071445941760.0, "grad_norm": 0.779361968435171, "language_loss": 0.54674315, "learning_rate": 2.7500477108457327e-06, "loss": 0.56744224, "num_input_tokens_seen": 71046465, "step": 3296, "time_per_iteration": 2.927067279815674 }, { "auxiliary_loss_clip": 0.0112561, "auxiliary_loss_mlp": 0.01029293, "balance_loss_clip": 1.04574406, "balance_loss_mlp": 1.02032816, "epoch": 0.3964408104370829, "flos": 25667439431040.0, "grad_norm": 1.8923014506886564, "language_loss": 0.80510932, "learning_rate": 2.7493255354464877e-06, "loss": 0.82665831, "num_input_tokens_seen": 71064275, "step": 3297, "time_per_iteration": 2.58343243598938 }, { "auxiliary_loss_clip": 0.01099265, "auxiliary_loss_mlp": 0.0102735, "balance_loss_clip": 1.04591966, "balance_loss_mlp": 1.01944637, "epoch": 0.396561053327722, "flos": 24277480790400.0, "grad_norm": 2.2940253043828736, "language_loss": 0.76256281, "learning_rate": 2.748603246375156e-06, "loss": 0.78382897, "num_input_tokens_seen": 71082290, "step": 3298, "time_per_iteration": 2.6529412269592285 }, { "auxiliary_loss_clip": 0.01185318, "auxiliary_loss_mlp": 0.01034995, "balance_loss_clip": 1.05652261, "balance_loss_mlp": 1.02679586, "epoch": 0.39668129621836107, "flos": 20522302364160.0, "grad_norm": 2.2860200728620304, "language_loss": 0.70224375, "learning_rate": 2.7478808437413055e-06, "loss": 0.72444689, "num_input_tokens_seen": 71101700, "step": 3299, "time_per_iteration": 3.29171085357666 }, { "auxiliary_loss_clip": 0.01131616, "auxiliary_loss_mlp": 0.01027461, "balance_loss_clip": 1.05585361, "balance_loss_mlp": 1.01921213, "epoch": 0.3968015391090002, "flos": 27052585649280.0, "grad_norm": 1.8374207464210466, "language_loss": 0.66066754, "learning_rate": 2.7471583276545263e-06, "loss": 0.68225831, "num_input_tokens_seen": 71122360, "step": 3300, "time_per_iteration": 2.6148300170898438 }, { "auxiliary_loss_clip": 0.0115726, "auxiliary_loss_mlp": 0.01030419, "balance_loss_clip": 1.0520978, "balance_loss_mlp": 1.0227536, "epoch": 0.3969217819996393, "flos": 12531819392640.0, "grad_norm": 2.314079434148456, "language_loss": 0.70782506, "learning_rate": 2.7464356982244224e-06, "loss": 0.72970188, "num_input_tokens_seen": 71140360, "step": 3301, "time_per_iteration": 2.4854118824005127 }, { "auxiliary_loss_clip": 0.01074202, "auxiliary_loss_mlp": 0.01004612, "balance_loss_clip": 1.02616191, "balance_loss_mlp": 1.0026803, "epoch": 0.39704202489027834, "flos": 66241399230720.0, "grad_norm": 0.7736484151504005, "language_loss": 0.61720312, "learning_rate": 2.745712955560617e-06, "loss": 0.63799131, "num_input_tokens_seen": 71196565, "step": 3302, "time_per_iteration": 3.7873544692993164 }, { "auxiliary_loss_clip": 0.01113514, "auxiliary_loss_mlp": 0.01029657, "balance_loss_clip": 1.04912424, "balance_loss_mlp": 1.02062714, "epoch": 0.39716226778091746, "flos": 16982982720000.0, "grad_norm": 3.027195481383852, "language_loss": 0.77458537, "learning_rate": 2.7449900997727496e-06, "loss": 0.79601711, "num_input_tokens_seen": 71214675, "step": 3303, "time_per_iteration": 3.357720375061035 }, { "auxiliary_loss_clip": 0.01156318, "auxiliary_loss_mlp": 0.01031392, "balance_loss_clip": 1.05588877, "balance_loss_mlp": 1.02373898, "epoch": 0.39728251067155657, "flos": 23477139901440.0, "grad_norm": 2.061190945910885, "language_loss": 0.84094739, "learning_rate": 2.744267130970476e-06, "loss": 0.86282444, "num_input_tokens_seen": 71234400, "step": 3304, "time_per_iteration": 2.509260892868042 }, { "auxiliary_loss_clip": 0.01153412, "auxiliary_loss_mlp": 0.01030891, "balance_loss_clip": 1.05415094, "balance_loss_mlp": 1.02217698, "epoch": 0.3974027535621956, "flos": 20704441253760.0, "grad_norm": 1.738133313676558, "language_loss": 0.77289069, "learning_rate": 2.7435440492634697e-06, "loss": 0.7947337, "num_input_tokens_seen": 71253725, "step": 3305, "time_per_iteration": 2.5091795921325684 }, { "auxiliary_loss_clip": 0.01155957, "auxiliary_loss_mlp": 0.01032191, "balance_loss_clip": 1.05122185, "balance_loss_mlp": 1.02337551, "epoch": 0.39752299645283473, "flos": 21543278544000.0, "grad_norm": 2.6393113719047503, "language_loss": 0.67367053, "learning_rate": 2.7428208547614228e-06, "loss": 0.69555199, "num_input_tokens_seen": 71273220, "step": 3306, "time_per_iteration": 2.504953384399414 }, { "auxiliary_loss_clip": 0.01173746, "auxiliary_loss_mlp": 0.01032724, "balance_loss_clip": 1.05654716, "balance_loss_mlp": 1.02468312, "epoch": 0.39764323934347384, "flos": 19208295031680.0, "grad_norm": 1.9824610939563567, "language_loss": 0.77812493, "learning_rate": 2.742097547574043e-06, "loss": 0.80018967, "num_input_tokens_seen": 71291445, "step": 3307, "time_per_iteration": 2.4473907947540283 }, { "auxiliary_loss_clip": 0.01162014, "auxiliary_loss_mlp": 0.00762786, "balance_loss_clip": 1.05259252, "balance_loss_mlp": 1.00036943, "epoch": 0.3977634822341129, "flos": 20850202644480.0, "grad_norm": 1.9633182423986277, "language_loss": 0.77651799, "learning_rate": 2.7413741278110544e-06, "loss": 0.795766, "num_input_tokens_seen": 71310135, "step": 3308, "time_per_iteration": 2.5101242065429688 }, { "auxiliary_loss_clip": 0.01162292, "auxiliary_loss_mlp": 0.01031036, "balance_loss_clip": 1.05502284, "balance_loss_mlp": 1.02219105, "epoch": 0.397883725124752, "flos": 39786042038400.0, "grad_norm": 2.3258358048190773, "language_loss": 0.68837166, "learning_rate": 2.7406505955822016e-06, "loss": 0.71030498, "num_input_tokens_seen": 71331160, "step": 3309, "time_per_iteration": 2.662421941757202 }, { "auxiliary_loss_clip": 0.01157974, "auxiliary_loss_mlp": 0.01030105, "balance_loss_clip": 1.05229855, "balance_loss_mlp": 1.02196276, "epoch": 0.39800396801539106, "flos": 17379507934080.0, "grad_norm": 4.099665470047776, "language_loss": 0.66162884, "learning_rate": 2.7399269509972415e-06, "loss": 0.68350959, "num_input_tokens_seen": 71345315, "step": 3310, "time_per_iteration": 2.463991403579712 }, { "auxiliary_loss_clip": 0.01148981, "auxiliary_loss_mlp": 0.01029731, "balance_loss_clip": 1.047032, "balance_loss_mlp": 1.02070713, "epoch": 0.3981242109060302, "flos": 19202764337280.0, "grad_norm": 2.4049500361340743, "language_loss": 0.85456431, "learning_rate": 2.7392031941659514e-06, "loss": 0.87635148, "num_input_tokens_seen": 71363160, "step": 3311, "time_per_iteration": 2.4707274436950684 }, { "auxiliary_loss_clip": 0.01158328, "auxiliary_loss_mlp": 0.01038866, "balance_loss_clip": 1.05653465, "balance_loss_mlp": 1.03034306, "epoch": 0.3982444537966693, "flos": 24565124903040.0, "grad_norm": 1.7721481224473024, "language_loss": 0.85882705, "learning_rate": 2.7384793251981244e-06, "loss": 0.88079906, "num_input_tokens_seen": 71382145, "step": 3312, "time_per_iteration": 2.520378589630127 }, { "auxiliary_loss_clip": 0.01177656, "auxiliary_loss_mlp": 0.01030183, "balance_loss_clip": 1.05459666, "balance_loss_mlp": 1.02215445, "epoch": 0.39836469668730834, "flos": 26213856099840.0, "grad_norm": 1.8308992717947343, "language_loss": 0.81013483, "learning_rate": 2.737755344203571e-06, "loss": 0.83221328, "num_input_tokens_seen": 71402095, "step": 3313, "time_per_iteration": 2.517637252807617 }, { "auxiliary_loss_clip": 0.01177489, "auxiliary_loss_mlp": 0.01031432, "balance_loss_clip": 1.05829382, "balance_loss_mlp": 1.02346861, "epoch": 0.39848493957794745, "flos": 27636134002560.0, "grad_norm": 1.7078794558468287, "language_loss": 0.79774928, "learning_rate": 2.7370312512921186e-06, "loss": 0.81983846, "num_input_tokens_seen": 71423875, "step": 3314, "time_per_iteration": 2.5207886695861816 }, { "auxiliary_loss_clip": 0.01160218, "auxiliary_loss_mlp": 0.01036248, "balance_loss_clip": 1.05208588, "balance_loss_mlp": 1.02687252, "epoch": 0.39860518246858656, "flos": 12239326944000.0, "grad_norm": 2.4720947779905376, "language_loss": 0.76794827, "learning_rate": 2.736307046573611e-06, "loss": 0.78991288, "num_input_tokens_seen": 71439745, "step": 3315, "time_per_iteration": 2.466719150543213 }, { "auxiliary_loss_clip": 0.01184243, "auxiliary_loss_mlp": 0.01025953, "balance_loss_clip": 1.05461872, "balance_loss_mlp": 1.01816869, "epoch": 0.3987254253592256, "flos": 22379135005440.0, "grad_norm": 1.6471847826049513, "language_loss": 0.82011878, "learning_rate": 2.73558273015791e-06, "loss": 0.84222078, "num_input_tokens_seen": 71459575, "step": 3316, "time_per_iteration": 2.444368362426758 }, { "auxiliary_loss_clip": 0.01189895, "auxiliary_loss_mlp": 0.01032375, "balance_loss_clip": 1.05685544, "balance_loss_mlp": 1.02311206, "epoch": 0.3988456682498647, "flos": 23514020190720.0, "grad_norm": 2.686505729113726, "language_loss": 0.70428991, "learning_rate": 2.734858302154894e-06, "loss": 0.72651255, "num_input_tokens_seen": 71481075, "step": 3317, "time_per_iteration": 2.4510557651519775 }, { "auxiliary_loss_clip": 0.01152969, "auxiliary_loss_mlp": 0.01033165, "balance_loss_clip": 1.05282629, "balance_loss_mlp": 1.02468944, "epoch": 0.39896591114050384, "flos": 19208761908480.0, "grad_norm": 2.3979298983238664, "language_loss": 0.76490843, "learning_rate": 2.734133762674457e-06, "loss": 0.78676975, "num_input_tokens_seen": 71500665, "step": 3318, "time_per_iteration": 2.485837697982788 }, { "auxiliary_loss_clip": 0.01159016, "auxiliary_loss_mlp": 0.01031311, "balance_loss_clip": 1.05365241, "balance_loss_mlp": 1.02275157, "epoch": 0.3990861540311429, "flos": 28401031146240.0, "grad_norm": 2.025857031041854, "language_loss": 0.70574152, "learning_rate": 2.7334091118265124e-06, "loss": 0.7276448, "num_input_tokens_seen": 71522560, "step": 3319, "time_per_iteration": 2.5475356578826904 }, { "auxiliary_loss_clip": 0.01073373, "auxiliary_loss_mlp": 0.01011207, "balance_loss_clip": 1.01629364, "balance_loss_mlp": 1.01012802, "epoch": 0.399206396921782, "flos": 61758563086080.0, "grad_norm": 0.6793530427591613, "language_loss": 0.57815641, "learning_rate": 2.732684349720989e-06, "loss": 0.59900218, "num_input_tokens_seen": 71590520, "step": 3320, "time_per_iteration": 3.0509047508239746 }, { "auxiliary_loss_clip": 0.01148158, "auxiliary_loss_mlp": 0.01028459, "balance_loss_clip": 1.05172324, "balance_loss_mlp": 1.02026963, "epoch": 0.3993266398124211, "flos": 28074567409920.0, "grad_norm": 1.7190728990591573, "language_loss": 0.75219119, "learning_rate": 2.7319594764678318e-06, "loss": 0.77395737, "num_input_tokens_seen": 71612620, "step": 3321, "time_per_iteration": 2.583740711212158 }, { "auxiliary_loss_clip": 0.01134205, "auxiliary_loss_mlp": 0.01035025, "balance_loss_clip": 1.0516715, "balance_loss_mlp": 1.02612579, "epoch": 0.39944688270306017, "flos": 23225083188480.0, "grad_norm": 1.7419892730984303, "language_loss": 0.83222747, "learning_rate": 2.7312344921770044e-06, "loss": 0.85391974, "num_input_tokens_seen": 71634320, "step": 3322, "time_per_iteration": 3.4414803981781006 }, { "auxiliary_loss_clip": 0.01156907, "auxiliary_loss_mlp": 0.01032003, "balance_loss_clip": 1.05087423, "balance_loss_mlp": 1.02412939, "epoch": 0.3995671255936993, "flos": 19390433921280.0, "grad_norm": 2.1001649743543584, "language_loss": 0.78482485, "learning_rate": 2.7305093969584857e-06, "loss": 0.80671394, "num_input_tokens_seen": 71653145, "step": 3323, "time_per_iteration": 2.519064426422119 }, { "auxiliary_loss_clip": 0.01165869, "auxiliary_loss_mlp": 0.01031495, "balance_loss_clip": 1.05294657, "balance_loss_mlp": 1.02350187, "epoch": 0.3996873684843384, "flos": 23842638743040.0, "grad_norm": 2.1214910284084754, "language_loss": 0.80035973, "learning_rate": 2.729784190922272e-06, "loss": 0.8223334, "num_input_tokens_seen": 71674580, "step": 3324, "time_per_iteration": 2.483886957168579 }, { "auxiliary_loss_clip": 0.01062868, "auxiliary_loss_mlp": 0.0100181, "balance_loss_clip": 1.01761556, "balance_loss_mlp": 1.00079083, "epoch": 0.39980761137497745, "flos": 66576877280640.0, "grad_norm": 0.93636568524594, "language_loss": 0.57192075, "learning_rate": 2.729058874178378e-06, "loss": 0.59256756, "num_input_tokens_seen": 71745260, "step": 3325, "time_per_iteration": 3.1218621730804443 }, { "auxiliary_loss_clip": 0.01148908, "auxiliary_loss_mlp": 0.01032106, "balance_loss_clip": 1.05112278, "balance_loss_mlp": 1.02364182, "epoch": 0.39992785426561656, "flos": 28549162834560.0, "grad_norm": 2.227029100420481, "language_loss": 0.69408619, "learning_rate": 2.7283334468368315e-06, "loss": 0.71589637, "num_input_tokens_seen": 71766540, "step": 3326, "time_per_iteration": 3.4614078998565674 }, { "auxiliary_loss_clip": 0.01094289, "auxiliary_loss_mlp": 0.01028907, "balance_loss_clip": 1.04281664, "balance_loss_mlp": 1.02024627, "epoch": 0.4000480971562556, "flos": 15049408671360.0, "grad_norm": 1.794609623628467, "language_loss": 0.73329663, "learning_rate": 2.72760790900768e-06, "loss": 0.75452864, "num_input_tokens_seen": 71783125, "step": 3327, "time_per_iteration": 2.5859737396240234 }, { "auxiliary_loss_clip": 0.01187686, "auxiliary_loss_mlp": 0.01028176, "balance_loss_clip": 1.05732679, "balance_loss_mlp": 1.02002192, "epoch": 0.4001683400468947, "flos": 23915609222400.0, "grad_norm": 3.023837708122538, "language_loss": 0.78608185, "learning_rate": 2.7268822608009875e-06, "loss": 0.80824047, "num_input_tokens_seen": 71802500, "step": 3328, "time_per_iteration": 2.461385726928711 }, { "auxiliary_loss_clip": 0.01148319, "auxiliary_loss_mlp": 0.01030315, "balance_loss_clip": 1.0515132, "balance_loss_mlp": 1.02224481, "epoch": 0.40028858293753383, "flos": 24352677912960.0, "grad_norm": 2.002312701284234, "language_loss": 0.78241062, "learning_rate": 2.726156502326834e-06, "loss": 0.80419701, "num_input_tokens_seen": 71823800, "step": 3329, "time_per_iteration": 3.4065518379211426 }, { "auxiliary_loss_clip": 0.01037392, "auxiliary_loss_mlp": 0.01006301, "balance_loss_clip": 1.02475178, "balance_loss_mlp": 1.00453663, "epoch": 0.4004088258281729, "flos": 66787025800320.0, "grad_norm": 0.6954955816553068, "language_loss": 0.60248381, "learning_rate": 2.725430633695316e-06, "loss": 0.62292069, "num_input_tokens_seen": 71886880, "step": 3330, "time_per_iteration": 3.9855246543884277 }, { "auxiliary_loss_clip": 0.01081976, "auxiliary_loss_mlp": 0.01001849, "balance_loss_clip": 1.01486754, "balance_loss_mlp": 1.00086582, "epoch": 0.400529068718812, "flos": 58598386473600.0, "grad_norm": 0.8859229209750509, "language_loss": 0.57941341, "learning_rate": 2.7247046550165485e-06, "loss": 0.60025156, "num_input_tokens_seen": 71939005, "step": 3331, "time_per_iteration": 3.096722364425659 }, { "auxiliary_loss_clip": 0.0118897, "auxiliary_loss_mlp": 0.01029622, "balance_loss_clip": 1.05753112, "balance_loss_mlp": 1.02147448, "epoch": 0.4006493116094511, "flos": 25377460934400.0, "grad_norm": 1.405170299905092, "language_loss": 0.76002222, "learning_rate": 2.7239785664006606e-06, "loss": 0.78220809, "num_input_tokens_seen": 71962545, "step": 3332, "time_per_iteration": 2.495569944381714 }, { "auxiliary_loss_clip": 0.01070862, "auxiliary_loss_mlp": 0.01002334, "balance_loss_clip": 1.01329231, "balance_loss_mlp": 1.00127268, "epoch": 0.40076955450009016, "flos": 60280729822080.0, "grad_norm": 0.8899218323031897, "language_loss": 0.6179955, "learning_rate": 2.7232523679578002e-06, "loss": 0.63872755, "num_input_tokens_seen": 72025625, "step": 3333, "time_per_iteration": 3.0788426399230957 }, { "auxiliary_loss_clip": 0.01168915, "auxiliary_loss_mlp": 0.01026054, "balance_loss_clip": 1.05561137, "balance_loss_mlp": 1.01842427, "epoch": 0.4008897973907293, "flos": 16617268396800.0, "grad_norm": 2.2695694963287236, "language_loss": 0.79483086, "learning_rate": 2.7225260597981295e-06, "loss": 0.81678057, "num_input_tokens_seen": 72043330, "step": 3334, "time_per_iteration": 2.439711332321167 }, { "auxiliary_loss_clip": 0.01144179, "auxiliary_loss_mlp": 0.00763604, "balance_loss_clip": 1.05355275, "balance_loss_mlp": 1.0002563, "epoch": 0.4010100402813684, "flos": 15377344865280.0, "grad_norm": 2.8254115072612023, "language_loss": 0.78385192, "learning_rate": 2.721799642031831e-06, "loss": 0.80292976, "num_input_tokens_seen": 72059500, "step": 3335, "time_per_iteration": 2.494485378265381 }, { "auxiliary_loss_clip": 0.01160482, "auxiliary_loss_mlp": 0.01032835, "balance_loss_clip": 1.04908133, "balance_loss_mlp": 1.02450252, "epoch": 0.40113028317200744, "flos": 13298835438720.0, "grad_norm": 1.926169863973025, "language_loss": 0.78056395, "learning_rate": 2.721073114769101e-06, "loss": 0.80249709, "num_input_tokens_seen": 72077175, "step": 3336, "time_per_iteration": 2.4852471351623535 }, { "auxiliary_loss_clip": 0.01138502, "auxiliary_loss_mlp": 0.01032638, "balance_loss_clip": 1.05059814, "balance_loss_mlp": 1.02478826, "epoch": 0.40125052606264655, "flos": 20668027841280.0, "grad_norm": 1.7518925359226891, "language_loss": 0.75076759, "learning_rate": 2.7203464781201523e-06, "loss": 0.772479, "num_input_tokens_seen": 72096490, "step": 3337, "time_per_iteration": 2.537952423095703 }, { "auxiliary_loss_clip": 0.01187696, "auxiliary_loss_mlp": 0.01032416, "balance_loss_clip": 1.05787134, "balance_loss_mlp": 1.02460778, "epoch": 0.40137076895328566, "flos": 24607679541120.0, "grad_norm": 2.6937475281623735, "language_loss": 0.78147501, "learning_rate": 2.719619732195215e-06, "loss": 0.80367613, "num_input_tokens_seen": 72118130, "step": 3338, "time_per_iteration": 2.4856388568878174 }, { "auxiliary_loss_clip": 0.01144691, "auxiliary_loss_mlp": 0.01026656, "balance_loss_clip": 1.05097723, "balance_loss_mlp": 1.01882362, "epoch": 0.4014910118439247, "flos": 24206593299840.0, "grad_norm": 1.5229698028392253, "language_loss": 0.72668022, "learning_rate": 2.7188928771045377e-06, "loss": 0.74839365, "num_input_tokens_seen": 72139450, "step": 3339, "time_per_iteration": 2.5688064098358154 }, { "auxiliary_loss_clip": 0.01138505, "auxiliary_loss_mlp": 0.01028556, "balance_loss_clip": 1.04984426, "balance_loss_mlp": 1.02093291, "epoch": 0.4016112547345638, "flos": 26725080418560.0, "grad_norm": 1.8002474030386153, "language_loss": 0.79796803, "learning_rate": 2.7181659129583815e-06, "loss": 0.81963861, "num_input_tokens_seen": 72159040, "step": 3340, "time_per_iteration": 2.5726680755615234 }, { "auxiliary_loss_clip": 0.0114701, "auxiliary_loss_mlp": 0.0102788, "balance_loss_clip": 1.0453589, "balance_loss_mlp": 1.0197382, "epoch": 0.4017314976252029, "flos": 21288025520640.0, "grad_norm": 1.7388359306240395, "language_loss": 0.75771153, "learning_rate": 2.7174388398670276e-06, "loss": 0.77946043, "num_input_tokens_seen": 72178220, "step": 3341, "time_per_iteration": 2.530214548110962 }, { "auxiliary_loss_clip": 0.01186283, "auxiliary_loss_mlp": 0.01034798, "balance_loss_clip": 1.05222499, "balance_loss_mlp": 1.02652526, "epoch": 0.401851740515842, "flos": 25484690010240.0, "grad_norm": 2.952699611839034, "language_loss": 0.92222655, "learning_rate": 2.716711657940773e-06, "loss": 0.94443727, "num_input_tokens_seen": 72199230, "step": 3342, "time_per_iteration": 2.481170415878296 }, { "auxiliary_loss_clip": 0.01052325, "auxiliary_loss_mlp": 0.0100261, "balance_loss_clip": 1.01494741, "balance_loss_mlp": 1.0016917, "epoch": 0.4019719834064811, "flos": 55395334978560.0, "grad_norm": 0.8103047138159378, "language_loss": 0.56454337, "learning_rate": 2.7159843672899284e-06, "loss": 0.58509266, "num_input_tokens_seen": 72263430, "step": 3343, "time_per_iteration": 3.2316973209381104 }, { "auxiliary_loss_clip": 0.01173996, "auxiliary_loss_mlp": 0.01031473, "balance_loss_clip": 1.056916, "balance_loss_mlp": 1.02306914, "epoch": 0.40209222629712016, "flos": 18180100218240.0, "grad_norm": 1.9041991478084044, "language_loss": 0.81355417, "learning_rate": 2.715256968024825e-06, "loss": 0.83560884, "num_input_tokens_seen": 72280505, "step": 3344, "time_per_iteration": 2.44474720954895 }, { "auxiliary_loss_clip": 0.0116394, "auxiliary_loss_mlp": 0.01031575, "balance_loss_clip": 1.05348635, "balance_loss_mlp": 1.02326047, "epoch": 0.40221246918775927, "flos": 25961009287680.0, "grad_norm": 1.488175491406286, "language_loss": 0.82278991, "learning_rate": 2.7145294602558083e-06, "loss": 0.84474498, "num_input_tokens_seen": 72301215, "step": 3345, "time_per_iteration": 2.579402208328247 }, { "auxiliary_loss_clip": 0.01172224, "auxiliary_loss_mlp": 0.01026417, "balance_loss_clip": 1.05363774, "balance_loss_mlp": 1.01757193, "epoch": 0.4023327120783984, "flos": 33838912056960.0, "grad_norm": 1.6961621906706548, "language_loss": 0.70874989, "learning_rate": 2.713801844093241e-06, "loss": 0.73073626, "num_input_tokens_seen": 72322365, "step": 3346, "time_per_iteration": 2.5805366039276123 }, { "auxiliary_loss_clip": 0.01172387, "auxiliary_loss_mlp": 0.0103336, "balance_loss_clip": 1.05469322, "balance_loss_mlp": 1.02582586, "epoch": 0.40245295496903744, "flos": 26900252069760.0, "grad_norm": 4.329955084016142, "language_loss": 0.88497341, "learning_rate": 2.7130741196475014e-06, "loss": 0.90703082, "num_input_tokens_seen": 72340495, "step": 3347, "time_per_iteration": 2.5114622116088867 }, { "auxiliary_loss_clip": 0.01163658, "auxiliary_loss_mlp": 0.01031185, "balance_loss_clip": 1.0569824, "balance_loss_mlp": 1.02243531, "epoch": 0.40257319785967655, "flos": 36902738436480.0, "grad_norm": 1.7347704773764612, "language_loss": 0.79523528, "learning_rate": 2.7123462870289848e-06, "loss": 0.81718373, "num_input_tokens_seen": 72360545, "step": 3348, "time_per_iteration": 3.445404052734375 }, { "auxiliary_loss_clip": 0.01158163, "auxiliary_loss_mlp": 0.01027335, "balance_loss_clip": 1.04989457, "balance_loss_mlp": 1.0188446, "epoch": 0.40269344075031566, "flos": 24353180703360.0, "grad_norm": 1.5570079333617244, "language_loss": 0.81313539, "learning_rate": 2.711618346348102e-06, "loss": 0.83499038, "num_input_tokens_seen": 72381070, "step": 3349, "time_per_iteration": 2.5344419479370117 }, { "auxiliary_loss_clip": 0.01151823, "auxiliary_loss_mlp": 0.0103358, "balance_loss_clip": 1.05239129, "balance_loss_mlp": 1.02533638, "epoch": 0.4028136836409547, "flos": 14389657614720.0, "grad_norm": 1.7204618575155866, "language_loss": 0.63533592, "learning_rate": 2.7108902977152825e-06, "loss": 0.65718997, "num_input_tokens_seen": 72398970, "step": 3350, "time_per_iteration": 2.4805235862731934 }, { "auxiliary_loss_clip": 0.01168873, "auxiliary_loss_mlp": 0.0102978, "balance_loss_clip": 1.05275476, "balance_loss_mlp": 1.02146256, "epoch": 0.4029339265315938, "flos": 26136037284480.0, "grad_norm": 2.2334036188005415, "language_loss": 0.75367528, "learning_rate": 2.7101621412409704e-06, "loss": 0.77566183, "num_input_tokens_seen": 72418455, "step": 3351, "time_per_iteration": 2.525186538696289 }, { "auxiliary_loss_clip": 0.01186413, "auxiliary_loss_mlp": 0.01032365, "balance_loss_clip": 1.05487633, "balance_loss_mlp": 1.02405632, "epoch": 0.40305416942223293, "flos": 23256325042560.0, "grad_norm": 1.8354931310081248, "language_loss": 0.85628814, "learning_rate": 2.7094338770356256e-06, "loss": 0.8784759, "num_input_tokens_seen": 72437540, "step": 3352, "time_per_iteration": 3.27976393699646 }, { "auxiliary_loss_clip": 0.01154016, "auxiliary_loss_mlp": 0.01030767, "balance_loss_clip": 1.05235922, "balance_loss_mlp": 1.02269602, "epoch": 0.403174412312872, "flos": 27089645506560.0, "grad_norm": 2.629118436396993, "language_loss": 0.63947296, "learning_rate": 2.708705505209726e-06, "loss": 0.66132081, "num_input_tokens_seen": 72458315, "step": 3353, "time_per_iteration": 2.5375888347625732 }, { "auxiliary_loss_clip": 0.01122439, "auxiliary_loss_mlp": 0.01026105, "balance_loss_clip": 1.04585087, "balance_loss_mlp": 1.01825547, "epoch": 0.4032946552035111, "flos": 21756336065280.0, "grad_norm": 1.9407194038834914, "language_loss": 0.91799098, "learning_rate": 2.7079770258737646e-06, "loss": 0.93947643, "num_input_tokens_seen": 72476225, "step": 3354, "time_per_iteration": 2.582780599594116 }, { "auxiliary_loss_clip": 0.01140371, "auxiliary_loss_mlp": 0.01030516, "balance_loss_clip": 1.04916334, "balance_loss_mlp": 1.02134299, "epoch": 0.4034148980941502, "flos": 17343956448000.0, "grad_norm": 3.0027357754337194, "language_loss": 0.75169086, "learning_rate": 2.707248439138251e-06, "loss": 0.77339977, "num_input_tokens_seen": 72492460, "step": 3355, "time_per_iteration": 3.221224069595337 }, { "auxiliary_loss_clip": 0.01156202, "auxiliary_loss_mlp": 0.01031533, "balance_loss_clip": 1.05601764, "balance_loss_mlp": 1.02368355, "epoch": 0.40353514098478926, "flos": 22017838055040.0, "grad_norm": 1.7240781659763247, "language_loss": 0.65557688, "learning_rate": 2.7065197451137114e-06, "loss": 0.67745423, "num_input_tokens_seen": 72513840, "step": 3356, "time_per_iteration": 2.5538177490234375 }, { "auxiliary_loss_clip": 0.01158999, "auxiliary_loss_mlp": 0.01030862, "balance_loss_clip": 1.0540297, "balance_loss_mlp": 1.02254725, "epoch": 0.4036553838754284, "flos": 14246446089600.0, "grad_norm": 2.2061355480707143, "language_loss": 0.67547947, "learning_rate": 2.7057909439106894e-06, "loss": 0.69737804, "num_input_tokens_seen": 72531695, "step": 3357, "time_per_iteration": 3.2442209720611572 }, { "auxiliary_loss_clip": 0.01164787, "auxiliary_loss_mlp": 0.00763371, "balance_loss_clip": 1.05338669, "balance_loss_mlp": 1.00018477, "epoch": 0.40377562676606743, "flos": 24790644443520.0, "grad_norm": 2.709797235465548, "language_loss": 0.78505218, "learning_rate": 2.7050620356397417e-06, "loss": 0.80433381, "num_input_tokens_seen": 72550645, "step": 3358, "time_per_iteration": 2.5643270015716553 }, { "auxiliary_loss_clip": 0.01184855, "auxiliary_loss_mlp": 0.01025171, "balance_loss_clip": 1.05713582, "balance_loss_mlp": 1.01741993, "epoch": 0.40389586965670654, "flos": 24061226958720.0, "grad_norm": 1.7007413928700912, "language_loss": 0.72321671, "learning_rate": 2.7043330204114437e-06, "loss": 0.74531698, "num_input_tokens_seen": 72569355, "step": 3359, "time_per_iteration": 2.469942092895508 }, { "auxiliary_loss_clip": 0.01181486, "auxiliary_loss_mlp": 0.01028704, "balance_loss_clip": 1.05370665, "balance_loss_mlp": 1.02064538, "epoch": 0.40401611254734565, "flos": 16399613934720.0, "grad_norm": 2.0329688455435386, "language_loss": 0.85432428, "learning_rate": 2.7036038983363862e-06, "loss": 0.87642622, "num_input_tokens_seen": 72585960, "step": 3360, "time_per_iteration": 2.3977837562561035 }, { "auxiliary_loss_clip": 0.01168561, "auxiliary_loss_mlp": 0.0102978, "balance_loss_clip": 1.05547059, "balance_loss_mlp": 1.02231503, "epoch": 0.4041363554379847, "flos": 23988220565760.0, "grad_norm": 1.7792993108996924, "language_loss": 0.84526694, "learning_rate": 2.702874669525177e-06, "loss": 0.86725038, "num_input_tokens_seen": 72604440, "step": 3361, "time_per_iteration": 2.482865571975708 }, { "auxiliary_loss_clip": 0.01149164, "auxiliary_loss_mlp": 0.01031117, "balance_loss_clip": 1.05765474, "balance_loss_mlp": 1.0224092, "epoch": 0.4042565983286238, "flos": 28401964899840.0, "grad_norm": 2.085179581183854, "language_loss": 0.69770318, "learning_rate": 2.7021453340884394e-06, "loss": 0.71950603, "num_input_tokens_seen": 72622165, "step": 3362, "time_per_iteration": 2.5826261043548584 }, { "auxiliary_loss_clip": 0.01149949, "auxiliary_loss_mlp": 0.00762972, "balance_loss_clip": 1.05432153, "balance_loss_mlp": 1.00017297, "epoch": 0.40437684121926293, "flos": 17710963660800.0, "grad_norm": 2.270351245482001, "language_loss": 0.73048806, "learning_rate": 2.7014158921368125e-06, "loss": 0.74961734, "num_input_tokens_seen": 72640490, "step": 3363, "time_per_iteration": 2.470259666442871 }, { "auxiliary_loss_clip": 0.01187606, "auxiliary_loss_mlp": 0.01032017, "balance_loss_clip": 1.05740142, "balance_loss_mlp": 1.02351713, "epoch": 0.404497084109902, "flos": 24018959629440.0, "grad_norm": 2.6346032838777114, "language_loss": 0.86016029, "learning_rate": 2.700686343780953e-06, "loss": 0.88235652, "num_input_tokens_seen": 72660360, "step": 3364, "time_per_iteration": 2.4738242626190186 }, { "auxiliary_loss_clip": 0.0115954, "auxiliary_loss_mlp": 0.01028306, "balance_loss_clip": 1.05222845, "balance_loss_mlp": 1.01986027, "epoch": 0.4046173270005411, "flos": 22929861306240.0, "grad_norm": 1.5938177062569232, "language_loss": 0.88310719, "learning_rate": 2.699956689131532e-06, "loss": 0.90498561, "num_input_tokens_seen": 72680345, "step": 3365, "time_per_iteration": 2.5097124576568604 }, { "auxiliary_loss_clip": 0.01162785, "auxiliary_loss_mlp": 0.01028834, "balance_loss_clip": 1.0553143, "balance_loss_mlp": 1.02044165, "epoch": 0.4047375698911802, "flos": 20668135582080.0, "grad_norm": 2.7726547696088293, "language_loss": 0.84795403, "learning_rate": 2.699226928299238e-06, "loss": 0.86987019, "num_input_tokens_seen": 72698365, "step": 3366, "time_per_iteration": 2.494499444961548 }, { "auxiliary_loss_clip": 0.01176626, "auxiliary_loss_mlp": 0.01032701, "balance_loss_clip": 1.05664659, "balance_loss_mlp": 1.02492237, "epoch": 0.40485781278181926, "flos": 28912865996160.0, "grad_norm": 2.337205935409372, "language_loss": 0.79043329, "learning_rate": 2.698497061394774e-06, "loss": 0.81252658, "num_input_tokens_seen": 72716850, "step": 3367, "time_per_iteration": 2.506425380706787 }, { "auxiliary_loss_clip": 0.01152816, "auxiliary_loss_mlp": 0.00762704, "balance_loss_clip": 1.05440688, "balance_loss_mlp": 1.00016999, "epoch": 0.40497805567245837, "flos": 23148377694720.0, "grad_norm": 3.083649654128188, "language_loss": 0.80556291, "learning_rate": 2.6977670885288627e-06, "loss": 0.82471812, "num_input_tokens_seen": 72738250, "step": 3368, "time_per_iteration": 2.554211139678955 }, { "auxiliary_loss_clip": 0.01147126, "auxiliary_loss_mlp": 0.01032159, "balance_loss_clip": 1.05112576, "balance_loss_mlp": 1.02369559, "epoch": 0.4050982985630975, "flos": 16289404030080.0, "grad_norm": 1.6620764289472865, "language_loss": 0.75343096, "learning_rate": 2.6970370098122378e-06, "loss": 0.77522385, "num_input_tokens_seen": 72755235, "step": 3369, "time_per_iteration": 2.4633595943450928 }, { "auxiliary_loss_clip": 0.01186289, "auxiliary_loss_mlp": 0.01027713, "balance_loss_clip": 1.05559802, "balance_loss_mlp": 1.0200001, "epoch": 0.40521854145373654, "flos": 34459484353920.0, "grad_norm": 1.5045027977382917, "language_loss": 0.86840677, "learning_rate": 2.6963068253556535e-06, "loss": 0.8905468, "num_input_tokens_seen": 72776620, "step": 3370, "time_per_iteration": 2.544238328933716 }, { "auxiliary_loss_clip": 0.01180854, "auxiliary_loss_mlp": 0.01031945, "balance_loss_clip": 1.05515289, "balance_loss_mlp": 1.02268815, "epoch": 0.40533878434437565, "flos": 25331099454720.0, "grad_norm": 2.1361957414435926, "language_loss": 0.85629225, "learning_rate": 2.6955765352698763e-06, "loss": 0.87842023, "num_input_tokens_seen": 72796765, "step": 3371, "time_per_iteration": 2.522141218185425 }, { "auxiliary_loss_clip": 0.01190627, "auxiliary_loss_mlp": 0.01027998, "balance_loss_clip": 1.05681491, "balance_loss_mlp": 1.01903987, "epoch": 0.40545902723501476, "flos": 15012061505280.0, "grad_norm": 2.0548145858599893, "language_loss": 0.73062837, "learning_rate": 2.6948461396656923e-06, "loss": 0.75281459, "num_input_tokens_seen": 72814175, "step": 3372, "time_per_iteration": 2.4054653644561768 }, { "auxiliary_loss_clip": 0.01179153, "auxiliary_loss_mlp": 0.01032196, "balance_loss_clip": 1.05636632, "balance_loss_mlp": 1.02388716, "epoch": 0.4055792701256538, "flos": 25521103422720.0, "grad_norm": 2.447431264885515, "language_loss": 0.74588716, "learning_rate": 2.6941156386539013e-06, "loss": 0.7680006, "num_input_tokens_seen": 72834125, "step": 3373, "time_per_iteration": 2.485348701477051 }, { "auxiliary_loss_clip": 0.01158329, "auxiliary_loss_mlp": 0.01033555, "balance_loss_clip": 1.05734408, "balance_loss_mlp": 1.02516556, "epoch": 0.4056995130162929, "flos": 19574583972480.0, "grad_norm": 2.137650033651343, "language_loss": 0.80775881, "learning_rate": 2.6933850323453203e-06, "loss": 0.82967764, "num_input_tokens_seen": 72852570, "step": 3374, "time_per_iteration": 2.4789068698883057 }, { "auxiliary_loss_clip": 0.01187417, "auxiliary_loss_mlp": 0.01027991, "balance_loss_clip": 1.05851412, "balance_loss_mlp": 1.01972985, "epoch": 0.405819755906932, "flos": 15413794191360.0, "grad_norm": 2.007796301734701, "language_loss": 0.74573904, "learning_rate": 2.6926543208507806e-06, "loss": 0.76789308, "num_input_tokens_seen": 72871250, "step": 3375, "time_per_iteration": 3.2331912517547607 }, { "auxiliary_loss_clip": 0.01172506, "auxiliary_loss_mlp": 0.01027991, "balance_loss_clip": 1.05546772, "balance_loss_mlp": 1.01922894, "epoch": 0.4059399987975711, "flos": 21433930565760.0, "grad_norm": 1.9151486571471033, "language_loss": 0.8007406, "learning_rate": 2.6919235042811316e-06, "loss": 0.82274562, "num_input_tokens_seen": 72890035, "step": 3376, "time_per_iteration": 2.4446563720703125 }, { "auxiliary_loss_clip": 0.01143438, "auxiliary_loss_mlp": 0.01033442, "balance_loss_clip": 1.05294466, "balance_loss_mlp": 1.02448356, "epoch": 0.4060602416882102, "flos": 25556942217600.0, "grad_norm": 3.089289128033694, "language_loss": 0.76336491, "learning_rate": 2.691192582747237e-06, "loss": 0.78513372, "num_input_tokens_seen": 72909665, "step": 3377, "time_per_iteration": 2.5499448776245117 }, { "auxiliary_loss_clip": 0.01191227, "auxiliary_loss_mlp": 0.01025928, "balance_loss_clip": 1.05952299, "balance_loss_mlp": 1.01813221, "epoch": 0.40618048457884925, "flos": 23766759262080.0, "grad_norm": 1.7072330383906646, "language_loss": 0.73954666, "learning_rate": 2.6904615563599765e-06, "loss": 0.76171815, "num_input_tokens_seen": 72929465, "step": 3378, "time_per_iteration": 2.4370408058166504 }, { "auxiliary_loss_clip": 0.01138206, "auxiliary_loss_mlp": 0.01025638, "balance_loss_clip": 1.0499233, "balance_loss_mlp": 1.0176599, "epoch": 0.40630072746948837, "flos": 17639681120640.0, "grad_norm": 2.5686773493979973, "language_loss": 0.83398104, "learning_rate": 2.6897304252302477e-06, "loss": 0.85561949, "num_input_tokens_seen": 72946785, "step": 3379, "time_per_iteration": 3.4193224906921387 }, { "auxiliary_loss_clip": 0.01047831, "auxiliary_loss_mlp": 0.01000571, "balance_loss_clip": 1.01428914, "balance_loss_mlp": 0.99953955, "epoch": 0.4064209703601275, "flos": 60836053063680.0, "grad_norm": 0.7849656510100202, "language_loss": 0.54760706, "learning_rate": 2.688999189468962e-06, "loss": 0.56809115, "num_input_tokens_seen": 73003215, "step": 3380, "time_per_iteration": 2.9865474700927734 }, { "auxiliary_loss_clip": 0.01172775, "auxiliary_loss_mlp": 0.01033578, "balance_loss_clip": 1.0563426, "balance_loss_mlp": 1.02579331, "epoch": 0.40654121325076653, "flos": 24024346669440.0, "grad_norm": 2.5553775502011926, "language_loss": 0.76012647, "learning_rate": 2.6882678491870464e-06, "loss": 0.78218997, "num_input_tokens_seen": 73023650, "step": 3381, "time_per_iteration": 3.308553695678711 }, { "auxiliary_loss_clip": 0.01176911, "auxiliary_loss_mlp": 0.01024662, "balance_loss_clip": 1.05644083, "balance_loss_mlp": 1.01563787, "epoch": 0.40666145614140564, "flos": 27344252085120.0, "grad_norm": 1.6598282134544586, "language_loss": 0.71465707, "learning_rate": 2.6875364044954453e-06, "loss": 0.73667282, "num_input_tokens_seen": 73043880, "step": 3382, "time_per_iteration": 2.5174951553344727 }, { "auxiliary_loss_clip": 0.01153549, "auxiliary_loss_mlp": 0.01027292, "balance_loss_clip": 1.0483793, "balance_loss_mlp": 1.01939476, "epoch": 0.40678169903204475, "flos": 26176724415360.0, "grad_norm": 1.5373131935642868, "language_loss": 0.82119489, "learning_rate": 2.6868048555051185e-06, "loss": 0.84300327, "num_input_tokens_seen": 73065410, "step": 3383, "time_per_iteration": 2.553271770477295 }, { "auxiliary_loss_clip": 0.01164721, "auxiliary_loss_mlp": 0.01032917, "balance_loss_clip": 1.05202174, "balance_loss_mlp": 1.02447939, "epoch": 0.4069019419226838, "flos": 28622420622720.0, "grad_norm": 2.7480694585289087, "language_loss": 0.85636979, "learning_rate": 2.686073202327041e-06, "loss": 0.87834615, "num_input_tokens_seen": 73084410, "step": 3384, "time_per_iteration": 3.3438169956207275 }, { "auxiliary_loss_clip": 0.01147696, "auxiliary_loss_mlp": 0.0103518, "balance_loss_clip": 1.04973865, "balance_loss_mlp": 1.02645969, "epoch": 0.4070221848133229, "flos": 25229006023680.0, "grad_norm": 1.671546420932362, "language_loss": 0.73064798, "learning_rate": 2.6853414450722043e-06, "loss": 0.75247675, "num_input_tokens_seen": 73104075, "step": 3385, "time_per_iteration": 2.544618844985962 }, { "auxiliary_loss_clip": 0.01171394, "auxiliary_loss_mlp": 0.01027367, "balance_loss_clip": 1.05489922, "balance_loss_mlp": 1.01890278, "epoch": 0.40714242770396203, "flos": 18405224709120.0, "grad_norm": 1.7653393830609239, "language_loss": 0.85523063, "learning_rate": 2.684609583851616e-06, "loss": 0.87721825, "num_input_tokens_seen": 73122250, "step": 3386, "time_per_iteration": 2.448624849319458 }, { "auxiliary_loss_clip": 0.01130067, "auxiliary_loss_mlp": 0.01030279, "balance_loss_clip": 1.05005169, "balance_loss_mlp": 1.02222037, "epoch": 0.4072626705946011, "flos": 30228920403840.0, "grad_norm": 1.5110664228785189, "language_loss": 0.80705535, "learning_rate": 2.683877618776297e-06, "loss": 0.82865882, "num_input_tokens_seen": 73144505, "step": 3387, "time_per_iteration": 2.6386239528656006 }, { "auxiliary_loss_clip": 0.01151296, "auxiliary_loss_mlp": 0.01033124, "balance_loss_clip": 1.04902768, "balance_loss_mlp": 1.02413559, "epoch": 0.4073829134852402, "flos": 21834549930240.0, "grad_norm": 2.329699160904908, "language_loss": 0.73962247, "learning_rate": 2.6831455499572876e-06, "loss": 0.76146668, "num_input_tokens_seen": 73162440, "step": 3388, "time_per_iteration": 2.504287004470825 }, { "auxiliary_loss_clip": 0.01189, "auxiliary_loss_mlp": 0.01028595, "balance_loss_clip": 1.05571914, "balance_loss_mlp": 1.02000022, "epoch": 0.40750315637587925, "flos": 25260211964160.0, "grad_norm": 1.7834216739595714, "language_loss": 0.77691239, "learning_rate": 2.682413377505641e-06, "loss": 0.79908836, "num_input_tokens_seen": 73181245, "step": 3389, "time_per_iteration": 2.4684269428253174 }, { "auxiliary_loss_clip": 0.01173233, "auxiliary_loss_mlp": 0.01028147, "balance_loss_clip": 1.0538801, "balance_loss_mlp": 1.01969528, "epoch": 0.40762339926651836, "flos": 19712767593600.0, "grad_norm": 2.2829060372494556, "language_loss": 0.7656498, "learning_rate": 2.6816811015324284e-06, "loss": 0.78766358, "num_input_tokens_seen": 73199295, "step": 3390, "time_per_iteration": 2.4656755924224854 }, { "auxiliary_loss_clip": 0.01080487, "auxiliary_loss_mlp": 0.01000646, "balance_loss_clip": 1.0143044, "balance_loss_mlp": 0.99968034, "epoch": 0.40774364215715747, "flos": 71449307314560.0, "grad_norm": 0.7269881696067815, "language_loss": 0.56734681, "learning_rate": 2.6809487221487343e-06, "loss": 0.58815813, "num_input_tokens_seen": 73258780, "step": 3391, "time_per_iteration": 2.9244205951690674 }, { "auxiliary_loss_clip": 0.01164119, "auxiliary_loss_mlp": 0.01022943, "balance_loss_clip": 1.05148816, "balance_loss_mlp": 1.01450348, "epoch": 0.4078638850477965, "flos": 15084134144640.0, "grad_norm": 2.7218408774333094, "language_loss": 0.81857866, "learning_rate": 2.6802162394656605e-06, "loss": 0.84044933, "num_input_tokens_seen": 73275490, "step": 3392, "time_per_iteration": 2.4352774620056152 }, { "auxiliary_loss_clip": 0.01153499, "auxiliary_loss_mlp": 0.01030939, "balance_loss_clip": 1.04879951, "balance_loss_mlp": 1.0229224, "epoch": 0.40798412793843564, "flos": 23842890138240.0, "grad_norm": 1.6492507823067226, "language_loss": 0.71605313, "learning_rate": 2.679483653594324e-06, "loss": 0.73789752, "num_input_tokens_seen": 73297260, "step": 3393, "time_per_iteration": 2.553133010864258 }, { "auxiliary_loss_clip": 0.01176084, "auxiliary_loss_mlp": 0.01030856, "balance_loss_clip": 1.05544233, "balance_loss_mlp": 1.02255297, "epoch": 0.40810437082907475, "flos": 21065774117760.0, "grad_norm": 4.107292579256087, "language_loss": 0.75852883, "learning_rate": 2.678750964645857e-06, "loss": 0.78059822, "num_input_tokens_seen": 73316340, "step": 3394, "time_per_iteration": 2.469615936279297 }, { "auxiliary_loss_clip": 0.0118087, "auxiliary_loss_mlp": 0.01033674, "balance_loss_clip": 1.06152833, "balance_loss_mlp": 1.02521014, "epoch": 0.4082246137197138, "flos": 11321377948800.0, "grad_norm": 2.4602551635508556, "language_loss": 0.8330512, "learning_rate": 2.6780181727314094e-06, "loss": 0.8551966, "num_input_tokens_seen": 73331245, "step": 3395, "time_per_iteration": 2.4695208072662354 }, { "auxiliary_loss_clip": 0.01146659, "auxiliary_loss_mlp": 0.00762777, "balance_loss_clip": 1.05050862, "balance_loss_mlp": 1.00010419, "epoch": 0.4083448566103529, "flos": 19062569554560.0, "grad_norm": 1.888437769656481, "language_loss": 0.78091037, "learning_rate": 2.6772852779621435e-06, "loss": 0.80000478, "num_input_tokens_seen": 73349105, "step": 3396, "time_per_iteration": 2.50990891456604 }, { "auxiliary_loss_clip": 0.01170266, "auxiliary_loss_mlp": 0.00762535, "balance_loss_clip": 1.05872989, "balance_loss_mlp": 1.00008106, "epoch": 0.408465099500992, "flos": 23550254035200.0, "grad_norm": 2.3388074612949405, "language_loss": 0.86841989, "learning_rate": 2.676552280449239e-06, "loss": 0.88774788, "num_input_tokens_seen": 73368990, "step": 3397, "time_per_iteration": 2.494401454925537 }, { "auxiliary_loss_clip": 0.01163707, "auxiliary_loss_mlp": 0.0103192, "balance_loss_clip": 1.05220544, "balance_loss_mlp": 1.02306247, "epoch": 0.4085853423916311, "flos": 12750012558720.0, "grad_norm": 2.2422508009404467, "language_loss": 0.76202452, "learning_rate": 2.6758191803038917e-06, "loss": 0.78398085, "num_input_tokens_seen": 73387485, "step": 3398, "time_per_iteration": 2.436962366104126 }, { "auxiliary_loss_clip": 0.01107744, "auxiliary_loss_mlp": 0.0102908, "balance_loss_clip": 1.04918516, "balance_loss_mlp": 1.02071166, "epoch": 0.4087055852822702, "flos": 24353072962560.0, "grad_norm": 1.6875299277262752, "language_loss": 0.82741207, "learning_rate": 2.6750859776373125e-06, "loss": 0.84878027, "num_input_tokens_seen": 73406940, "step": 3399, "time_per_iteration": 2.617870569229126 }, { "auxiliary_loss_clip": 0.01029972, "auxiliary_loss_mlp": 0.01006581, "balance_loss_clip": 1.01360369, "balance_loss_mlp": 1.00530553, "epoch": 0.4088258281729093, "flos": 66387950720640.0, "grad_norm": 0.7750689225180801, "language_loss": 0.604289, "learning_rate": 2.674352672560727e-06, "loss": 0.62465453, "num_input_tokens_seen": 73468385, "step": 3400, "time_per_iteration": 3.161284923553467 }, { "auxiliary_loss_clip": 0.011438, "auxiliary_loss_mlp": 0.01026294, "balance_loss_clip": 1.05002904, "balance_loss_mlp": 1.01752019, "epoch": 0.40894607106354836, "flos": 20449260057600.0, "grad_norm": 1.896731507224513, "language_loss": 0.76985514, "learning_rate": 2.673619265185377e-06, "loss": 0.79155612, "num_input_tokens_seen": 73488225, "step": 3401, "time_per_iteration": 2.543379306793213 }, { "auxiliary_loss_clip": 0.01175115, "auxiliary_loss_mlp": 0.01035021, "balance_loss_clip": 1.05433238, "balance_loss_mlp": 1.02643752, "epoch": 0.40906631395418747, "flos": 27053627143680.0, "grad_norm": 1.6220312349977828, "language_loss": 0.78030384, "learning_rate": 2.672885755622521e-06, "loss": 0.80240524, "num_input_tokens_seen": 73510640, "step": 3402, "time_per_iteration": 3.3825464248657227 }, { "auxiliary_loss_clip": 0.01128153, "auxiliary_loss_mlp": 0.01029671, "balance_loss_clip": 1.04652154, "balance_loss_mlp": 1.02126062, "epoch": 0.4091865568448266, "flos": 25484151306240.0, "grad_norm": 2.122783565191288, "language_loss": 0.70295417, "learning_rate": 2.67215214398343e-06, "loss": 0.72453237, "num_input_tokens_seen": 73530655, "step": 3403, "time_per_iteration": 2.606388807296753 }, { "auxiliary_loss_clip": 0.01131998, "auxiliary_loss_mlp": 0.01031105, "balance_loss_clip": 1.04633403, "balance_loss_mlp": 1.02207541, "epoch": 0.40930679973546563, "flos": 28657864368000.0, "grad_norm": 2.4252370637363017, "language_loss": 0.78026772, "learning_rate": 2.671418430379393e-06, "loss": 0.80189872, "num_input_tokens_seen": 73549340, "step": 3404, "time_per_iteration": 2.6188619136810303 }, { "auxiliary_loss_clip": 0.01186454, "auxiliary_loss_mlp": 0.0102691, "balance_loss_clip": 1.05508924, "balance_loss_mlp": 1.01846433, "epoch": 0.40942704262610474, "flos": 20886292834560.0, "grad_norm": 3.0820917807794084, "language_loss": 0.82934201, "learning_rate": 2.670684614921715e-06, "loss": 0.85147572, "num_input_tokens_seen": 73568315, "step": 3405, "time_per_iteration": 2.432753324508667 }, { "auxiliary_loss_clip": 0.01159272, "auxiliary_loss_mlp": 0.01031815, "balance_loss_clip": 1.0510478, "balance_loss_mlp": 1.02314281, "epoch": 0.4095472855167438, "flos": 21618080616960.0, "grad_norm": 3.0741149815124618, "language_loss": 0.69075906, "learning_rate": 2.6699506977217128e-06, "loss": 0.71266997, "num_input_tokens_seen": 73588490, "step": 3406, "time_per_iteration": 3.3827433586120605 }, { "auxiliary_loss_clip": 0.01171514, "auxiliary_loss_mlp": 0.01026027, "balance_loss_clip": 1.05772305, "balance_loss_mlp": 1.01787865, "epoch": 0.4096675284073829, "flos": 27926112499200.0, "grad_norm": 2.0136362758093114, "language_loss": 0.69915766, "learning_rate": 2.6692166788907233e-06, "loss": 0.72113311, "num_input_tokens_seen": 73608685, "step": 3407, "time_per_iteration": 2.5616419315338135 }, { "auxiliary_loss_clip": 0.01159127, "auxiliary_loss_mlp": 0.01031652, "balance_loss_clip": 1.05208826, "balance_loss_mlp": 1.022735, "epoch": 0.409787771298022, "flos": 19206607092480.0, "grad_norm": 1.8887130947309867, "language_loss": 0.76980293, "learning_rate": 2.6684825585400957e-06, "loss": 0.79171073, "num_input_tokens_seen": 73627630, "step": 3408, "time_per_iteration": 3.2694413661956787 }, { "auxiliary_loss_clip": 0.01053915, "auxiliary_loss_mlp": 0.01001098, "balance_loss_clip": 1.01382983, "balance_loss_mlp": 0.99999523, "epoch": 0.4099080141886611, "flos": 59269234832640.0, "grad_norm": 0.8230670138384596, "language_loss": 0.65103191, "learning_rate": 2.6677483367811947e-06, "loss": 0.67158204, "num_input_tokens_seen": 73687670, "step": 3409, "time_per_iteration": 3.1678109169006348 }, { "auxiliary_loss_clip": 0.01173871, "auxiliary_loss_mlp": 0.01025946, "balance_loss_clip": 1.05198944, "balance_loss_mlp": 1.0180124, "epoch": 0.4100282570793002, "flos": 21906443001600.0, "grad_norm": 1.6943569846300655, "language_loss": 0.75456792, "learning_rate": 2.6670140137254028e-06, "loss": 0.77656609, "num_input_tokens_seen": 73707145, "step": 3410, "time_per_iteration": 3.225379705429077 }, { "auxiliary_loss_clip": 0.01125115, "auxiliary_loss_mlp": 0.01027262, "balance_loss_clip": 1.04895771, "balance_loss_mlp": 1.01903653, "epoch": 0.4101484999699393, "flos": 18551596631040.0, "grad_norm": 2.175898670822533, "language_loss": 0.89666653, "learning_rate": 2.666279589484115e-06, "loss": 0.91819024, "num_input_tokens_seen": 73725045, "step": 3411, "time_per_iteration": 2.541153907775879 }, { "auxiliary_loss_clip": 0.01127034, "auxiliary_loss_mlp": 0.01024678, "balance_loss_clip": 1.04674268, "balance_loss_mlp": 1.01692891, "epoch": 0.41026874286057835, "flos": 19094529680640.0, "grad_norm": 1.9883902105830888, "language_loss": 0.80995059, "learning_rate": 2.6655450641687435e-06, "loss": 0.83146769, "num_input_tokens_seen": 73742610, "step": 3412, "time_per_iteration": 2.542750597000122 }, { "auxiliary_loss_clip": 0.01187967, "auxiliary_loss_mlp": 0.01029251, "balance_loss_clip": 1.05880988, "balance_loss_mlp": 1.02080488, "epoch": 0.41038898575121746, "flos": 31209568588800.0, "grad_norm": 1.64470787294491, "language_loss": 0.69458604, "learning_rate": 2.664810437890715e-06, "loss": 0.71675819, "num_input_tokens_seen": 73764280, "step": 3413, "time_per_iteration": 2.5264055728912354 }, { "auxiliary_loss_clip": 0.01109018, "auxiliary_loss_mlp": 0.01028718, "balance_loss_clip": 1.0532831, "balance_loss_mlp": 1.02134216, "epoch": 0.41050922864185657, "flos": 14355865895040.0, "grad_norm": 4.289189759480733, "language_loss": 0.79475296, "learning_rate": 2.6640757107614714e-06, "loss": 0.81613034, "num_input_tokens_seen": 73782375, "step": 3414, "time_per_iteration": 2.571340799331665 }, { "auxiliary_loss_clip": 0.0113986, "auxiliary_loss_mlp": 0.01026666, "balance_loss_clip": 1.05248928, "balance_loss_mlp": 1.01807749, "epoch": 0.4106294715324956, "flos": 30956290813440.0, "grad_norm": 2.09267630270332, "language_loss": 0.68926632, "learning_rate": 2.6633408828924697e-06, "loss": 0.7109316, "num_input_tokens_seen": 73801240, "step": 3415, "time_per_iteration": 2.6018478870391846 }, { "auxiliary_loss_clip": 0.01153017, "auxiliary_loss_mlp": 0.01036531, "balance_loss_clip": 1.05497253, "balance_loss_mlp": 1.02856743, "epoch": 0.41074971442313474, "flos": 24457321209600.0, "grad_norm": 1.5546567530053277, "language_loss": 0.7016114, "learning_rate": 2.662605954395185e-06, "loss": 0.72350681, "num_input_tokens_seen": 73821200, "step": 3416, "time_per_iteration": 2.5898351669311523 }, { "auxiliary_loss_clip": 0.01173896, "auxiliary_loss_mlp": 0.01025115, "balance_loss_clip": 1.0532558, "balance_loss_mlp": 1.01743186, "epoch": 0.41086995731377385, "flos": 21542991235200.0, "grad_norm": 1.7748157847477455, "language_loss": 0.83809614, "learning_rate": 2.6618709253811027e-06, "loss": 0.86008626, "num_input_tokens_seen": 73840655, "step": 3417, "time_per_iteration": 2.5107195377349854 }, { "auxiliary_loss_clip": 0.01185247, "auxiliary_loss_mlp": 0.01026453, "balance_loss_clip": 1.05799937, "balance_loss_mlp": 1.01951528, "epoch": 0.4109902002044129, "flos": 20702753314560.0, "grad_norm": 1.6345945953980663, "language_loss": 0.87304175, "learning_rate": 2.6611357959617277e-06, "loss": 0.89515877, "num_input_tokens_seen": 73860275, "step": 3418, "time_per_iteration": 2.4438281059265137 }, { "auxiliary_loss_clip": 0.01138697, "auxiliary_loss_mlp": 0.01035899, "balance_loss_clip": 1.04998684, "balance_loss_mlp": 1.02713096, "epoch": 0.411110443095052, "flos": 18179992477440.0, "grad_norm": 1.7857831501145531, "language_loss": 0.90923345, "learning_rate": 2.660400566248578e-06, "loss": 0.93097949, "num_input_tokens_seen": 73878400, "step": 3419, "time_per_iteration": 2.5350852012634277 }, { "auxiliary_loss_clip": 0.01144094, "auxiliary_loss_mlp": 0.0103262, "balance_loss_clip": 1.04989147, "balance_loss_mlp": 1.0232445, "epoch": 0.41123068598569107, "flos": 14575244209920.0, "grad_norm": 2.2793699006355044, "language_loss": 0.66624498, "learning_rate": 2.6596652363531876e-06, "loss": 0.68801212, "num_input_tokens_seen": 73894275, "step": 3420, "time_per_iteration": 2.5832343101501465 }, { "auxiliary_loss_clip": 0.01184643, "auxiliary_loss_mlp": 0.01026623, "balance_loss_clip": 1.05510128, "balance_loss_mlp": 1.01871634, "epoch": 0.4113509288763302, "flos": 21177995184000.0, "grad_norm": 1.4748049450188518, "language_loss": 0.78456891, "learning_rate": 2.6589298063871055e-06, "loss": 0.80668163, "num_input_tokens_seen": 73914450, "step": 3421, "time_per_iteration": 2.444382429122925 }, { "auxiliary_loss_clip": 0.0118418, "auxiliary_loss_mlp": 0.0102781, "balance_loss_clip": 1.05450559, "balance_loss_mlp": 1.01916766, "epoch": 0.4114711717669693, "flos": 18442212739200.0, "grad_norm": 1.898926543113717, "language_loss": 0.69418907, "learning_rate": 2.658194276461895e-06, "loss": 0.71630895, "num_input_tokens_seen": 73932375, "step": 3422, "time_per_iteration": 2.4294989109039307 }, { "auxiliary_loss_clip": 0.01159555, "auxiliary_loss_mlp": 0.01028998, "balance_loss_clip": 1.05047846, "balance_loss_mlp": 1.01993859, "epoch": 0.41159141465760835, "flos": 27233395735680.0, "grad_norm": 2.1380857066121157, "language_loss": 0.67110312, "learning_rate": 2.6574586466891368e-06, "loss": 0.69298863, "num_input_tokens_seen": 73952850, "step": 3423, "time_per_iteration": 2.564086675643921 }, { "auxiliary_loss_clip": 0.01155646, "auxiliary_loss_mlp": 0.00762543, "balance_loss_clip": 1.05102348, "balance_loss_mlp": 1.00011778, "epoch": 0.41171165754824746, "flos": 20006876154240.0, "grad_norm": 3.1246470547510246, "language_loss": 0.64620644, "learning_rate": 2.6567229171804247e-06, "loss": 0.66538835, "num_input_tokens_seen": 73970735, "step": 3424, "time_per_iteration": 2.495319128036499 }, { "auxiliary_loss_clip": 0.01149574, "auxiliary_loss_mlp": 0.01038115, "balance_loss_clip": 1.0475421, "balance_loss_mlp": 1.02885866, "epoch": 0.41183190043888657, "flos": 18004318035840.0, "grad_norm": 2.5008071495225725, "language_loss": 0.87688345, "learning_rate": 2.655987088047368e-06, "loss": 0.89876032, "num_input_tokens_seen": 73989080, "step": 3425, "time_per_iteration": 2.4999303817749023 }, { "auxiliary_loss_clip": 0.01150703, "auxiliary_loss_mlp": 0.01031948, "balance_loss_clip": 1.04946041, "balance_loss_mlp": 1.02320433, "epoch": 0.4119521433295256, "flos": 27163370171520.0, "grad_norm": 2.463150626199949, "language_loss": 0.78562689, "learning_rate": 2.6552511594015912e-06, "loss": 0.80745339, "num_input_tokens_seen": 74009470, "step": 3426, "time_per_iteration": 2.5535061359405518 }, { "auxiliary_loss_clip": 0.01155337, "auxiliary_loss_mlp": 0.01028563, "balance_loss_clip": 1.04829454, "balance_loss_mlp": 1.01948524, "epoch": 0.41207238622016473, "flos": 15122020014720.0, "grad_norm": 1.915374764288529, "language_loss": 0.85259998, "learning_rate": 2.654515131354735e-06, "loss": 0.874439, "num_input_tokens_seen": 74027735, "step": 3427, "time_per_iteration": 2.483039379119873 }, { "auxiliary_loss_clip": 0.01143483, "auxiliary_loss_mlp": 0.010282, "balance_loss_clip": 1.05152774, "balance_loss_mlp": 1.02076781, "epoch": 0.41219262911080384, "flos": 27052872958080.0, "grad_norm": 2.4105201343755893, "language_loss": 0.84782207, "learning_rate": 2.653779004018453e-06, "loss": 0.8695389, "num_input_tokens_seen": 74048300, "step": 3428, "time_per_iteration": 3.4264934062957764 }, { "auxiliary_loss_clip": 0.01149484, "auxiliary_loss_mlp": 0.01022603, "balance_loss_clip": 1.05072546, "balance_loss_mlp": 1.01456213, "epoch": 0.4123128720014429, "flos": 24686360282880.0, "grad_norm": 1.8627355113254442, "language_loss": 0.82521957, "learning_rate": 2.653042777504417e-06, "loss": 0.8469404, "num_input_tokens_seen": 74070890, "step": 3429, "time_per_iteration": 2.5591697692871094 }, { "auxiliary_loss_clip": 0.01165088, "auxiliary_loss_mlp": 0.01025107, "balance_loss_clip": 1.05193996, "balance_loss_mlp": 1.01668453, "epoch": 0.412433114892082, "flos": 26244774731520.0, "grad_norm": 1.7746401224407709, "language_loss": 0.79560387, "learning_rate": 2.6523064519243105e-06, "loss": 0.81750584, "num_input_tokens_seen": 74090460, "step": 3430, "time_per_iteration": 2.5351781845092773 }, { "auxiliary_loss_clip": 0.01171847, "auxiliary_loss_mlp": 0.01033445, "balance_loss_clip": 1.05414796, "balance_loss_mlp": 1.024647, "epoch": 0.4125533577827211, "flos": 21361031913600.0, "grad_norm": 2.4688279827538633, "language_loss": 0.79005921, "learning_rate": 2.6515700273898333e-06, "loss": 0.81211209, "num_input_tokens_seen": 74108335, "step": 3431, "time_per_iteration": 2.4668734073638916 }, { "auxiliary_loss_clip": 0.01146948, "auxiliary_loss_mlp": 0.01032093, "balance_loss_clip": 1.05278516, "balance_loss_mlp": 1.02311659, "epoch": 0.4126736006733602, "flos": 26067556005120.0, "grad_norm": 1.8607911076876527, "language_loss": 0.68730193, "learning_rate": 2.6508335040127018e-06, "loss": 0.70909238, "num_input_tokens_seen": 74128030, "step": 3432, "time_per_iteration": 3.360414981842041 }, { "auxiliary_loss_clip": 0.01175638, "auxiliary_loss_mlp": 0.01029592, "balance_loss_clip": 1.05609465, "balance_loss_mlp": 1.02158082, "epoch": 0.4127938435639993, "flos": 25666146541440.0, "grad_norm": 1.9936524485733829, "language_loss": 0.77026403, "learning_rate": 2.6500968819046446e-06, "loss": 0.79231644, "num_input_tokens_seen": 74148330, "step": 3433, "time_per_iteration": 2.518282413482666 }, { "auxiliary_loss_clip": 0.01128342, "auxiliary_loss_mlp": 0.01029444, "balance_loss_clip": 1.04418397, "balance_loss_mlp": 1.02111101, "epoch": 0.4129140864546384, "flos": 17995914253440.0, "grad_norm": 2.3148799500342108, "language_loss": 0.58738357, "learning_rate": 2.649360161177408e-06, "loss": 0.60896146, "num_input_tokens_seen": 74163390, "step": 3434, "time_per_iteration": 2.4858314990997314 }, { "auxiliary_loss_clip": 0.01178267, "auxiliary_loss_mlp": 0.0102849, "balance_loss_clip": 1.05378246, "balance_loss_mlp": 1.02030063, "epoch": 0.41303432934527745, "flos": 23732895715200.0, "grad_norm": 2.182380081991096, "language_loss": 0.73062766, "learning_rate": 2.6486233419427504e-06, "loss": 0.7526952, "num_input_tokens_seen": 74183205, "step": 3435, "time_per_iteration": 3.205916166305542 }, { "auxiliary_loss_clip": 0.01133665, "auxiliary_loss_mlp": 0.01025716, "balance_loss_clip": 1.04971623, "balance_loss_mlp": 1.0169239, "epoch": 0.41315457223591656, "flos": 19755286318080.0, "grad_norm": 2.440945576826226, "language_loss": 0.75222224, "learning_rate": 2.6478864243124484e-06, "loss": 0.77381611, "num_input_tokens_seen": 74202870, "step": 3436, "time_per_iteration": 2.5269200801849365 }, { "auxiliary_loss_clip": 0.01170382, "auxiliary_loss_mlp": 0.01022709, "balance_loss_clip": 1.05038667, "balance_loss_mlp": 1.01523471, "epoch": 0.4132748151265556, "flos": 20923316778240.0, "grad_norm": 3.817682558471954, "language_loss": 0.8523711, "learning_rate": 2.6471494083982903e-06, "loss": 0.87430197, "num_input_tokens_seen": 74222255, "step": 3437, "time_per_iteration": 3.264099597930908 }, { "auxiliary_loss_clip": 0.01144507, "auxiliary_loss_mlp": 0.0102554, "balance_loss_clip": 1.0489893, "balance_loss_mlp": 1.01801169, "epoch": 0.4133950580171947, "flos": 32232520016640.0, "grad_norm": 1.6900278272702725, "language_loss": 0.74777019, "learning_rate": 2.6464122943120818e-06, "loss": 0.76947063, "num_input_tokens_seen": 74242480, "step": 3438, "time_per_iteration": 2.6100597381591797 }, { "auxiliary_loss_clip": 0.01142309, "auxiliary_loss_mlp": 0.01025588, "balance_loss_clip": 1.05198514, "balance_loss_mlp": 1.01710606, "epoch": 0.41351530090783384, "flos": 23292487059840.0, "grad_norm": 2.8451429782575164, "language_loss": 0.82532728, "learning_rate": 2.645675082165642e-06, "loss": 0.8470062, "num_input_tokens_seen": 74258690, "step": 3439, "time_per_iteration": 2.526207447052002 }, { "auxiliary_loss_clip": 0.0115597, "auxiliary_loss_mlp": 0.01030741, "balance_loss_clip": 1.05181456, "balance_loss_mlp": 1.0220449, "epoch": 0.4136355437984729, "flos": 25593571111680.0, "grad_norm": 1.9902155610622383, "language_loss": 0.75291467, "learning_rate": 2.644937772070806e-06, "loss": 0.77478182, "num_input_tokens_seen": 74277135, "step": 3440, "time_per_iteration": 2.5322275161743164 }, { "auxiliary_loss_clip": 0.01185336, "auxiliary_loss_mlp": 0.0102605, "balance_loss_clip": 1.05458558, "balance_loss_mlp": 1.01757407, "epoch": 0.413755786689112, "flos": 19828615933440.0, "grad_norm": 2.225910037209359, "language_loss": 0.83527458, "learning_rate": 2.6442003641394225e-06, "loss": 0.8573885, "num_input_tokens_seen": 74294730, "step": 3441, "time_per_iteration": 2.4213078022003174 }, { "auxiliary_loss_clip": 0.01153504, "auxiliary_loss_mlp": 0.01027351, "balance_loss_clip": 1.04937196, "balance_loss_mlp": 1.01957822, "epoch": 0.4138760295797511, "flos": 26870446759680.0, "grad_norm": 1.564187576833067, "language_loss": 0.83831024, "learning_rate": 2.643462858483356e-06, "loss": 0.86011875, "num_input_tokens_seen": 74315015, "step": 3442, "time_per_iteration": 2.5670652389526367 }, { "auxiliary_loss_clip": 0.01125422, "auxiliary_loss_mlp": 0.01031912, "balance_loss_clip": 1.04939699, "balance_loss_mlp": 1.02306676, "epoch": 0.41399627247039017, "flos": 16399254798720.0, "grad_norm": 1.8494849239088171, "language_loss": 0.72705418, "learning_rate": 2.6427252552144856e-06, "loss": 0.74862754, "num_input_tokens_seen": 74333665, "step": 3443, "time_per_iteration": 2.5562806129455566 }, { "auxiliary_loss_clip": 0.0118492, "auxiliary_loss_mlp": 0.01033638, "balance_loss_clip": 1.05346036, "balance_loss_mlp": 1.02501965, "epoch": 0.4141165153610293, "flos": 22930220442240.0, "grad_norm": 2.140378889839288, "language_loss": 0.75185335, "learning_rate": 2.6419875544447044e-06, "loss": 0.77403891, "num_input_tokens_seen": 74355065, "step": 3444, "time_per_iteration": 2.5564467906951904 }, { "auxiliary_loss_clip": 0.01185332, "auxiliary_loss_mlp": 0.01032517, "balance_loss_clip": 1.05342913, "balance_loss_mlp": 1.02368903, "epoch": 0.4142367582516684, "flos": 25192556697600.0, "grad_norm": 1.764905075102477, "language_loss": 0.71788776, "learning_rate": 2.6412497562859218e-06, "loss": 0.74006623, "num_input_tokens_seen": 74376345, "step": 3445, "time_per_iteration": 2.4802005290985107 }, { "auxiliary_loss_clip": 0.01173914, "auxiliary_loss_mlp": 0.01028674, "balance_loss_clip": 1.05203605, "balance_loss_mlp": 1.02009642, "epoch": 0.41435700114230745, "flos": 21690476478720.0, "grad_norm": 2.1571402197682286, "language_loss": 0.76271755, "learning_rate": 2.6405118608500617e-06, "loss": 0.78474349, "num_input_tokens_seen": 74395170, "step": 3446, "time_per_iteration": 2.4645159244537354 }, { "auxiliary_loss_clip": 0.01136026, "auxiliary_loss_mlp": 0.01026862, "balance_loss_clip": 1.0526613, "balance_loss_mlp": 1.01902962, "epoch": 0.41447724403294656, "flos": 25995160143360.0, "grad_norm": 2.178869720562306, "language_loss": 0.81265402, "learning_rate": 2.6397738682490613e-06, "loss": 0.83428293, "num_input_tokens_seen": 74416070, "step": 3447, "time_per_iteration": 2.579881191253662 }, { "auxiliary_loss_clip": 0.01183748, "auxiliary_loss_mlp": 0.01026675, "balance_loss_clip": 1.05414104, "balance_loss_mlp": 1.01855087, "epoch": 0.41459748692358567, "flos": 18259678800000.0, "grad_norm": 2.4376251487091425, "language_loss": 0.75320101, "learning_rate": 2.6390357785948734e-06, "loss": 0.77530515, "num_input_tokens_seen": 74433185, "step": 3448, "time_per_iteration": 2.4142868518829346 }, { "auxiliary_loss_clip": 0.01171666, "auxiliary_loss_mlp": 0.01032044, "balance_loss_clip": 1.0545063, "balance_loss_mlp": 1.02364588, "epoch": 0.4147177298142247, "flos": 24168456034560.0, "grad_norm": 1.705961680271532, "language_loss": 0.80198503, "learning_rate": 2.6382975919994667e-06, "loss": 0.82402205, "num_input_tokens_seen": 74453760, "step": 3449, "time_per_iteration": 2.4977855682373047 }, { "auxiliary_loss_clip": 0.01157141, "auxiliary_loss_mlp": 0.01026213, "balance_loss_clip": 1.05161262, "balance_loss_mlp": 1.01876807, "epoch": 0.41483797270486383, "flos": 20084659056000.0, "grad_norm": 1.5213965130436213, "language_loss": 0.72941345, "learning_rate": 2.637559308574822e-06, "loss": 0.75124699, "num_input_tokens_seen": 74473505, "step": 3450, "time_per_iteration": 2.498060464859009 }, { "auxiliary_loss_clip": 0.01183914, "auxiliary_loss_mlp": 0.01030931, "balance_loss_clip": 1.0540688, "balance_loss_mlp": 1.02298522, "epoch": 0.4149582155955029, "flos": 30081040110720.0, "grad_norm": 2.125749644174213, "language_loss": 0.70971662, "learning_rate": 2.6368209284329376e-06, "loss": 0.73186505, "num_input_tokens_seen": 74494135, "step": 3451, "time_per_iteration": 2.496692657470703 }, { "auxiliary_loss_clip": 0.01167887, "auxiliary_loss_mlp": 0.01029783, "balance_loss_clip": 1.04943669, "balance_loss_mlp": 1.02143788, "epoch": 0.415078458486142, "flos": 16764394504320.0, "grad_norm": 1.8856325771597418, "language_loss": 0.74971163, "learning_rate": 2.636082451685825e-06, "loss": 0.77168834, "num_input_tokens_seen": 74512335, "step": 3452, "time_per_iteration": 2.426292657852173 }, { "auxiliary_loss_clip": 0.01159274, "auxiliary_loss_mlp": 0.01025832, "balance_loss_clip": 1.05352104, "balance_loss_mlp": 1.01776731, "epoch": 0.4151987013767811, "flos": 26033692458240.0, "grad_norm": 1.5080057706844285, "language_loss": 0.86405468, "learning_rate": 2.6353438784455094e-06, "loss": 0.88590568, "num_input_tokens_seen": 74535620, "step": 3453, "time_per_iteration": 2.5559446811676025 }, { "auxiliary_loss_clip": 0.01154006, "auxiliary_loss_mlp": 0.01030048, "balance_loss_clip": 1.05315661, "balance_loss_mlp": 1.02099359, "epoch": 0.41531894426742016, "flos": 24608002763520.0, "grad_norm": 2.2212199936848736, "language_loss": 0.72012186, "learning_rate": 2.6346052088240326e-06, "loss": 0.74196237, "num_input_tokens_seen": 74555140, "step": 3454, "time_per_iteration": 2.524017333984375 }, { "auxiliary_loss_clip": 0.01156956, "auxiliary_loss_mlp": 0.01029259, "balance_loss_clip": 1.04999161, "balance_loss_mlp": 1.02067614, "epoch": 0.4154391871580593, "flos": 14975791747200.0, "grad_norm": 2.11453114294564, "language_loss": 0.7770049, "learning_rate": 2.63386644293345e-06, "loss": 0.79886699, "num_input_tokens_seen": 74571485, "step": 3455, "time_per_iteration": 3.2934179306030273 }, { "auxiliary_loss_clip": 0.0113483, "auxiliary_loss_mlp": 0.01029339, "balance_loss_clip": 1.04548907, "balance_loss_mlp": 1.02170956, "epoch": 0.4155594300486984, "flos": 14647173194880.0, "grad_norm": 2.52187612389324, "language_loss": 0.82948858, "learning_rate": 2.633127580885833e-06, "loss": 0.85113031, "num_input_tokens_seen": 74585985, "step": 3456, "time_per_iteration": 2.497652769088745 }, { "auxiliary_loss_clip": 0.01183387, "auxiliary_loss_mlp": 0.01032019, "balance_loss_clip": 1.05622053, "balance_loss_mlp": 1.02385318, "epoch": 0.41567967293933744, "flos": 29497276275840.0, "grad_norm": 2.0215106515898817, "language_loss": 0.64998156, "learning_rate": 2.632388622793265e-06, "loss": 0.67213559, "num_input_tokens_seen": 74605140, "step": 3457, "time_per_iteration": 2.494825839996338 }, { "auxiliary_loss_clip": 0.01170144, "auxiliary_loss_mlp": 0.01029691, "balance_loss_clip": 1.05432391, "balance_loss_mlp": 1.02155519, "epoch": 0.41579991582997655, "flos": 19238387650560.0, "grad_norm": 1.7349252082218465, "language_loss": 0.67804873, "learning_rate": 2.6316495687678457e-06, "loss": 0.70004714, "num_input_tokens_seen": 74623790, "step": 3458, "time_per_iteration": 2.468646287918091 }, { "auxiliary_loss_clip": 0.01120027, "auxiliary_loss_mlp": 0.01024914, "balance_loss_clip": 1.04591346, "balance_loss_mlp": 1.01664686, "epoch": 0.41592015872061566, "flos": 24462061804800.0, "grad_norm": 2.88710394098374, "language_loss": 0.76728904, "learning_rate": 2.6309104189216887e-06, "loss": 0.78873843, "num_input_tokens_seen": 74641355, "step": 3459, "time_per_iteration": 3.4454345703125 }, { "auxiliary_loss_clip": 0.01128556, "auxiliary_loss_mlp": 0.00763053, "balance_loss_clip": 1.0470171, "balance_loss_mlp": 1.00022221, "epoch": 0.4160404016112547, "flos": 20775651966720.0, "grad_norm": 2.385908597627895, "language_loss": 0.74884462, "learning_rate": 2.630171173366923e-06, "loss": 0.76776075, "num_input_tokens_seen": 74657155, "step": 3460, "time_per_iteration": 2.5208818912506104 }, { "auxiliary_loss_clip": 0.01124696, "auxiliary_loss_mlp": 0.01025763, "balance_loss_clip": 1.04675901, "balance_loss_mlp": 1.0173943, "epoch": 0.41616064450189383, "flos": 13916462820480.0, "grad_norm": 2.436091787653902, "language_loss": 0.74587989, "learning_rate": 2.629431832215691e-06, "loss": 0.76738441, "num_input_tokens_seen": 74671960, "step": 3461, "time_per_iteration": 2.5265612602233887 }, { "auxiliary_loss_clip": 0.01148906, "auxiliary_loss_mlp": 0.01022683, "balance_loss_clip": 1.0486927, "balance_loss_mlp": 1.01473808, "epoch": 0.41628088739253294, "flos": 20010826650240.0, "grad_norm": 1.5957769260540788, "language_loss": 0.87647814, "learning_rate": 2.628692395580151e-06, "loss": 0.89819407, "num_input_tokens_seen": 74692050, "step": 3462, "time_per_iteration": 3.3256468772888184 }, { "auxiliary_loss_clip": 0.01095397, "auxiliary_loss_mlp": 0.01027883, "balance_loss_clip": 1.04392695, "balance_loss_mlp": 1.01952004, "epoch": 0.416401130283172, "flos": 29168801377920.0, "grad_norm": 1.9496326518963858, "language_loss": 0.79516274, "learning_rate": 2.6279528635724747e-06, "loss": 0.81639552, "num_input_tokens_seen": 74712205, "step": 3463, "time_per_iteration": 3.423001527786255 }, { "auxiliary_loss_clip": 0.01169522, "auxiliary_loss_mlp": 0.01027616, "balance_loss_clip": 1.05216312, "balance_loss_mlp": 1.01903939, "epoch": 0.4165213731738111, "flos": 16246813478400.0, "grad_norm": 3.344907597717156, "language_loss": 0.78575432, "learning_rate": 2.627213236304848e-06, "loss": 0.80772573, "num_input_tokens_seen": 74729005, "step": 3464, "time_per_iteration": 2.4564061164855957 }, { "auxiliary_loss_clip": 0.01173144, "auxiliary_loss_mlp": 0.01026151, "balance_loss_clip": 1.05422044, "balance_loss_mlp": 1.01782405, "epoch": 0.4166416160644502, "flos": 33765438787200.0, "grad_norm": 1.9955075768981707, "language_loss": 0.70592636, "learning_rate": 2.626473513889472e-06, "loss": 0.72791928, "num_input_tokens_seen": 74751385, "step": 3465, "time_per_iteration": 2.591647148132324 }, { "auxiliary_loss_clip": 0.01159776, "auxiliary_loss_mlp": 0.01033299, "balance_loss_clip": 1.05077195, "balance_loss_mlp": 1.02512693, "epoch": 0.41676185895508927, "flos": 20917498775040.0, "grad_norm": 1.806155721163052, "language_loss": 0.83063149, "learning_rate": 2.625733696438562e-06, "loss": 0.85256219, "num_input_tokens_seen": 74768890, "step": 3466, "time_per_iteration": 2.469381332397461 }, { "auxiliary_loss_clip": 0.01152033, "auxiliary_loss_mlp": 0.01032967, "balance_loss_clip": 1.05059648, "balance_loss_mlp": 1.0249387, "epoch": 0.4168821018457284, "flos": 18406122549120.0, "grad_norm": 1.6478267641127824, "language_loss": 0.75351608, "learning_rate": 2.6249937840643476e-06, "loss": 0.77536607, "num_input_tokens_seen": 74787195, "step": 3467, "time_per_iteration": 2.4977591037750244 }, { "auxiliary_loss_clip": 0.01183263, "auxiliary_loss_mlp": 0.0076253, "balance_loss_clip": 1.05469751, "balance_loss_mlp": 1.00021815, "epoch": 0.41700234473636744, "flos": 18698399516160.0, "grad_norm": 1.8950878755130762, "language_loss": 0.66750717, "learning_rate": 2.6242537768790733e-06, "loss": 0.68696517, "num_input_tokens_seen": 74806350, "step": 3468, "time_per_iteration": 2.4356980323791504 }, { "auxiliary_loss_clip": 0.01170564, "auxiliary_loss_mlp": 0.01028495, "balance_loss_clip": 1.05308676, "balance_loss_mlp": 1.01998937, "epoch": 0.41712258762700655, "flos": 31033283616000.0, "grad_norm": 1.8723668590831297, "language_loss": 0.68660635, "learning_rate": 2.6235136749949975e-06, "loss": 0.70859694, "num_input_tokens_seen": 74829800, "step": 3469, "time_per_iteration": 2.580603837966919 }, { "auxiliary_loss_clip": 0.01180828, "auxiliary_loss_mlp": 0.0102949, "balance_loss_clip": 1.05264091, "balance_loss_mlp": 1.02044451, "epoch": 0.41724283051764566, "flos": 35914763877120.0, "grad_norm": 2.1710000814226094, "language_loss": 0.61695391, "learning_rate": 2.6227734785243924e-06, "loss": 0.63905704, "num_input_tokens_seen": 74849760, "step": 3470, "time_per_iteration": 2.570401906967163 }, { "auxiliary_loss_clip": 0.01103893, "auxiliary_loss_mlp": 0.01025199, "balance_loss_clip": 1.04281986, "balance_loss_mlp": 1.0173018, "epoch": 0.4173630734082847, "flos": 25333649320320.0, "grad_norm": 11.067771064294295, "language_loss": 0.79270077, "learning_rate": 2.6220331875795466e-06, "loss": 0.81399167, "num_input_tokens_seen": 74869110, "step": 3471, "time_per_iteration": 2.623163938522339 }, { "auxiliary_loss_clip": 0.01165239, "auxiliary_loss_mlp": 0.01029891, "balance_loss_clip": 1.05230427, "balance_loss_mlp": 1.02164721, "epoch": 0.4174833162989238, "flos": 26685398868480.0, "grad_norm": 1.653054843406505, "language_loss": 0.7503953, "learning_rate": 2.62129280227276e-06, "loss": 0.77234662, "num_input_tokens_seen": 74889110, "step": 3472, "time_per_iteration": 2.5103964805603027 }, { "auxiliary_loss_clip": 0.01172667, "auxiliary_loss_mlp": 0.01034589, "balance_loss_clip": 1.05238307, "balance_loss_mlp": 1.0259347, "epoch": 0.41760355918956293, "flos": 74739584010240.0, "grad_norm": 1.9367760155530178, "language_loss": 0.68340445, "learning_rate": 2.62055232271635e-06, "loss": 0.705477, "num_input_tokens_seen": 74916260, "step": 3473, "time_per_iteration": 2.8592793941497803 }, { "auxiliary_loss_clip": 0.01130314, "auxiliary_loss_mlp": 0.01027395, "balance_loss_clip": 1.04696918, "balance_loss_mlp": 1.01933026, "epoch": 0.417723802080202, "flos": 14317513148160.0, "grad_norm": 1.945403972843583, "language_loss": 0.87755072, "learning_rate": 2.619811749022646e-06, "loss": 0.89912778, "num_input_tokens_seen": 74931570, "step": 3474, "time_per_iteration": 2.4995055198669434 }, { "auxiliary_loss_clip": 0.01170781, "auxiliary_loss_mlp": 0.01031733, "balance_loss_clip": 1.05418348, "balance_loss_mlp": 1.02320981, "epoch": 0.4178440449708411, "flos": 14643797316480.0, "grad_norm": 2.291969290459892, "language_loss": 0.71711504, "learning_rate": 2.6190710813039917e-06, "loss": 0.73914027, "num_input_tokens_seen": 74944695, "step": 3475, "time_per_iteration": 2.429954767227173 }, { "auxiliary_loss_clip": 0.01119317, "auxiliary_loss_mlp": 0.00763779, "balance_loss_clip": 1.04352081, "balance_loss_mlp": 1.00018811, "epoch": 0.4179642878614802, "flos": 21507296094720.0, "grad_norm": 2.1346174732178995, "language_loss": 0.83984888, "learning_rate": 2.618330319672747e-06, "loss": 0.85867989, "num_input_tokens_seen": 74964115, "step": 3476, "time_per_iteration": 2.5777759552001953 }, { "auxiliary_loss_clip": 0.01183848, "auxiliary_loss_mlp": 0.01027566, "balance_loss_clip": 1.0543288, "balance_loss_mlp": 1.01938236, "epoch": 0.41808453075211927, "flos": 18441997257600.0, "grad_norm": 2.360735482633372, "language_loss": 0.91842341, "learning_rate": 2.617589464241284e-06, "loss": 0.94053757, "num_input_tokens_seen": 74978515, "step": 3477, "time_per_iteration": 2.418271780014038 }, { "auxiliary_loss_clip": 0.01142993, "auxiliary_loss_mlp": 0.01023973, "balance_loss_clip": 1.05011487, "balance_loss_mlp": 1.01650465, "epoch": 0.4182047736427584, "flos": 20301020628480.0, "grad_norm": 1.989406817569932, "language_loss": 0.74753785, "learning_rate": 2.6168485151219914e-06, "loss": 0.76920748, "num_input_tokens_seen": 74998135, "step": 3478, "time_per_iteration": 2.545653820037842 }, { "auxiliary_loss_clip": 0.01169943, "auxiliary_loss_mlp": 0.01026444, "balance_loss_clip": 1.05275559, "balance_loss_mlp": 1.01809978, "epoch": 0.4183250165333975, "flos": 18876623823360.0, "grad_norm": 7.008066589815805, "language_loss": 0.71727669, "learning_rate": 2.616107472427269e-06, "loss": 0.73924047, "num_input_tokens_seen": 75012830, "step": 3479, "time_per_iteration": 2.4293010234832764 }, { "auxiliary_loss_clip": 0.01175003, "auxiliary_loss_mlp": 0.01025116, "balance_loss_clip": 1.05257463, "balance_loss_mlp": 1.01675367, "epoch": 0.41844525942403654, "flos": 17740050698880.0, "grad_norm": 2.9363689596502995, "language_loss": 0.76565349, "learning_rate": 2.615366336269533e-06, "loss": 0.78765464, "num_input_tokens_seen": 75026495, "step": 3480, "time_per_iteration": 2.436577796936035 }, { "auxiliary_loss_clip": 0.01185547, "auxiliary_loss_mlp": 0.0103608, "balance_loss_clip": 1.05353904, "balance_loss_mlp": 1.02694905, "epoch": 0.41856550231467565, "flos": 18361377181440.0, "grad_norm": 2.6185338129962386, "language_loss": 0.80420965, "learning_rate": 2.6146251067612126e-06, "loss": 0.82642591, "num_input_tokens_seen": 75041970, "step": 3481, "time_per_iteration": 2.412691354751587 }, { "auxiliary_loss_clip": 0.01168067, "auxiliary_loss_mlp": 0.01028688, "balance_loss_clip": 1.05354917, "balance_loss_mlp": 1.02039719, "epoch": 0.41868574520531476, "flos": 22781801445120.0, "grad_norm": 1.5559728647902487, "language_loss": 0.82525861, "learning_rate": 2.6138837840147525e-06, "loss": 0.84722614, "num_input_tokens_seen": 75061005, "step": 3482, "time_per_iteration": 3.317591428756714 }, { "auxiliary_loss_clip": 0.01138692, "auxiliary_loss_mlp": 0.01028026, "balance_loss_clip": 1.04896796, "balance_loss_mlp": 1.01991975, "epoch": 0.4188059880959538, "flos": 13699167494400.0, "grad_norm": 2.2546182438774176, "language_loss": 0.76345217, "learning_rate": 2.6131423681426103e-06, "loss": 0.7851193, "num_input_tokens_seen": 75076920, "step": 3483, "time_per_iteration": 2.500379800796509 }, { "auxiliary_loss_clip": 0.01183503, "auxiliary_loss_mlp": 0.01025574, "balance_loss_clip": 1.0551976, "balance_loss_mlp": 1.01827884, "epoch": 0.41892623098659293, "flos": 37818281220480.0, "grad_norm": 1.6276821832869575, "language_loss": 0.72819209, "learning_rate": 2.6124008592572587e-06, "loss": 0.75028282, "num_input_tokens_seen": 75100905, "step": 3484, "time_per_iteration": 2.60031795501709 }, { "auxiliary_loss_clip": 0.01186815, "auxiliary_loss_mlp": 0.01028761, "balance_loss_clip": 1.05375266, "balance_loss_mlp": 1.01998127, "epoch": 0.419046473877232, "flos": 23258874908160.0, "grad_norm": 3.085197331183568, "language_loss": 0.82063305, "learning_rate": 2.6116592574711835e-06, "loss": 0.84278882, "num_input_tokens_seen": 75119205, "step": 3485, "time_per_iteration": 2.4477596282958984 }, { "auxiliary_loss_clip": 0.01187818, "auxiliary_loss_mlp": 0.01037582, "balance_loss_clip": 1.05585194, "balance_loss_mlp": 1.02883172, "epoch": 0.4191667167678711, "flos": 20741034234240.0, "grad_norm": 1.7808387905703396, "language_loss": 0.84116077, "learning_rate": 2.6109175628968853e-06, "loss": 0.86341482, "num_input_tokens_seen": 75138970, "step": 3486, "time_per_iteration": 3.312459707260132 }, { "auxiliary_loss_clip": 0.01160745, "auxiliary_loss_mlp": 0.01028275, "balance_loss_clip": 1.05087364, "balance_loss_mlp": 1.02054977, "epoch": 0.4192869596585102, "flos": 23586416052480.0, "grad_norm": 1.8839716590873883, "language_loss": 0.82592857, "learning_rate": 2.610175775646878e-06, "loss": 0.84781879, "num_input_tokens_seen": 75157550, "step": 3487, "time_per_iteration": 2.485684394836426 }, { "auxiliary_loss_clip": 0.01152335, "auxiliary_loss_mlp": 0.01029144, "balance_loss_clip": 1.04782867, "balance_loss_mlp": 1.02097869, "epoch": 0.41940720254914926, "flos": 25081269384960.0, "grad_norm": 1.9322295648543002, "language_loss": 0.7250151, "learning_rate": 2.6094338958336907e-06, "loss": 0.74682987, "num_input_tokens_seen": 75176220, "step": 3488, "time_per_iteration": 3.348875045776367 }, { "auxiliary_loss_clip": 0.01155766, "auxiliary_loss_mlp": 0.01027057, "balance_loss_clip": 1.05182672, "balance_loss_mlp": 1.01898026, "epoch": 0.41952744543978837, "flos": 15554132628480.0, "grad_norm": 2.0032949221501264, "language_loss": 0.82409155, "learning_rate": 2.608691923569867e-06, "loss": 0.84591979, "num_input_tokens_seen": 75193095, "step": 3489, "time_per_iteration": 2.5133297443389893 }, { "auxiliary_loss_clip": 0.01171894, "auxiliary_loss_mlp": 0.01029811, "balance_loss_clip": 1.05367529, "balance_loss_mlp": 1.02166343, "epoch": 0.4196476883304275, "flos": 24644775312000.0, "grad_norm": 1.6185373153168154, "language_loss": 0.75680375, "learning_rate": 2.6079498589679616e-06, "loss": 0.77882075, "num_input_tokens_seen": 75214185, "step": 3490, "time_per_iteration": 3.2565455436706543 }, { "auxiliary_loss_clip": 0.01105126, "auxiliary_loss_mlp": 0.01034595, "balance_loss_clip": 1.0413866, "balance_loss_mlp": 1.02512932, "epoch": 0.41976793122106654, "flos": 24531333183360.0, "grad_norm": 2.767977131496339, "language_loss": 0.75896966, "learning_rate": 2.6072077021405465e-06, "loss": 0.7803669, "num_input_tokens_seen": 75233020, "step": 3491, "time_per_iteration": 2.621495485305786 }, { "auxiliary_loss_clip": 0.01148216, "auxiliary_loss_mlp": 0.01033635, "balance_loss_clip": 1.04925704, "balance_loss_mlp": 1.0259335, "epoch": 0.41988817411170565, "flos": 21175301664000.0, "grad_norm": 1.697596358884439, "language_loss": 0.68812382, "learning_rate": 2.6064654532002054e-06, "loss": 0.70994234, "num_input_tokens_seen": 75252030, "step": 3492, "time_per_iteration": 2.519561767578125 }, { "auxiliary_loss_clip": 0.01184474, "auxiliary_loss_mlp": 0.01033969, "balance_loss_clip": 1.05488694, "balance_loss_mlp": 1.02591014, "epoch": 0.42000841700234476, "flos": 31649402626560.0, "grad_norm": 3.9879169386297106, "language_loss": 0.75748897, "learning_rate": 2.6057231122595375e-06, "loss": 0.7796734, "num_input_tokens_seen": 75273340, "step": 3493, "time_per_iteration": 2.5316569805145264 }, { "auxiliary_loss_clip": 0.01155415, "auxiliary_loss_mlp": 0.01031245, "balance_loss_clip": 1.04857957, "balance_loss_mlp": 1.022614, "epoch": 0.4201286598929838, "flos": 21281525159040.0, "grad_norm": 1.6304732473793624, "language_loss": 0.72744763, "learning_rate": 2.604980679431154e-06, "loss": 0.74931419, "num_input_tokens_seen": 75291580, "step": 3494, "time_per_iteration": 2.5114362239837646 }, { "auxiliary_loss_clip": 0.01170537, "auxiliary_loss_mlp": 0.01027123, "balance_loss_clip": 1.04991305, "balance_loss_mlp": 1.01922584, "epoch": 0.4202489027836229, "flos": 18546532813440.0, "grad_norm": 1.990481141980804, "language_loss": 0.7480433, "learning_rate": 2.604238154827684e-06, "loss": 0.77002001, "num_input_tokens_seen": 75308205, "step": 3495, "time_per_iteration": 2.4571053981781006 }, { "auxiliary_loss_clip": 0.01169991, "auxiliary_loss_mlp": 0.01024945, "balance_loss_clip": 1.05244744, "balance_loss_mlp": 1.01691628, "epoch": 0.42036914567426203, "flos": 19317643009920.0, "grad_norm": 1.8816238367556346, "language_loss": 0.72657776, "learning_rate": 2.6034955385617656e-06, "loss": 0.74852717, "num_input_tokens_seen": 75326535, "step": 3496, "time_per_iteration": 2.450303554534912 }, { "auxiliary_loss_clip": 0.01051489, "auxiliary_loss_mlp": 0.0100441, "balance_loss_clip": 1.01800179, "balance_loss_mlp": 1.00332499, "epoch": 0.4204893885649011, "flos": 67842942935040.0, "grad_norm": 0.8101331624344884, "language_loss": 0.61675388, "learning_rate": 2.6027528307460544e-06, "loss": 0.63731289, "num_input_tokens_seen": 75390540, "step": 3497, "time_per_iteration": 3.181201219558716 }, { "auxiliary_loss_clip": 0.01183969, "auxiliary_loss_mlp": 0.01025552, "balance_loss_clip": 1.05389047, "balance_loss_mlp": 1.01807761, "epoch": 0.4206096314555402, "flos": 21908777385600.0, "grad_norm": 2.066648623615736, "language_loss": 0.86420363, "learning_rate": 2.602010031493217e-06, "loss": 0.88629889, "num_input_tokens_seen": 75408770, "step": 3498, "time_per_iteration": 2.477231740951538 }, { "auxiliary_loss_clip": 0.01136994, "auxiliary_loss_mlp": 0.01032969, "balance_loss_clip": 1.04853821, "balance_loss_mlp": 1.02462125, "epoch": 0.42072987434617926, "flos": 29278185269760.0, "grad_norm": 1.9352074136477548, "language_loss": 0.86761618, "learning_rate": 2.6012671409159367e-06, "loss": 0.88931578, "num_input_tokens_seen": 75430105, "step": 3499, "time_per_iteration": 2.5866153240203857 }, { "auxiliary_loss_clip": 0.01150318, "auxiliary_loss_mlp": 0.01029862, "balance_loss_clip": 1.05144715, "balance_loss_mlp": 1.02129054, "epoch": 0.42085011723681837, "flos": 27600726170880.0, "grad_norm": 1.8928820323263145, "language_loss": 0.81662506, "learning_rate": 2.6005241591269097e-06, "loss": 0.83842683, "num_input_tokens_seen": 75449475, "step": 3500, "time_per_iteration": 2.536463975906372 }, { "auxiliary_loss_clip": 0.01135685, "auxiliary_loss_mlp": 0.01030621, "balance_loss_clip": 1.05114555, "balance_loss_mlp": 1.02265811, "epoch": 0.4209703601274575, "flos": 27818632028160.0, "grad_norm": 1.7220329486448918, "language_loss": 0.79491782, "learning_rate": 2.5997810862388454e-06, "loss": 0.81658089, "num_input_tokens_seen": 75469315, "step": 3501, "time_per_iteration": 2.63895845413208 }, { "auxiliary_loss_clip": 0.01155156, "auxiliary_loss_mlp": 0.01028641, "balance_loss_clip": 1.04859161, "balance_loss_mlp": 1.02044547, "epoch": 0.42109060301809653, "flos": 27525529048320.0, "grad_norm": 1.9726864276113674, "language_loss": 0.75441056, "learning_rate": 2.599037922364467e-06, "loss": 0.77624857, "num_input_tokens_seen": 75488215, "step": 3502, "time_per_iteration": 2.5483884811401367 }, { "auxiliary_loss_clip": 0.01135715, "auxiliary_loss_mlp": 0.01025802, "balance_loss_clip": 1.04949701, "balance_loss_mlp": 1.01752949, "epoch": 0.42121084590873564, "flos": 29314275459840.0, "grad_norm": 2.1339329649153487, "language_loss": 0.75387096, "learning_rate": 2.5982946676165112e-06, "loss": 0.77548611, "num_input_tokens_seen": 75507985, "step": 3503, "time_per_iteration": 2.5771071910858154 }, { "auxiliary_loss_clip": 0.01057201, "auxiliary_loss_mlp": 0.01003984, "balance_loss_clip": 1.03015924, "balance_loss_mlp": 1.00238693, "epoch": 0.42133108879937475, "flos": 67398835178880.0, "grad_norm": 0.7289518483371384, "language_loss": 0.57607305, "learning_rate": 2.5975513221077313e-06, "loss": 0.59668487, "num_input_tokens_seen": 75571955, "step": 3504, "time_per_iteration": 3.1944680213928223 }, { "auxiliary_loss_clip": 0.01147057, "auxiliary_loss_mlp": 0.01032023, "balance_loss_clip": 1.05017352, "balance_loss_mlp": 1.02370226, "epoch": 0.4214513316900138, "flos": 23106038538240.0, "grad_norm": 2.582364925543384, "language_loss": 0.88772297, "learning_rate": 2.5968078859508897e-06, "loss": 0.90951371, "num_input_tokens_seen": 75589155, "step": 3505, "time_per_iteration": 2.4954302310943604 }, { "auxiliary_loss_clip": 0.0116895, "auxiliary_loss_mlp": 0.01028058, "balance_loss_clip": 1.05304158, "balance_loss_mlp": 1.02011251, "epoch": 0.4215715745806529, "flos": 15336190857600.0, "grad_norm": 2.0607230537098666, "language_loss": 0.80054593, "learning_rate": 2.5960643592587673e-06, "loss": 0.82251602, "num_input_tokens_seen": 75606565, "step": 3506, "time_per_iteration": 2.4433610439300537 }, { "auxiliary_loss_clip": 0.01141058, "auxiliary_loss_mlp": 0.0102741, "balance_loss_clip": 1.04900134, "balance_loss_mlp": 1.01985788, "epoch": 0.42169181747129203, "flos": 22127257860480.0, "grad_norm": 2.0123536169178435, "language_loss": 0.81577432, "learning_rate": 2.5953207421441553e-06, "loss": 0.83745903, "num_input_tokens_seen": 75625165, "step": 3507, "time_per_iteration": 2.540236473083496 }, { "auxiliary_loss_clip": 0.01145165, "auxiliary_loss_mlp": 0.01031624, "balance_loss_clip": 1.05258, "balance_loss_mlp": 1.0239414, "epoch": 0.4218120603619311, "flos": 22630724841600.0, "grad_norm": 2.151817061630336, "language_loss": 0.75429857, "learning_rate": 2.5945770347198603e-06, "loss": 0.77606642, "num_input_tokens_seen": 75643320, "step": 3508, "time_per_iteration": 3.3331243991851807 }, { "auxiliary_loss_clip": 0.01150369, "auxiliary_loss_mlp": 0.01022054, "balance_loss_clip": 1.04902506, "balance_loss_mlp": 1.01468706, "epoch": 0.4219323032525702, "flos": 19682818629120.0, "grad_norm": 1.6912967643455545, "language_loss": 0.81626594, "learning_rate": 2.593833237098701e-06, "loss": 0.83799016, "num_input_tokens_seen": 75660920, "step": 3509, "time_per_iteration": 2.510230779647827 }, { "auxiliary_loss_clip": 0.01166519, "auxiliary_loss_mlp": 0.01029738, "balance_loss_clip": 1.04867649, "balance_loss_mlp": 1.02115536, "epoch": 0.4220525461432093, "flos": 30190747224960.0, "grad_norm": 1.8069589798945016, "language_loss": 0.6215992, "learning_rate": 2.593089349393512e-06, "loss": 0.64356172, "num_input_tokens_seen": 75681410, "step": 3510, "time_per_iteration": 2.546098470687866 }, { "auxiliary_loss_clip": 0.01170095, "auxiliary_loss_mlp": 0.01025437, "balance_loss_clip": 1.05638552, "balance_loss_mlp": 1.01773643, "epoch": 0.42217278903384836, "flos": 24315941278080.0, "grad_norm": 2.119552249181925, "language_loss": 0.83804792, "learning_rate": 2.592345371717141e-06, "loss": 0.86000323, "num_input_tokens_seen": 75700940, "step": 3511, "time_per_iteration": 2.493624448776245 }, { "auxiliary_loss_clip": 0.01172858, "auxiliary_loss_mlp": 0.01029833, "balance_loss_clip": 1.05883813, "balance_loss_mlp": 1.02148247, "epoch": 0.42229303192448747, "flos": 17092474352640.0, "grad_norm": 2.330988490580442, "language_loss": 0.71804029, "learning_rate": 2.591601304182448e-06, "loss": 0.74006718, "num_input_tokens_seen": 75718910, "step": 3512, "time_per_iteration": 3.300139904022217 }, { "auxiliary_loss_clip": 0.01156208, "auxiliary_loss_mlp": 0.01027477, "balance_loss_clip": 1.05470657, "balance_loss_mlp": 1.02029443, "epoch": 0.4224132748151266, "flos": 22784530878720.0, "grad_norm": 1.7998042832617744, "language_loss": 0.79349375, "learning_rate": 2.5908571469023067e-06, "loss": 0.81533056, "num_input_tokens_seen": 75738395, "step": 3513, "time_per_iteration": 2.5223584175109863 }, { "auxiliary_loss_clip": 0.01184027, "auxiliary_loss_mlp": 0.01030342, "balance_loss_clip": 1.05492163, "balance_loss_mlp": 1.02240252, "epoch": 0.42253351770576564, "flos": 17819090576640.0, "grad_norm": 2.464266565644855, "language_loss": 0.75665057, "learning_rate": 2.5901128999896067e-06, "loss": 0.77879429, "num_input_tokens_seen": 75753825, "step": 3514, "time_per_iteration": 2.401898145675659 }, { "auxiliary_loss_clip": 0.01170024, "auxiliary_loss_mlp": 0.01027642, "balance_loss_clip": 1.05601716, "balance_loss_mlp": 1.01997733, "epoch": 0.42265376059640475, "flos": 28512390286080.0, "grad_norm": 1.591482796493947, "language_loss": 0.67968857, "learning_rate": 2.5893685635572487e-06, "loss": 0.70166528, "num_input_tokens_seen": 75774675, "step": 3515, "time_per_iteration": 3.2679989337921143 }, { "auxiliary_loss_clip": 0.01157476, "auxiliary_loss_mlp": 0.01029621, "balance_loss_clip": 1.05532002, "balance_loss_mlp": 1.02132416, "epoch": 0.4227740034870438, "flos": 16253349753600.0, "grad_norm": 1.8927749769020958, "language_loss": 0.68916708, "learning_rate": 2.5886241377181483e-06, "loss": 0.71103811, "num_input_tokens_seen": 75793545, "step": 3516, "time_per_iteration": 2.508251428604126 }, { "auxiliary_loss_clip": 0.01174479, "auxiliary_loss_mlp": 0.01029051, "balance_loss_clip": 1.05570459, "balance_loss_mlp": 1.0204556, "epoch": 0.4228942463776829, "flos": 25295691623040.0, "grad_norm": 1.7004153494485628, "language_loss": 0.81485415, "learning_rate": 2.587879622585234e-06, "loss": 0.83688939, "num_input_tokens_seen": 75812145, "step": 3517, "time_per_iteration": 3.2553234100341797 }, { "auxiliary_loss_clip": 0.01169647, "auxiliary_loss_mlp": 0.01031274, "balance_loss_clip": 1.05565786, "balance_loss_mlp": 1.02340662, "epoch": 0.423014489268322, "flos": 26395779507840.0, "grad_norm": 2.354801716907938, "language_loss": 0.75580376, "learning_rate": 2.5871350182714486e-06, "loss": 0.77781296, "num_input_tokens_seen": 75833025, "step": 3518, "time_per_iteration": 2.603123188018799 }, { "auxiliary_loss_clip": 0.01183822, "auxiliary_loss_mlp": 0.01026397, "balance_loss_clip": 1.0559907, "balance_loss_mlp": 1.01895237, "epoch": 0.4231347321589611, "flos": 17274002711040.0, "grad_norm": 2.167273863329668, "language_loss": 0.80323267, "learning_rate": 2.586390324889748e-06, "loss": 0.82533491, "num_input_tokens_seen": 75848925, "step": 3519, "time_per_iteration": 2.456437110900879 }, { "auxiliary_loss_clip": 0.01171034, "auxiliary_loss_mlp": 0.01031623, "balance_loss_clip": 1.05701935, "balance_loss_mlp": 1.02421975, "epoch": 0.4232549750496002, "flos": 22999635475200.0, "grad_norm": 1.8381712125618284, "language_loss": 0.67579174, "learning_rate": 2.5856455425531003e-06, "loss": 0.69781828, "num_input_tokens_seen": 75870400, "step": 3520, "time_per_iteration": 2.4923095703125 }, { "auxiliary_loss_clip": 0.01171643, "auxiliary_loss_mlp": 0.0102523, "balance_loss_clip": 1.05672765, "balance_loss_mlp": 1.01748681, "epoch": 0.4233752179402393, "flos": 21248343970560.0, "grad_norm": 1.7376983949816318, "language_loss": 0.80391335, "learning_rate": 2.5849006713744902e-06, "loss": 0.82588208, "num_input_tokens_seen": 75889195, "step": 3521, "time_per_iteration": 2.4542765617370605 }, { "auxiliary_loss_clip": 0.01154491, "auxiliary_loss_mlp": 0.01028206, "balance_loss_clip": 1.0523802, "balance_loss_mlp": 1.02002847, "epoch": 0.42349546083087836, "flos": 20704297599360.0, "grad_norm": 2.044537984655198, "language_loss": 0.72736812, "learning_rate": 2.5841557114669135e-06, "loss": 0.7491951, "num_input_tokens_seen": 75906055, "step": 3522, "time_per_iteration": 2.4920802116394043 }, { "auxiliary_loss_clip": 0.01189986, "auxiliary_loss_mlp": 0.01030601, "balance_loss_clip": 1.05591679, "balance_loss_mlp": 1.02198803, "epoch": 0.42361570372151747, "flos": 18585065128320.0, "grad_norm": 9.759805701092263, "language_loss": 0.67134333, "learning_rate": 2.58341066294338e-06, "loss": 0.69354922, "num_input_tokens_seen": 75922720, "step": 3523, "time_per_iteration": 2.4124362468719482 }, { "auxiliary_loss_clip": 0.0113476, "auxiliary_loss_mlp": 0.00762849, "balance_loss_clip": 1.05186999, "balance_loss_mlp": 1.00041032, "epoch": 0.4237359466121566, "flos": 20959478795520.0, "grad_norm": 2.3431021267201904, "language_loss": 0.85559934, "learning_rate": 2.5826655259169124e-06, "loss": 0.87457538, "num_input_tokens_seen": 75941375, "step": 3524, "time_per_iteration": 2.602137327194214 }, { "auxiliary_loss_clip": 0.01188391, "auxiliary_loss_mlp": 0.01031083, "balance_loss_clip": 1.05849552, "balance_loss_mlp": 1.02301896, "epoch": 0.42385618950279563, "flos": 18038181582720.0, "grad_norm": 2.1170359855964445, "language_loss": 0.90596938, "learning_rate": 2.5819203005005475e-06, "loss": 0.92816412, "num_input_tokens_seen": 75958710, "step": 3525, "time_per_iteration": 2.5590906143188477 }, { "auxiliary_loss_clip": 0.01152699, "auxiliary_loss_mlp": 0.01031834, "balance_loss_clip": 1.05451465, "balance_loss_mlp": 1.02406764, "epoch": 0.42397643239343474, "flos": 23769129559680.0, "grad_norm": 2.016743251961724, "language_loss": 0.78911197, "learning_rate": 2.581174986807336e-06, "loss": 0.81095731, "num_input_tokens_seen": 75978945, "step": 3526, "time_per_iteration": 2.5470805168151855 }, { "auxiliary_loss_clip": 0.01162197, "auxiliary_loss_mlp": 0.00762812, "balance_loss_clip": 1.05285668, "balance_loss_mlp": 1.00040948, "epoch": 0.42409667528407385, "flos": 16545088016640.0, "grad_norm": 2.2881186211263764, "language_loss": 0.9098134, "learning_rate": 2.580429584950341e-06, "loss": 0.92906344, "num_input_tokens_seen": 75994695, "step": 3527, "time_per_iteration": 2.4381277561187744 }, { "auxiliary_loss_clip": 0.0114904, "auxiliary_loss_mlp": 0.01026268, "balance_loss_clip": 1.05291605, "balance_loss_mlp": 1.01747012, "epoch": 0.4242169181747129, "flos": 16034186920320.0, "grad_norm": 2.0953409189123198, "language_loss": 0.66436046, "learning_rate": 2.5796840950426397e-06, "loss": 0.68611354, "num_input_tokens_seen": 76011780, "step": 3528, "time_per_iteration": 2.5010526180267334 }, { "auxiliary_loss_clip": 0.0116075, "auxiliary_loss_mlp": 0.01029284, "balance_loss_clip": 1.05131042, "balance_loss_mlp": 1.02163661, "epoch": 0.424337161065352, "flos": 20084012611200.0, "grad_norm": 1.8718419769513992, "language_loss": 0.65617955, "learning_rate": 2.578938517197322e-06, "loss": 0.67807996, "num_input_tokens_seen": 76029875, "step": 3529, "time_per_iteration": 2.463440179824829 }, { "auxiliary_loss_clip": 0.01146128, "auxiliary_loss_mlp": 0.01027713, "balance_loss_clip": 1.05065584, "balance_loss_mlp": 1.01976764, "epoch": 0.4244574039559911, "flos": 23878369797120.0, "grad_norm": 2.1711844819422317, "language_loss": 0.62226725, "learning_rate": 2.5781928515274916e-06, "loss": 0.6440056, "num_input_tokens_seen": 76048595, "step": 3530, "time_per_iteration": 2.522073268890381 }, { "auxiliary_loss_clip": 0.01175823, "auxiliary_loss_mlp": 0.01028923, "balance_loss_clip": 1.0578289, "balance_loss_mlp": 1.02146935, "epoch": 0.4245776468466302, "flos": 17565920542080.0, "grad_norm": 1.91141545402392, "language_loss": 0.67646921, "learning_rate": 2.577447098146265e-06, "loss": 0.69851661, "num_input_tokens_seen": 76065770, "step": 3531, "time_per_iteration": 2.4430432319641113 }, { "auxiliary_loss_clip": 0.01142812, "auxiliary_loss_mlp": 0.0103582, "balance_loss_clip": 1.05106425, "balance_loss_mlp": 1.02803016, "epoch": 0.4246978897372693, "flos": 27776256958080.0, "grad_norm": 1.749077554336442, "language_loss": 0.78845739, "learning_rate": 2.5767012571667724e-06, "loss": 0.81024373, "num_input_tokens_seen": 76085250, "step": 3532, "time_per_iteration": 2.579741954803467 }, { "auxiliary_loss_clip": 0.0117314, "auxiliary_loss_mlp": 0.01025766, "balance_loss_clip": 1.05347848, "balance_loss_mlp": 1.01669431, "epoch": 0.42481813262790835, "flos": 15596615439360.0, "grad_norm": 2.0263533953766473, "language_loss": 0.67991364, "learning_rate": 2.5759553287021587e-06, "loss": 0.70190275, "num_input_tokens_seen": 76103580, "step": 3533, "time_per_iteration": 2.4520928859710693 }, { "auxiliary_loss_clip": 0.01155958, "auxiliary_loss_mlp": 0.01027463, "balance_loss_clip": 1.05504012, "balance_loss_mlp": 1.01883841, "epoch": 0.42493837551854746, "flos": 23951088881280.0, "grad_norm": 2.0860933239739214, "language_loss": 0.77319604, "learning_rate": 2.5752093128655786e-06, "loss": 0.79503024, "num_input_tokens_seen": 76121825, "step": 3534, "time_per_iteration": 2.5165352821350098 }, { "auxiliary_loss_clip": 0.01148916, "auxiliary_loss_mlp": 0.01025351, "balance_loss_clip": 1.05022907, "balance_loss_mlp": 1.01698267, "epoch": 0.4250586184091866, "flos": 20813466009600.0, "grad_norm": 2.29887617168106, "language_loss": 0.73720282, "learning_rate": 2.574463209770204e-06, "loss": 0.75894547, "num_input_tokens_seen": 76141140, "step": 3535, "time_per_iteration": 3.3060848712921143 }, { "auxiliary_loss_clip": 0.0114067, "auxiliary_loss_mlp": 0.01031257, "balance_loss_clip": 1.05043602, "balance_loss_mlp": 1.02297163, "epoch": 0.42517886129982563, "flos": 30371018607360.0, "grad_norm": 1.719612265982136, "language_loss": 0.79694527, "learning_rate": 2.5737170195292165e-06, "loss": 0.81866461, "num_input_tokens_seen": 76164475, "step": 3536, "time_per_iteration": 2.6351466178894043 }, { "auxiliary_loss_clip": 0.01141957, "auxiliary_loss_mlp": 0.01028683, "balance_loss_clip": 1.04971194, "balance_loss_mlp": 1.01990306, "epoch": 0.42529910419046474, "flos": 20080636732800.0, "grad_norm": 2.000824768490275, "language_loss": 0.78247178, "learning_rate": 2.572970742255814e-06, "loss": 0.80417824, "num_input_tokens_seen": 76182965, "step": 3537, "time_per_iteration": 2.5373334884643555 }, { "auxiliary_loss_clip": 0.01171066, "auxiliary_loss_mlp": 0.01028203, "balance_loss_clip": 1.05716693, "balance_loss_mlp": 1.02056146, "epoch": 0.42541934708110385, "flos": 22632448694400.0, "grad_norm": 1.723061995218477, "language_loss": 0.81909847, "learning_rate": 2.5722243780632046e-06, "loss": 0.84109116, "num_input_tokens_seen": 76201230, "step": 3538, "time_per_iteration": 2.487924098968506 }, { "auxiliary_loss_clip": 0.01043385, "auxiliary_loss_mlp": 0.01004675, "balance_loss_clip": 1.02050674, "balance_loss_mlp": 1.00367403, "epoch": 0.4255395899717429, "flos": 66200676186240.0, "grad_norm": 0.7732521984192919, "language_loss": 0.6047954, "learning_rate": 2.5714779270646125e-06, "loss": 0.62527603, "num_input_tokens_seen": 76262000, "step": 3539, "time_per_iteration": 3.9092202186584473 }, { "auxiliary_loss_clip": 0.01162388, "auxiliary_loss_mlp": 0.0076273, "balance_loss_clip": 1.05590367, "balance_loss_mlp": 1.00038922, "epoch": 0.425659832862382, "flos": 17931814433280.0, "grad_norm": 2.6841389965686613, "language_loss": 0.77799147, "learning_rate": 2.5707313893732735e-06, "loss": 0.79724264, "num_input_tokens_seen": 76280540, "step": 3540, "time_per_iteration": 2.483970880508423 }, { "auxiliary_loss_clip": 0.01091331, "auxiliary_loss_mlp": 0.0102913, "balance_loss_clip": 1.04120362, "balance_loss_mlp": 1.02106535, "epoch": 0.4257800757530211, "flos": 24022550989440.0, "grad_norm": 1.6796403842919871, "language_loss": 0.77069861, "learning_rate": 2.5699847651024364e-06, "loss": 0.79190314, "num_input_tokens_seen": 76301180, "step": 3541, "time_per_iteration": 2.65840482711792 }, { "auxiliary_loss_clip": 0.01169351, "auxiliary_loss_mlp": 0.01028823, "balance_loss_clip": 1.0560087, "balance_loss_mlp": 1.02093124, "epoch": 0.4259003186436602, "flos": 23696015425920.0, "grad_norm": 3.6313339743253934, "language_loss": 0.76857674, "learning_rate": 2.5692380543653627e-06, "loss": 0.7905584, "num_input_tokens_seen": 76319335, "step": 3542, "time_per_iteration": 3.2987897396087646 }, { "auxiliary_loss_clip": 0.01175421, "auxiliary_loss_mlp": 0.00763123, "balance_loss_clip": 1.05565202, "balance_loss_mlp": 1.00047863, "epoch": 0.4260205615342993, "flos": 15259772672640.0, "grad_norm": 2.0202996225289915, "language_loss": 0.69757605, "learning_rate": 2.5684912572753293e-06, "loss": 0.7169615, "num_input_tokens_seen": 76335010, "step": 3543, "time_per_iteration": 2.4561078548431396 }, { "auxiliary_loss_clip": 0.01181584, "auxiliary_loss_mlp": 0.01022966, "balance_loss_clip": 1.0555141, "balance_loss_mlp": 1.01529479, "epoch": 0.4261408044249384, "flos": 30665306736000.0, "grad_norm": 1.7189432327803047, "language_loss": 0.84336442, "learning_rate": 2.5677443739456245e-06, "loss": 0.86540991, "num_input_tokens_seen": 76356670, "step": 3544, "time_per_iteration": 3.206366777420044 }, { "auxiliary_loss_clip": 0.01158528, "auxiliary_loss_mlp": 0.01023892, "balance_loss_clip": 1.05613399, "balance_loss_mlp": 1.01613712, "epoch": 0.42626104731557746, "flos": 23257905240960.0, "grad_norm": 2.3803050649695447, "language_loss": 0.79395318, "learning_rate": 2.5669974044895495e-06, "loss": 0.8157773, "num_input_tokens_seen": 76373065, "step": 3545, "time_per_iteration": 2.509317636489868 }, { "auxiliary_loss_clip": 0.01149294, "auxiliary_loss_mlp": 0.0102665, "balance_loss_clip": 1.051085, "balance_loss_mlp": 1.01846027, "epoch": 0.42638129020621657, "flos": 25884770670720.0, "grad_norm": 1.8138901651688484, "language_loss": 0.79966331, "learning_rate": 2.5662503490204187e-06, "loss": 0.82142276, "num_input_tokens_seen": 76393230, "step": 3546, "time_per_iteration": 2.57382869720459 }, { "auxiliary_loss_clip": 0.0115434, "auxiliary_loss_mlp": 0.01022785, "balance_loss_clip": 1.0512619, "balance_loss_mlp": 1.01500082, "epoch": 0.4265015330968556, "flos": 26502362138880.0, "grad_norm": 2.0335016009604905, "language_loss": 0.76303583, "learning_rate": 2.5655032076515603e-06, "loss": 0.78480709, "num_input_tokens_seen": 76412555, "step": 3547, "time_per_iteration": 2.564664840698242 }, { "auxiliary_loss_clip": 0.01158405, "auxiliary_loss_mlp": 0.01027115, "balance_loss_clip": 1.05305004, "balance_loss_mlp": 1.01913369, "epoch": 0.42662177598749473, "flos": 24389522288640.0, "grad_norm": 2.8284615523244767, "language_loss": 0.82254064, "learning_rate": 2.5647559804963155e-06, "loss": 0.84439582, "num_input_tokens_seen": 76432485, "step": 3548, "time_per_iteration": 2.5287795066833496 }, { "auxiliary_loss_clip": 0.01138534, "auxiliary_loss_mlp": 0.01032447, "balance_loss_clip": 1.05347896, "balance_loss_mlp": 1.02463269, "epoch": 0.42674201887813384, "flos": 23148629089920.0, "grad_norm": 1.981451402656454, "language_loss": 0.78943217, "learning_rate": 2.5640086676680364e-06, "loss": 0.81114197, "num_input_tokens_seen": 76453980, "step": 3549, "time_per_iteration": 2.6089608669281006 }, { "auxiliary_loss_clip": 0.01173815, "auxiliary_loss_mlp": 0.01027844, "balance_loss_clip": 1.05610466, "balance_loss_mlp": 1.0191896, "epoch": 0.4268622617687729, "flos": 21689614552320.0, "grad_norm": 2.294518990482537, "language_loss": 0.80787909, "learning_rate": 2.5632612692800923e-06, "loss": 0.82989573, "num_input_tokens_seen": 76473045, "step": 3550, "time_per_iteration": 2.4722511768341064 }, { "auxiliary_loss_clip": 0.01143307, "auxiliary_loss_mlp": 0.01036092, "balance_loss_clip": 1.05048192, "balance_loss_mlp": 1.02725244, "epoch": 0.426982504659412, "flos": 23440151871360.0, "grad_norm": 1.9535029950736376, "language_loss": 0.75238627, "learning_rate": 2.5625137854458603e-06, "loss": 0.77418023, "num_input_tokens_seen": 76492060, "step": 3551, "time_per_iteration": 2.5510425567626953 }, { "auxiliary_loss_clip": 0.01160055, "auxiliary_loss_mlp": 0.0102903, "balance_loss_clip": 1.05492651, "balance_loss_mlp": 1.02156794, "epoch": 0.4271027475500511, "flos": 18916556768640.0, "grad_norm": 1.9293302171028055, "language_loss": 0.80180645, "learning_rate": 2.561766216278735e-06, "loss": 0.82369727, "num_input_tokens_seen": 76509655, "step": 3552, "time_per_iteration": 2.4863080978393555 }, { "auxiliary_loss_clip": 0.01127058, "auxiliary_loss_mlp": 0.01028716, "balance_loss_clip": 1.05039573, "balance_loss_mlp": 1.02047825, "epoch": 0.4272229904406902, "flos": 26870554500480.0, "grad_norm": 1.7434203483395674, "language_loss": 0.80870008, "learning_rate": 2.561018561892121e-06, "loss": 0.83025783, "num_input_tokens_seen": 76528795, "step": 3553, "time_per_iteration": 2.5821099281311035 }, { "auxiliary_loss_clip": 0.01153879, "auxiliary_loss_mlp": 0.01033323, "balance_loss_clip": 1.05030298, "balance_loss_mlp": 1.02555037, "epoch": 0.4273432333313293, "flos": 23951376190080.0, "grad_norm": 1.5333451459252483, "language_loss": 0.76568747, "learning_rate": 2.5602708223994363e-06, "loss": 0.78755951, "num_input_tokens_seen": 76550660, "step": 3554, "time_per_iteration": 2.5269036293029785 }, { "auxiliary_loss_clip": 0.01143208, "auxiliary_loss_mlp": 0.01025685, "balance_loss_clip": 1.04758322, "balance_loss_mlp": 1.01794851, "epoch": 0.4274634762219684, "flos": 29570354496000.0, "grad_norm": 2.1282753049550367, "language_loss": 0.67153227, "learning_rate": 2.559522997914115e-06, "loss": 0.69322121, "num_input_tokens_seen": 76570240, "step": 3555, "time_per_iteration": 2.5892279148101807 }, { "auxiliary_loss_clip": 0.01184469, "auxiliary_loss_mlp": 0.01029943, "balance_loss_clip": 1.05815864, "balance_loss_mlp": 1.02251947, "epoch": 0.42758371911260745, "flos": 21434146047360.0, "grad_norm": 1.9272640727724966, "language_loss": 0.84807754, "learning_rate": 2.558775088549599e-06, "loss": 0.87022161, "num_input_tokens_seen": 76589820, "step": 3556, "time_per_iteration": 2.4482078552246094 }, { "auxiliary_loss_clip": 0.0117736, "auxiliary_loss_mlp": 0.01028798, "balance_loss_clip": 1.05546021, "balance_loss_mlp": 1.02057838, "epoch": 0.42770396200324656, "flos": 14752822072320.0, "grad_norm": 4.6819192163826555, "language_loss": 0.66522568, "learning_rate": 2.5580270944193467e-06, "loss": 0.68728727, "num_input_tokens_seen": 76606640, "step": 3557, "time_per_iteration": 2.419623851776123 }, { "auxiliary_loss_clip": 0.01084349, "auxiliary_loss_mlp": 0.01003471, "balance_loss_clip": 1.02027535, "balance_loss_mlp": 1.0024159, "epoch": 0.4278242048938857, "flos": 70654712601600.0, "grad_norm": 0.7595784660188609, "language_loss": 0.55505395, "learning_rate": 2.557279015636827e-06, "loss": 0.57593215, "num_input_tokens_seen": 76667050, "step": 3558, "time_per_iteration": 3.0204498767852783 }, { "auxiliary_loss_clip": 0.01070572, "auxiliary_loss_mlp": 0.01003582, "balance_loss_clip": 1.0200609, "balance_loss_mlp": 1.00245595, "epoch": 0.42794444778452473, "flos": 69366165033600.0, "grad_norm": 0.7784422243866626, "language_loss": 0.61264527, "learning_rate": 2.5565308523155245e-06, "loss": 0.63338685, "num_input_tokens_seen": 76726650, "step": 3559, "time_per_iteration": 2.9806787967681885 }, { "auxiliary_loss_clip": 0.01123368, "auxiliary_loss_mlp": 0.01029542, "balance_loss_clip": 1.04993415, "balance_loss_mlp": 1.02137637, "epoch": 0.42806469067516384, "flos": 18215328481920.0, "grad_norm": 2.2434340829085153, "language_loss": 0.81738192, "learning_rate": 2.5557826045689336e-06, "loss": 0.838911, "num_input_tokens_seen": 76742890, "step": 3560, "time_per_iteration": 2.5365028381347656 }, { "auxiliary_loss_clip": 0.01050918, "auxiliary_loss_mlp": 0.01005069, "balance_loss_clip": 1.02442741, "balance_loss_mlp": 1.00351954, "epoch": 0.4281849335658029, "flos": 54535814432640.0, "grad_norm": 0.818732295499881, "language_loss": 0.58867776, "learning_rate": 2.5550342725105643e-06, "loss": 0.60923767, "num_input_tokens_seen": 76801055, "step": 3561, "time_per_iteration": 3.0466389656066895 }, { "auxiliary_loss_clip": 0.0117374, "auxiliary_loss_mlp": 0.01034155, "balance_loss_clip": 1.0586741, "balance_loss_mlp": 1.02611387, "epoch": 0.428305176456442, "flos": 17274828723840.0, "grad_norm": 1.9251193512602596, "language_loss": 0.81000805, "learning_rate": 2.554285856253937e-06, "loss": 0.83208698, "num_input_tokens_seen": 76819890, "step": 3562, "time_per_iteration": 3.329968214035034 }, { "auxiliary_loss_clip": 0.01154903, "auxiliary_loss_mlp": 0.01030535, "balance_loss_clip": 1.05482984, "balance_loss_mlp": 1.02243209, "epoch": 0.4284254193470811, "flos": 26359509749760.0, "grad_norm": 2.103650819117505, "language_loss": 0.77569038, "learning_rate": 2.5535373559125855e-06, "loss": 0.79754472, "num_input_tokens_seen": 76840255, "step": 3563, "time_per_iteration": 2.5613696575164795 }, { "auxiliary_loss_clip": 0.01102388, "auxiliary_loss_mlp": 0.01024007, "balance_loss_clip": 1.04678929, "balance_loss_mlp": 1.01523948, "epoch": 0.42854566223772017, "flos": 29714248379520.0, "grad_norm": 1.80926251509629, "language_loss": 0.81813461, "learning_rate": 2.552788771600057e-06, "loss": 0.83939862, "num_input_tokens_seen": 76860565, "step": 3564, "time_per_iteration": 2.8759994506835938 }, { "auxiliary_loss_clip": 0.01146431, "auxiliary_loss_mlp": 0.01032696, "balance_loss_clip": 1.05467808, "balance_loss_mlp": 1.02420259, "epoch": 0.4286659051283593, "flos": 22018161277440.0, "grad_norm": 2.0687220530542243, "language_loss": 0.82019293, "learning_rate": 2.5520401034299118e-06, "loss": 0.84198427, "num_input_tokens_seen": 76878325, "step": 3565, "time_per_iteration": 2.961338758468628 }, { "auxiliary_loss_clip": 0.01175253, "auxiliary_loss_mlp": 0.01032268, "balance_loss_clip": 1.05667877, "balance_loss_mlp": 1.02370882, "epoch": 0.4287861480189984, "flos": 13334422838400.0, "grad_norm": 2.2319618979316096, "language_loss": 0.87335134, "learning_rate": 2.551291351515722e-06, "loss": 0.89542651, "num_input_tokens_seen": 76895340, "step": 3566, "time_per_iteration": 3.603695869445801 }, { "auxiliary_loss_clip": 0.01136179, "auxiliary_loss_mlp": 0.00763665, "balance_loss_clip": 1.04632497, "balance_loss_mlp": 1.00047851, "epoch": 0.42890639090963745, "flos": 26651535321600.0, "grad_norm": 1.6067015940367897, "language_loss": 0.85390478, "learning_rate": 2.5505425159710726e-06, "loss": 0.87290323, "num_input_tokens_seen": 76915150, "step": 3567, "time_per_iteration": 2.674638509750366 }, { "auxiliary_loss_clip": 0.01163183, "auxiliary_loss_mlp": 0.00763388, "balance_loss_clip": 1.05157232, "balance_loss_mlp": 1.00046396, "epoch": 0.42902663380027656, "flos": 24055768091520.0, "grad_norm": 3.9916115142268445, "language_loss": 0.83194745, "learning_rate": 2.549793596909561e-06, "loss": 0.85121316, "num_input_tokens_seen": 76933770, "step": 3568, "time_per_iteration": 2.529055118560791 }, { "auxiliary_loss_clip": 0.01154539, "auxiliary_loss_mlp": 0.01029289, "balance_loss_clip": 1.05505311, "balance_loss_mlp": 1.02096212, "epoch": 0.42914687669091567, "flos": 15632561975040.0, "grad_norm": 1.9891471433547236, "language_loss": 0.65879661, "learning_rate": 2.5490445944447976e-06, "loss": 0.68063486, "num_input_tokens_seen": 76952265, "step": 3569, "time_per_iteration": 3.33054518699646 }, { "auxiliary_loss_clip": 0.01169898, "auxiliary_loss_mlp": 0.01027862, "balance_loss_clip": 1.05364561, "balance_loss_mlp": 1.01987469, "epoch": 0.4292671195815547, "flos": 31467802440960.0, "grad_norm": 2.4909204966518628, "language_loss": 0.65166163, "learning_rate": 2.548295508690406e-06, "loss": 0.67363918, "num_input_tokens_seen": 76973560, "step": 3570, "time_per_iteration": 3.2226977348327637 }, { "auxiliary_loss_clip": 0.01173847, "auxiliary_loss_mlp": 0.01027592, "balance_loss_clip": 1.05355704, "balance_loss_mlp": 1.01922894, "epoch": 0.42938736247219383, "flos": 30257756046720.0, "grad_norm": 1.6986809529245595, "language_loss": 0.76484245, "learning_rate": 2.5475463397600217e-06, "loss": 0.78685689, "num_input_tokens_seen": 76993640, "step": 3571, "time_per_iteration": 2.5285048484802246 }, { "auxiliary_loss_clip": 0.01191717, "auxiliary_loss_mlp": 0.01029933, "balance_loss_clip": 1.05914819, "balance_loss_mlp": 1.02160597, "epoch": 0.42950760536283294, "flos": 29349683291520.0, "grad_norm": 2.443079276247546, "language_loss": 0.77560741, "learning_rate": 2.546797087767293e-06, "loss": 0.79782391, "num_input_tokens_seen": 77013765, "step": 3572, "time_per_iteration": 2.487550735473633 }, { "auxiliary_loss_clip": 0.01124443, "auxiliary_loss_mlp": 0.01034076, "balance_loss_clip": 1.04902542, "balance_loss_mlp": 1.02599955, "epoch": 0.429627848253472, "flos": 26869943969280.0, "grad_norm": 1.8032635849306458, "language_loss": 0.87161362, "learning_rate": 2.546047752825881e-06, "loss": 0.89319885, "num_input_tokens_seen": 77034370, "step": 3573, "time_per_iteration": 2.6012799739837646 }, { "auxiliary_loss_clip": 0.01132383, "auxiliary_loss_mlp": 0.01031203, "balance_loss_clip": 1.05017209, "balance_loss_mlp": 1.02296543, "epoch": 0.4297480911441111, "flos": 13881270470400.0, "grad_norm": 2.242408211206307, "language_loss": 0.93161988, "learning_rate": 2.5452983350494595e-06, "loss": 0.95325571, "num_input_tokens_seen": 77049925, "step": 3574, "time_per_iteration": 2.532256603240967 }, { "auxiliary_loss_clip": 0.01170554, "auxiliary_loss_mlp": 0.00763107, "balance_loss_clip": 1.05412567, "balance_loss_mlp": 1.00056624, "epoch": 0.4298683340347502, "flos": 20741141975040.0, "grad_norm": 2.1450186344560365, "language_loss": 0.65653044, "learning_rate": 2.544548834551713e-06, "loss": 0.67586708, "num_input_tokens_seen": 77068930, "step": 3575, "time_per_iteration": 2.500807762145996 }, { "auxiliary_loss_clip": 0.01139115, "auxiliary_loss_mlp": 0.00763196, "balance_loss_clip": 1.05166554, "balance_loss_mlp": 1.00045967, "epoch": 0.4299885769253893, "flos": 20882126856960.0, "grad_norm": 2.4004906846527336, "language_loss": 0.94127178, "learning_rate": 2.5437992514463424e-06, "loss": 0.9602949, "num_input_tokens_seen": 77082255, "step": 3576, "time_per_iteration": 2.515387773513794 }, { "auxiliary_loss_clip": 0.01174261, "auxiliary_loss_mlp": 0.01028046, "balance_loss_clip": 1.05637813, "balance_loss_mlp": 1.01946259, "epoch": 0.4301088198160284, "flos": 25484618183040.0, "grad_norm": 1.7281665072426735, "language_loss": 0.87893593, "learning_rate": 2.5430495858470565e-06, "loss": 0.90095901, "num_input_tokens_seen": 77101725, "step": 3577, "time_per_iteration": 2.4934825897216797 }, { "auxiliary_loss_clip": 0.01169641, "auxiliary_loss_mlp": 0.01032265, "balance_loss_clip": 1.05549073, "balance_loss_mlp": 1.0241704, "epoch": 0.43022906270666744, "flos": 18259427404800.0, "grad_norm": 2.78896037698433, "language_loss": 0.77113187, "learning_rate": 2.54229983786758e-06, "loss": 0.7931509, "num_input_tokens_seen": 77119670, "step": 3578, "time_per_iteration": 2.425658702850342 }, { "auxiliary_loss_clip": 0.01156303, "auxiliary_loss_mlp": 0.01029303, "balance_loss_clip": 1.05145121, "balance_loss_mlp": 1.02094698, "epoch": 0.43034930559730655, "flos": 23399536567680.0, "grad_norm": 1.8501987499236685, "language_loss": 0.85085428, "learning_rate": 2.541550007621651e-06, "loss": 0.87271035, "num_input_tokens_seen": 77138160, "step": 3579, "time_per_iteration": 2.4952118396759033 }, { "auxiliary_loss_clip": 0.01171266, "auxiliary_loss_mlp": 0.01028783, "balance_loss_clip": 1.05745423, "balance_loss_mlp": 1.02094483, "epoch": 0.43046954848794566, "flos": 28184382264960.0, "grad_norm": 2.139920714655677, "language_loss": 0.79701084, "learning_rate": 2.5408000952230156e-06, "loss": 0.81901133, "num_input_tokens_seen": 77156950, "step": 3580, "time_per_iteration": 2.5249781608581543 }, { "auxiliary_loss_clip": 0.01152759, "auxiliary_loss_mlp": 0.01027642, "balance_loss_clip": 1.05095124, "balance_loss_mlp": 1.01902294, "epoch": 0.4305897913785847, "flos": 28580476515840.0, "grad_norm": 2.0773310101012012, "language_loss": 0.90524906, "learning_rate": 2.5400501007854357e-06, "loss": 0.92705309, "num_input_tokens_seen": 77176395, "step": 3581, "time_per_iteration": 2.5997626781463623 }, { "auxiliary_loss_clip": 0.01126468, "auxiliary_loss_mlp": 0.01033263, "balance_loss_clip": 1.04613614, "balance_loss_mlp": 1.02560103, "epoch": 0.43071003426922383, "flos": 20448721353600.0, "grad_norm": 1.761173417858187, "language_loss": 0.75274009, "learning_rate": 2.539300024422685e-06, "loss": 0.77433741, "num_input_tokens_seen": 77194340, "step": 3582, "time_per_iteration": 2.556225299835205 }, { "auxiliary_loss_clip": 0.01050716, "auxiliary_loss_mlp": 0.01003189, "balance_loss_clip": 1.01982164, "balance_loss_mlp": 1.00197935, "epoch": 0.43083027715986294, "flos": 51997969883520.0, "grad_norm": 0.7906574745758691, "language_loss": 0.60928702, "learning_rate": 2.538549866248549e-06, "loss": 0.62982607, "num_input_tokens_seen": 77249320, "step": 3583, "time_per_iteration": 2.9549360275268555 }, { "auxiliary_loss_clip": 0.01173343, "auxiliary_loss_mlp": 0.0102745, "balance_loss_clip": 1.05350137, "balance_loss_mlp": 1.01874781, "epoch": 0.430950520050502, "flos": 16690885320960.0, "grad_norm": 2.3415440895236714, "language_loss": 0.80943739, "learning_rate": 2.5377996263768274e-06, "loss": 0.83144534, "num_input_tokens_seen": 77267400, "step": 3584, "time_per_iteration": 2.446281671524048 }, { "auxiliary_loss_clip": 0.01156413, "auxiliary_loss_mlp": 0.01033755, "balance_loss_clip": 1.04978085, "balance_loss_mlp": 1.02570295, "epoch": 0.4310707629411411, "flos": 24608433726720.0, "grad_norm": 1.7468793875230286, "language_loss": 0.68138158, "learning_rate": 2.5370493049213293e-06, "loss": 0.70328331, "num_input_tokens_seen": 77287045, "step": 3585, "time_per_iteration": 2.524519205093384 }, { "auxiliary_loss_clip": 0.010918, "auxiliary_loss_mlp": 0.01030482, "balance_loss_clip": 1.04665244, "balance_loss_mlp": 1.02145815, "epoch": 0.4311910058317802, "flos": 26432983019520.0, "grad_norm": 2.0215579887339024, "language_loss": 0.80402058, "learning_rate": 2.536298901995878e-06, "loss": 0.82524341, "num_input_tokens_seen": 77306255, "step": 3586, "time_per_iteration": 2.703721046447754 }, { "auxiliary_loss_clip": 0.01160498, "auxiliary_loss_mlp": 0.01027067, "balance_loss_clip": 1.05462074, "balance_loss_mlp": 1.0187639, "epoch": 0.43131124872241927, "flos": 25155891889920.0, "grad_norm": 1.614446675531778, "language_loss": 0.80473924, "learning_rate": 2.535548417714311e-06, "loss": 0.82661486, "num_input_tokens_seen": 77325555, "step": 3587, "time_per_iteration": 2.740952730178833 }, { "auxiliary_loss_clip": 0.01178517, "auxiliary_loss_mlp": 0.01029529, "balance_loss_clip": 1.05543554, "balance_loss_mlp": 1.02067208, "epoch": 0.4314314916130584, "flos": 21614812479360.0, "grad_norm": 1.533642986622583, "language_loss": 0.86980295, "learning_rate": 2.534797852190474e-06, "loss": 0.89188343, "num_input_tokens_seen": 77345735, "step": 3588, "time_per_iteration": 3.4060065746307373 }, { "auxiliary_loss_clip": 0.01169581, "auxiliary_loss_mlp": 0.01037813, "balance_loss_clip": 1.05319691, "balance_loss_mlp": 1.02904487, "epoch": 0.4315517345036975, "flos": 19275016544640.0, "grad_norm": 2.009137048748634, "language_loss": 0.81502658, "learning_rate": 2.5340472055382283e-06, "loss": 0.83710051, "num_input_tokens_seen": 77361765, "step": 3589, "time_per_iteration": 2.4752395153045654 }, { "auxiliary_loss_clip": 0.01139034, "auxiliary_loss_mlp": 0.01026132, "balance_loss_clip": 1.04677331, "balance_loss_mlp": 1.01827312, "epoch": 0.43167197739433655, "flos": 24273853516800.0, "grad_norm": 2.8294804541610383, "language_loss": 0.81204522, "learning_rate": 2.5332964778714468e-06, "loss": 0.83369684, "num_input_tokens_seen": 77378950, "step": 3590, "time_per_iteration": 2.5702152252197266 }, { "auxiliary_loss_clip": 0.0114217, "auxiliary_loss_mlp": 0.01026267, "balance_loss_clip": 1.05348587, "balance_loss_mlp": 1.0186739, "epoch": 0.43179222028497566, "flos": 16867816738560.0, "grad_norm": 1.5957164668142962, "language_loss": 0.66312826, "learning_rate": 2.5325456693040123e-06, "loss": 0.68481266, "num_input_tokens_seen": 77396145, "step": 3591, "time_per_iteration": 2.5571916103363037 }, { "auxiliary_loss_clip": 0.01179218, "auxiliary_loss_mlp": 0.01027707, "balance_loss_clip": 1.05383968, "balance_loss_mlp": 1.01914167, "epoch": 0.43191246317561477, "flos": 17639214243840.0, "grad_norm": 2.2300435099828704, "language_loss": 0.74680722, "learning_rate": 2.531794779949824e-06, "loss": 0.76887643, "num_input_tokens_seen": 77414045, "step": 3592, "time_per_iteration": 3.300826072692871 }, { "auxiliary_loss_clip": 0.01134499, "auxiliary_loss_mlp": 0.01027657, "balance_loss_clip": 1.04850733, "balance_loss_mlp": 1.01959825, "epoch": 0.4320327060662538, "flos": 23878800760320.0, "grad_norm": 1.6716766956956526, "language_loss": 0.87860763, "learning_rate": 2.5310438099227903e-06, "loss": 0.90022922, "num_input_tokens_seen": 77431310, "step": 3593, "time_per_iteration": 2.5510101318359375 }, { "auxiliary_loss_clip": 0.01072462, "auxiliary_loss_mlp": 0.01001891, "balance_loss_clip": 1.01909506, "balance_loss_mlp": 1.00081182, "epoch": 0.43215294895689293, "flos": 66394917959040.0, "grad_norm": 0.9191302928058255, "language_loss": 0.53381926, "learning_rate": 2.530292759336833e-06, "loss": 0.55456281, "num_input_tokens_seen": 77492045, "step": 3594, "time_per_iteration": 3.1054184436798096 }, { "auxiliary_loss_clip": 0.01157345, "auxiliary_loss_mlp": 0.01028151, "balance_loss_clip": 1.05487216, "balance_loss_mlp": 1.01938283, "epoch": 0.432273191847532, "flos": 20594267262720.0, "grad_norm": 2.3427555431985314, "language_loss": 0.69422531, "learning_rate": 2.5295416283058855e-06, "loss": 0.71608031, "num_input_tokens_seen": 77510910, "step": 3595, "time_per_iteration": 2.497631311416626 }, { "auxiliary_loss_clip": 0.01153369, "auxiliary_loss_mlp": 0.00762782, "balance_loss_clip": 1.05268729, "balance_loss_mlp": 1.00050402, "epoch": 0.4323934347381711, "flos": 19282127437440.0, "grad_norm": 1.5675448187427266, "language_loss": 0.65933645, "learning_rate": 2.5287904169438943e-06, "loss": 0.67849797, "num_input_tokens_seen": 77530115, "step": 3596, "time_per_iteration": 4.03898024559021 }, { "auxiliary_loss_clip": 0.01110511, "auxiliary_loss_mlp": 0.01038306, "balance_loss_clip": 1.05006278, "balance_loss_mlp": 1.02889407, "epoch": 0.4325136776288102, "flos": 21726315273600.0, "grad_norm": 3.051767216861184, "language_loss": 0.64282525, "learning_rate": 2.528039125364817e-06, "loss": 0.66431338, "num_input_tokens_seen": 77548920, "step": 3597, "time_per_iteration": 2.6449201107025146 }, { "auxiliary_loss_clip": 0.01145312, "auxiliary_loss_mlp": 0.01030879, "balance_loss_clip": 1.05067301, "balance_loss_mlp": 1.02229559, "epoch": 0.43263392051944927, "flos": 22340746344960.0, "grad_norm": 2.246116252924554, "language_loss": 0.75853992, "learning_rate": 2.5272877536826246e-06, "loss": 0.78030181, "num_input_tokens_seen": 77567715, "step": 3598, "time_per_iteration": 2.5189919471740723 }, { "auxiliary_loss_clip": 0.0113151, "auxiliary_loss_mlp": 0.01030876, "balance_loss_clip": 1.04659212, "balance_loss_mlp": 1.02204812, "epoch": 0.4327541634100884, "flos": 29168406328320.0, "grad_norm": 2.3783946157873723, "language_loss": 0.70116448, "learning_rate": 2.5265363020112986e-06, "loss": 0.72278833, "num_input_tokens_seen": 77588035, "step": 3599, "time_per_iteration": 2.6355364322662354 }, { "auxiliary_loss_clip": 0.01172324, "auxiliary_loss_mlp": 0.01034719, "balance_loss_clip": 1.05613291, "balance_loss_mlp": 1.02587986, "epoch": 0.4328744063007275, "flos": 26067448264320.0, "grad_norm": 3.323915847623867, "language_loss": 0.83648968, "learning_rate": 2.5257847704648344e-06, "loss": 0.85856009, "num_input_tokens_seen": 77609265, "step": 3600, "time_per_iteration": 2.5086514949798584 }, { "auxiliary_loss_clip": 0.01184066, "auxiliary_loss_mlp": 0.01028052, "balance_loss_clip": 1.05506432, "balance_loss_mlp": 1.02017796, "epoch": 0.43299464919136654, "flos": 16581357774720.0, "grad_norm": 1.9252807626380084, "language_loss": 0.75428838, "learning_rate": 2.525033159157239e-06, "loss": 0.77640957, "num_input_tokens_seen": 77625580, "step": 3601, "time_per_iteration": 2.3959898948669434 }, { "auxiliary_loss_clip": 0.01169547, "auxiliary_loss_mlp": 0.01039193, "balance_loss_clip": 1.05401242, "balance_loss_mlp": 1.02986491, "epoch": 0.43311489208200565, "flos": 16107265140480.0, "grad_norm": 2.006134952642327, "language_loss": 0.77326339, "learning_rate": 2.52428146820253e-06, "loss": 0.79535079, "num_input_tokens_seen": 77643835, "step": 3602, "time_per_iteration": 2.4670159816741943 }, { "auxiliary_loss_clip": 0.01146354, "auxiliary_loss_mlp": 0.01031665, "balance_loss_clip": 1.05238581, "balance_loss_mlp": 1.02189553, "epoch": 0.43323513497264476, "flos": 22930220442240.0, "grad_norm": 1.7307430856872235, "language_loss": 0.81849921, "learning_rate": 2.52352969771474e-06, "loss": 0.84027946, "num_input_tokens_seen": 77663060, "step": 3603, "time_per_iteration": 2.5484559535980225 }, { "auxiliary_loss_clip": 0.01159581, "auxiliary_loss_mlp": 0.01029624, "balance_loss_clip": 1.05276, "balance_loss_mlp": 1.02141619, "epoch": 0.4333553778632838, "flos": 25299031587840.0, "grad_norm": 2.356006697098909, "language_loss": 0.88193458, "learning_rate": 2.5227778478079106e-06, "loss": 0.90382659, "num_input_tokens_seen": 77682470, "step": 3604, "time_per_iteration": 2.5340633392333984 }, { "auxiliary_loss_clip": 0.01167145, "auxiliary_loss_mlp": 0.01033376, "balance_loss_clip": 1.05174792, "balance_loss_mlp": 1.02540648, "epoch": 0.43347562075392293, "flos": 19387165783680.0, "grad_norm": 1.5783330184256474, "language_loss": 0.76814699, "learning_rate": 2.522025918596098e-06, "loss": 0.79015219, "num_input_tokens_seen": 77700770, "step": 3605, "time_per_iteration": 2.449148654937744 }, { "auxiliary_loss_clip": 0.01173961, "auxiliary_loss_mlp": 0.0102963, "balance_loss_clip": 1.05479336, "balance_loss_mlp": 1.02148247, "epoch": 0.43359586364456204, "flos": 26325969425280.0, "grad_norm": 2.181342040543769, "language_loss": 0.65379238, "learning_rate": 2.521273910193368e-06, "loss": 0.67582834, "num_input_tokens_seen": 77723950, "step": 3606, "time_per_iteration": 2.523129940032959 }, { "auxiliary_loss_clip": 0.01178969, "auxiliary_loss_mlp": 0.01027876, "balance_loss_clip": 1.05606961, "balance_loss_mlp": 1.01939464, "epoch": 0.4337161065352011, "flos": 15989261984640.0, "grad_norm": 2.609366644306554, "language_loss": 0.87145531, "learning_rate": 2.5205218227138006e-06, "loss": 0.89352375, "num_input_tokens_seen": 77736905, "step": 3607, "time_per_iteration": 2.421447277069092 }, { "auxiliary_loss_clip": 0.0118613, "auxiliary_loss_mlp": 0.01027583, "balance_loss_clip": 1.05574131, "balance_loss_mlp": 1.01929164, "epoch": 0.4338363494258402, "flos": 20224710184320.0, "grad_norm": 2.258393103108662, "language_loss": 0.79083574, "learning_rate": 2.519769656271486e-06, "loss": 0.81297284, "num_input_tokens_seen": 77754325, "step": 3608, "time_per_iteration": 2.4304187297821045 }, { "auxiliary_loss_clip": 0.01120083, "auxiliary_loss_mlp": 0.01031254, "balance_loss_clip": 1.0486685, "balance_loss_mlp": 1.02252769, "epoch": 0.43395659231647926, "flos": 20083904870400.0, "grad_norm": 2.317198853509111, "language_loss": 0.67356873, "learning_rate": 2.5190174109805285e-06, "loss": 0.69508207, "num_input_tokens_seen": 77774150, "step": 3609, "time_per_iteration": 2.546675443649292 }, { "auxiliary_loss_clip": 0.01147647, "auxiliary_loss_mlp": 0.01026939, "balance_loss_clip": 1.04997301, "balance_loss_mlp": 1.01796293, "epoch": 0.43407683520711837, "flos": 19901801894400.0, "grad_norm": 2.10948214248333, "language_loss": 0.64168036, "learning_rate": 2.518265086955042e-06, "loss": 0.66342616, "num_input_tokens_seen": 77791870, "step": 3610, "time_per_iteration": 2.5092434883117676 }, { "auxiliary_loss_clip": 0.01185956, "auxiliary_loss_mlp": 0.01037496, "balance_loss_clip": 1.05473149, "balance_loss_mlp": 1.02936292, "epoch": 0.4341970780977575, "flos": 23108732058240.0, "grad_norm": 1.6843623205361662, "language_loss": 0.83678395, "learning_rate": 2.5175126843091534e-06, "loss": 0.85901845, "num_input_tokens_seen": 77811240, "step": 3611, "time_per_iteration": 2.4476075172424316 }, { "auxiliary_loss_clip": 0.01158178, "auxiliary_loss_mlp": 0.01023769, "balance_loss_clip": 1.05048442, "balance_loss_mlp": 1.01560867, "epoch": 0.43431732098839654, "flos": 37408288406400.0, "grad_norm": 2.1042005140210893, "language_loss": 0.75145274, "learning_rate": 2.5167602031570034e-06, "loss": 0.77327222, "num_input_tokens_seen": 77831425, "step": 3612, "time_per_iteration": 2.617098093032837 }, { "auxiliary_loss_clip": 0.0118606, "auxiliary_loss_mlp": 0.01025393, "balance_loss_clip": 1.05638385, "balance_loss_mlp": 1.01708436, "epoch": 0.43443756387903565, "flos": 31868206323840.0, "grad_norm": 1.6016382545762038, "language_loss": 0.73373705, "learning_rate": 2.51600764361274e-06, "loss": 0.75585163, "num_input_tokens_seen": 77852950, "step": 3613, "time_per_iteration": 2.501828908920288 }, { "auxiliary_loss_clip": 0.01188277, "auxiliary_loss_mlp": 0.01026601, "balance_loss_clip": 1.05649614, "balance_loss_mlp": 1.01847661, "epoch": 0.43455780676967476, "flos": 23477139901440.0, "grad_norm": 2.1886245207445425, "language_loss": 0.78908598, "learning_rate": 2.5152550057905283e-06, "loss": 0.81123483, "num_input_tokens_seen": 77872840, "step": 3614, "time_per_iteration": 2.4389216899871826 }, { "auxiliary_loss_clip": 0.01174111, "auxiliary_loss_mlp": 0.0076366, "balance_loss_clip": 1.05603015, "balance_loss_mlp": 1.00063324, "epoch": 0.4346780496603138, "flos": 24207060176640.0, "grad_norm": 2.2492235974098054, "language_loss": 0.76705825, "learning_rate": 2.5145022898045415e-06, "loss": 0.78643596, "num_input_tokens_seen": 77892025, "step": 3615, "time_per_iteration": 3.308350086212158 }, { "auxiliary_loss_clip": 0.01159266, "auxiliary_loss_mlp": 0.0103332, "balance_loss_clip": 1.04948032, "balance_loss_mlp": 1.02433205, "epoch": 0.4347982925509529, "flos": 17092366611840.0, "grad_norm": 2.026332633776362, "language_loss": 0.89337057, "learning_rate": 2.5137494957689664e-06, "loss": 0.91529644, "num_input_tokens_seen": 77907635, "step": 3616, "time_per_iteration": 2.4904677867889404 }, { "auxiliary_loss_clip": 0.01061576, "auxiliary_loss_mlp": 0.01005692, "balance_loss_clip": 1.01835418, "balance_loss_mlp": 1.00462496, "epoch": 0.43491853544159204, "flos": 60945544696320.0, "grad_norm": 0.7613973989573046, "language_loss": 0.5735485, "learning_rate": 2.5129966237980016e-06, "loss": 0.59422117, "num_input_tokens_seen": 77970630, "step": 3617, "time_per_iteration": 3.0938665866851807 }, { "auxiliary_loss_clip": 0.01142707, "auxiliary_loss_mlp": 0.01025509, "balance_loss_clip": 1.04907811, "balance_loss_mlp": 1.017272, "epoch": 0.4350387783322311, "flos": 21944652094080.0, "grad_norm": 1.8988439242893407, "language_loss": 0.77993286, "learning_rate": 2.512243674005857e-06, "loss": 0.80161512, "num_input_tokens_seen": 77989995, "step": 3618, "time_per_iteration": 2.55068302154541 }, { "auxiliary_loss_clip": 0.01113454, "auxiliary_loss_mlp": 0.01030697, "balance_loss_clip": 1.048311, "balance_loss_mlp": 1.02201319, "epoch": 0.4351590212228702, "flos": 25082705928960.0, "grad_norm": 1.7363392803079163, "language_loss": 0.85896677, "learning_rate": 2.5114906465067537e-06, "loss": 0.88040829, "num_input_tokens_seen": 78010980, "step": 3619, "time_per_iteration": 3.4609029293060303 }, { "auxiliary_loss_clip": 0.01172403, "auxiliary_loss_mlp": 0.01026266, "balance_loss_clip": 1.05167282, "balance_loss_mlp": 1.01766515, "epoch": 0.4352792641135093, "flos": 21506541909120.0, "grad_norm": 1.949125982000857, "language_loss": 0.75005817, "learning_rate": 2.5107375414149264e-06, "loss": 0.77204478, "num_input_tokens_seen": 78030225, "step": 3620, "time_per_iteration": 2.509326457977295 }, { "auxiliary_loss_clip": 0.01118833, "auxiliary_loss_mlp": 0.01029265, "balance_loss_clip": 1.04273868, "balance_loss_mlp": 1.02034283, "epoch": 0.43539950700414837, "flos": 16253457494400.0, "grad_norm": 2.311843330396648, "language_loss": 0.71927226, "learning_rate": 2.5099843588446197e-06, "loss": 0.74075323, "num_input_tokens_seen": 78048545, "step": 3621, "time_per_iteration": 2.5363385677337646 }, { "auxiliary_loss_clip": 0.01138254, "auxiliary_loss_mlp": 0.01031514, "balance_loss_clip": 1.05215633, "balance_loss_mlp": 1.02281761, "epoch": 0.4355197498947875, "flos": 16691819074560.0, "grad_norm": 1.6071659894416652, "language_loss": 0.61504698, "learning_rate": 2.509231098910091e-06, "loss": 0.63674462, "num_input_tokens_seen": 78068415, "step": 3622, "time_per_iteration": 3.354856252670288 }, { "auxiliary_loss_clip": 0.01155929, "auxiliary_loss_mlp": 0.01029368, "balance_loss_clip": 1.0559597, "balance_loss_mlp": 1.02075517, "epoch": 0.4356399927854266, "flos": 16362733645440.0, "grad_norm": 2.2405377597638565, "language_loss": 0.7461971, "learning_rate": 2.508477761725611e-06, "loss": 0.76805007, "num_input_tokens_seen": 78086690, "step": 3623, "time_per_iteration": 3.242828607559204 }, { "auxiliary_loss_clip": 0.01176179, "auxiliary_loss_mlp": 0.01029671, "balance_loss_clip": 1.05535257, "balance_loss_mlp": 1.02109432, "epoch": 0.43576023567606564, "flos": 17202037812480.0, "grad_norm": 2.402791545855353, "language_loss": 0.80649513, "learning_rate": 2.507724347405458e-06, "loss": 0.82855362, "num_input_tokens_seen": 78104640, "step": 3624, "time_per_iteration": 2.445854663848877 }, { "auxiliary_loss_clip": 0.01120554, "auxiliary_loss_mlp": 0.01029629, "balance_loss_clip": 1.04393983, "balance_loss_mlp": 1.02121329, "epoch": 0.43588047856670475, "flos": 15917656222080.0, "grad_norm": 2.233436473490909, "language_loss": 0.82006812, "learning_rate": 2.5069708560639243e-06, "loss": 0.84156996, "num_input_tokens_seen": 78122550, "step": 3625, "time_per_iteration": 2.5374538898468018 }, { "auxiliary_loss_clip": 0.01146066, "auxiliary_loss_mlp": 0.01028852, "balance_loss_clip": 1.05117929, "balance_loss_mlp": 1.02008963, "epoch": 0.4360007214573438, "flos": 23659566099840.0, "grad_norm": 2.705413983559013, "language_loss": 0.61605096, "learning_rate": 2.5062172878153158e-06, "loss": 0.6378001, "num_input_tokens_seen": 78141825, "step": 3626, "time_per_iteration": 2.5436885356903076 }, { "auxiliary_loss_clip": 0.01124464, "auxiliary_loss_mlp": 0.01032089, "balance_loss_clip": 1.05002499, "balance_loss_mlp": 1.02250445, "epoch": 0.4361209643479829, "flos": 21978767036160.0, "grad_norm": 1.9465188996827472, "language_loss": 0.87297785, "learning_rate": 2.505463642773947e-06, "loss": 0.89454341, "num_input_tokens_seen": 78161790, "step": 3627, "time_per_iteration": 2.6227526664733887 }, { "auxiliary_loss_clip": 0.01144854, "auxiliary_loss_mlp": 0.00763519, "balance_loss_clip": 1.05143547, "balance_loss_mlp": 1.00067449, "epoch": 0.43624120723862203, "flos": 17420159151360.0, "grad_norm": 2.514219648593826, "language_loss": 0.75091147, "learning_rate": 2.504709921054146e-06, "loss": 0.76999521, "num_input_tokens_seen": 78178605, "step": 3628, "time_per_iteration": 2.509385824203491 }, { "auxiliary_loss_clip": 0.01139182, "auxiliary_loss_mlp": 0.01031962, "balance_loss_clip": 1.04682803, "balance_loss_mlp": 1.02279472, "epoch": 0.4363614501292611, "flos": 17895293280000.0, "grad_norm": 4.513723805289859, "language_loss": 0.83737111, "learning_rate": 2.50395612277025e-06, "loss": 0.85908252, "num_input_tokens_seen": 78194460, "step": 3629, "time_per_iteration": 2.5140767097473145 }, { "auxiliary_loss_clip": 0.01160723, "auxiliary_loss_mlp": 0.01028449, "balance_loss_clip": 1.0512743, "balance_loss_mlp": 1.02008033, "epoch": 0.4364816930199002, "flos": 20302888135680.0, "grad_norm": 2.1144942238942255, "language_loss": 0.72881603, "learning_rate": 2.503202248036612e-06, "loss": 0.75070775, "num_input_tokens_seen": 78213315, "step": 3630, "time_per_iteration": 2.5045201778411865 }, { "auxiliary_loss_clip": 0.0118435, "auxiliary_loss_mlp": 0.01033718, "balance_loss_clip": 1.0543474, "balance_loss_mlp": 1.02489638, "epoch": 0.4366019359105393, "flos": 24061334699520.0, "grad_norm": 1.7372950643847136, "language_loss": 0.73373204, "learning_rate": 2.5024482969675927e-06, "loss": 0.75591266, "num_input_tokens_seen": 78233270, "step": 3631, "time_per_iteration": 2.45857310295105 }, { "auxiliary_loss_clip": 0.01133253, "auxiliary_loss_mlp": 0.01025508, "balance_loss_clip": 1.04852903, "balance_loss_mlp": 1.01764035, "epoch": 0.43672217880117836, "flos": 21754109422080.0, "grad_norm": 2.022253891462564, "language_loss": 0.84468782, "learning_rate": 2.501694269677566e-06, "loss": 0.86627543, "num_input_tokens_seen": 78251040, "step": 3632, "time_per_iteration": 2.5529282093048096 }, { "auxiliary_loss_clip": 0.01176267, "auxiliary_loss_mlp": 0.01026618, "balance_loss_clip": 1.05301142, "balance_loss_mlp": 1.01802921, "epoch": 0.4368424216918175, "flos": 18035200753920.0, "grad_norm": 1.9302069716994963, "language_loss": 0.80720055, "learning_rate": 2.500940166280918e-06, "loss": 0.82922935, "num_input_tokens_seen": 78269470, "step": 3633, "time_per_iteration": 2.469496011734009 }, { "auxiliary_loss_clip": 0.01167428, "auxiliary_loss_mlp": 0.01031377, "balance_loss_clip": 1.05148768, "balance_loss_mlp": 1.0229255, "epoch": 0.4369626645824566, "flos": 25447127362560.0, "grad_norm": 1.9898720563025336, "language_loss": 0.79129708, "learning_rate": 2.500185986892045e-06, "loss": 0.81328511, "num_input_tokens_seen": 78288955, "step": 3634, "time_per_iteration": 2.5015029907226562 }, { "auxiliary_loss_clip": 0.0116851, "auxiliary_loss_mlp": 0.01032771, "balance_loss_clip": 1.05206907, "balance_loss_mlp": 1.02424169, "epoch": 0.43708290747309564, "flos": 25302694775040.0, "grad_norm": 2.160090104061873, "language_loss": 0.77396762, "learning_rate": 2.499431731625355e-06, "loss": 0.79598045, "num_input_tokens_seen": 78307980, "step": 3635, "time_per_iteration": 2.495213031768799 }, { "auxiliary_loss_clip": 0.011879, "auxiliary_loss_mlp": 0.01029413, "balance_loss_clip": 1.05529296, "balance_loss_mlp": 1.01998973, "epoch": 0.43720315036373475, "flos": 31575103344000.0, "grad_norm": 1.8733933394012074, "language_loss": 0.79531556, "learning_rate": 2.4986774005952686e-06, "loss": 0.81748867, "num_input_tokens_seen": 78330355, "step": 3636, "time_per_iteration": 2.529128074645996 }, { "auxiliary_loss_clip": 0.01171313, "auxiliary_loss_mlp": 0.01029927, "balance_loss_clip": 1.05647755, "balance_loss_mlp": 1.02175498, "epoch": 0.43732339325437386, "flos": 23112000195840.0, "grad_norm": 1.9465508311630513, "language_loss": 0.84558129, "learning_rate": 2.4979229939162166e-06, "loss": 0.86759365, "num_input_tokens_seen": 78349135, "step": 3637, "time_per_iteration": 2.4494831562042236 }, { "auxiliary_loss_clip": 0.01168537, "auxiliary_loss_mlp": 0.01023937, "balance_loss_clip": 1.05422688, "balance_loss_mlp": 1.01566434, "epoch": 0.4374436361450129, "flos": 27746272080000.0, "grad_norm": 1.6012624121032137, "language_loss": 0.80400336, "learning_rate": 2.4971685117026433e-06, "loss": 0.82592809, "num_input_tokens_seen": 78368900, "step": 3638, "time_per_iteration": 2.498469114303589 }, { "auxiliary_loss_clip": 0.01172505, "auxiliary_loss_mlp": 0.01026223, "balance_loss_clip": 1.05378866, "balance_loss_mlp": 1.01771784, "epoch": 0.437563879035652, "flos": 24172370616960.0, "grad_norm": 1.467348615614611, "language_loss": 0.76309681, "learning_rate": 2.4964139540690018e-06, "loss": 0.78508413, "num_input_tokens_seen": 78392235, "step": 3639, "time_per_iteration": 2.51043701171875 }, { "auxiliary_loss_clip": 0.01145479, "auxiliary_loss_mlp": 0.01029246, "balance_loss_clip": 1.0515523, "balance_loss_mlp": 1.02043033, "epoch": 0.4376841219262911, "flos": 23477211728640.0, "grad_norm": 2.1019965662682916, "language_loss": 0.72508317, "learning_rate": 2.495659321129758e-06, "loss": 0.74683046, "num_input_tokens_seen": 78409980, "step": 3640, "time_per_iteration": 2.566718578338623 }, { "auxiliary_loss_clip": 0.01166287, "auxiliary_loss_mlp": 0.01036522, "balance_loss_clip": 1.04983711, "balance_loss_mlp": 1.02836776, "epoch": 0.4378043648169302, "flos": 25447809720960.0, "grad_norm": 1.7008280931151147, "language_loss": 0.75023711, "learning_rate": 2.494904612999389e-06, "loss": 0.77226526, "num_input_tokens_seen": 78428690, "step": 3641, "time_per_iteration": 2.5219779014587402 }, { "auxiliary_loss_clip": 0.01067198, "auxiliary_loss_mlp": 0.01003329, "balance_loss_clip": 1.0167594, "balance_loss_mlp": 1.00212479, "epoch": 0.4379246077075693, "flos": 53914056986880.0, "grad_norm": 0.748602284354078, "language_loss": 0.56544054, "learning_rate": 2.4941498297923843e-06, "loss": 0.58614576, "num_input_tokens_seen": 78489260, "step": 3642, "time_per_iteration": 3.7832112312316895 }, { "auxiliary_loss_clip": 0.0117018, "auxiliary_loss_mlp": 0.01025861, "balance_loss_clip": 1.05351484, "balance_loss_mlp": 1.01771307, "epoch": 0.43804485059820836, "flos": 20588305605120.0, "grad_norm": 1.9664160691605947, "language_loss": 0.6983552, "learning_rate": 2.4933949716232424e-06, "loss": 0.72031558, "num_input_tokens_seen": 78506785, "step": 3643, "time_per_iteration": 2.4410908222198486 }, { "auxiliary_loss_clip": 0.01142508, "auxiliary_loss_mlp": 0.0103006, "balance_loss_clip": 1.05175757, "balance_loss_mlp": 1.02110696, "epoch": 0.43816509348884747, "flos": 23876214981120.0, "grad_norm": 2.329121432420207, "language_loss": 0.73704511, "learning_rate": 2.492640038606476e-06, "loss": 0.75877076, "num_input_tokens_seen": 78525150, "step": 3644, "time_per_iteration": 2.539618730545044 }, { "auxiliary_loss_clip": 0.0117145, "auxiliary_loss_mlp": 0.01030598, "balance_loss_clip": 1.05210304, "balance_loss_mlp": 1.02188373, "epoch": 0.4382853363794866, "flos": 14684448533760.0, "grad_norm": 1.9267487199619864, "language_loss": 0.78487146, "learning_rate": 2.491885030856608e-06, "loss": 0.80689198, "num_input_tokens_seen": 78543245, "step": 3645, "time_per_iteration": 2.428431272506714 }, { "auxiliary_loss_clip": 0.01160944, "auxiliary_loss_mlp": 0.01028709, "balance_loss_clip": 1.05413651, "balance_loss_mlp": 1.02057326, "epoch": 0.43840557927012563, "flos": 17165301177600.0, "grad_norm": 2.085209038771853, "language_loss": 0.82645285, "learning_rate": 2.4911299484881713e-06, "loss": 0.84834945, "num_input_tokens_seen": 78560775, "step": 3646, "time_per_iteration": 3.336498498916626 }, { "auxiliary_loss_clip": 0.01150848, "auxiliary_loss_mlp": 0.01025502, "balance_loss_clip": 1.04993951, "balance_loss_mlp": 1.01747894, "epoch": 0.43852582216076474, "flos": 19390685316480.0, "grad_norm": 5.4379245278726245, "language_loss": 0.81367928, "learning_rate": 2.490374791615712e-06, "loss": 0.83544278, "num_input_tokens_seen": 78580800, "step": 3647, "time_per_iteration": 2.5091891288757324 }, { "auxiliary_loss_clip": 0.0119299, "auxiliary_loss_mlp": 0.00763706, "balance_loss_clip": 1.05729151, "balance_loss_mlp": 1.00067329, "epoch": 0.43864606505140386, "flos": 18075133699200.0, "grad_norm": 2.7574399281404096, "language_loss": 0.77927428, "learning_rate": 2.4896195603537867e-06, "loss": 0.79884124, "num_input_tokens_seen": 78595410, "step": 3648, "time_per_iteration": 2.397385358810425 }, { "auxiliary_loss_clip": 0.01125462, "auxiliary_loss_mlp": 0.01028919, "balance_loss_clip": 1.05315149, "balance_loss_mlp": 1.02045488, "epoch": 0.4387663079420429, "flos": 19644896845440.0, "grad_norm": 2.0292940268414967, "language_loss": 0.73760599, "learning_rate": 2.488864254816964e-06, "loss": 0.75914979, "num_input_tokens_seen": 78614100, "step": 3649, "time_per_iteration": 3.2876951694488525 }, { "auxiliary_loss_clip": 0.01174093, "auxiliary_loss_mlp": 0.01036956, "balance_loss_clip": 1.05545819, "balance_loss_mlp": 1.02809286, "epoch": 0.438886550832682, "flos": 19719339782400.0, "grad_norm": 6.459611713590826, "language_loss": 0.6875475, "learning_rate": 2.4881088751198218e-06, "loss": 0.70965803, "num_input_tokens_seen": 78632260, "step": 3650, "time_per_iteration": 3.2314679622650146 }, { "auxiliary_loss_clip": 0.01160188, "auxiliary_loss_mlp": 0.01028267, "balance_loss_clip": 1.05129576, "balance_loss_mlp": 1.01964176, "epoch": 0.43900679372332113, "flos": 14536675981440.0, "grad_norm": 2.725906032242637, "language_loss": 0.64348853, "learning_rate": 2.4873534213769517e-06, "loss": 0.66537309, "num_input_tokens_seen": 78647490, "step": 3651, "time_per_iteration": 2.4474642276763916 }, { "auxiliary_loss_clip": 0.01139304, "auxiliary_loss_mlp": 0.01031814, "balance_loss_clip": 1.05315351, "balance_loss_mlp": 1.02360094, "epoch": 0.4391270366139602, "flos": 24056234968320.0, "grad_norm": 1.6682082602519543, "language_loss": 0.71810389, "learning_rate": 2.4865978937029547e-06, "loss": 0.73981506, "num_input_tokens_seen": 78666470, "step": 3652, "time_per_iteration": 2.5351741313934326 }, { "auxiliary_loss_clip": 0.01119484, "auxiliary_loss_mlp": 0.01032613, "balance_loss_clip": 1.04930758, "balance_loss_mlp": 1.0236547, "epoch": 0.4392472795045993, "flos": 31538510363520.0, "grad_norm": 1.5912349278894915, "language_loss": 0.6605354, "learning_rate": 2.485842292212445e-06, "loss": 0.68205631, "num_input_tokens_seen": 78687685, "step": 3653, "time_per_iteration": 2.638922929763794 }, { "auxiliary_loss_clip": 0.011882, "auxiliary_loss_mlp": 0.01030056, "balance_loss_clip": 1.05671668, "balance_loss_mlp": 1.02149725, "epoch": 0.4393675223952384, "flos": 14866300114560.0, "grad_norm": 1.9302416867016723, "language_loss": 0.80380476, "learning_rate": 2.485086617020045e-06, "loss": 0.82598734, "num_input_tokens_seen": 78706180, "step": 3654, "time_per_iteration": 2.4113941192626953 }, { "auxiliary_loss_clip": 0.01149401, "auxiliary_loss_mlp": 0.010245, "balance_loss_clip": 1.04959059, "balance_loss_mlp": 1.01587486, "epoch": 0.43948776528587746, "flos": 14825900292480.0, "grad_norm": 2.1060905431072774, "language_loss": 0.81650442, "learning_rate": 2.4843308682403903e-06, "loss": 0.83824337, "num_input_tokens_seen": 78723095, "step": 3655, "time_per_iteration": 2.4563703536987305 }, { "auxiliary_loss_clip": 0.01185694, "auxiliary_loss_mlp": 0.01028623, "balance_loss_clip": 1.05482495, "balance_loss_mlp": 1.02050447, "epoch": 0.4396080081765166, "flos": 13914523486080.0, "grad_norm": 1.771312483835851, "language_loss": 0.82842958, "learning_rate": 2.4835750459881294e-06, "loss": 0.85057271, "num_input_tokens_seen": 78739720, "step": 3656, "time_per_iteration": 2.3916516304016113 }, { "auxiliary_loss_clip": 0.01149079, "auxiliary_loss_mlp": 0.01036869, "balance_loss_clip": 1.04897952, "balance_loss_mlp": 1.0275346, "epoch": 0.43972825106715563, "flos": 18222978078720.0, "grad_norm": 1.883646452895321, "language_loss": 0.81660694, "learning_rate": 2.4828191503779177e-06, "loss": 0.83846641, "num_input_tokens_seen": 78757820, "step": 3657, "time_per_iteration": 2.466453790664673 }, { "auxiliary_loss_clip": 0.01142291, "auxiliary_loss_mlp": 0.01026673, "balance_loss_clip": 1.04992008, "balance_loss_mlp": 1.01807773, "epoch": 0.43984849395779474, "flos": 16873239692160.0, "grad_norm": 2.1980635967594306, "language_loss": 0.89555985, "learning_rate": 2.482063181524425e-06, "loss": 0.9172495, "num_input_tokens_seen": 78773720, "step": 3658, "time_per_iteration": 2.5018508434295654 }, { "auxiliary_loss_clip": 0.01189443, "auxiliary_loss_mlp": 0.01038289, "balance_loss_clip": 1.0567528, "balance_loss_mlp": 1.02955055, "epoch": 0.43996873684843385, "flos": 18691504104960.0, "grad_norm": 2.2157591733945674, "language_loss": 0.81577098, "learning_rate": 2.4813071395423307e-06, "loss": 0.83804834, "num_input_tokens_seen": 78791285, "step": 3659, "time_per_iteration": 2.408621311187744 }, { "auxiliary_loss_clip": 0.01173055, "auxiliary_loss_mlp": 0.01033051, "balance_loss_clip": 1.05435991, "balance_loss_mlp": 1.0237174, "epoch": 0.4400889797390729, "flos": 23653460787840.0, "grad_norm": 1.8011518847937302, "language_loss": 0.64649594, "learning_rate": 2.4805510245463263e-06, "loss": 0.66855699, "num_input_tokens_seen": 78811440, "step": 3660, "time_per_iteration": 2.487126588821411 }, { "auxiliary_loss_clip": 0.01170902, "auxiliary_loss_mlp": 0.01033402, "balance_loss_clip": 1.05266976, "balance_loss_mlp": 1.02436018, "epoch": 0.440209222629712, "flos": 23149203707520.0, "grad_norm": 2.164366007879305, "language_loss": 0.60653222, "learning_rate": 2.4797948366511137e-06, "loss": 0.62857521, "num_input_tokens_seen": 78831150, "step": 3661, "time_per_iteration": 2.4744760990142822 }, { "auxiliary_loss_clip": 0.01144475, "auxiliary_loss_mlp": 0.01034919, "balance_loss_clip": 1.04944253, "balance_loss_mlp": 1.02641368, "epoch": 0.4403294655203511, "flos": 24823394668800.0, "grad_norm": 2.3243586023971243, "language_loss": 0.76402169, "learning_rate": 2.4790385759714055e-06, "loss": 0.7858156, "num_input_tokens_seen": 78850215, "step": 3662, "time_per_iteration": 2.62807559967041 }, { "auxiliary_loss_clip": 0.01171244, "auxiliary_loss_mlp": 0.01029695, "balance_loss_clip": 1.05690086, "balance_loss_mlp": 1.02120781, "epoch": 0.4404497084109902, "flos": 22565080736640.0, "grad_norm": 1.6277729916584753, "language_loss": 0.71002841, "learning_rate": 2.478282242621926e-06, "loss": 0.73203778, "num_input_tokens_seen": 78870675, "step": 3663, "time_per_iteration": 2.535227060317993 }, { "auxiliary_loss_clip": 0.01051195, "auxiliary_loss_mlp": 0.0100454, "balance_loss_clip": 1.01939011, "balance_loss_mlp": 1.00321126, "epoch": 0.4405699513016293, "flos": 64967073448320.0, "grad_norm": 0.8409831315378775, "language_loss": 0.59559989, "learning_rate": 2.477525836717411e-06, "loss": 0.61615723, "num_input_tokens_seen": 78938440, "step": 3664, "time_per_iteration": 3.2285568714141846 }, { "auxiliary_loss_clip": 0.01170862, "auxiliary_loss_mlp": 0.01030515, "balance_loss_clip": 1.05177999, "balance_loss_mlp": 1.02216434, "epoch": 0.4406901941922684, "flos": 35661952978560.0, "grad_norm": 2.331508199438346, "language_loss": 0.79460138, "learning_rate": 2.476769358372606e-06, "loss": 0.81661516, "num_input_tokens_seen": 78960090, "step": 3665, "time_per_iteration": 2.5901997089385986 }, { "auxiliary_loss_clip": 0.01139117, "auxiliary_loss_mlp": 0.01026376, "balance_loss_clip": 1.05162907, "balance_loss_mlp": 1.01894355, "epoch": 0.44081043708290746, "flos": 18040767361920.0, "grad_norm": 3.17328299839823, "language_loss": 0.75045919, "learning_rate": 2.4760128077022683e-06, "loss": 0.7721141, "num_input_tokens_seen": 78978225, "step": 3666, "time_per_iteration": 2.565070629119873 }, { "auxiliary_loss_clip": 0.01121405, "auxiliary_loss_mlp": 0.01025168, "balance_loss_clip": 1.05060518, "balance_loss_mlp": 1.01710391, "epoch": 0.44093067997354657, "flos": 30153507799680.0, "grad_norm": 1.682342936976867, "language_loss": 0.6852901, "learning_rate": 2.4752561848211672e-06, "loss": 0.70675582, "num_input_tokens_seen": 79000625, "step": 3667, "time_per_iteration": 2.6126208305358887 }, { "auxiliary_loss_clip": 0.011731, "auxiliary_loss_mlp": 0.01034368, "balance_loss_clip": 1.05914748, "balance_loss_mlp": 1.02599335, "epoch": 0.4410509228641857, "flos": 23255068066560.0, "grad_norm": 1.7725359982161295, "language_loss": 0.71227455, "learning_rate": 2.4744994898440797e-06, "loss": 0.73434925, "num_input_tokens_seen": 79019415, "step": 3668, "time_per_iteration": 2.4678609371185303 }, { "auxiliary_loss_clip": 0.0114822, "auxiliary_loss_mlp": 0.0103604, "balance_loss_clip": 1.05159271, "balance_loss_mlp": 1.02711105, "epoch": 0.44117116575482473, "flos": 19500571998720.0, "grad_norm": 1.958928770721048, "language_loss": 0.83773434, "learning_rate": 2.473742722885797e-06, "loss": 0.85957688, "num_input_tokens_seen": 79038435, "step": 3669, "time_per_iteration": 3.371425151824951 }, { "auxiliary_loss_clip": 0.01176063, "auxiliary_loss_mlp": 0.00763588, "balance_loss_clip": 1.05887735, "balance_loss_mlp": 1.00067163, "epoch": 0.44129140864546385, "flos": 27053124353280.0, "grad_norm": 2.0819537096438743, "language_loss": 0.65102518, "learning_rate": 2.4729858840611197e-06, "loss": 0.67042172, "num_input_tokens_seen": 79057345, "step": 3670, "time_per_iteration": 2.549913167953491 }, { "auxiliary_loss_clip": 0.01187426, "auxiliary_loss_mlp": 0.01026531, "balance_loss_clip": 1.05753946, "balance_loss_mlp": 1.01855588, "epoch": 0.4414116515361029, "flos": 26102101910400.0, "grad_norm": 4.809380728779214, "language_loss": 0.72867262, "learning_rate": 2.4722289734848605e-06, "loss": 0.75081217, "num_input_tokens_seen": 79077810, "step": 3671, "time_per_iteration": 2.4851484298706055 }, { "auxiliary_loss_clip": 0.01143818, "auxiliary_loss_mlp": 0.01028882, "balance_loss_clip": 1.05617523, "balance_loss_mlp": 1.02051973, "epoch": 0.441531894426742, "flos": 21906083865600.0, "grad_norm": 2.0540421767820445, "language_loss": 0.77928531, "learning_rate": 2.471471991271841e-06, "loss": 0.80101234, "num_input_tokens_seen": 79094935, "step": 3672, "time_per_iteration": 3.3719658851623535 }, { "auxiliary_loss_clip": 0.01164399, "auxiliary_loss_mlp": 0.01026875, "balance_loss_clip": 1.05276191, "balance_loss_mlp": 1.01832795, "epoch": 0.4416521373173811, "flos": 23437099215360.0, "grad_norm": 1.919010769417049, "language_loss": 0.79262859, "learning_rate": 2.470714937536896e-06, "loss": 0.81454134, "num_input_tokens_seen": 79113660, "step": 3673, "time_per_iteration": 2.470294237136841 }, { "auxiliary_loss_clip": 0.01125172, "auxiliary_loss_mlp": 0.0103085, "balance_loss_clip": 1.0489316, "balance_loss_mlp": 1.02226722, "epoch": 0.4417723802080202, "flos": 20334345471360.0, "grad_norm": 1.661876865895167, "language_loss": 0.70376062, "learning_rate": 2.469957812394868e-06, "loss": 0.72532082, "num_input_tokens_seen": 79132470, "step": 3674, "time_per_iteration": 2.565774917602539 }, { "auxiliary_loss_clip": 0.01186638, "auxiliary_loss_mlp": 0.01029041, "balance_loss_clip": 1.05809879, "balance_loss_mlp": 1.02052379, "epoch": 0.4418926230986593, "flos": 18880682060160.0, "grad_norm": 1.8807257481700441, "language_loss": 0.76321942, "learning_rate": 2.4692006159606148e-06, "loss": 0.78537619, "num_input_tokens_seen": 79150000, "step": 3675, "time_per_iteration": 2.4257280826568604 }, { "auxiliary_loss_clip": 0.01185678, "auxiliary_loss_mlp": 0.01030094, "balance_loss_clip": 1.05552435, "balance_loss_mlp": 1.02163005, "epoch": 0.4420128659892984, "flos": 19464409981440.0, "grad_norm": 1.8759780191007915, "language_loss": 0.78543627, "learning_rate": 2.468443348349e-06, "loss": 0.80759394, "num_input_tokens_seen": 79167875, "step": 3676, "time_per_iteration": 3.1443326473236084 }, { "auxiliary_loss_clip": 0.01127999, "auxiliary_loss_mlp": 0.01033494, "balance_loss_clip": 1.0489912, "balance_loss_mlp": 1.02381372, "epoch": 0.44213310887993745, "flos": 17894359526400.0, "grad_norm": 2.4823724357011616, "language_loss": 0.82399458, "learning_rate": 2.467686009674902e-06, "loss": 0.84560943, "num_input_tokens_seen": 79182325, "step": 3677, "time_per_iteration": 3.320093870162964 }, { "auxiliary_loss_clip": 0.01166445, "auxiliary_loss_mlp": 0.01029892, "balance_loss_clip": 1.05101466, "balance_loss_mlp": 1.02099323, "epoch": 0.44225335177057656, "flos": 19204667758080.0, "grad_norm": 1.8654601966993722, "language_loss": 0.85179412, "learning_rate": 2.466928600053209e-06, "loss": 0.87375748, "num_input_tokens_seen": 79197630, "step": 3678, "time_per_iteration": 2.4332807064056396 }, { "auxiliary_loss_clip": 0.0115601, "auxiliary_loss_mlp": 0.01026681, "balance_loss_clip": 1.05135798, "balance_loss_mlp": 1.01846766, "epoch": 0.4423735946612157, "flos": 23471321898240.0, "grad_norm": 1.8248486645074782, "language_loss": 0.71393889, "learning_rate": 2.466171119598818e-06, "loss": 0.73576581, "num_input_tokens_seen": 79217600, "step": 3679, "time_per_iteration": 2.5327563285827637 }, { "auxiliary_loss_clip": 0.01177879, "auxiliary_loss_mlp": 0.01033968, "balance_loss_clip": 1.05263925, "balance_loss_mlp": 1.02529597, "epoch": 0.44249383755185473, "flos": 26685398868480.0, "grad_norm": 1.8872031919742798, "language_loss": 0.77459937, "learning_rate": 2.465413568426639e-06, "loss": 0.79671788, "num_input_tokens_seen": 79238550, "step": 3680, "time_per_iteration": 2.516883134841919 }, { "auxiliary_loss_clip": 0.01167014, "auxiliary_loss_mlp": 0.010246, "balance_loss_clip": 1.05362177, "balance_loss_mlp": 1.01685166, "epoch": 0.44261408044249384, "flos": 23147659422720.0, "grad_norm": 2.471646533675412, "language_loss": 0.81122637, "learning_rate": 2.464655946651591e-06, "loss": 0.83314252, "num_input_tokens_seen": 79257555, "step": 3681, "time_per_iteration": 2.476849317550659 }, { "auxiliary_loss_clip": 0.01177036, "auxiliary_loss_mlp": 0.01029152, "balance_loss_clip": 1.0574224, "balance_loss_mlp": 1.0207355, "epoch": 0.44273432333313295, "flos": 24462564595200.0, "grad_norm": 2.0268041719936543, "language_loss": 0.81043327, "learning_rate": 2.4638982543886065e-06, "loss": 0.83249509, "num_input_tokens_seen": 79277595, "step": 3682, "time_per_iteration": 2.4976933002471924 }, { "auxiliary_loss_clip": 0.01174802, "auxiliary_loss_mlp": 0.0103535, "balance_loss_clip": 1.0558691, "balance_loss_mlp": 1.02689242, "epoch": 0.442854566223772, "flos": 17528932512000.0, "grad_norm": 9.69083777776178, "language_loss": 0.87227261, "learning_rate": 2.4631404917526254e-06, "loss": 0.89437413, "num_input_tokens_seen": 79294550, "step": 3683, "time_per_iteration": 2.4609546661376953 }, { "auxiliary_loss_clip": 0.01165195, "auxiliary_loss_mlp": 0.01027326, "balance_loss_clip": 1.05210638, "balance_loss_mlp": 1.01943398, "epoch": 0.4429748091144111, "flos": 24896293320960.0, "grad_norm": 1.6920251277605036, "language_loss": 0.79115558, "learning_rate": 2.4623826588586e-06, "loss": 0.81308079, "num_input_tokens_seen": 79314820, "step": 3684, "time_per_iteration": 2.491969108581543 }, { "auxiliary_loss_clip": 0.01151848, "auxiliary_loss_mlp": 0.01031889, "balance_loss_clip": 1.04962802, "balance_loss_mlp": 1.02289438, "epoch": 0.4430950520050502, "flos": 21614704738560.0, "grad_norm": 1.7836736265672408, "language_loss": 0.82626003, "learning_rate": 2.461624755821492e-06, "loss": 0.84809738, "num_input_tokens_seen": 79334300, "step": 3685, "time_per_iteration": 2.506840229034424 }, { "auxiliary_loss_clip": 0.01142849, "auxiliary_loss_mlp": 0.01024839, "balance_loss_clip": 1.05109644, "balance_loss_mlp": 1.01698887, "epoch": 0.4432152948956893, "flos": 24572271709440.0, "grad_norm": 1.7522927979709753, "language_loss": 0.76716673, "learning_rate": 2.4608667827562763e-06, "loss": 0.78884363, "num_input_tokens_seen": 79353630, "step": 3686, "time_per_iteration": 2.5531423091888428 }, { "auxiliary_loss_clip": 0.01176659, "auxiliary_loss_mlp": 0.01031689, "balance_loss_clip": 1.055722, "balance_loss_mlp": 1.02314723, "epoch": 0.4433355377863284, "flos": 21762261809280.0, "grad_norm": 2.12396996293802, "language_loss": 0.89869851, "learning_rate": 2.460108739777936e-06, "loss": 0.92078197, "num_input_tokens_seen": 79372765, "step": 3687, "time_per_iteration": 2.4826157093048096 }, { "auxiliary_loss_clip": 0.01157357, "auxiliary_loss_mlp": 0.01029924, "balance_loss_clip": 1.0546813, "balance_loss_mlp": 1.02138841, "epoch": 0.44345578067696745, "flos": 20084479488000.0, "grad_norm": 1.5974800149121162, "language_loss": 0.76480985, "learning_rate": 2.4593506270014656e-06, "loss": 0.78668267, "num_input_tokens_seen": 79391735, "step": 3688, "time_per_iteration": 2.4870541095733643 }, { "auxiliary_loss_clip": 0.01161638, "auxiliary_loss_mlp": 0.01027759, "balance_loss_clip": 1.05108809, "balance_loss_mlp": 1.01942611, "epoch": 0.44357602356760656, "flos": 24169497528960.0, "grad_norm": 1.6461131333415144, "language_loss": 0.82080132, "learning_rate": 2.45859244454187e-06, "loss": 0.84269536, "num_input_tokens_seen": 79411525, "step": 3689, "time_per_iteration": 2.550636053085327 }, { "auxiliary_loss_clip": 0.01169545, "auxiliary_loss_mlp": 0.01027946, "balance_loss_clip": 1.05495358, "balance_loss_mlp": 1.02015519, "epoch": 0.44369626645824567, "flos": 22707717644160.0, "grad_norm": 1.741849472187036, "language_loss": 0.66401744, "learning_rate": 2.4578341925141655e-06, "loss": 0.68599236, "num_input_tokens_seen": 79430740, "step": 3690, "time_per_iteration": 2.4910192489624023 }, { "auxiliary_loss_clip": 0.01179462, "auxiliary_loss_mlp": 0.01025428, "balance_loss_clip": 1.05479777, "balance_loss_mlp": 1.01664853, "epoch": 0.4438165093488847, "flos": 38030225420160.0, "grad_norm": 1.9121724741111004, "language_loss": 0.71961272, "learning_rate": 2.457075871033378e-06, "loss": 0.74166155, "num_input_tokens_seen": 79452615, "step": 3691, "time_per_iteration": 2.6078453063964844 }, { "auxiliary_loss_clip": 0.01143557, "auxiliary_loss_mlp": 0.01025529, "balance_loss_clip": 1.05234361, "balance_loss_mlp": 1.01735091, "epoch": 0.44393675223952384, "flos": 15523213996800.0, "grad_norm": 1.9985631047790897, "language_loss": 0.88896966, "learning_rate": 2.4563174802145445e-06, "loss": 0.91066051, "num_input_tokens_seen": 79469865, "step": 3692, "time_per_iteration": 2.485480546951294 }, { "auxiliary_loss_clip": 0.01063936, "auxiliary_loss_mlp": 0.01003626, "balance_loss_clip": 1.02172112, "balance_loss_mlp": 1.00226104, "epoch": 0.44405699513016295, "flos": 64574893779840.0, "grad_norm": 0.6334892616799727, "language_loss": 0.48645616, "learning_rate": 2.455559020172712e-06, "loss": 0.50713181, "num_input_tokens_seen": 79537220, "step": 3693, "time_per_iteration": 3.1618378162384033 }, { "auxiliary_loss_clip": 0.01136697, "auxiliary_loss_mlp": 0.01038172, "balance_loss_clip": 1.05596483, "balance_loss_mlp": 1.02953506, "epoch": 0.444177238020802, "flos": 23987394552960.0, "grad_norm": 2.397392671542597, "language_loss": 0.89748502, "learning_rate": 2.4548004910229385e-06, "loss": 0.91923368, "num_input_tokens_seen": 79554795, "step": 3694, "time_per_iteration": 2.628120183944702 }, { "auxiliary_loss_clip": 0.01176037, "auxiliary_loss_mlp": 0.00764013, "balance_loss_clip": 1.05644131, "balance_loss_mlp": 1.00070786, "epoch": 0.4442974809114411, "flos": 22563069575040.0, "grad_norm": 2.2172444591353107, "language_loss": 0.87127411, "learning_rate": 2.4540418928802913e-06, "loss": 0.89067459, "num_input_tokens_seen": 79573530, "step": 3695, "time_per_iteration": 2.4922924041748047 }, { "auxiliary_loss_clip": 0.01156449, "auxiliary_loss_mlp": 0.01031828, "balance_loss_clip": 1.0512948, "balance_loss_mlp": 1.02289367, "epoch": 0.4444177238020802, "flos": 17675699483520.0, "grad_norm": 2.8643462519079854, "language_loss": 0.65817106, "learning_rate": 2.4532832258598506e-06, "loss": 0.68005383, "num_input_tokens_seen": 79591360, "step": 3696, "time_per_iteration": 3.226816415786743 }, { "auxiliary_loss_clip": 0.01184374, "auxiliary_loss_mlp": 0.01026062, "balance_loss_clip": 1.05612481, "balance_loss_mlp": 1.01759839, "epoch": 0.4445379666927193, "flos": 28621594609920.0, "grad_norm": 1.7962066310276905, "language_loss": 0.80778468, "learning_rate": 2.4525244900767047e-06, "loss": 0.82988906, "num_input_tokens_seen": 79612175, "step": 3697, "time_per_iteration": 2.503439426422119 }, { "auxiliary_loss_clip": 0.01072983, "auxiliary_loss_mlp": 0.01001959, "balance_loss_clip": 1.02577829, "balance_loss_mlp": 1.00073075, "epoch": 0.4446582095833584, "flos": 70487370115200.0, "grad_norm": 0.7726120850558645, "language_loss": 0.60552257, "learning_rate": 2.4517656856459536e-06, "loss": 0.62627202, "num_input_tokens_seen": 79678020, "step": 3698, "time_per_iteration": 3.151158332824707 }, { "auxiliary_loss_clip": 0.01170439, "auxiliary_loss_mlp": 0.01033914, "balance_loss_clip": 1.05225992, "balance_loss_mlp": 1.02581656, "epoch": 0.4447784524739975, "flos": 26505199313280.0, "grad_norm": 1.656308441090315, "language_loss": 0.68097639, "learning_rate": 2.4510068126827073e-06, "loss": 0.70301998, "num_input_tokens_seen": 79699020, "step": 3699, "time_per_iteration": 3.3736932277679443 }, { "auxiliary_loss_clip": 0.01158756, "auxiliary_loss_mlp": 0.01035414, "balance_loss_clip": 1.05345583, "balance_loss_mlp": 1.02707577, "epoch": 0.44489869536463655, "flos": 11656209553920.0, "grad_norm": 6.3595659630556005, "language_loss": 0.81421053, "learning_rate": 2.450247871302086e-06, "loss": 0.8361522, "num_input_tokens_seen": 79716795, "step": 3700, "time_per_iteration": 2.4832677841186523 }, { "auxiliary_loss_clip": 0.01175798, "auxiliary_loss_mlp": 0.01027704, "balance_loss_clip": 1.0545212, "balance_loss_mlp": 1.01982427, "epoch": 0.44501893825527566, "flos": 20448469958400.0, "grad_norm": 2.313075096495458, "language_loss": 0.83370793, "learning_rate": 2.44948886161922e-06, "loss": 0.85574299, "num_input_tokens_seen": 79735810, "step": 3701, "time_per_iteration": 2.5322635173797607 }, { "auxiliary_loss_clip": 0.01175748, "auxiliary_loss_mlp": 0.01026403, "balance_loss_clip": 1.05587935, "balance_loss_mlp": 1.01868701, "epoch": 0.4451391811459148, "flos": 18261079430400.0, "grad_norm": 1.6084210201101368, "language_loss": 0.85110223, "learning_rate": 2.4487297837492524e-06, "loss": 0.87312371, "num_input_tokens_seen": 79754975, "step": 3702, "time_per_iteration": 3.270439863204956 }, { "auxiliary_loss_clip": 0.01142784, "auxiliary_loss_mlp": 0.01029359, "balance_loss_clip": 1.05190527, "balance_loss_mlp": 1.02097893, "epoch": 0.44525942403655383, "flos": 16910155895040.0, "grad_norm": 1.9247161420488583, "language_loss": 0.62377113, "learning_rate": 2.4479706378073323e-06, "loss": 0.64549261, "num_input_tokens_seen": 79773515, "step": 3703, "time_per_iteration": 2.6739790439605713 }, { "auxiliary_loss_clip": 0.01131238, "auxiliary_loss_mlp": 0.01027209, "balance_loss_clip": 1.0463264, "balance_loss_mlp": 1.01921606, "epoch": 0.44537966692719294, "flos": 23258838994560.0, "grad_norm": 1.5727117342734176, "language_loss": 0.83750415, "learning_rate": 2.447211423908623e-06, "loss": 0.8590886, "num_input_tokens_seen": 79793560, "step": 3704, "time_per_iteration": 3.283668279647827 }, { "auxiliary_loss_clip": 0.01173876, "auxiliary_loss_mlp": 0.01027256, "balance_loss_clip": 1.05376112, "balance_loss_mlp": 1.01920366, "epoch": 0.445499909817832, "flos": 21724160457600.0, "grad_norm": 2.295431497373646, "language_loss": 0.7439239, "learning_rate": 2.4464521421682966e-06, "loss": 0.7659353, "num_input_tokens_seen": 79811150, "step": 3705, "time_per_iteration": 2.464444875717163 }, { "auxiliary_loss_clip": 0.01166224, "auxiliary_loss_mlp": 0.01024693, "balance_loss_clip": 1.05462337, "balance_loss_mlp": 1.01706958, "epoch": 0.4456201527084711, "flos": 23987969170560.0, "grad_norm": 1.4100867967009847, "language_loss": 0.87436712, "learning_rate": 2.4456927927015345e-06, "loss": 0.89627624, "num_input_tokens_seen": 79832190, "step": 3706, "time_per_iteration": 2.4772579669952393 }, { "auxiliary_loss_clip": 0.0116715, "auxiliary_loss_mlp": 0.0103186, "balance_loss_clip": 1.0576086, "balance_loss_mlp": 1.02279973, "epoch": 0.4457403955991102, "flos": 18807065136000.0, "grad_norm": 2.034032312400158, "language_loss": 0.76659322, "learning_rate": 2.4449333756235307e-06, "loss": 0.78858328, "num_input_tokens_seen": 79848905, "step": 3707, "time_per_iteration": 2.479116439819336 }, { "auxiliary_loss_clip": 0.01176042, "auxiliary_loss_mlp": 0.01036432, "balance_loss_clip": 1.05490422, "balance_loss_mlp": 1.02784932, "epoch": 0.4458606384897493, "flos": 19207756327680.0, "grad_norm": 2.225344106043708, "language_loss": 0.78542346, "learning_rate": 2.4441738910494876e-06, "loss": 0.80754817, "num_input_tokens_seen": 79863640, "step": 3708, "time_per_iteration": 2.4359796047210693 }, { "auxiliary_loss_clip": 0.01164405, "auxiliary_loss_mlp": 0.01033269, "balance_loss_clip": 1.05187988, "balance_loss_mlp": 1.02489424, "epoch": 0.4459808813803884, "flos": 21361283308800.0, "grad_norm": 1.6699179867984635, "language_loss": 0.82155168, "learning_rate": 2.4434143390946176e-06, "loss": 0.84352839, "num_input_tokens_seen": 79882450, "step": 3709, "time_per_iteration": 2.4934604167938232 }, { "auxiliary_loss_clip": 0.01140056, "auxiliary_loss_mlp": 0.01029277, "balance_loss_clip": 1.04868197, "balance_loss_mlp": 1.02072358, "epoch": 0.4461011242710275, "flos": 23288967527040.0, "grad_norm": 1.8439956291296107, "language_loss": 0.85271466, "learning_rate": 2.4426547198741457e-06, "loss": 0.87440795, "num_input_tokens_seen": 79900655, "step": 3710, "time_per_iteration": 2.5815532207489014 }, { "auxiliary_loss_clip": 0.01129064, "auxiliary_loss_mlp": 0.0103097, "balance_loss_clip": 1.05202425, "balance_loss_mlp": 1.0228343, "epoch": 0.44622136716166655, "flos": 20193001453440.0, "grad_norm": 2.3639526633457377, "language_loss": 0.74590313, "learning_rate": 2.441895033503305e-06, "loss": 0.76750344, "num_input_tokens_seen": 79918575, "step": 3711, "time_per_iteration": 2.5526912212371826 }, { "auxiliary_loss_clip": 0.0117043, "auxiliary_loss_mlp": 0.01032191, "balance_loss_clip": 1.05340171, "balance_loss_mlp": 1.0232501, "epoch": 0.44634161005230566, "flos": 21283033530240.0, "grad_norm": 1.6951554902840211, "language_loss": 0.81890422, "learning_rate": 2.4411352800973375e-06, "loss": 0.84093046, "num_input_tokens_seen": 79937010, "step": 3712, "time_per_iteration": 2.4535343647003174 }, { "auxiliary_loss_clip": 0.01138319, "auxiliary_loss_mlp": 0.01027984, "balance_loss_clip": 1.0491575, "balance_loss_mlp": 1.0192399, "epoch": 0.44646185294294477, "flos": 22929358515840.0, "grad_norm": 2.282796522320087, "language_loss": 0.7557168, "learning_rate": 2.4403754597715005e-06, "loss": 0.77737975, "num_input_tokens_seen": 79956455, "step": 3713, "time_per_iteration": 2.531940460205078 }, { "auxiliary_loss_clip": 0.01159664, "auxiliary_loss_mlp": 0.01034928, "balance_loss_clip": 1.04897785, "balance_loss_mlp": 1.0256598, "epoch": 0.4465820958335838, "flos": 22637692080000.0, "grad_norm": 2.360868212339568, "language_loss": 0.92910033, "learning_rate": 2.4396155726410553e-06, "loss": 0.95104623, "num_input_tokens_seen": 79975065, "step": 3714, "time_per_iteration": 2.5019724369049072 }, { "auxiliary_loss_clip": 0.01177362, "auxiliary_loss_mlp": 0.01027234, "balance_loss_clip": 1.0527432, "balance_loss_mlp": 1.01925325, "epoch": 0.44670233872422294, "flos": 22672525294080.0, "grad_norm": 2.4848116740977697, "language_loss": 0.91114068, "learning_rate": 2.438855618821278e-06, "loss": 0.93318659, "num_input_tokens_seen": 79990865, "step": 3715, "time_per_iteration": 2.5113203525543213 }, { "auxiliary_loss_clip": 0.01162222, "auxiliary_loss_mlp": 0.01031138, "balance_loss_clip": 1.04904628, "balance_loss_mlp": 1.02264428, "epoch": 0.44682258161486205, "flos": 23582178247680.0, "grad_norm": 1.6282300358166526, "language_loss": 0.67165083, "learning_rate": 2.4380955984274517e-06, "loss": 0.69358444, "num_input_tokens_seen": 80009520, "step": 3716, "time_per_iteration": 2.4700863361358643 }, { "auxiliary_loss_clip": 0.01169011, "auxiliary_loss_mlp": 0.01035671, "balance_loss_clip": 1.05113196, "balance_loss_mlp": 1.02758896, "epoch": 0.4469428245055011, "flos": 26501356558080.0, "grad_norm": 1.658853914927923, "language_loss": 0.76939428, "learning_rate": 2.4373355115748716e-06, "loss": 0.79144108, "num_input_tokens_seen": 80030350, "step": 3717, "time_per_iteration": 2.5420291423797607 }, { "auxiliary_loss_clip": 0.01150738, "auxiliary_loss_mlp": 0.01031496, "balance_loss_clip": 1.05203068, "balance_loss_mlp": 1.02274573, "epoch": 0.4470630673961402, "flos": 21504925797120.0, "grad_norm": 1.686356699989055, "language_loss": 0.72169089, "learning_rate": 2.436575358378842e-06, "loss": 0.74351323, "num_input_tokens_seen": 80049840, "step": 3718, "time_per_iteration": 2.50437331199646 }, { "auxiliary_loss_clip": 0.01167251, "auxiliary_loss_mlp": 0.0103191, "balance_loss_clip": 1.05460501, "balance_loss_mlp": 1.02271855, "epoch": 0.44718331028677927, "flos": 16173986653440.0, "grad_norm": 3.6961995642420487, "language_loss": 0.83117914, "learning_rate": 2.4358151389546782e-06, "loss": 0.85317075, "num_input_tokens_seen": 80066525, "step": 3719, "time_per_iteration": 2.4982588291168213 }, { "auxiliary_loss_clip": 0.01185726, "auxiliary_loss_mlp": 0.01031734, "balance_loss_clip": 1.05546641, "balance_loss_mlp": 1.02307987, "epoch": 0.4473035531774184, "flos": 19681238430720.0, "grad_norm": 2.215861306443257, "language_loss": 0.76690769, "learning_rate": 2.4350548534177035e-06, "loss": 0.78908229, "num_input_tokens_seen": 80083355, "step": 3720, "time_per_iteration": 2.399819850921631 }, { "auxiliary_loss_clip": 0.01142642, "auxiliary_loss_mlp": 0.01032894, "balance_loss_clip": 1.0513339, "balance_loss_mlp": 1.02521062, "epoch": 0.4474237960680575, "flos": 41427590515200.0, "grad_norm": 1.5486249133206893, "language_loss": 0.6682502, "learning_rate": 2.434294501883254e-06, "loss": 0.69000554, "num_input_tokens_seen": 80106450, "step": 3721, "time_per_iteration": 2.7164411544799805 }, { "auxiliary_loss_clip": 0.01146654, "auxiliary_loss_mlp": 0.0103011, "balance_loss_clip": 1.04784989, "balance_loss_mlp": 1.02148521, "epoch": 0.44754403895869654, "flos": 22891328991360.0, "grad_norm": 1.6817321486877723, "language_loss": 0.65677023, "learning_rate": 2.433534084466674e-06, "loss": 0.67853791, "num_input_tokens_seen": 80125670, "step": 3722, "time_per_iteration": 3.3199877738952637 }, { "auxiliary_loss_clip": 0.01181532, "auxiliary_loss_mlp": 0.01027752, "balance_loss_clip": 1.05417609, "balance_loss_mlp": 1.01946068, "epoch": 0.44766428184933565, "flos": 25630271832960.0, "grad_norm": 1.6413005810510426, "language_loss": 0.7104544, "learning_rate": 2.4327736012833178e-06, "loss": 0.73254716, "num_input_tokens_seen": 80147390, "step": 3723, "time_per_iteration": 2.4865431785583496 }, { "auxiliary_loss_clip": 0.01171549, "auxiliary_loss_mlp": 0.01033204, "balance_loss_clip": 1.05430961, "balance_loss_mlp": 1.02479935, "epoch": 0.44778452473997477, "flos": 20448972748800.0, "grad_norm": 2.0468483539001556, "language_loss": 0.76963258, "learning_rate": 2.4320130524485506e-06, "loss": 0.7916801, "num_input_tokens_seen": 80166185, "step": 3724, "time_per_iteration": 2.446181058883667 }, { "auxiliary_loss_clip": 0.01151242, "auxiliary_loss_mlp": 0.010251, "balance_loss_clip": 1.05543447, "balance_loss_mlp": 1.01764607, "epoch": 0.4479047676306138, "flos": 21975462984960.0, "grad_norm": 4.294670207796363, "language_loss": 0.79626513, "learning_rate": 2.431252438077746e-06, "loss": 0.81802857, "num_input_tokens_seen": 80185685, "step": 3725, "time_per_iteration": 3.3264570236206055 }, { "auxiliary_loss_clip": 0.01174841, "auxiliary_loss_mlp": 0.00763459, "balance_loss_clip": 1.05286789, "balance_loss_mlp": 1.00054598, "epoch": 0.44802501052125293, "flos": 21467219495040.0, "grad_norm": 2.3404871547673305, "language_loss": 0.7730754, "learning_rate": 2.4304917582862906e-06, "loss": 0.79245836, "num_input_tokens_seen": 80204865, "step": 3726, "time_per_iteration": 2.4551732540130615 }, { "auxiliary_loss_clip": 0.01183236, "auxiliary_loss_mlp": 0.01028018, "balance_loss_clip": 1.05398405, "balance_loss_mlp": 1.01988196, "epoch": 0.44814525341189204, "flos": 22126970551680.0, "grad_norm": 1.904719991320197, "language_loss": 0.8765772, "learning_rate": 2.4297310131895774e-06, "loss": 0.89868975, "num_input_tokens_seen": 80223410, "step": 3727, "time_per_iteration": 2.4239656925201416 }, { "auxiliary_loss_clip": 0.01169893, "auxiliary_loss_mlp": 0.01031785, "balance_loss_clip": 1.05285311, "balance_loss_mlp": 1.02307653, "epoch": 0.4482654963025311, "flos": 16653933204480.0, "grad_norm": 1.94850857925264, "language_loss": 0.74924505, "learning_rate": 2.4289702029030113e-06, "loss": 0.77126175, "num_input_tokens_seen": 80240880, "step": 3728, "time_per_iteration": 3.241889715194702 }, { "auxiliary_loss_clip": 0.01171947, "auxiliary_loss_mlp": 0.01027051, "balance_loss_clip": 1.05626512, "balance_loss_mlp": 1.01886129, "epoch": 0.4483857391931702, "flos": 18841251905280.0, "grad_norm": 1.7704911897689553, "language_loss": 0.83301461, "learning_rate": 2.4282093275420057e-06, "loss": 0.85500461, "num_input_tokens_seen": 80259910, "step": 3729, "time_per_iteration": 2.490283489227295 }, { "auxiliary_loss_clip": 0.01175912, "auxiliary_loss_mlp": 0.01031034, "balance_loss_clip": 1.05572116, "balance_loss_mlp": 1.02324331, "epoch": 0.4485059820838093, "flos": 20372590477440.0, "grad_norm": 2.1172448537654196, "language_loss": 0.70855886, "learning_rate": 2.4274483872219863e-06, "loss": 0.73062837, "num_input_tokens_seen": 80277270, "step": 3730, "time_per_iteration": 2.428694009780884 }, { "auxiliary_loss_clip": 0.01166611, "auxiliary_loss_mlp": 0.01028713, "balance_loss_clip": 1.05191278, "balance_loss_mlp": 1.02068686, "epoch": 0.4486262249744484, "flos": 20047742853120.0, "grad_norm": 1.7888091077398294, "language_loss": 0.93992376, "learning_rate": 2.426687382058386e-06, "loss": 0.96187705, "num_input_tokens_seen": 80295550, "step": 3731, "time_per_iteration": 3.2148828506469727 }, { "auxiliary_loss_clip": 0.0107036, "auxiliary_loss_mlp": 0.01004385, "balance_loss_clip": 1.02395248, "balance_loss_mlp": 1.00322914, "epoch": 0.4487464678650875, "flos": 64595684776320.0, "grad_norm": 0.8617764825105491, "language_loss": 0.59841961, "learning_rate": 2.425926312166649e-06, "loss": 0.61916709, "num_input_tokens_seen": 80348425, "step": 3732, "time_per_iteration": 2.925603151321411 }, { "auxiliary_loss_clip": 0.01161711, "auxiliary_loss_mlp": 0.01025832, "balance_loss_clip": 1.05454564, "balance_loss_mlp": 1.01705253, "epoch": 0.4488667107557266, "flos": 20769798049920.0, "grad_norm": 2.0758542749481492, "language_loss": 0.72705811, "learning_rate": 2.42516517766223e-06, "loss": 0.74893355, "num_input_tokens_seen": 80366505, "step": 3733, "time_per_iteration": 2.46803617477417 }, { "auxiliary_loss_clip": 0.01184597, "auxiliary_loss_mlp": 0.01027602, "balance_loss_clip": 1.05712795, "balance_loss_mlp": 1.01927507, "epoch": 0.44898695364636565, "flos": 23951735326080.0, "grad_norm": 2.402057180054235, "language_loss": 0.68073934, "learning_rate": 2.4244039786605907e-06, "loss": 0.70286131, "num_input_tokens_seen": 80387510, "step": 3734, "time_per_iteration": 2.461834669113159 }, { "auxiliary_loss_clip": 0.01126024, "auxiliary_loss_mlp": 0.01027005, "balance_loss_clip": 1.04622388, "balance_loss_mlp": 1.01857138, "epoch": 0.44910719653700476, "flos": 18624351628800.0, "grad_norm": 2.326495437943824, "language_loss": 0.82343304, "learning_rate": 2.4236427152772055e-06, "loss": 0.84496337, "num_input_tokens_seen": 80405915, "step": 3735, "time_per_iteration": 2.5284368991851807 }, { "auxiliary_loss_clip": 0.01036357, "auxiliary_loss_mlp": 0.01002037, "balance_loss_clip": 1.01779127, "balance_loss_mlp": 1.00080955, "epoch": 0.4492274394276438, "flos": 57033435749760.0, "grad_norm": 0.828588795770772, "language_loss": 0.57338703, "learning_rate": 2.422881387627557e-06, "loss": 0.59377098, "num_input_tokens_seen": 80458365, "step": 3736, "time_per_iteration": 2.827345371246338 }, { "auxiliary_loss_clip": 0.01159183, "auxiliary_loss_mlp": 0.01024103, "balance_loss_clip": 1.05367064, "balance_loss_mlp": 1.01621163, "epoch": 0.4493476823182829, "flos": 23254888498560.0, "grad_norm": 1.5475513603628974, "language_loss": 0.7744534, "learning_rate": 2.422119995827139e-06, "loss": 0.79628628, "num_input_tokens_seen": 80478490, "step": 3737, "time_per_iteration": 2.501750946044922 }, { "auxiliary_loss_clip": 0.01173609, "auxiliary_loss_mlp": 0.01028821, "balance_loss_clip": 1.05444133, "balance_loss_mlp": 1.02088773, "epoch": 0.44946792520892204, "flos": 15815131827840.0, "grad_norm": 2.839449450650751, "language_loss": 0.74221724, "learning_rate": 2.4213585399914528e-06, "loss": 0.76424158, "num_input_tokens_seen": 80495695, "step": 3738, "time_per_iteration": 2.4529948234558105 }, { "auxiliary_loss_clip": 0.01171071, "auxiliary_loss_mlp": 0.01028793, "balance_loss_clip": 1.05458117, "balance_loss_mlp": 1.02071047, "epoch": 0.4495881680995611, "flos": 19610063631360.0, "grad_norm": 1.738964745627871, "language_loss": 0.85120904, "learning_rate": 2.4205970202360113e-06, "loss": 0.87320769, "num_input_tokens_seen": 80515260, "step": 3739, "time_per_iteration": 2.488502025604248 }, { "auxiliary_loss_clip": 0.01118295, "auxiliary_loss_mlp": 0.01027346, "balance_loss_clip": 1.04790878, "balance_loss_mlp": 1.01862633, "epoch": 0.4497084109902002, "flos": 26031465815040.0, "grad_norm": 1.8509698700314674, "language_loss": 0.77969623, "learning_rate": 2.4198354366763354e-06, "loss": 0.80115259, "num_input_tokens_seen": 80533900, "step": 3740, "time_per_iteration": 2.5950775146484375 }, { "auxiliary_loss_clip": 0.01160249, "auxiliary_loss_mlp": 0.01026637, "balance_loss_clip": 1.05328619, "balance_loss_mlp": 1.01825058, "epoch": 0.4498286538808393, "flos": 14793688771200.0, "grad_norm": 2.2453774512902474, "language_loss": 0.78749019, "learning_rate": 2.4190737894279587e-06, "loss": 0.80935907, "num_input_tokens_seen": 80551270, "step": 3741, "time_per_iteration": 2.5710997581481934 }, { "auxiliary_loss_clip": 0.01130927, "auxiliary_loss_mlp": 0.01027482, "balance_loss_clip": 1.04537284, "balance_loss_mlp": 1.01971555, "epoch": 0.44994889677147837, "flos": 15450171690240.0, "grad_norm": 2.0451407789837037, "language_loss": 0.80230498, "learning_rate": 2.4183120786064203e-06, "loss": 0.82388908, "num_input_tokens_seen": 80568145, "step": 3742, "time_per_iteration": 2.496428966522217 }, { "auxiliary_loss_clip": 0.01170937, "auxiliary_loss_mlp": 0.00762754, "balance_loss_clip": 1.05686092, "balance_loss_mlp": 1.00051403, "epoch": 0.4500691396621175, "flos": 21798316085760.0, "grad_norm": 2.2590118283325, "language_loss": 0.85631835, "learning_rate": 2.417550304327273e-06, "loss": 0.87565523, "num_input_tokens_seen": 80586185, "step": 3743, "time_per_iteration": 2.535747766494751 }, { "auxiliary_loss_clip": 0.01185947, "auxiliary_loss_mlp": 0.01033036, "balance_loss_clip": 1.05553615, "balance_loss_mlp": 1.02451897, "epoch": 0.4501893825527566, "flos": 32382016421760.0, "grad_norm": 1.6238272100642819, "language_loss": 0.75857115, "learning_rate": 2.4167884667060763e-06, "loss": 0.780761, "num_input_tokens_seen": 80608895, "step": 3744, "time_per_iteration": 2.560973882675171 }, { "auxiliary_loss_clip": 0.011562, "auxiliary_loss_mlp": 0.01031031, "balance_loss_clip": 1.05149508, "balance_loss_mlp": 1.02273417, "epoch": 0.45030962544339564, "flos": 16544944362240.0, "grad_norm": 2.0841093455484585, "language_loss": 0.87233818, "learning_rate": 2.4160265658584e-06, "loss": 0.89421046, "num_input_tokens_seen": 80623785, "step": 3745, "time_per_iteration": 2.479842185974121 }, { "auxiliary_loss_clip": 0.01175097, "auxiliary_loss_mlp": 0.01026984, "balance_loss_clip": 1.05516124, "balance_loss_mlp": 1.01866317, "epoch": 0.45042986833403476, "flos": 19573039687680.0, "grad_norm": 1.9375556610580837, "language_loss": 0.68178022, "learning_rate": 2.4152646018998253e-06, "loss": 0.7038011, "num_input_tokens_seen": 80642735, "step": 3746, "time_per_iteration": 2.4767673015594482 }, { "auxiliary_loss_clip": 0.01167417, "auxiliary_loss_mlp": 0.01036901, "balance_loss_clip": 1.05326366, "balance_loss_mlp": 1.02888441, "epoch": 0.45055011122467387, "flos": 23112467072640.0, "grad_norm": 2.4059667216935856, "language_loss": 0.71638024, "learning_rate": 2.4145025749459403e-06, "loss": 0.73842341, "num_input_tokens_seen": 80663760, "step": 3747, "time_per_iteration": 2.540201187133789 }, { "auxiliary_loss_clip": 0.01100442, "auxiliary_loss_mlp": 0.01037028, "balance_loss_clip": 1.04926336, "balance_loss_mlp": 1.02842689, "epoch": 0.4506703541153129, "flos": 19934623946880.0, "grad_norm": 3.1361848550496743, "language_loss": 0.70132393, "learning_rate": 2.413740485112344e-06, "loss": 0.72269857, "num_input_tokens_seen": 80682100, "step": 3748, "time_per_iteration": 2.6768319606781006 }, { "auxiliary_loss_clip": 0.01149807, "auxiliary_loss_mlp": 0.0102574, "balance_loss_clip": 1.05397427, "balance_loss_mlp": 1.01729381, "epoch": 0.45079059700595203, "flos": 19499530504320.0, "grad_norm": 1.6708286229857756, "language_loss": 0.82146597, "learning_rate": 2.412978332514646e-06, "loss": 0.84322149, "num_input_tokens_seen": 80700880, "step": 3749, "time_per_iteration": 3.3277199268341064 }, { "auxiliary_loss_clip": 0.01160279, "auxiliary_loss_mlp": 0.01025385, "balance_loss_clip": 1.05437994, "balance_loss_mlp": 1.01671219, "epoch": 0.4509108398965911, "flos": 27636313570560.0, "grad_norm": 2.5039223583520607, "language_loss": 0.71925116, "learning_rate": 2.4122161172684623e-06, "loss": 0.74110776, "num_input_tokens_seen": 80721675, "step": 3750, "time_per_iteration": 2.653538942337036 }, { "auxiliary_loss_clip": 0.01159929, "auxiliary_loss_mlp": 0.0103592, "balance_loss_clip": 1.05353808, "balance_loss_mlp": 1.02708101, "epoch": 0.4510310827872302, "flos": 20995712640000.0, "grad_norm": 4.125961595740625, "language_loss": 0.83858192, "learning_rate": 2.4114538394894216e-06, "loss": 0.86054045, "num_input_tokens_seen": 80739315, "step": 3751, "time_per_iteration": 2.5357472896575928 }, { "auxiliary_loss_clip": 0.01152362, "auxiliary_loss_mlp": 0.01025031, "balance_loss_clip": 1.04778242, "balance_loss_mlp": 1.01704431, "epoch": 0.4511513256778693, "flos": 16216684945920.0, "grad_norm": 1.9909336664787296, "language_loss": 0.83124518, "learning_rate": 2.410691499293161e-06, "loss": 0.85301912, "num_input_tokens_seen": 80757470, "step": 3752, "time_per_iteration": 3.3600518703460693 }, { "auxiliary_loss_clip": 0.0116877, "auxiliary_loss_mlp": 0.01026002, "balance_loss_clip": 1.05273366, "balance_loss_mlp": 1.01769948, "epoch": 0.45127156856850836, "flos": 25186702780800.0, "grad_norm": 1.586291049664566, "language_loss": 0.74620748, "learning_rate": 2.409929096795326e-06, "loss": 0.76815522, "num_input_tokens_seen": 80777840, "step": 3753, "time_per_iteration": 2.5574533939361572 }, { "auxiliary_loss_clip": 0.01170796, "auxiliary_loss_mlp": 0.0102956, "balance_loss_clip": 1.05291188, "balance_loss_mlp": 1.02073836, "epoch": 0.4513918114591475, "flos": 20412523422720.0, "grad_norm": 1.8514644660422697, "language_loss": 0.79435682, "learning_rate": 2.409166632111573e-06, "loss": 0.81636047, "num_input_tokens_seen": 80795975, "step": 3754, "time_per_iteration": 2.5205471515655518 }, { "auxiliary_loss_clip": 0.01178116, "auxiliary_loss_mlp": 0.01026187, "balance_loss_clip": 1.05422628, "balance_loss_mlp": 1.01752615, "epoch": 0.4515120543497866, "flos": 26648482665600.0, "grad_norm": 1.8717835396622062, "language_loss": 0.80601835, "learning_rate": 2.4084041053575674e-06, "loss": 0.82806146, "num_input_tokens_seen": 80815395, "step": 3755, "time_per_iteration": 3.294748544692993 }, { "auxiliary_loss_clip": 0.0116126, "auxiliary_loss_mlp": 0.01025496, "balance_loss_clip": 1.05414999, "balance_loss_mlp": 1.01704431, "epoch": 0.45163229724042564, "flos": 20595093275520.0, "grad_norm": 2.241044103456221, "language_loss": 0.7234416, "learning_rate": 2.4076415166489834e-06, "loss": 0.74530923, "num_input_tokens_seen": 80834805, "step": 3756, "time_per_iteration": 2.4837145805358887 }, { "auxiliary_loss_clip": 0.01133074, "auxiliary_loss_mlp": 0.0103185, "balance_loss_clip": 1.05036211, "balance_loss_mlp": 1.02398205, "epoch": 0.45175254013106475, "flos": 21689004021120.0, "grad_norm": 2.168584034022373, "language_loss": 0.78813004, "learning_rate": 2.406878866101506e-06, "loss": 0.80977929, "num_input_tokens_seen": 80853770, "step": 3757, "time_per_iteration": 2.6080996990203857 }, { "auxiliary_loss_clip": 0.0118539, "auxiliary_loss_mlp": 0.01028538, "balance_loss_clip": 1.05763328, "balance_loss_mlp": 1.02081299, "epoch": 0.45187278302170386, "flos": 18878850466560.0, "grad_norm": 1.9961197332170317, "language_loss": 0.78091234, "learning_rate": 2.4061161538308273e-06, "loss": 0.80305159, "num_input_tokens_seen": 80870615, "step": 3758, "time_per_iteration": 3.177151679992676 }, { "auxiliary_loss_clip": 0.0116982, "auxiliary_loss_mlp": 0.01027524, "balance_loss_clip": 1.05452859, "balance_loss_mlp": 1.01926851, "epoch": 0.4519930259123429, "flos": 18582479349120.0, "grad_norm": 1.8890221530485913, "language_loss": 0.88957685, "learning_rate": 2.4053533799526523e-06, "loss": 0.91155028, "num_input_tokens_seen": 80886335, "step": 3759, "time_per_iteration": 2.453939199447632 }, { "auxiliary_loss_clip": 0.01150136, "auxiliary_loss_mlp": 0.01031065, "balance_loss_clip": 1.05201554, "balance_loss_mlp": 1.02288747, "epoch": 0.452113268802982, "flos": 25192377129600.0, "grad_norm": 1.6118747846566377, "language_loss": 0.8601687, "learning_rate": 2.404590544582691e-06, "loss": 0.88198078, "num_input_tokens_seen": 80904570, "step": 3760, "time_per_iteration": 2.5175836086273193 }, { "auxiliary_loss_clip": 0.01129549, "auxiliary_loss_mlp": 0.0103248, "balance_loss_clip": 1.04423451, "balance_loss_mlp": 1.02413559, "epoch": 0.45223351169362114, "flos": 39378922312320.0, "grad_norm": 1.7108712405252762, "language_loss": 0.80917883, "learning_rate": 2.403827647836666e-06, "loss": 0.8307991, "num_input_tokens_seen": 80925125, "step": 3761, "time_per_iteration": 2.7732365131378174 }, { "auxiliary_loss_clip": 0.01185998, "auxiliary_loss_mlp": 0.01029848, "balance_loss_clip": 1.05456853, "balance_loss_mlp": 1.02126551, "epoch": 0.4523537545842602, "flos": 21582169994880.0, "grad_norm": 1.8969500765232759, "language_loss": 0.69223797, "learning_rate": 2.4030646898303075e-06, "loss": 0.71439648, "num_input_tokens_seen": 80946615, "step": 3762, "time_per_iteration": 2.5373876094818115 }, { "auxiliary_loss_clip": 0.01161509, "auxiliary_loss_mlp": 0.0103565, "balance_loss_clip": 1.05302978, "balance_loss_mlp": 1.02747226, "epoch": 0.4524739974748993, "flos": 28439527547520.0, "grad_norm": 2.2145640261037527, "language_loss": 0.81676447, "learning_rate": 2.4023016706793566e-06, "loss": 0.83873606, "num_input_tokens_seen": 80966410, "step": 3763, "time_per_iteration": 2.604886531829834 }, { "auxiliary_loss_clip": 0.01056349, "auxiliary_loss_mlp": 0.01003428, "balance_loss_clip": 1.01989651, "balance_loss_mlp": 1.00230718, "epoch": 0.4525942403655384, "flos": 61556492148480.0, "grad_norm": 0.7605593372196167, "language_loss": 0.56882799, "learning_rate": 2.401538590499561e-06, "loss": 0.58942574, "num_input_tokens_seen": 81026865, "step": 3764, "time_per_iteration": 3.249357223510742 }, { "auxiliary_loss_clip": 0.01173899, "auxiliary_loss_mlp": 0.00763177, "balance_loss_clip": 1.05513811, "balance_loss_mlp": 1.00044131, "epoch": 0.45271448325617747, "flos": 27529838680320.0, "grad_norm": 2.130759408348567, "language_loss": 0.71776503, "learning_rate": 2.400775449406682e-06, "loss": 0.73713577, "num_input_tokens_seen": 81050060, "step": 3765, "time_per_iteration": 2.6272900104522705 }, { "auxiliary_loss_clip": 0.01169285, "auxiliary_loss_mlp": 0.01030882, "balance_loss_clip": 1.0516479, "balance_loss_mlp": 1.02304423, "epoch": 0.4528347261468166, "flos": 22452608275200.0, "grad_norm": 1.7924407815249894, "language_loss": 0.72836196, "learning_rate": 2.400012247516485e-06, "loss": 0.75036359, "num_input_tokens_seen": 81070625, "step": 3766, "time_per_iteration": 2.55890154838562 }, { "auxiliary_loss_clip": 0.01144676, "auxiliary_loss_mlp": 0.01028603, "balance_loss_clip": 1.04866433, "balance_loss_mlp": 1.02052093, "epoch": 0.45295496903745563, "flos": 21103875469440.0, "grad_norm": 1.8522468763111193, "language_loss": 0.90215921, "learning_rate": 2.3992489849447484e-06, "loss": 0.92389202, "num_input_tokens_seen": 81089080, "step": 3767, "time_per_iteration": 2.5342369079589844 }, { "auxiliary_loss_clip": 0.01146481, "auxiliary_loss_mlp": 0.01028071, "balance_loss_clip": 1.04963183, "balance_loss_mlp": 1.01989055, "epoch": 0.45307521192809475, "flos": 23221168606080.0, "grad_norm": 1.593612532900772, "language_loss": 0.78877389, "learning_rate": 2.3984856618072584e-06, "loss": 0.8105194, "num_input_tokens_seen": 81109115, "step": 3768, "time_per_iteration": 2.526691198348999 }, { "auxiliary_loss_clip": 0.01147603, "auxiliary_loss_mlp": 0.01032843, "balance_loss_clip": 1.05243802, "balance_loss_mlp": 1.02458751, "epoch": 0.45319545481873386, "flos": 15560094286080.0, "grad_norm": 1.9856900933397736, "language_loss": 0.73791873, "learning_rate": 2.3977222782198098e-06, "loss": 0.75972319, "num_input_tokens_seen": 81127750, "step": 3769, "time_per_iteration": 2.4951794147491455 }, { "auxiliary_loss_clip": 0.01133546, "auxiliary_loss_mlp": 0.010372, "balance_loss_clip": 1.04953289, "balance_loss_mlp": 1.02744913, "epoch": 0.4533156977093729, "flos": 21944759834880.0, "grad_norm": 1.5617537540052147, "language_loss": 0.75110316, "learning_rate": 2.3969588342982077e-06, "loss": 0.77281058, "num_input_tokens_seen": 81147125, "step": 3770, "time_per_iteration": 2.5324506759643555 }, { "auxiliary_loss_clip": 0.01168656, "auxiliary_loss_mlp": 0.01029871, "balance_loss_clip": 1.05515385, "balance_loss_mlp": 1.02136564, "epoch": 0.453435940600012, "flos": 24242180699520.0, "grad_norm": 1.5061330962328552, "language_loss": 0.72658408, "learning_rate": 2.396195330158267e-06, "loss": 0.74856937, "num_input_tokens_seen": 81167015, "step": 3771, "time_per_iteration": 2.49704909324646 }, { "auxiliary_loss_clip": 0.01184709, "auxiliary_loss_mlp": 0.01027536, "balance_loss_clip": 1.05495811, "balance_loss_mlp": 1.01910233, "epoch": 0.45355618349065113, "flos": 23440367352960.0, "grad_norm": 1.6790523680169442, "language_loss": 0.79651469, "learning_rate": 2.3954317659158094e-06, "loss": 0.81863713, "num_input_tokens_seen": 81187350, "step": 3772, "time_per_iteration": 2.465524435043335 }, { "auxiliary_loss_clip": 0.01080716, "auxiliary_loss_mlp": 0.01000878, "balance_loss_clip": 1.01787233, "balance_loss_mlp": 0.99976921, "epoch": 0.4536764263812902, "flos": 66903161448960.0, "grad_norm": 0.8945363325813748, "language_loss": 0.56934851, "learning_rate": 2.394668141686667e-06, "loss": 0.59016442, "num_input_tokens_seen": 81249315, "step": 3773, "time_per_iteration": 3.051337718963623 }, { "auxiliary_loss_clip": 0.0116444, "auxiliary_loss_mlp": 0.01028266, "balance_loss_clip": 1.0500102, "balance_loss_mlp": 1.0203923, "epoch": 0.4537966692719293, "flos": 42739766254080.0, "grad_norm": 2.0690370994660694, "language_loss": 0.69371283, "learning_rate": 2.3939044575866813e-06, "loss": 0.71563989, "num_input_tokens_seen": 81272065, "step": 3774, "time_per_iteration": 2.6522011756896973 }, { "auxiliary_loss_clip": 0.01150021, "auxiliary_loss_mlp": 0.0076308, "balance_loss_clip": 1.04903984, "balance_loss_mlp": 1.0004549, "epoch": 0.4539169121625684, "flos": 35549480517120.0, "grad_norm": 2.1064599192742692, "language_loss": 0.75257522, "learning_rate": 2.3931407137317024e-06, "loss": 0.77170622, "num_input_tokens_seen": 81292220, "step": 3775, "time_per_iteration": 2.6045687198638916 }, { "auxiliary_loss_clip": 0.01138393, "auxiliary_loss_mlp": 0.01032629, "balance_loss_clip": 1.04696679, "balance_loss_mlp": 1.0239501, "epoch": 0.45403715505320746, "flos": 18514716341760.0, "grad_norm": 1.6599063037356312, "language_loss": 0.85206449, "learning_rate": 2.3923769102375907e-06, "loss": 0.87377477, "num_input_tokens_seen": 81311085, "step": 3776, "time_per_iteration": 3.357940912246704 }, { "auxiliary_loss_clip": 0.01141734, "auxiliary_loss_mlp": 0.01034878, "balance_loss_clip": 1.04974818, "balance_loss_mlp": 1.0260148, "epoch": 0.4541573979438466, "flos": 25045825639680.0, "grad_norm": 2.5178914633459057, "language_loss": 0.78348935, "learning_rate": 2.391613047220213e-06, "loss": 0.80525553, "num_input_tokens_seen": 81330985, "step": 3777, "time_per_iteration": 2.5558977127075195 }, { "auxiliary_loss_clip": 0.01133966, "auxiliary_loss_mlp": 0.01026218, "balance_loss_clip": 1.04943693, "balance_loss_mlp": 1.01796913, "epoch": 0.4542776408344857, "flos": 18332397884160.0, "grad_norm": 2.0353926887114646, "language_loss": 0.79293275, "learning_rate": 2.390849124795447e-06, "loss": 0.81453454, "num_input_tokens_seen": 81346985, "step": 3778, "time_per_iteration": 2.5255467891693115 }, { "auxiliary_loss_clip": 0.01185832, "auxiliary_loss_mlp": 0.01027391, "balance_loss_clip": 1.05525661, "balance_loss_mlp": 1.01935673, "epoch": 0.45439788372512474, "flos": 20701173116160.0, "grad_norm": 2.0985837238693246, "language_loss": 0.84202588, "learning_rate": 2.3900851430791804e-06, "loss": 0.86415815, "num_input_tokens_seen": 81365005, "step": 3779, "time_per_iteration": 3.2632970809936523 }, { "auxiliary_loss_clip": 0.01187104, "auxiliary_loss_mlp": 0.01032497, "balance_loss_clip": 1.05380249, "balance_loss_mlp": 1.0232048, "epoch": 0.45451812661576385, "flos": 22309432663680.0, "grad_norm": 2.3253835641828347, "language_loss": 0.84480727, "learning_rate": 2.389321102187307e-06, "loss": 0.86700326, "num_input_tokens_seen": 81383785, "step": 3780, "time_per_iteration": 2.4269118309020996 }, { "auxiliary_loss_clip": 0.0115723, "auxiliary_loss_mlp": 0.00763807, "balance_loss_clip": 1.05281854, "balance_loss_mlp": 1.00042903, "epoch": 0.4546383695064029, "flos": 21763303303680.0, "grad_norm": 1.8859639538980257, "language_loss": 0.8174938, "learning_rate": 2.3885570022357326e-06, "loss": 0.83670413, "num_input_tokens_seen": 81402915, "step": 3781, "time_per_iteration": 3.2148799896240234 }, { "auxiliary_loss_clip": 0.01051458, "auxiliary_loss_mlp": 0.01005437, "balance_loss_clip": 1.01503432, "balance_loss_mlp": 1.00407243, "epoch": 0.454758612397042, "flos": 64242755694720.0, "grad_norm": 0.8152794847847367, "language_loss": 0.60924393, "learning_rate": 2.38779284334037e-06, "loss": 0.6298129, "num_input_tokens_seen": 81467890, "step": 3782, "time_per_iteration": 3.1294891834259033 }, { "auxiliary_loss_clip": 0.01115241, "auxiliary_loss_mlp": 0.01030779, "balance_loss_clip": 1.04538035, "balance_loss_mlp": 1.02230966, "epoch": 0.4548788552876811, "flos": 27304175485440.0, "grad_norm": 1.92047963167719, "language_loss": 0.78968054, "learning_rate": 2.387028625617141e-06, "loss": 0.81114072, "num_input_tokens_seen": 81487105, "step": 3783, "time_per_iteration": 2.592634677886963 }, { "auxiliary_loss_clip": 0.01142262, "auxiliary_loss_mlp": 0.01028999, "balance_loss_clip": 1.04887736, "balance_loss_mlp": 1.02086866, "epoch": 0.4549990981783202, "flos": 22857142222080.0, "grad_norm": 1.8124798147360146, "language_loss": 0.84771532, "learning_rate": 2.3862643491819766e-06, "loss": 0.86942792, "num_input_tokens_seen": 81505670, "step": 3784, "time_per_iteration": 3.275575876235962 }, { "auxiliary_loss_clip": 0.01165969, "auxiliary_loss_mlp": 0.01028451, "balance_loss_clip": 1.04948103, "balance_loss_mlp": 1.02048779, "epoch": 0.4551193410689593, "flos": 23258587599360.0, "grad_norm": 1.7297506885774971, "language_loss": 0.84381652, "learning_rate": 2.3855000141508186e-06, "loss": 0.86576068, "num_input_tokens_seen": 81525825, "step": 3785, "time_per_iteration": 2.4763951301574707 }, { "auxiliary_loss_clip": 0.01161859, "auxiliary_loss_mlp": 0.01031976, "balance_loss_clip": 1.05582595, "balance_loss_mlp": 1.02308285, "epoch": 0.4552395839595984, "flos": 20777519473920.0, "grad_norm": 2.094454743344971, "language_loss": 0.83995044, "learning_rate": 2.3847356206396143e-06, "loss": 0.86188877, "num_input_tokens_seen": 81543135, "step": 3786, "time_per_iteration": 2.498401165008545 }, { "auxiliary_loss_clip": 0.0118489, "auxiliary_loss_mlp": 0.01027389, "balance_loss_clip": 1.0552392, "balance_loss_mlp": 1.01892543, "epoch": 0.45535982685023746, "flos": 23257510191360.0, "grad_norm": 1.5030745684404934, "language_loss": 0.78684652, "learning_rate": 2.3839711687643227e-06, "loss": 0.80896932, "num_input_tokens_seen": 81564360, "step": 3787, "time_per_iteration": 2.4950311183929443 }, { "auxiliary_loss_clip": 0.01171028, "auxiliary_loss_mlp": 0.01030485, "balance_loss_clip": 1.05355573, "balance_loss_mlp": 1.02124071, "epoch": 0.45548006974087657, "flos": 19646117907840.0, "grad_norm": 1.9704479939940027, "language_loss": 0.73939848, "learning_rate": 2.38320665864091e-06, "loss": 0.76141357, "num_input_tokens_seen": 81583710, "step": 3788, "time_per_iteration": 2.4467294216156006 }, { "auxiliary_loss_clip": 0.01114179, "auxiliary_loss_mlp": 0.01027913, "balance_loss_clip": 1.04512429, "balance_loss_mlp": 1.01945496, "epoch": 0.4556003126315157, "flos": 20047778766720.0, "grad_norm": 1.7535612715074247, "language_loss": 0.82016379, "learning_rate": 2.3824420903853516e-06, "loss": 0.84158474, "num_input_tokens_seen": 81602175, "step": 3789, "time_per_iteration": 2.6008527278900146 }, { "auxiliary_loss_clip": 0.01170612, "auxiliary_loss_mlp": 0.01028043, "balance_loss_clip": 1.0553441, "balance_loss_mlp": 1.01972854, "epoch": 0.45572055552215474, "flos": 22959738443520.0, "grad_norm": 2.1186014518042464, "language_loss": 0.81559336, "learning_rate": 2.3816774641136324e-06, "loss": 0.83757997, "num_input_tokens_seen": 81619430, "step": 3790, "time_per_iteration": 2.4550976753234863 }, { "auxiliary_loss_clip": 0.01168393, "auxiliary_loss_mlp": 0.00763052, "balance_loss_clip": 1.05338836, "balance_loss_mlp": 1.00040102, "epoch": 0.45584079841279385, "flos": 33109925535360.0, "grad_norm": 1.8800406877513136, "language_loss": 0.71113884, "learning_rate": 2.380912779941745e-06, "loss": 0.73045325, "num_input_tokens_seen": 81642550, "step": 3791, "time_per_iteration": 2.550929546356201 }, { "auxiliary_loss_clip": 0.01171577, "auxiliary_loss_mlp": 0.01037613, "balance_loss_clip": 1.05014384, "balance_loss_mlp": 1.0277189, "epoch": 0.45596104130343296, "flos": 27272179445760.0, "grad_norm": 1.8494476814431424, "language_loss": 0.82886106, "learning_rate": 2.3801480379856918e-06, "loss": 0.85095298, "num_input_tokens_seen": 81664260, "step": 3792, "time_per_iteration": 2.5273876190185547 }, { "auxiliary_loss_clip": 0.01158897, "auxiliary_loss_mlp": 0.01033136, "balance_loss_clip": 1.05326509, "balance_loss_mlp": 1.02513969, "epoch": 0.456081284194072, "flos": 21579799697280.0, "grad_norm": 1.7286153991026363, "language_loss": 0.83240891, "learning_rate": 2.379383238361484e-06, "loss": 0.85432923, "num_input_tokens_seen": 81683620, "step": 3793, "time_per_iteration": 2.487133741378784 }, { "auxiliary_loss_clip": 0.01166915, "auxiliary_loss_mlp": 0.01029734, "balance_loss_clip": 1.05071235, "balance_loss_mlp": 1.02147329, "epoch": 0.4562015270847111, "flos": 35918822113920.0, "grad_norm": 2.4842672936168753, "language_loss": 0.79355621, "learning_rate": 2.3786183811851407e-06, "loss": 0.81552267, "num_input_tokens_seen": 81704325, "step": 3794, "time_per_iteration": 2.5594379901885986 }, { "auxiliary_loss_clip": 0.01186374, "auxiliary_loss_mlp": 0.01028778, "balance_loss_clip": 1.05726063, "balance_loss_mlp": 1.02073193, "epoch": 0.45632176997535023, "flos": 13589783602560.0, "grad_norm": 1.6789033066313022, "language_loss": 0.80015802, "learning_rate": 2.3778534665726892e-06, "loss": 0.82230949, "num_input_tokens_seen": 81721155, "step": 3795, "time_per_iteration": 2.379777669906616 }, { "auxiliary_loss_clip": 0.01159261, "auxiliary_loss_mlp": 0.0103411, "balance_loss_clip": 1.05166936, "balance_loss_mlp": 1.0261054, "epoch": 0.4564420128659893, "flos": 32635401937920.0, "grad_norm": 2.786734573177213, "language_loss": 0.72400242, "learning_rate": 2.377088494640168e-06, "loss": 0.74593616, "num_input_tokens_seen": 81742905, "step": 3796, "time_per_iteration": 2.5297863483428955 }, { "auxiliary_loss_clip": 0.01164883, "auxiliary_loss_mlp": 0.0103014, "balance_loss_clip": 1.05331361, "balance_loss_mlp": 1.02206349, "epoch": 0.4565622557566284, "flos": 20377690208640.0, "grad_norm": 4.242723131690308, "language_loss": 0.78136063, "learning_rate": 2.3763234655036216e-06, "loss": 0.80331087, "num_input_tokens_seen": 81762105, "step": 3797, "time_per_iteration": 2.4528684616088867 }, { "auxiliary_loss_clip": 0.01137969, "auxiliary_loss_mlp": 0.01028854, "balance_loss_clip": 1.04526961, "balance_loss_mlp": 1.02058685, "epoch": 0.45668249864726745, "flos": 25374372364800.0, "grad_norm": 2.0709775984358423, "language_loss": 0.86847389, "learning_rate": 2.3755583792791046e-06, "loss": 0.89014214, "num_input_tokens_seen": 81781975, "step": 3798, "time_per_iteration": 2.539468288421631 }, { "auxiliary_loss_clip": 0.01168977, "auxiliary_loss_mlp": 0.01024162, "balance_loss_clip": 1.05152678, "balance_loss_mlp": 1.01635385, "epoch": 0.45680274153790656, "flos": 15559806977280.0, "grad_norm": 2.146732562241273, "language_loss": 0.74601626, "learning_rate": 2.3747932360826803e-06, "loss": 0.76794761, "num_input_tokens_seen": 81798905, "step": 3799, "time_per_iteration": 2.445308208465576 }, { "auxiliary_loss_clip": 0.01169416, "auxiliary_loss_mlp": 0.01029373, "balance_loss_clip": 1.05378258, "balance_loss_mlp": 1.02067089, "epoch": 0.4569229844285457, "flos": 19792884879360.0, "grad_norm": 3.0735226956985437, "language_loss": 0.8221522, "learning_rate": 2.3740280360304205e-06, "loss": 0.84414005, "num_input_tokens_seen": 81816630, "step": 3800, "time_per_iteration": 2.4315550327301025 }, { "auxiliary_loss_clip": 0.01140648, "auxiliary_loss_mlp": 0.01029107, "balance_loss_clip": 1.05232584, "balance_loss_mlp": 1.02033925, "epoch": 0.45704322731918473, "flos": 24093941270400.0, "grad_norm": 1.665226475130565, "language_loss": 0.68208665, "learning_rate": 2.3732627792384038e-06, "loss": 0.70378417, "num_input_tokens_seen": 81837700, "step": 3801, "time_per_iteration": 2.583601713180542 }, { "auxiliary_loss_clip": 0.01184084, "auxiliary_loss_mlp": 0.01027911, "balance_loss_clip": 1.05372906, "balance_loss_mlp": 1.01939917, "epoch": 0.45716347020982384, "flos": 31317803245440.0, "grad_norm": 3.0154721142397674, "language_loss": 0.75556082, "learning_rate": 2.3724974658227207e-06, "loss": 0.77768075, "num_input_tokens_seen": 81858490, "step": 3802, "time_per_iteration": 2.497620105743408 }, { "auxiliary_loss_clip": 0.01154197, "auxiliary_loss_mlp": 0.00763119, "balance_loss_clip": 1.05299437, "balance_loss_mlp": 1.0003581, "epoch": 0.45728371310046295, "flos": 26501392471680.0, "grad_norm": 2.28540710772405, "language_loss": 0.71281683, "learning_rate": 2.3717320958994687e-06, "loss": 0.73199004, "num_input_tokens_seen": 81876050, "step": 3803, "time_per_iteration": 3.362089157104492 }, { "auxiliary_loss_clip": 0.01137497, "auxiliary_loss_mlp": 0.01026956, "balance_loss_clip": 1.04297101, "balance_loss_mlp": 1.0193181, "epoch": 0.457403955991102, "flos": 17929408222080.0, "grad_norm": 2.0194345487736043, "language_loss": 0.70159721, "learning_rate": 2.3709666695847534e-06, "loss": 0.72324181, "num_input_tokens_seen": 81894230, "step": 3804, "time_per_iteration": 2.48596453666687 }, { "auxiliary_loss_clip": 0.01118354, "auxiliary_loss_mlp": 0.01028125, "balance_loss_clip": 1.04534924, "balance_loss_mlp": 1.02048957, "epoch": 0.4575241988817411, "flos": 42230660837760.0, "grad_norm": 1.5583618484445732, "language_loss": 0.69771802, "learning_rate": 2.370201186994689e-06, "loss": 0.71918273, "num_input_tokens_seen": 81917915, "step": 3805, "time_per_iteration": 2.735992670059204 }, { "auxiliary_loss_clip": 0.01146528, "auxiliary_loss_mlp": 0.01026787, "balance_loss_clip": 1.05204535, "balance_loss_mlp": 1.01884794, "epoch": 0.45764444177238023, "flos": 30117309868800.0, "grad_norm": 2.056254989275629, "language_loss": 0.69927001, "learning_rate": 2.369435648245399e-06, "loss": 0.72100317, "num_input_tokens_seen": 81938130, "step": 3806, "time_per_iteration": 3.4126553535461426 }, { "auxiliary_loss_clip": 0.01155552, "auxiliary_loss_mlp": 0.01038305, "balance_loss_clip": 1.05150104, "balance_loss_mlp": 1.02960253, "epoch": 0.4577646846630193, "flos": 24060293205120.0, "grad_norm": 1.679726526005014, "language_loss": 0.84967977, "learning_rate": 2.368670053453015e-06, "loss": 0.87161839, "num_input_tokens_seen": 81959820, "step": 3807, "time_per_iteration": 2.535785436630249 }, { "auxiliary_loss_clip": 0.01178461, "auxiliary_loss_mlp": 0.0102995, "balance_loss_clip": 1.05724096, "balance_loss_mlp": 1.02100909, "epoch": 0.4578849275536584, "flos": 17418578952960.0, "grad_norm": 2.164443176570588, "language_loss": 0.7445218, "learning_rate": 2.3679044027336757e-06, "loss": 0.76660591, "num_input_tokens_seen": 81975710, "step": 3808, "time_per_iteration": 3.157886505126953 }, { "auxiliary_loss_clip": 0.01183969, "auxiliary_loss_mlp": 0.01028808, "balance_loss_clip": 1.05366921, "balance_loss_mlp": 1.01993835, "epoch": 0.4580051704442975, "flos": 13510169107200.0, "grad_norm": 4.034110780753551, "language_loss": 0.69298583, "learning_rate": 2.3671386962035326e-06, "loss": 0.71511364, "num_input_tokens_seen": 81993180, "step": 3809, "time_per_iteration": 2.4340832233428955 }, { "auxiliary_loss_clip": 0.01169907, "auxiliary_loss_mlp": 0.01030642, "balance_loss_clip": 1.05239487, "balance_loss_mlp": 1.02232683, "epoch": 0.45812541333493656, "flos": 18037606965120.0, "grad_norm": 1.9833936227468862, "language_loss": 0.68344605, "learning_rate": 2.3663729339787405e-06, "loss": 0.70545149, "num_input_tokens_seen": 82010115, "step": 3810, "time_per_iteration": 2.450411558151245 }, { "auxiliary_loss_clip": 0.0118401, "auxiliary_loss_mlp": 0.01027722, "balance_loss_clip": 1.05341601, "balance_loss_mlp": 1.01924682, "epoch": 0.45824565622557567, "flos": 20222196232320.0, "grad_norm": 2.343059868702884, "language_loss": 0.7365219, "learning_rate": 2.365607116175466e-06, "loss": 0.75863922, "num_input_tokens_seen": 82025540, "step": 3811, "time_per_iteration": 3.1849732398986816 }, { "auxiliary_loss_clip": 0.01181244, "auxiliary_loss_mlp": 0.01025431, "balance_loss_clip": 1.05337, "balance_loss_mlp": 1.01740277, "epoch": 0.4583658991162148, "flos": 19864885691520.0, "grad_norm": 2.6216325182817033, "language_loss": 0.67094958, "learning_rate": 2.3648412429098825e-06, "loss": 0.69301629, "num_input_tokens_seen": 82043890, "step": 3812, "time_per_iteration": 2.469998359680176 }, { "auxiliary_loss_clip": 0.0113707, "auxiliary_loss_mlp": 0.01034498, "balance_loss_clip": 1.0495019, "balance_loss_mlp": 1.02555776, "epoch": 0.45848614200685384, "flos": 21029935322880.0, "grad_norm": 2.107825383099602, "language_loss": 0.82321841, "learning_rate": 2.364075314298172e-06, "loss": 0.84493411, "num_input_tokens_seen": 82061345, "step": 3813, "time_per_iteration": 2.5379202365875244 }, { "auxiliary_loss_clip": 0.0117526, "auxiliary_loss_mlp": 0.00763366, "balance_loss_clip": 1.05519104, "balance_loss_mlp": 1.0002799, "epoch": 0.45860638489749295, "flos": 21069293650560.0, "grad_norm": 2.1540155947863795, "language_loss": 0.70433772, "learning_rate": 2.3633093304565267e-06, "loss": 0.72372401, "num_input_tokens_seen": 82080400, "step": 3814, "time_per_iteration": 2.4544529914855957 }, { "auxiliary_loss_clip": 0.01188547, "auxiliary_loss_mlp": 0.01028813, "balance_loss_clip": 1.05673897, "balance_loss_mlp": 1.02019429, "epoch": 0.458726627788132, "flos": 26833889692800.0, "grad_norm": 1.9495933503349816, "language_loss": 0.63341087, "learning_rate": 2.3625432915011443e-06, "loss": 0.65558445, "num_input_tokens_seen": 82102310, "step": 3815, "time_per_iteration": 2.4853720664978027 }, { "auxiliary_loss_clip": 0.01150028, "auxiliary_loss_mlp": 0.01034011, "balance_loss_clip": 1.05091357, "balance_loss_mlp": 1.02560687, "epoch": 0.4588468706787711, "flos": 24097927680000.0, "grad_norm": 1.6675263893726129, "language_loss": 0.65215021, "learning_rate": 2.3617771975482334e-06, "loss": 0.67399061, "num_input_tokens_seen": 82121140, "step": 3816, "time_per_iteration": 2.496014356613159 }, { "auxiliary_loss_clip": 0.01120809, "auxiliary_loss_mlp": 0.0102628, "balance_loss_clip": 1.04783297, "balance_loss_mlp": 1.01863861, "epoch": 0.4589671135694102, "flos": 17889331622400.0, "grad_norm": 1.658072832036926, "language_loss": 0.74629033, "learning_rate": 2.3610110487140083e-06, "loss": 0.76776111, "num_input_tokens_seen": 82139575, "step": 3817, "time_per_iteration": 2.5430617332458496 }, { "auxiliary_loss_clip": 0.01156699, "auxiliary_loss_mlp": 0.01033461, "balance_loss_clip": 1.0539124, "balance_loss_mlp": 1.02509832, "epoch": 0.4590873564600493, "flos": 25626967781760.0, "grad_norm": 1.794561486391742, "language_loss": 0.8051284, "learning_rate": 2.360244845114695e-06, "loss": 0.82702994, "num_input_tokens_seen": 82159195, "step": 3818, "time_per_iteration": 2.528165578842163 }, { "auxiliary_loss_clip": 0.01150289, "auxiliary_loss_mlp": 0.01028064, "balance_loss_clip": 1.05344081, "balance_loss_mlp": 1.01942766, "epoch": 0.4592075993506884, "flos": 18514788168960.0, "grad_norm": 2.0803525924065585, "language_loss": 0.68684983, "learning_rate": 2.3594785868665245e-06, "loss": 0.7086333, "num_input_tokens_seen": 82175500, "step": 3819, "time_per_iteration": 2.473207712173462 }, { "auxiliary_loss_clip": 0.01143613, "auxiliary_loss_mlp": 0.00763172, "balance_loss_clip": 1.05111611, "balance_loss_mlp": 1.00028408, "epoch": 0.4593278422413275, "flos": 20631111638400.0, "grad_norm": 2.3835872762282255, "language_loss": 0.80651808, "learning_rate": 2.3587122740857386e-06, "loss": 0.82558596, "num_input_tokens_seen": 82192600, "step": 3820, "time_per_iteration": 2.5180118083953857 }, { "auxiliary_loss_clip": 0.01165594, "auxiliary_loss_mlp": 0.0102739, "balance_loss_clip": 1.0503968, "balance_loss_mlp": 1.01959991, "epoch": 0.45944808513196655, "flos": 21358517961600.0, "grad_norm": 1.7674697716118215, "language_loss": 0.77810109, "learning_rate": 2.357945906888586e-06, "loss": 0.80003089, "num_input_tokens_seen": 82212040, "step": 3821, "time_per_iteration": 2.4755611419677734 }, { "auxiliary_loss_clip": 0.01172685, "auxiliary_loss_mlp": 0.01032657, "balance_loss_clip": 1.05465913, "balance_loss_mlp": 1.02368617, "epoch": 0.45956832802260567, "flos": 21427789340160.0, "grad_norm": 6.857630860590178, "language_loss": 0.79621851, "learning_rate": 2.357179485391324e-06, "loss": 0.81827199, "num_input_tokens_seen": 82229895, "step": 3822, "time_per_iteration": 2.4633724689483643 }, { "auxiliary_loss_clip": 0.01184048, "auxiliary_loss_mlp": 0.0102639, "balance_loss_clip": 1.05670071, "balance_loss_mlp": 1.01808143, "epoch": 0.4596885709132448, "flos": 22382654538240.0, "grad_norm": 3.3939572568218463, "language_loss": 0.86541921, "learning_rate": 2.3564130097102173e-06, "loss": 0.88752359, "num_input_tokens_seen": 82249550, "step": 3823, "time_per_iteration": 2.431030750274658 }, { "auxiliary_loss_clip": 0.01148772, "auxiliary_loss_mlp": 0.01026805, "balance_loss_clip": 1.05376208, "balance_loss_mlp": 1.01805544, "epoch": 0.45980881380388383, "flos": 28981957806720.0, "grad_norm": 1.753479658580137, "language_loss": 0.75157654, "learning_rate": 2.355646479961541e-06, "loss": 0.77333236, "num_input_tokens_seen": 82268860, "step": 3824, "time_per_iteration": 2.6220083236694336 }, { "auxiliary_loss_clip": 0.01182269, "auxiliary_loss_mlp": 0.01026609, "balance_loss_clip": 1.05386233, "balance_loss_mlp": 1.01755476, "epoch": 0.45992905669452294, "flos": 33396599980800.0, "grad_norm": 2.1645082455652007, "language_loss": 0.71528888, "learning_rate": 2.354879896261576e-06, "loss": 0.73737764, "num_input_tokens_seen": 82289070, "step": 3825, "time_per_iteration": 2.5155882835388184 }, { "auxiliary_loss_clip": 0.01138515, "auxiliary_loss_mlp": 0.01030489, "balance_loss_clip": 1.05234957, "balance_loss_mlp": 1.02254975, "epoch": 0.46004929958516205, "flos": 36318184502400.0, "grad_norm": 1.8018786347648352, "language_loss": 0.56658053, "learning_rate": 2.3541132587266133e-06, "loss": 0.58827055, "num_input_tokens_seen": 82311790, "step": 3826, "time_per_iteration": 2.650313377380371 }, { "auxiliary_loss_clip": 0.01147093, "auxiliary_loss_mlp": 0.01026609, "balance_loss_clip": 1.0502224, "balance_loss_mlp": 1.01831234, "epoch": 0.4601695424758011, "flos": 17238451224960.0, "grad_norm": 1.9524804197617511, "language_loss": 0.69235218, "learning_rate": 2.3533465674729515e-06, "loss": 0.71408916, "num_input_tokens_seen": 82329020, "step": 3827, "time_per_iteration": 2.5046355724334717 }, { "auxiliary_loss_clip": 0.01184545, "auxiliary_loss_mlp": 0.01030169, "balance_loss_clip": 1.05534017, "balance_loss_mlp": 1.0213778, "epoch": 0.4602897853664402, "flos": 15888425529600.0, "grad_norm": 1.9207453384285458, "language_loss": 0.72819066, "learning_rate": 2.352579822616895e-06, "loss": 0.75033784, "num_input_tokens_seen": 82346455, "step": 3828, "time_per_iteration": 2.4058139324188232 }, { "auxiliary_loss_clip": 0.01157537, "auxiliary_loss_mlp": 0.01026007, "balance_loss_clip": 1.05190682, "balance_loss_mlp": 1.01799881, "epoch": 0.4604100282570793, "flos": 25412617370880.0, "grad_norm": 1.68594507157666, "language_loss": 0.77542496, "learning_rate": 2.351813024274761e-06, "loss": 0.7972604, "num_input_tokens_seen": 82367810, "step": 3829, "time_per_iteration": 3.362781286239624 }, { "auxiliary_loss_clip": 0.01145407, "auxiliary_loss_mlp": 0.01031358, "balance_loss_clip": 1.05089378, "balance_loss_mlp": 1.02305484, "epoch": 0.4605302711477184, "flos": 27630711048960.0, "grad_norm": 1.7564613473035389, "language_loss": 0.73894072, "learning_rate": 2.3510461725628693e-06, "loss": 0.76070833, "num_input_tokens_seen": 82388275, "step": 3830, "time_per_iteration": 2.5806941986083984 }, { "auxiliary_loss_clip": 0.01143263, "auxiliary_loss_mlp": 0.01028616, "balance_loss_clip": 1.05027628, "balance_loss_mlp": 1.02035475, "epoch": 0.4606505140383575, "flos": 23839657914240.0, "grad_norm": 1.7831035193181826, "language_loss": 0.71025854, "learning_rate": 2.350279267597554e-06, "loss": 0.73197734, "num_input_tokens_seen": 82408915, "step": 3831, "time_per_iteration": 2.596791982650757 }, { "auxiliary_loss_clip": 0.01171302, "auxiliary_loss_mlp": 0.01030528, "balance_loss_clip": 1.05531764, "balance_loss_mlp": 1.02201009, "epoch": 0.46077075692899655, "flos": 16107013745280.0, "grad_norm": 2.1336997328389207, "language_loss": 0.82908005, "learning_rate": 2.3495123094951515e-06, "loss": 0.8510983, "num_input_tokens_seen": 82427260, "step": 3832, "time_per_iteration": 2.4435081481933594 }, { "auxiliary_loss_clip": 0.01148236, "auxiliary_loss_mlp": 0.01023938, "balance_loss_clip": 1.05136442, "balance_loss_mlp": 1.01548648, "epoch": 0.46089099981963566, "flos": 48798147634560.0, "grad_norm": 1.9922786381061162, "language_loss": 0.75751519, "learning_rate": 2.34874529837201e-06, "loss": 0.77923691, "num_input_tokens_seen": 82450805, "step": 3833, "time_per_iteration": 3.5840744972229004 }, { "auxiliary_loss_clip": 0.01107561, "auxiliary_loss_mlp": 0.01022866, "balance_loss_clip": 1.04419804, "balance_loss_mlp": 1.01472449, "epoch": 0.46101124271027477, "flos": 19099234362240.0, "grad_norm": 2.171825253013662, "language_loss": 0.79380012, "learning_rate": 2.347978234344483e-06, "loss": 0.81510437, "num_input_tokens_seen": 82467010, "step": 3834, "time_per_iteration": 2.5585708618164062 }, { "auxiliary_loss_clip": 0.01173899, "auxiliary_loss_mlp": 0.0103647, "balance_loss_clip": 1.0553906, "balance_loss_mlp": 1.02728498, "epoch": 0.4611314856009138, "flos": 39347931853440.0, "grad_norm": 1.8323993137472896, "language_loss": 0.69013137, "learning_rate": 2.347211117528935e-06, "loss": 0.71223503, "num_input_tokens_seen": 82489310, "step": 3835, "time_per_iteration": 3.407909393310547 }, { "auxiliary_loss_clip": 0.01150674, "auxiliary_loss_mlp": 0.01030206, "balance_loss_clip": 1.05464113, "balance_loss_mlp": 1.02174199, "epoch": 0.46125172849155294, "flos": 20810772489600.0, "grad_norm": 1.5469928731392724, "language_loss": 0.71623951, "learning_rate": 2.3464439480417374e-06, "loss": 0.73804832, "num_input_tokens_seen": 82508830, "step": 3836, "time_per_iteration": 2.569108724594116 }, { "auxiliary_loss_clip": 0.01171436, "auxiliary_loss_mlp": 0.01032622, "balance_loss_clip": 1.0537746, "balance_loss_mlp": 1.0240624, "epoch": 0.46137197138219205, "flos": 17930808852480.0, "grad_norm": 2.350554078955025, "language_loss": 0.77454567, "learning_rate": 2.3456767259992676e-06, "loss": 0.79658628, "num_input_tokens_seen": 82526475, "step": 3837, "time_per_iteration": 2.458272933959961 }, { "auxiliary_loss_clip": 0.0118257, "auxiliary_loss_mlp": 0.00763591, "balance_loss_clip": 1.05277205, "balance_loss_mlp": 1.00021482, "epoch": 0.4614922142728311, "flos": 16836610798080.0, "grad_norm": 2.6381895017292294, "language_loss": 0.88564372, "learning_rate": 2.3449094515179135e-06, "loss": 0.90510535, "num_input_tokens_seen": 82543935, "step": 3838, "time_per_iteration": 3.176616907119751 }, { "auxiliary_loss_clip": 0.01160292, "auxiliary_loss_mlp": 0.01027995, "balance_loss_clip": 1.05135155, "balance_loss_mlp": 1.01922071, "epoch": 0.4616124571634702, "flos": 26614906427520.0, "grad_norm": 1.6624241672844116, "language_loss": 0.81702513, "learning_rate": 2.34414212471407e-06, "loss": 0.83890796, "num_input_tokens_seen": 82563730, "step": 3839, "time_per_iteration": 2.5248005390167236 }, { "auxiliary_loss_clip": 0.01176128, "auxiliary_loss_mlp": 0.01026621, "balance_loss_clip": 1.05352831, "balance_loss_mlp": 1.01817524, "epoch": 0.4617327000541093, "flos": 20340127560960.0, "grad_norm": 1.939585706846273, "language_loss": 0.72500026, "learning_rate": 2.3433747457041394e-06, "loss": 0.74702775, "num_input_tokens_seen": 82582435, "step": 3840, "time_per_iteration": 2.439389228820801 }, { "auxiliary_loss_clip": 0.01143336, "auxiliary_loss_mlp": 0.01030357, "balance_loss_clip": 1.05171943, "balance_loss_mlp": 1.02132726, "epoch": 0.4618529429447484, "flos": 29570749545600.0, "grad_norm": 1.9176713613126162, "language_loss": 0.84948707, "learning_rate": 2.342607314604533e-06, "loss": 0.87122405, "num_input_tokens_seen": 82602185, "step": 3841, "time_per_iteration": 2.584968090057373 }, { "auxiliary_loss_clip": 0.01172325, "auxiliary_loss_mlp": 0.01031801, "balance_loss_clip": 1.05704713, "balance_loss_mlp": 1.02339697, "epoch": 0.4619731858353875, "flos": 19787030962560.0, "grad_norm": 1.8793781865567267, "language_loss": 0.83882046, "learning_rate": 2.3418398315316694e-06, "loss": 0.86086166, "num_input_tokens_seen": 82620005, "step": 3842, "time_per_iteration": 2.4580259323120117 }, { "auxiliary_loss_clip": 0.01185784, "auxiliary_loss_mlp": 0.01039139, "balance_loss_clip": 1.05766439, "balance_loss_mlp": 1.03060961, "epoch": 0.4620934287260266, "flos": 18951138587520.0, "grad_norm": 2.2831325224672594, "language_loss": 0.77869642, "learning_rate": 2.3410722966019755e-06, "loss": 0.8009457, "num_input_tokens_seen": 82635120, "step": 3843, "time_per_iteration": 2.396442174911499 }, { "auxiliary_loss_clip": 0.01168172, "auxiliary_loss_mlp": 0.0102786, "balance_loss_clip": 1.0526011, "balance_loss_mlp": 1.01926446, "epoch": 0.46221367161666566, "flos": 37341674634240.0, "grad_norm": 1.85605789797141, "language_loss": 0.65926111, "learning_rate": 2.3403047099318848e-06, "loss": 0.68122143, "num_input_tokens_seen": 82659190, "step": 3844, "time_per_iteration": 2.5998647212982178 }, { "auxiliary_loss_clip": 0.01122707, "auxiliary_loss_mlp": 0.01024872, "balance_loss_clip": 1.0470562, "balance_loss_mlp": 1.01669991, "epoch": 0.46233391450730477, "flos": 14428549065600.0, "grad_norm": 2.099728382360556, "language_loss": 0.75131309, "learning_rate": 2.3395370716378405e-06, "loss": 0.77278888, "num_input_tokens_seen": 82676635, "step": 3845, "time_per_iteration": 2.537075996398926 }, { "auxiliary_loss_clip": 0.01173542, "auxiliary_loss_mlp": 0.01032499, "balance_loss_clip": 1.05349278, "balance_loss_mlp": 1.02434552, "epoch": 0.4624541573979438, "flos": 22493044010880.0, "grad_norm": 2.342088528436429, "language_loss": 0.72600794, "learning_rate": 2.338769381836292e-06, "loss": 0.74806833, "num_input_tokens_seen": 82696245, "step": 3846, "time_per_iteration": 2.4618353843688965 }, { "auxiliary_loss_clip": 0.01140005, "auxiliary_loss_mlp": 0.01034251, "balance_loss_clip": 1.05322385, "balance_loss_mlp": 1.02590084, "epoch": 0.46257440028858293, "flos": 14465070218880.0, "grad_norm": 2.3751685649516583, "language_loss": 0.73069572, "learning_rate": 2.3380016406436984e-06, "loss": 0.75243831, "num_input_tokens_seen": 82713725, "step": 3847, "time_per_iteration": 2.506568193435669 }, { "auxiliary_loss_clip": 0.01129059, "auxiliary_loss_mlp": 0.01034719, "balance_loss_clip": 1.05363083, "balance_loss_mlp": 1.02552223, "epoch": 0.46269464317922204, "flos": 23332204523520.0, "grad_norm": 2.1321927862599144, "language_loss": 0.8112278, "learning_rate": 2.337233848176524e-06, "loss": 0.8328656, "num_input_tokens_seen": 82731495, "step": 3848, "time_per_iteration": 2.5549581050872803 }, { "auxiliary_loss_clip": 0.01121183, "auxiliary_loss_mlp": 0.01028705, "balance_loss_clip": 1.04880321, "balance_loss_mlp": 1.02044988, "epoch": 0.4628148860698611, "flos": 18552027594240.0, "grad_norm": 2.0328690964437954, "language_loss": 0.8365525, "learning_rate": 2.3364660045512435e-06, "loss": 0.85805136, "num_input_tokens_seen": 82750255, "step": 3849, "time_per_iteration": 2.545180082321167 }, { "auxiliary_loss_clip": 0.01062477, "auxiliary_loss_mlp": 0.01003105, "balance_loss_clip": 1.02090359, "balance_loss_mlp": 1.002038, "epoch": 0.4629351289605002, "flos": 70667569670400.0, "grad_norm": 1.0325058665732432, "language_loss": 0.58241451, "learning_rate": 2.335698109884337e-06, "loss": 0.60307026, "num_input_tokens_seen": 82815460, "step": 3850, "time_per_iteration": 3.229626178741455 }, { "auxiliary_loss_clip": 0.01050997, "auxiliary_loss_mlp": 0.01003792, "balance_loss_clip": 1.03229666, "balance_loss_mlp": 1.00217104, "epoch": 0.4630553718511393, "flos": 59687200465920.0, "grad_norm": 0.7892037359808743, "language_loss": 0.59842908, "learning_rate": 2.334930164292294e-06, "loss": 0.61897695, "num_input_tokens_seen": 82878010, "step": 3851, "time_per_iteration": 3.260227918624878 }, { "auxiliary_loss_clip": 0.01120033, "auxiliary_loss_mlp": 0.01028735, "balance_loss_clip": 1.04607761, "balance_loss_mlp": 1.02115333, "epoch": 0.4631756147417784, "flos": 15960605909760.0, "grad_norm": 2.0209326958860667, "language_loss": 0.79903626, "learning_rate": 2.334162167891612e-06, "loss": 0.82052398, "num_input_tokens_seen": 82895275, "step": 3852, "time_per_iteration": 2.555870294570923 }, { "auxiliary_loss_clip": 0.01157046, "auxiliary_loss_mlp": 0.0103239, "balance_loss_clip": 1.05050707, "balance_loss_mlp": 1.02332997, "epoch": 0.4632958576324175, "flos": 16472907636480.0, "grad_norm": 2.4383968191116066, "language_loss": 0.75190365, "learning_rate": 2.333394120798795e-06, "loss": 0.77379799, "num_input_tokens_seen": 82914010, "step": 3853, "time_per_iteration": 2.510403871536255 }, { "auxiliary_loss_clip": 0.01153787, "auxiliary_loss_mlp": 0.01023884, "balance_loss_clip": 1.0492934, "balance_loss_mlp": 1.01502132, "epoch": 0.4634161005230566, "flos": 22346492520960.0, "grad_norm": 2.247237940543874, "language_loss": 0.71964312, "learning_rate": 2.3326260231303545e-06, "loss": 0.74141979, "num_input_tokens_seen": 82932610, "step": 3854, "time_per_iteration": 2.5033962726593018 }, { "auxiliary_loss_clip": 0.01182588, "auxiliary_loss_mlp": 0.01025743, "balance_loss_clip": 1.05686522, "balance_loss_mlp": 1.01766622, "epoch": 0.46353634341369565, "flos": 15742233175680.0, "grad_norm": 1.6235630507052292, "language_loss": 0.86317807, "learning_rate": 2.331857875002811e-06, "loss": 0.88526136, "num_input_tokens_seen": 82951210, "step": 3855, "time_per_iteration": 2.417605400085449 }, { "auxiliary_loss_clip": 0.01157342, "auxiliary_loss_mlp": 0.01032131, "balance_loss_clip": 1.05550969, "balance_loss_mlp": 1.02400088, "epoch": 0.46365658630433476, "flos": 28329820433280.0, "grad_norm": 1.6870892582817414, "language_loss": 0.75995284, "learning_rate": 2.3310896765326916e-06, "loss": 0.7818476, "num_input_tokens_seen": 82972210, "step": 3856, "time_per_iteration": 3.3949105739593506 }, { "auxiliary_loss_clip": 0.01138665, "auxiliary_loss_mlp": 0.01033063, "balance_loss_clip": 1.05045128, "balance_loss_mlp": 1.02427781, "epoch": 0.46377682919497387, "flos": 24608074590720.0, "grad_norm": 1.5815510143294145, "language_loss": 0.84264368, "learning_rate": 2.330321427836531e-06, "loss": 0.86436093, "num_input_tokens_seen": 82994080, "step": 3857, "time_per_iteration": 2.5600061416625977 }, { "auxiliary_loss_clip": 0.01165121, "auxiliary_loss_mlp": 0.01026022, "balance_loss_clip": 1.05233979, "balance_loss_mlp": 1.01702785, "epoch": 0.4638970720856129, "flos": 19060953442560.0, "grad_norm": 1.605440585062139, "language_loss": 0.82842326, "learning_rate": 2.3295531290308733e-06, "loss": 0.8503347, "num_input_tokens_seen": 83012230, "step": 3858, "time_per_iteration": 2.44889760017395 }, { "auxiliary_loss_clip": 0.01185408, "auxiliary_loss_mlp": 0.00763224, "balance_loss_clip": 1.05544567, "balance_loss_mlp": 1.00024676, "epoch": 0.46401731497625204, "flos": 18471012468480.0, "grad_norm": 2.515361157905502, "language_loss": 0.75979018, "learning_rate": 2.3287847802322678e-06, "loss": 0.77927655, "num_input_tokens_seen": 83027800, "step": 3859, "time_per_iteration": 2.388307571411133 }, { "auxiliary_loss_clip": 0.01164749, "auxiliary_loss_mlp": 0.01026611, "balance_loss_clip": 1.05549955, "balance_loss_mlp": 1.01785541, "epoch": 0.4641375578668911, "flos": 26067053214720.0, "grad_norm": 1.730979245222694, "language_loss": 0.83645052, "learning_rate": 2.3280163815572723e-06, "loss": 0.85836411, "num_input_tokens_seen": 83048395, "step": 3860, "time_per_iteration": 3.386744499206543 }, { "auxiliary_loss_clip": 0.01146104, "auxiliary_loss_mlp": 0.01026458, "balance_loss_clip": 1.04920149, "balance_loss_mlp": 1.01835442, "epoch": 0.4642578007575302, "flos": 19570382081280.0, "grad_norm": 2.2315802115092533, "language_loss": 0.76647878, "learning_rate": 2.3272479331224522e-06, "loss": 0.78820431, "num_input_tokens_seen": 83065825, "step": 3861, "time_per_iteration": 2.4834401607513428 }, { "auxiliary_loss_clip": 0.01185677, "auxiliary_loss_mlp": 0.01025621, "balance_loss_clip": 1.05557144, "balance_loss_mlp": 1.01763952, "epoch": 0.4643780436481693, "flos": 28186249772160.0, "grad_norm": 1.659882677175064, "language_loss": 0.77978754, "learning_rate": 2.3264794350443817e-06, "loss": 0.80190051, "num_input_tokens_seen": 83087920, "step": 3862, "time_per_iteration": 3.3304646015167236 }, { "auxiliary_loss_clip": 0.01170686, "auxiliary_loss_mlp": 0.01024434, "balance_loss_clip": 1.05011213, "balance_loss_mlp": 1.01621449, "epoch": 0.46449828653880837, "flos": 25375270204800.0, "grad_norm": 1.8237793170602534, "language_loss": 0.78528023, "learning_rate": 2.3257108874396396e-06, "loss": 0.80723143, "num_input_tokens_seen": 83109015, "step": 3863, "time_per_iteration": 2.516768217086792 }, { "auxiliary_loss_clip": 0.01155008, "auxiliary_loss_mlp": 0.01037081, "balance_loss_clip": 1.0512743, "balance_loss_mlp": 1.02834868, "epoch": 0.4646185294294475, "flos": 16034330574720.0, "grad_norm": 1.9480766458141583, "language_loss": 0.73751605, "learning_rate": 2.3249422904248152e-06, "loss": 0.75943691, "num_input_tokens_seen": 83127450, "step": 3864, "time_per_iteration": 3.2613096237182617 }, { "auxiliary_loss_clip": 0.01171798, "auxiliary_loss_mlp": 0.01028525, "balance_loss_clip": 1.0532186, "balance_loss_mlp": 1.02071977, "epoch": 0.4647387723200866, "flos": 26363101109760.0, "grad_norm": 1.4045701801822494, "language_loss": 0.86915839, "learning_rate": 2.324173644116504e-06, "loss": 0.89116168, "num_input_tokens_seen": 83150300, "step": 3865, "time_per_iteration": 2.5376906394958496 }, { "auxiliary_loss_clip": 0.01165689, "auxiliary_loss_mlp": 0.01026539, "balance_loss_clip": 1.0545218, "balance_loss_mlp": 1.01884961, "epoch": 0.46485901521072565, "flos": 27160209774720.0, "grad_norm": 2.0221487893706342, "language_loss": 0.81538665, "learning_rate": 2.3234049486313087e-06, "loss": 0.83730888, "num_input_tokens_seen": 83171750, "step": 3866, "time_per_iteration": 2.511503219604492 }, { "auxiliary_loss_clip": 0.01166536, "auxiliary_loss_mlp": 0.01026238, "balance_loss_clip": 1.05282724, "balance_loss_mlp": 1.01875186, "epoch": 0.46497925810136476, "flos": 24279851088000.0, "grad_norm": 2.5105874711915845, "language_loss": 0.75770646, "learning_rate": 2.322636204085839e-06, "loss": 0.77963424, "num_input_tokens_seen": 83191820, "step": 3867, "time_per_iteration": 2.4915366172790527 }, { "auxiliary_loss_clip": 0.01144801, "auxiliary_loss_mlp": 0.01032367, "balance_loss_clip": 1.04699802, "balance_loss_mlp": 1.02395701, "epoch": 0.46509950099200387, "flos": 16253134272000.0, "grad_norm": 2.1503541626242586, "language_loss": 0.78623295, "learning_rate": 2.3218674105967143e-06, "loss": 0.80800462, "num_input_tokens_seen": 83210085, "step": 3868, "time_per_iteration": 2.469186544418335 }, { "auxiliary_loss_clip": 0.01148773, "auxiliary_loss_mlp": 0.01029401, "balance_loss_clip": 1.04975271, "balance_loss_mlp": 1.02129507, "epoch": 0.4652197438826429, "flos": 23442270773760.0, "grad_norm": 1.5589792835783813, "language_loss": 0.8346957, "learning_rate": 2.3210985682805593e-06, "loss": 0.85647738, "num_input_tokens_seen": 83231865, "step": 3869, "time_per_iteration": 2.558034658432007 }, { "auxiliary_loss_clip": 0.01185475, "auxiliary_loss_mlp": 0.01026578, "balance_loss_clip": 1.05710042, "balance_loss_mlp": 1.01839995, "epoch": 0.46533998677328203, "flos": 16216397637120.0, "grad_norm": 2.2559173279609803, "language_loss": 0.68302858, "learning_rate": 2.320329677254007e-06, "loss": 0.70514911, "num_input_tokens_seen": 83249195, "step": 3870, "time_per_iteration": 2.3962395191192627 }, { "auxiliary_loss_clip": 0.0118221, "auxiliary_loss_mlp": 0.01026913, "balance_loss_clip": 1.05451286, "balance_loss_mlp": 1.01862216, "epoch": 0.46546022966392114, "flos": 21141869080320.0, "grad_norm": 2.7025901157905556, "language_loss": 0.72160763, "learning_rate": 2.319560737633697e-06, "loss": 0.74369889, "num_input_tokens_seen": 83267915, "step": 3871, "time_per_iteration": 2.4524593353271484 }, { "auxiliary_loss_clip": 0.011467, "auxiliary_loss_mlp": 0.01028435, "balance_loss_clip": 1.04927289, "balance_loss_mlp": 1.01963747, "epoch": 0.4655804725545602, "flos": 41171942442240.0, "grad_norm": 1.7049153411179157, "language_loss": 0.6811558, "learning_rate": 2.3187917495362775e-06, "loss": 0.70290715, "num_input_tokens_seen": 83292325, "step": 3872, "time_per_iteration": 2.7060701847076416 }, { "auxiliary_loss_clip": 0.01126992, "auxiliary_loss_mlp": 0.01035594, "balance_loss_clip": 1.05036211, "balance_loss_mlp": 1.0270648, "epoch": 0.4657007154451993, "flos": 19570956698880.0, "grad_norm": 2.5185733140767805, "language_loss": 0.77119905, "learning_rate": 2.318022713078403e-06, "loss": 0.79282498, "num_input_tokens_seen": 83306905, "step": 3873, "time_per_iteration": 2.528513193130493 }, { "auxiliary_loss_clip": 0.01154149, "auxiliary_loss_mlp": 0.01032861, "balance_loss_clip": 1.05237174, "balance_loss_mlp": 1.02481437, "epoch": 0.4658209583358384, "flos": 15517826956800.0, "grad_norm": 2.1639419508203273, "language_loss": 0.84712434, "learning_rate": 2.3172536283767354e-06, "loss": 0.86899447, "num_input_tokens_seen": 83320665, "step": 3874, "time_per_iteration": 2.446023464202881 }, { "auxiliary_loss_clip": 0.01138724, "auxiliary_loss_mlp": 0.01025478, "balance_loss_clip": 1.05147064, "balance_loss_mlp": 1.01665688, "epoch": 0.4659412012264775, "flos": 14903180403840.0, "grad_norm": 2.42409469856878, "language_loss": 0.80850017, "learning_rate": 2.3164844955479447e-06, "loss": 0.83014214, "num_input_tokens_seen": 83336475, "step": 3875, "time_per_iteration": 2.4981679916381836 }, { "auxiliary_loss_clip": 0.01136046, "auxiliary_loss_mlp": 0.01026874, "balance_loss_clip": 1.04999542, "balance_loss_mlp": 1.01835036, "epoch": 0.4660614441171166, "flos": 24425612478720.0, "grad_norm": 1.6047061721604563, "language_loss": 0.70705074, "learning_rate": 2.3157153147087082e-06, "loss": 0.7286799, "num_input_tokens_seen": 83358365, "step": 3876, "time_per_iteration": 2.606342315673828 }, { "auxiliary_loss_clip": 0.01135429, "auxiliary_loss_mlp": 0.01027477, "balance_loss_clip": 1.0527494, "balance_loss_mlp": 1.01957297, "epoch": 0.46618168700775564, "flos": 22091095843200.0, "grad_norm": 1.8187304219263447, "language_loss": 0.83066249, "learning_rate": 2.314946085975709e-06, "loss": 0.85229158, "num_input_tokens_seen": 83377345, "step": 3877, "time_per_iteration": 2.5547399520874023 }, { "auxiliary_loss_clip": 0.01134007, "auxiliary_loss_mlp": 0.01029665, "balance_loss_clip": 1.0530827, "balance_loss_mlp": 1.02146959, "epoch": 0.46630192989839475, "flos": 26176975810560.0, "grad_norm": 1.6979508638888605, "language_loss": 0.82295507, "learning_rate": 2.3141768094656393e-06, "loss": 0.84459186, "num_input_tokens_seen": 83395920, "step": 3878, "time_per_iteration": 2.5586094856262207 }, { "auxiliary_loss_clip": 0.01105843, "auxiliary_loss_mlp": 0.0102888, "balance_loss_clip": 1.04454708, "balance_loss_mlp": 1.02097595, "epoch": 0.46642217278903386, "flos": 11509622150400.0, "grad_norm": 2.3558260735724907, "language_loss": 0.82952344, "learning_rate": 2.3134074852951966e-06, "loss": 0.85087067, "num_input_tokens_seen": 83412510, "step": 3879, "time_per_iteration": 2.580610513687134 }, { "auxiliary_loss_clip": 0.01119884, "auxiliary_loss_mlp": 0.01030945, "balance_loss_clip": 1.04363728, "balance_loss_mlp": 1.02316689, "epoch": 0.4665424156796729, "flos": 32306819299200.0, "grad_norm": 1.6116375456185172, "language_loss": 0.77827567, "learning_rate": 2.312638113581088e-06, "loss": 0.79978389, "num_input_tokens_seen": 83432995, "step": 3880, "time_per_iteration": 2.635580062866211 }, { "auxiliary_loss_clip": 0.01168217, "auxiliary_loss_mlp": 0.01029725, "balance_loss_clip": 1.05089295, "balance_loss_mlp": 1.02157664, "epoch": 0.46666265857031203, "flos": 18436179254400.0, "grad_norm": 2.573096329465983, "language_loss": 0.77883887, "learning_rate": 2.311868694440027e-06, "loss": 0.80081832, "num_input_tokens_seen": 83447415, "step": 3881, "time_per_iteration": 2.4474406242370605 }, { "auxiliary_loss_clip": 0.01086467, "auxiliary_loss_mlp": 0.01004618, "balance_loss_clip": 1.02231169, "balance_loss_mlp": 1.00350344, "epoch": 0.46678290146095114, "flos": 68438989221120.0, "grad_norm": 0.732041733792607, "language_loss": 0.62523288, "learning_rate": 2.3110992279887323e-06, "loss": 0.64614373, "num_input_tokens_seen": 83519340, "step": 3882, "time_per_iteration": 3.1565401554107666 }, { "auxiliary_loss_clip": 0.01147862, "auxiliary_loss_mlp": 0.01028464, "balance_loss_clip": 1.05331504, "balance_loss_mlp": 1.02008343, "epoch": 0.4669031443515902, "flos": 17712507945600.0, "grad_norm": 2.2361374047629696, "language_loss": 0.84642488, "learning_rate": 2.310329714343932e-06, "loss": 0.86818814, "num_input_tokens_seen": 83535490, "step": 3883, "time_per_iteration": 3.3573288917541504 }, { "auxiliary_loss_clip": 0.01150141, "auxiliary_loss_mlp": 0.01023739, "balance_loss_clip": 1.05067158, "balance_loss_mlp": 1.01568651, "epoch": 0.4670233872422293, "flos": 23947748916480.0, "grad_norm": 2.0114157679523506, "language_loss": 0.8227917, "learning_rate": 2.309560153622361e-06, "loss": 0.84453058, "num_input_tokens_seen": 83552400, "step": 3884, "time_per_iteration": 2.5054590702056885 }, { "auxiliary_loss_clip": 0.01140206, "auxiliary_loss_mlp": 0.01029606, "balance_loss_clip": 1.05041337, "balance_loss_mlp": 1.02079654, "epoch": 0.4671436301328684, "flos": 28111268131200.0, "grad_norm": 1.907342007010759, "language_loss": 0.74632037, "learning_rate": 2.3087905459407602e-06, "loss": 0.76801848, "num_input_tokens_seen": 83571340, "step": 3885, "time_per_iteration": 2.583822250366211 }, { "auxiliary_loss_clip": 0.01074645, "auxiliary_loss_mlp": 0.010012, "balance_loss_clip": 1.01950741, "balance_loss_mlp": 1.00006795, "epoch": 0.46726387302350747, "flos": 69369684566400.0, "grad_norm": 0.7899103192596285, "language_loss": 0.62985563, "learning_rate": 2.3080208914158795e-06, "loss": 0.65061414, "num_input_tokens_seen": 83634340, "step": 3886, "time_per_iteration": 3.9647345542907715 }, { "auxiliary_loss_clip": 0.01153786, "auxiliary_loss_mlp": 0.01026643, "balance_loss_clip": 1.05349183, "balance_loss_mlp": 1.01876879, "epoch": 0.4673841159141466, "flos": 25519666878720.0, "grad_norm": 2.180772374069027, "language_loss": 0.72144479, "learning_rate": 2.3072511901644753e-06, "loss": 0.74324906, "num_input_tokens_seen": 83653410, "step": 3887, "time_per_iteration": 2.528275489807129 }, { "auxiliary_loss_clip": 0.01183669, "auxiliary_loss_mlp": 0.01024972, "balance_loss_clip": 1.05625856, "balance_loss_mlp": 1.01689017, "epoch": 0.4675043588047857, "flos": 24499265316480.0, "grad_norm": 1.7702952062515918, "language_loss": 0.80995989, "learning_rate": 2.306481442303309e-06, "loss": 0.83204627, "num_input_tokens_seen": 83672985, "step": 3888, "time_per_iteration": 3.247695207595825 }, { "auxiliary_loss_clip": 0.01170851, "auxiliary_loss_mlp": 0.0102845, "balance_loss_clip": 1.05205393, "balance_loss_mlp": 1.01982522, "epoch": 0.46762460169542475, "flos": 20960771685120.0, "grad_norm": 1.885880303298299, "language_loss": 0.73227996, "learning_rate": 2.3057116479491515e-06, "loss": 0.75427294, "num_input_tokens_seen": 83692395, "step": 3889, "time_per_iteration": 2.4668185710906982 }, { "auxiliary_loss_clip": 0.01163405, "auxiliary_loss_mlp": 0.01029331, "balance_loss_clip": 1.04871678, "balance_loss_mlp": 1.0213145, "epoch": 0.46774484458606386, "flos": 19171666137600.0, "grad_norm": 1.8307758839218344, "language_loss": 0.76158923, "learning_rate": 2.30494180721878e-06, "loss": 0.78351659, "num_input_tokens_seen": 83709735, "step": 3890, "time_per_iteration": 2.437392234802246 }, { "auxiliary_loss_clip": 0.01166558, "auxiliary_loss_mlp": 0.01032485, "balance_loss_clip": 1.0514648, "balance_loss_mlp": 1.02476072, "epoch": 0.4678650874767029, "flos": 17967689141760.0, "grad_norm": 1.984678040888482, "language_loss": 0.89754528, "learning_rate": 2.3041719202289794e-06, "loss": 0.91953576, "num_input_tokens_seen": 83725910, "step": 3891, "time_per_iteration": 3.1704788208007812 }, { "auxiliary_loss_clip": 0.01168691, "auxiliary_loss_mlp": 0.0103224, "balance_loss_clip": 1.05288756, "balance_loss_mlp": 1.02413988, "epoch": 0.467985330367342, "flos": 21360816432000.0, "grad_norm": 1.664477723877608, "language_loss": 0.80685574, "learning_rate": 2.30340198709654e-06, "loss": 0.82886505, "num_input_tokens_seen": 83745745, "step": 3892, "time_per_iteration": 2.475205659866333 }, { "auxiliary_loss_clip": 0.01157089, "auxiliary_loss_mlp": 0.01030517, "balance_loss_clip": 1.04808879, "balance_loss_mlp": 1.02253628, "epoch": 0.46810557325798113, "flos": 20521835487360.0, "grad_norm": 2.178028118512589, "language_loss": 0.74731648, "learning_rate": 2.3026320079382605e-06, "loss": 0.76919258, "num_input_tokens_seen": 83762680, "step": 3893, "time_per_iteration": 2.478456974029541 }, { "auxiliary_loss_clip": 0.01181422, "auxiliary_loss_mlp": 0.01024631, "balance_loss_clip": 1.05420351, "balance_loss_mlp": 1.01641786, "epoch": 0.4682258161486202, "flos": 30117848572800.0, "grad_norm": 1.9910731750170454, "language_loss": 0.76259762, "learning_rate": 2.3018619828709454e-06, "loss": 0.78465819, "num_input_tokens_seen": 83784220, "step": 3894, "time_per_iteration": 2.4849154949188232 }, { "auxiliary_loss_clip": 0.01165536, "auxiliary_loss_mlp": 0.00762786, "balance_loss_clip": 1.05523419, "balance_loss_mlp": 1.00016975, "epoch": 0.4683460590392593, "flos": 25293357239040.0, "grad_norm": 1.941504932459736, "language_loss": 0.8214823, "learning_rate": 2.3010919120114084e-06, "loss": 0.84076548, "num_input_tokens_seen": 83800750, "step": 3895, "time_per_iteration": 2.471978187561035 }, { "auxiliary_loss_clip": 0.01162205, "auxiliary_loss_mlp": 0.01032129, "balance_loss_clip": 1.04592657, "balance_loss_mlp": 1.02428472, "epoch": 0.4684663019298984, "flos": 15368330551680.0, "grad_norm": 2.936894641281481, "language_loss": 0.66227627, "learning_rate": 2.3003217954764672e-06, "loss": 0.6842196, "num_input_tokens_seen": 83815455, "step": 3896, "time_per_iteration": 2.399125814437866 }, { "auxiliary_loss_clip": 0.01168341, "auxiliary_loss_mlp": 0.01026117, "balance_loss_clip": 1.04838705, "balance_loss_mlp": 1.01792777, "epoch": 0.46858654482053747, "flos": 27778842737280.0, "grad_norm": 2.183958799472053, "language_loss": 0.7964617, "learning_rate": 2.299551633382949e-06, "loss": 0.81840628, "num_input_tokens_seen": 83835765, "step": 3897, "time_per_iteration": 2.498365879058838 }, { "auxiliary_loss_clip": 0.01146293, "auxiliary_loss_mlp": 0.01029172, "balance_loss_clip": 1.04847121, "balance_loss_mlp": 1.02112532, "epoch": 0.4687067877111766, "flos": 18040623707520.0, "grad_norm": 1.8873209137383269, "language_loss": 0.85583794, "learning_rate": 2.2987814258476854e-06, "loss": 0.87759262, "num_input_tokens_seen": 83853565, "step": 3898, "time_per_iteration": 2.459045886993408 }, { "auxiliary_loss_clip": 0.01126318, "auxiliary_loss_mlp": 0.01025096, "balance_loss_clip": 1.04328096, "balance_loss_mlp": 1.01698685, "epoch": 0.4688270306018157, "flos": 16977380198400.0, "grad_norm": 2.3232820564551893, "language_loss": 0.67809123, "learning_rate": 2.2980111729875177e-06, "loss": 0.69960535, "num_input_tokens_seen": 83869815, "step": 3899, "time_per_iteration": 2.5148820877075195 }, { "auxiliary_loss_clip": 0.01149602, "auxiliary_loss_mlp": 0.01033599, "balance_loss_clip": 1.05162537, "balance_loss_mlp": 1.02554631, "epoch": 0.46894727349245474, "flos": 17821640442240.0, "grad_norm": 1.7038890059976342, "language_loss": 0.82639122, "learning_rate": 2.2972408749192917e-06, "loss": 0.84822327, "num_input_tokens_seen": 83887545, "step": 3900, "time_per_iteration": 2.454667568206787 }, { "auxiliary_loss_clip": 0.01167148, "auxiliary_loss_mlp": 0.00762295, "balance_loss_clip": 1.05345964, "balance_loss_mlp": 1.00015974, "epoch": 0.46906751638309385, "flos": 21471349559040.0, "grad_norm": 1.9022616262999756, "language_loss": 0.66984117, "learning_rate": 2.296470531759861e-06, "loss": 0.68913567, "num_input_tokens_seen": 83905645, "step": 3901, "time_per_iteration": 2.5472047328948975 }, { "auxiliary_loss_clip": 0.01134553, "auxiliary_loss_mlp": 0.01027173, "balance_loss_clip": 1.04632318, "balance_loss_mlp": 1.01845884, "epoch": 0.46918775927373296, "flos": 20337829090560.0, "grad_norm": 2.104826364359253, "language_loss": 0.79586041, "learning_rate": 2.2957001436260866e-06, "loss": 0.81747758, "num_input_tokens_seen": 83922705, "step": 3902, "time_per_iteration": 2.50502610206604 }, { "auxiliary_loss_clip": 0.01149945, "auxiliary_loss_mlp": 0.01030722, "balance_loss_clip": 1.04967451, "balance_loss_mlp": 1.02277637, "epoch": 0.469308002164372, "flos": 18403249461120.0, "grad_norm": 1.6900208873205684, "language_loss": 0.73074389, "learning_rate": 2.294929710634836e-06, "loss": 0.75255048, "num_input_tokens_seen": 83940795, "step": 3903, "time_per_iteration": 2.4626681804656982 }, { "auxiliary_loss_clip": 0.01163985, "auxiliary_loss_mlp": 0.01032766, "balance_loss_clip": 1.04815865, "balance_loss_mlp": 1.02452862, "epoch": 0.46942824505501113, "flos": 37962067363200.0, "grad_norm": 1.7988188607603042, "language_loss": 0.6076479, "learning_rate": 2.2941592329029823e-06, "loss": 0.62961543, "num_input_tokens_seen": 83961900, "step": 3904, "time_per_iteration": 2.6006581783294678 }, { "auxiliary_loss_clip": 0.01164936, "auxiliary_loss_mlp": 0.01031881, "balance_loss_clip": 1.05158997, "balance_loss_mlp": 1.02277911, "epoch": 0.46954848794565024, "flos": 21872507627520.0, "grad_norm": 2.013836162482629, "language_loss": 0.79043615, "learning_rate": 2.2933887105474067e-06, "loss": 0.81240427, "num_input_tokens_seen": 83980075, "step": 3905, "time_per_iteration": 2.468151807785034 }, { "auxiliary_loss_clip": 0.01165186, "auxiliary_loss_mlp": 0.01030691, "balance_loss_clip": 1.05300975, "balance_loss_mlp": 1.02302909, "epoch": 0.4696687308362893, "flos": 22016545165440.0, "grad_norm": 1.7684303629614024, "language_loss": 0.8133927, "learning_rate": 2.2926181436849974e-06, "loss": 0.83535147, "num_input_tokens_seen": 83999430, "step": 3906, "time_per_iteration": 2.4747769832611084 }, { "auxiliary_loss_clip": 0.01166751, "auxiliary_loss_mlp": 0.01033423, "balance_loss_clip": 1.05305803, "balance_loss_mlp": 1.02475095, "epoch": 0.4697889737269284, "flos": 21613663244160.0, "grad_norm": 1.7270848221075503, "language_loss": 0.72615135, "learning_rate": 2.2918475324326478e-06, "loss": 0.74815309, "num_input_tokens_seen": 84019150, "step": 3907, "time_per_iteration": 2.4720046520233154 }, { "auxiliary_loss_clip": 0.01172444, "auxiliary_loss_mlp": 0.00762847, "balance_loss_clip": 1.05482447, "balance_loss_mlp": 1.00018561, "epoch": 0.46990921661756746, "flos": 25228323665280.0, "grad_norm": 2.414298547109381, "language_loss": 0.9137364, "learning_rate": 2.2910768769072603e-06, "loss": 0.93308932, "num_input_tokens_seen": 84037930, "step": 3908, "time_per_iteration": 2.494840145111084 }, { "auxiliary_loss_clip": 0.011628, "auxiliary_loss_mlp": 0.01032886, "balance_loss_clip": 1.05139899, "balance_loss_mlp": 1.02514374, "epoch": 0.47002945950820657, "flos": 13844031045120.0, "grad_norm": 1.8640808578300783, "language_loss": 0.75583982, "learning_rate": 2.2903061772257417e-06, "loss": 0.77779669, "num_input_tokens_seen": 84055915, "step": 3909, "time_per_iteration": 2.4327032566070557 }, { "auxiliary_loss_clip": 0.01166474, "auxiliary_loss_mlp": 0.01030212, "balance_loss_clip": 1.05212092, "balance_loss_mlp": 1.02214789, "epoch": 0.4701497023988457, "flos": 26247001374720.0, "grad_norm": 1.4465889587670249, "language_loss": 0.78676677, "learning_rate": 2.289535433505007e-06, "loss": 0.80873364, "num_input_tokens_seen": 84077270, "step": 3910, "time_per_iteration": 3.34236478805542 }, { "auxiliary_loss_clip": 0.01154239, "auxiliary_loss_mlp": 0.01027731, "balance_loss_clip": 1.04852319, "balance_loss_mlp": 1.01959467, "epoch": 0.47026994528948474, "flos": 25629517647360.0, "grad_norm": 3.3325974196098884, "language_loss": 0.63712442, "learning_rate": 2.2887646458619767e-06, "loss": 0.65894413, "num_input_tokens_seen": 84098635, "step": 3911, "time_per_iteration": 2.5121309757232666 }, { "auxiliary_loss_clip": 0.0114579, "auxiliary_loss_mlp": 0.01033416, "balance_loss_clip": 1.04975271, "balance_loss_mlp": 1.02493477, "epoch": 0.47039018818012385, "flos": 20554406144640.0, "grad_norm": 1.8168180917927572, "language_loss": 0.76332533, "learning_rate": 2.2879938144135797e-06, "loss": 0.78511739, "num_input_tokens_seen": 84114740, "step": 3912, "time_per_iteration": 3.391508102416992 }, { "auxiliary_loss_clip": 0.01137282, "auxiliary_loss_mlp": 0.00762165, "balance_loss_clip": 1.04772663, "balance_loss_mlp": 1.0001483, "epoch": 0.47051043107076296, "flos": 21577249831680.0, "grad_norm": 1.566237535722384, "language_loss": 0.74709392, "learning_rate": 2.2872229392767496e-06, "loss": 0.76608837, "num_input_tokens_seen": 84134845, "step": 3913, "time_per_iteration": 2.554697036743164 }, { "auxiliary_loss_clip": 0.01173013, "auxiliary_loss_mlp": 0.01030438, "balance_loss_clip": 1.05492735, "balance_loss_mlp": 1.02301705, "epoch": 0.470630673961402, "flos": 18953185662720.0, "grad_norm": 1.6615889273849738, "language_loss": 0.74864447, "learning_rate": 2.286452020568428e-06, "loss": 0.770679, "num_input_tokens_seen": 84152920, "step": 3914, "time_per_iteration": 2.4670512676239014 }, { "auxiliary_loss_clip": 0.01187811, "auxiliary_loss_mlp": 0.01029877, "balance_loss_clip": 1.05464709, "balance_loss_mlp": 1.02084088, "epoch": 0.4707509168520411, "flos": 19938969492480.0, "grad_norm": 1.7767280007631059, "language_loss": 0.73110145, "learning_rate": 2.2856810584055637e-06, "loss": 0.75327832, "num_input_tokens_seen": 84170455, "step": 3915, "time_per_iteration": 3.1501214504241943 }, { "auxiliary_loss_clip": 0.01169901, "auxiliary_loss_mlp": 0.01026995, "balance_loss_clip": 1.05301285, "balance_loss_mlp": 1.01920438, "epoch": 0.47087115974268023, "flos": 40118754741120.0, "grad_norm": 1.4843282628754895, "language_loss": 0.67796385, "learning_rate": 2.2849100529051085e-06, "loss": 0.69993281, "num_input_tokens_seen": 84197390, "step": 3916, "time_per_iteration": 2.6483914852142334 }, { "auxiliary_loss_clip": 0.01181751, "auxiliary_loss_mlp": 0.01028849, "balance_loss_clip": 1.05486059, "balance_loss_mlp": 1.0206002, "epoch": 0.4709914026333193, "flos": 13552723745280.0, "grad_norm": 2.6199059526797646, "language_loss": 0.80272996, "learning_rate": 2.284139004184026e-06, "loss": 0.8248359, "num_input_tokens_seen": 84214620, "step": 3917, "time_per_iteration": 3.078840732574463 }, { "auxiliary_loss_clip": 0.01184876, "auxiliary_loss_mlp": 0.01025919, "balance_loss_clip": 1.05471694, "balance_loss_mlp": 1.01772952, "epoch": 0.4711116455239584, "flos": 19974628719360.0, "grad_norm": 1.9085208601447747, "language_loss": 0.74340433, "learning_rate": 2.2833679123592814e-06, "loss": 0.76551229, "num_input_tokens_seen": 84231880, "step": 3918, "time_per_iteration": 2.4177355766296387 }, { "auxiliary_loss_clip": 0.01152112, "auxiliary_loss_mlp": 0.01029914, "balance_loss_clip": 1.05053329, "balance_loss_mlp": 1.02140808, "epoch": 0.4712318884145975, "flos": 32124824064000.0, "grad_norm": 1.8913302106108993, "language_loss": 0.63578331, "learning_rate": 2.2825967775478508e-06, "loss": 0.65760362, "num_input_tokens_seen": 84252980, "step": 3919, "time_per_iteration": 2.5593221187591553 }, { "auxiliary_loss_clip": 0.01181308, "auxiliary_loss_mlp": 0.01029988, "balance_loss_clip": 1.05168343, "balance_loss_mlp": 1.0218457, "epoch": 0.47135213130523657, "flos": 20047850593920.0, "grad_norm": 1.9486405646148768, "language_loss": 0.83488429, "learning_rate": 2.2818255998667135e-06, "loss": 0.85699725, "num_input_tokens_seen": 84271490, "step": 3920, "time_per_iteration": 2.407737970352173 }, { "auxiliary_loss_clip": 0.01168589, "auxiliary_loss_mlp": 0.01023757, "balance_loss_clip": 1.05431485, "balance_loss_mlp": 1.01621699, "epoch": 0.4714723741958757, "flos": 19426990988160.0, "grad_norm": 1.67337167104691, "language_loss": 0.79162836, "learning_rate": 2.2810543794328566e-06, "loss": 0.81355184, "num_input_tokens_seen": 84290525, "step": 3921, "time_per_iteration": 2.436403751373291 }, { "auxiliary_loss_clip": 0.01171074, "auxiliary_loss_mlp": 0.010297, "balance_loss_clip": 1.0525527, "balance_loss_mlp": 1.02193356, "epoch": 0.4715926170865148, "flos": 20373883367040.0, "grad_norm": 1.6731921078568863, "language_loss": 0.82470763, "learning_rate": 2.2802831163632735e-06, "loss": 0.84671533, "num_input_tokens_seen": 84309245, "step": 3922, "time_per_iteration": 2.5063672065734863 }, { "auxiliary_loss_clip": 0.01114736, "auxiliary_loss_mlp": 0.01029042, "balance_loss_clip": 1.04751801, "balance_loss_mlp": 1.0205543, "epoch": 0.47171285997715384, "flos": 22672884430080.0, "grad_norm": 1.656064427367903, "language_loss": 0.74663508, "learning_rate": 2.279511810774965e-06, "loss": 0.76807296, "num_input_tokens_seen": 84330775, "step": 3923, "time_per_iteration": 2.5902345180511475 }, { "auxiliary_loss_clip": 0.01184181, "auxiliary_loss_mlp": 0.01027104, "balance_loss_clip": 1.05463362, "balance_loss_mlp": 1.01911116, "epoch": 0.47183310286779295, "flos": 21105419754240.0, "grad_norm": 1.9027523324417548, "language_loss": 0.71974021, "learning_rate": 2.2787404627849364e-06, "loss": 0.74185306, "num_input_tokens_seen": 84349985, "step": 3924, "time_per_iteration": 2.416029691696167 }, { "auxiliary_loss_clip": 0.01151481, "auxiliary_loss_mlp": 0.01027267, "balance_loss_clip": 1.04859531, "balance_loss_mlp": 1.01920867, "epoch": 0.471953345758432, "flos": 21726566668800.0, "grad_norm": 1.9920244565664502, "language_loss": 0.79076815, "learning_rate": 2.277969072510202e-06, "loss": 0.81255561, "num_input_tokens_seen": 84368965, "step": 3925, "time_per_iteration": 2.4915759563446045 }, { "auxiliary_loss_clip": 0.01154154, "auxiliary_loss_mlp": 0.01026082, "balance_loss_clip": 1.05168402, "balance_loss_mlp": 1.01827955, "epoch": 0.4720735886490711, "flos": 19861078849920.0, "grad_norm": 1.6432953144551201, "language_loss": 0.81646621, "learning_rate": 2.2771976400677803e-06, "loss": 0.83826864, "num_input_tokens_seen": 84387795, "step": 3926, "time_per_iteration": 2.491424083709717 }, { "auxiliary_loss_clip": 0.01115938, "auxiliary_loss_mlp": 0.01024031, "balance_loss_clip": 1.04512691, "balance_loss_mlp": 1.0161097, "epoch": 0.47219383153971023, "flos": 19171809792000.0, "grad_norm": 1.757102901117818, "language_loss": 0.78484219, "learning_rate": 2.2764261655746965e-06, "loss": 0.80624187, "num_input_tokens_seen": 84405290, "step": 3927, "time_per_iteration": 2.5622406005859375 }, { "auxiliary_loss_clip": 0.01137155, "auxiliary_loss_mlp": 0.01023608, "balance_loss_clip": 1.04924273, "balance_loss_mlp": 1.01570392, "epoch": 0.4723140744303493, "flos": 23224005780480.0, "grad_norm": 1.6525497716001576, "language_loss": 0.75936806, "learning_rate": 2.2756546491479832e-06, "loss": 0.78097564, "num_input_tokens_seen": 84426205, "step": 3928, "time_per_iteration": 2.594552993774414 }, { "auxiliary_loss_clip": 0.01183145, "auxiliary_loss_mlp": 0.00762574, "balance_loss_clip": 1.05210543, "balance_loss_mlp": 1.00017548, "epoch": 0.4724343173209884, "flos": 18223265387520.0, "grad_norm": 4.575438564586592, "language_loss": 0.80469501, "learning_rate": 2.274883090904679e-06, "loss": 0.82415217, "num_input_tokens_seen": 84443970, "step": 3929, "time_per_iteration": 2.482083320617676 }, { "auxiliary_loss_clip": 0.01185622, "auxiliary_loss_mlp": 0.01028069, "balance_loss_clip": 1.05644011, "balance_loss_mlp": 1.02007627, "epoch": 0.4725545602116275, "flos": 21251037490560.0, "grad_norm": 2.3492833442457774, "language_loss": 0.68133175, "learning_rate": 2.2741114909618283e-06, "loss": 0.70346868, "num_input_tokens_seen": 84459865, "step": 3930, "time_per_iteration": 2.428098678588867 }, { "auxiliary_loss_clip": 0.01141188, "auxiliary_loss_mlp": 0.01024984, "balance_loss_clip": 1.04971039, "balance_loss_mlp": 1.01655602, "epoch": 0.47267480310226656, "flos": 21434002392960.0, "grad_norm": 1.9010653593579911, "language_loss": 0.71740603, "learning_rate": 2.2733398494364828e-06, "loss": 0.73906779, "num_input_tokens_seen": 84479110, "step": 3931, "time_per_iteration": 2.6437652111053467 }, { "auxiliary_loss_clip": 0.01151119, "auxiliary_loss_mlp": 0.01027011, "balance_loss_clip": 1.05338371, "balance_loss_mlp": 1.01914883, "epoch": 0.47279504599290567, "flos": 18770508069120.0, "grad_norm": 1.8301577529125357, "language_loss": 0.84467971, "learning_rate": 2.272568166445699e-06, "loss": 0.86646104, "num_input_tokens_seen": 84497675, "step": 3932, "time_per_iteration": 2.4781692028045654 }, { "auxiliary_loss_clip": 0.01167848, "auxiliary_loss_mlp": 0.01021889, "balance_loss_clip": 1.05021477, "balance_loss_mlp": 1.01361609, "epoch": 0.4729152888835448, "flos": 21105742976640.0, "grad_norm": 1.9076541834854692, "language_loss": 0.64131486, "learning_rate": 2.271796442106541e-06, "loss": 0.6632123, "num_input_tokens_seen": 84517030, "step": 3933, "time_per_iteration": 2.4738810062408447 }, { "auxiliary_loss_clip": 0.01048427, "auxiliary_loss_mlp": 0.01004574, "balance_loss_clip": 1.01725721, "balance_loss_mlp": 1.00341153, "epoch": 0.47303553177418384, "flos": 70201877840640.0, "grad_norm": 0.805979241650793, "language_loss": 0.56495887, "learning_rate": 2.271024676536079e-06, "loss": 0.58548892, "num_input_tokens_seen": 84577290, "step": 3934, "time_per_iteration": 3.096956491470337 }, { "auxiliary_loss_clip": 0.01161425, "auxiliary_loss_mlp": 0.01029103, "balance_loss_clip": 1.05564463, "balance_loss_mlp": 1.02038836, "epoch": 0.47315577466482295, "flos": 22455122227200.0, "grad_norm": 2.1328800265111583, "language_loss": 0.73396826, "learning_rate": 2.2702528698513894e-06, "loss": 0.7558735, "num_input_tokens_seen": 84598415, "step": 3935, "time_per_iteration": 2.5184085369110107 }, { "auxiliary_loss_clip": 0.01153546, "auxiliary_loss_mlp": 0.01028118, "balance_loss_clip": 1.0472182, "balance_loss_mlp": 1.0199579, "epoch": 0.47327601755546206, "flos": 24352857480960.0, "grad_norm": 2.103317361248548, "language_loss": 0.78858531, "learning_rate": 2.269481022169554e-06, "loss": 0.81040192, "num_input_tokens_seen": 84617010, "step": 3936, "time_per_iteration": 2.5378787517547607 }, { "auxiliary_loss_clip": 0.01159156, "auxiliary_loss_mlp": 0.01026561, "balance_loss_clip": 1.04911327, "balance_loss_mlp": 1.01791883, "epoch": 0.4733962604461011, "flos": 22926772736640.0, "grad_norm": 1.537479449737051, "language_loss": 0.80446088, "learning_rate": 2.2687091336076614e-06, "loss": 0.82631803, "num_input_tokens_seen": 84636350, "step": 3937, "time_per_iteration": 3.367605686187744 }, { "auxiliary_loss_clip": 0.01167021, "auxiliary_loss_mlp": 0.01034119, "balance_loss_clip": 1.05243397, "balance_loss_mlp": 1.0260694, "epoch": 0.4735165033367402, "flos": 18327369980160.0, "grad_norm": 1.7593878940941947, "language_loss": 0.79858375, "learning_rate": 2.267937204282807e-06, "loss": 0.82059515, "num_input_tokens_seen": 84653490, "step": 3938, "time_per_iteration": 2.447352886199951 }, { "auxiliary_loss_clip": 0.01175472, "auxiliary_loss_mlp": 0.01026577, "balance_loss_clip": 1.05418169, "balance_loss_mlp": 1.01808929, "epoch": 0.4736367462273793, "flos": 23037018554880.0, "grad_norm": 1.8830531469526242, "language_loss": 0.79087877, "learning_rate": 2.2671652343120926e-06, "loss": 0.81289923, "num_input_tokens_seen": 84673965, "step": 3939, "time_per_iteration": 3.3205392360687256 }, { "auxiliary_loss_clip": 0.01182226, "auxiliary_loss_mlp": 0.01026962, "balance_loss_clip": 1.05430079, "balance_loss_mlp": 1.01934767, "epoch": 0.4737569891180184, "flos": 25374336451200.0, "grad_norm": 1.695863902800287, "language_loss": 0.80493605, "learning_rate": 2.2663932238126236e-06, "loss": 0.82702792, "num_input_tokens_seen": 84692525, "step": 3940, "time_per_iteration": 2.4844329357147217 }, { "auxiliary_loss_clip": 0.01168258, "auxiliary_loss_mlp": 0.01024126, "balance_loss_clip": 1.05049801, "balance_loss_mlp": 1.01565027, "epoch": 0.4738772320086575, "flos": 25849326925440.0, "grad_norm": 1.4025689413624673, "language_loss": 0.80538106, "learning_rate": 2.265621172901515e-06, "loss": 0.82730484, "num_input_tokens_seen": 84715640, "step": 3941, "time_per_iteration": 3.304386615753174 }, { "auxiliary_loss_clip": 0.01186283, "auxiliary_loss_mlp": 0.01032524, "balance_loss_clip": 1.05710828, "balance_loss_mlp": 1.02434611, "epoch": 0.47399747489929656, "flos": 27564420499200.0, "grad_norm": 2.213206429423058, "language_loss": 0.71631354, "learning_rate": 2.2648490816958854e-06, "loss": 0.73850167, "num_input_tokens_seen": 84736635, "step": 3942, "time_per_iteration": 2.50063157081604 }, { "auxiliary_loss_clip": 0.01168264, "auxiliary_loss_mlp": 0.01031198, "balance_loss_clip": 1.05015123, "balance_loss_mlp": 1.02203131, "epoch": 0.47411771778993567, "flos": 24863650836480.0, "grad_norm": 2.296572266514965, "language_loss": 0.73123455, "learning_rate": 2.264076950312861e-06, "loss": 0.75322914, "num_input_tokens_seen": 84755445, "step": 3943, "time_per_iteration": 2.4793343544006348 }, { "auxiliary_loss_clip": 0.01159549, "auxiliary_loss_mlp": 0.01033708, "balance_loss_clip": 1.05114079, "balance_loss_mlp": 1.02481544, "epoch": 0.4742379606805748, "flos": 22748009725440.0, "grad_norm": 1.9875040261026784, "language_loss": 0.82636118, "learning_rate": 2.2633047788695727e-06, "loss": 0.84829378, "num_input_tokens_seen": 84775750, "step": 3944, "time_per_iteration": 3.277961254119873 }, { "auxiliary_loss_clip": 0.01153429, "auxiliary_loss_mlp": 0.01028802, "balance_loss_clip": 1.05159068, "balance_loss_mlp": 1.02140808, "epoch": 0.47435820357121383, "flos": 19681130689920.0, "grad_norm": 1.8106678933938967, "language_loss": 0.64074922, "learning_rate": 2.262532567483159e-06, "loss": 0.66257155, "num_input_tokens_seen": 84794310, "step": 3945, "time_per_iteration": 2.4848594665527344 }, { "auxiliary_loss_clip": 0.01187705, "auxiliary_loss_mlp": 0.00762947, "balance_loss_clip": 1.05689502, "balance_loss_mlp": 1.00022018, "epoch": 0.47447844646185294, "flos": 25228718714880.0, "grad_norm": 1.953219185358619, "language_loss": 0.8039096, "learning_rate": 2.2617603162707635e-06, "loss": 0.82341611, "num_input_tokens_seen": 84814720, "step": 3946, "time_per_iteration": 2.459381341934204 }, { "auxiliary_loss_clip": 0.01182574, "auxiliary_loss_mlp": 0.0102756, "balance_loss_clip": 1.05416536, "balance_loss_mlp": 1.01962066, "epoch": 0.47459868935249205, "flos": 24570619683840.0, "grad_norm": 1.7823515575382127, "language_loss": 0.82610595, "learning_rate": 2.2609880253495363e-06, "loss": 0.84820735, "num_input_tokens_seen": 84834355, "step": 3947, "time_per_iteration": 2.469965696334839 }, { "auxiliary_loss_clip": 0.01149464, "auxiliary_loss_mlp": 0.01032572, "balance_loss_clip": 1.04830933, "balance_loss_mlp": 1.02450728, "epoch": 0.4747189322431311, "flos": 20558500295040.0, "grad_norm": 1.9899514402168406, "language_loss": 0.86516255, "learning_rate": 2.260215694836633e-06, "loss": 0.88698292, "num_input_tokens_seen": 84853530, "step": 3948, "time_per_iteration": 2.5163052082061768 }, { "auxiliary_loss_clip": 0.01127495, "auxiliary_loss_mlp": 0.00762548, "balance_loss_clip": 1.04538691, "balance_loss_mlp": 1.00017416, "epoch": 0.4748391751337702, "flos": 25995231970560.0, "grad_norm": 2.1232131002539476, "language_loss": 0.65029842, "learning_rate": 2.2594433248492157e-06, "loss": 0.66919887, "num_input_tokens_seen": 84872505, "step": 3949, "time_per_iteration": 2.6297905445098877 }, { "auxiliary_loss_clip": 0.01172745, "auxiliary_loss_mlp": 0.01031224, "balance_loss_clip": 1.05143785, "balance_loss_mlp": 1.02367806, "epoch": 0.47495941802440933, "flos": 22821052032000.0, "grad_norm": 1.7018345763394311, "language_loss": 0.80039042, "learning_rate": 2.2586709155044527e-06, "loss": 0.82243013, "num_input_tokens_seen": 84893105, "step": 3950, "time_per_iteration": 2.485506057739258 }, { "auxiliary_loss_clip": 0.01184137, "auxiliary_loss_mlp": 0.0102735, "balance_loss_clip": 1.05480194, "balance_loss_mlp": 1.01877904, "epoch": 0.4750796609150484, "flos": 27891782075520.0, "grad_norm": 1.535165517223909, "language_loss": 0.75993085, "learning_rate": 2.2578984669195167e-06, "loss": 0.78204578, "num_input_tokens_seen": 84914070, "step": 3951, "time_per_iteration": 2.4928951263427734 }, { "auxiliary_loss_clip": 0.01164282, "auxiliary_loss_mlp": 0.01027623, "balance_loss_clip": 1.04787517, "balance_loss_mlp": 1.01997006, "epoch": 0.4751999038056875, "flos": 35660085471360.0, "grad_norm": 1.7078249981280178, "language_loss": 0.68011773, "learning_rate": 2.2571259792115887e-06, "loss": 0.7020368, "num_input_tokens_seen": 84935290, "step": 3952, "time_per_iteration": 2.6143741607666016 }, { "auxiliary_loss_clip": 0.01162092, "auxiliary_loss_mlp": 0.01027994, "balance_loss_clip": 1.04994595, "balance_loss_mlp": 1.02075469, "epoch": 0.4753201466963266, "flos": 22090880361600.0, "grad_norm": 2.010348380700964, "language_loss": 0.79343057, "learning_rate": 2.2563534524978544e-06, "loss": 0.81533134, "num_input_tokens_seen": 84952760, "step": 3953, "time_per_iteration": 2.4709224700927734 }, { "auxiliary_loss_clip": 0.0113746, "auxiliary_loss_mlp": 0.01025891, "balance_loss_clip": 1.05321193, "balance_loss_mlp": 1.01849985, "epoch": 0.47544038958696566, "flos": 30190854965760.0, "grad_norm": 3.4154588355327014, "language_loss": 0.70507705, "learning_rate": 2.2555808868955052e-06, "loss": 0.72671056, "num_input_tokens_seen": 84974890, "step": 3954, "time_per_iteration": 2.616994619369507 }, { "auxiliary_loss_clip": 0.01125724, "auxiliary_loss_mlp": 0.01027405, "balance_loss_clip": 1.04717135, "balance_loss_mlp": 1.01856577, "epoch": 0.47556063247760477, "flos": 23472219738240.0, "grad_norm": 2.242850593187867, "language_loss": 0.73732722, "learning_rate": 2.254808282521738e-06, "loss": 0.7588585, "num_input_tokens_seen": 84993640, "step": 3955, "time_per_iteration": 2.5972847938537598 }, { "auxiliary_loss_clip": 0.01142271, "auxiliary_loss_mlp": 0.00762986, "balance_loss_clip": 1.04797673, "balance_loss_mlp": 1.00015473, "epoch": 0.4756808753682438, "flos": 25155209531520.0, "grad_norm": 1.688456891365973, "language_loss": 0.80921721, "learning_rate": 2.2540356394937573e-06, "loss": 0.82826972, "num_input_tokens_seen": 85012340, "step": 3956, "time_per_iteration": 2.560209274291992 }, { "auxiliary_loss_clip": 0.01145335, "auxiliary_loss_mlp": 0.01026819, "balance_loss_clip": 1.04867911, "balance_loss_mlp": 1.01842046, "epoch": 0.47580111825888294, "flos": 15669729573120.0, "grad_norm": 2.5840713815426923, "language_loss": 0.83982933, "learning_rate": 2.253262957928772e-06, "loss": 0.86155081, "num_input_tokens_seen": 85029225, "step": 3957, "time_per_iteration": 2.498357057571411 }, { "auxiliary_loss_clip": 0.01146383, "auxiliary_loss_mlp": 0.01027263, "balance_loss_clip": 1.0465858, "balance_loss_mlp": 1.01910937, "epoch": 0.47592136114952205, "flos": 17636556637440.0, "grad_norm": 1.6481076950228488, "language_loss": 0.71979707, "learning_rate": 2.2524902379439976e-06, "loss": 0.74153352, "num_input_tokens_seen": 85047895, "step": 3958, "time_per_iteration": 2.4685184955596924 }, { "auxiliary_loss_clip": 0.01036937, "auxiliary_loss_mlp": 0.01015124, "balance_loss_clip": 1.03191209, "balance_loss_mlp": 1.01313341, "epoch": 0.4760416040401611, "flos": 61417159292160.0, "grad_norm": 0.7416526265400596, "language_loss": 0.63709635, "learning_rate": 2.251717479656655e-06, "loss": 0.65761691, "num_input_tokens_seen": 85112690, "step": 3959, "time_per_iteration": 3.199294328689575 }, { "auxiliary_loss_clip": 0.01183416, "auxiliary_loss_mlp": 0.01030478, "balance_loss_clip": 1.05317748, "balance_loss_mlp": 1.02187753, "epoch": 0.4761618469308002, "flos": 18405871153920.0, "grad_norm": 2.5720710813834837, "language_loss": 0.76329547, "learning_rate": 2.2509446831839704e-06, "loss": 0.78543437, "num_input_tokens_seen": 85132130, "step": 3960, "time_per_iteration": 2.4176218509674072 }, { "auxiliary_loss_clip": 0.01156009, "auxiliary_loss_mlp": 0.01033162, "balance_loss_clip": 1.04874122, "balance_loss_mlp": 1.02462065, "epoch": 0.4762820898214393, "flos": 18040911016320.0, "grad_norm": 2.3312981433200153, "language_loss": 0.82654268, "learning_rate": 2.250171848643177e-06, "loss": 0.84843439, "num_input_tokens_seen": 85149420, "step": 3961, "time_per_iteration": 2.463630199432373 }, { "auxiliary_loss_clip": 0.01150583, "auxiliary_loss_mlp": 0.01033925, "balance_loss_clip": 1.05071545, "balance_loss_mlp": 1.02666855, "epoch": 0.4764023327120784, "flos": 19318253541120.0, "grad_norm": 1.7757762507366792, "language_loss": 0.86164963, "learning_rate": 2.249398976151513e-06, "loss": 0.88349473, "num_input_tokens_seen": 85166970, "step": 3962, "time_per_iteration": 2.5673716068267822 }, { "auxiliary_loss_clip": 0.0118074, "auxiliary_loss_mlp": 0.01031746, "balance_loss_clip": 1.05287385, "balance_loss_mlp": 1.02404189, "epoch": 0.4765225756027175, "flos": 22747255539840.0, "grad_norm": 2.269308122831583, "language_loss": 0.78704858, "learning_rate": 2.248626065826223e-06, "loss": 0.80917341, "num_input_tokens_seen": 85185175, "step": 3963, "time_per_iteration": 3.270092248916626 }, { "auxiliary_loss_clip": 0.01085144, "auxiliary_loss_mlp": 0.01003451, "balance_loss_clip": 1.02123535, "balance_loss_mlp": 1.00227058, "epoch": 0.4766428184933566, "flos": 65933392106880.0, "grad_norm": 0.7628748756633362, "language_loss": 0.62553471, "learning_rate": 2.2478531177845564e-06, "loss": 0.64642066, "num_input_tokens_seen": 85246170, "step": 3964, "time_per_iteration": 2.9979188442230225 }, { "auxiliary_loss_clip": 0.01154793, "auxiliary_loss_mlp": 0.01023169, "balance_loss_clip": 1.04986763, "balance_loss_mlp": 1.01571274, "epoch": 0.47676306138399566, "flos": 24136495908480.0, "grad_norm": 2.2676695959364475, "language_loss": 0.84941137, "learning_rate": 2.247080132143769e-06, "loss": 0.87119102, "num_input_tokens_seen": 85268525, "step": 3965, "time_per_iteration": 2.5312819480895996 }, { "auxiliary_loss_clip": 0.01137259, "auxiliary_loss_mlp": 0.01029202, "balance_loss_clip": 1.04306388, "balance_loss_mlp": 1.0206604, "epoch": 0.47688330427463477, "flos": 12604322995200.0, "grad_norm": 1.946599524660357, "language_loss": 0.69220626, "learning_rate": 2.246307109021121e-06, "loss": 0.71387088, "num_input_tokens_seen": 85285930, "step": 3966, "time_per_iteration": 3.349525213241577 }, { "auxiliary_loss_clip": 0.01150135, "auxiliary_loss_mlp": 0.0102937, "balance_loss_clip": 1.04738307, "balance_loss_mlp": 1.02122211, "epoch": 0.4770035471652739, "flos": 21390585828480.0, "grad_norm": 1.8210990834071763, "language_loss": 0.82020861, "learning_rate": 2.2455340485338817e-06, "loss": 0.8420037, "num_input_tokens_seen": 85303565, "step": 3967, "time_per_iteration": 2.5074305534362793 }, { "auxiliary_loss_clip": 0.01167821, "auxiliary_loss_mlp": 0.01027496, "balance_loss_clip": 1.05049324, "balance_loss_mlp": 1.019449, "epoch": 0.47712379005591293, "flos": 25156251025920.0, "grad_norm": 2.2546258739980893, "language_loss": 0.67999876, "learning_rate": 2.244760950799322e-06, "loss": 0.70195192, "num_input_tokens_seen": 85321835, "step": 3968, "time_per_iteration": 3.3435447216033936 }, { "auxiliary_loss_clip": 0.01124292, "auxiliary_loss_mlp": 0.01029178, "balance_loss_clip": 1.04623389, "balance_loss_mlp": 1.02103591, "epoch": 0.47724403294655204, "flos": 22054323294720.0, "grad_norm": 1.7402937587528589, "language_loss": 0.72403592, "learning_rate": 2.2439878159347203e-06, "loss": 0.74557066, "num_input_tokens_seen": 85341260, "step": 3969, "time_per_iteration": 2.5409762859344482 }, { "auxiliary_loss_clip": 0.01084001, "auxiliary_loss_mlp": 0.01001324, "balance_loss_clip": 1.01954353, "balance_loss_mlp": 1.00013757, "epoch": 0.4773642758371911, "flos": 70229387658240.0, "grad_norm": 0.7336429076439863, "language_loss": 0.55572265, "learning_rate": 2.2432146440573616e-06, "loss": 0.57657593, "num_input_tokens_seen": 85407220, "step": 3970, "time_per_iteration": 3.126549243927002 }, { "auxiliary_loss_clip": 0.0115302, "auxiliary_loss_mlp": 0.01027214, "balance_loss_clip": 1.05038774, "balance_loss_mlp": 1.01932216, "epoch": 0.4774845187278302, "flos": 23548602009600.0, "grad_norm": 1.8888335085994998, "language_loss": 0.66403818, "learning_rate": 2.242441435284534e-06, "loss": 0.68584049, "num_input_tokens_seen": 85426095, "step": 3971, "time_per_iteration": 3.262711524963379 }, { "auxiliary_loss_clip": 0.01171728, "auxiliary_loss_mlp": 0.01032068, "balance_loss_clip": 1.05407298, "balance_loss_mlp": 1.02297819, "epoch": 0.4776047616184693, "flos": 23075371301760.0, "grad_norm": 2.1976847190672597, "language_loss": 0.85757387, "learning_rate": 2.2416681897335337e-06, "loss": 0.87961185, "num_input_tokens_seen": 85444245, "step": 3972, "time_per_iteration": 2.455953598022461 }, { "auxiliary_loss_clip": 0.0112803, "auxiliary_loss_mlp": 0.01034323, "balance_loss_clip": 1.05024052, "balance_loss_mlp": 1.02555537, "epoch": 0.4777250045091084, "flos": 31898119374720.0, "grad_norm": 1.776389658067525, "language_loss": 0.67022085, "learning_rate": 2.240894907521661e-06, "loss": 0.69184434, "num_input_tokens_seen": 85463325, "step": 3973, "time_per_iteration": 2.6557347774505615 }, { "auxiliary_loss_clip": 0.01151655, "auxiliary_loss_mlp": 0.01027543, "balance_loss_clip": 1.04840493, "balance_loss_mlp": 1.01963377, "epoch": 0.4778452473997475, "flos": 24278163148800.0, "grad_norm": 1.761736449021304, "language_loss": 0.64108646, "learning_rate": 2.240121588766223e-06, "loss": 0.66287845, "num_input_tokens_seen": 85483375, "step": 3974, "time_per_iteration": 2.5389761924743652 }, { "auxiliary_loss_clip": 0.01146781, "auxiliary_loss_mlp": 0.01027696, "balance_loss_clip": 1.04763746, "balance_loss_mlp": 1.01957822, "epoch": 0.4779654902903866, "flos": 31575031516800.0, "grad_norm": 1.632525483683521, "language_loss": 0.71173179, "learning_rate": 2.239348233584531e-06, "loss": 0.73347658, "num_input_tokens_seen": 85504230, "step": 3975, "time_per_iteration": 2.5691754817962646 }, { "auxiliary_loss_clip": 0.01168319, "auxiliary_loss_mlp": 0.01034031, "balance_loss_clip": 1.05092502, "balance_loss_mlp": 1.02532268, "epoch": 0.47808573318102565, "flos": 19500428344320.0, "grad_norm": 1.9025432427835756, "language_loss": 0.81269556, "learning_rate": 2.2385748420939013e-06, "loss": 0.83471906, "num_input_tokens_seen": 85523425, "step": 3976, "time_per_iteration": 2.4381492137908936 }, { "auxiliary_loss_clip": 0.0118344, "auxiliary_loss_mlp": 0.01029721, "balance_loss_clip": 1.05749369, "balance_loss_mlp": 1.02167475, "epoch": 0.47820597607166476, "flos": 22601135013120.0, "grad_norm": 1.8277971858326207, "language_loss": 0.7250663, "learning_rate": 2.2378014144116583e-06, "loss": 0.74719793, "num_input_tokens_seen": 85542235, "step": 3977, "time_per_iteration": 2.418173313140869 }, { "auxiliary_loss_clip": 0.01186584, "auxiliary_loss_mlp": 0.01034421, "balance_loss_clip": 1.0548079, "balance_loss_mlp": 1.02673841, "epoch": 0.4783262189623039, "flos": 23003011353600.0, "grad_norm": 1.858051524971096, "language_loss": 0.79761928, "learning_rate": 2.23702795065513e-06, "loss": 0.81982934, "num_input_tokens_seen": 85561815, "step": 3978, "time_per_iteration": 2.4411394596099854 }, { "auxiliary_loss_clip": 0.01074899, "auxiliary_loss_mlp": 0.01001818, "balance_loss_clip": 1.02020633, "balance_loss_mlp": 1.00062561, "epoch": 0.47844646185294293, "flos": 49772801226240.0, "grad_norm": 0.9939924251529546, "language_loss": 0.67503238, "learning_rate": 2.2362544509416493e-06, "loss": 0.69579953, "num_input_tokens_seen": 85613930, "step": 3979, "time_per_iteration": 2.9299569129943848 }, { "auxiliary_loss_clip": 0.01144554, "auxiliary_loss_mlp": 0.01034241, "balance_loss_clip": 1.04686737, "balance_loss_mlp": 1.02612305, "epoch": 0.47856670474358204, "flos": 20229558520320.0, "grad_norm": 2.1673715058317016, "language_loss": 0.82476485, "learning_rate": 2.2354809153885572e-06, "loss": 0.84655285, "num_input_tokens_seen": 85631000, "step": 3980, "time_per_iteration": 2.48453688621521 }, { "auxiliary_loss_clip": 0.01166358, "auxiliary_loss_mlp": 0.0103113, "balance_loss_clip": 1.04986358, "balance_loss_mlp": 1.02284467, "epoch": 0.47868694763422115, "flos": 20990936131200.0, "grad_norm": 1.6820024842968728, "language_loss": 0.83080387, "learning_rate": 2.234707344113197e-06, "loss": 0.85277867, "num_input_tokens_seen": 85649095, "step": 3981, "time_per_iteration": 2.435882329940796 }, { "auxiliary_loss_clip": 0.01177533, "auxiliary_loss_mlp": 0.01025285, "balance_loss_clip": 1.05177999, "balance_loss_mlp": 1.01770902, "epoch": 0.4788071905248602, "flos": 19026551191680.0, "grad_norm": 2.2898127098641003, "language_loss": 0.77545059, "learning_rate": 2.233933737232919e-06, "loss": 0.7974788, "num_input_tokens_seen": 85666875, "step": 3982, "time_per_iteration": 2.4090487957000732 }, { "auxiliary_loss_clip": 0.01116574, "auxiliary_loss_mlp": 0.00762428, "balance_loss_clip": 1.04493928, "balance_loss_mlp": 1.00005317, "epoch": 0.4789274334154993, "flos": 23002221254400.0, "grad_norm": 1.8953805438065205, "language_loss": 0.78105259, "learning_rate": 2.2331600948650793e-06, "loss": 0.7998426, "num_input_tokens_seen": 85687020, "step": 3983, "time_per_iteration": 2.545619487762451 }, { "auxiliary_loss_clip": 0.01129713, "auxiliary_loss_mlp": 0.00763618, "balance_loss_clip": 1.04945779, "balance_loss_mlp": 1.00006223, "epoch": 0.4790476763061384, "flos": 23075586783360.0, "grad_norm": 1.4386954195740431, "language_loss": 0.80384129, "learning_rate": 2.2323864171270386e-06, "loss": 0.82277459, "num_input_tokens_seen": 85708290, "step": 3984, "time_per_iteration": 2.549447536468506 }, { "auxiliary_loss_clip": 0.01141761, "auxiliary_loss_mlp": 0.01029303, "balance_loss_clip": 1.04670978, "balance_loss_mlp": 1.0204339, "epoch": 0.4791679191967775, "flos": 21179288073600.0, "grad_norm": 1.7739503564224635, "language_loss": 0.72467792, "learning_rate": 2.231612704136164e-06, "loss": 0.74638855, "num_input_tokens_seen": 85728660, "step": 3985, "time_per_iteration": 2.5680487155914307 }, { "auxiliary_loss_clip": 0.01163455, "auxiliary_loss_mlp": 0.01029451, "balance_loss_clip": 1.04987931, "balance_loss_mlp": 1.02105272, "epoch": 0.4792881620874166, "flos": 22301495758080.0, "grad_norm": 2.430686904060973, "language_loss": 0.74837899, "learning_rate": 2.2308389560098253e-06, "loss": 0.77030802, "num_input_tokens_seen": 85745035, "step": 3986, "time_per_iteration": 2.4403202533721924 }, { "auxiliary_loss_clip": 0.01146213, "auxiliary_loss_mlp": 0.01028561, "balance_loss_clip": 1.05261624, "balance_loss_mlp": 1.01976955, "epoch": 0.47940840497805565, "flos": 17420877423360.0, "grad_norm": 3.8204621774113923, "language_loss": 0.77177101, "learning_rate": 2.2300651728654008e-06, "loss": 0.79351878, "num_input_tokens_seen": 85760295, "step": 3987, "time_per_iteration": 2.511549949645996 }, { "auxiliary_loss_clip": 0.01065503, "auxiliary_loss_mlp": 0.00753057, "balance_loss_clip": 1.0169965, "balance_loss_mlp": 0.9999547, "epoch": 0.47952864786869476, "flos": 65358175708800.0, "grad_norm": 0.7327198027798575, "language_loss": 0.60208541, "learning_rate": 2.229291354820272e-06, "loss": 0.62027103, "num_input_tokens_seen": 85821305, "step": 3988, "time_per_iteration": 3.0782713890075684 }, { "auxiliary_loss_clip": 0.01164959, "auxiliary_loss_mlp": 0.0103044, "balance_loss_clip": 1.04941332, "balance_loss_mlp": 1.02195215, "epoch": 0.47964889075933387, "flos": 16799802336000.0, "grad_norm": 1.9265582463770996, "language_loss": 0.76056862, "learning_rate": 2.228517501991828e-06, "loss": 0.78252256, "num_input_tokens_seen": 85840105, "step": 3989, "time_per_iteration": 2.4439468383789062 }, { "auxiliary_loss_clip": 0.01057014, "auxiliary_loss_mlp": 0.01003261, "balance_loss_clip": 1.01804996, "balance_loss_mlp": 1.0021286, "epoch": 0.4797691336499729, "flos": 70079244808320.0, "grad_norm": 0.8269018489069816, "language_loss": 0.6105842, "learning_rate": 2.22774361449746e-06, "loss": 0.6311869, "num_input_tokens_seen": 85896585, "step": 3990, "time_per_iteration": 3.928361654281616 }, { "auxiliary_loss_clip": 0.0111311, "auxiliary_loss_mlp": 0.01027864, "balance_loss_clip": 1.04853439, "balance_loss_mlp": 1.01973951, "epoch": 0.47988937654061203, "flos": 18953329317120.0, "grad_norm": 2.865706420917079, "language_loss": 0.70823878, "learning_rate": 2.2269696924545668e-06, "loss": 0.72964853, "num_input_tokens_seen": 85914415, "step": 3991, "time_per_iteration": 2.58437442779541 }, { "auxiliary_loss_clip": 0.01142927, "auxiliary_loss_mlp": 0.01029191, "balance_loss_clip": 1.05286992, "balance_loss_mlp": 1.02127528, "epoch": 0.48000961943125114, "flos": 14461981649280.0, "grad_norm": 2.3733513305385134, "language_loss": 0.78378046, "learning_rate": 2.2261957359805523e-06, "loss": 0.80550164, "num_input_tokens_seen": 85931650, "step": 3992, "time_per_iteration": 2.5114989280700684 }, { "auxiliary_loss_clip": 0.01182526, "auxiliary_loss_mlp": 0.01023449, "balance_loss_clip": 1.05332077, "balance_loss_mlp": 1.01557255, "epoch": 0.4801298623218902, "flos": 27051149105280.0, "grad_norm": 2.7940978832993286, "language_loss": 0.74187064, "learning_rate": 2.225421745192823e-06, "loss": 0.76393032, "num_input_tokens_seen": 85951805, "step": 3993, "time_per_iteration": 3.326186180114746 }, { "auxiliary_loss_clip": 0.01166701, "auxiliary_loss_mlp": 0.01029029, "balance_loss_clip": 1.05206537, "balance_loss_mlp": 1.02068472, "epoch": 0.4802501052125293, "flos": 26355236031360.0, "grad_norm": 2.1713753448950857, "language_loss": 0.78271371, "learning_rate": 2.2246477202087955e-06, "loss": 0.80467105, "num_input_tokens_seen": 85972485, "step": 3994, "time_per_iteration": 2.505716562271118 }, { "auxiliary_loss_clip": 0.01155824, "auxiliary_loss_mlp": 0.01031157, "balance_loss_clip": 1.05045068, "balance_loss_mlp": 1.02380204, "epoch": 0.4803703481031684, "flos": 20993916960000.0, "grad_norm": 1.5563073175802402, "language_loss": 0.8277812, "learning_rate": 2.223873661145887e-06, "loss": 0.84965098, "num_input_tokens_seen": 85992540, "step": 3995, "time_per_iteration": 3.3509137630462646 }, { "auxiliary_loss_clip": 0.01155156, "auxiliary_loss_mlp": 0.00762901, "balance_loss_clip": 1.05538583, "balance_loss_mlp": 1.00001454, "epoch": 0.4804905909938075, "flos": 20703722981760.0, "grad_norm": 1.5493262984513207, "language_loss": 0.71310496, "learning_rate": 2.2230995681215226e-06, "loss": 0.73228556, "num_input_tokens_seen": 86012065, "step": 3996, "time_per_iteration": 2.5059707164764404 }, { "auxiliary_loss_clip": 0.01138284, "auxiliary_loss_mlp": 0.0102645, "balance_loss_clip": 1.05118155, "balance_loss_mlp": 1.01886284, "epoch": 0.4806108338844466, "flos": 16654831044480.0, "grad_norm": 2.3737159898476636, "language_loss": 0.77964944, "learning_rate": 2.2223254412531305e-06, "loss": 0.80129671, "num_input_tokens_seen": 86029435, "step": 3997, "time_per_iteration": 3.2671427726745605 }, { "auxiliary_loss_clip": 0.01138616, "auxiliary_loss_mlp": 0.01025982, "balance_loss_clip": 1.04543066, "balance_loss_mlp": 1.01820016, "epoch": 0.4807310767750857, "flos": 20011329440640.0, "grad_norm": 1.8537446652298992, "language_loss": 0.82521695, "learning_rate": 2.221551280658146e-06, "loss": 0.84686291, "num_input_tokens_seen": 86048495, "step": 3998, "time_per_iteration": 2.47847056388855 }, { "auxiliary_loss_clip": 0.01122415, "auxiliary_loss_mlp": 0.01025565, "balance_loss_clip": 1.0481379, "balance_loss_mlp": 1.01785791, "epoch": 0.48085131966572475, "flos": 23185257984000.0, "grad_norm": 1.6348183843005455, "language_loss": 0.7434231, "learning_rate": 2.2207770864540085e-06, "loss": 0.76490295, "num_input_tokens_seen": 86067470, "step": 3999, "time_per_iteration": 2.564797878265381 }, { "auxiliary_loss_clip": 0.01146218, "auxiliary_loss_mlp": 0.01029356, "balance_loss_clip": 1.0497179, "balance_loss_mlp": 1.02116036, "epoch": 0.48097156255636386, "flos": 20558643949440.0, "grad_norm": 2.026055341939582, "language_loss": 0.73038745, "learning_rate": 2.220002858758162e-06, "loss": 0.75214314, "num_input_tokens_seen": 86085460, "step": 4000, "time_per_iteration": 2.4935972690582275 }, { "auxiliary_loss_clip": 0.01071435, "auxiliary_loss_mlp": 0.0100087, "balance_loss_clip": 1.01699233, "balance_loss_mlp": 0.99974364, "epoch": 0.481091805447003, "flos": 70511608817280.0, "grad_norm": 0.9024897626198788, "language_loss": 0.60884935, "learning_rate": 2.2192285976880573e-06, "loss": 0.62957239, "num_input_tokens_seen": 86149715, "step": 4001, "time_per_iteration": 3.0582528114318848 }, { "auxiliary_loss_clip": 0.01143968, "auxiliary_loss_mlp": 0.00762087, "balance_loss_clip": 1.04902232, "balance_loss_mlp": 1.00003994, "epoch": 0.48121204833764203, "flos": 36428214839040.0, "grad_norm": 1.5436701532378354, "language_loss": 0.80716383, "learning_rate": 2.2184543033611485e-06, "loss": 0.82622439, "num_input_tokens_seen": 86170795, "step": 4002, "time_per_iteration": 2.66536283493042 }, { "auxiliary_loss_clip": 0.01169806, "auxiliary_loss_mlp": 0.01030919, "balance_loss_clip": 1.05230081, "balance_loss_mlp": 1.02324176, "epoch": 0.48133229122828114, "flos": 27490264871040.0, "grad_norm": 2.255906294504358, "language_loss": 0.8202607, "learning_rate": 2.2176799758948957e-06, "loss": 0.84226793, "num_input_tokens_seen": 86190955, "step": 4003, "time_per_iteration": 2.516401529312134 }, { "auxiliary_loss_clip": 0.01149143, "auxiliary_loss_mlp": 0.01031889, "balance_loss_clip": 1.05103505, "balance_loss_mlp": 1.02409852, "epoch": 0.4814525341189202, "flos": 43072802179200.0, "grad_norm": 1.8202217279820037, "language_loss": 0.73234135, "learning_rate": 2.2169056154067635e-06, "loss": 0.7541517, "num_input_tokens_seen": 86214875, "step": 4004, "time_per_iteration": 2.7078208923339844 }, { "auxiliary_loss_clip": 0.01171497, "auxiliary_loss_mlp": 0.00762869, "balance_loss_clip": 1.05524826, "balance_loss_mlp": 1.00001323, "epoch": 0.4815727770095593, "flos": 24236901400320.0, "grad_norm": 1.8220416222679847, "language_loss": 0.82599258, "learning_rate": 2.216131222014222e-06, "loss": 0.84533632, "num_input_tokens_seen": 86232950, "step": 4005, "time_per_iteration": 2.5012128353118896 }, { "auxiliary_loss_clip": 0.01135469, "auxiliary_loss_mlp": 0.01032463, "balance_loss_clip": 1.0492537, "balance_loss_mlp": 1.02397561, "epoch": 0.4816930199001984, "flos": 18113630100480.0, "grad_norm": 2.1702239270510395, "language_loss": 0.79992437, "learning_rate": 2.2153567958347455e-06, "loss": 0.82160366, "num_input_tokens_seen": 86249160, "step": 4006, "time_per_iteration": 2.4960286617279053 }, { "auxiliary_loss_clip": 0.0115401, "auxiliary_loss_mlp": 0.0102627, "balance_loss_clip": 1.05311728, "balance_loss_mlp": 1.01799643, "epoch": 0.48181326279083747, "flos": 17274720983040.0, "grad_norm": 2.036974211627786, "language_loss": 0.80058861, "learning_rate": 2.214582336985815e-06, "loss": 0.82239139, "num_input_tokens_seen": 86267060, "step": 4007, "time_per_iteration": 2.470130205154419 }, { "auxiliary_loss_clip": 0.01144592, "auxiliary_loss_mlp": 0.01029109, "balance_loss_clip": 1.04900789, "balance_loss_mlp": 1.02032948, "epoch": 0.4819335056814766, "flos": 14903252231040.0, "grad_norm": 2.197255688775241, "language_loss": 0.66410673, "learning_rate": 2.2138078455849142e-06, "loss": 0.68584371, "num_input_tokens_seen": 86285055, "step": 4008, "time_per_iteration": 2.46886944770813 }, { "auxiliary_loss_clip": 0.01175495, "auxiliary_loss_mlp": 0.01029232, "balance_loss_clip": 1.05457318, "balance_loss_mlp": 1.02145362, "epoch": 0.4820537485721157, "flos": 19244888012160.0, "grad_norm": 2.0553792454781026, "language_loss": 0.78387129, "learning_rate": 2.2130333217495334e-06, "loss": 0.80591857, "num_input_tokens_seen": 86304225, "step": 4009, "time_per_iteration": 2.4619975090026855 }, { "auxiliary_loss_clip": 0.01150172, "auxiliary_loss_mlp": 0.01027808, "balance_loss_clip": 1.05039942, "balance_loss_mlp": 1.01979709, "epoch": 0.48217399146275475, "flos": 16033791870720.0, "grad_norm": 3.1064920092466664, "language_loss": 0.67348146, "learning_rate": 2.2122587655971665e-06, "loss": 0.69526124, "num_input_tokens_seen": 86319170, "step": 4010, "time_per_iteration": 2.5164031982421875 }, { "auxiliary_loss_clip": 0.01154258, "auxiliary_loss_mlp": 0.01028917, "balance_loss_clip": 1.05053461, "balance_loss_mlp": 1.02138257, "epoch": 0.48229423435339386, "flos": 24134197438080.0, "grad_norm": 1.6605789856705067, "language_loss": 0.6407125, "learning_rate": 2.211484177245314e-06, "loss": 0.66254425, "num_input_tokens_seen": 86338760, "step": 4011, "time_per_iteration": 2.512725353240967 }, { "auxiliary_loss_clip": 0.01185934, "auxiliary_loss_mlp": 0.01032464, "balance_loss_clip": 1.05541587, "balance_loss_mlp": 1.02424407, "epoch": 0.48241447724403297, "flos": 23805435231360.0, "grad_norm": 1.9295960021644893, "language_loss": 0.71834379, "learning_rate": 2.21070955681148e-06, "loss": 0.74052781, "num_input_tokens_seen": 86357865, "step": 4012, "time_per_iteration": 2.449613332748413 }, { "auxiliary_loss_clip": 0.01132562, "auxiliary_loss_mlp": 0.0102813, "balance_loss_clip": 1.04869962, "balance_loss_mlp": 1.02030432, "epoch": 0.482534720134672, "flos": 23110312256640.0, "grad_norm": 1.5838853825164083, "language_loss": 0.77872318, "learning_rate": 2.209934904413174e-06, "loss": 0.8003301, "num_input_tokens_seen": 86379470, "step": 4013, "time_per_iteration": 2.5532708168029785 }, { "auxiliary_loss_clip": 0.01104976, "auxiliary_loss_mlp": 0.0102861, "balance_loss_clip": 1.03781271, "balance_loss_mlp": 1.01998472, "epoch": 0.48265496302531113, "flos": 20923819568640.0, "grad_norm": 1.8990438432396708, "language_loss": 0.71644902, "learning_rate": 2.2091602201679095e-06, "loss": 0.73778486, "num_input_tokens_seen": 86399080, "step": 4014, "time_per_iteration": 2.612545967102051 }, { "auxiliary_loss_clip": 0.01146133, "auxiliary_loss_mlp": 0.01025539, "balance_loss_clip": 1.05163813, "balance_loss_mlp": 1.01771331, "epoch": 0.48277520591595025, "flos": 15231152511360.0, "grad_norm": 2.346585112815046, "language_loss": 0.83471477, "learning_rate": 2.208385504193206e-06, "loss": 0.85643142, "num_input_tokens_seen": 86416580, "step": 4015, "time_per_iteration": 2.585113763809204 }, { "auxiliary_loss_clip": 0.0118359, "auxiliary_loss_mlp": 0.01020524, "balance_loss_clip": 1.0540421, "balance_loss_mlp": 1.01265633, "epoch": 0.4828954488065893, "flos": 17858664385920.0, "grad_norm": 1.8973272530851735, "language_loss": 0.81183368, "learning_rate": 2.2076107566065873e-06, "loss": 0.83387482, "num_input_tokens_seen": 86434365, "step": 4016, "time_per_iteration": 2.4063713550567627 }, { "auxiliary_loss_clip": 0.01174544, "auxiliary_loss_mlp": 0.01031973, "balance_loss_clip": 1.05557704, "balance_loss_mlp": 1.02471304, "epoch": 0.4830156916972284, "flos": 32087405070720.0, "grad_norm": 2.1320415127706593, "language_loss": 0.75556278, "learning_rate": 2.2068359775255816e-06, "loss": 0.77762794, "num_input_tokens_seen": 86452675, "step": 4017, "time_per_iteration": 3.383261203765869 }, { "auxiliary_loss_clip": 0.01121257, "auxiliary_loss_mlp": 0.01030258, "balance_loss_clip": 1.04743946, "balance_loss_mlp": 1.02215123, "epoch": 0.48313593458786747, "flos": 21871717528320.0, "grad_norm": 2.789894383423264, "language_loss": 0.77914602, "learning_rate": 2.206061167067723e-06, "loss": 0.80066115, "num_input_tokens_seen": 86470785, "step": 4018, "time_per_iteration": 2.5599255561828613 }, { "auxiliary_loss_clip": 0.01137514, "auxiliary_loss_mlp": 0.01026552, "balance_loss_clip": 1.04612303, "balance_loss_mlp": 1.01775455, "epoch": 0.4832561774785066, "flos": 22601206840320.0, "grad_norm": 1.9769122285139347, "language_loss": 0.79428267, "learning_rate": 2.205286325350549e-06, "loss": 0.81592333, "num_input_tokens_seen": 86489850, "step": 4019, "time_per_iteration": 3.403686761856079 }, { "auxiliary_loss_clip": 0.01126783, "auxiliary_loss_mlp": 0.01028771, "balance_loss_clip": 1.04873157, "balance_loss_mlp": 1.0213325, "epoch": 0.4833764203691457, "flos": 13437342282240.0, "grad_norm": 4.350399558320729, "language_loss": 0.72452468, "learning_rate": 2.204511452491603e-06, "loss": 0.74608022, "num_input_tokens_seen": 86506475, "step": 4020, "time_per_iteration": 2.5112051963806152 }, { "auxiliary_loss_clip": 0.01182201, "auxiliary_loss_mlp": 0.0103085, "balance_loss_clip": 1.05754459, "balance_loss_mlp": 1.02304745, "epoch": 0.48349666325978474, "flos": 44128036955520.0, "grad_norm": 1.7958611125963242, "language_loss": 0.74959695, "learning_rate": 2.2037365486084316e-06, "loss": 0.77172744, "num_input_tokens_seen": 86529715, "step": 4021, "time_per_iteration": 3.460017442703247 }, { "auxiliary_loss_clip": 0.01150004, "auxiliary_loss_mlp": 0.01030309, "balance_loss_clip": 1.04870439, "balance_loss_mlp": 1.02190423, "epoch": 0.48361690615042385, "flos": 26028377245440.0, "grad_norm": 2.422226814400363, "language_loss": 0.78060877, "learning_rate": 2.2029616138185886e-06, "loss": 0.80241197, "num_input_tokens_seen": 86548715, "step": 4022, "time_per_iteration": 2.5632619857788086 }, { "auxiliary_loss_clip": 0.01143809, "auxiliary_loss_mlp": 0.01030575, "balance_loss_clip": 1.05586326, "balance_loss_mlp": 1.02272809, "epoch": 0.48373714904106296, "flos": 22273306560000.0, "grad_norm": 1.6361565300408787, "language_loss": 0.82443738, "learning_rate": 2.202186648239629e-06, "loss": 0.84618121, "num_input_tokens_seen": 86568650, "step": 4023, "time_per_iteration": 2.533316135406494 }, { "auxiliary_loss_clip": 0.01169299, "auxiliary_loss_mlp": 0.01027875, "balance_loss_clip": 1.05483937, "balance_loss_mlp": 1.02019751, "epoch": 0.483857391931702, "flos": 28292293699200.0, "grad_norm": 1.7159135744511294, "language_loss": 0.71730506, "learning_rate": 2.201411651989117e-06, "loss": 0.73927683, "num_input_tokens_seen": 86590630, "step": 4024, "time_per_iteration": 3.3014490604400635 }, { "auxiliary_loss_clip": 0.01157861, "auxiliary_loss_mlp": 0.00762423, "balance_loss_clip": 1.05555797, "balance_loss_mlp": 1.00001144, "epoch": 0.48397763482234113, "flos": 27418048577280.0, "grad_norm": 1.9426368383318275, "language_loss": 0.7784096, "learning_rate": 2.2006366251846167e-06, "loss": 0.79761243, "num_input_tokens_seen": 86611270, "step": 4025, "time_per_iteration": 2.5484061241149902 }, { "auxiliary_loss_clip": 0.01156138, "auxiliary_loss_mlp": 0.0103026, "balance_loss_clip": 1.05429506, "balance_loss_mlp": 1.02299428, "epoch": 0.48409787771298024, "flos": 16797252470400.0, "grad_norm": 1.7070869882399438, "language_loss": 0.75565135, "learning_rate": 2.1998615679436997e-06, "loss": 0.77751541, "num_input_tokens_seen": 86628810, "step": 4026, "time_per_iteration": 2.4667627811431885 }, { "auxiliary_loss_clip": 0.01165313, "auxiliary_loss_mlp": 0.01030019, "balance_loss_clip": 1.05495644, "balance_loss_mlp": 1.0214715, "epoch": 0.4842181206036193, "flos": 25083496028160.0, "grad_norm": 2.0833561273461214, "language_loss": 0.77035522, "learning_rate": 2.199086480383942e-06, "loss": 0.79230851, "num_input_tokens_seen": 86648185, "step": 4027, "time_per_iteration": 2.5388615131378174 }, { "auxiliary_loss_clip": 0.01169003, "auxiliary_loss_mlp": 0.0103441, "balance_loss_clip": 1.05447793, "balance_loss_mlp": 1.02546382, "epoch": 0.4843383634942584, "flos": 30372311496960.0, "grad_norm": 3.164689330864456, "language_loss": 0.67496449, "learning_rate": 2.1983113626229234e-06, "loss": 0.6969986, "num_input_tokens_seen": 86667435, "step": 4028, "time_per_iteration": 2.5717720985412598 }, { "auxiliary_loss_clip": 0.01137424, "auxiliary_loss_mlp": 0.0076264, "balance_loss_clip": 1.0501976, "balance_loss_mlp": 1.00003588, "epoch": 0.4844586063848975, "flos": 20413564917120.0, "grad_norm": 1.791005216862981, "language_loss": 0.78418124, "learning_rate": 2.1975362147782293e-06, "loss": 0.80318189, "num_input_tokens_seen": 86686630, "step": 4029, "time_per_iteration": 2.5636374950408936 }, { "auxiliary_loss_clip": 0.010733, "auxiliary_loss_mlp": 0.01002223, "balance_loss_clip": 1.03510737, "balance_loss_mlp": 1.00051832, "epoch": 0.48457884927553657, "flos": 70303722854400.0, "grad_norm": 0.693168743604907, "language_loss": 0.54143679, "learning_rate": 2.196761036967448e-06, "loss": 0.56219196, "num_input_tokens_seen": 86754595, "step": 4030, "time_per_iteration": 3.2325029373168945 }, { "auxiliary_loss_clip": 0.01165098, "auxiliary_loss_mlp": 0.0102481, "balance_loss_clip": 1.05240369, "balance_loss_mlp": 1.01714468, "epoch": 0.4846990921661757, "flos": 19934516206080.0, "grad_norm": 1.615163937148822, "language_loss": 0.77574557, "learning_rate": 2.1959858293081743e-06, "loss": 0.79764467, "num_input_tokens_seen": 86773730, "step": 4031, "time_per_iteration": 2.475933790206909 }, { "auxiliary_loss_clip": 0.01139956, "auxiliary_loss_mlp": 0.0102794, "balance_loss_clip": 1.05192852, "balance_loss_mlp": 1.01986337, "epoch": 0.4848193350568148, "flos": 23075945919360.0, "grad_norm": 1.6198280526942903, "language_loss": 0.76113212, "learning_rate": 2.1952105919180056e-06, "loss": 0.78281111, "num_input_tokens_seen": 86792985, "step": 4032, "time_per_iteration": 2.5688636302948 }, { "auxiliary_loss_clip": 0.01156093, "auxiliary_loss_mlp": 0.0102073, "balance_loss_clip": 1.05360687, "balance_loss_mlp": 1.01236784, "epoch": 0.48493957794745385, "flos": 22455481363200.0, "grad_norm": 2.257211362095846, "language_loss": 0.67835373, "learning_rate": 2.1944353249145456e-06, "loss": 0.700122, "num_input_tokens_seen": 86812095, "step": 4033, "time_per_iteration": 2.550600528717041 }, { "auxiliary_loss_clip": 0.01185952, "auxiliary_loss_mlp": 0.01030916, "balance_loss_clip": 1.05857873, "balance_loss_mlp": 1.02329826, "epoch": 0.48505982083809296, "flos": 25046112948480.0, "grad_norm": 1.453879475665238, "language_loss": 0.74531084, "learning_rate": 2.193660028415401e-06, "loss": 0.76747954, "num_input_tokens_seen": 86832875, "step": 4034, "time_per_iteration": 2.4884185791015625 }, { "auxiliary_loss_clip": 0.01147805, "auxiliary_loss_mlp": 0.01023538, "balance_loss_clip": 1.05176902, "balance_loss_mlp": 1.01554513, "epoch": 0.485180063728732, "flos": 26761386090240.0, "grad_norm": 2.288873035078123, "language_loss": 0.82473969, "learning_rate": 2.1928847025381852e-06, "loss": 0.84645307, "num_input_tokens_seen": 86853480, "step": 4035, "time_per_iteration": 2.567490577697754 }, { "auxiliary_loss_clip": 0.01168067, "auxiliary_loss_mlp": 0.01027018, "balance_loss_clip": 1.05062187, "balance_loss_mlp": 1.01819611, "epoch": 0.4853003066193711, "flos": 24059143969920.0, "grad_norm": 1.647676469350279, "language_loss": 0.83809036, "learning_rate": 2.192109347400512e-06, "loss": 0.8600412, "num_input_tokens_seen": 86873695, "step": 4036, "time_per_iteration": 2.5026042461395264 }, { "auxiliary_loss_clip": 0.01158305, "auxiliary_loss_mlp": 0.01032987, "balance_loss_clip": 1.05186319, "balance_loss_mlp": 1.02411163, "epoch": 0.48542054951001024, "flos": 23076376882560.0, "grad_norm": 1.6434095893706393, "language_loss": 0.79051471, "learning_rate": 2.191333963120004e-06, "loss": 0.81242764, "num_input_tokens_seen": 86892675, "step": 4037, "time_per_iteration": 2.514356851577759 }, { "auxiliary_loss_clip": 0.01158109, "auxiliary_loss_mlp": 0.01032314, "balance_loss_clip": 1.05376327, "balance_loss_mlp": 1.02360606, "epoch": 0.4855407924006493, "flos": 25664889565440.0, "grad_norm": 2.4253763732347147, "language_loss": 0.70460498, "learning_rate": 2.190558549814286e-06, "loss": 0.72650921, "num_input_tokens_seen": 86912835, "step": 4038, "time_per_iteration": 2.537055492401123 }, { "auxiliary_loss_clip": 0.01154999, "auxiliary_loss_mlp": 0.01026686, "balance_loss_clip": 1.05221272, "balance_loss_mlp": 1.01881254, "epoch": 0.4856610352912884, "flos": 23987933256960.0, "grad_norm": 1.7292565221305087, "language_loss": 0.79308236, "learning_rate": 2.1897831076009872e-06, "loss": 0.81489921, "num_input_tokens_seen": 86932475, "step": 4039, "time_per_iteration": 2.512850761413574 }, { "auxiliary_loss_clip": 0.01170373, "auxiliary_loss_mlp": 0.01025949, "balance_loss_clip": 1.05404365, "balance_loss_mlp": 1.01807523, "epoch": 0.4857812781819275, "flos": 24096814358400.0, "grad_norm": 1.613813208040834, "language_loss": 0.80333507, "learning_rate": 2.1890076365977426e-06, "loss": 0.82529831, "num_input_tokens_seen": 86952300, "step": 4040, "time_per_iteration": 2.4895386695861816 }, { "auxiliary_loss_clip": 0.01056137, "auxiliary_loss_mlp": 0.01007984, "balance_loss_clip": 1.01821899, "balance_loss_mlp": 1.00675011, "epoch": 0.48590152107256657, "flos": 56266635185280.0, "grad_norm": 0.8551880458968137, "language_loss": 0.52817249, "learning_rate": 2.188232136922189e-06, "loss": 0.5488137, "num_input_tokens_seen": 87010420, "step": 4041, "time_per_iteration": 3.0015077590942383 }, { "auxiliary_loss_clip": 0.01111436, "auxiliary_loss_mlp": 0.01030993, "balance_loss_clip": 1.04953575, "balance_loss_mlp": 1.0224216, "epoch": 0.4860217639632057, "flos": 20046988667520.0, "grad_norm": 1.843162427098352, "language_loss": 0.75431663, "learning_rate": 2.187456608691971e-06, "loss": 0.77574092, "num_input_tokens_seen": 87029295, "step": 4042, "time_per_iteration": 2.5970029830932617 }, { "auxiliary_loss_clip": 0.01149956, "auxiliary_loss_mlp": 0.01032229, "balance_loss_clip": 1.05697215, "balance_loss_mlp": 1.02371144, "epoch": 0.4861420068538448, "flos": 17822143232640.0, "grad_norm": 1.8753014925101452, "language_loss": 0.87677664, "learning_rate": 2.1866810520247334e-06, "loss": 0.89859855, "num_input_tokens_seen": 87048165, "step": 4043, "time_per_iteration": 2.5197627544403076 }, { "auxiliary_loss_clip": 0.01174997, "auxiliary_loss_mlp": 0.01024559, "balance_loss_clip": 1.05332828, "balance_loss_mlp": 1.01584458, "epoch": 0.48626224974448384, "flos": 26250125857920.0, "grad_norm": 2.3554491165112736, "language_loss": 0.64988488, "learning_rate": 2.185905467038129e-06, "loss": 0.67188048, "num_input_tokens_seen": 87067070, "step": 4044, "time_per_iteration": 3.413688898086548 }, { "auxiliary_loss_clip": 0.01183121, "auxiliary_loss_mlp": 0.01025501, "balance_loss_clip": 1.05732, "balance_loss_mlp": 1.01768053, "epoch": 0.48638249263512295, "flos": 22054502862720.0, "grad_norm": 2.0463745882145945, "language_loss": 0.77482259, "learning_rate": 2.1851298538498127e-06, "loss": 0.7969088, "num_input_tokens_seen": 87086785, "step": 4045, "time_per_iteration": 2.4491310119628906 }, { "auxiliary_loss_clip": 0.01177968, "auxiliary_loss_mlp": 0.00763345, "balance_loss_clip": 1.05739164, "balance_loss_mlp": 1.00016522, "epoch": 0.48650273552576206, "flos": 25119945354240.0, "grad_norm": 1.8469257868097886, "language_loss": 0.79962122, "learning_rate": 2.184354212577446e-06, "loss": 0.81903434, "num_input_tokens_seen": 87107090, "step": 4046, "time_per_iteration": 3.3428757190704346 }, { "auxiliary_loss_clip": 0.01187616, "auxiliary_loss_mlp": 0.01026972, "balance_loss_clip": 1.05630851, "balance_loss_mlp": 1.01869869, "epoch": 0.4866229784164011, "flos": 17456931699840.0, "grad_norm": 4.218166042504802, "language_loss": 0.62811887, "learning_rate": 2.1835785433386907e-06, "loss": 0.65026474, "num_input_tokens_seen": 87125905, "step": 4047, "time_per_iteration": 3.2227940559387207 }, { "auxiliary_loss_clip": 0.01132377, "auxiliary_loss_mlp": 0.01031397, "balance_loss_clip": 1.05056989, "balance_loss_mlp": 1.02305818, "epoch": 0.48674322130704023, "flos": 23331127115520.0, "grad_norm": 1.8853798710415162, "language_loss": 0.65365708, "learning_rate": 2.182802846251216e-06, "loss": 0.67529476, "num_input_tokens_seen": 87146175, "step": 4048, "time_per_iteration": 2.538496494293213 }, { "auxiliary_loss_clip": 0.01147126, "auxiliary_loss_mlp": 0.0102621, "balance_loss_clip": 1.05112743, "balance_loss_mlp": 1.01804423, "epoch": 0.4868634641976793, "flos": 28804344030720.0, "grad_norm": 2.2938470278100827, "language_loss": 0.72392857, "learning_rate": 2.182027121432696e-06, "loss": 0.74566191, "num_input_tokens_seen": 87166800, "step": 4049, "time_per_iteration": 2.5950021743774414 }, { "auxiliary_loss_clip": 0.01188154, "auxiliary_loss_mlp": 0.01030456, "balance_loss_clip": 1.05713403, "balance_loss_mlp": 1.02131259, "epoch": 0.4869837070883184, "flos": 19025976574080.0, "grad_norm": 1.7345402287166836, "language_loss": 0.82433403, "learning_rate": 2.1812513690008054e-06, "loss": 0.84652013, "num_input_tokens_seen": 87185920, "step": 4050, "time_per_iteration": 2.4374701976776123 }, { "auxiliary_loss_clip": 0.01178251, "auxiliary_loss_mlp": 0.01028711, "balance_loss_clip": 1.05663967, "balance_loss_mlp": 1.01998472, "epoch": 0.4871039499789575, "flos": 15121409483520.0, "grad_norm": 2.0933243825728116, "language_loss": 0.79741073, "learning_rate": 2.180475589073227e-06, "loss": 0.8194803, "num_input_tokens_seen": 87203620, "step": 4051, "time_per_iteration": 3.2058520317077637 }, { "auxiliary_loss_clip": 0.0116048, "auxiliary_loss_mlp": 0.01024475, "balance_loss_clip": 1.05248952, "balance_loss_mlp": 1.01639235, "epoch": 0.48722419286959656, "flos": 26174066808960.0, "grad_norm": 1.59038909181476, "language_loss": 0.73389429, "learning_rate": 2.1796997817676456e-06, "loss": 0.75574374, "num_input_tokens_seen": 87224630, "step": 4052, "time_per_iteration": 2.5167155265808105 }, { "auxiliary_loss_clip": 0.01172781, "auxiliary_loss_mlp": 0.00762206, "balance_loss_clip": 1.0552392, "balance_loss_mlp": 1.00008011, "epoch": 0.4873444357602357, "flos": 24026142349440.0, "grad_norm": 1.5389986628754269, "language_loss": 0.67266291, "learning_rate": 2.1789239472017494e-06, "loss": 0.69201279, "num_input_tokens_seen": 87246280, "step": 4053, "time_per_iteration": 2.5113210678100586 }, { "auxiliary_loss_clip": 0.0114349, "auxiliary_loss_mlp": 0.01033206, "balance_loss_clip": 1.05210698, "balance_loss_mlp": 1.02468824, "epoch": 0.4874646786508748, "flos": 22820441500800.0, "grad_norm": 1.9762733860961847, "language_loss": 0.7297858, "learning_rate": 2.1781480854932326e-06, "loss": 0.75155276, "num_input_tokens_seen": 87266045, "step": 4054, "time_per_iteration": 2.5399374961853027 }, { "auxiliary_loss_clip": 0.01126496, "auxiliary_loss_mlp": 0.0103157, "balance_loss_clip": 1.05019116, "balance_loss_mlp": 1.02375579, "epoch": 0.48758492154151384, "flos": 21287594557440.0, "grad_norm": 1.8853190002157212, "language_loss": 0.79476541, "learning_rate": 2.1773721967597933e-06, "loss": 0.81634605, "num_input_tokens_seen": 87284495, "step": 4055, "time_per_iteration": 2.597148895263672 }, { "auxiliary_loss_clip": 0.01049061, "auxiliary_loss_mlp": 0.01001406, "balance_loss_clip": 1.01509893, "balance_loss_mlp": 1.00019598, "epoch": 0.48770516443215295, "flos": 62244109180800.0, "grad_norm": 0.8504286983227286, "language_loss": 0.57372952, "learning_rate": 2.1765962811191322e-06, "loss": 0.59423423, "num_input_tokens_seen": 87338960, "step": 4056, "time_per_iteration": 2.999025344848633 }, { "auxiliary_loss_clip": 0.01037147, "auxiliary_loss_mlp": 0.0100351, "balance_loss_clip": 1.02121603, "balance_loss_mlp": 1.00238371, "epoch": 0.48782540732279206, "flos": 66133451882880.0, "grad_norm": 0.8236176291206496, "language_loss": 0.62008166, "learning_rate": 2.1758203386889566e-06, "loss": 0.64048827, "num_input_tokens_seen": 87401730, "step": 4057, "time_per_iteration": 3.179783821105957 }, { "auxiliary_loss_clip": 0.01140356, "auxiliary_loss_mlp": 0.00762801, "balance_loss_clip": 1.04939401, "balance_loss_mlp": 1.00007892, "epoch": 0.4879456502134311, "flos": 14607922608000.0, "grad_norm": 2.185603282625709, "language_loss": 0.84332621, "learning_rate": 2.1750443695869746e-06, "loss": 0.8623578, "num_input_tokens_seen": 87417300, "step": 4058, "time_per_iteration": 2.5328805446624756 }, { "auxiliary_loss_clip": 0.01172297, "auxiliary_loss_mlp": 0.01029766, "balance_loss_clip": 1.05412626, "balance_loss_mlp": 1.02167809, "epoch": 0.4880658931040702, "flos": 19500464257920.0, "grad_norm": 2.0124907170253756, "language_loss": 0.85885882, "learning_rate": 2.174268373930901e-06, "loss": 0.88087946, "num_input_tokens_seen": 87434815, "step": 4059, "time_per_iteration": 2.495298385620117 }, { "auxiliary_loss_clip": 0.01137662, "auxiliary_loss_mlp": 0.00763332, "balance_loss_clip": 1.05353796, "balance_loss_mlp": 1.00006366, "epoch": 0.48818613599470934, "flos": 16723060928640.0, "grad_norm": 2.077818884417373, "language_loss": 0.79981762, "learning_rate": 2.1734923518384537e-06, "loss": 0.81882757, "num_input_tokens_seen": 87451420, "step": 4060, "time_per_iteration": 2.490832805633545 }, { "auxiliary_loss_clip": 0.01127164, "auxiliary_loss_mlp": 0.0103271, "balance_loss_clip": 1.05108762, "balance_loss_mlp": 1.02489567, "epoch": 0.4883063788853484, "flos": 26756932803840.0, "grad_norm": 1.7850121293091334, "language_loss": 0.82266873, "learning_rate": 2.1727163034273547e-06, "loss": 0.84426749, "num_input_tokens_seen": 87469585, "step": 4061, "time_per_iteration": 2.587679624557495 }, { "auxiliary_loss_clip": 0.01172929, "auxiliary_loss_mlp": 0.01024303, "balance_loss_clip": 1.05345392, "balance_loss_mlp": 1.01585746, "epoch": 0.4884266217759875, "flos": 16763388923520.0, "grad_norm": 2.4614690520949862, "language_loss": 0.79285216, "learning_rate": 2.17194022881533e-06, "loss": 0.81482452, "num_input_tokens_seen": 87485675, "step": 4062, "time_per_iteration": 2.4265897274017334 }, { "auxiliary_loss_clip": 0.01158876, "auxiliary_loss_mlp": 0.01032421, "balance_loss_clip": 1.05140376, "balance_loss_mlp": 1.02390909, "epoch": 0.4885468646666266, "flos": 24207132003840.0, "grad_norm": 6.143432545327328, "language_loss": 0.67689884, "learning_rate": 2.1711641281201092e-06, "loss": 0.69881183, "num_input_tokens_seen": 87505605, "step": 4063, "time_per_iteration": 2.533766508102417 }, { "auxiliary_loss_clip": 0.01168889, "auxiliary_loss_mlp": 0.01026654, "balance_loss_clip": 1.05443549, "balance_loss_mlp": 1.01848829, "epoch": 0.48866710755726567, "flos": 14610795696000.0, "grad_norm": 2.0452451851030835, "language_loss": 0.79497457, "learning_rate": 2.1703880014594264e-06, "loss": 0.81693, "num_input_tokens_seen": 87523195, "step": 4064, "time_per_iteration": 2.443293333053589 }, { "auxiliary_loss_clip": 0.01124908, "auxiliary_loss_mlp": 0.01031485, "balance_loss_clip": 1.05432439, "balance_loss_mlp": 1.02397454, "epoch": 0.4887873504479048, "flos": 28804451771520.0, "grad_norm": 2.367919336367914, "language_loss": 0.73787099, "learning_rate": 2.1696118489510182e-06, "loss": 0.75943494, "num_input_tokens_seen": 87544125, "step": 4065, "time_per_iteration": 2.6290082931518555 }, { "auxiliary_loss_clip": 0.01147638, "auxiliary_loss_mlp": 0.00763423, "balance_loss_clip": 1.0521009, "balance_loss_mlp": 1.00004148, "epoch": 0.48890759333854383, "flos": 22784387224320.0, "grad_norm": 1.7240674376886689, "language_loss": 0.72531772, "learning_rate": 2.1688356707126286e-06, "loss": 0.74442828, "num_input_tokens_seen": 87563745, "step": 4066, "time_per_iteration": 2.549661159515381 }, { "auxiliary_loss_clip": 0.01137688, "auxiliary_loss_mlp": 0.01028774, "balance_loss_clip": 1.04970789, "balance_loss_mlp": 1.02037561, "epoch": 0.48902783622918294, "flos": 17786088956160.0, "grad_norm": 3.0225175160789726, "language_loss": 0.70128107, "learning_rate": 2.168059466862001e-06, "loss": 0.72294563, "num_input_tokens_seen": 87581895, "step": 4067, "time_per_iteration": 2.527517318725586 }, { "auxiliary_loss_clip": 0.01155495, "auxiliary_loss_mlp": 0.01027418, "balance_loss_clip": 1.04899478, "balance_loss_mlp": 1.0198009, "epoch": 0.48914807911982205, "flos": 22310294590080.0, "grad_norm": 1.9855676841262162, "language_loss": 0.8183217, "learning_rate": 2.167283237516887e-06, "loss": 0.84015083, "num_input_tokens_seen": 87600170, "step": 4068, "time_per_iteration": 2.5178065299987793 }, { "auxiliary_loss_clip": 0.01159729, "auxiliary_loss_mlp": 0.01033758, "balance_loss_clip": 1.05256808, "balance_loss_mlp": 1.02556229, "epoch": 0.4892683220104611, "flos": 16363020954240.0, "grad_norm": 1.7547126209766728, "language_loss": 0.74762404, "learning_rate": 2.1665069827950383e-06, "loss": 0.76955891, "num_input_tokens_seen": 87617455, "step": 4069, "time_per_iteration": 2.485710859298706 }, { "auxiliary_loss_clip": 0.01157013, "auxiliary_loss_mlp": 0.01026805, "balance_loss_clip": 1.05280459, "balance_loss_mlp": 1.01913369, "epoch": 0.4893885649011002, "flos": 15739144606080.0, "grad_norm": 2.3863527753022313, "language_loss": 0.86746252, "learning_rate": 2.1657307028142126e-06, "loss": 0.88930064, "num_input_tokens_seen": 87634995, "step": 4070, "time_per_iteration": 3.367102861404419 }, { "auxiliary_loss_clip": 0.01156453, "auxiliary_loss_mlp": 0.01034344, "balance_loss_clip": 1.05322647, "balance_loss_mlp": 1.02535522, "epoch": 0.48950880779173933, "flos": 28581984887040.0, "grad_norm": 3.149404413788562, "language_loss": 0.67488945, "learning_rate": 2.164954397692171e-06, "loss": 0.69679743, "num_input_tokens_seen": 87654420, "step": 4071, "time_per_iteration": 2.571049690246582 }, { "auxiliary_loss_clip": 0.0106145, "auxiliary_loss_mlp": 0.01003267, "balance_loss_clip": 1.01723826, "balance_loss_mlp": 1.00205135, "epoch": 0.4896290506823784, "flos": 66186310746240.0, "grad_norm": 1.0803737716027464, "language_loss": 0.77365255, "learning_rate": 2.164178067546678e-06, "loss": 0.79429972, "num_input_tokens_seen": 87713585, "step": 4072, "time_per_iteration": 3.135568857192993 }, { "auxiliary_loss_clip": 0.01159518, "auxiliary_loss_mlp": 0.01030447, "balance_loss_clip": 1.04995441, "balance_loss_mlp": 1.02207875, "epoch": 0.4897492935730175, "flos": 12531065207040.0, "grad_norm": 1.9436197279462866, "language_loss": 0.90988368, "learning_rate": 2.163401712495504e-06, "loss": 0.93178332, "num_input_tokens_seen": 87731280, "step": 4073, "time_per_iteration": 3.352949380874634 }, { "auxiliary_loss_clip": 0.01135533, "auxiliary_loss_mlp": 0.01036582, "balance_loss_clip": 1.05201387, "balance_loss_mlp": 1.02802885, "epoch": 0.4898695364636566, "flos": 23476816679040.0, "grad_norm": 1.509612098462676, "language_loss": 0.79023528, "learning_rate": 2.1626253326564194e-06, "loss": 0.81195647, "num_input_tokens_seen": 87750230, "step": 4074, "time_per_iteration": 3.3941211700439453 }, { "auxiliary_loss_clip": 0.01153414, "auxiliary_loss_mlp": 0.01030634, "balance_loss_clip": 1.04936397, "balance_loss_mlp": 1.02222383, "epoch": 0.48998977935429566, "flos": 27160209774720.0, "grad_norm": 3.837295918962431, "language_loss": 0.76680166, "learning_rate": 2.161848928147201e-06, "loss": 0.78864217, "num_input_tokens_seen": 87770500, "step": 4075, "time_per_iteration": 2.560128927230835 }, { "auxiliary_loss_clip": 0.01171607, "auxiliary_loss_mlp": 0.01026177, "balance_loss_clip": 1.05549455, "balance_loss_mlp": 1.01795769, "epoch": 0.4901100222449348, "flos": 20339588856960.0, "grad_norm": 1.970392440756129, "language_loss": 0.8081584, "learning_rate": 2.161072499085629e-06, "loss": 0.8301363, "num_input_tokens_seen": 87789495, "step": 4076, "time_per_iteration": 2.478710651397705 }, { "auxiliary_loss_clip": 0.01147646, "auxiliary_loss_mlp": 0.01026849, "balance_loss_clip": 1.05219328, "balance_loss_mlp": 1.01894557, "epoch": 0.4902302651355739, "flos": 30446359384320.0, "grad_norm": 1.8004402661580634, "language_loss": 0.83366829, "learning_rate": 2.160296045589487e-06, "loss": 0.8554132, "num_input_tokens_seen": 87812955, "step": 4077, "time_per_iteration": 2.6417996883392334 }, { "auxiliary_loss_clip": 0.01168822, "auxiliary_loss_mlp": 0.01024558, "balance_loss_clip": 1.05364525, "balance_loss_mlp": 1.01617169, "epoch": 0.49035050802621294, "flos": 19174180089600.0, "grad_norm": 2.377398819649, "language_loss": 0.69271755, "learning_rate": 2.159519567776562e-06, "loss": 0.71465135, "num_input_tokens_seen": 87832605, "step": 4078, "time_per_iteration": 3.205803632736206 }, { "auxiliary_loss_clip": 0.01129338, "auxiliary_loss_mlp": 0.01027053, "balance_loss_clip": 1.04453564, "balance_loss_mlp": 1.01866043, "epoch": 0.49047075091685205, "flos": 22228489365120.0, "grad_norm": 2.6030239595589744, "language_loss": 0.70642829, "learning_rate": 2.1587430657646463e-06, "loss": 0.72799218, "num_input_tokens_seen": 87846040, "step": 4079, "time_per_iteration": 2.547797679901123 }, { "auxiliary_loss_clip": 0.01154888, "auxiliary_loss_mlp": 0.01024846, "balance_loss_clip": 1.05300307, "balance_loss_mlp": 1.01694822, "epoch": 0.4905909938074911, "flos": 20156516213760.0, "grad_norm": 1.9767102803922114, "language_loss": 0.77868295, "learning_rate": 2.157966539671533e-06, "loss": 0.80048025, "num_input_tokens_seen": 87865680, "step": 4080, "time_per_iteration": 2.5252723693847656 }, { "auxiliary_loss_clip": 0.0114174, "auxiliary_loss_mlp": 0.01025557, "balance_loss_clip": 1.04926026, "balance_loss_mlp": 1.01799011, "epoch": 0.4907112366981302, "flos": 17202217380480.0, "grad_norm": 2.0091646715340072, "language_loss": 0.6726476, "learning_rate": 2.157189989615021e-06, "loss": 0.69432056, "num_input_tokens_seen": 87884270, "step": 4081, "time_per_iteration": 2.5656538009643555 }, { "auxiliary_loss_clip": 0.01170981, "auxiliary_loss_mlp": 0.00763178, "balance_loss_clip": 1.0515815, "balance_loss_mlp": 1.00011992, "epoch": 0.4908314795887693, "flos": 21688968107520.0, "grad_norm": 2.211378102152233, "language_loss": 0.75354463, "learning_rate": 2.156413415712913e-06, "loss": 0.77288622, "num_input_tokens_seen": 87906320, "step": 4082, "time_per_iteration": 2.5558478832244873 }, { "auxiliary_loss_clip": 0.01163424, "auxiliary_loss_mlp": 0.00763654, "balance_loss_clip": 1.05446339, "balance_loss_mlp": 1.00014174, "epoch": 0.4909517224794084, "flos": 26213676531840.0, "grad_norm": 1.6783046842293747, "language_loss": 0.78429568, "learning_rate": 2.155636818083014e-06, "loss": 0.80356646, "num_input_tokens_seen": 87927690, "step": 4083, "time_per_iteration": 2.538339853286743 }, { "auxiliary_loss_clip": 0.0115417, "auxiliary_loss_mlp": 0.01024173, "balance_loss_clip": 1.05390143, "balance_loss_mlp": 1.01689482, "epoch": 0.4910719653700475, "flos": 23148377694720.0, "grad_norm": 1.8961320654349467, "language_loss": 0.84105182, "learning_rate": 2.154860196843134e-06, "loss": 0.86283529, "num_input_tokens_seen": 87946885, "step": 4084, "time_per_iteration": 2.513349771499634 }, { "auxiliary_loss_clip": 0.0118527, "auxiliary_loss_mlp": 0.01028536, "balance_loss_clip": 1.05567265, "balance_loss_mlp": 1.02063835, "epoch": 0.4911922082606866, "flos": 23331845387520.0, "grad_norm": 1.6832848412475891, "language_loss": 0.76682121, "learning_rate": 2.154083552111085e-06, "loss": 0.78895926, "num_input_tokens_seen": 87966055, "step": 4085, "time_per_iteration": 2.453343152999878 }, { "auxiliary_loss_clip": 0.01185093, "auxiliary_loss_mlp": 0.01027101, "balance_loss_clip": 1.05306721, "balance_loss_mlp": 1.01885223, "epoch": 0.49131245115132566, "flos": 29203239542400.0, "grad_norm": 1.943740418708593, "language_loss": 0.81703532, "learning_rate": 2.1533068840046834e-06, "loss": 0.83915728, "num_input_tokens_seen": 87986320, "step": 4086, "time_per_iteration": 2.488363742828369 }, { "auxiliary_loss_clip": 0.01149195, "auxiliary_loss_mlp": 0.00763245, "balance_loss_clip": 1.05005085, "balance_loss_mlp": 1.00013399, "epoch": 0.49143269404196477, "flos": 20147465986560.0, "grad_norm": 2.328205497385505, "language_loss": 0.61776161, "learning_rate": 2.152530192641749e-06, "loss": 0.636886, "num_input_tokens_seen": 88001230, "step": 4087, "time_per_iteration": 2.4580159187316895 }, { "auxiliary_loss_clip": 0.01172731, "auxiliary_loss_mlp": 0.01032853, "balance_loss_clip": 1.05240154, "balance_loss_mlp": 1.0245502, "epoch": 0.4915529369326039, "flos": 24389809597440.0, "grad_norm": 1.8741599549456955, "language_loss": 0.72298652, "learning_rate": 2.1517534781401068e-06, "loss": 0.74504244, "num_input_tokens_seen": 88019110, "step": 4088, "time_per_iteration": 2.491710901260376 }, { "auxiliary_loss_clip": 0.01169037, "auxiliary_loss_mlp": 0.01024429, "balance_loss_clip": 1.0533483, "balance_loss_mlp": 1.01623356, "epoch": 0.49167317982324293, "flos": 10524305197440.0, "grad_norm": 2.5346007002992805, "language_loss": 0.69590163, "learning_rate": 2.150976740617581e-06, "loss": 0.71783626, "num_input_tokens_seen": 88035670, "step": 4089, "time_per_iteration": 2.45009708404541 }, { "auxiliary_loss_clip": 0.01161298, "auxiliary_loss_mlp": 0.01030254, "balance_loss_clip": 1.05377293, "balance_loss_mlp": 1.02201641, "epoch": 0.49179342271388204, "flos": 25593427457280.0, "grad_norm": 1.8633110249632234, "language_loss": 0.71191609, "learning_rate": 2.150199980192006e-06, "loss": 0.73383164, "num_input_tokens_seen": 88054790, "step": 4090, "time_per_iteration": 2.5462887287139893 }, { "auxiliary_loss_clip": 0.0114873, "auxiliary_loss_mlp": 0.01024441, "balance_loss_clip": 1.04979455, "balance_loss_mlp": 1.0166862, "epoch": 0.49191366560452116, "flos": 21102043875840.0, "grad_norm": 1.6342879001809376, "language_loss": 0.80829811, "learning_rate": 2.1494231969812114e-06, "loss": 0.83002979, "num_input_tokens_seen": 88073780, "step": 4091, "time_per_iteration": 2.471823215484619 }, { "auxiliary_loss_clip": 0.01148388, "auxiliary_loss_mlp": 0.01028542, "balance_loss_clip": 1.05338955, "balance_loss_mlp": 1.02001905, "epoch": 0.4920339084951602, "flos": 26067520091520.0, "grad_norm": 2.4557446123295015, "language_loss": 0.81295007, "learning_rate": 2.1486463911030372e-06, "loss": 0.83471942, "num_input_tokens_seen": 88094430, "step": 4092, "time_per_iteration": 2.581411361694336 }, { "auxiliary_loss_clip": 0.01151775, "auxiliary_loss_mlp": 0.01031163, "balance_loss_clip": 1.04779994, "balance_loss_mlp": 1.02317047, "epoch": 0.4921541513857993, "flos": 25081269384960.0, "grad_norm": 1.8172882285606748, "language_loss": 0.74467409, "learning_rate": 2.147869562675324e-06, "loss": 0.76650345, "num_input_tokens_seen": 88113400, "step": 4093, "time_per_iteration": 2.5447893142700195 }, { "auxiliary_loss_clip": 0.01169473, "auxiliary_loss_mlp": 0.01027965, "balance_loss_clip": 1.05332804, "balance_loss_mlp": 1.01990676, "epoch": 0.49227439427643843, "flos": 24389809597440.0, "grad_norm": 1.7643488611528046, "language_loss": 0.72301012, "learning_rate": 2.147092711815915e-06, "loss": 0.74498451, "num_input_tokens_seen": 88132750, "step": 4094, "time_per_iteration": 2.4886093139648438 }, { "auxiliary_loss_clip": 0.01141163, "auxiliary_loss_mlp": 0.01022179, "balance_loss_clip": 1.05231571, "balance_loss_mlp": 1.01460588, "epoch": 0.4923946371670775, "flos": 11363753018880.0, "grad_norm": 2.443281012314946, "language_loss": 0.8587079, "learning_rate": 2.1463158386426593e-06, "loss": 0.88034129, "num_input_tokens_seen": 88150560, "step": 4095, "time_per_iteration": 2.4847512245178223 }, { "auxiliary_loss_clip": 0.01163391, "auxiliary_loss_mlp": 0.01029957, "balance_loss_clip": 1.05332351, "balance_loss_mlp": 1.02148676, "epoch": 0.4925148800577166, "flos": 30445964334720.0, "grad_norm": 1.9821149252294032, "language_loss": 0.7748816, "learning_rate": 2.145538943273407e-06, "loss": 0.7968151, "num_input_tokens_seen": 88170835, "step": 4096, "time_per_iteration": 2.5843424797058105 }, { "auxiliary_loss_clip": 0.01186129, "auxiliary_loss_mlp": 0.01030119, "balance_loss_clip": 1.05723906, "balance_loss_mlp": 1.02219796, "epoch": 0.49263512294835565, "flos": 20850454039680.0, "grad_norm": 2.932051404567651, "language_loss": 0.71612585, "learning_rate": 2.144762025826013e-06, "loss": 0.73828828, "num_input_tokens_seen": 88189925, "step": 4097, "time_per_iteration": 3.275627374649048 }, { "auxiliary_loss_clip": 0.01174187, "auxiliary_loss_mlp": 0.01030341, "balance_loss_clip": 1.05285192, "balance_loss_mlp": 1.02218711, "epoch": 0.49275536583899476, "flos": 23767477534080.0, "grad_norm": 2.255771670245462, "language_loss": 0.86852419, "learning_rate": 2.143985086418334e-06, "loss": 0.89056945, "num_input_tokens_seen": 88205105, "step": 4098, "time_per_iteration": 2.450673818588257 }, { "auxiliary_loss_clip": 0.0115596, "auxiliary_loss_mlp": 0.01022018, "balance_loss_clip": 1.05104613, "balance_loss_mlp": 1.01432848, "epoch": 0.4928756087296339, "flos": 22273522041600.0, "grad_norm": 1.4335540950301706, "language_loss": 0.76434034, "learning_rate": 2.1432081251682324e-06, "loss": 0.78612012, "num_input_tokens_seen": 88225475, "step": 4099, "time_per_iteration": 3.343742847442627 }, { "auxiliary_loss_clip": 0.01176135, "auxiliary_loss_mlp": 0.01026985, "balance_loss_clip": 1.06125724, "balance_loss_mlp": 1.0181222, "epoch": 0.49299585162027293, "flos": 19645471463040.0, "grad_norm": 2.216194006508457, "language_loss": 0.87102968, "learning_rate": 2.142431142193572e-06, "loss": 0.89306086, "num_input_tokens_seen": 88243255, "step": 4100, "time_per_iteration": 3.276048183441162 }, { "auxiliary_loss_clip": 0.011841, "auxiliary_loss_mlp": 0.01029629, "balance_loss_clip": 1.0565002, "balance_loss_mlp": 1.02131426, "epoch": 0.49311609451091204, "flos": 38837138497920.0, "grad_norm": 2.299635026533799, "language_loss": 0.7161544, "learning_rate": 2.1416541376122207e-06, "loss": 0.73829168, "num_input_tokens_seen": 88263435, "step": 4101, "time_per_iteration": 2.579281806945801 }, { "auxiliary_loss_clip": 0.01182973, "auxiliary_loss_mlp": 0.01029403, "balance_loss_clip": 1.0530324, "balance_loss_mlp": 1.02055156, "epoch": 0.49323633740155115, "flos": 28329102161280.0, "grad_norm": 1.7028832097159055, "language_loss": 0.73146141, "learning_rate": 2.1408771115420496e-06, "loss": 0.75358522, "num_input_tokens_seen": 88283295, "step": 4102, "time_per_iteration": 2.47206711769104 }, { "auxiliary_loss_clip": 0.01130977, "auxiliary_loss_mlp": 0.01029036, "balance_loss_clip": 1.05464578, "balance_loss_mlp": 1.02138281, "epoch": 0.4933565802921902, "flos": 21135584200320.0, "grad_norm": 2.000212652467221, "language_loss": 0.64809251, "learning_rate": 2.140100064100932e-06, "loss": 0.66969264, "num_input_tokens_seen": 88299270, "step": 4103, "time_per_iteration": 2.577216386795044 }, { "auxiliary_loss_clip": 0.01166449, "auxiliary_loss_mlp": 0.01023223, "balance_loss_clip": 1.05257654, "balance_loss_mlp": 1.01554906, "epoch": 0.4934768231828293, "flos": 18039007595520.0, "grad_norm": 1.821748772083299, "language_loss": 0.75829285, "learning_rate": 2.139322995406746e-06, "loss": 0.78018957, "num_input_tokens_seen": 88316905, "step": 4104, "time_per_iteration": 3.143646478652954 }, { "auxiliary_loss_clip": 0.01187829, "auxiliary_loss_mlp": 0.01032771, "balance_loss_clip": 1.05776906, "balance_loss_mlp": 1.02409816, "epoch": 0.4935970660734684, "flos": 23469957181440.0, "grad_norm": 1.8561560074483614, "language_loss": 0.79598165, "learning_rate": 2.1385459055773727e-06, "loss": 0.81818759, "num_input_tokens_seen": 88335095, "step": 4105, "time_per_iteration": 2.4261133670806885 }, { "auxiliary_loss_clip": 0.011128, "auxiliary_loss_mlp": 0.00762647, "balance_loss_clip": 1.04559731, "balance_loss_mlp": 1.00013137, "epoch": 0.4937173089641075, "flos": 64479258840960.0, "grad_norm": 2.1145639537237235, "language_loss": 0.73897099, "learning_rate": 2.137768794730696e-06, "loss": 0.75772548, "num_input_tokens_seen": 88358545, "step": 4106, "time_per_iteration": 2.948566436767578 }, { "auxiliary_loss_clip": 0.01160698, "auxiliary_loss_mlp": 0.01034146, "balance_loss_clip": 1.0540297, "balance_loss_mlp": 1.02558064, "epoch": 0.4938375518547466, "flos": 22346025644160.0, "grad_norm": 1.7450999679347463, "language_loss": 0.80337608, "learning_rate": 2.1369916629846026e-06, "loss": 0.82532454, "num_input_tokens_seen": 88378295, "step": 4107, "time_per_iteration": 2.536890983581543 }, { "auxiliary_loss_clip": 0.0115413, "auxiliary_loss_mlp": 0.01024966, "balance_loss_clip": 1.04833758, "balance_loss_mlp": 1.01715827, "epoch": 0.4939577947453857, "flos": 17858700299520.0, "grad_norm": 1.8202031863284833, "language_loss": 0.75075233, "learning_rate": 2.136214510456983e-06, "loss": 0.77254331, "num_input_tokens_seen": 88396750, "step": 4108, "time_per_iteration": 2.6008787155151367 }, { "auxiliary_loss_clip": 0.01048722, "auxiliary_loss_mlp": 0.00753078, "balance_loss_clip": 1.02279603, "balance_loss_mlp": 1.00006735, "epoch": 0.49407803763602476, "flos": 70066746875520.0, "grad_norm": 0.8904927835101719, "language_loss": 0.63189888, "learning_rate": 2.1354373372657296e-06, "loss": 0.64991689, "num_input_tokens_seen": 88455190, "step": 4109, "time_per_iteration": 3.1587002277374268 }, { "auxiliary_loss_clip": 0.01183679, "auxiliary_loss_mlp": 0.01028884, "balance_loss_clip": 1.05637908, "balance_loss_mlp": 1.02154064, "epoch": 0.49419828052666387, "flos": 24317485562880.0, "grad_norm": 1.9446445796968632, "language_loss": 0.71029574, "learning_rate": 2.1346601435287404e-06, "loss": 0.7324214, "num_input_tokens_seen": 88477460, "step": 4110, "time_per_iteration": 2.5125043392181396 }, { "auxiliary_loss_clip": 0.0115357, "auxiliary_loss_mlp": 0.01026553, "balance_loss_clip": 1.04996765, "balance_loss_mlp": 1.01864958, "epoch": 0.494318523417303, "flos": 29386060790400.0, "grad_norm": 2.0056009138406687, "language_loss": 0.80229628, "learning_rate": 2.1338829293639144e-06, "loss": 0.82409751, "num_input_tokens_seen": 88497820, "step": 4111, "time_per_iteration": 2.5772926807403564 }, { "auxiliary_loss_clip": 0.01127871, "auxiliary_loss_mlp": 0.01033919, "balance_loss_clip": 1.05036068, "balance_loss_mlp": 1.02529407, "epoch": 0.49443876630794203, "flos": 15268284195840.0, "grad_norm": 1.9464909524123395, "language_loss": 0.82966781, "learning_rate": 2.1331056948891547e-06, "loss": 0.8512857, "num_input_tokens_seen": 88514920, "step": 4112, "time_per_iteration": 2.5548248291015625 }, { "auxiliary_loss_clip": 0.01151468, "auxiliary_loss_mlp": 0.01026444, "balance_loss_clip": 1.05139184, "balance_loss_mlp": 1.01839781, "epoch": 0.49455900919858115, "flos": 12347453859840.0, "grad_norm": 2.153547646896936, "language_loss": 0.76733214, "learning_rate": 2.1323284402223666e-06, "loss": 0.78911126, "num_input_tokens_seen": 88530910, "step": 4113, "time_per_iteration": 2.4448986053466797 }, { "auxiliary_loss_clip": 0.01182954, "auxiliary_loss_mlp": 0.00761664, "balance_loss_clip": 1.05851507, "balance_loss_mlp": 1.00008357, "epoch": 0.4946792520892202, "flos": 22779610715520.0, "grad_norm": 1.894227028815412, "language_loss": 0.88390982, "learning_rate": 2.1315511654814597e-06, "loss": 0.90335602, "num_input_tokens_seen": 88549320, "step": 4114, "time_per_iteration": 2.449101209640503 }, { "auxiliary_loss_clip": 0.0114949, "auxiliary_loss_mlp": 0.01024801, "balance_loss_clip": 1.05316114, "balance_loss_mlp": 1.01755619, "epoch": 0.4947994949798593, "flos": 23148126299520.0, "grad_norm": 1.8064484786574229, "language_loss": 0.78082669, "learning_rate": 2.1307738707843456e-06, "loss": 0.80256957, "num_input_tokens_seen": 88568985, "step": 4115, "time_per_iteration": 2.5029611587524414 }, { "auxiliary_loss_clip": 0.01175774, "auxiliary_loss_mlp": 0.01026601, "balance_loss_clip": 1.05641603, "balance_loss_mlp": 1.01887, "epoch": 0.4949197378704984, "flos": 23659997063040.0, "grad_norm": 1.9267952837255575, "language_loss": 0.68972141, "learning_rate": 2.1299965562489385e-06, "loss": 0.71174514, "num_input_tokens_seen": 88588790, "step": 4116, "time_per_iteration": 2.5004801750183105 }, { "auxiliary_loss_clip": 0.01167111, "auxiliary_loss_mlp": 0.01030432, "balance_loss_clip": 1.05162835, "balance_loss_mlp": 1.02229583, "epoch": 0.4950399807611375, "flos": 26911493026560.0, "grad_norm": 1.404968374885798, "language_loss": 0.79080629, "learning_rate": 2.129219221993158e-06, "loss": 0.81278175, "num_input_tokens_seen": 88613575, "step": 4117, "time_per_iteration": 2.54606032371521 }, { "auxiliary_loss_clip": 0.01057268, "auxiliary_loss_mlp": 0.01006629, "balance_loss_clip": 1.02741957, "balance_loss_mlp": 1.00551462, "epoch": 0.4951602236517766, "flos": 67315270187520.0, "grad_norm": 0.7990890443105685, "language_loss": 0.59967893, "learning_rate": 2.128441868134924e-06, "loss": 0.62031788, "num_input_tokens_seen": 88675510, "step": 4118, "time_per_iteration": 3.1416573524475098 }, { "auxiliary_loss_clip": 0.01142471, "auxiliary_loss_mlp": 0.01027405, "balance_loss_clip": 1.04829574, "balance_loss_mlp": 1.01922154, "epoch": 0.4952804665424157, "flos": 19901442758400.0, "grad_norm": 2.0912139166573, "language_loss": 0.82745439, "learning_rate": 2.1276644947921606e-06, "loss": 0.84915316, "num_input_tokens_seen": 88694425, "step": 4119, "time_per_iteration": 2.5093836784362793 }, { "auxiliary_loss_clip": 0.0116913, "auxiliary_loss_mlp": 0.01026062, "balance_loss_clip": 1.05257046, "balance_loss_mlp": 1.01715124, "epoch": 0.49540070943305475, "flos": 18806813740800.0, "grad_norm": 1.9239056445013087, "language_loss": 0.8263731, "learning_rate": 2.126887102082795e-06, "loss": 0.84832501, "num_input_tokens_seen": 88714450, "step": 4120, "time_per_iteration": 2.4562063217163086 }, { "auxiliary_loss_clip": 0.0113999, "auxiliary_loss_mlp": 0.0102735, "balance_loss_clip": 1.04816628, "balance_loss_mlp": 1.01959276, "epoch": 0.49552095232369386, "flos": 24934179191040.0, "grad_norm": 1.9105544846739133, "language_loss": 0.70732427, "learning_rate": 2.126109690124757e-06, "loss": 0.72899771, "num_input_tokens_seen": 88735265, "step": 4121, "time_per_iteration": 2.5715651512145996 }, { "auxiliary_loss_clip": 0.01126929, "auxiliary_loss_mlp": 0.01027542, "balance_loss_clip": 1.04761755, "balance_loss_mlp": 1.01981711, "epoch": 0.495641195214333, "flos": 22857249962880.0, "grad_norm": 1.6611266414500356, "language_loss": 0.71172798, "learning_rate": 2.1253322590359786e-06, "loss": 0.73327267, "num_input_tokens_seen": 88754600, "step": 4122, "time_per_iteration": 2.574528455734253 }, { "auxiliary_loss_clip": 0.01166389, "auxiliary_loss_mlp": 0.01032753, "balance_loss_clip": 1.05131733, "balance_loss_mlp": 1.02505851, "epoch": 0.49576143810497203, "flos": 25769748343680.0, "grad_norm": 1.7271996843557909, "language_loss": 0.73734379, "learning_rate": 2.124554808934397e-06, "loss": 0.75933522, "num_input_tokens_seen": 88775180, "step": 4123, "time_per_iteration": 2.5132992267608643 }, { "auxiliary_loss_clip": 0.01119559, "auxiliary_loss_mlp": 0.01028642, "balance_loss_clip": 1.04577088, "balance_loss_mlp": 1.02005935, "epoch": 0.49588168099561114, "flos": 22128838058880.0, "grad_norm": 1.8237209943990293, "language_loss": 0.72868013, "learning_rate": 2.1237773399379496e-06, "loss": 0.75016212, "num_input_tokens_seen": 88796145, "step": 4124, "time_per_iteration": 3.4027979373931885 }, { "auxiliary_loss_clip": 0.01157245, "auxiliary_loss_mlp": 0.01025202, "balance_loss_clip": 1.04608119, "balance_loss_mlp": 1.01640153, "epoch": 0.49600192388625025, "flos": 24387331559040.0, "grad_norm": 9.384741885532225, "language_loss": 0.8693943, "learning_rate": 2.122999852164578e-06, "loss": 0.89121872, "num_input_tokens_seen": 88816765, "step": 4125, "time_per_iteration": 2.534491539001465 }, { "auxiliary_loss_clip": 0.01123971, "auxiliary_loss_mlp": 0.01024233, "balance_loss_clip": 1.04789424, "balance_loss_mlp": 1.01600778, "epoch": 0.4961221667768893, "flos": 22857429530880.0, "grad_norm": 2.3943681056355444, "language_loss": 0.58202177, "learning_rate": 2.122222345732227e-06, "loss": 0.60350382, "num_input_tokens_seen": 88836680, "step": 4126, "time_per_iteration": 3.4016165733337402 }, { "auxiliary_loss_clip": 0.01141351, "auxiliary_loss_mlp": 0.01027726, "balance_loss_clip": 1.048908, "balance_loss_mlp": 1.01952422, "epoch": 0.4962424096675284, "flos": 17858089768320.0, "grad_norm": 2.678943307408686, "language_loss": 0.83160228, "learning_rate": 2.121444820758843e-06, "loss": 0.85329306, "num_input_tokens_seen": 88855320, "step": 4127, "time_per_iteration": 3.260526180267334 }, { "auxiliary_loss_clip": 0.01125578, "auxiliary_loss_mlp": 0.01034139, "balance_loss_clip": 1.04996705, "balance_loss_mlp": 1.02537668, "epoch": 0.49636265255816747, "flos": 21793611404160.0, "grad_norm": 2.081068333847806, "language_loss": 0.78742272, "learning_rate": 2.120667277362376e-06, "loss": 0.80901992, "num_input_tokens_seen": 88874035, "step": 4128, "time_per_iteration": 2.5433716773986816 }, { "auxiliary_loss_clip": 0.0118723, "auxiliary_loss_mlp": 0.01036348, "balance_loss_clip": 1.05742288, "balance_loss_mlp": 1.02771091, "epoch": 0.4964828954488066, "flos": 16358603581440.0, "grad_norm": 2.201434720448348, "language_loss": 0.85090858, "learning_rate": 2.1198897156607796e-06, "loss": 0.87314433, "num_input_tokens_seen": 88891390, "step": 4129, "time_per_iteration": 2.4152348041534424 }, { "auxiliary_loss_clip": 0.01173403, "auxiliary_loss_mlp": 0.01029267, "balance_loss_clip": 1.05186415, "balance_loss_mlp": 1.02113104, "epoch": 0.4966031383394457, "flos": 24711101775360.0, "grad_norm": 2.9455213774524034, "language_loss": 0.73813653, "learning_rate": 2.1191121357720085e-06, "loss": 0.76016319, "num_input_tokens_seen": 88909450, "step": 4130, "time_per_iteration": 3.2572743892669678 }, { "auxiliary_loss_clip": 0.01120327, "auxiliary_loss_mlp": 0.01029676, "balance_loss_clip": 1.04868948, "balance_loss_mlp": 1.02118862, "epoch": 0.49672338123008475, "flos": 22930615491840.0, "grad_norm": 1.6450711028103961, "language_loss": 0.7504428, "learning_rate": 2.1183345378140206e-06, "loss": 0.77194291, "num_input_tokens_seen": 88929195, "step": 4131, "time_per_iteration": 2.6694369316101074 }, { "auxiliary_loss_clip": 0.01072872, "auxiliary_loss_mlp": 0.01002259, "balance_loss_clip": 1.01930702, "balance_loss_mlp": 1.0008105, "epoch": 0.49684362412072386, "flos": 65976736844160.0, "grad_norm": 0.8539052808092114, "language_loss": 0.61984456, "learning_rate": 2.1175569219047783e-06, "loss": 0.64059579, "num_input_tokens_seen": 88990635, "step": 4132, "time_per_iteration": 3.160383701324463 }, { "auxiliary_loss_clip": 0.0118328, "auxiliary_loss_mlp": 0.01027755, "balance_loss_clip": 1.05444455, "balance_loss_mlp": 1.01999497, "epoch": 0.49696386701136297, "flos": 19971288754560.0, "grad_norm": 1.6719445703853393, "language_loss": 0.73208475, "learning_rate": 2.1167792881622437e-06, "loss": 0.75419515, "num_input_tokens_seen": 89009655, "step": 4133, "time_per_iteration": 2.4385061264038086 }, { "auxiliary_loss_clip": 0.01153971, "auxiliary_loss_mlp": 0.01034489, "balance_loss_clip": 1.0534575, "balance_loss_mlp": 1.02666879, "epoch": 0.497084109902002, "flos": 24750819239040.0, "grad_norm": 1.7693898649399644, "language_loss": 0.81001109, "learning_rate": 2.116001636704384e-06, "loss": 0.83189565, "num_input_tokens_seen": 89030040, "step": 4134, "time_per_iteration": 2.5290114879608154 }, { "auxiliary_loss_clip": 0.01138425, "auxiliary_loss_mlp": 0.01032851, "balance_loss_clip": 1.05056131, "balance_loss_mlp": 1.02447033, "epoch": 0.49720435279264114, "flos": 21871825269120.0, "grad_norm": 1.8235047463180554, "language_loss": 0.80288315, "learning_rate": 2.1152239676491685e-06, "loss": 0.82459593, "num_input_tokens_seen": 89048145, "step": 4135, "time_per_iteration": 2.5922958850860596 }, { "auxiliary_loss_clip": 0.01159992, "auxiliary_loss_mlp": 0.01025529, "balance_loss_clip": 1.05013418, "balance_loss_mlp": 1.0177151, "epoch": 0.49732459568328025, "flos": 23805794367360.0, "grad_norm": 1.7542176573297985, "language_loss": 0.73184544, "learning_rate": 2.114446281114569e-06, "loss": 0.75370061, "num_input_tokens_seen": 89067165, "step": 4136, "time_per_iteration": 2.5260047912597656 }, { "auxiliary_loss_clip": 0.01147251, "auxiliary_loss_mlp": 0.01026575, "balance_loss_clip": 1.05117249, "balance_loss_mlp": 1.01786077, "epoch": 0.4974448385739193, "flos": 20047742853120.0, "grad_norm": 1.9369203888230022, "language_loss": 0.76220119, "learning_rate": 2.1136685772185587e-06, "loss": 0.78393948, "num_input_tokens_seen": 89086190, "step": 4137, "time_per_iteration": 2.489786386489868 }, { "auxiliary_loss_clip": 0.01152505, "auxiliary_loss_mlp": 0.00763387, "balance_loss_clip": 1.04498053, "balance_loss_mlp": 1.0000608, "epoch": 0.4975650814645584, "flos": 24821347593600.0, "grad_norm": 1.600728129345119, "language_loss": 0.78087401, "learning_rate": 2.1128908560791163e-06, "loss": 0.80003291, "num_input_tokens_seen": 89106020, "step": 4138, "time_per_iteration": 2.541379690170288 }, { "auxiliary_loss_clip": 0.01184245, "auxiliary_loss_mlp": 0.01029795, "balance_loss_clip": 1.05584574, "balance_loss_mlp": 1.02143252, "epoch": 0.4976853243551975, "flos": 19829477859840.0, "grad_norm": 2.1474548819042285, "language_loss": 0.78336895, "learning_rate": 2.1121131178142203e-06, "loss": 0.80550939, "num_input_tokens_seen": 89125385, "step": 4139, "time_per_iteration": 2.4247288703918457 }, { "auxiliary_loss_clip": 0.01156934, "auxiliary_loss_mlp": 0.01025583, "balance_loss_clip": 1.05096173, "balance_loss_mlp": 1.01781678, "epoch": 0.4978055672458366, "flos": 23142990654720.0, "grad_norm": 1.5999293712498972, "language_loss": 0.82329381, "learning_rate": 2.1113353625418544e-06, "loss": 0.845119, "num_input_tokens_seen": 89143935, "step": 4140, "time_per_iteration": 2.505744457244873 }, { "auxiliary_loss_clip": 0.01162455, "auxiliary_loss_mlp": 0.01029005, "balance_loss_clip": 1.05359507, "balance_loss_mlp": 1.02156019, "epoch": 0.4979258101364757, "flos": 15559914718080.0, "grad_norm": 1.754613911575738, "language_loss": 0.79090488, "learning_rate": 2.1105575903800017e-06, "loss": 0.81281954, "num_input_tokens_seen": 89162655, "step": 4141, "time_per_iteration": 2.427483081817627 }, { "auxiliary_loss_clip": 0.01173402, "auxiliary_loss_mlp": 0.01026574, "balance_loss_clip": 1.05192101, "balance_loss_mlp": 1.01840806, "epoch": 0.4980460530271148, "flos": 26356169784960.0, "grad_norm": 1.8894631223960248, "language_loss": 0.85268092, "learning_rate": 2.1097798014466502e-06, "loss": 0.8746807, "num_input_tokens_seen": 89182255, "step": 4142, "time_per_iteration": 2.484276533126831 }, { "auxiliary_loss_clip": 0.01173964, "auxiliary_loss_mlp": 0.01028953, "balance_loss_clip": 1.05452919, "balance_loss_mlp": 1.01988733, "epoch": 0.49816629591775385, "flos": 17274541415040.0, "grad_norm": 2.074310481810634, "language_loss": 0.5847829, "learning_rate": 2.109001995859791e-06, "loss": 0.606812, "num_input_tokens_seen": 89201155, "step": 4143, "time_per_iteration": 2.4370336532592773 }, { "auxiliary_loss_clip": 0.01061754, "auxiliary_loss_mlp": 0.01003484, "balance_loss_clip": 1.02219915, "balance_loss_mlp": 1.0021069, "epoch": 0.49828653880839296, "flos": 64930947344640.0, "grad_norm": 0.8104183401912067, "language_loss": 0.60123158, "learning_rate": 2.108224173737415e-06, "loss": 0.62188399, "num_input_tokens_seen": 89264455, "step": 4144, "time_per_iteration": 3.064518451690674 }, { "auxiliary_loss_clip": 0.01150527, "auxiliary_loss_mlp": 0.01030286, "balance_loss_clip": 1.04804778, "balance_loss_mlp": 1.02130914, "epoch": 0.498406781699032, "flos": 27484806003840.0, "grad_norm": 3.0373829126736895, "language_loss": 0.76386493, "learning_rate": 2.1074463351975183e-06, "loss": 0.78567308, "num_input_tokens_seen": 89283340, "step": 4145, "time_per_iteration": 2.5412256717681885 }, { "auxiliary_loss_clip": 0.01144585, "auxiliary_loss_mlp": 0.0102251, "balance_loss_clip": 1.04964185, "balance_loss_mlp": 1.0148474, "epoch": 0.49852702458967113, "flos": 31499870307840.0, "grad_norm": 1.6063259693606657, "language_loss": 0.71589231, "learning_rate": 2.106668480358098e-06, "loss": 0.73756331, "num_input_tokens_seen": 89303565, "step": 4146, "time_per_iteration": 2.6004159450531006 }, { "auxiliary_loss_clip": 0.01150463, "auxiliary_loss_mlp": 0.01026896, "balance_loss_clip": 1.04723525, "balance_loss_mlp": 1.01814604, "epoch": 0.49864726748031024, "flos": 22852868503680.0, "grad_norm": 1.567746630007518, "language_loss": 0.70865643, "learning_rate": 2.105890609337154e-06, "loss": 0.73043001, "num_input_tokens_seen": 89322080, "step": 4147, "time_per_iteration": 2.5667285919189453 }, { "auxiliary_loss_clip": 0.01083031, "auxiliary_loss_mlp": 0.01001448, "balance_loss_clip": 1.02080953, "balance_loss_mlp": 1.00014901, "epoch": 0.4987675103709493, "flos": 70405708544640.0, "grad_norm": 0.6878898786990562, "language_loss": 0.63853502, "learning_rate": 2.1051127222526883e-06, "loss": 0.65937972, "num_input_tokens_seen": 89394195, "step": 4148, "time_per_iteration": 3.1408419609069824 }, { "auxiliary_loss_clip": 0.01168398, "auxiliary_loss_mlp": 0.01023651, "balance_loss_clip": 1.05565691, "balance_loss_mlp": 1.01578307, "epoch": 0.4988877532615884, "flos": 28767571482240.0, "grad_norm": 1.5412398137947618, "language_loss": 0.8084116, "learning_rate": 2.1043348192227067e-06, "loss": 0.83033204, "num_input_tokens_seen": 89414565, "step": 4149, "time_per_iteration": 2.5256338119506836 }, { "auxiliary_loss_clip": 0.01129651, "auxiliary_loss_mlp": 0.01032881, "balance_loss_clip": 1.04869795, "balance_loss_mlp": 1.02454877, "epoch": 0.4990079961522275, "flos": 16872700988160.0, "grad_norm": 1.6977619615921062, "language_loss": 0.61629057, "learning_rate": 2.1035569003652156e-06, "loss": 0.63791585, "num_input_tokens_seen": 89433195, "step": 4150, "time_per_iteration": 3.3442342281341553 }, { "auxiliary_loss_clip": 0.01123107, "auxiliary_loss_mlp": 0.01037677, "balance_loss_clip": 1.04685307, "balance_loss_mlp": 1.02780664, "epoch": 0.4991282390428666, "flos": 13291042187520.0, "grad_norm": 1.9188090522970949, "language_loss": 0.81876165, "learning_rate": 2.1027789657982255e-06, "loss": 0.84036946, "num_input_tokens_seen": 89447410, "step": 4151, "time_per_iteration": 2.5223541259765625 }, { "auxiliary_loss_clip": 0.01127162, "auxiliary_loss_mlp": 0.01032447, "balance_loss_clip": 1.04864275, "balance_loss_mlp": 1.02450156, "epoch": 0.4992484819335057, "flos": 21537496454400.0, "grad_norm": 1.9915568983225955, "language_loss": 0.7703954, "learning_rate": 2.1020010156397482e-06, "loss": 0.79199147, "num_input_tokens_seen": 89464630, "step": 4152, "time_per_iteration": 2.558551788330078 }, { "auxiliary_loss_clip": 0.01169351, "auxiliary_loss_mlp": 0.01033487, "balance_loss_clip": 1.05259895, "balance_loss_mlp": 1.02545202, "epoch": 0.4993687248241448, "flos": 24860095390080.0, "grad_norm": 1.4786463022889875, "language_loss": 0.7766124, "learning_rate": 2.101223050007797e-06, "loss": 0.79864079, "num_input_tokens_seen": 89483180, "step": 4153, "time_per_iteration": 3.3580849170684814 }, { "auxiliary_loss_clip": 0.0108163, "auxiliary_loss_mlp": 0.01001306, "balance_loss_clip": 1.01904166, "balance_loss_mlp": 0.99999505, "epoch": 0.49948896771478385, "flos": 62941602453120.0, "grad_norm": 0.8207227722631733, "language_loss": 0.53765684, "learning_rate": 2.1004450690203904e-06, "loss": 0.55848616, "num_input_tokens_seen": 89539260, "step": 4154, "time_per_iteration": 3.8548424243927 }, { "auxiliary_loss_clip": 0.01081326, "auxiliary_loss_mlp": 0.01001137, "balance_loss_clip": 1.01902056, "balance_loss_mlp": 0.99988502, "epoch": 0.49960921060542296, "flos": 68284213516800.0, "grad_norm": 0.8536652669784582, "language_loss": 0.63358331, "learning_rate": 2.099667072795546e-06, "loss": 0.65440792, "num_input_tokens_seen": 89601380, "step": 4155, "time_per_iteration": 3.084392547607422 }, { "auxiliary_loss_clip": 0.01166483, "auxiliary_loss_mlp": 0.01030603, "balance_loss_clip": 1.05074334, "balance_loss_mlp": 1.02206814, "epoch": 0.49972945349606207, "flos": 23659350618240.0, "grad_norm": 2.3055516959853253, "language_loss": 0.79888976, "learning_rate": 2.0988890614512864e-06, "loss": 0.82086068, "num_input_tokens_seen": 89621270, "step": 4156, "time_per_iteration": 2.5225977897644043 }, { "auxiliary_loss_clip": 0.01159267, "auxiliary_loss_mlp": 0.01029429, "balance_loss_clip": 1.05502033, "balance_loss_mlp": 1.02136457, "epoch": 0.4998496963867011, "flos": 19755825022080.0, "grad_norm": 1.6344915540691656, "language_loss": 0.84120947, "learning_rate": 2.098111035105635e-06, "loss": 0.86309636, "num_input_tokens_seen": 89639695, "step": 4157, "time_per_iteration": 3.2427964210510254 }, { "auxiliary_loss_clip": 0.01126301, "auxiliary_loss_mlp": 0.01029097, "balance_loss_clip": 1.05217063, "balance_loss_mlp": 1.02112246, "epoch": 0.49996993927734024, "flos": 22265728790400.0, "grad_norm": 1.7160573327998834, "language_loss": 0.73278666, "learning_rate": 2.0973329938766176e-06, "loss": 0.75434065, "num_input_tokens_seen": 89657125, "step": 4158, "time_per_iteration": 2.596588134765625 }, { "auxiliary_loss_clip": 0.01165641, "auxiliary_loss_mlp": 0.01033458, "balance_loss_clip": 1.05397642, "balance_loss_mlp": 1.02487493, "epoch": 0.5000901821679793, "flos": 23327212533120.0, "grad_norm": 1.9087083097330289, "language_loss": 0.78850073, "learning_rate": 2.0965549378822618e-06, "loss": 0.81049168, "num_input_tokens_seen": 89678415, "step": 4159, "time_per_iteration": 2.580095052719116 }, { "auxiliary_loss_clip": 0.01091393, "auxiliary_loss_mlp": 0.01026239, "balance_loss_clip": 1.04507875, "balance_loss_mlp": 1.01816249, "epoch": 0.5002104250586185, "flos": 20339014239360.0, "grad_norm": 1.9060454826206714, "language_loss": 0.83911598, "learning_rate": 2.095776867240599e-06, "loss": 0.8602922, "num_input_tokens_seen": 89695405, "step": 4160, "time_per_iteration": 2.6216256618499756 }, { "auxiliary_loss_clip": 0.01133653, "auxiliary_loss_mlp": 0.01029621, "balance_loss_clip": 1.0473839, "balance_loss_mlp": 1.02227426, "epoch": 0.5003306679492575, "flos": 13991372634240.0, "grad_norm": 1.845389233110712, "language_loss": 0.82811105, "learning_rate": 2.094998782069661e-06, "loss": 0.84974384, "num_input_tokens_seen": 89713110, "step": 4161, "time_per_iteration": 2.5136258602142334 }, { "auxiliary_loss_clip": 0.01183639, "auxiliary_loss_mlp": 0.01027165, "balance_loss_clip": 1.05594206, "balance_loss_mlp": 1.01898742, "epoch": 0.5004509108398966, "flos": 27672762896640.0, "grad_norm": 1.7248011203696096, "language_loss": 0.75443238, "learning_rate": 2.0942206824874845e-06, "loss": 0.77654046, "num_input_tokens_seen": 89735885, "step": 4162, "time_per_iteration": 2.490328550338745 }, { "auxiliary_loss_clip": 0.01170035, "auxiliary_loss_mlp": 0.01027891, "balance_loss_clip": 1.05594885, "balance_loss_mlp": 1.0193975, "epoch": 0.5005711537305357, "flos": 14976186796800.0, "grad_norm": 2.1625795840130104, "language_loss": 0.79006231, "learning_rate": 2.093442568612105e-06, "loss": 0.81204152, "num_input_tokens_seen": 89753690, "step": 4163, "time_per_iteration": 2.4314751625061035 }, { "auxiliary_loss_clip": 0.01182263, "auxiliary_loss_mlp": 0.01025346, "balance_loss_clip": 1.05275989, "balance_loss_mlp": 1.0171802, "epoch": 0.5006913966211748, "flos": 26503259978880.0, "grad_norm": 1.4703431591048903, "language_loss": 0.85319805, "learning_rate": 2.0926644405615613e-06, "loss": 0.87527406, "num_input_tokens_seen": 89774590, "step": 4164, "time_per_iteration": 2.4616031646728516 }, { "auxiliary_loss_clip": 0.01133387, "auxiliary_loss_mlp": 0.01028397, "balance_loss_clip": 1.04901135, "balance_loss_mlp": 1.02048099, "epoch": 0.5008116395118138, "flos": 20449295971200.0, "grad_norm": 1.8117700571286253, "language_loss": 0.81258881, "learning_rate": 2.091886298453897e-06, "loss": 0.83420658, "num_input_tokens_seen": 89792775, "step": 4165, "time_per_iteration": 2.5090725421905518 }, { "auxiliary_loss_clip": 0.01166732, "auxiliary_loss_mlp": 0.01024058, "balance_loss_clip": 1.05182672, "balance_loss_mlp": 1.01641965, "epoch": 0.500931882402453, "flos": 21579871524480.0, "grad_norm": 2.1610045685585133, "language_loss": 0.73066109, "learning_rate": 2.091108142407153e-06, "loss": 0.75256902, "num_input_tokens_seen": 89811515, "step": 4166, "time_per_iteration": 2.4603617191314697 }, { "auxiliary_loss_clip": 0.01058418, "auxiliary_loss_mlp": 0.01003957, "balance_loss_clip": 1.029953, "balance_loss_mlp": 1.00227606, "epoch": 0.5010521252930921, "flos": 57785011925760.0, "grad_norm": 0.8383630926993236, "language_loss": 0.62405729, "learning_rate": 2.090329972539377e-06, "loss": 0.64468098, "num_input_tokens_seen": 89870080, "step": 4167, "time_per_iteration": 3.124333143234253 }, { "auxiliary_loss_clip": 0.01092424, "auxiliary_loss_mlp": 0.01030355, "balance_loss_clip": 1.04495597, "balance_loss_mlp": 1.02255905, "epoch": 0.5011723681837311, "flos": 18625500864000.0, "grad_norm": 1.7024647686693317, "language_loss": 0.68343723, "learning_rate": 2.089551788968616e-06, "loss": 0.70466501, "num_input_tokens_seen": 89888045, "step": 4168, "time_per_iteration": 2.5765953063964844 }, { "auxiliary_loss_clip": 0.0107902, "auxiliary_loss_mlp": 0.01003842, "balance_loss_clip": 1.01689529, "balance_loss_mlp": 1.00264382, "epoch": 0.5012926110743702, "flos": 55883146608000.0, "grad_norm": 0.834716188739814, "language_loss": 0.60811985, "learning_rate": 2.08877359181292e-06, "loss": 0.62894845, "num_input_tokens_seen": 89944610, "step": 4169, "time_per_iteration": 2.985989809036255 }, { "auxiliary_loss_clip": 0.01142338, "auxiliary_loss_mlp": 0.01026952, "balance_loss_clip": 1.04551339, "balance_loss_mlp": 1.01925445, "epoch": 0.5014128539650093, "flos": 24238266117120.0, "grad_norm": 2.4386148547423683, "language_loss": 0.85715073, "learning_rate": 2.0879953811903396e-06, "loss": 0.87884367, "num_input_tokens_seen": 89959495, "step": 4170, "time_per_iteration": 2.5322375297546387 }, { "auxiliary_loss_clip": 0.01167103, "auxiliary_loss_mlp": 0.0103121, "balance_loss_clip": 1.0531826, "balance_loss_mlp": 1.02312195, "epoch": 0.5015330968556484, "flos": 27527468382720.0, "grad_norm": 1.7939722933105977, "language_loss": 0.7837528, "learning_rate": 2.08721715721893e-06, "loss": 0.80573595, "num_input_tokens_seen": 89978820, "step": 4171, "time_per_iteration": 2.5158846378326416 }, { "auxiliary_loss_clip": 0.01167839, "auxiliary_loss_mlp": 0.01030489, "balance_loss_clip": 1.05332685, "balance_loss_mlp": 1.0225141, "epoch": 0.5016533397462875, "flos": 23800802376960.0, "grad_norm": 1.747416038381673, "language_loss": 0.76668239, "learning_rate": 2.0864389200167477e-06, "loss": 0.78866565, "num_input_tokens_seen": 89997075, "step": 4172, "time_per_iteration": 2.47259521484375 }, { "auxiliary_loss_clip": 0.01170802, "auxiliary_loss_mlp": 0.00762675, "balance_loss_clip": 1.05285501, "balance_loss_mlp": 1.00014329, "epoch": 0.5017735826369266, "flos": 25295009264640.0, "grad_norm": 1.919798091280079, "language_loss": 0.78957605, "learning_rate": 2.0856606697018504e-06, "loss": 0.80891085, "num_input_tokens_seen": 90015085, "step": 4173, "time_per_iteration": 2.4933574199676514 }, { "auxiliary_loss_clip": 0.01149911, "auxiliary_loss_mlp": 0.01029961, "balance_loss_clip": 1.04977274, "balance_loss_mlp": 1.02165246, "epoch": 0.5018938255275657, "flos": 16873203778560.0, "grad_norm": 2.8500797342025574, "language_loss": 0.72968054, "learning_rate": 2.084882406392297e-06, "loss": 0.75147927, "num_input_tokens_seen": 90033045, "step": 4174, "time_per_iteration": 2.478384256362915 }, { "auxiliary_loss_clip": 0.011482, "auxiliary_loss_mlp": 0.01027174, "balance_loss_clip": 1.05123162, "balance_loss_mlp": 1.01927614, "epoch": 0.5020140684182047, "flos": 25515429073920.0, "grad_norm": 2.494079964511907, "language_loss": 0.70868623, "learning_rate": 2.0841041302061496e-06, "loss": 0.7304399, "num_input_tokens_seen": 90052505, "step": 4175, "time_per_iteration": 2.5475215911865234 }, { "auxiliary_loss_clip": 0.01142428, "auxiliary_loss_mlp": 0.01033949, "balance_loss_clip": 1.04672444, "balance_loss_mlp": 1.02592075, "epoch": 0.5021343113088439, "flos": 23659278791040.0, "grad_norm": 1.9479204045875718, "language_loss": 0.75687671, "learning_rate": 2.083325841261473e-06, "loss": 0.77864051, "num_input_tokens_seen": 90071565, "step": 4176, "time_per_iteration": 2.512023687362671 }, { "auxiliary_loss_clip": 0.01145399, "auxiliary_loss_mlp": 0.01025994, "balance_loss_clip": 1.04672837, "balance_loss_mlp": 1.01819158, "epoch": 0.502254554199483, "flos": 24534673148160.0, "grad_norm": 2.429520816426436, "language_loss": 0.66624248, "learning_rate": 2.0825475396763322e-06, "loss": 0.68795645, "num_input_tokens_seen": 90092215, "step": 4177, "time_per_iteration": 3.3415985107421875 }, { "auxiliary_loss_clip": 0.01098364, "auxiliary_loss_mlp": 0.01029623, "balance_loss_clip": 1.04720092, "balance_loss_mlp": 1.02111149, "epoch": 0.502374797090122, "flos": 34240285607040.0, "grad_norm": 1.3806383394855086, "language_loss": 0.65732622, "learning_rate": 2.081769225568796e-06, "loss": 0.67860615, "num_input_tokens_seen": 90114665, "step": 4178, "time_per_iteration": 2.730598211288452 }, { "auxiliary_loss_clip": 0.01168322, "auxiliary_loss_mlp": 0.0103396, "balance_loss_clip": 1.04987943, "balance_loss_mlp": 1.02526331, "epoch": 0.5024950399807612, "flos": 26031106679040.0, "grad_norm": 1.489354769754311, "language_loss": 0.75870955, "learning_rate": 2.0809908990569327e-06, "loss": 0.78073239, "num_input_tokens_seen": 90136445, "step": 4179, "time_per_iteration": 3.362760543823242 }, { "auxiliary_loss_clip": 0.01153864, "auxiliary_loss_mlp": 0.01029152, "balance_loss_clip": 1.05078077, "balance_loss_mlp": 1.02071762, "epoch": 0.5026152828714002, "flos": 21252438120960.0, "grad_norm": 1.6577920262841377, "language_loss": 0.78895926, "learning_rate": 2.0802125602588146e-06, "loss": 0.81078947, "num_input_tokens_seen": 90155710, "step": 4180, "time_per_iteration": 3.326996088027954 }, { "auxiliary_loss_clip": 0.0118223, "auxiliary_loss_mlp": 0.01033684, "balance_loss_clip": 1.05484164, "balance_loss_mlp": 1.02550054, "epoch": 0.5027355257620393, "flos": 30956111245440.0, "grad_norm": 1.8507698758535343, "language_loss": 0.66382253, "learning_rate": 2.0794342092925146e-06, "loss": 0.68598163, "num_input_tokens_seen": 90176845, "step": 4181, "time_per_iteration": 2.507242441177368 }, { "auxiliary_loss_clip": 0.01172468, "auxiliary_loss_mlp": 0.01031505, "balance_loss_clip": 1.05574775, "balance_loss_mlp": 1.02333903, "epoch": 0.5028557686526784, "flos": 24791147233920.0, "grad_norm": 1.928388142887104, "language_loss": 0.68008, "learning_rate": 2.078655846276108e-06, "loss": 0.70211977, "num_input_tokens_seen": 90197175, "step": 4182, "time_per_iteration": 2.4964892864227295 }, { "auxiliary_loss_clip": 0.01148994, "auxiliary_loss_mlp": 0.01028131, "balance_loss_clip": 1.05040455, "balance_loss_mlp": 1.01983976, "epoch": 0.5029760115433175, "flos": 22966992990720.0, "grad_norm": 2.137462344561256, "language_loss": 0.68710399, "learning_rate": 2.0778774713276727e-06, "loss": 0.70887524, "num_input_tokens_seen": 90216650, "step": 4183, "time_per_iteration": 3.2987024784088135 }, { "auxiliary_loss_clip": 0.01164778, "auxiliary_loss_mlp": 0.01029592, "balance_loss_clip": 1.04963803, "balance_loss_mlp": 1.02106857, "epoch": 0.5030962544339566, "flos": 15305164485120.0, "grad_norm": 2.1077179033705704, "language_loss": 0.68073726, "learning_rate": 2.077099084565287e-06, "loss": 0.70268095, "num_input_tokens_seen": 90234055, "step": 4184, "time_per_iteration": 2.4242732524871826 }, { "auxiliary_loss_clip": 0.01147657, "auxiliary_loss_mlp": 0.01028727, "balance_loss_clip": 1.0478301, "balance_loss_mlp": 1.02064419, "epoch": 0.5032164973245957, "flos": 24494847943680.0, "grad_norm": 2.3040838560828556, "language_loss": 0.65544063, "learning_rate": 2.0763206861070313e-06, "loss": 0.67720449, "num_input_tokens_seen": 90253115, "step": 4185, "time_per_iteration": 2.5097901821136475 }, { "auxiliary_loss_clip": 0.01185076, "auxiliary_loss_mlp": 0.01032256, "balance_loss_clip": 1.05550051, "balance_loss_mlp": 1.02372706, "epoch": 0.5033367402152348, "flos": 16213452721920.0, "grad_norm": 2.0072480334691063, "language_loss": 0.75391686, "learning_rate": 2.0755422760709876e-06, "loss": 0.77609015, "num_input_tokens_seen": 90270515, "step": 4186, "time_per_iteration": 2.399305820465088 }, { "auxiliary_loss_clip": 0.01119322, "auxiliary_loss_mlp": 0.01033237, "balance_loss_clip": 1.04549003, "balance_loss_mlp": 1.024845, "epoch": 0.5034569831058738, "flos": 21391375927680.0, "grad_norm": 1.9119030243796675, "language_loss": 0.76888931, "learning_rate": 2.0747638545752417e-06, "loss": 0.79041493, "num_input_tokens_seen": 90289075, "step": 4187, "time_per_iteration": 2.5306806564331055 }, { "auxiliary_loss_clip": 0.01154907, "auxiliary_loss_mlp": 0.01026302, "balance_loss_clip": 1.05417061, "balance_loss_mlp": 1.01818991, "epoch": 0.503577225996513, "flos": 20558751690240.0, "grad_norm": 2.0261212518530884, "language_loss": 0.83200598, "learning_rate": 2.073985421737878e-06, "loss": 0.85381806, "num_input_tokens_seen": 90306385, "step": 4188, "time_per_iteration": 2.4842543601989746 }, { "auxiliary_loss_clip": 0.01171887, "auxiliary_loss_mlp": 0.01026604, "balance_loss_clip": 1.05360103, "balance_loss_mlp": 1.01855719, "epoch": 0.5036974688871521, "flos": 27229157930880.0, "grad_norm": 2.016543792409451, "language_loss": 0.74444741, "learning_rate": 2.0732069776769844e-06, "loss": 0.76643234, "num_input_tokens_seen": 90323795, "step": 4189, "time_per_iteration": 2.517836570739746 }, { "auxiliary_loss_clip": 0.01184411, "auxiliary_loss_mlp": 0.01029907, "balance_loss_clip": 1.05654585, "balance_loss_mlp": 1.02113903, "epoch": 0.5038177117777911, "flos": 20412164286720.0, "grad_norm": 2.0890636437834433, "language_loss": 0.73028249, "learning_rate": 2.072428522510651e-06, "loss": 0.75242567, "num_input_tokens_seen": 90340360, "step": 4190, "time_per_iteration": 2.4159305095672607 }, { "auxiliary_loss_clip": 0.01133694, "auxiliary_loss_mlp": 0.01028127, "balance_loss_clip": 1.04952073, "balance_loss_mlp": 1.02043819, "epoch": 0.5039379546684303, "flos": 21907987286400.0, "grad_norm": 2.1990393357000757, "language_loss": 0.76394939, "learning_rate": 2.071650056356968e-06, "loss": 0.78556764, "num_input_tokens_seen": 90357900, "step": 4191, "time_per_iteration": 2.5060040950775146 }, { "auxiliary_loss_clip": 0.01181761, "auxiliary_loss_mlp": 0.01033053, "balance_loss_clip": 1.05405402, "balance_loss_mlp": 1.02518511, "epoch": 0.5040581975590693, "flos": 20010718909440.0, "grad_norm": 1.872558425835631, "language_loss": 0.79949474, "learning_rate": 2.070871579334028e-06, "loss": 0.82164288, "num_input_tokens_seen": 90377010, "step": 4192, "time_per_iteration": 2.428006887435913 }, { "auxiliary_loss_clip": 0.01180101, "auxiliary_loss_mlp": 0.01027685, "balance_loss_clip": 1.05277407, "balance_loss_mlp": 1.01966202, "epoch": 0.5041784404497084, "flos": 20959837931520.0, "grad_norm": 3.869194947901325, "language_loss": 0.71947128, "learning_rate": 2.0700930915599264e-06, "loss": 0.74154913, "num_input_tokens_seen": 90396740, "step": 4193, "time_per_iteration": 2.4468986988067627 }, { "auxiliary_loss_clip": 0.01182096, "auxiliary_loss_mlp": 0.01030377, "balance_loss_clip": 1.05380321, "balance_loss_mlp": 1.02240825, "epoch": 0.5042986833403476, "flos": 12495082757760.0, "grad_norm": 1.9991825328208133, "language_loss": 0.78426707, "learning_rate": 2.0693145931527583e-06, "loss": 0.80639184, "num_input_tokens_seen": 90413220, "step": 4194, "time_per_iteration": 2.3959057331085205 }, { "auxiliary_loss_clip": 0.01148845, "auxiliary_loss_mlp": 0.01031137, "balance_loss_clip": 1.0500226, "balance_loss_mlp": 1.0230068, "epoch": 0.5044189262309866, "flos": 29202305788800.0, "grad_norm": 1.5362573874502132, "language_loss": 0.7786479, "learning_rate": 2.068536084230622e-06, "loss": 0.8004477, "num_input_tokens_seen": 90435085, "step": 4195, "time_per_iteration": 2.545450210571289 }, { "auxiliary_loss_clip": 0.01168461, "auxiliary_loss_mlp": 0.01034119, "balance_loss_clip": 1.05380082, "balance_loss_mlp": 1.02468359, "epoch": 0.5045391691216257, "flos": 23873198238720.0, "grad_norm": 1.9581666163537965, "language_loss": 0.88449043, "learning_rate": 2.067757564911616e-06, "loss": 0.90651625, "num_input_tokens_seen": 90453660, "step": 4196, "time_per_iteration": 2.4743988513946533 }, { "auxiliary_loss_clip": 0.01160993, "auxiliary_loss_mlp": 0.00763363, "balance_loss_clip": 1.05141652, "balance_loss_mlp": 1.00016475, "epoch": 0.5046594120122648, "flos": 24644990793600.0, "grad_norm": 3.0464101752092607, "language_loss": 0.92837322, "learning_rate": 2.0669790353138407e-06, "loss": 0.9476167, "num_input_tokens_seen": 90472625, "step": 4197, "time_per_iteration": 2.523627281188965 }, { "auxiliary_loss_clip": 0.01136121, "auxiliary_loss_mlp": 0.00763012, "balance_loss_clip": 1.05122519, "balance_loss_mlp": 1.00025356, "epoch": 0.5047796549029039, "flos": 23362835846400.0, "grad_norm": 2.06997513878877, "language_loss": 0.73087668, "learning_rate": 2.0662004955553995e-06, "loss": 0.74986798, "num_input_tokens_seen": 90492325, "step": 4198, "time_per_iteration": 2.530907154083252 }, { "auxiliary_loss_clip": 0.01148733, "auxiliary_loss_mlp": 0.01023642, "balance_loss_clip": 1.04856849, "balance_loss_mlp": 1.01592374, "epoch": 0.5048998977935429, "flos": 17304095329920.0, "grad_norm": 1.9088304713640585, "language_loss": 0.76845455, "learning_rate": 2.065421945754395e-06, "loss": 0.7901783, "num_input_tokens_seen": 90510055, "step": 4199, "time_per_iteration": 2.4702606201171875 }, { "auxiliary_loss_clip": 0.01127468, "auxiliary_loss_mlp": 0.01027449, "balance_loss_clip": 1.05005312, "balance_loss_mlp": 1.01991773, "epoch": 0.505020140684182, "flos": 34856979235200.0, "grad_norm": 1.592849459414632, "language_loss": 0.78183734, "learning_rate": 2.0646433860289344e-06, "loss": 0.80338645, "num_input_tokens_seen": 90528980, "step": 4200, "time_per_iteration": 2.6616246700286865 }, { "auxiliary_loss_clip": 0.01172984, "auxiliary_loss_mlp": 0.00763655, "balance_loss_clip": 1.05288565, "balance_loss_mlp": 1.00022566, "epoch": 0.5051403835748212, "flos": 24863974058880.0, "grad_norm": 2.2449877647319285, "language_loss": 0.82668477, "learning_rate": 2.0638648164971233e-06, "loss": 0.84605116, "num_input_tokens_seen": 90547445, "step": 4201, "time_per_iteration": 2.486485242843628 }, { "auxiliary_loss_clip": 0.01152563, "auxiliary_loss_mlp": 0.0102952, "balance_loss_clip": 1.05217338, "balance_loss_mlp": 1.02192688, "epoch": 0.5052606264654602, "flos": 20959694277120.0, "grad_norm": 1.8321090215994578, "language_loss": 0.88836199, "learning_rate": 2.06308623727707e-06, "loss": 0.91018283, "num_input_tokens_seen": 90567545, "step": 4202, "time_per_iteration": 2.488079071044922 }, { "auxiliary_loss_clip": 0.01162169, "auxiliary_loss_mlp": 0.01027565, "balance_loss_clip": 1.0506134, "balance_loss_mlp": 1.01898205, "epoch": 0.5053808693560993, "flos": 19642382893440.0, "grad_norm": 2.260185856820954, "language_loss": 0.76533687, "learning_rate": 2.0623076484868846e-06, "loss": 0.78723413, "num_input_tokens_seen": 90585000, "step": 4203, "time_per_iteration": 3.2754476070404053 }, { "auxiliary_loss_clip": 0.01057443, "auxiliary_loss_mlp": 0.01004305, "balance_loss_clip": 1.02130198, "balance_loss_mlp": 1.00317848, "epoch": 0.5055011122467384, "flos": 67504915019520.0, "grad_norm": 0.8469472341861977, "language_loss": 0.60696357, "learning_rate": 2.061529050244679e-06, "loss": 0.627581, "num_input_tokens_seen": 90644745, "step": 4204, "time_per_iteration": 3.034193515777588 }, { "auxiliary_loss_clip": 0.0114668, "auxiliary_loss_mlp": 0.01022801, "balance_loss_clip": 1.05068588, "balance_loss_mlp": 1.01427186, "epoch": 0.5056213551373775, "flos": 16872952383360.0, "grad_norm": 2.2513044406990224, "language_loss": 0.74369597, "learning_rate": 2.060750442668565e-06, "loss": 0.76539075, "num_input_tokens_seen": 90662500, "step": 4205, "time_per_iteration": 2.5891470909118652 }, { "auxiliary_loss_clip": 0.01169881, "auxiliary_loss_mlp": 0.01030287, "balance_loss_clip": 1.05556941, "balance_loss_mlp": 1.02191257, "epoch": 0.5057415980280165, "flos": 15334179696000.0, "grad_norm": 2.49031724293243, "language_loss": 0.64106423, "learning_rate": 2.059971825876657e-06, "loss": 0.66306591, "num_input_tokens_seen": 90677010, "step": 4206, "time_per_iteration": 2.414470672607422 }, { "auxiliary_loss_clip": 0.01170493, "auxiliary_loss_mlp": 0.01026891, "balance_loss_clip": 1.05423498, "balance_loss_mlp": 1.01861501, "epoch": 0.5058618409186557, "flos": 19025976574080.0, "grad_norm": 1.9791528682941184, "language_loss": 0.7663753, "learning_rate": 2.0591931999870713e-06, "loss": 0.78834915, "num_input_tokens_seen": 90695935, "step": 4207, "time_per_iteration": 4.025703191757202 }, { "auxiliary_loss_clip": 0.01065987, "auxiliary_loss_mlp": 0.01001798, "balance_loss_clip": 1.0204407, "balance_loss_mlp": 1.00072539, "epoch": 0.5059820838092948, "flos": 63453114080640.0, "grad_norm": 0.8289903686563834, "language_loss": 0.57591641, "learning_rate": 2.0584145651179234e-06, "loss": 0.59659427, "num_input_tokens_seen": 90751645, "step": 4208, "time_per_iteration": 3.039763927459717 }, { "auxiliary_loss_clip": 0.01155303, "auxiliary_loss_mlp": 0.00762425, "balance_loss_clip": 1.05463147, "balance_loss_mlp": 1.00016546, "epoch": 0.5061023266999338, "flos": 15441803821440.0, "grad_norm": 2.5622089982981513, "language_loss": 0.80391634, "learning_rate": 2.0576359213873327e-06, "loss": 0.82309365, "num_input_tokens_seen": 90766795, "step": 4209, "time_per_iteration": 2.4564402103424072 }, { "auxiliary_loss_clip": 0.01160104, "auxiliary_loss_mlp": 0.0102846, "balance_loss_clip": 1.04809022, "balance_loss_mlp": 1.02016318, "epoch": 0.506222569590573, "flos": 22451063990400.0, "grad_norm": 4.534793872075395, "language_loss": 0.70908451, "learning_rate": 2.056857268913419e-06, "loss": 0.73097014, "num_input_tokens_seen": 90786845, "step": 4210, "time_per_iteration": 3.2950143814086914 }, { "auxiliary_loss_clip": 0.01169104, "auxiliary_loss_mlp": 0.01027517, "balance_loss_clip": 1.05562043, "balance_loss_mlp": 1.0197382, "epoch": 0.506342812481212, "flos": 17558665994880.0, "grad_norm": 2.105991022313912, "language_loss": 0.83813787, "learning_rate": 2.056078607814303e-06, "loss": 0.86010408, "num_input_tokens_seen": 90802630, "step": 4211, "time_per_iteration": 2.411654233932495 }, { "auxiliary_loss_clip": 0.01168489, "auxiliary_loss_mlp": 0.01023847, "balance_loss_clip": 1.05369461, "balance_loss_mlp": 1.01562178, "epoch": 0.5064630553718511, "flos": 23402050519680.0, "grad_norm": 3.9351652022734696, "language_loss": 0.78508145, "learning_rate": 2.055299938208106e-06, "loss": 0.80700481, "num_input_tokens_seen": 90823620, "step": 4212, "time_per_iteration": 2.4986555576324463 }, { "auxiliary_loss_clip": 0.01174589, "auxiliary_loss_mlp": 0.01031573, "balance_loss_clip": 1.05630445, "balance_loss_mlp": 1.02319312, "epoch": 0.5065832982624903, "flos": 23987035416960.0, "grad_norm": 1.5746707234394322, "language_loss": 0.86405993, "learning_rate": 2.0545212602129526e-06, "loss": 0.88612157, "num_input_tokens_seen": 90843475, "step": 4213, "time_per_iteration": 2.480971574783325 }, { "auxiliary_loss_clip": 0.01143534, "auxiliary_loss_mlp": 0.01032386, "balance_loss_clip": 1.04665661, "balance_loss_mlp": 1.02359986, "epoch": 0.5067035411531293, "flos": 21503058289920.0, "grad_norm": 2.0019384238611893, "language_loss": 0.66305214, "learning_rate": 2.0537425739469673e-06, "loss": 0.68481135, "num_input_tokens_seen": 90862410, "step": 4214, "time_per_iteration": 2.48228120803833 }, { "auxiliary_loss_clip": 0.01074146, "auxiliary_loss_mlp": 0.01002184, "balance_loss_clip": 1.02130651, "balance_loss_mlp": 1.00107503, "epoch": 0.5068237840437684, "flos": 65934397687680.0, "grad_norm": 0.8419990211827253, "language_loss": 0.59471959, "learning_rate": 2.052963879528276e-06, "loss": 0.61548293, "num_input_tokens_seen": 90922280, "step": 4215, "time_per_iteration": 3.0256857872009277 }, { "auxiliary_loss_clip": 0.01169888, "auxiliary_loss_mlp": 0.0102373, "balance_loss_clip": 1.05455613, "balance_loss_mlp": 1.0157249, "epoch": 0.5069440269344075, "flos": 27264206626560.0, "grad_norm": 2.0111282731624147, "language_loss": 0.76598889, "learning_rate": 2.052185177075007e-06, "loss": 0.787925, "num_input_tokens_seen": 90941850, "step": 4216, "time_per_iteration": 2.5299932956695557 }, { "auxiliary_loss_clip": 0.0117132, "auxiliary_loss_mlp": 0.01031496, "balance_loss_clip": 1.05420351, "balance_loss_mlp": 1.02381301, "epoch": 0.5070642698250466, "flos": 23366319465600.0, "grad_norm": 1.6919714070192025, "language_loss": 0.8306703, "learning_rate": 2.051406466705288e-06, "loss": 0.85269845, "num_input_tokens_seen": 90961390, "step": 4217, "time_per_iteration": 2.4793636798858643 }, { "auxiliary_loss_clip": 0.01180918, "auxiliary_loss_mlp": 0.01027469, "balance_loss_clip": 1.05301011, "balance_loss_mlp": 1.01961899, "epoch": 0.5071845127156857, "flos": 20340127560960.0, "grad_norm": 1.829978519391727, "language_loss": 0.81053251, "learning_rate": 2.0506277485372486e-06, "loss": 0.83261639, "num_input_tokens_seen": 90980215, "step": 4218, "time_per_iteration": 2.4224464893341064 }, { "auxiliary_loss_clip": 0.01164191, "auxiliary_loss_mlp": 0.01032584, "balance_loss_clip": 1.05398405, "balance_loss_mlp": 1.02469254, "epoch": 0.5073047556063248, "flos": 12092955022080.0, "grad_norm": 1.8871532714754766, "language_loss": 0.67037475, "learning_rate": 2.04984902268902e-06, "loss": 0.69234252, "num_input_tokens_seen": 90997415, "step": 4219, "time_per_iteration": 2.4345171451568604 }, { "auxiliary_loss_clip": 0.01172974, "auxiliary_loss_mlp": 0.01029291, "balance_loss_clip": 1.05083847, "balance_loss_mlp": 1.02015352, "epoch": 0.5074249984969639, "flos": 19682854542720.0, "grad_norm": 2.601554207508566, "language_loss": 0.75460339, "learning_rate": 2.0490702892787345e-06, "loss": 0.77662605, "num_input_tokens_seen": 91016475, "step": 4220, "time_per_iteration": 2.4609060287475586 }, { "auxiliary_loss_clip": 0.01160354, "auxiliary_loss_mlp": 0.01031914, "balance_loss_clip": 1.05006528, "balance_loss_mlp": 1.02389121, "epoch": 0.5075452413876029, "flos": 28765703975040.0, "grad_norm": 2.192762152780997, "language_loss": 0.62366754, "learning_rate": 2.0482915484245246e-06, "loss": 0.64559019, "num_input_tokens_seen": 91038095, "step": 4221, "time_per_iteration": 2.505283832550049 }, { "auxiliary_loss_clip": 0.01117755, "auxiliary_loss_mlp": 0.01031162, "balance_loss_clip": 1.04777181, "balance_loss_mlp": 1.02243543, "epoch": 0.5076654842782421, "flos": 20339445202560.0, "grad_norm": 2.369725711986272, "language_loss": 0.84143519, "learning_rate": 2.047512800244526e-06, "loss": 0.86292434, "num_input_tokens_seen": 91053360, "step": 4222, "time_per_iteration": 2.531452178955078 }, { "auxiliary_loss_clip": 0.01167282, "auxiliary_loss_mlp": 0.01025243, "balance_loss_clip": 1.05313158, "balance_loss_mlp": 1.01694572, "epoch": 0.5077857271688812, "flos": 26359653404160.0, "grad_norm": 3.7636683653919003, "language_loss": 0.78961837, "learning_rate": 2.046734044856873e-06, "loss": 0.81154358, "num_input_tokens_seen": 91072770, "step": 4223, "time_per_iteration": 2.49385142326355 }, { "auxiliary_loss_clip": 0.01167541, "auxiliary_loss_mlp": 0.01029025, "balance_loss_clip": 1.05351055, "balance_loss_mlp": 1.02124691, "epoch": 0.5079059700595202, "flos": 21798962530560.0, "grad_norm": 1.897913170136811, "language_loss": 0.81250632, "learning_rate": 2.045955282379702e-06, "loss": 0.83447206, "num_input_tokens_seen": 91091430, "step": 4224, "time_per_iteration": 2.4464786052703857 }, { "auxiliary_loss_clip": 0.01164505, "auxiliary_loss_mlp": 0.01030386, "balance_loss_clip": 1.04941392, "balance_loss_mlp": 1.02194023, "epoch": 0.5080262129501594, "flos": 13187943175680.0, "grad_norm": 2.6944983267968388, "language_loss": 0.76109529, "learning_rate": 2.045176512931152e-06, "loss": 0.78304422, "num_input_tokens_seen": 91106060, "step": 4225, "time_per_iteration": 2.4173262119293213 }, { "auxiliary_loss_clip": 0.01143044, "auxiliary_loss_mlp": 0.01024754, "balance_loss_clip": 1.04974222, "balance_loss_mlp": 1.01703537, "epoch": 0.5081464558407984, "flos": 25301473712640.0, "grad_norm": 2.337237949388475, "language_loss": 0.76053709, "learning_rate": 2.0443977366293604e-06, "loss": 0.78221506, "num_input_tokens_seen": 91124100, "step": 4226, "time_per_iteration": 2.541816473007202 }, { "auxiliary_loss_clip": 0.01111162, "auxiliary_loss_mlp": 0.01032203, "balance_loss_clip": 1.04539144, "balance_loss_mlp": 1.02336955, "epoch": 0.5082666987314375, "flos": 30951226995840.0, "grad_norm": 1.6492625686937972, "language_loss": 0.76927078, "learning_rate": 2.043618953592468e-06, "loss": 0.79070437, "num_input_tokens_seen": 91146555, "step": 4227, "time_per_iteration": 2.661424160003662 }, { "auxiliary_loss_clip": 0.01155349, "auxiliary_loss_mlp": 0.01030147, "balance_loss_clip": 1.05310082, "balance_loss_mlp": 1.02155852, "epoch": 0.5083869416220766, "flos": 19682495406720.0, "grad_norm": 1.5038381746403708, "language_loss": 0.81146085, "learning_rate": 2.0428401639386144e-06, "loss": 0.83331573, "num_input_tokens_seen": 91167120, "step": 4228, "time_per_iteration": 2.524024724960327 }, { "auxiliary_loss_clip": 0.0105483, "auxiliary_loss_mlp": 0.01003571, "balance_loss_clip": 1.02039552, "balance_loss_mlp": 1.00253987, "epoch": 0.5085071845127157, "flos": 71817535589760.0, "grad_norm": 0.8183674946183168, "language_loss": 0.58113456, "learning_rate": 2.042061367785943e-06, "loss": 0.60171854, "num_input_tokens_seen": 91220260, "step": 4229, "time_per_iteration": 3.0499186515808105 }, { "auxiliary_loss_clip": 0.01143538, "auxiliary_loss_mlp": 0.0103079, "balance_loss_clip": 1.04867339, "balance_loss_mlp": 1.022493, "epoch": 0.5086274274033548, "flos": 35951608252800.0, "grad_norm": 2.068272594458992, "language_loss": 0.74709278, "learning_rate": 2.041282565252594e-06, "loss": 0.76883602, "num_input_tokens_seen": 91240425, "step": 4230, "time_per_iteration": 3.4399161338806152 }, { "auxiliary_loss_clip": 0.01140722, "auxiliary_loss_mlp": 0.01027647, "balance_loss_clip": 1.04955876, "balance_loss_mlp": 1.01965976, "epoch": 0.5087476702939938, "flos": 23513732881920.0, "grad_norm": 1.9248725173440195, "language_loss": 0.77212548, "learning_rate": 2.040503756456714e-06, "loss": 0.79380912, "num_input_tokens_seen": 91259635, "step": 4231, "time_per_iteration": 2.5535709857940674 }, { "auxiliary_loss_clip": 0.01160407, "auxiliary_loss_mlp": 0.0103112, "balance_loss_clip": 1.04937959, "balance_loss_mlp": 1.02263844, "epoch": 0.508867913184633, "flos": 15122091841920.0, "grad_norm": 1.8720259744479968, "language_loss": 0.78788388, "learning_rate": 2.0397249415164456e-06, "loss": 0.80979919, "num_input_tokens_seen": 91276990, "step": 4232, "time_per_iteration": 2.434131622314453 }, { "auxiliary_loss_clip": 0.01145754, "auxiliary_loss_mlp": 0.01027276, "balance_loss_clip": 1.04710853, "balance_loss_mlp": 1.01895523, "epoch": 0.508988156075272, "flos": 25885309374720.0, "grad_norm": 1.7991617762802905, "language_loss": 0.80021429, "learning_rate": 2.0389461205499354e-06, "loss": 0.82194459, "num_input_tokens_seen": 91296125, "step": 4233, "time_per_iteration": 3.3895139694213867 }, { "auxiliary_loss_clip": 0.01141641, "auxiliary_loss_mlp": 0.01029267, "balance_loss_clip": 1.04964304, "balance_loss_mlp": 1.02150595, "epoch": 0.5091083989659111, "flos": 13844857057920.0, "grad_norm": 2.59532224720696, "language_loss": 0.73436964, "learning_rate": 2.03816729367533e-06, "loss": 0.75607872, "num_input_tokens_seen": 91314280, "step": 4234, "time_per_iteration": 3.333515167236328 }, { "auxiliary_loss_clip": 0.01158594, "auxiliary_loss_mlp": 0.0103558, "balance_loss_clip": 1.05407727, "balance_loss_mlp": 1.02714014, "epoch": 0.5092286418565503, "flos": 21104881050240.0, "grad_norm": 2.296637304685272, "language_loss": 0.71788311, "learning_rate": 2.0373884610107765e-06, "loss": 0.73982489, "num_input_tokens_seen": 91334595, "step": 4235, "time_per_iteration": 2.520890235900879 }, { "auxiliary_loss_clip": 0.01168491, "auxiliary_loss_mlp": 0.01027052, "balance_loss_clip": 1.04858708, "balance_loss_mlp": 1.01885009, "epoch": 0.5093488847471893, "flos": 18621298972800.0, "grad_norm": 3.6892371380204545, "language_loss": 0.69298148, "learning_rate": 2.0366096226744225e-06, "loss": 0.71493697, "num_input_tokens_seen": 91349790, "step": 4236, "time_per_iteration": 2.4222216606140137 }, { "auxiliary_loss_clip": 0.01157311, "auxiliary_loss_mlp": 0.01035518, "balance_loss_clip": 1.05011976, "balance_loss_mlp": 1.02727509, "epoch": 0.5094691276378284, "flos": 23803783205760.0, "grad_norm": 1.780245047229767, "language_loss": 0.7679432, "learning_rate": 2.035830778784418e-06, "loss": 0.78987145, "num_input_tokens_seen": 91370465, "step": 4237, "time_per_iteration": 3.2091763019561768 }, { "auxiliary_loss_clip": 0.01156599, "auxiliary_loss_mlp": 0.01025372, "balance_loss_clip": 1.05279922, "balance_loss_mlp": 1.01682496, "epoch": 0.5095893705284675, "flos": 17420410546560.0, "grad_norm": 1.8942234409410152, "language_loss": 0.79986608, "learning_rate": 2.0350519294589134e-06, "loss": 0.82168573, "num_input_tokens_seen": 91388505, "step": 4238, "time_per_iteration": 2.4703798294067383 }, { "auxiliary_loss_clip": 0.01118546, "auxiliary_loss_mlp": 0.01023291, "balance_loss_clip": 1.04449272, "balance_loss_mlp": 1.01473761, "epoch": 0.5097096134191066, "flos": 25849362839040.0, "grad_norm": 1.6666450294808899, "language_loss": 0.82938766, "learning_rate": 2.0342730748160588e-06, "loss": 0.850806, "num_input_tokens_seen": 91408970, "step": 4239, "time_per_iteration": 2.5937604904174805 }, { "auxiliary_loss_clip": 0.01151448, "auxiliary_loss_mlp": 0.01028605, "balance_loss_clip": 1.04785407, "balance_loss_mlp": 1.02061224, "epoch": 0.5098298563097456, "flos": 27745122844800.0, "grad_norm": 2.869188560325743, "language_loss": 0.706734, "learning_rate": 2.033494214974006e-06, "loss": 0.72853452, "num_input_tokens_seen": 91430115, "step": 4240, "time_per_iteration": 2.5369811058044434 }, { "auxiliary_loss_clip": 0.01141895, "auxiliary_loss_mlp": 0.01029812, "balance_loss_clip": 1.04915464, "balance_loss_mlp": 1.02198005, "epoch": 0.5099500992003848, "flos": 21358913011200.0, "grad_norm": 1.703418559764768, "language_loss": 0.83784044, "learning_rate": 2.0327153500509067e-06, "loss": 0.85955751, "num_input_tokens_seen": 91449140, "step": 4241, "time_per_iteration": 2.4743709564208984 }, { "auxiliary_loss_clip": 0.01155068, "auxiliary_loss_mlp": 0.01030003, "balance_loss_clip": 1.05198455, "balance_loss_mlp": 1.02207017, "epoch": 0.5100703420910239, "flos": 19865999013120.0, "grad_norm": 2.0116333807045126, "language_loss": 0.84781206, "learning_rate": 2.031936480164916e-06, "loss": 0.86966276, "num_input_tokens_seen": 91466880, "step": 4242, "time_per_iteration": 2.4793429374694824 }, { "auxiliary_loss_clip": 0.01149133, "auxiliary_loss_mlp": 0.01028188, "balance_loss_clip": 1.05286491, "balance_loss_mlp": 1.02001047, "epoch": 0.5101905849816629, "flos": 24648797635200.0, "grad_norm": 4.389376168812962, "language_loss": 0.79635608, "learning_rate": 2.0311576054341857e-06, "loss": 0.8181293, "num_input_tokens_seen": 91487495, "step": 4243, "time_per_iteration": 2.5181028842926025 }, { "auxiliary_loss_clip": 0.01183291, "auxiliary_loss_mlp": 0.01026729, "balance_loss_clip": 1.05620539, "balance_loss_mlp": 1.01855731, "epoch": 0.5103108278723021, "flos": 22930076787840.0, "grad_norm": 1.7259891666790357, "language_loss": 0.62566996, "learning_rate": 2.0303787259768715e-06, "loss": 0.64777017, "num_input_tokens_seen": 91508395, "step": 4244, "time_per_iteration": 2.4243245124816895 }, { "auxiliary_loss_clip": 0.0115381, "auxiliary_loss_mlp": 0.01029591, "balance_loss_clip": 1.05220366, "balance_loss_mlp": 1.0216701, "epoch": 0.5104310707629411, "flos": 21506613736320.0, "grad_norm": 2.111999253410287, "language_loss": 0.6946708, "learning_rate": 2.0295998419111294e-06, "loss": 0.71650481, "num_input_tokens_seen": 91525685, "step": 4245, "time_per_iteration": 2.466531991958618 }, { "auxiliary_loss_clip": 0.01111886, "auxiliary_loss_mlp": 0.01031936, "balance_loss_clip": 1.04429841, "balance_loss_mlp": 1.02376997, "epoch": 0.5105513136535802, "flos": 14903180403840.0, "grad_norm": 3.5010966158179557, "language_loss": 0.733778, "learning_rate": 2.028820953355115e-06, "loss": 0.75521624, "num_input_tokens_seen": 91543785, "step": 4246, "time_per_iteration": 2.544123888015747 }, { "auxiliary_loss_clip": 0.01158315, "auxiliary_loss_mlp": 0.01029634, "balance_loss_clip": 1.04906607, "balance_loss_mlp": 1.02098274, "epoch": 0.5106715565442194, "flos": 22602212421120.0, "grad_norm": 1.6737099940698206, "language_loss": 0.7864427, "learning_rate": 2.0280420604269834e-06, "loss": 0.80832219, "num_input_tokens_seen": 91563325, "step": 4247, "time_per_iteration": 2.4977834224700928 }, { "auxiliary_loss_clip": 0.01067746, "auxiliary_loss_mlp": 0.0100102, "balance_loss_clip": 1.01934361, "balance_loss_mlp": 1.00002432, "epoch": 0.5107917994348584, "flos": 71027645558400.0, "grad_norm": 0.7001215243755687, "language_loss": 0.58915651, "learning_rate": 2.027263163244895e-06, "loss": 0.60984409, "num_input_tokens_seen": 91632450, "step": 4248, "time_per_iteration": 3.191516160964966 }, { "auxiliary_loss_clip": 0.01166075, "auxiliary_loss_mlp": 0.01028669, "balance_loss_clip": 1.05384707, "balance_loss_mlp": 1.02083135, "epoch": 0.5109120423254975, "flos": 24827416992000.0, "grad_norm": 1.6677924342726513, "language_loss": 0.74236333, "learning_rate": 2.026484261927005e-06, "loss": 0.76431084, "num_input_tokens_seen": 91651945, "step": 4249, "time_per_iteration": 2.4837357997894287 }, { "auxiliary_loss_clip": 0.01174665, "auxiliary_loss_mlp": 0.01027265, "balance_loss_clip": 1.05639207, "balance_loss_mlp": 1.01845551, "epoch": 0.5110322852161366, "flos": 21247661612160.0, "grad_norm": 2.009971883001086, "language_loss": 0.74127877, "learning_rate": 2.025705356591475e-06, "loss": 0.76329809, "num_input_tokens_seen": 91669635, "step": 4250, "time_per_iteration": 2.4449050426483154 }, { "auxiliary_loss_clip": 0.01045319, "auxiliary_loss_mlp": 0.00753165, "balance_loss_clip": 1.01980639, "balance_loss_mlp": 0.99992067, "epoch": 0.5111525281067757, "flos": 66457114358400.0, "grad_norm": 0.755468175708005, "language_loss": 0.57979679, "learning_rate": 2.024926447356462e-06, "loss": 0.59778166, "num_input_tokens_seen": 91731920, "step": 4251, "time_per_iteration": 3.0303187370300293 }, { "auxiliary_loss_clip": 0.01165466, "auxiliary_loss_mlp": 0.01033915, "balance_loss_clip": 1.05213022, "balance_loss_mlp": 1.02505744, "epoch": 0.5112727709974147, "flos": 14866731077760.0, "grad_norm": 2.3387553783665362, "language_loss": 0.78711414, "learning_rate": 2.024147534340127e-06, "loss": 0.8091079, "num_input_tokens_seen": 91749780, "step": 4252, "time_per_iteration": 2.4719812870025635 }, { "auxiliary_loss_clip": 0.01148648, "auxiliary_loss_mlp": 0.01026464, "balance_loss_clip": 1.04688764, "balance_loss_mlp": 1.01856589, "epoch": 0.5113930138880539, "flos": 21177600134400.0, "grad_norm": 1.7655880521304388, "language_loss": 0.79851246, "learning_rate": 2.02336861766063e-06, "loss": 0.82026362, "num_input_tokens_seen": 91768840, "step": 4253, "time_per_iteration": 2.5480082035064697 }, { "auxiliary_loss_clip": 0.01174237, "auxiliary_loss_mlp": 0.01027646, "balance_loss_clip": 1.05411017, "balance_loss_mlp": 1.01945055, "epoch": 0.511513256778693, "flos": 20409111630720.0, "grad_norm": 1.7485085200015165, "language_loss": 0.78985453, "learning_rate": 2.0225896974361327e-06, "loss": 0.81187338, "num_input_tokens_seen": 91788945, "step": 4254, "time_per_iteration": 2.5228259563446045 }, { "auxiliary_loss_clip": 0.01052463, "auxiliary_loss_mlp": 0.01003924, "balance_loss_clip": 1.02267146, "balance_loss_mlp": 1.0027678, "epoch": 0.511633499669332, "flos": 69879975131520.0, "grad_norm": 0.8621654082889129, "language_loss": 0.59980214, "learning_rate": 2.0218107737847962e-06, "loss": 0.62036598, "num_input_tokens_seen": 91850990, "step": 4255, "time_per_iteration": 3.131788730621338 }, { "auxiliary_loss_clip": 0.01181696, "auxiliary_loss_mlp": 0.01028919, "balance_loss_clip": 1.05496347, "balance_loss_mlp": 1.02102172, "epoch": 0.5117537425599712, "flos": 24097855852800.0, "grad_norm": 1.8905596839496766, "language_loss": 0.7469821, "learning_rate": 2.0210318468247826e-06, "loss": 0.76908821, "num_input_tokens_seen": 91869960, "step": 4256, "time_per_iteration": 3.301748275756836 }, { "auxiliary_loss_clip": 0.01152331, "auxiliary_loss_mlp": 0.01025683, "balance_loss_clip": 1.04946208, "balance_loss_mlp": 1.01835787, "epoch": 0.5118739854506102, "flos": 20959550622720.0, "grad_norm": 1.766194154064807, "language_loss": 0.81982529, "learning_rate": 2.020252916674255e-06, "loss": 0.84160548, "num_input_tokens_seen": 91889075, "step": 4257, "time_per_iteration": 2.518026113510132 }, { "auxiliary_loss_clip": 0.01167079, "auxiliary_loss_mlp": 0.01026932, "balance_loss_clip": 1.05014861, "balance_loss_mlp": 1.01886749, "epoch": 0.5119942283412493, "flos": 17457326749440.0, "grad_norm": 1.8069203388510389, "language_loss": 0.81198859, "learning_rate": 2.019473983451375e-06, "loss": 0.8339287, "num_input_tokens_seen": 91907495, "step": 4258, "time_per_iteration": 2.4647676944732666 }, { "auxiliary_loss_clip": 0.01144511, "auxiliary_loss_mlp": 0.01027322, "balance_loss_clip": 1.04867578, "balance_loss_mlp": 1.01919746, "epoch": 0.5121144712318885, "flos": 21066743784960.0, "grad_norm": 2.0220925179159495, "language_loss": 0.71626347, "learning_rate": 2.0186950472743076e-06, "loss": 0.7379818, "num_input_tokens_seen": 91927400, "step": 4259, "time_per_iteration": 2.5598652362823486 }, { "auxiliary_loss_clip": 0.01181697, "auxiliary_loss_mlp": 0.01027339, "balance_loss_clip": 1.0537957, "balance_loss_mlp": 1.01917958, "epoch": 0.5122347141225275, "flos": 19860791541120.0, "grad_norm": 1.6777330566778843, "language_loss": 0.74287784, "learning_rate": 2.0179161082612162e-06, "loss": 0.76496816, "num_input_tokens_seen": 91946790, "step": 4260, "time_per_iteration": 3.3007566928863525 }, { "auxiliary_loss_clip": 0.01147798, "auxiliary_loss_mlp": 0.01027528, "balance_loss_clip": 1.04789376, "balance_loss_mlp": 1.01999366, "epoch": 0.5123549570131666, "flos": 22528487756160.0, "grad_norm": 1.9901296992834492, "language_loss": 0.7300002, "learning_rate": 2.017137166530266e-06, "loss": 0.75175345, "num_input_tokens_seen": 91966325, "step": 4261, "time_per_iteration": 3.289705514907837 }, { "auxiliary_loss_clip": 0.01154071, "auxiliary_loss_mlp": 0.01029635, "balance_loss_clip": 1.04925203, "balance_loss_mlp": 1.02190161, "epoch": 0.5124751999038056, "flos": 20333375804160.0, "grad_norm": 2.0714490609464873, "language_loss": 0.79990327, "learning_rate": 2.0163582221996213e-06, "loss": 0.82174033, "num_input_tokens_seen": 91984700, "step": 4262, "time_per_iteration": 2.6133012771606445 }, { "auxiliary_loss_clip": 0.01153555, "auxiliary_loss_mlp": 0.01027758, "balance_loss_clip": 1.05056953, "balance_loss_mlp": 1.0198065, "epoch": 0.5125954427944448, "flos": 39785970211200.0, "grad_norm": 2.4268200798706934, "language_loss": 0.68229413, "learning_rate": 2.015579275387446e-06, "loss": 0.70410722, "num_input_tokens_seen": 92010020, "step": 4263, "time_per_iteration": 3.3962647914886475 }, { "auxiliary_loss_clip": 0.01144284, "auxiliary_loss_mlp": 0.01027971, "balance_loss_clip": 1.05089891, "balance_loss_mlp": 1.02004635, "epoch": 0.5127156856850839, "flos": 29205394358400.0, "grad_norm": 1.9084650485949497, "language_loss": 0.68326336, "learning_rate": 2.0148003262119085e-06, "loss": 0.70498592, "num_input_tokens_seen": 92030990, "step": 4264, "time_per_iteration": 2.5450198650360107 }, { "auxiliary_loss_clip": 0.01137384, "auxiliary_loss_mlp": 0.01028729, "balance_loss_clip": 1.04927349, "balance_loss_mlp": 1.0205214, "epoch": 0.5128359285757229, "flos": 13553693412480.0, "grad_norm": 1.8918325748929428, "language_loss": 0.76884156, "learning_rate": 2.0140213747911728e-06, "loss": 0.79050267, "num_input_tokens_seen": 92049525, "step": 4265, "time_per_iteration": 2.5075795650482178 }, { "auxiliary_loss_clip": 0.01135409, "auxiliary_loss_mlp": 0.01030036, "balance_loss_clip": 1.04993486, "balance_loss_mlp": 1.02166152, "epoch": 0.5129561714663621, "flos": 25192089820800.0, "grad_norm": 1.9729055609448587, "language_loss": 0.80425715, "learning_rate": 2.013242421243406e-06, "loss": 0.82591158, "num_input_tokens_seen": 92068430, "step": 4266, "time_per_iteration": 2.5770020484924316 }, { "auxiliary_loss_clip": 0.01125901, "auxiliary_loss_mlp": 0.01021837, "balance_loss_clip": 1.05133057, "balance_loss_mlp": 1.01396894, "epoch": 0.5130764143570011, "flos": 18150223080960.0, "grad_norm": 1.4989578344089376, "language_loss": 0.7892133, "learning_rate": 2.012463465686774e-06, "loss": 0.81069064, "num_input_tokens_seen": 92088180, "step": 4267, "time_per_iteration": 2.601551055908203 }, { "auxiliary_loss_clip": 0.01050355, "auxiliary_loss_mlp": 0.01004367, "balance_loss_clip": 1.03345513, "balance_loss_mlp": 1.00287092, "epoch": 0.5131966572476402, "flos": 59794896418560.0, "grad_norm": 0.7681886659106236, "language_loss": 0.54740709, "learning_rate": 2.0116845082394446e-06, "loss": 0.56795424, "num_input_tokens_seen": 92153015, "step": 4268, "time_per_iteration": 3.1891331672668457 }, { "auxiliary_loss_clip": 0.01172765, "auxiliary_loss_mlp": 0.01024595, "balance_loss_clip": 1.05345535, "balance_loss_mlp": 1.01644742, "epoch": 0.5133169001382794, "flos": 18515219132160.0, "grad_norm": 1.853034544470862, "language_loss": 0.78480369, "learning_rate": 2.0109055490195836e-06, "loss": 0.80677724, "num_input_tokens_seen": 92171470, "step": 4269, "time_per_iteration": 2.454500913619995 }, { "auxiliary_loss_clip": 0.01113876, "auxiliary_loss_mlp": 0.01025866, "balance_loss_clip": 1.04113901, "balance_loss_mlp": 1.01811767, "epoch": 0.5134371430289184, "flos": 15523537219200.0, "grad_norm": 2.2869226140867918, "language_loss": 0.64670295, "learning_rate": 2.0101265881453605e-06, "loss": 0.66810036, "num_input_tokens_seen": 92189945, "step": 4270, "time_per_iteration": 2.58471941947937 }, { "auxiliary_loss_clip": 0.01149587, "auxiliary_loss_mlp": 0.01031538, "balance_loss_clip": 1.05353785, "balance_loss_mlp": 1.02380776, "epoch": 0.5135573859195575, "flos": 21433786911360.0, "grad_norm": 2.088858675280562, "language_loss": 0.78387833, "learning_rate": 2.009347625734941e-06, "loss": 0.80568963, "num_input_tokens_seen": 92209855, "step": 4271, "time_per_iteration": 2.5243873596191406 }, { "auxiliary_loss_clip": 0.01187983, "auxiliary_loss_mlp": 0.01027625, "balance_loss_clip": 1.0594821, "balance_loss_mlp": 1.01976943, "epoch": 0.5136776288101966, "flos": 17712651600000.0, "grad_norm": 2.1007574975095444, "language_loss": 0.74510866, "learning_rate": 2.0085686619064954e-06, "loss": 0.76726472, "num_input_tokens_seen": 92226295, "step": 4272, "time_per_iteration": 2.4277310371398926 }, { "auxiliary_loss_clip": 0.01173347, "auxiliary_loss_mlp": 0.01030694, "balance_loss_clip": 1.056095, "balance_loss_mlp": 1.02272475, "epoch": 0.5137978717008357, "flos": 16581680997120.0, "grad_norm": 3.1822277116267355, "language_loss": 0.82629323, "learning_rate": 2.00778969677819e-06, "loss": 0.84833366, "num_input_tokens_seen": 92243330, "step": 4273, "time_per_iteration": 2.473097324371338 }, { "auxiliary_loss_clip": 0.0115095, "auxiliary_loss_mlp": 0.01024046, "balance_loss_clip": 1.05058813, "balance_loss_mlp": 1.01648211, "epoch": 0.5139181145914747, "flos": 20668243322880.0, "grad_norm": 1.9084091067323563, "language_loss": 0.63895261, "learning_rate": 2.0070107304681934e-06, "loss": 0.66070259, "num_input_tokens_seen": 92262285, "step": 4274, "time_per_iteration": 2.5414175987243652 }, { "auxiliary_loss_clip": 0.01139716, "auxiliary_loss_mlp": 0.01027266, "balance_loss_clip": 1.05352378, "balance_loss_mlp": 1.01905286, "epoch": 0.5140383574821139, "flos": 32926996546560.0, "grad_norm": 1.8882609542090956, "language_loss": 0.78203666, "learning_rate": 2.006231763094675e-06, "loss": 0.80370641, "num_input_tokens_seen": 92283305, "step": 4275, "time_per_iteration": 2.6354713439941406 }, { "auxiliary_loss_clip": 0.01149281, "auxiliary_loss_mlp": 0.0102304, "balance_loss_clip": 1.05418932, "balance_loss_mlp": 1.01544595, "epoch": 0.514158600372753, "flos": 19537093152000.0, "grad_norm": 2.4021131350950373, "language_loss": 0.87360674, "learning_rate": 2.0054527947758027e-06, "loss": 0.89532995, "num_input_tokens_seen": 92302105, "step": 4276, "time_per_iteration": 2.600318670272827 }, { "auxiliary_loss_clip": 0.01071156, "auxiliary_loss_mlp": 0.01003584, "balance_loss_clip": 1.02186275, "balance_loss_mlp": 1.00257671, "epoch": 0.514278843263392, "flos": 62523855279360.0, "grad_norm": 0.7206748185050432, "language_loss": 0.55898213, "learning_rate": 2.004673825629746e-06, "loss": 0.57972956, "num_input_tokens_seen": 92362885, "step": 4277, "time_per_iteration": 3.039154291152954 }, { "auxiliary_loss_clip": 0.0114906, "auxiliary_loss_mlp": 0.01028679, "balance_loss_clip": 1.05074024, "balance_loss_mlp": 1.02107334, "epoch": 0.5143990861540312, "flos": 25882328545920.0, "grad_norm": 1.5859270425814573, "language_loss": 0.72312629, "learning_rate": 2.0038948557746744e-06, "loss": 0.74490374, "num_input_tokens_seen": 92384740, "step": 4278, "time_per_iteration": 2.527926206588745 }, { "auxiliary_loss_clip": 0.01162618, "auxiliary_loss_mlp": 0.01026902, "balance_loss_clip": 1.05229259, "balance_loss_mlp": 1.01935351, "epoch": 0.5145193290446702, "flos": 23330660238720.0, "grad_norm": 1.7345291500477011, "language_loss": 0.75129551, "learning_rate": 2.0031158853287558e-06, "loss": 0.77319074, "num_input_tokens_seen": 92405175, "step": 4279, "time_per_iteration": 2.477201461791992 }, { "auxiliary_loss_clip": 0.011537, "auxiliary_loss_mlp": 0.01030551, "balance_loss_clip": 1.05419528, "balance_loss_mlp": 1.02298689, "epoch": 0.5146395719353093, "flos": 22856603518080.0, "grad_norm": 2.1843884344457845, "language_loss": 0.70376295, "learning_rate": 2.0023369144101593e-06, "loss": 0.72560543, "num_input_tokens_seen": 92423345, "step": 4280, "time_per_iteration": 2.5090935230255127 }, { "auxiliary_loss_clip": 0.01142817, "auxiliary_loss_mlp": 0.01027925, "balance_loss_clip": 1.04797149, "balance_loss_mlp": 1.02022433, "epoch": 0.5147598148259485, "flos": 26391577616640.0, "grad_norm": 1.6592812695804138, "language_loss": 0.76773685, "learning_rate": 2.0015579431370555e-06, "loss": 0.78944433, "num_input_tokens_seen": 92445025, "step": 4281, "time_per_iteration": 2.5510001182556152 }, { "auxiliary_loss_clip": 0.01164272, "auxiliary_loss_mlp": 0.01025841, "balance_loss_clip": 1.05373812, "balance_loss_mlp": 1.01799083, "epoch": 0.5148800577165875, "flos": 29965694561280.0, "grad_norm": 2.785029311071165, "language_loss": 0.69829583, "learning_rate": 2.000778971627612e-06, "loss": 0.72019702, "num_input_tokens_seen": 92464490, "step": 4282, "time_per_iteration": 2.52500057220459 }, { "auxiliary_loss_clip": 0.0114497, "auxiliary_loss_mlp": 0.01032913, "balance_loss_clip": 1.0489862, "balance_loss_mlp": 1.02492595, "epoch": 0.5150003006072266, "flos": 17931383470080.0, "grad_norm": 1.9897955407156322, "language_loss": 0.90266895, "learning_rate": 2e-06, "loss": 0.92444777, "num_input_tokens_seen": 92482085, "step": 4283, "time_per_iteration": 3.3161795139312744 }, { "auxiliary_loss_clip": 0.01179945, "auxiliary_loss_mlp": 0.01028807, "balance_loss_clip": 1.05495954, "balance_loss_mlp": 1.02117157, "epoch": 0.5151205434978657, "flos": 18478733892480.0, "grad_norm": 1.699246641932537, "language_loss": 0.85659552, "learning_rate": 1.9992210283723878e-06, "loss": 0.87868309, "num_input_tokens_seen": 92499325, "step": 4284, "time_per_iteration": 2.410841226577759 }, { "auxiliary_loss_clip": 0.01180323, "auxiliary_loss_mlp": 0.01027353, "balance_loss_clip": 1.0563767, "balance_loss_mlp": 1.01995599, "epoch": 0.5152407863885048, "flos": 25341263003520.0, "grad_norm": 1.6415695985877476, "language_loss": 0.79565156, "learning_rate": 1.9984420568629448e-06, "loss": 0.81772828, "num_input_tokens_seen": 92522090, "step": 4285, "time_per_iteration": 2.527308940887451 }, { "auxiliary_loss_clip": 0.01167141, "auxiliary_loss_mlp": 0.0102579, "balance_loss_clip": 1.05316257, "balance_loss_mlp": 1.0186851, "epoch": 0.5153610292791438, "flos": 18329740277760.0, "grad_norm": 2.0560583649139854, "language_loss": 0.78259861, "learning_rate": 1.9976630855898405e-06, "loss": 0.80452788, "num_input_tokens_seen": 92539845, "step": 4286, "time_per_iteration": 2.4286062717437744 }, { "auxiliary_loss_clip": 0.01146903, "auxiliary_loss_mlp": 0.01024541, "balance_loss_clip": 1.04628944, "balance_loss_mlp": 1.01700091, "epoch": 0.515481272169783, "flos": 30409945971840.0, "grad_norm": 2.008342408011769, "language_loss": 0.74458647, "learning_rate": 1.9968841146712445e-06, "loss": 0.76630092, "num_input_tokens_seen": 92559460, "step": 4287, "time_per_iteration": 3.4090259075164795 }, { "auxiliary_loss_clip": 0.01109598, "auxiliary_loss_mlp": 0.00762022, "balance_loss_clip": 1.04694295, "balance_loss_mlp": 1.00020337, "epoch": 0.5156015150604221, "flos": 23037305863680.0, "grad_norm": 1.6638910783958147, "language_loss": 0.71525955, "learning_rate": 1.996105144225326e-06, "loss": 0.73397571, "num_input_tokens_seen": 92579695, "step": 4288, "time_per_iteration": 3.406270742416382 }, { "auxiliary_loss_clip": 0.01166683, "auxiliary_loss_mlp": 0.01029889, "balance_loss_clip": 1.05421185, "balance_loss_mlp": 1.02243233, "epoch": 0.5157217579510611, "flos": 17858556645120.0, "grad_norm": 1.7709147666094829, "language_loss": 0.78625512, "learning_rate": 1.995326174370254e-06, "loss": 0.80822086, "num_input_tokens_seen": 92598795, "step": 4289, "time_per_iteration": 2.4571619033813477 }, { "auxiliary_loss_clip": 0.01163046, "auxiliary_loss_mlp": 0.00761795, "balance_loss_clip": 1.05121469, "balance_loss_mlp": 1.00013304, "epoch": 0.5158420008417003, "flos": 19171486569600.0, "grad_norm": 1.557681297857005, "language_loss": 0.73106652, "learning_rate": 1.994547205224197e-06, "loss": 0.75031495, "num_input_tokens_seen": 92617700, "step": 4290, "time_per_iteration": 3.2195868492126465 }, { "auxiliary_loss_clip": 0.01145769, "auxiliary_loss_mlp": 0.0102658, "balance_loss_clip": 1.05079579, "balance_loss_mlp": 1.01839888, "epoch": 0.5159622437323393, "flos": 22419534827520.0, "grad_norm": 1.9993505968268424, "language_loss": 0.67429018, "learning_rate": 1.993768236905325e-06, "loss": 0.69601369, "num_input_tokens_seen": 92638370, "step": 4291, "time_per_iteration": 2.5067594051361084 }, { "auxiliary_loss_clip": 0.01147966, "auxiliary_loss_mlp": 0.01025063, "balance_loss_clip": 1.05018187, "balance_loss_mlp": 1.01724887, "epoch": 0.5160824866229784, "flos": 24603010773120.0, "grad_norm": 2.5267436636036757, "language_loss": 0.65778154, "learning_rate": 1.992989269531807e-06, "loss": 0.67951179, "num_input_tokens_seen": 92657180, "step": 4292, "time_per_iteration": 2.5515949726104736 }, { "auxiliary_loss_clip": 0.01152177, "auxiliary_loss_mlp": 0.01026493, "balance_loss_clip": 1.04967022, "balance_loss_mlp": 1.01865482, "epoch": 0.5162027295136175, "flos": 18002737837440.0, "grad_norm": 2.4856415194819648, "language_loss": 0.68858957, "learning_rate": 1.99221030322181e-06, "loss": 0.71037626, "num_input_tokens_seen": 92673985, "step": 4293, "time_per_iteration": 2.4755516052246094 }, { "auxiliary_loss_clip": 0.01155531, "auxiliary_loss_mlp": 0.01027698, "balance_loss_clip": 1.05152452, "balance_loss_mlp": 1.02004457, "epoch": 0.5163229724042566, "flos": 27344611221120.0, "grad_norm": 1.6513786060270488, "language_loss": 0.80928516, "learning_rate": 1.991431338093505e-06, "loss": 0.83111745, "num_input_tokens_seen": 92696340, "step": 4294, "time_per_iteration": 2.544196605682373 }, { "auxiliary_loss_clip": 0.01151707, "auxiliary_loss_mlp": 0.01026931, "balance_loss_clip": 1.0545224, "balance_loss_mlp": 1.0196445, "epoch": 0.5164432152948957, "flos": 21762764599680.0, "grad_norm": 2.8668974819848865, "language_loss": 0.79280865, "learning_rate": 1.9906523742650587e-06, "loss": 0.8145951, "num_input_tokens_seen": 92715200, "step": 4295, "time_per_iteration": 2.4912047386169434 }, { "auxiliary_loss_clip": 0.01180272, "auxiliary_loss_mlp": 0.01031478, "balance_loss_clip": 1.05210805, "balance_loss_mlp": 1.02317548, "epoch": 0.5165634581855347, "flos": 25550334115200.0, "grad_norm": 2.05186771821765, "language_loss": 0.77610034, "learning_rate": 1.9898734118546397e-06, "loss": 0.79821783, "num_input_tokens_seen": 92735150, "step": 4296, "time_per_iteration": 2.498521327972412 }, { "auxiliary_loss_clip": 0.01100435, "auxiliary_loss_mlp": 0.01025664, "balance_loss_clip": 1.04560828, "balance_loss_mlp": 1.01712275, "epoch": 0.5166837010761739, "flos": 19901191363200.0, "grad_norm": 1.5975486388966356, "language_loss": 0.80442047, "learning_rate": 1.989094450980416e-06, "loss": 0.82568151, "num_input_tokens_seen": 92755250, "step": 4297, "time_per_iteration": 2.6359899044036865 }, { "auxiliary_loss_clip": 0.01164353, "auxiliary_loss_mlp": 0.01023905, "balance_loss_clip": 1.05279386, "balance_loss_mlp": 1.01618052, "epoch": 0.516803943966813, "flos": 26646076454400.0, "grad_norm": 2.10310171739443, "language_loss": 0.76911175, "learning_rate": 1.9883154917605556e-06, "loss": 0.79099429, "num_input_tokens_seen": 92774460, "step": 4298, "time_per_iteration": 2.5876362323760986 }, { "auxiliary_loss_clip": 0.01178971, "auxiliary_loss_mlp": 0.01022804, "balance_loss_clip": 1.05338526, "balance_loss_mlp": 1.01564538, "epoch": 0.516924186857452, "flos": 19682854542720.0, "grad_norm": 1.9547209466611781, "language_loss": 0.83364004, "learning_rate": 1.9875365343132262e-06, "loss": 0.85565782, "num_input_tokens_seen": 92791580, "step": 4299, "time_per_iteration": 2.420444965362549 }, { "auxiliary_loss_clip": 0.01165741, "auxiliary_loss_mlp": 0.00762273, "balance_loss_clip": 1.05377007, "balance_loss_mlp": 1.00012457, "epoch": 0.5170444297480912, "flos": 15956583586560.0, "grad_norm": 3.028424899236101, "language_loss": 0.84598947, "learning_rate": 1.9867575787565946e-06, "loss": 0.86526966, "num_input_tokens_seen": 92806240, "step": 4300, "time_per_iteration": 2.4310429096221924 }, { "auxiliary_loss_clip": 0.01167281, "auxiliary_loss_mlp": 0.01025512, "balance_loss_clip": 1.05279255, "balance_loss_mlp": 1.01728082, "epoch": 0.5171646726387302, "flos": 14174157968640.0, "grad_norm": 2.4605083133120766, "language_loss": 0.86404359, "learning_rate": 1.9859786252088275e-06, "loss": 0.88597155, "num_input_tokens_seen": 92823420, "step": 4301, "time_per_iteration": 2.463771104812622 }, { "auxiliary_loss_clip": 0.01141744, "auxiliary_loss_mlp": 0.01031076, "balance_loss_clip": 1.05039752, "balance_loss_mlp": 1.02247512, "epoch": 0.5172849155293693, "flos": 23578550974080.0, "grad_norm": 2.897042630731926, "language_loss": 0.66240561, "learning_rate": 1.9851996737880914e-06, "loss": 0.68413377, "num_input_tokens_seen": 92838605, "step": 4302, "time_per_iteration": 2.521503448486328 }, { "auxiliary_loss_clip": 0.01171635, "auxiliary_loss_mlp": 0.01031522, "balance_loss_clip": 1.05365467, "balance_loss_mlp": 1.02348089, "epoch": 0.5174051584200084, "flos": 14283541860480.0, "grad_norm": 2.123928898927342, "language_loss": 0.74533945, "learning_rate": 1.9844207246125537e-06, "loss": 0.767371, "num_input_tokens_seen": 92855185, "step": 4303, "time_per_iteration": 2.484696865081787 }, { "auxiliary_loss_clip": 0.01148577, "auxiliary_loss_mlp": 0.01024082, "balance_loss_clip": 1.04949856, "balance_loss_mlp": 1.01698649, "epoch": 0.5175254013106475, "flos": 37889384192640.0, "grad_norm": 1.7874384254251288, "language_loss": 0.68360448, "learning_rate": 1.983641777800379e-06, "loss": 0.70533103, "num_input_tokens_seen": 92877830, "step": 4304, "time_per_iteration": 2.6318607330322266 }, { "auxiliary_loss_clip": 0.01062865, "auxiliary_loss_mlp": 0.01005569, "balance_loss_clip": 1.01982236, "balance_loss_mlp": 1.00444841, "epoch": 0.5176456442012866, "flos": 68549737829760.0, "grad_norm": 0.8793157424073644, "language_loss": 0.58793604, "learning_rate": 1.9828628334697343e-06, "loss": 0.60862041, "num_input_tokens_seen": 92945040, "step": 4305, "time_per_iteration": 3.240938901901245 }, { "auxiliary_loss_clip": 0.01065804, "auxiliary_loss_mlp": 0.01004215, "balance_loss_clip": 1.0218854, "balance_loss_mlp": 1.00299871, "epoch": 0.5177658870919257, "flos": 64084137235200.0, "grad_norm": 0.7698451296104786, "language_loss": 0.54749763, "learning_rate": 1.982083891738784e-06, "loss": 0.56819779, "num_input_tokens_seen": 93005910, "step": 4306, "time_per_iteration": 3.0985476970672607 }, { "auxiliary_loss_clip": 0.01147166, "auxiliary_loss_mlp": 0.01026133, "balance_loss_clip": 1.05288994, "balance_loss_mlp": 1.01856327, "epoch": 0.5178861299825648, "flos": 26651248012800.0, "grad_norm": 1.5003269605184575, "language_loss": 0.82572871, "learning_rate": 1.9813049527256923e-06, "loss": 0.8474617, "num_input_tokens_seen": 93026305, "step": 4307, "time_per_iteration": 2.54730224609375 }, { "auxiliary_loss_clip": 0.01135141, "auxiliary_loss_mlp": 0.01028255, "balance_loss_clip": 1.04671049, "balance_loss_mlp": 1.02073932, "epoch": 0.5180063728732038, "flos": 17931886260480.0, "grad_norm": 2.4991705447756334, "language_loss": 0.81758183, "learning_rate": 1.9805260165486252e-06, "loss": 0.83921587, "num_input_tokens_seen": 93045675, "step": 4308, "time_per_iteration": 2.5303757190704346 }, { "auxiliary_loss_clip": 0.01165803, "auxiliary_loss_mlp": 0.01022941, "balance_loss_clip": 1.05437827, "balance_loss_mlp": 1.01526403, "epoch": 0.518126615763843, "flos": 19500895221120.0, "grad_norm": 2.2166133050093895, "language_loss": 0.86462986, "learning_rate": 1.9797470833257457e-06, "loss": 0.88651729, "num_input_tokens_seen": 93065375, "step": 4309, "time_per_iteration": 3.323747158050537 }, { "auxiliary_loss_clip": 0.01167648, "auxiliary_loss_mlp": 0.01027318, "balance_loss_clip": 1.05438471, "balance_loss_mlp": 1.01899672, "epoch": 0.5182468586544821, "flos": 20704082117760.0, "grad_norm": 2.261365916362636, "language_loss": 0.77514726, "learning_rate": 1.9789681531752177e-06, "loss": 0.79709697, "num_input_tokens_seen": 93085595, "step": 4310, "time_per_iteration": 2.465726613998413 }, { "auxiliary_loss_clip": 0.01119335, "auxiliary_loss_mlp": 0.01025563, "balance_loss_clip": 1.04865932, "balance_loss_mlp": 1.01830888, "epoch": 0.5183671015451211, "flos": 23112107936640.0, "grad_norm": 1.5614632562759347, "language_loss": 0.72529924, "learning_rate": 1.978189226215204e-06, "loss": 0.74674821, "num_input_tokens_seen": 93106140, "step": 4311, "time_per_iteration": 2.572096586227417 }, { "auxiliary_loss_clip": 0.01179527, "auxiliary_loss_mlp": 0.01027023, "balance_loss_clip": 1.0537281, "balance_loss_mlp": 1.01897609, "epoch": 0.5184873444357603, "flos": 17597090568960.0, "grad_norm": 2.070529119022739, "language_loss": 0.76982635, "learning_rate": 1.9774103025638675e-06, "loss": 0.79189187, "num_input_tokens_seen": 93124265, "step": 4312, "time_per_iteration": 2.4041223526000977 }, { "auxiliary_loss_clip": 0.01127932, "auxiliary_loss_mlp": 0.01022869, "balance_loss_clip": 1.05382943, "balance_loss_mlp": 1.01555252, "epoch": 0.5186075873263993, "flos": 24936800883840.0, "grad_norm": 1.5604120131675898, "language_loss": 0.76241541, "learning_rate": 1.9766313823393696e-06, "loss": 0.78392339, "num_input_tokens_seen": 93145130, "step": 4313, "time_per_iteration": 2.588914632797241 }, { "auxiliary_loss_clip": 0.01117137, "auxiliary_loss_mlp": 0.01028235, "balance_loss_clip": 1.04318452, "balance_loss_mlp": 1.02031946, "epoch": 0.5187278302170384, "flos": 15190106244480.0, "grad_norm": 2.02884406468073, "language_loss": 0.68983698, "learning_rate": 1.975852465659873e-06, "loss": 0.71129072, "num_input_tokens_seen": 93161110, "step": 4314, "time_per_iteration": 3.3884284496307373 }, { "auxiliary_loss_clip": 0.01169039, "auxiliary_loss_mlp": 0.01030757, "balance_loss_clip": 1.05429661, "balance_loss_mlp": 1.02309155, "epoch": 0.5188480731076776, "flos": 25009412227200.0, "grad_norm": 2.0468833794700854, "language_loss": 0.69994009, "learning_rate": 1.9750735526435377e-06, "loss": 0.72193801, "num_input_tokens_seen": 93178055, "step": 4315, "time_per_iteration": 3.3257477283477783 }, { "auxiliary_loss_clip": 0.01153357, "auxiliary_loss_mlp": 0.01025859, "balance_loss_clip": 1.05405092, "balance_loss_mlp": 1.01817012, "epoch": 0.5189683159983166, "flos": 24790141653120.0, "grad_norm": 2.150479590116937, "language_loss": 0.78674585, "learning_rate": 1.974294643408525e-06, "loss": 0.80853796, "num_input_tokens_seen": 93195850, "step": 4316, "time_per_iteration": 3.349114418029785 }, { "auxiliary_loss_clip": 0.01167859, "auxiliary_loss_mlp": 0.01029102, "balance_loss_clip": 1.05047369, "balance_loss_mlp": 1.02165174, "epoch": 0.5190885588889557, "flos": 24754266944640.0, "grad_norm": 4.668884775065209, "language_loss": 0.67125702, "learning_rate": 1.9735157380729947e-06, "loss": 0.69322664, "num_input_tokens_seen": 93216260, "step": 4317, "time_per_iteration": 2.485344409942627 }, { "auxiliary_loss_clip": 0.01153076, "auxiliary_loss_mlp": 0.01022755, "balance_loss_clip": 1.04970074, "balance_loss_mlp": 1.01567125, "epoch": 0.5192088017795948, "flos": 24712646060160.0, "grad_norm": 1.8294509732764275, "language_loss": 0.84534085, "learning_rate": 1.9727368367551053e-06, "loss": 0.86709917, "num_input_tokens_seen": 93234810, "step": 4318, "time_per_iteration": 2.523404121398926 }, { "auxiliary_loss_clip": 0.01138333, "auxiliary_loss_mlp": 0.01025726, "balance_loss_clip": 1.04777312, "balance_loss_mlp": 1.01807272, "epoch": 0.5193290446702339, "flos": 27229588894080.0, "grad_norm": 1.9492075566846674, "language_loss": 0.68504083, "learning_rate": 1.9719579395730164e-06, "loss": 0.70668137, "num_input_tokens_seen": 93254185, "step": 4319, "time_per_iteration": 2.528857946395874 }, { "auxiliary_loss_clip": 0.01181693, "auxiliary_loss_mlp": 0.01023542, "balance_loss_clip": 1.05628467, "balance_loss_mlp": 1.01593351, "epoch": 0.5194492875608729, "flos": 11473352392320.0, "grad_norm": 2.370936817676572, "language_loss": 0.93126476, "learning_rate": 1.9711790466448854e-06, "loss": 0.95331705, "num_input_tokens_seen": 93268205, "step": 4320, "time_per_iteration": 2.37650990486145 }, { "auxiliary_loss_clip": 0.01128511, "auxiliary_loss_mlp": 0.01033846, "balance_loss_clip": 1.04931116, "balance_loss_mlp": 1.02584112, "epoch": 0.5195695304515121, "flos": 20338906498560.0, "grad_norm": 2.0874242588894107, "language_loss": 0.71438462, "learning_rate": 1.9704001580888704e-06, "loss": 0.73600817, "num_input_tokens_seen": 93286945, "step": 4321, "time_per_iteration": 2.5591723918914795 }, { "auxiliary_loss_clip": 0.0114406, "auxiliary_loss_mlp": 0.00762592, "balance_loss_clip": 1.04641712, "balance_loss_mlp": 1.00028682, "epoch": 0.5196897733421512, "flos": 20048317470720.0, "grad_norm": 3.0046490005712054, "language_loss": 0.8701604, "learning_rate": 1.9696212740231283e-06, "loss": 0.88922691, "num_input_tokens_seen": 93305595, "step": 4322, "time_per_iteration": 2.4885165691375732 }, { "auxiliary_loss_clip": 0.01172126, "auxiliary_loss_mlp": 0.01024591, "balance_loss_clip": 1.0511024, "balance_loss_mlp": 1.01629353, "epoch": 0.5198100162327902, "flos": 23805507058560.0, "grad_norm": 2.139084722899328, "language_loss": 0.82582247, "learning_rate": 1.9688423945658146e-06, "loss": 0.84778965, "num_input_tokens_seen": 93326460, "step": 4323, "time_per_iteration": 2.487976312637329 }, { "auxiliary_loss_clip": 0.01113229, "auxiliary_loss_mlp": 0.0102657, "balance_loss_clip": 1.0415988, "balance_loss_mlp": 1.01821399, "epoch": 0.5199302591234293, "flos": 24023951619840.0, "grad_norm": 1.985345452166628, "language_loss": 0.7154538, "learning_rate": 1.9680635198350845e-06, "loss": 0.73685181, "num_input_tokens_seen": 93346170, "step": 4324, "time_per_iteration": 2.576594829559326 }, { "auxiliary_loss_clip": 0.01165487, "auxiliary_loss_mlp": 0.01032973, "balance_loss_clip": 1.04954374, "balance_loss_mlp": 1.02458119, "epoch": 0.5200505020140684, "flos": 26359366095360.0, "grad_norm": 2.012401828540083, "language_loss": 0.72556275, "learning_rate": 1.967284649949093e-06, "loss": 0.74754739, "num_input_tokens_seen": 93365380, "step": 4325, "time_per_iteration": 2.512523651123047 }, { "auxiliary_loss_clip": 0.01135533, "auxiliary_loss_mlp": 0.01030607, "balance_loss_clip": 1.04761183, "balance_loss_mlp": 1.02251267, "epoch": 0.5201707449047075, "flos": 39604262284800.0, "grad_norm": 1.7634997804006274, "language_loss": 0.72408974, "learning_rate": 1.966505785025994e-06, "loss": 0.74575114, "num_input_tokens_seen": 93387285, "step": 4326, "time_per_iteration": 2.67545485496521 }, { "auxiliary_loss_clip": 0.0113476, "auxiliary_loss_mlp": 0.01027572, "balance_loss_clip": 1.04808664, "balance_loss_mlp": 1.01947165, "epoch": 0.5202909877953465, "flos": 53682788292480.0, "grad_norm": 1.791571356702319, "language_loss": 0.76148891, "learning_rate": 1.965726925183941e-06, "loss": 0.78311223, "num_input_tokens_seen": 93410390, "step": 4327, "time_per_iteration": 2.8161420822143555 }, { "auxiliary_loss_clip": 0.0118042, "auxiliary_loss_mlp": 0.01022503, "balance_loss_clip": 1.05524254, "balance_loss_mlp": 1.01525545, "epoch": 0.5204112306859857, "flos": 19537021324800.0, "grad_norm": 1.7586306918358665, "language_loss": 0.84700787, "learning_rate": 1.964948070541087e-06, "loss": 0.86903709, "num_input_tokens_seen": 93429050, "step": 4328, "time_per_iteration": 2.4287045001983643 }, { "auxiliary_loss_clip": 0.01153016, "auxiliary_loss_mlp": 0.01030736, "balance_loss_clip": 1.04685843, "balance_loss_mlp": 1.02280903, "epoch": 0.5205314735766248, "flos": 15304697608320.0, "grad_norm": 2.4499090544097184, "language_loss": 0.69810385, "learning_rate": 1.9641692212155816e-06, "loss": 0.71994138, "num_input_tokens_seen": 93446815, "step": 4329, "time_per_iteration": 2.451185464859009 }, { "auxiliary_loss_clip": 0.01123511, "auxiliary_loss_mlp": 0.01031179, "balance_loss_clip": 1.05195141, "balance_loss_mlp": 1.02306128, "epoch": 0.5206517164672638, "flos": 59263701160320.0, "grad_norm": 1.9996389057800341, "language_loss": 0.72494018, "learning_rate": 1.9633903773255777e-06, "loss": 0.74648714, "num_input_tokens_seen": 93469130, "step": 4330, "time_per_iteration": 2.8791377544403076 }, { "auxiliary_loss_clip": 0.01176565, "auxiliary_loss_mlp": 0.0102619, "balance_loss_clip": 1.05012131, "balance_loss_mlp": 1.01829851, "epoch": 0.520771959357903, "flos": 26871129118080.0, "grad_norm": 1.7086111102781905, "language_loss": 0.7495327, "learning_rate": 1.9626115389892237e-06, "loss": 0.77156025, "num_input_tokens_seen": 93489920, "step": 4331, "time_per_iteration": 2.498196601867676 }, { "auxiliary_loss_clip": 0.01143066, "auxiliary_loss_mlp": 0.01024783, "balance_loss_clip": 1.04956031, "balance_loss_mlp": 1.01695704, "epoch": 0.520892202248542, "flos": 26907075653760.0, "grad_norm": 2.7606361747193757, "language_loss": 0.85726553, "learning_rate": 1.96183270632467e-06, "loss": 0.87894404, "num_input_tokens_seen": 93509770, "step": 4332, "time_per_iteration": 2.6455507278442383 }, { "auxiliary_loss_clip": 0.0112943, "auxiliary_loss_mlp": 0.00762793, "balance_loss_clip": 1.04587126, "balance_loss_mlp": 1.0001626, "epoch": 0.5210124451391811, "flos": 25849434666240.0, "grad_norm": 1.7626853461122032, "language_loss": 0.79265553, "learning_rate": 1.9610538794500644e-06, "loss": 0.81157768, "num_input_tokens_seen": 93529320, "step": 4333, "time_per_iteration": 2.5401036739349365 }, { "auxiliary_loss_clip": 0.01051956, "auxiliary_loss_mlp": 0.01003295, "balance_loss_clip": 1.01970232, "balance_loss_mlp": 1.00224614, "epoch": 0.5211326880298203, "flos": 70553804319360.0, "grad_norm": 0.7727918858617991, "language_loss": 0.59494221, "learning_rate": 1.9602750584835542e-06, "loss": 0.61549473, "num_input_tokens_seen": 93595255, "step": 4334, "time_per_iteration": 3.2483842372894287 }, { "auxiliary_loss_clip": 0.01147472, "auxiliary_loss_mlp": 0.0102291, "balance_loss_clip": 1.04837263, "balance_loss_mlp": 1.01527476, "epoch": 0.5212529309204593, "flos": 15628898787840.0, "grad_norm": 1.9299286669331854, "language_loss": 0.82301176, "learning_rate": 1.959496243543286e-06, "loss": 0.84471554, "num_input_tokens_seen": 93613135, "step": 4335, "time_per_iteration": 3.2852776050567627 }, { "auxiliary_loss_clip": 0.01169145, "auxiliary_loss_mlp": 0.01036039, "balance_loss_clip": 1.05653191, "balance_loss_mlp": 1.02782249, "epoch": 0.5213731738110984, "flos": 26242655829120.0, "grad_norm": 2.013150200395761, "language_loss": 0.79077005, "learning_rate": 1.9587174347474057e-06, "loss": 0.81282187, "num_input_tokens_seen": 93629645, "step": 4336, "time_per_iteration": 2.4735939502716064 }, { "auxiliary_loss_clip": 0.011091, "auxiliary_loss_mlp": 0.01029109, "balance_loss_clip": 1.04402423, "balance_loss_mlp": 1.02120566, "epoch": 0.5214934167017375, "flos": 19418407637760.0, "grad_norm": 2.116526222975386, "language_loss": 0.82434964, "learning_rate": 1.9579386322140574e-06, "loss": 0.84573174, "num_input_tokens_seen": 93645325, "step": 4337, "time_per_iteration": 2.5194547176361084 }, { "auxiliary_loss_clip": 0.0118239, "auxiliary_loss_mlp": 0.00762601, "balance_loss_clip": 1.05463576, "balance_loss_mlp": 1.00025547, "epoch": 0.5216136595923766, "flos": 30955788023040.0, "grad_norm": 1.7550947998859048, "language_loss": 0.80700946, "learning_rate": 1.9571598360613854e-06, "loss": 0.82645935, "num_input_tokens_seen": 93668200, "step": 4338, "time_per_iteration": 2.488835334777832 }, { "auxiliary_loss_clip": 0.01134789, "auxiliary_loss_mlp": 0.01026559, "balance_loss_clip": 1.04458201, "balance_loss_mlp": 1.01879263, "epoch": 0.5217339024830157, "flos": 21945047143680.0, "grad_norm": 2.0776521547677693, "language_loss": 0.6973902, "learning_rate": 1.956381046407532e-06, "loss": 0.71900368, "num_input_tokens_seen": 93688495, "step": 4339, "time_per_iteration": 2.475827932357788 }, { "auxiliary_loss_clip": 0.01132451, "auxiliary_loss_mlp": 0.01031366, "balance_loss_clip": 1.04680729, "balance_loss_mlp": 1.02343869, "epoch": 0.5218541453736548, "flos": 20923209037440.0, "grad_norm": 1.7810824338663827, "language_loss": 0.86097682, "learning_rate": 1.9556022633706394e-06, "loss": 0.88261497, "num_input_tokens_seen": 93707285, "step": 4340, "time_per_iteration": 3.3773446083068848 }, { "auxiliary_loss_clip": 0.01143484, "auxiliary_loss_mlp": 0.01029277, "balance_loss_clip": 1.04903436, "balance_loss_mlp": 1.02143347, "epoch": 0.5219743882642939, "flos": 23951663498880.0, "grad_norm": 1.6106960482397852, "language_loss": 0.79703474, "learning_rate": 1.954823487068848e-06, "loss": 0.8187623, "num_input_tokens_seen": 93727495, "step": 4341, "time_per_iteration": 3.3566136360168457 }, { "auxiliary_loss_clip": 0.01165518, "auxiliary_loss_mlp": 0.01028864, "balance_loss_clip": 1.05290639, "balance_loss_mlp": 1.02135944, "epoch": 0.5220946311549329, "flos": 28799280213120.0, "grad_norm": 1.672869638190347, "language_loss": 0.80807132, "learning_rate": 1.9540447176202976e-06, "loss": 0.83001512, "num_input_tokens_seen": 93748740, "step": 4342, "time_per_iteration": 2.521245241165161 }, { "auxiliary_loss_clip": 0.01067541, "auxiliary_loss_mlp": 0.01001622, "balance_loss_clip": 1.01848412, "balance_loss_mlp": 1.00064456, "epoch": 0.5222148740455721, "flos": 67189369017600.0, "grad_norm": 0.8879401036442628, "language_loss": 0.60693657, "learning_rate": 1.9532659551431272e-06, "loss": 0.62762821, "num_input_tokens_seen": 93815770, "step": 4343, "time_per_iteration": 3.2200825214385986 }, { "auxiliary_loss_clip": 0.01166344, "auxiliary_loss_mlp": 0.01026637, "balance_loss_clip": 1.05193317, "balance_loss_mlp": 1.01924598, "epoch": 0.5223351169362112, "flos": 61856164339200.0, "grad_norm": 1.5510103453136361, "language_loss": 0.67752343, "learning_rate": 1.9524871997554744e-06, "loss": 0.69945323, "num_input_tokens_seen": 93843530, "step": 4344, "time_per_iteration": 3.5065371990203857 }, { "auxiliary_loss_clip": 0.01165432, "auxiliary_loss_mlp": 0.01026911, "balance_loss_clip": 1.05167866, "balance_loss_mlp": 1.0191927, "epoch": 0.5224553598268502, "flos": 14647388676480.0, "grad_norm": 5.207205442187628, "language_loss": 0.81162918, "learning_rate": 1.951708451575475e-06, "loss": 0.8335526, "num_input_tokens_seen": 93860595, "step": 4345, "time_per_iteration": 2.4215919971466064 }, { "auxiliary_loss_clip": 0.01143969, "auxiliary_loss_mlp": 0.01030587, "balance_loss_clip": 1.04691207, "balance_loss_mlp": 1.02298737, "epoch": 0.5225756027174894, "flos": 14826043946880.0, "grad_norm": 2.2067932277279616, "language_loss": 0.82118744, "learning_rate": 1.9509297107212657e-06, "loss": 0.84293306, "num_input_tokens_seen": 93877365, "step": 4346, "time_per_iteration": 2.481962203979492 }, { "auxiliary_loss_clip": 0.01175944, "auxiliary_loss_mlp": 0.01025648, "balance_loss_clip": 1.05192351, "balance_loss_mlp": 1.01818001, "epoch": 0.5226958456081284, "flos": 23512009029120.0, "grad_norm": 1.7004833271668003, "language_loss": 0.79269499, "learning_rate": 1.95015097731098e-06, "loss": 0.81471092, "num_input_tokens_seen": 93896855, "step": 4347, "time_per_iteration": 2.435988664627075 }, { "auxiliary_loss_clip": 0.01177566, "auxiliary_loss_mlp": 0.01025499, "balance_loss_clip": 1.05310202, "balance_loss_mlp": 1.0179503, "epoch": 0.5228160884987675, "flos": 19062928690560.0, "grad_norm": 4.298942695852673, "language_loss": 0.81880325, "learning_rate": 1.949372251462751e-06, "loss": 0.8408339, "num_input_tokens_seen": 93914270, "step": 4348, "time_per_iteration": 2.410475492477417 }, { "auxiliary_loss_clip": 0.01136318, "auxiliary_loss_mlp": 0.00761949, "balance_loss_clip": 1.04916906, "balance_loss_mlp": 1.00028086, "epoch": 0.5229363313894067, "flos": 21063224252160.0, "grad_norm": 1.9589558864326888, "language_loss": 0.82839519, "learning_rate": 1.9485935332947124e-06, "loss": 0.84737784, "num_input_tokens_seen": 93932180, "step": 4349, "time_per_iteration": 2.55025577545166 }, { "auxiliary_loss_clip": 0.01143372, "auxiliary_loss_mlp": 0.01023202, "balance_loss_clip": 1.04954123, "balance_loss_mlp": 1.01607358, "epoch": 0.5230565742800457, "flos": 14830389492480.0, "grad_norm": 2.524751867110256, "language_loss": 0.84168446, "learning_rate": 1.947814822924993e-06, "loss": 0.86335015, "num_input_tokens_seen": 93949690, "step": 4350, "time_per_iteration": 2.4523684978485107 }, { "auxiliary_loss_clip": 0.01177895, "auxiliary_loss_mlp": 0.01030901, "balance_loss_clip": 1.05479002, "balance_loss_mlp": 1.02355731, "epoch": 0.5231768171706848, "flos": 25813021253760.0, "grad_norm": 1.8300090896213734, "language_loss": 0.82848084, "learning_rate": 1.9470361204717236e-06, "loss": 0.85056877, "num_input_tokens_seen": 93968830, "step": 4351, "time_per_iteration": 2.571258068084717 }, { "auxiliary_loss_clip": 0.01139389, "auxiliary_loss_mlp": 0.00762291, "balance_loss_clip": 1.0495981, "balance_loss_mlp": 1.00025225, "epoch": 0.5232970600613239, "flos": 22743807834240.0, "grad_norm": 1.693903220217893, "language_loss": 0.80695319, "learning_rate": 1.9462574260530326e-06, "loss": 0.82596999, "num_input_tokens_seen": 93989110, "step": 4352, "time_per_iteration": 2.5304713249206543 }, { "auxiliary_loss_clip": 0.01154177, "auxiliary_loss_mlp": 0.0102684, "balance_loss_clip": 1.04898679, "balance_loss_mlp": 1.01900768, "epoch": 0.523417302951963, "flos": 17310703432320.0, "grad_norm": 1.7964250712837848, "language_loss": 0.80760342, "learning_rate": 1.9454787397870472e-06, "loss": 0.82941359, "num_input_tokens_seen": 94006430, "step": 4353, "time_per_iteration": 2.4470584392547607 }, { "auxiliary_loss_clip": 0.01103339, "auxiliary_loss_mlp": 0.01028481, "balance_loss_clip": 1.04832923, "balance_loss_mlp": 1.02044952, "epoch": 0.523537545842602, "flos": 18551740285440.0, "grad_norm": 1.770964479083237, "language_loss": 0.71991873, "learning_rate": 1.944700061791894e-06, "loss": 0.74123693, "num_input_tokens_seen": 94024825, "step": 4354, "time_per_iteration": 2.6146578788757324 }, { "auxiliary_loss_clip": 0.01162298, "auxiliary_loss_mlp": 0.01030004, "balance_loss_clip": 1.05246806, "balance_loss_mlp": 1.02224326, "epoch": 0.5236577887332411, "flos": 19719267955200.0, "grad_norm": 2.612381994926738, "language_loss": 0.65407026, "learning_rate": 1.943921392185698e-06, "loss": 0.67599326, "num_input_tokens_seen": 94043450, "step": 4355, "time_per_iteration": 2.5120255947113037 }, { "auxiliary_loss_clip": 0.01152573, "auxiliary_loss_mlp": 0.01026797, "balance_loss_clip": 1.0486176, "balance_loss_mlp": 1.01908731, "epoch": 0.5237780316238803, "flos": 23550218121600.0, "grad_norm": 2.1571417274222675, "language_loss": 0.77540857, "learning_rate": 1.9431427310865814e-06, "loss": 0.79720229, "num_input_tokens_seen": 94063055, "step": 4356, "time_per_iteration": 2.5682945251464844 }, { "auxiliary_loss_clip": 0.01123109, "auxiliary_loss_mlp": 0.0102904, "balance_loss_clip": 1.04862618, "balance_loss_mlp": 1.02081442, "epoch": 0.5238982745145193, "flos": 22491894775680.0, "grad_norm": 1.6814132975443963, "language_loss": 0.78262562, "learning_rate": 1.942364078612667e-06, "loss": 0.80414712, "num_input_tokens_seen": 94081785, "step": 4357, "time_per_iteration": 2.5321431159973145 }, { "auxiliary_loss_clip": 0.0114305, "auxiliary_loss_mlp": 0.01024575, "balance_loss_clip": 1.04884946, "balance_loss_mlp": 1.01734471, "epoch": 0.5240185174051584, "flos": 27088927234560.0, "grad_norm": 2.023984857391796, "language_loss": 0.75546193, "learning_rate": 1.9415854348820765e-06, "loss": 0.77713811, "num_input_tokens_seen": 94101635, "step": 4358, "time_per_iteration": 2.5493717193603516 }, { "auxiliary_loss_clip": 0.01168851, "auxiliary_loss_mlp": 0.01027763, "balance_loss_clip": 1.05036139, "balance_loss_mlp": 1.01940095, "epoch": 0.5241387602957975, "flos": 22674680110080.0, "grad_norm": 2.0470814232962895, "language_loss": 0.67465502, "learning_rate": 1.940806800012929e-06, "loss": 0.69662118, "num_input_tokens_seen": 94121705, "step": 4359, "time_per_iteration": 2.460353374481201 }, { "auxiliary_loss_clip": 0.01117597, "auxiliary_loss_mlp": 0.00762406, "balance_loss_clip": 1.04751372, "balance_loss_mlp": 1.00024033, "epoch": 0.5242590031864366, "flos": 40553453134080.0, "grad_norm": 1.6004649444778154, "language_loss": 0.63456345, "learning_rate": 1.9400281741233432e-06, "loss": 0.65336347, "num_input_tokens_seen": 94146595, "step": 4360, "time_per_iteration": 2.725044012069702 }, { "auxiliary_loss_clip": 0.01039033, "auxiliary_loss_mlp": 0.01008526, "balance_loss_clip": 1.01704049, "balance_loss_mlp": 1.00757241, "epoch": 0.5243792460770756, "flos": 66676313105280.0, "grad_norm": 0.6555360742272462, "language_loss": 0.52552974, "learning_rate": 1.939249557331435e-06, "loss": 0.54600537, "num_input_tokens_seen": 94212410, "step": 4361, "time_per_iteration": 3.1416094303131104 }, { "auxiliary_loss_clip": 0.01144121, "auxiliary_loss_mlp": 0.01029342, "balance_loss_clip": 1.05019045, "balance_loss_mlp": 1.02224588, "epoch": 0.5244994889677148, "flos": 28183663992960.0, "grad_norm": 1.913655570960772, "language_loss": 0.72607535, "learning_rate": 1.938470949755321e-06, "loss": 0.74781001, "num_input_tokens_seen": 94232290, "step": 4362, "time_per_iteration": 3.411875009536743 }, { "auxiliary_loss_clip": 0.01047553, "auxiliary_loss_mlp": 0.01006545, "balance_loss_clip": 1.01656938, "balance_loss_mlp": 1.00555539, "epoch": 0.5246197318583539, "flos": 65950379239680.0, "grad_norm": 0.81850348095091, "language_loss": 0.55699503, "learning_rate": 1.937692351513115e-06, "loss": 0.57753599, "num_input_tokens_seen": 94291285, "step": 4363, "time_per_iteration": 3.0562753677368164 }, { "auxiliary_loss_clip": 0.01168707, "auxiliary_loss_mlp": 0.01026146, "balance_loss_clip": 1.05165184, "balance_loss_mlp": 1.0186832, "epoch": 0.5247399747489929, "flos": 21033490769280.0, "grad_norm": 1.541235835049601, "language_loss": 0.80282021, "learning_rate": 1.9369137627229297e-06, "loss": 0.82476878, "num_input_tokens_seen": 94309685, "step": 4364, "time_per_iteration": 2.455899477005005 }, { "auxiliary_loss_clip": 0.01163184, "auxiliary_loss_mlp": 0.0102686, "balance_loss_clip": 1.05228126, "balance_loss_mlp": 1.0191952, "epoch": 0.5248602176396321, "flos": 19025940660480.0, "grad_norm": 2.280632731269297, "language_loss": 0.88028109, "learning_rate": 1.936135183502877e-06, "loss": 0.90218151, "num_input_tokens_seen": 94326985, "step": 4365, "time_per_iteration": 2.469602108001709 }, { "auxiliary_loss_clip": 0.01139628, "auxiliary_loss_mlp": 0.01024871, "balance_loss_clip": 1.04844642, "balance_loss_mlp": 1.01735473, "epoch": 0.5249804605302711, "flos": 22200084685440.0, "grad_norm": 2.1876929504225076, "language_loss": 0.79830396, "learning_rate": 1.935356613971066e-06, "loss": 0.81994891, "num_input_tokens_seen": 94347645, "step": 4366, "time_per_iteration": 3.388249397277832 }, { "auxiliary_loss_clip": 0.01146819, "auxiliary_loss_mlp": 0.00761961, "balance_loss_clip": 1.04911792, "balance_loss_mlp": 1.0002172, "epoch": 0.5251007034209102, "flos": 23805686626560.0, "grad_norm": 1.7929640130798927, "language_loss": 0.76504159, "learning_rate": 1.9345780542456047e-06, "loss": 0.78412938, "num_input_tokens_seen": 94367020, "step": 4367, "time_per_iteration": 2.51507568359375 }, { "auxiliary_loss_clip": 0.0115283, "auxiliary_loss_mlp": 0.01028503, "balance_loss_clip": 1.04873621, "balance_loss_mlp": 1.02101707, "epoch": 0.5252209463115494, "flos": 23294605962240.0, "grad_norm": 2.4416101440400015, "language_loss": 0.71907568, "learning_rate": 1.9337995044446007e-06, "loss": 0.74088907, "num_input_tokens_seen": 94385860, "step": 4368, "time_per_iteration": 3.278944492340088 }, { "auxiliary_loss_clip": 0.01168199, "auxiliary_loss_mlp": 0.01028193, "balance_loss_clip": 1.05153382, "balance_loss_mlp": 1.0203433, "epoch": 0.5253411892021884, "flos": 19828687760640.0, "grad_norm": 2.044737898576299, "language_loss": 0.79873681, "learning_rate": 1.9330209646861596e-06, "loss": 0.82070065, "num_input_tokens_seen": 94405010, "step": 4369, "time_per_iteration": 2.4421024322509766 }, { "auxiliary_loss_clip": 0.0114528, "auxiliary_loss_mlp": 0.01029998, "balance_loss_clip": 1.04863429, "balance_loss_mlp": 1.02292848, "epoch": 0.5254614320928275, "flos": 24133730561280.0, "grad_norm": 1.9552026212971447, "language_loss": 0.77821803, "learning_rate": 1.9322424350883843e-06, "loss": 0.79997075, "num_input_tokens_seen": 94426845, "step": 4370, "time_per_iteration": 2.5154197216033936 }, { "auxiliary_loss_clip": 0.01150655, "auxiliary_loss_mlp": 0.01028791, "balance_loss_clip": 1.04898167, "balance_loss_mlp": 1.02157307, "epoch": 0.5255816749834666, "flos": 24644954880000.0, "grad_norm": 2.5500733863189264, "language_loss": 0.78570747, "learning_rate": 1.931463915769379e-06, "loss": 0.80750191, "num_input_tokens_seen": 94446960, "step": 4371, "time_per_iteration": 3.2991833686828613 }, { "auxiliary_loss_clip": 0.01118164, "auxiliary_loss_mlp": 0.01024478, "balance_loss_clip": 1.04520321, "balance_loss_mlp": 1.01684523, "epoch": 0.5257019178741057, "flos": 14136595320960.0, "grad_norm": 2.242927013176805, "language_loss": 0.74336219, "learning_rate": 1.930685406847242e-06, "loss": 0.76478863, "num_input_tokens_seen": 94461535, "step": 4372, "time_per_iteration": 2.5128729343414307 }, { "auxiliary_loss_clip": 0.01144863, "auxiliary_loss_mlp": 0.01027176, "balance_loss_clip": 1.04809499, "balance_loss_mlp": 1.01987433, "epoch": 0.5258221607647448, "flos": 23548961145600.0, "grad_norm": 1.4621954235803154, "language_loss": 0.81530154, "learning_rate": 1.9299069084400734e-06, "loss": 0.83702195, "num_input_tokens_seen": 94482395, "step": 4373, "time_per_iteration": 2.51153564453125 }, { "auxiliary_loss_clip": 0.01133198, "auxiliary_loss_mlp": 0.01026445, "balance_loss_clip": 1.05137515, "balance_loss_mlp": 1.01860726, "epoch": 0.5259424036553839, "flos": 24966103403520.0, "grad_norm": 1.9663180526975068, "language_loss": 0.69542682, "learning_rate": 1.9291284206659717e-06, "loss": 0.71702325, "num_input_tokens_seen": 94500580, "step": 4374, "time_per_iteration": 2.5352210998535156 }, { "auxiliary_loss_clip": 0.01179488, "auxiliary_loss_mlp": 0.01021644, "balance_loss_clip": 1.05401397, "balance_loss_mlp": 1.01370513, "epoch": 0.526062646546023, "flos": 28763908295040.0, "grad_norm": 2.2382752326694977, "language_loss": 0.7157954, "learning_rate": 1.928349943643032e-06, "loss": 0.73780674, "num_input_tokens_seen": 94519680, "step": 4375, "time_per_iteration": 2.4739067554473877 }, { "auxiliary_loss_clip": 0.01157843, "auxiliary_loss_mlp": 0.01028292, "balance_loss_clip": 1.05037689, "balance_loss_mlp": 1.02052212, "epoch": 0.526182889436662, "flos": 22821375254400.0, "grad_norm": 1.8576834313800896, "language_loss": 0.81786835, "learning_rate": 1.9275714774893493e-06, "loss": 0.83972967, "num_input_tokens_seen": 94539135, "step": 4376, "time_per_iteration": 2.4604790210723877 }, { "auxiliary_loss_clip": 0.01123334, "auxiliary_loss_mlp": 0.01028427, "balance_loss_clip": 1.04401994, "balance_loss_mlp": 1.02013004, "epoch": 0.5263031323273012, "flos": 22929466256640.0, "grad_norm": 2.182211073509432, "language_loss": 0.73040879, "learning_rate": 1.9267930223230154e-06, "loss": 0.75192642, "num_input_tokens_seen": 94557610, "step": 4377, "time_per_iteration": 2.5353589057922363 }, { "auxiliary_loss_clip": 0.01152057, "auxiliary_loss_mlp": 0.01027588, "balance_loss_clip": 1.05085623, "balance_loss_mlp": 1.0202837, "epoch": 0.5264233752179402, "flos": 17748634049280.0, "grad_norm": 2.4095937786058936, "language_loss": 0.779791, "learning_rate": 1.9260145782621224e-06, "loss": 0.80158746, "num_input_tokens_seen": 94575390, "step": 4378, "time_per_iteration": 2.4543614387512207 }, { "auxiliary_loss_clip": 0.01147088, "auxiliary_loss_mlp": 0.01026594, "balance_loss_clip": 1.05152142, "balance_loss_mlp": 1.01914334, "epoch": 0.5265436181085793, "flos": 24421626069120.0, "grad_norm": 1.947542888374729, "language_loss": 0.87916994, "learning_rate": 1.925236145424758e-06, "loss": 0.9009068, "num_input_tokens_seen": 94594210, "step": 4379, "time_per_iteration": 2.508543014526367 }, { "auxiliary_loss_clip": 0.0106866, "auxiliary_loss_mlp": 0.01002682, "balance_loss_clip": 1.01650429, "balance_loss_mlp": 1.00184727, "epoch": 0.5266638609992185, "flos": 69207298156800.0, "grad_norm": 0.697470827288729, "language_loss": 0.57586503, "learning_rate": 1.924457723929012e-06, "loss": 0.59657848, "num_input_tokens_seen": 94665020, "step": 4380, "time_per_iteration": 3.174663782119751 }, { "auxiliary_loss_clip": 0.01162604, "auxiliary_loss_mlp": 0.01024759, "balance_loss_clip": 1.05070293, "balance_loss_mlp": 1.01727295, "epoch": 0.5267841038898575, "flos": 20738699850240.0, "grad_norm": 2.304291087725399, "language_loss": 0.83023971, "learning_rate": 1.9236793138929685e-06, "loss": 0.85211337, "num_input_tokens_seen": 94684290, "step": 4381, "time_per_iteration": 2.45731258392334 }, { "auxiliary_loss_clip": 0.01167551, "auxiliary_loss_mlp": 0.01025076, "balance_loss_clip": 1.05054665, "balance_loss_mlp": 1.01741672, "epoch": 0.5269043467804966, "flos": 17234392988160.0, "grad_norm": 1.9429832661019535, "language_loss": 0.81147134, "learning_rate": 1.9229009154347133e-06, "loss": 0.83339763, "num_input_tokens_seen": 94701880, "step": 4382, "time_per_iteration": 2.4214773178100586 }, { "auxiliary_loss_clip": 0.01106013, "auxiliary_loss_mlp": 0.00761869, "balance_loss_clip": 1.04324234, "balance_loss_mlp": 1.00023746, "epoch": 0.5270245896711357, "flos": 18223157646720.0, "grad_norm": 2.029056836800655, "language_loss": 0.80564171, "learning_rate": 1.922122528672327e-06, "loss": 0.82432055, "num_input_tokens_seen": 94720545, "step": 4383, "time_per_iteration": 2.529179096221924 }, { "auxiliary_loss_clip": 0.0117295, "auxiliary_loss_mlp": 0.01023731, "balance_loss_clip": 1.05089211, "balance_loss_mlp": 1.01635194, "epoch": 0.5271448325617748, "flos": 21287558643840.0, "grad_norm": 2.386289761773235, "language_loss": 0.780711, "learning_rate": 1.9213441537238914e-06, "loss": 0.80267781, "num_input_tokens_seen": 94737420, "step": 4384, "time_per_iteration": 2.4164559841156006 }, { "auxiliary_loss_clip": 0.01030032, "auxiliary_loss_mlp": 0.01001423, "balance_loss_clip": 1.02040553, "balance_loss_mlp": 1.00047517, "epoch": 0.5272650754524139, "flos": 65495497403520.0, "grad_norm": 0.841077346623064, "language_loss": 0.57383412, "learning_rate": 1.920565790707485e-06, "loss": 0.5941487, "num_input_tokens_seen": 94802810, "step": 4385, "time_per_iteration": 3.2633578777313232 }, { "auxiliary_loss_clip": 0.01129808, "auxiliary_loss_mlp": 0.01027898, "balance_loss_clip": 1.04693341, "balance_loss_mlp": 1.0197506, "epoch": 0.527385318343053, "flos": 19676426008320.0, "grad_norm": 1.8964439843109433, "language_loss": 0.65567815, "learning_rate": 1.9197874397411853e-06, "loss": 0.67725527, "num_input_tokens_seen": 94819440, "step": 4386, "time_per_iteration": 2.5258615016937256 }, { "auxiliary_loss_clip": 0.01134758, "auxiliary_loss_mlp": 0.01033394, "balance_loss_clip": 1.04434276, "balance_loss_mlp": 1.02490032, "epoch": 0.5275055612336921, "flos": 12712018947840.0, "grad_norm": 3.2809255173526526, "language_loss": 0.66693175, "learning_rate": 1.919009100943067e-06, "loss": 0.6886133, "num_input_tokens_seen": 94835130, "step": 4387, "time_per_iteration": 2.471652030944824 }, { "auxiliary_loss_clip": 0.01132024, "auxiliary_loss_mlp": 0.010246, "balance_loss_clip": 1.0480566, "balance_loss_mlp": 1.01664257, "epoch": 0.5276258041243311, "flos": 17749029098880.0, "grad_norm": 1.8054861978937125, "language_loss": 0.65553266, "learning_rate": 1.9182307744312043e-06, "loss": 0.67709893, "num_input_tokens_seen": 94852235, "step": 4388, "time_per_iteration": 3.363699436187744 }, { "auxiliary_loss_clip": 0.01150688, "auxiliary_loss_mlp": 0.0103115, "balance_loss_clip": 1.04755569, "balance_loss_mlp": 1.02365184, "epoch": 0.5277460470149702, "flos": 22710447077760.0, "grad_norm": 2.4880963675929513, "language_loss": 0.76714778, "learning_rate": 1.9174524603236676e-06, "loss": 0.78896612, "num_input_tokens_seen": 94871185, "step": 4389, "time_per_iteration": 2.585955858230591 }, { "auxiliary_loss_clip": 0.01151704, "auxiliary_loss_mlp": 0.01025511, "balance_loss_clip": 1.05048561, "balance_loss_mlp": 1.01736891, "epoch": 0.5278662899056094, "flos": 19902699734400.0, "grad_norm": 2.0773213747718327, "language_loss": 0.75944352, "learning_rate": 1.916674158738527e-06, "loss": 0.78121567, "num_input_tokens_seen": 94890090, "step": 4390, "time_per_iteration": 2.494558572769165 }, { "auxiliary_loss_clip": 0.01130051, "auxiliary_loss_mlp": 0.00763286, "balance_loss_clip": 1.04871428, "balance_loss_mlp": 1.00020909, "epoch": 0.5279865327962484, "flos": 18005215875840.0, "grad_norm": 1.8242723600729802, "language_loss": 0.60296273, "learning_rate": 1.9158958697938506e-06, "loss": 0.62189615, "num_input_tokens_seen": 94908470, "step": 4391, "time_per_iteration": 2.5011210441589355 }, { "auxiliary_loss_clip": 0.01142681, "auxiliary_loss_mlp": 0.01028389, "balance_loss_clip": 1.0460099, "balance_loss_mlp": 1.0204463, "epoch": 0.5281067756868875, "flos": 15924443892480.0, "grad_norm": 2.660231527652515, "language_loss": 0.85814232, "learning_rate": 1.9151175936077032e-06, "loss": 0.87985301, "num_input_tokens_seen": 94923440, "step": 4392, "time_per_iteration": 3.3119781017303467 }, { "auxiliary_loss_clip": 0.01159666, "auxiliary_loss_mlp": 0.0102813, "balance_loss_clip": 1.05074072, "balance_loss_mlp": 1.02007723, "epoch": 0.5282270185775266, "flos": 19426488197760.0, "grad_norm": 1.6654065150361346, "language_loss": 0.79473078, "learning_rate": 1.9143393302981507e-06, "loss": 0.81660873, "num_input_tokens_seen": 94941125, "step": 4393, "time_per_iteration": 2.439785957336426 }, { "auxiliary_loss_clip": 0.01150025, "auxiliary_loss_mlp": 0.01024902, "balance_loss_clip": 1.04806554, "balance_loss_mlp": 1.01733208, "epoch": 0.5283472614681657, "flos": 16399613934720.0, "grad_norm": 1.705385640586101, "language_loss": 0.83006084, "learning_rate": 1.913561079983252e-06, "loss": 0.8518101, "num_input_tokens_seen": 94959950, "step": 4394, "time_per_iteration": 3.2972543239593506 }, { "auxiliary_loss_clip": 0.01153413, "auxiliary_loss_mlp": 0.01035054, "balance_loss_clip": 1.04833102, "balance_loss_mlp": 1.02601802, "epoch": 0.5284675043588047, "flos": 26760524163840.0, "grad_norm": 2.0889060258322156, "language_loss": 0.74984896, "learning_rate": 1.9127828427810693e-06, "loss": 0.77173364, "num_input_tokens_seen": 94980515, "step": 4395, "time_per_iteration": 2.5200607776641846 }, { "auxiliary_loss_clip": 0.01142611, "auxiliary_loss_mlp": 0.01028513, "balance_loss_clip": 1.04810238, "balance_loss_mlp": 1.02055538, "epoch": 0.5285877472494439, "flos": 19899898473600.0, "grad_norm": 2.9597928023796944, "language_loss": 0.81171829, "learning_rate": 1.9120046188096607e-06, "loss": 0.83342957, "num_input_tokens_seen": 94998560, "step": 4396, "time_per_iteration": 2.5485148429870605 }, { "auxiliary_loss_clip": 0.01151945, "auxiliary_loss_mlp": 0.01036373, "balance_loss_clip": 1.05396557, "balance_loss_mlp": 1.02854121, "epoch": 0.528707990140083, "flos": 20011257613440.0, "grad_norm": 1.9496307115972396, "language_loss": 0.741799, "learning_rate": 1.9112264081870804e-06, "loss": 0.76368219, "num_input_tokens_seen": 95016950, "step": 4397, "time_per_iteration": 2.489238739013672 }, { "auxiliary_loss_clip": 0.01137155, "auxiliary_loss_mlp": 0.01030932, "balance_loss_clip": 1.05250216, "balance_loss_mlp": 1.02244473, "epoch": 0.528828233030722, "flos": 20667956014080.0, "grad_norm": 2.504448705693279, "language_loss": 0.75749385, "learning_rate": 1.9104482110313843e-06, "loss": 0.77917469, "num_input_tokens_seen": 95036540, "step": 4398, "time_per_iteration": 3.306952953338623 }, { "auxiliary_loss_clip": 0.01161345, "auxiliary_loss_mlp": 0.01025704, "balance_loss_clip": 1.05063009, "balance_loss_mlp": 1.01820564, "epoch": 0.5289484759213612, "flos": 25192448956800.0, "grad_norm": 1.8535938708892494, "language_loss": 0.73992157, "learning_rate": 1.909670027460623e-06, "loss": 0.76179206, "num_input_tokens_seen": 95053840, "step": 4399, "time_per_iteration": 2.4948277473449707 }, { "auxiliary_loss_clip": 0.01162441, "auxiliary_loss_mlp": 0.01028153, "balance_loss_clip": 1.05072093, "balance_loss_mlp": 1.02026725, "epoch": 0.5290687188120002, "flos": 31139255715840.0, "grad_norm": 1.7473423374003851, "language_loss": 0.71797311, "learning_rate": 1.908891857592847e-06, "loss": 0.73987901, "num_input_tokens_seen": 95074910, "step": 4400, "time_per_iteration": 2.608015298843384 }, { "auxiliary_loss_clip": 0.01131708, "auxiliary_loss_mlp": 0.01026469, "balance_loss_clip": 1.0517652, "balance_loss_mlp": 1.01796985, "epoch": 0.5291889617026393, "flos": 20119851406080.0, "grad_norm": 2.2243282920012564, "language_loss": 0.90379614, "learning_rate": 1.9081137015461034e-06, "loss": 0.92537796, "num_input_tokens_seen": 95090985, "step": 4401, "time_per_iteration": 2.4960179328918457 }, { "auxiliary_loss_clip": 0.01116025, "auxiliary_loss_mlp": 0.0102671, "balance_loss_clip": 1.04762197, "balance_loss_mlp": 1.01910758, "epoch": 0.5293092045932785, "flos": 19643747610240.0, "grad_norm": 2.049691496636775, "language_loss": 0.90376449, "learning_rate": 1.9073355594384383e-06, "loss": 0.92519188, "num_input_tokens_seen": 95109225, "step": 4402, "time_per_iteration": 2.527320146560669 }, { "auxiliary_loss_clip": 0.01130218, "auxiliary_loss_mlp": 0.01033117, "balance_loss_clip": 1.05050611, "balance_loss_mlp": 1.0251714, "epoch": 0.5294294474839175, "flos": 24317736958080.0, "grad_norm": 1.9783832381434578, "language_loss": 0.8036176, "learning_rate": 1.906557431387895e-06, "loss": 0.82525098, "num_input_tokens_seen": 95128215, "step": 4403, "time_per_iteration": 2.5449538230895996 }, { "auxiliary_loss_clip": 0.01135048, "auxiliary_loss_mlp": 0.01028913, "balance_loss_clip": 1.0544498, "balance_loss_mlp": 1.02008843, "epoch": 0.5295496903745566, "flos": 18875941464960.0, "grad_norm": 1.9139444703641642, "language_loss": 0.78705049, "learning_rate": 1.905779317512516e-06, "loss": 0.80869007, "num_input_tokens_seen": 95145760, "step": 4404, "time_per_iteration": 2.4876649379730225 }, { "auxiliary_loss_clip": 0.01162042, "auxiliary_loss_mlp": 0.0102849, "balance_loss_clip": 1.05089378, "balance_loss_mlp": 1.02075028, "epoch": 0.5296699332651957, "flos": 20923101296640.0, "grad_norm": 3.3230698633396605, "language_loss": 0.80456257, "learning_rate": 1.9050012179303385e-06, "loss": 0.82646787, "num_input_tokens_seen": 95164270, "step": 4405, "time_per_iteration": 2.4593703746795654 }, { "auxiliary_loss_clip": 0.01164464, "auxiliary_loss_mlp": 0.01025555, "balance_loss_clip": 1.04972339, "balance_loss_mlp": 1.01764584, "epoch": 0.5297901761558348, "flos": 22046745525120.0, "grad_norm": 2.572535409535337, "language_loss": 0.68693566, "learning_rate": 1.904223132759401e-06, "loss": 0.70883584, "num_input_tokens_seen": 95182870, "step": 4406, "time_per_iteration": 2.452558755874634 }, { "auxiliary_loss_clip": 0.01166124, "auxiliary_loss_mlp": 0.01027852, "balance_loss_clip": 1.05229115, "balance_loss_mlp": 1.01939964, "epoch": 0.5299104190464738, "flos": 21798495653760.0, "grad_norm": 4.087952031227363, "language_loss": 0.68952179, "learning_rate": 1.9034450621177383e-06, "loss": 0.71146154, "num_input_tokens_seen": 95201190, "step": 4407, "time_per_iteration": 2.446545362472534 }, { "auxiliary_loss_clip": 0.01162265, "auxiliary_loss_mlp": 0.01038402, "balance_loss_clip": 1.05196393, "balance_loss_mlp": 1.02988458, "epoch": 0.530030661937113, "flos": 14720790119040.0, "grad_norm": 2.0279941268381005, "language_loss": 0.70698452, "learning_rate": 1.9026670061233824e-06, "loss": 0.72899115, "num_input_tokens_seen": 95218625, "step": 4408, "time_per_iteration": 2.41420578956604 }, { "auxiliary_loss_clip": 0.01144505, "auxiliary_loss_mlp": 0.01025791, "balance_loss_clip": 1.05098414, "balance_loss_mlp": 1.01783323, "epoch": 0.5301509048277521, "flos": 21251504367360.0, "grad_norm": 1.7368374521502858, "language_loss": 0.80731225, "learning_rate": 1.901888964894365e-06, "loss": 0.82901514, "num_input_tokens_seen": 95237665, "step": 4409, "time_per_iteration": 2.4782917499542236 }, { "auxiliary_loss_clip": 0.01178522, "auxiliary_loss_mlp": 0.01028883, "balance_loss_clip": 1.0519352, "balance_loss_mlp": 1.02119994, "epoch": 0.5302711477183911, "flos": 25957058791680.0, "grad_norm": 1.7546288430541206, "language_loss": 0.67922366, "learning_rate": 1.9011109385487134e-06, "loss": 0.70129764, "num_input_tokens_seen": 95258915, "step": 4410, "time_per_iteration": 2.444899320602417 }, { "auxiliary_loss_clip": 0.01178073, "auxiliary_loss_mlp": 0.01027917, "balance_loss_clip": 1.05082726, "balance_loss_mlp": 1.01964402, "epoch": 0.5303913906090303, "flos": 22273126992000.0, "grad_norm": 2.7118171127523176, "language_loss": 0.66615164, "learning_rate": 1.900332927204454e-06, "loss": 0.68821156, "num_input_tokens_seen": 95277365, "step": 4411, "time_per_iteration": 2.4106783866882324 }, { "auxiliary_loss_clip": 0.0115675, "auxiliary_loss_mlp": 0.01023546, "balance_loss_clip": 1.05109882, "balance_loss_mlp": 1.01582718, "epoch": 0.5305116334996693, "flos": 24936010784640.0, "grad_norm": 1.895793965911426, "language_loss": 0.76777911, "learning_rate": 1.8995549309796097e-06, "loss": 0.78958213, "num_input_tokens_seen": 95296670, "step": 4412, "time_per_iteration": 2.529984712600708 }, { "auxiliary_loss_clip": 0.01171853, "auxiliary_loss_mlp": 0.01028034, "balance_loss_clip": 1.05387521, "balance_loss_mlp": 1.02017188, "epoch": 0.5306318763903084, "flos": 20189338266240.0, "grad_norm": 2.0649350378005105, "language_loss": 0.76648295, "learning_rate": 1.8987769499922028e-06, "loss": 0.78848183, "num_input_tokens_seen": 95315640, "step": 4413, "time_per_iteration": 2.4816269874572754 }, { "auxiliary_loss_clip": 0.01162868, "auxiliary_loss_mlp": 0.00762269, "balance_loss_clip": 1.05118632, "balance_loss_mlp": 1.00029254, "epoch": 0.5307521192809476, "flos": 20266366982400.0, "grad_norm": 2.431334215031439, "language_loss": 0.7075935, "learning_rate": 1.897998984360252e-06, "loss": 0.72684491, "num_input_tokens_seen": 95334610, "step": 4414, "time_per_iteration": 2.458519220352173 }, { "auxiliary_loss_clip": 0.01147173, "auxiliary_loss_mlp": 0.0102515, "balance_loss_clip": 1.05042887, "balance_loss_mlp": 1.01741982, "epoch": 0.5308723621715866, "flos": 28844276976000.0, "grad_norm": 1.653642362517932, "language_loss": 0.78462231, "learning_rate": 1.897221034201775e-06, "loss": 0.80634552, "num_input_tokens_seen": 95358350, "step": 4415, "time_per_iteration": 3.3944060802459717 }, { "auxiliary_loss_clip": 0.01135287, "auxiliary_loss_mlp": 0.01027561, "balance_loss_clip": 1.04646444, "balance_loss_mlp": 1.02037251, "epoch": 0.5309926050622257, "flos": 27457766040960.0, "grad_norm": 1.5624555616937665, "language_loss": 0.6680097, "learning_rate": 1.8964430996347842e-06, "loss": 0.68963814, "num_input_tokens_seen": 95379900, "step": 4416, "time_per_iteration": 2.588054656982422 }, { "auxiliary_loss_clip": 0.01148879, "auxiliary_loss_mlp": 0.01025753, "balance_loss_clip": 1.04925323, "balance_loss_mlp": 1.01759923, "epoch": 0.5311128479528648, "flos": 20514545026560.0, "grad_norm": 3.7600022749501645, "language_loss": 0.82547688, "learning_rate": 1.8956651807772931e-06, "loss": 0.84722322, "num_input_tokens_seen": 95397935, "step": 4417, "time_per_iteration": 2.478142261505127 }, { "auxiliary_loss_clip": 0.01160405, "auxiliary_loss_mlp": 0.01022453, "balance_loss_clip": 1.05012894, "balance_loss_mlp": 1.01545537, "epoch": 0.5312330908435039, "flos": 21397660807680.0, "grad_norm": 2.1443934624516383, "language_loss": 0.83948964, "learning_rate": 1.8948872777473115e-06, "loss": 0.86131823, "num_input_tokens_seen": 95415890, "step": 4418, "time_per_iteration": 2.4721696376800537 }, { "auxiliary_loss_clip": 0.0114691, "auxiliary_loss_mlp": 0.01027225, "balance_loss_clip": 1.04902434, "balance_loss_mlp": 1.02008104, "epoch": 0.531353333734143, "flos": 24717350741760.0, "grad_norm": 1.8927535149723085, "language_loss": 0.63758755, "learning_rate": 1.8941093906628458e-06, "loss": 0.65932882, "num_input_tokens_seen": 95433675, "step": 4419, "time_per_iteration": 3.3488175868988037 }, { "auxiliary_loss_clip": 0.01140728, "auxiliary_loss_mlp": 0.01023722, "balance_loss_clip": 1.04614508, "balance_loss_mlp": 1.01649237, "epoch": 0.531473576624782, "flos": 30480689808000.0, "grad_norm": 1.7407172442118777, "language_loss": 0.70690119, "learning_rate": 1.893331519641902e-06, "loss": 0.72854573, "num_input_tokens_seen": 95455820, "step": 4420, "time_per_iteration": 2.550868034362793 }, { "auxiliary_loss_clip": 0.01122979, "auxiliary_loss_mlp": 0.01027267, "balance_loss_clip": 1.04337502, "balance_loss_mlp": 1.01942933, "epoch": 0.5315938195154212, "flos": 23002975440000.0, "grad_norm": 2.741893329887903, "language_loss": 0.7380811, "learning_rate": 1.8925536648024815e-06, "loss": 0.75958359, "num_input_tokens_seen": 95473240, "step": 4421, "time_per_iteration": 3.335223913192749 }, { "auxiliary_loss_clip": 0.01177398, "auxiliary_loss_mlp": 0.01025099, "balance_loss_clip": 1.05210996, "balance_loss_mlp": 1.01724613, "epoch": 0.5317140624060602, "flos": 22748584343040.0, "grad_norm": 2.3155794764712474, "language_loss": 0.75978935, "learning_rate": 1.8917758262625849e-06, "loss": 0.78181434, "num_input_tokens_seen": 95493480, "step": 4422, "time_per_iteration": 2.426776885986328 }, { "auxiliary_loss_clip": 0.01140854, "auxiliary_loss_mlp": 0.01029249, "balance_loss_clip": 1.04850554, "balance_loss_mlp": 1.02175641, "epoch": 0.5318343052966993, "flos": 22821087945600.0, "grad_norm": 1.6146765656286153, "language_loss": 0.8065474, "learning_rate": 1.8909980041402089e-06, "loss": 0.82824844, "num_input_tokens_seen": 95512075, "step": 4423, "time_per_iteration": 2.4910166263580322 }, { "auxiliary_loss_clip": 0.01157005, "auxiliary_loss_mlp": 0.0102723, "balance_loss_clip": 1.04844546, "balance_loss_mlp": 1.01927853, "epoch": 0.5319545481873384, "flos": 13626089274240.0, "grad_norm": 2.1907451519042183, "language_loss": 0.65487504, "learning_rate": 1.8902201985533494e-06, "loss": 0.67671734, "num_input_tokens_seen": 95529340, "step": 4424, "time_per_iteration": 2.4017388820648193 }, { "auxiliary_loss_clip": 0.01146654, "auxiliary_loss_mlp": 0.01022627, "balance_loss_clip": 1.04792094, "balance_loss_mlp": 1.0155158, "epoch": 0.5320747910779775, "flos": 22162522037760.0, "grad_norm": 1.8611239151710266, "language_loss": 0.74934757, "learning_rate": 1.8894424096199983e-06, "loss": 0.77104038, "num_input_tokens_seen": 95548545, "step": 4425, "time_per_iteration": 3.307497501373291 }, { "auxiliary_loss_clip": 0.01165348, "auxiliary_loss_mlp": 0.01026803, "balance_loss_clip": 1.0534761, "balance_loss_mlp": 1.01850557, "epoch": 0.5321950339686166, "flos": 18588081870720.0, "grad_norm": 1.9229733687298485, "language_loss": 0.85934412, "learning_rate": 1.8886646374581463e-06, "loss": 0.88126564, "num_input_tokens_seen": 95567770, "step": 4426, "time_per_iteration": 2.428509473800659 }, { "auxiliary_loss_clip": 0.01160504, "auxiliary_loss_mlp": 0.01026745, "balance_loss_clip": 1.04919577, "balance_loss_mlp": 1.01886821, "epoch": 0.5323152768592557, "flos": 22856818999680.0, "grad_norm": 1.8387797003995645, "language_loss": 0.71122217, "learning_rate": 1.8878868821857795e-06, "loss": 0.73309469, "num_input_tokens_seen": 95587420, "step": 4427, "time_per_iteration": 2.4547226428985596 }, { "auxiliary_loss_clip": 0.01117063, "auxiliary_loss_mlp": 0.01029392, "balance_loss_clip": 1.04383039, "balance_loss_mlp": 1.02111292, "epoch": 0.5324355197498948, "flos": 33948690998400.0, "grad_norm": 2.541021035417326, "language_loss": 0.751773, "learning_rate": 1.8871091439208838e-06, "loss": 0.77323759, "num_input_tokens_seen": 95609030, "step": 4428, "time_per_iteration": 2.6807432174682617 }, { "auxiliary_loss_clip": 0.01120159, "auxiliary_loss_mlp": 0.01030063, "balance_loss_clip": 1.04662943, "balance_loss_mlp": 1.02164638, "epoch": 0.5325557626405338, "flos": 23256720092160.0, "grad_norm": 2.148824894659783, "language_loss": 0.77255893, "learning_rate": 1.8863314227814414e-06, "loss": 0.79406106, "num_input_tokens_seen": 95627340, "step": 4429, "time_per_iteration": 2.5454797744750977 }, { "auxiliary_loss_clip": 0.01169561, "auxiliary_loss_mlp": 0.01027903, "balance_loss_clip": 1.05370152, "balance_loss_mlp": 1.01973116, "epoch": 0.532676005531173, "flos": 26718687797760.0, "grad_norm": 2.203540722026896, "language_loss": 0.48745137, "learning_rate": 1.8855537188854313e-06, "loss": 0.50942606, "num_input_tokens_seen": 95646315, "step": 4430, "time_per_iteration": 2.4973628520965576 }, { "auxiliary_loss_clip": 0.01163668, "auxiliary_loss_mlp": 0.01028293, "balance_loss_clip": 1.04846144, "balance_loss_mlp": 1.02065754, "epoch": 0.5327962484218121, "flos": 17894610921600.0, "grad_norm": 2.4094500990062464, "language_loss": 0.78297079, "learning_rate": 1.8847760323508315e-06, "loss": 0.80489039, "num_input_tokens_seen": 95665220, "step": 4431, "time_per_iteration": 2.4343512058258057 }, { "auxiliary_loss_clip": 0.01143125, "auxiliary_loss_mlp": 0.01026057, "balance_loss_clip": 1.04928517, "balance_loss_mlp": 1.01900911, "epoch": 0.5329164913124511, "flos": 17925385898880.0, "grad_norm": 1.9279859207312136, "language_loss": 0.7563687, "learning_rate": 1.883998363295616e-06, "loss": 0.77806056, "num_input_tokens_seen": 95682700, "step": 4432, "time_per_iteration": 2.4565107822418213 }, { "auxiliary_loss_clip": 0.01054434, "auxiliary_loss_mlp": 0.0100109, "balance_loss_clip": 1.01677632, "balance_loss_mlp": 1.00011253, "epoch": 0.5330367342030903, "flos": 57254178781440.0, "grad_norm": 0.87809699125139, "language_loss": 0.62535334, "learning_rate": 1.8832207118377565e-06, "loss": 0.64590859, "num_input_tokens_seen": 95738070, "step": 4433, "time_per_iteration": 2.9914746284484863 }, { "auxiliary_loss_clip": 0.01173744, "auxiliary_loss_mlp": 0.01024413, "balance_loss_clip": 1.05066645, "balance_loss_mlp": 1.01730776, "epoch": 0.5331569770937293, "flos": 17420518287360.0, "grad_norm": 2.126230536835848, "language_loss": 0.69381464, "learning_rate": 1.882443078095222e-06, "loss": 0.71579623, "num_input_tokens_seen": 95756950, "step": 4434, "time_per_iteration": 2.394221067428589 }, { "auxiliary_loss_clip": 0.01043321, "auxiliary_loss_mlp": 0.01001025, "balance_loss_clip": 1.02038908, "balance_loss_mlp": 1.00010157, "epoch": 0.5332772199843684, "flos": 56750783627520.0, "grad_norm": 1.1329709108492303, "language_loss": 0.6684711, "learning_rate": 1.8816654621859794e-06, "loss": 0.68891454, "num_input_tokens_seen": 95816615, "step": 4435, "time_per_iteration": 3.0088424682617188 }, { "auxiliary_loss_clip": 0.01174566, "auxiliary_loss_mlp": 0.0102574, "balance_loss_clip": 1.05182791, "balance_loss_mlp": 1.01775885, "epoch": 0.5333974628750076, "flos": 18697753071360.0, "grad_norm": 2.1951282833747707, "language_loss": 0.72302198, "learning_rate": 1.8808878642279915e-06, "loss": 0.74502504, "num_input_tokens_seen": 95832020, "step": 4436, "time_per_iteration": 2.408475399017334 }, { "auxiliary_loss_clip": 0.0113391, "auxiliary_loss_mlp": 0.01032853, "balance_loss_clip": 1.0433085, "balance_loss_mlp": 1.02476478, "epoch": 0.5335177057656466, "flos": 23805507058560.0, "grad_norm": 2.454058446815523, "language_loss": 0.6479131, "learning_rate": 1.8801102843392209e-06, "loss": 0.6695807, "num_input_tokens_seen": 95851425, "step": 4437, "time_per_iteration": 2.5382144451141357 }, { "auxiliary_loss_clip": 0.01132168, "auxiliary_loss_mlp": 0.01027486, "balance_loss_clip": 1.0454185, "balance_loss_mlp": 1.02004147, "epoch": 0.5336379486562857, "flos": 25078683605760.0, "grad_norm": 1.611145090353668, "language_loss": 0.8530699, "learning_rate": 1.8793327226376238e-06, "loss": 0.87466645, "num_input_tokens_seen": 95870745, "step": 4438, "time_per_iteration": 2.5607376098632812 }, { "auxiliary_loss_clip": 0.01155704, "auxiliary_loss_mlp": 0.0102699, "balance_loss_clip": 1.04951477, "balance_loss_mlp": 1.0193429, "epoch": 0.5337581915469248, "flos": 21396691140480.0, "grad_norm": 1.7876418179343254, "language_loss": 0.80082142, "learning_rate": 1.8785551792411569e-06, "loss": 0.82264841, "num_input_tokens_seen": 95889755, "step": 4439, "time_per_iteration": 2.4819371700286865 }, { "auxiliary_loss_clip": 0.01147436, "auxiliary_loss_mlp": 0.01028686, "balance_loss_clip": 1.04863691, "balance_loss_mlp": 1.0216918, "epoch": 0.5338784344375639, "flos": 14865905064960.0, "grad_norm": 2.0018118718720026, "language_loss": 0.82300633, "learning_rate": 1.8777776542677733e-06, "loss": 0.84476757, "num_input_tokens_seen": 95907805, "step": 4440, "time_per_iteration": 2.4573745727539062 }, { "auxiliary_loss_clip": 0.01132546, "auxiliary_loss_mlp": 0.01023086, "balance_loss_clip": 1.0441972, "balance_loss_mlp": 1.01518857, "epoch": 0.5339986773282029, "flos": 20813501923200.0, "grad_norm": 1.9381061106172475, "language_loss": 0.73208892, "learning_rate": 1.8770001478354216e-06, "loss": 0.7536453, "num_input_tokens_seen": 95927480, "step": 4441, "time_per_iteration": 2.5188307762145996 }, { "auxiliary_loss_clip": 0.01157611, "auxiliary_loss_mlp": 0.0103235, "balance_loss_clip": 1.04856658, "balance_loss_mlp": 1.02410674, "epoch": 0.5341189202188421, "flos": 17969089772160.0, "grad_norm": 2.334046273550115, "language_loss": 0.84375846, "learning_rate": 1.8762226600620504e-06, "loss": 0.86565804, "num_input_tokens_seen": 95946095, "step": 4442, "time_per_iteration": 3.258622407913208 }, { "auxiliary_loss_clip": 0.0115465, "auxiliary_loss_mlp": 0.01026939, "balance_loss_clip": 1.04857564, "balance_loss_mlp": 1.01852918, "epoch": 0.5342391631094812, "flos": 11031866328960.0, "grad_norm": 3.895016297064507, "language_loss": 0.58954144, "learning_rate": 1.8754451910656031e-06, "loss": 0.61135733, "num_input_tokens_seen": 95959995, "step": 4443, "time_per_iteration": 2.4188201427459717 }, { "auxiliary_loss_clip": 0.01128019, "auxiliary_loss_mlp": 0.0102669, "balance_loss_clip": 1.04714465, "balance_loss_mlp": 1.0186522, "epoch": 0.5343594060001202, "flos": 15339135772800.0, "grad_norm": 2.0982498329638704, "language_loss": 0.82721788, "learning_rate": 1.8746677409640212e-06, "loss": 0.84876502, "num_input_tokens_seen": 95977095, "step": 4444, "time_per_iteration": 2.5219223499298096 }, { "auxiliary_loss_clip": 0.01167921, "auxiliary_loss_mlp": 0.01028401, "balance_loss_clip": 1.05392075, "balance_loss_mlp": 1.02075338, "epoch": 0.5344796488907594, "flos": 26900898514560.0, "grad_norm": 1.7665490761804004, "language_loss": 0.84337252, "learning_rate": 1.8738903098752432e-06, "loss": 0.86533576, "num_input_tokens_seen": 95996225, "step": 4445, "time_per_iteration": 2.4985456466674805 }, { "auxiliary_loss_clip": 0.01148807, "auxiliary_loss_mlp": 0.01030856, "balance_loss_clip": 1.04945087, "balance_loss_mlp": 1.02342033, "epoch": 0.5345998917813984, "flos": 25411216740480.0, "grad_norm": 2.271288353522816, "language_loss": 0.73147124, "learning_rate": 1.8731128979172052e-06, "loss": 0.75326788, "num_input_tokens_seen": 96015425, "step": 4446, "time_per_iteration": 3.371224880218506 }, { "auxiliary_loss_clip": 0.01145833, "auxiliary_loss_mlp": 0.01022238, "balance_loss_clip": 1.04989493, "balance_loss_mlp": 1.01518083, "epoch": 0.5347201346720375, "flos": 32853379622400.0, "grad_norm": 2.1882611576014437, "language_loss": 0.66941226, "learning_rate": 1.8723355052078394e-06, "loss": 0.69109297, "num_input_tokens_seen": 96035460, "step": 4447, "time_per_iteration": 2.618471384048462 }, { "auxiliary_loss_clip": 0.01159026, "auxiliary_loss_mlp": 0.01035384, "balance_loss_clip": 1.04825974, "balance_loss_mlp": 1.02736092, "epoch": 0.5348403775626767, "flos": 17967940536960.0, "grad_norm": 2.674220123911979, "language_loss": 0.77289617, "learning_rate": 1.8715581318650765e-06, "loss": 0.79484028, "num_input_tokens_seen": 96054515, "step": 4448, "time_per_iteration": 3.2570197582244873 }, { "auxiliary_loss_clip": 0.011453, "auxiliary_loss_mlp": 0.01029161, "balance_loss_clip": 1.05004001, "balance_loss_mlp": 1.02024376, "epoch": 0.5349606204533157, "flos": 17603339535360.0, "grad_norm": 2.2044857888588596, "language_loss": 0.81117606, "learning_rate": 1.8707807780068422e-06, "loss": 0.83292067, "num_input_tokens_seen": 96072330, "step": 4449, "time_per_iteration": 2.5061838626861572 }, { "auxiliary_loss_clip": 0.01144826, "auxiliary_loss_mlp": 0.0102635, "balance_loss_clip": 1.04815948, "balance_loss_mlp": 1.01901841, "epoch": 0.5350808633439548, "flos": 29167831710720.0, "grad_norm": 2.551763531400763, "language_loss": 0.66170347, "learning_rate": 1.8700034437510611e-06, "loss": 0.68341517, "num_input_tokens_seen": 96092425, "step": 4450, "time_per_iteration": 2.5877599716186523 }, { "auxiliary_loss_clip": 0.01125631, "auxiliary_loss_mlp": 0.01025652, "balance_loss_clip": 1.0466001, "balance_loss_mlp": 1.01777864, "epoch": 0.5352011062345938, "flos": 19499997381120.0, "grad_norm": 2.2407233072706485, "language_loss": 0.81731796, "learning_rate": 1.8692261292156549e-06, "loss": 0.83883083, "num_input_tokens_seen": 96111660, "step": 4451, "time_per_iteration": 2.520409345626831 }, { "auxiliary_loss_clip": 0.01176956, "auxiliary_loss_mlp": 0.01024901, "balance_loss_clip": 1.054739, "balance_loss_mlp": 1.01735544, "epoch": 0.535321349125233, "flos": 23477642691840.0, "grad_norm": 1.9449492563437474, "language_loss": 0.81255102, "learning_rate": 1.8684488345185401e-06, "loss": 0.83456957, "num_input_tokens_seen": 96131835, "step": 4452, "time_per_iteration": 3.1671388149261475 }, { "auxiliary_loss_clip": 0.01178818, "auxiliary_loss_mlp": 0.0102924, "balance_loss_clip": 1.05379868, "balance_loss_mlp": 1.02142, "epoch": 0.535441592015872, "flos": 20478059786880.0, "grad_norm": 2.289697788667961, "language_loss": 0.78501272, "learning_rate": 1.8676715597776332e-06, "loss": 0.80709332, "num_input_tokens_seen": 96150180, "step": 4453, "time_per_iteration": 2.398439645767212 }, { "auxiliary_loss_clip": 0.01110919, "auxiliary_loss_mlp": 0.0102319, "balance_loss_clip": 1.04223621, "balance_loss_mlp": 1.01565957, "epoch": 0.5355618349065111, "flos": 19573147428480.0, "grad_norm": 1.807117462478234, "language_loss": 0.76476395, "learning_rate": 1.8668943051108455e-06, "loss": 0.78610504, "num_input_tokens_seen": 96167485, "step": 4454, "time_per_iteration": 2.528853416442871 }, { "auxiliary_loss_clip": 0.01146749, "auxiliary_loss_mlp": 0.01030506, "balance_loss_clip": 1.04812694, "balance_loss_mlp": 1.0226028, "epoch": 0.5356820777971503, "flos": 24024633978240.0, "grad_norm": 1.8669990840015993, "language_loss": 0.76142764, "learning_rate": 1.8661170706360856e-06, "loss": 0.7832002, "num_input_tokens_seen": 96186650, "step": 4455, "time_per_iteration": 2.507431745529175 }, { "auxiliary_loss_clip": 0.0116157, "auxiliary_loss_mlp": 0.01022897, "balance_loss_clip": 1.0519228, "balance_loss_mlp": 1.01577687, "epoch": 0.5358023206877893, "flos": 20884676722560.0, "grad_norm": 1.6291379145055414, "language_loss": 0.81508356, "learning_rate": 1.8653398564712594e-06, "loss": 0.83692825, "num_input_tokens_seen": 96205595, "step": 4456, "time_per_iteration": 2.446989059448242 }, { "auxiliary_loss_clip": 0.01160343, "auxiliary_loss_mlp": 0.01024168, "balance_loss_clip": 1.05177546, "balance_loss_mlp": 1.01643705, "epoch": 0.5359225635784284, "flos": 22418996123520.0, "grad_norm": 1.5592992000963437, "language_loss": 0.82007861, "learning_rate": 1.8645626627342704e-06, "loss": 0.84192371, "num_input_tokens_seen": 96226360, "step": 4457, "time_per_iteration": 2.4673218727111816 }, { "auxiliary_loss_clip": 0.01165079, "auxiliary_loss_mlp": 0.01028229, "balance_loss_clip": 1.05072236, "balance_loss_mlp": 1.02073061, "epoch": 0.5360428064690675, "flos": 24097784025600.0, "grad_norm": 2.2760354984773654, "language_loss": 0.80978495, "learning_rate": 1.8637854895430172e-06, "loss": 0.83171803, "num_input_tokens_seen": 96245625, "step": 4458, "time_per_iteration": 2.476638078689575 }, { "auxiliary_loss_clip": 0.01127204, "auxiliary_loss_mlp": 0.0102944, "balance_loss_clip": 1.04690289, "balance_loss_mlp": 1.02123857, "epoch": 0.5361630493597066, "flos": 21434505183360.0, "grad_norm": 2.137333711046165, "language_loss": 0.69852883, "learning_rate": 1.8630083370153978e-06, "loss": 0.72009528, "num_input_tokens_seen": 96265265, "step": 4459, "time_per_iteration": 2.5222668647766113 }, { "auxiliary_loss_clip": 0.01028763, "auxiliary_loss_mlp": 0.01001179, "balance_loss_clip": 1.01792812, "balance_loss_mlp": 1.00026083, "epoch": 0.5362832922503457, "flos": 68888696520960.0, "grad_norm": 0.7515503345002652, "language_loss": 0.55431956, "learning_rate": 1.8622312052693041e-06, "loss": 0.57461894, "num_input_tokens_seen": 96326445, "step": 4460, "time_per_iteration": 3.224353075027466 }, { "auxiliary_loss_clip": 0.01152905, "auxiliary_loss_mlp": 0.01026892, "balance_loss_clip": 1.04578114, "balance_loss_mlp": 1.01929271, "epoch": 0.5364035351409848, "flos": 9793702563840.0, "grad_norm": 2.3747290463549957, "language_loss": 0.72064066, "learning_rate": 1.8614540944226267e-06, "loss": 0.74243867, "num_input_tokens_seen": 96343115, "step": 4461, "time_per_iteration": 2.4199695587158203 }, { "auxiliary_loss_clip": 0.01143205, "auxiliary_loss_mlp": 0.01024452, "balance_loss_clip": 1.04959953, "balance_loss_mlp": 1.01750493, "epoch": 0.5365237780316239, "flos": 23290080848640.0, "grad_norm": 1.8957466065967739, "language_loss": 0.67709768, "learning_rate": 1.8606770045932537e-06, "loss": 0.69877422, "num_input_tokens_seen": 96362230, "step": 4462, "time_per_iteration": 2.5026862621307373 }, { "auxiliary_loss_clip": 0.01126294, "auxiliary_loss_mlp": 0.01029348, "balance_loss_clip": 1.04264879, "balance_loss_mlp": 1.02093208, "epoch": 0.5366440209222629, "flos": 26578133879040.0, "grad_norm": 2.385155724961322, "language_loss": 0.81893027, "learning_rate": 1.859899935899068e-06, "loss": 0.84048671, "num_input_tokens_seen": 96382085, "step": 4463, "time_per_iteration": 2.5726125240325928 }, { "auxiliary_loss_clip": 0.01148911, "auxiliary_loss_mlp": 0.01027171, "balance_loss_clip": 1.05319619, "balance_loss_mlp": 1.01915979, "epoch": 0.5367642638129021, "flos": 19608052469760.0, "grad_norm": 1.5633399524107081, "language_loss": 0.7887969, "learning_rate": 1.8591228884579506e-06, "loss": 0.81055772, "num_input_tokens_seen": 96400580, "step": 4464, "time_per_iteration": 2.474151849746704 }, { "auxiliary_loss_clip": 0.01137231, "auxiliary_loss_mlp": 0.01026211, "balance_loss_clip": 1.04786599, "balance_loss_mlp": 1.01870084, "epoch": 0.5368845067035412, "flos": 23915214172800.0, "grad_norm": 2.001505398609328, "language_loss": 0.82025689, "learning_rate": 1.8583458623877795e-06, "loss": 0.84189129, "num_input_tokens_seen": 96419680, "step": 4465, "time_per_iteration": 2.533148765563965 }, { "auxiliary_loss_clip": 0.01163519, "auxiliary_loss_mlp": 0.0102476, "balance_loss_clip": 1.05115592, "balance_loss_mlp": 1.01730335, "epoch": 0.5370047495941802, "flos": 16873131951360.0, "grad_norm": 1.7100307378037336, "language_loss": 0.74166769, "learning_rate": 1.8575688578064281e-06, "loss": 0.76355052, "num_input_tokens_seen": 96437805, "step": 4466, "time_per_iteration": 2.419611692428589 }, { "auxiliary_loss_clip": 0.0116479, "auxiliary_loss_mlp": 0.01027861, "balance_loss_clip": 1.05205464, "balance_loss_mlp": 1.02023792, "epoch": 0.5371249924848194, "flos": 20740926493440.0, "grad_norm": 1.6992476666578011, "language_loss": 0.76564705, "learning_rate": 1.8567918748317674e-06, "loss": 0.78757358, "num_input_tokens_seen": 96457155, "step": 4467, "time_per_iteration": 2.457407236099243 }, { "auxiliary_loss_clip": 0.01131867, "auxiliary_loss_mlp": 0.01027422, "balance_loss_clip": 1.04425859, "balance_loss_mlp": 1.0196259, "epoch": 0.5372452353754584, "flos": 17968120104960.0, "grad_norm": 2.164692405242359, "language_loss": 0.82604158, "learning_rate": 1.8560149135816659e-06, "loss": 0.84763443, "num_input_tokens_seen": 96473990, "step": 4468, "time_per_iteration": 3.3103084564208984 }, { "auxiliary_loss_clip": 0.01156866, "auxiliary_loss_mlp": 0.01021639, "balance_loss_clip": 1.04751825, "balance_loss_mlp": 1.01446533, "epoch": 0.5373654782660975, "flos": 15377021642880.0, "grad_norm": 2.1594839537498522, "language_loss": 0.84159189, "learning_rate": 1.8552379741739873e-06, "loss": 0.86337698, "num_input_tokens_seen": 96491335, "step": 4469, "time_per_iteration": 2.421666383743286 }, { "auxiliary_loss_clip": 0.01045487, "auxiliary_loss_mlp": 0.00752661, "balance_loss_clip": 1.01706469, "balance_loss_mlp": 0.99971449, "epoch": 0.5374857211567367, "flos": 69000091574400.0, "grad_norm": 0.8942068885640814, "language_loss": 0.55684155, "learning_rate": 1.8544610567265935e-06, "loss": 0.57482308, "num_input_tokens_seen": 96545275, "step": 4470, "time_per_iteration": 3.03814959526062 }, { "auxiliary_loss_clip": 0.0114831, "auxiliary_loss_mlp": 0.00761958, "balance_loss_clip": 1.05115497, "balance_loss_mlp": 1.00026214, "epoch": 0.5376059640473757, "flos": 15085355207040.0, "grad_norm": 1.9626685817636675, "language_loss": 0.83195674, "learning_rate": 1.853684161357341e-06, "loss": 0.85105944, "num_input_tokens_seen": 96562935, "step": 4471, "time_per_iteration": 2.464038133621216 }, { "auxiliary_loss_clip": 0.01159627, "auxiliary_loss_mlp": 0.0076238, "balance_loss_clip": 1.05127382, "balance_loss_mlp": 1.00029433, "epoch": 0.5377262069380148, "flos": 19792597570560.0, "grad_norm": 1.7383531331988016, "language_loss": 0.76809514, "learning_rate": 1.852907288184085e-06, "loss": 0.78731519, "num_input_tokens_seen": 96581820, "step": 4472, "time_per_iteration": 2.4673213958740234 }, { "auxiliary_loss_clip": 0.01123999, "auxiliary_loss_mlp": 0.01027439, "balance_loss_clip": 1.04639482, "balance_loss_mlp": 1.01884377, "epoch": 0.5378464498286539, "flos": 30003077640960.0, "grad_norm": 1.8400786012806205, "language_loss": 0.70284921, "learning_rate": 1.8521304373246762e-06, "loss": 0.72436363, "num_input_tokens_seen": 96602865, "step": 4473, "time_per_iteration": 3.4835758209228516 }, { "auxiliary_loss_clip": 0.01165097, "auxiliary_loss_mlp": 0.01025631, "balance_loss_clip": 1.04973698, "balance_loss_mlp": 1.01779938, "epoch": 0.537966692719293, "flos": 21251217058560.0, "grad_norm": 2.506134031203014, "language_loss": 0.88833678, "learning_rate": 1.8513536088969626e-06, "loss": 0.91024411, "num_input_tokens_seen": 96620530, "step": 4474, "time_per_iteration": 3.2373931407928467 }, { "auxiliary_loss_clip": 0.01164577, "auxiliary_loss_mlp": 0.0103712, "balance_loss_clip": 1.05295444, "balance_loss_mlp": 1.02890062, "epoch": 0.538086935609932, "flos": 21543170803200.0, "grad_norm": 2.1503063923741546, "language_loss": 0.80407178, "learning_rate": 1.8505768030187884e-06, "loss": 0.82608879, "num_input_tokens_seen": 96640660, "step": 4475, "time_per_iteration": 2.4650754928588867 }, { "auxiliary_loss_clip": 0.01144412, "auxiliary_loss_mlp": 0.01025949, "balance_loss_clip": 1.05049706, "balance_loss_mlp": 1.01882017, "epoch": 0.5382071785005712, "flos": 22747219626240.0, "grad_norm": 1.7119117056191293, "language_loss": 0.8012085, "learning_rate": 1.849800019807995e-06, "loss": 0.8229121, "num_input_tokens_seen": 96661885, "step": 4476, "time_per_iteration": 2.526031970977783 }, { "auxiliary_loss_clip": 0.0113173, "auxiliary_loss_mlp": 0.0102614, "balance_loss_clip": 1.04729128, "balance_loss_mlp": 1.01858521, "epoch": 0.5383274213912103, "flos": 24934574240640.0, "grad_norm": 2.210722727540462, "language_loss": 0.70869958, "learning_rate": 1.8490232593824186e-06, "loss": 0.73027825, "num_input_tokens_seen": 96678340, "step": 4477, "time_per_iteration": 2.5267958641052246 }, { "auxiliary_loss_clip": 0.01147608, "auxiliary_loss_mlp": 0.01026257, "balance_loss_clip": 1.05252624, "balance_loss_mlp": 1.01952422, "epoch": 0.5384476642818493, "flos": 22310186849280.0, "grad_norm": 1.763550013622752, "language_loss": 0.84696341, "learning_rate": 1.8482465218598935e-06, "loss": 0.86870199, "num_input_tokens_seen": 96698285, "step": 4478, "time_per_iteration": 3.2263917922973633 }, { "auxiliary_loss_clip": 0.01133589, "auxiliary_loss_mlp": 0.01027434, "balance_loss_clip": 1.04629946, "balance_loss_mlp": 1.01925039, "epoch": 0.5385679071724885, "flos": 22711021695360.0, "grad_norm": 1.8017459716605533, "language_loss": 0.83167791, "learning_rate": 1.8474698073582508e-06, "loss": 0.85328817, "num_input_tokens_seen": 96719655, "step": 4479, "time_per_iteration": 2.5472662448883057 }, { "auxiliary_loss_clip": 0.01138837, "auxiliary_loss_mlp": 0.01021095, "balance_loss_clip": 1.04722977, "balance_loss_mlp": 1.01362062, "epoch": 0.5386881500631275, "flos": 15953746412160.0, "grad_norm": 1.9907662895863352, "language_loss": 0.87314087, "learning_rate": 1.8466931159953166e-06, "loss": 0.89474022, "num_input_tokens_seen": 96736290, "step": 4480, "time_per_iteration": 2.486788034439087 }, { "auxiliary_loss_clip": 0.01153111, "auxiliary_loss_mlp": 0.01028763, "balance_loss_clip": 1.05212665, "balance_loss_mlp": 1.02114248, "epoch": 0.5388083929537666, "flos": 24060041809920.0, "grad_norm": 2.394991686298086, "language_loss": 0.84316683, "learning_rate": 1.8459164478889158e-06, "loss": 0.86498547, "num_input_tokens_seen": 96757685, "step": 4481, "time_per_iteration": 2.5128936767578125 }, { "auxiliary_loss_clip": 0.01127108, "auxiliary_loss_mlp": 0.01023799, "balance_loss_clip": 1.04517388, "balance_loss_mlp": 1.01647949, "epoch": 0.5389286358444056, "flos": 22236893147520.0, "grad_norm": 1.875004365145341, "language_loss": 0.76047671, "learning_rate": 1.8451398031568663e-06, "loss": 0.78198576, "num_input_tokens_seen": 96777310, "step": 4482, "time_per_iteration": 2.516167640686035 }, { "auxiliary_loss_clip": 0.01134465, "auxiliary_loss_mlp": 0.01024904, "balance_loss_clip": 1.04816866, "balance_loss_mlp": 1.01709616, "epoch": 0.5390488787350448, "flos": 24281718595200.0, "grad_norm": 1.7031201848368782, "language_loss": 0.74754083, "learning_rate": 1.844363181916986e-06, "loss": 0.76913452, "num_input_tokens_seen": 96798035, "step": 4483, "time_per_iteration": 2.5668463706970215 }, { "auxiliary_loss_clip": 0.01159778, "auxiliary_loss_mlp": 0.01029984, "balance_loss_clip": 1.04962683, "balance_loss_mlp": 1.02264643, "epoch": 0.5391691216256839, "flos": 16581393688320.0, "grad_norm": 2.250537525079556, "language_loss": 0.83371311, "learning_rate": 1.8435865842870868e-06, "loss": 0.85561067, "num_input_tokens_seen": 96815975, "step": 4484, "time_per_iteration": 2.437711477279663 }, { "auxiliary_loss_clip": 0.01138163, "auxiliary_loss_mlp": 0.0076231, "balance_loss_clip": 1.0449276, "balance_loss_mlp": 1.00037479, "epoch": 0.5392893645163229, "flos": 23330049707520.0, "grad_norm": 2.0537061908818086, "language_loss": 0.71873277, "learning_rate": 1.8428100103849787e-06, "loss": 0.73773748, "num_input_tokens_seen": 96835770, "step": 4485, "time_per_iteration": 2.5080618858337402 }, { "auxiliary_loss_clip": 0.01150014, "auxiliary_loss_mlp": 0.01031514, "balance_loss_clip": 1.05330825, "balance_loss_mlp": 1.02337241, "epoch": 0.5394096074069621, "flos": 15669801400320.0, "grad_norm": 2.4248632676515145, "language_loss": 0.73307145, "learning_rate": 1.842033460328467e-06, "loss": 0.75488675, "num_input_tokens_seen": 96854490, "step": 4486, "time_per_iteration": 2.455778121948242 }, { "auxiliary_loss_clip": 0.01150211, "auxiliary_loss_mlp": 0.00761969, "balance_loss_clip": 1.04753304, "balance_loss_mlp": 1.0003593, "epoch": 0.5395298502976011, "flos": 22893447893760.0, "grad_norm": 1.6631617719100145, "language_loss": 0.75173771, "learning_rate": 1.8412569342353541e-06, "loss": 0.77085948, "num_input_tokens_seen": 96874645, "step": 4487, "time_per_iteration": 2.5174357891082764 }, { "auxiliary_loss_clip": 0.01153241, "auxiliary_loss_mlp": 0.01032649, "balance_loss_clip": 1.05108953, "balance_loss_mlp": 1.02436388, "epoch": 0.5396500931882402, "flos": 23842135952640.0, "grad_norm": 1.9005491259605878, "language_loss": 0.84796524, "learning_rate": 1.840480432223438e-06, "loss": 0.86982417, "num_input_tokens_seen": 96893650, "step": 4488, "time_per_iteration": 2.4916694164276123 }, { "auxiliary_loss_clip": 0.01148741, "auxiliary_loss_mlp": 0.01030293, "balance_loss_clip": 1.04751098, "balance_loss_mlp": 1.02270222, "epoch": 0.5397703360788794, "flos": 26322988596480.0, "grad_norm": 2.205160208525085, "language_loss": 0.77600622, "learning_rate": 1.8397039544105131e-06, "loss": 0.79779655, "num_input_tokens_seen": 96912735, "step": 4489, "time_per_iteration": 2.534141778945923 }, { "auxiliary_loss_clip": 0.01142902, "auxiliary_loss_mlp": 0.01024235, "balance_loss_clip": 1.04639983, "balance_loss_mlp": 1.01648653, "epoch": 0.5398905789695184, "flos": 21214588164480.0, "grad_norm": 2.154275094202878, "language_loss": 0.69819635, "learning_rate": 1.8389275009143711e-06, "loss": 0.71986771, "num_input_tokens_seen": 96932475, "step": 4490, "time_per_iteration": 2.4854838848114014 }, { "auxiliary_loss_clip": 0.01172929, "auxiliary_loss_mlp": 0.0102558, "balance_loss_clip": 1.05064297, "balance_loss_mlp": 1.01840913, "epoch": 0.5400108218601575, "flos": 25080335631360.0, "grad_norm": 1.8663062078000634, "language_loss": 0.73665971, "learning_rate": 1.8381510718527988e-06, "loss": 0.75864476, "num_input_tokens_seen": 96952085, "step": 4491, "time_per_iteration": 2.4466052055358887 }, { "auxiliary_loss_clip": 0.01151174, "auxiliary_loss_mlp": 0.01029152, "balance_loss_clip": 1.04742694, "balance_loss_mlp": 1.02110505, "epoch": 0.5401310647507966, "flos": 26357498588160.0, "grad_norm": 2.5472658261863463, "language_loss": 0.63529658, "learning_rate": 1.8373746673435812e-06, "loss": 0.65709984, "num_input_tokens_seen": 96973110, "step": 4492, "time_per_iteration": 2.532409191131592 }, { "auxiliary_loss_clip": 0.01179352, "auxiliary_loss_mlp": 0.0102842, "balance_loss_clip": 1.05430782, "balance_loss_mlp": 1.02051091, "epoch": 0.5402513076414357, "flos": 27855332749440.0, "grad_norm": 1.661301080479555, "language_loss": 0.79172289, "learning_rate": 1.8365982875044964e-06, "loss": 0.81380063, "num_input_tokens_seen": 96993420, "step": 4493, "time_per_iteration": 2.4797379970550537 }, { "auxiliary_loss_clip": 0.01169397, "auxiliary_loss_mlp": 0.00762604, "balance_loss_clip": 1.05331743, "balance_loss_mlp": 1.00028503, "epoch": 0.5403715505320748, "flos": 22893771116160.0, "grad_norm": 1.9877258192457343, "language_loss": 0.7570402, "learning_rate": 1.8358219324533217e-06, "loss": 0.77636015, "num_input_tokens_seen": 97013685, "step": 4494, "time_per_iteration": 2.471734046936035 }, { "auxiliary_loss_clip": 0.01143205, "auxiliary_loss_mlp": 0.01025245, "balance_loss_clip": 1.04836488, "balance_loss_mlp": 1.01842606, "epoch": 0.5404917934227139, "flos": 30224143895040.0, "grad_norm": 1.875463548352243, "language_loss": 0.70485508, "learning_rate": 1.8350456023078292e-06, "loss": 0.72653961, "num_input_tokens_seen": 97036060, "step": 4495, "time_per_iteration": 3.3996193408966064 }, { "auxiliary_loss_clip": 0.01180751, "auxiliary_loss_mlp": 0.01033635, "balance_loss_clip": 1.05342209, "balance_loss_mlp": 1.02522516, "epoch": 0.540612036313353, "flos": 19938502615680.0, "grad_norm": 2.780571036390765, "language_loss": 0.78138185, "learning_rate": 1.8342692971857874e-06, "loss": 0.80352569, "num_input_tokens_seen": 97055260, "step": 4496, "time_per_iteration": 2.4275875091552734 }, { "auxiliary_loss_clip": 0.01147921, "auxiliary_loss_mlp": 0.01028973, "balance_loss_clip": 1.05072308, "balance_loss_mlp": 1.02148688, "epoch": 0.540732279203992, "flos": 24279599692800.0, "grad_norm": 2.1974390425763413, "language_loss": 0.71137136, "learning_rate": 1.833493017204962e-06, "loss": 0.73314023, "num_input_tokens_seen": 97075365, "step": 4497, "time_per_iteration": 2.558467149734497 }, { "auxiliary_loss_clip": 0.01176532, "auxiliary_loss_mlp": 0.01033103, "balance_loss_clip": 1.05226088, "balance_loss_mlp": 1.02559316, "epoch": 0.5408525220946312, "flos": 20193216935040.0, "grad_norm": 1.8664053892363894, "language_loss": 0.78216189, "learning_rate": 1.8327167624831134e-06, "loss": 0.80425823, "num_input_tokens_seen": 97093095, "step": 4498, "time_per_iteration": 2.4242374897003174 }, { "auxiliary_loss_clip": 0.01174873, "auxiliary_loss_mlp": 0.01027145, "balance_loss_clip": 1.05251908, "balance_loss_mlp": 1.01994157, "epoch": 0.5409727649852702, "flos": 24134448833280.0, "grad_norm": 1.5840676542609582, "language_loss": 0.71054977, "learning_rate": 1.831940533137999e-06, "loss": 0.73256993, "num_input_tokens_seen": 97112000, "step": 4499, "time_per_iteration": 3.3257219791412354 }, { "auxiliary_loss_clip": 0.01161919, "auxiliary_loss_mlp": 0.01026878, "balance_loss_clip": 1.05461526, "balance_loss_mlp": 1.01951051, "epoch": 0.5410930078759093, "flos": 23912700220800.0, "grad_norm": 1.6962871266940485, "language_loss": 0.72297829, "learning_rate": 1.8311643292873718e-06, "loss": 0.74486625, "num_input_tokens_seen": 97130820, "step": 4500, "time_per_iteration": 2.4935522079467773 }, { "auxiliary_loss_clip": 0.01158953, "auxiliary_loss_mlp": 0.01027909, "balance_loss_clip": 1.05154514, "balance_loss_mlp": 1.02086663, "epoch": 0.5412132507665485, "flos": 21105132445440.0, "grad_norm": 2.0953335547982097, "language_loss": 0.87982357, "learning_rate": 1.8303881510489818e-06, "loss": 0.90169221, "num_input_tokens_seen": 97149210, "step": 4501, "time_per_iteration": 3.255511522293091 }, { "auxiliary_loss_clip": 0.01150309, "auxiliary_loss_mlp": 0.01027774, "balance_loss_clip": 1.05197513, "balance_loss_mlp": 1.01922679, "epoch": 0.5413334936571875, "flos": 30227340205440.0, "grad_norm": 2.464181921881129, "language_loss": 0.69331312, "learning_rate": 1.829611998540574e-06, "loss": 0.71509397, "num_input_tokens_seen": 97170415, "step": 4502, "time_per_iteration": 2.5475172996520996 }, { "auxiliary_loss_clip": 0.01163828, "auxiliary_loss_mlp": 0.0076218, "balance_loss_clip": 1.05030084, "balance_loss_mlp": 1.00030172, "epoch": 0.5414537365478266, "flos": 24279635606400.0, "grad_norm": 2.1929211885137927, "language_loss": 0.79975057, "learning_rate": 1.8288358718798914e-06, "loss": 0.81901073, "num_input_tokens_seen": 97189605, "step": 4503, "time_per_iteration": 2.4653518199920654 }, { "auxiliary_loss_clip": 0.01155891, "auxiliary_loss_mlp": 0.00761946, "balance_loss_clip": 1.04989314, "balance_loss_mlp": 1.00035596, "epoch": 0.5415739794384657, "flos": 16654543735680.0, "grad_norm": 1.7906156978222616, "language_loss": 0.72438109, "learning_rate": 1.8280597711846703e-06, "loss": 0.74355948, "num_input_tokens_seen": 97207845, "step": 4504, "time_per_iteration": 2.4137439727783203 }, { "auxiliary_loss_clip": 0.01161612, "auxiliary_loss_mlp": 0.01022528, "balance_loss_clip": 1.05430222, "balance_loss_mlp": 1.01501155, "epoch": 0.5416942223291048, "flos": 23185724860800.0, "grad_norm": 1.7553121924385462, "language_loss": 0.83786285, "learning_rate": 1.8272836965726455e-06, "loss": 0.8597042, "num_input_tokens_seen": 97226780, "step": 4505, "time_per_iteration": 3.207233190536499 }, { "auxiliary_loss_clip": 0.01104327, "auxiliary_loss_mlp": 0.01027141, "balance_loss_clip": 1.04199433, "balance_loss_mlp": 1.01915944, "epoch": 0.5418144652197439, "flos": 20303247271680.0, "grad_norm": 1.7384649819730214, "language_loss": 0.78313279, "learning_rate": 1.8265076481615461e-06, "loss": 0.80444747, "num_input_tokens_seen": 97246695, "step": 4506, "time_per_iteration": 2.5814051628112793 }, { "auxiliary_loss_clip": 0.01147569, "auxiliary_loss_mlp": 0.01030564, "balance_loss_clip": 1.05069005, "balance_loss_mlp": 1.02223718, "epoch": 0.541934708110383, "flos": 12458633431680.0, "grad_norm": 2.161994067558727, "language_loss": 0.87442434, "learning_rate": 1.8257316260690987e-06, "loss": 0.89620566, "num_input_tokens_seen": 97264480, "step": 4507, "time_per_iteration": 2.5089099407196045 }, { "auxiliary_loss_clip": 0.01161769, "auxiliary_loss_mlp": 0.01017635, "balance_loss_clip": 1.05010569, "balance_loss_mlp": 1.01073325, "epoch": 0.5420549510010221, "flos": 21253802837760.0, "grad_norm": 1.4342178975487165, "language_loss": 0.75699228, "learning_rate": 1.8249556304130254e-06, "loss": 0.7787863, "num_input_tokens_seen": 97285760, "step": 4508, "time_per_iteration": 2.461242198944092 }, { "auxiliary_loss_clip": 0.01138354, "auxiliary_loss_mlp": 0.01029662, "balance_loss_clip": 1.04738355, "balance_loss_mlp": 1.02181256, "epoch": 0.5421751938916611, "flos": 29490524519040.0, "grad_norm": 1.9113721024560573, "language_loss": 0.68916261, "learning_rate": 1.824179661311044e-06, "loss": 0.71084273, "num_input_tokens_seen": 97304510, "step": 4509, "time_per_iteration": 2.5242104530334473 }, { "auxiliary_loss_clip": 0.01116441, "auxiliary_loss_mlp": 0.01024462, "balance_loss_clip": 1.04052949, "balance_loss_mlp": 1.01673722, "epoch": 0.5422954367823003, "flos": 18734238311040.0, "grad_norm": 1.9054684177628225, "language_loss": 0.79972827, "learning_rate": 1.823403718880868e-06, "loss": 0.82113731, "num_input_tokens_seen": 97323270, "step": 4510, "time_per_iteration": 2.5528883934020996 }, { "auxiliary_loss_clip": 0.01146336, "auxiliary_loss_mlp": 0.01029674, "balance_loss_clip": 1.04513681, "balance_loss_mlp": 1.02189541, "epoch": 0.5424156796729394, "flos": 39969006940800.0, "grad_norm": 1.584981514043345, "language_loss": 0.66644168, "learning_rate": 1.822627803240207e-06, "loss": 0.68820179, "num_input_tokens_seen": 97345600, "step": 4511, "time_per_iteration": 2.646287441253662 }, { "auxiliary_loss_clip": 0.01136932, "auxiliary_loss_mlp": 0.01029103, "balance_loss_clip": 1.04874229, "balance_loss_mlp": 1.02182531, "epoch": 0.5425359225635784, "flos": 11546538353280.0, "grad_norm": 2.1479674754870053, "language_loss": 0.84785694, "learning_rate": 1.8218519145067675e-06, "loss": 0.86951733, "num_input_tokens_seen": 97361220, "step": 4512, "time_per_iteration": 2.4993834495544434 }, { "auxiliary_loss_clip": 0.01126343, "auxiliary_loss_mlp": 0.01028071, "balance_loss_clip": 1.04443717, "balance_loss_mlp": 1.02036405, "epoch": 0.5426561654542175, "flos": 20229702174720.0, "grad_norm": 2.1856135700902257, "language_loss": 0.89349431, "learning_rate": 1.8210760527982508e-06, "loss": 0.91503847, "num_input_tokens_seen": 97381505, "step": 4513, "time_per_iteration": 2.519304037094116 }, { "auxiliary_loss_clip": 0.01149375, "auxiliary_loss_mlp": 0.00762314, "balance_loss_clip": 1.05110645, "balance_loss_mlp": 1.00032353, "epoch": 0.5427764083448566, "flos": 21871681614720.0, "grad_norm": 1.7386745694831183, "language_loss": 0.75077903, "learning_rate": 1.8203002182323552e-06, "loss": 0.76989591, "num_input_tokens_seen": 97399060, "step": 4514, "time_per_iteration": 2.5139713287353516 }, { "auxiliary_loss_clip": 0.01152836, "auxiliary_loss_mlp": 0.01025624, "balance_loss_clip": 1.05245519, "balance_loss_mlp": 1.01754737, "epoch": 0.5428966512354957, "flos": 19640946349440.0, "grad_norm": 1.9764111876447796, "language_loss": 0.7580114, "learning_rate": 1.819524410926773e-06, "loss": 0.779796, "num_input_tokens_seen": 97416740, "step": 4515, "time_per_iteration": 2.479501247406006 }, { "auxiliary_loss_clip": 0.01103346, "auxiliary_loss_mlp": 0.01027456, "balance_loss_clip": 1.04651427, "balance_loss_mlp": 1.01951098, "epoch": 0.5430168941261347, "flos": 22382187661440.0, "grad_norm": 1.4678912406432258, "language_loss": 0.77089119, "learning_rate": 1.8187486309991944e-06, "loss": 0.79219925, "num_input_tokens_seen": 97437620, "step": 4516, "time_per_iteration": 2.607738733291626 }, { "auxiliary_loss_clip": 0.01166328, "auxiliary_loss_mlp": 0.01029861, "balance_loss_clip": 1.0521723, "balance_loss_mlp": 1.0227294, "epoch": 0.5431371370167739, "flos": 18764187275520.0, "grad_norm": 1.649220096398923, "language_loss": 0.7728231, "learning_rate": 1.817972878567304e-06, "loss": 0.7947849, "num_input_tokens_seen": 97456275, "step": 4517, "time_per_iteration": 2.4406261444091797 }, { "auxiliary_loss_clip": 0.01151926, "auxiliary_loss_mlp": 0.01027294, "balance_loss_clip": 1.04838395, "balance_loss_mlp": 1.02008462, "epoch": 0.543257379907413, "flos": 18806023641600.0, "grad_norm": 1.7383614917474701, "language_loss": 0.76066166, "learning_rate": 1.8171971537487834e-06, "loss": 0.78245389, "num_input_tokens_seen": 97474925, "step": 4518, "time_per_iteration": 2.49815034866333 }, { "auxiliary_loss_clip": 0.01174344, "auxiliary_loss_mlp": 0.01031816, "balance_loss_clip": 1.05001843, "balance_loss_mlp": 1.02399635, "epoch": 0.543377622798052, "flos": 17493381025920.0, "grad_norm": 1.830820324531856, "language_loss": 0.80420929, "learning_rate": 1.8164214566613093e-06, "loss": 0.82627088, "num_input_tokens_seen": 97493550, "step": 4519, "time_per_iteration": 2.411087989807129 }, { "auxiliary_loss_clip": 0.01172077, "auxiliary_loss_mlp": 0.01023134, "balance_loss_clip": 1.04954863, "balance_loss_mlp": 1.01589847, "epoch": 0.5434978656886912, "flos": 18989311766400.0, "grad_norm": 4.114227920719771, "language_loss": 0.65828776, "learning_rate": 1.8156457874225547e-06, "loss": 0.68023992, "num_input_tokens_seen": 97512010, "step": 4520, "time_per_iteration": 2.4073808193206787 }, { "auxiliary_loss_clip": 0.01139216, "auxiliary_loss_mlp": 0.01023683, "balance_loss_clip": 1.04899454, "balance_loss_mlp": 1.01642895, "epoch": 0.5436181085793302, "flos": 17274936464640.0, "grad_norm": 2.035172728444253, "language_loss": 0.80638015, "learning_rate": 1.814870146150187e-06, "loss": 0.82800913, "num_input_tokens_seen": 97530120, "step": 4521, "time_per_iteration": 2.4470407962799072 }, { "auxiliary_loss_clip": 0.01152884, "auxiliary_loss_mlp": 0.0103139, "balance_loss_clip": 1.04701042, "balance_loss_mlp": 1.02381146, "epoch": 0.5437383514699693, "flos": 19098587917440.0, "grad_norm": 1.91983004723685, "language_loss": 0.78594959, "learning_rate": 1.814094532961871e-06, "loss": 0.80779231, "num_input_tokens_seen": 97548695, "step": 4522, "time_per_iteration": 3.2551491260528564 }, { "auxiliary_loss_clip": 0.01120395, "auxiliary_loss_mlp": 0.01028786, "balance_loss_clip": 1.04434252, "balance_loss_mlp": 1.02104342, "epoch": 0.5438585943606085, "flos": 22602715211520.0, "grad_norm": 1.846615871248621, "language_loss": 0.83428198, "learning_rate": 1.8133189479752666e-06, "loss": 0.85577381, "num_input_tokens_seen": 97567625, "step": 4523, "time_per_iteration": 2.554713249206543 }, { "auxiliary_loss_clip": 0.01139024, "auxiliary_loss_mlp": 0.01027661, "balance_loss_clip": 1.04557085, "balance_loss_mlp": 1.02073824, "epoch": 0.5439788372512475, "flos": 21798495653760.0, "grad_norm": 1.8881763630042188, "language_loss": 0.81947517, "learning_rate": 1.8125433913080292e-06, "loss": 0.84114206, "num_input_tokens_seen": 97585325, "step": 4524, "time_per_iteration": 2.479999303817749 }, { "auxiliary_loss_clip": 0.01089927, "auxiliary_loss_mlp": 0.01025363, "balance_loss_clip": 1.0459249, "balance_loss_mlp": 1.01899123, "epoch": 0.5440990801418866, "flos": 16399362539520.0, "grad_norm": 2.0509675596091728, "language_loss": 0.82522714, "learning_rate": 1.811767863077811e-06, "loss": 0.84638011, "num_input_tokens_seen": 97604275, "step": 4525, "time_per_iteration": 2.579888105392456 }, { "auxiliary_loss_clip": 0.01097783, "auxiliary_loss_mlp": 0.01027814, "balance_loss_clip": 1.04573405, "balance_loss_mlp": 1.02042603, "epoch": 0.5442193230325257, "flos": 21615638492160.0, "grad_norm": 1.6406692432042242, "language_loss": 0.78299081, "learning_rate": 1.8109923634022577e-06, "loss": 0.80424678, "num_input_tokens_seen": 97624300, "step": 4526, "time_per_iteration": 3.4092373847961426 }, { "auxiliary_loss_clip": 0.01177091, "auxiliary_loss_mlp": 0.01025861, "balance_loss_clip": 1.05138004, "balance_loss_mlp": 1.0180887, "epoch": 0.5443395659231648, "flos": 15481198062720.0, "grad_norm": 1.9315214886879715, "language_loss": 0.86531496, "learning_rate": 1.8102168923990128e-06, "loss": 0.88734448, "num_input_tokens_seen": 97637845, "step": 4527, "time_per_iteration": 2.3992555141448975 }, { "auxiliary_loss_clip": 0.01165053, "auxiliary_loss_mlp": 0.00761583, "balance_loss_clip": 1.05297613, "balance_loss_mlp": 1.00039887, "epoch": 0.5444598088138038, "flos": 18770436241920.0, "grad_norm": 1.792593614094289, "language_loss": 0.79970706, "learning_rate": 1.809441450185714e-06, "loss": 0.81897336, "num_input_tokens_seen": 97656330, "step": 4528, "time_per_iteration": 3.2790322303771973 }, { "auxiliary_loss_clip": 0.011512, "auxiliary_loss_mlp": 0.01024985, "balance_loss_clip": 1.04636526, "balance_loss_mlp": 1.01722479, "epoch": 0.544580051704443, "flos": 21142335957120.0, "grad_norm": 2.0519548152882843, "language_loss": 0.73393506, "learning_rate": 1.8086660368799958e-06, "loss": 0.75569689, "num_input_tokens_seen": 97674380, "step": 4529, "time_per_iteration": 2.4725565910339355 }, { "auxiliary_loss_clip": 0.01150644, "auxiliary_loss_mlp": 0.01020466, "balance_loss_clip": 1.05085957, "balance_loss_mlp": 1.01263428, "epoch": 0.5447002945950821, "flos": 32491508054400.0, "grad_norm": 1.7249744067724055, "language_loss": 0.77068698, "learning_rate": 1.807890652599488e-06, "loss": 0.7923981, "num_input_tokens_seen": 97698765, "step": 4530, "time_per_iteration": 2.585057258605957 }, { "auxiliary_loss_clip": 0.01172829, "auxiliary_loss_mlp": 0.0102648, "balance_loss_clip": 1.05118966, "balance_loss_mlp": 1.01974797, "epoch": 0.5448205374857211, "flos": 11798307757440.0, "grad_norm": 4.200980973443889, "language_loss": 0.82388544, "learning_rate": 1.8071152974618156e-06, "loss": 0.84587854, "num_input_tokens_seen": 97716565, "step": 4531, "time_per_iteration": 3.148620128631592 }, { "auxiliary_loss_clip": 0.01134952, "auxiliary_loss_mlp": 0.00761862, "balance_loss_clip": 1.04592037, "balance_loss_mlp": 1.00027609, "epoch": 0.5449407803763603, "flos": 24133766474880.0, "grad_norm": 2.3330085471769504, "language_loss": 0.78289199, "learning_rate": 1.806339971584599e-06, "loss": 0.80186015, "num_input_tokens_seen": 97733225, "step": 4532, "time_per_iteration": 2.5186805725097656 }, { "auxiliary_loss_clip": 0.01174376, "auxiliary_loss_mlp": 0.01025538, "balance_loss_clip": 1.05073059, "balance_loss_mlp": 1.01780176, "epoch": 0.5450610232669993, "flos": 23258551685760.0, "grad_norm": 2.000358025561131, "language_loss": 0.8523401, "learning_rate": 1.8055646750854546e-06, "loss": 0.87433922, "num_input_tokens_seen": 97752735, "step": 4533, "time_per_iteration": 2.430197238922119 }, { "auxiliary_loss_clip": 0.01149936, "auxiliary_loss_mlp": 0.01023404, "balance_loss_clip": 1.04878426, "balance_loss_mlp": 1.01578069, "epoch": 0.5451812661576384, "flos": 17785083375360.0, "grad_norm": 2.186783014702608, "language_loss": 0.81599176, "learning_rate": 1.8047894080819945e-06, "loss": 0.83772516, "num_input_tokens_seen": 97769985, "step": 4534, "time_per_iteration": 2.4424550533294678 }, { "auxiliary_loss_clip": 0.01077564, "auxiliary_loss_mlp": 0.01000203, "balance_loss_clip": 1.01713634, "balance_loss_mlp": 0.99932688, "epoch": 0.5453015090482776, "flos": 71062586513280.0, "grad_norm": 0.7222235935203467, "language_loss": 0.63152587, "learning_rate": 1.8040141706918258e-06, "loss": 0.65230346, "num_input_tokens_seen": 97831225, "step": 4535, "time_per_iteration": 3.1208040714263916 }, { "auxiliary_loss_clip": 0.01148813, "auxiliary_loss_mlp": 0.0102608, "balance_loss_clip": 1.05027533, "balance_loss_mlp": 1.01842642, "epoch": 0.5454217519389166, "flos": 25552201622400.0, "grad_norm": 1.7454166229967965, "language_loss": 0.76542628, "learning_rate": 1.8032389630325525e-06, "loss": 0.78717518, "num_input_tokens_seen": 97849975, "step": 4536, "time_per_iteration": 2.5153892040252686 }, { "auxiliary_loss_clip": 0.01144809, "auxiliary_loss_mlp": 0.01029455, "balance_loss_clip": 1.0441432, "balance_loss_mlp": 1.02153361, "epoch": 0.5455419948295557, "flos": 23658345037440.0, "grad_norm": 1.6372953379951058, "language_loss": 0.75886774, "learning_rate": 1.8024637852217707e-06, "loss": 0.78061038, "num_input_tokens_seen": 97869700, "step": 4537, "time_per_iteration": 2.5058796405792236 }, { "auxiliary_loss_clip": 0.01146537, "auxiliary_loss_mlp": 0.01033976, "balance_loss_clip": 1.04838705, "balance_loss_mlp": 1.02599454, "epoch": 0.5456622377201948, "flos": 23403989854080.0, "grad_norm": 1.8888518967027725, "language_loss": 0.84811908, "learning_rate": 1.8016886373770766e-06, "loss": 0.86992419, "num_input_tokens_seen": 97888215, "step": 4538, "time_per_iteration": 2.4896507263183594 }, { "auxiliary_loss_clip": 0.01147538, "auxiliary_loss_mlp": 0.0102518, "balance_loss_clip": 1.04890561, "balance_loss_mlp": 1.01766419, "epoch": 0.5457824806108339, "flos": 23988040997760.0, "grad_norm": 1.70619246750523, "language_loss": 0.78872037, "learning_rate": 1.8009135196160579e-06, "loss": 0.81044763, "num_input_tokens_seen": 97907090, "step": 4539, "time_per_iteration": 2.5665969848632812 }, { "auxiliary_loss_clip": 0.01129304, "auxiliary_loss_mlp": 0.01027329, "balance_loss_clip": 1.04558575, "balance_loss_mlp": 1.02077579, "epoch": 0.545902723501473, "flos": 22565870835840.0, "grad_norm": 1.6616023518629373, "language_loss": 0.84290826, "learning_rate": 1.8001384320563e-06, "loss": 0.86447465, "num_input_tokens_seen": 97927345, "step": 4540, "time_per_iteration": 2.5505895614624023 }, { "auxiliary_loss_clip": 0.01076303, "auxiliary_loss_mlp": 0.01000217, "balance_loss_clip": 1.01617897, "balance_loss_mlp": 0.99933493, "epoch": 0.5460229663921121, "flos": 55198399685760.0, "grad_norm": 0.7749301543654749, "language_loss": 0.57799071, "learning_rate": 1.7993633748153833e-06, "loss": 0.5987559, "num_input_tokens_seen": 97981950, "step": 4541, "time_per_iteration": 2.912912368774414 }, { "auxiliary_loss_clip": 0.01165002, "auxiliary_loss_mlp": 0.01028674, "balance_loss_clip": 1.05021369, "balance_loss_mlp": 1.02117836, "epoch": 0.5461432092827512, "flos": 15413866018560.0, "grad_norm": 2.0416428692083866, "language_loss": 0.73028022, "learning_rate": 1.7985883480108834e-06, "loss": 0.75221699, "num_input_tokens_seen": 97999585, "step": 4542, "time_per_iteration": 2.424006700515747 }, { "auxiliary_loss_clip": 0.01156192, "auxiliary_loss_mlp": 0.01030804, "balance_loss_clip": 1.04839957, "balance_loss_mlp": 1.02288282, "epoch": 0.5462634521733902, "flos": 24024921287040.0, "grad_norm": 1.7501600149856733, "language_loss": 0.71994466, "learning_rate": 1.797813351760371e-06, "loss": 0.74181461, "num_input_tokens_seen": 98021290, "step": 4543, "time_per_iteration": 2.481827735900879 }, { "auxiliary_loss_clip": 0.01176667, "auxiliary_loss_mlp": 0.0102296, "balance_loss_clip": 1.052508, "balance_loss_mlp": 1.01519907, "epoch": 0.5463836950640293, "flos": 22820944291200.0, "grad_norm": 1.836872192055815, "language_loss": 0.77999276, "learning_rate": 1.7970383861814116e-06, "loss": 0.80198902, "num_input_tokens_seen": 98041060, "step": 4544, "time_per_iteration": 2.439911127090454 }, { "auxiliary_loss_clip": 0.01160459, "auxiliary_loss_mlp": 0.01025248, "balance_loss_clip": 1.051036, "balance_loss_mlp": 1.01744556, "epoch": 0.5465039379546685, "flos": 20448290390400.0, "grad_norm": 1.8993314336524234, "language_loss": 0.74120528, "learning_rate": 1.7962634513915684e-06, "loss": 0.76306236, "num_input_tokens_seen": 98058410, "step": 4545, "time_per_iteration": 2.43542742729187 }, { "auxiliary_loss_clip": 0.01173234, "auxiliary_loss_mlp": 0.01021611, "balance_loss_clip": 1.0507493, "balance_loss_mlp": 1.01440763, "epoch": 0.5466241808453075, "flos": 17343310003200.0, "grad_norm": 1.8499934561082187, "language_loss": 0.79394221, "learning_rate": 1.7954885475083969e-06, "loss": 0.81589067, "num_input_tokens_seen": 98076080, "step": 4546, "time_per_iteration": 2.386650800704956 }, { "auxiliary_loss_clip": 0.01177076, "auxiliary_loss_mlp": 0.01030274, "balance_loss_clip": 1.0522691, "balance_loss_mlp": 1.02276933, "epoch": 0.5467444237359466, "flos": 21617039122560.0, "grad_norm": 2.683068920328305, "language_loss": 0.7231003, "learning_rate": 1.7947136746494513e-06, "loss": 0.74517381, "num_input_tokens_seen": 98096995, "step": 4547, "time_per_iteration": 2.4507923126220703 }, { "auxiliary_loss_clip": 0.01160442, "auxiliary_loss_mlp": 0.01028466, "balance_loss_clip": 1.05037403, "balance_loss_mlp": 1.02111673, "epoch": 0.5468646666265857, "flos": 24170467196160.0, "grad_norm": 2.0244713422737535, "language_loss": 0.87999213, "learning_rate": 1.793938832932277e-06, "loss": 0.90188122, "num_input_tokens_seen": 98115105, "step": 4548, "time_per_iteration": 2.543168544769287 }, { "auxiliary_loss_clip": 0.01175335, "auxiliary_loss_mlp": 0.01022658, "balance_loss_clip": 1.0509218, "balance_loss_mlp": 1.01533258, "epoch": 0.5469849095172248, "flos": 27527001505920.0, "grad_norm": 3.9813891389050253, "language_loss": 0.70256078, "learning_rate": 1.7931640224744185e-06, "loss": 0.72454071, "num_input_tokens_seen": 98135655, "step": 4549, "time_per_iteration": 3.3618717193603516 }, { "auxiliary_loss_clip": 0.0111812, "auxiliary_loss_mlp": 0.01024883, "balance_loss_clip": 1.04001212, "balance_loss_mlp": 1.01751029, "epoch": 0.5471051524078638, "flos": 27964680727680.0, "grad_norm": 1.6906578686327298, "language_loss": 0.73609936, "learning_rate": 1.7923892433934127e-06, "loss": 0.75752944, "num_input_tokens_seen": 98156730, "step": 4550, "time_per_iteration": 2.6062722206115723 }, { "auxiliary_loss_clip": 0.01149645, "auxiliary_loss_mlp": 0.00762469, "balance_loss_clip": 1.04947627, "balance_loss_mlp": 1.00027156, "epoch": 0.547225395298503, "flos": 18150510389760.0, "grad_norm": 1.7988293326444136, "language_loss": 0.78776646, "learning_rate": 1.7916144958067939e-06, "loss": 0.80688763, "num_input_tokens_seen": 98174590, "step": 4551, "time_per_iteration": 2.458024263381958 }, { "auxiliary_loss_clip": 0.01162338, "auxiliary_loss_mlp": 0.0102254, "balance_loss_clip": 1.04953206, "balance_loss_mlp": 1.01510155, "epoch": 0.5473456381891421, "flos": 21361498790400.0, "grad_norm": 1.6765500906810593, "language_loss": 0.78673708, "learning_rate": 1.7908397798320905e-06, "loss": 0.80858582, "num_input_tokens_seen": 98194325, "step": 4552, "time_per_iteration": 2.4699718952178955 }, { "auxiliary_loss_clip": 0.01160948, "auxiliary_loss_mlp": 0.00762508, "balance_loss_clip": 1.04991162, "balance_loss_mlp": 1.00033486, "epoch": 0.5474658810797811, "flos": 19932145908480.0, "grad_norm": 1.8051822545327347, "language_loss": 0.74956799, "learning_rate": 1.7900650955868265e-06, "loss": 0.76880252, "num_input_tokens_seen": 98213970, "step": 4553, "time_per_iteration": 3.324789524078369 }, { "auxiliary_loss_clip": 0.01160909, "auxiliary_loss_mlp": 0.0076161, "balance_loss_clip": 1.05158138, "balance_loss_mlp": 1.00024807, "epoch": 0.5475861239704203, "flos": 50476217264640.0, "grad_norm": 1.6981592132020786, "language_loss": 0.76457, "learning_rate": 1.7892904431885202e-06, "loss": 0.78379524, "num_input_tokens_seen": 98241145, "step": 4554, "time_per_iteration": 2.726081371307373 }, { "auxiliary_loss_clip": 0.01118066, "auxiliary_loss_mlp": 0.01027191, "balance_loss_clip": 1.04255903, "balance_loss_mlp": 1.02022946, "epoch": 0.5477063668610593, "flos": 20705123612160.0, "grad_norm": 2.6497862896537736, "language_loss": 0.75188208, "learning_rate": 1.788515822754686e-06, "loss": 0.77333462, "num_input_tokens_seen": 98261565, "step": 4555, "time_per_iteration": 3.524815082550049 }, { "auxiliary_loss_clip": 0.01133472, "auxiliary_loss_mlp": 0.01029459, "balance_loss_clip": 1.04474831, "balance_loss_mlp": 1.02174652, "epoch": 0.5478266097516984, "flos": 19609740408960.0, "grad_norm": 2.713399239387335, "language_loss": 0.78099608, "learning_rate": 1.7877412344028335e-06, "loss": 0.80262542, "num_input_tokens_seen": 98281370, "step": 4556, "time_per_iteration": 2.532320737838745 }, { "auxiliary_loss_clip": 0.01162166, "auxiliary_loss_mlp": 0.01021516, "balance_loss_clip": 1.04914927, "balance_loss_mlp": 1.0140655, "epoch": 0.5479468526423376, "flos": 12896599962240.0, "grad_norm": 2.202349931979879, "language_loss": 0.77485198, "learning_rate": 1.7869666782504668e-06, "loss": 0.7966888, "num_input_tokens_seen": 98297950, "step": 4557, "time_per_iteration": 2.463698387145996 }, { "auxiliary_loss_clip": 0.01132633, "auxiliary_loss_mlp": 0.01024082, "balance_loss_clip": 1.04283547, "balance_loss_mlp": 1.01650059, "epoch": 0.5480670955329766, "flos": 18588800142720.0, "grad_norm": 2.424999626467018, "language_loss": 0.68952286, "learning_rate": 1.7861921544150867e-06, "loss": 0.71109009, "num_input_tokens_seen": 98316800, "step": 4558, "time_per_iteration": 3.2716052532196045 }, { "auxiliary_loss_clip": 0.01091073, "auxiliary_loss_mlp": 0.00761889, "balance_loss_clip": 1.04247427, "balance_loss_mlp": 1.00020099, "epoch": 0.5481873384236157, "flos": 15954608338560.0, "grad_norm": 1.8777549783472427, "language_loss": 0.76459134, "learning_rate": 1.7854176630141856e-06, "loss": 0.78312099, "num_input_tokens_seen": 98333935, "step": 4559, "time_per_iteration": 2.6099140644073486 }, { "auxiliary_loss_clip": 0.01179481, "auxiliary_loss_mlp": 0.01036978, "balance_loss_clip": 1.05334163, "balance_loss_mlp": 1.02925944, "epoch": 0.5483075813142548, "flos": 22783812606720.0, "grad_norm": 2.117852096102336, "language_loss": 0.84697604, "learning_rate": 1.784643204165255e-06, "loss": 0.86914062, "num_input_tokens_seen": 98353255, "step": 4560, "time_per_iteration": 2.4737913608551025 }, { "auxiliary_loss_clip": 0.01155709, "auxiliary_loss_mlp": 0.01024309, "balance_loss_clip": 1.05070949, "balance_loss_mlp": 1.01701951, "epoch": 0.5484278242048939, "flos": 19317212046720.0, "grad_norm": 1.929608792232688, "language_loss": 0.77659905, "learning_rate": 1.7838687779857783e-06, "loss": 0.79839921, "num_input_tokens_seen": 98371130, "step": 4561, "time_per_iteration": 2.4686620235443115 }, { "auxiliary_loss_clip": 0.01138915, "auxiliary_loss_mlp": 0.01026644, "balance_loss_clip": 1.04524457, "balance_loss_mlp": 1.01859772, "epoch": 0.5485480670955329, "flos": 22816024128000.0, "grad_norm": 2.525157956433148, "language_loss": 0.63625896, "learning_rate": 1.7830943845932366e-06, "loss": 0.65791452, "num_input_tokens_seen": 98390455, "step": 4562, "time_per_iteration": 2.511744976043701 }, { "auxiliary_loss_clip": 0.01148393, "auxiliary_loss_mlp": 0.01027185, "balance_loss_clip": 1.04811239, "balance_loss_mlp": 1.02007735, "epoch": 0.5486683099861721, "flos": 22671304231680.0, "grad_norm": 1.7376921800225773, "language_loss": 0.7511009, "learning_rate": 1.7823200241051044e-06, "loss": 0.77285671, "num_input_tokens_seen": 98409370, "step": 4563, "time_per_iteration": 2.5301826000213623 }, { "auxiliary_loss_clip": 0.01174728, "auxiliary_loss_mlp": 0.01022712, "balance_loss_clip": 1.0509963, "balance_loss_mlp": 1.01518416, "epoch": 0.5487885528768112, "flos": 23149383275520.0, "grad_norm": 1.8530807796113584, "language_loss": 0.80451554, "learning_rate": 1.7815456966388513e-06, "loss": 0.82648993, "num_input_tokens_seen": 98428465, "step": 4564, "time_per_iteration": 2.4201343059539795 }, { "auxiliary_loss_clip": 0.01133188, "auxiliary_loss_mlp": 0.01030926, "balance_loss_clip": 1.04530263, "balance_loss_mlp": 1.02349341, "epoch": 0.5489087957674502, "flos": 22053928245120.0, "grad_norm": 2.242278662268403, "language_loss": 0.80838966, "learning_rate": 1.780771402311943e-06, "loss": 0.8300308, "num_input_tokens_seen": 98447300, "step": 4565, "time_per_iteration": 2.5529279708862305 }, { "auxiliary_loss_clip": 0.01145576, "auxiliary_loss_mlp": 0.01029005, "balance_loss_clip": 1.04783845, "balance_loss_mlp": 1.02119112, "epoch": 0.5490290386580894, "flos": 24315977191680.0, "grad_norm": 1.9985236507139452, "language_loss": 0.78857285, "learning_rate": 1.7799971412418374e-06, "loss": 0.81031871, "num_input_tokens_seen": 98468695, "step": 4566, "time_per_iteration": 2.515193462371826 }, { "auxiliary_loss_clip": 0.01135284, "auxiliary_loss_mlp": 0.01025986, "balance_loss_clip": 1.04826117, "balance_loss_mlp": 1.01838374, "epoch": 0.5491492815487284, "flos": 18294942977280.0, "grad_norm": 11.723893213270623, "language_loss": 0.73885345, "learning_rate": 1.7792229135459918e-06, "loss": 0.76046616, "num_input_tokens_seen": 98485345, "step": 4567, "time_per_iteration": 2.4846317768096924 }, { "auxiliary_loss_clip": 0.01045923, "auxiliary_loss_mlp": 0.01020286, "balance_loss_clip": 1.03527236, "balance_loss_mlp": 1.01878965, "epoch": 0.5492695244393675, "flos": 64550257050240.0, "grad_norm": 0.7975063242764259, "language_loss": 0.61580998, "learning_rate": 1.7784487193418538e-06, "loss": 0.63647211, "num_input_tokens_seen": 98543195, "step": 4568, "time_per_iteration": 3.056511878967285 }, { "auxiliary_loss_clip": 0.01117544, "auxiliary_loss_mlp": 0.01025708, "balance_loss_clip": 1.04118085, "balance_loss_mlp": 1.01760793, "epoch": 0.5493897673300067, "flos": 17379579761280.0, "grad_norm": 1.977251591286577, "language_loss": 0.61347437, "learning_rate": 1.7776745587468698e-06, "loss": 0.63490689, "num_input_tokens_seen": 98560620, "step": 4569, "time_per_iteration": 2.5322492122650146 }, { "auxiliary_loss_clip": 0.01172484, "auxiliary_loss_mlp": 0.01029719, "balance_loss_clip": 1.04877591, "balance_loss_mlp": 1.02223301, "epoch": 0.5495100102206457, "flos": 19901765980800.0, "grad_norm": 2.280690154085733, "language_loss": 0.81421161, "learning_rate": 1.7769004318784776e-06, "loss": 0.83623374, "num_input_tokens_seen": 98578265, "step": 4570, "time_per_iteration": 2.4282124042510986 }, { "auxiliary_loss_clip": 0.01162038, "auxiliary_loss_mlp": 0.01022539, "balance_loss_clip": 1.04976487, "balance_loss_mlp": 1.01530933, "epoch": 0.5496302531112848, "flos": 16727190992640.0, "grad_norm": 2.3230471837682507, "language_loss": 0.80471182, "learning_rate": 1.776126338854113e-06, "loss": 0.82655752, "num_input_tokens_seen": 98596055, "step": 4571, "time_per_iteration": 2.436657667160034 }, { "auxiliary_loss_clip": 0.01158951, "auxiliary_loss_mlp": 0.01026041, "balance_loss_clip": 1.05287945, "balance_loss_mlp": 1.01882625, "epoch": 0.5497504960019239, "flos": 24572343536640.0, "grad_norm": 1.6746441438583883, "language_loss": 0.84460211, "learning_rate": 1.7753522797912044e-06, "loss": 0.86645204, "num_input_tokens_seen": 98616140, "step": 4572, "time_per_iteration": 2.495299816131592 }, { "auxiliary_loss_clip": 0.01152074, "auxiliary_loss_mlp": 0.01024062, "balance_loss_clip": 1.04634762, "balance_loss_mlp": 1.01652193, "epoch": 0.549870738892563, "flos": 15450494912640.0, "grad_norm": 2.2534041290342057, "language_loss": 0.69912559, "learning_rate": 1.7745782548071765e-06, "loss": 0.72088695, "num_input_tokens_seen": 98633035, "step": 4573, "time_per_iteration": 2.463361978530884 }, { "auxiliary_loss_clip": 0.01131782, "auxiliary_loss_mlp": 0.01031766, "balance_loss_clip": 1.0534482, "balance_loss_mlp": 1.02428865, "epoch": 0.549990981783202, "flos": 21069114082560.0, "grad_norm": 1.6027613517811543, "language_loss": 0.74147856, "learning_rate": 1.7738042640194482e-06, "loss": 0.76311398, "num_input_tokens_seen": 98652700, "step": 4574, "time_per_iteration": 2.513716220855713 }, { "auxiliary_loss_clip": 0.01174342, "auxiliary_loss_mlp": 0.0102484, "balance_loss_clip": 1.05057776, "balance_loss_mlp": 1.01723456, "epoch": 0.5501112246738411, "flos": 21395901041280.0, "grad_norm": 4.283970291102364, "language_loss": 0.70605361, "learning_rate": 1.7730303075454335e-06, "loss": 0.72804546, "num_input_tokens_seen": 98671590, "step": 4575, "time_per_iteration": 2.417705535888672 }, { "auxiliary_loss_clip": 0.0113337, "auxiliary_loss_mlp": 0.01026899, "balance_loss_clip": 1.04462266, "balance_loss_mlp": 1.01873326, "epoch": 0.5502314675644803, "flos": 17456931699840.0, "grad_norm": 1.8406541595085508, "language_loss": 0.85018128, "learning_rate": 1.7722563855025402e-06, "loss": 0.87178397, "num_input_tokens_seen": 98689620, "step": 4576, "time_per_iteration": 3.239882707595825 }, { "auxiliary_loss_clip": 0.01146196, "auxiliary_loss_mlp": 0.01020372, "balance_loss_clip": 1.0441941, "balance_loss_mlp": 1.01271927, "epoch": 0.5503517104551193, "flos": 24310410583680.0, "grad_norm": 2.5152415230034992, "language_loss": 0.70804691, "learning_rate": 1.7714824980081721e-06, "loss": 0.72971261, "num_input_tokens_seen": 98708915, "step": 4577, "time_per_iteration": 2.530142068862915 }, { "auxiliary_loss_clip": 0.01158352, "auxiliary_loss_mlp": 0.01021886, "balance_loss_clip": 1.05130887, "balance_loss_mlp": 1.01495147, "epoch": 0.5504719533457584, "flos": 22419427086720.0, "grad_norm": 1.6876752239705388, "language_loss": 0.73926985, "learning_rate": 1.7707086451797276e-06, "loss": 0.76107216, "num_input_tokens_seen": 98729790, "step": 4578, "time_per_iteration": 2.4867489337921143 }, { "auxiliary_loss_clip": 0.01041873, "auxiliary_loss_mlp": 0.0100182, "balance_loss_clip": 1.01362205, "balance_loss_mlp": 1.00084198, "epoch": 0.5505921962363975, "flos": 67294155968640.0, "grad_norm": 0.6992956562704757, "language_loss": 0.52346671, "learning_rate": 1.7699348271345993e-06, "loss": 0.54390365, "num_input_tokens_seen": 98792415, "step": 4579, "time_per_iteration": 3.89481520652771 }, { "auxiliary_loss_clip": 0.01037737, "auxiliary_loss_mlp": 0.01003438, "balance_loss_clip": 1.01361394, "balance_loss_mlp": 1.00232971, "epoch": 0.5507124391270366, "flos": 45685125578880.0, "grad_norm": 3.1794472675558203, "language_loss": 0.54430765, "learning_rate": 1.7691610439901753e-06, "loss": 0.56471938, "num_input_tokens_seen": 98855350, "step": 4580, "time_per_iteration": 3.1726150512695312 }, { "auxiliary_loss_clip": 0.01163625, "auxiliary_loss_mlp": 0.01026128, "balance_loss_clip": 1.05114615, "balance_loss_mlp": 1.01914787, "epoch": 0.5508326820176757, "flos": 22273845264000.0, "grad_norm": 1.795442319083522, "language_loss": 0.75315332, "learning_rate": 1.7683872958638367e-06, "loss": 0.77505088, "num_input_tokens_seen": 98874230, "step": 4581, "time_per_iteration": 2.5027122497558594 }, { "auxiliary_loss_clip": 0.01142542, "auxiliary_loss_mlp": 0.01025634, "balance_loss_clip": 1.04526246, "balance_loss_mlp": 1.01792121, "epoch": 0.5509529249083148, "flos": 20012442762240.0, "grad_norm": 2.3341089321278923, "language_loss": 0.84708309, "learning_rate": 1.7676135828729614e-06, "loss": 0.86876488, "num_input_tokens_seen": 98893940, "step": 4582, "time_per_iteration": 3.3754777908325195 }, { "auxiliary_loss_clip": 0.0116154, "auxiliary_loss_mlp": 0.01026191, "balance_loss_clip": 1.05121076, "balance_loss_mlp": 1.01863003, "epoch": 0.5510731677989539, "flos": 21834801325440.0, "grad_norm": 2.546083563910847, "language_loss": 0.82716769, "learning_rate": 1.7668399051349205e-06, "loss": 0.84904498, "num_input_tokens_seen": 98913620, "step": 4583, "time_per_iteration": 2.481414794921875 }, { "auxiliary_loss_clip": 0.01129364, "auxiliary_loss_mlp": 0.01021433, "balance_loss_clip": 1.04554474, "balance_loss_mlp": 1.01402414, "epoch": 0.5511934106895929, "flos": 21467901853440.0, "grad_norm": 1.9774319854423994, "language_loss": 0.83366269, "learning_rate": 1.766066262767081e-06, "loss": 0.85517067, "num_input_tokens_seen": 98931460, "step": 4584, "time_per_iteration": 2.5133705139160156 }, { "auxiliary_loss_clip": 0.01143608, "auxiliary_loss_mlp": 0.01023765, "balance_loss_clip": 1.05011714, "balance_loss_mlp": 1.01636803, "epoch": 0.5513136535802321, "flos": 21068934514560.0, "grad_norm": 2.11975912492017, "language_loss": 0.77296448, "learning_rate": 1.765292655886803e-06, "loss": 0.79463822, "num_input_tokens_seen": 98950105, "step": 4585, "time_per_iteration": 3.266927719116211 }, { "auxiliary_loss_clip": 0.01137984, "auxiliary_loss_mlp": 0.01027223, "balance_loss_clip": 1.04614711, "balance_loss_mlp": 1.01968932, "epoch": 0.5514338964708712, "flos": 27815004754560.0, "grad_norm": 2.316459250751171, "language_loss": 0.70491791, "learning_rate": 1.764519084611443e-06, "loss": 0.72657001, "num_input_tokens_seen": 98970560, "step": 4586, "time_per_iteration": 2.554762840270996 }, { "auxiliary_loss_clip": 0.01144834, "auxiliary_loss_mlp": 0.01025652, "balance_loss_clip": 1.04485667, "balance_loss_mlp": 1.01716995, "epoch": 0.5515541393615102, "flos": 21908525990400.0, "grad_norm": 1.9169869126700863, "language_loss": 0.77876776, "learning_rate": 1.7637455490583505e-06, "loss": 0.80047262, "num_input_tokens_seen": 98989885, "step": 4587, "time_per_iteration": 2.5103631019592285 }, { "auxiliary_loss_clip": 0.01160474, "auxiliary_loss_mlp": 0.0102577, "balance_loss_clip": 1.05094504, "balance_loss_mlp": 1.01883531, "epoch": 0.5516743822521494, "flos": 20485422074880.0, "grad_norm": 2.1088689234778375, "language_loss": 0.77264249, "learning_rate": 1.7629720493448701e-06, "loss": 0.794505, "num_input_tokens_seen": 99007180, "step": 4588, "time_per_iteration": 2.4755096435546875 }, { "auxiliary_loss_clip": 0.01155645, "auxiliary_loss_mlp": 0.01029743, "balance_loss_clip": 1.04966128, "balance_loss_mlp": 1.02235818, "epoch": 0.5517946251427884, "flos": 14940383915520.0, "grad_norm": 2.221206655272309, "language_loss": 0.85483432, "learning_rate": 1.7621985855883418e-06, "loss": 0.87668824, "num_input_tokens_seen": 99023880, "step": 4589, "time_per_iteration": 2.4562203884124756 }, { "auxiliary_loss_clip": 0.01141417, "auxiliary_loss_mlp": 0.01022181, "balance_loss_clip": 1.0477711, "balance_loss_mlp": 1.0145998, "epoch": 0.5519148680334275, "flos": 18404865573120.0, "grad_norm": 1.9633910655858247, "language_loss": 0.72875792, "learning_rate": 1.7614251579060983e-06, "loss": 0.75039393, "num_input_tokens_seen": 99042475, "step": 4590, "time_per_iteration": 2.4668638706207275 }, { "auxiliary_loss_clip": 0.01135538, "auxiliary_loss_mlp": 0.01025429, "balance_loss_clip": 1.04711854, "balance_loss_mlp": 1.01762688, "epoch": 0.5520351109240667, "flos": 25113337251840.0, "grad_norm": 1.7528165984326978, "language_loss": 0.84657156, "learning_rate": 1.76065176641547e-06, "loss": 0.86818123, "num_input_tokens_seen": 99065185, "step": 4591, "time_per_iteration": 2.5800955295562744 }, { "auxiliary_loss_clip": 0.01159643, "auxiliary_loss_mlp": 0.01022772, "balance_loss_clip": 1.04646087, "balance_loss_mlp": 1.01492786, "epoch": 0.5521553538147057, "flos": 21069545045760.0, "grad_norm": 1.8201871631045887, "language_loss": 0.78194022, "learning_rate": 1.759878411233777e-06, "loss": 0.80376434, "num_input_tokens_seen": 99083645, "step": 4592, "time_per_iteration": 2.4513750076293945 }, { "auxiliary_loss_clip": 0.01158278, "auxiliary_loss_mlp": 0.01023502, "balance_loss_clip": 1.04787374, "balance_loss_mlp": 1.0156281, "epoch": 0.5522755967053448, "flos": 18879999701760.0, "grad_norm": 2.23111348106347, "language_loss": 0.75911099, "learning_rate": 1.7591050924783388e-06, "loss": 0.78092873, "num_input_tokens_seen": 99100835, "step": 4593, "time_per_iteration": 2.435822010040283 }, { "auxiliary_loss_clip": 0.01035245, "auxiliary_loss_mlp": 0.0100228, "balance_loss_clip": 1.01448512, "balance_loss_mlp": 1.0012188, "epoch": 0.5523958395959839, "flos": 64675622494080.0, "grad_norm": 0.8409316010093588, "language_loss": 0.57956284, "learning_rate": 1.7583318102664661e-06, "loss": 0.59993804, "num_input_tokens_seen": 99168400, "step": 4594, "time_per_iteration": 3.162726879119873 }, { "auxiliary_loss_clip": 0.01163231, "auxiliary_loss_mlp": 0.01027378, "balance_loss_clip": 1.04684472, "balance_loss_mlp": 1.01982021, "epoch": 0.552516082486623, "flos": 10889732211840.0, "grad_norm": 2.0834189121283617, "language_loss": 0.7913065, "learning_rate": 1.757558564715466e-06, "loss": 0.81321263, "num_input_tokens_seen": 99186475, "step": 4595, "time_per_iteration": 2.448741912841797 }, { "auxiliary_loss_clip": 0.01161826, "auxiliary_loss_mlp": 0.01026472, "balance_loss_clip": 1.04710138, "balance_loss_mlp": 1.01874757, "epoch": 0.552636325377262, "flos": 22199797376640.0, "grad_norm": 2.5948709360976223, "language_loss": 0.74100387, "learning_rate": 1.7567853559426386e-06, "loss": 0.76288682, "num_input_tokens_seen": 99203525, "step": 4596, "time_per_iteration": 2.4705755710601807 }, { "auxiliary_loss_clip": 0.01162587, "auxiliary_loss_mlp": 0.0102786, "balance_loss_clip": 1.04906225, "balance_loss_mlp": 1.02059984, "epoch": 0.5527565682679012, "flos": 23988184652160.0, "grad_norm": 1.954694638151164, "language_loss": 0.75518489, "learning_rate": 1.7560121840652797e-06, "loss": 0.77708936, "num_input_tokens_seen": 99222910, "step": 4597, "time_per_iteration": 2.4726083278656006 }, { "auxiliary_loss_clip": 0.01123301, "auxiliary_loss_mlp": 0.01021177, "balance_loss_clip": 1.04606032, "balance_loss_mlp": 1.01351833, "epoch": 0.5528768111585403, "flos": 19719267955200.0, "grad_norm": 1.7970635243716697, "language_loss": 0.69439077, "learning_rate": 1.7552390492006782e-06, "loss": 0.71583557, "num_input_tokens_seen": 99241230, "step": 4598, "time_per_iteration": 2.5285274982452393 }, { "auxiliary_loss_clip": 0.0112505, "auxiliary_loss_mlp": 0.00761991, "balance_loss_clip": 1.04282832, "balance_loss_mlp": 1.00033128, "epoch": 0.5529970540491793, "flos": 26215975002240.0, "grad_norm": 2.841268767851113, "language_loss": 0.65259719, "learning_rate": 1.7544659514661184e-06, "loss": 0.67146754, "num_input_tokens_seen": 99264320, "step": 4599, "time_per_iteration": 2.6457877159118652 }, { "auxiliary_loss_clip": 0.01142168, "auxiliary_loss_mlp": 0.01022629, "balance_loss_clip": 1.04485798, "balance_loss_mlp": 1.01534522, "epoch": 0.5531172969398185, "flos": 24425971614720.0, "grad_norm": 2.0676723586161323, "language_loss": 0.79505628, "learning_rate": 1.7536928909788786e-06, "loss": 0.81670427, "num_input_tokens_seen": 99283625, "step": 4600, "time_per_iteration": 2.519928455352783 }, { "auxiliary_loss_clip": 0.01038012, "auxiliary_loss_mlp": 0.01002878, "balance_loss_clip": 1.01439595, "balance_loss_mlp": 1.00188303, "epoch": 0.5532375398304575, "flos": 64907316195840.0, "grad_norm": 0.8867001722156724, "language_loss": 0.61995786, "learning_rate": 1.752919867856231e-06, "loss": 0.64036679, "num_input_tokens_seen": 99335270, "step": 4601, "time_per_iteration": 2.9552173614501953 }, { "auxiliary_loss_clip": 0.01137935, "auxiliary_loss_mlp": 0.01024868, "balance_loss_clip": 1.04410577, "balance_loss_mlp": 1.0175426, "epoch": 0.5533577827210966, "flos": 19683105937920.0, "grad_norm": 1.7535870211720843, "language_loss": 0.78933799, "learning_rate": 1.7521468822154436e-06, "loss": 0.81096601, "num_input_tokens_seen": 99354185, "step": 4602, "time_per_iteration": 2.4831244945526123 }, { "auxiliary_loss_clip": 0.01141231, "auxiliary_loss_mlp": 0.01026506, "balance_loss_clip": 1.04819393, "balance_loss_mlp": 1.01973748, "epoch": 0.5534780256117358, "flos": 32306496076800.0, "grad_norm": 1.82088858066226, "language_loss": 0.74731058, "learning_rate": 1.751373934173777e-06, "loss": 0.76898801, "num_input_tokens_seen": 99376930, "step": 4603, "time_per_iteration": 3.4246315956115723 }, { "auxiliary_loss_clip": 0.01175575, "auxiliary_loss_mlp": 0.01025871, "balance_loss_clip": 1.04968834, "balance_loss_mlp": 1.01816392, "epoch": 0.5535982685023748, "flos": 23222425582080.0, "grad_norm": 1.5675869527216764, "language_loss": 0.73273396, "learning_rate": 1.750601023848487e-06, "loss": 0.75474846, "num_input_tokens_seen": 99397655, "step": 4604, "time_per_iteration": 2.4424405097961426 }, { "auxiliary_loss_clip": 0.01174596, "auxiliary_loss_mlp": 0.00761752, "balance_loss_clip": 1.05228043, "balance_loss_mlp": 1.00026774, "epoch": 0.5537185113930139, "flos": 24352534258560.0, "grad_norm": 2.1650850015714878, "language_loss": 0.73828447, "learning_rate": 1.749828151356823e-06, "loss": 0.75764793, "num_input_tokens_seen": 99417850, "step": 4605, "time_per_iteration": 2.438225269317627 }, { "auxiliary_loss_clip": 0.0114486, "auxiliary_loss_mlp": 0.01029615, "balance_loss_clip": 1.04667068, "balance_loss_mlp": 1.02261186, "epoch": 0.553838754283653, "flos": 23549068886400.0, "grad_norm": 1.6494187455371543, "language_loss": 0.75877815, "learning_rate": 1.7490553168160297e-06, "loss": 0.78052294, "num_input_tokens_seen": 99438920, "step": 4606, "time_per_iteration": 3.3813464641571045 }, { "auxiliary_loss_clip": 0.01144062, "auxiliary_loss_mlp": 0.0102847, "balance_loss_clip": 1.0461936, "balance_loss_mlp": 1.02014923, "epoch": 0.5539589971742921, "flos": 17275044205440.0, "grad_norm": 1.8498956039126726, "language_loss": 0.76338255, "learning_rate": 1.748282520343345e-06, "loss": 0.78510791, "num_input_tokens_seen": 99457950, "step": 4607, "time_per_iteration": 2.4512076377868652 }, { "auxiliary_loss_clip": 0.01169421, "auxiliary_loss_mlp": 0.01022544, "balance_loss_clip": 1.05077076, "balance_loss_mlp": 1.01465845, "epoch": 0.5540792400649311, "flos": 27564169104000.0, "grad_norm": 1.968105000801093, "language_loss": 0.78958642, "learning_rate": 1.7475097620560023e-06, "loss": 0.81150603, "num_input_tokens_seen": 99478015, "step": 4608, "time_per_iteration": 3.328918933868408 }, { "auxiliary_loss_clip": 0.01174592, "auxiliary_loss_mlp": 0.01022635, "balance_loss_clip": 1.05170345, "balance_loss_mlp": 1.01548803, "epoch": 0.5541994829555702, "flos": 23878657105920.0, "grad_norm": 1.8116425445954205, "language_loss": 0.71257412, "learning_rate": 1.746737042071228e-06, "loss": 0.73454636, "num_input_tokens_seen": 99496520, "step": 4609, "time_per_iteration": 2.435483694076538 }, { "auxiliary_loss_clip": 0.01142669, "auxiliary_loss_mlp": 0.01022868, "balance_loss_clip": 1.0479964, "balance_loss_mlp": 1.01517344, "epoch": 0.5543197258462094, "flos": 20115721342080.0, "grad_norm": 5.749539878932833, "language_loss": 0.79233378, "learning_rate": 1.7459643605062424e-06, "loss": 0.81398916, "num_input_tokens_seen": 99513780, "step": 4610, "time_per_iteration": 2.4782073497772217 }, { "auxiliary_loss_clip": 0.01115797, "auxiliary_loss_mlp": 0.01022765, "balance_loss_clip": 1.0455904, "balance_loss_mlp": 1.01502824, "epoch": 0.5544399687368484, "flos": 20916565021440.0, "grad_norm": 1.6165793758894886, "language_loss": 0.80664867, "learning_rate": 1.745191717478262e-06, "loss": 0.82803428, "num_input_tokens_seen": 99532360, "step": 4611, "time_per_iteration": 2.5607051849365234 }, { "auxiliary_loss_clip": 0.01143397, "auxiliary_loss_mlp": 0.01031002, "balance_loss_clip": 1.0495584, "balance_loss_mlp": 1.02327096, "epoch": 0.5545602116274875, "flos": 25518661297920.0, "grad_norm": 1.9027095472513167, "language_loss": 0.79611576, "learning_rate": 1.7444191131044948e-06, "loss": 0.81785971, "num_input_tokens_seen": 99552635, "step": 4612, "time_per_iteration": 3.260999917984009 }, { "auxiliary_loss_clip": 0.01146826, "auxiliary_loss_mlp": 0.01028122, "balance_loss_clip": 1.04976153, "balance_loss_mlp": 1.01980758, "epoch": 0.5546804545181266, "flos": 20995568985600.0, "grad_norm": 1.7553601877609761, "language_loss": 0.73065877, "learning_rate": 1.7436465475021456e-06, "loss": 0.75240827, "num_input_tokens_seen": 99572685, "step": 4613, "time_per_iteration": 2.50041127204895 }, { "auxiliary_loss_clip": 0.01123572, "auxiliary_loss_mlp": 0.01023986, "balance_loss_clip": 1.04623449, "balance_loss_mlp": 1.01608241, "epoch": 0.5548006974087657, "flos": 26833638297600.0, "grad_norm": 1.8490591092707873, "language_loss": 0.71849304, "learning_rate": 1.7428740207884111e-06, "loss": 0.73996866, "num_input_tokens_seen": 99593565, "step": 4614, "time_per_iteration": 2.595573663711548 }, { "auxiliary_loss_clip": 0.01119956, "auxiliary_loss_mlp": 0.01026869, "balance_loss_clip": 1.04569817, "balance_loss_mlp": 1.01921582, "epoch": 0.5549209402994048, "flos": 33656414031360.0, "grad_norm": 1.7744247723909636, "language_loss": 0.61129081, "learning_rate": 1.7421015330804833e-06, "loss": 0.63275909, "num_input_tokens_seen": 99613485, "step": 4615, "time_per_iteration": 2.650078296661377 }, { "auxiliary_loss_clip": 0.01174529, "auxiliary_loss_mlp": 0.01027293, "balance_loss_clip": 1.0511719, "balance_loss_mlp": 1.01981258, "epoch": 0.5550411831900439, "flos": 23769524609280.0, "grad_norm": 1.7454185104715776, "language_loss": 0.72400331, "learning_rate": 1.7413290844955475e-06, "loss": 0.74602151, "num_input_tokens_seen": 99633515, "step": 4616, "time_per_iteration": 2.481126070022583 }, { "auxiliary_loss_clip": 0.01155513, "auxiliary_loss_mlp": 0.01027665, "balance_loss_clip": 1.05171251, "balance_loss_mlp": 1.02044129, "epoch": 0.555161426080683, "flos": 21651189978240.0, "grad_norm": 1.8683043543788287, "language_loss": 0.78068757, "learning_rate": 1.7405566751507843e-06, "loss": 0.80251938, "num_input_tokens_seen": 99651560, "step": 4617, "time_per_iteration": 2.460554838180542 }, { "auxiliary_loss_clip": 0.01131125, "auxiliary_loss_mlp": 0.01032996, "balance_loss_clip": 1.04484153, "balance_loss_mlp": 1.02599287, "epoch": 0.555281668971322, "flos": 49563116605440.0, "grad_norm": 1.451190297533999, "language_loss": 0.67608857, "learning_rate": 1.7397843051633668e-06, "loss": 0.69772977, "num_input_tokens_seen": 99674255, "step": 4618, "time_per_iteration": 2.8118202686309814 }, { "auxiliary_loss_clip": 0.01155112, "auxiliary_loss_mlp": 0.01027301, "balance_loss_clip": 1.04858947, "balance_loss_mlp": 1.01961231, "epoch": 0.5554019118619612, "flos": 20741608851840.0, "grad_norm": 1.6997013659868998, "language_loss": 0.71361554, "learning_rate": 1.739011974650464e-06, "loss": 0.73543972, "num_input_tokens_seen": 99693585, "step": 4619, "time_per_iteration": 2.458878517150879 }, { "auxiliary_loss_clip": 0.01122592, "auxiliary_loss_mlp": 0.01027172, "balance_loss_clip": 1.04540467, "balance_loss_mlp": 1.01944089, "epoch": 0.5555221547526003, "flos": 25483217552640.0, "grad_norm": 3.1959499183109434, "language_loss": 0.76647395, "learning_rate": 1.7382396837292365e-06, "loss": 0.78797162, "num_input_tokens_seen": 99714045, "step": 4620, "time_per_iteration": 2.6123523712158203 }, { "auxiliary_loss_clip": 0.01175055, "auxiliary_loss_mlp": 0.01022097, "balance_loss_clip": 1.05240893, "balance_loss_mlp": 1.01437187, "epoch": 0.5556423976432393, "flos": 21762513204480.0, "grad_norm": 1.8926166248520258, "language_loss": 0.73393691, "learning_rate": 1.737467432516841e-06, "loss": 0.75590849, "num_input_tokens_seen": 99734145, "step": 4621, "time_per_iteration": 2.434607982635498 }, { "auxiliary_loss_clip": 0.01144136, "auxiliary_loss_mlp": 0.01024403, "balance_loss_clip": 1.04441214, "balance_loss_mlp": 1.0169735, "epoch": 0.5557626405338785, "flos": 24900171989760.0, "grad_norm": 2.30868781223006, "language_loss": 0.74419034, "learning_rate": 1.7366952211304274e-06, "loss": 0.7658757, "num_input_tokens_seen": 99751990, "step": 4622, "time_per_iteration": 2.510425329208374 }, { "auxiliary_loss_clip": 0.01139126, "auxiliary_loss_mlp": 0.01025484, "balance_loss_clip": 1.04648924, "balance_loss_mlp": 1.01780653, "epoch": 0.5558828834245175, "flos": 18697501676160.0, "grad_norm": 2.0231673981919944, "language_loss": 0.83958733, "learning_rate": 1.735923049687139e-06, "loss": 0.86123341, "num_input_tokens_seen": 99768565, "step": 4623, "time_per_iteration": 2.466840982437134 }, { "auxiliary_loss_clip": 0.01141118, "auxiliary_loss_mlp": 0.01025308, "balance_loss_clip": 1.04664481, "balance_loss_mlp": 1.01798511, "epoch": 0.5560031263151566, "flos": 27272179445760.0, "grad_norm": 1.4503057712483887, "language_loss": 0.73727715, "learning_rate": 1.7351509183041144e-06, "loss": 0.75894141, "num_input_tokens_seen": 99788895, "step": 4624, "time_per_iteration": 2.5360922813415527 }, { "auxiliary_loss_clip": 0.01176778, "auxiliary_loss_mlp": 0.01023867, "balance_loss_clip": 1.05155659, "balance_loss_mlp": 1.01664829, "epoch": 0.5561233692057957, "flos": 23403738458880.0, "grad_norm": 1.6291736817877638, "language_loss": 0.71514523, "learning_rate": 1.7343788270984852e-06, "loss": 0.73715168, "num_input_tokens_seen": 99808035, "step": 4625, "time_per_iteration": 2.441911458969116 }, { "auxiliary_loss_clip": 0.01143773, "auxiliary_loss_mlp": 0.01023015, "balance_loss_clip": 1.0490911, "balance_loss_mlp": 1.01510882, "epoch": 0.5562436120964348, "flos": 37670867804160.0, "grad_norm": 1.8309422499479109, "language_loss": 0.74689662, "learning_rate": 1.7336067761873764e-06, "loss": 0.76856446, "num_input_tokens_seen": 99830460, "step": 4626, "time_per_iteration": 2.6328115463256836 }, { "auxiliary_loss_clip": 0.01169926, "auxiliary_loss_mlp": 0.01029226, "balance_loss_clip": 1.05162418, "balance_loss_mlp": 1.02109575, "epoch": 0.5563638549870739, "flos": 25155245445120.0, "grad_norm": 1.9123073967380806, "language_loss": 0.76350218, "learning_rate": 1.7328347656879076e-06, "loss": 0.78549373, "num_input_tokens_seen": 99850320, "step": 4627, "time_per_iteration": 2.504725694656372 }, { "auxiliary_loss_clip": 0.01132678, "auxiliary_loss_mlp": 0.01021946, "balance_loss_clip": 1.04657364, "balance_loss_mlp": 1.01391721, "epoch": 0.556484097877713, "flos": 13581810783360.0, "grad_norm": 2.270634774073174, "language_loss": 0.67934072, "learning_rate": 1.7320627957171927e-06, "loss": 0.70088696, "num_input_tokens_seen": 99864980, "step": 4628, "time_per_iteration": 2.4655189514160156 }, { "auxiliary_loss_clip": 0.01176321, "auxiliary_loss_mlp": 0.01024437, "balance_loss_clip": 1.05267274, "balance_loss_mlp": 1.01743913, "epoch": 0.5566043407683521, "flos": 24681368292480.0, "grad_norm": 4.885454474018997, "language_loss": 0.8158983, "learning_rate": 1.7312908663923382e-06, "loss": 0.83790582, "num_input_tokens_seen": 99881155, "step": 4629, "time_per_iteration": 2.449221134185791 }, { "auxiliary_loss_clip": 0.01155254, "auxiliary_loss_mlp": 0.0102506, "balance_loss_clip": 1.04774356, "balance_loss_mlp": 1.01722527, "epoch": 0.5567245836589911, "flos": 20588161950720.0, "grad_norm": 2.6351390872886213, "language_loss": 0.67132974, "learning_rate": 1.7305189778304463e-06, "loss": 0.69313288, "num_input_tokens_seen": 99899330, "step": 4630, "time_per_iteration": 3.199492931365967 }, { "auxiliary_loss_clip": 0.01149082, "auxiliary_loss_mlp": 0.01028249, "balance_loss_clip": 1.05212951, "balance_loss_mlp": 1.02062833, "epoch": 0.5568448265496303, "flos": 20704189858560.0, "grad_norm": 1.7926391479239319, "language_loss": 0.79410577, "learning_rate": 1.729747130148611e-06, "loss": 0.81587911, "num_input_tokens_seen": 99918525, "step": 4631, "time_per_iteration": 2.484035015106201 }, { "auxiliary_loss_clip": 0.01138729, "auxiliary_loss_mlp": 0.01026581, "balance_loss_clip": 1.0478282, "balance_loss_mlp": 1.01826048, "epoch": 0.5569650694402694, "flos": 25302910256640.0, "grad_norm": 1.8188745188738866, "language_loss": 0.76582205, "learning_rate": 1.7289753234639208e-06, "loss": 0.78747523, "num_input_tokens_seen": 99937500, "step": 4632, "time_per_iteration": 2.560176372528076 }, { "auxiliary_loss_clip": 0.01167092, "auxiliary_loss_mlp": 0.01026494, "balance_loss_clip": 1.05225813, "balance_loss_mlp": 1.01866245, "epoch": 0.5570853123309084, "flos": 19712623939200.0, "grad_norm": 3.534945863279641, "language_loss": 0.76477939, "learning_rate": 1.7282035578934592e-06, "loss": 0.78671527, "num_input_tokens_seen": 99955665, "step": 4633, "time_per_iteration": 3.321467161178589 }, { "auxiliary_loss_clip": 0.01140989, "auxiliary_loss_mlp": 0.01036001, "balance_loss_clip": 1.05089867, "balance_loss_mlp": 1.02868712, "epoch": 0.5572055552215476, "flos": 16108091153280.0, "grad_norm": 1.8059749404313463, "language_loss": 0.78944975, "learning_rate": 1.727431833554301e-06, "loss": 0.81121969, "num_input_tokens_seen": 99974140, "step": 4634, "time_per_iteration": 2.521886110305786 }, { "auxiliary_loss_clip": 0.01114021, "auxiliary_loss_mlp": 0.01024146, "balance_loss_clip": 1.04610968, "balance_loss_mlp": 1.01646328, "epoch": 0.5573257981121866, "flos": 17128815937920.0, "grad_norm": 1.791886050918992, "language_loss": 0.77460384, "learning_rate": 1.7266601505635175e-06, "loss": 0.79598558, "num_input_tokens_seen": 99991480, "step": 4635, "time_per_iteration": 3.4280803203582764 }, { "auxiliary_loss_clip": 0.01162694, "auxiliary_loss_mlp": 0.01026877, "balance_loss_clip": 1.05179167, "balance_loss_mlp": 1.01921773, "epoch": 0.5574460410028257, "flos": 18807029222400.0, "grad_norm": 2.4125204917444436, "language_loss": 0.75997543, "learning_rate": 1.7258885090381717e-06, "loss": 0.7818712, "num_input_tokens_seen": 100009520, "step": 4636, "time_per_iteration": 2.451374053955078 }, { "auxiliary_loss_clip": 0.01150358, "auxiliary_loss_mlp": 0.01028099, "balance_loss_clip": 1.04871297, "balance_loss_mlp": 1.02099407, "epoch": 0.5575662838934649, "flos": 29642678530560.0, "grad_norm": 1.8321429816259176, "language_loss": 0.7845186, "learning_rate": 1.7251169090953213e-06, "loss": 0.80630314, "num_input_tokens_seen": 100029995, "step": 4637, "time_per_iteration": 2.5829689502716064 }, { "auxiliary_loss_clip": 0.01161407, "auxiliary_loss_mlp": 0.01024314, "balance_loss_clip": 1.04985428, "balance_loss_mlp": 1.01630306, "epoch": 0.5576865267841039, "flos": 22054466949120.0, "grad_norm": 2.5592453917876905, "language_loss": 0.75951588, "learning_rate": 1.7243453508520168e-06, "loss": 0.78137308, "num_input_tokens_seen": 100046980, "step": 4638, "time_per_iteration": 3.157209634780884 }, { "auxiliary_loss_clip": 0.01147754, "auxiliary_loss_mlp": 0.01027883, "balance_loss_clip": 1.04773486, "balance_loss_mlp": 1.02013993, "epoch": 0.557806769674743, "flos": 17196040241280.0, "grad_norm": 1.8866592588179323, "language_loss": 0.8457588, "learning_rate": 1.7235738344253038e-06, "loss": 0.86751521, "num_input_tokens_seen": 100060610, "step": 4639, "time_per_iteration": 2.4657938480377197 }, { "auxiliary_loss_clip": 0.01164245, "auxiliary_loss_mlp": 0.01028033, "balance_loss_clip": 1.055089, "balance_loss_mlp": 1.01990926, "epoch": 0.557927012565382, "flos": 24712717887360.0, "grad_norm": 3.1801102619615538, "language_loss": 0.82909626, "learning_rate": 1.72280235993222e-06, "loss": 0.85101902, "num_input_tokens_seen": 100078915, "step": 4640, "time_per_iteration": 2.508254051208496 }, { "auxiliary_loss_clip": 0.01157918, "auxiliary_loss_mlp": 0.00762758, "balance_loss_clip": 1.04921222, "balance_loss_mlp": 1.00029778, "epoch": 0.5580472554560212, "flos": 16983090460800.0, "grad_norm": 2.2492420488409355, "language_loss": 0.69526637, "learning_rate": 1.722030927489798e-06, "loss": 0.71447319, "num_input_tokens_seen": 100096195, "step": 4641, "time_per_iteration": 2.440819501876831 }, { "auxiliary_loss_clip": 0.011362, "auxiliary_loss_mlp": 0.01022272, "balance_loss_clip": 1.05016398, "balance_loss_mlp": 1.01445508, "epoch": 0.5581674983466602, "flos": 23509100027520.0, "grad_norm": 1.6734723263534574, "language_loss": 0.74127352, "learning_rate": 1.7212595372150634e-06, "loss": 0.76285827, "num_input_tokens_seen": 100116175, "step": 4642, "time_per_iteration": 2.5422399044036865 }, { "auxiliary_loss_clip": 0.01178055, "auxiliary_loss_mlp": 0.01025011, "balance_loss_clip": 1.05357552, "balance_loss_mlp": 1.01781082, "epoch": 0.5582877412372993, "flos": 13480291969920.0, "grad_norm": 2.3068966582922714, "language_loss": 0.72841978, "learning_rate": 1.720488189225035e-06, "loss": 0.75045049, "num_input_tokens_seen": 100133875, "step": 4643, "time_per_iteration": 2.3839259147644043 }, { "auxiliary_loss_clip": 0.01163627, "auxiliary_loss_mlp": 0.010264, "balance_loss_clip": 1.05009913, "balance_loss_mlp": 1.01868665, "epoch": 0.5584079841279385, "flos": 21903605827200.0, "grad_norm": 2.1320329244769236, "language_loss": 0.79387081, "learning_rate": 1.7197168836367265e-06, "loss": 0.8157711, "num_input_tokens_seen": 100150685, "step": 4644, "time_per_iteration": 2.4428772926330566 }, { "auxiliary_loss_clip": 0.01157057, "auxiliary_loss_mlp": 0.00762141, "balance_loss_clip": 1.04720819, "balance_loss_mlp": 1.00025892, "epoch": 0.5585282270185775, "flos": 18843550375680.0, "grad_norm": 1.8273082612237137, "language_loss": 0.81958282, "learning_rate": 1.7189456205671433e-06, "loss": 0.8387748, "num_input_tokens_seen": 100169530, "step": 4645, "time_per_iteration": 2.4554295539855957 }, { "auxiliary_loss_clip": 0.01170389, "auxiliary_loss_mlp": 0.01026292, "balance_loss_clip": 1.05173147, "balance_loss_mlp": 1.01820397, "epoch": 0.5586484699092166, "flos": 21868449390720.0, "grad_norm": 1.7662071525388225, "language_loss": 0.82310963, "learning_rate": 1.7181744001332866e-06, "loss": 0.84507644, "num_input_tokens_seen": 100188140, "step": 4646, "time_per_iteration": 2.4654436111450195 }, { "auxiliary_loss_clip": 0.0117644, "auxiliary_loss_mlp": 0.01025342, "balance_loss_clip": 1.05408692, "balance_loss_mlp": 1.01763546, "epoch": 0.5587687127998557, "flos": 22893232412160.0, "grad_norm": 1.7128976657847303, "language_loss": 0.63498676, "learning_rate": 1.7174032224521493e-06, "loss": 0.65700454, "num_input_tokens_seen": 100206850, "step": 4647, "time_per_iteration": 2.4171664714813232 }, { "auxiliary_loss_clip": 0.01161559, "auxiliary_loss_mlp": 0.01026541, "balance_loss_clip": 1.05078363, "balance_loss_mlp": 1.01918876, "epoch": 0.5588889556904948, "flos": 20303067703680.0, "grad_norm": 1.649069331894175, "language_loss": 0.69753414, "learning_rate": 1.7166320876407184e-06, "loss": 0.71941519, "num_input_tokens_seen": 100226270, "step": 4648, "time_per_iteration": 2.5370969772338867 }, { "auxiliary_loss_clip": 0.01179676, "auxiliary_loss_mlp": 0.00762517, "balance_loss_clip": 1.0536679, "balance_loss_mlp": 1.00035954, "epoch": 0.5590091985811338, "flos": 16472153450880.0, "grad_norm": 2.00652214296276, "language_loss": 0.67454857, "learning_rate": 1.7158609958159742e-06, "loss": 0.6939705, "num_input_tokens_seen": 100243675, "step": 4649, "time_per_iteration": 2.410348653793335 }, { "auxiliary_loss_clip": 0.01118583, "auxiliary_loss_mlp": 0.01032325, "balance_loss_clip": 1.04787612, "balance_loss_mlp": 1.02455831, "epoch": 0.559129441471773, "flos": 14532186781440.0, "grad_norm": 5.473230571034753, "language_loss": 0.78052616, "learning_rate": 1.7150899470948911e-06, "loss": 0.80203521, "num_input_tokens_seen": 100258940, "step": 4650, "time_per_iteration": 2.5439555644989014 }, { "auxiliary_loss_clip": 0.01056703, "auxiliary_loss_mlp": 0.01001233, "balance_loss_clip": 1.0197506, "balance_loss_mlp": 1.00019574, "epoch": 0.5592496843624121, "flos": 60521009852160.0, "grad_norm": 0.7966790626454702, "language_loss": 0.56700563, "learning_rate": 1.7143189415944365e-06, "loss": 0.58758503, "num_input_tokens_seen": 100323400, "step": 4651, "time_per_iteration": 3.0973634719848633 }, { "auxiliary_loss_clip": 0.01162009, "auxiliary_loss_mlp": 0.01025623, "balance_loss_clip": 1.05181646, "balance_loss_mlp": 1.01738596, "epoch": 0.5593699272530511, "flos": 20886256920960.0, "grad_norm": 1.632918942786101, "language_loss": 0.7624889, "learning_rate": 1.7135479794315714e-06, "loss": 0.78436518, "num_input_tokens_seen": 100340355, "step": 4652, "time_per_iteration": 2.449916362762451 }, { "auxiliary_loss_clip": 0.01132379, "auxiliary_loss_mlp": 0.01022603, "balance_loss_clip": 1.04853582, "balance_loss_mlp": 1.01524162, "epoch": 0.5594901701436903, "flos": 12896743616640.0, "grad_norm": 1.918269431886607, "language_loss": 0.79292631, "learning_rate": 1.7127770607232502e-06, "loss": 0.81447613, "num_input_tokens_seen": 100358900, "step": 4653, "time_per_iteration": 2.5069708824157715 }, { "auxiliary_loss_clip": 0.01141167, "auxiliary_loss_mlp": 0.01024676, "balance_loss_clip": 1.04892516, "balance_loss_mlp": 1.01671898, "epoch": 0.5596104130343293, "flos": 23112107936640.0, "grad_norm": 20.5625506750041, "language_loss": 0.79580224, "learning_rate": 1.7120061855864204e-06, "loss": 0.81746072, "num_input_tokens_seen": 100378910, "step": 4654, "time_per_iteration": 2.54665470123291 }, { "auxiliary_loss_clip": 0.01164816, "auxiliary_loss_mlp": 0.01028303, "balance_loss_clip": 1.05463767, "balance_loss_mlp": 1.02052474, "epoch": 0.5597306559249684, "flos": 25957812977280.0, "grad_norm": 5.0736947359311095, "language_loss": 0.70961249, "learning_rate": 1.7112353541380233e-06, "loss": 0.73154372, "num_input_tokens_seen": 100398770, "step": 4655, "time_per_iteration": 2.4963390827178955 }, { "auxiliary_loss_clip": 0.0115177, "auxiliary_loss_mlp": 0.01030571, "balance_loss_clip": 1.05194855, "balance_loss_mlp": 1.02184474, "epoch": 0.5598508988156076, "flos": 22492289825280.0, "grad_norm": 1.487328339271208, "language_loss": 0.72375858, "learning_rate": 1.7104645664949931e-06, "loss": 0.74558198, "num_input_tokens_seen": 100421240, "step": 4656, "time_per_iteration": 3.320931911468506 }, { "auxiliary_loss_clip": 0.01151403, "auxiliary_loss_mlp": 0.01027767, "balance_loss_clip": 1.04784155, "balance_loss_mlp": 1.01947641, "epoch": 0.5599711417062466, "flos": 23112538899840.0, "grad_norm": 2.0556188410887994, "language_loss": 0.71332437, "learning_rate": 1.7096938227742584e-06, "loss": 0.73511606, "num_input_tokens_seen": 100442370, "step": 4657, "time_per_iteration": 2.521167039871216 }, { "auxiliary_loss_clip": 0.01178348, "auxiliary_loss_mlp": 0.01026276, "balance_loss_clip": 1.05337214, "balance_loss_mlp": 1.01842594, "epoch": 0.5600913845968857, "flos": 22339345714560.0, "grad_norm": 1.8397435219302765, "language_loss": 0.84366721, "learning_rate": 1.70892312309274e-06, "loss": 0.86571342, "num_input_tokens_seen": 100460260, "step": 4658, "time_per_iteration": 2.433173894882202 }, { "auxiliary_loss_clip": 0.01147959, "auxiliary_loss_mlp": 0.0102594, "balance_loss_clip": 1.04343128, "balance_loss_mlp": 1.01782179, "epoch": 0.5602116274875248, "flos": 17633791290240.0, "grad_norm": 2.975392959743797, "language_loss": 0.68112212, "learning_rate": 1.7081524675673523e-06, "loss": 0.70286107, "num_input_tokens_seen": 100475750, "step": 4659, "time_per_iteration": 2.4482812881469727 }, { "auxiliary_loss_clip": 0.01056263, "auxiliary_loss_mlp": 0.0100064, "balance_loss_clip": 1.01519918, "balance_loss_mlp": 0.99962026, "epoch": 0.5603318703781639, "flos": 70115945529600.0, "grad_norm": 0.7713027304735945, "language_loss": 0.5960052, "learning_rate": 1.7073818563150026e-06, "loss": 0.61657417, "num_input_tokens_seen": 100537830, "step": 4660, "time_per_iteration": 3.9954657554626465 }, { "auxiliary_loss_clip": 0.01159389, "auxiliary_loss_mlp": 0.01025404, "balance_loss_clip": 1.04952502, "balance_loss_mlp": 1.01765525, "epoch": 0.560452113268803, "flos": 18545850455040.0, "grad_norm": 2.1787942141850234, "language_loss": 0.86644322, "learning_rate": 1.7066112894525935e-06, "loss": 0.88829112, "num_input_tokens_seen": 100555910, "step": 4661, "time_per_iteration": 2.4842474460601807 }, { "auxiliary_loss_clip": 0.01141045, "auxiliary_loss_mlp": 0.01031546, "balance_loss_clip": 1.04810047, "balance_loss_mlp": 1.02371418, "epoch": 0.5605723561594421, "flos": 25264665250560.0, "grad_norm": 1.6296382630935142, "language_loss": 0.72843611, "learning_rate": 1.7058407670970177e-06, "loss": 0.75016201, "num_input_tokens_seen": 100577385, "step": 4662, "time_per_iteration": 3.357377529144287 }, { "auxiliary_loss_clip": 0.01167646, "auxiliary_loss_mlp": 0.01029488, "balance_loss_clip": 1.05021274, "balance_loss_mlp": 1.02133441, "epoch": 0.5606925990500812, "flos": 20594949621120.0, "grad_norm": 1.6277637897738084, "language_loss": 0.60955441, "learning_rate": 1.7050702893651643e-06, "loss": 0.63152575, "num_input_tokens_seen": 100596965, "step": 4663, "time_per_iteration": 2.4452908039093018 }, { "auxiliary_loss_clip": 0.01163357, "auxiliary_loss_mlp": 0.01026493, "balance_loss_clip": 1.05200791, "balance_loss_mlp": 1.01842904, "epoch": 0.5608128419407202, "flos": 35006044677120.0, "grad_norm": 2.5419460593517735, "language_loss": 0.76049721, "learning_rate": 1.7042998563739134e-06, "loss": 0.78239572, "num_input_tokens_seen": 100615315, "step": 4664, "time_per_iteration": 2.560274362564087 }, { "auxiliary_loss_clip": 0.01155389, "auxiliary_loss_mlp": 0.0103225, "balance_loss_clip": 1.04751158, "balance_loss_mlp": 1.0241617, "epoch": 0.5609330848313594, "flos": 24639819235200.0, "grad_norm": 2.256913655105567, "language_loss": 0.71309018, "learning_rate": 1.703529468240139e-06, "loss": 0.73496658, "num_input_tokens_seen": 100634185, "step": 4665, "time_per_iteration": 3.2564074993133545 }, { "auxiliary_loss_clip": 0.01142823, "auxiliary_loss_mlp": 0.01027203, "balance_loss_clip": 1.05012894, "balance_loss_mlp": 1.01922441, "epoch": 0.5610533277219985, "flos": 18762894385920.0, "grad_norm": 2.372843300588213, "language_loss": 0.74257457, "learning_rate": 1.7027591250807088e-06, "loss": 0.76427484, "num_input_tokens_seen": 100651360, "step": 4666, "time_per_iteration": 2.4598982334136963 }, { "auxiliary_loss_clip": 0.01181383, "auxiliary_loss_mlp": 0.01026381, "balance_loss_clip": 1.05544686, "balance_loss_mlp": 1.01853704, "epoch": 0.5611735706126375, "flos": 15012384727680.0, "grad_norm": 2.4330320735330004, "language_loss": 0.84826428, "learning_rate": 1.7019888270124825e-06, "loss": 0.8703419, "num_input_tokens_seen": 100668525, "step": 4667, "time_per_iteration": 2.3941941261291504 }, { "auxiliary_loss_clip": 0.01169649, "auxiliary_loss_mlp": 0.01029658, "balance_loss_clip": 1.05386961, "balance_loss_mlp": 1.02099109, "epoch": 0.5612938135032767, "flos": 16468167041280.0, "grad_norm": 1.7602220885835493, "language_loss": 0.82137924, "learning_rate": 1.7012185741523147e-06, "loss": 0.84337234, "num_input_tokens_seen": 100684850, "step": 4668, "time_per_iteration": 2.4194586277008057 }, { "auxiliary_loss_clip": 0.01180237, "auxiliary_loss_mlp": 0.0102903, "balance_loss_clip": 1.05491328, "balance_loss_mlp": 1.02130532, "epoch": 0.5614140563939157, "flos": 25666433850240.0, "grad_norm": 2.8924875264821504, "language_loss": 0.62510872, "learning_rate": 1.7004483666170514e-06, "loss": 0.64720142, "num_input_tokens_seen": 100705345, "step": 4669, "time_per_iteration": 2.4603211879730225 }, { "auxiliary_loss_clip": 0.01163957, "auxiliary_loss_mlp": 0.01026964, "balance_loss_clip": 1.05115509, "balance_loss_mlp": 1.01964474, "epoch": 0.5615342992845548, "flos": 24717566223360.0, "grad_norm": 2.130757627893791, "language_loss": 0.80699134, "learning_rate": 1.699678204523533e-06, "loss": 0.82890058, "num_input_tokens_seen": 100725210, "step": 4670, "time_per_iteration": 2.473994731903076 }, { "auxiliary_loss_clip": 0.01154442, "auxiliary_loss_mlp": 0.01029491, "balance_loss_clip": 1.05346835, "balance_loss_mlp": 1.02069354, "epoch": 0.5616545421751938, "flos": 22015934634240.0, "grad_norm": 3.787960458819434, "language_loss": 0.69138598, "learning_rate": 1.6989080879885918e-06, "loss": 0.71322536, "num_input_tokens_seen": 100743070, "step": 4671, "time_per_iteration": 2.504103660583496 }, { "auxiliary_loss_clip": 0.01042694, "auxiliary_loss_mlp": 0.0100182, "balance_loss_clip": 1.01499963, "balance_loss_mlp": 1.00078893, "epoch": 0.561774785065833, "flos": 53760358690560.0, "grad_norm": 0.9010901182194945, "language_loss": 0.61043864, "learning_rate": 1.6981380171290544e-06, "loss": 0.63088381, "num_input_tokens_seen": 100804095, "step": 4672, "time_per_iteration": 3.06972074508667 }, { "auxiliary_loss_clip": 0.01145017, "auxiliary_loss_mlp": 0.01028078, "balance_loss_clip": 1.04662192, "balance_loss_mlp": 1.02015662, "epoch": 0.5618950279564721, "flos": 19750007018880.0, "grad_norm": 1.9095044973685826, "language_loss": 0.74349374, "learning_rate": 1.6973679920617396e-06, "loss": 0.76522464, "num_input_tokens_seen": 100821630, "step": 4673, "time_per_iteration": 2.4905998706817627 }, { "auxiliary_loss_clip": 0.01147745, "auxiliary_loss_mlp": 0.01027616, "balance_loss_clip": 1.05116308, "balance_loss_mlp": 1.01955128, "epoch": 0.5620152708471111, "flos": 16800592435200.0, "grad_norm": 2.422210034430206, "language_loss": 0.85206586, "learning_rate": 1.6965980129034603e-06, "loss": 0.87381947, "num_input_tokens_seen": 100839015, "step": 4674, "time_per_iteration": 2.469625473022461 }, { "auxiliary_loss_clip": 0.01152973, "auxiliary_loss_mlp": 0.01025876, "balance_loss_clip": 1.0539403, "balance_loss_mlp": 1.01809764, "epoch": 0.5621355137377503, "flos": 26797799502720.0, "grad_norm": 1.5154359158780253, "language_loss": 0.76440513, "learning_rate": 1.6958280797710209e-06, "loss": 0.78619361, "num_input_tokens_seen": 100860940, "step": 4675, "time_per_iteration": 2.5471930503845215 }, { "auxiliary_loss_clip": 0.01053591, "auxiliary_loss_mlp": 0.01002585, "balance_loss_clip": 1.01535988, "balance_loss_mlp": 1.00156581, "epoch": 0.5622557566283893, "flos": 61207046686080.0, "grad_norm": 0.7154377444825417, "language_loss": 0.54814363, "learning_rate": 1.6950581927812198e-06, "loss": 0.56870538, "num_input_tokens_seen": 100920510, "step": 4676, "time_per_iteration": 2.943524122238159 }, { "auxiliary_loss_clip": 0.01164336, "auxiliary_loss_mlp": 0.01026777, "balance_loss_clip": 1.05168009, "balance_loss_mlp": 1.01879597, "epoch": 0.5623759995190284, "flos": 26468534505600.0, "grad_norm": 2.952573418583003, "language_loss": 0.79496479, "learning_rate": 1.6942883520508486e-06, "loss": 0.81687587, "num_input_tokens_seen": 100939245, "step": 4677, "time_per_iteration": 2.500335931777954 }, { "auxiliary_loss_clip": 0.01164734, "auxiliary_loss_mlp": 0.01025996, "balance_loss_clip": 1.05082297, "balance_loss_mlp": 1.01802731, "epoch": 0.5624962424096676, "flos": 19390900798080.0, "grad_norm": 2.0054358670043393, "language_loss": 0.77308154, "learning_rate": 1.693518557696691e-06, "loss": 0.79498887, "num_input_tokens_seen": 100958385, "step": 4678, "time_per_iteration": 2.434480667114258 }, { "auxiliary_loss_clip": 0.01159946, "auxiliary_loss_mlp": 0.01024886, "balance_loss_clip": 1.04851234, "balance_loss_mlp": 1.01710153, "epoch": 0.5626164853003066, "flos": 20667345482880.0, "grad_norm": 2.010738097474918, "language_loss": 0.88986361, "learning_rate": 1.6927488098355252e-06, "loss": 0.91171193, "num_input_tokens_seen": 100976015, "step": 4679, "time_per_iteration": 2.4582018852233887 }, { "auxiliary_loss_clip": 0.01038544, "auxiliary_loss_mlp": 0.01003137, "balance_loss_clip": 1.01413584, "balance_loss_mlp": 1.00193274, "epoch": 0.5627367281909457, "flos": 62766071665920.0, "grad_norm": 0.9094488422671037, "language_loss": 0.63175243, "learning_rate": 1.6919791085841201e-06, "loss": 0.65216923, "num_input_tokens_seen": 101033425, "step": 4680, "time_per_iteration": 3.0931854248046875 }, { "auxiliary_loss_clip": 0.0115805, "auxiliary_loss_mlp": 0.01030213, "balance_loss_clip": 1.04772329, "balance_loss_mlp": 1.0214572, "epoch": 0.5628569710815848, "flos": 12787144243200.0, "grad_norm": 2.2988181829739522, "language_loss": 0.78827047, "learning_rate": 1.6912094540592396e-06, "loss": 0.81015313, "num_input_tokens_seen": 101048945, "step": 4681, "time_per_iteration": 2.4543240070343018 }, { "auxiliary_loss_clip": 0.01162639, "auxiliary_loss_mlp": 0.01028973, "balance_loss_clip": 1.05067933, "balance_loss_mlp": 1.02122474, "epoch": 0.5629772139722239, "flos": 13762082165760.0, "grad_norm": 5.605968390907182, "language_loss": 0.81280792, "learning_rate": 1.6904398463776393e-06, "loss": 0.83472401, "num_input_tokens_seen": 101062745, "step": 4682, "time_per_iteration": 2.3975889682769775 }, { "auxiliary_loss_clip": 0.0116499, "auxiliary_loss_mlp": 0.01025936, "balance_loss_clip": 1.0495671, "balance_loss_mlp": 1.01804996, "epoch": 0.5630974568628629, "flos": 21467830026240.0, "grad_norm": 1.7068709996142475, "language_loss": 0.72629416, "learning_rate": 1.6896702856560683e-06, "loss": 0.7482034, "num_input_tokens_seen": 101081840, "step": 4683, "time_per_iteration": 3.284876585006714 }, { "auxiliary_loss_clip": 0.01133642, "auxiliary_loss_mlp": 0.01025367, "balance_loss_clip": 1.04509115, "balance_loss_mlp": 1.01748109, "epoch": 0.5632176997535021, "flos": 14245907385600.0, "grad_norm": 2.825214411476421, "language_loss": 0.69354212, "learning_rate": 1.6889007720112677e-06, "loss": 0.71513218, "num_input_tokens_seen": 101099585, "step": 4684, "time_per_iteration": 2.4920742511749268 }, { "auxiliary_loss_clip": 0.01167496, "auxiliary_loss_mlp": 0.01026046, "balance_loss_clip": 1.05389929, "balance_loss_mlp": 1.01875997, "epoch": 0.5633379426441412, "flos": 20812244947200.0, "grad_norm": 1.5469558476558942, "language_loss": 0.77391911, "learning_rate": 1.6881313055599734e-06, "loss": 0.79585457, "num_input_tokens_seen": 101119515, "step": 4685, "time_per_iteration": 2.456150531768799 }, { "auxiliary_loss_clip": 0.01136625, "auxiliary_loss_mlp": 0.01023181, "balance_loss_clip": 1.04566884, "balance_loss_mlp": 1.01486015, "epoch": 0.5634581855347802, "flos": 22600883617920.0, "grad_norm": 2.4028321410814883, "language_loss": 0.82260096, "learning_rate": 1.6873618864189117e-06, "loss": 0.84419894, "num_input_tokens_seen": 101135285, "step": 4686, "time_per_iteration": 3.324130058288574 }, { "auxiliary_loss_clip": 0.01163568, "auxiliary_loss_mlp": 0.01033185, "balance_loss_clip": 1.05002165, "balance_loss_mlp": 1.02490616, "epoch": 0.5635784284254194, "flos": 21506972872320.0, "grad_norm": 2.2035073149545683, "language_loss": 0.7750845, "learning_rate": 1.686592514704803e-06, "loss": 0.79705203, "num_input_tokens_seen": 101152680, "step": 4687, "time_per_iteration": 2.4573662281036377 }, { "auxiliary_loss_clip": 0.01149164, "auxiliary_loss_mlp": 0.01026419, "balance_loss_clip": 1.05315936, "balance_loss_mlp": 1.01918936, "epoch": 0.5636986713160584, "flos": 19827466698240.0, "grad_norm": 2.2018676842780556, "language_loss": 0.70977378, "learning_rate": 1.685823190534361e-06, "loss": 0.73152965, "num_input_tokens_seen": 101170920, "step": 4688, "time_per_iteration": 2.476706027984619 }, { "auxiliary_loss_clip": 0.01181615, "auxiliary_loss_mlp": 0.01024859, "balance_loss_clip": 1.05358446, "balance_loss_mlp": 1.016204, "epoch": 0.5638189142066975, "flos": 19792453916160.0, "grad_norm": 1.8715929235791309, "language_loss": 0.83937359, "learning_rate": 1.6850539140242907e-06, "loss": 0.86143827, "num_input_tokens_seen": 101190180, "step": 4689, "time_per_iteration": 3.2551026344299316 }, { "auxiliary_loss_clip": 0.01167918, "auxiliary_loss_mlp": 0.0103107, "balance_loss_clip": 1.05150843, "balance_loss_mlp": 1.0233779, "epoch": 0.5639391570973367, "flos": 22893771116160.0, "grad_norm": 1.882163472555844, "language_loss": 0.81961226, "learning_rate": 1.684284685291292e-06, "loss": 0.84160215, "num_input_tokens_seen": 101211825, "step": 4690, "time_per_iteration": 2.471123695373535 }, { "auxiliary_loss_clip": 0.01179296, "auxiliary_loss_mlp": 0.01031148, "balance_loss_clip": 1.05327809, "balance_loss_mlp": 1.02306604, "epoch": 0.5640593999879757, "flos": 23727077712000.0, "grad_norm": 1.8887818603786286, "language_loss": 0.81538939, "learning_rate": 1.683515504452055e-06, "loss": 0.83749378, "num_input_tokens_seen": 101229200, "step": 4691, "time_per_iteration": 3.263162612915039 }, { "auxiliary_loss_clip": 0.0112631, "auxiliary_loss_mlp": 0.01034121, "balance_loss_clip": 1.04475379, "balance_loss_mlp": 1.02512097, "epoch": 0.5641796428786148, "flos": 22710123855360.0, "grad_norm": 2.11691359539167, "language_loss": 0.66213882, "learning_rate": 1.6827463716232648e-06, "loss": 0.68374312, "num_input_tokens_seen": 101249860, "step": 4692, "time_per_iteration": 2.557039260864258 }, { "auxiliary_loss_clip": 0.01162309, "auxiliary_loss_mlp": 0.00762565, "balance_loss_clip": 1.05038953, "balance_loss_mlp": 1.00035405, "epoch": 0.5642998857692539, "flos": 19791987039360.0, "grad_norm": 2.8169886462108717, "language_loss": 0.75555813, "learning_rate": 1.6819772869215972e-06, "loss": 0.77480686, "num_input_tokens_seen": 101268940, "step": 4693, "time_per_iteration": 2.4532949924468994 }, { "auxiliary_loss_clip": 0.01155663, "auxiliary_loss_mlp": 0.01028177, "balance_loss_clip": 1.05228376, "balance_loss_mlp": 1.02092314, "epoch": 0.564420128659893, "flos": 23185904428800.0, "grad_norm": 1.8697942062197417, "language_loss": 0.82250357, "learning_rate": 1.6812082504637228e-06, "loss": 0.84434199, "num_input_tokens_seen": 101290260, "step": 4694, "time_per_iteration": 2.5473644733428955 }, { "auxiliary_loss_clip": 0.01161255, "auxiliary_loss_mlp": 0.01022309, "balance_loss_clip": 1.05277026, "balance_loss_mlp": 1.0144887, "epoch": 0.564540371550532, "flos": 23258264376960.0, "grad_norm": 1.6775129219736873, "language_loss": 0.74346316, "learning_rate": 1.6804392623663025e-06, "loss": 0.76529878, "num_input_tokens_seen": 101311465, "step": 4695, "time_per_iteration": 2.5016591548919678 }, { "auxiliary_loss_clip": 0.01156502, "auxiliary_loss_mlp": 0.01025042, "balance_loss_clip": 1.04980719, "balance_loss_mlp": 1.01694155, "epoch": 0.5646606144411712, "flos": 25010058672000.0, "grad_norm": 1.940493550732572, "language_loss": 0.78128088, "learning_rate": 1.6796703227459935e-06, "loss": 0.80309623, "num_input_tokens_seen": 101329420, "step": 4696, "time_per_iteration": 2.481167793273926 }, { "auxiliary_loss_clip": 0.01112191, "auxiliary_loss_mlp": 0.01024885, "balance_loss_clip": 1.04430425, "balance_loss_mlp": 1.01689839, "epoch": 0.5647808573318103, "flos": 36539645806080.0, "grad_norm": 1.7537009572840152, "language_loss": 0.7605378, "learning_rate": 1.6789014317194407e-06, "loss": 0.78190857, "num_input_tokens_seen": 101350900, "step": 4697, "time_per_iteration": 2.7260093688964844 }, { "auxiliary_loss_clip": 0.01159502, "auxiliary_loss_mlp": 0.01028812, "balance_loss_clip": 1.05382454, "balance_loss_mlp": 1.02039647, "epoch": 0.5649011002224493, "flos": 22528451842560.0, "grad_norm": 2.605412069969427, "language_loss": 0.72936368, "learning_rate": 1.6781325894032853e-06, "loss": 0.75124681, "num_input_tokens_seen": 101369860, "step": 4698, "time_per_iteration": 2.546757221221924 }, { "auxiliary_loss_clip": 0.01148021, "auxiliary_loss_mlp": 0.01032979, "balance_loss_clip": 1.05302024, "balance_loss_mlp": 1.02506351, "epoch": 0.5650213431130885, "flos": 18515147304960.0, "grad_norm": 1.9566633712524366, "language_loss": 0.92016459, "learning_rate": 1.6773637959141608e-06, "loss": 0.94197464, "num_input_tokens_seen": 101386835, "step": 4699, "time_per_iteration": 2.4633591175079346 }, { "auxiliary_loss_clip": 0.01139715, "auxiliary_loss_mlp": 0.01028008, "balance_loss_clip": 1.04763365, "balance_loss_mlp": 1.02000892, "epoch": 0.5651415860037275, "flos": 17526310819200.0, "grad_norm": 2.056013661694584, "language_loss": 0.66345894, "learning_rate": 1.6765950513686915e-06, "loss": 0.68513614, "num_input_tokens_seen": 101404945, "step": 4700, "time_per_iteration": 2.4759459495544434 }, { "auxiliary_loss_clip": 0.01121685, "auxiliary_loss_mlp": 0.01033106, "balance_loss_clip": 1.04298306, "balance_loss_mlp": 1.0247252, "epoch": 0.5652618288943666, "flos": 25520026014720.0, "grad_norm": 2.1644489644946385, "language_loss": 0.76034021, "learning_rate": 1.675826355883496e-06, "loss": 0.78188813, "num_input_tokens_seen": 101424160, "step": 4701, "time_per_iteration": 2.634275197982788 }, { "auxiliary_loss_clip": 0.01144623, "auxiliary_loss_mlp": 0.01031596, "balance_loss_clip": 1.05011082, "balance_loss_mlp": 1.02342439, "epoch": 0.5653820717850057, "flos": 19683105937920.0, "grad_norm": 2.4807865185467373, "language_loss": 0.78902924, "learning_rate": 1.6750577095751848e-06, "loss": 0.81079137, "num_input_tokens_seen": 101443270, "step": 4702, "time_per_iteration": 2.4767277240753174 }, { "auxiliary_loss_clip": 0.01175368, "auxiliary_loss_mlp": 0.01031682, "balance_loss_clip": 1.05147231, "balance_loss_mlp": 1.02386153, "epoch": 0.5655023146756448, "flos": 26979722910720.0, "grad_norm": 1.7288361627247184, "language_loss": 0.7252205, "learning_rate": 1.6742891125603605e-06, "loss": 0.74729097, "num_input_tokens_seen": 101464175, "step": 4703, "time_per_iteration": 2.464181661605835 }, { "auxiliary_loss_clip": 0.01161838, "auxiliary_loss_mlp": 0.01027017, "balance_loss_clip": 1.05123281, "balance_loss_mlp": 1.01860094, "epoch": 0.5656225575662839, "flos": 27669351104640.0, "grad_norm": 1.8079627743755122, "language_loss": 0.72152948, "learning_rate": 1.6735205649556185e-06, "loss": 0.74341804, "num_input_tokens_seen": 101484045, "step": 4704, "time_per_iteration": 2.515594959259033 }, { "auxiliary_loss_clip": 0.01138262, "auxiliary_loss_mlp": 0.01029459, "balance_loss_clip": 1.0479877, "balance_loss_mlp": 1.02154326, "epoch": 0.5657428004569229, "flos": 24349732997760.0, "grad_norm": 1.6270290300969787, "language_loss": 0.84739369, "learning_rate": 1.6727520668775476e-06, "loss": 0.86907089, "num_input_tokens_seen": 101504330, "step": 4705, "time_per_iteration": 2.556506633758545 }, { "auxiliary_loss_clip": 0.01179483, "auxiliary_loss_mlp": 0.01029435, "balance_loss_clip": 1.05179179, "balance_loss_mlp": 1.02099538, "epoch": 0.5658630433475621, "flos": 21944041562880.0, "grad_norm": 1.5995932013267553, "language_loss": 0.7536822, "learning_rate": 1.6719836184427275e-06, "loss": 0.77577138, "num_input_tokens_seen": 101524635, "step": 4706, "time_per_iteration": 2.4360644817352295 }, { "auxiliary_loss_clip": 0.0114712, "auxiliary_loss_mlp": 0.01026595, "balance_loss_clip": 1.04825616, "balance_loss_mlp": 1.01923943, "epoch": 0.5659832862382012, "flos": 30409012218240.0, "grad_norm": 2.0394749216234076, "language_loss": 0.64126742, "learning_rate": 1.671215219767733e-06, "loss": 0.66300452, "num_input_tokens_seen": 101544095, "step": 4707, "time_per_iteration": 2.5578556060791016 }, { "auxiliary_loss_clip": 0.01123661, "auxiliary_loss_mlp": 0.01031328, "balance_loss_clip": 1.04695809, "balance_loss_mlp": 1.02317405, "epoch": 0.5661035291288402, "flos": 13188194570880.0, "grad_norm": 2.291189978795589, "language_loss": 0.7618767, "learning_rate": 1.670446870969127e-06, "loss": 0.78342664, "num_input_tokens_seen": 101561760, "step": 4708, "time_per_iteration": 2.5221760272979736 }, { "auxiliary_loss_clip": 0.0115383, "auxiliary_loss_mlp": 0.01028261, "balance_loss_clip": 1.05117726, "balance_loss_mlp": 1.02042902, "epoch": 0.5662237720194794, "flos": 16143032108160.0, "grad_norm": 2.1385840302027335, "language_loss": 0.8010447, "learning_rate": 1.6696785721634685e-06, "loss": 0.82286561, "num_input_tokens_seen": 101576245, "step": 4709, "time_per_iteration": 3.2841742038726807 }, { "auxiliary_loss_clip": 0.01164776, "auxiliary_loss_mlp": 0.01033832, "balance_loss_clip": 1.0502218, "balance_loss_mlp": 1.02562487, "epoch": 0.5663440149101184, "flos": 17676848718720.0, "grad_norm": 2.0248618267950516, "language_loss": 0.73495239, "learning_rate": 1.6689103234673086e-06, "loss": 0.75693852, "num_input_tokens_seen": 101594565, "step": 4710, "time_per_iteration": 2.4507322311401367 }, { "auxiliary_loss_clip": 0.01148322, "auxiliary_loss_mlp": 0.01029706, "balance_loss_clip": 1.05052042, "balance_loss_mlp": 1.0216713, "epoch": 0.5664642578007575, "flos": 23368330627200.0, "grad_norm": 4.111201650649018, "language_loss": 0.76954913, "learning_rate": 1.668142124997189e-06, "loss": 0.79132938, "num_input_tokens_seen": 101614225, "step": 4711, "time_per_iteration": 2.5090441703796387 }, { "auxiliary_loss_clip": 0.01048453, "auxiliary_loss_mlp": 0.01004458, "balance_loss_clip": 1.01747072, "balance_loss_mlp": 1.0035342, "epoch": 0.5665845006913967, "flos": 65516470945920.0, "grad_norm": 0.7247073504934648, "language_loss": 0.59846222, "learning_rate": 1.6673739768696453e-06, "loss": 0.61899137, "num_input_tokens_seen": 101680795, "step": 4712, "time_per_iteration": 3.0884652137756348 }, { "auxiliary_loss_clip": 0.01156187, "auxiliary_loss_mlp": 0.01028721, "balance_loss_clip": 1.04947615, "balance_loss_mlp": 1.020105, "epoch": 0.5667047435820357, "flos": 26140885620480.0, "grad_norm": 2.56243291798559, "language_loss": 0.77386022, "learning_rate": 1.6666058792012052e-06, "loss": 0.79570925, "num_input_tokens_seen": 101701680, "step": 4713, "time_per_iteration": 3.3893685340881348 }, { "auxiliary_loss_clip": 0.01065614, "auxiliary_loss_mlp": 0.01001889, "balance_loss_clip": 1.01392603, "balance_loss_mlp": 1.00088787, "epoch": 0.5668249864726748, "flos": 71866949725440.0, "grad_norm": 0.881443577343635, "language_loss": 0.68800932, "learning_rate": 1.6658378321083878e-06, "loss": 0.70868433, "num_input_tokens_seen": 101766010, "step": 4714, "time_per_iteration": 3.082977056503296 }, { "auxiliary_loss_clip": 0.01111679, "auxiliary_loss_mlp": 0.01025905, "balance_loss_clip": 1.04485428, "balance_loss_mlp": 1.01834989, "epoch": 0.5669452293633139, "flos": 22195667312640.0, "grad_norm": 2.310161133539983, "language_loss": 0.82514113, "learning_rate": 1.6650698357077055e-06, "loss": 0.84651697, "num_input_tokens_seen": 101783055, "step": 4715, "time_per_iteration": 2.621511459350586 }, { "auxiliary_loss_clip": 0.01154776, "auxiliary_loss_mlp": 0.01032341, "balance_loss_clip": 1.04928899, "balance_loss_mlp": 1.02381158, "epoch": 0.567065472253953, "flos": 18223193560320.0, "grad_norm": 2.923296883424349, "language_loss": 0.80919373, "learning_rate": 1.6643018901156632e-06, "loss": 0.83106494, "num_input_tokens_seen": 101802150, "step": 4716, "time_per_iteration": 3.3085148334503174 }, { "auxiliary_loss_clip": 0.01155781, "auxiliary_loss_mlp": 0.01026654, "balance_loss_clip": 1.05090141, "balance_loss_mlp": 1.0188309, "epoch": 0.567185715144592, "flos": 20371548983040.0, "grad_norm": 3.0750420763824127, "language_loss": 0.79597688, "learning_rate": 1.6635339954487566e-06, "loss": 0.81780124, "num_input_tokens_seen": 101818025, "step": 4717, "time_per_iteration": 2.4891278743743896 }, { "auxiliary_loss_clip": 0.01154342, "auxiliary_loss_mlp": 0.01026684, "balance_loss_clip": 1.04996347, "balance_loss_mlp": 1.01880467, "epoch": 0.5673059580352312, "flos": 23221348174080.0, "grad_norm": 1.815736403216912, "language_loss": 0.82125729, "learning_rate": 1.6627661518234765e-06, "loss": 0.84306753, "num_input_tokens_seen": 101837280, "step": 4718, "time_per_iteration": 3.285160779953003 }, { "auxiliary_loss_clip": 0.01126167, "auxiliary_loss_mlp": 0.0102668, "balance_loss_clip": 1.04962659, "balance_loss_mlp": 1.01835918, "epoch": 0.5674262009258703, "flos": 21719599430400.0, "grad_norm": 1.5566206295639413, "language_loss": 0.85178322, "learning_rate": 1.661998359356302e-06, "loss": 0.87331164, "num_input_tokens_seen": 101856310, "step": 4719, "time_per_iteration": 2.6344470977783203 }, { "auxiliary_loss_clip": 0.01074504, "auxiliary_loss_mlp": 0.01003021, "balance_loss_clip": 1.01428938, "balance_loss_mlp": 1.00207376, "epoch": 0.5675464438165093, "flos": 67470369114240.0, "grad_norm": 0.7465739752672518, "language_loss": 0.55847228, "learning_rate": 1.6612306181637077e-06, "loss": 0.57924747, "num_input_tokens_seen": 101915635, "step": 4720, "time_per_iteration": 2.9846572875976562 }, { "auxiliary_loss_clip": 0.01132208, "auxiliary_loss_mlp": 0.01031774, "balance_loss_clip": 1.0465306, "balance_loss_mlp": 1.02393615, "epoch": 0.5676666867071485, "flos": 18879173688960.0, "grad_norm": 2.850211380728475, "language_loss": 0.65545309, "learning_rate": 1.6604629283621598e-06, "loss": 0.67709291, "num_input_tokens_seen": 101933565, "step": 4721, "time_per_iteration": 2.5548489093780518 }, { "auxiliary_loss_clip": 0.0118074, "auxiliary_loss_mlp": 0.01032694, "balance_loss_clip": 1.05319858, "balance_loss_mlp": 1.02409911, "epoch": 0.5677869295977875, "flos": 33546778744320.0, "grad_norm": 1.8710218947673942, "language_loss": 0.74162489, "learning_rate": 1.6596952900681152e-06, "loss": 0.76375926, "num_input_tokens_seen": 101954325, "step": 4722, "time_per_iteration": 2.5515873432159424 }, { "auxiliary_loss_clip": 0.01120108, "auxiliary_loss_mlp": 0.0103173, "balance_loss_clip": 1.053478, "balance_loss_mlp": 1.02299154, "epoch": 0.5679071724884266, "flos": 28037256157440.0, "grad_norm": 2.162052294578075, "language_loss": 0.81846702, "learning_rate": 1.658927703398025e-06, "loss": 0.83998537, "num_input_tokens_seen": 101974390, "step": 4723, "time_per_iteration": 2.6018714904785156 }, { "auxiliary_loss_clip": 0.01119953, "auxiliary_loss_mlp": 0.01026077, "balance_loss_clip": 1.04188263, "balance_loss_mlp": 1.01801205, "epoch": 0.5680274153790658, "flos": 23550110380800.0, "grad_norm": 2.5546751487288164, "language_loss": 0.7823928, "learning_rate": 1.6581601684683309e-06, "loss": 0.80385315, "num_input_tokens_seen": 101994815, "step": 4724, "time_per_iteration": 2.5922391414642334 }, { "auxiliary_loss_clip": 0.0116453, "auxiliary_loss_mlp": 0.0102915, "balance_loss_clip": 1.05205464, "balance_loss_mlp": 1.02184284, "epoch": 0.5681476582697048, "flos": 22455158140800.0, "grad_norm": 2.6734174527055248, "language_loss": 0.68262964, "learning_rate": 1.6573926853954674e-06, "loss": 0.70456648, "num_input_tokens_seen": 102012400, "step": 4725, "time_per_iteration": 2.4544126987457275 }, { "auxiliary_loss_clip": 0.01141419, "auxiliary_loss_mlp": 0.01025158, "balance_loss_clip": 1.0447166, "balance_loss_mlp": 1.01714468, "epoch": 0.5682679011603439, "flos": 19536913584000.0, "grad_norm": 1.8993748613005417, "language_loss": 0.8323307, "learning_rate": 1.6566252542958608e-06, "loss": 0.85399646, "num_input_tokens_seen": 102031900, "step": 4726, "time_per_iteration": 2.5696568489074707 }, { "auxiliary_loss_clip": 0.01131414, "auxiliary_loss_mlp": 0.01030684, "balance_loss_clip": 1.04947686, "balance_loss_mlp": 1.02264643, "epoch": 0.568388144050983, "flos": 28765488493440.0, "grad_norm": 1.9272812791611291, "language_loss": 0.78774786, "learning_rate": 1.6558578752859305e-06, "loss": 0.80936885, "num_input_tokens_seen": 102050860, "step": 4727, "time_per_iteration": 2.585646152496338 }, { "auxiliary_loss_clip": 0.01134925, "auxiliary_loss_mlp": 0.01025424, "balance_loss_clip": 1.04784811, "balance_loss_mlp": 1.01809299, "epoch": 0.5685083869416221, "flos": 21209452519680.0, "grad_norm": 3.4885767045964595, "language_loss": 0.78853911, "learning_rate": 1.6550905484820865e-06, "loss": 0.81014264, "num_input_tokens_seen": 102069320, "step": 4728, "time_per_iteration": 2.530128002166748 }, { "auxiliary_loss_clip": 0.01180409, "auxiliary_loss_mlp": 0.01028559, "balance_loss_clip": 1.05264819, "balance_loss_mlp": 1.02021992, "epoch": 0.5686286298322611, "flos": 24827021942400.0, "grad_norm": 2.1654557125574616, "language_loss": 0.79086977, "learning_rate": 1.6543232740007328e-06, "loss": 0.81295943, "num_input_tokens_seen": 102086435, "step": 4729, "time_per_iteration": 2.4443626403808594 }, { "auxiliary_loss_clip": 0.01167989, "auxiliary_loss_mlp": 0.01027817, "balance_loss_clip": 1.05272341, "balance_loss_mlp": 1.0197109, "epoch": 0.5687488727229003, "flos": 26615121909120.0, "grad_norm": 2.5494685832169597, "language_loss": 0.67172599, "learning_rate": 1.653556051958263e-06, "loss": 0.69368404, "num_input_tokens_seen": 102106115, "step": 4730, "time_per_iteration": 2.507558584213257 }, { "auxiliary_loss_clip": 0.0108906, "auxiliary_loss_mlp": 0.01026949, "balance_loss_clip": 1.04206955, "balance_loss_mlp": 1.01878262, "epoch": 0.5688691156135394, "flos": 20808725414400.0, "grad_norm": 2.099238030279477, "language_loss": 0.74115926, "learning_rate": 1.6527888824710642e-06, "loss": 0.76231939, "num_input_tokens_seen": 102125715, "step": 4731, "time_per_iteration": 2.610666036605835 }, { "auxiliary_loss_clip": 0.01128763, "auxiliary_loss_mlp": 0.0102978, "balance_loss_clip": 1.04457283, "balance_loss_mlp": 1.02127421, "epoch": 0.5689893585041784, "flos": 25880963829120.0, "grad_norm": 2.0808562393608097, "language_loss": 0.76409227, "learning_rate": 1.6520217656555166e-06, "loss": 0.78567767, "num_input_tokens_seen": 102145005, "step": 4732, "time_per_iteration": 2.553199529647827 }, { "auxiliary_loss_clip": 0.01138573, "auxiliary_loss_mlp": 0.01031371, "balance_loss_clip": 1.0479393, "balance_loss_mlp": 1.02349174, "epoch": 0.5691096013948175, "flos": 23477463123840.0, "grad_norm": 1.4975347240184178, "language_loss": 0.71006942, "learning_rate": 1.65125470162799e-06, "loss": 0.73176897, "num_input_tokens_seen": 102165360, "step": 4733, "time_per_iteration": 2.511319398880005 }, { "auxiliary_loss_clip": 0.01135727, "auxiliary_loss_mlp": 0.01030275, "balance_loss_clip": 1.04484749, "balance_loss_mlp": 1.02233577, "epoch": 0.5692298442854566, "flos": 18075600576000.0, "grad_norm": 1.9645453373813944, "language_loss": 0.69870341, "learning_rate": 1.6504876905048485e-06, "loss": 0.72036344, "num_input_tokens_seen": 102182320, "step": 4734, "time_per_iteration": 2.519029140472412 }, { "auxiliary_loss_clip": 0.01178031, "auxiliary_loss_mlp": 0.01025647, "balance_loss_clip": 1.05421352, "balance_loss_mlp": 1.01810122, "epoch": 0.5693500871760957, "flos": 23039317025280.0, "grad_norm": 1.855190993853517, "language_loss": 0.72103029, "learning_rate": 1.6497207324024464e-06, "loss": 0.74306709, "num_input_tokens_seen": 102201220, "step": 4735, "time_per_iteration": 2.4509618282318115 }, { "auxiliary_loss_clip": 0.01157478, "auxiliary_loss_mlp": 0.01029767, "balance_loss_clip": 1.04972386, "balance_loss_mlp": 1.02180362, "epoch": 0.5694703300667348, "flos": 18989670902400.0, "grad_norm": 13.772233856028635, "language_loss": 0.82644027, "learning_rate": 1.6489538274371305e-06, "loss": 0.84831274, "num_input_tokens_seen": 102219825, "step": 4736, "time_per_iteration": 3.3168396949768066 }, { "auxiliary_loss_clip": 0.01158419, "auxiliary_loss_mlp": 0.01028877, "balance_loss_clip": 1.05204082, "balance_loss_mlp": 1.02127445, "epoch": 0.5695905729573739, "flos": 21908705558400.0, "grad_norm": 1.871675628880176, "language_loss": 0.83240873, "learning_rate": 1.6481869757252396e-06, "loss": 0.85428166, "num_input_tokens_seen": 102238160, "step": 4737, "time_per_iteration": 2.4798386096954346 }, { "auxiliary_loss_clip": 0.01165729, "auxiliary_loss_mlp": 0.01030887, "balance_loss_clip": 1.05389452, "balance_loss_mlp": 1.02334714, "epoch": 0.569710815848013, "flos": 28476659232000.0, "grad_norm": 1.470773345299097, "language_loss": 0.71815145, "learning_rate": 1.647420177383105e-06, "loss": 0.74011767, "num_input_tokens_seen": 102261030, "step": 4738, "time_per_iteration": 2.5471749305725098 }, { "auxiliary_loss_clip": 0.01160067, "auxiliary_loss_mlp": 0.01024843, "balance_loss_clip": 1.05376577, "balance_loss_mlp": 1.01752019, "epoch": 0.569831058738652, "flos": 28366162018560.0, "grad_norm": 1.804050814597485, "language_loss": 0.72515523, "learning_rate": 1.646653432527049e-06, "loss": 0.74700433, "num_input_tokens_seen": 102281670, "step": 4739, "time_per_iteration": 3.3799397945404053 }, { "auxiliary_loss_clip": 0.01139071, "auxiliary_loss_mlp": 0.01025129, "balance_loss_clip": 1.05113959, "balance_loss_mlp": 1.01740456, "epoch": 0.5699513016292912, "flos": 25849973370240.0, "grad_norm": 1.5147571984855663, "language_loss": 0.74583161, "learning_rate": 1.645886741273387e-06, "loss": 0.76747358, "num_input_tokens_seen": 102303485, "step": 4740, "time_per_iteration": 2.582517623901367 }, { "auxiliary_loss_clip": 0.01134086, "auxiliary_loss_mlp": 0.01033202, "balance_loss_clip": 1.05308735, "balance_loss_mlp": 1.0250783, "epoch": 0.5700715445199303, "flos": 18037858360320.0, "grad_norm": 1.8819933248917702, "language_loss": 0.7388643, "learning_rate": 1.645120103738424e-06, "loss": 0.76053715, "num_input_tokens_seen": 102320995, "step": 4741, "time_per_iteration": 2.5203750133514404 }, { "auxiliary_loss_clip": 0.01151465, "auxiliary_loss_mlp": 0.00762225, "balance_loss_clip": 1.04902637, "balance_loss_mlp": 1.00041485, "epoch": 0.5701917874105693, "flos": 11473352392320.0, "grad_norm": 2.1662987553458968, "language_loss": 0.83098698, "learning_rate": 1.6443535200384591e-06, "loss": 0.85012382, "num_input_tokens_seen": 102339170, "step": 4742, "time_per_iteration": 3.3173928260803223 }, { "auxiliary_loss_clip": 0.01180091, "auxiliary_loss_mlp": 0.01032067, "balance_loss_clip": 1.05387616, "balance_loss_mlp": 1.02391887, "epoch": 0.5703120303012085, "flos": 21761759018880.0, "grad_norm": 1.7339324478765894, "language_loss": 0.70340449, "learning_rate": 1.6435869902897827e-06, "loss": 0.72552609, "num_input_tokens_seen": 102357750, "step": 4743, "time_per_iteration": 2.4915976524353027 }, { "auxiliary_loss_clip": 0.01042878, "auxiliary_loss_mlp": 0.01002232, "balance_loss_clip": 1.01622462, "balance_loss_mlp": 1.00115955, "epoch": 0.5704322731918475, "flos": 56746258513920.0, "grad_norm": 0.7916318192070402, "language_loss": 0.62012851, "learning_rate": 1.6428205146086764e-06, "loss": 0.6405797, "num_input_tokens_seen": 102419730, "step": 4744, "time_per_iteration": 3.156371593475342 }, { "auxiliary_loss_clip": 0.01155329, "auxiliary_loss_mlp": 0.01027641, "balance_loss_clip": 1.04986286, "balance_loss_mlp": 1.01962423, "epoch": 0.5705525160824866, "flos": 20741141975040.0, "grad_norm": 1.5881622900639671, "language_loss": 0.70522523, "learning_rate": 1.6420540931114142e-06, "loss": 0.72705495, "num_input_tokens_seen": 102440320, "step": 4745, "time_per_iteration": 3.334642171859741 }, { "auxiliary_loss_clip": 0.01150573, "auxiliary_loss_mlp": 0.01039313, "balance_loss_clip": 1.0490464, "balance_loss_mlp": 1.0315944, "epoch": 0.5706727589731257, "flos": 18771262254720.0, "grad_norm": 2.9952781073061967, "language_loss": 0.78852087, "learning_rate": 1.6412877259142616e-06, "loss": 0.81041968, "num_input_tokens_seen": 102460240, "step": 4746, "time_per_iteration": 2.5048701763153076 }, { "auxiliary_loss_clip": 0.01147025, "auxiliary_loss_mlp": 0.01026542, "balance_loss_clip": 1.05052066, "balance_loss_mlp": 1.0187099, "epoch": 0.5707930018637648, "flos": 27634733372160.0, "grad_norm": 3.0466808057231773, "language_loss": 0.73664892, "learning_rate": 1.6405214131334757e-06, "loss": 0.75838459, "num_input_tokens_seen": 102478765, "step": 4747, "time_per_iteration": 2.5273215770721436 }, { "auxiliary_loss_clip": 0.01119149, "auxiliary_loss_mlp": 0.0102502, "balance_loss_clip": 1.04923499, "balance_loss_mlp": 1.01717043, "epoch": 0.5709132447544039, "flos": 27597673514880.0, "grad_norm": 2.1510479692751954, "language_loss": 0.79685748, "learning_rate": 1.6397551548853052e-06, "loss": 0.81829917, "num_input_tokens_seen": 102496930, "step": 4748, "time_per_iteration": 2.60506534576416 }, { "auxiliary_loss_clip": 0.01150623, "auxiliary_loss_mlp": 0.01027717, "balance_loss_clip": 1.05115271, "balance_loss_mlp": 1.01989698, "epoch": 0.571033487645043, "flos": 21686095019520.0, "grad_norm": 1.707413932690761, "language_loss": 0.70669395, "learning_rate": 1.6389889512859917e-06, "loss": 0.72847736, "num_input_tokens_seen": 102516590, "step": 4749, "time_per_iteration": 2.495048999786377 }, { "auxiliary_loss_clip": 0.01051427, "auxiliary_loss_mlp": 0.01001945, "balance_loss_clip": 1.01452708, "balance_loss_mlp": 1.00094342, "epoch": 0.5711537305356821, "flos": 70181445980160.0, "grad_norm": 1.5239875771762392, "language_loss": 0.60341245, "learning_rate": 1.638222802451767e-06, "loss": 0.62394613, "num_input_tokens_seen": 102578070, "step": 4750, "time_per_iteration": 3.078207015991211 }, { "auxiliary_loss_clip": 0.01159788, "auxiliary_loss_mlp": 0.01025052, "balance_loss_clip": 1.05223393, "balance_loss_mlp": 1.01755369, "epoch": 0.5712739734263211, "flos": 24717494396160.0, "grad_norm": 3.5052093593161797, "language_loss": 0.75152946, "learning_rate": 1.6374567084988561e-06, "loss": 0.77337784, "num_input_tokens_seen": 102599255, "step": 4751, "time_per_iteration": 2.490955114364624 }, { "auxiliary_loss_clip": 0.01156749, "auxiliary_loss_mlp": 0.01027565, "balance_loss_clip": 1.05352747, "balance_loss_mlp": 1.01873159, "epoch": 0.5713942163169603, "flos": 26578169792640.0, "grad_norm": 1.9093210784622567, "language_loss": 0.76478082, "learning_rate": 1.6366906695434738e-06, "loss": 0.78662395, "num_input_tokens_seen": 102621775, "step": 4752, "time_per_iteration": 2.5401628017425537 }, { "auxiliary_loss_clip": 0.01167696, "auxiliary_loss_mlp": 0.01028785, "balance_loss_clip": 1.0564239, "balance_loss_mlp": 1.02113795, "epoch": 0.5715144592075994, "flos": 21142443697920.0, "grad_norm": 2.0891406145388656, "language_loss": 0.86240387, "learning_rate": 1.6359246857018275e-06, "loss": 0.88436872, "num_input_tokens_seen": 102639305, "step": 4753, "time_per_iteration": 2.456295967102051 }, { "auxiliary_loss_clip": 0.01121055, "auxiliary_loss_mlp": 0.01024696, "balance_loss_clip": 1.04586601, "balance_loss_mlp": 1.01672101, "epoch": 0.5716347020982384, "flos": 23330265189120.0, "grad_norm": 5.332455642797424, "language_loss": 0.7806741, "learning_rate": 1.6351587570901178e-06, "loss": 0.80213165, "num_input_tokens_seen": 102659430, "step": 4754, "time_per_iteration": 2.570239305496216 }, { "auxiliary_loss_clip": 0.0113945, "auxiliary_loss_mlp": 0.01030149, "balance_loss_clip": 1.05317783, "balance_loss_mlp": 1.02257323, "epoch": 0.5717549449888776, "flos": 17009555806080.0, "grad_norm": 3.52161720218778, "language_loss": 0.75976098, "learning_rate": 1.634392883824534e-06, "loss": 0.78145695, "num_input_tokens_seen": 102671430, "step": 4755, "time_per_iteration": 2.4697248935699463 }, { "auxiliary_loss_clip": 0.01124218, "auxiliary_loss_mlp": 0.0102753, "balance_loss_clip": 1.04681683, "balance_loss_mlp": 1.0193162, "epoch": 0.5718751878795166, "flos": 35518130922240.0, "grad_norm": 1.7099825837522165, "language_loss": 0.67718029, "learning_rate": 1.6336270660212595e-06, "loss": 0.69869781, "num_input_tokens_seen": 102693025, "step": 4756, "time_per_iteration": 2.679170608520508 }, { "auxiliary_loss_clip": 0.01151473, "auxiliary_loss_mlp": 0.01028719, "balance_loss_clip": 1.05642188, "balance_loss_mlp": 1.01994491, "epoch": 0.5719954307701557, "flos": 38613989255040.0, "grad_norm": 2.098109128742455, "language_loss": 0.65970272, "learning_rate": 1.6328613037964676e-06, "loss": 0.68150461, "num_input_tokens_seen": 102716090, "step": 4757, "time_per_iteration": 2.632429599761963 }, { "auxiliary_loss_clip": 0.0116407, "auxiliary_loss_mlp": 0.01024895, "balance_loss_clip": 1.05146694, "balance_loss_mlp": 1.01708078, "epoch": 0.5721156736607949, "flos": 20631111638400.0, "grad_norm": 2.2049568480010024, "language_loss": 0.67957032, "learning_rate": 1.6320955972663241e-06, "loss": 0.70145994, "num_input_tokens_seen": 102735685, "step": 4758, "time_per_iteration": 2.4628214836120605 }, { "auxiliary_loss_clip": 0.01165425, "auxiliary_loss_mlp": 0.01028752, "balance_loss_clip": 1.05173123, "balance_loss_mlp": 1.02092004, "epoch": 0.5722359165514339, "flos": 37415076076800.0, "grad_norm": 3.2790342293734995, "language_loss": 0.65478462, "learning_rate": 1.6313299465469857e-06, "loss": 0.67672646, "num_input_tokens_seen": 102758415, "step": 4759, "time_per_iteration": 2.6060216426849365 }, { "auxiliary_loss_clip": 0.0116235, "auxiliary_loss_mlp": 0.01029642, "balance_loss_clip": 1.05295944, "balance_loss_mlp": 1.02113032, "epoch": 0.572356159442073, "flos": 21972877205760.0, "grad_norm": 3.0941458673995004, "language_loss": 0.79842687, "learning_rate": 1.6305643517546014e-06, "loss": 0.82034677, "num_input_tokens_seen": 102773795, "step": 4760, "time_per_iteration": 2.4629385471343994 }, { "auxiliary_loss_clip": 0.01178226, "auxiliary_loss_mlp": 0.01036987, "balance_loss_clip": 1.05441713, "balance_loss_mlp": 1.02931631, "epoch": 0.5724764023327121, "flos": 19135540033920.0, "grad_norm": 2.205102241371106, "language_loss": 0.84799659, "learning_rate": 1.629798813005311e-06, "loss": 0.87014878, "num_input_tokens_seen": 102793515, "step": 4761, "time_per_iteration": 2.442936897277832 }, { "auxiliary_loss_clip": 0.01122685, "auxiliary_loss_mlp": 0.01028305, "balance_loss_clip": 1.04923034, "balance_loss_mlp": 1.02066636, "epoch": 0.5725966452233512, "flos": 22819759142400.0, "grad_norm": 2.060900104094086, "language_loss": 0.71421975, "learning_rate": 1.6290333304152473e-06, "loss": 0.73572958, "num_input_tokens_seen": 102813390, "step": 4762, "time_per_iteration": 3.454465866088867 }, { "auxiliary_loss_clip": 0.01150804, "auxiliary_loss_mlp": 0.01032457, "balance_loss_clip": 1.05635846, "balance_loss_mlp": 1.02433848, "epoch": 0.5727168881139902, "flos": 41496610498560.0, "grad_norm": 1.739227773846657, "language_loss": 0.56967151, "learning_rate": 1.6282679041005314e-06, "loss": 0.59150416, "num_input_tokens_seen": 102838980, "step": 4763, "time_per_iteration": 2.65539813041687 }, { "auxiliary_loss_clip": 0.01141807, "auxiliary_loss_mlp": 0.01023589, "balance_loss_clip": 1.04769182, "balance_loss_mlp": 1.01583421, "epoch": 0.5728371310046293, "flos": 14647675985280.0, "grad_norm": 4.273514517086232, "language_loss": 0.87447834, "learning_rate": 1.6275025341772789e-06, "loss": 0.89613229, "num_input_tokens_seen": 102855285, "step": 4764, "time_per_iteration": 2.451650857925415 }, { "auxiliary_loss_clip": 0.01150931, "auxiliary_loss_mlp": 0.01029313, "balance_loss_clip": 1.04970515, "balance_loss_mlp": 1.02058053, "epoch": 0.5729573738952685, "flos": 21506613736320.0, "grad_norm": 2.603330793803001, "language_loss": 0.81770635, "learning_rate": 1.626737220761596e-06, "loss": 0.83950877, "num_input_tokens_seen": 102872750, "step": 4765, "time_per_iteration": 2.4955108165740967 }, { "auxiliary_loss_clip": 0.01160917, "auxiliary_loss_mlp": 0.01028216, "balance_loss_clip": 1.0530231, "balance_loss_mlp": 1.02030087, "epoch": 0.5730776167859075, "flos": 23621680229760.0, "grad_norm": 2.3190990351956633, "language_loss": 0.7847755, "learning_rate": 1.62597196396958e-06, "loss": 0.80666685, "num_input_tokens_seen": 102890920, "step": 4766, "time_per_iteration": 3.3220200538635254 }, { "auxiliary_loss_clip": 0.01167287, "auxiliary_loss_mlp": 0.01026303, "balance_loss_clip": 1.05431175, "balance_loss_mlp": 1.01817298, "epoch": 0.5731978596765466, "flos": 25739224761600.0, "grad_norm": 1.854538852857469, "language_loss": 0.85461199, "learning_rate": 1.6252067639173197e-06, "loss": 0.87654787, "num_input_tokens_seen": 102912830, "step": 4767, "time_per_iteration": 2.495978832244873 }, { "auxiliary_loss_clip": 0.01165056, "auxiliary_loss_mlp": 0.01027951, "balance_loss_clip": 1.0521971, "balance_loss_mlp": 1.01991653, "epoch": 0.5733181025671857, "flos": 26359509749760.0, "grad_norm": 1.8074348970593332, "language_loss": 0.69824123, "learning_rate": 1.6244416207208956e-06, "loss": 0.72017127, "num_input_tokens_seen": 102933765, "step": 4768, "time_per_iteration": 3.297753095626831 }, { "auxiliary_loss_clip": 0.01138937, "auxiliary_loss_mlp": 0.01033193, "balance_loss_clip": 1.05142665, "balance_loss_mlp": 1.02530742, "epoch": 0.5734383454578248, "flos": 29423874833280.0, "grad_norm": 1.6160593431772499, "language_loss": 0.73711258, "learning_rate": 1.6236765344963787e-06, "loss": 0.75883389, "num_input_tokens_seen": 102955025, "step": 4769, "time_per_iteration": 2.586174964904785 }, { "auxiliary_loss_clip": 0.01151216, "auxiliary_loss_mlp": 0.01025143, "balance_loss_clip": 1.05109143, "balance_loss_mlp": 1.01734674, "epoch": 0.5735585883484638, "flos": 34969954487040.0, "grad_norm": 2.519054706805177, "language_loss": 0.68985176, "learning_rate": 1.6229115053598322e-06, "loss": 0.71161532, "num_input_tokens_seen": 102976780, "step": 4770, "time_per_iteration": 2.602721929550171 }, { "auxiliary_loss_clip": 0.01167753, "auxiliary_loss_mlp": 0.01030026, "balance_loss_clip": 1.05519235, "balance_loss_mlp": 1.02214599, "epoch": 0.573678831239103, "flos": 18770759464320.0, "grad_norm": 1.6986385436856126, "language_loss": 0.72035605, "learning_rate": 1.6221465334273108e-06, "loss": 0.74233383, "num_input_tokens_seen": 102995990, "step": 4771, "time_per_iteration": 3.1759603023529053 }, { "auxiliary_loss_clip": 0.01141856, "auxiliary_loss_mlp": 0.01024853, "balance_loss_clip": 1.04956734, "balance_loss_mlp": 1.01685977, "epoch": 0.5737990741297421, "flos": 25702883176320.0, "grad_norm": 2.1656902682139645, "language_loss": 0.61840439, "learning_rate": 1.6213816188148593e-06, "loss": 0.64007151, "num_input_tokens_seen": 103014695, "step": 4772, "time_per_iteration": 2.5560390949249268 }, { "auxiliary_loss_clip": 0.01145857, "auxiliary_loss_mlp": 0.01028749, "balance_loss_clip": 1.0553143, "balance_loss_mlp": 1.02075052, "epoch": 0.5739193170203811, "flos": 27269234530560.0, "grad_norm": 1.758216911477341, "language_loss": 0.77144563, "learning_rate": 1.6206167616385162e-06, "loss": 0.79319167, "num_input_tokens_seen": 103035760, "step": 4773, "time_per_iteration": 2.569058895111084 }, { "auxiliary_loss_clip": 0.0115817, "auxiliary_loss_mlp": 0.01028858, "balance_loss_clip": 1.05315411, "balance_loss_mlp": 1.02077007, "epoch": 0.5740395599110203, "flos": 12239721993600.0, "grad_norm": 3.3084818464407175, "language_loss": 0.73481417, "learning_rate": 1.6198519620143078e-06, "loss": 0.75668442, "num_input_tokens_seen": 103052915, "step": 4774, "time_per_iteration": 2.472527503967285 }, { "auxiliary_loss_clip": 0.01142225, "auxiliary_loss_mlp": 0.01032314, "balance_loss_clip": 1.0526861, "balance_loss_mlp": 1.02453017, "epoch": 0.5741598028016593, "flos": 25921399564800.0, "grad_norm": 1.5924289218745713, "language_loss": 0.78225183, "learning_rate": 1.6190872200582546e-06, "loss": 0.80399728, "num_input_tokens_seen": 103074655, "step": 4775, "time_per_iteration": 2.5884151458740234 }, { "auxiliary_loss_clip": 0.01146656, "auxiliary_loss_mlp": 0.00762394, "balance_loss_clip": 1.05057156, "balance_loss_mlp": 1.00039613, "epoch": 0.5742800456922984, "flos": 19244133826560.0, "grad_norm": 2.085643995961095, "language_loss": 0.7782892, "learning_rate": 1.6183225358863676e-06, "loss": 0.79737973, "num_input_tokens_seen": 103091550, "step": 4776, "time_per_iteration": 2.4914186000823975 }, { "auxiliary_loss_clip": 0.01141872, "auxiliary_loss_mlp": 0.01028435, "balance_loss_clip": 1.04854012, "balance_loss_mlp": 1.01994717, "epoch": 0.5744002885829376, "flos": 30920487932160.0, "grad_norm": 2.43199619117625, "language_loss": 0.71885306, "learning_rate": 1.617557909614648e-06, "loss": 0.74055612, "num_input_tokens_seen": 103110985, "step": 4777, "time_per_iteration": 2.5706887245178223 }, { "auxiliary_loss_clip": 0.01134175, "auxiliary_loss_mlp": 0.01027744, "balance_loss_clip": 1.04754949, "balance_loss_mlp": 1.02012062, "epoch": 0.5745205314735766, "flos": 23840017050240.0, "grad_norm": 2.1036283836514804, "language_loss": 0.86193657, "learning_rate": 1.6167933413590899e-06, "loss": 0.88355571, "num_input_tokens_seen": 103129890, "step": 4778, "time_per_iteration": 2.548201084136963 }, { "auxiliary_loss_clip": 0.01163781, "auxiliary_loss_mlp": 0.01031255, "balance_loss_clip": 1.05230212, "balance_loss_mlp": 1.02337563, "epoch": 0.5746407743642157, "flos": 12311902373760.0, "grad_norm": 7.07477411183321, "language_loss": 0.90704578, "learning_rate": 1.6160288312356773e-06, "loss": 0.92899609, "num_input_tokens_seen": 103147020, "step": 4779, "time_per_iteration": 2.458759307861328 }, { "auxiliary_loss_clip": 0.01168738, "auxiliary_loss_mlp": 0.01027353, "balance_loss_clip": 1.05215931, "balance_loss_mlp": 1.01888895, "epoch": 0.5747610172548548, "flos": 24133658734080.0, "grad_norm": 1.5443493866202302, "language_loss": 0.81743348, "learning_rate": 1.6152643793603857e-06, "loss": 0.83939433, "num_input_tokens_seen": 103167370, "step": 4780, "time_per_iteration": 2.4920387268066406 }, { "auxiliary_loss_clip": 0.01179624, "auxiliary_loss_mlp": 0.01027761, "balance_loss_clip": 1.05525708, "balance_loss_mlp": 1.02006555, "epoch": 0.5748812601454939, "flos": 25408451393280.0, "grad_norm": 1.7335684289408109, "language_loss": 0.87785637, "learning_rate": 1.6144999858491815e-06, "loss": 0.89993024, "num_input_tokens_seen": 103186000, "step": 4781, "time_per_iteration": 2.4937713146209717 }, { "auxiliary_loss_clip": 0.01156142, "auxiliary_loss_mlp": 0.01024267, "balance_loss_clip": 1.05169177, "balance_loss_mlp": 1.01584518, "epoch": 0.575001503036133, "flos": 30624942827520.0, "grad_norm": 1.595080321016195, "language_loss": 0.85633057, "learning_rate": 1.6137356508180232e-06, "loss": 0.87813461, "num_input_tokens_seen": 103207710, "step": 4782, "time_per_iteration": 2.57137131690979 }, { "auxiliary_loss_clip": 0.01180839, "auxiliary_loss_mlp": 0.00762296, "balance_loss_clip": 1.05387294, "balance_loss_mlp": 1.00040984, "epoch": 0.5751217459267721, "flos": 21726566668800.0, "grad_norm": 2.235736534573803, "language_loss": 0.81298852, "learning_rate": 1.6129713743828593e-06, "loss": 0.83241987, "num_input_tokens_seen": 103226720, "step": 4783, "time_per_iteration": 2.448772430419922 }, { "auxiliary_loss_clip": 0.01151621, "auxiliary_loss_mlp": 0.01027815, "balance_loss_clip": 1.04941893, "balance_loss_mlp": 1.02030456, "epoch": 0.5752419888174112, "flos": 21651620941440.0, "grad_norm": 1.523641375629479, "language_loss": 0.75432396, "learning_rate": 1.6122071566596306e-06, "loss": 0.77611828, "num_input_tokens_seen": 103246995, "step": 4784, "time_per_iteration": 2.5059280395507812 }, { "auxiliary_loss_clip": 0.01168195, "auxiliary_loss_mlp": 0.01029577, "balance_loss_clip": 1.05327964, "balance_loss_mlp": 1.02119637, "epoch": 0.5753622317080502, "flos": 17775997234560.0, "grad_norm": 2.744317140698451, "language_loss": 0.83043426, "learning_rate": 1.6114429977642674e-06, "loss": 0.85241199, "num_input_tokens_seen": 103261500, "step": 4785, "time_per_iteration": 2.424600601196289 }, { "auxiliary_loss_clip": 0.01166476, "auxiliary_loss_mlp": 0.0102614, "balance_loss_clip": 1.05516362, "balance_loss_mlp": 1.01873064, "epoch": 0.5754824745986894, "flos": 19789616741760.0, "grad_norm": 1.7930361664239713, "language_loss": 0.73213071, "learning_rate": 1.6106788978126926e-06, "loss": 0.75405687, "num_input_tokens_seen": 103280475, "step": 4786, "time_per_iteration": 2.4599227905273438 }, { "auxiliary_loss_clip": 0.01118287, "auxiliary_loss_mlp": 0.01032201, "balance_loss_clip": 1.04471731, "balance_loss_mlp": 1.02342129, "epoch": 0.5756027174893285, "flos": 30985665160320.0, "grad_norm": 5.146094576227733, "language_loss": 0.79100758, "learning_rate": 1.6099148569208196e-06, "loss": 0.81251246, "num_input_tokens_seen": 103297695, "step": 4787, "time_per_iteration": 2.624391794204712 }, { "auxiliary_loss_clip": 0.01154895, "auxiliary_loss_mlp": 0.01034565, "balance_loss_clip": 1.05490386, "balance_loss_mlp": 1.02555287, "epoch": 0.5757229603799675, "flos": 28546864364160.0, "grad_norm": 2.6128869188007537, "language_loss": 0.62864387, "learning_rate": 1.6091508752045523e-06, "loss": 0.6505385, "num_input_tokens_seen": 103318575, "step": 4788, "time_per_iteration": 2.5386650562286377 }, { "auxiliary_loss_clip": 0.01127626, "auxiliary_loss_mlp": 0.01025739, "balance_loss_clip": 1.04600096, "balance_loss_mlp": 1.01770473, "epoch": 0.5758432032706067, "flos": 22999024944000.0, "grad_norm": 2.168295974326995, "language_loss": 0.86428696, "learning_rate": 1.608386952779787e-06, "loss": 0.88582063, "num_input_tokens_seen": 103337945, "step": 4789, "time_per_iteration": 3.3623316287994385 }, { "auxiliary_loss_clip": 0.011567, "auxiliary_loss_mlp": 0.01026606, "balance_loss_clip": 1.05294979, "balance_loss_mlp": 1.01910138, "epoch": 0.5759634461612457, "flos": 25739727552000.0, "grad_norm": 1.6391115248064005, "language_loss": 0.7456978, "learning_rate": 1.6076230897624098e-06, "loss": 0.76753086, "num_input_tokens_seen": 103360150, "step": 4790, "time_per_iteration": 2.5459370613098145 }, { "auxiliary_loss_clip": 0.01165915, "auxiliary_loss_mlp": 0.01030199, "balance_loss_clip": 1.05057538, "balance_loss_mlp": 1.02169979, "epoch": 0.5760836890518848, "flos": 30591761639040.0, "grad_norm": 2.297407168241599, "language_loss": 0.77500349, "learning_rate": 1.6068592862682974e-06, "loss": 0.79696465, "num_input_tokens_seen": 103378305, "step": 4791, "time_per_iteration": 2.5299479961395264 }, { "auxiliary_loss_clip": 0.01153211, "auxiliary_loss_mlp": 0.01031202, "balance_loss_clip": 1.05213237, "balance_loss_mlp": 1.02332222, "epoch": 0.576203931942524, "flos": 36538963447680.0, "grad_norm": 1.773175782769849, "language_loss": 0.73385715, "learning_rate": 1.6060955424133187e-06, "loss": 0.7557013, "num_input_tokens_seen": 103399230, "step": 4792, "time_per_iteration": 3.5237619876861572 }, { "auxiliary_loss_clip": 0.01165923, "auxiliary_loss_mlp": 0.01026662, "balance_loss_clip": 1.05542064, "balance_loss_mlp": 1.01798964, "epoch": 0.576324174833163, "flos": 25516937445120.0, "grad_norm": 1.743717787566893, "language_loss": 0.89414316, "learning_rate": 1.6053318583133332e-06, "loss": 0.91606903, "num_input_tokens_seen": 103420100, "step": 4793, "time_per_iteration": 2.5145506858825684 }, { "auxiliary_loss_clip": 0.01164846, "auxiliary_loss_mlp": 0.01032138, "balance_loss_clip": 1.05392146, "balance_loss_mlp": 1.02375805, "epoch": 0.5764444177238021, "flos": 25119262995840.0, "grad_norm": 2.0376276274853775, "language_loss": 0.75491732, "learning_rate": 1.6045682340841907e-06, "loss": 0.77688712, "num_input_tokens_seen": 103439025, "step": 4794, "time_per_iteration": 2.5021474361419678 }, { "auxiliary_loss_clip": 0.0105585, "auxiliary_loss_mlp": 0.00752672, "balance_loss_clip": 1.02720571, "balance_loss_mlp": 0.99973947, "epoch": 0.5765646606144411, "flos": 62212687758720.0, "grad_norm": 0.752540202647363, "language_loss": 0.58022308, "learning_rate": 1.6038046698417336e-06, "loss": 0.59830832, "num_input_tokens_seen": 103499920, "step": 4795, "time_per_iteration": 3.94370436668396 }, { "auxiliary_loss_clip": 0.01164884, "auxiliary_loss_mlp": 0.01024995, "balance_loss_clip": 1.05216634, "balance_loss_mlp": 1.01725233, "epoch": 0.5766849035050803, "flos": 25118760205440.0, "grad_norm": 1.9003486803110985, "language_loss": 0.68532211, "learning_rate": 1.6030411657017919e-06, "loss": 0.70722091, "num_input_tokens_seen": 103519575, "step": 4796, "time_per_iteration": 2.5120344161987305 }, { "auxiliary_loss_clip": 0.01156231, "auxiliary_loss_mlp": 0.01025154, "balance_loss_clip": 1.05059648, "balance_loss_mlp": 1.0174408, "epoch": 0.5768051463957193, "flos": 15991093578240.0, "grad_norm": 1.761590501920671, "language_loss": 0.84264827, "learning_rate": 1.6022777217801903e-06, "loss": 0.86446214, "num_input_tokens_seen": 103536530, "step": 4797, "time_per_iteration": 3.161177158355713 }, { "auxiliary_loss_clip": 0.01136033, "auxiliary_loss_mlp": 0.01023375, "balance_loss_clip": 1.05145061, "balance_loss_mlp": 1.0154984, "epoch": 0.5769253892863584, "flos": 22163635359360.0, "grad_norm": 1.8800946414537691, "language_loss": 0.73611188, "learning_rate": 1.601514338192742e-06, "loss": 0.75770599, "num_input_tokens_seen": 103556460, "step": 4798, "time_per_iteration": 2.5353286266326904 }, { "auxiliary_loss_clip": 0.01175039, "auxiliary_loss_mlp": 0.01023674, "balance_loss_clip": 1.05287385, "balance_loss_mlp": 1.01635456, "epoch": 0.5770456321769976, "flos": 22856388036480.0, "grad_norm": 2.114391634578596, "language_loss": 0.71742213, "learning_rate": 1.6007510150552514e-06, "loss": 0.73940921, "num_input_tokens_seen": 103574520, "step": 4799, "time_per_iteration": 2.4333620071411133 }, { "auxiliary_loss_clip": 0.01168884, "auxiliary_loss_mlp": 0.01027255, "balance_loss_clip": 1.05176425, "balance_loss_mlp": 1.01856422, "epoch": 0.5771658750676366, "flos": 46353672489600.0, "grad_norm": 1.5141077532487481, "language_loss": 0.62086558, "learning_rate": 1.599987752483515e-06, "loss": 0.64282703, "num_input_tokens_seen": 103598965, "step": 4800, "time_per_iteration": 2.6748929023742676 }, { "auxiliary_loss_clip": 0.01130554, "auxiliary_loss_mlp": 0.01028828, "balance_loss_clip": 1.04643154, "balance_loss_mlp": 1.02079892, "epoch": 0.5772861179582757, "flos": 22159972172160.0, "grad_norm": 1.6100844400104255, "language_loss": 0.67872632, "learning_rate": 1.5992245505933184e-06, "loss": 0.70032012, "num_input_tokens_seen": 103618665, "step": 4801, "time_per_iteration": 2.5230495929718018 }, { "auxiliary_loss_clip": 0.01180662, "auxiliary_loss_mlp": 0.01028828, "balance_loss_clip": 1.05408692, "balance_loss_mlp": 1.02134156, "epoch": 0.5774063608489148, "flos": 31248926916480.0, "grad_norm": 2.002559568933912, "language_loss": 0.7088384, "learning_rate": 1.5984614095004388e-06, "loss": 0.73093331, "num_input_tokens_seen": 103639800, "step": 4802, "time_per_iteration": 2.4983036518096924 }, { "auxiliary_loss_clip": 0.01158881, "auxiliary_loss_mlp": 0.01030225, "balance_loss_clip": 1.05158973, "balance_loss_mlp": 1.02246118, "epoch": 0.5775266037395539, "flos": 22527123039360.0, "grad_norm": 2.80413296871043, "language_loss": 0.81024653, "learning_rate": 1.5976983293206438e-06, "loss": 0.83213758, "num_input_tokens_seen": 103655605, "step": 4803, "time_per_iteration": 2.449223518371582 }, { "auxiliary_loss_clip": 0.01145952, "auxiliary_loss_mlp": 0.01029542, "balance_loss_clip": 1.04780221, "balance_loss_mlp": 1.02193069, "epoch": 0.577646846630193, "flos": 21068790860160.0, "grad_norm": 1.8254404834279567, "language_loss": 0.71511841, "learning_rate": 1.5969353101696928e-06, "loss": 0.73687339, "num_input_tokens_seen": 103674045, "step": 4804, "time_per_iteration": 2.4780876636505127 }, { "auxiliary_loss_clip": 0.01164843, "auxiliary_loss_mlp": 0.01030267, "balance_loss_clip": 1.05211675, "balance_loss_mlp": 1.02312672, "epoch": 0.5777670895208321, "flos": 29714284293120.0, "grad_norm": 1.5720162114038148, "language_loss": 0.79401541, "learning_rate": 1.5961723521633341e-06, "loss": 0.81596649, "num_input_tokens_seen": 103695285, "step": 4805, "time_per_iteration": 2.527066230773926 }, { "auxiliary_loss_clip": 0.01145788, "auxiliary_loss_mlp": 0.01031655, "balance_loss_clip": 1.04825234, "balance_loss_mlp": 1.02364397, "epoch": 0.5778873324114712, "flos": 19500428344320.0, "grad_norm": 2.2742097060379605, "language_loss": 0.90755296, "learning_rate": 1.5954094554173097e-06, "loss": 0.92932737, "num_input_tokens_seen": 103713275, "step": 4806, "time_per_iteration": 2.4670984745025635 }, { "auxiliary_loss_clip": 0.01156293, "auxiliary_loss_mlp": 0.01027005, "balance_loss_clip": 1.05265927, "balance_loss_mlp": 1.01957774, "epoch": 0.5780075753021102, "flos": 14136846716160.0, "grad_norm": 2.099187690766042, "language_loss": 0.78771853, "learning_rate": 1.5946466200473482e-06, "loss": 0.80955154, "num_input_tokens_seen": 103731185, "step": 4807, "time_per_iteration": 2.4643871784210205 }, { "auxiliary_loss_clip": 0.011533, "auxiliary_loss_mlp": 0.01030682, "balance_loss_clip": 1.04941022, "balance_loss_mlp": 1.02274859, "epoch": 0.5781278181927494, "flos": 15262178883840.0, "grad_norm": 1.9045103062158144, "language_loss": 0.83419228, "learning_rate": 1.5938838461691723e-06, "loss": 0.85603213, "num_input_tokens_seen": 103748095, "step": 4808, "time_per_iteration": 2.4726104736328125 }, { "auxiliary_loss_clip": 0.01181256, "auxiliary_loss_mlp": 0.01027813, "balance_loss_clip": 1.05584574, "balance_loss_mlp": 1.02013552, "epoch": 0.5782480610833884, "flos": 16726831856640.0, "grad_norm": 2.6893097111012914, "language_loss": 0.82849038, "learning_rate": 1.593121133898494e-06, "loss": 0.85058105, "num_input_tokens_seen": 103765300, "step": 4809, "time_per_iteration": 2.404103994369507 }, { "auxiliary_loss_clip": 0.01170514, "auxiliary_loss_mlp": 0.0102681, "balance_loss_clip": 1.05332327, "balance_loss_mlp": 1.01931202, "epoch": 0.5783683039740275, "flos": 25482140144640.0, "grad_norm": 2.3154858738439033, "language_loss": 0.79078865, "learning_rate": 1.592358483351016e-06, "loss": 0.8127619, "num_input_tokens_seen": 103785475, "step": 4810, "time_per_iteration": 2.4880356788635254 }, { "auxiliary_loss_clip": 0.01159595, "auxiliary_loss_mlp": 0.01024556, "balance_loss_clip": 1.05054951, "balance_loss_mlp": 1.01746881, "epoch": 0.5784885468646667, "flos": 18405835240320.0, "grad_norm": 1.9558081568064312, "language_loss": 0.72437102, "learning_rate": 1.5915958946424326e-06, "loss": 0.74621248, "num_input_tokens_seen": 103804160, "step": 4811, "time_per_iteration": 2.433809518814087 }, { "auxiliary_loss_clip": 0.01138734, "auxiliary_loss_mlp": 0.00763159, "balance_loss_clip": 1.05050564, "balance_loss_mlp": 1.00041556, "epoch": 0.5786087897553057, "flos": 46100717936640.0, "grad_norm": 1.6131716401641405, "language_loss": 0.74637413, "learning_rate": 1.5908333678884271e-06, "loss": 0.76539308, "num_input_tokens_seen": 103830580, "step": 4812, "time_per_iteration": 2.761343002319336 }, { "auxiliary_loss_clip": 0.01162204, "auxiliary_loss_mlp": 0.01027746, "balance_loss_clip": 1.05179417, "balance_loss_mlp": 1.02013421, "epoch": 0.5787290326459448, "flos": 12385950261120.0, "grad_norm": 2.60731347072189, "language_loss": 0.73593378, "learning_rate": 1.5900709032046743e-06, "loss": 0.75783324, "num_input_tokens_seen": 103848655, "step": 4813, "time_per_iteration": 2.429922580718994 }, { "auxiliary_loss_clip": 0.01147975, "auxiliary_loss_mlp": 0.01027722, "balance_loss_clip": 1.0530237, "balance_loss_mlp": 1.01994109, "epoch": 0.5788492755365839, "flos": 23290332243840.0, "grad_norm": 2.2008917073525547, "language_loss": 0.78439766, "learning_rate": 1.5893085007068391e-06, "loss": 0.80615461, "num_input_tokens_seen": 103866215, "step": 4814, "time_per_iteration": 2.4926669597625732 }, { "auxiliary_loss_clip": 0.01138574, "auxiliary_loss_mlp": 0.01025892, "balance_loss_clip": 1.04767525, "balance_loss_mlp": 1.01736546, "epoch": 0.578969518427223, "flos": 24061047390720.0, "grad_norm": 1.7970503846266523, "language_loss": 0.70889968, "learning_rate": 1.5885461605105786e-06, "loss": 0.73054433, "num_input_tokens_seen": 103887815, "step": 4815, "time_per_iteration": 2.5430924892425537 }, { "auxiliary_loss_clip": 0.01154508, "auxiliary_loss_mlp": 0.01028425, "balance_loss_clip": 1.0535754, "balance_loss_mlp": 1.02012205, "epoch": 0.579089761317862, "flos": 21871825269120.0, "grad_norm": 1.792600951521911, "language_loss": 0.76843095, "learning_rate": 1.5877838827315375e-06, "loss": 0.79026026, "num_input_tokens_seen": 103906360, "step": 4816, "time_per_iteration": 3.320629358291626 }, { "auxiliary_loss_clip": 0.01178697, "auxiliary_loss_mlp": 0.01030352, "balance_loss_clip": 1.05510771, "balance_loss_mlp": 1.02273405, "epoch": 0.5792100042085012, "flos": 22929681738240.0, "grad_norm": 1.6887431972782252, "language_loss": 0.70308185, "learning_rate": 1.587021667485355e-06, "loss": 0.72517234, "num_input_tokens_seen": 103925730, "step": 4817, "time_per_iteration": 2.450834035873413 }, { "auxiliary_loss_clip": 0.01150962, "auxiliary_loss_mlp": 0.01022438, "balance_loss_clip": 1.04798841, "balance_loss_mlp": 1.0148263, "epoch": 0.5793302470991403, "flos": 21470056669440.0, "grad_norm": 1.7952666705475375, "language_loss": 0.78592384, "learning_rate": 1.5862595148876559e-06, "loss": 0.80765784, "num_input_tokens_seen": 103945835, "step": 4818, "time_per_iteration": 2.5076398849487305 }, { "auxiliary_loss_clip": 0.01125559, "auxiliary_loss_mlp": 0.01028747, "balance_loss_clip": 1.0495286, "balance_loss_mlp": 1.02084947, "epoch": 0.5794504899897793, "flos": 12711013367040.0, "grad_norm": 2.2706713707006374, "language_loss": 0.76332724, "learning_rate": 1.58549742505406e-06, "loss": 0.78487033, "num_input_tokens_seen": 103960580, "step": 4819, "time_per_iteration": 3.3719420433044434 }, { "auxiliary_loss_clip": 0.01178578, "auxiliary_loss_mlp": 0.01026739, "balance_loss_clip": 1.05333555, "balance_loss_mlp": 1.01927006, "epoch": 0.5795707328804185, "flos": 14867054300160.0, "grad_norm": 2.5759263894171474, "language_loss": 0.75158703, "learning_rate": 1.5847353981001747e-06, "loss": 0.77364022, "num_input_tokens_seen": 103977760, "step": 4820, "time_per_iteration": 2.443037271499634 }, { "auxiliary_loss_clip": 0.0114201, "auxiliary_loss_mlp": 0.01029647, "balance_loss_clip": 1.04654431, "balance_loss_mlp": 1.02160025, "epoch": 0.5796909757710575, "flos": 36430046432640.0, "grad_norm": 1.7062196958755085, "language_loss": 0.69907612, "learning_rate": 1.5839734341415993e-06, "loss": 0.72079271, "num_input_tokens_seen": 103999960, "step": 4821, "time_per_iteration": 3.46303653717041 }, { "auxiliary_loss_clip": 0.01158311, "auxiliary_loss_mlp": 0.01026743, "balance_loss_clip": 1.05479527, "balance_loss_mlp": 1.019418, "epoch": 0.5798112186616966, "flos": 23039891642880.0, "grad_norm": 7.62861136487302, "language_loss": 0.76747185, "learning_rate": 1.5832115332939238e-06, "loss": 0.78932238, "num_input_tokens_seen": 104018400, "step": 4822, "time_per_iteration": 2.4939842224121094 }, { "auxiliary_loss_clip": 0.01165624, "auxiliary_loss_mlp": 0.01031723, "balance_loss_clip": 1.05331111, "balance_loss_mlp": 1.0240159, "epoch": 0.5799314615523358, "flos": 16652604401280.0, "grad_norm": 2.0397647987742, "language_loss": 0.74556226, "learning_rate": 1.5824496956727272e-06, "loss": 0.76753575, "num_input_tokens_seen": 104035605, "step": 4823, "time_per_iteration": 2.455172538757324 }, { "auxiliary_loss_clip": 0.01150368, "auxiliary_loss_mlp": 0.01025473, "balance_loss_clip": 1.05048776, "balance_loss_mlp": 1.01774251, "epoch": 0.5800517044429748, "flos": 20485673470080.0, "grad_norm": 1.6543905942696686, "language_loss": 0.73162234, "learning_rate": 1.5816879213935797e-06, "loss": 0.75338078, "num_input_tokens_seen": 104054415, "step": 4824, "time_per_iteration": 3.2578377723693848 }, { "auxiliary_loss_clip": 0.01159574, "auxiliary_loss_mlp": 0.01029769, "balance_loss_clip": 1.05142653, "balance_loss_mlp": 1.0224911, "epoch": 0.5801719473336139, "flos": 31538258968320.0, "grad_norm": 1.6112439783685095, "language_loss": 0.79972327, "learning_rate": 1.5809262105720416e-06, "loss": 0.82161665, "num_input_tokens_seen": 104075455, "step": 4825, "time_per_iteration": 2.543463945388794 }, { "auxiliary_loss_clip": 0.01175541, "auxiliary_loss_mlp": 0.01031363, "balance_loss_clip": 1.05400872, "balance_loss_mlp": 1.02417207, "epoch": 0.580292190224253, "flos": 20375966355840.0, "grad_norm": 1.508661381725362, "language_loss": 0.79546809, "learning_rate": 1.5801645633236644e-06, "loss": 0.81753707, "num_input_tokens_seen": 104096440, "step": 4826, "time_per_iteration": 2.461289167404175 }, { "auxiliary_loss_clip": 0.01142754, "auxiliary_loss_mlp": 0.01030793, "balance_loss_clip": 1.04756069, "balance_loss_mlp": 1.02289271, "epoch": 0.5804124331148921, "flos": 26615373304320.0, "grad_norm": 1.9444833460267235, "language_loss": 0.77387327, "learning_rate": 1.579402979763989e-06, "loss": 0.79560876, "num_input_tokens_seen": 104116775, "step": 4827, "time_per_iteration": 2.5271782875061035 }, { "auxiliary_loss_clip": 0.01121406, "auxiliary_loss_mlp": 0.01024271, "balance_loss_clip": 1.05055141, "balance_loss_mlp": 1.0168587, "epoch": 0.5805326760055312, "flos": 13478496289920.0, "grad_norm": 2.1982238170008617, "language_loss": 0.81359017, "learning_rate": 1.578641460008548e-06, "loss": 0.83504689, "num_input_tokens_seen": 104134510, "step": 4828, "time_per_iteration": 2.5493481159210205 }, { "auxiliary_loss_clip": 0.01162447, "auxiliary_loss_mlp": 0.01026366, "balance_loss_clip": 1.05247688, "balance_loss_mlp": 1.01814032, "epoch": 0.5806529188961702, "flos": 12091374823680.0, "grad_norm": 2.1794946307205882, "language_loss": 0.67927456, "learning_rate": 1.5778800041728613e-06, "loss": 0.70116264, "num_input_tokens_seen": 104150800, "step": 4829, "time_per_iteration": 2.423433303833008 }, { "auxiliary_loss_clip": 0.01159302, "auxiliary_loss_mlp": 0.01021144, "balance_loss_clip": 1.05304146, "balance_loss_mlp": 1.01371145, "epoch": 0.5807731617868094, "flos": 26214107495040.0, "grad_norm": 1.4959524687369703, "language_loss": 0.66090465, "learning_rate": 1.577118612372443e-06, "loss": 0.6827091, "num_input_tokens_seen": 104172640, "step": 4830, "time_per_iteration": 2.5437004566192627 }, { "auxiliary_loss_clip": 0.01142239, "auxiliary_loss_mlp": 0.00762631, "balance_loss_clip": 1.04601967, "balance_loss_mlp": 1.0005002, "epoch": 0.5808934046774484, "flos": 37962139190400.0, "grad_norm": 1.6426443878281551, "language_loss": 0.70309186, "learning_rate": 1.5763572847227943e-06, "loss": 0.72214061, "num_input_tokens_seen": 104193525, "step": 4831, "time_per_iteration": 2.665447950363159 }, { "auxiliary_loss_clip": 0.01160254, "auxiliary_loss_mlp": 0.01025853, "balance_loss_clip": 1.0496273, "balance_loss_mlp": 1.01862288, "epoch": 0.5810136475680875, "flos": 20485853038080.0, "grad_norm": 1.9382805648751418, "language_loss": 0.81237346, "learning_rate": 1.5755960213394091e-06, "loss": 0.8342346, "num_input_tokens_seen": 104210625, "step": 4832, "time_per_iteration": 2.4658761024475098 }, { "auxiliary_loss_clip": 0.01136401, "auxiliary_loss_mlp": 0.01027521, "balance_loss_clip": 1.04745483, "balance_loss_mlp": 1.02035367, "epoch": 0.5811338904587267, "flos": 17530153574400.0, "grad_norm": 1.7978367172209813, "language_loss": 0.78827655, "learning_rate": 1.5748348223377703e-06, "loss": 0.80991578, "num_input_tokens_seen": 104228180, "step": 4833, "time_per_iteration": 2.5076098442077637 }, { "auxiliary_loss_clip": 0.01146458, "auxiliary_loss_mlp": 0.01027632, "balance_loss_clip": 1.05074251, "balance_loss_mlp": 1.02035737, "epoch": 0.5812541333493657, "flos": 19458017360640.0, "grad_norm": 1.6187735851716132, "language_loss": 0.77801555, "learning_rate": 1.5740736878333507e-06, "loss": 0.79975647, "num_input_tokens_seen": 104246020, "step": 4834, "time_per_iteration": 2.4869067668914795 }, { "auxiliary_loss_clip": 0.01152377, "auxiliary_loss_mlp": 0.01024147, "balance_loss_clip": 1.05032754, "balance_loss_mlp": 1.01639867, "epoch": 0.5813743762400048, "flos": 20594949621120.0, "grad_norm": 2.767879829741655, "language_loss": 0.77816725, "learning_rate": 1.5733126179416143e-06, "loss": 0.79993248, "num_input_tokens_seen": 104260505, "step": 4835, "time_per_iteration": 2.5152225494384766 }, { "auxiliary_loss_clip": 0.0116264, "auxiliary_loss_mlp": 0.01025491, "balance_loss_clip": 1.05145264, "balance_loss_mlp": 1.01820779, "epoch": 0.5814946191306439, "flos": 33178227246720.0, "grad_norm": 2.749408754252559, "language_loss": 0.72580385, "learning_rate": 1.5725516127780137e-06, "loss": 0.74768519, "num_input_tokens_seen": 104282640, "step": 4836, "time_per_iteration": 2.57684326171875 }, { "auxiliary_loss_clip": 0.01168099, "auxiliary_loss_mlp": 0.0103019, "balance_loss_clip": 1.04977918, "balance_loss_mlp": 1.0219171, "epoch": 0.581614862021283, "flos": 16143283503360.0, "grad_norm": 2.068136660915475, "language_loss": 0.88011634, "learning_rate": 1.5717906724579943e-06, "loss": 0.90209925, "num_input_tokens_seen": 104299700, "step": 4837, "time_per_iteration": 2.4465553760528564 }, { "auxiliary_loss_clip": 0.01144227, "auxiliary_loss_mlp": 0.01024046, "balance_loss_clip": 1.05093455, "balance_loss_mlp": 1.01672626, "epoch": 0.581735104911922, "flos": 33802642298880.0, "grad_norm": 2.022580939214022, "language_loss": 0.68180704, "learning_rate": 1.571029797096989e-06, "loss": 0.70348978, "num_input_tokens_seen": 104320805, "step": 4838, "time_per_iteration": 2.6637215614318848 }, { "auxiliary_loss_clip": 0.0117482, "auxiliary_loss_mlp": 0.01029864, "balance_loss_clip": 1.05176306, "balance_loss_mlp": 1.02255654, "epoch": 0.5818553478025612, "flos": 23331163029120.0, "grad_norm": 1.7913687619389325, "language_loss": 0.78718591, "learning_rate": 1.570268986810423e-06, "loss": 0.80923277, "num_input_tokens_seen": 104340700, "step": 4839, "time_per_iteration": 2.4550280570983887 }, { "auxiliary_loss_clip": 0.01145935, "auxiliary_loss_mlp": 0.01026746, "balance_loss_clip": 1.04870248, "balance_loss_mlp": 1.01996565, "epoch": 0.5819755906932003, "flos": 20996143603200.0, "grad_norm": 1.985929185702165, "language_loss": 0.74562323, "learning_rate": 1.5695082417137096e-06, "loss": 0.76735002, "num_input_tokens_seen": 104358575, "step": 4840, "time_per_iteration": 2.4990272521972656 }, { "auxiliary_loss_clip": 0.01144232, "auxiliary_loss_mlp": 0.01025733, "balance_loss_clip": 1.04634285, "balance_loss_mlp": 1.01869965, "epoch": 0.5820958335838393, "flos": 21431668008960.0, "grad_norm": 1.720241073364023, "language_loss": 0.75308704, "learning_rate": 1.5687475619222539e-06, "loss": 0.77478671, "num_input_tokens_seen": 104378530, "step": 4841, "time_per_iteration": 2.547492504119873 }, { "auxiliary_loss_clip": 0.01141072, "auxiliary_loss_mlp": 0.01025842, "balance_loss_clip": 1.04540944, "balance_loss_mlp": 1.01801324, "epoch": 0.5822160764744785, "flos": 17967473660160.0, "grad_norm": 2.3256871300788577, "language_loss": 0.73508853, "learning_rate": 1.5679869475514496e-06, "loss": 0.75675774, "num_input_tokens_seen": 104395465, "step": 4842, "time_per_iteration": 3.2962448596954346 }, { "auxiliary_loss_clip": 0.0116497, "auxiliary_loss_mlp": 0.01030106, "balance_loss_clip": 1.05276275, "balance_loss_mlp": 1.02196956, "epoch": 0.5823363193651175, "flos": 23033858158080.0, "grad_norm": 2.1838748952135205, "language_loss": 0.81499696, "learning_rate": 1.567226398716682e-06, "loss": 0.83694774, "num_input_tokens_seen": 104415380, "step": 4843, "time_per_iteration": 2.5237386226654053 }, { "auxiliary_loss_clip": 0.01154551, "auxiliary_loss_mlp": 0.01023428, "balance_loss_clip": 1.04970193, "balance_loss_mlp": 1.01489604, "epoch": 0.5824565622557566, "flos": 32891840110080.0, "grad_norm": 1.8471673957133083, "language_loss": 0.6191833, "learning_rate": 1.566465915533326e-06, "loss": 0.64096314, "num_input_tokens_seen": 104437410, "step": 4844, "time_per_iteration": 2.6162548065185547 }, { "auxiliary_loss_clip": 0.01160571, "auxiliary_loss_mlp": 0.01025233, "balance_loss_clip": 1.0516479, "balance_loss_mlp": 1.01764512, "epoch": 0.5825768051463958, "flos": 22229674513920.0, "grad_norm": 1.9538527415128801, "language_loss": 0.88267934, "learning_rate": 1.5657054981167458e-06, "loss": 0.90453744, "num_input_tokens_seen": 104456305, "step": 4845, "time_per_iteration": 3.3563575744628906 }, { "auxiliary_loss_clip": 0.01159165, "auxiliary_loss_mlp": 0.0102485, "balance_loss_clip": 1.0512867, "balance_loss_mlp": 1.01805472, "epoch": 0.5826970480370348, "flos": 28001561016960.0, "grad_norm": 2.000017001513208, "language_loss": 0.67855966, "learning_rate": 1.5649451465822965e-06, "loss": 0.70039982, "num_input_tokens_seen": 104477695, "step": 4846, "time_per_iteration": 2.5291800498962402 }, { "auxiliary_loss_clip": 0.01119701, "auxiliary_loss_mlp": 0.01028635, "balance_loss_clip": 1.04843366, "balance_loss_mlp": 1.02113056, "epoch": 0.5828172909276739, "flos": 17858053854720.0, "grad_norm": 1.7766887218450944, "language_loss": 0.83740079, "learning_rate": 1.5641848610453218e-06, "loss": 0.85888422, "num_input_tokens_seen": 104496355, "step": 4847, "time_per_iteration": 2.5439999103546143 }, { "auxiliary_loss_clip": 0.01159178, "auxiliary_loss_mlp": 0.01025862, "balance_loss_clip": 1.05213666, "balance_loss_mlp": 1.01814342, "epoch": 0.582937533818313, "flos": 19865244827520.0, "grad_norm": 2.107255172201198, "language_loss": 0.86087084, "learning_rate": 1.563424641621158e-06, "loss": 0.8827213, "num_input_tokens_seen": 104515535, "step": 4848, "time_per_iteration": 3.325834274291992 }, { "auxiliary_loss_clip": 0.01152505, "auxiliary_loss_mlp": 0.01025521, "balance_loss_clip": 1.04987502, "balance_loss_mlp": 1.01762342, "epoch": 0.5830577767089521, "flos": 26870734068480.0, "grad_norm": 1.94068994315908, "language_loss": 0.69884402, "learning_rate": 1.5626644884251282e-06, "loss": 0.72062427, "num_input_tokens_seen": 104535055, "step": 4849, "time_per_iteration": 2.5377447605133057 }, { "auxiliary_loss_clip": 0.01176486, "auxiliary_loss_mlp": 0.01023693, "balance_loss_clip": 1.05314493, "balance_loss_mlp": 1.01675177, "epoch": 0.5831780195995911, "flos": 25298205575040.0, "grad_norm": 1.5909733309391125, "language_loss": 0.88058019, "learning_rate": 1.5619044015725488e-06, "loss": 0.90258199, "num_input_tokens_seen": 104554745, "step": 4850, "time_per_iteration": 2.465125322341919 }, { "auxiliary_loss_clip": 0.01184149, "auxiliary_loss_mlp": 0.01030931, "balance_loss_clip": 1.05670619, "balance_loss_mlp": 1.02236032, "epoch": 0.5832982624902303, "flos": 14756988049920.0, "grad_norm": 3.1330635107007696, "language_loss": 0.86991805, "learning_rate": 1.5611443811787224e-06, "loss": 0.89206892, "num_input_tokens_seen": 104568870, "step": 4851, "time_per_iteration": 3.156158924102783 }, { "auxiliary_loss_clip": 0.01160862, "auxiliary_loss_mlp": 0.01023479, "balance_loss_clip": 1.05103397, "balance_loss_mlp": 1.01597786, "epoch": 0.5834185053808694, "flos": 20444555376000.0, "grad_norm": 2.2519021943582413, "language_loss": 0.69193029, "learning_rate": 1.560384427358945e-06, "loss": 0.71377373, "num_input_tokens_seen": 104588415, "step": 4852, "time_per_iteration": 2.4471395015716553 }, { "auxiliary_loss_clip": 0.01140894, "auxiliary_loss_mlp": 0.01028994, "balance_loss_clip": 1.04445791, "balance_loss_mlp": 1.02146542, "epoch": 0.5835387482715084, "flos": 27200394115200.0, "grad_norm": 2.463433953067411, "language_loss": 0.73123449, "learning_rate": 1.5596245402284998e-06, "loss": 0.75293338, "num_input_tokens_seen": 104611940, "step": 4853, "time_per_iteration": 2.5585215091705322 }, { "auxiliary_loss_clip": 0.01166485, "auxiliary_loss_mlp": 0.01027264, "balance_loss_clip": 1.05385637, "balance_loss_mlp": 1.01940215, "epoch": 0.5836589911621476, "flos": 16654615562880.0, "grad_norm": 1.6673030764860544, "language_loss": 0.8178246, "learning_rate": 1.5588647199026619e-06, "loss": 0.83976197, "num_input_tokens_seen": 104629675, "step": 4854, "time_per_iteration": 2.441371440887451 }, { "auxiliary_loss_clip": 0.01183148, "auxiliary_loss_mlp": 0.01024559, "balance_loss_clip": 1.05733562, "balance_loss_mlp": 1.01665592, "epoch": 0.5837792340527866, "flos": 20446817932800.0, "grad_norm": 2.0831662708254886, "language_loss": 0.87713569, "learning_rate": 1.5581049664966956e-06, "loss": 0.89921272, "num_input_tokens_seen": 104647435, "step": 4855, "time_per_iteration": 2.4172589778900146 }, { "auxiliary_loss_clip": 0.01032293, "auxiliary_loss_mlp": 0.01003257, "balance_loss_clip": 1.02152658, "balance_loss_mlp": 1.00216007, "epoch": 0.5838994769434257, "flos": 65995480765440.0, "grad_norm": 0.9961906891607405, "language_loss": 0.65095389, "learning_rate": 1.5573452801258545e-06, "loss": 0.67130935, "num_input_tokens_seen": 104694605, "step": 4856, "time_per_iteration": 2.949613332748413 }, { "auxiliary_loss_clip": 0.01168817, "auxiliary_loss_mlp": 0.01034306, "balance_loss_clip": 1.05254018, "balance_loss_mlp": 1.02656889, "epoch": 0.5840197198340649, "flos": 21470523546240.0, "grad_norm": 2.360438211936364, "language_loss": 0.63235027, "learning_rate": 1.5565856609053824e-06, "loss": 0.65438151, "num_input_tokens_seen": 104713400, "step": 4857, "time_per_iteration": 2.4557480812072754 }, { "auxiliary_loss_clip": 0.01177818, "auxiliary_loss_mlp": 0.01025229, "balance_loss_clip": 1.05423403, "balance_loss_mlp": 1.01728415, "epoch": 0.5841399627247039, "flos": 19135144984320.0, "grad_norm": 2.0485681925859223, "language_loss": 0.80327988, "learning_rate": 1.5558261089505127e-06, "loss": 0.82531041, "num_input_tokens_seen": 104732130, "step": 4858, "time_per_iteration": 2.3986682891845703 }, { "auxiliary_loss_clip": 0.01164291, "auxiliary_loss_mlp": 0.01026393, "balance_loss_clip": 1.05311632, "balance_loss_mlp": 1.01891589, "epoch": 0.584260205615343, "flos": 26425692558720.0, "grad_norm": 2.6940012537396596, "language_loss": 0.80158663, "learning_rate": 1.5550666243764697e-06, "loss": 0.82349348, "num_input_tokens_seen": 104750290, "step": 4859, "time_per_iteration": 2.508812427520752 }, { "auxiliary_loss_clip": 0.01162938, "auxiliary_loss_mlp": 0.01029103, "balance_loss_clip": 1.05108821, "balance_loss_mlp": 1.02162242, "epoch": 0.584380448505982, "flos": 13881809174400.0, "grad_norm": 2.038169763437529, "language_loss": 0.77519751, "learning_rate": 1.554307207298465e-06, "loss": 0.79711789, "num_input_tokens_seen": 104768550, "step": 4860, "time_per_iteration": 2.437427043914795 }, { "auxiliary_loss_clip": 0.01182664, "auxiliary_loss_mlp": 0.01031477, "balance_loss_clip": 1.05650914, "balance_loss_mlp": 1.02346015, "epoch": 0.5845006913966212, "flos": 21543709507200.0, "grad_norm": 1.860142245788919, "language_loss": 0.78734183, "learning_rate": 1.553547857831704e-06, "loss": 0.80948329, "num_input_tokens_seen": 104785060, "step": 4861, "time_per_iteration": 2.440162181854248 }, { "auxiliary_loss_clip": 0.0108054, "auxiliary_loss_mlp": 0.01000403, "balance_loss_clip": 1.02137733, "balance_loss_mlp": 0.99944383, "epoch": 0.5846209342872603, "flos": 58375452712320.0, "grad_norm": 0.8851175172939759, "language_loss": 0.64188695, "learning_rate": 1.5527885760913771e-06, "loss": 0.66269636, "num_input_tokens_seen": 104834950, "step": 4862, "time_per_iteration": 2.851132869720459 }, { "auxiliary_loss_clip": 0.01147899, "auxiliary_loss_mlp": 0.01026378, "balance_loss_clip": 1.05136418, "balance_loss_mlp": 1.01900458, "epoch": 0.5847411771778993, "flos": 18588045957120.0, "grad_norm": 1.5871219602991582, "language_loss": 0.76283348, "learning_rate": 1.552029362192668e-06, "loss": 0.78457618, "num_input_tokens_seen": 104854210, "step": 4863, "time_per_iteration": 2.48380184173584 }, { "auxiliary_loss_clip": 0.01129701, "auxiliary_loss_mlp": 0.01031475, "balance_loss_clip": 1.04677558, "balance_loss_mlp": 1.02391434, "epoch": 0.5848614200685385, "flos": 24240780069120.0, "grad_norm": 1.8966944328291402, "language_loss": 0.72216862, "learning_rate": 1.5512702162507478e-06, "loss": 0.74378037, "num_input_tokens_seen": 104874525, "step": 4864, "time_per_iteration": 2.5531554222106934 }, { "auxiliary_loss_clip": 0.01059597, "auxiliary_loss_mlp": 0.01001672, "balance_loss_clip": 1.02088821, "balance_loss_mlp": 1.00066471, "epoch": 0.5849816629591775, "flos": 71660245933440.0, "grad_norm": 1.1317534729688938, "language_loss": 0.55839682, "learning_rate": 1.5505111383807792e-06, "loss": 0.57900953, "num_input_tokens_seen": 104937195, "step": 4865, "time_per_iteration": 3.136577606201172 }, { "auxiliary_loss_clip": 0.01123058, "auxiliary_loss_mlp": 0.01024534, "balance_loss_clip": 1.0455687, "balance_loss_mlp": 1.01750982, "epoch": 0.5851019058498166, "flos": 23802095266560.0, "grad_norm": 1.7223347079823759, "language_loss": 0.80646664, "learning_rate": 1.5497521286979138e-06, "loss": 0.82794255, "num_input_tokens_seen": 104957435, "step": 4866, "time_per_iteration": 2.5836257934570312 }, { "auxiliary_loss_clip": 0.01138844, "auxiliary_loss_mlp": 0.01029247, "balance_loss_clip": 1.04933167, "balance_loss_mlp": 1.02107573, "epoch": 0.5852221487404557, "flos": 24388516707840.0, "grad_norm": 1.9035069135276117, "language_loss": 0.74253416, "learning_rate": 1.5489931873172927e-06, "loss": 0.76421505, "num_input_tokens_seen": 104978755, "step": 4867, "time_per_iteration": 2.568427085876465 }, { "auxiliary_loss_clip": 0.01087025, "auxiliary_loss_mlp": 0.01028749, "balance_loss_clip": 1.03933191, "balance_loss_mlp": 1.02133465, "epoch": 0.5853423916310948, "flos": 27271425260160.0, "grad_norm": 1.6779308275106, "language_loss": 0.79110068, "learning_rate": 1.5482343143540467e-06, "loss": 0.81225848, "num_input_tokens_seen": 105000020, "step": 4868, "time_per_iteration": 2.625805139541626 }, { "auxiliary_loss_clip": 0.01135247, "auxiliary_loss_mlp": 0.00761814, "balance_loss_clip": 1.04805756, "balance_loss_mlp": 1.00038552, "epoch": 0.5854626345217339, "flos": 11983786611840.0, "grad_norm": 1.926357160625032, "language_loss": 0.8255769, "learning_rate": 1.547475509923295e-06, "loss": 0.84454751, "num_input_tokens_seen": 105017060, "step": 4869, "time_per_iteration": 3.3666861057281494 }, { "auxiliary_loss_clip": 0.01040175, "auxiliary_loss_mlp": 0.01002552, "balance_loss_clip": 1.01994157, "balance_loss_mlp": 1.0015744, "epoch": 0.585582877412373, "flos": 64342335173760.0, "grad_norm": 0.7328504637338263, "language_loss": 0.56075072, "learning_rate": 1.5467167741401495e-06, "loss": 0.58117795, "num_input_tokens_seen": 105078540, "step": 4870, "time_per_iteration": 3.132795572280884 }, { "auxiliary_loss_clip": 0.01143404, "auxiliary_loss_mlp": 0.01028324, "balance_loss_clip": 1.04508471, "balance_loss_mlp": 1.02055144, "epoch": 0.5857031203030121, "flos": 17011926103680.0, "grad_norm": 2.673943306379921, "language_loss": 0.71054357, "learning_rate": 1.5459581071197083e-06, "loss": 0.73226088, "num_input_tokens_seen": 105094200, "step": 4871, "time_per_iteration": 2.476003646850586 }, { "auxiliary_loss_clip": 0.01168802, "auxiliary_loss_mlp": 0.01022287, "balance_loss_clip": 1.05585456, "balance_loss_mlp": 1.01464605, "epoch": 0.5858233631936511, "flos": 20885682303360.0, "grad_norm": 2.0639407047433993, "language_loss": 0.83156538, "learning_rate": 1.5451995089770624e-06, "loss": 0.85347629, "num_input_tokens_seen": 105113985, "step": 4872, "time_per_iteration": 3.325416088104248 }, { "auxiliary_loss_clip": 0.01175918, "auxiliary_loss_mlp": 0.01024246, "balance_loss_clip": 1.05331874, "balance_loss_mlp": 1.01719499, "epoch": 0.5859436060842903, "flos": 23191902000000.0, "grad_norm": 9.177022466088836, "language_loss": 0.7179935, "learning_rate": 1.5444409798272885e-06, "loss": 0.73999512, "num_input_tokens_seen": 105138075, "step": 4873, "time_per_iteration": 2.550305128097534 }, { "auxiliary_loss_clip": 0.01136295, "auxiliary_loss_mlp": 0.01031032, "balance_loss_clip": 1.04809999, "balance_loss_mlp": 1.02355802, "epoch": 0.5860638489749294, "flos": 22492648961280.0, "grad_norm": 2.7690078253871917, "language_loss": 0.80556107, "learning_rate": 1.543682519785456e-06, "loss": 0.82723439, "num_input_tokens_seen": 105156555, "step": 4874, "time_per_iteration": 2.590604066848755 }, { "auxiliary_loss_clip": 0.01148519, "auxiliary_loss_mlp": 0.01029442, "balance_loss_clip": 1.05019605, "balance_loss_mlp": 1.02231646, "epoch": 0.5861840918655684, "flos": 17566243764480.0, "grad_norm": 2.4416744322458652, "language_loss": 0.80483055, "learning_rate": 1.5429241289666219e-06, "loss": 0.82661021, "num_input_tokens_seen": 105174055, "step": 4875, "time_per_iteration": 3.315791130065918 }, { "auxiliary_loss_clip": 0.01140726, "auxiliary_loss_mlp": 0.01026856, "balance_loss_clip": 1.04901659, "balance_loss_mlp": 1.01955175, "epoch": 0.5863043347562076, "flos": 25556152118400.0, "grad_norm": 1.9229775416406538, "language_loss": 0.69987988, "learning_rate": 1.5421658074858342e-06, "loss": 0.72155571, "num_input_tokens_seen": 105192160, "step": 4876, "time_per_iteration": 2.535771131515503 }, { "auxiliary_loss_clip": 0.01145201, "auxiliary_loss_mlp": 0.01029946, "balance_loss_clip": 1.05000031, "balance_loss_mlp": 1.02225113, "epoch": 0.5864245776468466, "flos": 20667525050880.0, "grad_norm": 2.7927631739534875, "language_loss": 0.66187465, "learning_rate": 1.5414075554581298e-06, "loss": 0.68362617, "num_input_tokens_seen": 105210205, "step": 4877, "time_per_iteration": 3.263946771621704 }, { "auxiliary_loss_clip": 0.01178424, "auxiliary_loss_mlp": 0.01027163, "balance_loss_clip": 1.05255139, "balance_loss_mlp": 1.01991558, "epoch": 0.5865448205374857, "flos": 28913907490560.0, "grad_norm": 2.465698080662706, "language_loss": 0.78675997, "learning_rate": 1.5406493729985348e-06, "loss": 0.80881584, "num_input_tokens_seen": 105229400, "step": 4878, "time_per_iteration": 2.476013422012329 }, { "auxiliary_loss_clip": 0.01124357, "auxiliary_loss_mlp": 0.00762172, "balance_loss_clip": 1.04953027, "balance_loss_mlp": 1.00028741, "epoch": 0.5866650634281249, "flos": 25842575168640.0, "grad_norm": 2.697156231673269, "language_loss": 0.7248143, "learning_rate": 1.5398912602220644e-06, "loss": 0.74367952, "num_input_tokens_seen": 105248675, "step": 4879, "time_per_iteration": 2.583902359008789 }, { "auxiliary_loss_clip": 0.01133181, "auxiliary_loss_mlp": 0.01027374, "balance_loss_clip": 1.0483495, "balance_loss_mlp": 1.01992297, "epoch": 0.5867853063187639, "flos": 17052325925760.0, "grad_norm": 2.2094827278864617, "language_loss": 0.78712022, "learning_rate": 1.539133217243724e-06, "loss": 0.80872571, "num_input_tokens_seen": 105265695, "step": 4880, "time_per_iteration": 2.518782615661621 }, { "auxiliary_loss_clip": 0.01139508, "auxiliary_loss_mlp": 0.01027867, "balance_loss_clip": 1.04864419, "balance_loss_mlp": 1.01960599, "epoch": 0.586905549209403, "flos": 24645026707200.0, "grad_norm": 2.0988796510168375, "language_loss": 0.76015568, "learning_rate": 1.5383752441785081e-06, "loss": 0.78182948, "num_input_tokens_seen": 105284920, "step": 4881, "time_per_iteration": 2.5902581214904785 }, { "auxiliary_loss_clip": 0.01167349, "auxiliary_loss_mlp": 0.01035072, "balance_loss_clip": 1.05216777, "balance_loss_mlp": 1.02748466, "epoch": 0.5870257921000421, "flos": 14720538723840.0, "grad_norm": 2.12705509228889, "language_loss": 0.85114139, "learning_rate": 1.5376173411414003e-06, "loss": 0.87316561, "num_input_tokens_seen": 105302960, "step": 4882, "time_per_iteration": 2.4419870376586914 }, { "auxiliary_loss_clip": 0.01148606, "auxiliary_loss_mlp": 0.01030796, "balance_loss_clip": 1.04713047, "balance_loss_mlp": 1.02263618, "epoch": 0.5871460349906812, "flos": 23914998691200.0, "grad_norm": 2.286116162266255, "language_loss": 0.78641808, "learning_rate": 1.5368595082473753e-06, "loss": 0.80821216, "num_input_tokens_seen": 105321260, "step": 4883, "time_per_iteration": 2.5248355865478516 }, { "auxiliary_loss_clip": 0.01163735, "auxiliary_loss_mlp": 0.01020461, "balance_loss_clip": 1.0496695, "balance_loss_mlp": 1.01328456, "epoch": 0.5872662778813202, "flos": 22164174063360.0, "grad_norm": 1.6252387607282661, "language_loss": 0.77787477, "learning_rate": 1.5361017456113935e-06, "loss": 0.79971671, "num_input_tokens_seen": 105341610, "step": 4884, "time_per_iteration": 2.4605295658111572 }, { "auxiliary_loss_clip": 0.01165495, "auxiliary_loss_mlp": 0.01027055, "balance_loss_clip": 1.05140829, "balance_loss_mlp": 1.01914513, "epoch": 0.5873865207719594, "flos": 18441925430400.0, "grad_norm": 1.9039322470754632, "language_loss": 0.86321974, "learning_rate": 1.5353440533484085e-06, "loss": 0.88514519, "num_input_tokens_seen": 105360465, "step": 4885, "time_per_iteration": 2.44521427154541 }, { "auxiliary_loss_clip": 0.01151262, "auxiliary_loss_mlp": 0.01029163, "balance_loss_clip": 1.05050683, "balance_loss_mlp": 1.02125978, "epoch": 0.5875067636625985, "flos": 54015321427200.0, "grad_norm": 1.8363400797070706, "language_loss": 0.65927649, "learning_rate": 1.534586431573361e-06, "loss": 0.6810807, "num_input_tokens_seen": 105385405, "step": 4886, "time_per_iteration": 2.792919397354126 }, { "auxiliary_loss_clip": 0.01106488, "auxiliary_loss_mlp": 0.01025683, "balance_loss_clip": 1.04269648, "balance_loss_mlp": 1.01675391, "epoch": 0.5876270065532375, "flos": 27995707100160.0, "grad_norm": 1.8618837842434977, "language_loss": 0.78928101, "learning_rate": 1.5338288804011817e-06, "loss": 0.81060272, "num_input_tokens_seen": 105404905, "step": 4887, "time_per_iteration": 2.654822826385498 }, { "auxiliary_loss_clip": 0.0114405, "auxiliary_loss_mlp": 0.01030356, "balance_loss_clip": 1.04710913, "balance_loss_mlp": 1.02238119, "epoch": 0.5877472494438767, "flos": 21361462876800.0, "grad_norm": 1.862960841958464, "language_loss": 0.71203065, "learning_rate": 1.533071399946791e-06, "loss": 0.73377466, "num_input_tokens_seen": 105423650, "step": 4888, "time_per_iteration": 2.4955098628997803 }, { "auxiliary_loss_clip": 0.01150916, "auxiliary_loss_mlp": 0.01026224, "balance_loss_clip": 1.04857135, "balance_loss_mlp": 1.01899385, "epoch": 0.5878674923345157, "flos": 22383013674240.0, "grad_norm": 2.0136479059696826, "language_loss": 0.57435399, "learning_rate": 1.5323139903250977e-06, "loss": 0.59612542, "num_input_tokens_seen": 105444255, "step": 4889, "time_per_iteration": 2.533618688583374 }, { "auxiliary_loss_clip": 0.01151524, "auxiliary_loss_mlp": 0.01024734, "balance_loss_clip": 1.05260658, "balance_loss_mlp": 1.01745296, "epoch": 0.5879877352251548, "flos": 21868664872320.0, "grad_norm": 1.7949150722762295, "language_loss": 0.77045381, "learning_rate": 1.5315566516510002e-06, "loss": 0.79221642, "num_input_tokens_seen": 105462425, "step": 4890, "time_per_iteration": 2.495255708694458 }, { "auxiliary_loss_clip": 0.01177807, "auxiliary_loss_mlp": 0.01027324, "balance_loss_clip": 1.05443501, "balance_loss_mlp": 1.01940525, "epoch": 0.5881079781157939, "flos": 17493811989120.0, "grad_norm": 1.7239018469422087, "language_loss": 0.67731953, "learning_rate": 1.5307993840393857e-06, "loss": 0.69937086, "num_input_tokens_seen": 105480505, "step": 4891, "time_per_iteration": 2.4426653385162354 }, { "auxiliary_loss_clip": 0.0117616, "auxiliary_loss_mlp": 0.0102398, "balance_loss_clip": 1.05199265, "balance_loss_mlp": 1.01665497, "epoch": 0.588228221006433, "flos": 22601853285120.0, "grad_norm": 1.846636804355916, "language_loss": 0.80427814, "learning_rate": 1.530042187605132e-06, "loss": 0.82627952, "num_input_tokens_seen": 105499760, "step": 4892, "time_per_iteration": 2.4386227130889893 }, { "auxiliary_loss_clip": 0.01162774, "auxiliary_loss_mlp": 0.00761443, "balance_loss_clip": 1.05201054, "balance_loss_mlp": 1.00024509, "epoch": 0.5883484638970721, "flos": 26176939896960.0, "grad_norm": 1.3633597318893786, "language_loss": 0.84165776, "learning_rate": 1.5292850624631044e-06, "loss": 0.86089993, "num_input_tokens_seen": 105521955, "step": 4893, "time_per_iteration": 2.5085768699645996 }, { "auxiliary_loss_clip": 0.01160156, "auxiliary_loss_mlp": 0.01028469, "balance_loss_clip": 1.05295575, "balance_loss_mlp": 1.02048826, "epoch": 0.5884687067877111, "flos": 30443737691520.0, "grad_norm": 1.909389229039017, "language_loss": 0.80411786, "learning_rate": 1.5285280087281593e-06, "loss": 0.82600415, "num_input_tokens_seen": 105542685, "step": 4894, "time_per_iteration": 2.5591704845428467 }, { "auxiliary_loss_clip": 0.01061128, "auxiliary_loss_mlp": 0.01001673, "balance_loss_clip": 1.02113819, "balance_loss_mlp": 1.0007019, "epoch": 0.5885889496783503, "flos": 70507550580480.0, "grad_norm": 0.6507384048749758, "language_loss": 0.56575453, "learning_rate": 1.5277710265151398e-06, "loss": 0.58638263, "num_input_tokens_seen": 105612165, "step": 4895, "time_per_iteration": 3.230508804321289 }, { "auxiliary_loss_clip": 0.0116331, "auxiliary_loss_mlp": 0.01023341, "balance_loss_clip": 1.05184925, "balance_loss_mlp": 1.01520443, "epoch": 0.5887091925689893, "flos": 19098767485440.0, "grad_norm": 2.3995892345281353, "language_loss": 0.77127254, "learning_rate": 1.5270141159388803e-06, "loss": 0.7931391, "num_input_tokens_seen": 105629185, "step": 4896, "time_per_iteration": 3.265119791030884 }, { "auxiliary_loss_clip": 0.01175512, "auxiliary_loss_mlp": 0.010242, "balance_loss_clip": 1.0510962, "balance_loss_mlp": 1.01646328, "epoch": 0.5888294354596284, "flos": 23294282739840.0, "grad_norm": 1.6262157937744788, "language_loss": 0.80495906, "learning_rate": 1.526257277114203e-06, "loss": 0.82695621, "num_input_tokens_seen": 105650260, "step": 4897, "time_per_iteration": 2.4233932495117188 }, { "auxiliary_loss_clip": 0.01143025, "auxiliary_loss_mlp": 0.01024012, "balance_loss_clip": 1.04937696, "balance_loss_mlp": 1.01666224, "epoch": 0.5889496783502676, "flos": 21981532383360.0, "grad_norm": 1.8312968996154904, "language_loss": 0.79475182, "learning_rate": 1.5255005101559201e-06, "loss": 0.81642222, "num_input_tokens_seen": 105667870, "step": 4898, "time_per_iteration": 2.489795684814453 }, { "auxiliary_loss_clip": 0.01166231, "auxiliary_loss_mlp": 0.01026366, "balance_loss_clip": 1.05195093, "balance_loss_mlp": 1.01932025, "epoch": 0.5890699212409066, "flos": 21685233093120.0, "grad_norm": 2.1078549330762324, "language_loss": 0.76870197, "learning_rate": 1.524743815178833e-06, "loss": 0.7906279, "num_input_tokens_seen": 105685830, "step": 4899, "time_per_iteration": 3.3152313232421875 }, { "auxiliary_loss_clip": 0.0114926, "auxiliary_loss_mlp": 0.01024229, "balance_loss_clip": 1.04817283, "balance_loss_mlp": 1.0168587, "epoch": 0.5891901641315457, "flos": 19464553635840.0, "grad_norm": 3.166271870474221, "language_loss": 0.8101716, "learning_rate": 1.5239871922977315e-06, "loss": 0.83190656, "num_input_tokens_seen": 105705745, "step": 4900, "time_per_iteration": 2.5020203590393066 }, { "auxiliary_loss_clip": 0.01145567, "auxiliary_loss_mlp": 0.01032255, "balance_loss_clip": 1.04646659, "balance_loss_mlp": 1.02446473, "epoch": 0.5893104070221848, "flos": 19609884063360.0, "grad_norm": 1.846445680777024, "language_loss": 0.8989383, "learning_rate": 1.523230641627394e-06, "loss": 0.92071652, "num_input_tokens_seen": 105724730, "step": 4901, "time_per_iteration": 3.3320086002349854 }, { "auxiliary_loss_clip": 0.01122301, "auxiliary_loss_mlp": 0.01026981, "balance_loss_clip": 1.04358888, "balance_loss_mlp": 1.01954246, "epoch": 0.5894306499128239, "flos": 29060063930880.0, "grad_norm": 1.9612376460470438, "language_loss": 0.72871953, "learning_rate": 1.5224741632825888e-06, "loss": 0.75021231, "num_input_tokens_seen": 105744920, "step": 4902, "time_per_iteration": 2.615467071533203 }, { "auxiliary_loss_clip": 0.01181289, "auxiliary_loss_mlp": 0.01024252, "balance_loss_clip": 1.05576873, "balance_loss_mlp": 1.01619053, "epoch": 0.589550892803463, "flos": 42298890721920.0, "grad_norm": 1.6752828781043458, "language_loss": 0.69171691, "learning_rate": 1.521717757378074e-06, "loss": 0.7137723, "num_input_tokens_seen": 105765465, "step": 4903, "time_per_iteration": 2.6308889389038086 }, { "auxiliary_loss_clip": 0.01168553, "auxiliary_loss_mlp": 0.01030049, "balance_loss_clip": 1.05305803, "balance_loss_mlp": 1.02199602, "epoch": 0.5896711356941021, "flos": 14137062197760.0, "grad_norm": 1.8413498162275888, "language_loss": 0.69518149, "learning_rate": 1.5209614240285943e-06, "loss": 0.7171675, "num_input_tokens_seen": 105783120, "step": 4904, "time_per_iteration": 3.2270171642303467 }, { "auxiliary_loss_clip": 0.01174072, "auxiliary_loss_mlp": 0.00761931, "balance_loss_clip": 1.04998147, "balance_loss_mlp": 1.00037193, "epoch": 0.5897913785847412, "flos": 17201355454080.0, "grad_norm": 2.593005582368182, "language_loss": 0.85191512, "learning_rate": 1.520205163348887e-06, "loss": 0.87127519, "num_input_tokens_seen": 105801055, "step": 4905, "time_per_iteration": 2.4040586948394775 }, { "auxiliary_loss_clip": 0.01051347, "auxiliary_loss_mlp": 0.010015, "balance_loss_clip": 1.01999092, "balance_loss_mlp": 1.0003916, "epoch": 0.5899116214753802, "flos": 48794164202880.0, "grad_norm": 0.7262631845310534, "language_loss": 0.57018745, "learning_rate": 1.519448975453674e-06, "loss": 0.590716, "num_input_tokens_seen": 105856155, "step": 4906, "time_per_iteration": 2.957052707672119 }, { "auxiliary_loss_clip": 0.01164717, "auxiliary_loss_mlp": 0.00762303, "balance_loss_clip": 1.05371881, "balance_loss_mlp": 1.00042319, "epoch": 0.5900318643660194, "flos": 21103659987840.0, "grad_norm": 2.197850812735842, "language_loss": 0.75720716, "learning_rate": 1.5186928604576696e-06, "loss": 0.77647734, "num_input_tokens_seen": 105873350, "step": 4907, "time_per_iteration": 2.457343101501465 }, { "auxiliary_loss_clip": 0.01147913, "auxiliary_loss_mlp": 0.01028378, "balance_loss_clip": 1.04785371, "balance_loss_mlp": 1.02092731, "epoch": 0.5901521072566585, "flos": 21178390233600.0, "grad_norm": 2.07140024042001, "language_loss": 0.76665926, "learning_rate": 1.5179368184755752e-06, "loss": 0.78842217, "num_input_tokens_seen": 105891435, "step": 4908, "time_per_iteration": 2.48539400100708 }, { "auxiliary_loss_clip": 0.01146949, "auxiliary_loss_mlp": 0.0102247, "balance_loss_clip": 1.05015516, "balance_loss_mlp": 1.01515639, "epoch": 0.5902723501472975, "flos": 20225967160320.0, "grad_norm": 1.5581074641132817, "language_loss": 0.82744849, "learning_rate": 1.5171808496220821e-06, "loss": 0.84914261, "num_input_tokens_seen": 105910190, "step": 4909, "time_per_iteration": 2.4843177795410156 }, { "auxiliary_loss_clip": 0.01153602, "auxiliary_loss_mlp": 0.01025007, "balance_loss_clip": 1.0498457, "balance_loss_mlp": 1.01782131, "epoch": 0.5903925930379367, "flos": 22964407211520.0, "grad_norm": 1.711347472887412, "language_loss": 0.81179714, "learning_rate": 1.5164249540118708e-06, "loss": 0.83358324, "num_input_tokens_seen": 105929315, "step": 4910, "time_per_iteration": 2.5136327743530273 }, { "auxiliary_loss_clip": 0.01108093, "auxiliary_loss_mlp": 0.01028028, "balance_loss_clip": 1.04377532, "balance_loss_mlp": 1.0202291, "epoch": 0.5905128359285757, "flos": 23367720096000.0, "grad_norm": 1.7194469707999254, "language_loss": 0.83124453, "learning_rate": 1.5156691317596093e-06, "loss": 0.8526057, "num_input_tokens_seen": 105950740, "step": 4911, "time_per_iteration": 2.5977957248687744 }, { "auxiliary_loss_clip": 0.01166619, "auxiliary_loss_mlp": 0.00762103, "balance_loss_clip": 1.05225146, "balance_loss_mlp": 1.00041032, "epoch": 0.5906330788192148, "flos": 28032335994240.0, "grad_norm": 2.351751233723427, "language_loss": 0.66862053, "learning_rate": 1.5149133829799556e-06, "loss": 0.6879077, "num_input_tokens_seen": 105968735, "step": 4912, "time_per_iteration": 2.5219383239746094 }, { "auxiliary_loss_clip": 0.01154809, "auxiliary_loss_mlp": 0.01034103, "balance_loss_clip": 1.04871333, "balance_loss_mlp": 1.02611589, "epoch": 0.590753321709854, "flos": 18477943793280.0, "grad_norm": 2.1764494683531366, "language_loss": 0.8045311, "learning_rate": 1.5141577077875556e-06, "loss": 0.82642019, "num_input_tokens_seen": 105986060, "step": 4913, "time_per_iteration": 2.460291624069214 }, { "auxiliary_loss_clip": 0.01166466, "auxiliary_loss_mlp": 0.01026729, "balance_loss_clip": 1.05300474, "balance_loss_mlp": 1.01912963, "epoch": 0.590873564600493, "flos": 16873706568960.0, "grad_norm": 1.7998377767036837, "language_loss": 0.72579432, "learning_rate": 1.5134021062970451e-06, "loss": 0.74772632, "num_input_tokens_seen": 106004440, "step": 4914, "time_per_iteration": 2.4363203048706055 }, { "auxiliary_loss_clip": 0.01130013, "auxiliary_loss_mlp": 0.01028509, "balance_loss_clip": 1.05197072, "balance_loss_mlp": 1.02078462, "epoch": 0.5909938074911321, "flos": 13516166678400.0, "grad_norm": 2.0512883741565373, "language_loss": 0.80964404, "learning_rate": 1.5126465786230483e-06, "loss": 0.83122927, "num_input_tokens_seen": 106021215, "step": 4915, "time_per_iteration": 2.478245258331299 }, { "auxiliary_loss_clip": 0.01175081, "auxiliary_loss_mlp": 0.01026926, "balance_loss_clip": 1.05142224, "balance_loss_mlp": 1.01912951, "epoch": 0.5911140503817712, "flos": 26024067613440.0, "grad_norm": 1.7527029326285188, "language_loss": 0.82141525, "learning_rate": 1.5118911248801787e-06, "loss": 0.84343535, "num_input_tokens_seen": 106039225, "step": 4916, "time_per_iteration": 2.444098949432373 }, { "auxiliary_loss_clip": 0.01159578, "auxiliary_loss_mlp": 0.01027247, "balance_loss_clip": 1.05018401, "balance_loss_mlp": 1.02036572, "epoch": 0.5912342932724103, "flos": 23258731253760.0, "grad_norm": 1.979301126389231, "language_loss": 0.79502845, "learning_rate": 1.5111357451830364e-06, "loss": 0.81689668, "num_input_tokens_seen": 106057920, "step": 4917, "time_per_iteration": 2.460336923599243 }, { "auxiliary_loss_clip": 0.01162486, "auxiliary_loss_mlp": 0.01027043, "balance_loss_clip": 1.05022955, "balance_loss_mlp": 1.01943111, "epoch": 0.5913545361630493, "flos": 19573039687680.0, "grad_norm": 1.9666009456757982, "language_loss": 0.71161103, "learning_rate": 1.5103804396462131e-06, "loss": 0.73350626, "num_input_tokens_seen": 106077855, "step": 4918, "time_per_iteration": 2.4605956077575684 }, { "auxiliary_loss_clip": 0.01165644, "auxiliary_loss_mlp": 0.01030637, "balance_loss_clip": 1.04921901, "balance_loss_mlp": 1.02252448, "epoch": 0.5914747790536885, "flos": 26213532877440.0, "grad_norm": 3.088505184687068, "language_loss": 0.79835898, "learning_rate": 1.5096252083842877e-06, "loss": 0.8203218, "num_input_tokens_seen": 106097065, "step": 4919, "time_per_iteration": 2.4927666187286377 }, { "auxiliary_loss_clip": 0.01158832, "auxiliary_loss_mlp": 0.01024323, "balance_loss_clip": 1.04779673, "balance_loss_mlp": 1.01669347, "epoch": 0.5915950219443276, "flos": 27417545786880.0, "grad_norm": 1.6976556706417651, "language_loss": 0.84898341, "learning_rate": 1.5088700515118285e-06, "loss": 0.87081504, "num_input_tokens_seen": 106116385, "step": 4920, "time_per_iteration": 2.503058910369873 }, { "auxiliary_loss_clip": 0.01130942, "auxiliary_loss_mlp": 0.01028448, "balance_loss_clip": 1.04954422, "balance_loss_mlp": 1.02028191, "epoch": 0.5917152648349666, "flos": 21907879545600.0, "grad_norm": 1.6698580532702783, "language_loss": 0.66167057, "learning_rate": 1.508114969143392e-06, "loss": 0.68326449, "num_input_tokens_seen": 106136370, "step": 4921, "time_per_iteration": 2.513958692550659 }, { "auxiliary_loss_clip": 0.01149561, "auxiliary_loss_mlp": 0.01029695, "balance_loss_clip": 1.04787827, "balance_loss_mlp": 1.0225482, "epoch": 0.5918355077256057, "flos": 28109185142400.0, "grad_norm": 1.783833279866674, "language_loss": 0.77339721, "learning_rate": 1.5073599613935238e-06, "loss": 0.7951898, "num_input_tokens_seen": 106158490, "step": 4922, "time_per_iteration": 3.415992259979248 }, { "auxiliary_loss_clip": 0.01149563, "auxiliary_loss_mlp": 0.01027678, "balance_loss_clip": 1.04928064, "balance_loss_mlp": 1.01992917, "epoch": 0.5919557506162448, "flos": 28183807647360.0, "grad_norm": 2.123727696147373, "language_loss": 0.57717514, "learning_rate": 1.5066050283767574e-06, "loss": 0.59894753, "num_input_tokens_seen": 106179170, "step": 4923, "time_per_iteration": 2.5488946437835693 }, { "auxiliary_loss_clip": 0.0114406, "auxiliary_loss_mlp": 0.01028344, "balance_loss_clip": 1.04959166, "balance_loss_mlp": 1.02095032, "epoch": 0.5920759935068839, "flos": 12094355652480.0, "grad_norm": 1.9618520512369764, "language_loss": 0.82455289, "learning_rate": 1.505850170207616e-06, "loss": 0.84627688, "num_input_tokens_seen": 106196035, "step": 4924, "time_per_iteration": 2.5083770751953125 }, { "auxiliary_loss_clip": 0.0114584, "auxiliary_loss_mlp": 0.01026718, "balance_loss_clip": 1.04682088, "balance_loss_mlp": 1.01937485, "epoch": 0.592196236397523, "flos": 29424772673280.0, "grad_norm": 2.189166603395096, "language_loss": 0.7800976, "learning_rate": 1.505095387000611e-06, "loss": 0.80182314, "num_input_tokens_seen": 106218335, "step": 4925, "time_per_iteration": 2.5520801544189453 }, { "auxiliary_loss_clip": 0.01138334, "auxiliary_loss_mlp": 0.01026449, "balance_loss_clip": 1.0482161, "balance_loss_mlp": 1.01918066, "epoch": 0.5923164792881621, "flos": 24384709866240.0, "grad_norm": 2.065637656285934, "language_loss": 0.74177647, "learning_rate": 1.504340678870242e-06, "loss": 0.76342428, "num_input_tokens_seen": 106236550, "step": 4926, "time_per_iteration": 3.3669991493225098 }, { "auxiliary_loss_clip": 0.01162038, "auxiliary_loss_mlp": 0.0103202, "balance_loss_clip": 1.05101717, "balance_loss_mlp": 1.02484989, "epoch": 0.5924367221788012, "flos": 24024238928640.0, "grad_norm": 1.9737458741265004, "language_loss": 0.89623493, "learning_rate": 1.5035860459309989e-06, "loss": 0.91817546, "num_input_tokens_seen": 106254265, "step": 4927, "time_per_iteration": 2.4698445796966553 }, { "auxiliary_loss_clip": 0.01143996, "auxiliary_loss_mlp": 0.01029548, "balance_loss_clip": 1.04748595, "balance_loss_mlp": 1.02116728, "epoch": 0.5925569650694402, "flos": 26870590414080.0, "grad_norm": 1.8080600223539682, "language_loss": 0.63587487, "learning_rate": 1.5028314882973568e-06, "loss": 0.6576103, "num_input_tokens_seen": 106274670, "step": 4928, "time_per_iteration": 3.3737740516662598 }, { "auxiliary_loss_clip": 0.01151784, "auxiliary_loss_mlp": 0.01030708, "balance_loss_clip": 1.05230355, "balance_loss_mlp": 1.02272153, "epoch": 0.5926772079600794, "flos": 22302788647680.0, "grad_norm": 2.645334717007061, "language_loss": 0.84577751, "learning_rate": 1.502077006083783e-06, "loss": 0.86760241, "num_input_tokens_seen": 106293330, "step": 4929, "time_per_iteration": 2.4817793369293213 }, { "auxiliary_loss_clip": 0.01167471, "auxiliary_loss_mlp": 0.00761328, "balance_loss_clip": 1.05173385, "balance_loss_mlp": 1.00029147, "epoch": 0.5927974508507184, "flos": 19865244827520.0, "grad_norm": 1.7331667546283522, "language_loss": 0.76869094, "learning_rate": 1.5013225994047315e-06, "loss": 0.78797901, "num_input_tokens_seen": 106310960, "step": 4930, "time_per_iteration": 2.4470012187957764 }, { "auxiliary_loss_clip": 0.01165454, "auxiliary_loss_mlp": 0.00761324, "balance_loss_clip": 1.05366147, "balance_loss_mlp": 1.00030732, "epoch": 0.5929176937413575, "flos": 15776743167360.0, "grad_norm": 1.5171864093309735, "language_loss": 0.80508769, "learning_rate": 1.5005682683746452e-06, "loss": 0.82435542, "num_input_tokens_seen": 106329475, "step": 4931, "time_per_iteration": 3.183997869491577 }, { "auxiliary_loss_clip": 0.01163788, "auxiliary_loss_mlp": 0.01027969, "balance_loss_clip": 1.05344498, "balance_loss_mlp": 1.02032518, "epoch": 0.5930379366319967, "flos": 17601472028160.0, "grad_norm": 2.2819889038742365, "language_loss": 0.72695696, "learning_rate": 1.4998140131079553e-06, "loss": 0.74887455, "num_input_tokens_seen": 106345565, "step": 4932, "time_per_iteration": 2.418487310409546 }, { "auxiliary_loss_clip": 0.01102791, "auxiliary_loss_mlp": 0.00761458, "balance_loss_clip": 1.04363, "balance_loss_mlp": 1.00026798, "epoch": 0.5931581795226357, "flos": 17704283731200.0, "grad_norm": 1.7946972982747897, "language_loss": 0.73921323, "learning_rate": 1.4990598337190821e-06, "loss": 0.75785571, "num_input_tokens_seen": 106361920, "step": 4933, "time_per_iteration": 2.531195640563965 }, { "auxiliary_loss_clip": 0.01175481, "auxiliary_loss_mlp": 0.00762408, "balance_loss_clip": 1.05142677, "balance_loss_mlp": 1.00027764, "epoch": 0.5932784224132748, "flos": 24280102483200.0, "grad_norm": 1.8443745540758933, "language_loss": 0.67920095, "learning_rate": 1.4983057303224338e-06, "loss": 0.69857979, "num_input_tokens_seen": 106381735, "step": 4934, "time_per_iteration": 2.4451987743377686 }, { "auxiliary_loss_clip": 0.01118944, "auxiliary_loss_mlp": 0.01029881, "balance_loss_clip": 1.04617429, "balance_loss_mlp": 1.0221796, "epoch": 0.5933986653039139, "flos": 22926700909440.0, "grad_norm": 1.6630106734573074, "language_loss": 0.87887239, "learning_rate": 1.4975517030324072e-06, "loss": 0.90036064, "num_input_tokens_seen": 106399745, "step": 4935, "time_per_iteration": 2.543041944503784 }, { "auxiliary_loss_clip": 0.01073839, "auxiliary_loss_mlp": 0.00752771, "balance_loss_clip": 1.01507378, "balance_loss_mlp": 0.9997347, "epoch": 0.593518908194553, "flos": 71121730256640.0, "grad_norm": 0.7858956376765502, "language_loss": 0.61843598, "learning_rate": 1.4967977519633882e-06, "loss": 0.63670206, "num_input_tokens_seen": 106457205, "step": 4936, "time_per_iteration": 3.118154525756836 }, { "auxiliary_loss_clip": 0.0113402, "auxiliary_loss_mlp": 0.01024581, "balance_loss_clip": 1.04871881, "balance_loss_mlp": 1.0169394, "epoch": 0.593639151085192, "flos": 20448649526400.0, "grad_norm": 1.9104499777932717, "language_loss": 0.78172088, "learning_rate": 1.4960438772297494e-06, "loss": 0.80330694, "num_input_tokens_seen": 106474250, "step": 4937, "time_per_iteration": 2.512204647064209 }, { "auxiliary_loss_clip": 0.01149633, "auxiliary_loss_mlp": 0.01027627, "balance_loss_clip": 1.04809988, "balance_loss_mlp": 1.02007508, "epoch": 0.5937593939758312, "flos": 30883428074880.0, "grad_norm": 2.2284603403364525, "language_loss": 0.73346782, "learning_rate": 1.495290078945855e-06, "loss": 0.75524038, "num_input_tokens_seen": 106494015, "step": 4938, "time_per_iteration": 2.5780420303344727 }, { "auxiliary_loss_clip": 0.01175281, "auxiliary_loss_mlp": 0.01029293, "balance_loss_clip": 1.05246115, "balance_loss_mlp": 1.02158952, "epoch": 0.5938796368664703, "flos": 36898069668480.0, "grad_norm": 2.3131837347137636, "language_loss": 0.73908448, "learning_rate": 1.4945363572260529e-06, "loss": 0.76113027, "num_input_tokens_seen": 106515010, "step": 4939, "time_per_iteration": 2.553023099899292 }, { "auxiliary_loss_clip": 0.01162091, "auxiliary_loss_mlp": 0.0102468, "balance_loss_clip": 1.04995251, "balance_loss_mlp": 1.01750398, "epoch": 0.5939998797571093, "flos": 23842926051840.0, "grad_norm": 2.280127823288347, "language_loss": 0.67816079, "learning_rate": 1.4937827121846845e-06, "loss": 0.70002848, "num_input_tokens_seen": 106535265, "step": 4940, "time_per_iteration": 2.52632474899292 }, { "auxiliary_loss_clip": 0.01133648, "auxiliary_loss_mlp": 0.01034691, "balance_loss_clip": 1.0529623, "balance_loss_mlp": 1.02721667, "epoch": 0.5941201226477485, "flos": 25191407462400.0, "grad_norm": 1.513079214174461, "language_loss": 0.73617417, "learning_rate": 1.4930291439360755e-06, "loss": 0.75785756, "num_input_tokens_seen": 106557830, "step": 4941, "time_per_iteration": 2.567063570022583 }, { "auxiliary_loss_clip": 0.0116539, "auxiliary_loss_mlp": 0.01029056, "balance_loss_clip": 1.05236495, "balance_loss_mlp": 1.02051449, "epoch": 0.5942403655383875, "flos": 22418996123520.0, "grad_norm": 1.7173236475614415, "language_loss": 0.79456139, "learning_rate": 1.4922756525945427e-06, "loss": 0.81650585, "num_input_tokens_seen": 106577140, "step": 4942, "time_per_iteration": 2.447849988937378 }, { "auxiliary_loss_clip": 0.01063626, "auxiliary_loss_mlp": 0.01001372, "balance_loss_clip": 1.01418185, "balance_loss_mlp": 1.0002749, "epoch": 0.5943606084290266, "flos": 67629310796160.0, "grad_norm": 1.176669132312266, "language_loss": 0.59675837, "learning_rate": 1.4915222382743894e-06, "loss": 0.61740828, "num_input_tokens_seen": 106635975, "step": 4943, "time_per_iteration": 3.1054978370666504 }, { "auxiliary_loss_clip": 0.01165791, "auxiliary_loss_mlp": 0.01027476, "balance_loss_clip": 1.05288613, "balance_loss_mlp": 1.01963186, "epoch": 0.5944808513196658, "flos": 18223157646720.0, "grad_norm": 2.511971399840203, "language_loss": 0.71873939, "learning_rate": 1.4907689010899085e-06, "loss": 0.74067211, "num_input_tokens_seen": 106653555, "step": 4944, "time_per_iteration": 2.431511640548706 }, { "auxiliary_loss_clip": 0.01148534, "auxiliary_loss_mlp": 0.01024146, "balance_loss_clip": 1.04858875, "balance_loss_mlp": 1.01636744, "epoch": 0.5946010942103048, "flos": 24790824011520.0, "grad_norm": 2.005001366906365, "language_loss": 0.6286431, "learning_rate": 1.4900156411553804e-06, "loss": 0.65036988, "num_input_tokens_seen": 106673385, "step": 4945, "time_per_iteration": 2.5283596515655518 }, { "auxiliary_loss_clip": 0.01153577, "auxiliary_loss_mlp": 0.01026671, "balance_loss_clip": 1.05174875, "balance_loss_mlp": 1.01916051, "epoch": 0.5947213371009439, "flos": 15231619388160.0, "grad_norm": 1.8912554429473638, "language_loss": 0.85471523, "learning_rate": 1.4892624585850739e-06, "loss": 0.87651777, "num_input_tokens_seen": 106691740, "step": 4946, "time_per_iteration": 2.4547576904296875 }, { "auxiliary_loss_clip": 0.01180647, "auxiliary_loss_mlp": 0.01029913, "balance_loss_clip": 1.05407166, "balance_loss_mlp": 1.02195621, "epoch": 0.594841579991583, "flos": 25848069949440.0, "grad_norm": 1.948354772301302, "language_loss": 0.79370081, "learning_rate": 1.4885093534932465e-06, "loss": 0.81580639, "num_input_tokens_seen": 106709705, "step": 4947, "time_per_iteration": 2.451784610748291 }, { "auxiliary_loss_clip": 0.01149525, "auxiliary_loss_mlp": 0.01027932, "balance_loss_clip": 1.05171728, "balance_loss_mlp": 1.01985598, "epoch": 0.5949618228822221, "flos": 23981109672960.0, "grad_norm": 2.1113095811427676, "language_loss": 0.71349955, "learning_rate": 1.4877563259941433e-06, "loss": 0.7352742, "num_input_tokens_seen": 106727560, "step": 4948, "time_per_iteration": 2.5894393920898438 }, { "auxiliary_loss_clip": 0.01169289, "auxiliary_loss_mlp": 0.0102833, "balance_loss_clip": 1.05242014, "balance_loss_mlp": 1.02004457, "epoch": 0.5950820657728612, "flos": 40547491476480.0, "grad_norm": 1.9119158894215622, "language_loss": 0.67768896, "learning_rate": 1.4870033762019988e-06, "loss": 0.69966519, "num_input_tokens_seen": 106747725, "step": 4949, "time_per_iteration": 3.4546549320220947 }, { "auxiliary_loss_clip": 0.01148257, "auxiliary_loss_mlp": 0.01028029, "balance_loss_clip": 1.05020618, "balance_loss_mlp": 1.01979768, "epoch": 0.5952023086635003, "flos": 23184467884800.0, "grad_norm": 1.8718595646818428, "language_loss": 0.73295224, "learning_rate": 1.4862505042310334e-06, "loss": 0.75471509, "num_input_tokens_seen": 106767010, "step": 4950, "time_per_iteration": 2.4868078231811523 }, { "auxiliary_loss_clip": 0.01140951, "auxiliary_loss_mlp": 0.01032313, "balance_loss_clip": 1.04924345, "balance_loss_mlp": 1.02516675, "epoch": 0.5953225515541394, "flos": 33653289548160.0, "grad_norm": 2.7747454963618057, "language_loss": 0.69862986, "learning_rate": 1.4854977101954587e-06, "loss": 0.72036242, "num_input_tokens_seen": 106789230, "step": 4951, "time_per_iteration": 2.602067470550537 }, { "auxiliary_loss_clip": 0.01163366, "auxiliary_loss_mlp": 0.01028298, "balance_loss_clip": 1.04802966, "balance_loss_mlp": 1.02034664, "epoch": 0.5954427944447784, "flos": 24459619680000.0, "grad_norm": 1.8566278373746716, "language_loss": 0.86410409, "learning_rate": 1.4847449942094716e-06, "loss": 0.88602066, "num_input_tokens_seen": 106808110, "step": 4952, "time_per_iteration": 2.5694382190704346 }, { "auxiliary_loss_clip": 0.01145011, "auxiliary_loss_mlp": 0.01026784, "balance_loss_clip": 1.0497334, "balance_loss_mlp": 1.01909447, "epoch": 0.5955630373354175, "flos": 18551848026240.0, "grad_norm": 2.1879984913400503, "language_loss": 0.8626225, "learning_rate": 1.4839923563872598e-06, "loss": 0.88434047, "num_input_tokens_seen": 106826650, "step": 4953, "time_per_iteration": 3.299710750579834 }, { "auxiliary_loss_clip": 0.01138145, "auxiliary_loss_mlp": 0.01028121, "balance_loss_clip": 1.05129373, "balance_loss_mlp": 1.01990759, "epoch": 0.5956832802260567, "flos": 19791699730560.0, "grad_norm": 2.0111740021682514, "language_loss": 0.76011419, "learning_rate": 1.483239796842997e-06, "loss": 0.78177691, "num_input_tokens_seen": 106844680, "step": 4954, "time_per_iteration": 2.4914398193359375 }, { "auxiliary_loss_clip": 0.01135083, "auxiliary_loss_mlp": 0.01028849, "balance_loss_clip": 1.04863405, "balance_loss_mlp": 1.02138078, "epoch": 0.5958035231166957, "flos": 19750868945280.0, "grad_norm": 2.391598636700139, "language_loss": 0.83933699, "learning_rate": 1.4824873156908462e-06, "loss": 0.86097634, "num_input_tokens_seen": 106862605, "step": 4955, "time_per_iteration": 3.3377203941345215 }, { "auxiliary_loss_clip": 0.01163082, "auxiliary_loss_mlp": 0.00763065, "balance_loss_clip": 1.0517801, "balance_loss_mlp": 1.0003221, "epoch": 0.5959237660073348, "flos": 21652806090240.0, "grad_norm": 1.6410390504623167, "language_loss": 0.7558533, "learning_rate": 1.4817349130449584e-06, "loss": 0.77511477, "num_input_tokens_seen": 106882325, "step": 4956, "time_per_iteration": 2.455779552459717 }, { "auxiliary_loss_clip": 0.01160479, "auxiliary_loss_mlp": 0.01024626, "balance_loss_clip": 1.05075622, "balance_loss_mlp": 1.01708627, "epoch": 0.5960440088979739, "flos": 21171207513600.0, "grad_norm": 1.7988731833587812, "language_loss": 0.83128268, "learning_rate": 1.4809825890194717e-06, "loss": 0.8531338, "num_input_tokens_seen": 106900995, "step": 4957, "time_per_iteration": 3.168745756149292 }, { "auxiliary_loss_clip": 0.01143143, "auxiliary_loss_mlp": 0.01023334, "balance_loss_clip": 1.0467366, "balance_loss_mlp": 1.01525807, "epoch": 0.596164251788613, "flos": 14757526753920.0, "grad_norm": 1.8143343458085588, "language_loss": 0.77339411, "learning_rate": 1.4802303437285139e-06, "loss": 0.79505885, "num_input_tokens_seen": 106918265, "step": 4958, "time_per_iteration": 2.451044797897339 }, { "auxiliary_loss_clip": 0.01145635, "auxiliary_loss_mlp": 0.01030751, "balance_loss_clip": 1.0466224, "balance_loss_mlp": 1.02278733, "epoch": 0.596284494679252, "flos": 20485924865280.0, "grad_norm": 2.2989540994973168, "language_loss": 0.8101123, "learning_rate": 1.4794781772861994e-06, "loss": 0.83187616, "num_input_tokens_seen": 106934760, "step": 4959, "time_per_iteration": 2.4854226112365723 }, { "auxiliary_loss_clip": 0.01146987, "auxiliary_loss_mlp": 0.00762551, "balance_loss_clip": 1.04849386, "balance_loss_mlp": 1.00034475, "epoch": 0.5964047375698912, "flos": 31212262108800.0, "grad_norm": 2.0492302027120926, "language_loss": 0.66953945, "learning_rate": 1.4787260898066324e-06, "loss": 0.68863487, "num_input_tokens_seen": 106954760, "step": 4960, "time_per_iteration": 2.557249069213867 }, { "auxiliary_loss_clip": 0.01176359, "auxiliary_loss_mlp": 0.0102893, "balance_loss_clip": 1.05356097, "balance_loss_mlp": 1.02102649, "epoch": 0.5965249804605303, "flos": 27483620855040.0, "grad_norm": 1.9676453578294528, "language_loss": 0.85881627, "learning_rate": 1.4779740814039023e-06, "loss": 0.88086915, "num_input_tokens_seen": 106974845, "step": 4961, "time_per_iteration": 2.4634146690368652 }, { "auxiliary_loss_clip": 0.01175868, "auxiliary_loss_mlp": 0.0102955, "balance_loss_clip": 1.05139518, "balance_loss_mlp": 1.02110434, "epoch": 0.5966452233511693, "flos": 30773936442240.0, "grad_norm": 2.139889861102823, "language_loss": 0.68651414, "learning_rate": 1.4772221521920894e-06, "loss": 0.70856833, "num_input_tokens_seen": 106994870, "step": 4962, "time_per_iteration": 2.473191976547241 }, { "auxiliary_loss_clip": 0.01152617, "auxiliary_loss_mlp": 0.01025369, "balance_loss_clip": 1.05465364, "balance_loss_mlp": 1.01762605, "epoch": 0.5967654662418085, "flos": 25481170477440.0, "grad_norm": 1.9149100296852453, "language_loss": 0.74028611, "learning_rate": 1.4764703022852598e-06, "loss": 0.76206601, "num_input_tokens_seen": 107015390, "step": 4963, "time_per_iteration": 2.5281455516815186 }, { "auxiliary_loss_clip": 0.01096019, "auxiliary_loss_mlp": 0.01023436, "balance_loss_clip": 1.04384995, "balance_loss_mlp": 1.01601553, "epoch": 0.5968857091324475, "flos": 19099126621440.0, "grad_norm": 1.839949121094876, "language_loss": 0.77111888, "learning_rate": 1.4757185317974696e-06, "loss": 0.79231346, "num_input_tokens_seen": 107033775, "step": 4964, "time_per_iteration": 2.5622169971466064 }, { "auxiliary_loss_clip": 0.01164558, "auxiliary_loss_mlp": 0.01028244, "balance_loss_clip": 1.05113244, "balance_loss_mlp": 1.02014971, "epoch": 0.5970059520230866, "flos": 23692711374720.0, "grad_norm": 2.331835218485058, "language_loss": 0.70733988, "learning_rate": 1.474966840842761e-06, "loss": 0.72926784, "num_input_tokens_seen": 107053355, "step": 4965, "time_per_iteration": 2.462054491043091 }, { "auxiliary_loss_clip": 0.01167546, "auxiliary_loss_mlp": 0.01027751, "balance_loss_clip": 1.0524137, "balance_loss_mlp": 1.01996338, "epoch": 0.5971261949137258, "flos": 23185545292800.0, "grad_norm": 1.7218439076553258, "language_loss": 0.86805224, "learning_rate": 1.4742152295351655e-06, "loss": 0.89000523, "num_input_tokens_seen": 107072510, "step": 4966, "time_per_iteration": 2.4483320713043213 }, { "auxiliary_loss_clip": 0.01163634, "auxiliary_loss_mlp": 0.00762949, "balance_loss_clip": 1.05017376, "balance_loss_mlp": 1.00041175, "epoch": 0.5972464378043648, "flos": 20557710195840.0, "grad_norm": 2.4383486074991283, "language_loss": 0.64000165, "learning_rate": 1.4734636979887016e-06, "loss": 0.65926743, "num_input_tokens_seen": 107089970, "step": 4967, "time_per_iteration": 2.4462952613830566 }, { "auxiliary_loss_clip": 0.01138416, "auxiliary_loss_mlp": 0.01032135, "balance_loss_clip": 1.04806232, "balance_loss_mlp": 1.02404666, "epoch": 0.5973666806950039, "flos": 29387030457600.0, "grad_norm": 2.0295601572211126, "language_loss": 0.90170622, "learning_rate": 1.4727122463173755e-06, "loss": 0.92341173, "num_input_tokens_seen": 107108500, "step": 4968, "time_per_iteration": 2.5615103244781494 }, { "auxiliary_loss_clip": 0.0114958, "auxiliary_loss_mlp": 0.01025528, "balance_loss_clip": 1.05158031, "balance_loss_mlp": 1.01778495, "epoch": 0.597486923585643, "flos": 22273522041600.0, "grad_norm": 1.7394379454645812, "language_loss": 0.64354867, "learning_rate": 1.471960874635183e-06, "loss": 0.66529977, "num_input_tokens_seen": 107128060, "step": 4969, "time_per_iteration": 2.4700706005096436 }, { "auxiliary_loss_clip": 0.01145599, "auxiliary_loss_mlp": 0.01030358, "balance_loss_clip": 1.04819679, "balance_loss_mlp": 1.02221036, "epoch": 0.5976071664762821, "flos": 13772461196160.0, "grad_norm": 2.187790786830356, "language_loss": 0.70690691, "learning_rate": 1.4712095830561055e-06, "loss": 0.72866654, "num_input_tokens_seen": 107146550, "step": 4970, "time_per_iteration": 2.4548113346099854 }, { "auxiliary_loss_clip": 0.01147186, "auxiliary_loss_mlp": 0.01028012, "balance_loss_clip": 1.0471102, "balance_loss_mlp": 1.02043581, "epoch": 0.5977274093669211, "flos": 19098623831040.0, "grad_norm": 1.8106387547572447, "language_loss": 0.80656016, "learning_rate": 1.4704583716941147e-06, "loss": 0.8283121, "num_input_tokens_seen": 107165415, "step": 4971, "time_per_iteration": 2.4699997901916504 }, { "auxiliary_loss_clip": 0.01156526, "auxiliary_loss_mlp": 0.01033366, "balance_loss_clip": 1.05182981, "balance_loss_mlp": 1.02576685, "epoch": 0.5978476522575603, "flos": 20376002269440.0, "grad_norm": 2.1702936540877924, "language_loss": 0.72501111, "learning_rate": 1.4697072406631672e-06, "loss": 0.74691004, "num_input_tokens_seen": 107185320, "step": 4972, "time_per_iteration": 2.4371893405914307 }, { "auxiliary_loss_clip": 0.01125817, "auxiliary_loss_mlp": 0.01034691, "balance_loss_clip": 1.05018544, "balance_loss_mlp": 1.02584517, "epoch": 0.5979678951481994, "flos": 29023147728000.0, "grad_norm": 1.588791497033447, "language_loss": 0.72839105, "learning_rate": 1.4689561900772097e-06, "loss": 0.74999613, "num_input_tokens_seen": 107205380, "step": 4973, "time_per_iteration": 2.6014435291290283 }, { "auxiliary_loss_clip": 0.01146675, "auxiliary_loss_mlp": 0.01029572, "balance_loss_clip": 1.04700828, "balance_loss_mlp": 1.02213907, "epoch": 0.5980881380388384, "flos": 17967689141760.0, "grad_norm": 2.0716523326837435, "language_loss": 0.72641158, "learning_rate": 1.4682052200501758e-06, "loss": 0.74817407, "num_input_tokens_seen": 107222585, "step": 4974, "time_per_iteration": 2.4566664695739746 }, { "auxiliary_loss_clip": 0.01176278, "auxiliary_loss_mlp": 0.01027127, "balance_loss_clip": 1.05194044, "balance_loss_mlp": 1.01902044, "epoch": 0.5982083809294776, "flos": 22962827013120.0, "grad_norm": 1.8513017249803914, "language_loss": 0.79629666, "learning_rate": 1.4674543306959876e-06, "loss": 0.81833076, "num_input_tokens_seen": 107242055, "step": 4975, "time_per_iteration": 2.4262027740478516 }, { "auxiliary_loss_clip": 0.01156405, "auxiliary_loss_mlp": 0.01028812, "balance_loss_clip": 1.05264425, "balance_loss_mlp": 1.02031815, "epoch": 0.5983286238201166, "flos": 20991941712000.0, "grad_norm": 2.3764702166331664, "language_loss": 0.84969616, "learning_rate": 1.4667035221285535e-06, "loss": 0.87154835, "num_input_tokens_seen": 107259695, "step": 4976, "time_per_iteration": 3.314951181411743 }, { "auxiliary_loss_clip": 0.01161551, "auxiliary_loss_mlp": 0.01027003, "balance_loss_clip": 1.05235291, "balance_loss_mlp": 1.0192008, "epoch": 0.5984488667107557, "flos": 28183448511360.0, "grad_norm": 1.9596747127444862, "language_loss": 0.74235994, "learning_rate": 1.4659527944617715e-06, "loss": 0.76424545, "num_input_tokens_seen": 107279640, "step": 4977, "time_per_iteration": 2.49161958694458 }, { "auxiliary_loss_clip": 0.01101519, "auxiliary_loss_mlp": 0.01026815, "balance_loss_clip": 1.04230893, "balance_loss_mlp": 1.018929, "epoch": 0.5985691096013949, "flos": 16471794314880.0, "grad_norm": 1.8085265572665044, "language_loss": 0.75897062, "learning_rate": 1.465202147809526e-06, "loss": 0.78025389, "num_input_tokens_seen": 107298135, "step": 4978, "time_per_iteration": 2.5493552684783936 }, { "auxiliary_loss_clip": 0.01177732, "auxiliary_loss_mlp": 0.01026295, "balance_loss_clip": 1.05335546, "balance_loss_mlp": 1.01910388, "epoch": 0.5986893524920339, "flos": 26719046933760.0, "grad_norm": 2.0609474560671837, "language_loss": 0.76020217, "learning_rate": 1.4644515822856888e-06, "loss": 0.78224242, "num_input_tokens_seen": 107316570, "step": 4979, "time_per_iteration": 3.2906670570373535 }, { "auxiliary_loss_clip": 0.01042761, "auxiliary_loss_mlp": 0.01002341, "balance_loss_clip": 1.01489282, "balance_loss_mlp": 1.00126767, "epoch": 0.598809595382673, "flos": 61608061100160.0, "grad_norm": 0.9209046664704985, "language_loss": 0.5651983, "learning_rate": 1.4637010980041215e-06, "loss": 0.58564937, "num_input_tokens_seen": 107378680, "step": 4980, "time_per_iteration": 3.1087265014648438 }, { "auxiliary_loss_clip": 0.01178441, "auxiliary_loss_mlp": 0.01029064, "balance_loss_clip": 1.05274665, "balance_loss_mlp": 1.02085042, "epoch": 0.5989298382733121, "flos": 11801719549440.0, "grad_norm": 2.3279182921558594, "language_loss": 0.89423263, "learning_rate": 1.4629506950786707e-06, "loss": 0.91630769, "num_input_tokens_seen": 107394860, "step": 4981, "time_per_iteration": 3.2508251667022705 }, { "auxiliary_loss_clip": 0.01073633, "auxiliary_loss_mlp": 0.01000602, "balance_loss_clip": 1.01515174, "balance_loss_mlp": 0.99955332, "epoch": 0.5990500811639512, "flos": 60025800021120.0, "grad_norm": 0.8080336682060275, "language_loss": 0.56101525, "learning_rate": 1.4622003736231733e-06, "loss": 0.58175755, "num_input_tokens_seen": 107453850, "step": 4982, "time_per_iteration": 3.0760984420776367 }, { "auxiliary_loss_clip": 0.01163708, "auxiliary_loss_mlp": 0.01027783, "balance_loss_clip": 1.05302715, "balance_loss_mlp": 1.01946187, "epoch": 0.5991703240545903, "flos": 18222726683520.0, "grad_norm": 2.3769213074901163, "language_loss": 0.804515, "learning_rate": 1.461450133751451e-06, "loss": 0.8264299, "num_input_tokens_seen": 107471920, "step": 4983, "time_per_iteration": 2.456864833831787 }, { "auxiliary_loss_clip": 0.01167607, "auxiliary_loss_mlp": 0.01029591, "balance_loss_clip": 1.05290306, "balance_loss_mlp": 1.02141893, "epoch": 0.5992905669452293, "flos": 27709894581120.0, "grad_norm": 1.8922090337392568, "language_loss": 0.76022387, "learning_rate": 1.4606999755773153e-06, "loss": 0.78219581, "num_input_tokens_seen": 107493125, "step": 4984, "time_per_iteration": 3.260646343231201 }, { "auxiliary_loss_clip": 0.01176605, "auxiliary_loss_mlp": 0.01028571, "balance_loss_clip": 1.05327678, "balance_loss_mlp": 1.02060187, "epoch": 0.5994108098358685, "flos": 20449008662400.0, "grad_norm": 1.97051652426096, "language_loss": 0.821702, "learning_rate": 1.4599498992145643e-06, "loss": 0.8437537, "num_input_tokens_seen": 107513150, "step": 4985, "time_per_iteration": 2.4697866439819336 }, { "auxiliary_loss_clip": 0.01155634, "auxiliary_loss_mlp": 0.00762361, "balance_loss_clip": 1.05148196, "balance_loss_mlp": 1.00047624, "epoch": 0.5995310527265075, "flos": 22269966595200.0, "grad_norm": 1.8964918959056367, "language_loss": 0.70976949, "learning_rate": 1.4591999047769846e-06, "loss": 0.72894943, "num_input_tokens_seen": 107532005, "step": 4986, "time_per_iteration": 2.499343156814575 }, { "auxiliary_loss_clip": 0.01102257, "auxiliary_loss_mlp": 0.0103256, "balance_loss_clip": 1.04158103, "balance_loss_mlp": 1.0239768, "epoch": 0.5996512956171466, "flos": 18916951818240.0, "grad_norm": 1.6569863288346862, "language_loss": 0.75477016, "learning_rate": 1.4584499923783486e-06, "loss": 0.77611834, "num_input_tokens_seen": 107550585, "step": 4987, "time_per_iteration": 2.5597124099731445 }, { "auxiliary_loss_clip": 0.01146626, "auxiliary_loss_mlp": 0.01022737, "balance_loss_clip": 1.04883158, "balance_loss_mlp": 1.01527464, "epoch": 0.5997715385077858, "flos": 15370916330880.0, "grad_norm": 1.7460331297603642, "language_loss": 0.76018357, "learning_rate": 1.457700162132419e-06, "loss": 0.78187728, "num_input_tokens_seen": 107567575, "step": 4988, "time_per_iteration": 2.455214262008667 }, { "auxiliary_loss_clip": 0.01116324, "auxiliary_loss_mlp": 0.01024681, "balance_loss_clip": 1.04639053, "balance_loss_mlp": 1.01710546, "epoch": 0.5998917813984248, "flos": 25264844818560.0, "grad_norm": 2.5868413614548573, "language_loss": 0.72516936, "learning_rate": 1.4569504141529433e-06, "loss": 0.74657941, "num_input_tokens_seen": 107585410, "step": 4989, "time_per_iteration": 2.582700252532959 }, { "auxiliary_loss_clip": 0.01165248, "auxiliary_loss_mlp": 0.01032739, "balance_loss_clip": 1.05479109, "balance_loss_mlp": 1.02441216, "epoch": 0.6000120242890639, "flos": 22054502862720.0, "grad_norm": 1.9744354764354737, "language_loss": 0.72019613, "learning_rate": 1.456200748553658e-06, "loss": 0.74217606, "num_input_tokens_seen": 107603405, "step": 4990, "time_per_iteration": 2.449049472808838 }, { "auxiliary_loss_clip": 0.01179614, "auxiliary_loss_mlp": 0.01031814, "balance_loss_clip": 1.0533433, "balance_loss_mlp": 1.02327228, "epoch": 0.600132267179703, "flos": 29863421562240.0, "grad_norm": 1.774715116444949, "language_loss": 0.78607428, "learning_rate": 1.455451165448287e-06, "loss": 0.80818862, "num_input_tokens_seen": 107626060, "step": 4991, "time_per_iteration": 2.491271734237671 }, { "auxiliary_loss_clip": 0.01146083, "auxiliary_loss_mlp": 0.01027813, "balance_loss_clip": 1.05040836, "balance_loss_mlp": 1.01975465, "epoch": 0.6002525100703421, "flos": 25045358762880.0, "grad_norm": 2.1315986861374623, "language_loss": 0.73357773, "learning_rate": 1.4547016649505407e-06, "loss": 0.75531662, "num_input_tokens_seen": 107644070, "step": 4992, "time_per_iteration": 2.5270144939422607 }, { "auxiliary_loss_clip": 0.01133068, "auxiliary_loss_mlp": 0.01023193, "balance_loss_clip": 1.04659677, "balance_loss_mlp": 1.01486278, "epoch": 0.6003727529609811, "flos": 20849592113280.0, "grad_norm": 2.0925473028567425, "language_loss": 0.84769738, "learning_rate": 1.4539522471741193e-06, "loss": 0.86925995, "num_input_tokens_seen": 107661495, "step": 4993, "time_per_iteration": 2.575850248336792 }, { "auxiliary_loss_clip": 0.01165259, "auxiliary_loss_mlp": 0.01029749, "balance_loss_clip": 1.04939103, "balance_loss_mlp": 1.02135623, "epoch": 0.6004929958516203, "flos": 15594604277760.0, "grad_norm": 2.0274479076688436, "language_loss": 0.71188104, "learning_rate": 1.4532029122327067e-06, "loss": 0.73383111, "num_input_tokens_seen": 107678280, "step": 4994, "time_per_iteration": 2.423466682434082 }, { "auxiliary_loss_clip": 0.01132062, "auxiliary_loss_mlp": 0.01029756, "balance_loss_clip": 1.05140328, "balance_loss_mlp": 1.02209139, "epoch": 0.6006132387422594, "flos": 21763267390080.0, "grad_norm": 1.9483110586840606, "language_loss": 0.75176615, "learning_rate": 1.4524536602399783e-06, "loss": 0.77338433, "num_input_tokens_seen": 107697370, "step": 4995, "time_per_iteration": 2.5061392784118652 }, { "auxiliary_loss_clip": 0.01143593, "auxiliary_loss_mlp": 0.01033968, "balance_loss_clip": 1.04965007, "balance_loss_mlp": 1.02633905, "epoch": 0.6007334816328984, "flos": 22858542852480.0, "grad_norm": 1.575291398430298, "language_loss": 0.77695823, "learning_rate": 1.4517044913095938e-06, "loss": 0.79873389, "num_input_tokens_seen": 107717790, "step": 4996, "time_per_iteration": 2.5041141510009766 }, { "auxiliary_loss_clip": 0.01163615, "auxiliary_loss_mlp": 0.01026292, "balance_loss_clip": 1.05148876, "balance_loss_mlp": 1.0180068, "epoch": 0.6008537245235376, "flos": 28324577047680.0, "grad_norm": 2.175484317456789, "language_loss": 0.81437993, "learning_rate": 1.4509554055552022e-06, "loss": 0.83627903, "num_input_tokens_seen": 107738020, "step": 4997, "time_per_iteration": 2.5001513957977295 }, { "auxiliary_loss_clip": 0.0114588, "auxiliary_loss_mlp": 0.01029076, "balance_loss_clip": 1.04908776, "balance_loss_mlp": 1.02102661, "epoch": 0.6009739674141766, "flos": 20886113266560.0, "grad_norm": 3.6005132846392973, "language_loss": 0.83950353, "learning_rate": 1.450206403090439e-06, "loss": 0.86125308, "num_input_tokens_seen": 107756215, "step": 4998, "time_per_iteration": 2.4637374877929688 }, { "auxiliary_loss_clip": 0.01163042, "auxiliary_loss_mlp": 0.01024263, "balance_loss_clip": 1.05340576, "balance_loss_mlp": 1.01642537, "epoch": 0.6010942103048157, "flos": 20481004702080.0, "grad_norm": 2.753614681859002, "language_loss": 0.86094439, "learning_rate": 1.4494574840289274e-06, "loss": 0.88281751, "num_input_tokens_seen": 107773330, "step": 4999, "time_per_iteration": 2.4368371963500977 }, { "auxiliary_loss_clip": 0.01166374, "auxiliary_loss_mlp": 0.01033472, "balance_loss_clip": 1.04971433, "balance_loss_mlp": 1.02477598, "epoch": 0.6012144531954549, "flos": 23805973935360.0, "grad_norm": 5.324664305094547, "language_loss": 0.73814762, "learning_rate": 1.4487086484842782e-06, "loss": 0.76014602, "num_input_tokens_seen": 107791975, "step": 5000, "time_per_iteration": 2.4726297855377197 }, { "auxiliary_loss_clip": 0.01171895, "auxiliary_loss_mlp": 0.0102602, "balance_loss_clip": 1.04896939, "balance_loss_mlp": 1.01844716, "epoch": 0.6013346960860939, "flos": 18988378012800.0, "grad_norm": 2.252309271280477, "language_loss": 0.60172224, "learning_rate": 1.4479598965700878e-06, "loss": 0.62370133, "num_input_tokens_seen": 107809240, "step": 5001, "time_per_iteration": 2.3883323669433594 }, { "auxiliary_loss_clip": 0.01132745, "auxiliary_loss_mlp": 0.01027517, "balance_loss_clip": 1.04567719, "balance_loss_mlp": 1.01950622, "epoch": 0.601454938976733, "flos": 24025316336640.0, "grad_norm": 2.272489443577085, "language_loss": 0.68891734, "learning_rate": 1.4472112283999427e-06, "loss": 0.71051997, "num_input_tokens_seen": 107827895, "step": 5002, "time_per_iteration": 3.3464436531066895 }, { "auxiliary_loss_clip": 0.01160421, "auxiliary_loss_mlp": 0.01032569, "balance_loss_clip": 1.05323637, "balance_loss_mlp": 1.0249815, "epoch": 0.6015751818673721, "flos": 26427129102720.0, "grad_norm": 2.0363594406857852, "language_loss": 0.6939925, "learning_rate": 1.4464626440874143e-06, "loss": 0.71592242, "num_input_tokens_seen": 107847010, "step": 5003, "time_per_iteration": 2.494530439376831 }, { "auxiliary_loss_clip": 0.01126397, "auxiliary_loss_mlp": 0.01028353, "balance_loss_clip": 1.04351139, "balance_loss_mlp": 1.02006209, "epoch": 0.6016954247580112, "flos": 13115260005120.0, "grad_norm": 2.402330950953752, "language_loss": 0.74155611, "learning_rate": 1.4457141437460636e-06, "loss": 0.7631036, "num_input_tokens_seen": 107864235, "step": 5004, "time_per_iteration": 2.50895619392395 }, { "auxiliary_loss_clip": 0.01149278, "auxiliary_loss_mlp": 0.0102956, "balance_loss_clip": 1.04833245, "balance_loss_mlp": 1.02110839, "epoch": 0.6018156676486502, "flos": 23768447201280.0, "grad_norm": 1.7299193761026348, "language_loss": 0.73381329, "learning_rate": 1.444965727489436e-06, "loss": 0.7556017, "num_input_tokens_seen": 107883680, "step": 5005, "time_per_iteration": 3.347539186477661 }, { "auxiliary_loss_clip": 0.01130165, "auxiliary_loss_mlp": 0.01029062, "balance_loss_clip": 1.04347682, "balance_loss_mlp": 1.02115822, "epoch": 0.6019359105392894, "flos": 26469360518400.0, "grad_norm": 9.282858128044536, "language_loss": 0.63305753, "learning_rate": 1.444217395431066e-06, "loss": 0.65464979, "num_input_tokens_seen": 107906220, "step": 5006, "time_per_iteration": 2.5798964500427246 }, { "auxiliary_loss_clip": 0.01038194, "auxiliary_loss_mlp": 0.01005458, "balance_loss_clip": 1.01464319, "balance_loss_mlp": 1.00442088, "epoch": 0.6020561534299285, "flos": 69190849728000.0, "grad_norm": 0.8039692586969128, "language_loss": 0.55863523, "learning_rate": 1.4434691476844755e-06, "loss": 0.57907176, "num_input_tokens_seen": 107967195, "step": 5007, "time_per_iteration": 3.0605690479278564 }, { "auxiliary_loss_clip": 0.01143008, "auxiliary_loss_mlp": 0.01026873, "balance_loss_clip": 1.04916382, "balance_loss_mlp": 1.0197922, "epoch": 0.6021763963205675, "flos": 21835304115840.0, "grad_norm": 2.123492861379607, "language_loss": 0.6677469, "learning_rate": 1.4427209843631729e-06, "loss": 0.68944573, "num_input_tokens_seen": 107984245, "step": 5008, "time_per_iteration": 3.344230890274048 }, { "auxiliary_loss_clip": 0.01175014, "auxiliary_loss_mlp": 0.00762186, "balance_loss_clip": 1.05259132, "balance_loss_mlp": 1.0005331, "epoch": 0.6022966392112067, "flos": 26578636669440.0, "grad_norm": 1.83676117665767, "language_loss": 0.81273806, "learning_rate": 1.4419729055806534e-06, "loss": 0.83211005, "num_input_tokens_seen": 108003680, "step": 5009, "time_per_iteration": 2.489734172821045 }, { "auxiliary_loss_clip": 0.01144772, "auxiliary_loss_mlp": 0.00762106, "balance_loss_clip": 1.05130374, "balance_loss_mlp": 1.0005089, "epoch": 0.6024168821018457, "flos": 20703722981760.0, "grad_norm": 2.1334655888156724, "language_loss": 0.82075316, "learning_rate": 1.441224911450401e-06, "loss": 0.83982193, "num_input_tokens_seen": 108019635, "step": 5010, "time_per_iteration": 3.2493977546691895 }, { "auxiliary_loss_clip": 0.01166597, "auxiliary_loss_mlp": 0.01035405, "balance_loss_clip": 1.05022347, "balance_loss_mlp": 1.02731037, "epoch": 0.6025371249924848, "flos": 24680973242880.0, "grad_norm": 2.7722133494753143, "language_loss": 0.82099724, "learning_rate": 1.4404770020858851e-06, "loss": 0.84301722, "num_input_tokens_seen": 108039120, "step": 5011, "time_per_iteration": 2.47942852973938 }, { "auxiliary_loss_clip": 0.0115398, "auxiliary_loss_mlp": 0.01027947, "balance_loss_clip": 1.0478797, "balance_loss_mlp": 1.02033556, "epoch": 0.602657367883124, "flos": 25955801815680.0, "grad_norm": 1.59993587240379, "language_loss": 0.85915565, "learning_rate": 1.439729177600563e-06, "loss": 0.88097489, "num_input_tokens_seen": 108059615, "step": 5012, "time_per_iteration": 2.4997735023498535 }, { "auxiliary_loss_clip": 0.01137371, "auxiliary_loss_mlp": 0.01026793, "balance_loss_clip": 1.04735422, "balance_loss_mlp": 1.01887453, "epoch": 0.602777610773763, "flos": 16690633925760.0, "grad_norm": 2.0473062333553855, "language_loss": 0.73011851, "learning_rate": 1.4389814381078793e-06, "loss": 0.75176024, "num_input_tokens_seen": 108078855, "step": 5013, "time_per_iteration": 2.539184331893921 }, { "auxiliary_loss_clip": 0.01087009, "auxiliary_loss_mlp": 0.01034419, "balance_loss_clip": 1.0457232, "balance_loss_mlp": 1.02707553, "epoch": 0.6028978536644021, "flos": 13334243270400.0, "grad_norm": 3.7071607608245536, "language_loss": 0.8010776, "learning_rate": 1.438233783721265e-06, "loss": 0.82229185, "num_input_tokens_seen": 108095020, "step": 5014, "time_per_iteration": 2.580188035964966 }, { "auxiliary_loss_clip": 0.01148489, "auxiliary_loss_mlp": 0.01026996, "balance_loss_clip": 1.05526173, "balance_loss_mlp": 1.01934278, "epoch": 0.6030180965550412, "flos": 19644825018240.0, "grad_norm": 1.9904394804667211, "language_loss": 0.77557641, "learning_rate": 1.43748621455414e-06, "loss": 0.79733133, "num_input_tokens_seen": 108111455, "step": 5015, "time_per_iteration": 2.4913110733032227 }, { "auxiliary_loss_clip": 0.01144282, "auxiliary_loss_mlp": 0.01029333, "balance_loss_clip": 1.04798412, "balance_loss_mlp": 1.02138174, "epoch": 0.6031383394456803, "flos": 14458390289280.0, "grad_norm": 2.5235556294668817, "language_loss": 0.80881631, "learning_rate": 1.4367387307199082e-06, "loss": 0.83055246, "num_input_tokens_seen": 108128305, "step": 5016, "time_per_iteration": 2.4549930095672607 }, { "auxiliary_loss_clip": 0.01156434, "auxiliary_loss_mlp": 0.01030407, "balance_loss_clip": 1.0478915, "balance_loss_mlp": 1.02269471, "epoch": 0.6032585823363193, "flos": 13917791623680.0, "grad_norm": 2.02468675669203, "language_loss": 0.82572615, "learning_rate": 1.4359913323319632e-06, "loss": 0.84759456, "num_input_tokens_seen": 108145475, "step": 5017, "time_per_iteration": 2.4183881282806396 }, { "auxiliary_loss_clip": 0.01092142, "auxiliary_loss_mlp": 0.0102733, "balance_loss_clip": 1.04136455, "balance_loss_mlp": 1.01943231, "epoch": 0.6033788252269584, "flos": 24353252530560.0, "grad_norm": 1.5656940464598688, "language_loss": 0.77422225, "learning_rate": 1.4352440195036847e-06, "loss": 0.79541701, "num_input_tokens_seen": 108165650, "step": 5018, "time_per_iteration": 2.7103166580200195 }, { "auxiliary_loss_clip": 0.01096191, "auxiliary_loss_mlp": 0.01024006, "balance_loss_clip": 1.04015517, "balance_loss_mlp": 1.01623929, "epoch": 0.6034990681175976, "flos": 25521247077120.0, "grad_norm": 2.2470337703071093, "language_loss": 0.79701352, "learning_rate": 1.4344967923484395e-06, "loss": 0.81821549, "num_input_tokens_seen": 108187620, "step": 5019, "time_per_iteration": 2.645002603530884 }, { "auxiliary_loss_clip": 0.01157419, "auxiliary_loss_mlp": 0.01028951, "balance_loss_clip": 1.0487802, "balance_loss_mlp": 1.02129126, "epoch": 0.6036193110082366, "flos": 25958387594880.0, "grad_norm": 2.1305302587415147, "language_loss": 0.72435927, "learning_rate": 1.433749650979581e-06, "loss": 0.74622297, "num_input_tokens_seen": 108207605, "step": 5020, "time_per_iteration": 2.5045955181121826 }, { "auxiliary_loss_clip": 0.01137612, "auxiliary_loss_mlp": 0.01027896, "balance_loss_clip": 1.04668975, "balance_loss_mlp": 1.02015305, "epoch": 0.6037395538988757, "flos": 25593427457280.0, "grad_norm": 1.7935822766322709, "language_loss": 0.67929846, "learning_rate": 1.433002595510451e-06, "loss": 0.70095348, "num_input_tokens_seen": 108226385, "step": 5021, "time_per_iteration": 2.550509452819824 }, { "auxiliary_loss_clip": 0.01142626, "auxiliary_loss_mlp": 0.00763046, "balance_loss_clip": 1.04645419, "balance_loss_mlp": 1.00054348, "epoch": 0.6038597967895148, "flos": 17816253402240.0, "grad_norm": 1.8841670330695506, "language_loss": 0.72065091, "learning_rate": 1.4322556260543757e-06, "loss": 0.73970765, "num_input_tokens_seen": 108242960, "step": 5022, "time_per_iteration": 2.4696199893951416 }, { "auxiliary_loss_clip": 0.01042647, "auxiliary_loss_mlp": 0.01004339, "balance_loss_clip": 1.01380968, "balance_loss_mlp": 1.00323629, "epoch": 0.6039800396801539, "flos": 65169213235200.0, "grad_norm": 0.9004752528451603, "language_loss": 0.62714118, "learning_rate": 1.4315087427246703e-06, "loss": 0.64761102, "num_input_tokens_seen": 108296785, "step": 5023, "time_per_iteration": 2.988929271697998 }, { "auxiliary_loss_clip": 0.01074203, "auxiliary_loss_mlp": 0.01001746, "balance_loss_clip": 1.01607776, "balance_loss_mlp": 1.00070286, "epoch": 0.604100282570793, "flos": 67386409073280.0, "grad_norm": 0.8659566953079477, "language_loss": 0.58467013, "learning_rate": 1.4307619456346372e-06, "loss": 0.60542959, "num_input_tokens_seen": 108341090, "step": 5024, "time_per_iteration": 2.72853422164917 }, { "auxiliary_loss_clip": 0.01162675, "auxiliary_loss_mlp": 0.01027537, "balance_loss_clip": 1.04793739, "balance_loss_mlp": 1.01937151, "epoch": 0.6042205254614321, "flos": 35297495631360.0, "grad_norm": 2.4484646796305656, "language_loss": 0.74530387, "learning_rate": 1.430015234897564e-06, "loss": 0.76720595, "num_input_tokens_seen": 108364370, "step": 5025, "time_per_iteration": 2.5868020057678223 }, { "auxiliary_loss_clip": 0.01175461, "auxiliary_loss_mlp": 0.00762733, "balance_loss_clip": 1.05122185, "balance_loss_mlp": 1.00058103, "epoch": 0.6043407683520712, "flos": 45658262206080.0, "grad_norm": 1.9333595655394096, "language_loss": 0.66328311, "learning_rate": 1.4292686106267274e-06, "loss": 0.68266499, "num_input_tokens_seen": 108387220, "step": 5026, "time_per_iteration": 2.66455340385437 }, { "auxiliary_loss_clip": 0.01165658, "auxiliary_loss_mlp": 0.01031724, "balance_loss_clip": 1.050825, "balance_loss_mlp": 1.02364135, "epoch": 0.6044610112427102, "flos": 16180020138240.0, "grad_norm": 1.893756904251137, "language_loss": 0.77115452, "learning_rate": 1.4285220729353876e-06, "loss": 0.79312837, "num_input_tokens_seen": 108405760, "step": 5027, "time_per_iteration": 2.4467709064483643 }, { "auxiliary_loss_clip": 0.01143584, "auxiliary_loss_mlp": 0.01026576, "balance_loss_clip": 1.04542017, "balance_loss_mlp": 1.01868415, "epoch": 0.6045812541333494, "flos": 13804062186240.0, "grad_norm": 3.590940819406141, "language_loss": 0.78105903, "learning_rate": 1.4277756219367957e-06, "loss": 0.8027606, "num_input_tokens_seen": 108422785, "step": 5028, "time_per_iteration": 2.458397626876831 }, { "auxiliary_loss_clip": 0.01141256, "auxiliary_loss_mlp": 0.01025115, "balance_loss_clip": 1.04836535, "balance_loss_mlp": 1.01712227, "epoch": 0.6047014970239885, "flos": 19975059682560.0, "grad_norm": 2.1083199103529418, "language_loss": 0.79953861, "learning_rate": 1.4270292577441864e-06, "loss": 0.82120228, "num_input_tokens_seen": 108442290, "step": 5029, "time_per_iteration": 3.335294008255005 }, { "auxiliary_loss_clip": 0.01163883, "auxiliary_loss_mlp": 0.01026311, "balance_loss_clip": 1.04761863, "balance_loss_mlp": 1.01827013, "epoch": 0.6048217399146275, "flos": 25337097025920.0, "grad_norm": 1.554436458152638, "language_loss": 0.71815109, "learning_rate": 1.4262829804707836e-06, "loss": 0.74005306, "num_input_tokens_seen": 108464280, "step": 5030, "time_per_iteration": 2.478785514831543 }, { "auxiliary_loss_clip": 0.01162765, "auxiliary_loss_mlp": 0.010281, "balance_loss_clip": 1.04754078, "balance_loss_mlp": 1.01990485, "epoch": 0.6049419828052667, "flos": 26030819370240.0, "grad_norm": 1.5380958489320469, "language_loss": 0.69692987, "learning_rate": 1.4255367902297958e-06, "loss": 0.71883845, "num_input_tokens_seen": 108485610, "step": 5031, "time_per_iteration": 2.4898414611816406 }, { "auxiliary_loss_clip": 0.01172529, "auxiliary_loss_mlp": 0.01028998, "balance_loss_clip": 1.05104756, "balance_loss_mlp": 1.0216012, "epoch": 0.6050622256959057, "flos": 14648106948480.0, "grad_norm": 2.0205071211404957, "language_loss": 0.78258532, "learning_rate": 1.4247906871344215e-06, "loss": 0.8046006, "num_input_tokens_seen": 108501005, "step": 5032, "time_per_iteration": 3.2430365085601807 }, { "auxiliary_loss_clip": 0.01138409, "auxiliary_loss_mlp": 0.01021865, "balance_loss_clip": 1.04324579, "balance_loss_mlp": 1.01417613, "epoch": 0.6051824685865448, "flos": 23331450337920.0, "grad_norm": 1.9961447986959235, "language_loss": 0.75440133, "learning_rate": 1.4240446712978415e-06, "loss": 0.77600408, "num_input_tokens_seen": 108519990, "step": 5033, "time_per_iteration": 2.5093581676483154 }, { "auxiliary_loss_clip": 0.01166141, "auxiliary_loss_mlp": 0.01023443, "balance_loss_clip": 1.05149388, "balance_loss_mlp": 1.01497316, "epoch": 0.605302711477184, "flos": 27563307177600.0, "grad_norm": 1.944082708811842, "language_loss": 0.74390268, "learning_rate": 1.423298742833227e-06, "loss": 0.76579857, "num_input_tokens_seen": 108538650, "step": 5034, "time_per_iteration": 3.369309902191162 }, { "auxiliary_loss_clip": 0.011362, "auxiliary_loss_mlp": 0.01028501, "balance_loss_clip": 1.04437637, "balance_loss_mlp": 1.02083015, "epoch": 0.605422954367823, "flos": 15154698412800.0, "grad_norm": 1.9796431019349092, "language_loss": 0.71518171, "learning_rate": 1.4225529018537352e-06, "loss": 0.73682868, "num_input_tokens_seen": 108554155, "step": 5035, "time_per_iteration": 2.485997438430786 }, { "auxiliary_loss_clip": 0.01174463, "auxiliary_loss_mlp": 0.01027055, "balance_loss_clip": 1.05160427, "balance_loss_mlp": 1.0192349, "epoch": 0.6055431972584621, "flos": 27673912131840.0, "grad_norm": 1.5391614634909567, "language_loss": 0.77641916, "learning_rate": 1.4218071484725082e-06, "loss": 0.79843432, "num_input_tokens_seen": 108576275, "step": 5036, "time_per_iteration": 3.2165653705596924 }, { "auxiliary_loss_clip": 0.01143802, "auxiliary_loss_mlp": 0.01033921, "balance_loss_clip": 1.0498203, "balance_loss_mlp": 1.0262202, "epoch": 0.6056634401491012, "flos": 19387489006080.0, "grad_norm": 1.939184240632925, "language_loss": 0.76289034, "learning_rate": 1.4210614828026786e-06, "loss": 0.78466761, "num_input_tokens_seen": 108594125, "step": 5037, "time_per_iteration": 2.4745450019836426 }, { "auxiliary_loss_clip": 0.01173516, "auxiliary_loss_mlp": 0.01021558, "balance_loss_clip": 1.05024421, "balance_loss_mlp": 1.01408923, "epoch": 0.6057836830397403, "flos": 24789459294720.0, "grad_norm": 1.7997925081321913, "language_loss": 0.74395478, "learning_rate": 1.4203159049573605e-06, "loss": 0.7659055, "num_input_tokens_seen": 108615360, "step": 5038, "time_per_iteration": 2.4586880207061768 }, { "auxiliary_loss_clip": 0.01153919, "auxiliary_loss_mlp": 0.01028599, "balance_loss_clip": 1.0476861, "balance_loss_mlp": 1.02069545, "epoch": 0.6059039259303793, "flos": 20558248899840.0, "grad_norm": 2.0111601639072414, "language_loss": 0.86556637, "learning_rate": 1.4195704150496593e-06, "loss": 0.88739151, "num_input_tokens_seen": 108633075, "step": 5039, "time_per_iteration": 2.4744415283203125 }, { "auxiliary_loss_clip": 0.01146098, "auxiliary_loss_mlp": 0.01025455, "balance_loss_clip": 1.04911399, "balance_loss_mlp": 1.0177238, "epoch": 0.6060241688210185, "flos": 21069724613760.0, "grad_norm": 1.6540863857803458, "language_loss": 0.73883128, "learning_rate": 1.4188250131926639e-06, "loss": 0.7605468, "num_input_tokens_seen": 108651875, "step": 5040, "time_per_iteration": 2.480175018310547 }, { "auxiliary_loss_clip": 0.01148466, "auxiliary_loss_mlp": 0.01029292, "balance_loss_clip": 1.04756987, "balance_loss_mlp": 1.02094769, "epoch": 0.6061444117116576, "flos": 16361081619840.0, "grad_norm": 1.9325676538841574, "language_loss": 0.80827814, "learning_rate": 1.4180796994994525e-06, "loss": 0.83005571, "num_input_tokens_seen": 108669290, "step": 5041, "time_per_iteration": 2.4469096660614014 }, { "auxiliary_loss_clip": 0.01142959, "auxiliary_loss_mlp": 0.01021449, "balance_loss_clip": 1.04533744, "balance_loss_mlp": 1.01383805, "epoch": 0.6062646546022966, "flos": 21507296094720.0, "grad_norm": 1.7463168202069994, "language_loss": 0.72617656, "learning_rate": 1.4173344740830877e-06, "loss": 0.74782062, "num_input_tokens_seen": 108688420, "step": 5042, "time_per_iteration": 2.480771780014038 }, { "auxiliary_loss_clip": 0.01145253, "auxiliary_loss_mlp": 0.01031683, "balance_loss_clip": 1.05302942, "balance_loss_mlp": 1.0237143, "epoch": 0.6063848974929358, "flos": 38983151283840.0, "grad_norm": 1.9515909128149715, "language_loss": 0.70632088, "learning_rate": 1.4165893370566206e-06, "loss": 0.72809023, "num_input_tokens_seen": 108712175, "step": 5043, "time_per_iteration": 2.6273722648620605 }, { "auxiliary_loss_clip": 0.01154709, "auxiliary_loss_mlp": 0.01029925, "balance_loss_clip": 1.04581594, "balance_loss_mlp": 1.02180684, "epoch": 0.6065051403835748, "flos": 19646584784640.0, "grad_norm": 1.6974091260036241, "language_loss": 0.77468133, "learning_rate": 1.4158442885330865e-06, "loss": 0.79652762, "num_input_tokens_seen": 108730745, "step": 5044, "time_per_iteration": 2.4389431476593018 }, { "auxiliary_loss_clip": 0.01153892, "auxiliary_loss_mlp": 0.01031203, "balance_loss_clip": 1.04661655, "balance_loss_mlp": 1.02289975, "epoch": 0.6066253832742139, "flos": 23513086437120.0, "grad_norm": 1.9480080935240414, "language_loss": 0.78942889, "learning_rate": 1.4150993286255094e-06, "loss": 0.81127989, "num_input_tokens_seen": 108749995, "step": 5045, "time_per_iteration": 2.467611074447632 }, { "auxiliary_loss_clip": 0.0117245, "auxiliary_loss_mlp": 0.01025729, "balance_loss_clip": 1.04905248, "balance_loss_mlp": 1.01823974, "epoch": 0.6067456261648531, "flos": 19133708440320.0, "grad_norm": 2.471559180399533, "language_loss": 0.79588759, "learning_rate": 1.4143544574468993e-06, "loss": 0.81786942, "num_input_tokens_seen": 108768355, "step": 5046, "time_per_iteration": 2.4121909141540527 }, { "auxiliary_loss_clip": 0.01158391, "auxiliary_loss_mlp": 0.0102485, "balance_loss_clip": 1.04983783, "balance_loss_mlp": 1.01658869, "epoch": 0.6068658690554921, "flos": 20520614424960.0, "grad_norm": 1.722803665476089, "language_loss": 0.82395798, "learning_rate": 1.4136096751102523e-06, "loss": 0.84579033, "num_input_tokens_seen": 108786685, "step": 5047, "time_per_iteration": 2.447263479232788 }, { "auxiliary_loss_clip": 0.01149321, "auxiliary_loss_mlp": 0.0102625, "balance_loss_clip": 1.0499053, "balance_loss_mlp": 1.01854324, "epoch": 0.6069861119461312, "flos": 27374560185600.0, "grad_norm": 2.05435511508815, "language_loss": 0.8265512, "learning_rate": 1.4128649817285516e-06, "loss": 0.84830701, "num_input_tokens_seen": 108804820, "step": 5048, "time_per_iteration": 2.5240516662597656 }, { "auxiliary_loss_clip": 0.01149415, "auxiliary_loss_mlp": 0.01039071, "balance_loss_clip": 1.04676509, "balance_loss_mlp": 1.03107178, "epoch": 0.6071063548367702, "flos": 25626500904960.0, "grad_norm": 2.6059429650678747, "language_loss": 0.63015944, "learning_rate": 1.412120377414766e-06, "loss": 0.6520443, "num_input_tokens_seen": 108825010, "step": 5049, "time_per_iteration": 2.5339953899383545 }, { "auxiliary_loss_clip": 0.01176174, "auxiliary_loss_mlp": 0.01029646, "balance_loss_clip": 1.05325937, "balance_loss_mlp": 1.02182603, "epoch": 0.6072265977274094, "flos": 24460517520000.0, "grad_norm": 1.5043191827552223, "language_loss": 0.71129543, "learning_rate": 1.4113758622818522e-06, "loss": 0.73335361, "num_input_tokens_seen": 108845075, "step": 5050, "time_per_iteration": 2.453789710998535 }, { "auxiliary_loss_clip": 0.01151449, "auxiliary_loss_mlp": 0.00762229, "balance_loss_clip": 1.05008698, "balance_loss_mlp": 1.00051451, "epoch": 0.6073468406180484, "flos": 18149253413760.0, "grad_norm": 1.7414512563671498, "language_loss": 0.82665896, "learning_rate": 1.410631436442751e-06, "loss": 0.84579575, "num_input_tokens_seen": 108863870, "step": 5051, "time_per_iteration": 2.470334529876709 }, { "auxiliary_loss_clip": 0.011641, "auxiliary_loss_mlp": 0.01025167, "balance_loss_clip": 1.05019867, "balance_loss_mlp": 1.01738906, "epoch": 0.6074670835086875, "flos": 20697617669760.0, "grad_norm": 2.377530585480949, "language_loss": 0.86479461, "learning_rate": 1.4098871000103936e-06, "loss": 0.88668728, "num_input_tokens_seen": 108882470, "step": 5052, "time_per_iteration": 2.4396162033081055 }, { "auxiliary_loss_clip": 0.01145703, "auxiliary_loss_mlp": 0.01023482, "balance_loss_clip": 1.04607844, "balance_loss_mlp": 1.01618075, "epoch": 0.6075873263993267, "flos": 23769955572480.0, "grad_norm": 1.6721580359970072, "language_loss": 0.82650232, "learning_rate": 1.409142853097693e-06, "loss": 0.84819412, "num_input_tokens_seen": 108902710, "step": 5053, "time_per_iteration": 2.4990317821502686 }, { "auxiliary_loss_clip": 0.01147906, "auxiliary_loss_mlp": 0.01025782, "balance_loss_clip": 1.04817462, "balance_loss_mlp": 1.01827765, "epoch": 0.6077075692899657, "flos": 24454484035200.0, "grad_norm": 2.023040004575697, "language_loss": 0.79450166, "learning_rate": 1.408398695817553e-06, "loss": 0.81623852, "num_input_tokens_seen": 108919935, "step": 5054, "time_per_iteration": 2.5154056549072266 }, { "auxiliary_loss_clip": 0.01144591, "auxiliary_loss_mlp": 0.0103648, "balance_loss_clip": 1.04607213, "balance_loss_mlp": 1.02765226, "epoch": 0.6078278121806048, "flos": 27382102041600.0, "grad_norm": 1.5839432614312505, "language_loss": 0.70070207, "learning_rate": 1.4076546282828593e-06, "loss": 0.72251284, "num_input_tokens_seen": 108942790, "step": 5055, "time_per_iteration": 2.54927134513855 }, { "auxiliary_loss_clip": 0.0114762, "auxiliary_loss_mlp": 0.0102684, "balance_loss_clip": 1.04312754, "balance_loss_mlp": 1.01954412, "epoch": 0.6079480550712439, "flos": 38436447306240.0, "grad_norm": 2.9606739596897453, "language_loss": 0.65790534, "learning_rate": 1.4069106506064874e-06, "loss": 0.67964995, "num_input_tokens_seen": 108964215, "step": 5056, "time_per_iteration": 3.3528690338134766 }, { "auxiliary_loss_clip": 0.01141754, "auxiliary_loss_mlp": 0.01027874, "balance_loss_clip": 1.04757166, "balance_loss_mlp": 1.0200361, "epoch": 0.608068297961883, "flos": 25336271013120.0, "grad_norm": 2.5495524593936567, "language_loss": 0.78132248, "learning_rate": 1.4061667629012989e-06, "loss": 0.80301875, "num_input_tokens_seen": 108984885, "step": 5057, "time_per_iteration": 2.508213996887207 }, { "auxiliary_loss_clip": 0.01137808, "auxiliary_loss_mlp": 0.01026486, "balance_loss_clip": 1.04803705, "balance_loss_mlp": 1.01888633, "epoch": 0.608188540852522, "flos": 24202463235840.0, "grad_norm": 3.958121331552012, "language_loss": 0.83207607, "learning_rate": 1.40542296528014e-06, "loss": 0.85371894, "num_input_tokens_seen": 109004545, "step": 5058, "time_per_iteration": 2.5215721130371094 }, { "auxiliary_loss_clip": 0.01159327, "auxiliary_loss_mlp": 0.01032129, "balance_loss_clip": 1.04754245, "balance_loss_mlp": 1.02409744, "epoch": 0.6083087837431612, "flos": 21284146851840.0, "grad_norm": 2.121510698521352, "language_loss": 0.75962627, "learning_rate": 1.4046792578558452e-06, "loss": 0.78154087, "num_input_tokens_seen": 109022440, "step": 5059, "time_per_iteration": 3.316783905029297 }, { "auxiliary_loss_clip": 0.01141027, "auxiliary_loss_mlp": 0.0102892, "balance_loss_clip": 1.04591596, "balance_loss_mlp": 1.02095699, "epoch": 0.6084290266338003, "flos": 16471435178880.0, "grad_norm": 2.205578236021955, "language_loss": 0.76122385, "learning_rate": 1.4039356407412325e-06, "loss": 0.78292328, "num_input_tokens_seen": 109035680, "step": 5060, "time_per_iteration": 2.4452011585235596 }, { "auxiliary_loss_clip": 0.01066352, "auxiliary_loss_mlp": 0.0100103, "balance_loss_clip": 1.01757431, "balance_loss_mlp": 1.00011778, "epoch": 0.6085492695244393, "flos": 66443574931200.0, "grad_norm": 0.7881127065018196, "language_loss": 0.57123005, "learning_rate": 1.40319211404911e-06, "loss": 0.59190392, "num_input_tokens_seen": 109090680, "step": 5061, "time_per_iteration": 3.8978376388549805 }, { "auxiliary_loss_clip": 0.01174727, "auxiliary_loss_mlp": 0.01027046, "balance_loss_clip": 1.05089557, "balance_loss_mlp": 1.01908255, "epoch": 0.6086695124150785, "flos": 23618986709760.0, "grad_norm": 1.795289184050645, "language_loss": 0.9066298, "learning_rate": 1.4024486778922691e-06, "loss": 0.92864752, "num_input_tokens_seen": 109108995, "step": 5062, "time_per_iteration": 2.4421961307525635 }, { "auxiliary_loss_clip": 0.01149667, "auxiliary_loss_mlp": 0.01029942, "balance_loss_clip": 1.045959, "balance_loss_mlp": 1.02214026, "epoch": 0.6087897553057176, "flos": 20157054917760.0, "grad_norm": 1.8913959386274457, "language_loss": 0.77492893, "learning_rate": 1.4017053323834884e-06, "loss": 0.79672498, "num_input_tokens_seen": 109128825, "step": 5063, "time_per_iteration": 2.488363265991211 }, { "auxiliary_loss_clip": 0.01148366, "auxiliary_loss_mlp": 0.01025676, "balance_loss_clip": 1.04640567, "balance_loss_mlp": 1.01812387, "epoch": 0.6089099981963566, "flos": 25482535194240.0, "grad_norm": 1.933533855319585, "language_loss": 0.75814879, "learning_rate": 1.4009620776355333e-06, "loss": 0.77988923, "num_input_tokens_seen": 109150425, "step": 5064, "time_per_iteration": 3.2686140537261963 }, { "auxiliary_loss_clip": 0.01157879, "auxiliary_loss_mlp": 0.01022043, "balance_loss_clip": 1.04796982, "balance_loss_mlp": 1.01452076, "epoch": 0.6090302410869958, "flos": 25332895134720.0, "grad_norm": 1.763732955852719, "language_loss": 0.79360533, "learning_rate": 1.4002189137611553e-06, "loss": 0.81540453, "num_input_tokens_seen": 109169765, "step": 5065, "time_per_iteration": 2.4963266849517822 }, { "auxiliary_loss_clip": 0.01158074, "auxiliary_loss_mlp": 0.01024063, "balance_loss_clip": 1.04803658, "balance_loss_mlp": 1.01673794, "epoch": 0.6091504839776348, "flos": 23987358639360.0, "grad_norm": 1.6654030639640247, "language_loss": 0.69669372, "learning_rate": 1.3994758408730901e-06, "loss": 0.7185151, "num_input_tokens_seen": 109188950, "step": 5066, "time_per_iteration": 2.472099781036377 }, { "auxiliary_loss_clip": 0.01148095, "auxiliary_loss_mlp": 0.01024653, "balance_loss_clip": 1.0494163, "balance_loss_mlp": 1.0162847, "epoch": 0.6092707268682739, "flos": 29643037666560.0, "grad_norm": 2.7554103288725544, "language_loss": 0.76339543, "learning_rate": 1.3987328590840629e-06, "loss": 0.78512293, "num_input_tokens_seen": 109209895, "step": 5067, "time_per_iteration": 2.6054129600524902 }, { "auxiliary_loss_clip": 0.01156096, "auxiliary_loss_mlp": 0.01028028, "balance_loss_clip": 1.04747033, "balance_loss_mlp": 1.0207144, "epoch": 0.609390969758913, "flos": 24024957200640.0, "grad_norm": 2.075536064425432, "language_loss": 0.86368924, "learning_rate": 1.397989968506783e-06, "loss": 0.88553053, "num_input_tokens_seen": 109228905, "step": 5068, "time_per_iteration": 2.4662129878997803 }, { "auxiliary_loss_clip": 0.01179347, "auxiliary_loss_mlp": 0.01034572, "balance_loss_clip": 1.05269992, "balance_loss_mlp": 1.02661729, "epoch": 0.6095112126495521, "flos": 11102143288320.0, "grad_norm": 2.228408867935824, "language_loss": 0.72331738, "learning_rate": 1.3972471692539458e-06, "loss": 0.74545658, "num_input_tokens_seen": 109243620, "step": 5069, "time_per_iteration": 2.3966426849365234 }, { "auxiliary_loss_clip": 0.01142669, "auxiliary_loss_mlp": 0.01023374, "balance_loss_clip": 1.04760742, "balance_loss_mlp": 1.01561379, "epoch": 0.6096314555401912, "flos": 17265491187840.0, "grad_norm": 2.8268495863944274, "language_loss": 0.75498974, "learning_rate": 1.3965044614382348e-06, "loss": 0.77665019, "num_input_tokens_seen": 109259070, "step": 5070, "time_per_iteration": 2.445420742034912 }, { "auxiliary_loss_clip": 0.01177421, "auxiliary_loss_mlp": 0.0102623, "balance_loss_clip": 1.05192733, "balance_loss_mlp": 1.01843357, "epoch": 0.6097516984308303, "flos": 21645910679040.0, "grad_norm": 2.1283286011042972, "language_loss": 0.75576556, "learning_rate": 1.3957618451723162e-06, "loss": 0.77780211, "num_input_tokens_seen": 109275100, "step": 5071, "time_per_iteration": 2.426062822341919 }, { "auxiliary_loss_clip": 0.01146785, "auxiliary_loss_mlp": 0.01028045, "balance_loss_clip": 1.04809344, "balance_loss_mlp": 1.02062118, "epoch": 0.6098719413214694, "flos": 27199208966400.0, "grad_norm": 1.9461103588496604, "language_loss": 0.71213925, "learning_rate": 1.3950193205688457e-06, "loss": 0.73388755, "num_input_tokens_seen": 109294825, "step": 5072, "time_per_iteration": 2.5500707626342773 }, { "auxiliary_loss_clip": 0.01143449, "auxiliary_loss_mlp": 0.01025237, "balance_loss_clip": 1.04828525, "balance_loss_mlp": 1.01748812, "epoch": 0.6099921842121084, "flos": 20412954385920.0, "grad_norm": 1.839722369053823, "language_loss": 0.83545953, "learning_rate": 1.3942768877404627e-06, "loss": 0.85714644, "num_input_tokens_seen": 109313790, "step": 5073, "time_per_iteration": 2.476454973220825 }, { "auxiliary_loss_clip": 0.01172301, "auxiliary_loss_mlp": 0.01026507, "balance_loss_clip": 1.04937267, "balance_loss_mlp": 1.01946735, "epoch": 0.6101124271027476, "flos": 23366139897600.0, "grad_norm": 1.548529755196695, "language_loss": 0.73547733, "learning_rate": 1.393534546799795e-06, "loss": 0.75746548, "num_input_tokens_seen": 109333490, "step": 5074, "time_per_iteration": 2.4650776386260986 }, { "auxiliary_loss_clip": 0.01138827, "auxiliary_loss_mlp": 0.01030173, "balance_loss_clip": 1.04760671, "balance_loss_mlp": 1.0218401, "epoch": 0.6102326699933867, "flos": 26687840993280.0, "grad_norm": 1.7553216402234515, "language_loss": 0.67627013, "learning_rate": 1.3927922978594536e-06, "loss": 0.69796014, "num_input_tokens_seen": 109354575, "step": 5075, "time_per_iteration": 2.534072160720825 }, { "auxiliary_loss_clip": 0.01060701, "auxiliary_loss_mlp": 0.01001463, "balance_loss_clip": 1.01683462, "balance_loss_mlp": 1.00058126, "epoch": 0.6103529128840257, "flos": 60644612551680.0, "grad_norm": 0.771954237713556, "language_loss": 0.57446122, "learning_rate": 1.3920501410320387e-06, "loss": 0.59508288, "num_input_tokens_seen": 109410690, "step": 5076, "time_per_iteration": 2.9833619594573975 }, { "auxiliary_loss_clip": 0.01146146, "auxiliary_loss_mlp": 0.01026915, "balance_loss_clip": 1.04586792, "balance_loss_mlp": 1.01887441, "epoch": 0.6104731557746649, "flos": 19021307806080.0, "grad_norm": 2.046806922183719, "language_loss": 0.75950503, "learning_rate": 1.3913080764301333e-06, "loss": 0.78123569, "num_input_tokens_seen": 109427650, "step": 5077, "time_per_iteration": 2.460766553878784 }, { "auxiliary_loss_clip": 0.01126712, "auxiliary_loss_mlp": 0.01035234, "balance_loss_clip": 1.04381859, "balance_loss_mlp": 1.02750337, "epoch": 0.6105933986653039, "flos": 23366894083200.0, "grad_norm": 1.9358012460082523, "language_loss": 0.7128607, "learning_rate": 1.3905661041663085e-06, "loss": 0.73448014, "num_input_tokens_seen": 109448835, "step": 5078, "time_per_iteration": 2.560464859008789 }, { "auxiliary_loss_clip": 0.01161571, "auxiliary_loss_mlp": 0.01033974, "balance_loss_clip": 1.05051482, "balance_loss_mlp": 1.02599907, "epoch": 0.610713641555943, "flos": 34637565006720.0, "grad_norm": 2.2979424204514616, "language_loss": 0.65208471, "learning_rate": 1.389824224353122e-06, "loss": 0.67404014, "num_input_tokens_seen": 109470425, "step": 5079, "time_per_iteration": 2.5610344409942627 }, { "auxiliary_loss_clip": 0.0115995, "auxiliary_loss_mlp": 0.01023647, "balance_loss_clip": 1.05076504, "balance_loss_mlp": 1.0160296, "epoch": 0.610833884446582, "flos": 26646471504000.0, "grad_norm": 1.644420073547241, "language_loss": 0.76982391, "learning_rate": 1.389082437103115e-06, "loss": 0.79165983, "num_input_tokens_seen": 109489695, "step": 5080, "time_per_iteration": 2.4779303073883057 }, { "auxiliary_loss_clip": 0.01129828, "auxiliary_loss_mlp": 0.01025592, "balance_loss_clip": 1.04391813, "balance_loss_mlp": 1.01730132, "epoch": 0.6109541273372212, "flos": 21215126868480.0, "grad_norm": 2.9723896096586984, "language_loss": 0.77760744, "learning_rate": 1.3883407425288172e-06, "loss": 0.79916167, "num_input_tokens_seen": 109510030, "step": 5081, "time_per_iteration": 2.553284168243408 }, { "auxiliary_loss_clip": 0.01142196, "auxiliary_loss_mlp": 0.01027912, "balance_loss_clip": 1.04558182, "balance_loss_mlp": 1.02015734, "epoch": 0.6110743702278603, "flos": 20084084438400.0, "grad_norm": 2.133305995274337, "language_loss": 0.79975712, "learning_rate": 1.3875991407427417e-06, "loss": 0.82145822, "num_input_tokens_seen": 109528255, "step": 5082, "time_per_iteration": 3.2947733402252197 }, { "auxiliary_loss_clip": 0.01046629, "auxiliary_loss_mlp": 0.01002104, "balance_loss_clip": 1.01670015, "balance_loss_mlp": 1.00110853, "epoch": 0.6111946131184993, "flos": 68302957438080.0, "grad_norm": 0.7707187304039955, "language_loss": 0.58226377, "learning_rate": 1.38685763185739e-06, "loss": 0.60275108, "num_input_tokens_seen": 109581915, "step": 5083, "time_per_iteration": 3.105914831161499 }, { "auxiliary_loss_clip": 0.01173656, "auxiliary_loss_mlp": 0.01026081, "balance_loss_clip": 1.05030227, "balance_loss_mlp": 1.01804602, "epoch": 0.6113148560091385, "flos": 19937676602880.0, "grad_norm": 2.415949856770844, "language_loss": 0.67725444, "learning_rate": 1.3861162159852476e-06, "loss": 0.69925183, "num_input_tokens_seen": 109600050, "step": 5084, "time_per_iteration": 2.410493850708008 }, { "auxiliary_loss_clip": 0.01151265, "auxiliary_loss_mlp": 0.01026259, "balance_loss_clip": 1.04899645, "balance_loss_mlp": 1.01778293, "epoch": 0.6114350988997775, "flos": 23731854220800.0, "grad_norm": 2.6658033470219458, "language_loss": 0.80039108, "learning_rate": 1.3853748932387875e-06, "loss": 0.82216632, "num_input_tokens_seen": 109620690, "step": 5085, "time_per_iteration": 3.3655753135681152 }, { "auxiliary_loss_clip": 0.0113418, "auxiliary_loss_mlp": 0.01021389, "balance_loss_clip": 1.04547048, "balance_loss_mlp": 1.01365805, "epoch": 0.6115553417904166, "flos": 24023700224640.0, "grad_norm": 2.353728573836011, "language_loss": 0.75099081, "learning_rate": 1.3846336637304671e-06, "loss": 0.77254653, "num_input_tokens_seen": 109638960, "step": 5086, "time_per_iteration": 2.484530210494995 }, { "auxiliary_loss_clip": 0.01141938, "auxiliary_loss_mlp": 0.01023628, "balance_loss_clip": 1.04962564, "balance_loss_mlp": 1.01587987, "epoch": 0.6116755846810558, "flos": 23733542160000.0, "grad_norm": 1.8829038915104588, "language_loss": 0.83109522, "learning_rate": 1.3838925275727316e-06, "loss": 0.8527509, "num_input_tokens_seen": 109659700, "step": 5087, "time_per_iteration": 2.5030009746551514 }, { "auxiliary_loss_clip": 0.01175185, "auxiliary_loss_mlp": 0.01024154, "balance_loss_clip": 1.05197108, "balance_loss_mlp": 1.01677525, "epoch": 0.6117958275716948, "flos": 18661626967680.0, "grad_norm": 2.269638300784814, "language_loss": 0.79136729, "learning_rate": 1.3831514848780089e-06, "loss": 0.81336069, "num_input_tokens_seen": 109679275, "step": 5088, "time_per_iteration": 2.4074864387512207 }, { "auxiliary_loss_clip": 0.01154594, "auxiliary_loss_mlp": 0.01027564, "balance_loss_clip": 1.04809511, "balance_loss_mlp": 1.02032161, "epoch": 0.6119160704623339, "flos": 16471183783680.0, "grad_norm": 2.675359491424576, "language_loss": 0.91493845, "learning_rate": 1.3824105357587152e-06, "loss": 0.93676007, "num_input_tokens_seen": 109696380, "step": 5089, "time_per_iteration": 3.2116165161132812 }, { "auxiliary_loss_clip": 0.01140288, "auxiliary_loss_mlp": 0.01026302, "balance_loss_clip": 1.04482126, "balance_loss_mlp": 1.01857138, "epoch": 0.612036313352973, "flos": 23915465568000.0, "grad_norm": 1.5189165546542074, "language_loss": 0.82676542, "learning_rate": 1.381669680327253e-06, "loss": 0.84843129, "num_input_tokens_seen": 109718060, "step": 5090, "time_per_iteration": 3.291295051574707 }, { "auxiliary_loss_clip": 0.01141033, "auxiliary_loss_mlp": 0.0102603, "balance_loss_clip": 1.04912925, "balance_loss_mlp": 1.01788795, "epoch": 0.6121565562436121, "flos": 26974766833920.0, "grad_norm": 1.9626741935955292, "language_loss": 0.7062344, "learning_rate": 1.380928918696008e-06, "loss": 0.72790504, "num_input_tokens_seen": 109736830, "step": 5091, "time_per_iteration": 2.530230760574341 }, { "auxiliary_loss_clip": 0.011591, "auxiliary_loss_mlp": 0.01025017, "balance_loss_clip": 1.04774833, "balance_loss_mlp": 1.01716101, "epoch": 0.6122767991342511, "flos": 15668867646720.0, "grad_norm": 3.1379894187401, "language_loss": 0.71554208, "learning_rate": 1.3801882509773548e-06, "loss": 0.73738325, "num_input_tokens_seen": 109754690, "step": 5092, "time_per_iteration": 2.4204964637756348 }, { "auxiliary_loss_clip": 0.01154851, "auxiliary_loss_mlp": 0.01025703, "balance_loss_clip": 1.04695797, "balance_loss_mlp": 1.01766789, "epoch": 0.6123970420248903, "flos": 27964321591680.0, "grad_norm": 4.2475396185683, "language_loss": 0.81701678, "learning_rate": 1.3794476772836503e-06, "loss": 0.83882236, "num_input_tokens_seen": 109775790, "step": 5093, "time_per_iteration": 2.506866693496704 }, { "auxiliary_loss_clip": 0.01126149, "auxiliary_loss_mlp": 0.01029923, "balance_loss_clip": 1.04717636, "balance_loss_mlp": 1.02154219, "epoch": 0.6125172849155294, "flos": 21468727866240.0, "grad_norm": 1.6608983218580216, "language_loss": 0.84496176, "learning_rate": 1.3787071977272402e-06, "loss": 0.86652255, "num_input_tokens_seen": 109795050, "step": 5094, "time_per_iteration": 2.5703883171081543 }, { "auxiliary_loss_clip": 0.01112877, "auxiliary_loss_mlp": 0.01030748, "balance_loss_clip": 1.0462805, "balance_loss_mlp": 1.02293372, "epoch": 0.6126375278061684, "flos": 16248321849600.0, "grad_norm": 3.09254998674022, "language_loss": 0.71797681, "learning_rate": 1.3779668124204535e-06, "loss": 0.73941302, "num_input_tokens_seen": 109811465, "step": 5095, "time_per_iteration": 2.509732484817505 }, { "auxiliary_loss_clip": 0.01141317, "auxiliary_loss_mlp": 0.01027106, "balance_loss_clip": 1.04918694, "balance_loss_mlp": 1.01908934, "epoch": 0.6127577706968076, "flos": 20448865008000.0, "grad_norm": 1.501077716527644, "language_loss": 0.80910319, "learning_rate": 1.3772265214756074e-06, "loss": 0.83078742, "num_input_tokens_seen": 109831225, "step": 5096, "time_per_iteration": 2.4700565338134766 }, { "auxiliary_loss_clip": 0.01161292, "auxiliary_loss_mlp": 0.01028791, "balance_loss_clip": 1.04705644, "balance_loss_mlp": 1.0211854, "epoch": 0.6128780135874466, "flos": 18260397072000.0, "grad_norm": 1.8857198553529981, "language_loss": 0.75419867, "learning_rate": 1.3764863250050025e-06, "loss": 0.77609956, "num_input_tokens_seen": 109849465, "step": 5097, "time_per_iteration": 2.4436445236206055 }, { "auxiliary_loss_clip": 0.01132261, "auxiliary_loss_mlp": 0.01028296, "balance_loss_clip": 1.04572415, "balance_loss_mlp": 1.02087569, "epoch": 0.6129982564780857, "flos": 24937088192640.0, "grad_norm": 1.792018576268544, "language_loss": 0.80602854, "learning_rate": 1.3757462231209272e-06, "loss": 0.8276341, "num_input_tokens_seen": 109869770, "step": 5098, "time_per_iteration": 2.5587480068206787 }, { "auxiliary_loss_clip": 0.01139473, "auxiliary_loss_mlp": 0.01024067, "balance_loss_clip": 1.04609632, "balance_loss_mlp": 1.01593113, "epoch": 0.6131184993687249, "flos": 22492038430080.0, "grad_norm": 1.983903909425592, "language_loss": 0.88618672, "learning_rate": 1.3750062159356525e-06, "loss": 0.90782213, "num_input_tokens_seen": 109889120, "step": 5099, "time_per_iteration": 2.504131555557251 }, { "auxiliary_loss_clip": 0.0112095, "auxiliary_loss_mlp": 0.01027434, "balance_loss_clip": 1.04404151, "balance_loss_mlp": 1.01998353, "epoch": 0.6132387422593639, "flos": 15885839750400.0, "grad_norm": 1.6749461859328214, "language_loss": 0.83296037, "learning_rate": 1.3742663035614382e-06, "loss": 0.85444415, "num_input_tokens_seen": 109906490, "step": 5100, "time_per_iteration": 2.482245683670044 }, { "auxiliary_loss_clip": 0.01175817, "auxiliary_loss_mlp": 0.01030878, "balance_loss_clip": 1.05095983, "balance_loss_mlp": 1.0227834, "epoch": 0.613358985150003, "flos": 25411539962880.0, "grad_norm": 3.079416876949993, "language_loss": 0.80385959, "learning_rate": 1.3735264861105283e-06, "loss": 0.82592654, "num_input_tokens_seen": 109927130, "step": 5101, "time_per_iteration": 2.4548351764678955 }, { "auxiliary_loss_clip": 0.01133169, "auxiliary_loss_mlp": 0.01026517, "balance_loss_clip": 1.04508257, "balance_loss_mlp": 1.01901579, "epoch": 0.6134792280406421, "flos": 21361283308800.0, "grad_norm": 2.121434392117287, "language_loss": 0.78416133, "learning_rate": 1.372786763695152e-06, "loss": 0.80575818, "num_input_tokens_seen": 109945890, "step": 5102, "time_per_iteration": 2.516022205352783 }, { "auxiliary_loss_clip": 0.01160849, "auxiliary_loss_mlp": 0.01031982, "balance_loss_clip": 1.04775882, "balance_loss_mlp": 1.024001, "epoch": 0.6135994709312812, "flos": 21211248199680.0, "grad_norm": 3.0606831251367246, "language_loss": 0.77118242, "learning_rate": 1.3720471364275257e-06, "loss": 0.79311073, "num_input_tokens_seen": 109965535, "step": 5103, "time_per_iteration": 2.4676713943481445 }, { "auxiliary_loss_clip": 0.0112764, "auxiliary_loss_mlp": 0.00762904, "balance_loss_clip": 1.04507315, "balance_loss_mlp": 1.0007174, "epoch": 0.6137197138219203, "flos": 14794047907200.0, "grad_norm": 2.074752507963412, "language_loss": 0.78232431, "learning_rate": 1.3713076044198486e-06, "loss": 0.80122972, "num_input_tokens_seen": 109982345, "step": 5104, "time_per_iteration": 2.4856326580047607 }, { "auxiliary_loss_clip": 0.01140532, "auxiliary_loss_mlp": 0.01031838, "balance_loss_clip": 1.04648161, "balance_loss_mlp": 1.02377343, "epoch": 0.6138399567125594, "flos": 20084515401600.0, "grad_norm": 2.6463185183969746, "language_loss": 0.81177032, "learning_rate": 1.3705681677843086e-06, "loss": 0.83349407, "num_input_tokens_seen": 110000940, "step": 5105, "time_per_iteration": 2.4687111377716064 }, { "auxiliary_loss_clip": 0.01073599, "auxiliary_loss_mlp": 0.01001425, "balance_loss_clip": 1.01608825, "balance_loss_mlp": 1.00047779, "epoch": 0.6139601996031985, "flos": 60123838193280.0, "grad_norm": 0.8062958448978689, "language_loss": 0.60640794, "learning_rate": 1.3698288266330768e-06, "loss": 0.62715822, "num_input_tokens_seen": 110061565, "step": 5106, "time_per_iteration": 3.095419406890869 }, { "auxiliary_loss_clip": 0.01143363, "auxiliary_loss_mlp": 0.01023012, "balance_loss_clip": 1.05189347, "balance_loss_mlp": 1.01555526, "epoch": 0.6140804424938375, "flos": 23586703361280.0, "grad_norm": 3.761532254880022, "language_loss": 0.72768337, "learning_rate": 1.3690895810783113e-06, "loss": 0.74934709, "num_input_tokens_seen": 110080360, "step": 5107, "time_per_iteration": 2.5079920291900635 }, { "auxiliary_loss_clip": 0.01111024, "auxiliary_loss_mlp": 0.00762779, "balance_loss_clip": 1.0410949, "balance_loss_mlp": 1.00069571, "epoch": 0.6142006853844767, "flos": 21398199511680.0, "grad_norm": 2.859343515641965, "language_loss": 0.71429312, "learning_rate": 1.3683504312321543e-06, "loss": 0.73303109, "num_input_tokens_seen": 110100695, "step": 5108, "time_per_iteration": 3.3253512382507324 }, { "auxiliary_loss_clip": 0.01164364, "auxiliary_loss_mlp": 0.01027314, "balance_loss_clip": 1.04948795, "balance_loss_mlp": 1.01930869, "epoch": 0.6143209282751158, "flos": 12057367622400.0, "grad_norm": 2.2641694690977325, "language_loss": 0.80268037, "learning_rate": 1.3676113772067355e-06, "loss": 0.82459706, "num_input_tokens_seen": 110117750, "step": 5109, "time_per_iteration": 2.4397342205047607 }, { "auxiliary_loss_clip": 0.01122927, "auxiliary_loss_mlp": 0.01024662, "balance_loss_clip": 1.04479313, "balance_loss_mlp": 1.01677656, "epoch": 0.6144411711657548, "flos": 25082274965760.0, "grad_norm": 1.7993815904853432, "language_loss": 0.72608173, "learning_rate": 1.3668724191141671e-06, "loss": 0.74755764, "num_input_tokens_seen": 110137020, "step": 5110, "time_per_iteration": 2.574392557144165 }, { "auxiliary_loss_clip": 0.01131348, "auxiliary_loss_mlp": 0.01033588, "balance_loss_clip": 1.05282533, "balance_loss_mlp": 1.02523756, "epoch": 0.6145614140563939, "flos": 20114069316480.0, "grad_norm": 2.168741696093098, "language_loss": 0.6672402, "learning_rate": 1.3661335570665493e-06, "loss": 0.68888962, "num_input_tokens_seen": 110154930, "step": 5111, "time_per_iteration": 2.508439302444458 }, { "auxiliary_loss_clip": 0.01151295, "auxiliary_loss_mlp": 0.01028893, "balance_loss_clip": 1.05130553, "balance_loss_mlp": 1.02132928, "epoch": 0.614681656947033, "flos": 16800376953600.0, "grad_norm": 2.4082095880795484, "language_loss": 0.70047927, "learning_rate": 1.3653947911759676e-06, "loss": 0.72228116, "num_input_tokens_seen": 110172480, "step": 5112, "time_per_iteration": 3.3108434677124023 }, { "auxiliary_loss_clip": 0.01110389, "auxiliary_loss_mlp": 0.01032534, "balance_loss_clip": 1.04393744, "balance_loss_mlp": 1.0240581, "epoch": 0.6148018998376721, "flos": 38801587011840.0, "grad_norm": 1.9050274812869563, "language_loss": 0.74706751, "learning_rate": 1.3646561215544904e-06, "loss": 0.76849675, "num_input_tokens_seen": 110197120, "step": 5113, "time_per_iteration": 2.7064969539642334 }, { "auxiliary_loss_clip": 0.01161413, "auxiliary_loss_mlp": 0.01024166, "balance_loss_clip": 1.05038035, "balance_loss_mlp": 1.01647115, "epoch": 0.6149221427283111, "flos": 23327032965120.0, "grad_norm": 2.196896747359998, "language_loss": 0.79495418, "learning_rate": 1.363917548314176e-06, "loss": 0.81681001, "num_input_tokens_seen": 110216385, "step": 5114, "time_per_iteration": 2.507715940475464 }, { "auxiliary_loss_clip": 0.01167551, "auxiliary_loss_mlp": 0.01028041, "balance_loss_clip": 1.04987669, "balance_loss_mlp": 1.02024519, "epoch": 0.6150423856189503, "flos": 22379494141440.0, "grad_norm": 1.6859480971957332, "language_loss": 0.73069608, "learning_rate": 1.3631790715670626e-06, "loss": 0.75265205, "num_input_tokens_seen": 110234790, "step": 5115, "time_per_iteration": 3.320070266723633 }, { "auxiliary_loss_clip": 0.01080253, "auxiliary_loss_mlp": 0.01024134, "balance_loss_clip": 1.04433727, "balance_loss_mlp": 1.01688588, "epoch": 0.6151626285095894, "flos": 18692078722560.0, "grad_norm": 1.8132990081572578, "language_loss": 0.85411805, "learning_rate": 1.3624406914251783e-06, "loss": 0.87516189, "num_input_tokens_seen": 110251910, "step": 5116, "time_per_iteration": 2.614272356033325 }, { "auxiliary_loss_clip": 0.011616, "auxiliary_loss_mlp": 0.01028105, "balance_loss_clip": 1.04844487, "balance_loss_mlp": 1.0209291, "epoch": 0.6152828714002284, "flos": 15851688894720.0, "grad_norm": 1.875999404706754, "language_loss": 0.87926996, "learning_rate": 1.3617024080005335e-06, "loss": 0.90116704, "num_input_tokens_seen": 110268810, "step": 5117, "time_per_iteration": 3.205634593963623 }, { "auxiliary_loss_clip": 0.01148334, "auxiliary_loss_mlp": 0.00762388, "balance_loss_clip": 1.04618704, "balance_loss_mlp": 1.00068903, "epoch": 0.6154031142908676, "flos": 24869792062080.0, "grad_norm": 1.5118296874236248, "language_loss": 0.74606073, "learning_rate": 1.3609642214051266e-06, "loss": 0.76516789, "num_input_tokens_seen": 110293035, "step": 5118, "time_per_iteration": 2.586886405944824 }, { "auxiliary_loss_clip": 0.01143076, "auxiliary_loss_mlp": 0.01031343, "balance_loss_clip": 1.05111957, "balance_loss_mlp": 1.02277803, "epoch": 0.6155233571815066, "flos": 19244744357760.0, "grad_norm": 1.8796245593984653, "language_loss": 0.66211236, "learning_rate": 1.3602261317509385e-06, "loss": 0.68385661, "num_input_tokens_seen": 110309695, "step": 5119, "time_per_iteration": 2.4761598110198975 }, { "auxiliary_loss_clip": 0.0116176, "auxiliary_loss_mlp": 0.01025005, "balance_loss_clip": 1.04947948, "balance_loss_mlp": 1.01654756, "epoch": 0.6156436000721457, "flos": 18770077105920.0, "grad_norm": 2.692538032462891, "language_loss": 0.82466644, "learning_rate": 1.3594881391499387e-06, "loss": 0.84653413, "num_input_tokens_seen": 110328610, "step": 5120, "time_per_iteration": 2.4384398460388184 }, { "auxiliary_loss_clip": 0.01149388, "auxiliary_loss_mlp": 0.01026951, "balance_loss_clip": 1.04891694, "balance_loss_mlp": 1.0190537, "epoch": 0.6157638429627849, "flos": 18041198325120.0, "grad_norm": 2.0166243919535636, "language_loss": 0.79189581, "learning_rate": 1.3587502437140778e-06, "loss": 0.81365919, "num_input_tokens_seen": 110346775, "step": 5121, "time_per_iteration": 2.4741883277893066 }, { "auxiliary_loss_clip": 0.01149189, "auxiliary_loss_mlp": 0.01029567, "balance_loss_clip": 1.04741859, "balance_loss_mlp": 1.02150834, "epoch": 0.6158840858534239, "flos": 25556726736000.0, "grad_norm": 2.2288144063697803, "language_loss": 0.85067892, "learning_rate": 1.3580124455552952e-06, "loss": 0.87246656, "num_input_tokens_seen": 110366140, "step": 5122, "time_per_iteration": 2.5107157230377197 }, { "auxiliary_loss_clip": 0.01161623, "auxiliary_loss_mlp": 0.00761794, "balance_loss_clip": 1.0498805, "balance_loss_mlp": 1.00073385, "epoch": 0.616004328744063, "flos": 24640788902400.0, "grad_norm": 1.6461498777306336, "language_loss": 0.87382662, "learning_rate": 1.3572747447855148e-06, "loss": 0.8930608, "num_input_tokens_seen": 110386550, "step": 5123, "time_per_iteration": 2.4893503189086914 }, { "auxiliary_loss_clip": 0.01178223, "auxiliary_loss_mlp": 0.0102792, "balance_loss_clip": 1.05366778, "balance_loss_mlp": 1.01996326, "epoch": 0.6161245716347021, "flos": 21689686379520.0, "grad_norm": 2.2205169980477075, "language_loss": 0.69211447, "learning_rate": 1.356537141516644e-06, "loss": 0.71417594, "num_input_tokens_seen": 110403970, "step": 5124, "time_per_iteration": 2.4197683334350586 }, { "auxiliary_loss_clip": 0.01162462, "auxiliary_loss_mlp": 0.01026927, "balance_loss_clip": 1.05254543, "balance_loss_mlp": 1.01904678, "epoch": 0.6162448145253412, "flos": 35189225061120.0, "grad_norm": 2.1713747739713054, "language_loss": 0.61966944, "learning_rate": 1.3557996358605775e-06, "loss": 0.6415633, "num_input_tokens_seen": 110423890, "step": 5125, "time_per_iteration": 2.5521202087402344 }, { "auxiliary_loss_clip": 0.01159368, "auxiliary_loss_mlp": 0.01031449, "balance_loss_clip": 1.04893374, "balance_loss_mlp": 1.02369475, "epoch": 0.6163650574159802, "flos": 21615279356160.0, "grad_norm": 2.137293296134422, "language_loss": 0.69733453, "learning_rate": 1.3550622279291941e-06, "loss": 0.71924269, "num_input_tokens_seen": 110442035, "step": 5126, "time_per_iteration": 2.4518954753875732 }, { "auxiliary_loss_clip": 0.01108061, "auxiliary_loss_mlp": 0.01026371, "balance_loss_clip": 1.04277468, "balance_loss_mlp": 1.01847303, "epoch": 0.6164853003066194, "flos": 24572163968640.0, "grad_norm": 1.4525271136953573, "language_loss": 0.83256698, "learning_rate": 1.354324917834358e-06, "loss": 0.85391128, "num_input_tokens_seen": 110463280, "step": 5127, "time_per_iteration": 2.5621418952941895 }, { "auxiliary_loss_clip": 0.01102617, "auxiliary_loss_mlp": 0.00762848, "balance_loss_clip": 1.04464948, "balance_loss_mlp": 1.00073767, "epoch": 0.6166055431972585, "flos": 21835986474240.0, "grad_norm": 1.740837685526935, "language_loss": 0.7687853, "learning_rate": 1.353587705687918e-06, "loss": 0.78744, "num_input_tokens_seen": 110481455, "step": 5128, "time_per_iteration": 2.589176654815674 }, { "auxiliary_loss_clip": 0.01152411, "auxiliary_loss_mlp": 0.01026174, "balance_loss_clip": 1.05061007, "balance_loss_mlp": 1.01831222, "epoch": 0.6167257860878975, "flos": 17785262943360.0, "grad_norm": 18.805756662928104, "language_loss": 0.71780175, "learning_rate": 1.3528505916017096e-06, "loss": 0.73958755, "num_input_tokens_seen": 110499155, "step": 5129, "time_per_iteration": 2.4418556690216064 }, { "auxiliary_loss_clip": 0.01160673, "auxiliary_loss_mlp": 0.01030876, "balance_loss_clip": 1.04801965, "balance_loss_mlp": 1.02291274, "epoch": 0.6168460289785367, "flos": 23214811898880.0, "grad_norm": 1.9956938296100017, "language_loss": 0.88457108, "learning_rate": 1.3521135756875514e-06, "loss": 0.90648663, "num_input_tokens_seen": 110515470, "step": 5130, "time_per_iteration": 2.4279496669769287 }, { "auxiliary_loss_clip": 0.01096143, "auxiliary_loss_mlp": 0.01027447, "balance_loss_clip": 1.04391456, "balance_loss_mlp": 1.02001441, "epoch": 0.6169662718691757, "flos": 26213281482240.0, "grad_norm": 1.5922104159147648, "language_loss": 0.8646912, "learning_rate": 1.3513766580572496e-06, "loss": 0.88592708, "num_input_tokens_seen": 110538290, "step": 5131, "time_per_iteration": 2.64207124710083 }, { "auxiliary_loss_clip": 0.01158315, "auxiliary_loss_mlp": 0.01025482, "balance_loss_clip": 1.0488596, "balance_loss_mlp": 1.01830602, "epoch": 0.6170865147598148, "flos": 19026120228480.0, "grad_norm": 2.001301953903268, "language_loss": 0.77359492, "learning_rate": 1.3506398388225924e-06, "loss": 0.79543287, "num_input_tokens_seen": 110555610, "step": 5132, "time_per_iteration": 2.4383535385131836 }, { "auxiliary_loss_clip": 0.0117407, "auxiliary_loss_mlp": 0.0102423, "balance_loss_clip": 1.05286646, "balance_loss_mlp": 1.01690412, "epoch": 0.617206757650454, "flos": 18260361158400.0, "grad_norm": 1.8237794699146703, "language_loss": 0.71642679, "learning_rate": 1.349903118095355e-06, "loss": 0.73840976, "num_input_tokens_seen": 110574745, "step": 5133, "time_per_iteration": 2.411007881164551 }, { "auxiliary_loss_clip": 0.0116385, "auxiliary_loss_mlp": 0.01029153, "balance_loss_clip": 1.04898012, "balance_loss_mlp": 1.02131474, "epoch": 0.617327000541093, "flos": 18186959715840.0, "grad_norm": 1.634229230515609, "language_loss": 0.73205948, "learning_rate": 1.349166495987298e-06, "loss": 0.75398946, "num_input_tokens_seen": 110593310, "step": 5134, "time_per_iteration": 2.4359846115112305 }, { "auxiliary_loss_clip": 0.01061904, "auxiliary_loss_mlp": 0.01021128, "balance_loss_clip": 1.02895141, "balance_loss_mlp": 1.02000737, "epoch": 0.6174472434317321, "flos": 61833796122240.0, "grad_norm": 0.8233008660393084, "language_loss": 0.60909635, "learning_rate": 1.348429972610166e-06, "loss": 0.62992668, "num_input_tokens_seen": 110657615, "step": 5135, "time_per_iteration": 3.934006929397583 }, { "auxiliary_loss_clip": 0.01037221, "auxiliary_loss_mlp": 0.01008393, "balance_loss_clip": 1.03129709, "balance_loss_mlp": 1.00718272, "epoch": 0.6175674863223712, "flos": 71230970494080.0, "grad_norm": 0.8490553402599413, "language_loss": 0.57908899, "learning_rate": 1.3476935480756897e-06, "loss": 0.59954512, "num_input_tokens_seen": 110714365, "step": 5136, "time_per_iteration": 3.005431890487671 }, { "auxiliary_loss_clip": 0.01121309, "auxiliary_loss_mlp": 0.01033753, "balance_loss_clip": 1.0434444, "balance_loss_mlp": 1.0258677, "epoch": 0.6176877292130103, "flos": 21835447770240.0, "grad_norm": 2.03787729210879, "language_loss": 0.756091, "learning_rate": 1.346957222495583e-06, "loss": 0.77764165, "num_input_tokens_seen": 110732160, "step": 5137, "time_per_iteration": 2.5312812328338623 }, { "auxiliary_loss_clip": 0.01151661, "auxiliary_loss_mlp": 0.00762518, "balance_loss_clip": 1.04998922, "balance_loss_mlp": 1.00070214, "epoch": 0.6178079721036493, "flos": 17741738638080.0, "grad_norm": 3.0990128599154794, "language_loss": 0.71393383, "learning_rate": 1.3462209959815466e-06, "loss": 0.73307556, "num_input_tokens_seen": 110746900, "step": 5138, "time_per_iteration": 2.4329543113708496 }, { "auxiliary_loss_clip": 0.01151018, "auxiliary_loss_mlp": 0.01027787, "balance_loss_clip": 1.05056465, "balance_loss_mlp": 1.01985025, "epoch": 0.6179282149942885, "flos": 22633131052800.0, "grad_norm": 1.7254070638110282, "language_loss": 0.74007356, "learning_rate": 1.345484868645265e-06, "loss": 0.76186162, "num_input_tokens_seen": 110765710, "step": 5139, "time_per_iteration": 3.324608087539673 }, { "auxiliary_loss_clip": 0.01140696, "auxiliary_loss_mlp": 0.01030782, "balance_loss_clip": 1.04719186, "balance_loss_mlp": 1.02263427, "epoch": 0.6180484578849276, "flos": 22310330503680.0, "grad_norm": 1.93353122568057, "language_loss": 0.78708959, "learning_rate": 1.3447488405984088e-06, "loss": 0.80880433, "num_input_tokens_seen": 110783970, "step": 5140, "time_per_iteration": 2.5369367599487305 }, { "auxiliary_loss_clip": 0.01144588, "auxiliary_loss_mlp": 0.01024222, "balance_loss_clip": 1.04718387, "balance_loss_mlp": 1.01670551, "epoch": 0.6181687007755666, "flos": 35225458905600.0, "grad_norm": 2.1289467633489254, "language_loss": 0.69880098, "learning_rate": 1.3440129119526322e-06, "loss": 0.72048903, "num_input_tokens_seen": 110806395, "step": 5141, "time_per_iteration": 3.3777740001678467 }, { "auxiliary_loss_clip": 0.0107402, "auxiliary_loss_mlp": 0.01004253, "balance_loss_clip": 1.01679921, "balance_loss_mlp": 1.003335, "epoch": 0.6182889436662057, "flos": 61547370094080.0, "grad_norm": 0.8001682396038746, "language_loss": 0.51233035, "learning_rate": 1.3432770828195762e-06, "loss": 0.53311312, "num_input_tokens_seen": 110867380, "step": 5142, "time_per_iteration": 3.17407488822937 }, { "auxiliary_loss_clip": 0.01120848, "auxiliary_loss_mlp": 0.01022626, "balance_loss_clip": 1.04271865, "balance_loss_mlp": 1.01443088, "epoch": 0.6184091865568448, "flos": 19609991804160.0, "grad_norm": 2.3179526312180276, "language_loss": 0.70428991, "learning_rate": 1.3425413533108635e-06, "loss": 0.72572464, "num_input_tokens_seen": 110885980, "step": 5143, "time_per_iteration": 3.317586898803711 }, { "auxiliary_loss_clip": 0.01120033, "auxiliary_loss_mlp": 0.01026638, "balance_loss_clip": 1.04843521, "balance_loss_mlp": 1.01929438, "epoch": 0.6185294294474839, "flos": 23586882929280.0, "grad_norm": 2.0891966194110334, "language_loss": 0.70644093, "learning_rate": 1.341805723538105e-06, "loss": 0.72790766, "num_input_tokens_seen": 110906085, "step": 5144, "time_per_iteration": 2.6144511699676514 }, { "auxiliary_loss_clip": 0.01155755, "auxiliary_loss_mlp": 0.01031882, "balance_loss_clip": 1.05108666, "balance_loss_mlp": 1.02363276, "epoch": 0.618649672338123, "flos": 26762032535040.0, "grad_norm": 1.6604391015252626, "language_loss": 0.77731955, "learning_rate": 1.3410701936128948e-06, "loss": 0.79919589, "num_input_tokens_seen": 110928865, "step": 5145, "time_per_iteration": 2.5302317142486572 }, { "auxiliary_loss_clip": 0.01163699, "auxiliary_loss_mlp": 0.01027024, "balance_loss_clip": 1.05389094, "balance_loss_mlp": 1.01953161, "epoch": 0.6187699152287621, "flos": 14456630522880.0, "grad_norm": 2.5091876659540846, "language_loss": 0.84897012, "learning_rate": 1.340334763646812e-06, "loss": 0.87087739, "num_input_tokens_seen": 110943000, "step": 5146, "time_per_iteration": 2.3902246952056885 }, { "auxiliary_loss_clip": 0.01177569, "auxiliary_loss_mlp": 0.01030709, "balance_loss_clip": 1.05235517, "balance_loss_mlp": 1.02191734, "epoch": 0.6188901581194012, "flos": 20084766796800.0, "grad_norm": 1.8313339041881738, "language_loss": 0.74298638, "learning_rate": 1.3395994337514218e-06, "loss": 0.76506919, "num_input_tokens_seen": 110963170, "step": 5147, "time_per_iteration": 2.4604387283325195 }, { "auxiliary_loss_clip": 0.01153079, "auxiliary_loss_mlp": 0.01028197, "balance_loss_clip": 1.04697967, "balance_loss_mlp": 1.02035332, "epoch": 0.6190104010100402, "flos": 25700728360320.0, "grad_norm": 1.6864189167042607, "language_loss": 0.78778219, "learning_rate": 1.3388642040382725e-06, "loss": 0.80959487, "num_input_tokens_seen": 110983595, "step": 5148, "time_per_iteration": 2.4791977405548096 }, { "auxiliary_loss_clip": 0.01133196, "auxiliary_loss_mlp": 0.01025399, "balance_loss_clip": 1.04170585, "balance_loss_mlp": 1.01743031, "epoch": 0.6191306439006794, "flos": 30442372974720.0, "grad_norm": 1.7344759551081834, "language_loss": 0.84180474, "learning_rate": 1.3381290746188975e-06, "loss": 0.86339074, "num_input_tokens_seen": 111002965, "step": 5149, "time_per_iteration": 2.581310272216797 }, { "auxiliary_loss_clip": 0.01162987, "auxiliary_loss_mlp": 0.01032321, "balance_loss_clip": 1.05254984, "balance_loss_mlp": 1.02409577, "epoch": 0.6192508867913185, "flos": 26685793918080.0, "grad_norm": 1.6903521483416424, "language_loss": 0.67075741, "learning_rate": 1.3373940456048152e-06, "loss": 0.69271052, "num_input_tokens_seen": 111022990, "step": 5150, "time_per_iteration": 2.4894511699676514 }, { "auxiliary_loss_clip": 0.01174787, "auxiliary_loss_mlp": 0.01024873, "balance_loss_clip": 1.05245173, "balance_loss_mlp": 1.01749969, "epoch": 0.6193711296819575, "flos": 36722036090880.0, "grad_norm": 1.8015590074361678, "language_loss": 0.5909031, "learning_rate": 1.3366591171075299e-06, "loss": 0.61289972, "num_input_tokens_seen": 111046495, "step": 5151, "time_per_iteration": 2.550067186355591 }, { "auxiliary_loss_clip": 0.01145976, "auxiliary_loss_mlp": 0.0102376, "balance_loss_clip": 1.04841852, "balance_loss_mlp": 1.01637459, "epoch": 0.6194913725725967, "flos": 25192556697600.0, "grad_norm": 1.9518248568930159, "language_loss": 0.91008627, "learning_rate": 1.335924289238529e-06, "loss": 0.93178362, "num_input_tokens_seen": 111065705, "step": 5152, "time_per_iteration": 2.516597270965576 }, { "auxiliary_loss_clip": 0.0114438, "auxiliary_loss_mlp": 0.00763011, "balance_loss_clip": 1.05101037, "balance_loss_mlp": 1.00079656, "epoch": 0.6196116154632357, "flos": 21178821196800.0, "grad_norm": 1.5269647474180734, "language_loss": 0.76932395, "learning_rate": 1.3351895621092859e-06, "loss": 0.78839779, "num_input_tokens_seen": 111086050, "step": 5153, "time_per_iteration": 2.5211715698242188 }, { "auxiliary_loss_clip": 0.01081732, "auxiliary_loss_mlp": 0.01029174, "balance_loss_clip": 1.03751802, "balance_loss_mlp": 1.02149105, "epoch": 0.6197318583538748, "flos": 16253744803200.0, "grad_norm": 1.8713889016813556, "language_loss": 0.76723945, "learning_rate": 1.3344549358312567e-06, "loss": 0.7883485, "num_input_tokens_seen": 111104450, "step": 5154, "time_per_iteration": 2.616731643676758 }, { "auxiliary_loss_clip": 0.01165772, "auxiliary_loss_mlp": 0.01023722, "balance_loss_clip": 1.05164266, "balance_loss_mlp": 1.01562738, "epoch": 0.619852101244514, "flos": 24425612478720.0, "grad_norm": 1.8889001754231425, "language_loss": 0.78362107, "learning_rate": 1.3337204105158852e-06, "loss": 0.805516, "num_input_tokens_seen": 111123320, "step": 5155, "time_per_iteration": 2.4758620262145996 }, { "auxiliary_loss_clip": 0.01115153, "auxiliary_loss_mlp": 0.01027215, "balance_loss_clip": 1.03721404, "balance_loss_mlp": 1.01937151, "epoch": 0.619972344135153, "flos": 16727298733440.0, "grad_norm": 1.8317029600533525, "language_loss": 0.72757632, "learning_rate": 1.332985986274597e-06, "loss": 0.74899995, "num_input_tokens_seen": 111140950, "step": 5156, "time_per_iteration": 2.491074323654175 }, { "auxiliary_loss_clip": 0.01095943, "auxiliary_loss_mlp": 0.00762421, "balance_loss_clip": 1.04548001, "balance_loss_mlp": 1.00068665, "epoch": 0.6200925870257921, "flos": 12495190498560.0, "grad_norm": 1.9978033146800827, "language_loss": 0.75580359, "learning_rate": 1.3322516632188047e-06, "loss": 0.77438724, "num_input_tokens_seen": 111157845, "step": 5157, "time_per_iteration": 2.5521092414855957 }, { "auxiliary_loss_clip": 0.01129246, "auxiliary_loss_mlp": 0.01027925, "balance_loss_clip": 1.0455339, "balance_loss_mlp": 1.01991391, "epoch": 0.6202128299164312, "flos": 26539350168960.0, "grad_norm": 1.7963965760158007, "language_loss": 0.6719082, "learning_rate": 1.3315174414599045e-06, "loss": 0.69347996, "num_input_tokens_seen": 111179165, "step": 5158, "time_per_iteration": 2.5751895904541016 }, { "auxiliary_loss_clip": 0.01156214, "auxiliary_loss_mlp": 0.01024233, "balance_loss_clip": 1.04734278, "balance_loss_mlp": 1.01568592, "epoch": 0.6203330728070703, "flos": 18770508069120.0, "grad_norm": 2.1245506726345535, "language_loss": 0.75202847, "learning_rate": 1.3307833211092768e-06, "loss": 0.77383298, "num_input_tokens_seen": 111197830, "step": 5159, "time_per_iteration": 2.4460315704345703 }, { "auxiliary_loss_clip": 0.01177647, "auxiliary_loss_mlp": 0.01031116, "balance_loss_clip": 1.05416012, "balance_loss_mlp": 1.02321851, "epoch": 0.6204533156977093, "flos": 20629782835200.0, "grad_norm": 1.6359629451403146, "language_loss": 0.75242364, "learning_rate": 1.3300493022782873e-06, "loss": 0.77451134, "num_input_tokens_seen": 111218400, "step": 5160, "time_per_iteration": 2.437129020690918 }, { "auxiliary_loss_clip": 0.01109571, "auxiliary_loss_mlp": 0.00763293, "balance_loss_clip": 1.04406416, "balance_loss_mlp": 1.00076747, "epoch": 0.6205735585883485, "flos": 17348050598400.0, "grad_norm": 1.789987078371122, "language_loss": 0.72415805, "learning_rate": 1.3293153850782855e-06, "loss": 0.74288672, "num_input_tokens_seen": 111236720, "step": 5161, "time_per_iteration": 2.529670000076294 }, { "auxiliary_loss_clip": 0.01123569, "auxiliary_loss_mlp": 0.01028339, "balance_loss_clip": 1.04506731, "balance_loss_mlp": 1.0192728, "epoch": 0.6206938014789876, "flos": 22965017742720.0, "grad_norm": 2.394727344070964, "language_loss": 0.71178114, "learning_rate": 1.3285815696206069e-06, "loss": 0.73330021, "num_input_tokens_seen": 111258265, "step": 5162, "time_per_iteration": 3.2981951236724854 }, { "auxiliary_loss_clip": 0.01133436, "auxiliary_loss_mlp": 0.01032915, "balance_loss_clip": 1.04450345, "balance_loss_mlp": 1.02477264, "epoch": 0.6208140443696266, "flos": 23983192661760.0, "grad_norm": 2.69615125241396, "language_loss": 0.7683441, "learning_rate": 1.32784785601657e-06, "loss": 0.79000771, "num_input_tokens_seen": 111277675, "step": 5163, "time_per_iteration": 2.548370599746704 }, { "auxiliary_loss_clip": 0.01148839, "auxiliary_loss_mlp": 0.01024809, "balance_loss_clip": 1.0469408, "balance_loss_mlp": 1.01693249, "epoch": 0.6209342872602658, "flos": 35077291303680.0, "grad_norm": 1.7211830320276993, "language_loss": 0.73686028, "learning_rate": 1.3271142443774798e-06, "loss": 0.75859672, "num_input_tokens_seen": 111299910, "step": 5164, "time_per_iteration": 2.629284143447876 }, { "auxiliary_loss_clip": 0.01144833, "auxiliary_loss_mlp": 0.01022473, "balance_loss_clip": 1.04928827, "balance_loss_mlp": 1.01459968, "epoch": 0.6210545301509048, "flos": 26979327861120.0, "grad_norm": 1.766327599393501, "language_loss": 0.81522989, "learning_rate": 1.3263807348146228e-06, "loss": 0.83690292, "num_input_tokens_seen": 111319765, "step": 5165, "time_per_iteration": 2.5535645484924316 }, { "auxiliary_loss_clip": 0.0114433, "auxiliary_loss_mlp": 0.01036337, "balance_loss_clip": 1.04550493, "balance_loss_mlp": 1.0276525, "epoch": 0.6211747730415439, "flos": 33618240852480.0, "grad_norm": 2.192007811063716, "language_loss": 0.73259747, "learning_rate": 1.3256473274392733e-06, "loss": 0.75440407, "num_input_tokens_seen": 111341110, "step": 5166, "time_per_iteration": 3.4649343490600586 }, { "auxiliary_loss_clip": 0.01174752, "auxiliary_loss_mlp": 0.01029226, "balance_loss_clip": 1.05107534, "balance_loss_mlp": 1.02116108, "epoch": 0.6212950159321831, "flos": 34167099646080.0, "grad_norm": 1.9716200427603439, "language_loss": 0.69928497, "learning_rate": 1.3249140223626873e-06, "loss": 0.72132474, "num_input_tokens_seen": 111362730, "step": 5167, "time_per_iteration": 3.3117260932922363 }, { "auxiliary_loss_clip": 0.01159025, "auxiliary_loss_mlp": 0.01025647, "balance_loss_clip": 1.05048549, "balance_loss_mlp": 1.01794565, "epoch": 0.6214152588228221, "flos": 27965758135680.0, "grad_norm": 1.8686629737140226, "language_loss": 0.75584412, "learning_rate": 1.3241808196961077e-06, "loss": 0.77769083, "num_input_tokens_seen": 111383855, "step": 5168, "time_per_iteration": 2.499868392944336 }, { "auxiliary_loss_clip": 0.01133113, "auxiliary_loss_mlp": 0.01024034, "balance_loss_clip": 1.04511786, "balance_loss_mlp": 1.01642799, "epoch": 0.6215355017134612, "flos": 20230204965120.0, "grad_norm": 1.7888571058056566, "language_loss": 0.71031201, "learning_rate": 1.3234477195507608e-06, "loss": 0.73188341, "num_input_tokens_seen": 111402685, "step": 5169, "time_per_iteration": 2.4794108867645264 }, { "auxiliary_loss_clip": 0.01132301, "auxiliary_loss_mlp": 0.01027922, "balance_loss_clip": 1.04677236, "balance_loss_mlp": 1.02076328, "epoch": 0.6216557446041003, "flos": 41428129219200.0, "grad_norm": 2.3187113633964214, "language_loss": 0.62189484, "learning_rate": 1.322714722037857e-06, "loss": 0.64349705, "num_input_tokens_seen": 111424130, "step": 5170, "time_per_iteration": 3.42864727973938 }, { "auxiliary_loss_clip": 0.01140212, "auxiliary_loss_mlp": 0.01030152, "balance_loss_clip": 1.04759312, "balance_loss_mlp": 1.02231979, "epoch": 0.6217759874947394, "flos": 27928770105600.0, "grad_norm": 2.0451137869749543, "language_loss": 0.77578139, "learning_rate": 1.321981827268591e-06, "loss": 0.79748499, "num_input_tokens_seen": 111444785, "step": 5171, "time_per_iteration": 2.5806195735931396 }, { "auxiliary_loss_clip": 0.01150253, "auxiliary_loss_mlp": 0.01028771, "balance_loss_clip": 1.04932022, "balance_loss_mlp": 1.02086735, "epoch": 0.6218962303853784, "flos": 21765673601280.0, "grad_norm": 1.697017083153132, "language_loss": 0.81457758, "learning_rate": 1.3212490353541426e-06, "loss": 0.83636785, "num_input_tokens_seen": 111467045, "step": 5172, "time_per_iteration": 2.5358362197875977 }, { "auxiliary_loss_clip": 0.01176365, "auxiliary_loss_mlp": 0.01024585, "balance_loss_clip": 1.05082786, "balance_loss_mlp": 1.01658583, "epoch": 0.6220164732760175, "flos": 21246260981760.0, "grad_norm": 1.9215630578600953, "language_loss": 0.80180526, "learning_rate": 1.3205163464056762e-06, "loss": 0.82381475, "num_input_tokens_seen": 111483650, "step": 5173, "time_per_iteration": 2.428027868270874 }, { "auxiliary_loss_clip": 0.01158613, "auxiliary_loss_mlp": 0.0102875, "balance_loss_clip": 1.04830134, "balance_loss_mlp": 1.0210371, "epoch": 0.6221367161666567, "flos": 26136360506880.0, "grad_norm": 1.883102375572845, "language_loss": 0.72920758, "learning_rate": 1.319783760534339e-06, "loss": 0.75108123, "num_input_tokens_seen": 111502895, "step": 5174, "time_per_iteration": 2.4963786602020264 }, { "auxiliary_loss_clip": 0.01161858, "auxiliary_loss_mlp": 0.01030575, "balance_loss_clip": 1.052176, "balance_loss_mlp": 1.02232587, "epoch": 0.6222569590572957, "flos": 16284196558080.0, "grad_norm": 2.248326752209355, "language_loss": 0.75213224, "learning_rate": 1.319051277851266e-06, "loss": 0.77405655, "num_input_tokens_seen": 111519180, "step": 5175, "time_per_iteration": 2.431265115737915 }, { "auxiliary_loss_clip": 0.01162926, "auxiliary_loss_mlp": 0.01030796, "balance_loss_clip": 1.04964113, "balance_loss_mlp": 1.02309537, "epoch": 0.6223772019479348, "flos": 18223840005120.0, "grad_norm": 1.8167662182747357, "language_loss": 0.842345, "learning_rate": 1.3183188984675716e-06, "loss": 0.86428225, "num_input_tokens_seen": 111537545, "step": 5176, "time_per_iteration": 2.429539203643799 }, { "auxiliary_loss_clip": 0.0114653, "auxiliary_loss_mlp": 0.01033007, "balance_loss_clip": 1.05012107, "balance_loss_mlp": 1.02555668, "epoch": 0.6224974448385739, "flos": 27489797994240.0, "grad_norm": 2.4275330939966455, "language_loss": 0.71225476, "learning_rate": 1.3175866224943586e-06, "loss": 0.73405015, "num_input_tokens_seen": 111556265, "step": 5177, "time_per_iteration": 2.5202741622924805 }, { "auxiliary_loss_clip": 0.01150551, "auxiliary_loss_mlp": 0.01029362, "balance_loss_clip": 1.04937506, "balance_loss_mlp": 1.02128005, "epoch": 0.622617687729213, "flos": 19791951125760.0, "grad_norm": 2.5902245639624097, "language_loss": 0.73468375, "learning_rate": 1.316854450042712e-06, "loss": 0.75648284, "num_input_tokens_seen": 111574205, "step": 5178, "time_per_iteration": 2.468618392944336 }, { "auxiliary_loss_clip": 0.01166919, "auxiliary_loss_mlp": 0.01025371, "balance_loss_clip": 1.05129933, "balance_loss_mlp": 1.01751196, "epoch": 0.622737930619852, "flos": 23038886062080.0, "grad_norm": 1.9047970956836242, "language_loss": 0.74362719, "learning_rate": 1.3161223812237024e-06, "loss": 0.76555008, "num_input_tokens_seen": 111593560, "step": 5179, "time_per_iteration": 2.4668078422546387 }, { "auxiliary_loss_clip": 0.01174002, "auxiliary_loss_mlp": 0.01031181, "balance_loss_clip": 1.04919124, "balance_loss_mlp": 1.02327764, "epoch": 0.6228581735104912, "flos": 12634271959680.0, "grad_norm": 2.937125254931495, "language_loss": 0.84914672, "learning_rate": 1.3153904161483842e-06, "loss": 0.87119859, "num_input_tokens_seen": 111608860, "step": 5180, "time_per_iteration": 2.3765153884887695 }, { "auxiliary_loss_clip": 0.01127725, "auxiliary_loss_mlp": 0.01026438, "balance_loss_clip": 1.04467797, "balance_loss_mlp": 1.01800966, "epoch": 0.6229784164011303, "flos": 23802813538560.0, "grad_norm": 2.0340228026532836, "language_loss": 0.85037589, "learning_rate": 1.3146585549277953e-06, "loss": 0.87191749, "num_input_tokens_seen": 111627500, "step": 5181, "time_per_iteration": 2.5239310264587402 }, { "auxiliary_loss_clip": 0.01157476, "auxiliary_loss_mlp": 0.01032754, "balance_loss_clip": 1.05071926, "balance_loss_mlp": 1.02495217, "epoch": 0.6230986592917693, "flos": 22414219614720.0, "grad_norm": 2.0717912476862907, "language_loss": 0.78470969, "learning_rate": 1.3139267976729591e-06, "loss": 0.80661196, "num_input_tokens_seen": 111647690, "step": 5182, "time_per_iteration": 2.4967501163482666 }, { "auxiliary_loss_clip": 0.01165023, "auxiliary_loss_mlp": 0.01029867, "balance_loss_clip": 1.05192065, "balance_loss_mlp": 1.02140307, "epoch": 0.6232189021824085, "flos": 34528217028480.0, "grad_norm": 1.642552819933219, "language_loss": 0.71445298, "learning_rate": 1.3131951444948815e-06, "loss": 0.73640186, "num_input_tokens_seen": 111667090, "step": 5183, "time_per_iteration": 2.5647177696228027 }, { "auxiliary_loss_clip": 0.01149273, "auxiliary_loss_mlp": 0.01032329, "balance_loss_clip": 1.04965508, "balance_loss_mlp": 1.02431846, "epoch": 0.6233391450730476, "flos": 22237000888320.0, "grad_norm": 2.1184798783130776, "language_loss": 0.76420403, "learning_rate": 1.3124635955045546e-06, "loss": 0.78602004, "num_input_tokens_seen": 111686905, "step": 5184, "time_per_iteration": 2.4844810962677 }, { "auxiliary_loss_clip": 0.01107052, "auxiliary_loss_mlp": 0.00763131, "balance_loss_clip": 1.04104447, "balance_loss_mlp": 1.00061691, "epoch": 0.6234593879636866, "flos": 20332693445760.0, "grad_norm": 1.832222907773644, "language_loss": 0.83983433, "learning_rate": 1.3117321508129537e-06, "loss": 0.85853612, "num_input_tokens_seen": 111704985, "step": 5185, "time_per_iteration": 2.5574467182159424 }, { "auxiliary_loss_clip": 0.0114947, "auxiliary_loss_mlp": 0.01022647, "balance_loss_clip": 1.04886055, "balance_loss_mlp": 1.01532722, "epoch": 0.6235796308543258, "flos": 20664903358080.0, "grad_norm": 1.5358608182006221, "language_loss": 0.76228631, "learning_rate": 1.3110008105310388e-06, "loss": 0.78400743, "num_input_tokens_seen": 111724805, "step": 5186, "time_per_iteration": 2.502291440963745 }, { "auxiliary_loss_clip": 0.01176002, "auxiliary_loss_mlp": 0.01033589, "balance_loss_clip": 1.0499531, "balance_loss_mlp": 1.02547073, "epoch": 0.6236998737449648, "flos": 26618641441920.0, "grad_norm": 1.620482699143074, "language_loss": 0.78040814, "learning_rate": 1.3102695747697526e-06, "loss": 0.80250406, "num_input_tokens_seen": 111747675, "step": 5187, "time_per_iteration": 2.491316318511963 }, { "auxiliary_loss_clip": 0.01104596, "auxiliary_loss_mlp": 0.01027368, "balance_loss_clip": 1.04617989, "balance_loss_mlp": 1.01916695, "epoch": 0.6238201166356039, "flos": 12674599954560.0, "grad_norm": 3.164098414209472, "language_loss": 0.90217841, "learning_rate": 1.3095384436400237e-06, "loss": 0.92349803, "num_input_tokens_seen": 111759205, "step": 5188, "time_per_iteration": 2.5018911361694336 }, { "auxiliary_loss_clip": 0.01153973, "auxiliary_loss_mlp": 0.01023299, "balance_loss_clip": 1.04810059, "balance_loss_mlp": 1.01557386, "epoch": 0.623940359526243, "flos": 10452160730880.0, "grad_norm": 2.1649389303604223, "language_loss": 0.81824613, "learning_rate": 1.3088074172527633e-06, "loss": 0.84001887, "num_input_tokens_seen": 111776335, "step": 5189, "time_per_iteration": 3.333324432373047 }, { "auxiliary_loss_clip": 0.01148948, "auxiliary_loss_mlp": 0.01023697, "balance_loss_clip": 1.04586041, "balance_loss_mlp": 1.0154295, "epoch": 0.6240606024168821, "flos": 29059525226880.0, "grad_norm": 1.7440212301044016, "language_loss": 0.71493834, "learning_rate": 1.3080764957188684e-06, "loss": 0.73666477, "num_input_tokens_seen": 111796580, "step": 5190, "time_per_iteration": 2.5504536628723145 }, { "auxiliary_loss_clip": 0.01122099, "auxiliary_loss_mlp": 0.0102464, "balance_loss_clip": 1.04511976, "balance_loss_mlp": 1.01633668, "epoch": 0.6241808453075212, "flos": 22018089450240.0, "grad_norm": 1.7427760892535424, "language_loss": 0.70954829, "learning_rate": 1.3073456791492192e-06, "loss": 0.73101568, "num_input_tokens_seen": 111816290, "step": 5191, "time_per_iteration": 2.557647943496704 }, { "auxiliary_loss_clip": 0.01148211, "auxiliary_loss_mlp": 0.01025333, "balance_loss_clip": 1.04606092, "balance_loss_mlp": 1.01770949, "epoch": 0.6243010881981603, "flos": 21138708683520.0, "grad_norm": 1.7562718852180146, "language_loss": 0.78606039, "learning_rate": 1.3066149676546801e-06, "loss": 0.80779582, "num_input_tokens_seen": 111834470, "step": 5192, "time_per_iteration": 2.4930386543273926 }, { "auxiliary_loss_clip": 0.01148658, "auxiliary_loss_mlp": 0.01028748, "balance_loss_clip": 1.05349004, "balance_loss_mlp": 1.02068949, "epoch": 0.6244213310887994, "flos": 22344948236160.0, "grad_norm": 1.6920099490156852, "language_loss": 0.6629023, "learning_rate": 1.3058843613460985e-06, "loss": 0.68467641, "num_input_tokens_seen": 111852410, "step": 5193, "time_per_iteration": 3.344407081604004 }, { "auxiliary_loss_clip": 0.01140214, "auxiliary_loss_mlp": 0.01027518, "balance_loss_clip": 1.04753375, "balance_loss_mlp": 1.01972127, "epoch": 0.6245415739794384, "flos": 15231978524160.0, "grad_norm": 1.9263940141314557, "language_loss": 0.74089432, "learning_rate": 1.3051538603343075e-06, "loss": 0.76257157, "num_input_tokens_seen": 111870340, "step": 5194, "time_per_iteration": 3.365662097930908 }, { "auxiliary_loss_clip": 0.01161555, "auxiliary_loss_mlp": 0.01036097, "balance_loss_clip": 1.0514853, "balance_loss_mlp": 1.02819633, "epoch": 0.6246618168700776, "flos": 18879891960960.0, "grad_norm": 1.9666510575706388, "language_loss": 0.67418522, "learning_rate": 1.3044234647301235e-06, "loss": 0.69616175, "num_input_tokens_seen": 111888365, "step": 5195, "time_per_iteration": 2.442220449447632 }, { "auxiliary_loss_clip": 0.01155336, "auxiliary_loss_mlp": 0.01026757, "balance_loss_clip": 1.04876482, "balance_loss_mlp": 1.01986647, "epoch": 0.6247820597607167, "flos": 14319201087360.0, "grad_norm": 1.9101932011054747, "language_loss": 0.72601497, "learning_rate": 1.303693174644347e-06, "loss": 0.74783587, "num_input_tokens_seen": 111905840, "step": 5196, "time_per_iteration": 3.16748046875 }, { "auxiliary_loss_clip": 0.0114268, "auxiliary_loss_mlp": 0.0103079, "balance_loss_clip": 1.04643488, "balance_loss_mlp": 1.02220666, "epoch": 0.6249023026513557, "flos": 22637979388800.0, "grad_norm": 2.0862632571542226, "language_loss": 0.80467904, "learning_rate": 1.3029629901877625e-06, "loss": 0.82641375, "num_input_tokens_seen": 111925215, "step": 5197, "time_per_iteration": 2.4859719276428223 }, { "auxiliary_loss_clip": 0.01168862, "auxiliary_loss_mlp": 0.01027572, "balance_loss_clip": 1.05253983, "balance_loss_mlp": 1.01925683, "epoch": 0.6250225455419949, "flos": 20266690204800.0, "grad_norm": 3.0725458616635564, "language_loss": 0.77077407, "learning_rate": 1.3022329114711376e-06, "loss": 0.79273844, "num_input_tokens_seen": 111943925, "step": 5198, "time_per_iteration": 2.4443161487579346 }, { "auxiliary_loss_clip": 0.01143222, "auxiliary_loss_mlp": 0.0102402, "balance_loss_clip": 1.04749584, "balance_loss_mlp": 1.016361, "epoch": 0.6251427884326339, "flos": 23437853400960.0, "grad_norm": 2.4790473496002225, "language_loss": 0.69548547, "learning_rate": 1.3015029386052256e-06, "loss": 0.71715784, "num_input_tokens_seen": 111964095, "step": 5199, "time_per_iteration": 2.512484550476074 }, { "auxiliary_loss_clip": 0.01144577, "auxiliary_loss_mlp": 0.01030605, "balance_loss_clip": 1.04931223, "balance_loss_mlp": 1.02269578, "epoch": 0.625263031323273, "flos": 31723055464320.0, "grad_norm": 2.127825540730026, "language_loss": 0.72924012, "learning_rate": 1.3007730717007622e-06, "loss": 0.75099194, "num_input_tokens_seen": 111984910, "step": 5200, "time_per_iteration": 2.603738784790039 }, { "auxiliary_loss_clip": 0.01177455, "auxiliary_loss_mlp": 0.01029186, "balance_loss_clip": 1.05234516, "balance_loss_mlp": 1.02088344, "epoch": 0.6253832742139122, "flos": 24134341092480.0, "grad_norm": 1.7274660524267338, "language_loss": 0.75600898, "learning_rate": 1.3000433108684676e-06, "loss": 0.77807534, "num_input_tokens_seen": 112005410, "step": 5201, "time_per_iteration": 2.450544834136963 }, { "auxiliary_loss_clip": 0.01157304, "auxiliary_loss_mlp": 0.01022565, "balance_loss_clip": 1.04908442, "balance_loss_mlp": 1.01484585, "epoch": 0.6255035171045512, "flos": 27668812400640.0, "grad_norm": 9.456925047273225, "language_loss": 0.8065123, "learning_rate": 1.2993136562190467e-06, "loss": 0.82831097, "num_input_tokens_seen": 112024530, "step": 5202, "time_per_iteration": 2.4952661991119385 }, { "auxiliary_loss_clip": 0.01150399, "auxiliary_loss_mlp": 0.01028074, "balance_loss_clip": 1.04889143, "balance_loss_mlp": 1.01988471, "epoch": 0.6256237599951903, "flos": 20227798753920.0, "grad_norm": 1.5608837175036179, "language_loss": 0.70270973, "learning_rate": 1.2985841078631871e-06, "loss": 0.72449446, "num_input_tokens_seen": 112043850, "step": 5203, "time_per_iteration": 2.4833059310913086 }, { "auxiliary_loss_clip": 0.01099335, "auxiliary_loss_mlp": 0.01031528, "balance_loss_clip": 1.03912997, "balance_loss_mlp": 1.02352262, "epoch": 0.6257440028858293, "flos": 24170574936960.0, "grad_norm": 1.8296164103154822, "language_loss": 0.78451347, "learning_rate": 1.2978546659115608e-06, "loss": 0.80582213, "num_input_tokens_seen": 112061930, "step": 5204, "time_per_iteration": 2.5853633880615234 }, { "auxiliary_loss_clip": 0.01149213, "auxiliary_loss_mlp": 0.01030503, "balance_loss_clip": 1.0490036, "balance_loss_mlp": 1.02269483, "epoch": 0.6258642457764685, "flos": 15851940289920.0, "grad_norm": 1.9241035156068131, "language_loss": 0.85477406, "learning_rate": 1.2971253304748228e-06, "loss": 0.87657118, "num_input_tokens_seen": 112079645, "step": 5205, "time_per_iteration": 2.4536120891571045 }, { "auxiliary_loss_clip": 0.01166588, "auxiliary_loss_mlp": 0.01029086, "balance_loss_clip": 1.05259645, "balance_loss_mlp": 1.02108145, "epoch": 0.6259844886671075, "flos": 11911354836480.0, "grad_norm": 1.588159424865464, "language_loss": 0.75170666, "learning_rate": 1.296396101663614e-06, "loss": 0.7736634, "num_input_tokens_seen": 112096205, "step": 5206, "time_per_iteration": 2.429076671600342 }, { "auxiliary_loss_clip": 0.01164154, "auxiliary_loss_mlp": 0.01026794, "balance_loss_clip": 1.05111194, "balance_loss_mlp": 1.01894987, "epoch": 0.6261047315577466, "flos": 15887958652800.0, "grad_norm": 3.574489457448891, "language_loss": 0.8394323, "learning_rate": 1.2956669795885565e-06, "loss": 0.86134183, "num_input_tokens_seen": 112112835, "step": 5207, "time_per_iteration": 2.4251773357391357 }, { "auxiliary_loss_clip": 0.01127823, "auxiliary_loss_mlp": 0.01034217, "balance_loss_clip": 1.04628062, "balance_loss_mlp": 1.0260694, "epoch": 0.6262249744483858, "flos": 31248926916480.0, "grad_norm": 2.1633989928415707, "language_loss": 0.6804316, "learning_rate": 1.294937964360259e-06, "loss": 0.702052, "num_input_tokens_seen": 112133105, "step": 5208, "time_per_iteration": 2.5997345447540283 }, { "auxiliary_loss_clip": 0.01152985, "auxiliary_loss_mlp": 0.01030884, "balance_loss_clip": 1.04851007, "balance_loss_mlp": 1.02177668, "epoch": 0.6263452173390248, "flos": 27198598435200.0, "grad_norm": 2.202788255060346, "language_loss": 0.71013522, "learning_rate": 1.2942090560893108e-06, "loss": 0.73197389, "num_input_tokens_seen": 112152510, "step": 5209, "time_per_iteration": 2.5261213779449463 }, { "auxiliary_loss_clip": 0.01173905, "auxiliary_loss_mlp": 0.01024677, "balance_loss_clip": 1.05083227, "balance_loss_mlp": 1.01739907, "epoch": 0.6264654602296639, "flos": 37342069683840.0, "grad_norm": 2.0449967530183635, "language_loss": 0.60759819, "learning_rate": 1.2934802548862882e-06, "loss": 0.62958395, "num_input_tokens_seen": 112175295, "step": 5210, "time_per_iteration": 2.5536608695983887 }, { "auxiliary_loss_clip": 0.01142308, "auxiliary_loss_mlp": 0.01029336, "balance_loss_clip": 1.04583013, "balance_loss_mlp": 1.02179551, "epoch": 0.626585703120303, "flos": 14756952136320.0, "grad_norm": 1.7741220645081541, "language_loss": 0.83058244, "learning_rate": 1.292751560861749e-06, "loss": 0.85229892, "num_input_tokens_seen": 112190200, "step": 5211, "time_per_iteration": 2.4346988201141357 }, { "auxiliary_loss_clip": 0.01175801, "auxiliary_loss_mlp": 0.01028444, "balance_loss_clip": 1.0510031, "balance_loss_mlp": 1.02000999, "epoch": 0.6267059460109421, "flos": 22347318533760.0, "grad_norm": 1.8700318560239715, "language_loss": 0.79694223, "learning_rate": 1.2920229741262354e-06, "loss": 0.81898469, "num_input_tokens_seen": 112208205, "step": 5212, "time_per_iteration": 2.4382312297821045 }, { "auxiliary_loss_clip": 0.01147328, "auxiliary_loss_mlp": 0.01026899, "balance_loss_clip": 1.04774451, "balance_loss_mlp": 1.01924598, "epoch": 0.6268261889015811, "flos": 17748813617280.0, "grad_norm": 2.7344753534121433, "language_loss": 0.75384033, "learning_rate": 1.2912944947902739e-06, "loss": 0.77558255, "num_input_tokens_seen": 112224690, "step": 5213, "time_per_iteration": 2.4599359035491943 }, { "auxiliary_loss_clip": 0.01152422, "auxiliary_loss_mlp": 0.01024836, "balance_loss_clip": 1.0487715, "balance_loss_mlp": 1.01639569, "epoch": 0.6269464317922203, "flos": 32846484211200.0, "grad_norm": 1.9549116769146608, "language_loss": 0.71654594, "learning_rate": 1.2905661229643742e-06, "loss": 0.73831856, "num_input_tokens_seen": 112244450, "step": 5214, "time_per_iteration": 2.630216360092163 }, { "auxiliary_loss_clip": 0.01176062, "auxiliary_loss_mlp": 0.01029642, "balance_loss_clip": 1.05081916, "balance_loss_mlp": 1.02159548, "epoch": 0.6270666746828594, "flos": 17929192740480.0, "grad_norm": 2.2214229842556663, "language_loss": 0.84309161, "learning_rate": 1.2898378587590299e-06, "loss": 0.86514866, "num_input_tokens_seen": 112261050, "step": 5215, "time_per_iteration": 3.1176860332489014 }, { "auxiliary_loss_clip": 0.0115838, "auxiliary_loss_mlp": 0.0102373, "balance_loss_clip": 1.0502367, "balance_loss_mlp": 1.01613069, "epoch": 0.6271869175734984, "flos": 17457326749440.0, "grad_norm": 1.8758552555966743, "language_loss": 0.87420404, "learning_rate": 1.2891097022847173e-06, "loss": 0.89602506, "num_input_tokens_seen": 112278395, "step": 5216, "time_per_iteration": 2.455867290496826 }, { "auxiliary_loss_clip": 0.01146082, "auxiliary_loss_mlp": 0.01033469, "balance_loss_clip": 1.0476253, "balance_loss_mlp": 1.0246172, "epoch": 0.6273071604641376, "flos": 26868615166080.0, "grad_norm": 2.3621541623132916, "language_loss": 0.66724181, "learning_rate": 1.2883816536518978e-06, "loss": 0.68903726, "num_input_tokens_seen": 112299535, "step": 5217, "time_per_iteration": 2.5225913524627686 }, { "auxiliary_loss_clip": 0.0115751, "auxiliary_loss_mlp": 0.01026678, "balance_loss_clip": 1.04755771, "balance_loss_mlp": 1.01952553, "epoch": 0.6274274033547766, "flos": 26062384446720.0, "grad_norm": 1.71003600763185, "language_loss": 0.81872934, "learning_rate": 1.2876537129710155e-06, "loss": 0.84057128, "num_input_tokens_seen": 112317265, "step": 5218, "time_per_iteration": 2.475831985473633 }, { "auxiliary_loss_clip": 0.0114661, "auxiliary_loss_mlp": 0.01031142, "balance_loss_clip": 1.05319691, "balance_loss_mlp": 1.02334619, "epoch": 0.6275476462454157, "flos": 20266259241600.0, "grad_norm": 1.970132464259165, "language_loss": 0.7524184, "learning_rate": 1.286925880352499e-06, "loss": 0.77419591, "num_input_tokens_seen": 112336125, "step": 5219, "time_per_iteration": 2.473417043685913 }, { "auxiliary_loss_clip": 0.01145305, "auxiliary_loss_mlp": 0.01021292, "balance_loss_clip": 1.04897714, "balance_loss_mlp": 1.0139488, "epoch": 0.6276678891360549, "flos": 26320402817280.0, "grad_norm": 1.7128010599842847, "language_loss": 0.71219057, "learning_rate": 1.2861981559067592e-06, "loss": 0.7338565, "num_input_tokens_seen": 112356730, "step": 5220, "time_per_iteration": 3.3524868488311768 }, { "auxiliary_loss_clip": 0.0110738, "auxiliary_loss_mlp": 0.01024401, "balance_loss_clip": 1.04274035, "balance_loss_mlp": 1.0168134, "epoch": 0.6277881320266939, "flos": 13912512324480.0, "grad_norm": 2.0379167900975714, "language_loss": 0.80072564, "learning_rate": 1.2854705397441917e-06, "loss": 0.82204348, "num_input_tokens_seen": 112372270, "step": 5221, "time_per_iteration": 3.384284734725952 }, { "auxiliary_loss_clip": 0.01125709, "auxiliary_loss_mlp": 0.01026377, "balance_loss_clip": 1.04305279, "balance_loss_mlp": 1.01877129, "epoch": 0.627908374917333, "flos": 27048922462080.0, "grad_norm": 2.2332600876185844, "language_loss": 0.7759856, "learning_rate": 1.2847430319751747e-06, "loss": 0.79750645, "num_input_tokens_seen": 112390365, "step": 5222, "time_per_iteration": 2.5690886974334717 }, { "auxiliary_loss_clip": 0.01155425, "auxiliary_loss_mlp": 0.01024738, "balance_loss_clip": 1.05053329, "balance_loss_mlp": 1.0174545, "epoch": 0.6280286178079721, "flos": 23769201386880.0, "grad_norm": 2.288177421563343, "language_loss": 0.67311609, "learning_rate": 1.2840156327100712e-06, "loss": 0.69491774, "num_input_tokens_seen": 112407490, "step": 5223, "time_per_iteration": 3.172185182571411 }, { "auxiliary_loss_clip": 0.01172709, "auxiliary_loss_mlp": 0.01023958, "balance_loss_clip": 1.05046487, "balance_loss_mlp": 1.01622117, "epoch": 0.6281488606986112, "flos": 26359150613760.0, "grad_norm": 1.8129402269948693, "language_loss": 0.72260547, "learning_rate": 1.2832883420592272e-06, "loss": 0.7445721, "num_input_tokens_seen": 112426385, "step": 5224, "time_per_iteration": 2.4578473567962646 }, { "auxiliary_loss_clip": 0.01142726, "auxiliary_loss_mlp": 0.01028501, "balance_loss_clip": 1.0477531, "balance_loss_mlp": 1.02057946, "epoch": 0.6282691035892503, "flos": 36137194848000.0, "grad_norm": 3.5637560289815746, "language_loss": 0.64467061, "learning_rate": 1.282561160132972e-06, "loss": 0.66638285, "num_input_tokens_seen": 112446905, "step": 5225, "time_per_iteration": 2.6068241596221924 }, { "auxiliary_loss_clip": 0.01151054, "auxiliary_loss_mlp": 0.01037466, "balance_loss_clip": 1.04585648, "balance_loss_mlp": 1.02968717, "epoch": 0.6283893464798894, "flos": 26537231266560.0, "grad_norm": 1.9507870436213395, "language_loss": 0.80939525, "learning_rate": 1.2818340870416186e-06, "loss": 0.83128047, "num_input_tokens_seen": 112468040, "step": 5226, "time_per_iteration": 2.5375113487243652 }, { "auxiliary_loss_clip": 0.01138921, "auxiliary_loss_mlp": 0.01029503, "balance_loss_clip": 1.04562402, "balance_loss_mlp": 1.02097321, "epoch": 0.6285095893705285, "flos": 22237216369920.0, "grad_norm": 2.411895185398057, "language_loss": 0.75849283, "learning_rate": 1.2811071228954626e-06, "loss": 0.780177, "num_input_tokens_seen": 112486675, "step": 5227, "time_per_iteration": 2.5346691608428955 }, { "auxiliary_loss_clip": 0.01146403, "auxiliary_loss_mlp": 0.01026043, "balance_loss_clip": 1.04913485, "balance_loss_mlp": 1.01816332, "epoch": 0.6286298322611675, "flos": 26542259170560.0, "grad_norm": 2.116746544027814, "language_loss": 0.81267941, "learning_rate": 1.2803802678047846e-06, "loss": 0.83440387, "num_input_tokens_seen": 112506825, "step": 5228, "time_per_iteration": 2.5236423015594482 }, { "auxiliary_loss_clip": 0.0115161, "auxiliary_loss_mlp": 0.01035118, "balance_loss_clip": 1.04968143, "balance_loss_mlp": 1.02688074, "epoch": 0.6287500751518067, "flos": 21795227516160.0, "grad_norm": 1.9220122988730943, "language_loss": 0.74154377, "learning_rate": 1.279653521879848e-06, "loss": 0.76341105, "num_input_tokens_seen": 112526890, "step": 5229, "time_per_iteration": 2.489473819732666 }, { "auxiliary_loss_clip": 0.01080833, "auxiliary_loss_mlp": 0.01026129, "balance_loss_clip": 1.03982747, "balance_loss_mlp": 1.01892579, "epoch": 0.6288703180424458, "flos": 20009605587840.0, "grad_norm": 1.9858378219753012, "language_loss": 0.83831459, "learning_rate": 1.2789268852308997e-06, "loss": 0.85938418, "num_input_tokens_seen": 112542100, "step": 5230, "time_per_iteration": 2.578523874282837 }, { "auxiliary_loss_clip": 0.01152263, "auxiliary_loss_mlp": 0.0103009, "balance_loss_clip": 1.04742026, "balance_loss_mlp": 1.02191186, "epoch": 0.6289905609330848, "flos": 22124923476480.0, "grad_norm": 2.1981195811225214, "language_loss": 0.71028095, "learning_rate": 1.2782003579681688e-06, "loss": 0.73210454, "num_input_tokens_seen": 112561630, "step": 5231, "time_per_iteration": 2.470848798751831 }, { "auxiliary_loss_clip": 0.01177175, "auxiliary_loss_mlp": 0.01029849, "balance_loss_clip": 1.05290663, "balance_loss_mlp": 1.02168894, "epoch": 0.629110803823724, "flos": 25518481729920.0, "grad_norm": 1.5388962269697406, "language_loss": 0.74302125, "learning_rate": 1.2774739402018701e-06, "loss": 0.76509148, "num_input_tokens_seen": 112582465, "step": 5232, "time_per_iteration": 2.4598615169525146 }, { "auxiliary_loss_clip": 0.01162717, "auxiliary_loss_mlp": 0.01030345, "balance_loss_clip": 1.05331588, "balance_loss_mlp": 1.02197623, "epoch": 0.629231046714363, "flos": 20886616056960.0, "grad_norm": 2.1229375345960295, "language_loss": 0.73529476, "learning_rate": 1.2767476320422002e-06, "loss": 0.75722539, "num_input_tokens_seen": 112602390, "step": 5233, "time_per_iteration": 2.467738628387451 }, { "auxiliary_loss_clip": 0.01048812, "auxiliary_loss_mlp": 0.01001011, "balance_loss_clip": 1.01820767, "balance_loss_mlp": 1.00014102, "epoch": 0.6293512896050021, "flos": 65050027908480.0, "grad_norm": 0.6788227917817856, "language_loss": 0.57197392, "learning_rate": 1.2760214335993392e-06, "loss": 0.59247214, "num_input_tokens_seen": 112669035, "step": 5234, "time_per_iteration": 3.1789321899414062 }, { "auxiliary_loss_clip": 0.01152627, "auxiliary_loss_mlp": 0.01025423, "balance_loss_clip": 1.04704154, "balance_loss_mlp": 1.01815128, "epoch": 0.6294715324956413, "flos": 34677857088000.0, "grad_norm": 1.9001739370799604, "language_loss": 0.59017414, "learning_rate": 1.2752953449834514e-06, "loss": 0.61195463, "num_input_tokens_seen": 112691485, "step": 5235, "time_per_iteration": 2.579751968383789 }, { "auxiliary_loss_clip": 0.01175512, "auxiliary_loss_mlp": 0.01027767, "balance_loss_clip": 1.05241036, "balance_loss_mlp": 1.02036381, "epoch": 0.6295917753862803, "flos": 22784207656320.0, "grad_norm": 1.5564747156146077, "language_loss": 0.7999087, "learning_rate": 1.2745693663046836e-06, "loss": 0.82194149, "num_input_tokens_seen": 112710555, "step": 5236, "time_per_iteration": 2.432197093963623 }, { "auxiliary_loss_clip": 0.01154967, "auxiliary_loss_mlp": 0.01023839, "balance_loss_clip": 1.04776597, "balance_loss_mlp": 1.01674283, "epoch": 0.6297120182769194, "flos": 20850454039680.0, "grad_norm": 2.415094296415789, "language_loss": 0.80901217, "learning_rate": 1.2738434976731662e-06, "loss": 0.83080018, "num_input_tokens_seen": 112728740, "step": 5237, "time_per_iteration": 2.446061611175537 }, { "auxiliary_loss_clip": 0.01147329, "auxiliary_loss_mlp": 0.01036582, "balance_loss_clip": 1.04914629, "balance_loss_mlp": 1.028023, "epoch": 0.6298322611675584, "flos": 19497662997120.0, "grad_norm": 1.511658108858409, "language_loss": 0.75341427, "learning_rate": 1.2731177391990125e-06, "loss": 0.77525342, "num_input_tokens_seen": 112748665, "step": 5238, "time_per_iteration": 2.495856285095215 }, { "auxiliary_loss_clip": 0.01146229, "auxiliary_loss_mlp": 0.01024142, "balance_loss_clip": 1.04514074, "balance_loss_mlp": 1.01671517, "epoch": 0.6299525040581976, "flos": 12604466649600.0, "grad_norm": 1.9377019127844985, "language_loss": 0.81650442, "learning_rate": 1.2723920909923203e-06, "loss": 0.83820814, "num_input_tokens_seen": 112764410, "step": 5239, "time_per_iteration": 2.4648613929748535 }, { "auxiliary_loss_clip": 0.01074429, "auxiliary_loss_mlp": 0.01004566, "balance_loss_clip": 1.0169102, "balance_loss_mlp": 1.00367188, "epoch": 0.6300727469488366, "flos": 57725685636480.0, "grad_norm": 0.8612994398266367, "language_loss": 0.60413402, "learning_rate": 1.2716665531631688e-06, "loss": 0.62492394, "num_input_tokens_seen": 112818695, "step": 5240, "time_per_iteration": 2.967538595199585 }, { "auxiliary_loss_clip": 0.01164447, "auxiliary_loss_mlp": 0.01023709, "balance_loss_clip": 1.04732883, "balance_loss_mlp": 1.01520991, "epoch": 0.6301929898394757, "flos": 22527302607360.0, "grad_norm": 1.7266647771960117, "language_loss": 0.77568197, "learning_rate": 1.270941125821623e-06, "loss": 0.79756355, "num_input_tokens_seen": 112839120, "step": 5241, "time_per_iteration": 2.4679481983184814 }, { "auxiliary_loss_clip": 0.01152948, "auxiliary_loss_mlp": 0.01030813, "balance_loss_clip": 1.0445534, "balance_loss_mlp": 1.02295089, "epoch": 0.6303132327301149, "flos": 28293550675200.0, "grad_norm": 1.7122368213958923, "language_loss": 0.75129187, "learning_rate": 1.2702158090777278e-06, "loss": 0.77312946, "num_input_tokens_seen": 112860210, "step": 5242, "time_per_iteration": 3.3576436042785645 }, { "auxiliary_loss_clip": 0.01127615, "auxiliary_loss_mlp": 0.01025928, "balance_loss_clip": 1.0440861, "balance_loss_mlp": 1.01821852, "epoch": 0.6304334756207539, "flos": 25264521596160.0, "grad_norm": 1.906634468510918, "language_loss": 0.74925578, "learning_rate": 1.2694906030415148e-06, "loss": 0.77079117, "num_input_tokens_seen": 112877955, "step": 5243, "time_per_iteration": 2.5397820472717285 }, { "auxiliary_loss_clip": 0.01154239, "auxiliary_loss_mlp": 0.01028966, "balance_loss_clip": 1.04855275, "balance_loss_mlp": 1.02098536, "epoch": 0.630553718511393, "flos": 18033548728320.0, "grad_norm": 2.7027013290048454, "language_loss": 0.82178998, "learning_rate": 1.2687655078229958e-06, "loss": 0.84362197, "num_input_tokens_seen": 112892285, "step": 5244, "time_per_iteration": 2.6105520725250244 }, { "auxiliary_loss_clip": 0.01143502, "auxiliary_loss_mlp": 0.01025547, "balance_loss_clip": 1.04837537, "balance_loss_mlp": 1.01791167, "epoch": 0.6306739614020321, "flos": 27304103658240.0, "grad_norm": 2.2809779005503863, "language_loss": 0.68951988, "learning_rate": 1.2680405235321678e-06, "loss": 0.71121031, "num_input_tokens_seen": 112913620, "step": 5245, "time_per_iteration": 2.5363271236419678 }, { "auxiliary_loss_clip": 0.01148288, "auxiliary_loss_mlp": 0.00762787, "balance_loss_clip": 1.05169559, "balance_loss_mlp": 1.0003823, "epoch": 0.6307942042926712, "flos": 15341434243200.0, "grad_norm": 2.0723166847212067, "language_loss": 0.78863728, "learning_rate": 1.267315650279011e-06, "loss": 0.80774796, "num_input_tokens_seen": 112932090, "step": 5246, "time_per_iteration": 3.3224740028381348 }, { "auxiliary_loss_clip": 0.01126687, "auxiliary_loss_mlp": 0.01024207, "balance_loss_clip": 1.04773426, "balance_loss_mlp": 1.01682222, "epoch": 0.6309144471833102, "flos": 19606400444160.0, "grad_norm": 1.7387648797215294, "language_loss": 0.73967999, "learning_rate": 1.2665908881734874e-06, "loss": 0.76118898, "num_input_tokens_seen": 112950925, "step": 5247, "time_per_iteration": 3.412727117538452 }, { "auxiliary_loss_clip": 0.01159651, "auxiliary_loss_mlp": 0.01025458, "balance_loss_clip": 1.04971337, "balance_loss_mlp": 1.01848137, "epoch": 0.6310346900739494, "flos": 17493345112320.0, "grad_norm": 2.1975669661306965, "language_loss": 0.84660661, "learning_rate": 1.2658662373255432e-06, "loss": 0.86845767, "num_input_tokens_seen": 112969315, "step": 5248, "time_per_iteration": 2.5256261825561523 }, { "auxiliary_loss_clip": 0.01053738, "auxiliary_loss_mlp": 0.01002163, "balance_loss_clip": 1.01614547, "balance_loss_mlp": 1.00128686, "epoch": 0.6311549329645885, "flos": 55070164131840.0, "grad_norm": 0.7103915134079878, "language_loss": 0.52294707, "learning_rate": 1.2651416978451063e-06, "loss": 0.54350609, "num_input_tokens_seen": 113034700, "step": 5249, "time_per_iteration": 3.1693356037139893 }, { "auxiliary_loss_clip": 0.01175721, "auxiliary_loss_mlp": 0.01024664, "balance_loss_clip": 1.05043006, "balance_loss_mlp": 1.01685023, "epoch": 0.6312751758552275, "flos": 41902545075840.0, "grad_norm": 2.2658316028768093, "language_loss": 0.65159422, "learning_rate": 1.2644172698420903e-06, "loss": 0.67359805, "num_input_tokens_seen": 113056805, "step": 5250, "time_per_iteration": 3.312739372253418 }, { "auxiliary_loss_clip": 0.01131324, "auxiliary_loss_mlp": 0.0103052, "balance_loss_clip": 1.04585195, "balance_loss_mlp": 1.02242541, "epoch": 0.6313954187458667, "flos": 19646800266240.0, "grad_norm": 1.796287760389685, "language_loss": 0.84842646, "learning_rate": 1.2636929534263892e-06, "loss": 0.87004495, "num_input_tokens_seen": 113075790, "step": 5251, "time_per_iteration": 2.5325424671173096 }, { "auxiliary_loss_clip": 0.01131442, "auxiliary_loss_mlp": 0.01023171, "balance_loss_clip": 1.04186857, "balance_loss_mlp": 1.01551795, "epoch": 0.6315156616365057, "flos": 22894273906560.0, "grad_norm": 1.7231176928805898, "language_loss": 0.77580416, "learning_rate": 1.2629687487078821e-06, "loss": 0.79735029, "num_input_tokens_seen": 113094600, "step": 5252, "time_per_iteration": 2.551456928253174 }, { "auxiliary_loss_clip": 0.01162681, "auxiliary_loss_mlp": 0.01028536, "balance_loss_clip": 1.04751658, "balance_loss_mlp": 1.02010226, "epoch": 0.6316359045271448, "flos": 23726251699200.0, "grad_norm": 2.164319552760073, "language_loss": 0.76402354, "learning_rate": 1.2622446557964293e-06, "loss": 0.78593576, "num_input_tokens_seen": 113112605, "step": 5253, "time_per_iteration": 2.5192368030548096 }, { "auxiliary_loss_clip": 0.01143661, "auxiliary_loss_mlp": 0.01027515, "balance_loss_clip": 1.04325616, "balance_loss_mlp": 1.02047586, "epoch": 0.631756147417784, "flos": 33108417164160.0, "grad_norm": 1.6950993714776457, "language_loss": 0.71356583, "learning_rate": 1.261520674801876e-06, "loss": 0.73527753, "num_input_tokens_seen": 113133200, "step": 5254, "time_per_iteration": 2.6268553733825684 }, { "auxiliary_loss_clip": 0.01146707, "auxiliary_loss_mlp": 0.01026586, "balance_loss_clip": 1.05171251, "balance_loss_mlp": 1.01870036, "epoch": 0.631876390308423, "flos": 31248424126080.0, "grad_norm": 1.892572863066902, "language_loss": 0.72500235, "learning_rate": 1.2607968058340488e-06, "loss": 0.74673533, "num_input_tokens_seen": 113152895, "step": 5255, "time_per_iteration": 2.5809309482574463 }, { "auxiliary_loss_clip": 0.01140263, "auxiliary_loss_mlp": 0.01029561, "balance_loss_clip": 1.04521251, "balance_loss_mlp": 1.0217886, "epoch": 0.6319966331990621, "flos": 24681152810880.0, "grad_norm": 1.7664565370931442, "language_loss": 0.73207307, "learning_rate": 1.2600730490027583e-06, "loss": 0.75377136, "num_input_tokens_seen": 113173135, "step": 5256, "time_per_iteration": 2.557288408279419 }, { "auxiliary_loss_clip": 0.01131318, "auxiliary_loss_mlp": 0.01027553, "balance_loss_clip": 1.04668927, "balance_loss_mlp": 1.0197866, "epoch": 0.6321168760897012, "flos": 17491764913920.0, "grad_norm": 1.584690559047317, "language_loss": 0.80507481, "learning_rate": 1.2593494044177984e-06, "loss": 0.82666349, "num_input_tokens_seen": 113191440, "step": 5257, "time_per_iteration": 2.5556273460388184 }, { "auxiliary_loss_clip": 0.011768, "auxiliary_loss_mlp": 0.01026408, "balance_loss_clip": 1.04890287, "balance_loss_mlp": 1.01830173, "epoch": 0.6322371189803403, "flos": 18295373940480.0, "grad_norm": 2.251341422288516, "language_loss": 0.80602717, "learning_rate": 1.2586258721889448e-06, "loss": 0.8280592, "num_input_tokens_seen": 113208790, "step": 5258, "time_per_iteration": 2.4100778102874756 }, { "auxiliary_loss_clip": 0.01111202, "auxiliary_loss_mlp": 0.0102647, "balance_loss_clip": 1.0454185, "balance_loss_mlp": 1.01876307, "epoch": 0.6323573618709794, "flos": 20157270399360.0, "grad_norm": 1.9190900136527766, "language_loss": 0.81956846, "learning_rate": 1.2579024524259573e-06, "loss": 0.84094512, "num_input_tokens_seen": 113225050, "step": 5259, "time_per_iteration": 2.5287811756134033 }, { "auxiliary_loss_clip": 0.0113967, "auxiliary_loss_mlp": 0.01025239, "balance_loss_clip": 1.04355669, "balance_loss_mlp": 1.01740742, "epoch": 0.6324776047616185, "flos": 20042391726720.0, "grad_norm": 2.081989733055051, "language_loss": 0.91321421, "learning_rate": 1.2571791452385768e-06, "loss": 0.93486321, "num_input_tokens_seen": 113242315, "step": 5260, "time_per_iteration": 2.480090618133545 }, { "auxiliary_loss_clip": 0.01146337, "auxiliary_loss_mlp": 0.01031223, "balance_loss_clip": 1.04780018, "balance_loss_mlp": 1.02333784, "epoch": 0.6325978476522576, "flos": 30848235724800.0, "grad_norm": 1.567618844513022, "language_loss": 0.77061307, "learning_rate": 1.2564559507365301e-06, "loss": 0.79238868, "num_input_tokens_seen": 113264720, "step": 5261, "time_per_iteration": 2.5631837844848633 }, { "auxiliary_loss_clip": 0.01145494, "auxiliary_loss_mlp": 0.01025971, "balance_loss_clip": 1.04648447, "balance_loss_mlp": 1.01751292, "epoch": 0.6327180905428966, "flos": 24535104111360.0, "grad_norm": 2.611555615167692, "language_loss": 0.78617263, "learning_rate": 1.2557328690295244e-06, "loss": 0.80788732, "num_input_tokens_seen": 113282910, "step": 5262, "time_per_iteration": 2.5441393852233887 }, { "auxiliary_loss_clip": 0.01137522, "auxiliary_loss_mlp": 0.01026415, "balance_loss_clip": 1.04886937, "balance_loss_mlp": 1.01872611, "epoch": 0.6328383334335358, "flos": 21575274583680.0, "grad_norm": 1.863550005022295, "language_loss": 0.76022011, "learning_rate": 1.255009900227251e-06, "loss": 0.78185952, "num_input_tokens_seen": 113301935, "step": 5263, "time_per_iteration": 2.5330796241760254 }, { "auxiliary_loss_clip": 0.01170232, "auxiliary_loss_mlp": 0.01024154, "balance_loss_clip": 1.05040181, "balance_loss_mlp": 1.01716805, "epoch": 0.6329585763241748, "flos": 22929861306240.0, "grad_norm": 1.7881547881459787, "language_loss": 0.79132324, "learning_rate": 1.254287044439383e-06, "loss": 0.81326705, "num_input_tokens_seen": 113321540, "step": 5264, "time_per_iteration": 2.437756299972534 }, { "auxiliary_loss_clip": 0.0107299, "auxiliary_loss_mlp": 0.01001186, "balance_loss_clip": 1.01553082, "balance_loss_mlp": 1.00032794, "epoch": 0.6330788192148139, "flos": 70936897847040.0, "grad_norm": 0.779024323654976, "language_loss": 0.54449677, "learning_rate": 1.2535643017755776e-06, "loss": 0.56523854, "num_input_tokens_seen": 113383730, "step": 5265, "time_per_iteration": 3.1355056762695312 }, { "auxiliary_loss_clip": 0.01130683, "auxiliary_loss_mlp": 0.01032311, "balance_loss_clip": 1.04431105, "balance_loss_mlp": 1.02402544, "epoch": 0.6331990621054531, "flos": 21244501215360.0, "grad_norm": 2.595743669949413, "language_loss": 0.72268641, "learning_rate": 1.2528416723454737e-06, "loss": 0.74431634, "num_input_tokens_seen": 113400400, "step": 5266, "time_per_iteration": 2.511258602142334 }, { "auxiliary_loss_clip": 0.01171116, "auxiliary_loss_mlp": 0.01023696, "balance_loss_clip": 1.05074954, "balance_loss_mlp": 1.01669884, "epoch": 0.6333193049960921, "flos": 34459412526720.0, "grad_norm": 1.4988909305434233, "language_loss": 0.70961404, "learning_rate": 1.2521191562586945e-06, "loss": 0.73156214, "num_input_tokens_seen": 113424050, "step": 5267, "time_per_iteration": 2.527148485183716 }, { "auxiliary_loss_clip": 0.01174584, "auxiliary_loss_mlp": 0.007623, "balance_loss_clip": 1.05160499, "balance_loss_mlp": 1.00047827, "epoch": 0.6334395478867312, "flos": 18329883932160.0, "grad_norm": 2.3017376266098637, "language_loss": 0.76833642, "learning_rate": 1.2513967536248445e-06, "loss": 0.78770524, "num_input_tokens_seen": 113440370, "step": 5268, "time_per_iteration": 2.4046616554260254 }, { "auxiliary_loss_clip": 0.01154731, "auxiliary_loss_mlp": 0.01028181, "balance_loss_clip": 1.04934096, "balance_loss_mlp": 1.02093577, "epoch": 0.6335597907773702, "flos": 23623152687360.0, "grad_norm": 1.5746507331262924, "language_loss": 0.80809134, "learning_rate": 1.2506744645535117e-06, "loss": 0.82992047, "num_input_tokens_seen": 113460800, "step": 5269, "time_per_iteration": 3.2198593616485596 }, { "auxiliary_loss_clip": 0.01136489, "auxiliary_loss_mlp": 0.01021628, "balance_loss_clip": 1.04161441, "balance_loss_mlp": 1.01389718, "epoch": 0.6336800336680094, "flos": 22710913954560.0, "grad_norm": 7.694002005438443, "language_loss": 0.604617, "learning_rate": 1.249952289154267e-06, "loss": 0.62619817, "num_input_tokens_seen": 113480840, "step": 5270, "time_per_iteration": 2.5100300312042236 }, { "auxiliary_loss_clip": 0.01090882, "auxiliary_loss_mlp": 0.01028785, "balance_loss_clip": 1.04044759, "balance_loss_mlp": 1.0215013, "epoch": 0.6338002765586485, "flos": 23622757637760.0, "grad_norm": 2.075392247291788, "language_loss": 0.76413989, "learning_rate": 1.2492302275366635e-06, "loss": 0.78533655, "num_input_tokens_seen": 113500515, "step": 5271, "time_per_iteration": 2.5960912704467773 }, { "auxiliary_loss_clip": 0.01152743, "auxiliary_loss_mlp": 0.01027776, "balance_loss_clip": 1.04660082, "balance_loss_mlp": 1.02003384, "epoch": 0.6339205194492875, "flos": 26505450708480.0, "grad_norm": 2.546027301746887, "language_loss": 0.65856743, "learning_rate": 1.2485082798102377e-06, "loss": 0.6803726, "num_input_tokens_seen": 113520930, "step": 5272, "time_per_iteration": 2.497006416320801 }, { "auxiliary_loss_clip": 0.01135857, "auxiliary_loss_mlp": 0.01024821, "balance_loss_clip": 1.04543376, "balance_loss_mlp": 1.01626766, "epoch": 0.6340407623399267, "flos": 18544306170240.0, "grad_norm": 2.237009596368795, "language_loss": 0.68699163, "learning_rate": 1.2477864460845084e-06, "loss": 0.70859838, "num_input_tokens_seen": 113537330, "step": 5273, "time_per_iteration": 3.34025239944458 }, { "auxiliary_loss_clip": 0.01143964, "auxiliary_loss_mlp": 0.0102555, "balance_loss_clip": 1.0454905, "balance_loss_mlp": 1.01764035, "epoch": 0.6341610052305657, "flos": 17712579772800.0, "grad_norm": 2.7675679732899767, "language_loss": 0.7315557, "learning_rate": 1.2470647264689776e-06, "loss": 0.75325084, "num_input_tokens_seen": 113555810, "step": 5274, "time_per_iteration": 3.3289694786071777 }, { "auxiliary_loss_clip": 0.01110216, "auxiliary_loss_mlp": 0.01026458, "balance_loss_clip": 1.04107213, "balance_loss_mlp": 1.01845288, "epoch": 0.6342812481212048, "flos": 23587026583680.0, "grad_norm": 15.072758610497422, "language_loss": 0.70952404, "learning_rate": 1.2463431210731282e-06, "loss": 0.73089075, "num_input_tokens_seen": 113575395, "step": 5275, "time_per_iteration": 2.6223363876342773 }, { "auxiliary_loss_clip": 0.01125579, "auxiliary_loss_mlp": 0.01022951, "balance_loss_clip": 1.04233372, "balance_loss_mlp": 1.01567364, "epoch": 0.634401491011844, "flos": 17821927751040.0, "grad_norm": 2.4653105526294388, "language_loss": 0.76252979, "learning_rate": 1.2456216300064289e-06, "loss": 0.78401506, "num_input_tokens_seen": 113592945, "step": 5276, "time_per_iteration": 3.3060522079467773 }, { "auxiliary_loss_clip": 0.01128194, "auxiliary_loss_mlp": 0.01025351, "balance_loss_clip": 1.04329205, "balance_loss_mlp": 1.01741791, "epoch": 0.634521733902483, "flos": 21358158825600.0, "grad_norm": 1.6074230619354897, "language_loss": 0.78452903, "learning_rate": 1.244900253378328e-06, "loss": 0.80606449, "num_input_tokens_seen": 113613000, "step": 5277, "time_per_iteration": 2.549111843109131 }, { "auxiliary_loss_clip": 0.01086331, "auxiliary_loss_mlp": 0.0102657, "balance_loss_clip": 1.04237294, "balance_loss_mlp": 1.01922965, "epoch": 0.6346419767931221, "flos": 16545052103040.0, "grad_norm": 1.8858344114320909, "language_loss": 0.69168562, "learning_rate": 1.2441789912982583e-06, "loss": 0.71281463, "num_input_tokens_seen": 113630085, "step": 5278, "time_per_iteration": 2.5837655067443848 }, { "auxiliary_loss_clip": 0.01162283, "auxiliary_loss_mlp": 0.01022699, "balance_loss_clip": 1.04936278, "balance_loss_mlp": 1.0149262, "epoch": 0.6347622196837612, "flos": 24350989973760.0, "grad_norm": 1.8158614743689814, "language_loss": 0.64959836, "learning_rate": 1.2434578438756346e-06, "loss": 0.67144823, "num_input_tokens_seen": 113650515, "step": 5279, "time_per_iteration": 2.507667064666748 }, { "auxiliary_loss_clip": 0.01157929, "auxiliary_loss_mlp": 0.01022055, "balance_loss_clip": 1.04682398, "balance_loss_mlp": 1.01469946, "epoch": 0.6348824625744003, "flos": 64523178195840.0, "grad_norm": 1.9204991286030741, "language_loss": 0.77853382, "learning_rate": 1.242736811219855e-06, "loss": 0.80033362, "num_input_tokens_seen": 113676475, "step": 5280, "time_per_iteration": 2.8520243167877197 }, { "auxiliary_loss_clip": 0.01154301, "auxiliary_loss_mlp": 0.01024287, "balance_loss_clip": 1.04774368, "balance_loss_mlp": 1.01656866, "epoch": 0.6350027054650393, "flos": 28622133313920.0, "grad_norm": 2.0580047871682092, "language_loss": 0.81554079, "learning_rate": 1.2420158934402988e-06, "loss": 0.83732677, "num_input_tokens_seen": 113697090, "step": 5281, "time_per_iteration": 2.518745183944702 }, { "auxiliary_loss_clip": 0.011162, "auxiliary_loss_mlp": 0.01024988, "balance_loss_clip": 1.04131126, "balance_loss_mlp": 1.01732886, "epoch": 0.6351229483556785, "flos": 23002544476800.0, "grad_norm": 1.8350585422268777, "language_loss": 0.84928542, "learning_rate": 1.2412950906463286e-06, "loss": 0.87069726, "num_input_tokens_seen": 113714395, "step": 5282, "time_per_iteration": 2.5169124603271484 }, { "auxiliary_loss_clip": 0.01115645, "auxiliary_loss_mlp": 0.01026344, "balance_loss_clip": 1.0443542, "balance_loss_mlp": 1.01885462, "epoch": 0.6352431912463176, "flos": 21939300967680.0, "grad_norm": 1.7217629415082754, "language_loss": 0.89771909, "learning_rate": 1.2405744029472902e-06, "loss": 0.91913891, "num_input_tokens_seen": 113733880, "step": 5283, "time_per_iteration": 2.570225715637207 }, { "auxiliary_loss_clip": 0.01141719, "auxiliary_loss_mlp": 0.01024448, "balance_loss_clip": 1.04499722, "balance_loss_mlp": 1.01726615, "epoch": 0.6353634341369566, "flos": 13735257684480.0, "grad_norm": 2.3437238379443532, "language_loss": 0.76385826, "learning_rate": 1.2398538304525108e-06, "loss": 0.78551996, "num_input_tokens_seen": 113752505, "step": 5284, "time_per_iteration": 2.468639612197876 }, { "auxiliary_loss_clip": 0.01117207, "auxiliary_loss_mlp": 0.01034064, "balance_loss_clip": 1.04538131, "balance_loss_mlp": 1.0256952, "epoch": 0.6354836770275958, "flos": 19316170552320.0, "grad_norm": 1.973459835833161, "language_loss": 0.75322002, "learning_rate": 1.2391333732713016e-06, "loss": 0.77473271, "num_input_tokens_seen": 113770310, "step": 5285, "time_per_iteration": 2.5357022285461426 }, { "auxiliary_loss_clip": 0.01121697, "auxiliary_loss_mlp": 0.01037953, "balance_loss_clip": 1.04391575, "balance_loss_mlp": 1.02874374, "epoch": 0.6356039199182348, "flos": 21613375935360.0, "grad_norm": 2.091643414321461, "language_loss": 0.78670728, "learning_rate": 1.2384130315129543e-06, "loss": 0.80830371, "num_input_tokens_seen": 113788635, "step": 5286, "time_per_iteration": 2.5537593364715576 }, { "auxiliary_loss_clip": 0.01091519, "auxiliary_loss_mlp": 0.0102581, "balance_loss_clip": 1.04289269, "balance_loss_mlp": 1.01763833, "epoch": 0.6357241628088739, "flos": 18111978074880.0, "grad_norm": 2.5820126966692167, "language_loss": 0.73408329, "learning_rate": 1.2376928052867447e-06, "loss": 0.75525659, "num_input_tokens_seen": 113807755, "step": 5287, "time_per_iteration": 2.60221266746521 }, { "auxiliary_loss_clip": 0.01145728, "auxiliary_loss_mlp": 0.01026239, "balance_loss_clip": 1.04870617, "balance_loss_mlp": 1.01874113, "epoch": 0.6358444056995131, "flos": 24935256599040.0, "grad_norm": 1.900953781513786, "language_loss": 0.77475965, "learning_rate": 1.2369726947019299e-06, "loss": 0.79647923, "num_input_tokens_seen": 113828230, "step": 5288, "time_per_iteration": 2.5208442211151123 }, { "auxiliary_loss_clip": 0.01156505, "auxiliary_loss_mlp": 0.01020329, "balance_loss_clip": 1.04632115, "balance_loss_mlp": 1.01276457, "epoch": 0.6359646485901521, "flos": 23293348986240.0, "grad_norm": 2.162948496529071, "language_loss": 0.67297655, "learning_rate": 1.2362526998677511e-06, "loss": 0.69474483, "num_input_tokens_seen": 113844595, "step": 5289, "time_per_iteration": 2.457869529724121 }, { "auxiliary_loss_clip": 0.01148012, "auxiliary_loss_mlp": 0.01027002, "balance_loss_clip": 1.04748273, "balance_loss_mlp": 1.01988482, "epoch": 0.6360848914807912, "flos": 20887442069760.0, "grad_norm": 1.8536267063473426, "language_loss": 0.84292012, "learning_rate": 1.2355328208934301e-06, "loss": 0.86467028, "num_input_tokens_seen": 113863470, "step": 5290, "time_per_iteration": 2.492821216583252 }, { "auxiliary_loss_clip": 0.01156999, "auxiliary_loss_mlp": 0.00762547, "balance_loss_clip": 1.04509509, "balance_loss_mlp": 1.00057101, "epoch": 0.6362051343714303, "flos": 18479775386880.0, "grad_norm": 1.6114396935860047, "language_loss": 0.7220962, "learning_rate": 1.2348130578881728e-06, "loss": 0.74129164, "num_input_tokens_seen": 113881690, "step": 5291, "time_per_iteration": 2.4520437717437744 }, { "auxiliary_loss_clip": 0.01174524, "auxiliary_loss_mlp": 0.0102957, "balance_loss_clip": 1.04988742, "balance_loss_mlp": 1.02157152, "epoch": 0.6363253772620694, "flos": 24389594115840.0, "grad_norm": 2.519250035934543, "language_loss": 0.76353866, "learning_rate": 1.2340934109611664e-06, "loss": 0.78557956, "num_input_tokens_seen": 113902450, "step": 5292, "time_per_iteration": 2.452216148376465 }, { "auxiliary_loss_clip": 0.01150948, "auxiliary_loss_mlp": 0.01026443, "balance_loss_clip": 1.04919553, "balance_loss_mlp": 1.01784182, "epoch": 0.6364456201527084, "flos": 25958243940480.0, "grad_norm": 1.9950303360599584, "language_loss": 0.68935609, "learning_rate": 1.2333738802215798e-06, "loss": 0.71112996, "num_input_tokens_seen": 113922670, "step": 5293, "time_per_iteration": 2.5406911373138428 }, { "auxiliary_loss_clip": 0.01108759, "auxiliary_loss_mlp": 0.01025938, "balance_loss_clip": 1.04078245, "balance_loss_mlp": 1.01828742, "epoch": 0.6365658630433476, "flos": 20740711011840.0, "grad_norm": 1.8296366015077776, "language_loss": 0.80890852, "learning_rate": 1.2326544657785668e-06, "loss": 0.83025551, "num_input_tokens_seen": 113942360, "step": 5294, "time_per_iteration": 2.544131278991699 }, { "auxiliary_loss_clip": 0.01122527, "auxiliary_loss_mlp": 0.01033007, "balance_loss_clip": 1.04422414, "balance_loss_mlp": 1.02450192, "epoch": 0.6366861059339867, "flos": 21434146047360.0, "grad_norm": 2.83681433898636, "language_loss": 0.74706316, "learning_rate": 1.2319351677412608e-06, "loss": 0.76861858, "num_input_tokens_seen": 113959405, "step": 5295, "time_per_iteration": 3.3298044204711914 }, { "auxiliary_loss_clip": 0.01140489, "auxiliary_loss_mlp": 0.01026381, "balance_loss_clip": 1.04899931, "balance_loss_mlp": 1.01842988, "epoch": 0.6368063488246257, "flos": 22267093507200.0, "grad_norm": 2.28397664650324, "language_loss": 0.74407566, "learning_rate": 1.2312159862187796e-06, "loss": 0.76574433, "num_input_tokens_seen": 113977815, "step": 5296, "time_per_iteration": 2.5113227367401123 }, { "auxiliary_loss_clip": 0.01177998, "auxiliary_loss_mlp": 0.01034236, "balance_loss_clip": 1.05291986, "balance_loss_mlp": 1.02645445, "epoch": 0.6369265917152649, "flos": 22420719976320.0, "grad_norm": 2.688298999957881, "language_loss": 0.75958943, "learning_rate": 1.2304969213202217e-06, "loss": 0.78171176, "num_input_tokens_seen": 113999075, "step": 5297, "time_per_iteration": 2.453690767288208 }, { "auxiliary_loss_clip": 0.01139792, "auxiliary_loss_mlp": 0.01024813, "balance_loss_clip": 1.0453496, "balance_loss_mlp": 1.01769006, "epoch": 0.6370468346059039, "flos": 24718176754560.0, "grad_norm": 2.2552513326488963, "language_loss": 0.79033393, "learning_rate": 1.2297779731546692e-06, "loss": 0.81198001, "num_input_tokens_seen": 114018170, "step": 5298, "time_per_iteration": 2.5117056369781494 }, { "auxiliary_loss_clip": 0.01143514, "auxiliary_loss_mlp": 0.01029705, "balance_loss_clip": 1.04846466, "balance_loss_mlp": 1.02180135, "epoch": 0.637167077496543, "flos": 25296589463040.0, "grad_norm": 1.9199288917648691, "language_loss": 0.78064179, "learning_rate": 1.2290591418311853e-06, "loss": 0.80237395, "num_input_tokens_seen": 114035565, "step": 5299, "time_per_iteration": 3.3526906967163086 }, { "auxiliary_loss_clip": 0.01161287, "auxiliary_loss_mlp": 0.01027273, "balance_loss_clip": 1.05219555, "balance_loss_mlp": 1.01969123, "epoch": 0.637287320387182, "flos": 27671110871040.0, "grad_norm": 1.6197946487048385, "language_loss": 0.72045326, "learning_rate": 1.2283404274588172e-06, "loss": 0.7423389, "num_input_tokens_seen": 114054510, "step": 5300, "time_per_iteration": 3.367604970932007 }, { "auxiliary_loss_clip": 0.00996388, "auxiliary_loss_mlp": 0.01004192, "balance_loss_clip": 1.01321888, "balance_loss_mlp": 1.00309527, "epoch": 0.6374075632778212, "flos": 63173406873600.0, "grad_norm": 0.7427944534174411, "language_loss": 0.52826333, "learning_rate": 1.227621830146592e-06, "loss": 0.54826909, "num_input_tokens_seen": 114109875, "step": 5301, "time_per_iteration": 3.1422998905181885 }, { "auxiliary_loss_clip": 0.01139577, "auxiliary_loss_mlp": 0.01030294, "balance_loss_clip": 1.05139172, "balance_loss_mlp": 1.02269101, "epoch": 0.6375278061684603, "flos": 25558127366400.0, "grad_norm": 1.9973624382908963, "language_loss": 0.79488832, "learning_rate": 1.2269033500035217e-06, "loss": 0.81658697, "num_input_tokens_seen": 114130010, "step": 5302, "time_per_iteration": 2.84183406829834 }, { "auxiliary_loss_clip": 0.011305, "auxiliary_loss_mlp": 0.01030061, "balance_loss_clip": 1.04673529, "balance_loss_mlp": 1.02270937, "epoch": 0.6376480490590993, "flos": 25666362023040.0, "grad_norm": 1.6599732459490695, "language_loss": 0.73446679, "learning_rate": 1.2261849871385988e-06, "loss": 0.7560724, "num_input_tokens_seen": 114151115, "step": 5303, "time_per_iteration": 3.293503522872925 }, { "auxiliary_loss_clip": 0.01174423, "auxiliary_loss_mlp": 0.01023777, "balance_loss_clip": 1.04949498, "balance_loss_mlp": 1.01572454, "epoch": 0.6377682919497385, "flos": 31537684350720.0, "grad_norm": 2.2285008761150786, "language_loss": 0.6242066, "learning_rate": 1.2254667416607972e-06, "loss": 0.64618862, "num_input_tokens_seen": 114172715, "step": 5304, "time_per_iteration": 2.506847858428955 }, { "auxiliary_loss_clip": 0.01157954, "auxiliary_loss_mlp": 0.01022167, "balance_loss_clip": 1.0504241, "balance_loss_mlp": 1.01438594, "epoch": 0.6378885348403776, "flos": 23039209284480.0, "grad_norm": 1.8430296024005839, "language_loss": 0.82953912, "learning_rate": 1.2247486136790756e-06, "loss": 0.85134029, "num_input_tokens_seen": 114192195, "step": 5305, "time_per_iteration": 2.461240291595459 }, { "auxiliary_loss_clip": 0.01163064, "auxiliary_loss_mlp": 0.01032924, "balance_loss_clip": 1.05182195, "balance_loss_mlp": 1.02541375, "epoch": 0.6380087777310166, "flos": 18697070712960.0, "grad_norm": 2.080045351034289, "language_loss": 0.80872786, "learning_rate": 1.2240306033023726e-06, "loss": 0.83068776, "num_input_tokens_seen": 114210020, "step": 5306, "time_per_iteration": 2.432070255279541 }, { "auxiliary_loss_clip": 0.01131885, "auxiliary_loss_mlp": 0.01028584, "balance_loss_clip": 1.04243898, "balance_loss_mlp": 1.02046561, "epoch": 0.6381290206216558, "flos": 23331558078720.0, "grad_norm": 1.8907804918699203, "language_loss": 0.72097391, "learning_rate": 1.223312710639611e-06, "loss": 0.74257857, "num_input_tokens_seen": 114228740, "step": 5307, "time_per_iteration": 2.52185320854187 }, { "auxiliary_loss_clip": 0.01144887, "auxiliary_loss_mlp": 0.01026909, "balance_loss_clip": 1.04798126, "balance_loss_mlp": 1.01893127, "epoch": 0.6382492635122948, "flos": 18880466578560.0, "grad_norm": 2.1403445603410773, "language_loss": 0.86983752, "learning_rate": 1.2225949357996928e-06, "loss": 0.89155549, "num_input_tokens_seen": 114246865, "step": 5308, "time_per_iteration": 2.453169107437134 }, { "auxiliary_loss_clip": 0.01156393, "auxiliary_loss_mlp": 0.01026432, "balance_loss_clip": 1.049945, "balance_loss_mlp": 1.01925528, "epoch": 0.6383695064029339, "flos": 27819134818560.0, "grad_norm": 1.4945758287424313, "language_loss": 0.80474615, "learning_rate": 1.221877278891505e-06, "loss": 0.82657439, "num_input_tokens_seen": 114266120, "step": 5309, "time_per_iteration": 2.493582248687744 }, { "auxiliary_loss_clip": 0.01166133, "auxiliary_loss_mlp": 0.0103186, "balance_loss_clip": 1.05068755, "balance_loss_mlp": 1.0235033, "epoch": 0.638489749293573, "flos": 26395635853440.0, "grad_norm": 1.9957006960726937, "language_loss": 0.71371841, "learning_rate": 1.221159740023915e-06, "loss": 0.7356984, "num_input_tokens_seen": 114285950, "step": 5310, "time_per_iteration": 2.477654218673706 }, { "auxiliary_loss_clip": 0.01140549, "auxiliary_loss_mlp": 0.00762534, "balance_loss_clip": 1.04776287, "balance_loss_mlp": 1.00055432, "epoch": 0.6386099921842121, "flos": 23988328306560.0, "grad_norm": 1.8415529101352488, "language_loss": 0.72419107, "learning_rate": 1.2204423193057735e-06, "loss": 0.74322188, "num_input_tokens_seen": 114304780, "step": 5311, "time_per_iteration": 2.5925655364990234 }, { "auxiliary_loss_clip": 0.01052044, "auxiliary_loss_mlp": 0.0100202, "balance_loss_clip": 1.01491857, "balance_loss_mlp": 1.00102472, "epoch": 0.6387302350748512, "flos": 71731169337600.0, "grad_norm": 0.8919463888173281, "language_loss": 0.63398248, "learning_rate": 1.2197250168459122e-06, "loss": 0.65452307, "num_input_tokens_seen": 114361180, "step": 5312, "time_per_iteration": 3.088937997817993 }, { "auxiliary_loss_clip": 0.01161212, "auxiliary_loss_mlp": 0.01025727, "balance_loss_clip": 1.04964519, "balance_loss_mlp": 1.01815724, "epoch": 0.6388504779654903, "flos": 14535778141440.0, "grad_norm": 1.9262403540987723, "language_loss": 0.74465376, "learning_rate": 1.2190078327531454e-06, "loss": 0.76652312, "num_input_tokens_seen": 114377425, "step": 5313, "time_per_iteration": 2.45031476020813 }, { "auxiliary_loss_clip": 0.01159214, "auxiliary_loss_mlp": 0.01029459, "balance_loss_clip": 1.04774022, "balance_loss_mlp": 1.02214575, "epoch": 0.6389707208561294, "flos": 22346133384960.0, "grad_norm": 1.4173791068018777, "language_loss": 0.7263236, "learning_rate": 1.2182907671362697e-06, "loss": 0.74821031, "num_input_tokens_seen": 114398120, "step": 5314, "time_per_iteration": 2.528268814086914 }, { "auxiliary_loss_clip": 0.0115957, "auxiliary_loss_mlp": 0.01025745, "balance_loss_clip": 1.05124092, "balance_loss_mlp": 1.01796103, "epoch": 0.6390909637467684, "flos": 19426883247360.0, "grad_norm": 1.7790856853570518, "language_loss": 0.78865641, "learning_rate": 1.2175738201040626e-06, "loss": 0.81050956, "num_input_tokens_seen": 114415160, "step": 5315, "time_per_iteration": 2.4518160820007324 }, { "auxiliary_loss_clip": 0.01157315, "auxiliary_loss_mlp": 0.01029092, "balance_loss_clip": 1.04750371, "balance_loss_mlp": 1.02150989, "epoch": 0.6392112066374076, "flos": 24090852700800.0, "grad_norm": 1.7995876809726286, "language_loss": 0.78570986, "learning_rate": 1.2168569917652855e-06, "loss": 0.80757391, "num_input_tokens_seen": 114435015, "step": 5316, "time_per_iteration": 2.4894604682922363 }, { "auxiliary_loss_clip": 0.01161029, "auxiliary_loss_mlp": 0.0102335, "balance_loss_clip": 1.05093455, "balance_loss_mlp": 1.01529157, "epoch": 0.6393314495280467, "flos": 26795141896320.0, "grad_norm": 1.5124650012420744, "language_loss": 0.63884765, "learning_rate": 1.2161402822286797e-06, "loss": 0.66069144, "num_input_tokens_seen": 114455700, "step": 5317, "time_per_iteration": 2.4946131706237793 }, { "auxiliary_loss_clip": 0.01127592, "auxiliary_loss_mlp": 0.01021914, "balance_loss_clip": 1.04453397, "balance_loss_mlp": 1.01433849, "epoch": 0.6394516924186857, "flos": 20260692633600.0, "grad_norm": 2.1216247270816617, "language_loss": 0.78773034, "learning_rate": 1.2154236916029703e-06, "loss": 0.80922544, "num_input_tokens_seen": 114473675, "step": 5318, "time_per_iteration": 2.497843027114868 }, { "auxiliary_loss_clip": 0.01115721, "auxiliary_loss_mlp": 0.01024901, "balance_loss_clip": 1.04071355, "balance_loss_mlp": 1.01742077, "epoch": 0.6395719353093249, "flos": 18368847210240.0, "grad_norm": 2.534705224720435, "language_loss": 0.73448962, "learning_rate": 1.2147072199968627e-06, "loss": 0.75589585, "num_input_tokens_seen": 114492310, "step": 5319, "time_per_iteration": 2.5192885398864746 }, { "auxiliary_loss_clip": 0.01156808, "auxiliary_loss_mlp": 0.01026179, "balance_loss_clip": 1.04859042, "balance_loss_mlp": 1.01899624, "epoch": 0.6396921781999639, "flos": 17566315591680.0, "grad_norm": 1.687268839794007, "language_loss": 0.71678048, "learning_rate": 1.2139908675190454e-06, "loss": 0.73861033, "num_input_tokens_seen": 114511520, "step": 5320, "time_per_iteration": 2.4200022220611572 }, { "auxiliary_loss_clip": 0.01091807, "auxiliary_loss_mlp": 0.01025279, "balance_loss_clip": 1.03890812, "balance_loss_mlp": 1.01778078, "epoch": 0.639812421090603, "flos": 21251252972160.0, "grad_norm": 2.1242980023188798, "language_loss": 0.75461257, "learning_rate": 1.2132746342781883e-06, "loss": 0.77578342, "num_input_tokens_seen": 114532680, "step": 5321, "time_per_iteration": 2.600229024887085 }, { "auxiliary_loss_clip": 0.0117499, "auxiliary_loss_mlp": 0.01026994, "balance_loss_clip": 1.05134273, "balance_loss_mlp": 1.0186435, "epoch": 0.6399326639812422, "flos": 11180967684480.0, "grad_norm": 2.4183223498876187, "language_loss": 0.79890311, "learning_rate": 1.2125585203829442e-06, "loss": 0.82092297, "num_input_tokens_seen": 114548320, "step": 5322, "time_per_iteration": 3.1619529724121094 }, { "auxiliary_loss_clip": 0.01118718, "auxiliary_loss_mlp": 0.01030233, "balance_loss_clip": 1.04556656, "balance_loss_mlp": 1.02222228, "epoch": 0.6400529068718812, "flos": 23911048195200.0, "grad_norm": 4.753229500658536, "language_loss": 0.74411368, "learning_rate": 1.211842525941946e-06, "loss": 0.76560318, "num_input_tokens_seen": 114568115, "step": 5323, "time_per_iteration": 2.5193850994110107 }, { "auxiliary_loss_clip": 0.0111287, "auxiliary_loss_mlp": 0.01025994, "balance_loss_clip": 1.04585505, "balance_loss_mlp": 1.01857877, "epoch": 0.6401731497625203, "flos": 44018724890880.0, "grad_norm": 1.8372634232212812, "language_loss": 0.78586113, "learning_rate": 1.2111266510638105e-06, "loss": 0.80724972, "num_input_tokens_seen": 114591040, "step": 5324, "time_per_iteration": 2.7600488662719727 }, { "auxiliary_loss_clip": 0.01094668, "auxiliary_loss_mlp": 0.01026398, "balance_loss_clip": 1.04182291, "balance_loss_mlp": 1.01886964, "epoch": 0.6402933926531594, "flos": 20662209838080.0, "grad_norm": 1.7788410479294903, "language_loss": 0.80138856, "learning_rate": 1.2104108958571346e-06, "loss": 0.82259917, "num_input_tokens_seen": 114609310, "step": 5325, "time_per_iteration": 2.6203114986419678 }, { "auxiliary_loss_clip": 0.01155702, "auxiliary_loss_mlp": 0.01028062, "balance_loss_clip": 1.04929554, "balance_loss_mlp": 1.02068889, "epoch": 0.6404136355437985, "flos": 24863327614080.0, "grad_norm": 1.4649566621947558, "language_loss": 0.75913548, "learning_rate": 1.2096952604304975e-06, "loss": 0.7809732, "num_input_tokens_seen": 114629740, "step": 5326, "time_per_iteration": 3.3330068588256836 }, { "auxiliary_loss_clip": 0.01160867, "auxiliary_loss_mlp": 0.01026281, "balance_loss_clip": 1.04821277, "balance_loss_mlp": 1.01840734, "epoch": 0.6405338784344375, "flos": 40479548901120.0, "grad_norm": 2.3478994769926786, "language_loss": 0.705194, "learning_rate": 1.2089797448924616e-06, "loss": 0.72706544, "num_input_tokens_seen": 114653615, "step": 5327, "time_per_iteration": 3.415560007095337 }, { "auxiliary_loss_clip": 0.01122666, "auxiliary_loss_mlp": 0.01026401, "balance_loss_clip": 1.0423646, "balance_loss_mlp": 1.01864362, "epoch": 0.6406541213250767, "flos": 20886041439360.0, "grad_norm": 2.0119672809734164, "language_loss": 0.65710902, "learning_rate": 1.2082643493515692e-06, "loss": 0.67859972, "num_input_tokens_seen": 114671935, "step": 5328, "time_per_iteration": 2.5431039333343506 }, { "auxiliary_loss_clip": 0.01157258, "auxiliary_loss_mlp": 0.01024726, "balance_loss_clip": 1.04804981, "balance_loss_mlp": 1.01707888, "epoch": 0.6407743642157158, "flos": 23295970679040.0, "grad_norm": 2.0107220421596996, "language_loss": 0.82066506, "learning_rate": 1.207549073916346e-06, "loss": 0.84248489, "num_input_tokens_seen": 114692870, "step": 5329, "time_per_iteration": 3.2290544509887695 }, { "auxiliary_loss_clip": 0.01134585, "auxiliary_loss_mlp": 0.01022905, "balance_loss_clip": 1.04556584, "balance_loss_mlp": 1.01579404, "epoch": 0.6408946071063548, "flos": 15012636122880.0, "grad_norm": 1.8958550359033812, "language_loss": 0.77789593, "learning_rate": 1.2068339186952976e-06, "loss": 0.7994709, "num_input_tokens_seen": 114710410, "step": 5330, "time_per_iteration": 2.5155410766601562 }, { "auxiliary_loss_clip": 0.011611, "auxiliary_loss_mlp": 0.01029178, "balance_loss_clip": 1.04946446, "balance_loss_mlp": 1.02127123, "epoch": 0.6410148499969939, "flos": 22528595496960.0, "grad_norm": 2.0442715750028824, "language_loss": 0.73243988, "learning_rate": 1.2061188837969136e-06, "loss": 0.75434262, "num_input_tokens_seen": 114730020, "step": 5331, "time_per_iteration": 2.4662926197052 }, { "auxiliary_loss_clip": 0.01122873, "auxiliary_loss_mlp": 0.01025999, "balance_loss_clip": 1.04150641, "balance_loss_mlp": 1.01804209, "epoch": 0.641135092887633, "flos": 12422004537600.0, "grad_norm": 2.480355896826845, "language_loss": 0.84041226, "learning_rate": 1.2054039693296631e-06, "loss": 0.86190093, "num_input_tokens_seen": 114748015, "step": 5332, "time_per_iteration": 2.5016486644744873 }, { "auxiliary_loss_clip": 0.01125519, "auxiliary_loss_mlp": 0.01029453, "balance_loss_clip": 1.0445025, "balance_loss_mlp": 1.02235389, "epoch": 0.6412553357782721, "flos": 22127329687680.0, "grad_norm": 1.8420310628330967, "language_loss": 0.81520379, "learning_rate": 1.2046891754019992e-06, "loss": 0.83675349, "num_input_tokens_seen": 114768625, "step": 5333, "time_per_iteration": 2.536783218383789 }, { "auxiliary_loss_clip": 0.01161449, "auxiliary_loss_mlp": 0.01025574, "balance_loss_clip": 1.04948533, "balance_loss_mlp": 1.0180223, "epoch": 0.6413755786689112, "flos": 15888605097600.0, "grad_norm": 1.9282467162316852, "language_loss": 0.82951975, "learning_rate": 1.2039745021223548e-06, "loss": 0.85139, "num_input_tokens_seen": 114786045, "step": 5334, "time_per_iteration": 2.4252753257751465 }, { "auxiliary_loss_clip": 0.01023782, "auxiliary_loss_mlp": 0.01004669, "balance_loss_clip": 1.0148356, "balance_loss_mlp": 1.0038408, "epoch": 0.6414958215595503, "flos": 68039159955840.0, "grad_norm": 0.7914621700395528, "language_loss": 0.57056165, "learning_rate": 1.2032599495991456e-06, "loss": 0.59084612, "num_input_tokens_seen": 114850785, "step": 5335, "time_per_iteration": 3.2243902683258057 }, { "auxiliary_loss_clip": 0.01157959, "auxiliary_loss_mlp": 0.01023383, "balance_loss_clip": 1.04883182, "balance_loss_mlp": 1.01552749, "epoch": 0.6416160644501894, "flos": 44091300320640.0, "grad_norm": 2.070151705950607, "language_loss": 0.70034564, "learning_rate": 1.2025455179407685e-06, "loss": 0.72215903, "num_input_tokens_seen": 114871945, "step": 5336, "time_per_iteration": 2.6606969833374023 }, { "auxiliary_loss_clip": 0.01155571, "auxiliary_loss_mlp": 0.0076235, "balance_loss_clip": 1.04840839, "balance_loss_mlp": 1.00055385, "epoch": 0.6417363073408284, "flos": 20959837931520.0, "grad_norm": 2.0420737735067864, "language_loss": 0.73590958, "learning_rate": 1.2018312072556022e-06, "loss": 0.75508875, "num_input_tokens_seen": 114890445, "step": 5337, "time_per_iteration": 2.4405548572540283 }, { "auxiliary_loss_clip": 0.01168738, "auxiliary_loss_mlp": 0.00762265, "balance_loss_clip": 1.04844379, "balance_loss_mlp": 1.00057149, "epoch": 0.6418565502314676, "flos": 22455122227200.0, "grad_norm": 1.669480341017247, "language_loss": 0.74216533, "learning_rate": 1.2011170176520077e-06, "loss": 0.76147532, "num_input_tokens_seen": 114911360, "step": 5338, "time_per_iteration": 2.431079387664795 }, { "auxiliary_loss_clip": 0.01084319, "auxiliary_loss_mlp": 0.01023944, "balance_loss_clip": 1.03932023, "balance_loss_mlp": 1.01627302, "epoch": 0.6419767931221066, "flos": 25045502417280.0, "grad_norm": 1.6519615999822461, "language_loss": 0.81233078, "learning_rate": 1.2004029492383256e-06, "loss": 0.83341342, "num_input_tokens_seen": 114932700, "step": 5339, "time_per_iteration": 2.6286911964416504 }, { "auxiliary_loss_clip": 0.01156269, "auxiliary_loss_mlp": 0.01026485, "balance_loss_clip": 1.04970431, "balance_loss_mlp": 1.01866508, "epoch": 0.6420970360127457, "flos": 19463691709440.0, "grad_norm": 1.9546883748002868, "language_loss": 0.73321879, "learning_rate": 1.1996890021228814e-06, "loss": 0.75504637, "num_input_tokens_seen": 114949475, "step": 5340, "time_per_iteration": 2.434872627258301 }, { "auxiliary_loss_clip": 0.01140382, "auxiliary_loss_mlp": 0.01024294, "balance_loss_clip": 1.04528308, "balance_loss_mlp": 1.01681995, "epoch": 0.6422172789033849, "flos": 40406147458560.0, "grad_norm": 1.5912915364760845, "language_loss": 0.69944096, "learning_rate": 1.1989751764139785e-06, "loss": 0.72108781, "num_input_tokens_seen": 114973125, "step": 5341, "time_per_iteration": 2.653404474258423 }, { "auxiliary_loss_clip": 0.01110681, "auxiliary_loss_mlp": 0.01023054, "balance_loss_clip": 1.03812551, "balance_loss_mlp": 1.01520967, "epoch": 0.6423375217940239, "flos": 27672870637440.0, "grad_norm": 1.62401096740937, "language_loss": 0.82967788, "learning_rate": 1.1982614722199044e-06, "loss": 0.85101521, "num_input_tokens_seen": 114994300, "step": 5342, "time_per_iteration": 2.591768503189087 }, { "auxiliary_loss_clip": 0.01147619, "auxiliary_loss_mlp": 0.01025007, "balance_loss_clip": 1.04608083, "balance_loss_mlp": 1.01809049, "epoch": 0.642457764684663, "flos": 18369242259840.0, "grad_norm": 2.039110277937119, "language_loss": 0.77809697, "learning_rate": 1.1975478896489276e-06, "loss": 0.79982316, "num_input_tokens_seen": 115012135, "step": 5343, "time_per_iteration": 2.470501184463501 }, { "auxiliary_loss_clip": 0.01168439, "auxiliary_loss_mlp": 0.01023038, "balance_loss_clip": 1.04824841, "balance_loss_mlp": 1.01601088, "epoch": 0.6425780075753021, "flos": 19750509809280.0, "grad_norm": 1.9409994790536031, "language_loss": 0.76477146, "learning_rate": 1.1968344288092981e-06, "loss": 0.78668618, "num_input_tokens_seen": 115028715, "step": 5344, "time_per_iteration": 2.40632700920105 }, { "auxiliary_loss_clip": 0.01157403, "auxiliary_loss_mlp": 0.00762356, "balance_loss_clip": 1.0491153, "balance_loss_mlp": 1.00062203, "epoch": 0.6426982504659412, "flos": 20558536208640.0, "grad_norm": 2.5298245518084315, "language_loss": 0.64473689, "learning_rate": 1.1961210898092468e-06, "loss": 0.66393447, "num_input_tokens_seen": 115047665, "step": 5345, "time_per_iteration": 2.4848885536193848 }, { "auxiliary_loss_clip": 0.01149248, "auxiliary_loss_mlp": 0.01027569, "balance_loss_clip": 1.04882598, "balance_loss_mlp": 1.0194633, "epoch": 0.6428184933565803, "flos": 17851984456320.0, "grad_norm": 4.701172722725984, "language_loss": 0.796799, "learning_rate": 1.1954078727569874e-06, "loss": 0.81856716, "num_input_tokens_seen": 115064965, "step": 5346, "time_per_iteration": 2.4554507732391357 }, { "auxiliary_loss_clip": 0.01132288, "auxiliary_loss_mlp": 0.0076226, "balance_loss_clip": 1.04337668, "balance_loss_mlp": 1.00056815, "epoch": 0.6429387362472194, "flos": 22456953820800.0, "grad_norm": 1.6386806484125886, "language_loss": 0.77965021, "learning_rate": 1.1946947777607141e-06, "loss": 0.79859573, "num_input_tokens_seen": 115086100, "step": 5347, "time_per_iteration": 2.5322957038879395 }, { "auxiliary_loss_clip": 0.01109016, "auxiliary_loss_mlp": 0.01024265, "balance_loss_clip": 1.04210496, "balance_loss_mlp": 1.01631975, "epoch": 0.6430589791378585, "flos": 24752579005440.0, "grad_norm": 1.8584480875619018, "language_loss": 0.79986107, "learning_rate": 1.1939818049286024e-06, "loss": 0.82119393, "num_input_tokens_seen": 115104260, "step": 5348, "time_per_iteration": 2.569805383682251 }, { "auxiliary_loss_clip": 0.0108661, "auxiliary_loss_mlp": 0.0102786, "balance_loss_clip": 1.03920484, "balance_loss_mlp": 1.0205524, "epoch": 0.6431792220284975, "flos": 24901249397760.0, "grad_norm": 1.519905985688268, "language_loss": 0.75588173, "learning_rate": 1.1932689543688101e-06, "loss": 0.77702647, "num_input_tokens_seen": 115125365, "step": 5349, "time_per_iteration": 3.385685920715332 }, { "auxiliary_loss_clip": 0.01143903, "auxiliary_loss_mlp": 0.01026624, "balance_loss_clip": 1.04815221, "balance_loss_mlp": 1.01902127, "epoch": 0.6432994649191367, "flos": 21032305620480.0, "grad_norm": 2.2725091381337816, "language_loss": 0.7260707, "learning_rate": 1.1925562261894756e-06, "loss": 0.74777591, "num_input_tokens_seen": 115144445, "step": 5350, "time_per_iteration": 2.4838509559631348 }, { "auxiliary_loss_clip": 0.01139656, "auxiliary_loss_mlp": 0.01027563, "balance_loss_clip": 1.04515898, "balance_loss_mlp": 1.02014184, "epoch": 0.6434197078097758, "flos": 30884433655680.0, "grad_norm": 3.0386379653774607, "language_loss": 0.77880096, "learning_rate": 1.1918436204987207e-06, "loss": 0.80047309, "num_input_tokens_seen": 115166305, "step": 5351, "time_per_iteration": 2.5686447620391846 }, { "auxiliary_loss_clip": 0.01155893, "auxiliary_loss_mlp": 0.01024305, "balance_loss_clip": 1.05091619, "balance_loss_mlp": 1.01693153, "epoch": 0.6435399507004148, "flos": 15012492468480.0, "grad_norm": 1.9502419607140733, "language_loss": 0.81479049, "learning_rate": 1.191131137404645e-06, "loss": 0.83659244, "num_input_tokens_seen": 115183045, "step": 5352, "time_per_iteration": 3.273716926574707 }, { "auxiliary_loss_clip": 0.01116893, "auxiliary_loss_mlp": 0.010283, "balance_loss_clip": 1.04344296, "balance_loss_mlp": 1.02114797, "epoch": 0.643660193591054, "flos": 19901981462400.0, "grad_norm": 2.0284984924712854, "language_loss": 0.76935744, "learning_rate": 1.190418777015333e-06, "loss": 0.79080939, "num_input_tokens_seen": 115201955, "step": 5353, "time_per_iteration": 2.5105373859405518 }, { "auxiliary_loss_clip": 0.01143691, "auxiliary_loss_mlp": 0.0102059, "balance_loss_clip": 1.04684758, "balance_loss_mlp": 1.01346767, "epoch": 0.643780436481693, "flos": 24133622820480.0, "grad_norm": 1.4747283207564812, "language_loss": 0.7382971, "learning_rate": 1.1897065394388487e-06, "loss": 0.75993991, "num_input_tokens_seen": 115222395, "step": 5354, "time_per_iteration": 3.409649610519409 }, { "auxiliary_loss_clip": 0.01143752, "auxiliary_loss_mlp": 0.01029314, "balance_loss_clip": 1.05037212, "balance_loss_mlp": 1.02169657, "epoch": 0.6439006793723321, "flos": 23148808657920.0, "grad_norm": 1.6125624525076288, "language_loss": 0.76621556, "learning_rate": 1.1889944247832385e-06, "loss": 0.78794616, "num_input_tokens_seen": 115242635, "step": 5355, "time_per_iteration": 3.2758243083953857 }, { "auxiliary_loss_clip": 0.01157667, "auxiliary_loss_mlp": 0.01028165, "balance_loss_clip": 1.04512191, "balance_loss_mlp": 1.02062535, "epoch": 0.6440209222629713, "flos": 23617909301760.0, "grad_norm": 1.9098594187834383, "language_loss": 0.70562905, "learning_rate": 1.1882824331565283e-06, "loss": 0.72748739, "num_input_tokens_seen": 115262095, "step": 5356, "time_per_iteration": 2.521042823791504 }, { "auxiliary_loss_clip": 0.01123631, "auxiliary_loss_mlp": 0.01027701, "balance_loss_clip": 1.04210019, "balance_loss_mlp": 1.02037013, "epoch": 0.6441411651536103, "flos": 16544872535040.0, "grad_norm": 2.014874191587914, "language_loss": 0.89210314, "learning_rate": 1.1875705646667287e-06, "loss": 0.91361648, "num_input_tokens_seen": 115279985, "step": 5357, "time_per_iteration": 2.487450122833252 }, { "auxiliary_loss_clip": 0.01153017, "auxiliary_loss_mlp": 0.01026817, "balance_loss_clip": 1.04431057, "balance_loss_mlp": 1.01861548, "epoch": 0.6442614080442494, "flos": 25410965345280.0, "grad_norm": 1.9162354228599496, "language_loss": 0.75268722, "learning_rate": 1.1868588194218282e-06, "loss": 0.77448559, "num_input_tokens_seen": 115300365, "step": 5358, "time_per_iteration": 2.4772651195526123 }, { "auxiliary_loss_clip": 0.01148005, "auxiliary_loss_mlp": 0.01030101, "balance_loss_clip": 1.04510999, "balance_loss_mlp": 1.02232909, "epoch": 0.6443816509348885, "flos": 28294017552000.0, "grad_norm": 1.6328647738082025, "language_loss": 0.73876476, "learning_rate": 1.1861471975297979e-06, "loss": 0.76054585, "num_input_tokens_seen": 115322060, "step": 5359, "time_per_iteration": 2.5494964122772217 }, { "auxiliary_loss_clip": 0.01126722, "auxiliary_loss_mlp": 0.01022555, "balance_loss_clip": 1.04684877, "balance_loss_mlp": 1.01459765, "epoch": 0.6445018938255276, "flos": 36690075964800.0, "grad_norm": 1.6199011382781163, "language_loss": 0.70958543, "learning_rate": 1.185435699098591e-06, "loss": 0.73107815, "num_input_tokens_seen": 115348255, "step": 5360, "time_per_iteration": 2.69227933883667 }, { "auxiliary_loss_clip": 0.01145888, "auxiliary_loss_mlp": 0.01023233, "balance_loss_clip": 1.04590225, "balance_loss_mlp": 1.0156008, "epoch": 0.6446221367161666, "flos": 14501411804160.0, "grad_norm": 7.536487055703115, "language_loss": 0.78403491, "learning_rate": 1.1847243242361403e-06, "loss": 0.80572605, "num_input_tokens_seen": 115366845, "step": 5361, "time_per_iteration": 2.450035810470581 }, { "auxiliary_loss_clip": 0.0114605, "auxiliary_loss_mlp": 0.01034156, "balance_loss_clip": 1.04786444, "balance_loss_mlp": 1.02666068, "epoch": 0.6447423796068057, "flos": 24609367480320.0, "grad_norm": 1.9725811768542592, "language_loss": 0.7779808, "learning_rate": 1.1840130730503624e-06, "loss": 0.79978287, "num_input_tokens_seen": 115388125, "step": 5362, "time_per_iteration": 2.5733940601348877 }, { "auxiliary_loss_clip": 0.01171787, "auxiliary_loss_mlp": 0.01026473, "balance_loss_clip": 1.0492866, "balance_loss_mlp": 1.01912951, "epoch": 0.6448626224974449, "flos": 25047298097280.0, "grad_norm": 1.7002227044297158, "language_loss": 0.74608403, "learning_rate": 1.1833019456491518e-06, "loss": 0.76806664, "num_input_tokens_seen": 115409655, "step": 5363, "time_per_iteration": 2.517425298690796 }, { "auxiliary_loss_clip": 0.01159534, "auxiliary_loss_mlp": 0.01025886, "balance_loss_clip": 1.04968572, "balance_loss_mlp": 1.01842976, "epoch": 0.6449828653880839, "flos": 22530355263360.0, "grad_norm": 2.1289093804855015, "language_loss": 0.79254144, "learning_rate": 1.1825909421403871e-06, "loss": 0.81439567, "num_input_tokens_seen": 115428750, "step": 5364, "time_per_iteration": 2.473006010055542 }, { "auxiliary_loss_clip": 0.01156489, "auxiliary_loss_mlp": 0.01027053, "balance_loss_clip": 1.04714763, "balance_loss_mlp": 1.02008557, "epoch": 0.645103108278723, "flos": 25695736369920.0, "grad_norm": 1.6775703708739282, "language_loss": 0.76189846, "learning_rate": 1.181880062631926e-06, "loss": 0.78373384, "num_input_tokens_seen": 115448085, "step": 5365, "time_per_iteration": 2.4765913486480713 }, { "auxiliary_loss_clip": 0.01138044, "auxiliary_loss_mlp": 0.01031328, "balance_loss_clip": 1.04679561, "balance_loss_mlp": 1.02295315, "epoch": 0.6452233511693621, "flos": 27450331925760.0, "grad_norm": 2.003137936354261, "language_loss": 0.84850895, "learning_rate": 1.1811693072316093e-06, "loss": 0.87020272, "num_input_tokens_seen": 115465765, "step": 5366, "time_per_iteration": 2.517852783203125 }, { "auxiliary_loss_clip": 0.01169989, "auxiliary_loss_mlp": 0.00762463, "balance_loss_clip": 1.0467217, "balance_loss_mlp": 1.00054502, "epoch": 0.6453435940600012, "flos": 19208618254080.0, "grad_norm": 2.174860207167397, "language_loss": 0.84323335, "learning_rate": 1.1804586760472574e-06, "loss": 0.86255783, "num_input_tokens_seen": 115482230, "step": 5367, "time_per_iteration": 2.4059057235717773 }, { "auxiliary_loss_clip": 0.01126319, "auxiliary_loss_mlp": 0.01023378, "balance_loss_clip": 1.0435679, "balance_loss_mlp": 1.01556945, "epoch": 0.6454638369506402, "flos": 25737680476800.0, "grad_norm": 2.2075010916483717, "language_loss": 0.80187517, "learning_rate": 1.1797481691866736e-06, "loss": 0.82337213, "num_input_tokens_seen": 115499455, "step": 5368, "time_per_iteration": 2.530470609664917 }, { "auxiliary_loss_clip": 0.01136434, "auxiliary_loss_mlp": 0.01032601, "balance_loss_clip": 1.04761469, "balance_loss_mlp": 1.02525771, "epoch": 0.6455840798412794, "flos": 20989176364800.0, "grad_norm": 2.000739103417135, "language_loss": 0.83167076, "learning_rate": 1.1790377867576393e-06, "loss": 0.85336113, "num_input_tokens_seen": 115517205, "step": 5369, "time_per_iteration": 2.4924209117889404 }, { "auxiliary_loss_clip": 0.01146224, "auxiliary_loss_mlp": 0.01026275, "balance_loss_clip": 1.04642403, "balance_loss_mlp": 1.01892543, "epoch": 0.6457043227319185, "flos": 26067556005120.0, "grad_norm": 2.1072539658293605, "language_loss": 0.76444286, "learning_rate": 1.1783275288679203e-06, "loss": 0.78616786, "num_input_tokens_seen": 115534370, "step": 5370, "time_per_iteration": 2.5288474559783936 }, { "auxiliary_loss_clip": 0.01064654, "auxiliary_loss_mlp": 0.01002359, "balance_loss_clip": 1.01657021, "balance_loss_mlp": 1.0013876, "epoch": 0.6458245656225575, "flos": 60370831088640.0, "grad_norm": 0.8475207745816336, "language_loss": 0.57112044, "learning_rate": 1.177617395625262e-06, "loss": 0.59179056, "num_input_tokens_seen": 115592345, "step": 5371, "time_per_iteration": 3.0110514163970947 }, { "auxiliary_loss_clip": 0.01156936, "auxiliary_loss_mlp": 0.01028401, "balance_loss_clip": 1.04955232, "balance_loss_mlp": 1.02099466, "epoch": 0.6459448085131967, "flos": 23076771932160.0, "grad_norm": 2.109194996184222, "language_loss": 0.75144202, "learning_rate": 1.1769073871373908e-06, "loss": 0.77329534, "num_input_tokens_seen": 115612550, "step": 5372, "time_per_iteration": 2.4809679985046387 }, { "auxiliary_loss_clip": 0.01124834, "auxiliary_loss_mlp": 0.01022894, "balance_loss_clip": 1.04220545, "balance_loss_mlp": 1.01571178, "epoch": 0.6460650514038357, "flos": 22598190097920.0, "grad_norm": 1.6962795854723085, "language_loss": 0.83681768, "learning_rate": 1.176197503512015e-06, "loss": 0.85829496, "num_input_tokens_seen": 115632265, "step": 5373, "time_per_iteration": 2.5287370681762695 }, { "auxiliary_loss_clip": 0.01140674, "auxiliary_loss_mlp": 0.01026543, "balance_loss_clip": 1.0466876, "balance_loss_mlp": 1.01932752, "epoch": 0.6461852942944748, "flos": 20266726118400.0, "grad_norm": 2.207626833863588, "language_loss": 0.82171977, "learning_rate": 1.1754877448568223e-06, "loss": 0.8433919, "num_input_tokens_seen": 115651720, "step": 5374, "time_per_iteration": 2.4975805282592773 }, { "auxiliary_loss_clip": 0.01141588, "auxiliary_loss_mlp": 0.01025345, "balance_loss_clip": 1.04442525, "balance_loss_mlp": 1.0178293, "epoch": 0.646305537185114, "flos": 23367109564800.0, "grad_norm": 2.7149867036172752, "language_loss": 0.89946866, "learning_rate": 1.1747781112794837e-06, "loss": 0.92113799, "num_input_tokens_seen": 115668215, "step": 5375, "time_per_iteration": 3.359842538833618 }, { "auxiliary_loss_clip": 0.01127006, "auxiliary_loss_mlp": 0.0102665, "balance_loss_clip": 1.04610848, "balance_loss_mlp": 1.01922929, "epoch": 0.646425780075753, "flos": 24277480790400.0, "grad_norm": 1.814659732872634, "language_loss": 0.82686841, "learning_rate": 1.1740686028876487e-06, "loss": 0.848405, "num_input_tokens_seen": 115687080, "step": 5376, "time_per_iteration": 2.5411336421966553 }, { "auxiliary_loss_clip": 0.01152356, "auxiliary_loss_mlp": 0.01023774, "balance_loss_clip": 1.0474751, "balance_loss_mlp": 1.01667261, "epoch": 0.6465460229663921, "flos": 20813968800000.0, "grad_norm": 3.232957618848814, "language_loss": 0.74605536, "learning_rate": 1.1733592197889507e-06, "loss": 0.76781666, "num_input_tokens_seen": 115703990, "step": 5377, "time_per_iteration": 2.4445273876190186 }, { "auxiliary_loss_clip": 0.01149306, "auxiliary_loss_mlp": 0.0102324, "balance_loss_clip": 1.04729044, "balance_loss_mlp": 1.01636434, "epoch": 0.6466662658570312, "flos": 22853299466880.0, "grad_norm": 1.902984090622666, "language_loss": 0.72670501, "learning_rate": 1.1726499620910014e-06, "loss": 0.74843043, "num_input_tokens_seen": 115724270, "step": 5378, "time_per_iteration": 2.4676947593688965 }, { "auxiliary_loss_clip": 0.01155216, "auxiliary_loss_mlp": 0.0102417, "balance_loss_clip": 1.04725885, "balance_loss_mlp": 1.01644814, "epoch": 0.6467865087476703, "flos": 15304553953920.0, "grad_norm": 1.9012010274018538, "language_loss": 0.77490056, "learning_rate": 1.1719408299013955e-06, "loss": 0.7966944, "num_input_tokens_seen": 115742995, "step": 5379, "time_per_iteration": 3.349086046218872 }, { "auxiliary_loss_clip": 0.01170034, "auxiliary_loss_mlp": 0.01028877, "balance_loss_clip": 1.05068803, "balance_loss_mlp": 1.0214268, "epoch": 0.6469067516383094, "flos": 19573650218880.0, "grad_norm": 2.210741846198485, "language_loss": 0.75815099, "learning_rate": 1.1712318233277067e-06, "loss": 0.7801401, "num_input_tokens_seen": 115762015, "step": 5380, "time_per_iteration": 3.2264480590820312 }, { "auxiliary_loss_clip": 0.01063531, "auxiliary_loss_mlp": 0.01001287, "balance_loss_clip": 1.01692963, "balance_loss_mlp": 1.00033379, "epoch": 0.6470269945289485, "flos": 65098002522240.0, "grad_norm": 0.7497049174802937, "language_loss": 0.57815784, "learning_rate": 1.1705229424774916e-06, "loss": 0.59880602, "num_input_tokens_seen": 115816285, "step": 5381, "time_per_iteration": 3.631687641143799 }, { "auxiliary_loss_clip": 0.01138542, "auxiliary_loss_mlp": 0.01028806, "balance_loss_clip": 1.04396987, "balance_loss_mlp": 1.02137303, "epoch": 0.6471472374195876, "flos": 30696943639680.0, "grad_norm": 1.6849529909741638, "language_loss": 0.64126766, "learning_rate": 1.1698141874582867e-06, "loss": 0.66294122, "num_input_tokens_seen": 115837330, "step": 5382, "time_per_iteration": 2.56235671043396 }, { "auxiliary_loss_clip": 0.01168809, "auxiliary_loss_mlp": 0.01027412, "balance_loss_clip": 1.04950714, "balance_loss_mlp": 1.02075112, "epoch": 0.6472674803102266, "flos": 20521835487360.0, "grad_norm": 2.069375863564844, "language_loss": 0.72142899, "learning_rate": 1.169105558377609e-06, "loss": 0.74339116, "num_input_tokens_seen": 115857420, "step": 5383, "time_per_iteration": 2.416995048522949 }, { "auxiliary_loss_clip": 0.01118094, "auxiliary_loss_mlp": 0.00762362, "balance_loss_clip": 1.04979205, "balance_loss_mlp": 1.00058675, "epoch": 0.6473877232008658, "flos": 24715447320960.0, "grad_norm": 1.6796468554461612, "language_loss": 0.78325164, "learning_rate": 1.1683970553429587e-06, "loss": 0.80205619, "num_input_tokens_seen": 115878875, "step": 5384, "time_per_iteration": 2.5850071907043457 }, { "auxiliary_loss_clip": 0.01130428, "auxiliary_loss_mlp": 0.01026937, "balance_loss_clip": 1.04514277, "balance_loss_mlp": 1.01936769, "epoch": 0.6475079660915048, "flos": 15885552441600.0, "grad_norm": 3.2797018940203744, "language_loss": 0.8169831, "learning_rate": 1.1676886784618128e-06, "loss": 0.83855677, "num_input_tokens_seen": 115895540, "step": 5385, "time_per_iteration": 2.4666833877563477 }, { "auxiliary_loss_clip": 0.01157426, "auxiliary_loss_mlp": 0.01025667, "balance_loss_clip": 1.04839325, "balance_loss_mlp": 1.01796043, "epoch": 0.6476282089821439, "flos": 17381590922880.0, "grad_norm": 2.3627084171020756, "language_loss": 0.83817947, "learning_rate": 1.1669804278416332e-06, "loss": 0.86001039, "num_input_tokens_seen": 115910265, "step": 5386, "time_per_iteration": 2.4138023853302 }, { "auxiliary_loss_clip": 0.01147047, "auxiliary_loss_mlp": 0.01026758, "balance_loss_clip": 1.04809904, "balance_loss_mlp": 1.01897407, "epoch": 0.6477484518727831, "flos": 20194078861440.0, "grad_norm": 1.9173957365869634, "language_loss": 0.71284294, "learning_rate": 1.1662723035898602e-06, "loss": 0.73458099, "num_input_tokens_seen": 115930025, "step": 5387, "time_per_iteration": 2.4842822551727295 }, { "auxiliary_loss_clip": 0.01155594, "auxiliary_loss_mlp": 0.01023982, "balance_loss_clip": 1.04855776, "balance_loss_mlp": 1.01643562, "epoch": 0.6478686947634221, "flos": 25410426641280.0, "grad_norm": 1.5864928088235273, "language_loss": 0.8176682, "learning_rate": 1.165564305813915e-06, "loss": 0.83946395, "num_input_tokens_seen": 115949025, "step": 5388, "time_per_iteration": 2.4786365032196045 }, { "auxiliary_loss_clip": 0.01156247, "auxiliary_loss_mlp": 0.01024858, "balance_loss_clip": 1.04869533, "balance_loss_mlp": 1.01760387, "epoch": 0.6479889376540612, "flos": 20083581648000.0, "grad_norm": 1.7424551227277802, "language_loss": 0.81379735, "learning_rate": 1.1648564346212019e-06, "loss": 0.83560842, "num_input_tokens_seen": 115968145, "step": 5389, "time_per_iteration": 2.4599738121032715 }, { "auxiliary_loss_clip": 0.01152876, "auxiliary_loss_mlp": 0.01027676, "balance_loss_clip": 1.04916835, "balance_loss_mlp": 1.02048755, "epoch": 0.6481091805447003, "flos": 26758082039040.0, "grad_norm": 2.0962209175181616, "language_loss": 0.76356912, "learning_rate": 1.164148690119104e-06, "loss": 0.78537464, "num_input_tokens_seen": 115989425, "step": 5390, "time_per_iteration": 2.4863195419311523 }, { "auxiliary_loss_clip": 0.01167658, "auxiliary_loss_mlp": 0.01025513, "balance_loss_clip": 1.04841769, "balance_loss_mlp": 1.01823568, "epoch": 0.6482294234353394, "flos": 23952094462080.0, "grad_norm": 2.0303993663255695, "language_loss": 0.74334908, "learning_rate": 1.163441072414985e-06, "loss": 0.76528084, "num_input_tokens_seen": 116009630, "step": 5391, "time_per_iteration": 2.442458391189575 }, { "auxiliary_loss_clip": 0.01156777, "auxiliary_loss_mlp": 0.01024132, "balance_loss_clip": 1.04952657, "balance_loss_mlp": 1.01689279, "epoch": 0.6483496663259785, "flos": 26209833776640.0, "grad_norm": 1.984044679603604, "language_loss": 0.69866383, "learning_rate": 1.16273358161619e-06, "loss": 0.72047293, "num_input_tokens_seen": 116029965, "step": 5392, "time_per_iteration": 2.493894577026367 }, { "auxiliary_loss_clip": 0.0115253, "auxiliary_loss_mlp": 0.01026518, "balance_loss_clip": 1.04963517, "balance_loss_mlp": 1.01916242, "epoch": 0.6484699092166175, "flos": 20922239370240.0, "grad_norm": 1.934568824094898, "language_loss": 0.83545816, "learning_rate": 1.1620262178300446e-06, "loss": 0.8572486, "num_input_tokens_seen": 116048580, "step": 5393, "time_per_iteration": 2.4673571586608887 }, { "auxiliary_loss_clip": 0.0112838, "auxiliary_loss_mlp": 0.01024479, "balance_loss_clip": 1.04354692, "balance_loss_mlp": 1.01718903, "epoch": 0.6485901521072567, "flos": 33072865678080.0, "grad_norm": 2.366200624095203, "language_loss": 0.75639564, "learning_rate": 1.1613189811638563e-06, "loss": 0.77792418, "num_input_tokens_seen": 116070305, "step": 5394, "time_per_iteration": 2.6324853897094727 }, { "auxiliary_loss_clip": 0.01158177, "auxiliary_loss_mlp": 0.01027217, "balance_loss_clip": 1.04957438, "balance_loss_mlp": 1.02020764, "epoch": 0.6487103949978957, "flos": 22274060745600.0, "grad_norm": 1.5125506975357599, "language_loss": 0.77831137, "learning_rate": 1.1606118717249117e-06, "loss": 0.80016541, "num_input_tokens_seen": 116090405, "step": 5395, "time_per_iteration": 2.4912984371185303 }, { "auxiliary_loss_clip": 0.01173863, "auxiliary_loss_mlp": 0.01024893, "balance_loss_clip": 1.04964268, "balance_loss_mlp": 1.0173229, "epoch": 0.6488306378885348, "flos": 22930400010240.0, "grad_norm": 1.8538992817476412, "language_loss": 0.67611563, "learning_rate": 1.1599048896204787e-06, "loss": 0.69810319, "num_input_tokens_seen": 116110285, "step": 5396, "time_per_iteration": 2.4224696159362793 }, { "auxiliary_loss_clip": 0.0113375, "auxiliary_loss_mlp": 0.01026279, "balance_loss_clip": 1.0474118, "balance_loss_mlp": 1.01890576, "epoch": 0.648950880779174, "flos": 20376110010240.0, "grad_norm": 1.86818198474614, "language_loss": 0.80856854, "learning_rate": 1.1591980349578061e-06, "loss": 0.83016884, "num_input_tokens_seen": 116128955, "step": 5397, "time_per_iteration": 2.5147154331207275 }, { "auxiliary_loss_clip": 0.01038576, "auxiliary_loss_mlp": 0.01001414, "balance_loss_clip": 1.01368618, "balance_loss_mlp": 1.00051403, "epoch": 0.649071123669813, "flos": 59930889310080.0, "grad_norm": 0.7370653548124485, "language_loss": 0.54303443, "learning_rate": 1.158491307844123e-06, "loss": 0.5634343, "num_input_tokens_seen": 116188875, "step": 5398, "time_per_iteration": 3.062973976135254 }, { "auxiliary_loss_clip": 0.01143135, "auxiliary_loss_mlp": 0.01024168, "balance_loss_clip": 1.04818606, "balance_loss_mlp": 1.0168488, "epoch": 0.6491913665604521, "flos": 20446566537600.0, "grad_norm": 1.7038105458207862, "language_loss": 0.83832473, "learning_rate": 1.1577847083866387e-06, "loss": 0.85999775, "num_input_tokens_seen": 116207910, "step": 5399, "time_per_iteration": 2.464677333831787 }, { "auxiliary_loss_clip": 0.01132292, "auxiliary_loss_mlp": 0.01026611, "balance_loss_clip": 1.04479098, "balance_loss_mlp": 1.0187726, "epoch": 0.6493116094510912, "flos": 16946820702720.0, "grad_norm": 1.855177009785709, "language_loss": 0.71696377, "learning_rate": 1.1570782366925453e-06, "loss": 0.73855281, "num_input_tokens_seen": 116226425, "step": 5400, "time_per_iteration": 2.452134847640991 }, { "auxiliary_loss_clip": 0.01142262, "auxiliary_loss_mlp": 0.01023291, "balance_loss_clip": 1.04279935, "balance_loss_mlp": 1.01570308, "epoch": 0.6494318523417303, "flos": 18802935072000.0, "grad_norm": 1.8018560466709699, "language_loss": 0.75824016, "learning_rate": 1.1563718928690132e-06, "loss": 0.77989572, "num_input_tokens_seen": 116243860, "step": 5401, "time_per_iteration": 2.458969831466675 }, { "auxiliary_loss_clip": 0.01125747, "auxiliary_loss_mlp": 0.01027501, "balance_loss_clip": 1.04444623, "balance_loss_mlp": 1.0197736, "epoch": 0.6495520952323693, "flos": 18982847318400.0, "grad_norm": 2.1628364599989522, "language_loss": 0.716272, "learning_rate": 1.1556656770231942e-06, "loss": 0.73780447, "num_input_tokens_seen": 116260055, "step": 5402, "time_per_iteration": 3.3266797065734863 }, { "auxiliary_loss_clip": 0.01158118, "auxiliary_loss_mlp": 0.01027369, "balance_loss_clip": 1.04811358, "balance_loss_mlp": 1.02047014, "epoch": 0.6496723381230085, "flos": 22745388032640.0, "grad_norm": 1.6160907475737125, "language_loss": 0.76413393, "learning_rate": 1.1549595892622207e-06, "loss": 0.78598881, "num_input_tokens_seen": 116278825, "step": 5403, "time_per_iteration": 2.4608218669891357 }, { "auxiliary_loss_clip": 0.01023799, "auxiliary_loss_mlp": 0.01003853, "balance_loss_clip": 1.01659656, "balance_loss_mlp": 1.00304842, "epoch": 0.6497925810136476, "flos": 62145283887360.0, "grad_norm": 0.8366120045218466, "language_loss": 0.59027404, "learning_rate": 1.1542536296932047e-06, "loss": 0.61055058, "num_input_tokens_seen": 116342360, "step": 5404, "time_per_iteration": 3.0789506435394287 }, { "auxiliary_loss_clip": 0.01133697, "auxiliary_loss_mlp": 0.01026499, "balance_loss_clip": 1.04367888, "balance_loss_mlp": 1.01855385, "epoch": 0.6499128239042866, "flos": 20156731695360.0, "grad_norm": 1.5949901860122606, "language_loss": 0.69981307, "learning_rate": 1.1535477984232414e-06, "loss": 0.72141504, "num_input_tokens_seen": 116362235, "step": 5405, "time_per_iteration": 2.505502223968506 }, { "auxiliary_loss_clip": 0.01116274, "auxiliary_loss_mlp": 0.01027173, "balance_loss_clip": 1.03943741, "balance_loss_mlp": 1.01974607, "epoch": 0.6500330667949258, "flos": 24462420940800.0, "grad_norm": 1.741601822144197, "language_loss": 0.77370203, "learning_rate": 1.152842095559404e-06, "loss": 0.79513645, "num_input_tokens_seen": 116382895, "step": 5406, "time_per_iteration": 4.2207725048065186 }, { "auxiliary_loss_clip": 0.01146971, "auxiliary_loss_mlp": 0.01025716, "balance_loss_clip": 1.04587376, "balance_loss_mlp": 1.0186286, "epoch": 0.6501533096855648, "flos": 25477399549440.0, "grad_norm": 1.6914510232454572, "language_loss": 0.76452059, "learning_rate": 1.1521365212087474e-06, "loss": 0.78624749, "num_input_tokens_seen": 116402880, "step": 5407, "time_per_iteration": 2.523043632507324 }, { "auxiliary_loss_clip": 0.01155994, "auxiliary_loss_mlp": 0.01024231, "balance_loss_clip": 1.04617023, "balance_loss_mlp": 1.0165782, "epoch": 0.6502735525762039, "flos": 44819245347840.0, "grad_norm": 1.5923291623948574, "language_loss": 0.70774889, "learning_rate": 1.1514310754783062e-06, "loss": 0.7295512, "num_input_tokens_seen": 116425830, "step": 5408, "time_per_iteration": 3.4084830284118652 }, { "auxiliary_loss_clip": 0.01145397, "auxiliary_loss_mlp": 0.01023532, "balance_loss_clip": 1.04765832, "balance_loss_mlp": 1.0156939, "epoch": 0.6503937954668431, "flos": 28658546726400.0, "grad_norm": 1.8707926288294439, "language_loss": 0.73297626, "learning_rate": 1.1507257584750964e-06, "loss": 0.75466549, "num_input_tokens_seen": 116446010, "step": 5409, "time_per_iteration": 2.589836597442627 }, { "auxiliary_loss_clip": 0.01171445, "auxiliary_loss_mlp": 0.01025641, "balance_loss_clip": 1.04959369, "balance_loss_mlp": 1.01791, "epoch": 0.6505140383574821, "flos": 20922562592640.0, "grad_norm": 1.9796961251777545, "language_loss": 0.77842784, "learning_rate": 1.150020570306113e-06, "loss": 0.80039865, "num_input_tokens_seen": 116465150, "step": 5410, "time_per_iteration": 2.410038471221924 }, { "auxiliary_loss_clip": 0.01135092, "auxiliary_loss_mlp": 0.0102527, "balance_loss_clip": 1.04149389, "balance_loss_mlp": 1.01749802, "epoch": 0.6506342812481212, "flos": 20595236929920.0, "grad_norm": 2.6853216891952445, "language_loss": 0.74958539, "learning_rate": 1.1493155110783338e-06, "loss": 0.77118903, "num_input_tokens_seen": 116483675, "step": 5411, "time_per_iteration": 2.4797093868255615 }, { "auxiliary_loss_clip": 0.01155657, "auxiliary_loss_mlp": 0.01024005, "balance_loss_clip": 1.04776502, "balance_loss_mlp": 1.01623249, "epoch": 0.6507545241387603, "flos": 30226478279040.0, "grad_norm": 2.6621805251068626, "language_loss": 0.70573229, "learning_rate": 1.1486105808987155e-06, "loss": 0.72752893, "num_input_tokens_seen": 116505165, "step": 5412, "time_per_iteration": 2.545302629470825 }, { "auxiliary_loss_clip": 0.01158894, "auxiliary_loss_mlp": 0.0102059, "balance_loss_clip": 1.04960871, "balance_loss_mlp": 1.01298499, "epoch": 0.6508747670293994, "flos": 17128241320320.0, "grad_norm": 1.825299864888889, "language_loss": 0.81571674, "learning_rate": 1.1479057798741947e-06, "loss": 0.8375116, "num_input_tokens_seen": 116523220, "step": 5413, "time_per_iteration": 2.437554121017456 }, { "auxiliary_loss_clip": 0.01054643, "auxiliary_loss_mlp": 0.01009326, "balance_loss_clip": 1.02318025, "balance_loss_mlp": 1.00810421, "epoch": 0.6509950099200384, "flos": 68559826573440.0, "grad_norm": 0.7909187214930529, "language_loss": 0.53340364, "learning_rate": 1.14720110811169e-06, "loss": 0.55404335, "num_input_tokens_seen": 116580450, "step": 5414, "time_per_iteration": 3.0831449031829834 }, { "auxiliary_loss_clip": 0.01161779, "auxiliary_loss_mlp": 0.01027469, "balance_loss_clip": 1.049137, "balance_loss_mlp": 1.01983345, "epoch": 0.6511152528106776, "flos": 22347462188160.0, "grad_norm": 2.1337486389609532, "language_loss": 0.76643193, "learning_rate": 1.146496565718098e-06, "loss": 0.78832436, "num_input_tokens_seen": 116601020, "step": 5415, "time_per_iteration": 2.465686559677124 }, { "auxiliary_loss_clip": 0.01147364, "auxiliary_loss_mlp": 0.01026344, "balance_loss_clip": 1.05141306, "balance_loss_mlp": 1.01833868, "epoch": 0.6512354957013167, "flos": 20522158709760.0, "grad_norm": 1.9173268352475563, "language_loss": 0.75976503, "learning_rate": 1.1457921528002996e-06, "loss": 0.78150207, "num_input_tokens_seen": 116619455, "step": 5416, "time_per_iteration": 2.4744672775268555 }, { "auxiliary_loss_clip": 0.01171561, "auxiliary_loss_mlp": 0.00762552, "balance_loss_clip": 1.05003142, "balance_loss_mlp": 1.00070596, "epoch": 0.6513557385919557, "flos": 32337342881280.0, "grad_norm": 2.2128519736824797, "language_loss": 0.72480196, "learning_rate": 1.1450878694651522e-06, "loss": 0.74414313, "num_input_tokens_seen": 116640020, "step": 5417, "time_per_iteration": 2.5107059478759766 }, { "auxiliary_loss_clip": 0.01114932, "auxiliary_loss_mlp": 0.01023737, "balance_loss_clip": 1.04161572, "balance_loss_mlp": 1.0161196, "epoch": 0.6514759814825949, "flos": 12093206417280.0, "grad_norm": 7.295446121662826, "language_loss": 0.62675995, "learning_rate": 1.1443837158194954e-06, "loss": 0.64814663, "num_input_tokens_seen": 116655165, "step": 5418, "time_per_iteration": 2.500206708908081 }, { "auxiliary_loss_clip": 0.01133119, "auxiliary_loss_mlp": 0.01028949, "balance_loss_clip": 1.05281436, "balance_loss_mlp": 1.02090216, "epoch": 0.651596224373234, "flos": 22526907557760.0, "grad_norm": 1.880254572797962, "language_loss": 0.74593651, "learning_rate": 1.1436796919701484e-06, "loss": 0.76755726, "num_input_tokens_seen": 116673880, "step": 5419, "time_per_iteration": 2.518559694290161 }, { "auxiliary_loss_clip": 0.01145284, "auxiliary_loss_mlp": 0.01023396, "balance_loss_clip": 1.05036473, "balance_loss_mlp": 1.01603508, "epoch": 0.651716467263873, "flos": 27818955250560.0, "grad_norm": 1.9222445285583931, "language_loss": 0.61628288, "learning_rate": 1.1429757980239115e-06, "loss": 0.63796973, "num_input_tokens_seen": 116694305, "step": 5420, "time_per_iteration": 2.5443460941314697 }, { "auxiliary_loss_clip": 0.01171686, "auxiliary_loss_mlp": 0.01032989, "balance_loss_clip": 1.049119, "balance_loss_mlp": 1.02485895, "epoch": 0.6518367101545122, "flos": 24316300414080.0, "grad_norm": 2.2601579156734855, "language_loss": 0.81650531, "learning_rate": 1.1422720340875636e-06, "loss": 0.838552, "num_input_tokens_seen": 116713055, "step": 5421, "time_per_iteration": 2.431621789932251 }, { "auxiliary_loss_clip": 0.01162808, "auxiliary_loss_mlp": 0.01026526, "balance_loss_clip": 1.04787552, "balance_loss_mlp": 1.01940084, "epoch": 0.6519569530451512, "flos": 20011939971840.0, "grad_norm": 1.9156298586850238, "language_loss": 0.79172671, "learning_rate": 1.1415684002678671e-06, "loss": 0.81362003, "num_input_tokens_seen": 116731815, "step": 5422, "time_per_iteration": 2.43428373336792 }, { "auxiliary_loss_clip": 0.01146, "auxiliary_loss_mlp": 0.01030684, "balance_loss_clip": 1.04561782, "balance_loss_mlp": 1.0227654, "epoch": 0.6520771959357903, "flos": 21576064682880.0, "grad_norm": 2.4552864604426814, "language_loss": 0.77722645, "learning_rate": 1.1408648966715617e-06, "loss": 0.79899329, "num_input_tokens_seen": 116749335, "step": 5423, "time_per_iteration": 2.48447585105896 }, { "auxiliary_loss_clip": 0.01142831, "auxiliary_loss_mlp": 0.01028542, "balance_loss_clip": 1.04315734, "balance_loss_mlp": 1.02101958, "epoch": 0.6521974388264293, "flos": 22711021695360.0, "grad_norm": 1.7686189723053143, "language_loss": 0.72463131, "learning_rate": 1.1401615234053683e-06, "loss": 0.74634498, "num_input_tokens_seen": 116768155, "step": 5424, "time_per_iteration": 2.475473403930664 }, { "auxiliary_loss_clip": 0.01144117, "auxiliary_loss_mlp": 0.01029075, "balance_loss_clip": 1.04617238, "balance_loss_mlp": 1.02150536, "epoch": 0.6523176817170685, "flos": 23002939526400.0, "grad_norm": 2.013088659868647, "language_loss": 0.75732207, "learning_rate": 1.1394582805759885e-06, "loss": 0.77905393, "num_input_tokens_seen": 116787435, "step": 5425, "time_per_iteration": 2.492194652557373 }, { "auxiliary_loss_clip": 0.01158779, "auxiliary_loss_mlp": 0.01029036, "balance_loss_clip": 1.04983258, "balance_loss_mlp": 1.02158237, "epoch": 0.6524379246077076, "flos": 21688249835520.0, "grad_norm": 1.6860771855964514, "language_loss": 0.75490665, "learning_rate": 1.1387551682901022e-06, "loss": 0.77678478, "num_input_tokens_seen": 116808040, "step": 5426, "time_per_iteration": 2.4582362174987793 }, { "auxiliary_loss_clip": 0.01126798, "auxiliary_loss_mlp": 0.01025834, "balance_loss_clip": 1.04531419, "balance_loss_mlp": 1.01857066, "epoch": 0.6525581674983466, "flos": 19390936711680.0, "grad_norm": 3.468677449449453, "language_loss": 0.70685619, "learning_rate": 1.138052186654373e-06, "loss": 0.72838247, "num_input_tokens_seen": 116825510, "step": 5427, "time_per_iteration": 2.504934310913086 }, { "auxiliary_loss_clip": 0.01146192, "auxiliary_loss_mlp": 0.01023189, "balance_loss_clip": 1.04741454, "balance_loss_mlp": 1.01493907, "epoch": 0.6526784103889858, "flos": 17165444832000.0, "grad_norm": 2.2006353470320477, "language_loss": 0.88082296, "learning_rate": 1.1373493357754417e-06, "loss": 0.90251678, "num_input_tokens_seen": 116844415, "step": 5428, "time_per_iteration": 2.456697940826416 }, { "auxiliary_loss_clip": 0.01169934, "auxiliary_loss_mlp": 0.01021271, "balance_loss_clip": 1.04770613, "balance_loss_mlp": 1.01436329, "epoch": 0.6527986532796248, "flos": 18989168112000.0, "grad_norm": 1.7605706431425618, "language_loss": 0.77212018, "learning_rate": 1.1366466157599303e-06, "loss": 0.79403222, "num_input_tokens_seen": 116863690, "step": 5429, "time_per_iteration": 3.1312551498413086 }, { "auxiliary_loss_clip": 0.01111853, "auxiliary_loss_mlp": 0.00762844, "balance_loss_clip": 1.04262304, "balance_loss_mlp": 1.00061369, "epoch": 0.6529188961702639, "flos": 14238581011200.0, "grad_norm": 2.2214907247739797, "language_loss": 0.76363444, "learning_rate": 1.1359440267144412e-06, "loss": 0.78238142, "num_input_tokens_seen": 116881145, "step": 5430, "time_per_iteration": 2.5234646797180176 }, { "auxiliary_loss_clip": 0.01159768, "auxiliary_loss_mlp": 0.01022424, "balance_loss_clip": 1.04864776, "balance_loss_mlp": 1.0154984, "epoch": 0.653039139060903, "flos": 36682929158400.0, "grad_norm": 1.9871571570191675, "language_loss": 0.74024165, "learning_rate": 1.1352415687455556e-06, "loss": 0.76206356, "num_input_tokens_seen": 116902405, "step": 5431, "time_per_iteration": 2.5878005027770996 }, { "auxiliary_loss_clip": 0.01158132, "auxiliary_loss_mlp": 0.01029207, "balance_loss_clip": 1.04976785, "balance_loss_mlp": 1.02188492, "epoch": 0.6531593819515421, "flos": 25376275785600.0, "grad_norm": 2.3825868980395932, "language_loss": 0.63941169, "learning_rate": 1.1345392419598362e-06, "loss": 0.66128504, "num_input_tokens_seen": 116921285, "step": 5432, "time_per_iteration": 3.373856782913208 }, { "auxiliary_loss_clip": 0.01151061, "auxiliary_loss_mlp": 0.01020679, "balance_loss_clip": 1.04601049, "balance_loss_mlp": 1.01269817, "epoch": 0.6532796248421812, "flos": 21178533888000.0, "grad_norm": 1.6477537244735991, "language_loss": 0.71806282, "learning_rate": 1.1338370464638263e-06, "loss": 0.73978025, "num_input_tokens_seen": 116940685, "step": 5433, "time_per_iteration": 3.294677495956421 }, { "auxiliary_loss_clip": 0.01169666, "auxiliary_loss_mlp": 0.01020712, "balance_loss_clip": 1.04741144, "balance_loss_mlp": 1.01335645, "epoch": 0.6533998677328203, "flos": 17675950878720.0, "grad_norm": 2.336478210524243, "language_loss": 0.64025503, "learning_rate": 1.1331349823640474e-06, "loss": 0.66215879, "num_input_tokens_seen": 116958115, "step": 5434, "time_per_iteration": 2.3974545001983643 }, { "auxiliary_loss_clip": 0.01157395, "auxiliary_loss_mlp": 0.00761877, "balance_loss_clip": 1.04690075, "balance_loss_mlp": 1.00064099, "epoch": 0.6535201106234594, "flos": 28400384701440.0, "grad_norm": 2.3189662681660304, "language_loss": 0.78374422, "learning_rate": 1.132433049767003e-06, "loss": 0.80293697, "num_input_tokens_seen": 116976030, "step": 5435, "time_per_iteration": 3.2574713230133057 }, { "auxiliary_loss_clip": 0.01141604, "auxiliary_loss_mlp": 0.01025959, "balance_loss_clip": 1.04689407, "balance_loss_mlp": 1.01935434, "epoch": 0.6536403535140984, "flos": 23586667447680.0, "grad_norm": 1.5918954219305315, "language_loss": 0.80939412, "learning_rate": 1.1317312487791748e-06, "loss": 0.83106983, "num_input_tokens_seen": 116997680, "step": 5436, "time_per_iteration": 2.504192352294922 }, { "auxiliary_loss_clip": 0.01151214, "auxiliary_loss_mlp": 0.01027757, "balance_loss_clip": 1.04646468, "balance_loss_mlp": 1.02033031, "epoch": 0.6537605964047376, "flos": 21579476474880.0, "grad_norm": 3.93432374522333, "language_loss": 0.73504812, "learning_rate": 1.1310295795070253e-06, "loss": 0.75683784, "num_input_tokens_seen": 117017620, "step": 5437, "time_per_iteration": 2.4524824619293213 }, { "auxiliary_loss_clip": 0.01119438, "auxiliary_loss_mlp": 0.01028518, "balance_loss_clip": 1.04329741, "balance_loss_mlp": 1.02091265, "epoch": 0.6538808392953767, "flos": 26833997433600.0, "grad_norm": 1.8605797673593854, "language_loss": 0.80882508, "learning_rate": 1.1303280420569982e-06, "loss": 0.83030462, "num_input_tokens_seen": 117039505, "step": 5438, "time_per_iteration": 2.5812337398529053 }, { "auxiliary_loss_clip": 0.01152315, "auxiliary_loss_mlp": 0.01023282, "balance_loss_clip": 1.04673338, "balance_loss_mlp": 1.01579022, "epoch": 0.6540010821860157, "flos": 30738241301760.0, "grad_norm": 1.6635113604138412, "language_loss": 0.77096975, "learning_rate": 1.1296266365355158e-06, "loss": 0.79272574, "num_input_tokens_seen": 117062890, "step": 5439, "time_per_iteration": 2.53440260887146 }, { "auxiliary_loss_clip": 0.01131821, "auxiliary_loss_mlp": 0.01022346, "balance_loss_clip": 1.0462091, "balance_loss_mlp": 1.01437128, "epoch": 0.6541213250766549, "flos": 26907147480960.0, "grad_norm": 1.8552373161488265, "language_loss": 0.73964345, "learning_rate": 1.1289253630489806e-06, "loss": 0.76118511, "num_input_tokens_seen": 117083940, "step": 5440, "time_per_iteration": 2.552569627761841 }, { "auxiliary_loss_clip": 0.01161062, "auxiliary_loss_mlp": 0.01029219, "balance_loss_clip": 1.04713321, "balance_loss_mlp": 1.02129793, "epoch": 0.6542415679672939, "flos": 19172384409600.0, "grad_norm": 2.275298212771805, "language_loss": 0.72369605, "learning_rate": 1.1282242217037753e-06, "loss": 0.74559891, "num_input_tokens_seen": 117101440, "step": 5441, "time_per_iteration": 2.428621530532837 }, { "auxiliary_loss_clip": 0.01110524, "auxiliary_loss_mlp": 0.01024917, "balance_loss_clip": 1.03989649, "balance_loss_mlp": 1.01717472, "epoch": 0.654361810857933, "flos": 48173517100800.0, "grad_norm": 3.4533814683365014, "language_loss": 0.61571622, "learning_rate": 1.127523212606262e-06, "loss": 0.63707066, "num_input_tokens_seen": 117124265, "step": 5442, "time_per_iteration": 2.7673609256744385 }, { "auxiliary_loss_clip": 0.01156203, "auxiliary_loss_mlp": 0.01026603, "balance_loss_clip": 1.04887748, "balance_loss_mlp": 1.01908684, "epoch": 0.6544820537485722, "flos": 26943165843840.0, "grad_norm": 1.5330886653022804, "language_loss": 0.72986728, "learning_rate": 1.1268223358627835e-06, "loss": 0.75169533, "num_input_tokens_seen": 117146755, "step": 5443, "time_per_iteration": 2.502192974090576 }, { "auxiliary_loss_clip": 0.01170755, "auxiliary_loss_mlp": 0.01025362, "balance_loss_clip": 1.04827738, "balance_loss_mlp": 1.01768541, "epoch": 0.6546022966392112, "flos": 20886328748160.0, "grad_norm": 1.7444612670351287, "language_loss": 0.72069454, "learning_rate": 1.126121591579663e-06, "loss": 0.74265575, "num_input_tokens_seen": 117165960, "step": 5444, "time_per_iteration": 2.4083096981048584 }, { "auxiliary_loss_clip": 0.01153383, "auxiliary_loss_mlp": 0.01026073, "balance_loss_clip": 1.04902899, "balance_loss_mlp": 1.01859808, "epoch": 0.6547225395298503, "flos": 24936693143040.0, "grad_norm": 1.52777908215882, "language_loss": 0.68855822, "learning_rate": 1.1254209798632018e-06, "loss": 0.71035278, "num_input_tokens_seen": 117186980, "step": 5445, "time_per_iteration": 2.481923818588257 }, { "auxiliary_loss_clip": 0.01088637, "auxiliary_loss_mlp": 0.01022439, "balance_loss_clip": 1.0399189, "balance_loss_mlp": 1.01489317, "epoch": 0.6548427824204894, "flos": 22565942663040.0, "grad_norm": 1.5885633187903272, "language_loss": 0.84326208, "learning_rate": 1.124720500819683e-06, "loss": 0.86437285, "num_input_tokens_seen": 117205135, "step": 5446, "time_per_iteration": 2.578939199447632 }, { "auxiliary_loss_clip": 0.01175364, "auxiliary_loss_mlp": 0.01029573, "balance_loss_clip": 1.05280805, "balance_loss_mlp": 1.02147877, "epoch": 0.6549630253111285, "flos": 18442500048000.0, "grad_norm": 1.8464784197272823, "language_loss": 0.82653767, "learning_rate": 1.1240201545553682e-06, "loss": 0.84858704, "num_input_tokens_seen": 117222935, "step": 5447, "time_per_iteration": 2.41939640045166 }, { "auxiliary_loss_clip": 0.01127309, "auxiliary_loss_mlp": 0.01024715, "balance_loss_clip": 1.04546309, "balance_loss_mlp": 1.01740122, "epoch": 0.6550832682017675, "flos": 25187313312000.0, "grad_norm": 1.7592498599445936, "language_loss": 0.73017538, "learning_rate": 1.1233199411764987e-06, "loss": 0.75169563, "num_input_tokens_seen": 117242370, "step": 5448, "time_per_iteration": 2.546719789505005 }, { "auxiliary_loss_clip": 0.01116508, "auxiliary_loss_mlp": 0.01026089, "balance_loss_clip": 1.0417757, "balance_loss_mlp": 1.01867139, "epoch": 0.6552035110924067, "flos": 22748153379840.0, "grad_norm": 1.7906602316990663, "language_loss": 0.68703872, "learning_rate": 1.1226198607892978e-06, "loss": 0.70846474, "num_input_tokens_seen": 117262930, "step": 5449, "time_per_iteration": 2.534228801727295 }, { "auxiliary_loss_clip": 0.01120039, "auxiliary_loss_mlp": 0.01025884, "balance_loss_clip": 1.04696107, "balance_loss_mlp": 1.01824594, "epoch": 0.6553237539830458, "flos": 21799178012160.0, "grad_norm": 1.7359555512365068, "language_loss": 0.79595459, "learning_rate": 1.1219199134999664e-06, "loss": 0.81741381, "num_input_tokens_seen": 117281430, "step": 5450, "time_per_iteration": 2.5475001335144043 }, { "auxiliary_loss_clip": 0.01145622, "auxiliary_loss_mlp": 0.0102828, "balance_loss_clip": 1.04718912, "balance_loss_mlp": 1.01979852, "epoch": 0.6554439968736848, "flos": 20887226588160.0, "grad_norm": 2.2396766975448985, "language_loss": 0.78496528, "learning_rate": 1.1212200994146863e-06, "loss": 0.80670428, "num_input_tokens_seen": 117299185, "step": 5451, "time_per_iteration": 2.4714713096618652 }, { "auxiliary_loss_clip": 0.01125096, "auxiliary_loss_mlp": 0.01029435, "balance_loss_clip": 1.04054236, "balance_loss_mlp": 1.02187109, "epoch": 0.655564239764324, "flos": 16139045698560.0, "grad_norm": 2.603008830866016, "language_loss": 0.75873518, "learning_rate": 1.120520418639618e-06, "loss": 0.78028047, "num_input_tokens_seen": 117317720, "step": 5452, "time_per_iteration": 2.5121986865997314 }, { "auxiliary_loss_clip": 0.01157432, "auxiliary_loss_mlp": 0.01028738, "balance_loss_clip": 1.05039299, "balance_loss_mlp": 1.02167535, "epoch": 0.655684482654963, "flos": 29570354496000.0, "grad_norm": 1.9105713941414475, "language_loss": 0.83696365, "learning_rate": 1.119820871280903e-06, "loss": 0.85882533, "num_input_tokens_seen": 117338795, "step": 5453, "time_per_iteration": 2.527688503265381 }, { "auxiliary_loss_clip": 0.01155859, "auxiliary_loss_mlp": 0.01024572, "balance_loss_clip": 1.04797316, "balance_loss_mlp": 1.01706171, "epoch": 0.6558047255456021, "flos": 29789409588480.0, "grad_norm": 4.816395132912529, "language_loss": 0.73438352, "learning_rate": 1.1191214574446614e-06, "loss": 0.75618786, "num_input_tokens_seen": 117359040, "step": 5454, "time_per_iteration": 2.5190632343292236 }, { "auxiliary_loss_clip": 0.01137683, "auxiliary_loss_mlp": 0.01026661, "balance_loss_clip": 1.04529715, "balance_loss_mlp": 1.01910353, "epoch": 0.6559249684362413, "flos": 29059166090880.0, "grad_norm": 2.3987333741800696, "language_loss": 0.79803085, "learning_rate": 1.118422177236995e-06, "loss": 0.81967425, "num_input_tokens_seen": 117380865, "step": 5455, "time_per_iteration": 2.5466363430023193 }, { "auxiliary_loss_clip": 0.0114459, "auxiliary_loss_mlp": 0.01031501, "balance_loss_clip": 1.04652262, "balance_loss_mlp": 1.02335334, "epoch": 0.6560452113268803, "flos": 20225464369920.0, "grad_norm": 1.943398448947495, "language_loss": 0.85624105, "learning_rate": 1.1177230307639835e-06, "loss": 0.87800193, "num_input_tokens_seen": 117398405, "step": 5456, "time_per_iteration": 3.2690234184265137 }, { "auxiliary_loss_clip": 0.01124994, "auxiliary_loss_mlp": 0.01025436, "balance_loss_clip": 1.04343605, "balance_loss_mlp": 1.01791942, "epoch": 0.6561654542175194, "flos": 25045538330880.0, "grad_norm": 1.663899665356716, "language_loss": 0.7848624, "learning_rate": 1.1170240181316865e-06, "loss": 0.80636674, "num_input_tokens_seen": 117419850, "step": 5457, "time_per_iteration": 2.5490152835845947 }, { "auxiliary_loss_clip": 0.01124641, "auxiliary_loss_mlp": 0.01028031, "balance_loss_clip": 1.04216897, "balance_loss_mlp": 1.0201993, "epoch": 0.6562856971081584, "flos": 22856711258880.0, "grad_norm": 2.726734622191998, "language_loss": 0.79474413, "learning_rate": 1.1163251394461442e-06, "loss": 0.81627089, "num_input_tokens_seen": 117438330, "step": 5458, "time_per_iteration": 2.506521701812744 }, { "auxiliary_loss_clip": 0.01154774, "auxiliary_loss_mlp": 0.01026847, "balance_loss_clip": 1.04819155, "balance_loss_mlp": 1.01946175, "epoch": 0.6564059399987976, "flos": 18872565586560.0, "grad_norm": 1.91155008434475, "language_loss": 0.82672256, "learning_rate": 1.1156263948133746e-06, "loss": 0.84853876, "num_input_tokens_seen": 117454985, "step": 5459, "time_per_iteration": 3.284507989883423 }, { "auxiliary_loss_clip": 0.01108252, "auxiliary_loss_mlp": 0.00762642, "balance_loss_clip": 1.04414308, "balance_loss_mlp": 1.00072885, "epoch": 0.6565261828894366, "flos": 25484187219840.0, "grad_norm": 2.238904793705515, "language_loss": 0.77235126, "learning_rate": 1.1149277843393787e-06, "loss": 0.79106021, "num_input_tokens_seen": 117476145, "step": 5460, "time_per_iteration": 3.3520314693450928 }, { "auxiliary_loss_clip": 0.01095583, "auxiliary_loss_mlp": 0.00762707, "balance_loss_clip": 1.0371139, "balance_loss_mlp": 1.00071025, "epoch": 0.6566464257800757, "flos": 19683500987520.0, "grad_norm": 2.47829113441089, "language_loss": 0.63581449, "learning_rate": 1.1142293081301342e-06, "loss": 0.65439737, "num_input_tokens_seen": 117494025, "step": 5461, "time_per_iteration": 2.589726686477661 }, { "auxiliary_loss_clip": 0.01138806, "auxiliary_loss_mlp": 0.01020857, "balance_loss_clip": 1.04576254, "balance_loss_mlp": 1.01400793, "epoch": 0.6567666686707149, "flos": 23514127931520.0, "grad_norm": 1.7557402581691772, "language_loss": 0.68078631, "learning_rate": 1.1135309662915995e-06, "loss": 0.70238292, "num_input_tokens_seen": 117514190, "step": 5462, "time_per_iteration": 3.3392550945281982 }, { "auxiliary_loss_clip": 0.01121384, "auxiliary_loss_mlp": 0.01025404, "balance_loss_clip": 1.04316413, "balance_loss_mlp": 1.01804876, "epoch": 0.6568869115613539, "flos": 32781342896640.0, "grad_norm": 2.0053983555877184, "language_loss": 0.60525131, "learning_rate": 1.112832758929712e-06, "loss": 0.62671912, "num_input_tokens_seen": 117536800, "step": 5463, "time_per_iteration": 2.649257183074951 }, { "auxiliary_loss_clip": 0.01155649, "auxiliary_loss_mlp": 0.01031179, "balance_loss_clip": 1.04928041, "balance_loss_mlp": 1.02368033, "epoch": 0.657007154451993, "flos": 18442428220800.0, "grad_norm": 1.8745029126214443, "language_loss": 0.74964637, "learning_rate": 1.11213468615039e-06, "loss": 0.77151465, "num_input_tokens_seen": 117556230, "step": 5464, "time_per_iteration": 2.450623035430908 }, { "auxiliary_loss_clip": 0.01100681, "auxiliary_loss_mlp": 0.01026109, "balance_loss_clip": 1.04264569, "balance_loss_mlp": 1.01894748, "epoch": 0.6571273973426321, "flos": 25156717902720.0, "grad_norm": 1.4900733433035265, "language_loss": 0.75171518, "learning_rate": 1.1114367480595292e-06, "loss": 0.77298307, "num_input_tokens_seen": 117577310, "step": 5465, "time_per_iteration": 2.615804433822632 }, { "auxiliary_loss_clip": 0.01101774, "auxiliary_loss_mlp": 0.01030364, "balance_loss_clip": 1.04615283, "balance_loss_mlp": 1.02239525, "epoch": 0.6572476402332712, "flos": 17529830352000.0, "grad_norm": 1.7735486969208059, "language_loss": 0.81244195, "learning_rate": 1.1107389447630086e-06, "loss": 0.8337633, "num_input_tokens_seen": 117596010, "step": 5466, "time_per_iteration": 2.557216167449951 }, { "auxiliary_loss_clip": 0.01137687, "auxiliary_loss_mlp": 0.00761851, "balance_loss_clip": 1.04379416, "balance_loss_mlp": 1.00078666, "epoch": 0.6573678831239103, "flos": 17014260487680.0, "grad_norm": 1.9922111222417023, "language_loss": 0.78405643, "learning_rate": 1.1100412763666818e-06, "loss": 0.80305183, "num_input_tokens_seen": 117611270, "step": 5467, "time_per_iteration": 2.4717767238616943 }, { "auxiliary_loss_clip": 0.01144888, "auxiliary_loss_mlp": 0.01023207, "balance_loss_clip": 1.04768229, "balance_loss_mlp": 1.01533329, "epoch": 0.6574881260145494, "flos": 23910078528000.0, "grad_norm": 1.4934403424944827, "language_loss": 0.80037481, "learning_rate": 1.1093437429763865e-06, "loss": 0.82205576, "num_input_tokens_seen": 117631535, "step": 5468, "time_per_iteration": 2.519512414932251 }, { "auxiliary_loss_clip": 0.01157556, "auxiliary_loss_mlp": 0.01019966, "balance_loss_clip": 1.0499711, "balance_loss_mlp": 1.01305163, "epoch": 0.6576083689051885, "flos": 11218458504960.0, "grad_norm": 2.173774977131815, "language_loss": 0.73243797, "learning_rate": 1.1086463446979361e-06, "loss": 0.75421321, "num_input_tokens_seen": 117649885, "step": 5469, "time_per_iteration": 2.473111629486084 }, { "auxiliary_loss_clip": 0.01161031, "auxiliary_loss_mlp": 0.01024323, "balance_loss_clip": 1.05172396, "balance_loss_mlp": 1.01703048, "epoch": 0.6577286117958275, "flos": 22455553190400.0, "grad_norm": 1.756797979653362, "language_loss": 0.77532804, "learning_rate": 1.1079490816371277e-06, "loss": 0.79718161, "num_input_tokens_seen": 117669650, "step": 5470, "time_per_iteration": 2.466010808944702 }, { "auxiliary_loss_clip": 0.01158691, "auxiliary_loss_mlp": 0.00762291, "balance_loss_clip": 1.04814124, "balance_loss_mlp": 1.00074315, "epoch": 0.6578488546864667, "flos": 21872184405120.0, "grad_norm": 2.3502520897543393, "language_loss": 0.75057077, "learning_rate": 1.1072519538997352e-06, "loss": 0.76978064, "num_input_tokens_seen": 117688790, "step": 5471, "time_per_iteration": 2.4627366065979004 }, { "auxiliary_loss_clip": 0.01144903, "auxiliary_loss_mlp": 0.01023788, "balance_loss_clip": 1.04452133, "balance_loss_mlp": 1.01660573, "epoch": 0.6579690975771058, "flos": 23543753673600.0, "grad_norm": 1.744828416464258, "language_loss": 0.82241738, "learning_rate": 1.1065549615915095e-06, "loss": 0.84410429, "num_input_tokens_seen": 117708620, "step": 5472, "time_per_iteration": 2.516986608505249 }, { "auxiliary_loss_clip": 0.01161106, "auxiliary_loss_mlp": 0.0102833, "balance_loss_clip": 1.0532831, "balance_loss_mlp": 1.0206176, "epoch": 0.6580893404677448, "flos": 32743995730560.0, "grad_norm": 2.1604951934973347, "language_loss": 0.78304708, "learning_rate": 1.105858104818187e-06, "loss": 0.80494142, "num_input_tokens_seen": 117729775, "step": 5473, "time_per_iteration": 2.5387651920318604 }, { "auxiliary_loss_clip": 0.01161532, "auxiliary_loss_mlp": 0.01025005, "balance_loss_clip": 1.04977083, "balance_loss_mlp": 1.01672578, "epoch": 0.658209583358384, "flos": 15888138220800.0, "grad_norm": 6.193159378752744, "language_loss": 0.74840915, "learning_rate": 1.105161383685478e-06, "loss": 0.77027452, "num_input_tokens_seen": 117746160, "step": 5474, "time_per_iteration": 2.4260239601135254 }, { "auxiliary_loss_clip": 0.01042022, "auxiliary_loss_mlp": 0.01001845, "balance_loss_clip": 1.01838005, "balance_loss_mlp": 1.00092697, "epoch": 0.658329826249023, "flos": 62695902447360.0, "grad_norm": 0.7297184140266282, "language_loss": 0.56311542, "learning_rate": 1.1044647982990771e-06, "loss": 0.58355409, "num_input_tokens_seen": 117808045, "step": 5475, "time_per_iteration": 3.0598082542419434 }, { "auxiliary_loss_clip": 0.01145704, "auxiliary_loss_mlp": 0.01026571, "balance_loss_clip": 1.04793811, "balance_loss_mlp": 1.01879227, "epoch": 0.6584500691396621, "flos": 31722624501120.0, "grad_norm": 2.511164672358488, "language_loss": 0.64645272, "learning_rate": 1.1037683487646536e-06, "loss": 0.66817558, "num_input_tokens_seen": 117828330, "step": 5476, "time_per_iteration": 2.5600411891937256 }, { "auxiliary_loss_clip": 0.01142695, "auxiliary_loss_mlp": 0.00762484, "balance_loss_clip": 1.04982102, "balance_loss_mlp": 1.00066936, "epoch": 0.6585703120303013, "flos": 18406086635520.0, "grad_norm": 2.104784881122586, "language_loss": 0.77099991, "learning_rate": 1.1030720351878583e-06, "loss": 0.79005164, "num_input_tokens_seen": 117846450, "step": 5477, "time_per_iteration": 2.44301700592041 }, { "auxiliary_loss_clip": 0.01054978, "auxiliary_loss_mlp": 0.01001845, "balance_loss_clip": 1.01835775, "balance_loss_mlp": 1.00098693, "epoch": 0.6586905549209403, "flos": 58309880434560.0, "grad_norm": 0.8136869411179377, "language_loss": 0.57718468, "learning_rate": 1.102375857674323e-06, "loss": 0.59775293, "num_input_tokens_seen": 117908365, "step": 5478, "time_per_iteration": 3.032545804977417 }, { "auxiliary_loss_clip": 0.01143578, "auxiliary_loss_mlp": 0.01024578, "balance_loss_clip": 1.04626369, "balance_loss_mlp": 1.01717186, "epoch": 0.6588107978115794, "flos": 22782627457920.0, "grad_norm": 1.7751988157288778, "language_loss": 0.9038468, "learning_rate": 1.1016798163296561e-06, "loss": 0.92552835, "num_input_tokens_seen": 117927565, "step": 5479, "time_per_iteration": 2.505192756652832 }, { "auxiliary_loss_clip": 0.01160699, "auxiliary_loss_mlp": 0.01021794, "balance_loss_clip": 1.0491817, "balance_loss_mlp": 1.01402748, "epoch": 0.6589310407022185, "flos": 20667525050880.0, "grad_norm": 1.8999939411733766, "language_loss": 0.66394627, "learning_rate": 1.1009839112594471e-06, "loss": 0.68577117, "num_input_tokens_seen": 117945590, "step": 5480, "time_per_iteration": 2.445777177810669 }, { "auxiliary_loss_clip": 0.01160084, "auxiliary_loss_mlp": 0.01029979, "balance_loss_clip": 1.04912531, "balance_loss_mlp": 1.02225971, "epoch": 0.6590512835928576, "flos": 25630595055360.0, "grad_norm": 2.1081587316969688, "language_loss": 0.71923256, "learning_rate": 1.1002881425692638e-06, "loss": 0.74113327, "num_input_tokens_seen": 117966020, "step": 5481, "time_per_iteration": 2.4979910850524902 }, { "auxiliary_loss_clip": 0.01151807, "auxiliary_loss_mlp": 0.0102442, "balance_loss_clip": 1.04621863, "balance_loss_mlp": 1.01660919, "epoch": 0.6591715264834966, "flos": 23726108044800.0, "grad_norm": 1.6499540453889339, "language_loss": 0.75149906, "learning_rate": 1.0995925103646532e-06, "loss": 0.77326131, "num_input_tokens_seen": 117984620, "step": 5482, "time_per_iteration": 3.2330353260040283 }, { "auxiliary_loss_clip": 0.01126224, "auxiliary_loss_mlp": 0.0102355, "balance_loss_clip": 1.0471046, "balance_loss_mlp": 1.01577759, "epoch": 0.6592917693741358, "flos": 35773850822400.0, "grad_norm": 1.7927925427408702, "language_loss": 0.66550684, "learning_rate": 1.0988970147511437e-06, "loss": 0.68700463, "num_input_tokens_seen": 118006500, "step": 5483, "time_per_iteration": 2.6555371284484863 }, { "auxiliary_loss_clip": 0.01144911, "auxiliary_loss_mlp": 0.01027313, "balance_loss_clip": 1.04939961, "balance_loss_mlp": 1.01949286, "epoch": 0.6594120122647749, "flos": 21396834794880.0, "grad_norm": 5.828404758710859, "language_loss": 0.80168962, "learning_rate": 1.0982016558342405e-06, "loss": 0.82341182, "num_input_tokens_seen": 118025470, "step": 5484, "time_per_iteration": 2.4823062419891357 }, { "auxiliary_loss_clip": 0.01173003, "auxiliary_loss_mlp": 0.01022017, "balance_loss_clip": 1.05158246, "balance_loss_mlp": 1.01477814, "epoch": 0.6595322551554139, "flos": 19351829779200.0, "grad_norm": 2.092667029019394, "language_loss": 0.71033549, "learning_rate": 1.0975064337194291e-06, "loss": 0.73228574, "num_input_tokens_seen": 118043515, "step": 5485, "time_per_iteration": 2.4008774757385254 }, { "auxiliary_loss_clip": 0.01127299, "auxiliary_loss_mlp": 0.01037349, "balance_loss_clip": 1.0471586, "balance_loss_mlp": 1.0298183, "epoch": 0.6596524980460531, "flos": 16837113588480.0, "grad_norm": 2.6069982098550684, "language_loss": 0.7033574, "learning_rate": 1.0968113485121743e-06, "loss": 0.72500384, "num_input_tokens_seen": 118063105, "step": 5486, "time_per_iteration": 3.3639791011810303 }, { "auxiliary_loss_clip": 0.01157822, "auxiliary_loss_mlp": 0.0076278, "balance_loss_clip": 1.04614782, "balance_loss_mlp": 1.00071931, "epoch": 0.6597727409366921, "flos": 21798567480960.0, "grad_norm": 1.727159501699965, "language_loss": 0.8022033, "learning_rate": 1.0961164003179185e-06, "loss": 0.82140934, "num_input_tokens_seen": 118081615, "step": 5487, "time_per_iteration": 3.3240482807159424 }, { "auxiliary_loss_clip": 0.01128229, "auxiliary_loss_mlp": 0.01026839, "balance_loss_clip": 1.04485822, "balance_loss_mlp": 1.01906073, "epoch": 0.6598929838273312, "flos": 23730704985600.0, "grad_norm": 1.7684741516972529, "language_loss": 0.84352738, "learning_rate": 1.0954215892420884e-06, "loss": 0.86507809, "num_input_tokens_seen": 118102315, "step": 5488, "time_per_iteration": 2.5419845581054688 }, { "auxiliary_loss_clip": 0.01134156, "auxiliary_loss_mlp": 0.01032148, "balance_loss_clip": 1.04790759, "balance_loss_mlp": 1.02376807, "epoch": 0.6600132267179702, "flos": 19974520978560.0, "grad_norm": 1.7655967211589083, "language_loss": 0.70552593, "learning_rate": 1.094726915390082e-06, "loss": 0.727189, "num_input_tokens_seen": 118120650, "step": 5489, "time_per_iteration": 3.272987127304077 }, { "auxiliary_loss_clip": 0.01159659, "auxiliary_loss_mlp": 0.01026516, "balance_loss_clip": 1.04994094, "balance_loss_mlp": 1.01894069, "epoch": 0.6601334696086094, "flos": 22342649765760.0, "grad_norm": 1.7514712984569456, "language_loss": 0.69936025, "learning_rate": 1.0940323788672836e-06, "loss": 0.72122204, "num_input_tokens_seen": 118139825, "step": 5490, "time_per_iteration": 2.4691755771636963 }, { "auxiliary_loss_clip": 0.01154527, "auxiliary_loss_mlp": 0.01023903, "balance_loss_clip": 1.04891181, "balance_loss_mlp": 1.0162468, "epoch": 0.6602537124992485, "flos": 25703098657920.0, "grad_norm": 1.5914370212075002, "language_loss": 0.73699164, "learning_rate": 1.0933379797790522e-06, "loss": 0.75877589, "num_input_tokens_seen": 118159240, "step": 5491, "time_per_iteration": 2.4835219383239746 }, { "auxiliary_loss_clip": 0.0117339, "auxiliary_loss_mlp": 0.01027736, "balance_loss_clip": 1.0516032, "balance_loss_mlp": 1.01980305, "epoch": 0.6603739553898875, "flos": 25848572739840.0, "grad_norm": 2.181969012948015, "language_loss": 0.7085371, "learning_rate": 1.0926437182307293e-06, "loss": 0.73054838, "num_input_tokens_seen": 118178050, "step": 5492, "time_per_iteration": 2.4552559852600098 }, { "auxiliary_loss_clip": 0.01147338, "auxiliary_loss_mlp": 0.01025934, "balance_loss_clip": 1.0467391, "balance_loss_mlp": 1.0182209, "epoch": 0.6604941982805267, "flos": 24570296461440.0, "grad_norm": 1.7549829047717922, "language_loss": 0.77862883, "learning_rate": 1.0919495943276338e-06, "loss": 0.80036157, "num_input_tokens_seen": 118199070, "step": 5493, "time_per_iteration": 2.5302581787109375 }, { "auxiliary_loss_clip": 0.01131722, "auxiliary_loss_mlp": 0.01025673, "balance_loss_clip": 1.04236674, "balance_loss_mlp": 1.01735234, "epoch": 0.6606144411711657, "flos": 13261775581440.0, "grad_norm": 3.1017645903864124, "language_loss": 0.7631433, "learning_rate": 1.0912556081750611e-06, "loss": 0.7847172, "num_input_tokens_seen": 118217000, "step": 5494, "time_per_iteration": 2.5310869216918945 }, { "auxiliary_loss_clip": 0.01142509, "auxiliary_loss_mlp": 0.01026035, "balance_loss_clip": 1.04902244, "balance_loss_mlp": 1.01875758, "epoch": 0.6607346840618048, "flos": 25155281358720.0, "grad_norm": 1.9716868875742357, "language_loss": 0.76578814, "learning_rate": 1.0905617598782909e-06, "loss": 0.78747356, "num_input_tokens_seen": 118237205, "step": 5495, "time_per_iteration": 2.502443313598633 }, { "auxiliary_loss_clip": 0.01108803, "auxiliary_loss_mlp": 0.01026434, "balance_loss_clip": 1.04394794, "balance_loss_mlp": 1.01927519, "epoch": 0.660854926952444, "flos": 17638029095040.0, "grad_norm": 3.1336762055953695, "language_loss": 0.80862933, "learning_rate": 1.0898680495425775e-06, "loss": 0.82998168, "num_input_tokens_seen": 118255495, "step": 5496, "time_per_iteration": 2.5191776752471924 }, { "auxiliary_loss_clip": 0.01147984, "auxiliary_loss_mlp": 0.01027016, "balance_loss_clip": 1.04920626, "balance_loss_mlp": 1.01960707, "epoch": 0.660975169843083, "flos": 16836000266880.0, "grad_norm": 1.627412866153545, "language_loss": 0.80131406, "learning_rate": 1.0891744772731594e-06, "loss": 0.82306409, "num_input_tokens_seen": 118273310, "step": 5497, "time_per_iteration": 2.4517807960510254 }, { "auxiliary_loss_clip": 0.01159899, "auxiliary_loss_mlp": 0.01030258, "balance_loss_clip": 1.0488503, "balance_loss_mlp": 1.02308798, "epoch": 0.6610954127337221, "flos": 26870410846080.0, "grad_norm": 1.5313410030880796, "language_loss": 0.66256523, "learning_rate": 1.088481043175248e-06, "loss": 0.68446678, "num_input_tokens_seen": 118293880, "step": 5498, "time_per_iteration": 2.50740385055542 }, { "auxiliary_loss_clip": 0.01130624, "auxiliary_loss_mlp": 0.01023767, "balance_loss_clip": 1.04173291, "balance_loss_mlp": 1.0161705, "epoch": 0.6612156556243612, "flos": 26465697331200.0, "grad_norm": 1.5800179556884986, "language_loss": 0.75429577, "learning_rate": 1.0877877473540368e-06, "loss": 0.77583969, "num_input_tokens_seen": 118314465, "step": 5499, "time_per_iteration": 2.5167829990386963 }, { "auxiliary_loss_clip": 0.01173366, "auxiliary_loss_mlp": 0.01021604, "balance_loss_clip": 1.04949057, "balance_loss_mlp": 1.01420736, "epoch": 0.6613358985150003, "flos": 19791915212160.0, "grad_norm": 1.695776739112023, "language_loss": 0.72491169, "learning_rate": 1.0870945899147002e-06, "loss": 0.74686146, "num_input_tokens_seen": 118331110, "step": 5500, "time_per_iteration": 2.3996667861938477 }, { "auxiliary_loss_clip": 0.01154867, "auxiliary_loss_mlp": 0.01027601, "balance_loss_clip": 1.04924369, "balance_loss_mlp": 1.02043664, "epoch": 0.6614561414056394, "flos": 26831627136000.0, "grad_norm": 1.782129191087351, "language_loss": 0.76254296, "learning_rate": 1.0864015709623879e-06, "loss": 0.78436768, "num_input_tokens_seen": 118351980, "step": 5501, "time_per_iteration": 2.4809956550598145 }, { "auxiliary_loss_clip": 0.01160495, "auxiliary_loss_mlp": 0.01025799, "balance_loss_clip": 1.04785395, "balance_loss_mlp": 1.01853371, "epoch": 0.6615763842962785, "flos": 22894597128960.0, "grad_norm": 2.2878075119431918, "language_loss": 0.80370712, "learning_rate": 1.0857086906022313e-06, "loss": 0.82557011, "num_input_tokens_seen": 118370315, "step": 5502, "time_per_iteration": 2.4603171348571777 }, { "auxiliary_loss_clip": 0.01092524, "auxiliary_loss_mlp": 0.01024673, "balance_loss_clip": 1.04411626, "balance_loss_mlp": 1.01672781, "epoch": 0.6616966271869176, "flos": 24790321221120.0, "grad_norm": 1.9567973686304647, "language_loss": 0.73015428, "learning_rate": 1.0850159489393388e-06, "loss": 0.75132626, "num_input_tokens_seen": 118389575, "step": 5503, "time_per_iteration": 2.591839551925659 }, { "auxiliary_loss_clip": 0.01120105, "auxiliary_loss_mlp": 0.01025006, "balance_loss_clip": 1.04040778, "balance_loss_mlp": 1.01731658, "epoch": 0.6618168700775566, "flos": 17202109639680.0, "grad_norm": 1.7726425937653567, "language_loss": 0.8221457, "learning_rate": 1.0843233460787992e-06, "loss": 0.84359682, "num_input_tokens_seen": 118406790, "step": 5504, "time_per_iteration": 2.5015242099761963 }, { "auxiliary_loss_clip": 0.01121837, "auxiliary_loss_mlp": 0.01025203, "balance_loss_clip": 1.04845929, "balance_loss_mlp": 1.01745415, "epoch": 0.6619371129681958, "flos": 25447091448960.0, "grad_norm": 1.817277690297216, "language_loss": 0.77930063, "learning_rate": 1.0836308821256805e-06, "loss": 0.800771, "num_input_tokens_seen": 118427590, "step": 5505, "time_per_iteration": 2.550518751144409 }, { "auxiliary_loss_clip": 0.01156609, "auxiliary_loss_mlp": 0.01026321, "balance_loss_clip": 1.04958797, "balance_loss_mlp": 1.01912665, "epoch": 0.6620573558588349, "flos": 18040444139520.0, "grad_norm": 2.0885954603458066, "language_loss": 0.78144073, "learning_rate": 1.0829385571850282e-06, "loss": 0.80327004, "num_input_tokens_seen": 118444570, "step": 5506, "time_per_iteration": 2.432097911834717 }, { "auxiliary_loss_clip": 0.01175803, "auxiliary_loss_mlp": 0.01022833, "balance_loss_clip": 1.05067146, "balance_loss_mlp": 1.014727, "epoch": 0.6621775987494739, "flos": 17785586165760.0, "grad_norm": 2.6374579240350045, "language_loss": 0.83635974, "learning_rate": 1.0822463713618679e-06, "loss": 0.85834622, "num_input_tokens_seen": 118461425, "step": 5507, "time_per_iteration": 2.390392541885376 }, { "auxiliary_loss_clip": 0.01129776, "auxiliary_loss_mlp": 0.01027962, "balance_loss_clip": 1.0454073, "balance_loss_mlp": 1.02062166, "epoch": 0.6622978416401131, "flos": 17492590926720.0, "grad_norm": 2.0372885234616733, "language_loss": 0.84817189, "learning_rate": 1.0815543247612034e-06, "loss": 0.86974925, "num_input_tokens_seen": 118478495, "step": 5508, "time_per_iteration": 2.498204231262207 }, { "auxiliary_loss_clip": 0.01139714, "auxiliary_loss_mlp": 0.01020393, "balance_loss_clip": 1.04204845, "balance_loss_mlp": 1.01306212, "epoch": 0.6624180845307521, "flos": 21648352803840.0, "grad_norm": 1.6316674313800148, "language_loss": 0.82571679, "learning_rate": 1.0808624174880168e-06, "loss": 0.84731787, "num_input_tokens_seen": 118499145, "step": 5509, "time_per_iteration": 3.2900233268737793 }, { "auxiliary_loss_clip": 0.01170201, "auxiliary_loss_mlp": 0.01022683, "balance_loss_clip": 1.05057073, "balance_loss_mlp": 1.01573944, "epoch": 0.6625383274213912, "flos": 23805902108160.0, "grad_norm": 1.681469688867829, "language_loss": 0.79641908, "learning_rate": 1.080170649647272e-06, "loss": 0.81834799, "num_input_tokens_seen": 118518950, "step": 5510, "time_per_iteration": 2.4409148693084717 }, { "auxiliary_loss_clip": 0.01169568, "auxiliary_loss_mlp": 0.01022581, "balance_loss_clip": 1.04933012, "balance_loss_mlp": 1.01496696, "epoch": 0.6626585703120303, "flos": 33262941473280.0, "grad_norm": 1.5850588106199062, "language_loss": 0.67071486, "learning_rate": 1.0794790213439068e-06, "loss": 0.69263631, "num_input_tokens_seen": 118545850, "step": 5511, "time_per_iteration": 2.583749294281006 }, { "auxiliary_loss_clip": 0.01117278, "auxiliary_loss_mlp": 0.01029025, "balance_loss_clip": 1.04536104, "balance_loss_mlp": 1.0209192, "epoch": 0.6627788132026694, "flos": 22085780630400.0, "grad_norm": 2.1677491242586373, "language_loss": 0.78728771, "learning_rate": 1.078787532682843e-06, "loss": 0.80875069, "num_input_tokens_seen": 118563325, "step": 5512, "time_per_iteration": 3.5486109256744385 }, { "auxiliary_loss_clip": 0.01153587, "auxiliary_loss_mlp": 0.01028332, "balance_loss_clip": 1.04797149, "balance_loss_mlp": 1.02104831, "epoch": 0.6628990560933085, "flos": 36173608260480.0, "grad_norm": 2.457118403339589, "language_loss": 0.7613343, "learning_rate": 1.0780961837689773e-06, "loss": 0.78315347, "num_input_tokens_seen": 118582835, "step": 5513, "time_per_iteration": 3.4394383430480957 }, { "auxiliary_loss_clip": 0.01137165, "auxiliary_loss_mlp": 0.010237, "balance_loss_clip": 1.0473299, "balance_loss_mlp": 1.0163449, "epoch": 0.6630192989839476, "flos": 18513567106560.0, "grad_norm": 1.8659128961843627, "language_loss": 0.69941652, "learning_rate": 1.0774049747071883e-06, "loss": 0.72102517, "num_input_tokens_seen": 118600715, "step": 5514, "time_per_iteration": 2.46176815032959 }, { "auxiliary_loss_clip": 0.0111058, "auxiliary_loss_mlp": 0.01029971, "balance_loss_clip": 1.04435849, "balance_loss_mlp": 1.02205515, "epoch": 0.6631395418745867, "flos": 35809510049280.0, "grad_norm": 1.786747177903583, "language_loss": 0.68000865, "learning_rate": 1.076713905602332e-06, "loss": 0.70141411, "num_input_tokens_seen": 118621290, "step": 5515, "time_per_iteration": 2.6638760566711426 }, { "auxiliary_loss_clip": 0.01160182, "auxiliary_loss_mlp": 0.01022697, "balance_loss_clip": 1.04950476, "balance_loss_mlp": 1.01544881, "epoch": 0.6632597847652257, "flos": 20047742853120.0, "grad_norm": 1.825025368132173, "language_loss": 0.81068814, "learning_rate": 1.07602297655924e-06, "loss": 0.83251691, "num_input_tokens_seen": 118639610, "step": 5516, "time_per_iteration": 3.2048749923706055 }, { "auxiliary_loss_clip": 0.01171914, "auxiliary_loss_mlp": 0.0102587, "balance_loss_clip": 1.05165792, "balance_loss_mlp": 1.01868165, "epoch": 0.6633800276558649, "flos": 21214480423680.0, "grad_norm": 1.8426631964134386, "language_loss": 0.81222248, "learning_rate": 1.0753321876827292e-06, "loss": 0.83420026, "num_input_tokens_seen": 118658895, "step": 5517, "time_per_iteration": 2.432898998260498 }, { "auxiliary_loss_clip": 0.01169314, "auxiliary_loss_mlp": 0.01023304, "balance_loss_clip": 1.0471499, "balance_loss_mlp": 1.01585925, "epoch": 0.663500270546504, "flos": 23987753688960.0, "grad_norm": 1.7610862714714586, "language_loss": 0.74105859, "learning_rate": 1.0746415390775893e-06, "loss": 0.76298475, "num_input_tokens_seen": 118677025, "step": 5518, "time_per_iteration": 2.430189609527588 }, { "auxiliary_loss_clip": 0.01170991, "auxiliary_loss_mlp": 0.01026131, "balance_loss_clip": 1.05153191, "balance_loss_mlp": 1.0189929, "epoch": 0.663620513437143, "flos": 17932389050880.0, "grad_norm": 2.193754933738702, "language_loss": 0.76538527, "learning_rate": 1.0739510308485939e-06, "loss": 0.7873565, "num_input_tokens_seen": 118694240, "step": 5519, "time_per_iteration": 2.3850111961364746 }, { "auxiliary_loss_clip": 0.01046035, "auxiliary_loss_mlp": 0.01002203, "balance_loss_clip": 1.01747537, "balance_loss_mlp": 1.00130308, "epoch": 0.6637407563277821, "flos": 57840241086720.0, "grad_norm": 0.805975172859145, "language_loss": 0.62494946, "learning_rate": 1.07326066310049e-06, "loss": 0.64543176, "num_input_tokens_seen": 118758365, "step": 5520, "time_per_iteration": 3.115778684616089 }, { "auxiliary_loss_clip": 0.01124287, "auxiliary_loss_mlp": 0.01025816, "balance_loss_clip": 1.04352069, "balance_loss_mlp": 1.01741815, "epoch": 0.6638609992184212, "flos": 27306007079040.0, "grad_norm": 2.9635734425357914, "language_loss": 0.7924149, "learning_rate": 1.0725704359380059e-06, "loss": 0.81391597, "num_input_tokens_seen": 118778220, "step": 5521, "time_per_iteration": 2.5515520572662354 }, { "auxiliary_loss_clip": 0.01160887, "auxiliary_loss_mlp": 0.01022939, "balance_loss_clip": 1.04841661, "balance_loss_mlp": 1.01603103, "epoch": 0.6639812421090603, "flos": 18624854419200.0, "grad_norm": 1.9282956814426322, "language_loss": 0.72169173, "learning_rate": 1.0718803494658497e-06, "loss": 0.74352998, "num_input_tokens_seen": 118797110, "step": 5522, "time_per_iteration": 2.4254212379455566 }, { "auxiliary_loss_clip": 0.01078893, "auxiliary_loss_mlp": 0.01031956, "balance_loss_clip": 1.04172552, "balance_loss_mlp": 1.02364695, "epoch": 0.6641014849996993, "flos": 15924479806080.0, "grad_norm": 2.014979946928305, "language_loss": 0.83849549, "learning_rate": 1.071190403788707e-06, "loss": 0.859604, "num_input_tokens_seen": 118812415, "step": 5523, "time_per_iteration": 2.5648577213287354 }, { "auxiliary_loss_clip": 0.01135727, "auxiliary_loss_mlp": 0.01026546, "balance_loss_clip": 1.04852486, "balance_loss_mlp": 1.01884484, "epoch": 0.6642217278903385, "flos": 26505486622080.0, "grad_norm": 1.732655601491865, "language_loss": 0.75329524, "learning_rate": 1.0705005990112415e-06, "loss": 0.77491796, "num_input_tokens_seen": 118832195, "step": 5524, "time_per_iteration": 2.5725104808807373 }, { "auxiliary_loss_clip": 0.01104152, "auxiliary_loss_mlp": 0.01030067, "balance_loss_clip": 1.04475117, "balance_loss_mlp": 1.02262878, "epoch": 0.6643419707809776, "flos": 15377308951680.0, "grad_norm": 3.315039872400583, "language_loss": 0.7430492, "learning_rate": 1.0698109352380957e-06, "loss": 0.76439142, "num_input_tokens_seen": 118849795, "step": 5525, "time_per_iteration": 2.5100502967834473 }, { "auxiliary_loss_clip": 0.01169231, "auxiliary_loss_mlp": 0.01024484, "balance_loss_clip": 1.04901254, "balance_loss_mlp": 1.01753092, "epoch": 0.6644622136716166, "flos": 25117610970240.0, "grad_norm": 2.391484785397507, "language_loss": 0.7815215, "learning_rate": 1.0691214125738909e-06, "loss": 0.80345863, "num_input_tokens_seen": 118870000, "step": 5526, "time_per_iteration": 2.450190305709839 }, { "auxiliary_loss_clip": 0.01070438, "auxiliary_loss_mlp": 0.01001672, "balance_loss_clip": 1.01508987, "balance_loss_mlp": 1.00081372, "epoch": 0.6645824565622558, "flos": 66201717680640.0, "grad_norm": 0.7918490834424107, "language_loss": 0.57526654, "learning_rate": 1.0684320311232287e-06, "loss": 0.59598768, "num_input_tokens_seen": 118932905, "step": 5527, "time_per_iteration": 3.0717484951019287 }, { "auxiliary_loss_clip": 0.01141057, "auxiliary_loss_mlp": 0.01025754, "balance_loss_clip": 1.04597092, "balance_loss_mlp": 1.01792765, "epoch": 0.6647026994528948, "flos": 25082131311360.0, "grad_norm": 1.8932580601366746, "language_loss": 0.81569481, "learning_rate": 1.0677427909906865e-06, "loss": 0.83736289, "num_input_tokens_seen": 118953355, "step": 5528, "time_per_iteration": 2.6524579524993896 }, { "auxiliary_loss_clip": 0.01175936, "auxiliary_loss_mlp": 0.01030558, "balance_loss_clip": 1.05256224, "balance_loss_mlp": 1.02252936, "epoch": 0.6648229423435339, "flos": 18222187979520.0, "grad_norm": 1.9618559863124314, "language_loss": 0.71879601, "learning_rate": 1.0670536922808216e-06, "loss": 0.74086094, "num_input_tokens_seen": 118973480, "step": 5529, "time_per_iteration": 2.5966434478759766 }, { "auxiliary_loss_clip": 0.01142242, "auxiliary_loss_mlp": 0.01025324, "balance_loss_clip": 1.04695535, "balance_loss_mlp": 1.01833844, "epoch": 0.6649431852341731, "flos": 18296882311680.0, "grad_norm": 4.6810062473447145, "language_loss": 0.71760809, "learning_rate": 1.06636473509817e-06, "loss": 0.73928374, "num_input_tokens_seen": 118989860, "step": 5530, "time_per_iteration": 2.50557804107666 }, { "auxiliary_loss_clip": 0.01137328, "auxiliary_loss_mlp": 0.00762909, "balance_loss_clip": 1.04518366, "balance_loss_mlp": 1.00060987, "epoch": 0.6650634281248121, "flos": 17019575700480.0, "grad_norm": 2.0045402465359885, "language_loss": 0.80801779, "learning_rate": 1.0656759195472447e-06, "loss": 0.82702017, "num_input_tokens_seen": 119007150, "step": 5531, "time_per_iteration": 2.468345880508423 }, { "auxiliary_loss_clip": 0.01048867, "auxiliary_loss_mlp": 0.01001513, "balance_loss_clip": 1.01583886, "balance_loss_mlp": 1.00061309, "epoch": 0.6651836710154512, "flos": 69294810666240.0, "grad_norm": 0.7688547156161302, "language_loss": 0.59770262, "learning_rate": 1.0649872457325414e-06, "loss": 0.61820644, "num_input_tokens_seen": 119068435, "step": 5532, "time_per_iteration": 3.0359108448028564 }, { "auxiliary_loss_clip": 0.01060317, "auxiliary_loss_mlp": 0.01001244, "balance_loss_clip": 1.01387858, "balance_loss_mlp": 1.00034988, "epoch": 0.6653039139060903, "flos": 66883444882560.0, "grad_norm": 0.8540739284539888, "language_loss": 0.55148345, "learning_rate": 1.0642987137585278e-06, "loss": 0.57209909, "num_input_tokens_seen": 119127960, "step": 5533, "time_per_iteration": 2.9857611656188965 }, { "auxiliary_loss_clip": 0.01141108, "auxiliary_loss_mlp": 0.01025743, "balance_loss_clip": 1.04616153, "balance_loss_mlp": 1.01859641, "epoch": 0.6654241567967294, "flos": 21470056669440.0, "grad_norm": 1.677822983000996, "language_loss": 0.82282197, "learning_rate": 1.0636103237296561e-06, "loss": 0.84449053, "num_input_tokens_seen": 119146885, "step": 5534, "time_per_iteration": 2.504901647567749 }, { "auxiliary_loss_clip": 0.01155336, "auxiliary_loss_mlp": 0.01027502, "balance_loss_clip": 1.05058169, "balance_loss_mlp": 1.02081406, "epoch": 0.6655443996873684, "flos": 25119514391040.0, "grad_norm": 1.7147288675738555, "language_loss": 0.84097803, "learning_rate": 1.062922075750353e-06, "loss": 0.86280644, "num_input_tokens_seen": 119166900, "step": 5535, "time_per_iteration": 3.223299741744995 }, { "auxiliary_loss_clip": 0.0113081, "auxiliary_loss_mlp": 0.01024059, "balance_loss_clip": 1.04607654, "balance_loss_mlp": 1.01684368, "epoch": 0.6656646425780076, "flos": 17457326749440.0, "grad_norm": 1.9615097527664374, "language_loss": 0.72253799, "learning_rate": 1.0622339699250267e-06, "loss": 0.74408662, "num_input_tokens_seen": 119184820, "step": 5536, "time_per_iteration": 2.5011141300201416 }, { "auxiliary_loss_clip": 0.01127225, "auxiliary_loss_mlp": 0.0102323, "balance_loss_clip": 1.04398274, "balance_loss_mlp": 1.01616645, "epoch": 0.6657848854686467, "flos": 23434190213760.0, "grad_norm": 1.742560073184748, "language_loss": 0.79410326, "learning_rate": 1.0615460063580624e-06, "loss": 0.81560779, "num_input_tokens_seen": 119203295, "step": 5537, "time_per_iteration": 2.5618491172790527 }, { "auxiliary_loss_clip": 0.01145145, "auxiliary_loss_mlp": 0.01024693, "balance_loss_clip": 1.04666209, "balance_loss_mlp": 1.01788604, "epoch": 0.6659051283592857, "flos": 11509909459200.0, "grad_norm": 1.7497521672685548, "language_loss": 0.73423094, "learning_rate": 1.060858185153821e-06, "loss": 0.75592935, "num_input_tokens_seen": 119221395, "step": 5538, "time_per_iteration": 2.4606072902679443 }, { "auxiliary_loss_clip": 0.01148398, "auxiliary_loss_mlp": 0.01024114, "balance_loss_clip": 1.04701769, "balance_loss_mlp": 1.01643658, "epoch": 0.6660253712499249, "flos": 20594554571520.0, "grad_norm": 2.1705685113249564, "language_loss": 0.76340932, "learning_rate": 1.0601705064166474e-06, "loss": 0.78513443, "num_input_tokens_seen": 119239790, "step": 5539, "time_per_iteration": 3.314908981323242 }, { "auxiliary_loss_clip": 0.01140862, "auxiliary_loss_mlp": 0.01027304, "balance_loss_clip": 1.04999018, "balance_loss_mlp": 1.01981151, "epoch": 0.666145614140564, "flos": 21251504367360.0, "grad_norm": 1.98334632172986, "language_loss": 0.7346797, "learning_rate": 1.0594829702508596e-06, "loss": 0.75636137, "num_input_tokens_seen": 119257505, "step": 5540, "time_per_iteration": 3.3076515197753906 }, { "auxiliary_loss_clip": 0.01129677, "auxiliary_loss_mlp": 0.01022363, "balance_loss_clip": 1.04428756, "balance_loss_mlp": 1.0152936, "epoch": 0.666265857031203, "flos": 33726188200320.0, "grad_norm": 3.951639745684329, "language_loss": 0.54971969, "learning_rate": 1.0587955767607592e-06, "loss": 0.57124007, "num_input_tokens_seen": 119279365, "step": 5541, "time_per_iteration": 2.6498446464538574 }, { "auxiliary_loss_clip": 0.01171984, "auxiliary_loss_mlp": 0.01027053, "balance_loss_clip": 1.05083442, "balance_loss_mlp": 1.01951885, "epoch": 0.6663860999218422, "flos": 17456644391040.0, "grad_norm": 2.1541790337222495, "language_loss": 0.76832557, "learning_rate": 1.0581083260506206e-06, "loss": 0.79031593, "num_input_tokens_seen": 119296150, "step": 5542, "time_per_iteration": 2.4024274349212646 }, { "auxiliary_loss_clip": 0.01140099, "auxiliary_loss_mlp": 0.01025213, "balance_loss_clip": 1.04537153, "balance_loss_mlp": 1.01805437, "epoch": 0.6665063428124812, "flos": 17676740977920.0, "grad_norm": 2.126645932810646, "language_loss": 0.76756954, "learning_rate": 1.0574212182246993e-06, "loss": 0.7892226, "num_input_tokens_seen": 119314845, "step": 5543, "time_per_iteration": 3.2622148990631104 }, { "auxiliary_loss_clip": 0.01146619, "auxiliary_loss_mlp": 0.01024303, "balance_loss_clip": 1.04625523, "balance_loss_mlp": 1.01603603, "epoch": 0.6666265857031203, "flos": 27673265687040.0, "grad_norm": 2.236547372071661, "language_loss": 0.76049602, "learning_rate": 1.0567342533872303e-06, "loss": 0.78220528, "num_input_tokens_seen": 119334875, "step": 5544, "time_per_iteration": 2.5519330501556396 }, { "auxiliary_loss_clip": 0.01144798, "auxiliary_loss_mlp": 0.01025208, "balance_loss_clip": 1.04801226, "balance_loss_mlp": 1.01758456, "epoch": 0.6667468285937594, "flos": 25046831220480.0, "grad_norm": 1.7298219727837763, "language_loss": 0.81157649, "learning_rate": 1.0560474316424255e-06, "loss": 0.83327657, "num_input_tokens_seen": 119354635, "step": 5545, "time_per_iteration": 2.52504563331604 }, { "auxiliary_loss_clip": 0.01142637, "auxiliary_loss_mlp": 0.01029472, "balance_loss_clip": 1.04534674, "balance_loss_mlp": 1.0210973, "epoch": 0.6668670714843985, "flos": 22780472641920.0, "grad_norm": 2.8473794769995115, "language_loss": 0.73465812, "learning_rate": 1.0553607530944746e-06, "loss": 0.75637919, "num_input_tokens_seen": 119372690, "step": 5546, "time_per_iteration": 2.498440980911255 }, { "auxiliary_loss_clip": 0.0112748, "auxiliary_loss_mlp": 0.01027001, "balance_loss_clip": 1.04397225, "balance_loss_mlp": 1.01971757, "epoch": 0.6669873143750376, "flos": 22163886754560.0, "grad_norm": 2.2452083971023606, "language_loss": 0.89550394, "learning_rate": 1.0546742178475463e-06, "loss": 0.91704875, "num_input_tokens_seen": 119391685, "step": 5547, "time_per_iteration": 2.5326223373413086 }, { "auxiliary_loss_clip": 0.0111963, "auxiliary_loss_mlp": 0.0102169, "balance_loss_clip": 1.04582059, "balance_loss_mlp": 1.0146569, "epoch": 0.6671075572656767, "flos": 20514832335360.0, "grad_norm": 1.714228263651133, "language_loss": 0.86919343, "learning_rate": 1.0539878260057868e-06, "loss": 0.89060658, "num_input_tokens_seen": 119410725, "step": 5548, "time_per_iteration": 2.643723487854004 }, { "auxiliary_loss_clip": 0.01161279, "auxiliary_loss_mlp": 0.01023832, "balance_loss_clip": 1.05214167, "balance_loss_mlp": 1.0153923, "epoch": 0.6672278001563158, "flos": 17931203902080.0, "grad_norm": 2.4429975224497356, "language_loss": 0.68255234, "learning_rate": 1.0533015776733226e-06, "loss": 0.70440346, "num_input_tokens_seen": 119426875, "step": 5549, "time_per_iteration": 2.5756430625915527 }, { "auxiliary_loss_clip": 0.01141113, "auxiliary_loss_mlp": 0.01024153, "balance_loss_clip": 1.04726541, "balance_loss_mlp": 1.0162375, "epoch": 0.6673480430469548, "flos": 22342146975360.0, "grad_norm": 2.388490199378825, "language_loss": 0.78299356, "learning_rate": 1.0526154729542566e-06, "loss": 0.80464613, "num_input_tokens_seen": 119446935, "step": 5550, "time_per_iteration": 2.6153078079223633 }, { "auxiliary_loss_clip": 0.01128532, "auxiliary_loss_mlp": 0.0102835, "balance_loss_clip": 1.04624462, "balance_loss_mlp": 1.02025509, "epoch": 0.6674682859375939, "flos": 20703830722560.0, "grad_norm": 2.2910147188210037, "language_loss": 0.79933047, "learning_rate": 1.0519295119526699e-06, "loss": 0.82089937, "num_input_tokens_seen": 119463240, "step": 5551, "time_per_iteration": 2.6724295616149902 }, { "auxiliary_loss_clip": 0.01145757, "auxiliary_loss_mlp": 0.01021979, "balance_loss_clip": 1.04754233, "balance_loss_mlp": 1.01433754, "epoch": 0.667588528828233, "flos": 26206673379840.0, "grad_norm": 1.5638411910386156, "language_loss": 0.83154821, "learning_rate": 1.0512436947726227e-06, "loss": 0.85322547, "num_input_tokens_seen": 119484655, "step": 5552, "time_per_iteration": 2.6712429523468018 }, { "auxiliary_loss_clip": 0.01127753, "auxiliary_loss_mlp": 0.01019472, "balance_loss_clip": 1.0423404, "balance_loss_mlp": 1.01151538, "epoch": 0.6677087717188721, "flos": 23071025756160.0, "grad_norm": 2.159858252587647, "language_loss": 0.65541106, "learning_rate": 1.0505580215181517e-06, "loss": 0.67688334, "num_input_tokens_seen": 119502895, "step": 5553, "time_per_iteration": 2.5411529541015625 }, { "auxiliary_loss_clip": 0.01028389, "auxiliary_loss_mlp": 0.01000971, "balance_loss_clip": 1.01356769, "balance_loss_mlp": 1.00015473, "epoch": 0.6678290146095112, "flos": 70941315219840.0, "grad_norm": 0.9283409737524261, "language_loss": 0.56688583, "learning_rate": 1.0498724922932753e-06, "loss": 0.58717942, "num_input_tokens_seen": 119561010, "step": 5554, "time_per_iteration": 3.0315301418304443 }, { "auxiliary_loss_clip": 0.01176606, "auxiliary_loss_mlp": 0.01025569, "balance_loss_clip": 1.05287588, "balance_loss_mlp": 1.0175705, "epoch": 0.6679492575001503, "flos": 18661088263680.0, "grad_norm": 2.0944287112077267, "language_loss": 0.86728042, "learning_rate": 1.0491871072019851e-06, "loss": 0.88930219, "num_input_tokens_seen": 119578900, "step": 5555, "time_per_iteration": 2.3964390754699707 }, { "auxiliary_loss_clip": 0.01133213, "auxiliary_loss_mlp": 0.01027566, "balance_loss_clip": 1.04447389, "balance_loss_mlp": 1.02036595, "epoch": 0.6680695003907894, "flos": 29711985822720.0, "grad_norm": 1.7539777547302515, "language_loss": 0.63716698, "learning_rate": 1.0485018663482555e-06, "loss": 0.65877473, "num_input_tokens_seen": 119598920, "step": 5556, "time_per_iteration": 2.5800023078918457 }, { "auxiliary_loss_clip": 0.01153377, "auxiliary_loss_mlp": 0.01021948, "balance_loss_clip": 1.04778492, "balance_loss_mlp": 1.01407397, "epoch": 0.6681897432814284, "flos": 28218964083840.0, "grad_norm": 2.540842597012571, "language_loss": 0.70446754, "learning_rate": 1.0478167698360354e-06, "loss": 0.72622073, "num_input_tokens_seen": 119618220, "step": 5557, "time_per_iteration": 2.4940547943115234 }, { "auxiliary_loss_clip": 0.0115044, "auxiliary_loss_mlp": 0.01024418, "balance_loss_clip": 1.04641485, "balance_loss_mlp": 1.01664209, "epoch": 0.6683099861720676, "flos": 25046543911680.0, "grad_norm": 2.6775474687521372, "language_loss": 0.70299375, "learning_rate": 1.0471318177692556e-06, "loss": 0.72474235, "num_input_tokens_seen": 119638520, "step": 5558, "time_per_iteration": 2.478252410888672 }, { "auxiliary_loss_clip": 0.0111925, "auxiliary_loss_mlp": 0.01027877, "balance_loss_clip": 1.04527569, "balance_loss_mlp": 1.02043796, "epoch": 0.6684302290627067, "flos": 22996977868800.0, "grad_norm": 3.2440330190469435, "language_loss": 0.76142788, "learning_rate": 1.046447010251821e-06, "loss": 0.78289914, "num_input_tokens_seen": 119655850, "step": 5559, "time_per_iteration": 2.5569262504577637 }, { "auxiliary_loss_clip": 0.01142883, "auxiliary_loss_mlp": 0.01025282, "balance_loss_clip": 1.04899716, "balance_loss_mlp": 1.01816535, "epoch": 0.6685504719533457, "flos": 26573824247040.0, "grad_norm": 2.138365777084513, "language_loss": 0.75626755, "learning_rate": 1.0457623473876157e-06, "loss": 0.77794921, "num_input_tokens_seen": 119675355, "step": 5560, "time_per_iteration": 2.578597068786621 }, { "auxiliary_loss_clip": 0.01169231, "auxiliary_loss_mlp": 0.01024956, "balance_loss_clip": 1.04872036, "balance_loss_mlp": 1.01811969, "epoch": 0.6686707148439849, "flos": 28986087870720.0, "grad_norm": 1.868925726003123, "language_loss": 0.71220756, "learning_rate": 1.0450778292805046e-06, "loss": 0.73414946, "num_input_tokens_seen": 119695340, "step": 5561, "time_per_iteration": 2.4805686473846436 }, { "auxiliary_loss_clip": 0.01159164, "auxiliary_loss_mlp": 0.01026473, "balance_loss_clip": 1.04660451, "balance_loss_mlp": 1.01919794, "epoch": 0.6687909577346239, "flos": 23623152687360.0, "grad_norm": 2.0247041898382103, "language_loss": 0.78722423, "learning_rate": 1.0443934560343267e-06, "loss": 0.8090806, "num_input_tokens_seen": 119716750, "step": 5562, "time_per_iteration": 3.313943862915039 }, { "auxiliary_loss_clip": 0.01115409, "auxiliary_loss_mlp": 0.01024399, "balance_loss_clip": 1.04246783, "balance_loss_mlp": 1.01709461, "epoch": 0.668911200625263, "flos": 23148593176320.0, "grad_norm": 2.0556870240121534, "language_loss": 0.78214395, "learning_rate": 1.0437092277529034e-06, "loss": 0.80354202, "num_input_tokens_seen": 119736005, "step": 5563, "time_per_iteration": 2.525938034057617 }, { "auxiliary_loss_clip": 0.01124505, "auxiliary_loss_mlp": 0.01025881, "balance_loss_clip": 1.04078555, "balance_loss_mlp": 1.01880646, "epoch": 0.6690314435159022, "flos": 18551919853440.0, "grad_norm": 2.0346624199803913, "language_loss": 0.73141664, "learning_rate": 1.0430251445400292e-06, "loss": 0.75292051, "num_input_tokens_seen": 119754050, "step": 5564, "time_per_iteration": 2.4879183769226074 }, { "auxiliary_loss_clip": 0.0108693, "auxiliary_loss_mlp": 0.01025199, "balance_loss_clip": 1.04578424, "balance_loss_mlp": 1.01773643, "epoch": 0.6691516864065412, "flos": 31759540704000.0, "grad_norm": 2.29490684102977, "language_loss": 0.62464035, "learning_rate": 1.0423412064994787e-06, "loss": 0.64576161, "num_input_tokens_seen": 119774820, "step": 5565, "time_per_iteration": 2.6734249591827393 }, { "auxiliary_loss_clip": 0.01130492, "auxiliary_loss_mlp": 0.01023299, "balance_loss_clip": 1.0444777, "balance_loss_mlp": 1.01587188, "epoch": 0.6692719292971803, "flos": 34933864296960.0, "grad_norm": 2.1299899439090906, "language_loss": 0.73788476, "learning_rate": 1.0416574137350064e-06, "loss": 0.75942266, "num_input_tokens_seen": 119795525, "step": 5566, "time_per_iteration": 3.489614963531494 }, { "auxiliary_loss_clip": 0.01149282, "auxiliary_loss_mlp": 0.01027406, "balance_loss_clip": 1.04733348, "balance_loss_mlp": 1.01975322, "epoch": 0.6693921721878194, "flos": 20449188230400.0, "grad_norm": 2.0446985089643475, "language_loss": 0.81325543, "learning_rate": 1.0409737663503428e-06, "loss": 0.83502233, "num_input_tokens_seen": 119813905, "step": 5567, "time_per_iteration": 3.256037473678589 }, { "auxiliary_loss_clip": 0.0115332, "auxiliary_loss_mlp": 0.0102903, "balance_loss_clip": 1.04498446, "balance_loss_mlp": 1.02097762, "epoch": 0.6695124150784585, "flos": 16614538963200.0, "grad_norm": 2.436174842291626, "language_loss": 0.82967198, "learning_rate": 1.040290264449196e-06, "loss": 0.8514955, "num_input_tokens_seen": 119832010, "step": 5568, "time_per_iteration": 2.4484620094299316 }, { "auxiliary_loss_clip": 0.01152272, "auxiliary_loss_mlp": 0.01028968, "balance_loss_clip": 1.0483197, "balance_loss_mlp": 1.02178299, "epoch": 0.6696326579690975, "flos": 26652145852800.0, "grad_norm": 3.8486687727426134, "language_loss": 0.63670647, "learning_rate": 1.0396069081352532e-06, "loss": 0.65851885, "num_input_tokens_seen": 119851165, "step": 5569, "time_per_iteration": 3.2229678630828857 }, { "auxiliary_loss_clip": 0.01068641, "auxiliary_loss_mlp": 0.01002517, "balance_loss_clip": 1.0130806, "balance_loss_mlp": 1.00153947, "epoch": 0.6697529008597367, "flos": 66964603662720.0, "grad_norm": 0.7728842107677465, "language_loss": 0.56076854, "learning_rate": 1.0389236975121782e-06, "loss": 0.58148009, "num_input_tokens_seen": 119906015, "step": 5570, "time_per_iteration": 2.9311363697052 }, { "auxiliary_loss_clip": 0.01172728, "auxiliary_loss_mlp": 0.01021099, "balance_loss_clip": 1.04959297, "balance_loss_mlp": 1.01331449, "epoch": 0.6698731437503758, "flos": 20886939279360.0, "grad_norm": 2.2919833931668183, "language_loss": 0.71307224, "learning_rate": 1.0382406326836147e-06, "loss": 0.7350105, "num_input_tokens_seen": 119925160, "step": 5571, "time_per_iteration": 2.4139564037323 }, { "auxiliary_loss_clip": 0.01163883, "auxiliary_loss_mlp": 0.01025774, "balance_loss_clip": 1.05001009, "balance_loss_mlp": 1.01755416, "epoch": 0.6699933866410148, "flos": 20409470766720.0, "grad_norm": 1.8400274426332732, "language_loss": 0.7611568, "learning_rate": 1.0375577137531828e-06, "loss": 0.7830534, "num_input_tokens_seen": 119943720, "step": 5572, "time_per_iteration": 2.4486207962036133 }, { "auxiliary_loss_clip": 0.01146559, "auxiliary_loss_mlp": 0.01025664, "balance_loss_clip": 1.0486697, "balance_loss_mlp": 1.01762354, "epoch": 0.670113629531654, "flos": 29023075900800.0, "grad_norm": 1.5204870415256866, "language_loss": 0.72181427, "learning_rate": 1.0368749408244802e-06, "loss": 0.74353653, "num_input_tokens_seen": 119966640, "step": 5573, "time_per_iteration": 2.5619585514068604 }, { "auxiliary_loss_clip": 0.0115276, "auxiliary_loss_mlp": 0.01028009, "balance_loss_clip": 1.04887307, "balance_loss_mlp": 1.0206275, "epoch": 0.670233872422293, "flos": 19791699730560.0, "grad_norm": 2.0509017342801608, "language_loss": 0.78862309, "learning_rate": 1.0361923140010836e-06, "loss": 0.81043077, "num_input_tokens_seen": 119985125, "step": 5574, "time_per_iteration": 2.4595985412597656 }, { "auxiliary_loss_clip": 0.01158599, "auxiliary_loss_mlp": 0.01021459, "balance_loss_clip": 1.04609215, "balance_loss_mlp": 1.01359069, "epoch": 0.6703541153129321, "flos": 24243689070720.0, "grad_norm": 2.4843856840743532, "language_loss": 0.63171947, "learning_rate": 1.0355098333865455e-06, "loss": 0.65351999, "num_input_tokens_seen": 120004355, "step": 5575, "time_per_iteration": 2.474578380584717 }, { "auxiliary_loss_clip": 0.01156168, "auxiliary_loss_mlp": 0.01028428, "balance_loss_clip": 1.05293179, "balance_loss_mlp": 1.02114725, "epoch": 0.6704743582035713, "flos": 26688523351680.0, "grad_norm": 1.5922564903622123, "language_loss": 0.69083208, "learning_rate": 1.0348274990844006e-06, "loss": 0.71267807, "num_input_tokens_seen": 120027115, "step": 5576, "time_per_iteration": 2.507566213607788 }, { "auxiliary_loss_clip": 0.0115701, "auxiliary_loss_mlp": 0.01026959, "balance_loss_clip": 1.04970455, "balance_loss_mlp": 1.01944304, "epoch": 0.6705946010942103, "flos": 23514379326720.0, "grad_norm": 1.7193449444515767, "language_loss": 0.73016864, "learning_rate": 1.034145311198155e-06, "loss": 0.75200832, "num_input_tokens_seen": 120047130, "step": 5577, "time_per_iteration": 2.457211494445801 }, { "auxiliary_loss_clip": 0.01166544, "auxiliary_loss_mlp": 0.01020109, "balance_loss_clip": 1.04788625, "balance_loss_mlp": 1.01320028, "epoch": 0.6707148439848494, "flos": 24061011477120.0, "grad_norm": 1.6692131200243168, "language_loss": 0.6354475, "learning_rate": 1.0334632698312989e-06, "loss": 0.65731406, "num_input_tokens_seen": 120067925, "step": 5578, "time_per_iteration": 2.4303925037384033 }, { "auxiliary_loss_clip": 0.01135283, "auxiliary_loss_mlp": 0.01029213, "balance_loss_clip": 1.04557943, "balance_loss_mlp": 1.02127075, "epoch": 0.6708350868754885, "flos": 22528667324160.0, "grad_norm": 2.0549815873472648, "language_loss": 0.75402439, "learning_rate": 1.032781375087295e-06, "loss": 0.77566946, "num_input_tokens_seen": 120087825, "step": 5579, "time_per_iteration": 2.4749789237976074 }, { "auxiliary_loss_clip": 0.01147297, "auxiliary_loss_mlp": 0.01024672, "balance_loss_clip": 1.05082631, "balance_loss_mlp": 1.01757944, "epoch": 0.6709553297661276, "flos": 25227749047680.0, "grad_norm": 1.795518310229102, "language_loss": 0.67651784, "learning_rate": 1.0320996270695891e-06, "loss": 0.69823748, "num_input_tokens_seen": 120108895, "step": 5580, "time_per_iteration": 2.531343936920166 }, { "auxiliary_loss_clip": 0.01127814, "auxiliary_loss_mlp": 0.01024539, "balance_loss_clip": 1.04389405, "balance_loss_mlp": 1.0167253, "epoch": 0.6710755726567667, "flos": 20448757267200.0, "grad_norm": 1.7611695924655835, "language_loss": 0.73460591, "learning_rate": 1.0314180258815998e-06, "loss": 0.75612944, "num_input_tokens_seen": 120127535, "step": 5581, "time_per_iteration": 2.5152859687805176 }, { "auxiliary_loss_clip": 0.01118559, "auxiliary_loss_mlp": 0.01022809, "balance_loss_clip": 1.04242873, "balance_loss_mlp": 1.01573372, "epoch": 0.6711958155474057, "flos": 25995411538560.0, "grad_norm": 1.5454506611844128, "language_loss": 0.7423296, "learning_rate": 1.0307365716267247e-06, "loss": 0.76374328, "num_input_tokens_seen": 120147980, "step": 5582, "time_per_iteration": 2.5693109035491943 }, { "auxiliary_loss_clip": 0.01156179, "auxiliary_loss_mlp": 0.01024722, "balance_loss_clip": 1.04835892, "balance_loss_mlp": 1.01736093, "epoch": 0.6713160584380449, "flos": 19937712516480.0, "grad_norm": 1.9203590595304838, "language_loss": 0.78199959, "learning_rate": 1.0300552644083423e-06, "loss": 0.80380857, "num_input_tokens_seen": 120166905, "step": 5583, "time_per_iteration": 2.4478225708007812 }, { "auxiliary_loss_clip": 0.0113581, "auxiliary_loss_mlp": 0.01022827, "balance_loss_clip": 1.04927218, "balance_loss_mlp": 1.01506698, "epoch": 0.6714363013286839, "flos": 18223373128320.0, "grad_norm": 2.2636071456574576, "language_loss": 0.72501814, "learning_rate": 1.0293741043298036e-06, "loss": 0.7466045, "num_input_tokens_seen": 120185255, "step": 5584, "time_per_iteration": 2.4972448348999023 }, { "auxiliary_loss_clip": 0.0113461, "auxiliary_loss_mlp": 0.01029248, "balance_loss_clip": 1.05077291, "balance_loss_mlp": 1.02131486, "epoch": 0.671556544219323, "flos": 25812374808960.0, "grad_norm": 4.039987393102375, "language_loss": 0.71544707, "learning_rate": 1.0286930914944436e-06, "loss": 0.7370857, "num_input_tokens_seen": 120205070, "step": 5585, "time_per_iteration": 2.544395923614502 }, { "auxiliary_loss_clip": 0.01170176, "auxiliary_loss_mlp": 0.01025143, "balance_loss_clip": 1.04692054, "balance_loss_mlp": 1.01794577, "epoch": 0.6716767871099621, "flos": 15850431918720.0, "grad_norm": 2.4592464930007387, "language_loss": 0.77125227, "learning_rate": 1.0280122260055684e-06, "loss": 0.7932055, "num_input_tokens_seen": 120220780, "step": 5586, "time_per_iteration": 2.398461103439331 }, { "auxiliary_loss_clip": 0.0117304, "auxiliary_loss_mlp": 0.01029076, "balance_loss_clip": 1.0507741, "balance_loss_mlp": 1.02094603, "epoch": 0.6717970300006012, "flos": 19756112330880.0, "grad_norm": 1.930092579841454, "language_loss": 0.820786, "learning_rate": 1.0273315079664652e-06, "loss": 0.84280711, "num_input_tokens_seen": 120238735, "step": 5587, "time_per_iteration": 2.4364752769470215 }, { "auxiliary_loss_clip": 0.01158978, "auxiliary_loss_mlp": 0.01022519, "balance_loss_clip": 1.04962456, "balance_loss_mlp": 1.01512241, "epoch": 0.6719172728912403, "flos": 25485049146240.0, "grad_norm": 2.1537941645484624, "language_loss": 0.74342352, "learning_rate": 1.0266509374803992e-06, "loss": 0.76523852, "num_input_tokens_seen": 120259895, "step": 5588, "time_per_iteration": 2.4993505477905273 }, { "auxiliary_loss_clip": 0.01170433, "auxiliary_loss_mlp": 0.00762336, "balance_loss_clip": 1.0483737, "balance_loss_mlp": 1.00059915, "epoch": 0.6720375157818794, "flos": 15880344969600.0, "grad_norm": 4.557218838866834, "language_loss": 0.84566152, "learning_rate": 1.0259705146506123e-06, "loss": 0.86498922, "num_input_tokens_seen": 120274790, "step": 5589, "time_per_iteration": 3.1294379234313965 }, { "auxiliary_loss_clip": 0.0115964, "auxiliary_loss_mlp": 0.01027452, "balance_loss_clip": 1.04859781, "balance_loss_mlp": 1.02023971, "epoch": 0.6721577586725185, "flos": 32010843231360.0, "grad_norm": 2.0961986089760716, "language_loss": 0.77502471, "learning_rate": 1.025290239580324e-06, "loss": 0.79689562, "num_input_tokens_seen": 120295460, "step": 5590, "time_per_iteration": 2.5450472831726074 }, { "auxiliary_loss_clip": 0.01114423, "auxiliary_loss_mlp": 0.01028646, "balance_loss_clip": 1.04314291, "balance_loss_mlp": 1.02130306, "epoch": 0.6722780015631575, "flos": 20737873837440.0, "grad_norm": 1.6192387201139165, "language_loss": 0.75764674, "learning_rate": 1.0246101123727313e-06, "loss": 0.77907741, "num_input_tokens_seen": 120314440, "step": 5591, "time_per_iteration": 2.5433592796325684 }, { "auxiliary_loss_clip": 0.01155405, "auxiliary_loss_mlp": 0.01030723, "balance_loss_clip": 1.04615891, "balance_loss_mlp": 1.023839, "epoch": 0.6723982444537967, "flos": 16909617191040.0, "grad_norm": 1.8983292224355133, "language_loss": 0.78469092, "learning_rate": 1.0239301331310085e-06, "loss": 0.80655217, "num_input_tokens_seen": 120332060, "step": 5592, "time_per_iteration": 3.2444515228271484 }, { "auxiliary_loss_clip": 0.01153206, "auxiliary_loss_mlp": 0.01025997, "balance_loss_clip": 1.04712093, "balance_loss_mlp": 1.01868939, "epoch": 0.6725184873444358, "flos": 20667812359680.0, "grad_norm": 1.6893222952765115, "language_loss": 0.88701761, "learning_rate": 1.0232503019583088e-06, "loss": 0.90880966, "num_input_tokens_seen": 120351670, "step": 5593, "time_per_iteration": 2.4609792232513428 }, { "auxiliary_loss_clip": 0.01151919, "auxiliary_loss_mlp": 0.01027316, "balance_loss_clip": 1.04745698, "balance_loss_mlp": 1.01968396, "epoch": 0.6726387302350748, "flos": 23727616416000.0, "grad_norm": 1.8391984003790942, "language_loss": 0.69723374, "learning_rate": 1.0225706189577619e-06, "loss": 0.71902609, "num_input_tokens_seen": 120370195, "step": 5594, "time_per_iteration": 3.295260190963745 }, { "auxiliary_loss_clip": 0.01158551, "auxiliary_loss_mlp": 0.01025529, "balance_loss_clip": 1.04849505, "balance_loss_mlp": 1.01766157, "epoch": 0.672758973125714, "flos": 15188274650880.0, "grad_norm": 2.4426036981054753, "language_loss": 0.7461617, "learning_rate": 1.021891084232475e-06, "loss": 0.76800251, "num_input_tokens_seen": 120388130, "step": 5595, "time_per_iteration": 2.416186571121216 }, { "auxiliary_loss_clip": 0.01154593, "auxiliary_loss_mlp": 0.01027123, "balance_loss_clip": 1.04541349, "balance_loss_mlp": 1.01931763, "epoch": 0.672879216016353, "flos": 18077252601600.0, "grad_norm": 3.0578557889030766, "language_loss": 0.80070502, "learning_rate": 1.0212116978855325e-06, "loss": 0.82252216, "num_input_tokens_seen": 120406145, "step": 5596, "time_per_iteration": 3.1221911907196045 }, { "auxiliary_loss_clip": 0.01126155, "auxiliary_loss_mlp": 0.01021038, "balance_loss_clip": 1.04460335, "balance_loss_mlp": 1.01410341, "epoch": 0.6729994589069921, "flos": 23476349802240.0, "grad_norm": 1.710851739438485, "language_loss": 0.78964907, "learning_rate": 1.020532460019997e-06, "loss": 0.81112099, "num_input_tokens_seen": 120425395, "step": 5597, "time_per_iteration": 2.524912118911743 }, { "auxiliary_loss_clip": 0.01091593, "auxiliary_loss_mlp": 0.01026481, "balance_loss_clip": 1.04280186, "balance_loss_mlp": 1.01897693, "epoch": 0.6731197017976313, "flos": 26322018929280.0, "grad_norm": 1.9175062700451448, "language_loss": 0.71083069, "learning_rate": 1.0198533707389096e-06, "loss": 0.73201144, "num_input_tokens_seen": 120446270, "step": 5598, "time_per_iteration": 2.6498665809631348 }, { "auxiliary_loss_clip": 0.01153994, "auxiliary_loss_mlp": 0.00762519, "balance_loss_clip": 1.04830921, "balance_loss_mlp": 1.00066662, "epoch": 0.6732399446882703, "flos": 21616428591360.0, "grad_norm": 4.479655399412542, "language_loss": 0.73157728, "learning_rate": 1.0191744301452853e-06, "loss": 0.75074244, "num_input_tokens_seen": 120465570, "step": 5599, "time_per_iteration": 2.437769889831543 }, { "auxiliary_loss_clip": 0.0116932, "auxiliary_loss_mlp": 0.01026874, "balance_loss_clip": 1.04865634, "balance_loss_mlp": 1.01938212, "epoch": 0.6733601875789094, "flos": 25880173729920.0, "grad_norm": 1.6607557564656918, "language_loss": 0.70416641, "learning_rate": 1.0184956383421208e-06, "loss": 0.72612834, "num_input_tokens_seen": 120484220, "step": 5600, "time_per_iteration": 2.4456331729888916 }, { "auxiliary_loss_clip": 0.01157701, "auxiliary_loss_mlp": 0.01025374, "balance_loss_clip": 1.04857826, "balance_loss_mlp": 1.01795614, "epoch": 0.6734804304695485, "flos": 22929573997440.0, "grad_norm": 1.9796694561357622, "language_loss": 0.65360671, "learning_rate": 1.017816995432387e-06, "loss": 0.67543745, "num_input_tokens_seen": 120503320, "step": 5601, "time_per_iteration": 2.464118719100952 }, { "auxiliary_loss_clip": 0.01142149, "auxiliary_loss_mlp": 0.01023438, "balance_loss_clip": 1.04594636, "balance_loss_mlp": 1.0158149, "epoch": 0.6736006733601876, "flos": 18697968552960.0, "grad_norm": 2.2448856128377646, "language_loss": 0.74459243, "learning_rate": 1.0171385015190353e-06, "loss": 0.76624835, "num_input_tokens_seen": 120523180, "step": 5602, "time_per_iteration": 2.466320276260376 }, { "auxiliary_loss_clip": 0.01140067, "auxiliary_loss_mlp": 0.00762745, "balance_loss_clip": 1.05057693, "balance_loss_mlp": 1.00071073, "epoch": 0.6737209162508266, "flos": 19427745173760.0, "grad_norm": 2.108869883709841, "language_loss": 0.7301839, "learning_rate": 1.0164601567049908e-06, "loss": 0.74921191, "num_input_tokens_seen": 120541710, "step": 5603, "time_per_iteration": 2.4771969318389893 }, { "auxiliary_loss_clip": 0.01142128, "auxiliary_loss_mlp": 0.01026764, "balance_loss_clip": 1.04705882, "balance_loss_mlp": 1.0188005, "epoch": 0.6738411591414658, "flos": 20158060498560.0, "grad_norm": 1.5791783503968118, "language_loss": 0.8043083, "learning_rate": 1.015781961093158e-06, "loss": 0.82599723, "num_input_tokens_seen": 120561030, "step": 5604, "time_per_iteration": 2.4761765003204346 }, { "auxiliary_loss_clip": 0.01143991, "auxiliary_loss_mlp": 0.01026062, "balance_loss_clip": 1.04299426, "balance_loss_mlp": 1.01859403, "epoch": 0.6739614020321049, "flos": 21653847584640.0, "grad_norm": 1.9417755597284516, "language_loss": 0.77187514, "learning_rate": 1.0151039147864197e-06, "loss": 0.79357564, "num_input_tokens_seen": 120581005, "step": 5605, "time_per_iteration": 2.4847517013549805 }, { "auxiliary_loss_clip": 0.010836, "auxiliary_loss_mlp": 0.01023915, "balance_loss_clip": 1.04823756, "balance_loss_mlp": 1.01589823, "epoch": 0.6740816449227439, "flos": 19171702051200.0, "grad_norm": 3.8051607042570157, "language_loss": 0.65878797, "learning_rate": 1.0144260178876336e-06, "loss": 0.6798631, "num_input_tokens_seen": 120600350, "step": 5606, "time_per_iteration": 2.6127288341522217 }, { "auxiliary_loss_clip": 0.01147329, "auxiliary_loss_mlp": 0.01020958, "balance_loss_clip": 1.04611468, "balance_loss_mlp": 1.01394498, "epoch": 0.6742018878133831, "flos": 21097015971840.0, "grad_norm": 2.5932104089263657, "language_loss": 0.67258775, "learning_rate": 1.0137482704996388e-06, "loss": 0.69427061, "num_input_tokens_seen": 120614700, "step": 5607, "time_per_iteration": 2.466639518737793 }, { "auxiliary_loss_clip": 0.01129851, "auxiliary_loss_mlp": 0.0102748, "balance_loss_clip": 1.04481328, "balance_loss_mlp": 1.01980269, "epoch": 0.6743221307040221, "flos": 23549966726400.0, "grad_norm": 4.481755641195638, "language_loss": 0.79232001, "learning_rate": 1.0130706727252461e-06, "loss": 0.81389338, "num_input_tokens_seen": 120631755, "step": 5608, "time_per_iteration": 2.5173239707946777 }, { "auxiliary_loss_clip": 0.0113129, "auxiliary_loss_mlp": 0.0102803, "balance_loss_clip": 1.04560113, "balance_loss_mlp": 1.02046633, "epoch": 0.6744423735946612, "flos": 16249542912000.0, "grad_norm": 2.7969143063846333, "language_loss": 0.68080229, "learning_rate": 1.0123932246672468e-06, "loss": 0.70239544, "num_input_tokens_seen": 120645900, "step": 5609, "time_per_iteration": 2.4734508991241455 }, { "auxiliary_loss_clip": 0.01026974, "auxiliary_loss_mlp": 0.00753028, "balance_loss_clip": 1.01471448, "balance_loss_mlp": 0.9998517, "epoch": 0.6745626164853004, "flos": 57843257829120.0, "grad_norm": 0.7804044379563917, "language_loss": 0.55836391, "learning_rate": 1.0117159264284114e-06, "loss": 0.57616389, "num_input_tokens_seen": 120709070, "step": 5610, "time_per_iteration": 3.112058162689209 }, { "auxiliary_loss_clip": 0.011472, "auxiliary_loss_mlp": 0.0102519, "balance_loss_clip": 1.04830813, "balance_loss_mlp": 1.0180583, "epoch": 0.6746828593759394, "flos": 20485027025280.0, "grad_norm": 1.8025215069265428, "language_loss": 0.77049118, "learning_rate": 1.0110387781114837e-06, "loss": 0.79221511, "num_input_tokens_seen": 120727685, "step": 5611, "time_per_iteration": 2.481444835662842 }, { "auxiliary_loss_clip": 0.01167564, "auxiliary_loss_mlp": 0.01026111, "balance_loss_clip": 1.0479244, "balance_loss_mlp": 1.01867855, "epoch": 0.6748031022665785, "flos": 19208223204480.0, "grad_norm": 2.1587504679037273, "language_loss": 0.77094889, "learning_rate": 1.0103617798191872e-06, "loss": 0.7928856, "num_input_tokens_seen": 120747160, "step": 5612, "time_per_iteration": 2.4058477878570557 }, { "auxiliary_loss_clip": 0.01141566, "auxiliary_loss_mlp": 0.01023379, "balance_loss_clip": 1.04869998, "balance_loss_mlp": 1.01558232, "epoch": 0.6749233451572175, "flos": 15195026407680.0, "grad_norm": 2.550253404705486, "language_loss": 0.82835215, "learning_rate": 1.0096849316542217e-06, "loss": 0.85000163, "num_input_tokens_seen": 120763710, "step": 5613, "time_per_iteration": 2.4668567180633545 }, { "auxiliary_loss_clip": 0.01073191, "auxiliary_loss_mlp": 0.01020718, "balance_loss_clip": 1.03747666, "balance_loss_mlp": 1.01296961, "epoch": 0.6750435880478567, "flos": 26499489050880.0, "grad_norm": 2.187206510248746, "language_loss": 0.74766231, "learning_rate": 1.0090082337192643e-06, "loss": 0.76860142, "num_input_tokens_seen": 120783355, "step": 5614, "time_per_iteration": 2.652745246887207 }, { "auxiliary_loss_clip": 0.01091801, "auxiliary_loss_mlp": 0.0102714, "balance_loss_clip": 1.03603482, "balance_loss_mlp": 1.01978159, "epoch": 0.6751638309384957, "flos": 23404313076480.0, "grad_norm": 1.9790381324417856, "language_loss": 0.78248245, "learning_rate": 1.0083316861169705e-06, "loss": 0.80367184, "num_input_tokens_seen": 120802090, "step": 5615, "time_per_iteration": 2.5952847003936768 }, { "auxiliary_loss_clip": 0.01131985, "auxiliary_loss_mlp": 0.01023283, "balance_loss_clip": 1.04292858, "balance_loss_mlp": 1.01494479, "epoch": 0.6752840738291348, "flos": 23441408847360.0, "grad_norm": 2.5462130476113334, "language_loss": 0.71312702, "learning_rate": 1.0076552889499713e-06, "loss": 0.73467976, "num_input_tokens_seen": 120822855, "step": 5616, "time_per_iteration": 3.2727737426757812 }, { "auxiliary_loss_clip": 0.01156204, "auxiliary_loss_mlp": 0.01025133, "balance_loss_clip": 1.0497458, "balance_loss_mlp": 1.01835907, "epoch": 0.675404316719774, "flos": 30335826257280.0, "grad_norm": 2.3543251133200545, "language_loss": 0.73736441, "learning_rate": 1.006979042320876e-06, "loss": 0.75917774, "num_input_tokens_seen": 120843070, "step": 5617, "time_per_iteration": 2.512648344039917 }, { "auxiliary_loss_clip": 0.01137699, "auxiliary_loss_mlp": 0.0102067, "balance_loss_clip": 1.04344237, "balance_loss_mlp": 1.01275742, "epoch": 0.675524559610413, "flos": 23622613983360.0, "grad_norm": 2.268019991433727, "language_loss": 0.62844241, "learning_rate": 1.0063029463322702e-06, "loss": 0.65002608, "num_input_tokens_seen": 120863345, "step": 5618, "time_per_iteration": 2.510601282119751 }, { "auxiliary_loss_clip": 0.01107286, "auxiliary_loss_mlp": 0.00762807, "balance_loss_clip": 1.04074073, "balance_loss_mlp": 1.00069857, "epoch": 0.6756448025010521, "flos": 21248631279360.0, "grad_norm": 2.3227387283972925, "language_loss": 0.752038, "learning_rate": 1.0056270010867164e-06, "loss": 0.7707389, "num_input_tokens_seen": 120880915, "step": 5619, "time_per_iteration": 3.3766841888427734 }, { "auxiliary_loss_clip": 0.01140258, "auxiliary_loss_mlp": 0.01027995, "balance_loss_clip": 1.04161584, "balance_loss_mlp": 1.02000773, "epoch": 0.6757650453916912, "flos": 21646521210240.0, "grad_norm": 2.4100007711594023, "language_loss": 0.77961367, "learning_rate": 1.004951206686758e-06, "loss": 0.80129623, "num_input_tokens_seen": 120899190, "step": 5620, "time_per_iteration": 3.291287660598755 }, { "auxiliary_loss_clip": 0.01150151, "auxiliary_loss_mlp": 0.01031102, "balance_loss_clip": 1.04589152, "balance_loss_mlp": 1.02357376, "epoch": 0.6758852882823303, "flos": 21795658479360.0, "grad_norm": 2.526588143775918, "language_loss": 0.71496648, "learning_rate": 1.0042755632349087e-06, "loss": 0.73677897, "num_input_tokens_seen": 120916080, "step": 5621, "time_per_iteration": 2.4356589317321777 }, { "auxiliary_loss_clip": 0.0112673, "auxiliary_loss_mlp": 0.01027917, "balance_loss_clip": 1.04364705, "balance_loss_mlp": 1.02014494, "epoch": 0.6760055311729694, "flos": 27088783580160.0, "grad_norm": 2.2547904810140884, "language_loss": 0.62818456, "learning_rate": 1.0036000708336653e-06, "loss": 0.64973104, "num_input_tokens_seen": 120935210, "step": 5622, "time_per_iteration": 3.3229434490203857 }, { "auxiliary_loss_clip": 0.01145278, "auxiliary_loss_mlp": 0.01028595, "balance_loss_clip": 1.04803848, "balance_loss_mlp": 1.02096009, "epoch": 0.6761257740636085, "flos": 17999792922240.0, "grad_norm": 2.546558619472363, "language_loss": 0.79283339, "learning_rate": 1.0029247295854984e-06, "loss": 0.8145721, "num_input_tokens_seen": 120951830, "step": 5623, "time_per_iteration": 2.4617576599121094 }, { "auxiliary_loss_clip": 0.01134581, "auxiliary_loss_mlp": 0.01025958, "balance_loss_clip": 1.04796314, "balance_loss_mlp": 1.0190202, "epoch": 0.6762460169542476, "flos": 15121912273920.0, "grad_norm": 2.0280833066145294, "language_loss": 0.71425265, "learning_rate": 1.0022495395928588e-06, "loss": 0.73585802, "num_input_tokens_seen": 120970310, "step": 5624, "time_per_iteration": 2.5078158378601074 }, { "auxiliary_loss_clip": 0.01070963, "auxiliary_loss_mlp": 0.01002205, "balance_loss_clip": 1.01545882, "balance_loss_mlp": 1.00110233, "epoch": 0.6763662598448866, "flos": 67886970030720.0, "grad_norm": 0.7908352360251991, "language_loss": 0.62347364, "learning_rate": 1.0015745009581697e-06, "loss": 0.64420533, "num_input_tokens_seen": 121031915, "step": 5625, "time_per_iteration": 3.0608749389648438 }, { "auxiliary_loss_clip": 0.01153176, "auxiliary_loss_mlp": 0.01023455, "balance_loss_clip": 1.04903865, "balance_loss_mlp": 1.01634443, "epoch": 0.6764865027355258, "flos": 20631829910400.0, "grad_norm": 1.8648441227093218, "language_loss": 0.66912121, "learning_rate": 1.0008996137838343e-06, "loss": 0.69088751, "num_input_tokens_seen": 121050890, "step": 5626, "time_per_iteration": 2.4541313648223877 }, { "auxiliary_loss_clip": 0.01174715, "auxiliary_loss_mlp": 0.01026978, "balance_loss_clip": 1.05032587, "balance_loss_mlp": 1.01912785, "epoch": 0.6766067456261649, "flos": 21215809226880.0, "grad_norm": 1.9934190685119748, "language_loss": 0.79698277, "learning_rate": 1.000224878172234e-06, "loss": 0.81899965, "num_input_tokens_seen": 121070015, "step": 5627, "time_per_iteration": 2.4292218685150146 }, { "auxiliary_loss_clip": 0.01158191, "auxiliary_loss_mlp": 0.01024436, "balance_loss_clip": 1.04711056, "balance_loss_mlp": 1.01689017, "epoch": 0.6767269885168039, "flos": 19938251220480.0, "grad_norm": 5.117672667238519, "language_loss": 0.72491854, "learning_rate": 9.99550294225724e-07, "loss": 0.74674487, "num_input_tokens_seen": 121089170, "step": 5628, "time_per_iteration": 2.4629600048065186 }, { "auxiliary_loss_clip": 0.01117135, "auxiliary_loss_mlp": 0.01028004, "balance_loss_clip": 1.04237509, "balance_loss_mlp": 1.02004123, "epoch": 0.6768472314074431, "flos": 20814076540800.0, "grad_norm": 2.005189705533757, "language_loss": 0.72329581, "learning_rate": 9.988758620466402e-07, "loss": 0.74474722, "num_input_tokens_seen": 121108040, "step": 5629, "time_per_iteration": 2.5583128929138184 }, { "auxiliary_loss_clip": 0.01103109, "auxiliary_loss_mlp": 0.01024231, "balance_loss_clip": 1.04010439, "balance_loss_mlp": 1.01713777, "epoch": 0.6769674742980821, "flos": 23186012169600.0, "grad_norm": 1.5534478381106351, "language_loss": 0.76141167, "learning_rate": 9.982015817372917e-07, "loss": 0.78268504, "num_input_tokens_seen": 121128480, "step": 5630, "time_per_iteration": 2.6117234230041504 }, { "auxiliary_loss_clip": 0.0110992, "auxiliary_loss_mlp": 0.01024894, "balance_loss_clip": 1.03948462, "balance_loss_mlp": 1.01715136, "epoch": 0.6770877171887212, "flos": 24242934885120.0, "grad_norm": 2.455341481587658, "language_loss": 0.81844622, "learning_rate": 9.975274533999657e-07, "loss": 0.83979434, "num_input_tokens_seen": 121148010, "step": 5631, "time_per_iteration": 2.5707125663757324 }, { "auxiliary_loss_clip": 0.01171228, "auxiliary_loss_mlp": 0.01028535, "balance_loss_clip": 1.04837656, "balance_loss_mlp": 1.02057219, "epoch": 0.6772079600793603, "flos": 18141567903360.0, "grad_norm": 2.783368541098781, "language_loss": 0.83714271, "learning_rate": 9.96853477136929e-07, "loss": 0.8591404, "num_input_tokens_seen": 121162755, "step": 5632, "time_per_iteration": 2.3982958793640137 }, { "auxiliary_loss_clip": 0.01119539, "auxiliary_loss_mlp": 0.01023011, "balance_loss_clip": 1.04142535, "balance_loss_mlp": 1.01550364, "epoch": 0.6773282029699994, "flos": 22452069571200.0, "grad_norm": 2.588647034896108, "language_loss": 0.75288522, "learning_rate": 9.96179653050422e-07, "loss": 0.77431077, "num_input_tokens_seen": 121182915, "step": 5633, "time_per_iteration": 2.650327682495117 }, { "auxiliary_loss_clip": 0.0112272, "auxiliary_loss_mlp": 0.01024975, "balance_loss_clip": 1.04434836, "balance_loss_mlp": 1.0171752, "epoch": 0.6774484458606385, "flos": 18693730748160.0, "grad_norm": 2.1216665205693612, "language_loss": 0.73863685, "learning_rate": 9.955059812426635e-07, "loss": 0.76011372, "num_input_tokens_seen": 121200445, "step": 5634, "time_per_iteration": 2.4857876300811768 }, { "auxiliary_loss_clip": 0.01172598, "auxiliary_loss_mlp": 0.01025694, "balance_loss_clip": 1.05231726, "balance_loss_mlp": 1.01793623, "epoch": 0.6775686887512776, "flos": 25994046821760.0, "grad_norm": 3.080649696603547, "language_loss": 0.82902557, "learning_rate": 9.948324618158493e-07, "loss": 0.85100847, "num_input_tokens_seen": 121220785, "step": 5635, "time_per_iteration": 2.453493118286133 }, { "auxiliary_loss_clip": 0.01156853, "auxiliary_loss_mlp": 0.01027071, "balance_loss_clip": 1.04499531, "balance_loss_mlp": 1.01935267, "epoch": 0.6776889316419167, "flos": 13587987922560.0, "grad_norm": 2.1913885316081774, "language_loss": 0.77460003, "learning_rate": 9.941590948721502e-07, "loss": 0.79643929, "num_input_tokens_seen": 121237985, "step": 5636, "time_per_iteration": 2.4147961139678955 }, { "auxiliary_loss_clip": 0.01136668, "auxiliary_loss_mlp": 0.01023797, "balance_loss_clip": 1.04621148, "balance_loss_mlp": 1.01712763, "epoch": 0.6778091745325557, "flos": 27601121220480.0, "grad_norm": 1.6560030482033634, "language_loss": 0.76536179, "learning_rate": 9.934858805137188e-07, "loss": 0.78696638, "num_input_tokens_seen": 121258635, "step": 5637, "time_per_iteration": 2.535013437271118 }, { "auxiliary_loss_clip": 0.01149996, "auxiliary_loss_mlp": 0.0102635, "balance_loss_clip": 1.04593265, "balance_loss_mlp": 1.01905704, "epoch": 0.6779294174231949, "flos": 18734058743040.0, "grad_norm": 1.6317033481784171, "language_loss": 0.80404937, "learning_rate": 9.92812818842677e-07, "loss": 0.82581282, "num_input_tokens_seen": 121277810, "step": 5638, "time_per_iteration": 2.4280025959014893 }, { "auxiliary_loss_clip": 0.01154159, "auxiliary_loss_mlp": 0.01024871, "balance_loss_clip": 1.04875386, "balance_loss_mlp": 1.01741755, "epoch": 0.678049660313834, "flos": 45873797765760.0, "grad_norm": 1.728243752163979, "language_loss": 0.64215094, "learning_rate": 9.921399099611306e-07, "loss": 0.66394126, "num_input_tokens_seen": 121298975, "step": 5639, "time_per_iteration": 2.7496955394744873 }, { "auxiliary_loss_clip": 0.01131616, "auxiliary_loss_mlp": 0.00762256, "balance_loss_clip": 1.04502487, "balance_loss_mlp": 1.00069189, "epoch": 0.678169903204473, "flos": 19974556892160.0, "grad_norm": 1.6785396115507052, "language_loss": 0.69072688, "learning_rate": 9.914671539711588e-07, "loss": 0.70966566, "num_input_tokens_seen": 121318495, "step": 5640, "time_per_iteration": 2.531480073928833 }, { "auxiliary_loss_clip": 0.01082379, "auxiliary_loss_mlp": 0.01023493, "balance_loss_clip": 1.04075789, "balance_loss_mlp": 1.01645339, "epoch": 0.6782901460951122, "flos": 21395613732480.0, "grad_norm": 2.152253398748627, "language_loss": 0.78218687, "learning_rate": 9.90794550974817e-07, "loss": 0.80324554, "num_input_tokens_seen": 121338890, "step": 5641, "time_per_iteration": 2.6066811084747314 }, { "auxiliary_loss_clip": 0.01122211, "auxiliary_loss_mlp": 0.01030058, "balance_loss_clip": 1.04400957, "balance_loss_mlp": 1.0220263, "epoch": 0.6784103889857512, "flos": 21434002392960.0, "grad_norm": 3.9983651578338155, "language_loss": 0.81072259, "learning_rate": 9.901221010741407e-07, "loss": 0.83224529, "num_input_tokens_seen": 121358210, "step": 5642, "time_per_iteration": 3.3362085819244385 }, { "auxiliary_loss_clip": 0.01159912, "auxiliary_loss_mlp": 0.01026965, "balance_loss_clip": 1.04770613, "balance_loss_mlp": 1.01973498, "epoch": 0.6785306318763903, "flos": 32671923091200.0, "grad_norm": 1.8751191017556763, "language_loss": 0.74953198, "learning_rate": 9.894498043711375e-07, "loss": 0.77140081, "num_input_tokens_seen": 121379955, "step": 5643, "time_per_iteration": 2.5505597591400146 }, { "auxiliary_loss_clip": 0.01139171, "auxiliary_loss_mlp": 0.01023712, "balance_loss_clip": 1.04471803, "balance_loss_mlp": 1.01592207, "epoch": 0.6786508747670293, "flos": 25632139340160.0, "grad_norm": 2.6386515211156576, "language_loss": 0.69506228, "learning_rate": 9.887776609677962e-07, "loss": 0.71669114, "num_input_tokens_seen": 121401325, "step": 5644, "time_per_iteration": 2.537651300430298 }, { "auxiliary_loss_clip": 0.01116392, "auxiliary_loss_mlp": 0.01023004, "balance_loss_clip": 1.03983223, "balance_loss_mlp": 1.01574421, "epoch": 0.6787711176576685, "flos": 19171881619200.0, "grad_norm": 1.6030564398697167, "language_loss": 0.72312951, "learning_rate": 9.88105670966079e-07, "loss": 0.74452347, "num_input_tokens_seen": 121419785, "step": 5645, "time_per_iteration": 3.385706663131714 }, { "auxiliary_loss_clip": 0.01100664, "auxiliary_loss_mlp": 0.01023297, "balance_loss_clip": 1.04292321, "balance_loss_mlp": 1.01652002, "epoch": 0.6788913605483076, "flos": 13985159581440.0, "grad_norm": 1.8973781655752906, "language_loss": 0.78938323, "learning_rate": 9.874338344679283e-07, "loss": 0.81062281, "num_input_tokens_seen": 121435630, "step": 5646, "time_per_iteration": 3.346597671508789 }, { "auxiliary_loss_clip": 0.01166025, "auxiliary_loss_mlp": 0.01025239, "balance_loss_clip": 1.0481391, "balance_loss_mlp": 1.01830745, "epoch": 0.6790116034389466, "flos": 22017586659840.0, "grad_norm": 1.6880262874068503, "language_loss": 0.73913878, "learning_rate": 9.86762151575259e-07, "loss": 0.76105142, "num_input_tokens_seen": 121455625, "step": 5647, "time_per_iteration": 2.4226245880126953 }, { "auxiliary_loss_clip": 0.0111374, "auxiliary_loss_mlp": 0.00761737, "balance_loss_clip": 1.04470301, "balance_loss_mlp": 1.00074291, "epoch": 0.6791318463295858, "flos": 20922454851840.0, "grad_norm": 1.489565348735229, "language_loss": 0.80265582, "learning_rate": 9.860906223899651e-07, "loss": 0.8214106, "num_input_tokens_seen": 121475020, "step": 5648, "time_per_iteration": 2.552877902984619 }, { "auxiliary_loss_clip": 0.01146673, "auxiliary_loss_mlp": 0.01027643, "balance_loss_clip": 1.04672384, "balance_loss_mlp": 1.02049041, "epoch": 0.6792520892202248, "flos": 28512749422080.0, "grad_norm": 1.736217289933732, "language_loss": 0.75607276, "learning_rate": 9.854192470139184e-07, "loss": 0.77781594, "num_input_tokens_seen": 121496500, "step": 5649, "time_per_iteration": 3.318429708480835 }, { "auxiliary_loss_clip": 0.01139207, "auxiliary_loss_mlp": 0.01027389, "balance_loss_clip": 1.04700947, "balance_loss_mlp": 1.02064776, "epoch": 0.6793723321108639, "flos": 20011904058240.0, "grad_norm": 2.1702929071846317, "language_loss": 0.71754479, "learning_rate": 9.847480255489645e-07, "loss": 0.73921072, "num_input_tokens_seen": 121515525, "step": 5650, "time_per_iteration": 2.4825146198272705 }, { "auxiliary_loss_clip": 0.01144409, "auxiliary_loss_mlp": 0.01023607, "balance_loss_clip": 1.04576802, "balance_loss_mlp": 1.01654983, "epoch": 0.6794925750015031, "flos": 26649488246400.0, "grad_norm": 2.8997197262430947, "language_loss": 0.69172394, "learning_rate": 9.840769580969295e-07, "loss": 0.71340412, "num_input_tokens_seen": 121535965, "step": 5651, "time_per_iteration": 2.545766830444336 }, { "auxiliary_loss_clip": 0.01147496, "auxiliary_loss_mlp": 0.01022858, "balance_loss_clip": 1.04616284, "balance_loss_mlp": 1.01576543, "epoch": 0.6796128178921421, "flos": 21580374314880.0, "grad_norm": 1.863022121432462, "language_loss": 0.79777551, "learning_rate": 9.834060447596114e-07, "loss": 0.81947905, "num_input_tokens_seen": 121555235, "step": 5652, "time_per_iteration": 2.531484603881836 }, { "auxiliary_loss_clip": 0.01156653, "auxiliary_loss_mlp": 0.01022512, "balance_loss_clip": 1.04559255, "balance_loss_mlp": 1.01466799, "epoch": 0.6797330607827812, "flos": 22492002516480.0, "grad_norm": 2.1144039437172575, "language_loss": 0.77824962, "learning_rate": 9.827352856387868e-07, "loss": 0.80004132, "num_input_tokens_seen": 121574945, "step": 5653, "time_per_iteration": 2.479072332382202 }, { "auxiliary_loss_clip": 0.01021827, "auxiliary_loss_mlp": 0.01002487, "balance_loss_clip": 1.01295829, "balance_loss_mlp": 1.00139046, "epoch": 0.6798533036734203, "flos": 66306648286080.0, "grad_norm": 0.7751288183603556, "language_loss": 0.64240062, "learning_rate": 9.820646808362118e-07, "loss": 0.66264367, "num_input_tokens_seen": 121641200, "step": 5654, "time_per_iteration": 3.2031667232513428 }, { "auxiliary_loss_clip": 0.01136489, "auxiliary_loss_mlp": 0.01026423, "balance_loss_clip": 1.04510152, "balance_loss_mlp": 1.01930022, "epoch": 0.6799735465640594, "flos": 16180163792640.0, "grad_norm": 2.072371955800868, "language_loss": 0.72879732, "learning_rate": 9.813942304536154e-07, "loss": 0.75042641, "num_input_tokens_seen": 121659170, "step": 5655, "time_per_iteration": 2.459920644760132 }, { "auxiliary_loss_clip": 0.01141359, "auxiliary_loss_mlp": 0.01028855, "balance_loss_clip": 1.04512191, "balance_loss_mlp": 1.02179813, "epoch": 0.6800937894546984, "flos": 22125749489280.0, "grad_norm": 1.7380389302859622, "language_loss": 0.63639247, "learning_rate": 9.807239345927043e-07, "loss": 0.65809464, "num_input_tokens_seen": 121679180, "step": 5656, "time_per_iteration": 2.489053249359131 }, { "auxiliary_loss_clip": 0.01142877, "auxiliary_loss_mlp": 0.0102627, "balance_loss_clip": 1.042503, "balance_loss_mlp": 1.01863456, "epoch": 0.6802140323453376, "flos": 31612953300480.0, "grad_norm": 2.528153415639979, "language_loss": 0.72309339, "learning_rate": 9.80053793355162e-07, "loss": 0.74478483, "num_input_tokens_seen": 121697875, "step": 5657, "time_per_iteration": 2.5537240505218506 }, { "auxiliary_loss_clip": 0.01109629, "auxiliary_loss_mlp": 0.0102752, "balance_loss_clip": 1.04368401, "balance_loss_mlp": 1.01950884, "epoch": 0.6803342752359767, "flos": 17712938908800.0, "grad_norm": 2.1152385311104633, "language_loss": 0.7483629, "learning_rate": 9.793838068426472e-07, "loss": 0.76973438, "num_input_tokens_seen": 121715570, "step": 5658, "time_per_iteration": 2.50657057762146 }, { "auxiliary_loss_clip": 0.01168706, "auxiliary_loss_mlp": 0.01025539, "balance_loss_clip": 1.04902971, "balance_loss_mlp": 1.01799059, "epoch": 0.6804545181266157, "flos": 11326800902400.0, "grad_norm": 2.6904177023535283, "language_loss": 0.60966849, "learning_rate": 9.78713975156799e-07, "loss": 0.63161087, "num_input_tokens_seen": 121731435, "step": 5659, "time_per_iteration": 2.4328441619873047 }, { "auxiliary_loss_clip": 0.01130212, "auxiliary_loss_mlp": 0.01024989, "balance_loss_clip": 1.04981244, "balance_loss_mlp": 1.01722813, "epoch": 0.6805747610172549, "flos": 29350976181120.0, "grad_norm": 1.6504029865941823, "language_loss": 0.71490681, "learning_rate": 9.780442983992273e-07, "loss": 0.73645884, "num_input_tokens_seen": 121749950, "step": 5660, "time_per_iteration": 2.57289719581604 }, { "auxiliary_loss_clip": 0.01135305, "auxiliary_loss_mlp": 0.01025417, "balance_loss_clip": 1.04475737, "balance_loss_mlp": 1.01796675, "epoch": 0.680695003907894, "flos": 37631868612480.0, "grad_norm": 1.707315684940208, "language_loss": 0.71813023, "learning_rate": 9.773747766715238e-07, "loss": 0.73973745, "num_input_tokens_seen": 121770770, "step": 5661, "time_per_iteration": 2.6178576946258545 }, { "auxiliary_loss_clip": 0.01141844, "auxiliary_loss_mlp": 0.01026165, "balance_loss_clip": 1.04405189, "balance_loss_mlp": 1.01876211, "epoch": 0.680815246798533, "flos": 22127365601280.0, "grad_norm": 1.9599068485474205, "language_loss": 0.80376077, "learning_rate": 9.767054100752536e-07, "loss": 0.82544082, "num_input_tokens_seen": 121790720, "step": 5662, "time_per_iteration": 2.475688934326172 }, { "auxiliary_loss_clip": 0.01128996, "auxiliary_loss_mlp": 0.01024863, "balance_loss_clip": 1.04531837, "balance_loss_mlp": 1.01746607, "epoch": 0.6809354896891722, "flos": 17201822330880.0, "grad_norm": 1.8482289044336226, "language_loss": 0.81643987, "learning_rate": 9.760361987119584e-07, "loss": 0.83797848, "num_input_tokens_seen": 121808455, "step": 5663, "time_per_iteration": 2.4713306427001953 }, { "auxiliary_loss_clip": 0.011397, "auxiliary_loss_mlp": 0.01027554, "balance_loss_clip": 1.04399204, "balance_loss_mlp": 1.01951385, "epoch": 0.6810557325798112, "flos": 12458166554880.0, "grad_norm": 1.8615498439985667, "language_loss": 0.67311943, "learning_rate": 9.753671426831592e-07, "loss": 0.69479203, "num_input_tokens_seen": 121824470, "step": 5664, "time_per_iteration": 2.43747878074646 }, { "auxiliary_loss_clip": 0.01148724, "auxiliary_loss_mlp": 0.01026749, "balance_loss_clip": 1.04486561, "balance_loss_mlp": 1.01945019, "epoch": 0.6811759754704503, "flos": 22156165330560.0, "grad_norm": 1.7824829079284061, "language_loss": 0.79699397, "learning_rate": 9.746982420903483e-07, "loss": 0.81874871, "num_input_tokens_seen": 121842665, "step": 5665, "time_per_iteration": 2.4514105319976807 }, { "auxiliary_loss_clip": 0.0115555, "auxiliary_loss_mlp": 0.0102469, "balance_loss_clip": 1.05116224, "balance_loss_mlp": 1.0178926, "epoch": 0.6812962183610894, "flos": 17525377065600.0, "grad_norm": 1.9972980751986613, "language_loss": 0.74940825, "learning_rate": 9.740294970349993e-07, "loss": 0.77121067, "num_input_tokens_seen": 121859080, "step": 5666, "time_per_iteration": 2.420055627822876 }, { "auxiliary_loss_clip": 0.01049221, "auxiliary_loss_mlp": 0.01001837, "balance_loss_clip": 1.01485109, "balance_loss_mlp": 1.00085402, "epoch": 0.6814164612517285, "flos": 60274480855680.0, "grad_norm": 0.8743419920956141, "language_loss": 0.60887706, "learning_rate": 9.733609076185594e-07, "loss": 0.62938762, "num_input_tokens_seen": 121915485, "step": 5667, "time_per_iteration": 2.9459009170532227 }, { "auxiliary_loss_clip": 0.01154799, "auxiliary_loss_mlp": 0.01030116, "balance_loss_clip": 1.04893303, "balance_loss_mlp": 1.02233171, "epoch": 0.6815367041423676, "flos": 19317750750720.0, "grad_norm": 1.7976735443505019, "language_loss": 0.83713353, "learning_rate": 9.72692473942455e-07, "loss": 0.85898274, "num_input_tokens_seen": 121932710, "step": 5668, "time_per_iteration": 2.4389798641204834 }, { "auxiliary_loss_clip": 0.01120624, "auxiliary_loss_mlp": 0.01024956, "balance_loss_clip": 1.04871285, "balance_loss_mlp": 1.01714134, "epoch": 0.6816569470330067, "flos": 22161696024960.0, "grad_norm": 4.137210942081937, "language_loss": 0.77749372, "learning_rate": 9.720241961080849e-07, "loss": 0.79894954, "num_input_tokens_seen": 121952025, "step": 5669, "time_per_iteration": 3.269195556640625 }, { "auxiliary_loss_clip": 0.01168316, "auxiliary_loss_mlp": 0.01026068, "balance_loss_clip": 1.04765201, "balance_loss_mlp": 1.01866829, "epoch": 0.6817771899236458, "flos": 41463501137280.0, "grad_norm": 2.2527367203781505, "language_loss": 0.73023182, "learning_rate": 9.713560742168259e-07, "loss": 0.75217569, "num_input_tokens_seen": 121974650, "step": 5670, "time_per_iteration": 2.5809009075164795 }, { "auxiliary_loss_clip": 0.01125893, "auxiliary_loss_mlp": 0.0102939, "balance_loss_clip": 1.04355431, "balance_loss_mlp": 1.02238309, "epoch": 0.6818974328142848, "flos": 21106138026240.0, "grad_norm": 1.8757381891085267, "language_loss": 0.71325529, "learning_rate": 9.706881083700333e-07, "loss": 0.73480815, "num_input_tokens_seen": 121994335, "step": 5671, "time_per_iteration": 3.3486883640289307 }, { "auxiliary_loss_clip": 0.01100251, "auxiliary_loss_mlp": 0.01028385, "balance_loss_clip": 1.04716277, "balance_loss_mlp": 1.02052331, "epoch": 0.682017675704924, "flos": 20441897769600.0, "grad_norm": 1.9867108097680781, "language_loss": 0.82230723, "learning_rate": 9.700202986690357e-07, "loss": 0.84359366, "num_input_tokens_seen": 122012635, "step": 5672, "time_per_iteration": 2.5564680099487305 }, { "auxiliary_loss_clip": 0.0115358, "auxiliary_loss_mlp": 0.00762533, "balance_loss_clip": 1.04692459, "balance_loss_mlp": 1.00061178, "epoch": 0.682137918595563, "flos": 20044438801920.0, "grad_norm": 1.8920798058221098, "language_loss": 0.66328001, "learning_rate": 9.693526452151413e-07, "loss": 0.68244117, "num_input_tokens_seen": 122031685, "step": 5673, "time_per_iteration": 3.2407755851745605 }, { "auxiliary_loss_clip": 0.01132798, "auxiliary_loss_mlp": 0.01023499, "balance_loss_clip": 1.04496288, "balance_loss_mlp": 1.01537526, "epoch": 0.6822581614862021, "flos": 31684559063040.0, "grad_norm": 1.697455899643641, "language_loss": 0.75330949, "learning_rate": 9.686851481096305e-07, "loss": 0.77487242, "num_input_tokens_seen": 122052995, "step": 5674, "time_per_iteration": 2.5931127071380615 }, { "auxiliary_loss_clip": 0.01099548, "auxiliary_loss_mlp": 0.01026729, "balance_loss_clip": 1.04274535, "balance_loss_mlp": 1.01905751, "epoch": 0.6823784043768413, "flos": 23477570864640.0, "grad_norm": 1.8882355076551773, "language_loss": 0.7173987, "learning_rate": 9.68017807453762e-07, "loss": 0.73866153, "num_input_tokens_seen": 122071740, "step": 5675, "time_per_iteration": 2.5883944034576416 }, { "auxiliary_loss_clip": 0.01143675, "auxiliary_loss_mlp": 0.00762078, "balance_loss_clip": 1.04831815, "balance_loss_mlp": 1.00057149, "epoch": 0.6824986472674803, "flos": 14137134024960.0, "grad_norm": 1.8058269271248202, "language_loss": 0.72952092, "learning_rate": 9.673506233487721e-07, "loss": 0.74857843, "num_input_tokens_seen": 122089705, "step": 5676, "time_per_iteration": 3.24983811378479 }, { "auxiliary_loss_clip": 0.01141474, "auxiliary_loss_mlp": 0.00761906, "balance_loss_clip": 1.04461622, "balance_loss_mlp": 1.00050855, "epoch": 0.6826188901581194, "flos": 21504997624320.0, "grad_norm": 1.680776358043912, "language_loss": 0.8608079, "learning_rate": 9.666835958958717e-07, "loss": 0.87984169, "num_input_tokens_seen": 122109025, "step": 5677, "time_per_iteration": 2.4849982261657715 }, { "auxiliary_loss_clip": 0.01167742, "auxiliary_loss_mlp": 0.01020407, "balance_loss_clip": 1.04891181, "balance_loss_mlp": 1.01343036, "epoch": 0.6827391330487584, "flos": 20810126044800.0, "grad_norm": 2.4835521219678673, "language_loss": 0.80907482, "learning_rate": 9.660167251962484e-07, "loss": 0.83095634, "num_input_tokens_seen": 122127385, "step": 5678, "time_per_iteration": 2.4099223613739014 }, { "auxiliary_loss_clip": 0.01129298, "auxiliary_loss_mlp": 0.01023543, "balance_loss_clip": 1.04319572, "balance_loss_mlp": 1.01692426, "epoch": 0.6828593759393976, "flos": 21688788539520.0, "grad_norm": 10.157356116017443, "language_loss": 0.77760178, "learning_rate": 9.653500113510654e-07, "loss": 0.7991302, "num_input_tokens_seen": 122146500, "step": 5679, "time_per_iteration": 2.554255485534668 }, { "auxiliary_loss_clip": 0.01134528, "auxiliary_loss_mlp": 0.01027876, "balance_loss_clip": 1.04295611, "balance_loss_mlp": 1.02032137, "epoch": 0.6829796188300367, "flos": 25337707557120.0, "grad_norm": 2.442300309763335, "language_loss": 0.67099756, "learning_rate": 9.646834544614627e-07, "loss": 0.69262159, "num_input_tokens_seen": 122167000, "step": 5680, "time_per_iteration": 2.537274122238159 }, { "auxiliary_loss_clip": 0.01132529, "auxiliary_loss_mlp": 0.01026343, "balance_loss_clip": 1.04588199, "balance_loss_mlp": 1.0193994, "epoch": 0.6830998617206757, "flos": 20704800389760.0, "grad_norm": 1.7629091437102198, "language_loss": 0.76264429, "learning_rate": 9.64017054628558e-07, "loss": 0.78423297, "num_input_tokens_seen": 122185825, "step": 5681, "time_per_iteration": 2.467233419418335 }, { "auxiliary_loss_clip": 0.01114737, "auxiliary_loss_mlp": 0.01024885, "balance_loss_clip": 1.04224157, "balance_loss_mlp": 1.01790273, "epoch": 0.6832201046113149, "flos": 21726638496000.0, "grad_norm": 1.7542470196537288, "language_loss": 0.78997815, "learning_rate": 9.63350811953441e-07, "loss": 0.81137443, "num_input_tokens_seen": 122206200, "step": 5682, "time_per_iteration": 2.5623183250427246 }, { "auxiliary_loss_clip": 0.01127531, "auxiliary_loss_mlp": 0.01022149, "balance_loss_clip": 1.04386592, "balance_loss_mlp": 1.01493406, "epoch": 0.6833403475019539, "flos": 19536554448000.0, "grad_norm": 2.8896546595659363, "language_loss": 0.7039206, "learning_rate": 9.626847265371826e-07, "loss": 0.72541738, "num_input_tokens_seen": 122225520, "step": 5683, "time_per_iteration": 2.512218475341797 }, { "auxiliary_loss_clip": 0.01131057, "auxiliary_loss_mlp": 0.01025975, "balance_loss_clip": 1.04320049, "balance_loss_mlp": 1.01885772, "epoch": 0.683460590392593, "flos": 19352153001600.0, "grad_norm": 2.0864397872367526, "language_loss": 0.78533518, "learning_rate": 9.620187984808262e-07, "loss": 0.80690545, "num_input_tokens_seen": 122244320, "step": 5684, "time_per_iteration": 2.486323833465576 }, { "auxiliary_loss_clip": 0.01139136, "auxiliary_loss_mlp": 0.00761744, "balance_loss_clip": 1.04664588, "balance_loss_mlp": 1.0005579, "epoch": 0.6835808332832322, "flos": 23288500650240.0, "grad_norm": 1.610999900544913, "language_loss": 0.85963929, "learning_rate": 9.613530278853919e-07, "loss": 0.8786481, "num_input_tokens_seen": 122264295, "step": 5685, "time_per_iteration": 2.494279384613037 }, { "auxiliary_loss_clip": 0.01155453, "auxiliary_loss_mlp": 0.01023088, "balance_loss_clip": 1.04990995, "balance_loss_mlp": 1.01598287, "epoch": 0.6837010761738712, "flos": 21653416621440.0, "grad_norm": 2.021045836357281, "language_loss": 0.74426723, "learning_rate": 9.60687414851879e-07, "loss": 0.76605266, "num_input_tokens_seen": 122285300, "step": 5686, "time_per_iteration": 2.4731569290161133 }, { "auxiliary_loss_clip": 0.01143967, "auxiliary_loss_mlp": 0.01025345, "balance_loss_clip": 1.0489378, "balance_loss_mlp": 1.01801658, "epoch": 0.6838213190645103, "flos": 17566387418880.0, "grad_norm": 2.3009906422321733, "language_loss": 0.77402008, "learning_rate": 9.600219594812575e-07, "loss": 0.79571319, "num_input_tokens_seen": 122303240, "step": 5687, "time_per_iteration": 2.448185920715332 }, { "auxiliary_loss_clip": 0.01167513, "auxiliary_loss_mlp": 0.01024154, "balance_loss_clip": 1.04848981, "balance_loss_mlp": 1.01726413, "epoch": 0.6839415619551494, "flos": 23112538899840.0, "grad_norm": 1.692974361196155, "language_loss": 0.72805774, "learning_rate": 9.593566618744786e-07, "loss": 0.74997437, "num_input_tokens_seen": 122323390, "step": 5688, "time_per_iteration": 2.4155213832855225 }, { "auxiliary_loss_clip": 0.01167908, "auxiliary_loss_mlp": 0.01026168, "balance_loss_clip": 1.04748166, "balance_loss_mlp": 1.01888442, "epoch": 0.6840618048457885, "flos": 22127868391680.0, "grad_norm": 1.810355264466806, "language_loss": 0.73995751, "learning_rate": 9.58691522132466e-07, "loss": 0.76189828, "num_input_tokens_seen": 122342200, "step": 5689, "time_per_iteration": 2.4185643196105957 }, { "auxiliary_loss_clip": 0.01146562, "auxiliary_loss_mlp": 0.01025276, "balance_loss_clip": 1.04885101, "balance_loss_mlp": 1.01764023, "epoch": 0.6841820477364275, "flos": 22015898720640.0, "grad_norm": 2.397166713152011, "language_loss": 0.84819698, "learning_rate": 9.58026540356123e-07, "loss": 0.86991537, "num_input_tokens_seen": 122360465, "step": 5690, "time_per_iteration": 2.4903764724731445 }, { "auxiliary_loss_clip": 0.01155678, "auxiliary_loss_mlp": 0.01025346, "balance_loss_clip": 1.04545617, "balance_loss_mlp": 1.01803267, "epoch": 0.6843022906270667, "flos": 24900531125760.0, "grad_norm": 1.6046056119883991, "language_loss": 0.86712223, "learning_rate": 9.573617166463246e-07, "loss": 0.88893247, "num_input_tokens_seen": 122381680, "step": 5691, "time_per_iteration": 2.4692256450653076 }, { "auxiliary_loss_clip": 0.01142201, "auxiliary_loss_mlp": 0.01025309, "balance_loss_clip": 1.04382288, "balance_loss_mlp": 1.0188067, "epoch": 0.6844225335177058, "flos": 19969924037760.0, "grad_norm": 3.558523135121771, "language_loss": 0.60136461, "learning_rate": 9.56697051103924e-07, "loss": 0.62303966, "num_input_tokens_seen": 122399120, "step": 5692, "time_per_iteration": 2.4762392044067383 }, { "auxiliary_loss_clip": 0.01137827, "auxiliary_loss_mlp": 0.01026207, "balance_loss_clip": 1.04455853, "balance_loss_mlp": 1.01884627, "epoch": 0.6845427764083448, "flos": 25883334126720.0, "grad_norm": 2.2050805970703644, "language_loss": 0.81205273, "learning_rate": 9.560325438297522e-07, "loss": 0.83369303, "num_input_tokens_seen": 122417430, "step": 5693, "time_per_iteration": 2.4971835613250732 }, { "auxiliary_loss_clip": 0.01144783, "auxiliary_loss_mlp": 0.01023142, "balance_loss_clip": 1.05141211, "balance_loss_mlp": 1.01629364, "epoch": 0.684663019298984, "flos": 18880143356160.0, "grad_norm": 1.7168429276221087, "language_loss": 0.86630297, "learning_rate": 9.553681949246127e-07, "loss": 0.88798225, "num_input_tokens_seen": 122435055, "step": 5694, "time_per_iteration": 2.5113637447357178 }, { "auxiliary_loss_clip": 0.01130482, "auxiliary_loss_mlp": 0.01026273, "balance_loss_clip": 1.0452404, "balance_loss_mlp": 1.01836371, "epoch": 0.684783262189623, "flos": 54193725302400.0, "grad_norm": 1.962164242458883, "language_loss": 0.74914157, "learning_rate": 9.547040044892886e-07, "loss": 0.77070916, "num_input_tokens_seen": 122462570, "step": 5695, "time_per_iteration": 3.528773069381714 }, { "auxiliary_loss_clip": 0.01062137, "auxiliary_loss_mlp": 0.01000976, "balance_loss_clip": 1.01544106, "balance_loss_mlp": 0.9999153, "epoch": 0.6849035050802621, "flos": 63970264143360.0, "grad_norm": 0.8630307867586, "language_loss": 0.60133076, "learning_rate": 9.540399726245354e-07, "loss": 0.62196183, "num_input_tokens_seen": 122519275, "step": 5696, "time_per_iteration": 2.92171573638916 }, { "auxiliary_loss_clip": 0.01136223, "auxiliary_loss_mlp": 0.01026326, "balance_loss_clip": 1.04433775, "balance_loss_mlp": 1.01846409, "epoch": 0.6850237479709013, "flos": 25224121774080.0, "grad_norm": 1.6878599159076844, "language_loss": 0.68831736, "learning_rate": 9.533760994310859e-07, "loss": 0.70994282, "num_input_tokens_seen": 122539675, "step": 5697, "time_per_iteration": 2.5102057456970215 }, { "auxiliary_loss_clip": 0.0116935, "auxiliary_loss_mlp": 0.01024984, "balance_loss_clip": 1.04903245, "balance_loss_mlp": 1.01768196, "epoch": 0.6851439908615403, "flos": 19354128249600.0, "grad_norm": 2.040418355002968, "language_loss": 0.75328302, "learning_rate": 9.527123850096508e-07, "loss": 0.77522635, "num_input_tokens_seen": 122558035, "step": 5698, "time_per_iteration": 3.246387004852295 }, { "auxiliary_loss_clip": 0.01159818, "auxiliary_loss_mlp": 0.01022887, "balance_loss_clip": 1.04740846, "balance_loss_mlp": 1.01582348, "epoch": 0.6852642337521794, "flos": 23182133500800.0, "grad_norm": 1.8724907840886837, "language_loss": 0.71947205, "learning_rate": 9.520488294609142e-07, "loss": 0.74129915, "num_input_tokens_seen": 122576815, "step": 5699, "time_per_iteration": 2.447057008743286 }, { "auxiliary_loss_clip": 0.01029632, "auxiliary_loss_mlp": 0.01002322, "balance_loss_clip": 1.01675105, "balance_loss_mlp": 1.00125527, "epoch": 0.6853844766428185, "flos": 62647206583680.0, "grad_norm": 0.7445548108741515, "language_loss": 0.53871405, "learning_rate": 9.513854328855368e-07, "loss": 0.55903363, "num_input_tokens_seen": 122634690, "step": 5700, "time_per_iteration": 3.9129080772399902 }, { "auxiliary_loss_clip": 0.01167198, "auxiliary_loss_mlp": 0.01024318, "balance_loss_clip": 1.04985428, "balance_loss_mlp": 1.01730883, "epoch": 0.6855047195334576, "flos": 23437242869760.0, "grad_norm": 2.0281115486823693, "language_loss": 0.81323457, "learning_rate": 9.507221953841558e-07, "loss": 0.83514977, "num_input_tokens_seen": 122652320, "step": 5701, "time_per_iteration": 2.4152815341949463 }, { "auxiliary_loss_clip": 0.01158487, "auxiliary_loss_mlp": 0.01024381, "balance_loss_clip": 1.05084181, "balance_loss_mlp": 1.01668656, "epoch": 0.6856249624240967, "flos": 20664831530880.0, "grad_norm": 1.5102995450698231, "language_loss": 0.78113216, "learning_rate": 9.500591170573824e-07, "loss": 0.80296087, "num_input_tokens_seen": 122672340, "step": 5702, "time_per_iteration": 2.4542086124420166 }, { "auxiliary_loss_clip": 0.0110879, "auxiliary_loss_mlp": 0.01023902, "balance_loss_clip": 1.04199123, "balance_loss_mlp": 1.01666617, "epoch": 0.6857452053147358, "flos": 17087302794240.0, "grad_norm": 1.9622124326126513, "language_loss": 0.74071407, "learning_rate": 9.493961980058078e-07, "loss": 0.76204097, "num_input_tokens_seen": 122689935, "step": 5703, "time_per_iteration": 3.2859106063842773 }, { "auxiliary_loss_clip": 0.01083469, "auxiliary_loss_mlp": 0.0102494, "balance_loss_clip": 1.03926325, "balance_loss_mlp": 1.01776648, "epoch": 0.6858654482053749, "flos": 30847266057600.0, "grad_norm": 1.9613972706816656, "language_loss": 0.67700058, "learning_rate": 9.48733438329993e-07, "loss": 0.69808471, "num_input_tokens_seen": 122710200, "step": 5704, "time_per_iteration": 2.638317823410034 }, { "auxiliary_loss_clip": 0.01168297, "auxiliary_loss_mlp": 0.00762092, "balance_loss_clip": 1.05002117, "balance_loss_mlp": 1.0005399, "epoch": 0.6859856910960139, "flos": 28877314510080.0, "grad_norm": 1.7804988203379646, "language_loss": 0.74381697, "learning_rate": 9.480708381304807e-07, "loss": 0.76312089, "num_input_tokens_seen": 122731495, "step": 5705, "time_per_iteration": 2.4973254203796387 }, { "auxiliary_loss_clip": 0.01109901, "auxiliary_loss_mlp": 0.01028459, "balance_loss_clip": 1.04580879, "balance_loss_mlp": 1.02088904, "epoch": 0.6861059339866531, "flos": 19354523299200.0, "grad_norm": 2.3999947672265005, "language_loss": 0.83796859, "learning_rate": 9.474083975077858e-07, "loss": 0.85935223, "num_input_tokens_seen": 122748620, "step": 5706, "time_per_iteration": 2.5197863578796387 }, { "auxiliary_loss_clip": 0.01148622, "auxiliary_loss_mlp": 0.01022976, "balance_loss_clip": 1.04636383, "balance_loss_mlp": 1.01590347, "epoch": 0.6862261768772921, "flos": 22199976944640.0, "grad_norm": 9.835527263764096, "language_loss": 0.80111837, "learning_rate": 9.467461165623994e-07, "loss": 0.82283431, "num_input_tokens_seen": 122767670, "step": 5707, "time_per_iteration": 2.442981004714966 }, { "auxiliary_loss_clip": 0.01158062, "auxiliary_loss_mlp": 0.01022686, "balance_loss_clip": 1.04757833, "balance_loss_mlp": 1.01580453, "epoch": 0.6863464197679312, "flos": 26285677344000.0, "grad_norm": 2.144117148262155, "language_loss": 0.79662561, "learning_rate": 9.46083995394791e-07, "loss": 0.81843305, "num_input_tokens_seen": 122785480, "step": 5708, "time_per_iteration": 2.5060737133026123 }, { "auxiliary_loss_clip": 0.01155173, "auxiliary_loss_mlp": 0.00761378, "balance_loss_clip": 1.04716969, "balance_loss_mlp": 1.00051546, "epoch": 0.6864666626585703, "flos": 37815228564480.0, "grad_norm": 2.481138689271586, "language_loss": 0.63409549, "learning_rate": 9.454220341054012e-07, "loss": 0.65326095, "num_input_tokens_seen": 122810265, "step": 5709, "time_per_iteration": 2.598345994949341 }, { "auxiliary_loss_clip": 0.01123225, "auxiliary_loss_mlp": 0.01029761, "balance_loss_clip": 1.04328811, "balance_loss_mlp": 1.02247143, "epoch": 0.6865869055492094, "flos": 19391152193280.0, "grad_norm": 1.9070596138514655, "language_loss": 0.80716562, "learning_rate": 9.447602327946512e-07, "loss": 0.82869542, "num_input_tokens_seen": 122828905, "step": 5710, "time_per_iteration": 2.4986867904663086 }, { "auxiliary_loss_clip": 0.01139256, "auxiliary_loss_mlp": 0.01026968, "balance_loss_clip": 1.0451231, "balance_loss_mlp": 1.01943982, "epoch": 0.6867071484398485, "flos": 20375966355840.0, "grad_norm": 1.8066430502567807, "language_loss": 0.76586872, "learning_rate": 9.440985915629338e-07, "loss": 0.7875309, "num_input_tokens_seen": 122846235, "step": 5711, "time_per_iteration": 2.4718642234802246 }, { "auxiliary_loss_clip": 0.01168395, "auxiliary_loss_mlp": 0.01024389, "balance_loss_clip": 1.05066097, "balance_loss_mlp": 1.01735544, "epoch": 0.6868273913304875, "flos": 15889143801600.0, "grad_norm": 2.196203337506372, "language_loss": 0.72919637, "learning_rate": 9.434371105106223e-07, "loss": 0.7511242, "num_input_tokens_seen": 122863835, "step": 5712, "time_per_iteration": 2.3857431411743164 }, { "auxiliary_loss_clip": 0.0112347, "auxiliary_loss_mlp": 0.01026019, "balance_loss_clip": 1.04348886, "balance_loss_mlp": 1.01850295, "epoch": 0.6869476342211267, "flos": 24462492768000.0, "grad_norm": 1.7184899821878972, "language_loss": 0.70556158, "learning_rate": 9.427757897380602e-07, "loss": 0.72705644, "num_input_tokens_seen": 122883235, "step": 5713, "time_per_iteration": 2.5473453998565674 }, { "auxiliary_loss_clip": 0.0112479, "auxiliary_loss_mlp": 0.01022913, "balance_loss_clip": 1.04569185, "balance_loss_mlp": 1.01515865, "epoch": 0.6870678771117658, "flos": 18442571875200.0, "grad_norm": 2.286378730540375, "language_loss": 0.84600478, "learning_rate": 9.421146293455695e-07, "loss": 0.86748171, "num_input_tokens_seen": 122898975, "step": 5714, "time_per_iteration": 2.5033187866210938 }, { "auxiliary_loss_clip": 0.0113827, "auxiliary_loss_mlp": 0.0102412, "balance_loss_clip": 1.04431129, "balance_loss_mlp": 1.01693487, "epoch": 0.6871881200024048, "flos": 22200371994240.0, "grad_norm": 1.788128408753099, "language_loss": 0.68465519, "learning_rate": 9.414536294334489e-07, "loss": 0.70627904, "num_input_tokens_seen": 122918995, "step": 5715, "time_per_iteration": 2.5033304691314697 }, { "auxiliary_loss_clip": 0.01141822, "auxiliary_loss_mlp": 0.01025809, "balance_loss_clip": 1.04271805, "balance_loss_mlp": 1.01810789, "epoch": 0.687308362893044, "flos": 22127724737280.0, "grad_norm": 2.006388834957188, "language_loss": 0.6968714, "learning_rate": 9.407927901019708e-07, "loss": 0.7185477, "num_input_tokens_seen": 122938125, "step": 5716, "time_per_iteration": 2.4821105003356934 }, { "auxiliary_loss_clip": 0.01154673, "auxiliary_loss_mlp": 0.01023301, "balance_loss_clip": 1.047176, "balance_loss_mlp": 1.01627409, "epoch": 0.687428605783683, "flos": 25040546340480.0, "grad_norm": 2.7207932885444213, "language_loss": 0.76633137, "learning_rate": 9.401321114513854e-07, "loss": 0.78811109, "num_input_tokens_seen": 122957020, "step": 5717, "time_per_iteration": 2.4805705547332764 }, { "auxiliary_loss_clip": 0.01170348, "auxiliary_loss_mlp": 0.010273, "balance_loss_clip": 1.04962957, "balance_loss_mlp": 1.0198853, "epoch": 0.6875488486743221, "flos": 23770063313280.0, "grad_norm": 1.8699197166872108, "language_loss": 0.74892223, "learning_rate": 9.394715935819155e-07, "loss": 0.7708987, "num_input_tokens_seen": 122977410, "step": 5718, "time_per_iteration": 2.457313060760498 }, { "auxiliary_loss_clip": 0.01158266, "auxiliary_loss_mlp": 0.01027903, "balance_loss_clip": 1.04766273, "balance_loss_mlp": 1.0207293, "epoch": 0.6876690915649613, "flos": 25516937445120.0, "grad_norm": 3.5285869913030243, "language_loss": 0.61999249, "learning_rate": 9.388112365937608e-07, "loss": 0.64185417, "num_input_tokens_seen": 122996875, "step": 5719, "time_per_iteration": 2.4832680225372314 }, { "auxiliary_loss_clip": 0.01127298, "auxiliary_loss_mlp": 0.01022468, "balance_loss_clip": 1.04493904, "balance_loss_mlp": 1.014925, "epoch": 0.6877893344556003, "flos": 19427996568960.0, "grad_norm": 2.416596294166508, "language_loss": 0.82506192, "learning_rate": 9.381510405870985e-07, "loss": 0.84655952, "num_input_tokens_seen": 123015890, "step": 5720, "time_per_iteration": 2.497011661529541 }, { "auxiliary_loss_clip": 0.01154961, "auxiliary_loss_mlp": 0.01024888, "balance_loss_clip": 1.0477922, "balance_loss_mlp": 1.01751471, "epoch": 0.6879095773462394, "flos": 18661303745280.0, "grad_norm": 2.97680934093026, "language_loss": 0.77138931, "learning_rate": 9.374910056620791e-07, "loss": 0.79318774, "num_input_tokens_seen": 123034955, "step": 5721, "time_per_iteration": 2.4259798526763916 }, { "auxiliary_loss_clip": 0.01156637, "auxiliary_loss_mlp": 0.01028598, "balance_loss_clip": 1.04807711, "balance_loss_mlp": 1.02099884, "epoch": 0.6880298202368785, "flos": 20883132437760.0, "grad_norm": 2.298995768917676, "language_loss": 0.80937731, "learning_rate": 9.368311319188293e-07, "loss": 0.83122969, "num_input_tokens_seen": 123052770, "step": 5722, "time_per_iteration": 3.2340428829193115 }, { "auxiliary_loss_clip": 0.01126149, "auxiliary_loss_mlp": 0.01023562, "balance_loss_clip": 1.0438261, "balance_loss_mlp": 1.01648116, "epoch": 0.6881500631275176, "flos": 30153292318080.0, "grad_norm": 1.6718056376974701, "language_loss": 0.7943576, "learning_rate": 9.361714194574515e-07, "loss": 0.81585473, "num_input_tokens_seen": 123075105, "step": 5723, "time_per_iteration": 2.5868897438049316 }, { "auxiliary_loss_clip": 0.01071363, "auxiliary_loss_mlp": 0.01001711, "balance_loss_clip": 1.01588881, "balance_loss_mlp": 1.00064361, "epoch": 0.6882703060181566, "flos": 66181537215360.0, "grad_norm": 0.730133418288036, "language_loss": 0.58289182, "learning_rate": 9.355118683780228e-07, "loss": 0.60362256, "num_input_tokens_seen": 123145175, "step": 5724, "time_per_iteration": 3.102224349975586 }, { "auxiliary_loss_clip": 0.01167932, "auxiliary_loss_mlp": 0.01025853, "balance_loss_clip": 1.04799485, "balance_loss_mlp": 1.01851535, "epoch": 0.6883905489087958, "flos": 18214646123520.0, "grad_norm": 2.096248042270591, "language_loss": 0.79752886, "learning_rate": 9.348524787805987e-07, "loss": 0.81946671, "num_input_tokens_seen": 123160365, "step": 5725, "time_per_iteration": 3.5191938877105713 }, { "auxiliary_loss_clip": 0.01129102, "auxiliary_loss_mlp": 0.010215, "balance_loss_clip": 1.04172623, "balance_loss_mlp": 1.01447535, "epoch": 0.6885107917994349, "flos": 14056262553600.0, "grad_norm": 4.922032093977957, "language_loss": 0.8538748, "learning_rate": 9.341932507652053e-07, "loss": 0.87538081, "num_input_tokens_seen": 123174855, "step": 5726, "time_per_iteration": 2.519352436065674 }, { "auxiliary_loss_clip": 0.01138951, "auxiliary_loss_mlp": 0.01027875, "balance_loss_clip": 1.04224718, "balance_loss_mlp": 1.01955998, "epoch": 0.6886310346900739, "flos": 28690722334080.0, "grad_norm": 1.905341977876921, "language_loss": 0.78584218, "learning_rate": 9.335341844318489e-07, "loss": 0.80751044, "num_input_tokens_seen": 123194995, "step": 5727, "time_per_iteration": 3.290689468383789 }, { "auxiliary_loss_clip": 0.01140837, "auxiliary_loss_mlp": 0.01027219, "balance_loss_clip": 1.04652655, "balance_loss_mlp": 1.01994681, "epoch": 0.6887512775807131, "flos": 24535319592960.0, "grad_norm": 1.8871737465022373, "language_loss": 0.73136538, "learning_rate": 9.328752798805091e-07, "loss": 0.75304592, "num_input_tokens_seen": 123213465, "step": 5728, "time_per_iteration": 2.5016701221466064 }, { "auxiliary_loss_clip": 0.01154397, "auxiliary_loss_mlp": 0.01027188, "balance_loss_clip": 1.04627395, "balance_loss_mlp": 1.0199703, "epoch": 0.6888715204713521, "flos": 22414363269120.0, "grad_norm": 2.2402181236994965, "language_loss": 0.76077914, "learning_rate": 9.322165372111399e-07, "loss": 0.78259498, "num_input_tokens_seen": 123231610, "step": 5729, "time_per_iteration": 3.2580041885375977 }, { "auxiliary_loss_clip": 0.01123636, "auxiliary_loss_mlp": 0.0102226, "balance_loss_clip": 1.04590964, "balance_loss_mlp": 1.01513469, "epoch": 0.6889917633619912, "flos": 22054323294720.0, "grad_norm": 1.9574166982418773, "language_loss": 0.75711697, "learning_rate": 9.315579565236747e-07, "loss": 0.7785759, "num_input_tokens_seen": 123250715, "step": 5730, "time_per_iteration": 2.5145270824432373 }, { "auxiliary_loss_clip": 0.01140609, "auxiliary_loss_mlp": 0.01028075, "balance_loss_clip": 1.0505271, "balance_loss_mlp": 1.02004063, "epoch": 0.6891120062526304, "flos": 23949724164480.0, "grad_norm": 1.6966003496311692, "language_loss": 0.74528551, "learning_rate": 9.308995379180162e-07, "loss": 0.7669723, "num_input_tokens_seen": 123270270, "step": 5731, "time_per_iteration": 2.5157313346862793 }, { "auxiliary_loss_clip": 0.01061384, "auxiliary_loss_mlp": 0.0100106, "balance_loss_clip": 1.01502061, "balance_loss_mlp": 1.00006509, "epoch": 0.6892322491432694, "flos": 64117354337280.0, "grad_norm": 0.7454223445601498, "language_loss": 0.59533644, "learning_rate": 9.302412814940488e-07, "loss": 0.61596096, "num_input_tokens_seen": 123333045, "step": 5732, "time_per_iteration": 3.0954742431640625 }, { "auxiliary_loss_clip": 0.01138511, "auxiliary_loss_mlp": 0.01024738, "balance_loss_clip": 1.04325151, "balance_loss_mlp": 1.01687288, "epoch": 0.6893524920339085, "flos": 23002436736000.0, "grad_norm": 2.053368614831796, "language_loss": 0.71156657, "learning_rate": 9.295831873516276e-07, "loss": 0.73319906, "num_input_tokens_seen": 123352320, "step": 5733, "time_per_iteration": 2.5057895183563232 }, { "auxiliary_loss_clip": 0.01167235, "auxiliary_loss_mlp": 0.01025925, "balance_loss_clip": 1.0490464, "balance_loss_mlp": 1.01843834, "epoch": 0.6894727349245476, "flos": 21396260177280.0, "grad_norm": 1.7618763236820842, "language_loss": 0.7615453, "learning_rate": 9.289252555905873e-07, "loss": 0.78347695, "num_input_tokens_seen": 123372400, "step": 5734, "time_per_iteration": 2.433708667755127 }, { "auxiliary_loss_clip": 0.01155747, "auxiliary_loss_mlp": 0.0102275, "balance_loss_clip": 1.04883313, "balance_loss_mlp": 1.01529956, "epoch": 0.6895929778151867, "flos": 19865316654720.0, "grad_norm": 2.044291818992456, "language_loss": 0.76015317, "learning_rate": 9.282674863107334e-07, "loss": 0.7819382, "num_input_tokens_seen": 123390215, "step": 5735, "time_per_iteration": 2.4485487937927246 }, { "auxiliary_loss_clip": 0.01151006, "auxiliary_loss_mlp": 0.01027335, "balance_loss_clip": 1.04805565, "balance_loss_mlp": 1.02006578, "epoch": 0.6897132207058257, "flos": 18179166464640.0, "grad_norm": 2.3627322131421318, "language_loss": 0.7575624, "learning_rate": 9.276098796118488e-07, "loss": 0.77934575, "num_input_tokens_seen": 123406700, "step": 5736, "time_per_iteration": 2.4083616733551025 }, { "auxiliary_loss_clip": 0.01142096, "auxiliary_loss_mlp": 0.01024075, "balance_loss_clip": 1.04860759, "balance_loss_mlp": 1.01702714, "epoch": 0.6898334635964649, "flos": 32561641359360.0, "grad_norm": 1.7887429098307388, "language_loss": 0.66648114, "learning_rate": 9.269524355936938e-07, "loss": 0.6881429, "num_input_tokens_seen": 123429880, "step": 5737, "time_per_iteration": 2.567242383956909 }, { "auxiliary_loss_clip": 0.01135014, "auxiliary_loss_mlp": 0.01022752, "balance_loss_clip": 1.04405141, "balance_loss_mlp": 1.01578689, "epoch": 0.689953706487104, "flos": 22819004956800.0, "grad_norm": 1.7042803989756066, "language_loss": 0.84831685, "learning_rate": 9.262951543560002e-07, "loss": 0.8698945, "num_input_tokens_seen": 123449105, "step": 5738, "time_per_iteration": 2.5031094551086426 }, { "auxiliary_loss_clip": 0.01142445, "auxiliary_loss_mlp": 0.01031822, "balance_loss_clip": 1.05028462, "balance_loss_mlp": 1.02417505, "epoch": 0.690073949377743, "flos": 18515362786560.0, "grad_norm": 2.1839234862831414, "language_loss": 0.86339325, "learning_rate": 9.256380359984795e-07, "loss": 0.88513589, "num_input_tokens_seen": 123466215, "step": 5739, "time_per_iteration": 2.4452052116394043 }, { "auxiliary_loss_clip": 0.01116626, "auxiliary_loss_mlp": 0.01026186, "balance_loss_clip": 1.03886485, "balance_loss_mlp": 1.01899123, "epoch": 0.6901941922683821, "flos": 34857194716800.0, "grad_norm": 2.154249940638445, "language_loss": 0.75074643, "learning_rate": 9.249810806208139e-07, "loss": 0.77217454, "num_input_tokens_seen": 123485480, "step": 5740, "time_per_iteration": 2.664944648742676 }, { "auxiliary_loss_clip": 0.01109315, "auxiliary_loss_mlp": 0.00761566, "balance_loss_clip": 1.03845358, "balance_loss_mlp": 1.00059927, "epoch": 0.6903144351590212, "flos": 16253672976000.0, "grad_norm": 1.8945977996530499, "language_loss": 0.80364668, "learning_rate": 9.243242883226627e-07, "loss": 0.82235551, "num_input_tokens_seen": 123504575, "step": 5741, "time_per_iteration": 2.528092384338379 }, { "auxiliary_loss_clip": 0.01156939, "auxiliary_loss_mlp": 0.01028227, "balance_loss_clip": 1.04451704, "balance_loss_mlp": 1.02010906, "epoch": 0.6904346780496603, "flos": 28035137255040.0, "grad_norm": 2.8932824773119874, "language_loss": 0.69639575, "learning_rate": 9.236676592036628e-07, "loss": 0.71824741, "num_input_tokens_seen": 123524250, "step": 5742, "time_per_iteration": 2.5079591274261475 }, { "auxiliary_loss_clip": 0.01138907, "auxiliary_loss_mlp": 0.01023325, "balance_loss_clip": 1.04899776, "balance_loss_mlp": 1.01615787, "epoch": 0.6905549209402994, "flos": 23624266008960.0, "grad_norm": 1.7007250525889752, "language_loss": 0.73777354, "learning_rate": 9.230111933634228e-07, "loss": 0.75939584, "num_input_tokens_seen": 123545845, "step": 5743, "time_per_iteration": 2.5137429237365723 }, { "auxiliary_loss_clip": 0.01158403, "auxiliary_loss_mlp": 0.01022376, "balance_loss_clip": 1.05024183, "balance_loss_mlp": 1.0153966, "epoch": 0.6906751638309385, "flos": 23114945111040.0, "grad_norm": 1.6831624748905776, "language_loss": 0.80760562, "learning_rate": 9.223548909015288e-07, "loss": 0.82941341, "num_input_tokens_seen": 123567535, "step": 5744, "time_per_iteration": 2.487330436706543 }, { "auxiliary_loss_clip": 0.01103924, "auxiliary_loss_mlp": 0.01026927, "balance_loss_clip": 1.03928256, "balance_loss_mlp": 1.01975083, "epoch": 0.6907954067215776, "flos": 27305468375040.0, "grad_norm": 1.818234544003298, "language_loss": 0.72195995, "learning_rate": 9.216987519175407e-07, "loss": 0.74326849, "num_input_tokens_seen": 123587710, "step": 5745, "time_per_iteration": 2.6132993698120117 }, { "auxiliary_loss_clip": 0.01147898, "auxiliary_loss_mlp": 0.0102332, "balance_loss_clip": 1.04651523, "balance_loss_mlp": 1.01620674, "epoch": 0.6909156496122166, "flos": 21689399070720.0, "grad_norm": 1.7788387209152363, "language_loss": 0.6858052, "learning_rate": 9.210427765109942e-07, "loss": 0.70751739, "num_input_tokens_seen": 123607385, "step": 5746, "time_per_iteration": 2.438671112060547 }, { "auxiliary_loss_clip": 0.0113983, "auxiliary_loss_mlp": 0.01025206, "balance_loss_clip": 1.04378462, "balance_loss_mlp": 1.0173533, "epoch": 0.6910358925028558, "flos": 22561453463040.0, "grad_norm": 2.953691675612746, "language_loss": 0.81158715, "learning_rate": 9.20386964781402e-07, "loss": 0.83323753, "num_input_tokens_seen": 123625405, "step": 5747, "time_per_iteration": 2.4744808673858643 }, { "auxiliary_loss_clip": 0.01136451, "auxiliary_loss_mlp": 0.01023128, "balance_loss_clip": 1.04490495, "balance_loss_mlp": 1.01598191, "epoch": 0.6911561353934949, "flos": 22054107813120.0, "grad_norm": 1.849350440588227, "language_loss": 0.84274703, "learning_rate": 9.197313168282472e-07, "loss": 0.86434281, "num_input_tokens_seen": 123642850, "step": 5748, "time_per_iteration": 2.4859039783477783 }, { "auxiliary_loss_clip": 0.01148937, "auxiliary_loss_mlp": 0.01024622, "balance_loss_clip": 1.04379201, "balance_loss_mlp": 1.01743412, "epoch": 0.6912763782841339, "flos": 24206557386240.0, "grad_norm": 2.2064091290287178, "language_loss": 0.7202186, "learning_rate": 9.190758327509935e-07, "loss": 0.74195421, "num_input_tokens_seen": 123661595, "step": 5749, "time_per_iteration": 3.27254581451416 }, { "auxiliary_loss_clip": 0.01032514, "auxiliary_loss_mlp": 0.00752863, "balance_loss_clip": 1.01442003, "balance_loss_mlp": 0.99984324, "epoch": 0.6913966211747731, "flos": 52329641091840.0, "grad_norm": 0.932012425441419, "language_loss": 0.64492482, "learning_rate": 9.184205126490767e-07, "loss": 0.66277862, "num_input_tokens_seen": 123710490, "step": 5750, "time_per_iteration": 2.953484296798706 }, { "auxiliary_loss_clip": 0.01038958, "auxiliary_loss_mlp": 0.00752811, "balance_loss_clip": 1.0143528, "balance_loss_mlp": 0.99991363, "epoch": 0.6915168640654121, "flos": 66741274851840.0, "grad_norm": 1.0796555288360798, "language_loss": 0.59702909, "learning_rate": 9.177653566219075e-07, "loss": 0.61494684, "num_input_tokens_seen": 123765215, "step": 5751, "time_per_iteration": 3.8127617835998535 }, { "auxiliary_loss_clip": 0.01129051, "auxiliary_loss_mlp": 0.01027543, "balance_loss_clip": 1.04291868, "balance_loss_mlp": 1.02030134, "epoch": 0.6916371069560512, "flos": 18296523175680.0, "grad_norm": 2.7052514069274753, "language_loss": 0.762353, "learning_rate": 9.171103647688744e-07, "loss": 0.78391892, "num_input_tokens_seen": 123783955, "step": 5752, "time_per_iteration": 2.5005910396575928 }, { "auxiliary_loss_clip": 0.01075087, "auxiliary_loss_mlp": 0.01024856, "balance_loss_clip": 1.03978825, "balance_loss_mlp": 1.01806748, "epoch": 0.6917573498466904, "flos": 19645794685440.0, "grad_norm": 1.930178206733818, "language_loss": 0.68999827, "learning_rate": 9.164555371893367e-07, "loss": 0.7109977, "num_input_tokens_seen": 123803885, "step": 5753, "time_per_iteration": 2.6045751571655273 }, { "auxiliary_loss_clip": 0.01155808, "auxiliary_loss_mlp": 0.00761603, "balance_loss_clip": 1.04900646, "balance_loss_mlp": 1.00051844, "epoch": 0.6918775927373294, "flos": 14210319985920.0, "grad_norm": 2.094529852681535, "language_loss": 0.7520963, "learning_rate": 9.158008739826333e-07, "loss": 0.77127039, "num_input_tokens_seen": 123821485, "step": 5754, "time_per_iteration": 3.257155418395996 }, { "auxiliary_loss_clip": 0.01138581, "auxiliary_loss_mlp": 0.01027271, "balance_loss_clip": 1.04726052, "balance_loss_mlp": 1.01980901, "epoch": 0.6919978356279685, "flos": 23985455218560.0, "grad_norm": 1.5857175985167324, "language_loss": 0.86595976, "learning_rate": 9.151463752480744e-07, "loss": 0.8876183, "num_input_tokens_seen": 123840215, "step": 5755, "time_per_iteration": 2.501033306121826 }, { "auxiliary_loss_clip": 0.01115205, "auxiliary_loss_mlp": 0.01027124, "balance_loss_clip": 1.04217076, "balance_loss_mlp": 1.01991141, "epoch": 0.6921180785186076, "flos": 23622937205760.0, "grad_norm": 1.6344181757249778, "language_loss": 0.80368906, "learning_rate": 9.144920410849493e-07, "loss": 0.82511234, "num_input_tokens_seen": 123861450, "step": 5756, "time_per_iteration": 3.2965164184570312 }, { "auxiliary_loss_clip": 0.01144526, "auxiliary_loss_mlp": 0.01026503, "balance_loss_clip": 1.04586649, "balance_loss_mlp": 1.01920414, "epoch": 0.6922383214092467, "flos": 21142623265920.0, "grad_norm": 1.6706082176997474, "language_loss": 0.80647415, "learning_rate": 9.138378715925176e-07, "loss": 0.82818443, "num_input_tokens_seen": 123880545, "step": 5757, "time_per_iteration": 2.5535547733306885 }, { "auxiliary_loss_clip": 0.01134742, "auxiliary_loss_mlp": 0.010227, "balance_loss_clip": 1.04456425, "balance_loss_mlp": 1.01561022, "epoch": 0.6923585642998857, "flos": 21470667200640.0, "grad_norm": 1.5756804331649827, "language_loss": 0.80963147, "learning_rate": 9.131838668700167e-07, "loss": 0.8312059, "num_input_tokens_seen": 123900615, "step": 5758, "time_per_iteration": 2.4891936779022217 }, { "auxiliary_loss_clip": 0.01126833, "auxiliary_loss_mlp": 0.01024972, "balance_loss_clip": 1.04350471, "balance_loss_mlp": 1.01792026, "epoch": 0.6924788071905249, "flos": 21105204272640.0, "grad_norm": 1.9790880702226041, "language_loss": 0.86584604, "learning_rate": 9.125300270166598e-07, "loss": 0.88736403, "num_input_tokens_seen": 123921220, "step": 5759, "time_per_iteration": 2.544259548187256 }, { "auxiliary_loss_clip": 0.01133819, "auxiliary_loss_mlp": 0.01020283, "balance_loss_clip": 1.04431319, "balance_loss_mlp": 1.01279998, "epoch": 0.692599050081164, "flos": 26250018117120.0, "grad_norm": 1.7142491767918437, "language_loss": 0.85845792, "learning_rate": 9.118763521316324e-07, "loss": 0.87999892, "num_input_tokens_seen": 123941795, "step": 5760, "time_per_iteration": 2.5486650466918945 }, { "auxiliary_loss_clip": 0.01168502, "auxiliary_loss_mlp": 0.00762082, "balance_loss_clip": 1.04765296, "balance_loss_mlp": 1.00051713, "epoch": 0.692719292971803, "flos": 20885215426560.0, "grad_norm": 2.4385243358059463, "language_loss": 0.76098591, "learning_rate": 9.112228423140987e-07, "loss": 0.7802918, "num_input_tokens_seen": 123960715, "step": 5761, "time_per_iteration": 2.4318244457244873 }, { "auxiliary_loss_clip": 0.011448, "auxiliary_loss_mlp": 0.01030711, "balance_loss_clip": 1.04634619, "balance_loss_mlp": 1.02286148, "epoch": 0.6928395358624422, "flos": 25921938268800.0, "grad_norm": 6.885428835020421, "language_loss": 0.86322641, "learning_rate": 9.105694976631932e-07, "loss": 0.88498151, "num_input_tokens_seen": 123978625, "step": 5762, "time_per_iteration": 2.5106899738311768 }, { "auxiliary_loss_clip": 0.01153497, "auxiliary_loss_mlp": 0.01028305, "balance_loss_clip": 1.04892206, "balance_loss_mlp": 1.02087808, "epoch": 0.6929597787530812, "flos": 23586559706880.0, "grad_norm": 2.375343916997738, "language_loss": 0.72661793, "learning_rate": 9.099163182780283e-07, "loss": 0.74843597, "num_input_tokens_seen": 123996780, "step": 5763, "time_per_iteration": 2.468513011932373 }, { "auxiliary_loss_clip": 0.01136672, "auxiliary_loss_mlp": 0.01027017, "balance_loss_clip": 1.04610741, "balance_loss_mlp": 1.01919961, "epoch": 0.6930800216437203, "flos": 18255656476800.0, "grad_norm": 3.429194481011056, "language_loss": 0.49409071, "learning_rate": 9.092633042576916e-07, "loss": 0.51572758, "num_input_tokens_seen": 124014045, "step": 5764, "time_per_iteration": 2.4508748054504395 }, { "auxiliary_loss_clip": 0.0113584, "auxiliary_loss_mlp": 0.01027504, "balance_loss_clip": 1.04598045, "balance_loss_mlp": 1.02026212, "epoch": 0.6932002645343595, "flos": 29168621809920.0, "grad_norm": 1.7174633994847568, "language_loss": 0.56394851, "learning_rate": 9.086104557012446e-07, "loss": 0.58558202, "num_input_tokens_seen": 124034615, "step": 5765, "time_per_iteration": 2.5372023582458496 }, { "auxiliary_loss_clip": 0.01145246, "auxiliary_loss_mlp": 0.01021772, "balance_loss_clip": 1.04603648, "balance_loss_mlp": 1.01473331, "epoch": 0.6933205074249985, "flos": 23842746483840.0, "grad_norm": 1.8192545164636547, "language_loss": 0.65292811, "learning_rate": 9.079577727077239e-07, "loss": 0.67459834, "num_input_tokens_seen": 124053445, "step": 5766, "time_per_iteration": 2.469566822052002 }, { "auxiliary_loss_clip": 0.01156079, "auxiliary_loss_mlp": 0.01028968, "balance_loss_clip": 1.04863262, "balance_loss_mlp": 1.02127945, "epoch": 0.6934407503156376, "flos": 24166696268160.0, "grad_norm": 3.126753868699411, "language_loss": 0.71937448, "learning_rate": 9.073052553761404e-07, "loss": 0.741225, "num_input_tokens_seen": 124072810, "step": 5767, "time_per_iteration": 2.4635894298553467 }, { "auxiliary_loss_clip": 0.01116461, "auxiliary_loss_mlp": 0.01024929, "balance_loss_clip": 1.04527593, "balance_loss_mlp": 1.01679897, "epoch": 0.6935609932062767, "flos": 20631327120000.0, "grad_norm": 1.6614036675546122, "language_loss": 0.78268957, "learning_rate": 9.066529038054805e-07, "loss": 0.80410349, "num_input_tokens_seen": 124092875, "step": 5768, "time_per_iteration": 2.5495917797088623 }, { "auxiliary_loss_clip": 0.01138751, "auxiliary_loss_mlp": 0.01022071, "balance_loss_clip": 1.04660332, "balance_loss_mlp": 1.01490331, "epoch": 0.6936812360969158, "flos": 18254184019200.0, "grad_norm": 1.815154152159682, "language_loss": 0.74178845, "learning_rate": 9.060007180947071e-07, "loss": 0.76339662, "num_input_tokens_seen": 124110930, "step": 5769, "time_per_iteration": 2.4599761962890625 }, { "auxiliary_loss_clip": 0.01111126, "auxiliary_loss_mlp": 0.01027169, "balance_loss_clip": 1.03906393, "balance_loss_mlp": 1.01951933, "epoch": 0.6938014789875548, "flos": 31317336368640.0, "grad_norm": 1.851476358020392, "language_loss": 0.73212183, "learning_rate": 9.053486983427534e-07, "loss": 0.75350475, "num_input_tokens_seen": 124132180, "step": 5770, "time_per_iteration": 2.6224005222320557 }, { "auxiliary_loss_clip": 0.01142913, "auxiliary_loss_mlp": 0.01026416, "balance_loss_clip": 1.04412901, "balance_loss_mlp": 1.01899552, "epoch": 0.6939217218781939, "flos": 17528429721600.0, "grad_norm": 2.009702779252152, "language_loss": 0.7055375, "learning_rate": 9.046968446485326e-07, "loss": 0.72723079, "num_input_tokens_seen": 124150585, "step": 5771, "time_per_iteration": 2.468140125274658 }, { "auxiliary_loss_clip": 0.01157962, "auxiliary_loss_mlp": 0.01028684, "balance_loss_clip": 1.04979205, "balance_loss_mlp": 1.02054238, "epoch": 0.6940419647688331, "flos": 18551776199040.0, "grad_norm": 8.092004430525884, "language_loss": 0.70643914, "learning_rate": 9.040451571109295e-07, "loss": 0.72830558, "num_input_tokens_seen": 124166205, "step": 5772, "time_per_iteration": 2.4313132762908936 }, { "auxiliary_loss_clip": 0.01040063, "auxiliary_loss_mlp": 0.01002626, "balance_loss_clip": 1.02045226, "balance_loss_mlp": 1.00154102, "epoch": 0.6941622076594721, "flos": 66926286829440.0, "grad_norm": 0.832527170160163, "language_loss": 0.6037572, "learning_rate": 9.033936358288042e-07, "loss": 0.62418413, "num_input_tokens_seen": 124219940, "step": 5773, "time_per_iteration": 2.988834857940674 }, { "auxiliary_loss_clip": 0.01170381, "auxiliary_loss_mlp": 0.01016035, "balance_loss_clip": 1.04958415, "balance_loss_mlp": 1.00869155, "epoch": 0.6942824505501112, "flos": 26578062051840.0, "grad_norm": 1.6914707995046343, "language_loss": 0.82178885, "learning_rate": 9.027422809009937e-07, "loss": 0.84365302, "num_input_tokens_seen": 124239885, "step": 5774, "time_per_iteration": 2.4825809001922607 }, { "auxiliary_loss_clip": 0.01155152, "auxiliary_loss_mlp": 0.0102025, "balance_loss_clip": 1.04539442, "balance_loss_mlp": 1.01265001, "epoch": 0.6944026934407503, "flos": 21248308056960.0, "grad_norm": 3.8448102472157353, "language_loss": 0.83407629, "learning_rate": 9.020910924263054e-07, "loss": 0.85583031, "num_input_tokens_seen": 124258410, "step": 5775, "time_per_iteration": 2.4506568908691406 }, { "auxiliary_loss_clip": 0.0103796, "auxiliary_loss_mlp": 0.01003887, "balance_loss_clip": 1.01909065, "balance_loss_mlp": 1.0028199, "epoch": 0.6945229363313894, "flos": 70677191537280.0, "grad_norm": 0.8132748977895188, "language_loss": 0.58186227, "learning_rate": 9.014400705035261e-07, "loss": 0.60228074, "num_input_tokens_seen": 124315315, "step": 5776, "time_per_iteration": 3.81909441947937 }, { "auxiliary_loss_clip": 0.01166827, "auxiliary_loss_mlp": 0.01022395, "balance_loss_clip": 1.0503571, "balance_loss_mlp": 1.01520073, "epoch": 0.6946431792220285, "flos": 18952934267520.0, "grad_norm": 1.9368996722258527, "language_loss": 0.76930261, "learning_rate": 9.00789215231414e-07, "loss": 0.79119486, "num_input_tokens_seen": 124333710, "step": 5777, "time_per_iteration": 3.235917568206787 }, { "auxiliary_loss_clip": 0.0112462, "auxiliary_loss_mlp": 0.00762161, "balance_loss_clip": 1.0406965, "balance_loss_mlp": 1.00055563, "epoch": 0.6947634221126676, "flos": 20338834671360.0, "grad_norm": 1.7351386482825102, "language_loss": 0.81811237, "learning_rate": 9.001385267087056e-07, "loss": 0.83698022, "num_input_tokens_seen": 124352855, "step": 5778, "time_per_iteration": 2.556151866912842 }, { "auxiliary_loss_clip": 0.01157447, "auxiliary_loss_mlp": 0.01022054, "balance_loss_clip": 1.04842675, "balance_loss_mlp": 1.01494598, "epoch": 0.6948836650033067, "flos": 21833723917440.0, "grad_norm": 1.4755900027901134, "language_loss": 0.70272839, "learning_rate": 8.994880050341072e-07, "loss": 0.72452343, "num_input_tokens_seen": 124372960, "step": 5779, "time_per_iteration": 2.4567015171051025 }, { "auxiliary_loss_clip": 0.01135167, "auxiliary_loss_mlp": 0.01034608, "balance_loss_clip": 1.04640913, "balance_loss_mlp": 1.0272944, "epoch": 0.6950039078939457, "flos": 23657519024640.0, "grad_norm": 2.1235507986752165, "language_loss": 0.77724135, "learning_rate": 8.988376503063026e-07, "loss": 0.79893911, "num_input_tokens_seen": 124394220, "step": 5780, "time_per_iteration": 3.2974607944488525 }, { "auxiliary_loss_clip": 0.01122355, "auxiliary_loss_mlp": 0.01026767, "balance_loss_clip": 1.04586256, "balance_loss_mlp": 1.01874423, "epoch": 0.6951241507845849, "flos": 21792462168960.0, "grad_norm": 1.755732389878013, "language_loss": 0.8114289, "learning_rate": 8.981874626239521e-07, "loss": 0.83292007, "num_input_tokens_seen": 124412795, "step": 5781, "time_per_iteration": 2.547027349472046 }, { "auxiliary_loss_clip": 0.01155894, "auxiliary_loss_mlp": 0.0103125, "balance_loss_clip": 1.04939628, "balance_loss_mlp": 1.02349496, "epoch": 0.695244393675224, "flos": 14647568244480.0, "grad_norm": 2.431478494265423, "language_loss": 0.8810094, "learning_rate": 8.975374420856872e-07, "loss": 0.90288079, "num_input_tokens_seen": 124429690, "step": 5782, "time_per_iteration": 2.413693428039551 }, { "auxiliary_loss_clip": 0.01116878, "auxiliary_loss_mlp": 0.01021501, "balance_loss_clip": 1.04174519, "balance_loss_mlp": 1.0145278, "epoch": 0.695364636565863, "flos": 16873203778560.0, "grad_norm": 4.236207942755898, "language_loss": 0.72964561, "learning_rate": 8.968875887901157e-07, "loss": 0.75102937, "num_input_tokens_seen": 124447070, "step": 5783, "time_per_iteration": 3.23355770111084 }, { "auxiliary_loss_clip": 0.011393, "auxiliary_loss_mlp": 0.0102304, "balance_loss_clip": 1.04344916, "balance_loss_mlp": 1.01535654, "epoch": 0.6954848794565022, "flos": 19354523299200.0, "grad_norm": 1.9328052486000094, "language_loss": 0.63046283, "learning_rate": 8.9623790283582e-07, "loss": 0.6520862, "num_input_tokens_seen": 124464950, "step": 5784, "time_per_iteration": 2.4698426723480225 }, { "auxiliary_loss_clip": 0.0112813, "auxiliary_loss_mlp": 0.01029524, "balance_loss_clip": 1.04537439, "balance_loss_mlp": 1.02193046, "epoch": 0.6956051223471412, "flos": 18990209606400.0, "grad_norm": 2.340605641380878, "language_loss": 0.76530075, "learning_rate": 8.955883843213561e-07, "loss": 0.78687739, "num_input_tokens_seen": 124483965, "step": 5785, "time_per_iteration": 2.511528253555298 }, { "auxiliary_loss_clip": 0.01160859, "auxiliary_loss_mlp": 0.01027418, "balance_loss_clip": 1.04810762, "balance_loss_mlp": 1.01949668, "epoch": 0.6957253652377803, "flos": 16107229226880.0, "grad_norm": 1.8928675731567448, "language_loss": 0.86788189, "learning_rate": 8.949390333452569e-07, "loss": 0.88976467, "num_input_tokens_seen": 124501910, "step": 5786, "time_per_iteration": 2.4222774505615234 }, { "auxiliary_loss_clip": 0.01167613, "auxiliary_loss_mlp": 0.01025162, "balance_loss_clip": 1.04974055, "balance_loss_mlp": 1.01804829, "epoch": 0.6958456081284194, "flos": 29388646569600.0, "grad_norm": 1.6813816244161495, "language_loss": 0.67475069, "learning_rate": 8.942898500060279e-07, "loss": 0.69667846, "num_input_tokens_seen": 124521625, "step": 5787, "time_per_iteration": 2.4769599437713623 }, { "auxiliary_loss_clip": 0.01118087, "auxiliary_loss_mlp": 0.01023143, "balance_loss_clip": 1.04379165, "balance_loss_mlp": 1.01562631, "epoch": 0.6959658510190585, "flos": 25154850395520.0, "grad_norm": 2.777945724328948, "language_loss": 0.71817529, "learning_rate": 8.936408344021493e-07, "loss": 0.73958755, "num_input_tokens_seen": 124538540, "step": 5788, "time_per_iteration": 2.5563461780548096 }, { "auxiliary_loss_clip": 0.01150771, "auxiliary_loss_mlp": 0.01029629, "balance_loss_clip": 1.04954803, "balance_loss_mlp": 1.02118313, "epoch": 0.6960860939096976, "flos": 42814388759040.0, "grad_norm": 2.1689027919672967, "language_loss": 0.71130282, "learning_rate": 8.929919866320765e-07, "loss": 0.73310685, "num_input_tokens_seen": 124559355, "step": 5789, "time_per_iteration": 2.675229549407959 }, { "auxiliary_loss_clip": 0.0113079, "auxiliary_loss_mlp": 0.00762327, "balance_loss_clip": 1.04247475, "balance_loss_mlp": 1.00050497, "epoch": 0.6962063368003367, "flos": 17566566986880.0, "grad_norm": 1.8505713536597757, "language_loss": 0.81092548, "learning_rate": 8.923433067942385e-07, "loss": 0.82985663, "num_input_tokens_seen": 124577920, "step": 5790, "time_per_iteration": 2.5029871463775635 }, { "auxiliary_loss_clip": 0.01134611, "auxiliary_loss_mlp": 0.01027165, "balance_loss_clip": 1.04621625, "balance_loss_mlp": 1.02006578, "epoch": 0.6963265796909758, "flos": 21251648021760.0, "grad_norm": 1.854768305622176, "language_loss": 0.68871766, "learning_rate": 8.916947949870417e-07, "loss": 0.71033543, "num_input_tokens_seen": 124597585, "step": 5791, "time_per_iteration": 2.5244009494781494 }, { "auxiliary_loss_clip": 0.0106135, "auxiliary_loss_mlp": 0.01002651, "balance_loss_clip": 1.01484227, "balance_loss_mlp": 1.00163805, "epoch": 0.6964468225816148, "flos": 68828295801600.0, "grad_norm": 0.750887477114594, "language_loss": 0.58145362, "learning_rate": 8.910464513088615e-07, "loss": 0.60209364, "num_input_tokens_seen": 124661625, "step": 5792, "time_per_iteration": 3.1116039752960205 }, { "auxiliary_loss_clip": 0.01133065, "auxiliary_loss_mlp": 0.01022869, "balance_loss_clip": 1.0440166, "balance_loss_mlp": 1.0150156, "epoch": 0.696567065472254, "flos": 18950887192320.0, "grad_norm": 5.252957896290847, "language_loss": 0.7840808, "learning_rate": 8.903982758580542e-07, "loss": 0.8056401, "num_input_tokens_seen": 124680565, "step": 5793, "time_per_iteration": 2.513719320297241 }, { "auxiliary_loss_clip": 0.01138294, "auxiliary_loss_mlp": 0.01032992, "balance_loss_clip": 1.046345, "balance_loss_mlp": 1.02561283, "epoch": 0.696687308362893, "flos": 22856675345280.0, "grad_norm": 1.9795699831082307, "language_loss": 0.80265558, "learning_rate": 8.897502687329457e-07, "loss": 0.82436848, "num_input_tokens_seen": 124700365, "step": 5794, "time_per_iteration": 2.4889466762542725 }, { "auxiliary_loss_clip": 0.01122406, "auxiliary_loss_mlp": 0.0102249, "balance_loss_clip": 1.04350555, "balance_loss_mlp": 1.0154984, "epoch": 0.6968075512535321, "flos": 24972926987520.0, "grad_norm": 2.0478345205776205, "language_loss": 0.79856312, "learning_rate": 8.891024300318382e-07, "loss": 0.82001209, "num_input_tokens_seen": 124718935, "step": 5795, "time_per_iteration": 2.589400291442871 }, { "auxiliary_loss_clip": 0.01117054, "auxiliary_loss_mlp": 0.01023169, "balance_loss_clip": 1.04173446, "balance_loss_mlp": 1.01642752, "epoch": 0.6969277941441713, "flos": 21030438113280.0, "grad_norm": 2.7166445141360644, "language_loss": 0.75870109, "learning_rate": 8.884547598530103e-07, "loss": 0.78010333, "num_input_tokens_seen": 124739505, "step": 5796, "time_per_iteration": 2.5714428424835205 }, { "auxiliary_loss_clip": 0.01071477, "auxiliary_loss_mlp": 0.01030487, "balance_loss_clip": 1.03705394, "balance_loss_mlp": 1.02289915, "epoch": 0.6970480370348103, "flos": 21579404647680.0, "grad_norm": 1.8499792818007779, "language_loss": 0.75178993, "learning_rate": 8.8780725829471e-07, "loss": 0.77280957, "num_input_tokens_seen": 124757410, "step": 5797, "time_per_iteration": 2.6393935680389404 }, { "auxiliary_loss_clip": 0.01168999, "auxiliary_loss_mlp": 0.01027174, "balance_loss_clip": 1.0486027, "balance_loss_mlp": 1.01946163, "epoch": 0.6971682799254494, "flos": 22419175691520.0, "grad_norm": 1.9340346040666845, "language_loss": 0.78222102, "learning_rate": 8.87159925455165e-07, "loss": 0.80418277, "num_input_tokens_seen": 124777240, "step": 5798, "time_per_iteration": 2.430356979370117 }, { "auxiliary_loss_clip": 0.01122592, "auxiliary_loss_mlp": 0.01027879, "balance_loss_clip": 1.04484296, "balance_loss_mlp": 1.02086091, "epoch": 0.6972885228160886, "flos": 20005834659840.0, "grad_norm": 2.0074781583069417, "language_loss": 0.73301578, "learning_rate": 8.865127614325738e-07, "loss": 0.75452042, "num_input_tokens_seen": 124795670, "step": 5799, "time_per_iteration": 2.520542621612549 }, { "auxiliary_loss_clip": 0.01132629, "auxiliary_loss_mlp": 0.0102719, "balance_loss_clip": 1.04342294, "balance_loss_mlp": 1.01954317, "epoch": 0.6974087657067276, "flos": 37853437656960.0, "grad_norm": 1.983450800859278, "language_loss": 0.66537815, "learning_rate": 8.85865766325113e-07, "loss": 0.68697637, "num_input_tokens_seen": 124819600, "step": 5800, "time_per_iteration": 2.626162528991699 }, { "auxiliary_loss_clip": 0.01136049, "auxiliary_loss_mlp": 0.01025025, "balance_loss_clip": 1.04439139, "balance_loss_mlp": 1.01776469, "epoch": 0.6975290085973667, "flos": 29489267543040.0, "grad_norm": 2.192836506003847, "language_loss": 0.72344482, "learning_rate": 8.852189402309287e-07, "loss": 0.74505556, "num_input_tokens_seen": 124838785, "step": 5801, "time_per_iteration": 2.5396711826324463 }, { "auxiliary_loss_clip": 0.0115447, "auxiliary_loss_mlp": 0.01025812, "balance_loss_clip": 1.04866207, "balance_loss_mlp": 1.01889789, "epoch": 0.6976492514880057, "flos": 12895630295040.0, "grad_norm": 2.2069679831906193, "language_loss": 0.74260998, "learning_rate": 8.845722832481441e-07, "loss": 0.76441282, "num_input_tokens_seen": 124854215, "step": 5802, "time_per_iteration": 3.125596761703491 }, { "auxiliary_loss_clip": 0.01153587, "auxiliary_loss_mlp": 0.01024297, "balance_loss_clip": 1.04753709, "balance_loss_mlp": 1.01719534, "epoch": 0.6977694943786449, "flos": 24352929308160.0, "grad_norm": 1.85412657963025, "language_loss": 0.77529085, "learning_rate": 8.83925795474858e-07, "loss": 0.79706967, "num_input_tokens_seen": 124874340, "step": 5803, "time_per_iteration": 2.491938352584839 }, { "auxiliary_loss_clip": 0.01122467, "auxiliary_loss_mlp": 0.0102388, "balance_loss_clip": 1.04618394, "balance_loss_mlp": 1.01614308, "epoch": 0.6978897372692839, "flos": 29898470257920.0, "grad_norm": 3.696652994148292, "language_loss": 0.58865392, "learning_rate": 8.832794770091414e-07, "loss": 0.61011744, "num_input_tokens_seen": 124895175, "step": 5804, "time_per_iteration": 3.4000065326690674 }, { "auxiliary_loss_clip": 0.01144866, "auxiliary_loss_mlp": 0.01025692, "balance_loss_clip": 1.04633486, "balance_loss_mlp": 1.01811647, "epoch": 0.698009980159923, "flos": 21761579450880.0, "grad_norm": 2.0113103291204126, "language_loss": 0.82701099, "learning_rate": 8.826333279490401e-07, "loss": 0.84871662, "num_input_tokens_seen": 124915810, "step": 5805, "time_per_iteration": 2.49599289894104 }, { "auxiliary_loss_clip": 0.01143454, "auxiliary_loss_mlp": 0.01026419, "balance_loss_clip": 1.04748416, "balance_loss_mlp": 1.0194962, "epoch": 0.6981302230505622, "flos": 19857164267520.0, "grad_norm": 2.0915443591533593, "language_loss": 0.67778647, "learning_rate": 8.819873483925748e-07, "loss": 0.69948518, "num_input_tokens_seen": 124932930, "step": 5806, "time_per_iteration": 2.4617981910705566 }, { "auxiliary_loss_clip": 0.01129893, "auxiliary_loss_mlp": 0.00761986, "balance_loss_clip": 1.04670787, "balance_loss_mlp": 1.00050414, "epoch": 0.6982504659412012, "flos": 22198648141440.0, "grad_norm": 1.9298570994482234, "language_loss": 0.74184799, "learning_rate": 8.81341538437739e-07, "loss": 0.7607668, "num_input_tokens_seen": 124951220, "step": 5807, "time_per_iteration": 3.4212121963500977 }, { "auxiliary_loss_clip": 0.01142352, "auxiliary_loss_mlp": 0.01022129, "balance_loss_clip": 1.04330921, "balance_loss_mlp": 1.0147146, "epoch": 0.6983707088318403, "flos": 35588479708800.0, "grad_norm": 1.9772594302035267, "language_loss": 0.68066496, "learning_rate": 8.80695898182503e-07, "loss": 0.70230979, "num_input_tokens_seen": 124972200, "step": 5808, "time_per_iteration": 2.6189494132995605 }, { "auxiliary_loss_clip": 0.01060003, "auxiliary_loss_mlp": 0.01001457, "balance_loss_clip": 1.02090359, "balance_loss_mlp": 1.00030029, "epoch": 0.6984909517224794, "flos": 65440052760960.0, "grad_norm": 0.8101711278610646, "language_loss": 0.65125275, "learning_rate": 8.800504277248093e-07, "loss": 0.67186737, "num_input_tokens_seen": 125036950, "step": 5809, "time_per_iteration": 3.8643782138824463 }, { "auxiliary_loss_clip": 0.01127645, "auxiliary_loss_mlp": 0.00762036, "balance_loss_clip": 1.05123889, "balance_loss_mlp": 1.00052738, "epoch": 0.6986111946131185, "flos": 18546927863040.0, "grad_norm": 2.212635760890346, "language_loss": 0.75038385, "learning_rate": 8.794051271625753e-07, "loss": 0.76928067, "num_input_tokens_seen": 125054585, "step": 5810, "time_per_iteration": 2.5123722553253174 }, { "auxiliary_loss_clip": 0.01138871, "auxiliary_loss_mlp": 0.01024201, "balance_loss_clip": 1.0467298, "balance_loss_mlp": 1.01727223, "epoch": 0.6987314375037575, "flos": 23039173370880.0, "grad_norm": 2.6801948281864836, "language_loss": 0.83186746, "learning_rate": 8.787599965936925e-07, "loss": 0.85349822, "num_input_tokens_seen": 125075515, "step": 5811, "time_per_iteration": 2.506580114364624 }, { "auxiliary_loss_clip": 0.01120556, "auxiliary_loss_mlp": 0.01023639, "balance_loss_clip": 1.04547179, "balance_loss_mlp": 1.01669192, "epoch": 0.6988516803943967, "flos": 38400393029760.0, "grad_norm": 1.715966941119164, "language_loss": 0.72301733, "learning_rate": 8.781150361160261e-07, "loss": 0.74445927, "num_input_tokens_seen": 125097425, "step": 5812, "time_per_iteration": 2.6715238094329834 }, { "auxiliary_loss_clip": 0.01130334, "auxiliary_loss_mlp": 0.0102635, "balance_loss_clip": 1.04548073, "balance_loss_mlp": 1.01907253, "epoch": 0.6989719232850358, "flos": 24096993926400.0, "grad_norm": 1.5827104973818182, "language_loss": 0.73502684, "learning_rate": 8.774702458274181e-07, "loss": 0.75659364, "num_input_tokens_seen": 125117830, "step": 5813, "time_per_iteration": 2.5260837078094482 }, { "auxiliary_loss_clip": 0.01156398, "auxiliary_loss_mlp": 0.01025836, "balance_loss_clip": 1.04887247, "balance_loss_mlp": 1.01809919, "epoch": 0.6990921661756748, "flos": 14866838818560.0, "grad_norm": 2.3991094318439297, "language_loss": 0.70330489, "learning_rate": 8.768256258256799e-07, "loss": 0.72512722, "num_input_tokens_seen": 125134455, "step": 5814, "time_per_iteration": 2.4202916622161865 }, { "auxiliary_loss_clip": 0.01157571, "auxiliary_loss_mlp": 0.01028734, "balance_loss_clip": 1.0483619, "balance_loss_mlp": 1.02130151, "epoch": 0.699212409066314, "flos": 20193719725440.0, "grad_norm": 1.6305378040662866, "language_loss": 0.74216193, "learning_rate": 8.76181176208602e-07, "loss": 0.76402497, "num_input_tokens_seen": 125152555, "step": 5815, "time_per_iteration": 2.4460389614105225 }, { "auxiliary_loss_clip": 0.01102664, "auxiliary_loss_mlp": 0.01031813, "balance_loss_clip": 1.04071593, "balance_loss_mlp": 1.02374852, "epoch": 0.699332651956953, "flos": 19427888828160.0, "grad_norm": 1.9788421832959064, "language_loss": 0.73437583, "learning_rate": 8.755368970739461e-07, "loss": 0.75572062, "num_input_tokens_seen": 125171915, "step": 5816, "time_per_iteration": 2.550799608230591 }, { "auxiliary_loss_clip": 0.01130549, "auxiliary_loss_mlp": 0.01026508, "balance_loss_clip": 1.04349005, "balance_loss_mlp": 1.01863408, "epoch": 0.6994528948475921, "flos": 16143714466560.0, "grad_norm": 3.259344368100055, "language_loss": 0.61532134, "learning_rate": 8.748927885194479e-07, "loss": 0.6368919, "num_input_tokens_seen": 125190220, "step": 5817, "time_per_iteration": 2.5015554428100586 }, { "auxiliary_loss_clip": 0.01027549, "auxiliary_loss_mlp": 0.01004755, "balance_loss_clip": 1.01224947, "balance_loss_mlp": 1.00376534, "epoch": 0.6995731377382313, "flos": 64952420699520.0, "grad_norm": 0.7905983814474801, "language_loss": 0.57417536, "learning_rate": 8.742488506428209e-07, "loss": 0.5944984, "num_input_tokens_seen": 125249310, "step": 5818, "time_per_iteration": 3.0465736389160156 }, { "auxiliary_loss_clip": 0.01143125, "auxiliary_loss_mlp": 0.00761813, "balance_loss_clip": 1.04676628, "balance_loss_mlp": 1.00048351, "epoch": 0.6996933806288703, "flos": 24900136076160.0, "grad_norm": 1.8095859629611606, "language_loss": 0.78231585, "learning_rate": 8.736050835417466e-07, "loss": 0.8013652, "num_input_tokens_seen": 125269350, "step": 5819, "time_per_iteration": 2.534228563308716 }, { "auxiliary_loss_clip": 0.01159359, "auxiliary_loss_mlp": 0.0102635, "balance_loss_clip": 1.04909015, "balance_loss_mlp": 1.01872015, "epoch": 0.6998136235195094, "flos": 20777806782720.0, "grad_norm": 1.8192816756510883, "language_loss": 0.61549938, "learning_rate": 8.729614873138862e-07, "loss": 0.63735652, "num_input_tokens_seen": 125286985, "step": 5820, "time_per_iteration": 2.450200319290161 }, { "auxiliary_loss_clip": 0.01121254, "auxiliary_loss_mlp": 0.01026307, "balance_loss_clip": 1.0469892, "balance_loss_mlp": 1.01854026, "epoch": 0.6999338664101485, "flos": 23733470332800.0, "grad_norm": 1.903322738519752, "language_loss": 0.77887797, "learning_rate": 8.723180620568716e-07, "loss": 0.80035359, "num_input_tokens_seen": 125306240, "step": 5821, "time_per_iteration": 2.588735818862915 }, { "auxiliary_loss_clip": 0.01143045, "auxiliary_loss_mlp": 0.01022418, "balance_loss_clip": 1.0449996, "balance_loss_mlp": 1.01518822, "epoch": 0.7000541093007876, "flos": 19864598382720.0, "grad_norm": 1.7493931980047281, "language_loss": 0.85130024, "learning_rate": 8.716748078683116e-07, "loss": 0.87295485, "num_input_tokens_seen": 125323015, "step": 5822, "time_per_iteration": 2.47420597076416 }, { "auxiliary_loss_clip": 0.01073595, "auxiliary_loss_mlp": 0.01027216, "balance_loss_clip": 1.03788662, "balance_loss_mlp": 1.01860905, "epoch": 0.7001743521914267, "flos": 29679056029440.0, "grad_norm": 2.055232501455391, "language_loss": 0.68827254, "learning_rate": 8.710317248457855e-07, "loss": 0.70928067, "num_input_tokens_seen": 125342630, "step": 5823, "time_per_iteration": 2.6623828411102295 }, { "auxiliary_loss_clip": 0.01136376, "auxiliary_loss_mlp": 0.01026819, "balance_loss_clip": 1.04667652, "balance_loss_mlp": 1.01930285, "epoch": 0.7002945950820658, "flos": 27489762080640.0, "grad_norm": 1.7973682424863193, "language_loss": 0.72462505, "learning_rate": 8.703888130868482e-07, "loss": 0.74625695, "num_input_tokens_seen": 125364480, "step": 5824, "time_per_iteration": 2.5532102584838867 }, { "auxiliary_loss_clip": 0.01126737, "auxiliary_loss_mlp": 0.01023523, "balance_loss_clip": 1.0446353, "balance_loss_mlp": 1.01682925, "epoch": 0.7004148379727049, "flos": 22158463800960.0, "grad_norm": 2.822393128196926, "language_loss": 0.82287407, "learning_rate": 8.697460726890307e-07, "loss": 0.84437668, "num_input_tokens_seen": 125381625, "step": 5825, "time_per_iteration": 2.5144972801208496 }, { "auxiliary_loss_clip": 0.011252, "auxiliary_loss_mlp": 0.00762111, "balance_loss_clip": 1.04229474, "balance_loss_mlp": 1.00047016, "epoch": 0.7005350808633439, "flos": 19423758764160.0, "grad_norm": 1.9369837674046624, "language_loss": 0.90281129, "learning_rate": 8.691035037498354e-07, "loss": 0.92168444, "num_input_tokens_seen": 125397615, "step": 5826, "time_per_iteration": 2.5030462741851807 }, { "auxiliary_loss_clip": 0.01136311, "auxiliary_loss_mlp": 0.01024248, "balance_loss_clip": 1.04306197, "balance_loss_mlp": 1.0170002, "epoch": 0.7006553237539831, "flos": 23476708938240.0, "grad_norm": 1.8795623109171808, "language_loss": 0.72476053, "learning_rate": 8.684611063667391e-07, "loss": 0.74636608, "num_input_tokens_seen": 125418080, "step": 5827, "time_per_iteration": 2.5319175720214844 }, { "auxiliary_loss_clip": 0.01153478, "auxiliary_loss_mlp": 0.01023916, "balance_loss_clip": 1.04597783, "balance_loss_mlp": 1.0171659, "epoch": 0.7007755666446221, "flos": 31212872640000.0, "grad_norm": 1.7575375963367066, "language_loss": 0.76704061, "learning_rate": 8.678188806371935e-07, "loss": 0.78881454, "num_input_tokens_seen": 125440115, "step": 5828, "time_per_iteration": 2.565089464187622 }, { "auxiliary_loss_clip": 0.01153655, "auxiliary_loss_mlp": 0.01023954, "balance_loss_clip": 1.04608214, "balance_loss_mlp": 1.01746929, "epoch": 0.7008958095352612, "flos": 18149899858560.0, "grad_norm": 1.627767872174666, "language_loss": 0.85306025, "learning_rate": 8.671768266586228e-07, "loss": 0.87483633, "num_input_tokens_seen": 125458240, "step": 5829, "time_per_iteration": 3.2037551403045654 }, { "auxiliary_loss_clip": 0.01123475, "auxiliary_loss_mlp": 0.01025652, "balance_loss_clip": 1.04342556, "balance_loss_mlp": 1.01865458, "epoch": 0.7010160524259004, "flos": 27452307173760.0, "grad_norm": 1.675482902238848, "language_loss": 0.78192276, "learning_rate": 8.665349445284275e-07, "loss": 0.80341399, "num_input_tokens_seen": 125477980, "step": 5830, "time_per_iteration": 2.5903918743133545 }, { "auxiliary_loss_clip": 0.01126801, "auxiliary_loss_mlp": 0.01021239, "balance_loss_clip": 1.04730821, "balance_loss_mlp": 1.01403618, "epoch": 0.7011362953165394, "flos": 23842064125440.0, "grad_norm": 1.459501148550249, "language_loss": 0.81086689, "learning_rate": 8.658932343439799e-07, "loss": 0.83234727, "num_input_tokens_seen": 125497765, "step": 5831, "time_per_iteration": 3.386246919631958 }, { "auxiliary_loss_clip": 0.01168793, "auxiliary_loss_mlp": 0.01028209, "balance_loss_clip": 1.04845846, "balance_loss_mlp": 1.02040112, "epoch": 0.7012565382071785, "flos": 24823430582400.0, "grad_norm": 1.9015881522778693, "language_loss": 0.77469099, "learning_rate": 8.65251696202627e-07, "loss": 0.79666102, "num_input_tokens_seen": 125514145, "step": 5832, "time_per_iteration": 2.444854259490967 }, { "auxiliary_loss_clip": 0.01130083, "auxiliary_loss_mlp": 0.01024944, "balance_loss_clip": 1.04674983, "balance_loss_mlp": 1.01758909, "epoch": 0.7013767810978175, "flos": 21397445326080.0, "grad_norm": 2.2945403525867394, "language_loss": 0.87699163, "learning_rate": 8.646103302016896e-07, "loss": 0.89854187, "num_input_tokens_seen": 125533115, "step": 5833, "time_per_iteration": 3.3560352325439453 }, { "auxiliary_loss_clip": 0.01122077, "auxiliary_loss_mlp": 0.01024399, "balance_loss_clip": 1.04351306, "balance_loss_mlp": 1.01680541, "epoch": 0.7014970239884567, "flos": 16687150306560.0, "grad_norm": 1.8912571964271003, "language_loss": 0.88527739, "learning_rate": 8.639691364384614e-07, "loss": 0.9067421, "num_input_tokens_seen": 125550740, "step": 5834, "time_per_iteration": 2.555995464324951 }, { "auxiliary_loss_clip": 0.01143218, "auxiliary_loss_mlp": 0.01028679, "balance_loss_clip": 1.04683483, "balance_loss_mlp": 1.02082062, "epoch": 0.7016172668790958, "flos": 12568268718720.0, "grad_norm": 1.9878251352231335, "language_loss": 0.7277925, "learning_rate": 8.633281150102136e-07, "loss": 0.74951148, "num_input_tokens_seen": 125567590, "step": 5835, "time_per_iteration": 2.4555845260620117 }, { "auxiliary_loss_clip": 0.01140318, "auxiliary_loss_mlp": 0.01021705, "balance_loss_clip": 1.04754233, "balance_loss_mlp": 1.0147016, "epoch": 0.7017375097697348, "flos": 17452729808640.0, "grad_norm": 5.251000715413512, "language_loss": 0.67891234, "learning_rate": 8.626872660141855e-07, "loss": 0.70053256, "num_input_tokens_seen": 125585500, "step": 5836, "time_per_iteration": 3.2454843521118164 }, { "auxiliary_loss_clip": 0.01112075, "auxiliary_loss_mlp": 0.01028037, "balance_loss_clip": 1.04408634, "balance_loss_mlp": 1.02060997, "epoch": 0.701857752660374, "flos": 18513028402560.0, "grad_norm": 1.971044312174865, "language_loss": 0.7454384, "learning_rate": 8.620465895475957e-07, "loss": 0.7668395, "num_input_tokens_seen": 125603720, "step": 5837, "time_per_iteration": 2.5258865356445312 }, { "auxiliary_loss_clip": 0.01107985, "auxiliary_loss_mlp": 0.01025099, "balance_loss_clip": 1.04362595, "balance_loss_mlp": 1.01782727, "epoch": 0.701977995551013, "flos": 24425971614720.0, "grad_norm": 1.584783907692154, "language_loss": 0.75290573, "learning_rate": 8.614060857076333e-07, "loss": 0.77423656, "num_input_tokens_seen": 125624390, "step": 5838, "time_per_iteration": 2.5844318866729736 }, { "auxiliary_loss_clip": 0.01134269, "auxiliary_loss_mlp": 0.0102868, "balance_loss_clip": 1.04340684, "balance_loss_mlp": 1.02098477, "epoch": 0.7020982384416521, "flos": 23002759958400.0, "grad_norm": 1.9210371169033498, "language_loss": 0.74943495, "learning_rate": 8.60765754591462e-07, "loss": 0.7710644, "num_input_tokens_seen": 125644085, "step": 5839, "time_per_iteration": 2.496074914932251 }, { "auxiliary_loss_clip": 0.0116628, "auxiliary_loss_mlp": 0.01024093, "balance_loss_clip": 1.04787827, "balance_loss_mlp": 1.01695824, "epoch": 0.7022184813322913, "flos": 20449080489600.0, "grad_norm": 2.405025307998839, "language_loss": 0.72841823, "learning_rate": 8.601255962962211e-07, "loss": 0.75032198, "num_input_tokens_seen": 125663095, "step": 5840, "time_per_iteration": 2.4173026084899902 }, { "auxiliary_loss_clip": 0.0116535, "auxiliary_loss_mlp": 0.0102879, "balance_loss_clip": 1.05135846, "balance_loss_mlp": 1.0206008, "epoch": 0.7023387242229303, "flos": 19790514581760.0, "grad_norm": 2.4169471783817555, "language_loss": 0.72169554, "learning_rate": 8.594856109190194e-07, "loss": 0.74363697, "num_input_tokens_seen": 125680125, "step": 5841, "time_per_iteration": 2.4390385150909424 }, { "auxiliary_loss_clip": 0.01168743, "auxiliary_loss_mlp": 0.01023853, "balance_loss_clip": 1.04885232, "balance_loss_mlp": 1.01621222, "epoch": 0.7024589671135694, "flos": 33259278286080.0, "grad_norm": 1.6536870667754038, "language_loss": 0.68811655, "learning_rate": 8.588457985569446e-07, "loss": 0.71004248, "num_input_tokens_seen": 125703035, "step": 5842, "time_per_iteration": 2.551112413406372 }, { "auxiliary_loss_clip": 0.01170963, "auxiliary_loss_mlp": 0.01027417, "balance_loss_clip": 1.04891908, "balance_loss_mlp": 1.01988006, "epoch": 0.7025792100042085, "flos": 19098982967040.0, "grad_norm": 2.325769154993244, "language_loss": 0.72026587, "learning_rate": 8.582061593070542e-07, "loss": 0.74224973, "num_input_tokens_seen": 125723765, "step": 5843, "time_per_iteration": 2.433300733566284 }, { "auxiliary_loss_clip": 0.01169784, "auxiliary_loss_mlp": 0.00761986, "balance_loss_clip": 1.04952431, "balance_loss_mlp": 1.00046539, "epoch": 0.7026994528948476, "flos": 18952611045120.0, "grad_norm": 2.09562273630407, "language_loss": 0.77006441, "learning_rate": 8.57566693266383e-07, "loss": 0.78938204, "num_input_tokens_seen": 125741455, "step": 5844, "time_per_iteration": 2.410703659057617 }, { "auxiliary_loss_clip": 0.01145536, "auxiliary_loss_mlp": 0.00762762, "balance_loss_clip": 1.04570544, "balance_loss_mlp": 1.00048518, "epoch": 0.7028196957854866, "flos": 19536662188800.0, "grad_norm": 6.044492178560601, "language_loss": 0.69362265, "learning_rate": 8.569274005319354e-07, "loss": 0.71270561, "num_input_tokens_seen": 125759855, "step": 5845, "time_per_iteration": 2.4974935054779053 }, { "auxiliary_loss_clip": 0.01150789, "auxiliary_loss_mlp": 0.01026063, "balance_loss_clip": 1.04639792, "balance_loss_mlp": 1.01847196, "epoch": 0.7029399386761258, "flos": 20845318394880.0, "grad_norm": 1.8238291304949386, "language_loss": 0.79649365, "learning_rate": 8.562882812006913e-07, "loss": 0.81826216, "num_input_tokens_seen": 125777345, "step": 5846, "time_per_iteration": 2.4432694911956787 }, { "auxiliary_loss_clip": 0.01165123, "auxiliary_loss_mlp": 0.01027856, "balance_loss_clip": 1.04709482, "balance_loss_mlp": 1.02019072, "epoch": 0.7030601815667649, "flos": 22055005653120.0, "grad_norm": 2.237983222467098, "language_loss": 0.77741587, "learning_rate": 8.556493353696066e-07, "loss": 0.79934567, "num_input_tokens_seen": 125796345, "step": 5847, "time_per_iteration": 2.4328713417053223 }, { "auxiliary_loss_clip": 0.01158836, "auxiliary_loss_mlp": 0.00762288, "balance_loss_clip": 1.05026376, "balance_loss_mlp": 1.0005362, "epoch": 0.7031804244574039, "flos": 27198742089600.0, "grad_norm": 2.322622129372933, "language_loss": 0.68586165, "learning_rate": 8.550105631356077e-07, "loss": 0.70507288, "num_input_tokens_seen": 125816070, "step": 5848, "time_per_iteration": 2.4995882511138916 }, { "auxiliary_loss_clip": 0.01121023, "auxiliary_loss_mlp": 0.01026495, "balance_loss_clip": 1.04183519, "balance_loss_mlp": 1.01860905, "epoch": 0.7033006673480431, "flos": 22379853277440.0, "grad_norm": 1.845409076447715, "language_loss": 0.77179873, "learning_rate": 8.543719645955961e-07, "loss": 0.79327393, "num_input_tokens_seen": 125834400, "step": 5849, "time_per_iteration": 2.524508476257324 }, { "auxiliary_loss_clip": 0.01141703, "auxiliary_loss_mlp": 0.01022721, "balance_loss_clip": 1.04646909, "balance_loss_mlp": 1.01544976, "epoch": 0.7034209102386821, "flos": 24715986024960.0, "grad_norm": 1.5584195849754012, "language_loss": 0.74670684, "learning_rate": 8.537335398464467e-07, "loss": 0.76835114, "num_input_tokens_seen": 125854720, "step": 5850, "time_per_iteration": 2.5125372409820557 }, { "auxiliary_loss_clip": 0.01139523, "auxiliary_loss_mlp": 0.01028532, "balance_loss_clip": 1.04275489, "balance_loss_mlp": 1.02102232, "epoch": 0.7035411531293212, "flos": 22556174163840.0, "grad_norm": 2.5716388290327483, "language_loss": 0.85361779, "learning_rate": 8.53095288985007e-07, "loss": 0.87529838, "num_input_tokens_seen": 125868455, "step": 5851, "time_per_iteration": 2.4696247577667236 }, { "auxiliary_loss_clip": 0.01166293, "auxiliary_loss_mlp": 0.01022844, "balance_loss_clip": 1.04903579, "balance_loss_mlp": 1.01557183, "epoch": 0.7036613960199604, "flos": 22674967418880.0, "grad_norm": 1.6514324453961566, "language_loss": 0.82382655, "learning_rate": 8.524572121081009e-07, "loss": 0.84571797, "num_input_tokens_seen": 125888555, "step": 5852, "time_per_iteration": 2.426699161529541 }, { "auxiliary_loss_clip": 0.01159088, "auxiliary_loss_mlp": 0.01028098, "balance_loss_clip": 1.04752481, "balance_loss_mlp": 1.02080822, "epoch": 0.7037816389105994, "flos": 22492146170880.0, "grad_norm": 2.485132491332381, "language_loss": 0.62506866, "learning_rate": 8.518193093125232e-07, "loss": 0.64694047, "num_input_tokens_seen": 125907610, "step": 5853, "time_per_iteration": 2.4579017162323 }, { "auxiliary_loss_clip": 0.01145507, "auxiliary_loss_mlp": 0.01024203, "balance_loss_clip": 1.04743564, "balance_loss_mlp": 1.01741982, "epoch": 0.7039018818012385, "flos": 27087490690560.0, "grad_norm": 1.6433352001801482, "language_loss": 0.80866694, "learning_rate": 8.511815806950436e-07, "loss": 0.83036405, "num_input_tokens_seen": 125928640, "step": 5854, "time_per_iteration": 2.591630697250366 }, { "auxiliary_loss_clip": 0.01153622, "auxiliary_loss_mlp": 0.01023292, "balance_loss_clip": 1.04581761, "balance_loss_mlp": 1.0159843, "epoch": 0.7040221246918776, "flos": 17749819198080.0, "grad_norm": 1.6436640800644307, "language_loss": 0.78003961, "learning_rate": 8.505440263524044e-07, "loss": 0.80180871, "num_input_tokens_seen": 125947485, "step": 5855, "time_per_iteration": 3.2241318225860596 }, { "auxiliary_loss_clip": 0.01155432, "auxiliary_loss_mlp": 0.01022679, "balance_loss_clip": 1.04542816, "balance_loss_mlp": 1.01461482, "epoch": 0.7041423675825167, "flos": 16279851012480.0, "grad_norm": 3.934870077160221, "language_loss": 0.88130814, "learning_rate": 8.49906646381322e-07, "loss": 0.90308917, "num_input_tokens_seen": 125960320, "step": 5856, "time_per_iteration": 2.444519281387329 }, { "auxiliary_loss_clip": 0.01130476, "auxiliary_loss_mlp": 0.01022495, "balance_loss_clip": 1.04649091, "balance_loss_mlp": 1.01571178, "epoch": 0.7042626104731557, "flos": 25483181639040.0, "grad_norm": 2.630008703780646, "language_loss": 0.72072744, "learning_rate": 8.492694408784884e-07, "loss": 0.74225724, "num_input_tokens_seen": 125980575, "step": 5857, "time_per_iteration": 2.5613455772399902 }, { "auxiliary_loss_clip": 0.01158781, "auxiliary_loss_mlp": 0.01027697, "balance_loss_clip": 1.04797339, "balance_loss_mlp": 1.02060962, "epoch": 0.7043828533637949, "flos": 17857622891520.0, "grad_norm": 3.749287466560704, "language_loss": 0.62531662, "learning_rate": 8.486324099405642e-07, "loss": 0.64718139, "num_input_tokens_seen": 125997420, "step": 5858, "time_per_iteration": 3.2527029514312744 }, { "auxiliary_loss_clip": 0.0115246, "auxiliary_loss_mlp": 0.01026192, "balance_loss_clip": 1.0462575, "balance_loss_mlp": 1.01937926, "epoch": 0.704503096254434, "flos": 29494259533440.0, "grad_norm": 1.6526191757965571, "language_loss": 0.74981928, "learning_rate": 8.479955536641887e-07, "loss": 0.77160579, "num_input_tokens_seen": 126018915, "step": 5859, "time_per_iteration": 3.3527369499206543 }, { "auxiliary_loss_clip": 0.0113179, "auxiliary_loss_mlp": 0.01026169, "balance_loss_clip": 1.04039085, "balance_loss_mlp": 1.01920414, "epoch": 0.704623339145073, "flos": 30920739327360.0, "grad_norm": 1.9982653628858016, "language_loss": 0.66674972, "learning_rate": 8.473588721459716e-07, "loss": 0.68832934, "num_input_tokens_seen": 126038825, "step": 5860, "time_per_iteration": 2.5654003620147705 }, { "auxiliary_loss_clip": 0.01157067, "auxiliary_loss_mlp": 0.01038139, "balance_loss_clip": 1.05032432, "balance_loss_mlp": 1.02970552, "epoch": 0.7047435820357122, "flos": 23914747296000.0, "grad_norm": 2.081898189509378, "language_loss": 0.70755208, "learning_rate": 8.467223654824967e-07, "loss": 0.72950417, "num_input_tokens_seen": 126058280, "step": 5861, "time_per_iteration": 2.479520320892334 }, { "auxiliary_loss_clip": 0.011484, "auxiliary_loss_mlp": 0.01025183, "balance_loss_clip": 1.04636121, "balance_loss_mlp": 1.01795602, "epoch": 0.7048638249263512, "flos": 46494010926720.0, "grad_norm": 1.795873877181477, "language_loss": 0.62471318, "learning_rate": 8.460860337703233e-07, "loss": 0.64644897, "num_input_tokens_seen": 126078885, "step": 5862, "time_per_iteration": 3.4283645153045654 }, { "auxiliary_loss_clip": 0.0111548, "auxiliary_loss_mlp": 0.01030576, "balance_loss_clip": 1.04255819, "balance_loss_mlp": 1.02251458, "epoch": 0.7049840678169903, "flos": 21689219502720.0, "grad_norm": 1.743779418961563, "language_loss": 0.70548517, "learning_rate": 8.454498771059797e-07, "loss": 0.7269457, "num_input_tokens_seen": 126098260, "step": 5863, "time_per_iteration": 2.507906198501587 }, { "auxiliary_loss_clip": 0.01106172, "auxiliary_loss_mlp": 0.01024287, "balance_loss_clip": 1.04242611, "balance_loss_mlp": 1.01622832, "epoch": 0.7051043107076294, "flos": 18405081054720.0, "grad_norm": 2.095799603692248, "language_loss": 0.83618104, "learning_rate": 8.448138955859725e-07, "loss": 0.85748565, "num_input_tokens_seen": 126114845, "step": 5864, "time_per_iteration": 2.513786792755127 }, { "auxiliary_loss_clip": 0.01141321, "auxiliary_loss_mlp": 0.01025458, "balance_loss_clip": 1.04597449, "balance_loss_mlp": 1.01797152, "epoch": 0.7052245535982685, "flos": 19319043640320.0, "grad_norm": 1.8849638653558245, "language_loss": 0.90191555, "learning_rate": 8.44178089306778e-07, "loss": 0.92358327, "num_input_tokens_seen": 126132780, "step": 5865, "time_per_iteration": 2.4798450469970703 }, { "auxiliary_loss_clip": 0.01167238, "auxiliary_loss_mlp": 0.01023918, "balance_loss_clip": 1.0483036, "balance_loss_mlp": 1.01717961, "epoch": 0.7053447964889076, "flos": 19062138591360.0, "grad_norm": 1.7480958491746597, "language_loss": 0.7696026, "learning_rate": 8.4354245836485e-07, "loss": 0.79151416, "num_input_tokens_seen": 126151225, "step": 5866, "time_per_iteration": 2.4040088653564453 }, { "auxiliary_loss_clip": 0.01129014, "auxiliary_loss_mlp": 0.0102406, "balance_loss_clip": 1.04613924, "balance_loss_mlp": 1.01572466, "epoch": 0.7054650393795466, "flos": 27379228953600.0, "grad_norm": 1.4810631398635954, "language_loss": 0.72855306, "learning_rate": 8.429070028566108e-07, "loss": 0.7500838, "num_input_tokens_seen": 126172535, "step": 5867, "time_per_iteration": 2.573024272918701 }, { "auxiliary_loss_clip": 0.01153429, "auxiliary_loss_mlp": 0.01028346, "balance_loss_clip": 1.04831207, "balance_loss_mlp": 1.0206387, "epoch": 0.7055852822701858, "flos": 16102201322880.0, "grad_norm": 1.826659008747168, "language_loss": 0.75051713, "learning_rate": 8.422717228784586e-07, "loss": 0.77233487, "num_input_tokens_seen": 126189410, "step": 5868, "time_per_iteration": 2.441877603530884 }, { "auxiliary_loss_clip": 0.01113021, "auxiliary_loss_mlp": 0.01026678, "balance_loss_clip": 1.04725575, "balance_loss_mlp": 1.01892662, "epoch": 0.7057055251608249, "flos": 11692299744000.0, "grad_norm": 1.8004013758980293, "language_loss": 0.69582367, "learning_rate": 8.416366185267663e-07, "loss": 0.7172206, "num_input_tokens_seen": 126206910, "step": 5869, "time_per_iteration": 2.510385036468506 }, { "auxiliary_loss_clip": 0.01154063, "auxiliary_loss_mlp": 0.0102178, "balance_loss_clip": 1.04589319, "balance_loss_mlp": 1.01464534, "epoch": 0.7058257680514639, "flos": 22711560399360.0, "grad_norm": 1.6831362460816388, "language_loss": 0.78001696, "learning_rate": 8.410016898978778e-07, "loss": 0.80177534, "num_input_tokens_seen": 126224385, "step": 5870, "time_per_iteration": 2.459655523300171 }, { "auxiliary_loss_clip": 0.01112938, "auxiliary_loss_mlp": 0.01025937, "balance_loss_clip": 1.04558218, "balance_loss_mlp": 1.0186981, "epoch": 0.7059460109421031, "flos": 17529543043200.0, "grad_norm": 1.6492320170949961, "language_loss": 0.78770697, "learning_rate": 8.403669370881115e-07, "loss": 0.80909574, "num_input_tokens_seen": 126243120, "step": 5871, "time_per_iteration": 2.5525572299957275 }, { "auxiliary_loss_clip": 0.01168931, "auxiliary_loss_mlp": 0.01026675, "balance_loss_clip": 1.04996181, "balance_loss_mlp": 1.01989484, "epoch": 0.7060662538327421, "flos": 23544687427200.0, "grad_norm": 1.8383117626121872, "language_loss": 0.78571761, "learning_rate": 8.397323601937587e-07, "loss": 0.80767369, "num_input_tokens_seen": 126263020, "step": 5872, "time_per_iteration": 2.429551362991333 }, { "auxiliary_loss_clip": 0.01120571, "auxiliary_loss_mlp": 0.01026093, "balance_loss_clip": 1.04356551, "balance_loss_mlp": 1.01910996, "epoch": 0.7061864967233812, "flos": 30260736875520.0, "grad_norm": 1.745887519231487, "language_loss": 0.76808316, "learning_rate": 8.390979593110838e-07, "loss": 0.78954977, "num_input_tokens_seen": 126285150, "step": 5873, "time_per_iteration": 2.6032698154449463 }, { "auxiliary_loss_clip": 0.01145353, "auxiliary_loss_mlp": 0.01025955, "balance_loss_clip": 1.04846871, "balance_loss_mlp": 1.01808167, "epoch": 0.7063067396140204, "flos": 20701460424960.0, "grad_norm": 1.4806571208470214, "language_loss": 0.81541359, "learning_rate": 8.384637345363262e-07, "loss": 0.83712661, "num_input_tokens_seen": 126304340, "step": 5874, "time_per_iteration": 2.5697927474975586 }, { "auxiliary_loss_clip": 0.01132994, "auxiliary_loss_mlp": 0.01023677, "balance_loss_clip": 1.04261243, "balance_loss_mlp": 1.01652193, "epoch": 0.7064269825046594, "flos": 32266168081920.0, "grad_norm": 1.7318408462134016, "language_loss": 0.766119, "learning_rate": 8.378296859656964e-07, "loss": 0.78768569, "num_input_tokens_seen": 126325495, "step": 5875, "time_per_iteration": 2.5778212547302246 }, { "auxiliary_loss_clip": 0.01141256, "auxiliary_loss_mlp": 0.01028521, "balance_loss_clip": 1.04652238, "balance_loss_mlp": 1.02108204, "epoch": 0.7065472253952985, "flos": 30227124723840.0, "grad_norm": 2.0260462488986963, "language_loss": 0.68625867, "learning_rate": 8.371958136953792e-07, "loss": 0.70795649, "num_input_tokens_seen": 126345525, "step": 5876, "time_per_iteration": 2.557030439376831 }, { "auxiliary_loss_clip": 0.0112916, "auxiliary_loss_mlp": 0.01028571, "balance_loss_clip": 1.04245698, "balance_loss_mlp": 1.02067327, "epoch": 0.7066674682859376, "flos": 16216720859520.0, "grad_norm": 3.4497262046621087, "language_loss": 0.66013145, "learning_rate": 8.365621178215326e-07, "loss": 0.68170869, "num_input_tokens_seen": 126361995, "step": 5877, "time_per_iteration": 2.5502867698669434 }, { "auxiliary_loss_clip": 0.01148615, "auxiliary_loss_mlp": 0.01025465, "balance_loss_clip": 1.04501164, "balance_loss_mlp": 1.01883674, "epoch": 0.7067877111765767, "flos": 14830461319680.0, "grad_norm": 2.467832362969757, "language_loss": 0.75492555, "learning_rate": 8.359285984402871e-07, "loss": 0.77666628, "num_input_tokens_seen": 126379260, "step": 5878, "time_per_iteration": 2.432802438735962 }, { "auxiliary_loss_clip": 0.01134576, "auxiliary_loss_mlp": 0.01022677, "balance_loss_clip": 1.04621029, "balance_loss_mlp": 1.01581955, "epoch": 0.7069079540672157, "flos": 25440196037760.0, "grad_norm": 1.8604467206663504, "language_loss": 0.73830485, "learning_rate": 8.352952556477489e-07, "loss": 0.75987744, "num_input_tokens_seen": 126397170, "step": 5879, "time_per_iteration": 2.5171470642089844 }, { "auxiliary_loss_clip": 0.01153866, "auxiliary_loss_mlp": 0.01026467, "balance_loss_clip": 1.04867709, "balance_loss_mlp": 1.01946938, "epoch": 0.7070281969578549, "flos": 24607751368320.0, "grad_norm": 1.9111363703637, "language_loss": 0.76861405, "learning_rate": 8.34662089539993e-07, "loss": 0.79041731, "num_input_tokens_seen": 126416680, "step": 5880, "time_per_iteration": 2.476982593536377 }, { "auxiliary_loss_clip": 0.01166145, "auxiliary_loss_mlp": 0.01024269, "balance_loss_clip": 1.04948759, "balance_loss_mlp": 1.01721179, "epoch": 0.707148439848494, "flos": 26724469887360.0, "grad_norm": 9.695301230249642, "language_loss": 0.79246104, "learning_rate": 8.340291002130722e-07, "loss": 0.81436527, "num_input_tokens_seen": 126435870, "step": 5881, "time_per_iteration": 2.459730625152588 }, { "auxiliary_loss_clip": 0.01170055, "auxiliary_loss_mlp": 0.0102628, "balance_loss_clip": 1.04911423, "balance_loss_mlp": 1.01866269, "epoch": 0.707268682739133, "flos": 15085750256640.0, "grad_norm": 4.180355208202724, "language_loss": 0.79438704, "learning_rate": 8.3339628776301e-07, "loss": 0.8163504, "num_input_tokens_seen": 126454010, "step": 5882, "time_per_iteration": 3.126051425933838 }, { "auxiliary_loss_clip": 0.01167083, "auxiliary_loss_mlp": 0.0102498, "balance_loss_clip": 1.0481497, "balance_loss_mlp": 1.0180068, "epoch": 0.7073889256297722, "flos": 34313148345600.0, "grad_norm": 1.829479874034662, "language_loss": 0.57265085, "learning_rate": 8.327636522858033e-07, "loss": 0.59457147, "num_input_tokens_seen": 126473615, "step": 5883, "time_per_iteration": 2.517216920852661 }, { "auxiliary_loss_clip": 0.01113286, "auxiliary_loss_mlp": 0.0102884, "balance_loss_clip": 1.04569483, "balance_loss_mlp": 1.02137113, "epoch": 0.7075091685204112, "flos": 20083940784000.0, "grad_norm": 1.8732293239577875, "language_loss": 0.77574593, "learning_rate": 8.321311938774225e-07, "loss": 0.79716718, "num_input_tokens_seen": 126492705, "step": 5884, "time_per_iteration": 3.3915157318115234 }, { "auxiliary_loss_clip": 0.01171782, "auxiliary_loss_mlp": 0.01026484, "balance_loss_clip": 1.0495218, "balance_loss_mlp": 1.01899755, "epoch": 0.7076294114110503, "flos": 20777124424320.0, "grad_norm": 1.9646135379962832, "language_loss": 0.79203027, "learning_rate": 8.314989126338104e-07, "loss": 0.81401294, "num_input_tokens_seen": 126512715, "step": 5885, "time_per_iteration": 3.2675116062164307 }, { "auxiliary_loss_clip": 0.01157339, "auxiliary_loss_mlp": 0.01025987, "balance_loss_clip": 1.04732287, "balance_loss_mlp": 1.01873934, "epoch": 0.7077496543016895, "flos": 17967689141760.0, "grad_norm": 1.684375479227643, "language_loss": 0.84470969, "learning_rate": 8.308668086508847e-07, "loss": 0.86654294, "num_input_tokens_seen": 126530795, "step": 5886, "time_per_iteration": 2.439687967300415 }, { "auxiliary_loss_clip": 0.01128684, "auxiliary_loss_mlp": 0.01020038, "balance_loss_clip": 1.04210651, "balance_loss_mlp": 1.01254249, "epoch": 0.7078698971923285, "flos": 45478098564480.0, "grad_norm": 1.881127755599909, "language_loss": 0.73955005, "learning_rate": 8.302348820245342e-07, "loss": 0.76103729, "num_input_tokens_seen": 126553360, "step": 5887, "time_per_iteration": 2.731684446334839 }, { "auxiliary_loss_clip": 0.01126637, "auxiliary_loss_mlp": 0.01025945, "balance_loss_clip": 1.04322624, "balance_loss_mlp": 1.01799381, "epoch": 0.7079901400829676, "flos": 26943704547840.0, "grad_norm": 2.409356345252419, "language_loss": 0.70149148, "learning_rate": 8.296031328506232e-07, "loss": 0.72301733, "num_input_tokens_seen": 126573110, "step": 5888, "time_per_iteration": 2.565338134765625 }, { "auxiliary_loss_clip": 0.01141519, "auxiliary_loss_mlp": 0.01024775, "balance_loss_clip": 1.0468049, "balance_loss_mlp": 1.01764059, "epoch": 0.7081103829736067, "flos": 24423206267520.0, "grad_norm": 1.9519650635385772, "language_loss": 0.75844556, "learning_rate": 8.289715612249857e-07, "loss": 0.78010851, "num_input_tokens_seen": 126593725, "step": 5889, "time_per_iteration": 3.256669044494629 }, { "auxiliary_loss_clip": 0.0113752, "auxiliary_loss_mlp": 0.01026351, "balance_loss_clip": 1.04622734, "balance_loss_mlp": 1.01879311, "epoch": 0.7082306258642458, "flos": 18543300589440.0, "grad_norm": 2.4845261798980314, "language_loss": 0.77542293, "learning_rate": 8.283401672434305e-07, "loss": 0.79706168, "num_input_tokens_seen": 126608950, "step": 5890, "time_per_iteration": 2.437650203704834 }, { "auxiliary_loss_clip": 0.01138333, "auxiliary_loss_mlp": 0.01026034, "balance_loss_clip": 1.04798603, "balance_loss_mlp": 1.01906633, "epoch": 0.7083508687548848, "flos": 23477534951040.0, "grad_norm": 2.413514482741669, "language_loss": 0.70276546, "learning_rate": 8.277089510017412e-07, "loss": 0.72440922, "num_input_tokens_seen": 126629755, "step": 5891, "time_per_iteration": 2.5006752014160156 }, { "auxiliary_loss_clip": 0.011395, "auxiliary_loss_mlp": 0.01025214, "balance_loss_clip": 1.0491358, "balance_loss_mlp": 1.01809764, "epoch": 0.708471111645524, "flos": 22419463000320.0, "grad_norm": 5.2911139257654405, "language_loss": 0.8237235, "learning_rate": 8.270779125956719e-07, "loss": 0.84537065, "num_input_tokens_seen": 126650135, "step": 5892, "time_per_iteration": 2.4883430004119873 }, { "auxiliary_loss_clip": 0.01107796, "auxiliary_loss_mlp": 0.01024325, "balance_loss_clip": 1.04331398, "balance_loss_mlp": 1.01713359, "epoch": 0.7085913545361631, "flos": 20922885815040.0, "grad_norm": 2.6514971254165056, "language_loss": 0.80068707, "learning_rate": 8.264470521209505e-07, "loss": 0.82200825, "num_input_tokens_seen": 126668500, "step": 5893, "time_per_iteration": 2.5412471294403076 }, { "auxiliary_loss_clip": 0.01145371, "auxiliary_loss_mlp": 0.01023477, "balance_loss_clip": 1.04493821, "balance_loss_mlp": 1.01636338, "epoch": 0.7087115974268021, "flos": 15012384727680.0, "grad_norm": 2.3041720460929405, "language_loss": 0.76861179, "learning_rate": 8.258163696732785e-07, "loss": 0.79030031, "num_input_tokens_seen": 126686090, "step": 5894, "time_per_iteration": 2.422011375427246 }, { "auxiliary_loss_clip": 0.01148852, "auxiliary_loss_mlp": 0.01024945, "balance_loss_clip": 1.04607105, "balance_loss_mlp": 1.01791513, "epoch": 0.7088318403174413, "flos": 21539040739200.0, "grad_norm": 1.857016607301194, "language_loss": 0.7716186, "learning_rate": 8.251858653483288e-07, "loss": 0.7933566, "num_input_tokens_seen": 126704255, "step": 5895, "time_per_iteration": 2.457542657852173 }, { "auxiliary_loss_clip": 0.01154831, "auxiliary_loss_mlp": 0.01023376, "balance_loss_clip": 1.04921913, "balance_loss_mlp": 1.01609182, "epoch": 0.7089520832080803, "flos": 15516785462400.0, "grad_norm": 2.015062678844766, "language_loss": 0.85957199, "learning_rate": 8.245555392417501e-07, "loss": 0.88135409, "num_input_tokens_seen": 126718910, "step": 5896, "time_per_iteration": 2.3973546028137207 }, { "auxiliary_loss_clip": 0.01098049, "auxiliary_loss_mlp": 0.01021564, "balance_loss_clip": 1.03941357, "balance_loss_mlp": 1.01419091, "epoch": 0.7090723260987194, "flos": 20412667077120.0, "grad_norm": 2.6988756889881804, "language_loss": 0.78762686, "learning_rate": 8.239253914491613e-07, "loss": 0.80882299, "num_input_tokens_seen": 126737235, "step": 5897, "time_per_iteration": 2.5295279026031494 }, { "auxiliary_loss_clip": 0.01122608, "auxiliary_loss_mlp": 0.01020553, "balance_loss_clip": 1.04569817, "balance_loss_mlp": 1.01364517, "epoch": 0.7091925689893585, "flos": 25668337271040.0, "grad_norm": 1.7954880170059735, "language_loss": 0.75259316, "learning_rate": 8.232954220661556e-07, "loss": 0.77402484, "num_input_tokens_seen": 126759970, "step": 5898, "time_per_iteration": 2.5826380252838135 }, { "auxiliary_loss_clip": 0.01169178, "auxiliary_loss_mlp": 0.01028165, "balance_loss_clip": 1.05151212, "balance_loss_mlp": 1.02137613, "epoch": 0.7093128118799976, "flos": 24206629213440.0, "grad_norm": 2.4662082860745635, "language_loss": 0.70080054, "learning_rate": 8.226656311882989e-07, "loss": 0.72277403, "num_input_tokens_seen": 126779280, "step": 5899, "time_per_iteration": 2.4339444637298584 }, { "auxiliary_loss_clip": 0.01151128, "auxiliary_loss_mlp": 0.01026805, "balance_loss_clip": 1.04739881, "balance_loss_mlp": 1.01989055, "epoch": 0.7094330547706367, "flos": 16646786398080.0, "grad_norm": 2.0857110750536445, "language_loss": 0.77122843, "learning_rate": 8.22036018911129e-07, "loss": 0.79300779, "num_input_tokens_seen": 126797310, "step": 5900, "time_per_iteration": 2.435462713241577 }, { "auxiliary_loss_clip": 0.01172454, "auxiliary_loss_mlp": 0.01028306, "balance_loss_clip": 1.04938328, "balance_loss_mlp": 1.0207299, "epoch": 0.7095532976612757, "flos": 16283370545280.0, "grad_norm": 2.164938336487819, "language_loss": 0.80656147, "learning_rate": 8.214065853301599e-07, "loss": 0.82856911, "num_input_tokens_seen": 126812840, "step": 5901, "time_per_iteration": 2.388033390045166 }, { "auxiliary_loss_clip": 0.01063006, "auxiliary_loss_mlp": 0.01003132, "balance_loss_clip": 1.01616704, "balance_loss_mlp": 1.00214851, "epoch": 0.7096735405519149, "flos": 70722080559360.0, "grad_norm": 0.8472975926833304, "language_loss": 0.58255839, "learning_rate": 8.207773305408734e-07, "loss": 0.60321987, "num_input_tokens_seen": 126880060, "step": 5902, "time_per_iteration": 3.1554813385009766 }, { "auxiliary_loss_clip": 0.01119159, "auxiliary_loss_mlp": 0.0102758, "balance_loss_clip": 1.04238808, "balance_loss_mlp": 1.0198493, "epoch": 0.709793783442554, "flos": 23621500661760.0, "grad_norm": 1.965219374983623, "language_loss": 0.79957139, "learning_rate": 8.201482546387288e-07, "loss": 0.82103878, "num_input_tokens_seen": 126899535, "step": 5903, "time_per_iteration": 2.6002721786499023 }, { "auxiliary_loss_clip": 0.01153218, "auxiliary_loss_mlp": 0.0102567, "balance_loss_clip": 1.04794693, "balance_loss_mlp": 1.01869631, "epoch": 0.709914026333193, "flos": 25993472204160.0, "grad_norm": 1.6392482300275395, "language_loss": 0.91766375, "learning_rate": 8.195193577191553e-07, "loss": 0.93945265, "num_input_tokens_seen": 126921365, "step": 5904, "time_per_iteration": 2.4943082332611084 }, { "auxiliary_loss_clip": 0.01147404, "auxiliary_loss_mlp": 0.00761936, "balance_loss_clip": 1.0465194, "balance_loss_mlp": 1.0004282, "epoch": 0.7100342692238322, "flos": 24861531934080.0, "grad_norm": 2.201979540955928, "language_loss": 0.84544945, "learning_rate": 8.188906398775579e-07, "loss": 0.86454284, "num_input_tokens_seen": 126941910, "step": 5905, "time_per_iteration": 2.5196001529693604 }, { "auxiliary_loss_clip": 0.01168288, "auxiliary_loss_mlp": 0.00762281, "balance_loss_clip": 1.04793715, "balance_loss_mlp": 1.00042605, "epoch": 0.7101545121144712, "flos": 24932203943040.0, "grad_norm": 1.965040495960102, "language_loss": 0.69046426, "learning_rate": 8.18262101209311e-07, "loss": 0.70976996, "num_input_tokens_seen": 126961120, "step": 5906, "time_per_iteration": 2.4749648571014404 }, { "auxiliary_loss_clip": 0.011577, "auxiliary_loss_mlp": 0.01022723, "balance_loss_clip": 1.04731846, "balance_loss_mlp": 1.0156002, "epoch": 0.7102747550051103, "flos": 23768842250880.0, "grad_norm": 1.7456079028346174, "language_loss": 0.70122093, "learning_rate": 8.176337418097626e-07, "loss": 0.72302514, "num_input_tokens_seen": 126981590, "step": 5907, "time_per_iteration": 2.4929699897766113 }, { "auxiliary_loss_clip": 0.01153564, "auxiliary_loss_mlp": 0.00761701, "balance_loss_clip": 1.04880261, "balance_loss_mlp": 1.00035095, "epoch": 0.7103949978957494, "flos": 15303907509120.0, "grad_norm": 2.6451944249928823, "language_loss": 0.79995382, "learning_rate": 8.170055617742364e-07, "loss": 0.81910646, "num_input_tokens_seen": 126998870, "step": 5908, "time_per_iteration": 2.4380545616149902 }, { "auxiliary_loss_clip": 0.01133699, "auxiliary_loss_mlp": 0.01030153, "balance_loss_clip": 1.04359865, "balance_loss_mlp": 1.02233922, "epoch": 0.7105152407863885, "flos": 22638805401600.0, "grad_norm": 1.769385070674959, "language_loss": 0.71064436, "learning_rate": 8.163775611980252e-07, "loss": 0.73228288, "num_input_tokens_seen": 127017980, "step": 5909, "time_per_iteration": 3.2582204341888428 }, { "auxiliary_loss_clip": 0.01140579, "auxiliary_loss_mlp": 0.01026256, "balance_loss_clip": 1.04775047, "balance_loss_mlp": 1.01936603, "epoch": 0.7106354836770276, "flos": 17238594879360.0, "grad_norm": 1.5974972136353895, "language_loss": 0.78581631, "learning_rate": 8.157497401763982e-07, "loss": 0.80748469, "num_input_tokens_seen": 127035645, "step": 5910, "time_per_iteration": 2.4502665996551514 }, { "auxiliary_loss_clip": 0.01151934, "auxiliary_loss_mlp": 0.01023923, "balance_loss_clip": 1.04712069, "balance_loss_mlp": 1.01661563, "epoch": 0.7107557265676667, "flos": 20193647898240.0, "grad_norm": 1.8255890527228482, "language_loss": 0.78039765, "learning_rate": 8.151220988045935e-07, "loss": 0.80215621, "num_input_tokens_seen": 127054900, "step": 5911, "time_per_iteration": 3.2621326446533203 }, { "auxiliary_loss_clip": 0.01153461, "auxiliary_loss_mlp": 0.01023164, "balance_loss_clip": 1.04785657, "balance_loss_mlp": 1.01639557, "epoch": 0.7108759694583058, "flos": 21507080613120.0, "grad_norm": 2.0565607215947623, "language_loss": 0.82941008, "learning_rate": 8.144946371778234e-07, "loss": 0.85117626, "num_input_tokens_seen": 127075010, "step": 5912, "time_per_iteration": 3.3095526695251465 }, { "auxiliary_loss_clip": 0.01140143, "auxiliary_loss_mlp": 0.00762795, "balance_loss_clip": 1.04746783, "balance_loss_mlp": 1.0004282, "epoch": 0.7109962123489448, "flos": 24061909317120.0, "grad_norm": 1.8420633798382486, "language_loss": 0.78162861, "learning_rate": 8.138673553912751e-07, "loss": 0.80065787, "num_input_tokens_seen": 127095570, "step": 5913, "time_per_iteration": 2.4990622997283936 }, { "auxiliary_loss_clip": 0.01111225, "auxiliary_loss_mlp": 0.01023491, "balance_loss_clip": 1.04306483, "balance_loss_mlp": 1.01618683, "epoch": 0.711116455239584, "flos": 30480474326400.0, "grad_norm": 10.918322215441364, "language_loss": 0.57274139, "learning_rate": 8.132402535401059e-07, "loss": 0.59408861, "num_input_tokens_seen": 127116825, "step": 5914, "time_per_iteration": 2.619413375854492 }, { "auxiliary_loss_clip": 0.01153458, "auxiliary_loss_mlp": 0.0102762, "balance_loss_clip": 1.04951155, "balance_loss_mlp": 1.0201335, "epoch": 0.711236698130223, "flos": 25045610158080.0, "grad_norm": 2.109172351053258, "language_loss": 0.74345809, "learning_rate": 8.126133317194465e-07, "loss": 0.76526886, "num_input_tokens_seen": 127137015, "step": 5915, "time_per_iteration": 3.184783458709717 }, { "auxiliary_loss_clip": 0.01108946, "auxiliary_loss_mlp": 0.01027381, "balance_loss_clip": 1.04166222, "balance_loss_mlp": 1.01949489, "epoch": 0.7113569410208621, "flos": 24206701040640.0, "grad_norm": 1.9780733247385713, "language_loss": 0.74532568, "learning_rate": 8.11986590024401e-07, "loss": 0.76668894, "num_input_tokens_seen": 127156755, "step": 5916, "time_per_iteration": 2.5804224014282227 }, { "auxiliary_loss_clip": 0.01147855, "auxiliary_loss_mlp": 0.01027545, "balance_loss_clip": 1.05148816, "balance_loss_mlp": 1.01954293, "epoch": 0.7114771839115013, "flos": 35439306526080.0, "grad_norm": 1.876163098924071, "language_loss": 0.69152963, "learning_rate": 8.113600285500442e-07, "loss": 0.71328366, "num_input_tokens_seen": 127176965, "step": 5917, "time_per_iteration": 2.600675106048584 }, { "auxiliary_loss_clip": 0.01168909, "auxiliary_loss_mlp": 0.01019595, "balance_loss_clip": 1.04831445, "balance_loss_mlp": 1.01253796, "epoch": 0.7115974268021403, "flos": 21099458096640.0, "grad_norm": 1.8739102300953667, "language_loss": 0.74540234, "learning_rate": 8.107336473914268e-07, "loss": 0.76728743, "num_input_tokens_seen": 127195595, "step": 5918, "time_per_iteration": 2.414710283279419 }, { "auxiliary_loss_clip": 0.010498, "auxiliary_loss_mlp": 0.01001163, "balance_loss_clip": 1.01632953, "balance_loss_mlp": 1.00010765, "epoch": 0.7117176696927794, "flos": 56752866616320.0, "grad_norm": 0.7701010514575031, "language_loss": 0.55774122, "learning_rate": 8.101074466435694e-07, "loss": 0.57825083, "num_input_tokens_seen": 127255070, "step": 5919, "time_per_iteration": 3.0018229484558105 }, { "auxiliary_loss_clip": 0.01147664, "auxiliary_loss_mlp": 0.01025741, "balance_loss_clip": 1.04545951, "balance_loss_mlp": 1.01873732, "epoch": 0.7118379125834186, "flos": 15925269905280.0, "grad_norm": 1.7143730186604327, "language_loss": 0.6781137, "learning_rate": 8.094814264014662e-07, "loss": 0.69984782, "num_input_tokens_seen": 127273825, "step": 5920, "time_per_iteration": 2.423405885696411 }, { "auxiliary_loss_clip": 0.01170842, "auxiliary_loss_mlp": 0.01028259, "balance_loss_clip": 1.04891109, "balance_loss_mlp": 1.02046835, "epoch": 0.7119581554740576, "flos": 20193360589440.0, "grad_norm": 2.06826062705643, "language_loss": 0.81433731, "learning_rate": 8.088555867600844e-07, "loss": 0.83632827, "num_input_tokens_seen": 127289990, "step": 5921, "time_per_iteration": 2.425159454345703 }, { "auxiliary_loss_clip": 0.01124207, "auxiliary_loss_mlp": 0.01022925, "balance_loss_clip": 1.0438298, "balance_loss_mlp": 1.01627922, "epoch": 0.7120783983646967, "flos": 34715383822080.0, "grad_norm": 1.645086391183544, "language_loss": 0.60422426, "learning_rate": 8.08229927814362e-07, "loss": 0.62569559, "num_input_tokens_seen": 127312880, "step": 5922, "time_per_iteration": 2.6355979442596436 }, { "auxiliary_loss_clip": 0.01121827, "auxiliary_loss_mlp": 0.01022627, "balance_loss_clip": 1.04188216, "balance_loss_mlp": 1.01566815, "epoch": 0.7121986412553358, "flos": 26359114700160.0, "grad_norm": 1.6856458353800652, "language_loss": 0.65223628, "learning_rate": 8.076044496592134e-07, "loss": 0.67368084, "num_input_tokens_seen": 127334730, "step": 5923, "time_per_iteration": 2.558905601501465 }, { "auxiliary_loss_clip": 0.01140975, "auxiliary_loss_mlp": 0.01026422, "balance_loss_clip": 1.04757428, "balance_loss_mlp": 1.01953149, "epoch": 0.7123188841459749, "flos": 11145344371200.0, "grad_norm": 2.9472909354827648, "language_loss": 0.779719, "learning_rate": 8.069791523895204e-07, "loss": 0.80139291, "num_input_tokens_seen": 127351180, "step": 5924, "time_per_iteration": 2.444303274154663 }, { "auxiliary_loss_clip": 0.01113495, "auxiliary_loss_mlp": 0.01027021, "balance_loss_clip": 1.03968561, "balance_loss_mlp": 1.02003574, "epoch": 0.7124391270366139, "flos": 20811670329600.0, "grad_norm": 2.7339564776865966, "language_loss": 0.77795053, "learning_rate": 8.063540361001422e-07, "loss": 0.79935575, "num_input_tokens_seen": 127369750, "step": 5925, "time_per_iteration": 2.509763240814209 }, { "auxiliary_loss_clip": 0.01120847, "auxiliary_loss_mlp": 0.01026442, "balance_loss_clip": 1.04367721, "balance_loss_mlp": 1.0185082, "epoch": 0.7125593699272531, "flos": 17603734584960.0, "grad_norm": 1.8695298956361248, "language_loss": 0.79380929, "learning_rate": 8.057291008859069e-07, "loss": 0.81528217, "num_input_tokens_seen": 127387910, "step": 5926, "time_per_iteration": 2.5028536319732666 }, { "auxiliary_loss_clip": 0.01151277, "auxiliary_loss_mlp": 0.01027847, "balance_loss_clip": 1.04630232, "balance_loss_mlp": 1.02089465, "epoch": 0.7126796128178922, "flos": 28654057526400.0, "grad_norm": 1.892980246859141, "language_loss": 0.68136835, "learning_rate": 8.051043468416187e-07, "loss": 0.70315963, "num_input_tokens_seen": 127409160, "step": 5927, "time_per_iteration": 2.5853114128112793 }, { "auxiliary_loss_clip": 0.01167046, "auxiliary_loss_mlp": 0.0102243, "balance_loss_clip": 1.04982388, "balance_loss_mlp": 1.01541996, "epoch": 0.7127998557085312, "flos": 16034438315520.0, "grad_norm": 1.739864015659535, "language_loss": 0.82251871, "learning_rate": 8.044797740620506e-07, "loss": 0.84441346, "num_input_tokens_seen": 127427765, "step": 5928, "time_per_iteration": 2.4110355377197266 }, { "auxiliary_loss_clip": 0.01107474, "auxiliary_loss_mlp": 0.01027235, "balance_loss_clip": 1.04434967, "balance_loss_mlp": 1.02057743, "epoch": 0.7129200985991703, "flos": 23403271582080.0, "grad_norm": 2.3382013158819914, "language_loss": 0.78935671, "learning_rate": 8.038553826419494e-07, "loss": 0.81070387, "num_input_tokens_seen": 127446475, "step": 5929, "time_per_iteration": 2.5456135272979736 }, { "auxiliary_loss_clip": 0.01166213, "auxiliary_loss_mlp": 0.01021946, "balance_loss_clip": 1.04738116, "balance_loss_mlp": 1.01466799, "epoch": 0.7130403414898094, "flos": 21397445326080.0, "grad_norm": 1.5925983082648503, "language_loss": 0.80878288, "learning_rate": 8.032311726760364e-07, "loss": 0.83066452, "num_input_tokens_seen": 127467695, "step": 5930, "time_per_iteration": 2.4311695098876953 }, { "auxiliary_loss_clip": 0.01117748, "auxiliary_loss_mlp": 0.01022347, "balance_loss_clip": 1.04487777, "balance_loss_mlp": 1.01435423, "epoch": 0.7131605843804485, "flos": 74739045306240.0, "grad_norm": 1.6652256721273955, "language_loss": 0.6909281, "learning_rate": 8.026071442590022e-07, "loss": 0.71232903, "num_input_tokens_seen": 127494590, "step": 5931, "time_per_iteration": 2.9107630252838135 }, { "auxiliary_loss_clip": 0.01155695, "auxiliary_loss_mlp": 0.01023063, "balance_loss_clip": 1.05101502, "balance_loss_mlp": 1.0163188, "epoch": 0.7132808272710875, "flos": 18368739469440.0, "grad_norm": 1.8645366465907327, "language_loss": 0.80614018, "learning_rate": 8.019832974855134e-07, "loss": 0.82792771, "num_input_tokens_seen": 127512550, "step": 5932, "time_per_iteration": 2.414710760116577 }, { "auxiliary_loss_clip": 0.01124711, "auxiliary_loss_mlp": 0.01021925, "balance_loss_clip": 1.04586852, "balance_loss_mlp": 1.01435828, "epoch": 0.7134010701617267, "flos": 23253380127360.0, "grad_norm": 2.397313765032879, "language_loss": 0.82587063, "learning_rate": 8.013596324502052e-07, "loss": 0.84733701, "num_input_tokens_seen": 127531015, "step": 5933, "time_per_iteration": 2.5310628414154053 }, { "auxiliary_loss_clip": 0.01146503, "auxiliary_loss_mlp": 0.01022212, "balance_loss_clip": 1.04686999, "balance_loss_mlp": 1.01549208, "epoch": 0.7135213130523658, "flos": 23653137565440.0, "grad_norm": 1.6789758185620631, "language_loss": 0.7860254, "learning_rate": 8.007361492476872e-07, "loss": 0.80771255, "num_input_tokens_seen": 127550340, "step": 5934, "time_per_iteration": 2.4680466651916504 }, { "auxiliary_loss_clip": 0.01135552, "auxiliary_loss_mlp": 0.01025064, "balance_loss_clip": 1.04570079, "balance_loss_mlp": 1.0172503, "epoch": 0.7136415559430048, "flos": 24790644443520.0, "grad_norm": 1.626311560765611, "language_loss": 0.7903989, "learning_rate": 8.001128479725426e-07, "loss": 0.81200504, "num_input_tokens_seen": 127572245, "step": 5935, "time_per_iteration": 2.542800188064575 }, { "auxiliary_loss_clip": 0.01102892, "auxiliary_loss_mlp": 0.01022208, "balance_loss_clip": 1.03877807, "balance_loss_mlp": 1.01485837, "epoch": 0.713761798833644, "flos": 18296954138880.0, "grad_norm": 1.6236523823109696, "language_loss": 0.81287438, "learning_rate": 7.994897287193248e-07, "loss": 0.8341254, "num_input_tokens_seen": 127591625, "step": 5936, "time_per_iteration": 3.272777557373047 }, { "auxiliary_loss_clip": 0.01156622, "auxiliary_loss_mlp": 0.01028473, "balance_loss_clip": 1.04727864, "balance_loss_mlp": 1.02078366, "epoch": 0.713882041724283, "flos": 15558262692480.0, "grad_norm": 12.020479750260124, "language_loss": 0.83558595, "learning_rate": 7.988667915825605e-07, "loss": 0.8574369, "num_input_tokens_seen": 127608690, "step": 5937, "time_per_iteration": 3.234461545944214 }, { "auxiliary_loss_clip": 0.01139117, "auxiliary_loss_mlp": 0.01025049, "balance_loss_clip": 1.04517698, "balance_loss_mlp": 1.01760483, "epoch": 0.7140022846149221, "flos": 24061011477120.0, "grad_norm": 2.190641409999294, "language_loss": 0.75256544, "learning_rate": 7.982440366567491e-07, "loss": 0.77420712, "num_input_tokens_seen": 127627180, "step": 5938, "time_per_iteration": 2.487715482711792 }, { "auxiliary_loss_clip": 0.01148037, "auxiliary_loss_mlp": 0.01023534, "balance_loss_clip": 1.04544067, "balance_loss_mlp": 1.01628566, "epoch": 0.7141225275055613, "flos": 27891710248320.0, "grad_norm": 1.798389852428495, "language_loss": 0.75367653, "learning_rate": 7.97621464036361e-07, "loss": 0.77539229, "num_input_tokens_seen": 127648940, "step": 5939, "time_per_iteration": 3.3337647914886475 }, { "auxiliary_loss_clip": 0.01156462, "auxiliary_loss_mlp": 0.01023416, "balance_loss_clip": 1.04747534, "balance_loss_mlp": 1.01582873, "epoch": 0.7142427703962003, "flos": 19682603147520.0, "grad_norm": 1.612917090763218, "language_loss": 0.68116152, "learning_rate": 7.969990738158417e-07, "loss": 0.70296025, "num_input_tokens_seen": 127667350, "step": 5940, "time_per_iteration": 2.427645206451416 }, { "auxiliary_loss_clip": 0.0115693, "auxiliary_loss_mlp": 0.01025578, "balance_loss_clip": 1.04932261, "balance_loss_mlp": 1.01804423, "epoch": 0.7143630132868394, "flos": 21032377447680.0, "grad_norm": 2.197820580097943, "language_loss": 0.85094392, "learning_rate": 7.963768660896062e-07, "loss": 0.87276906, "num_input_tokens_seen": 127685760, "step": 5941, "time_per_iteration": 3.181696653366089 }, { "auxiliary_loss_clip": 0.01156318, "auxiliary_loss_mlp": 0.01025819, "balance_loss_clip": 1.04737258, "balance_loss_mlp": 1.01796341, "epoch": 0.7144832561774785, "flos": 24129923719680.0, "grad_norm": 2.1006198240667793, "language_loss": 0.82558203, "learning_rate": 7.957548409520432e-07, "loss": 0.84740341, "num_input_tokens_seen": 127704985, "step": 5942, "time_per_iteration": 2.446225643157959 }, { "auxiliary_loss_clip": 0.01125794, "auxiliary_loss_mlp": 0.01020061, "balance_loss_clip": 1.04293668, "balance_loss_mlp": 1.01313806, "epoch": 0.7146034990681176, "flos": 16325817442560.0, "grad_norm": 3.5090619472832847, "language_loss": 0.84121609, "learning_rate": 7.951329984975135e-07, "loss": 0.86267465, "num_input_tokens_seen": 127721925, "step": 5943, "time_per_iteration": 2.471788167953491 }, { "auxiliary_loss_clip": 0.01041298, "auxiliary_loss_mlp": 0.01001847, "balance_loss_clip": 1.01241088, "balance_loss_mlp": 1.00081563, "epoch": 0.7147237419587567, "flos": 69627164232960.0, "grad_norm": 0.7138321493804686, "language_loss": 0.54321826, "learning_rate": 7.94511338820349e-07, "loss": 0.56364971, "num_input_tokens_seen": 127784230, "step": 5944, "time_per_iteration": 3.0834860801696777 }, { "auxiliary_loss_clip": 0.01140484, "auxiliary_loss_mlp": 0.00762358, "balance_loss_clip": 1.04580259, "balance_loss_mlp": 1.0003413, "epoch": 0.7148439848493958, "flos": 22266806198400.0, "grad_norm": 2.297531839271508, "language_loss": 0.78409398, "learning_rate": 7.938898620148575e-07, "loss": 0.80312246, "num_input_tokens_seen": 127801990, "step": 5945, "time_per_iteration": 2.484203815460205 }, { "auxiliary_loss_clip": 0.01139539, "auxiliary_loss_mlp": 0.0102522, "balance_loss_clip": 1.0464921, "balance_loss_mlp": 1.01804686, "epoch": 0.7149642277400349, "flos": 17931383470080.0, "grad_norm": 1.9867961615757628, "language_loss": 0.70704776, "learning_rate": 7.932685681753135e-07, "loss": 0.72869533, "num_input_tokens_seen": 127819270, "step": 5946, "time_per_iteration": 2.4448025226593018 }, { "auxiliary_loss_clip": 0.01164645, "auxiliary_loss_mlp": 0.01021362, "balance_loss_clip": 1.04844403, "balance_loss_mlp": 1.01468062, "epoch": 0.7150844706306739, "flos": 31681937370240.0, "grad_norm": 2.025471147651801, "language_loss": 0.62650669, "learning_rate": 7.92647457395969e-07, "loss": 0.64836669, "num_input_tokens_seen": 127841095, "step": 5947, "time_per_iteration": 2.5029704570770264 }, { "auxiliary_loss_clip": 0.01104954, "auxiliary_loss_mlp": 0.01026908, "balance_loss_clip": 1.03982544, "balance_loss_mlp": 1.01941872, "epoch": 0.7152047135213131, "flos": 10926217451520.0, "grad_norm": 2.033030334656612, "language_loss": 0.74778056, "learning_rate": 7.920265297710444e-07, "loss": 0.76909918, "num_input_tokens_seen": 127858485, "step": 5948, "time_per_iteration": 2.55680775642395 }, { "auxiliary_loss_clip": 0.01155035, "auxiliary_loss_mlp": 0.01028357, "balance_loss_clip": 1.04873919, "balance_loss_mlp": 1.02104664, "epoch": 0.7153249564119522, "flos": 20995640812800.0, "grad_norm": 1.8020935211084854, "language_loss": 0.73254424, "learning_rate": 7.914057853947363e-07, "loss": 0.75437814, "num_input_tokens_seen": 127877665, "step": 5949, "time_per_iteration": 2.4449634552001953 }, { "auxiliary_loss_clip": 0.01126886, "auxiliary_loss_mlp": 0.01030701, "balance_loss_clip": 1.04588938, "balance_loss_mlp": 1.02309573, "epoch": 0.7154451993025912, "flos": 24243114453120.0, "grad_norm": 2.3097265687939466, "language_loss": 0.62464869, "learning_rate": 7.907852243612089e-07, "loss": 0.6462245, "num_input_tokens_seen": 127898070, "step": 5950, "time_per_iteration": 2.5632383823394775 }, { "auxiliary_loss_clip": 0.01137629, "auxiliary_loss_mlp": 0.01024004, "balance_loss_clip": 1.04442382, "balance_loss_mlp": 1.01710463, "epoch": 0.7155654421932304, "flos": 23330947547520.0, "grad_norm": 2.4445920557105323, "language_loss": 0.72456896, "learning_rate": 7.901648467646009e-07, "loss": 0.7461853, "num_input_tokens_seen": 127917010, "step": 5951, "time_per_iteration": 2.4919145107269287 }, { "auxiliary_loss_clip": 0.01170737, "auxiliary_loss_mlp": 0.01028093, "balance_loss_clip": 1.05073297, "balance_loss_mlp": 1.02089024, "epoch": 0.7156856850838694, "flos": 22711883621760.0, "grad_norm": 1.513611795855191, "language_loss": 0.72323102, "learning_rate": 7.895446526990244e-07, "loss": 0.74521929, "num_input_tokens_seen": 127937025, "step": 5952, "time_per_iteration": 2.431321859359741 }, { "auxiliary_loss_clip": 0.0112192, "auxiliary_loss_mlp": 0.01026386, "balance_loss_clip": 1.04544544, "balance_loss_mlp": 1.01892948, "epoch": 0.7158059279745085, "flos": 19865424395520.0, "grad_norm": 1.4918191436637456, "language_loss": 0.75417376, "learning_rate": 7.889246422585609e-07, "loss": 0.77565682, "num_input_tokens_seen": 127956410, "step": 5953, "time_per_iteration": 2.5446574687957764 }, { "auxiliary_loss_clip": 0.01169615, "auxiliary_loss_mlp": 0.01022095, "balance_loss_clip": 1.05043077, "balance_loss_mlp": 1.01537204, "epoch": 0.7159261708651476, "flos": 24134772055680.0, "grad_norm": 2.0535115772754033, "language_loss": 0.73640311, "learning_rate": 7.883048155372675e-07, "loss": 0.75832021, "num_input_tokens_seen": 127974925, "step": 5954, "time_per_iteration": 2.438392400741577 }, { "auxiliary_loss_clip": 0.0114473, "auxiliary_loss_mlp": 0.01023021, "balance_loss_clip": 1.04652762, "balance_loss_mlp": 1.01592767, "epoch": 0.7160464137557867, "flos": 16983198201600.0, "grad_norm": 2.2442108113923274, "language_loss": 0.71274406, "learning_rate": 7.876851726291698e-07, "loss": 0.73442155, "num_input_tokens_seen": 127993225, "step": 5955, "time_per_iteration": 2.462009906768799 }, { "auxiliary_loss_clip": 0.01129355, "auxiliary_loss_mlp": 0.01023029, "balance_loss_clip": 1.04402018, "balance_loss_mlp": 1.01588583, "epoch": 0.7161666566464258, "flos": 25228251838080.0, "grad_norm": 1.9405708162473427, "language_loss": 0.78466296, "learning_rate": 7.870657136282666e-07, "loss": 0.8061868, "num_input_tokens_seen": 128012085, "step": 5956, "time_per_iteration": 2.586334705352783 }, { "auxiliary_loss_clip": 0.0114964, "auxiliary_loss_mlp": 0.01026719, "balance_loss_clip": 1.04631352, "balance_loss_mlp": 1.01942682, "epoch": 0.7162868995370649, "flos": 26468390851200.0, "grad_norm": 1.5535642698682754, "language_loss": 0.81967443, "learning_rate": 7.86446438628531e-07, "loss": 0.84143806, "num_input_tokens_seen": 128033155, "step": 5957, "time_per_iteration": 2.480459213256836 }, { "auxiliary_loss_clip": 0.010693, "auxiliary_loss_mlp": 0.01001776, "balance_loss_clip": 1.01337409, "balance_loss_mlp": 1.00078666, "epoch": 0.716407142427704, "flos": 69998912040960.0, "grad_norm": 0.7706519337822517, "language_loss": 0.56893235, "learning_rate": 7.858273477239059e-07, "loss": 0.58964312, "num_input_tokens_seen": 128101575, "step": 5958, "time_per_iteration": 3.028585433959961 }, { "auxiliary_loss_clip": 0.01097445, "auxiliary_loss_mlp": 0.01025699, "balance_loss_clip": 1.04128027, "balance_loss_mlp": 1.01806951, "epoch": 0.716527385318343, "flos": 20740459616640.0, "grad_norm": 3.981616231933726, "language_loss": 0.71342969, "learning_rate": 7.852084410083067e-07, "loss": 0.73466116, "num_input_tokens_seen": 128120395, "step": 5959, "time_per_iteration": 2.529221773147583 }, { "auxiliary_loss_clip": 0.01135568, "auxiliary_loss_mlp": 0.01023634, "balance_loss_clip": 1.04644775, "balance_loss_mlp": 1.01680613, "epoch": 0.7166476282089821, "flos": 25371966153600.0, "grad_norm": 1.5823201993328826, "language_loss": 0.63586152, "learning_rate": 7.84589718575621e-07, "loss": 0.6574536, "num_input_tokens_seen": 128140840, "step": 5960, "time_per_iteration": 2.499711513519287 }, { "auxiliary_loss_clip": 0.01139584, "auxiliary_loss_mlp": 0.01024692, "balance_loss_clip": 1.04140449, "balance_loss_mlp": 1.01726508, "epoch": 0.7167678710996213, "flos": 24133730561280.0, "grad_norm": 2.051914502092473, "language_loss": 0.69125974, "learning_rate": 7.83971180519708e-07, "loss": 0.71290255, "num_input_tokens_seen": 128159695, "step": 5961, "time_per_iteration": 2.485414743423462 }, { "auxiliary_loss_clip": 0.01171054, "auxiliary_loss_mlp": 0.01020219, "balance_loss_clip": 1.05058181, "balance_loss_mlp": 1.01269412, "epoch": 0.7168881139902603, "flos": 30226586019840.0, "grad_norm": 1.9783952506312736, "language_loss": 0.75783879, "learning_rate": 7.833528269344008e-07, "loss": 0.77975154, "num_input_tokens_seen": 128179600, "step": 5962, "time_per_iteration": 2.4719698429107666 }, { "auxiliary_loss_clip": 0.01127714, "auxiliary_loss_mlp": 0.01027687, "balance_loss_clip": 1.0478543, "balance_loss_mlp": 1.02006364, "epoch": 0.7170083568808994, "flos": 14606414236800.0, "grad_norm": 2.19147052371818, "language_loss": 0.77354544, "learning_rate": 7.827346579135023e-07, "loss": 0.7950995, "num_input_tokens_seen": 128196940, "step": 5963, "time_per_iteration": 3.235485553741455 }, { "auxiliary_loss_clip": 0.01132614, "auxiliary_loss_mlp": 0.01023625, "balance_loss_clip": 1.04109097, "balance_loss_mlp": 1.01607275, "epoch": 0.7171285997715385, "flos": 23331091201920.0, "grad_norm": 1.918318510181046, "language_loss": 0.83056885, "learning_rate": 7.821166735507885e-07, "loss": 0.85213125, "num_input_tokens_seen": 128215970, "step": 5964, "time_per_iteration": 3.3208627700805664 }, { "auxiliary_loss_clip": 0.01166054, "auxiliary_loss_mlp": 0.0102293, "balance_loss_clip": 1.04807389, "balance_loss_mlp": 1.01586342, "epoch": 0.7172488426621776, "flos": 16543543731840.0, "grad_norm": 1.7061114409008664, "language_loss": 0.68515599, "learning_rate": 7.81498873940007e-07, "loss": 0.70704591, "num_input_tokens_seen": 128233185, "step": 5965, "time_per_iteration": 3.24603533744812 }, { "auxiliary_loss_clip": 0.01157576, "auxiliary_loss_mlp": 0.01020295, "balance_loss_clip": 1.04477262, "balance_loss_mlp": 1.01268053, "epoch": 0.7173690855528166, "flos": 26541612725760.0, "grad_norm": 2.198894462808227, "language_loss": 0.77404654, "learning_rate": 7.808812591748768e-07, "loss": 0.79582524, "num_input_tokens_seen": 128253565, "step": 5966, "time_per_iteration": 2.4849202632904053 }, { "auxiliary_loss_clip": 0.01120585, "auxiliary_loss_mlp": 0.01022783, "balance_loss_clip": 1.04278624, "balance_loss_mlp": 1.01538587, "epoch": 0.7174893284434558, "flos": 22784099915520.0, "grad_norm": 1.9995247814891484, "language_loss": 0.65141243, "learning_rate": 7.802638293490915e-07, "loss": 0.67284608, "num_input_tokens_seen": 128273210, "step": 5967, "time_per_iteration": 2.511413812637329 }, { "auxiliary_loss_clip": 0.01141701, "auxiliary_loss_mlp": 0.01024165, "balance_loss_clip": 1.04485655, "balance_loss_mlp": 1.01720881, "epoch": 0.7176095713340949, "flos": 23293564467840.0, "grad_norm": 1.607539206124552, "language_loss": 0.76828629, "learning_rate": 7.796465845563123e-07, "loss": 0.78994495, "num_input_tokens_seen": 128292085, "step": 5968, "time_per_iteration": 3.2865138053894043 }, { "auxiliary_loss_clip": 0.01133691, "auxiliary_loss_mlp": 0.00762008, "balance_loss_clip": 1.04482985, "balance_loss_mlp": 1.00039935, "epoch": 0.7177298142247339, "flos": 25591631777280.0, "grad_norm": 1.9185757109855452, "language_loss": 0.79618633, "learning_rate": 7.790295248901766e-07, "loss": 0.81514329, "num_input_tokens_seen": 128313215, "step": 5969, "time_per_iteration": 2.513559103012085 }, { "auxiliary_loss_clip": 0.01154186, "auxiliary_loss_mlp": 0.01023751, "balance_loss_clip": 1.04812098, "balance_loss_mlp": 1.01649463, "epoch": 0.7178500571153731, "flos": 31652778504960.0, "grad_norm": 3.232229592781444, "language_loss": 0.62354678, "learning_rate": 7.784126504442902e-07, "loss": 0.64532614, "num_input_tokens_seen": 128336445, "step": 5970, "time_per_iteration": 2.550555944442749 }, { "auxiliary_loss_clip": 0.01116124, "auxiliary_loss_mlp": 0.01019245, "balance_loss_clip": 1.04328299, "balance_loss_mlp": 1.01193142, "epoch": 0.7179703000060121, "flos": 19427242383360.0, "grad_norm": 1.431572761934135, "language_loss": 0.67669272, "learning_rate": 7.777959613122351e-07, "loss": 0.69804645, "num_input_tokens_seen": 128356270, "step": 5971, "time_per_iteration": 2.496440887451172 }, { "auxiliary_loss_clip": 0.01131966, "auxiliary_loss_mlp": 0.01025844, "balance_loss_clip": 1.04554319, "balance_loss_mlp": 1.01872134, "epoch": 0.7180905428966512, "flos": 28839249072000.0, "grad_norm": 1.8246313015915578, "language_loss": 0.77922994, "learning_rate": 7.771794575875604e-07, "loss": 0.80080807, "num_input_tokens_seen": 128378140, "step": 5972, "time_per_iteration": 2.5360546112060547 }, { "auxiliary_loss_clip": 0.01156815, "auxiliary_loss_mlp": 0.01029284, "balance_loss_clip": 1.05040479, "balance_loss_mlp": 1.02143455, "epoch": 0.7182107857872904, "flos": 20047563285120.0, "grad_norm": 2.3693136504902492, "language_loss": 0.77600729, "learning_rate": 7.765631393637888e-07, "loss": 0.79786831, "num_input_tokens_seen": 128396335, "step": 5973, "time_per_iteration": 2.480060577392578 }, { "auxiliary_loss_clip": 0.01148786, "auxiliary_loss_mlp": 0.01022247, "balance_loss_clip": 1.04402542, "balance_loss_mlp": 1.01473045, "epoch": 0.7183310286779294, "flos": 22747686503040.0, "grad_norm": 2.7891524672140107, "language_loss": 0.48781639, "learning_rate": 7.75947006734417e-07, "loss": 0.50952673, "num_input_tokens_seen": 128414115, "step": 5974, "time_per_iteration": 2.4523487091064453 }, { "auxiliary_loss_clip": 0.01166098, "auxiliary_loss_mlp": 0.01025511, "balance_loss_clip": 1.04692411, "balance_loss_mlp": 1.01825094, "epoch": 0.7184512715685685, "flos": 17158262112000.0, "grad_norm": 1.9762087871114307, "language_loss": 0.82911795, "learning_rate": 7.753310597929101e-07, "loss": 0.85103405, "num_input_tokens_seen": 128430755, "step": 5975, "time_per_iteration": 2.363180160522461 }, { "auxiliary_loss_clip": 0.01069045, "auxiliary_loss_mlp": 0.01001057, "balance_loss_clip": 1.01330996, "balance_loss_mlp": 1.00007999, "epoch": 0.7185715144592076, "flos": 65509611448320.0, "grad_norm": 0.7588439268693543, "language_loss": 0.55146426, "learning_rate": 7.747152986327095e-07, "loss": 0.57216531, "num_input_tokens_seen": 128491300, "step": 5976, "time_per_iteration": 2.930744171142578 }, { "auxiliary_loss_clip": 0.01115246, "auxiliary_loss_mlp": 0.01025427, "balance_loss_clip": 1.04405522, "balance_loss_mlp": 1.01831925, "epoch": 0.7186917573498467, "flos": 16180522928640.0, "grad_norm": 1.7968503572306591, "language_loss": 0.68002009, "learning_rate": 7.740997233472228e-07, "loss": 0.70142674, "num_input_tokens_seen": 128508920, "step": 5977, "time_per_iteration": 2.4975147247314453 }, { "auxiliary_loss_clip": 0.0113971, "auxiliary_loss_mlp": 0.01019987, "balance_loss_clip": 1.04523945, "balance_loss_mlp": 1.01359463, "epoch": 0.7188120002404857, "flos": 29242274647680.0, "grad_norm": 2.028477166157526, "language_loss": 0.70545155, "learning_rate": 7.734843340298329e-07, "loss": 0.72704852, "num_input_tokens_seen": 128528745, "step": 5978, "time_per_iteration": 2.556546211242676 }, { "auxiliary_loss_clip": 0.01143963, "auxiliary_loss_mlp": 0.01029059, "balance_loss_clip": 1.04546499, "balance_loss_mlp": 1.02142322, "epoch": 0.7189322431311249, "flos": 33401161008000.0, "grad_norm": 1.8227730176582633, "language_loss": 0.7514649, "learning_rate": 7.72869130773895e-07, "loss": 0.77319515, "num_input_tokens_seen": 128549345, "step": 5979, "time_per_iteration": 2.580125093460083 }, { "auxiliary_loss_clip": 0.0106055, "auxiliary_loss_mlp": 0.01001273, "balance_loss_clip": 1.01362038, "balance_loss_mlp": 1.00025368, "epoch": 0.719052486021764, "flos": 61351263792000.0, "grad_norm": 0.7841740392847185, "language_loss": 0.59397566, "learning_rate": 7.722541136727343e-07, "loss": 0.61459386, "num_input_tokens_seen": 128605360, "step": 5980, "time_per_iteration": 2.910393238067627 }, { "auxiliary_loss_clip": 0.01153463, "auxiliary_loss_mlp": 0.0102267, "balance_loss_clip": 1.0480113, "balance_loss_mlp": 1.015306, "epoch": 0.719172728912403, "flos": 15596795007360.0, "grad_norm": 4.4258872686997925, "language_loss": 0.80874044, "learning_rate": 7.716392828196483e-07, "loss": 0.83050179, "num_input_tokens_seen": 128623160, "step": 5981, "time_per_iteration": 2.4103119373321533 }, { "auxiliary_loss_clip": 0.01154508, "auxiliary_loss_mlp": 0.01026728, "balance_loss_clip": 1.04875374, "balance_loss_mlp": 1.01968312, "epoch": 0.7192929718030422, "flos": 15553162961280.0, "grad_norm": 2.3907294685851266, "language_loss": 0.76855034, "learning_rate": 7.710246383079064e-07, "loss": 0.79036266, "num_input_tokens_seen": 128638545, "step": 5982, "time_per_iteration": 2.41166090965271 }, { "auxiliary_loss_clip": 0.01141437, "auxiliary_loss_mlp": 0.01024494, "balance_loss_clip": 1.04382706, "balance_loss_mlp": 1.01710343, "epoch": 0.7194132146936812, "flos": 21862487733120.0, "grad_norm": 2.7910990426610023, "language_loss": 0.92155468, "learning_rate": 7.704101802307492e-07, "loss": 0.94321406, "num_input_tokens_seen": 128650845, "step": 5983, "time_per_iteration": 2.452523708343506 }, { "auxiliary_loss_clip": 0.01117713, "auxiliary_loss_mlp": 0.01029865, "balance_loss_clip": 1.04321396, "balance_loss_mlp": 1.02182698, "epoch": 0.7195334575843203, "flos": 27338900958720.0, "grad_norm": 2.079848905110411, "language_loss": 0.86884528, "learning_rate": 7.697959086813912e-07, "loss": 0.89032108, "num_input_tokens_seen": 128667010, "step": 5984, "time_per_iteration": 2.530970335006714 }, { "auxiliary_loss_clip": 0.01117591, "auxiliary_loss_mlp": 0.01025737, "balance_loss_clip": 1.04199338, "balance_loss_mlp": 1.01849818, "epoch": 0.7196537004749595, "flos": 18770615809920.0, "grad_norm": 1.7202020209522562, "language_loss": 0.79658598, "learning_rate": 7.691818237530145e-07, "loss": 0.81801927, "num_input_tokens_seen": 128685870, "step": 5985, "time_per_iteration": 2.4783823490142822 }, { "auxiliary_loss_clip": 0.01124029, "auxiliary_loss_mlp": 0.01024653, "balance_loss_clip": 1.0443089, "balance_loss_mlp": 1.0174644, "epoch": 0.7197739433655985, "flos": 24531009960960.0, "grad_norm": 3.347365544878862, "language_loss": 0.77560627, "learning_rate": 7.685679255387774e-07, "loss": 0.79709309, "num_input_tokens_seen": 128704185, "step": 5986, "time_per_iteration": 2.5717742443084717 }, { "auxiliary_loss_clip": 0.01137741, "auxiliary_loss_mlp": 0.01025392, "balance_loss_clip": 1.04616594, "balance_loss_mlp": 1.01807249, "epoch": 0.7198941862562376, "flos": 18040587793920.0, "grad_norm": 1.9515849246077535, "language_loss": 0.7704761, "learning_rate": 7.679542141318065e-07, "loss": 0.79210746, "num_input_tokens_seen": 128721290, "step": 5987, "time_per_iteration": 2.460951566696167 }, { "auxiliary_loss_clip": 0.0112759, "auxiliary_loss_mlp": 0.0102375, "balance_loss_clip": 1.04280448, "balance_loss_mlp": 1.01686835, "epoch": 0.7200144291468767, "flos": 29022393542400.0, "grad_norm": 2.985615211994362, "language_loss": 0.75820959, "learning_rate": 7.673406896252013e-07, "loss": 0.77972293, "num_input_tokens_seen": 128742665, "step": 5988, "time_per_iteration": 2.5501222610473633 }, { "auxiliary_loss_clip": 0.01123739, "auxiliary_loss_mlp": 0.01028472, "balance_loss_clip": 1.04211879, "balance_loss_mlp": 1.02016938, "epoch": 0.7201346720375158, "flos": 25374264624000.0, "grad_norm": 1.7704955440651207, "language_loss": 0.7838248, "learning_rate": 7.667273521120347e-07, "loss": 0.80534697, "num_input_tokens_seen": 128762225, "step": 5989, "time_per_iteration": 3.335773229598999 }, { "auxiliary_loss_clip": 0.01130774, "auxiliary_loss_mlp": 0.01028129, "balance_loss_clip": 1.04635978, "balance_loss_mlp": 1.02109003, "epoch": 0.7202549149281549, "flos": 14355614499840.0, "grad_norm": 1.9877454207093774, "language_loss": 0.79988301, "learning_rate": 7.661142016853468e-07, "loss": 0.82147205, "num_input_tokens_seen": 128779585, "step": 5990, "time_per_iteration": 3.3239893913269043 }, { "auxiliary_loss_clip": 0.01110229, "auxiliary_loss_mlp": 0.01025868, "balance_loss_clip": 1.042992, "balance_loss_mlp": 1.01868272, "epoch": 0.7203751578187939, "flos": 23001682550400.0, "grad_norm": 1.7743199523224717, "language_loss": 0.74498689, "learning_rate": 7.655012384381543e-07, "loss": 0.76634789, "num_input_tokens_seen": 128799070, "step": 5991, "time_per_iteration": 2.561940908432007 }, { "auxiliary_loss_clip": 0.01139675, "auxiliary_loss_mlp": 0.01028656, "balance_loss_clip": 1.05005169, "balance_loss_mlp": 1.02133083, "epoch": 0.7204954007094331, "flos": 23692424065920.0, "grad_norm": 1.6525945971799834, "language_loss": 0.81969774, "learning_rate": 7.648884624634415e-07, "loss": 0.84138101, "num_input_tokens_seen": 128817620, "step": 5992, "time_per_iteration": 3.3387362957000732 }, { "auxiliary_loss_clip": 0.01148888, "auxiliary_loss_mlp": 0.01026357, "balance_loss_clip": 1.04598236, "balance_loss_mlp": 1.01917481, "epoch": 0.7206156436000721, "flos": 16253026531200.0, "grad_norm": 1.7731871169080444, "language_loss": 0.88784838, "learning_rate": 7.642758738541683e-07, "loss": 0.90960085, "num_input_tokens_seen": 128834200, "step": 5993, "time_per_iteration": 2.4055371284484863 }, { "auxiliary_loss_clip": 0.01058743, "auxiliary_loss_mlp": 0.01001506, "balance_loss_clip": 1.01376402, "balance_loss_mlp": 1.00054014, "epoch": 0.7207358864907112, "flos": 54377806504320.0, "grad_norm": 0.7572707582816807, "language_loss": 0.60749763, "learning_rate": 7.636634727032621e-07, "loss": 0.62810016, "num_input_tokens_seen": 128891305, "step": 5994, "time_per_iteration": 2.8905177116394043 }, { "auxiliary_loss_clip": 0.01127889, "auxiliary_loss_mlp": 0.0102503, "balance_loss_clip": 1.04140258, "balance_loss_mlp": 1.01720393, "epoch": 0.7208561293813504, "flos": 19135540033920.0, "grad_norm": 1.8721676484559238, "language_loss": 0.78841472, "learning_rate": 7.630512591036231e-07, "loss": 0.80994391, "num_input_tokens_seen": 128910615, "step": 5995, "time_per_iteration": 3.284900188446045 }, { "auxiliary_loss_clip": 0.01157233, "auxiliary_loss_mlp": 0.01029274, "balance_loss_clip": 1.04973745, "balance_loss_mlp": 1.0220201, "epoch": 0.7209763722719894, "flos": 17748526308480.0, "grad_norm": 2.3773216578585736, "language_loss": 0.64676392, "learning_rate": 7.624392331481255e-07, "loss": 0.66862893, "num_input_tokens_seen": 128928270, "step": 5996, "time_per_iteration": 2.4248831272125244 }, { "auxiliary_loss_clip": 0.0105985, "auxiliary_loss_mlp": 0.01001382, "balance_loss_clip": 1.01456189, "balance_loss_mlp": 1.00031495, "epoch": 0.7210966151626285, "flos": 66819488716800.0, "grad_norm": 0.7493092913616837, "language_loss": 0.51836169, "learning_rate": 7.618273949296115e-07, "loss": 0.53897393, "num_input_tokens_seen": 128987780, "step": 5997, "time_per_iteration": 2.9322750568389893 }, { "auxiliary_loss_clip": 0.01133361, "auxiliary_loss_mlp": 0.0102533, "balance_loss_clip": 1.04344153, "balance_loss_mlp": 1.01722407, "epoch": 0.7212168580532676, "flos": 21141869080320.0, "grad_norm": 1.8981321493140701, "language_loss": 0.68664795, "learning_rate": 7.612157445408987e-07, "loss": 0.70823485, "num_input_tokens_seen": 129005590, "step": 5998, "time_per_iteration": 2.4826321601867676 }, { "auxiliary_loss_clip": 0.01148237, "auxiliary_loss_mlp": 0.01028395, "balance_loss_clip": 1.05186796, "balance_loss_mlp": 1.02081037, "epoch": 0.7213371009439067, "flos": 22345738335360.0, "grad_norm": 2.226406481358763, "language_loss": 0.74367869, "learning_rate": 7.606042820747716e-07, "loss": 0.76544499, "num_input_tokens_seen": 129021995, "step": 5999, "time_per_iteration": 2.4945120811462402 }, { "auxiliary_loss_clip": 0.01146962, "auxiliary_loss_mlp": 0.01024081, "balance_loss_clip": 1.05046725, "balance_loss_mlp": 1.01663053, "epoch": 0.7214573438345457, "flos": 18515901490560.0, "grad_norm": 2.1359100509397706, "language_loss": 0.85404289, "learning_rate": 7.599930076239889e-07, "loss": 0.87575334, "num_input_tokens_seen": 129039280, "step": 6000, "time_per_iteration": 2.472728967666626 }, { "auxiliary_loss_clip": 0.01115423, "auxiliary_loss_mlp": 0.00761979, "balance_loss_clip": 1.04511642, "balance_loss_mlp": 1.00037766, "epoch": 0.7215775867251849, "flos": 35736108606720.0, "grad_norm": 2.006742038219633, "language_loss": 0.70400584, "learning_rate": 7.593819212812818e-07, "loss": 0.72277993, "num_input_tokens_seen": 129060860, "step": 6001, "time_per_iteration": 2.66943621635437 }, { "auxiliary_loss_clip": 0.01153777, "auxiliary_loss_mlp": 0.01022394, "balance_loss_clip": 1.04854417, "balance_loss_mlp": 1.01536119, "epoch": 0.721697829615824, "flos": 20372410909440.0, "grad_norm": 5.027456021302868, "language_loss": 0.71434486, "learning_rate": 7.587710231393508e-07, "loss": 0.73610651, "num_input_tokens_seen": 129079215, "step": 6002, "time_per_iteration": 2.4332945346832275 }, { "auxiliary_loss_clip": 0.01071424, "auxiliary_loss_mlp": 0.01021402, "balance_loss_clip": 1.03706491, "balance_loss_mlp": 1.01458025, "epoch": 0.721818072506463, "flos": 20229809915520.0, "grad_norm": 1.909431944797739, "language_loss": 0.8379997, "learning_rate": 7.581603132908685e-07, "loss": 0.85892797, "num_input_tokens_seen": 129097185, "step": 6003, "time_per_iteration": 2.5940792560577393 }, { "auxiliary_loss_clip": 0.01121517, "auxiliary_loss_mlp": 0.01022629, "balance_loss_clip": 1.04460168, "balance_loss_mlp": 1.01512146, "epoch": 0.7219383153971022, "flos": 18186887888640.0, "grad_norm": 1.9426869239432827, "language_loss": 0.78648496, "learning_rate": 7.575497918284795e-07, "loss": 0.80792642, "num_input_tokens_seen": 129114730, "step": 6004, "time_per_iteration": 2.501044511795044 }, { "auxiliary_loss_clip": 0.01171574, "auxiliary_loss_mlp": 0.01029238, "balance_loss_clip": 1.04900825, "balance_loss_mlp": 1.02178466, "epoch": 0.7220585582877412, "flos": 17342124854400.0, "grad_norm": 1.9389326334801453, "language_loss": 0.74713039, "learning_rate": 7.569394588447984e-07, "loss": 0.76913851, "num_input_tokens_seen": 129131745, "step": 6005, "time_per_iteration": 2.451655387878418 }, { "auxiliary_loss_clip": 0.01145827, "auxiliary_loss_mlp": 0.01025989, "balance_loss_clip": 1.04499614, "balance_loss_mlp": 1.01891947, "epoch": 0.7221788011783803, "flos": 16976338704000.0, "grad_norm": 2.1875925920998402, "language_loss": 0.77720118, "learning_rate": 7.563293144324146e-07, "loss": 0.79891932, "num_input_tokens_seen": 129147295, "step": 6006, "time_per_iteration": 2.4157588481903076 }, { "auxiliary_loss_clip": 0.01168018, "auxiliary_loss_mlp": 0.0102617, "balance_loss_clip": 1.05020463, "balance_loss_mlp": 1.01928282, "epoch": 0.7222990440690195, "flos": 26286359702400.0, "grad_norm": 1.7745956050819711, "language_loss": 0.80347037, "learning_rate": 7.557193586838834e-07, "loss": 0.82541227, "num_input_tokens_seen": 129162660, "step": 6007, "time_per_iteration": 2.452289581298828 }, { "auxiliary_loss_clip": 0.01144258, "auxiliary_loss_mlp": 0.01024422, "balance_loss_clip": 1.04560399, "balance_loss_mlp": 1.0174005, "epoch": 0.7224192869596585, "flos": 17601687509760.0, "grad_norm": 2.063032941343346, "language_loss": 0.70465046, "learning_rate": 7.551095916917371e-07, "loss": 0.72633725, "num_input_tokens_seen": 129179990, "step": 6008, "time_per_iteration": 2.4540679454803467 }, { "auxiliary_loss_clip": 0.01137213, "auxiliary_loss_mlp": 0.01027759, "balance_loss_clip": 1.04519868, "balance_loss_mlp": 1.01980138, "epoch": 0.7225395298502976, "flos": 12932331016320.0, "grad_norm": 3.9488347740190664, "language_loss": 0.66441917, "learning_rate": 7.545000135484758e-07, "loss": 0.68606889, "num_input_tokens_seen": 129197425, "step": 6009, "time_per_iteration": 2.4877817630767822 }, { "auxiliary_loss_clip": 0.01169243, "auxiliary_loss_mlp": 0.00761983, "balance_loss_clip": 1.05022001, "balance_loss_mlp": 1.0003767, "epoch": 0.7226597727409367, "flos": 29643899592960.0, "grad_norm": 4.110110059471532, "language_loss": 0.62876147, "learning_rate": 7.538906243465714e-07, "loss": 0.64807373, "num_input_tokens_seen": 129217560, "step": 6010, "time_per_iteration": 2.475687265396118 }, { "auxiliary_loss_clip": 0.01170204, "auxiliary_loss_mlp": 0.01024402, "balance_loss_clip": 1.05039358, "balance_loss_mlp": 1.01715159, "epoch": 0.7227800156315758, "flos": 13771635183360.0, "grad_norm": 2.1839692433528075, "language_loss": 0.78865159, "learning_rate": 7.5328142417847e-07, "loss": 0.81059766, "num_input_tokens_seen": 129234325, "step": 6011, "time_per_iteration": 2.3999295234680176 }, { "auxiliary_loss_clip": 0.0114998, "auxiliary_loss_mlp": 0.01030784, "balance_loss_clip": 1.04466867, "balance_loss_mlp": 1.02412021, "epoch": 0.7229002585222148, "flos": 20301882554880.0, "grad_norm": 1.7848167117486438, "language_loss": 0.69578463, "learning_rate": 7.526724131365838e-07, "loss": 0.71759224, "num_input_tokens_seen": 129255280, "step": 6012, "time_per_iteration": 2.4556541442871094 }, { "auxiliary_loss_clip": 0.01138734, "auxiliary_loss_mlp": 0.01028338, "balance_loss_clip": 1.04858291, "balance_loss_mlp": 1.02079248, "epoch": 0.723020501412854, "flos": 16581250033920.0, "grad_norm": 1.9976839115489198, "language_loss": 0.70107841, "learning_rate": 7.520635913133017e-07, "loss": 0.72274917, "num_input_tokens_seen": 129273910, "step": 6013, "time_per_iteration": 2.4425346851348877 }, { "auxiliary_loss_clip": 0.01159931, "auxiliary_loss_mlp": 0.01028621, "balance_loss_clip": 1.04900575, "balance_loss_mlp": 1.0204854, "epoch": 0.7231407443034931, "flos": 28548300908160.0, "grad_norm": 1.8870891315437903, "language_loss": 0.82634008, "learning_rate": 7.514549588009798e-07, "loss": 0.84822559, "num_input_tokens_seen": 129294785, "step": 6014, "time_per_iteration": 2.4922797679901123 }, { "auxiliary_loss_clip": 0.01141797, "auxiliary_loss_mlp": 0.01025867, "balance_loss_clip": 1.04628873, "balance_loss_mlp": 1.01902401, "epoch": 0.7232609871941321, "flos": 30008536508160.0, "grad_norm": 3.5782944014282987, "language_loss": 0.70600319, "learning_rate": 7.508465156919492e-07, "loss": 0.72767979, "num_input_tokens_seen": 129318295, "step": 6015, "time_per_iteration": 2.5513219833374023 }, { "auxiliary_loss_clip": 0.01141146, "auxiliary_loss_mlp": 0.01027078, "balance_loss_clip": 1.04583597, "balance_loss_mlp": 1.01952028, "epoch": 0.7233812300847713, "flos": 16654005031680.0, "grad_norm": 2.3502057885378798, "language_loss": 0.61568433, "learning_rate": 7.502382620785083e-07, "loss": 0.63736653, "num_input_tokens_seen": 129334845, "step": 6016, "time_per_iteration": 3.2132537364959717 }, { "auxiliary_loss_clip": 0.01028106, "auxiliary_loss_mlp": 0.01004215, "balance_loss_clip": 1.01288819, "balance_loss_mlp": 1.00329721, "epoch": 0.7235014729754103, "flos": 67258784050560.0, "grad_norm": 0.8044529059662425, "language_loss": 0.62498724, "learning_rate": 7.496301980529289e-07, "loss": 0.64531052, "num_input_tokens_seen": 129398055, "step": 6017, "time_per_iteration": 3.9700419902801514 }, { "auxiliary_loss_clip": 0.01170349, "auxiliary_loss_mlp": 0.01025866, "balance_loss_clip": 1.0501442, "balance_loss_mlp": 1.01863289, "epoch": 0.7236217158660494, "flos": 26943237671040.0, "grad_norm": 5.784379325592039, "language_loss": 0.74407017, "learning_rate": 7.490223237074547e-07, "loss": 0.76603234, "num_input_tokens_seen": 129417765, "step": 6018, "time_per_iteration": 2.4488601684570312 }, { "auxiliary_loss_clip": 0.01124949, "auxiliary_loss_mlp": 0.01026676, "balance_loss_clip": 1.04261518, "balance_loss_mlp": 1.01899862, "epoch": 0.7237419587566886, "flos": 29423372042880.0, "grad_norm": 2.0386632418468675, "language_loss": 0.65892655, "learning_rate": 7.484146391342989e-07, "loss": 0.68044281, "num_input_tokens_seen": 129437560, "step": 6019, "time_per_iteration": 3.402531862258911 }, { "auxiliary_loss_clip": 0.01133786, "auxiliary_loss_mlp": 0.01026306, "balance_loss_clip": 1.04389572, "balance_loss_mlp": 1.01911151, "epoch": 0.7238622016473276, "flos": 17821496787840.0, "grad_norm": 3.1553578170541843, "language_loss": 0.56824768, "learning_rate": 7.478071444256484e-07, "loss": 0.58984864, "num_input_tokens_seen": 129455320, "step": 6020, "time_per_iteration": 2.4340176582336426 }, { "auxiliary_loss_clip": 0.01136763, "auxiliary_loss_mlp": 0.01025542, "balance_loss_clip": 1.04581952, "balance_loss_mlp": 1.01824927, "epoch": 0.7239824445379667, "flos": 25739117020800.0, "grad_norm": 1.669516718253167, "language_loss": 0.79221809, "learning_rate": 7.471998396736579e-07, "loss": 0.8138411, "num_input_tokens_seen": 129475700, "step": 6021, "time_per_iteration": 3.297128677368164 }, { "auxiliary_loss_clip": 0.01130012, "auxiliary_loss_mlp": 0.01022486, "balance_loss_clip": 1.04740191, "balance_loss_mlp": 1.01543784, "epoch": 0.7241026874286057, "flos": 23148916398720.0, "grad_norm": 1.6938870151479555, "language_loss": 0.75689989, "learning_rate": 7.465927249704549e-07, "loss": 0.77842486, "num_input_tokens_seen": 129493585, "step": 6022, "time_per_iteration": 2.5260355472564697 }, { "auxiliary_loss_clip": 0.01153484, "auxiliary_loss_mlp": 0.01024684, "balance_loss_clip": 1.04785001, "balance_loss_mlp": 1.01752806, "epoch": 0.7242229303192449, "flos": 20266905686400.0, "grad_norm": 2.010529393297914, "language_loss": 0.77115488, "learning_rate": 7.459858004081398e-07, "loss": 0.79293656, "num_input_tokens_seen": 129511555, "step": 6023, "time_per_iteration": 2.4605953693389893 }, { "auxiliary_loss_clip": 0.01029351, "auxiliary_loss_mlp": 0.01003216, "balance_loss_clip": 1.01182222, "balance_loss_mlp": 1.00210714, "epoch": 0.724343173209884, "flos": 62311659684480.0, "grad_norm": 0.6565992248594862, "language_loss": 0.58060646, "learning_rate": 7.453790660787815e-07, "loss": 0.60093212, "num_input_tokens_seen": 129579650, "step": 6024, "time_per_iteration": 3.1811561584472656 }, { "auxiliary_loss_clip": 0.01142533, "auxiliary_loss_mlp": 0.0102236, "balance_loss_clip": 1.04668736, "balance_loss_mlp": 1.01466513, "epoch": 0.724463416100523, "flos": 35006403813120.0, "grad_norm": 2.487729595746856, "language_loss": 0.63646984, "learning_rate": 7.447725220744214e-07, "loss": 0.65811872, "num_input_tokens_seen": 129601895, "step": 6025, "time_per_iteration": 2.607048988342285 }, { "auxiliary_loss_clip": 0.01168389, "auxiliary_loss_mlp": 0.01028281, "balance_loss_clip": 1.04786289, "balance_loss_mlp": 1.02077329, "epoch": 0.7245836589911622, "flos": 21871968923520.0, "grad_norm": 2.191457515657719, "language_loss": 0.77638066, "learning_rate": 7.441661684870717e-07, "loss": 0.79834729, "num_input_tokens_seen": 129622150, "step": 6026, "time_per_iteration": 2.4498631954193115 }, { "auxiliary_loss_clip": 0.01169349, "auxiliary_loss_mlp": 0.01020306, "balance_loss_clip": 1.0504781, "balance_loss_mlp": 1.01339173, "epoch": 0.7247039018818012, "flos": 23006494972800.0, "grad_norm": 1.6908496084677738, "language_loss": 0.81659722, "learning_rate": 7.435600054087152e-07, "loss": 0.8384937, "num_input_tokens_seen": 129644315, "step": 6027, "time_per_iteration": 2.456315279006958 }, { "auxiliary_loss_clip": 0.01172203, "auxiliary_loss_mlp": 0.01029281, "balance_loss_clip": 1.05223274, "balance_loss_mlp": 1.02198195, "epoch": 0.7248241447724403, "flos": 31722588587520.0, "grad_norm": 1.7903916572246275, "language_loss": 0.74406439, "learning_rate": 7.42954032931308e-07, "loss": 0.76607919, "num_input_tokens_seen": 129665355, "step": 6028, "time_per_iteration": 2.5084571838378906 }, { "auxiliary_loss_clip": 0.01140779, "auxiliary_loss_mlp": 0.0102533, "balance_loss_clip": 1.04539967, "balance_loss_mlp": 1.01845133, "epoch": 0.7249443876630794, "flos": 34896984007680.0, "grad_norm": 1.8133674041723753, "language_loss": 0.74583489, "learning_rate": 7.423482511467733e-07, "loss": 0.76749599, "num_input_tokens_seen": 129686125, "step": 6029, "time_per_iteration": 2.6122748851776123 }, { "auxiliary_loss_clip": 0.01087051, "auxiliary_loss_mlp": 0.0102633, "balance_loss_clip": 1.04073763, "balance_loss_mlp": 1.01939464, "epoch": 0.7250646305537185, "flos": 26359294268160.0, "grad_norm": 2.028596163551907, "language_loss": 0.64529043, "learning_rate": 7.417426601470099e-07, "loss": 0.66642416, "num_input_tokens_seen": 129706485, "step": 6030, "time_per_iteration": 2.619476795196533 }, { "auxiliary_loss_clip": 0.01156154, "auxiliary_loss_mlp": 0.01025234, "balance_loss_clip": 1.04832625, "balance_loss_mlp": 1.01741385, "epoch": 0.7251848734443576, "flos": 30081614728320.0, "grad_norm": 2.068492348662004, "language_loss": 0.78497219, "learning_rate": 7.411372600238841e-07, "loss": 0.80678606, "num_input_tokens_seen": 129727100, "step": 6031, "time_per_iteration": 2.520246982574463 }, { "auxiliary_loss_clip": 0.01168934, "auxiliary_loss_mlp": 0.01027507, "balance_loss_clip": 1.04935288, "balance_loss_mlp": 1.02019346, "epoch": 0.7253051163349967, "flos": 17785262943360.0, "grad_norm": 1.9766609161773634, "language_loss": 0.73686266, "learning_rate": 7.405320508692346e-07, "loss": 0.75882709, "num_input_tokens_seen": 129745840, "step": 6032, "time_per_iteration": 2.3928418159484863 }, { "auxiliary_loss_clip": 0.01165948, "auxiliary_loss_mlp": 0.01021786, "balance_loss_clip": 1.04986668, "balance_loss_mlp": 1.01487768, "epoch": 0.7254253592256358, "flos": 12641346938880.0, "grad_norm": 1.7691461649455498, "language_loss": 0.75384581, "learning_rate": 7.399270327748727e-07, "loss": 0.77572316, "num_input_tokens_seen": 129763500, "step": 6033, "time_per_iteration": 2.407437801361084 }, { "auxiliary_loss_clip": 0.01126669, "auxiliary_loss_mlp": 0.00760791, "balance_loss_clip": 1.04334259, "balance_loss_mlp": 1.00032187, "epoch": 0.7255456021162748, "flos": 27199208966400.0, "grad_norm": 1.8752201488629237, "language_loss": 0.74161017, "learning_rate": 7.39322205832577e-07, "loss": 0.76048476, "num_input_tokens_seen": 129784390, "step": 6034, "time_per_iteration": 2.5662126541137695 }, { "auxiliary_loss_clip": 0.01136063, "auxiliary_loss_mlp": 0.01021565, "balance_loss_clip": 1.04540586, "balance_loss_mlp": 1.01435924, "epoch": 0.725665845006914, "flos": 21288205088640.0, "grad_norm": 1.9693143105615372, "language_loss": 0.80809402, "learning_rate": 7.387175701341009e-07, "loss": 0.82967037, "num_input_tokens_seen": 129803060, "step": 6035, "time_per_iteration": 2.4947454929351807 }, { "auxiliary_loss_clip": 0.01153514, "auxiliary_loss_mlp": 0.01022176, "balance_loss_clip": 1.04659915, "balance_loss_mlp": 1.01502991, "epoch": 0.7257860878975531, "flos": 16033684129920.0, "grad_norm": 10.596058797726988, "language_loss": 0.72169155, "learning_rate": 7.381131257711659e-07, "loss": 0.7434485, "num_input_tokens_seen": 129820165, "step": 6036, "time_per_iteration": 2.428905725479126 }, { "auxiliary_loss_clip": 0.01140422, "auxiliary_loss_mlp": 0.01025967, "balance_loss_clip": 1.05238068, "balance_loss_mlp": 1.0189997, "epoch": 0.7259063307881921, "flos": 12129943052160.0, "grad_norm": 1.7709084545011553, "language_loss": 0.83624882, "learning_rate": 7.375088728354677e-07, "loss": 0.85791272, "num_input_tokens_seen": 129835195, "step": 6037, "time_per_iteration": 2.4450795650482178 }, { "auxiliary_loss_clip": 0.01129009, "auxiliary_loss_mlp": 0.01022151, "balance_loss_clip": 1.04504681, "balance_loss_mlp": 1.01491451, "epoch": 0.7260265736788313, "flos": 30443845432320.0, "grad_norm": 1.6581015970387019, "language_loss": 0.67050523, "learning_rate": 7.369048114186691e-07, "loss": 0.69201684, "num_input_tokens_seen": 129856240, "step": 6038, "time_per_iteration": 2.5816047191619873 }, { "auxiliary_loss_clip": 0.01135205, "auxiliary_loss_mlp": 0.00761333, "balance_loss_clip": 1.04711318, "balance_loss_mlp": 1.00029314, "epoch": 0.7261468165694703, "flos": 21142264129920.0, "grad_norm": 1.7516089499832659, "language_loss": 0.83453536, "learning_rate": 7.363009416124055e-07, "loss": 0.85350072, "num_input_tokens_seen": 129875565, "step": 6039, "time_per_iteration": 2.508725166320801 }, { "auxiliary_loss_clip": 0.01130544, "auxiliary_loss_mlp": 0.01023985, "balance_loss_clip": 1.04731607, "balance_loss_mlp": 1.01664138, "epoch": 0.7262670594601094, "flos": 22306308180480.0, "grad_norm": 2.158664072081652, "language_loss": 0.62693131, "learning_rate": 7.356972635082852e-07, "loss": 0.6484766, "num_input_tokens_seen": 129894420, "step": 6040, "time_per_iteration": 2.508397340774536 }, { "auxiliary_loss_clip": 0.01112205, "auxiliary_loss_mlp": 0.01026033, "balance_loss_clip": 1.04809213, "balance_loss_mlp": 1.01895809, "epoch": 0.7263873023507486, "flos": 25335049950720.0, "grad_norm": 1.8823020402572586, "language_loss": 0.75217694, "learning_rate": 7.35093777197884e-07, "loss": 0.77355933, "num_input_tokens_seen": 129914490, "step": 6041, "time_per_iteration": 2.575162887573242 }, { "auxiliary_loss_clip": 0.01139885, "auxiliary_loss_mlp": 0.01020626, "balance_loss_clip": 1.04828906, "balance_loss_mlp": 1.01376522, "epoch": 0.7265075452413876, "flos": 23878621192320.0, "grad_norm": 2.175094438950915, "language_loss": 0.86124527, "learning_rate": 7.344904827727525e-07, "loss": 0.88285047, "num_input_tokens_seen": 129931670, "step": 6042, "time_per_iteration": 2.4986205101013184 }, { "auxiliary_loss_clip": 0.0112713, "auxiliary_loss_mlp": 0.01023941, "balance_loss_clip": 1.04285884, "balance_loss_mlp": 1.01655054, "epoch": 0.7266277881320267, "flos": 28724549967360.0, "grad_norm": 2.993181674063456, "language_loss": 0.73743117, "learning_rate": 7.338873803244076e-07, "loss": 0.75894189, "num_input_tokens_seen": 129946905, "step": 6043, "time_per_iteration": 3.282459020614624 }, { "auxiliary_loss_clip": 0.01135136, "auxiliary_loss_mlp": 0.0102525, "balance_loss_clip": 1.04605055, "balance_loss_mlp": 1.01845217, "epoch": 0.7267480310226658, "flos": 24863507182080.0, "grad_norm": 1.7229417007540275, "language_loss": 0.80872029, "learning_rate": 7.332844699443401e-07, "loss": 0.83032417, "num_input_tokens_seen": 129965505, "step": 6044, "time_per_iteration": 3.36826491355896 }, { "auxiliary_loss_clip": 0.01102204, "auxiliary_loss_mlp": 0.01024867, "balance_loss_clip": 1.04048014, "balance_loss_mlp": 1.01815832, "epoch": 0.7268682739133049, "flos": 27198490694400.0, "grad_norm": 1.9023147716971842, "language_loss": 0.75172096, "learning_rate": 7.326817517240121e-07, "loss": 0.77299166, "num_input_tokens_seen": 129987210, "step": 6045, "time_per_iteration": 2.589280366897583 }, { "auxiliary_loss_clip": 0.01154823, "auxiliary_loss_mlp": 0.00760961, "balance_loss_clip": 1.0481807, "balance_loss_mlp": 1.00028038, "epoch": 0.7269885168039439, "flos": 33508138688640.0, "grad_norm": 2.189282586823286, "language_loss": 0.83345866, "learning_rate": 7.320792257548545e-07, "loss": 0.85261655, "num_input_tokens_seen": 130008385, "step": 6046, "time_per_iteration": 3.3918864727020264 }, { "auxiliary_loss_clip": 0.01144733, "auxiliary_loss_mlp": 0.01022623, "balance_loss_clip": 1.04714429, "balance_loss_mlp": 1.0151602, "epoch": 0.7271087596945831, "flos": 24313750548480.0, "grad_norm": 1.8360867878056761, "language_loss": 0.76349485, "learning_rate": 7.314768921282704e-07, "loss": 0.78516841, "num_input_tokens_seen": 130029040, "step": 6047, "time_per_iteration": 2.504153251647949 }, { "auxiliary_loss_clip": 0.01156181, "auxiliary_loss_mlp": 0.01023949, "balance_loss_clip": 1.04748607, "balance_loss_mlp": 1.01679075, "epoch": 0.7272290025852222, "flos": 23805147922560.0, "grad_norm": 2.825462046756991, "language_loss": 0.72337544, "learning_rate": 7.30874750935633e-07, "loss": 0.74517673, "num_input_tokens_seen": 130048725, "step": 6048, "time_per_iteration": 3.2359619140625 }, { "auxiliary_loss_clip": 0.01126845, "auxiliary_loss_mlp": 0.01023657, "balance_loss_clip": 1.04684186, "balance_loss_mlp": 1.01652217, "epoch": 0.7273492454758612, "flos": 16720367408640.0, "grad_norm": 4.23380308243205, "language_loss": 0.7874791, "learning_rate": 7.30272802268286e-07, "loss": 0.8089841, "num_input_tokens_seen": 130065720, "step": 6049, "time_per_iteration": 2.487602949142456 }, { "auxiliary_loss_clip": 0.01071103, "auxiliary_loss_mlp": 0.01021433, "balance_loss_clip": 1.03615773, "balance_loss_mlp": 1.01452446, "epoch": 0.7274694883665004, "flos": 28031330413440.0, "grad_norm": 2.808644567431553, "language_loss": 0.7596457, "learning_rate": 7.29671046217547e-07, "loss": 0.78057104, "num_input_tokens_seen": 130084830, "step": 6050, "time_per_iteration": 2.5993053913116455 }, { "auxiliary_loss_clip": 0.01128471, "auxiliary_loss_mlp": 0.01028324, "balance_loss_clip": 1.0463599, "balance_loss_mlp": 1.02125168, "epoch": 0.7275897312571394, "flos": 30372706546560.0, "grad_norm": 1.825345955677869, "language_loss": 0.81345701, "learning_rate": 7.290694828746988e-07, "loss": 0.83502495, "num_input_tokens_seen": 130104495, "step": 6051, "time_per_iteration": 2.577407121658325 }, { "auxiliary_loss_clip": 0.01129597, "auxiliary_loss_mlp": 0.01024737, "balance_loss_clip": 1.04344559, "balance_loss_mlp": 1.01746202, "epoch": 0.7277099741477785, "flos": 19204775498880.0, "grad_norm": 1.964649200838904, "language_loss": 0.85851324, "learning_rate": 7.284681123310004e-07, "loss": 0.88005662, "num_input_tokens_seen": 130123210, "step": 6052, "time_per_iteration": 2.51304030418396 }, { "auxiliary_loss_clip": 0.01152796, "auxiliary_loss_mlp": 0.01028245, "balance_loss_clip": 1.04791343, "balance_loss_mlp": 1.02045512, "epoch": 0.7278302170384175, "flos": 20667884186880.0, "grad_norm": 2.98817717559006, "language_loss": 0.79554474, "learning_rate": 7.27866934677678e-07, "loss": 0.81735516, "num_input_tokens_seen": 130142880, "step": 6053, "time_per_iteration": 2.4361069202423096 }, { "auxiliary_loss_clip": 0.01108135, "auxiliary_loss_mlp": 0.01022581, "balance_loss_clip": 1.04340911, "balance_loss_mlp": 1.01555705, "epoch": 0.7279504599290567, "flos": 19093200877440.0, "grad_norm": 1.6930859080041893, "language_loss": 0.78456926, "learning_rate": 7.272659500059297e-07, "loss": 0.80587649, "num_input_tokens_seen": 130160220, "step": 6054, "time_per_iteration": 2.5480964183807373 }, { "auxiliary_loss_clip": 0.0114776, "auxiliary_loss_mlp": 0.01034422, "balance_loss_clip": 1.04690599, "balance_loss_mlp": 1.02736211, "epoch": 0.7280707028196958, "flos": 19062174504960.0, "grad_norm": 2.661770738953473, "language_loss": 0.80178165, "learning_rate": 7.266651584069264e-07, "loss": 0.82360345, "num_input_tokens_seen": 130177885, "step": 6055, "time_per_iteration": 2.451216459274292 }, { "auxiliary_loss_clip": 0.01157912, "auxiliary_loss_mlp": 0.01022527, "balance_loss_clip": 1.0509063, "balance_loss_mlp": 1.01553559, "epoch": 0.7281909457103348, "flos": 37196308293120.0, "grad_norm": 1.636765370791178, "language_loss": 0.57367694, "learning_rate": 7.260645599718045e-07, "loss": 0.59548134, "num_input_tokens_seen": 130204240, "step": 6056, "time_per_iteration": 2.5949149131774902 }, { "auxiliary_loss_clip": 0.01139184, "auxiliary_loss_mlp": 0.01029046, "balance_loss_clip": 1.04522789, "balance_loss_mlp": 1.02120817, "epoch": 0.728311188600974, "flos": 20667094087680.0, "grad_norm": 7.417084390698381, "language_loss": 0.674986, "learning_rate": 7.254641547916767e-07, "loss": 0.69666833, "num_input_tokens_seen": 130221735, "step": 6057, "time_per_iteration": 2.465744733810425 }, { "auxiliary_loss_clip": 0.01168454, "auxiliary_loss_mlp": 0.01030964, "balance_loss_clip": 1.05055022, "balance_loss_mlp": 1.02382374, "epoch": 0.728431431491613, "flos": 28840685616000.0, "grad_norm": 2.034867113792075, "language_loss": 0.69522333, "learning_rate": 7.248639429576226e-07, "loss": 0.7172175, "num_input_tokens_seen": 130241190, "step": 6058, "time_per_iteration": 2.4541234970092773 }, { "auxiliary_loss_clip": 0.01153669, "auxiliary_loss_mlp": 0.0102727, "balance_loss_clip": 1.04661965, "balance_loss_mlp": 1.02001333, "epoch": 0.7285516743822521, "flos": 25991856092160.0, "grad_norm": 1.601315687084765, "language_loss": 0.71790141, "learning_rate": 7.242639245606959e-07, "loss": 0.73971081, "num_input_tokens_seen": 130260980, "step": 6059, "time_per_iteration": 2.473053455352783 }, { "auxiliary_loss_clip": 0.01145635, "auxiliary_loss_mlp": 0.01026477, "balance_loss_clip": 1.04517055, "balance_loss_mlp": 1.01922345, "epoch": 0.7286719172728913, "flos": 16399721675520.0, "grad_norm": 1.8144714658929, "language_loss": 0.82548076, "learning_rate": 7.236640996919168e-07, "loss": 0.84720182, "num_input_tokens_seen": 130280025, "step": 6060, "time_per_iteration": 2.4468722343444824 }, { "auxiliary_loss_clip": 0.01155341, "auxiliary_loss_mlp": 0.01025189, "balance_loss_clip": 1.04712296, "balance_loss_mlp": 1.01819754, "epoch": 0.7287921601635303, "flos": 22018161277440.0, "grad_norm": 1.5124226124954496, "language_loss": 0.7071898, "learning_rate": 7.230644684422782e-07, "loss": 0.72899508, "num_input_tokens_seen": 130300255, "step": 6061, "time_per_iteration": 2.454136610031128 }, { "auxiliary_loss_clip": 0.0112672, "auxiliary_loss_mlp": 0.01027259, "balance_loss_clip": 1.04550767, "balance_loss_mlp": 1.02050257, "epoch": 0.7289124030541694, "flos": 24600927784320.0, "grad_norm": 1.712803627920868, "language_loss": 0.8133564, "learning_rate": 7.224650309027451e-07, "loss": 0.83489615, "num_input_tokens_seen": 130320005, "step": 6062, "time_per_iteration": 2.5313501358032227 }, { "auxiliary_loss_clip": 0.01158991, "auxiliary_loss_mlp": 0.01022877, "balance_loss_clip": 1.05110061, "balance_loss_mlp": 1.01581049, "epoch": 0.7290326459448085, "flos": 21393638484480.0, "grad_norm": 2.899120479963915, "language_loss": 0.68734419, "learning_rate": 7.218657871642506e-07, "loss": 0.70916289, "num_input_tokens_seen": 130338810, "step": 6063, "time_per_iteration": 2.4475221633911133 }, { "auxiliary_loss_clip": 0.01170728, "auxiliary_loss_mlp": 0.01026677, "balance_loss_clip": 1.04975176, "balance_loss_mlp": 1.01953936, "epoch": 0.7291528888354476, "flos": 18587686821120.0, "grad_norm": 2.0182695687246297, "language_loss": 0.62324286, "learning_rate": 7.212667373177012e-07, "loss": 0.64521694, "num_input_tokens_seen": 130353805, "step": 6064, "time_per_iteration": 2.3907220363616943 }, { "auxiliary_loss_clip": 0.01125621, "auxiliary_loss_mlp": 0.01023874, "balance_loss_clip": 1.04461861, "balance_loss_mlp": 1.01669705, "epoch": 0.7292731317260867, "flos": 18951066760320.0, "grad_norm": 5.4491074619613, "language_loss": 0.75430483, "learning_rate": 7.206678814539704e-07, "loss": 0.77579975, "num_input_tokens_seen": 130372105, "step": 6065, "time_per_iteration": 2.501145124435425 }, { "auxiliary_loss_clip": 0.01120068, "auxiliary_loss_mlp": 0.0102462, "balance_loss_clip": 1.04428017, "balance_loss_mlp": 1.01791716, "epoch": 0.7293933746167258, "flos": 21067569797760.0, "grad_norm": 2.071844414826551, "language_loss": 0.72687268, "learning_rate": 7.20069219663904e-07, "loss": 0.74831951, "num_input_tokens_seen": 130391990, "step": 6066, "time_per_iteration": 2.5456364154815674 }, { "auxiliary_loss_clip": 0.01155773, "auxiliary_loss_mlp": 0.01023117, "balance_loss_clip": 1.04570138, "balance_loss_mlp": 1.01618218, "epoch": 0.7295136175073649, "flos": 22453326547200.0, "grad_norm": 1.775308638986487, "language_loss": 0.79362333, "learning_rate": 7.1947075203832e-07, "loss": 0.81541228, "num_input_tokens_seen": 130411970, "step": 6067, "time_per_iteration": 2.4571807384490967 }, { "auxiliary_loss_clip": 0.01069948, "auxiliary_loss_mlp": 0.01002267, "balance_loss_clip": 1.0148766, "balance_loss_mlp": 1.00118196, "epoch": 0.7296338603980039, "flos": 56125506648960.0, "grad_norm": 0.8625292430452522, "language_loss": 0.60186422, "learning_rate": 7.188724786680049e-07, "loss": 0.62258625, "num_input_tokens_seen": 130472440, "step": 6068, "time_per_iteration": 3.006516695022583 }, { "auxiliary_loss_clip": 0.0113927, "auxiliary_loss_mlp": 0.01024096, "balance_loss_clip": 1.04480898, "balance_loss_mlp": 1.0167284, "epoch": 0.7297541032886431, "flos": 25228287751680.0, "grad_norm": 1.7191336596005689, "language_loss": 0.75628817, "learning_rate": 7.182743996437162e-07, "loss": 0.7779218, "num_input_tokens_seen": 130491975, "step": 6069, "time_per_iteration": 2.518704652786255 }, { "auxiliary_loss_clip": 0.01132922, "auxiliary_loss_mlp": 0.01022314, "balance_loss_clip": 1.04339182, "balance_loss_mlp": 1.01458955, "epoch": 0.7298743461792822, "flos": 26467600752000.0, "grad_norm": 1.8681838726139628, "language_loss": 0.6862877, "learning_rate": 7.176765150561819e-07, "loss": 0.70784003, "num_input_tokens_seen": 130510580, "step": 6070, "time_per_iteration": 4.110658884048462 }, { "auxiliary_loss_clip": 0.0116921, "auxiliary_loss_mlp": 0.01023712, "balance_loss_clip": 1.0474968, "balance_loss_mlp": 1.0165863, "epoch": 0.7299945890699212, "flos": 19569053278080.0, "grad_norm": 2.196190424696349, "language_loss": 0.79589492, "learning_rate": 7.170788249961002e-07, "loss": 0.81782413, "num_input_tokens_seen": 130529090, "step": 6071, "time_per_iteration": 2.4754576683044434 }, { "auxiliary_loss_clip": 0.01164138, "auxiliary_loss_mlp": 0.01023538, "balance_loss_clip": 1.04709721, "balance_loss_mlp": 1.01658547, "epoch": 0.7301148319605604, "flos": 22928963466240.0, "grad_norm": 1.8685895341075822, "language_loss": 0.88565135, "learning_rate": 7.164813295541418e-07, "loss": 0.90752816, "num_input_tokens_seen": 130548655, "step": 6072, "time_per_iteration": 3.26410174369812 }, { "auxiliary_loss_clip": 0.0114143, "auxiliary_loss_mlp": 0.01025777, "balance_loss_clip": 1.04490077, "balance_loss_mlp": 1.01874959, "epoch": 0.7302350748511994, "flos": 25369703596800.0, "grad_norm": 1.607299385512349, "language_loss": 0.70380247, "learning_rate": 7.15884028820944e-07, "loss": 0.72547454, "num_input_tokens_seen": 130567710, "step": 6073, "time_per_iteration": 2.502211570739746 }, { "auxiliary_loss_clip": 0.01118953, "auxiliary_loss_mlp": 0.01021036, "balance_loss_clip": 1.0405674, "balance_loss_mlp": 1.01415753, "epoch": 0.7303553177418385, "flos": 27819170732160.0, "grad_norm": 2.0068940394597554, "language_loss": 0.60835981, "learning_rate": 7.152869228871185e-07, "loss": 0.62975967, "num_input_tokens_seen": 130590195, "step": 6074, "time_per_iteration": 2.5531303882598877 }, { "auxiliary_loss_clip": 0.01135023, "auxiliary_loss_mlp": 0.01028159, "balance_loss_clip": 1.04469085, "balance_loss_mlp": 1.02057123, "epoch": 0.7304755606324776, "flos": 24426510318720.0, "grad_norm": 2.0650139150635085, "language_loss": 0.72399163, "learning_rate": 7.146900118432457e-07, "loss": 0.74562347, "num_input_tokens_seen": 130609940, "step": 6075, "time_per_iteration": 3.325765371322632 }, { "auxiliary_loss_clip": 0.01081949, "auxiliary_loss_mlp": 0.01029659, "balance_loss_clip": 1.03608203, "balance_loss_mlp": 1.02258372, "epoch": 0.7305958035231167, "flos": 23840483927040.0, "grad_norm": 1.6288849391698481, "language_loss": 0.85704744, "learning_rate": 7.140932957798753e-07, "loss": 0.87816358, "num_input_tokens_seen": 130628380, "step": 6076, "time_per_iteration": 2.7142691612243652 }, { "auxiliary_loss_clip": 0.01142766, "auxiliary_loss_mlp": 0.01022122, "balance_loss_clip": 1.04408932, "balance_loss_mlp": 1.01482594, "epoch": 0.7307160464137558, "flos": 16726939597440.0, "grad_norm": 2.9542674738387116, "language_loss": 0.71352798, "learning_rate": 7.134967747875309e-07, "loss": 0.73517686, "num_input_tokens_seen": 130646590, "step": 6077, "time_per_iteration": 2.49676775932312 }, { "auxiliary_loss_clip": 0.01149278, "auxiliary_loss_mlp": 0.01025124, "balance_loss_clip": 1.04496181, "balance_loss_mlp": 1.01800728, "epoch": 0.7308362893043949, "flos": 21798280172160.0, "grad_norm": 2.110313043272194, "language_loss": 0.8193149, "learning_rate": 7.129004489567014e-07, "loss": 0.84105891, "num_input_tokens_seen": 130664070, "step": 6078, "time_per_iteration": 2.446838617324829 }, { "auxiliary_loss_clip": 0.01130748, "auxiliary_loss_mlp": 0.01022242, "balance_loss_clip": 1.04479504, "balance_loss_mlp": 1.01500595, "epoch": 0.730956532195034, "flos": 10707377840640.0, "grad_norm": 2.174305448171294, "language_loss": 0.78347278, "learning_rate": 7.123043183778512e-07, "loss": 0.80500269, "num_input_tokens_seen": 130681400, "step": 6079, "time_per_iteration": 2.5183959007263184 }, { "auxiliary_loss_clip": 0.01133809, "auxiliary_loss_mlp": 0.0103189, "balance_loss_clip": 1.04717827, "balance_loss_mlp": 1.02471328, "epoch": 0.731076775085673, "flos": 19791987039360.0, "grad_norm": 2.3414133347998134, "language_loss": 0.65093482, "learning_rate": 7.117083831414114e-07, "loss": 0.67259181, "num_input_tokens_seen": 130700675, "step": 6080, "time_per_iteration": 2.5625460147857666 }, { "auxiliary_loss_clip": 0.0116518, "auxiliary_loss_mlp": 0.01022285, "balance_loss_clip": 1.04853845, "balance_loss_mlp": 1.01529944, "epoch": 0.7311970179763122, "flos": 20447033414400.0, "grad_norm": 1.9020191974456258, "language_loss": 0.69872558, "learning_rate": 7.11112643337787e-07, "loss": 0.72060025, "num_input_tokens_seen": 130719720, "step": 6081, "time_per_iteration": 2.460615873336792 }, { "auxiliary_loss_clip": 0.01143326, "auxiliary_loss_mlp": 0.01030189, "balance_loss_clip": 1.0495975, "balance_loss_mlp": 1.02253628, "epoch": 0.7313172608669513, "flos": 18513818501760.0, "grad_norm": 2.0916858946149977, "language_loss": 0.76452732, "learning_rate": 7.10517099057349e-07, "loss": 0.78626251, "num_input_tokens_seen": 130736670, "step": 6082, "time_per_iteration": 2.538703203201294 }, { "auxiliary_loss_clip": 0.01138001, "auxiliary_loss_mlp": 0.01021984, "balance_loss_clip": 1.04422379, "balance_loss_mlp": 1.01442039, "epoch": 0.7314375037575903, "flos": 16180738410240.0, "grad_norm": 3.406099754138424, "language_loss": 0.61160922, "learning_rate": 7.099217503904411e-07, "loss": 0.63320905, "num_input_tokens_seen": 130754525, "step": 6083, "time_per_iteration": 2.470655918121338 }, { "auxiliary_loss_clip": 0.01142672, "auxiliary_loss_mlp": 0.01023461, "balance_loss_clip": 1.04683614, "balance_loss_mlp": 1.01662695, "epoch": 0.7315577466482295, "flos": 17967940536960.0, "grad_norm": 2.168818214662408, "language_loss": 0.90080929, "learning_rate": 7.093265974273788e-07, "loss": 0.92247063, "num_input_tokens_seen": 130772420, "step": 6084, "time_per_iteration": 2.451982259750366 }, { "auxiliary_loss_clip": 0.0115271, "auxiliary_loss_mlp": 0.01021605, "balance_loss_clip": 1.04469919, "balance_loss_mlp": 1.01465845, "epoch": 0.7316779895388685, "flos": 18405440190720.0, "grad_norm": 1.8290021345354643, "language_loss": 0.72191536, "learning_rate": 7.087316402584447e-07, "loss": 0.74365848, "num_input_tokens_seen": 130791245, "step": 6085, "time_per_iteration": 2.4172520637512207 }, { "auxiliary_loss_clip": 0.0116766, "auxiliary_loss_mlp": 0.01020727, "balance_loss_clip": 1.04932082, "balance_loss_mlp": 1.01332378, "epoch": 0.7317982324295076, "flos": 17928294900480.0, "grad_norm": 1.975844527751707, "language_loss": 0.863065, "learning_rate": 7.081368789738953e-07, "loss": 0.88494885, "num_input_tokens_seen": 130808445, "step": 6086, "time_per_iteration": 2.376862049102783 }, { "auxiliary_loss_clip": 0.01133227, "auxiliary_loss_mlp": 0.01023305, "balance_loss_clip": 1.0412426, "balance_loss_mlp": 1.01580071, "epoch": 0.7319184753201466, "flos": 27229840289280.0, "grad_norm": 2.4064825638920206, "language_loss": 0.77936935, "learning_rate": 7.075423136639537e-07, "loss": 0.80093479, "num_input_tokens_seen": 130827700, "step": 6087, "time_per_iteration": 2.518655300140381 }, { "auxiliary_loss_clip": 0.01117999, "auxiliary_loss_mlp": 0.01023328, "balance_loss_clip": 1.04141688, "balance_loss_mlp": 1.0159725, "epoch": 0.7320387182107858, "flos": 37448544574080.0, "grad_norm": 2.1011986921515855, "language_loss": 0.74276489, "learning_rate": 7.069479444188149e-07, "loss": 0.7641781, "num_input_tokens_seen": 130848290, "step": 6088, "time_per_iteration": 2.634871482849121 }, { "auxiliary_loss_clip": 0.01132309, "auxiliary_loss_mlp": 0.01023598, "balance_loss_clip": 1.04561925, "balance_loss_mlp": 1.01604056, "epoch": 0.7321589611014249, "flos": 17859023521920.0, "grad_norm": 1.822385432380353, "language_loss": 0.82249892, "learning_rate": 7.063537713286453e-07, "loss": 0.84405804, "num_input_tokens_seen": 130865970, "step": 6089, "time_per_iteration": 2.4458703994750977 }, { "auxiliary_loss_clip": 0.01145208, "auxiliary_loss_mlp": 0.01022444, "balance_loss_clip": 1.04608583, "balance_loss_mlp": 1.01515746, "epoch": 0.7322792039920639, "flos": 26100593539200.0, "grad_norm": 1.7472471266449063, "language_loss": 0.80879009, "learning_rate": 7.057597944835803e-07, "loss": 0.83046657, "num_input_tokens_seen": 130885245, "step": 6090, "time_per_iteration": 2.5119845867156982 }, { "auxiliary_loss_clip": 0.01131904, "auxiliary_loss_mlp": 0.01021081, "balance_loss_clip": 1.04381573, "balance_loss_mlp": 1.01420283, "epoch": 0.7323994468827031, "flos": 25369093065600.0, "grad_norm": 1.550271164377914, "language_loss": 0.745031, "learning_rate": 7.051660139737253e-07, "loss": 0.76656085, "num_input_tokens_seen": 130903465, "step": 6091, "time_per_iteration": 2.5374433994293213 }, { "auxiliary_loss_clip": 0.01151214, "auxiliary_loss_mlp": 0.00761542, "balance_loss_clip": 1.04873168, "balance_loss_mlp": 1.00032687, "epoch": 0.7325196897733421, "flos": 26907075653760.0, "grad_norm": 2.404974823497621, "language_loss": 0.7659322, "learning_rate": 7.045724298891565e-07, "loss": 0.78505969, "num_input_tokens_seen": 130922935, "step": 6092, "time_per_iteration": 2.483245849609375 }, { "auxiliary_loss_clip": 0.01152253, "auxiliary_loss_mlp": 0.01023174, "balance_loss_clip": 1.04722667, "balance_loss_mlp": 1.01623547, "epoch": 0.7326399326639812, "flos": 25775781828480.0, "grad_norm": 1.9320790573908304, "language_loss": 0.69105494, "learning_rate": 7.039790423199192e-07, "loss": 0.71280921, "num_input_tokens_seen": 130942575, "step": 6093, "time_per_iteration": 2.464826822280884 }, { "auxiliary_loss_clip": 0.01142848, "auxiliary_loss_mlp": 0.01024626, "balance_loss_clip": 1.04637527, "balance_loss_mlp": 1.01764011, "epoch": 0.7327601755546204, "flos": 21032269706880.0, "grad_norm": 1.9397062132468301, "language_loss": 0.77659988, "learning_rate": 7.033858513560322e-07, "loss": 0.79827464, "num_input_tokens_seen": 130958870, "step": 6094, "time_per_iteration": 2.446718454360962 }, { "auxiliary_loss_clip": 0.01153437, "auxiliary_loss_mlp": 0.01022726, "balance_loss_clip": 1.0477072, "balance_loss_mlp": 1.01576138, "epoch": 0.7328804184452594, "flos": 16289224462080.0, "grad_norm": 2.550591191607863, "language_loss": 0.76353878, "learning_rate": 7.027928570874794e-07, "loss": 0.78530037, "num_input_tokens_seen": 130977060, "step": 6095, "time_per_iteration": 2.411625623703003 }, { "auxiliary_loss_clip": 0.01165811, "auxiliary_loss_mlp": 0.01025871, "balance_loss_clip": 1.04815173, "balance_loss_mlp": 1.0187484, "epoch": 0.7330006613358985, "flos": 17858233422720.0, "grad_norm": 1.8203880795264558, "language_loss": 0.85420823, "learning_rate": 7.022000596042194e-07, "loss": 0.8761251, "num_input_tokens_seen": 130994160, "step": 6096, "time_per_iteration": 2.4382669925689697 }, { "auxiliary_loss_clip": 0.01126081, "auxiliary_loss_mlp": 0.0102531, "balance_loss_clip": 1.04124975, "balance_loss_mlp": 1.0181396, "epoch": 0.7331209042265376, "flos": 22492074343680.0, "grad_norm": 2.0782956590857826, "language_loss": 0.82074994, "learning_rate": 7.016074589961784e-07, "loss": 0.84226382, "num_input_tokens_seen": 131012725, "step": 6097, "time_per_iteration": 4.066557884216309 }, { "auxiliary_loss_clip": 0.01137424, "auxiliary_loss_mlp": 0.01024848, "balance_loss_clip": 1.0472331, "balance_loss_mlp": 1.0173645, "epoch": 0.7332411471171767, "flos": 33072757937280.0, "grad_norm": 3.373020572821285, "language_loss": 0.66768903, "learning_rate": 7.01015055353253e-07, "loss": 0.68931174, "num_input_tokens_seen": 131035150, "step": 6098, "time_per_iteration": 3.427626132965088 }, { "auxiliary_loss_clip": 0.01104349, "auxiliary_loss_mlp": 0.01025622, "balance_loss_clip": 1.04264235, "balance_loss_mlp": 1.01781642, "epoch": 0.7333613900078157, "flos": 22743017735040.0, "grad_norm": 1.6594426146958103, "language_loss": 0.77940929, "learning_rate": 7.004228487653123e-07, "loss": 0.80070901, "num_input_tokens_seen": 131055955, "step": 6099, "time_per_iteration": 2.600335121154785 }, { "auxiliary_loss_clip": 0.01119688, "auxiliary_loss_mlp": 0.01022542, "balance_loss_clip": 1.03903115, "balance_loss_mlp": 1.0155654, "epoch": 0.7334816328984549, "flos": 22346133384960.0, "grad_norm": 2.3250237476893187, "language_loss": 0.78377461, "learning_rate": 6.998308393221906e-07, "loss": 0.80519694, "num_input_tokens_seen": 131074360, "step": 6100, "time_per_iteration": 2.4972546100616455 }, { "auxiliary_loss_clip": 0.01126922, "auxiliary_loss_mlp": 0.01025261, "balance_loss_clip": 1.04453993, "balance_loss_mlp": 1.01818848, "epoch": 0.733601875789094, "flos": 20736149984640.0, "grad_norm": 2.0025848768325925, "language_loss": 0.71113706, "learning_rate": 6.992390271136977e-07, "loss": 0.73265886, "num_input_tokens_seen": 131090070, "step": 6101, "time_per_iteration": 2.498704433441162 }, { "auxiliary_loss_clip": 0.01146787, "auxiliary_loss_mlp": 0.01020907, "balance_loss_clip": 1.04429829, "balance_loss_mlp": 1.01414514, "epoch": 0.733722118679733, "flos": 22564362464640.0, "grad_norm": 1.8165193175406134, "language_loss": 0.85598075, "learning_rate": 6.986474122296094e-07, "loss": 0.87765771, "num_input_tokens_seen": 131109185, "step": 6102, "time_per_iteration": 3.1823179721832275 }, { "auxiliary_loss_clip": 0.01172218, "auxiliary_loss_mlp": 0.01020854, "balance_loss_clip": 1.05054927, "balance_loss_mlp": 1.01384151, "epoch": 0.7338423615703722, "flos": 20084192179200.0, "grad_norm": 3.075498197473535, "language_loss": 0.72113538, "learning_rate": 6.980559947596751e-07, "loss": 0.74306607, "num_input_tokens_seen": 131127725, "step": 6103, "time_per_iteration": 2.397486686706543 }, { "auxiliary_loss_clip": 0.01108667, "auxiliary_loss_mlp": 0.01022405, "balance_loss_clip": 1.04046476, "balance_loss_mlp": 1.01496959, "epoch": 0.7339626044610112, "flos": 21687675217920.0, "grad_norm": 2.1976276906123364, "language_loss": 0.75900066, "learning_rate": 6.974647747936109e-07, "loss": 0.78031135, "num_input_tokens_seen": 131146110, "step": 6104, "time_per_iteration": 2.5461323261260986 }, { "auxiliary_loss_clip": 0.01169647, "auxiliary_loss_mlp": 0.00761873, "balance_loss_clip": 1.05007541, "balance_loss_mlp": 1.00040174, "epoch": 0.7340828473516503, "flos": 15268248282240.0, "grad_norm": 2.5093309413384755, "language_loss": 0.82421112, "learning_rate": 6.968737524211039e-07, "loss": 0.84352636, "num_input_tokens_seen": 131162920, "step": 6105, "time_per_iteration": 2.3735501766204834 }, { "auxiliary_loss_clip": 0.01153453, "auxiliary_loss_mlp": 0.01024431, "balance_loss_clip": 1.04783189, "balance_loss_mlp": 1.01715636, "epoch": 0.7342030902422895, "flos": 22930112701440.0, "grad_norm": 1.9187462857665418, "language_loss": 0.80317259, "learning_rate": 6.962829277318132e-07, "loss": 0.82495141, "num_input_tokens_seen": 131182515, "step": 6106, "time_per_iteration": 2.444582223892212 }, { "auxiliary_loss_clip": 0.01156594, "auxiliary_loss_mlp": 0.01025203, "balance_loss_clip": 1.05060506, "balance_loss_mlp": 1.01857531, "epoch": 0.7343233331329285, "flos": 25847890381440.0, "grad_norm": 1.7944830951726432, "language_loss": 0.83404624, "learning_rate": 6.956923008153652e-07, "loss": 0.85586423, "num_input_tokens_seen": 131202280, "step": 6107, "time_per_iteration": 2.4748213291168213 }, { "auxiliary_loss_clip": 0.01153147, "auxiliary_loss_mlp": 0.0102744, "balance_loss_clip": 1.04541135, "balance_loss_mlp": 1.02101445, "epoch": 0.7344435760235676, "flos": 18478985287680.0, "grad_norm": 2.1985630007422965, "language_loss": 0.84163302, "learning_rate": 6.951018717613593e-07, "loss": 0.86343884, "num_input_tokens_seen": 131221295, "step": 6108, "time_per_iteration": 2.452054738998413 }, { "auxiliary_loss_clip": 0.01152709, "auxiliary_loss_mlp": 0.01024608, "balance_loss_clip": 1.0479449, "balance_loss_mlp": 1.01766443, "epoch": 0.7345638189142067, "flos": 17640040256640.0, "grad_norm": 2.111463295746286, "language_loss": 0.7824921, "learning_rate": 6.945116406593614e-07, "loss": 0.80426526, "num_input_tokens_seen": 131240150, "step": 6109, "time_per_iteration": 2.4198927879333496 }, { "auxiliary_loss_clip": 0.01114618, "auxiliary_loss_mlp": 0.01025403, "balance_loss_clip": 1.04510975, "balance_loss_mlp": 1.01817298, "epoch": 0.7346840618048458, "flos": 20260225756800.0, "grad_norm": 2.0335258083724725, "language_loss": 0.74789572, "learning_rate": 6.939216075989089e-07, "loss": 0.76929587, "num_input_tokens_seen": 131258080, "step": 6110, "time_per_iteration": 2.5281388759613037 }, { "auxiliary_loss_clip": 0.01136416, "auxiliary_loss_mlp": 0.0102096, "balance_loss_clip": 1.04391694, "balance_loss_mlp": 1.01373553, "epoch": 0.7348043046954849, "flos": 29023183641600.0, "grad_norm": 2.0789368476138144, "language_loss": 0.66281021, "learning_rate": 6.933317726695109e-07, "loss": 0.68438393, "num_input_tokens_seen": 131279310, "step": 6111, "time_per_iteration": 2.523442506790161 }, { "auxiliary_loss_clip": 0.0112566, "auxiliary_loss_mlp": 0.01025986, "balance_loss_clip": 1.04791582, "balance_loss_mlp": 1.01862824, "epoch": 0.734924547586124, "flos": 17931203902080.0, "grad_norm": 2.316041614826212, "language_loss": 0.79884422, "learning_rate": 6.92742135960644e-07, "loss": 0.82036066, "num_input_tokens_seen": 131297010, "step": 6112, "time_per_iteration": 2.471081495285034 }, { "auxiliary_loss_clip": 0.01061137, "auxiliary_loss_mlp": 0.01001304, "balance_loss_clip": 1.01565647, "balance_loss_mlp": 1.00023687, "epoch": 0.7350447904767631, "flos": 63588319850880.0, "grad_norm": 0.8204316855328927, "language_loss": 0.55722958, "learning_rate": 6.921526975617556e-07, "loss": 0.57785398, "num_input_tokens_seen": 131356470, "step": 6113, "time_per_iteration": 3.0800039768218994 }, { "auxiliary_loss_clip": 0.01141019, "auxiliary_loss_mlp": 0.01024701, "balance_loss_clip": 1.04361629, "balance_loss_mlp": 1.01739979, "epoch": 0.7351650333674021, "flos": 21580015178880.0, "grad_norm": 2.0189428341341573, "language_loss": 0.75163162, "learning_rate": 6.915634575622631e-07, "loss": 0.77328885, "num_input_tokens_seen": 131374985, "step": 6114, "time_per_iteration": 2.4784367084503174 }, { "auxiliary_loss_clip": 0.01166651, "auxiliary_loss_mlp": 0.01022242, "balance_loss_clip": 1.04894483, "balance_loss_mlp": 1.01523268, "epoch": 0.7352852762580413, "flos": 18186349184640.0, "grad_norm": 1.8347008491799346, "language_loss": 0.7092185, "learning_rate": 6.909744160515532e-07, "loss": 0.73110735, "num_input_tokens_seen": 131393125, "step": 6115, "time_per_iteration": 2.388014316558838 }, { "auxiliary_loss_clip": 0.01137416, "auxiliary_loss_mlp": 0.01024828, "balance_loss_clip": 1.0452559, "balance_loss_mlp": 1.01779723, "epoch": 0.7354055191486804, "flos": 38910073063680.0, "grad_norm": 1.995171613961482, "language_loss": 0.69654024, "learning_rate": 6.903855731189849e-07, "loss": 0.71816266, "num_input_tokens_seen": 131415760, "step": 6116, "time_per_iteration": 2.6465911865234375 }, { "auxiliary_loss_clip": 0.01147112, "auxiliary_loss_mlp": 0.01030061, "balance_loss_clip": 1.04764831, "balance_loss_mlp": 1.02231574, "epoch": 0.7355257620393194, "flos": 16289978647680.0, "grad_norm": 2.388241065246289, "language_loss": 0.82299089, "learning_rate": 6.897969288538825e-07, "loss": 0.84476262, "num_input_tokens_seen": 131433705, "step": 6117, "time_per_iteration": 2.4507827758789062 }, { "auxiliary_loss_clip": 0.01138816, "auxiliary_loss_mlp": 0.01024592, "balance_loss_clip": 1.0463208, "balance_loss_mlp": 1.01778245, "epoch": 0.7356460049299585, "flos": 18114240631680.0, "grad_norm": 2.3434490001092554, "language_loss": 0.81514186, "learning_rate": 6.892084833455452e-07, "loss": 0.8367759, "num_input_tokens_seen": 131453275, "step": 6118, "time_per_iteration": 2.453321695327759 }, { "auxiliary_loss_clip": 0.0115268, "auxiliary_loss_mlp": 0.01022325, "balance_loss_clip": 1.04944384, "balance_loss_mlp": 1.01560163, "epoch": 0.7357662478205976, "flos": 21325193118720.0, "grad_norm": 1.4434957043541996, "language_loss": 0.83912963, "learning_rate": 6.886202366832384e-07, "loss": 0.86087966, "num_input_tokens_seen": 131474960, "step": 6119, "time_per_iteration": 2.463392972946167 }, { "auxiliary_loss_clip": 0.01113138, "auxiliary_loss_mlp": 0.01023785, "balance_loss_clip": 1.04653382, "balance_loss_mlp": 1.01610804, "epoch": 0.7358864907112367, "flos": 14246841139200.0, "grad_norm": 9.36868885106388, "language_loss": 0.73476374, "learning_rate": 6.880321889561987e-07, "loss": 0.75613296, "num_input_tokens_seen": 131492935, "step": 6120, "time_per_iteration": 2.504666805267334 }, { "auxiliary_loss_clip": 0.01119984, "auxiliary_loss_mlp": 0.01027779, "balance_loss_clip": 1.04332554, "balance_loss_mlp": 1.01975584, "epoch": 0.7360067336018757, "flos": 22309684058880.0, "grad_norm": 2.763689757625933, "language_loss": 0.65157712, "learning_rate": 6.874443402536338e-07, "loss": 0.67305475, "num_input_tokens_seen": 131512025, "step": 6121, "time_per_iteration": 2.507908821105957 }, { "auxiliary_loss_clip": 0.01142145, "auxiliary_loss_mlp": 0.01020396, "balance_loss_clip": 1.04724443, "balance_loss_mlp": 1.01326132, "epoch": 0.7361269764925149, "flos": 25554607833600.0, "grad_norm": 2.8295515221300467, "language_loss": 0.80553359, "learning_rate": 6.868566906647177e-07, "loss": 0.82715905, "num_input_tokens_seen": 131532975, "step": 6122, "time_per_iteration": 2.5304811000823975 }, { "auxiliary_loss_clip": 0.01153464, "auxiliary_loss_mlp": 0.01028199, "balance_loss_clip": 1.04568267, "balance_loss_mlp": 1.02060509, "epoch": 0.736247219383154, "flos": 20376505059840.0, "grad_norm": 2.8301423462248514, "language_loss": 0.83408904, "learning_rate": 6.862692402785984e-07, "loss": 0.85590565, "num_input_tokens_seen": 131553225, "step": 6123, "time_per_iteration": 3.161473274230957 }, { "auxiliary_loss_clip": 0.01033688, "auxiliary_loss_mlp": 0.01004321, "balance_loss_clip": 1.01557016, "balance_loss_mlp": 1.00307488, "epoch": 0.736367462273793, "flos": 70339525735680.0, "grad_norm": 0.7948722943119283, "language_loss": 0.4958145, "learning_rate": 6.856819891843899e-07, "loss": 0.51619458, "num_input_tokens_seen": 131617930, "step": 6124, "time_per_iteration": 4.001164436340332 }, { "auxiliary_loss_clip": 0.01096068, "auxiliary_loss_mlp": 0.01027882, "balance_loss_clip": 1.04080462, "balance_loss_mlp": 1.02078855, "epoch": 0.7364877051644322, "flos": 22412711243520.0, "grad_norm": 1.977683330044527, "language_loss": 0.72185355, "learning_rate": 6.8509493747118e-07, "loss": 0.74309307, "num_input_tokens_seen": 131636740, "step": 6125, "time_per_iteration": 3.3232955932617188 }, { "auxiliary_loss_clip": 0.01167659, "auxiliary_loss_mlp": 0.01024772, "balance_loss_clip": 1.04850829, "balance_loss_mlp": 1.01767015, "epoch": 0.7366079480550712, "flos": 12130266274560.0, "grad_norm": 4.056533982848506, "language_loss": 0.87854242, "learning_rate": 6.845080852280221e-07, "loss": 0.90046674, "num_input_tokens_seen": 131653810, "step": 6126, "time_per_iteration": 2.367185115814209 }, { "auxiliary_loss_clip": 0.0112573, "auxiliary_loss_mlp": 0.01021132, "balance_loss_clip": 1.0436331, "balance_loss_mlp": 1.01472116, "epoch": 0.7367281909457103, "flos": 15049336844160.0, "grad_norm": 1.608921876373448, "language_loss": 0.742347, "learning_rate": 6.839214325439409e-07, "loss": 0.76381558, "num_input_tokens_seen": 131671505, "step": 6127, "time_per_iteration": 2.48690128326416 }, { "auxiliary_loss_clip": 0.01133006, "auxiliary_loss_mlp": 0.01024216, "balance_loss_clip": 1.04617584, "balance_loss_mlp": 1.0170753, "epoch": 0.7368484338363495, "flos": 23510752053120.0, "grad_norm": 2.39098191918476, "language_loss": 0.7174834, "learning_rate": 6.833349795079327e-07, "loss": 0.73905563, "num_input_tokens_seen": 131690615, "step": 6128, "time_per_iteration": 3.2437474727630615 }, { "auxiliary_loss_clip": 0.01125568, "auxiliary_loss_mlp": 0.01025287, "balance_loss_clip": 1.04596829, "balance_loss_mlp": 1.01815772, "epoch": 0.7369686767269885, "flos": 27417833095680.0, "grad_norm": 1.66657925085952, "language_loss": 0.68344021, "learning_rate": 6.827487262089613e-07, "loss": 0.70494872, "num_input_tokens_seen": 131711120, "step": 6129, "time_per_iteration": 2.5470545291900635 }, { "auxiliary_loss_clip": 0.01040807, "auxiliary_loss_mlp": 0.01001375, "balance_loss_clip": 1.01108766, "balance_loss_mlp": 1.00024879, "epoch": 0.7370889196176276, "flos": 70293343824000.0, "grad_norm": 0.9011213052345667, "language_loss": 0.56821793, "learning_rate": 6.821626727359606e-07, "loss": 0.58863974, "num_input_tokens_seen": 131776680, "step": 6130, "time_per_iteration": 3.124809741973877 }, { "auxiliary_loss_clip": 0.01143709, "auxiliary_loss_mlp": 0.01023798, "balance_loss_clip": 1.05106902, "balance_loss_mlp": 1.01607335, "epoch": 0.7372091625082667, "flos": 18040839189120.0, "grad_norm": 2.9872590313058676, "language_loss": 0.76932818, "learning_rate": 6.815768191778348e-07, "loss": 0.79100323, "num_input_tokens_seen": 131794760, "step": 6131, "time_per_iteration": 2.4574620723724365 }, { "auxiliary_loss_clip": 0.01150286, "auxiliary_loss_mlp": 0.01026746, "balance_loss_clip": 1.04574406, "balance_loss_mlp": 1.01939034, "epoch": 0.7373294053989058, "flos": 33726331854720.0, "grad_norm": 2.072196462936505, "language_loss": 0.7303524, "learning_rate": 6.809911656234569e-07, "loss": 0.7521227, "num_input_tokens_seen": 131816735, "step": 6132, "time_per_iteration": 2.5581698417663574 }, { "auxiliary_loss_clip": 0.01126694, "auxiliary_loss_mlp": 0.01021401, "balance_loss_clip": 1.04247594, "balance_loss_mlp": 1.01460648, "epoch": 0.7374496482895448, "flos": 21506326427520.0, "grad_norm": 2.0023466777862127, "language_loss": 0.78540516, "learning_rate": 6.804057121616707e-07, "loss": 0.80688608, "num_input_tokens_seen": 131834940, "step": 6133, "time_per_iteration": 2.5201940536499023 }, { "auxiliary_loss_clip": 0.01154842, "auxiliary_loss_mlp": 0.01024463, "balance_loss_clip": 1.04774356, "balance_loss_mlp": 1.0167737, "epoch": 0.737569891180184, "flos": 24936908624640.0, "grad_norm": 2.035292912528272, "language_loss": 0.72040772, "learning_rate": 6.798204588812888e-07, "loss": 0.74220073, "num_input_tokens_seen": 131854355, "step": 6134, "time_per_iteration": 2.508913040161133 }, { "auxiliary_loss_clip": 0.01084932, "auxiliary_loss_mlp": 0.00761731, "balance_loss_clip": 1.03742397, "balance_loss_mlp": 1.00034809, "epoch": 0.7376901340708231, "flos": 20664544222080.0, "grad_norm": 1.793526734011821, "language_loss": 0.75624692, "learning_rate": 6.792354058710937e-07, "loss": 0.77471364, "num_input_tokens_seen": 131871825, "step": 6135, "time_per_iteration": 2.5681512355804443 }, { "auxiliary_loss_clip": 0.01161144, "auxiliary_loss_mlp": 0.0102108, "balance_loss_clip": 1.04705095, "balance_loss_mlp": 1.01413596, "epoch": 0.7378103769614621, "flos": 23805794367360.0, "grad_norm": 1.7824262238039135, "language_loss": 0.65195465, "learning_rate": 6.786505532198374e-07, "loss": 0.67377687, "num_input_tokens_seen": 131890770, "step": 6136, "time_per_iteration": 2.4442238807678223 }, { "auxiliary_loss_clip": 0.01167465, "auxiliary_loss_mlp": 0.01026631, "balance_loss_clip": 1.04805684, "balance_loss_mlp": 1.01902533, "epoch": 0.7379306198521013, "flos": 22237216369920.0, "grad_norm": 1.6295217909728643, "language_loss": 0.8544395, "learning_rate": 6.780659010162411e-07, "loss": 0.87638044, "num_input_tokens_seen": 131909720, "step": 6137, "time_per_iteration": 2.4078242778778076 }, { "auxiliary_loss_clip": 0.01130993, "auxiliary_loss_mlp": 0.01023, "balance_loss_clip": 1.04657292, "balance_loss_mlp": 1.01618147, "epoch": 0.7380508627427403, "flos": 14903108576640.0, "grad_norm": 1.7315278849538593, "language_loss": 0.83306575, "learning_rate": 6.774814493489975e-07, "loss": 0.85460567, "num_input_tokens_seen": 131927395, "step": 6138, "time_per_iteration": 2.4646408557891846 }, { "auxiliary_loss_clip": 0.01150791, "auxiliary_loss_mlp": 0.01023401, "balance_loss_clip": 1.04736924, "balance_loss_mlp": 1.01627243, "epoch": 0.7381711056333794, "flos": 21685843624320.0, "grad_norm": 1.8481432317643938, "language_loss": 0.66169411, "learning_rate": 6.768971983067655e-07, "loss": 0.68343604, "num_input_tokens_seen": 131947725, "step": 6139, "time_per_iteration": 2.444254159927368 }, { "auxiliary_loss_clip": 0.01068921, "auxiliary_loss_mlp": 0.01000978, "balance_loss_clip": 1.01360869, "balance_loss_mlp": 0.99999475, "epoch": 0.7382913485240186, "flos": 52404263596800.0, "grad_norm": 1.008046804286874, "language_loss": 0.67838311, "learning_rate": 6.763131479781772e-07, "loss": 0.69908208, "num_input_tokens_seen": 131997485, "step": 6140, "time_per_iteration": 2.824784517288208 }, { "auxiliary_loss_clip": 0.01132157, "auxiliary_loss_mlp": 0.01023985, "balance_loss_clip": 1.04671896, "balance_loss_mlp": 1.01700807, "epoch": 0.7384115914146576, "flos": 21798818876160.0, "grad_norm": 2.146481648944633, "language_loss": 0.76783538, "learning_rate": 6.757292984518316e-07, "loss": 0.78939682, "num_input_tokens_seen": 132016885, "step": 6141, "time_per_iteration": 2.456026792526245 }, { "auxiliary_loss_clip": 0.01059514, "auxiliary_loss_mlp": 0.01001796, "balance_loss_clip": 1.01408243, "balance_loss_mlp": 1.00078857, "epoch": 0.7385318343052967, "flos": 61494331662720.0, "grad_norm": 0.7408605739698927, "language_loss": 0.56436586, "learning_rate": 6.751456498162981e-07, "loss": 0.58497894, "num_input_tokens_seen": 132075920, "step": 6142, "time_per_iteration": 2.9201762676239014 }, { "auxiliary_loss_clip": 0.01150295, "auxiliary_loss_mlp": 0.01021738, "balance_loss_clip": 1.04317379, "balance_loss_mlp": 1.01553011, "epoch": 0.7386520771959358, "flos": 17013757697280.0, "grad_norm": 2.040981969223243, "language_loss": 0.85481173, "learning_rate": 6.745622021601174e-07, "loss": 0.87653208, "num_input_tokens_seen": 132092945, "step": 6143, "time_per_iteration": 2.398787498474121 }, { "auxiliary_loss_clip": 0.01127585, "auxiliary_loss_mlp": 0.01022816, "balance_loss_clip": 1.0437746, "balance_loss_mlp": 1.01579201, "epoch": 0.7387723200865749, "flos": 18770759464320.0, "grad_norm": 1.9343672579084095, "language_loss": 0.69383079, "learning_rate": 6.739789555717954e-07, "loss": 0.71533477, "num_input_tokens_seen": 132109920, "step": 6144, "time_per_iteration": 2.471367597579956 }, { "auxiliary_loss_clip": 0.01166018, "auxiliary_loss_mlp": 0.01024849, "balance_loss_clip": 1.04785812, "balance_loss_mlp": 1.01795232, "epoch": 0.738892562977214, "flos": 22525542840960.0, "grad_norm": 2.299903502533772, "language_loss": 0.77365208, "learning_rate": 6.733959101398124e-07, "loss": 0.79556072, "num_input_tokens_seen": 132128050, "step": 6145, "time_per_iteration": 2.4074547290802 }, { "auxiliary_loss_clip": 0.01137132, "auxiliary_loss_mlp": 0.01025278, "balance_loss_clip": 1.04421115, "balance_loss_mlp": 1.01754189, "epoch": 0.7390128058678531, "flos": 21501478091520.0, "grad_norm": 1.6021813279585075, "language_loss": 0.81531954, "learning_rate": 6.728130659526143e-07, "loss": 0.83694357, "num_input_tokens_seen": 132145860, "step": 6146, "time_per_iteration": 2.473816156387329 }, { "auxiliary_loss_clip": 0.01142773, "auxiliary_loss_mlp": 0.01029601, "balance_loss_clip": 1.04756582, "balance_loss_mlp": 1.02216864, "epoch": 0.7391330487584922, "flos": 25776176878080.0, "grad_norm": 2.4484079217432675, "language_loss": 0.71100426, "learning_rate": 6.7223042309862e-07, "loss": 0.73272794, "num_input_tokens_seen": 132166060, "step": 6147, "time_per_iteration": 2.5129122734069824 }, { "auxiliary_loss_clip": 0.01149319, "auxiliary_loss_mlp": 0.0102609, "balance_loss_clip": 1.04482603, "balance_loss_mlp": 1.01949739, "epoch": 0.7392532916491312, "flos": 28366736636160.0, "grad_norm": 4.769843803939903, "language_loss": 0.73553824, "learning_rate": 6.716479816662144e-07, "loss": 0.75729239, "num_input_tokens_seen": 132187790, "step": 6148, "time_per_iteration": 2.5065290927886963 }, { "auxiliary_loss_clip": 0.01142973, "auxiliary_loss_mlp": 0.01022549, "balance_loss_clip": 1.04529178, "balance_loss_mlp": 1.01551604, "epoch": 0.7393735345397703, "flos": 23585877348480.0, "grad_norm": 1.9293338262648343, "language_loss": 0.73069763, "learning_rate": 6.710657417437531e-07, "loss": 0.75235289, "num_input_tokens_seen": 132207495, "step": 6149, "time_per_iteration": 2.544508934020996 }, { "auxiliary_loss_clip": 0.01137384, "auxiliary_loss_mlp": 0.01024611, "balance_loss_clip": 1.04498255, "balance_loss_mlp": 1.0176549, "epoch": 0.7394937774304094, "flos": 19974772373760.0, "grad_norm": 2.4202571793140564, "language_loss": 0.79693204, "learning_rate": 6.704837034195628e-07, "loss": 0.81855202, "num_input_tokens_seen": 132225960, "step": 6150, "time_per_iteration": 4.103356122970581 }, { "auxiliary_loss_clip": 0.01144622, "auxiliary_loss_mlp": 0.0103129, "balance_loss_clip": 1.04429317, "balance_loss_mlp": 1.02390468, "epoch": 0.7396140203210485, "flos": 23478037741440.0, "grad_norm": 1.6533003076769768, "language_loss": 0.84895766, "learning_rate": 6.699018667819376e-07, "loss": 0.87071669, "num_input_tokens_seen": 132245360, "step": 6151, "time_per_iteration": 2.4704983234405518 }, { "auxiliary_loss_clip": 0.01150598, "auxiliary_loss_mlp": 0.0102859, "balance_loss_clip": 1.04499865, "balance_loss_mlp": 1.02094853, "epoch": 0.7397342632116876, "flos": 25555433846400.0, "grad_norm": 1.573566404652876, "language_loss": 0.72694886, "learning_rate": 6.693202319191415e-07, "loss": 0.74874073, "num_input_tokens_seen": 132267095, "step": 6152, "time_per_iteration": 3.3766157627105713 }, { "auxiliary_loss_clip": 0.0116696, "auxiliary_loss_mlp": 0.01026223, "balance_loss_clip": 1.05151534, "balance_loss_mlp": 1.01887691, "epoch": 0.7398545061023267, "flos": 24755021130240.0, "grad_norm": 1.8394961097270544, "language_loss": 0.7519151, "learning_rate": 6.687387989194084e-07, "loss": 0.77384698, "num_input_tokens_seen": 132286610, "step": 6153, "time_per_iteration": 2.5108325481414795 }, { "auxiliary_loss_clip": 0.01136614, "auxiliary_loss_mlp": 0.01025994, "balance_loss_clip": 1.04856825, "balance_loss_mlp": 1.01886511, "epoch": 0.7399747489929658, "flos": 16508602776960.0, "grad_norm": 1.7675194688513232, "language_loss": 0.79115498, "learning_rate": 6.681575678709404e-07, "loss": 0.8127811, "num_input_tokens_seen": 132305300, "step": 6154, "time_per_iteration": 2.4753262996673584 }, { "auxiliary_loss_clip": 0.01152331, "auxiliary_loss_mlp": 0.01022402, "balance_loss_clip": 1.04705024, "balance_loss_mlp": 1.01550889, "epoch": 0.7400949918836048, "flos": 24097065753600.0, "grad_norm": 1.9513790591154117, "language_loss": 0.70933914, "learning_rate": 6.67576538861911e-07, "loss": 0.73108649, "num_input_tokens_seen": 132323875, "step": 6155, "time_per_iteration": 3.265145778656006 }, { "auxiliary_loss_clip": 0.01134586, "auxiliary_loss_mlp": 0.01022995, "balance_loss_clip": 1.0445075, "balance_loss_mlp": 1.01632559, "epoch": 0.740215234774244, "flos": 21802517976960.0, "grad_norm": 1.914649125924512, "language_loss": 0.82184827, "learning_rate": 6.669957119804612e-07, "loss": 0.84342408, "num_input_tokens_seen": 132345510, "step": 6156, "time_per_iteration": 2.5210795402526855 }, { "auxiliary_loss_clip": 0.01147576, "auxiliary_loss_mlp": 0.01025451, "balance_loss_clip": 1.04784179, "balance_loss_mlp": 1.01861393, "epoch": 0.7403354776648831, "flos": 18733196816640.0, "grad_norm": 3.274396289426646, "language_loss": 0.71954364, "learning_rate": 6.66415087314702e-07, "loss": 0.74127388, "num_input_tokens_seen": 132360465, "step": 6157, "time_per_iteration": 2.4516172409057617 }, { "auxiliary_loss_clip": 0.01139076, "auxiliary_loss_mlp": 0.01019088, "balance_loss_clip": 1.04441166, "balance_loss_mlp": 1.01216197, "epoch": 0.7404557205555221, "flos": 16909581277440.0, "grad_norm": 1.9876777460500796, "language_loss": 0.73317605, "learning_rate": 6.65834664952714e-07, "loss": 0.75475764, "num_input_tokens_seen": 132377915, "step": 6158, "time_per_iteration": 2.4498064517974854 }, { "auxiliary_loss_clip": 0.01124776, "auxiliary_loss_mlp": 0.01022515, "balance_loss_clip": 1.0433557, "balance_loss_mlp": 1.01587176, "epoch": 0.7405759634461613, "flos": 21214408596480.0, "grad_norm": 5.215755168968883, "language_loss": 0.76198125, "learning_rate": 6.652544449825457e-07, "loss": 0.78345418, "num_input_tokens_seen": 132398170, "step": 6159, "time_per_iteration": 2.5090584754943848 }, { "auxiliary_loss_clip": 0.01146281, "auxiliary_loss_mlp": 0.01029306, "balance_loss_clip": 1.04439187, "balance_loss_mlp": 1.02203774, "epoch": 0.7406962063368003, "flos": 20480106862080.0, "grad_norm": 2.281255998077098, "language_loss": 0.76381552, "learning_rate": 6.646744274922182e-07, "loss": 0.7855714, "num_input_tokens_seen": 132416615, "step": 6160, "time_per_iteration": 2.4607415199279785 }, { "auxiliary_loss_clip": 0.01141031, "auxiliary_loss_mlp": 0.01019453, "balance_loss_clip": 1.0450536, "balance_loss_mlp": 1.0123508, "epoch": 0.7408164492274394, "flos": 19791915212160.0, "grad_norm": 2.6430930941410273, "language_loss": 0.75554162, "learning_rate": 6.640946125697171e-07, "loss": 0.77714646, "num_input_tokens_seen": 132434145, "step": 6161, "time_per_iteration": 2.4523236751556396 }, { "auxiliary_loss_clip": 0.01153634, "auxiliary_loss_mlp": 0.01021035, "balance_loss_clip": 1.04576206, "balance_loss_mlp": 1.01379323, "epoch": 0.7409366921180786, "flos": 29204855654400.0, "grad_norm": 1.7400778372577494, "language_loss": 0.75125176, "learning_rate": 6.635150003030017e-07, "loss": 0.77299845, "num_input_tokens_seen": 132452670, "step": 6162, "time_per_iteration": 2.5072247982025146 }, { "auxiliary_loss_clip": 0.01109275, "auxiliary_loss_mlp": 0.01022135, "balance_loss_clip": 1.03884137, "balance_loss_mlp": 1.0152154, "epoch": 0.7410569350087176, "flos": 22930004960640.0, "grad_norm": 2.5277331856093337, "language_loss": 0.85946649, "learning_rate": 6.629355907799981e-07, "loss": 0.88078064, "num_input_tokens_seen": 132472475, "step": 6163, "time_per_iteration": 2.524101495742798 }, { "auxiliary_loss_clip": 0.01152612, "auxiliary_loss_mlp": 0.01022343, "balance_loss_clip": 1.04496896, "balance_loss_mlp": 1.01539588, "epoch": 0.7411771778993567, "flos": 30440397726720.0, "grad_norm": 1.745145059194402, "language_loss": 0.69244128, "learning_rate": 6.623563840886015e-07, "loss": 0.71419084, "num_input_tokens_seen": 132493400, "step": 6164, "time_per_iteration": 2.5163278579711914 }, { "auxiliary_loss_clip": 0.01148683, "auxiliary_loss_mlp": 0.0102045, "balance_loss_clip": 1.04535162, "balance_loss_mlp": 1.01350021, "epoch": 0.7412974207899958, "flos": 20522050968960.0, "grad_norm": 1.5921267757564497, "language_loss": 0.69588256, "learning_rate": 6.617773803166795e-07, "loss": 0.71757388, "num_input_tokens_seen": 132511725, "step": 6165, "time_per_iteration": 2.429363489151001 }, { "auxiliary_loss_clip": 0.01144432, "auxiliary_loss_mlp": 0.00762124, "balance_loss_clip": 1.04588127, "balance_loss_mlp": 1.00039768, "epoch": 0.7414176636806349, "flos": 22090700793600.0, "grad_norm": 2.160269003135224, "language_loss": 0.82313693, "learning_rate": 6.611985795520634e-07, "loss": 0.84220243, "num_input_tokens_seen": 132530270, "step": 6166, "time_per_iteration": 2.475053071975708 }, { "auxiliary_loss_clip": 0.01134589, "auxiliary_loss_mlp": 0.01025548, "balance_loss_clip": 1.04705679, "balance_loss_mlp": 1.01823461, "epoch": 0.7415379065712739, "flos": 25155245445120.0, "grad_norm": 1.9970311768991247, "language_loss": 0.77234519, "learning_rate": 6.606199818825588e-07, "loss": 0.79394656, "num_input_tokens_seen": 132550725, "step": 6167, "time_per_iteration": 2.542524814605713 }, { "auxiliary_loss_clip": 0.01143904, "auxiliary_loss_mlp": 0.01023678, "balance_loss_clip": 1.04378426, "balance_loss_mlp": 1.01652575, "epoch": 0.7416581494619131, "flos": 16871731320960.0, "grad_norm": 3.0072285731994546, "language_loss": 0.81650358, "learning_rate": 6.600415873959377e-07, "loss": 0.83817947, "num_input_tokens_seen": 132568600, "step": 6168, "time_per_iteration": 2.438075065612793 }, { "auxiliary_loss_clip": 0.01095927, "auxiliary_loss_mlp": 0.00761202, "balance_loss_clip": 1.04002655, "balance_loss_mlp": 1.00032854, "epoch": 0.7417783923525522, "flos": 28438881102720.0, "grad_norm": 1.9182251782080653, "language_loss": 0.64667565, "learning_rate": 6.594633961799437e-07, "loss": 0.6652469, "num_input_tokens_seen": 132587640, "step": 6169, "time_per_iteration": 2.639529228210449 }, { "auxiliary_loss_clip": 0.01133024, "auxiliary_loss_mlp": 0.01022993, "balance_loss_clip": 1.04518294, "balance_loss_mlp": 1.01583755, "epoch": 0.7418986352431912, "flos": 20084299920000.0, "grad_norm": 2.1666729190003124, "language_loss": 0.81358981, "learning_rate": 6.588854083222857e-07, "loss": 0.83514994, "num_input_tokens_seen": 132607075, "step": 6170, "time_per_iteration": 2.5021615028381348 }, { "auxiliary_loss_clip": 0.01144455, "auxiliary_loss_mlp": 0.01026511, "balance_loss_clip": 1.0483315, "balance_loss_mlp": 1.01875067, "epoch": 0.7420188781338304, "flos": 18259571059200.0, "grad_norm": 5.59677325555764, "language_loss": 0.81034243, "learning_rate": 6.583076239106444e-07, "loss": 0.83205211, "num_input_tokens_seen": 132625580, "step": 6171, "time_per_iteration": 2.461170196533203 }, { "auxiliary_loss_clip": 0.01144108, "auxiliary_loss_mlp": 0.0102294, "balance_loss_clip": 1.04674983, "balance_loss_mlp": 1.01540005, "epoch": 0.7421391210244694, "flos": 13771994319360.0, "grad_norm": 2.001980496848609, "language_loss": 0.75233805, "learning_rate": 6.577300430326707e-07, "loss": 0.77400851, "num_input_tokens_seen": 132640525, "step": 6172, "time_per_iteration": 2.4620823860168457 }, { "auxiliary_loss_clip": 0.01124835, "auxiliary_loss_mlp": 0.01022405, "balance_loss_clip": 1.04649031, "balance_loss_mlp": 1.01547635, "epoch": 0.7422593639151085, "flos": 15961683317760.0, "grad_norm": 2.3653042260689228, "language_loss": 0.72111821, "learning_rate": 6.571526657759821e-07, "loss": 0.74259055, "num_input_tokens_seen": 132656265, "step": 6173, "time_per_iteration": 2.4795925617218018 }, { "auxiliary_loss_clip": 0.01146936, "auxiliary_loss_mlp": 0.01020344, "balance_loss_clip": 1.04446375, "balance_loss_mlp": 1.01336098, "epoch": 0.7423796068057477, "flos": 30114400867200.0, "grad_norm": 1.734758258463048, "language_loss": 0.70693886, "learning_rate": 6.565754922281663e-07, "loss": 0.72861171, "num_input_tokens_seen": 132678510, "step": 6174, "time_per_iteration": 2.5282599925994873 }, { "auxiliary_loss_clip": 0.01137581, "auxiliary_loss_mlp": 0.01027362, "balance_loss_clip": 1.04447603, "balance_loss_mlp": 1.02048039, "epoch": 0.7424998496963867, "flos": 20521907314560.0, "grad_norm": 1.8061434100116054, "language_loss": 0.78390324, "learning_rate": 6.559985224767801e-07, "loss": 0.8055526, "num_input_tokens_seen": 132696385, "step": 6175, "time_per_iteration": 2.5167171955108643 }, { "auxiliary_loss_clip": 0.01130282, "auxiliary_loss_mlp": 0.01026147, "balance_loss_clip": 1.04436326, "balance_loss_mlp": 1.01907206, "epoch": 0.7426200925870258, "flos": 21871573873920.0, "grad_norm": 3.420312848934449, "language_loss": 0.75403482, "learning_rate": 6.55421756609349e-07, "loss": 0.77559912, "num_input_tokens_seen": 132714640, "step": 6176, "time_per_iteration": 2.4969983100891113 }, { "auxiliary_loss_clip": 0.01152111, "auxiliary_loss_mlp": 0.01029325, "balance_loss_clip": 1.04969072, "balance_loss_mlp": 1.02165723, "epoch": 0.7427403354776649, "flos": 26432049265920.0, "grad_norm": 1.7760128107619337, "language_loss": 0.79036349, "learning_rate": 6.54845194713369e-07, "loss": 0.8121779, "num_input_tokens_seen": 132735590, "step": 6177, "time_per_iteration": 4.072268486022949 }, { "auxiliary_loss_clip": 0.01147979, "auxiliary_loss_mlp": 0.0102862, "balance_loss_clip": 1.04616809, "balance_loss_mlp": 1.02205801, "epoch": 0.742860578368304, "flos": 19898390102400.0, "grad_norm": 1.8916684867949831, "language_loss": 0.79498696, "learning_rate": 6.542688368763034e-07, "loss": 0.81675303, "num_input_tokens_seen": 132753995, "step": 6178, "time_per_iteration": 3.271073579788208 }, { "auxiliary_loss_clip": 0.01152102, "auxiliary_loss_mlp": 0.01027465, "balance_loss_clip": 1.0482794, "balance_loss_mlp": 1.02083421, "epoch": 0.742980821258943, "flos": 24827201510400.0, "grad_norm": 2.4925975872946586, "language_loss": 0.76932037, "learning_rate": 6.536926831855854e-07, "loss": 0.791116, "num_input_tokens_seen": 132773160, "step": 6179, "time_per_iteration": 2.45601749420166 }, { "auxiliary_loss_clip": 0.01135752, "auxiliary_loss_mlp": 0.01022823, "balance_loss_clip": 1.0452354, "balance_loss_mlp": 1.01607895, "epoch": 0.7431010641495821, "flos": 25228646887680.0, "grad_norm": 2.9300099208392836, "language_loss": 0.72909594, "learning_rate": 6.531167337286165e-07, "loss": 0.7506817, "num_input_tokens_seen": 132793180, "step": 6180, "time_per_iteration": 2.4815125465393066 }, { "auxiliary_loss_clip": 0.01138423, "auxiliary_loss_mlp": 0.0102296, "balance_loss_clip": 1.0474757, "balance_loss_mlp": 1.01621222, "epoch": 0.7432213070402213, "flos": 21762369550080.0, "grad_norm": 1.403660437381547, "language_loss": 0.7972523, "learning_rate": 6.52540988592768e-07, "loss": 0.81886613, "num_input_tokens_seen": 132814200, "step": 6181, "time_per_iteration": 2.4895172119140625 }, { "auxiliary_loss_clip": 0.01140588, "auxiliary_loss_mlp": 0.01020485, "balance_loss_clip": 1.04636407, "balance_loss_mlp": 1.01364517, "epoch": 0.7433415499308603, "flos": 14793832425600.0, "grad_norm": 1.9555727524976907, "language_loss": 0.83165836, "learning_rate": 6.519654478653814e-07, "loss": 0.85326904, "num_input_tokens_seen": 132832565, "step": 6182, "time_per_iteration": 3.2044496536254883 }, { "auxiliary_loss_clip": 0.01052846, "auxiliary_loss_mlp": 0.01002711, "balance_loss_clip": 1.01496482, "balance_loss_mlp": 1.00160837, "epoch": 0.7434617928214994, "flos": 67155577297920.0, "grad_norm": 0.7421561489254233, "language_loss": 0.5614903, "learning_rate": 6.51390111633763e-07, "loss": 0.58204579, "num_input_tokens_seen": 132897840, "step": 6183, "time_per_iteration": 3.105877637863159 }, { "auxiliary_loss_clip": 0.01094522, "auxiliary_loss_mlp": 0.01020144, "balance_loss_clip": 1.03875446, "balance_loss_mlp": 1.01344442, "epoch": 0.7435820357121385, "flos": 27377576928000.0, "grad_norm": 1.6357073275961822, "language_loss": 0.76207614, "learning_rate": 6.508149799851932e-07, "loss": 0.78322279, "num_input_tokens_seen": 132919505, "step": 6184, "time_per_iteration": 2.60357928276062 }, { "auxiliary_loss_clip": 0.01136877, "auxiliary_loss_mlp": 0.01021035, "balance_loss_clip": 1.04546118, "balance_loss_mlp": 1.01407325, "epoch": 0.7437022786027776, "flos": 23987645948160.0, "grad_norm": 2.062518575948394, "language_loss": 0.61426228, "learning_rate": 6.502400530069183e-07, "loss": 0.63584143, "num_input_tokens_seen": 132939390, "step": 6185, "time_per_iteration": 2.455746650695801 }, { "auxiliary_loss_clip": 0.01126645, "auxiliary_loss_mlp": 0.01030541, "balance_loss_clip": 1.04526186, "balance_loss_mlp": 1.0229888, "epoch": 0.7438225214934167, "flos": 21866761451520.0, "grad_norm": 1.6500706286797915, "language_loss": 0.68501008, "learning_rate": 6.496653307861535e-07, "loss": 0.70658195, "num_input_tokens_seen": 132960060, "step": 6186, "time_per_iteration": 2.522322416305542 }, { "auxiliary_loss_clip": 0.01157895, "auxiliary_loss_mlp": 0.01024997, "balance_loss_clip": 1.04782844, "balance_loss_mlp": 1.01790082, "epoch": 0.7439427643840558, "flos": 20230097224320.0, "grad_norm": 3.6776352129083842, "language_loss": 0.65812725, "learning_rate": 6.490908134100857e-07, "loss": 0.6799562, "num_input_tokens_seen": 132978525, "step": 6187, "time_per_iteration": 2.436244487762451 }, { "auxiliary_loss_clip": 0.01158552, "auxiliary_loss_mlp": 0.01021989, "balance_loss_clip": 1.04821706, "balance_loss_mlp": 1.01454389, "epoch": 0.7440630072746949, "flos": 20849915335680.0, "grad_norm": 2.2181268072344094, "language_loss": 0.69327259, "learning_rate": 6.48516500965866e-07, "loss": 0.71507794, "num_input_tokens_seen": 132998460, "step": 6188, "time_per_iteration": 2.444633722305298 }, { "auxiliary_loss_clip": 0.01154872, "auxiliary_loss_mlp": 0.01021909, "balance_loss_clip": 1.04431081, "balance_loss_mlp": 1.01470256, "epoch": 0.7441832501653339, "flos": 26503762769280.0, "grad_norm": 1.8008470146056523, "language_loss": 0.81646919, "learning_rate": 6.479423935406192e-07, "loss": 0.83823705, "num_input_tokens_seen": 133018445, "step": 6189, "time_per_iteration": 2.476483106613159 }, { "auxiliary_loss_clip": 0.01044096, "auxiliary_loss_mlp": 0.01001698, "balance_loss_clip": 1.01650774, "balance_loss_mlp": 1.00073802, "epoch": 0.7443034930559731, "flos": 68602848088320.0, "grad_norm": 0.8120585173931032, "language_loss": 0.61998612, "learning_rate": 6.473684912214357e-07, "loss": 0.64044404, "num_input_tokens_seen": 133082005, "step": 6190, "time_per_iteration": 3.1991283893585205 }, { "auxiliary_loss_clip": 0.01153062, "auxiliary_loss_mlp": 0.01021295, "balance_loss_clip": 1.04863572, "balance_loss_mlp": 1.01427984, "epoch": 0.7444237359466122, "flos": 18654982951680.0, "grad_norm": 5.5890299380191895, "language_loss": 0.69773829, "learning_rate": 6.467947940953778e-07, "loss": 0.71948195, "num_input_tokens_seen": 133100530, "step": 6191, "time_per_iteration": 2.4406604766845703 }, { "auxiliary_loss_clip": 0.01138434, "auxiliary_loss_mlp": 0.0102792, "balance_loss_clip": 1.04459417, "balance_loss_mlp": 1.02111959, "epoch": 0.7445439788372512, "flos": 22817604326400.0, "grad_norm": 1.7888926965732181, "language_loss": 0.72490942, "learning_rate": 6.462213022494732e-07, "loss": 0.74657297, "num_input_tokens_seen": 133119775, "step": 6192, "time_per_iteration": 2.484675645828247 }, { "auxiliary_loss_clip": 0.0106198, "auxiliary_loss_mlp": 0.01001569, "balance_loss_clip": 1.0152657, "balance_loss_mlp": 1.00056195, "epoch": 0.7446642217278904, "flos": 67045690615680.0, "grad_norm": 0.7677672766641063, "language_loss": 0.61051834, "learning_rate": 6.456480157707201e-07, "loss": 0.63115382, "num_input_tokens_seen": 133184550, "step": 6193, "time_per_iteration": 2.9859704971313477 }, { "auxiliary_loss_clip": 0.01118272, "auxiliary_loss_mlp": 0.01027632, "balance_loss_clip": 1.04254794, "balance_loss_mlp": 1.0199039, "epoch": 0.7447844646185294, "flos": 17417465631360.0, "grad_norm": 1.819256562295751, "language_loss": 0.84713626, "learning_rate": 6.450749347460866e-07, "loss": 0.86859524, "num_input_tokens_seen": 133201525, "step": 6194, "time_per_iteration": 2.4781603813171387 }, { "auxiliary_loss_clip": 0.01167648, "auxiliary_loss_mlp": 0.0102575, "balance_loss_clip": 1.04753089, "balance_loss_mlp": 1.01884818, "epoch": 0.7449047075091685, "flos": 26615876094720.0, "grad_norm": 1.8097698630998473, "language_loss": 0.78789788, "learning_rate": 6.445020592625083e-07, "loss": 0.80983186, "num_input_tokens_seen": 133222175, "step": 6195, "time_per_iteration": 2.4590728282928467 }, { "auxiliary_loss_clip": 0.01166421, "auxiliary_loss_mlp": 0.01026383, "balance_loss_clip": 1.04703259, "balance_loss_mlp": 1.01885176, "epoch": 0.7450249503998077, "flos": 14170458867840.0, "grad_norm": 2.310880236555306, "language_loss": 0.80482769, "learning_rate": 6.4392938940689e-07, "loss": 0.8267557, "num_input_tokens_seen": 133237590, "step": 6196, "time_per_iteration": 2.3711137771606445 }, { "auxiliary_loss_clip": 0.01109084, "auxiliary_loss_mlp": 0.00761268, "balance_loss_clip": 1.04393423, "balance_loss_mlp": 1.0003289, "epoch": 0.7451451932904467, "flos": 19606687752960.0, "grad_norm": 2.3412638046916, "language_loss": 0.72118419, "learning_rate": 6.433569252661049e-07, "loss": 0.73988771, "num_input_tokens_seen": 133255590, "step": 6197, "time_per_iteration": 2.5289671421051025 }, { "auxiliary_loss_clip": 0.01118567, "auxiliary_loss_mlp": 0.01023067, "balance_loss_clip": 1.04270315, "balance_loss_mlp": 1.01637971, "epoch": 0.7452654361810858, "flos": 12495405980160.0, "grad_norm": 1.7419545742608133, "language_loss": 0.71128464, "learning_rate": 6.427846669269952e-07, "loss": 0.73270094, "num_input_tokens_seen": 133273210, "step": 6198, "time_per_iteration": 2.473322629928589 }, { "auxiliary_loss_clip": 0.01170042, "auxiliary_loss_mlp": 0.01027548, "balance_loss_clip": 1.05222011, "balance_loss_mlp": 1.02088404, "epoch": 0.7453856790717249, "flos": 22127329687680.0, "grad_norm": 2.2693060423334908, "language_loss": 0.82625848, "learning_rate": 6.422126144763729e-07, "loss": 0.84823436, "num_input_tokens_seen": 133292600, "step": 6199, "time_per_iteration": 2.4111764430999756 }, { "auxiliary_loss_clip": 0.01123233, "auxiliary_loss_mlp": 0.00761984, "balance_loss_clip": 1.0410471, "balance_loss_mlp": 1.00040412, "epoch": 0.745505921962364, "flos": 20010682995840.0, "grad_norm": 3.7885517949085235, "language_loss": 0.76948673, "learning_rate": 6.416407680010174e-07, "loss": 0.7883389, "num_input_tokens_seen": 133306960, "step": 6200, "time_per_iteration": 2.4838271141052246 }, { "auxiliary_loss_clip": 0.01126198, "auxiliary_loss_mlp": 0.01027572, "balance_loss_clip": 1.04662418, "balance_loss_mlp": 1.01968622, "epoch": 0.745626164853003, "flos": 24677884673280.0, "grad_norm": 1.9412734105472431, "language_loss": 0.81055725, "learning_rate": 6.410691275876774e-07, "loss": 0.83209497, "num_input_tokens_seen": 133326380, "step": 6201, "time_per_iteration": 2.561248540878296 }, { "auxiliary_loss_clip": 0.01146038, "auxiliary_loss_mlp": 0.01023811, "balance_loss_clip": 1.04729605, "balance_loss_mlp": 1.0167954, "epoch": 0.7457464077436422, "flos": 14538830797440.0, "grad_norm": 3.983802267878781, "language_loss": 0.76758128, "learning_rate": 6.404976933230704e-07, "loss": 0.78927982, "num_input_tokens_seen": 133342900, "step": 6202, "time_per_iteration": 2.43813157081604 }, { "auxiliary_loss_clip": 0.01144489, "auxiliary_loss_mlp": 0.01025861, "balance_loss_clip": 1.04676485, "balance_loss_mlp": 1.01875329, "epoch": 0.7458666506342813, "flos": 34021194600960.0, "grad_norm": 2.05260282849613, "language_loss": 0.72575796, "learning_rate": 6.399264652938813e-07, "loss": 0.7474615, "num_input_tokens_seen": 133363805, "step": 6203, "time_per_iteration": 2.574030876159668 }, { "auxiliary_loss_clip": 0.01136499, "auxiliary_loss_mlp": 0.01022459, "balance_loss_clip": 1.04431498, "balance_loss_mlp": 1.01539612, "epoch": 0.7459868935249203, "flos": 24279025075200.0, "grad_norm": 1.9374598061480668, "language_loss": 0.74500263, "learning_rate": 6.393554435867679e-07, "loss": 0.76659214, "num_input_tokens_seen": 133384655, "step": 6204, "time_per_iteration": 4.084227561950684 }, { "auxiliary_loss_clip": 0.01121001, "auxiliary_loss_mlp": 0.01023674, "balance_loss_clip": 1.04250634, "balance_loss_mlp": 1.01597285, "epoch": 0.7461071364155595, "flos": 21908777385600.0, "grad_norm": 1.9800080127206716, "language_loss": 0.83316207, "learning_rate": 6.387846282883502e-07, "loss": 0.85460889, "num_input_tokens_seen": 133401185, "step": 6205, "time_per_iteration": 3.3325726985931396 }, { "auxiliary_loss_clip": 0.01164902, "auxiliary_loss_mlp": 0.01021893, "balance_loss_clip": 1.04754555, "balance_loss_mlp": 1.0149188, "epoch": 0.7462273793061985, "flos": 22889712879360.0, "grad_norm": 2.9273996422156014, "language_loss": 0.76747751, "learning_rate": 6.38214019485223e-07, "loss": 0.78934544, "num_input_tokens_seen": 133420010, "step": 6206, "time_per_iteration": 2.4077541828155518 }, { "auxiliary_loss_clip": 0.01093658, "auxiliary_loss_mlp": 0.01024832, "balance_loss_clip": 1.04005182, "balance_loss_mlp": 1.01764965, "epoch": 0.7463476221968376, "flos": 19968451580160.0, "grad_norm": 1.859492413114982, "language_loss": 0.71791303, "learning_rate": 6.376436172639461e-07, "loss": 0.73909795, "num_input_tokens_seen": 133437855, "step": 6207, "time_per_iteration": 2.5536484718322754 }, { "auxiliary_loss_clip": 0.01088513, "auxiliary_loss_mlp": 0.01030115, "balance_loss_clip": 1.04204917, "balance_loss_mlp": 1.0219785, "epoch": 0.7464678650874768, "flos": 16836610798080.0, "grad_norm": 2.3585745994790352, "language_loss": 0.64730155, "learning_rate": 6.370734217110487e-07, "loss": 0.66848779, "num_input_tokens_seen": 133456600, "step": 6208, "time_per_iteration": 3.3388936519622803 }, { "auxiliary_loss_clip": 0.01142146, "auxiliary_loss_mlp": 0.01027459, "balance_loss_clip": 1.04774106, "balance_loss_mlp": 1.01996636, "epoch": 0.7465881079781158, "flos": 48100869843840.0, "grad_norm": 1.4505603185848304, "language_loss": 0.64256763, "learning_rate": 6.36503432913031e-07, "loss": 0.66426361, "num_input_tokens_seen": 133479745, "step": 6209, "time_per_iteration": 2.729149103164673 }, { "auxiliary_loss_clip": 0.01150543, "auxiliary_loss_mlp": 0.01024486, "balance_loss_clip": 1.04698348, "balance_loss_mlp": 1.01703, "epoch": 0.7467083508687549, "flos": 19677359761920.0, "grad_norm": 2.0386818034073877, "language_loss": 0.68967742, "learning_rate": 6.359336509563569e-07, "loss": 0.71142769, "num_input_tokens_seen": 133495765, "step": 6210, "time_per_iteration": 2.407444953918457 }, { "auxiliary_loss_clip": 0.01111907, "auxiliary_loss_mlp": 0.01030829, "balance_loss_clip": 1.04248261, "balance_loss_mlp": 1.02367055, "epoch": 0.7468285937593939, "flos": 17895436934400.0, "grad_norm": 1.7850169020944548, "language_loss": 0.80585963, "learning_rate": 6.353640759274641e-07, "loss": 0.82728696, "num_input_tokens_seen": 133514655, "step": 6211, "time_per_iteration": 2.466000556945801 }, { "auxiliary_loss_clip": 0.01148716, "auxiliary_loss_mlp": 0.01021498, "balance_loss_clip": 1.0438211, "balance_loss_mlp": 1.01427948, "epoch": 0.7469488366500331, "flos": 23141446369920.0, "grad_norm": 3.815208700603355, "language_loss": 0.73478317, "learning_rate": 6.347947079127556e-07, "loss": 0.75648534, "num_input_tokens_seen": 133532555, "step": 6212, "time_per_iteration": 2.448587656021118 }, { "auxiliary_loss_clip": 0.01135783, "auxiliary_loss_mlp": 0.0102314, "balance_loss_clip": 1.04566979, "balance_loss_mlp": 1.01576042, "epoch": 0.7470690795406721, "flos": 16690849407360.0, "grad_norm": 2.2187525466721603, "language_loss": 0.77419782, "learning_rate": 6.342255469986053e-07, "loss": 0.79578704, "num_input_tokens_seen": 133551300, "step": 6213, "time_per_iteration": 2.444871187210083 }, { "auxiliary_loss_clip": 0.0116441, "auxiliary_loss_mlp": 0.01025067, "balance_loss_clip": 1.04717588, "balance_loss_mlp": 1.01775301, "epoch": 0.7471893224313112, "flos": 25192700352000.0, "grad_norm": 1.8812834755729595, "language_loss": 0.76202929, "learning_rate": 6.336565932713533e-07, "loss": 0.7839241, "num_input_tokens_seen": 133570725, "step": 6214, "time_per_iteration": 2.43023419380188 }, { "auxiliary_loss_clip": 0.0113915, "auxiliary_loss_mlp": 0.01025334, "balance_loss_clip": 1.0497694, "balance_loss_mlp": 1.01788592, "epoch": 0.7473095653219504, "flos": 22526225199360.0, "grad_norm": 1.9257933638632283, "language_loss": 0.77874303, "learning_rate": 6.330878468173088e-07, "loss": 0.80038786, "num_input_tokens_seen": 133590790, "step": 6215, "time_per_iteration": 2.4914753437042236 }, { "auxiliary_loss_clip": 0.01144302, "auxiliary_loss_mlp": 0.01021666, "balance_loss_clip": 1.044276, "balance_loss_mlp": 1.01465321, "epoch": 0.7474298082125894, "flos": 18113989236480.0, "grad_norm": 1.833161394688964, "language_loss": 0.73129159, "learning_rate": 6.32519307722752e-07, "loss": 0.75295126, "num_input_tokens_seen": 133608685, "step": 6216, "time_per_iteration": 2.413458824157715 }, { "auxiliary_loss_clip": 0.01040271, "auxiliary_loss_mlp": 0.01002351, "balance_loss_clip": 1.02107894, "balance_loss_mlp": 1.00124824, "epoch": 0.7475500511032285, "flos": 62086535193600.0, "grad_norm": 0.9147310792187742, "language_loss": 0.54990089, "learning_rate": 6.31950976073929e-07, "loss": 0.5703271, "num_input_tokens_seen": 133662775, "step": 6217, "time_per_iteration": 3.065585136413574 }, { "auxiliary_loss_clip": 0.0111098, "auxiliary_loss_mlp": 0.01023694, "balance_loss_clip": 1.04553318, "balance_loss_mlp": 1.01650882, "epoch": 0.7476702939938676, "flos": 17785586165760.0, "grad_norm": 2.8981021199372297, "language_loss": 0.8071233, "learning_rate": 6.31382851957055e-07, "loss": 0.82847011, "num_input_tokens_seen": 133679595, "step": 6218, "time_per_iteration": 2.5134427547454834 }, { "auxiliary_loss_clip": 0.01121556, "auxiliary_loss_mlp": 0.00761805, "balance_loss_clip": 1.04384255, "balance_loss_mlp": 1.00042236, "epoch": 0.7477905368845067, "flos": 27927944092800.0, "grad_norm": 1.9047534806383175, "language_loss": 0.7137059, "learning_rate": 6.308149354583143e-07, "loss": 0.73253953, "num_input_tokens_seen": 133699000, "step": 6219, "time_per_iteration": 2.542675256729126 }, { "auxiliary_loss_clip": 0.01158034, "auxiliary_loss_mlp": 0.01027928, "balance_loss_clip": 1.04858708, "balance_loss_mlp": 1.02016139, "epoch": 0.7479107797751458, "flos": 26870374932480.0, "grad_norm": 2.067737699056454, "language_loss": 0.82009405, "learning_rate": 6.302472266638586e-07, "loss": 0.84195369, "num_input_tokens_seen": 133719540, "step": 6220, "time_per_iteration": 2.4710311889648438 }, { "auxiliary_loss_clip": 0.01174272, "auxiliary_loss_mlp": 0.01027114, "balance_loss_clip": 1.05049396, "balance_loss_mlp": 1.01959229, "epoch": 0.7480310226657849, "flos": 33943375785600.0, "grad_norm": 2.3184549032649464, "language_loss": 0.70081878, "learning_rate": 6.296797256598101e-07, "loss": 0.72283262, "num_input_tokens_seen": 133741020, "step": 6221, "time_per_iteration": 2.5132362842559814 }, { "auxiliary_loss_clip": 0.01115356, "auxiliary_loss_mlp": 0.01020992, "balance_loss_clip": 1.04266918, "balance_loss_mlp": 1.01424766, "epoch": 0.748151265556424, "flos": 24826555065600.0, "grad_norm": 1.6507157849663419, "language_loss": 0.81287891, "learning_rate": 6.291124325322576e-07, "loss": 0.83424234, "num_input_tokens_seen": 133761145, "step": 6222, "time_per_iteration": 2.529958486557007 }, { "auxiliary_loss_clip": 0.01144576, "auxiliary_loss_mlp": 0.01021906, "balance_loss_clip": 1.04648924, "balance_loss_mlp": 1.01461053, "epoch": 0.748271508447063, "flos": 38399351535360.0, "grad_norm": 1.5315019259196678, "language_loss": 0.62317777, "learning_rate": 6.285453473672595e-07, "loss": 0.64484257, "num_input_tokens_seen": 133783715, "step": 6223, "time_per_iteration": 2.622898817062378 }, { "auxiliary_loss_clip": 0.01162977, "auxiliary_loss_mlp": 0.01022608, "balance_loss_clip": 1.04584289, "balance_loss_mlp": 1.01568222, "epoch": 0.7483917513377022, "flos": 21541842000000.0, "grad_norm": 1.8350760105936486, "language_loss": 0.75425625, "learning_rate": 6.279784702508415e-07, "loss": 0.77611214, "num_input_tokens_seen": 133804465, "step": 6224, "time_per_iteration": 2.430347442626953 }, { "auxiliary_loss_clip": 0.01042139, "auxiliary_loss_mlp": 0.01001089, "balance_loss_clip": 1.01416397, "balance_loss_mlp": 1.00000405, "epoch": 0.7485119942283412, "flos": 62314532772480.0, "grad_norm": 0.7883586487722969, "language_loss": 0.5861026, "learning_rate": 6.274118012689979e-07, "loss": 0.6065349, "num_input_tokens_seen": 133866365, "step": 6225, "time_per_iteration": 3.1924705505371094 }, { "auxiliary_loss_clip": 0.01132486, "auxiliary_loss_mlp": 0.01020595, "balance_loss_clip": 1.04461217, "balance_loss_mlp": 1.01350808, "epoch": 0.7486322371189803, "flos": 29937613104000.0, "grad_norm": 1.4412620309893303, "language_loss": 0.67983925, "learning_rate": 6.268453405076943e-07, "loss": 0.70137006, "num_input_tokens_seen": 133888760, "step": 6226, "time_per_iteration": 2.5535829067230225 }, { "auxiliary_loss_clip": 0.01137645, "auxiliary_loss_mlp": 0.01020275, "balance_loss_clip": 1.04396915, "balance_loss_mlp": 1.01374507, "epoch": 0.7487524800096195, "flos": 18949414734720.0, "grad_norm": 1.8144143569730453, "language_loss": 0.82340175, "learning_rate": 6.262790880528592e-07, "loss": 0.84498096, "num_input_tokens_seen": 133906380, "step": 6227, "time_per_iteration": 2.4477949142456055 }, { "auxiliary_loss_clip": 0.01134435, "auxiliary_loss_mlp": 0.01030438, "balance_loss_clip": 1.04232669, "balance_loss_mlp": 1.02282381, "epoch": 0.7488727229002585, "flos": 18697393935360.0, "grad_norm": 2.344175243345799, "language_loss": 0.79441434, "learning_rate": 6.257130439903951e-07, "loss": 0.81606305, "num_input_tokens_seen": 133922875, "step": 6228, "time_per_iteration": 2.4687416553497314 }, { "auxiliary_loss_clip": 0.01170136, "auxiliary_loss_mlp": 0.01024206, "balance_loss_clip": 1.05075121, "balance_loss_mlp": 1.0170629, "epoch": 0.7489929657908976, "flos": 23623368168960.0, "grad_norm": 1.8729090487936177, "language_loss": 0.81070983, "learning_rate": 6.251472084061695e-07, "loss": 0.83265328, "num_input_tokens_seen": 133941795, "step": 6229, "time_per_iteration": 2.4162254333496094 }, { "auxiliary_loss_clip": 0.01153283, "auxiliary_loss_mlp": 0.01023761, "balance_loss_clip": 1.04921317, "balance_loss_mlp": 1.01695156, "epoch": 0.7491132086815367, "flos": 20551533056640.0, "grad_norm": 1.876590929168162, "language_loss": 0.88928324, "learning_rate": 6.245815813860191e-07, "loss": 0.91105366, "num_input_tokens_seen": 133957305, "step": 6230, "time_per_iteration": 3.991645336151123 }, { "auxiliary_loss_clip": 0.01169023, "auxiliary_loss_mlp": 0.01023028, "balance_loss_clip": 1.04729509, "balance_loss_mlp": 1.01581001, "epoch": 0.7492334515721758, "flos": 23003011353600.0, "grad_norm": 1.9645323292033885, "language_loss": 0.70029616, "learning_rate": 6.240161630157495e-07, "loss": 0.72221667, "num_input_tokens_seen": 133976660, "step": 6231, "time_per_iteration": 2.4113821983337402 }, { "auxiliary_loss_clip": 0.01169446, "auxiliary_loss_mlp": 0.01023414, "balance_loss_clip": 1.04816628, "balance_loss_mlp": 1.01628804, "epoch": 0.7493536944628149, "flos": 16398823835520.0, "grad_norm": 1.84444668515726, "language_loss": 0.70071959, "learning_rate": 6.23450953381133e-07, "loss": 0.72264814, "num_input_tokens_seen": 133994750, "step": 6232, "time_per_iteration": 3.219994306564331 }, { "auxiliary_loss_clip": 0.0113269, "auxiliary_loss_mlp": 0.01023618, "balance_loss_clip": 1.04464459, "balance_loss_mlp": 1.01659024, "epoch": 0.749473937353454, "flos": 15338561155200.0, "grad_norm": 1.9465253736845634, "language_loss": 0.68101096, "learning_rate": 6.228859525679131e-07, "loss": 0.70257407, "num_input_tokens_seen": 134009165, "step": 6233, "time_per_iteration": 2.4291491508483887 }, { "auxiliary_loss_clip": 0.01151357, "auxiliary_loss_mlp": 0.01023636, "balance_loss_clip": 1.04587197, "balance_loss_mlp": 1.01654911, "epoch": 0.7495941802440931, "flos": 18951138587520.0, "grad_norm": 2.370861030985141, "language_loss": 0.79979414, "learning_rate": 6.223211606617986e-07, "loss": 0.82154411, "num_input_tokens_seen": 134027585, "step": 6234, "time_per_iteration": 2.4236137866973877 }, { "auxiliary_loss_clip": 0.01151537, "auxiliary_loss_mlp": 0.01023931, "balance_loss_clip": 1.05012703, "balance_loss_mlp": 1.01771128, "epoch": 0.7497144231347321, "flos": 22492469393280.0, "grad_norm": 1.8198836772299694, "language_loss": 0.83996075, "learning_rate": 6.217565777484701e-07, "loss": 0.86171544, "num_input_tokens_seen": 134046680, "step": 6235, "time_per_iteration": 3.1726624965667725 }, { "auxiliary_loss_clip": 0.01134361, "auxiliary_loss_mlp": 0.00761634, "balance_loss_clip": 1.04438329, "balance_loss_mlp": 1.00038457, "epoch": 0.7498346660253713, "flos": 24243509502720.0, "grad_norm": 1.7571962626905717, "language_loss": 0.80194199, "learning_rate": 6.211922039135722e-07, "loss": 0.82090193, "num_input_tokens_seen": 134066825, "step": 6236, "time_per_iteration": 2.5021450519561768 }, { "auxiliary_loss_clip": 0.01168505, "auxiliary_loss_mlp": 0.01025935, "balance_loss_clip": 1.049384, "balance_loss_mlp": 1.01885366, "epoch": 0.7499549089160104, "flos": 24387080163840.0, "grad_norm": 2.9861287230376203, "language_loss": 0.80835712, "learning_rate": 6.206280392427201e-07, "loss": 0.83030152, "num_input_tokens_seen": 134086410, "step": 6237, "time_per_iteration": 2.4432663917541504 }, { "auxiliary_loss_clip": 0.01146797, "auxiliary_loss_mlp": 0.01021113, "balance_loss_clip": 1.04471016, "balance_loss_mlp": 1.01388574, "epoch": 0.7500751518066494, "flos": 34057320704640.0, "grad_norm": 1.568529236453423, "language_loss": 0.73682833, "learning_rate": 6.200640838214983e-07, "loss": 0.75850737, "num_input_tokens_seen": 134109185, "step": 6238, "time_per_iteration": 2.5562078952789307 }, { "auxiliary_loss_clip": 0.01166918, "auxiliary_loss_mlp": 0.01025921, "balance_loss_clip": 1.04849482, "balance_loss_mlp": 1.01878011, "epoch": 0.7501953946972886, "flos": 18843586289280.0, "grad_norm": 1.8420751057366733, "language_loss": 0.66660023, "learning_rate": 6.195003377354578e-07, "loss": 0.68852854, "num_input_tokens_seen": 134128455, "step": 6239, "time_per_iteration": 2.402932643890381 }, { "auxiliary_loss_clip": 0.01150001, "auxiliary_loss_mlp": 0.01029442, "balance_loss_clip": 1.04455245, "balance_loss_mlp": 1.02169895, "epoch": 0.7503156375879276, "flos": 20257675891200.0, "grad_norm": 2.626204985629956, "language_loss": 0.72696888, "learning_rate": 6.189368010701183e-07, "loss": 0.74876332, "num_input_tokens_seen": 134145515, "step": 6240, "time_per_iteration": 2.4027717113494873 }, { "auxiliary_loss_clip": 0.01157392, "auxiliary_loss_mlp": 0.01025955, "balance_loss_clip": 1.04523158, "balance_loss_mlp": 1.01860261, "epoch": 0.7504358804785667, "flos": 13480040574720.0, "grad_norm": 1.9453612773620705, "language_loss": 0.76529676, "learning_rate": 6.183734739109683e-07, "loss": 0.78713024, "num_input_tokens_seen": 134163335, "step": 6241, "time_per_iteration": 2.4118711948394775 }, { "auxiliary_loss_clip": 0.01160642, "auxiliary_loss_mlp": 0.01025354, "balance_loss_clip": 1.04854369, "balance_loss_mlp": 1.01773643, "epoch": 0.7505561233692057, "flos": 29461042431360.0, "grad_norm": 2.2504640541617373, "language_loss": 0.68508184, "learning_rate": 6.178103563434629e-07, "loss": 0.70694178, "num_input_tokens_seen": 134182335, "step": 6242, "time_per_iteration": 2.481496572494507 }, { "auxiliary_loss_clip": 0.01166838, "auxiliary_loss_mlp": 0.01029457, "balance_loss_clip": 1.04749024, "balance_loss_mlp": 1.02251315, "epoch": 0.7506763662598449, "flos": 20302457172480.0, "grad_norm": 1.7734571656123108, "language_loss": 0.83663851, "learning_rate": 6.172474484530283e-07, "loss": 0.85860145, "num_input_tokens_seen": 134201070, "step": 6243, "time_per_iteration": 2.3832345008850098 }, { "auxiliary_loss_clip": 0.01129106, "auxiliary_loss_mlp": 0.01025663, "balance_loss_clip": 1.0410111, "balance_loss_mlp": 1.01842117, "epoch": 0.750796609150484, "flos": 37230961939200.0, "grad_norm": 2.171412603974266, "language_loss": 0.75914609, "learning_rate": 6.166847503250563e-07, "loss": 0.78069377, "num_input_tokens_seen": 134223310, "step": 6244, "time_per_iteration": 2.6111512184143066 }, { "auxiliary_loss_clip": 0.01141789, "auxiliary_loss_mlp": 0.01023911, "balance_loss_clip": 1.04663026, "balance_loss_mlp": 1.01670218, "epoch": 0.750916852041123, "flos": 19609417186560.0, "grad_norm": 2.3001814639975025, "language_loss": 0.79256475, "learning_rate": 6.161222620449078e-07, "loss": 0.81422174, "num_input_tokens_seen": 134242085, "step": 6245, "time_per_iteration": 2.5093770027160645 }, { "auxiliary_loss_clip": 0.01126965, "auxiliary_loss_mlp": 0.01028541, "balance_loss_clip": 1.04454315, "balance_loss_mlp": 1.0212009, "epoch": 0.7510370949317622, "flos": 25112690807040.0, "grad_norm": 6.03203468351411, "language_loss": 0.79935658, "learning_rate": 6.155599836979117e-07, "loss": 0.82091165, "num_input_tokens_seen": 134260770, "step": 6246, "time_per_iteration": 2.533597230911255 }, { "auxiliary_loss_clip": 0.01110671, "auxiliary_loss_mlp": 0.01028378, "balance_loss_clip": 1.04131794, "balance_loss_mlp": 1.02083826, "epoch": 0.7511573378224012, "flos": 19062282245760.0, "grad_norm": 2.476737835506978, "language_loss": 0.81687248, "learning_rate": 6.149979153693649e-07, "loss": 0.83826292, "num_input_tokens_seen": 134278025, "step": 6247, "time_per_iteration": 2.517467498779297 }, { "auxiliary_loss_clip": 0.01150882, "auxiliary_loss_mlp": 0.01021136, "balance_loss_clip": 1.04562283, "balance_loss_mlp": 1.01387036, "epoch": 0.7512775807130403, "flos": 19937676602880.0, "grad_norm": 1.9242567808821216, "language_loss": 0.76984453, "learning_rate": 6.144360571445343e-07, "loss": 0.7915647, "num_input_tokens_seen": 134297170, "step": 6248, "time_per_iteration": 2.4166879653930664 }, { "auxiliary_loss_clip": 0.01153255, "auxiliary_loss_mlp": 0.01024226, "balance_loss_clip": 1.04967964, "balance_loss_mlp": 1.01670694, "epoch": 0.7513978236036795, "flos": 20739920912640.0, "grad_norm": 1.677912911373662, "language_loss": 0.80133832, "learning_rate": 6.138744091086509e-07, "loss": 0.82311308, "num_input_tokens_seen": 134316755, "step": 6249, "time_per_iteration": 2.4671390056610107 }, { "auxiliary_loss_clip": 0.01129486, "auxiliary_loss_mlp": 0.01023699, "balance_loss_clip": 1.04612064, "balance_loss_mlp": 1.01657593, "epoch": 0.7515180664943185, "flos": 27563163523200.0, "grad_norm": 2.9477022416916894, "language_loss": 0.72612911, "learning_rate": 6.133129713469183e-07, "loss": 0.74766099, "num_input_tokens_seen": 134335960, "step": 6250, "time_per_iteration": 2.5370397567749023 }, { "auxiliary_loss_clip": 0.0113391, "auxiliary_loss_mlp": 0.01024471, "balance_loss_clip": 1.04283834, "balance_loss_mlp": 1.0167762, "epoch": 0.7516383093849576, "flos": 33803181002880.0, "grad_norm": 1.642104574358395, "language_loss": 0.63948023, "learning_rate": 6.127517439445053e-07, "loss": 0.66106409, "num_input_tokens_seen": 134356805, "step": 6251, "time_per_iteration": 2.640925645828247 }, { "auxiliary_loss_clip": 0.0110339, "auxiliary_loss_mlp": 0.01025942, "balance_loss_clip": 1.04175401, "balance_loss_mlp": 1.01900971, "epoch": 0.7517585522755967, "flos": 29746172592000.0, "grad_norm": 1.7587436804556467, "language_loss": 0.81388497, "learning_rate": 6.121907269865498e-07, "loss": 0.83517826, "num_input_tokens_seen": 134376295, "step": 6252, "time_per_iteration": 2.587433099746704 }, { "auxiliary_loss_clip": 0.01033395, "auxiliary_loss_mlp": 0.01001507, "balance_loss_clip": 1.01297998, "balance_loss_mlp": 1.00034511, "epoch": 0.7518787951662358, "flos": 69807974319360.0, "grad_norm": 0.9252908462086723, "language_loss": 0.67270464, "learning_rate": 6.116299205581577e-07, "loss": 0.69305366, "num_input_tokens_seen": 134431125, "step": 6253, "time_per_iteration": 3.0550522804260254 }, { "auxiliary_loss_clip": 0.01172567, "auxiliary_loss_mlp": 0.01029086, "balance_loss_clip": 1.05047369, "balance_loss_mlp": 1.02153063, "epoch": 0.7519990380568748, "flos": 34203225749760.0, "grad_norm": 1.9753732488135523, "language_loss": 0.684681, "learning_rate": 6.110693247444018e-07, "loss": 0.70669746, "num_input_tokens_seen": 134452960, "step": 6254, "time_per_iteration": 2.520676374435425 }, { "auxiliary_loss_clip": 0.0111267, "auxiliary_loss_mlp": 0.01021024, "balance_loss_clip": 1.04187322, "balance_loss_mlp": 1.01428902, "epoch": 0.752119280947514, "flos": 21725704742400.0, "grad_norm": 2.126531339211925, "language_loss": 0.82428157, "learning_rate": 6.105089396303258e-07, "loss": 0.84561855, "num_input_tokens_seen": 134471350, "step": 6255, "time_per_iteration": 2.487708568572998 }, { "auxiliary_loss_clip": 0.01138127, "auxiliary_loss_mlp": 0.01028442, "balance_loss_clip": 1.04393554, "balance_loss_mlp": 1.02078891, "epoch": 0.7522395238381531, "flos": 32742774668160.0, "grad_norm": 2.0933952787177934, "language_loss": 0.75474834, "learning_rate": 6.099487653009383e-07, "loss": 0.77641404, "num_input_tokens_seen": 134490695, "step": 6256, "time_per_iteration": 2.5547077655792236 }, { "auxiliary_loss_clip": 0.01150694, "auxiliary_loss_mlp": 0.01021254, "balance_loss_clip": 1.04513693, "balance_loss_mlp": 1.01500177, "epoch": 0.7523597667287921, "flos": 23476026579840.0, "grad_norm": 1.981453671770173, "language_loss": 0.82870805, "learning_rate": 6.093888018412192e-07, "loss": 0.85042763, "num_input_tokens_seen": 134506885, "step": 6257, "time_per_iteration": 4.064921140670776 }, { "auxiliary_loss_clip": 0.01060228, "auxiliary_loss_mlp": 0.01001605, "balance_loss_clip": 1.01348948, "balance_loss_mlp": 1.00044858, "epoch": 0.7524800096194313, "flos": 67346730501120.0, "grad_norm": 0.706416242686214, "language_loss": 0.54675937, "learning_rate": 6.088290493361125e-07, "loss": 0.56737769, "num_input_tokens_seen": 134571770, "step": 6258, "time_per_iteration": 3.1746506690979004 }, { "auxiliary_loss_clip": 0.01103374, "auxiliary_loss_mlp": 0.01025143, "balance_loss_clip": 1.04152441, "balance_loss_mlp": 1.01802027, "epoch": 0.7526002525100703, "flos": 13006055681280.0, "grad_norm": 2.1684632685671823, "language_loss": 0.71786094, "learning_rate": 6.082695078705322e-07, "loss": 0.73914611, "num_input_tokens_seen": 134589250, "step": 6259, "time_per_iteration": 3.35184907913208 }, { "auxiliary_loss_clip": 0.01146666, "auxiliary_loss_mlp": 0.01025885, "balance_loss_clip": 1.04649723, "balance_loss_mlp": 1.01827335, "epoch": 0.7527204954007094, "flos": 21397229844480.0, "grad_norm": 2.0456011277912367, "language_loss": 0.69125962, "learning_rate": 6.077101775293618e-07, "loss": 0.71298516, "num_input_tokens_seen": 134608075, "step": 6260, "time_per_iteration": 2.4235687255859375 }, { "auxiliary_loss_clip": 0.01155083, "auxiliary_loss_mlp": 0.0102883, "balance_loss_clip": 1.04685998, "balance_loss_mlp": 1.02077448, "epoch": 0.7528407382913486, "flos": 18947188091520.0, "grad_norm": 2.4849077527889643, "language_loss": 0.82339543, "learning_rate": 6.071510583974504e-07, "loss": 0.84523457, "num_input_tokens_seen": 134623260, "step": 6261, "time_per_iteration": 2.3993842601776123 }, { "auxiliary_loss_clip": 0.01170101, "auxiliary_loss_mlp": 0.01029477, "balance_loss_clip": 1.04961562, "balance_loss_mlp": 1.02209187, "epoch": 0.7529609811819876, "flos": 15231798956160.0, "grad_norm": 1.8486718868682808, "language_loss": 0.71922195, "learning_rate": 6.065921505596161e-07, "loss": 0.74121767, "num_input_tokens_seen": 134641540, "step": 6262, "time_per_iteration": 3.09307599067688 }, { "auxiliary_loss_clip": 0.01125493, "auxiliary_loss_mlp": 0.01023624, "balance_loss_clip": 1.04510975, "balance_loss_mlp": 1.01632512, "epoch": 0.7530812240726267, "flos": 19354487385600.0, "grad_norm": 1.6740837587178967, "language_loss": 0.77016187, "learning_rate": 6.060334541006445e-07, "loss": 0.79165304, "num_input_tokens_seen": 134660035, "step": 6263, "time_per_iteration": 2.4801292419433594 }, { "auxiliary_loss_clip": 0.01127439, "auxiliary_loss_mlp": 0.0102579, "balance_loss_clip": 1.04247904, "balance_loss_mlp": 1.01879549, "epoch": 0.7532014669632658, "flos": 27748247328000.0, "grad_norm": 1.4816375557735484, "language_loss": 0.69224095, "learning_rate": 6.05474969105289e-07, "loss": 0.71377319, "num_input_tokens_seen": 134683025, "step": 6264, "time_per_iteration": 2.579890727996826 }, { "auxiliary_loss_clip": 0.01154516, "auxiliary_loss_mlp": 0.01022254, "balance_loss_clip": 1.04752994, "balance_loss_mlp": 1.01479459, "epoch": 0.7533217098539049, "flos": 14137421333760.0, "grad_norm": 2.1586156920426607, "language_loss": 0.73210609, "learning_rate": 6.049166956582725e-07, "loss": 0.75387383, "num_input_tokens_seen": 134701290, "step": 6265, "time_per_iteration": 2.4073972702026367 }, { "auxiliary_loss_clip": 0.01148341, "auxiliary_loss_mlp": 0.01019485, "balance_loss_clip": 1.04510522, "balance_loss_mlp": 1.01263928, "epoch": 0.753441952744544, "flos": 26429068437120.0, "grad_norm": 2.938928415162882, "language_loss": 0.87530792, "learning_rate": 6.043586338442841e-07, "loss": 0.89698619, "num_input_tokens_seen": 134720345, "step": 6266, "time_per_iteration": 2.4748902320861816 }, { "auxiliary_loss_clip": 0.01164837, "auxiliary_loss_mlp": 0.01021315, "balance_loss_clip": 1.04902983, "balance_loss_mlp": 1.01508367, "epoch": 0.7535621956351831, "flos": 23878621192320.0, "grad_norm": 1.7953251826041832, "language_loss": 0.73136187, "learning_rate": 6.038007837479815e-07, "loss": 0.75322342, "num_input_tokens_seen": 134741450, "step": 6267, "time_per_iteration": 2.4239704608917236 }, { "auxiliary_loss_clip": 0.01150586, "auxiliary_loss_mlp": 0.01026156, "balance_loss_clip": 1.04704654, "balance_loss_mlp": 1.0188092, "epoch": 0.7536824385258222, "flos": 21795873960960.0, "grad_norm": 1.9848182025672334, "language_loss": 0.64212251, "learning_rate": 6.032431454539897e-07, "loss": 0.66389, "num_input_tokens_seen": 134760295, "step": 6268, "time_per_iteration": 2.4287822246551514 }, { "auxiliary_loss_clip": 0.01127872, "auxiliary_loss_mlp": 0.01028251, "balance_loss_clip": 1.04533052, "balance_loss_mlp": 1.02164364, "epoch": 0.7538026814164612, "flos": 28911644933760.0, "grad_norm": 1.8428271433944978, "language_loss": 0.81465751, "learning_rate": 6.026857190469014e-07, "loss": 0.83621871, "num_input_tokens_seen": 134782050, "step": 6269, "time_per_iteration": 2.5652706623077393 }, { "auxiliary_loss_clip": 0.01140396, "auxiliary_loss_mlp": 0.01021166, "balance_loss_clip": 1.04469585, "balance_loss_mlp": 1.01370335, "epoch": 0.7539229243071004, "flos": 21104701482240.0, "grad_norm": 6.019051247493286, "language_loss": 0.74187309, "learning_rate": 6.0212850461128e-07, "loss": 0.76348871, "num_input_tokens_seen": 134801170, "step": 6270, "time_per_iteration": 2.4639132022857666 }, { "auxiliary_loss_clip": 0.01142583, "auxiliary_loss_mlp": 0.01025749, "balance_loss_clip": 1.04521811, "balance_loss_mlp": 1.01805985, "epoch": 0.7540431671977395, "flos": 15158469340800.0, "grad_norm": 2.074958772768113, "language_loss": 0.74484587, "learning_rate": 6.015715022316516e-07, "loss": 0.76652914, "num_input_tokens_seen": 134819150, "step": 6271, "time_per_iteration": 2.4526193141937256 }, { "auxiliary_loss_clip": 0.01111515, "auxiliary_loss_mlp": 0.01021022, "balance_loss_clip": 1.04073691, "balance_loss_mlp": 1.01323223, "epoch": 0.7541634100883785, "flos": 18770579896320.0, "grad_norm": 2.8255669662731067, "language_loss": 0.78141332, "learning_rate": 6.010147119925154e-07, "loss": 0.80273867, "num_input_tokens_seen": 134836905, "step": 6272, "time_per_iteration": 2.5063862800598145 }, { "auxiliary_loss_clip": 0.01117236, "auxiliary_loss_mlp": 0.01023636, "balance_loss_clip": 1.04290366, "balance_loss_mlp": 1.01656091, "epoch": 0.7542836529790176, "flos": 20594770053120.0, "grad_norm": 1.975469233250893, "language_loss": 0.66183078, "learning_rate": 6.004581339783348e-07, "loss": 0.68323946, "num_input_tokens_seen": 134855225, "step": 6273, "time_per_iteration": 2.518101930618286 }, { "auxiliary_loss_clip": 0.01157985, "auxiliary_loss_mlp": 0.01032267, "balance_loss_clip": 1.04805088, "balance_loss_mlp": 1.02433991, "epoch": 0.7544038958696567, "flos": 19095104298240.0, "grad_norm": 2.6312171637024746, "language_loss": 0.68493694, "learning_rate": 5.999017682735425e-07, "loss": 0.70683944, "num_input_tokens_seen": 134871615, "step": 6274, "time_per_iteration": 2.5629022121429443 }, { "auxiliary_loss_clip": 0.01104552, "auxiliary_loss_mlp": 0.01027091, "balance_loss_clip": 1.04253376, "balance_loss_mlp": 1.01959562, "epoch": 0.7545241387602958, "flos": 31723306859520.0, "grad_norm": 2.002320912570464, "language_loss": 0.66431183, "learning_rate": 5.993456149625387e-07, "loss": 0.68562829, "num_input_tokens_seen": 134892765, "step": 6275, "time_per_iteration": 2.713529348373413 }, { "auxiliary_loss_clip": 0.0111629, "auxiliary_loss_mlp": 0.01024479, "balance_loss_clip": 1.04333472, "balance_loss_mlp": 1.01787746, "epoch": 0.7546443816509348, "flos": 20296495514880.0, "grad_norm": 1.7324732139310497, "language_loss": 0.82305419, "learning_rate": 5.987896741296909e-07, "loss": 0.8444618, "num_input_tokens_seen": 134910505, "step": 6276, "time_per_iteration": 2.501146078109741 }, { "auxiliary_loss_clip": 0.01141642, "auxiliary_loss_mlp": 0.01028032, "balance_loss_clip": 1.04825306, "balance_loss_mlp": 1.02077234, "epoch": 0.754764624541574, "flos": 23696159080320.0, "grad_norm": 2.4922362202068573, "language_loss": 0.783961, "learning_rate": 5.982339458593361e-07, "loss": 0.80565774, "num_input_tokens_seen": 134930445, "step": 6277, "time_per_iteration": 2.5138630867004395 }, { "auxiliary_loss_clip": 0.011516, "auxiliary_loss_mlp": 0.00761755, "balance_loss_clip": 1.04850817, "balance_loss_mlp": 1.00035167, "epoch": 0.7548848674322131, "flos": 25337204766720.0, "grad_norm": 1.5599870295067508, "language_loss": 0.83855003, "learning_rate": 5.976784302357767e-07, "loss": 0.8576836, "num_input_tokens_seen": 134951010, "step": 6278, "time_per_iteration": 2.4912402629852295 }, { "auxiliary_loss_clip": 0.01156583, "auxiliary_loss_mlp": 0.0102687, "balance_loss_clip": 1.04810667, "balance_loss_mlp": 1.01972926, "epoch": 0.7550051103228521, "flos": 19573147428480.0, "grad_norm": 1.8807749057620473, "language_loss": 0.73131055, "learning_rate": 5.971231273432855e-07, "loss": 0.7531451, "num_input_tokens_seen": 134970495, "step": 6279, "time_per_iteration": 2.426175355911255 }, { "auxiliary_loss_clip": 0.01060137, "auxiliary_loss_mlp": 0.0100264, "balance_loss_clip": 1.01441216, "balance_loss_mlp": 1.0015732, "epoch": 0.7551253532134913, "flos": 64150068648960.0, "grad_norm": 0.8104748717816136, "language_loss": 0.54582214, "learning_rate": 5.965680372661e-07, "loss": 0.56644988, "num_input_tokens_seen": 135028060, "step": 6280, "time_per_iteration": 2.947946310043335 }, { "auxiliary_loss_clip": 0.01140063, "auxiliary_loss_mlp": 0.01022911, "balance_loss_clip": 1.04693937, "balance_loss_mlp": 1.01661992, "epoch": 0.7552455961041303, "flos": 26067986968320.0, "grad_norm": 1.8685220353776606, "language_loss": 0.5656051, "learning_rate": 5.960131600884266e-07, "loss": 0.58723485, "num_input_tokens_seen": 135047330, "step": 6281, "time_per_iteration": 2.5105249881744385 }, { "auxiliary_loss_clip": 0.01129489, "auxiliary_loss_mlp": 0.01022468, "balance_loss_clip": 1.04580986, "balance_loss_mlp": 1.01599193, "epoch": 0.7553658389947694, "flos": 24498223822080.0, "grad_norm": 1.7390483990100314, "language_loss": 0.7589674, "learning_rate": 5.954584958944413e-07, "loss": 0.78048694, "num_input_tokens_seen": 135065995, "step": 6282, "time_per_iteration": 2.52677059173584 }, { "auxiliary_loss_clip": 0.01127613, "auxiliary_loss_mlp": 0.00761876, "balance_loss_clip": 1.04241645, "balance_loss_mlp": 1.00032043, "epoch": 0.7554860818854086, "flos": 21799465320960.0, "grad_norm": 1.8134519638142823, "language_loss": 0.81914151, "learning_rate": 5.949040447682854e-07, "loss": 0.83803642, "num_input_tokens_seen": 135085820, "step": 6283, "time_per_iteration": 3.2417562007904053 }, { "auxiliary_loss_clip": 0.0114508, "auxiliary_loss_mlp": 0.01023518, "balance_loss_clip": 1.0461179, "balance_loss_mlp": 1.01664615, "epoch": 0.7556063247760476, "flos": 16362123114240.0, "grad_norm": 3.0804674350465837, "language_loss": 0.68603671, "learning_rate": 5.943498067940686e-07, "loss": 0.70772272, "num_input_tokens_seen": 135102845, "step": 6284, "time_per_iteration": 3.283679723739624 }, { "auxiliary_loss_clip": 0.01140149, "auxiliary_loss_mlp": 0.01027324, "balance_loss_clip": 1.05275655, "balance_loss_mlp": 1.01999843, "epoch": 0.7557265676666867, "flos": 27235155502080.0, "grad_norm": 1.6548234805581539, "language_loss": 0.81631684, "learning_rate": 5.937957820558686e-07, "loss": 0.8379916, "num_input_tokens_seen": 135122190, "step": 6285, "time_per_iteration": 3.3709068298339844 }, { "auxiliary_loss_clip": 0.01049981, "auxiliary_loss_mlp": 0.01001505, "balance_loss_clip": 1.01219296, "balance_loss_mlp": 1.00046813, "epoch": 0.7558468105573258, "flos": 62189131415040.0, "grad_norm": 0.8473369133802555, "language_loss": 0.65465021, "learning_rate": 5.932419706377296e-07, "loss": 0.67516506, "num_input_tokens_seen": 135180495, "step": 6286, "time_per_iteration": 3.029397487640381 }, { "auxiliary_loss_clip": 0.01125351, "auxiliary_loss_mlp": 0.01022959, "balance_loss_clip": 1.04825294, "balance_loss_mlp": 1.01568711, "epoch": 0.7559670534479649, "flos": 33249078823680.0, "grad_norm": 1.8219643399996757, "language_loss": 0.74012923, "learning_rate": 5.92688372623666e-07, "loss": 0.76161242, "num_input_tokens_seen": 135199200, "step": 6287, "time_per_iteration": 2.5882673263549805 }, { "auxiliary_loss_clip": 0.01153921, "auxiliary_loss_mlp": 0.01022768, "balance_loss_clip": 1.04545832, "balance_loss_mlp": 1.01556754, "epoch": 0.7560872963386039, "flos": 14064379027200.0, "grad_norm": 2.0510412323897054, "language_loss": 0.73849952, "learning_rate": 5.921349880976574e-07, "loss": 0.76026642, "num_input_tokens_seen": 135217035, "step": 6288, "time_per_iteration": 3.205052137374878 }, { "auxiliary_loss_clip": 0.01142175, "auxiliary_loss_mlp": 0.00762325, "balance_loss_clip": 1.04425597, "balance_loss_mlp": 1.00038171, "epoch": 0.7562075392292431, "flos": 20412307941120.0, "grad_norm": 1.8696031497189385, "language_loss": 0.82365584, "learning_rate": 5.915818171436515e-07, "loss": 0.84270084, "num_input_tokens_seen": 135236370, "step": 6289, "time_per_iteration": 2.487180233001709 }, { "auxiliary_loss_clip": 0.01138726, "auxiliary_loss_mlp": 0.01025319, "balance_loss_clip": 1.04262042, "balance_loss_mlp": 1.01832187, "epoch": 0.7563277821198822, "flos": 20376792368640.0, "grad_norm": 2.262014802609539, "language_loss": 0.74437463, "learning_rate": 5.910288598455642e-07, "loss": 0.76601505, "num_input_tokens_seen": 135255720, "step": 6290, "time_per_iteration": 2.4932262897491455 }, { "auxiliary_loss_clip": 0.01159518, "auxiliary_loss_mlp": 0.01030382, "balance_loss_clip": 1.04856634, "balance_loss_mlp": 1.02306581, "epoch": 0.7564480250105212, "flos": 18588261438720.0, "grad_norm": 2.1522170690851654, "language_loss": 0.74586409, "learning_rate": 5.90476116287278e-07, "loss": 0.76776308, "num_input_tokens_seen": 135273320, "step": 6291, "time_per_iteration": 2.4486305713653564 }, { "auxiliary_loss_clip": 0.01140761, "auxiliary_loss_mlp": 0.0102789, "balance_loss_clip": 1.04886472, "balance_loss_mlp": 1.02071905, "epoch": 0.7565682679011604, "flos": 21215521918080.0, "grad_norm": 1.8188353913665978, "language_loss": 0.68033367, "learning_rate": 5.899235865526456e-07, "loss": 0.70202017, "num_input_tokens_seen": 135292615, "step": 6292, "time_per_iteration": 2.476522922515869 }, { "auxiliary_loss_clip": 0.01116689, "auxiliary_loss_mlp": 0.0102574, "balance_loss_clip": 1.04239488, "balance_loss_mlp": 1.01904321, "epoch": 0.7566885107917994, "flos": 20449008662400.0, "grad_norm": 1.7866296552006307, "language_loss": 0.82062238, "learning_rate": 5.893712707254825e-07, "loss": 0.84204674, "num_input_tokens_seen": 135310075, "step": 6293, "time_per_iteration": 2.4998016357421875 }, { "auxiliary_loss_clip": 0.01106037, "auxiliary_loss_mlp": 0.01023374, "balance_loss_clip": 1.04016626, "balance_loss_mlp": 1.01555347, "epoch": 0.7568087536824385, "flos": 19025832919680.0, "grad_norm": 2.5138350037075416, "language_loss": 0.65726644, "learning_rate": 5.888191688895769e-07, "loss": 0.67856061, "num_input_tokens_seen": 135327335, "step": 6294, "time_per_iteration": 2.5095326900482178 }, { "auxiliary_loss_clip": 0.01167241, "auxiliary_loss_mlp": 0.01027279, "balance_loss_clip": 1.04636717, "balance_loss_mlp": 1.0193038, "epoch": 0.7569289965730777, "flos": 15225442248960.0, "grad_norm": 2.2802058498177815, "language_loss": 0.61875337, "learning_rate": 5.882672811286813e-07, "loss": 0.64069855, "num_input_tokens_seen": 135343615, "step": 6295, "time_per_iteration": 2.3773233890533447 }, { "auxiliary_loss_clip": 0.01169594, "auxiliary_loss_mlp": 0.01026128, "balance_loss_clip": 1.04825592, "balance_loss_mlp": 1.01882625, "epoch": 0.7570492394637167, "flos": 20769367086720.0, "grad_norm": 2.0122279612698626, "language_loss": 0.69431567, "learning_rate": 5.877156075265166e-07, "loss": 0.71627289, "num_input_tokens_seen": 135359880, "step": 6296, "time_per_iteration": 2.3994789123535156 }, { "auxiliary_loss_clip": 0.01139063, "auxiliary_loss_mlp": 0.01023036, "balance_loss_clip": 1.04427767, "balance_loss_mlp": 1.01538897, "epoch": 0.7571694823543558, "flos": 15664091137920.0, "grad_norm": 2.568869135287148, "language_loss": 0.69711959, "learning_rate": 5.871641481667715e-07, "loss": 0.71874058, "num_input_tokens_seen": 135374325, "step": 6297, "time_per_iteration": 2.4964351654052734 }, { "auxiliary_loss_clip": 0.01114522, "auxiliary_loss_mlp": 0.01030237, "balance_loss_clip": 1.04375172, "balance_loss_mlp": 1.02252758, "epoch": 0.7572897252449949, "flos": 25409241492480.0, "grad_norm": 1.8676425103516465, "language_loss": 0.84341598, "learning_rate": 5.866129031331011e-07, "loss": 0.86486351, "num_input_tokens_seen": 135393980, "step": 6298, "time_per_iteration": 2.567317008972168 }, { "auxiliary_loss_clip": 0.01141324, "auxiliary_loss_mlp": 0.01025239, "balance_loss_clip": 1.04521346, "balance_loss_mlp": 1.01779151, "epoch": 0.757409968135634, "flos": 24279348297600.0, "grad_norm": 1.9529690391343197, "language_loss": 0.83697963, "learning_rate": 5.8606187250913e-07, "loss": 0.85864532, "num_input_tokens_seen": 135412030, "step": 6299, "time_per_iteration": 2.510033130645752 }, { "auxiliary_loss_clip": 0.01154667, "auxiliary_loss_mlp": 0.00761668, "balance_loss_clip": 1.04941511, "balance_loss_mlp": 1.0003686, "epoch": 0.757530211026273, "flos": 24133766474880.0, "grad_norm": 2.0252841485884705, "language_loss": 0.84210235, "learning_rate": 5.855110563784482e-07, "loss": 0.86126566, "num_input_tokens_seen": 135430565, "step": 6300, "time_per_iteration": 2.453111171722412 }, { "auxiliary_loss_clip": 0.01147931, "auxiliary_loss_mlp": 0.00761874, "balance_loss_clip": 1.04491329, "balance_loss_mlp": 1.00032973, "epoch": 0.7576504539169122, "flos": 23951807153280.0, "grad_norm": 2.119686534995756, "language_loss": 0.64331168, "learning_rate": 5.849604548246156e-07, "loss": 0.66240972, "num_input_tokens_seen": 135451675, "step": 6301, "time_per_iteration": 2.470508337020874 }, { "auxiliary_loss_clip": 0.01145463, "auxiliary_loss_mlp": 0.00761639, "balance_loss_clip": 1.04802024, "balance_loss_mlp": 1.00039577, "epoch": 0.7577706968075513, "flos": 21251360712960.0, "grad_norm": 2.0044358807027254, "language_loss": 0.80423015, "learning_rate": 5.844100679311565e-07, "loss": 0.8233012, "num_input_tokens_seen": 135470635, "step": 6302, "time_per_iteration": 2.495260238647461 }, { "auxiliary_loss_clip": 0.01143786, "auxiliary_loss_mlp": 0.01022826, "balance_loss_clip": 1.04951477, "balance_loss_mlp": 1.01506507, "epoch": 0.7578909396981903, "flos": 18296595002880.0, "grad_norm": 2.768617825031151, "language_loss": 0.76597059, "learning_rate": 5.838598957815637e-07, "loss": 0.78763676, "num_input_tokens_seen": 135487865, "step": 6303, "time_per_iteration": 2.4927802085876465 }, { "auxiliary_loss_clip": 0.01132554, "auxiliary_loss_mlp": 0.01023391, "balance_loss_clip": 1.04364729, "balance_loss_mlp": 1.01637888, "epoch": 0.7580111825888295, "flos": 25373869574400.0, "grad_norm": 1.5876508215880005, "language_loss": 0.85302413, "learning_rate": 5.833099384592996e-07, "loss": 0.8745836, "num_input_tokens_seen": 135508440, "step": 6304, "time_per_iteration": 2.5008881092071533 }, { "auxiliary_loss_clip": 0.01134277, "auxiliary_loss_mlp": 0.0102561, "balance_loss_clip": 1.04370856, "balance_loss_mlp": 1.01822841, "epoch": 0.7581314254794685, "flos": 23768662682880.0, "grad_norm": 2.2896850810795932, "language_loss": 0.7142691, "learning_rate": 5.827601960477913e-07, "loss": 0.73586798, "num_input_tokens_seen": 135526365, "step": 6305, "time_per_iteration": 2.475066900253296 }, { "auxiliary_loss_clip": 0.01148639, "auxiliary_loss_mlp": 0.01026794, "balance_loss_clip": 1.04398727, "balance_loss_mlp": 1.02036834, "epoch": 0.7582516683701076, "flos": 22054610603520.0, "grad_norm": 2.4584644419153996, "language_loss": 0.70667589, "learning_rate": 5.822106686304344e-07, "loss": 0.72843021, "num_input_tokens_seen": 135545655, "step": 6306, "time_per_iteration": 2.4346423149108887 }, { "auxiliary_loss_clip": 0.01133031, "auxiliary_loss_mlp": 0.01027149, "balance_loss_clip": 1.04505849, "balance_loss_mlp": 1.01981211, "epoch": 0.7583719112607467, "flos": 31649725848960.0, "grad_norm": 1.6930919223942935, "language_loss": 0.57783252, "learning_rate": 5.816613562905919e-07, "loss": 0.59943438, "num_input_tokens_seen": 135566840, "step": 6307, "time_per_iteration": 2.568549156188965 }, { "auxiliary_loss_clip": 0.01125657, "auxiliary_loss_mlp": 0.01025938, "balance_loss_clip": 1.04938924, "balance_loss_mlp": 1.0185889, "epoch": 0.7584921541513858, "flos": 33068376478080.0, "grad_norm": 1.5333518695777595, "language_loss": 0.70068789, "learning_rate": 5.811122591115933e-07, "loss": 0.72220385, "num_input_tokens_seen": 135587825, "step": 6308, "time_per_iteration": 2.584228038787842 }, { "auxiliary_loss_clip": 0.01128449, "auxiliary_loss_mlp": 0.010271, "balance_loss_clip": 1.05008614, "balance_loss_mlp": 1.01992035, "epoch": 0.7586123970420249, "flos": 23326350606720.0, "grad_norm": 2.238996947319549, "language_loss": 0.71755725, "learning_rate": 5.805633771767376e-07, "loss": 0.73911279, "num_input_tokens_seen": 135605220, "step": 6309, "time_per_iteration": 2.510951042175293 }, { "auxiliary_loss_clip": 0.0113747, "auxiliary_loss_mlp": 0.01023247, "balance_loss_clip": 1.04668093, "balance_loss_mlp": 1.01530159, "epoch": 0.7587326399326639, "flos": 18334229477760.0, "grad_norm": 1.6933524051913462, "language_loss": 0.77881581, "learning_rate": 5.800147105692888e-07, "loss": 0.80042303, "num_input_tokens_seen": 135624795, "step": 6310, "time_per_iteration": 3.3414499759674072 }, { "auxiliary_loss_clip": 0.01153978, "auxiliary_loss_mlp": 0.0102343, "balance_loss_clip": 1.04512882, "balance_loss_mlp": 1.01655173, "epoch": 0.7588528828233031, "flos": 17275080119040.0, "grad_norm": 1.9194437294654993, "language_loss": 0.79095036, "learning_rate": 5.794662593724795e-07, "loss": 0.81272447, "num_input_tokens_seen": 135643800, "step": 6311, "time_per_iteration": 3.282911777496338 }, { "auxiliary_loss_clip": 0.01171036, "auxiliary_loss_mlp": 0.01029126, "balance_loss_clip": 1.05131805, "balance_loss_mlp": 1.02155638, "epoch": 0.7589731257139422, "flos": 17713621267200.0, "grad_norm": 1.8620825579727243, "language_loss": 0.74696571, "learning_rate": 5.789180236695091e-07, "loss": 0.76896739, "num_input_tokens_seen": 135660655, "step": 6312, "time_per_iteration": 2.3927524089813232 }, { "auxiliary_loss_clip": 0.01148521, "auxiliary_loss_mlp": 0.01026573, "balance_loss_clip": 1.04728293, "balance_loss_mlp": 1.02004337, "epoch": 0.7590933686045812, "flos": 15961072786560.0, "grad_norm": 1.9442211615831841, "language_loss": 0.85097373, "learning_rate": 5.78370003543544e-07, "loss": 0.87272465, "num_input_tokens_seen": 135679410, "step": 6313, "time_per_iteration": 2.412609338760376 }, { "auxiliary_loss_clip": 0.01154447, "auxiliary_loss_mlp": 0.00761981, "balance_loss_clip": 1.04832172, "balance_loss_mlp": 1.00038409, "epoch": 0.7592136114952204, "flos": 21068072588160.0, "grad_norm": 2.083486064330563, "language_loss": 0.8371017, "learning_rate": 5.778221990777203e-07, "loss": 0.85626602, "num_input_tokens_seen": 135697150, "step": 6314, "time_per_iteration": 2.4422402381896973 }, { "auxiliary_loss_clip": 0.0114386, "auxiliary_loss_mlp": 0.01026918, "balance_loss_clip": 1.04937983, "balance_loss_mlp": 1.01949739, "epoch": 0.7593338543858594, "flos": 25297666871040.0, "grad_norm": 2.5559611022323057, "language_loss": 0.82623255, "learning_rate": 5.772746103551372e-07, "loss": 0.84794039, "num_input_tokens_seen": 135712545, "step": 6315, "time_per_iteration": 3.2547085285186768 }, { "auxiliary_loss_clip": 0.01138076, "auxiliary_loss_mlp": 0.01021115, "balance_loss_clip": 1.04687536, "balance_loss_mlp": 1.01356304, "epoch": 0.7594540972764985, "flos": 31832367528960.0, "grad_norm": 1.7597377282269855, "language_loss": 0.72190166, "learning_rate": 5.767272374588648e-07, "loss": 0.74349356, "num_input_tokens_seen": 135733950, "step": 6316, "time_per_iteration": 2.556236743927002 }, { "auxiliary_loss_clip": 0.0115302, "auxiliary_loss_mlp": 0.01024452, "balance_loss_clip": 1.04926753, "balance_loss_mlp": 1.01697111, "epoch": 0.7595743401671377, "flos": 37597250880000.0, "grad_norm": 1.6275790280984483, "language_loss": 0.77904081, "learning_rate": 5.76180080471939e-07, "loss": 0.80081546, "num_input_tokens_seen": 135757120, "step": 6317, "time_per_iteration": 2.581899404525757 }, { "auxiliary_loss_clip": 0.01172257, "auxiliary_loss_mlp": 0.0102548, "balance_loss_clip": 1.04931593, "balance_loss_mlp": 1.01769018, "epoch": 0.7596945830577767, "flos": 18287724343680.0, "grad_norm": 2.3262235525151467, "language_loss": 0.72180593, "learning_rate": 5.756331394773631e-07, "loss": 0.74378324, "num_input_tokens_seen": 135773335, "step": 6318, "time_per_iteration": 2.3827598094940186 }, { "auxiliary_loss_clip": 0.01099971, "auxiliary_loss_mlp": 0.00762549, "balance_loss_clip": 1.04296005, "balance_loss_mlp": 1.00030613, "epoch": 0.7598148259484158, "flos": 22233122219520.0, "grad_norm": 1.696122889528536, "language_loss": 0.76261896, "learning_rate": 5.750864145581071e-07, "loss": 0.78124416, "num_input_tokens_seen": 135792555, "step": 6319, "time_per_iteration": 2.5931317806243896 }, { "auxiliary_loss_clip": 0.01168761, "auxiliary_loss_mlp": 0.0102676, "balance_loss_clip": 1.0505681, "balance_loss_mlp": 1.01969683, "epoch": 0.7599350688390549, "flos": 27161718145920.0, "grad_norm": 1.9137831335682258, "language_loss": 0.86044788, "learning_rate": 5.745399057971085e-07, "loss": 0.88240302, "num_input_tokens_seen": 135813690, "step": 6320, "time_per_iteration": 2.442300796508789 }, { "auxiliary_loss_clip": 0.01158099, "auxiliary_loss_mlp": 0.01026452, "balance_loss_clip": 1.0477246, "balance_loss_mlp": 1.01948154, "epoch": 0.760055311729694, "flos": 15560704817280.0, "grad_norm": 2.6722157325804483, "language_loss": 0.75298023, "learning_rate": 5.739936132772738e-07, "loss": 0.77482575, "num_input_tokens_seen": 135832255, "step": 6321, "time_per_iteration": 2.4137682914733887 }, { "auxiliary_loss_clip": 0.01166094, "auxiliary_loss_mlp": 0.01025137, "balance_loss_clip": 1.04670417, "balance_loss_mlp": 1.01763618, "epoch": 0.760175554620333, "flos": 25155496840320.0, "grad_norm": 1.9331697096220553, "language_loss": 0.74442792, "learning_rate": 5.734475370814733e-07, "loss": 0.76634014, "num_input_tokens_seen": 135851935, "step": 6322, "time_per_iteration": 2.423246145248413 }, { "auxiliary_loss_clip": 0.01154586, "auxiliary_loss_mlp": 0.01022408, "balance_loss_clip": 1.04521108, "balance_loss_mlp": 1.01519561, "epoch": 0.7602957975109722, "flos": 24353791234560.0, "grad_norm": 1.6153738544491134, "language_loss": 0.78686702, "learning_rate": 5.729016772925483e-07, "loss": 0.80863696, "num_input_tokens_seen": 135873510, "step": 6323, "time_per_iteration": 2.457078456878662 }, { "auxiliary_loss_clip": 0.01110636, "auxiliary_loss_mlp": 0.01023412, "balance_loss_clip": 1.04593015, "balance_loss_mlp": 1.01565695, "epoch": 0.7604160404016113, "flos": 25192664438400.0, "grad_norm": 1.9692730370201807, "language_loss": 0.70632821, "learning_rate": 5.723560339933038e-07, "loss": 0.7276687, "num_input_tokens_seen": 135893845, "step": 6324, "time_per_iteration": 2.5802526473999023 }, { "auxiliary_loss_clip": 0.01151902, "auxiliary_loss_mlp": 0.00761791, "balance_loss_clip": 1.04662216, "balance_loss_mlp": 1.00032759, "epoch": 0.7605362832922503, "flos": 29861841363840.0, "grad_norm": 2.3544129384779113, "language_loss": 0.65566218, "learning_rate": 5.71810607266513e-07, "loss": 0.67479908, "num_input_tokens_seen": 135912430, "step": 6325, "time_per_iteration": 2.4846885204315186 }, { "auxiliary_loss_clip": 0.01153364, "auxiliary_loss_mlp": 0.01024451, "balance_loss_clip": 1.04577374, "balance_loss_mlp": 1.01741421, "epoch": 0.7606565261828895, "flos": 13917935278080.0, "grad_norm": 2.1290613247396397, "language_loss": 0.60401756, "learning_rate": 5.712653971949184e-07, "loss": 0.62579566, "num_input_tokens_seen": 135930550, "step": 6326, "time_per_iteration": 2.4105725288391113 }, { "auxiliary_loss_clip": 0.01148444, "auxiliary_loss_mlp": 0.01022567, "balance_loss_clip": 1.04608309, "balance_loss_mlp": 1.01508641, "epoch": 0.7607767690735285, "flos": 18551273408640.0, "grad_norm": 2.197977500654092, "language_loss": 0.7530911, "learning_rate": 5.707204038612268e-07, "loss": 0.77480125, "num_input_tokens_seen": 135947980, "step": 6327, "time_per_iteration": 2.3939802646636963 }, { "auxiliary_loss_clip": 0.01151838, "auxiliary_loss_mlp": 0.01027807, "balance_loss_clip": 1.05369282, "balance_loss_mlp": 1.02002311, "epoch": 0.7608970119641676, "flos": 20922993555840.0, "grad_norm": 4.1618849003005325, "language_loss": 0.74070084, "learning_rate": 5.701756273481138e-07, "loss": 0.76249731, "num_input_tokens_seen": 135965400, "step": 6328, "time_per_iteration": 2.4617631435394287 }, { "auxiliary_loss_clip": 0.01143758, "auxiliary_loss_mlp": 0.01023547, "balance_loss_clip": 1.04528272, "balance_loss_mlp": 1.01641846, "epoch": 0.7610172548548068, "flos": 23807302738560.0, "grad_norm": 1.4458777093984885, "language_loss": 0.73968965, "learning_rate": 5.696310677382212e-07, "loss": 0.76136267, "num_input_tokens_seen": 135986795, "step": 6329, "time_per_iteration": 2.492581367492676 }, { "auxiliary_loss_clip": 0.01030973, "auxiliary_loss_mlp": 0.01001464, "balance_loss_clip": 1.01165676, "balance_loss_mlp": 1.00058734, "epoch": 0.7611374977454458, "flos": 66496580426880.0, "grad_norm": 0.8720645697198282, "language_loss": 0.61830842, "learning_rate": 5.690867251141576e-07, "loss": 0.63863277, "num_input_tokens_seen": 136053450, "step": 6330, "time_per_iteration": 3.2151288986206055 }, { "auxiliary_loss_clip": 0.0116113, "auxiliary_loss_mlp": 0.01025574, "balance_loss_clip": 1.04748785, "balance_loss_mlp": 1.01826048, "epoch": 0.7612577406360849, "flos": 15633136592640.0, "grad_norm": 2.993612463154022, "language_loss": 0.9221698, "learning_rate": 5.685425995585013e-07, "loss": 0.94403684, "num_input_tokens_seen": 136071375, "step": 6331, "time_per_iteration": 2.42741322517395 }, { "auxiliary_loss_clip": 0.01048877, "auxiliary_loss_mlp": 0.01004664, "balance_loss_clip": 1.01247931, "balance_loss_mlp": 1.0036273, "epoch": 0.761377983526724, "flos": 60526253237760.0, "grad_norm": 3.8826306507364126, "language_loss": 0.59045696, "learning_rate": 5.679986911537935e-07, "loss": 0.61099243, "num_input_tokens_seen": 136138905, "step": 6332, "time_per_iteration": 3.1932616233825684 }, { "auxiliary_loss_clip": 0.0110315, "auxiliary_loss_mlp": 0.01021479, "balance_loss_clip": 1.04544961, "balance_loss_mlp": 1.01418352, "epoch": 0.7614982264173631, "flos": 35772522019200.0, "grad_norm": 2.1024744526939783, "language_loss": 0.67546231, "learning_rate": 5.674549999825462e-07, "loss": 0.69670856, "num_input_tokens_seen": 136161720, "step": 6333, "time_per_iteration": 2.655038833618164 }, { "auxiliary_loss_clip": 0.01058547, "auxiliary_loss_mlp": 0.010017, "balance_loss_clip": 1.01239836, "balance_loss_mlp": 1.00067508, "epoch": 0.7616184693080021, "flos": 67925502345600.0, "grad_norm": 0.9183383795375871, "language_loss": 0.71333563, "learning_rate": 5.669115261272363e-07, "loss": 0.7339381, "num_input_tokens_seen": 136222040, "step": 6334, "time_per_iteration": 3.023500442504883 }, { "auxiliary_loss_clip": 0.01155372, "auxiliary_loss_mlp": 0.010263, "balance_loss_clip": 1.04781616, "balance_loss_mlp": 1.01885581, "epoch": 0.7617387121986413, "flos": 20521979141760.0, "grad_norm": 2.3209834273778966, "language_loss": 0.72619081, "learning_rate": 5.663682696703081e-07, "loss": 0.74800754, "num_input_tokens_seen": 136240305, "step": 6335, "time_per_iteration": 2.424441337585449 }, { "auxiliary_loss_clip": 0.01166859, "auxiliary_loss_mlp": 0.01022455, "balance_loss_clip": 1.04935789, "balance_loss_mlp": 1.01562095, "epoch": 0.7618589550892804, "flos": 18624495283200.0, "grad_norm": 2.13907609455361, "language_loss": 0.82004869, "learning_rate": 5.658252306941746e-07, "loss": 0.84194183, "num_input_tokens_seen": 136259625, "step": 6336, "time_per_iteration": 2.392655372619629 }, { "auxiliary_loss_clip": 0.01113659, "auxiliary_loss_mlp": 0.0102801, "balance_loss_clip": 1.04392028, "balance_loss_mlp": 1.02003527, "epoch": 0.7619791979799194, "flos": 17453735389440.0, "grad_norm": 2.0472294080051188, "language_loss": 0.75158238, "learning_rate": 5.65282409281212e-07, "loss": 0.77299917, "num_input_tokens_seen": 136277090, "step": 6337, "time_per_iteration": 4.117333173751831 }, { "auxiliary_loss_clip": 0.01136397, "auxiliary_loss_mlp": 0.01024597, "balance_loss_clip": 1.0446856, "balance_loss_mlp": 1.01742387, "epoch": 0.7620994408705585, "flos": 14137421333760.0, "grad_norm": 2.817679875138117, "language_loss": 0.70092213, "learning_rate": 5.64739805513768e-07, "loss": 0.72253203, "num_input_tokens_seen": 136294635, "step": 6338, "time_per_iteration": 3.2620275020599365 }, { "auxiliary_loss_clip": 0.01053926, "auxiliary_loss_mlp": 0.00753067, "balance_loss_clip": 1.01274848, "balance_loss_mlp": 1.00004756, "epoch": 0.7622196837611976, "flos": 70708792527360.0, "grad_norm": 0.7868444945715333, "language_loss": 0.55714929, "learning_rate": 5.641974194741541e-07, "loss": 0.57521927, "num_input_tokens_seen": 136350320, "step": 6339, "time_per_iteration": 2.9368066787719727 }, { "auxiliary_loss_clip": 0.0104348, "auxiliary_loss_mlp": 0.01002475, "balance_loss_clip": 1.02246153, "balance_loss_mlp": 1.00114536, "epoch": 0.7623399266518367, "flos": 60684150447360.0, "grad_norm": 0.7965721282224352, "language_loss": 0.63755411, "learning_rate": 5.636552512446502e-07, "loss": 0.65801364, "num_input_tokens_seen": 136411375, "step": 6340, "time_per_iteration": 2.9715940952301025 }, { "auxiliary_loss_clip": 0.01147749, "auxiliary_loss_mlp": 0.0102425, "balance_loss_clip": 1.04569745, "balance_loss_mlp": 1.01706481, "epoch": 0.7624601695424758, "flos": 26468893641600.0, "grad_norm": 1.9332538891919064, "language_loss": 0.77745032, "learning_rate": 5.631133009075027e-07, "loss": 0.79917026, "num_input_tokens_seen": 136430560, "step": 6341, "time_per_iteration": 2.4662132263183594 }, { "auxiliary_loss_clip": 0.01154421, "auxiliary_loss_mlp": 0.00761352, "balance_loss_clip": 1.04762554, "balance_loss_mlp": 1.00031853, "epoch": 0.7625804124331149, "flos": 19135755515520.0, "grad_norm": 1.7510602788399283, "language_loss": 0.68527842, "learning_rate": 5.625715685449242e-07, "loss": 0.70443618, "num_input_tokens_seen": 136448665, "step": 6342, "time_per_iteration": 3.2139532566070557 }, { "auxiliary_loss_clip": 0.01128327, "auxiliary_loss_mlp": 0.01025456, "balance_loss_clip": 1.05097675, "balance_loss_mlp": 1.01882803, "epoch": 0.762700655323754, "flos": 26213101914240.0, "grad_norm": 1.629358366918023, "language_loss": 0.71679193, "learning_rate": 5.620300542390966e-07, "loss": 0.73832971, "num_input_tokens_seen": 136469710, "step": 6343, "time_per_iteration": 2.5413007736206055 }, { "auxiliary_loss_clip": 0.01135849, "auxiliary_loss_mlp": 0.01024264, "balance_loss_clip": 1.0427084, "balance_loss_mlp": 1.01771009, "epoch": 0.762820898214393, "flos": 22382582711040.0, "grad_norm": 1.9192033933468766, "language_loss": 0.849204, "learning_rate": 5.614887580721659e-07, "loss": 0.87080508, "num_input_tokens_seen": 136489855, "step": 6344, "time_per_iteration": 2.4717347621917725 }, { "auxiliary_loss_clip": 0.01117947, "auxiliary_loss_mlp": 0.0103225, "balance_loss_clip": 1.04618454, "balance_loss_mlp": 1.02439153, "epoch": 0.7629411411050322, "flos": 15700504550400.0, "grad_norm": 1.9564479352594817, "language_loss": 0.73763824, "learning_rate": 5.609476801262481e-07, "loss": 0.75914013, "num_input_tokens_seen": 136504715, "step": 6345, "time_per_iteration": 2.42869234085083 }, { "auxiliary_loss_clip": 0.01126714, "auxiliary_loss_mlp": 0.01027097, "balance_loss_clip": 1.04847682, "balance_loss_mlp": 1.01982534, "epoch": 0.7630613839956712, "flos": 13770342293760.0, "grad_norm": 2.292100234323005, "language_loss": 0.63951814, "learning_rate": 5.604068204834223e-07, "loss": 0.66105622, "num_input_tokens_seen": 136521610, "step": 6346, "time_per_iteration": 2.4694647789001465 }, { "auxiliary_loss_clip": 0.01111715, "auxiliary_loss_mlp": 0.00762354, "balance_loss_clip": 1.04542756, "balance_loss_mlp": 1.000283, "epoch": 0.7631816268863103, "flos": 14569569861120.0, "grad_norm": 2.215049450923147, "language_loss": 0.76720798, "learning_rate": 5.598661792257367e-07, "loss": 0.78594863, "num_input_tokens_seen": 136538655, "step": 6347, "time_per_iteration": 2.509542942047119 }, { "auxiliary_loss_clip": 0.01150963, "auxiliary_loss_mlp": 0.01024132, "balance_loss_clip": 1.04513907, "balance_loss_mlp": 1.01718175, "epoch": 0.7633018697769495, "flos": 19062210418560.0, "grad_norm": 1.9012677857481546, "language_loss": 0.75694811, "learning_rate": 5.593257564352071e-07, "loss": 0.77869904, "num_input_tokens_seen": 136557095, "step": 6348, "time_per_iteration": 2.4221768379211426 }, { "auxiliary_loss_clip": 0.01151656, "auxiliary_loss_mlp": 0.0102128, "balance_loss_clip": 1.047261, "balance_loss_mlp": 1.01425278, "epoch": 0.7634221126675885, "flos": 22052958577920.0, "grad_norm": 1.546631299463628, "language_loss": 0.75615633, "learning_rate": 5.58785552193815e-07, "loss": 0.77788568, "num_input_tokens_seen": 136577340, "step": 6349, "time_per_iteration": 2.4384586811065674 }, { "auxiliary_loss_clip": 0.01168528, "auxiliary_loss_mlp": 0.01022205, "balance_loss_clip": 1.04902315, "balance_loss_mlp": 1.01530015, "epoch": 0.7635423555582276, "flos": 29382720825600.0, "grad_norm": 2.1326665327485106, "language_loss": 0.75637043, "learning_rate": 5.582455665835086e-07, "loss": 0.77827775, "num_input_tokens_seen": 136597635, "step": 6350, "time_per_iteration": 2.4667088985443115 }, { "auxiliary_loss_clip": 0.01150137, "auxiliary_loss_mlp": 0.01030328, "balance_loss_clip": 1.04547262, "balance_loss_mlp": 1.02212048, "epoch": 0.7636625984488667, "flos": 17784903807360.0, "grad_norm": 2.596099468780283, "language_loss": 0.72555166, "learning_rate": 5.577057996862036e-07, "loss": 0.7473563, "num_input_tokens_seen": 136615260, "step": 6351, "time_per_iteration": 2.4519476890563965 }, { "auxiliary_loss_clip": 0.01163613, "auxiliary_loss_mlp": 0.01024034, "balance_loss_clip": 1.04783523, "balance_loss_mlp": 1.01721549, "epoch": 0.7637828413395058, "flos": 23734583654400.0, "grad_norm": 1.6057167603519746, "language_loss": 0.76335788, "learning_rate": 5.571662515837814e-07, "loss": 0.78523433, "num_input_tokens_seen": 136637220, "step": 6352, "time_per_iteration": 2.4916059970855713 }, { "auxiliary_loss_clip": 0.01138141, "auxiliary_loss_mlp": 0.01024031, "balance_loss_clip": 1.04600549, "balance_loss_mlp": 1.01711392, "epoch": 0.7639030842301449, "flos": 36283279461120.0, "grad_norm": 1.873407667167574, "language_loss": 0.83926988, "learning_rate": 5.566269223580926e-07, "loss": 0.86089158, "num_input_tokens_seen": 136658930, "step": 6353, "time_per_iteration": 2.601808786392212 }, { "auxiliary_loss_clip": 0.01156984, "auxiliary_loss_mlp": 0.01023895, "balance_loss_clip": 1.04849374, "balance_loss_mlp": 1.01662576, "epoch": 0.764023327120784, "flos": 28878104609280.0, "grad_norm": 1.8511146631553665, "language_loss": 0.75109136, "learning_rate": 5.560878120909511e-07, "loss": 0.77290016, "num_input_tokens_seen": 136681530, "step": 6354, "time_per_iteration": 2.5004525184631348 }, { "auxiliary_loss_clip": 0.01059698, "auxiliary_loss_mlp": 0.01002478, "balance_loss_clip": 1.01318908, "balance_loss_mlp": 1.0014708, "epoch": 0.7641435700114231, "flos": 64789711067520.0, "grad_norm": 0.8441767138699058, "language_loss": 0.58611226, "learning_rate": 5.55548920864141e-07, "loss": 0.60673404, "num_input_tokens_seen": 136742185, "step": 6355, "time_per_iteration": 3.04837703704834 }, { "auxiliary_loss_clip": 0.01155418, "auxiliary_loss_mlp": 0.01020966, "balance_loss_clip": 1.05129814, "balance_loss_mlp": 1.01443887, "epoch": 0.7642638129020621, "flos": 16835784785280.0, "grad_norm": 1.8017246285669568, "language_loss": 0.77789485, "learning_rate": 5.550102487594113e-07, "loss": 0.79965872, "num_input_tokens_seen": 136760855, "step": 6356, "time_per_iteration": 2.4332199096679688 }, { "auxiliary_loss_clip": 0.01114587, "auxiliary_loss_mlp": 0.00761259, "balance_loss_clip": 1.04167366, "balance_loss_mlp": 1.0003233, "epoch": 0.7643840557927013, "flos": 30408940391040.0, "grad_norm": 1.8794261144617133, "language_loss": 0.71704465, "learning_rate": 5.54471795858477e-07, "loss": 0.73580307, "num_input_tokens_seen": 136780925, "step": 6357, "time_per_iteration": 2.609037160873413 }, { "auxiliary_loss_clip": 0.01125005, "auxiliary_loss_mlp": 0.01028119, "balance_loss_clip": 1.04139495, "balance_loss_mlp": 1.02074564, "epoch": 0.7645042986833404, "flos": 16983234115200.0, "grad_norm": 1.9632775658355854, "language_loss": 0.82754779, "learning_rate": 5.539335622430235e-07, "loss": 0.84907901, "num_input_tokens_seen": 136799545, "step": 6358, "time_per_iteration": 2.483186960220337 }, { "auxiliary_loss_clip": 0.01146309, "auxiliary_loss_mlp": 0.01026911, "balance_loss_clip": 1.04414415, "balance_loss_mlp": 1.01919866, "epoch": 0.7646245415739794, "flos": 17311493531520.0, "grad_norm": 2.0609308329142393, "language_loss": 0.74821275, "learning_rate": 5.533955479946975e-07, "loss": 0.76994491, "num_input_tokens_seen": 136818325, "step": 6359, "time_per_iteration": 2.4492714405059814 }, { "auxiliary_loss_clip": 0.01034095, "auxiliary_loss_mlp": 0.00753077, "balance_loss_clip": 1.02201557, "balance_loss_mlp": 0.99990511, "epoch": 0.7647447844646186, "flos": 70402332666240.0, "grad_norm": 0.8732725267242816, "language_loss": 0.65782553, "learning_rate": 5.528577531951173e-07, "loss": 0.67569721, "num_input_tokens_seen": 136878730, "step": 6360, "time_per_iteration": 3.057543992996216 }, { "auxiliary_loss_clip": 0.01144509, "auxiliary_loss_mlp": 0.01022368, "balance_loss_clip": 1.04717374, "balance_loss_mlp": 1.01574552, "epoch": 0.7648650273552576, "flos": 17675914965120.0, "grad_norm": 2.466310627002327, "language_loss": 0.73924428, "learning_rate": 5.523201779258653e-07, "loss": 0.76091301, "num_input_tokens_seen": 136897705, "step": 6361, "time_per_iteration": 2.45358943939209 }, { "auxiliary_loss_clip": 0.01165745, "auxiliary_loss_mlp": 0.01023562, "balance_loss_clip": 1.04690981, "balance_loss_mlp": 1.0161891, "epoch": 0.7649852702458967, "flos": 22162019247360.0, "grad_norm": 1.8628793001611974, "language_loss": 0.84109479, "learning_rate": 5.517828222684912e-07, "loss": 0.86298788, "num_input_tokens_seen": 136918360, "step": 6362, "time_per_iteration": 2.4129037857055664 }, { "auxiliary_loss_clip": 0.0104627, "auxiliary_loss_mlp": 0.01001362, "balance_loss_clip": 1.01423407, "balance_loss_mlp": 1.0002774, "epoch": 0.7651055131365359, "flos": 69848338227840.0, "grad_norm": 0.765834522646588, "language_loss": 0.59067565, "learning_rate": 5.512456863045117e-07, "loss": 0.61115199, "num_input_tokens_seen": 136979050, "step": 6363, "time_per_iteration": 3.871473789215088 }, { "auxiliary_loss_clip": 0.01166817, "auxiliary_loss_mlp": 0.01026442, "balance_loss_clip": 1.04681683, "balance_loss_mlp": 1.01928055, "epoch": 0.7652257560271749, "flos": 19464014931840.0, "grad_norm": 2.3135936498083147, "language_loss": 0.74049056, "learning_rate": 5.507087701154089e-07, "loss": 0.76242316, "num_input_tokens_seen": 136998970, "step": 6364, "time_per_iteration": 3.1764132976531982 }, { "auxiliary_loss_clip": 0.01112698, "auxiliary_loss_mlp": 0.01026497, "balance_loss_clip": 1.04280519, "balance_loss_mlp": 1.01966918, "epoch": 0.765345998917814, "flos": 15961108700160.0, "grad_norm": 1.9229534114550089, "language_loss": 0.75533187, "learning_rate": 5.50172073782634e-07, "loss": 0.7767238, "num_input_tokens_seen": 137016950, "step": 6365, "time_per_iteration": 3.3357675075531006 }, { "auxiliary_loss_clip": 0.01123193, "auxiliary_loss_mlp": 0.01027664, "balance_loss_clip": 1.04585505, "balance_loss_mlp": 1.02054691, "epoch": 0.7654662418084531, "flos": 23659853408640.0, "grad_norm": 4.170047747101967, "language_loss": 0.87785631, "learning_rate": 5.496355973876023e-07, "loss": 0.89936483, "num_input_tokens_seen": 137036205, "step": 6366, "time_per_iteration": 2.5032167434692383 }, { "auxiliary_loss_clip": 0.01121971, "auxiliary_loss_mlp": 0.00762334, "balance_loss_clip": 1.043329, "balance_loss_mlp": 1.0003792, "epoch": 0.7655864846990922, "flos": 41463608878080.0, "grad_norm": 21.05141699631102, "language_loss": 0.70696187, "learning_rate": 5.490993410116984e-07, "loss": 0.72580487, "num_input_tokens_seen": 137059195, "step": 6367, "time_per_iteration": 2.689823627471924 }, { "auxiliary_loss_clip": 0.01123308, "auxiliary_loss_mlp": 0.01027936, "balance_loss_clip": 1.04692626, "balance_loss_mlp": 1.02109945, "epoch": 0.7657067275897312, "flos": 43142684088960.0, "grad_norm": 1.731143379743734, "language_loss": 0.69647586, "learning_rate": 5.485633047362704e-07, "loss": 0.71798825, "num_input_tokens_seen": 137081200, "step": 6368, "time_per_iteration": 2.697878837585449 }, { "auxiliary_loss_clip": 0.01174076, "auxiliary_loss_mlp": 0.01029737, "balance_loss_clip": 1.05254412, "balance_loss_mlp": 1.02226305, "epoch": 0.7658269704803703, "flos": 17311780840320.0, "grad_norm": 5.347963610685303, "language_loss": 0.78490973, "learning_rate": 5.480274886426341e-07, "loss": 0.80694783, "num_input_tokens_seen": 137097840, "step": 6369, "time_per_iteration": 3.1596245765686035 }, { "auxiliary_loss_clip": 0.01151102, "auxiliary_loss_mlp": 0.01022818, "balance_loss_clip": 1.04919469, "balance_loss_mlp": 1.01610088, "epoch": 0.7659472133710095, "flos": 12568160977920.0, "grad_norm": 1.9345798010231363, "language_loss": 0.77938432, "learning_rate": 5.474918928120744e-07, "loss": 0.8011235, "num_input_tokens_seen": 137114335, "step": 6370, "time_per_iteration": 2.4045112133026123 }, { "auxiliary_loss_clip": 0.0115023, "auxiliary_loss_mlp": 0.01021179, "balance_loss_clip": 1.0463264, "balance_loss_mlp": 1.01470876, "epoch": 0.7660674562616485, "flos": 22707430335360.0, "grad_norm": 1.9443593274740107, "language_loss": 0.87434733, "learning_rate": 5.469565173258392e-07, "loss": 0.89606148, "num_input_tokens_seen": 137132850, "step": 6371, "time_per_iteration": 2.441770076751709 }, { "auxiliary_loss_clip": 0.01170854, "auxiliary_loss_mlp": 0.01028128, "balance_loss_clip": 1.04856074, "balance_loss_mlp": 1.02049899, "epoch": 0.7661876991522876, "flos": 17056455989760.0, "grad_norm": 1.7071123412324896, "language_loss": 0.63544959, "learning_rate": 5.464213622651454e-07, "loss": 0.65743947, "num_input_tokens_seen": 137150665, "step": 6372, "time_per_iteration": 2.415595531463623 }, { "auxiliary_loss_clip": 0.01131167, "auxiliary_loss_mlp": 0.01026077, "balance_loss_clip": 1.04567599, "balance_loss_mlp": 1.01888835, "epoch": 0.7663079420429267, "flos": 20084228092800.0, "grad_norm": 1.6241856462781512, "language_loss": 0.84267974, "learning_rate": 5.458864277111753e-07, "loss": 0.86425221, "num_input_tokens_seen": 137168500, "step": 6373, "time_per_iteration": 2.4927902221679688 }, { "auxiliary_loss_clip": 0.01131159, "auxiliary_loss_mlp": 0.0076095, "balance_loss_clip": 1.04256582, "balance_loss_mlp": 1.00027359, "epoch": 0.7664281849335658, "flos": 12677473042560.0, "grad_norm": 2.3540503747733594, "language_loss": 0.69292229, "learning_rate": 5.453517137450769e-07, "loss": 0.71184337, "num_input_tokens_seen": 137185075, "step": 6374, "time_per_iteration": 2.4372057914733887 }, { "auxiliary_loss_clip": 0.01152503, "auxiliary_loss_mlp": 0.01025327, "balance_loss_clip": 1.04839587, "balance_loss_mlp": 1.01791263, "epoch": 0.7665484278242048, "flos": 22345271458560.0, "grad_norm": 2.0147409502210394, "language_loss": 0.75881517, "learning_rate": 5.448172204479684e-07, "loss": 0.78059345, "num_input_tokens_seen": 137204355, "step": 6375, "time_per_iteration": 2.4697751998901367 }, { "auxiliary_loss_clip": 0.01164884, "auxiliary_loss_mlp": 0.01025059, "balance_loss_clip": 1.04755449, "balance_loss_mlp": 1.01797748, "epoch": 0.766668670714844, "flos": 23617909301760.0, "grad_norm": 1.9338781180831894, "language_loss": 0.7459054, "learning_rate": 5.442829479009294e-07, "loss": 0.76780474, "num_input_tokens_seen": 137223135, "step": 6376, "time_per_iteration": 2.409158945083618 }, { "auxiliary_loss_clip": 0.01159459, "auxiliary_loss_mlp": 0.01027574, "balance_loss_clip": 1.04668212, "balance_loss_mlp": 1.01988459, "epoch": 0.7667889136054831, "flos": 19427134642560.0, "grad_norm": 2.392654489155968, "language_loss": 0.71623802, "learning_rate": 5.437488961850103e-07, "loss": 0.73810834, "num_input_tokens_seen": 137242935, "step": 6377, "time_per_iteration": 2.4200100898742676 }, { "auxiliary_loss_clip": 0.01106178, "auxiliary_loss_mlp": 0.01024907, "balance_loss_clip": 1.04220891, "balance_loss_mlp": 1.01852059, "epoch": 0.7669091564961221, "flos": 26866352609280.0, "grad_norm": 1.7484663900688924, "language_loss": 0.75362182, "learning_rate": 5.432150653812258e-07, "loss": 0.77493268, "num_input_tokens_seen": 137262970, "step": 6378, "time_per_iteration": 2.573922872543335 }, { "auxiliary_loss_clip": 0.01150978, "auxiliary_loss_mlp": 0.01025203, "balance_loss_clip": 1.04763174, "balance_loss_mlp": 1.01786304, "epoch": 0.7670293993867613, "flos": 12385303816320.0, "grad_norm": 2.564257105109169, "language_loss": 0.82675385, "learning_rate": 5.42681455570557e-07, "loss": 0.84851563, "num_input_tokens_seen": 137279500, "step": 6379, "time_per_iteration": 2.4332940578460693 }, { "auxiliary_loss_clip": 0.01163439, "auxiliary_loss_mlp": 0.01022548, "balance_loss_clip": 1.04626977, "balance_loss_mlp": 1.01560998, "epoch": 0.7671496422774003, "flos": 21762944167680.0, "grad_norm": 1.991398931282379, "language_loss": 0.64638889, "learning_rate": 5.42148066833954e-07, "loss": 0.66824877, "num_input_tokens_seen": 137298745, "step": 6380, "time_per_iteration": 2.394430637359619 }, { "auxiliary_loss_clip": 0.01165104, "auxiliary_loss_mlp": 0.01024203, "balance_loss_clip": 1.04786932, "balance_loss_mlp": 1.01716423, "epoch": 0.7672698851680394, "flos": 21069221823360.0, "grad_norm": 2.2928881409693727, "language_loss": 0.75220078, "learning_rate": 5.416148992523289e-07, "loss": 0.77409387, "num_input_tokens_seen": 137317320, "step": 6381, "time_per_iteration": 2.3966023921966553 }, { "auxiliary_loss_clip": 0.01081798, "auxiliary_loss_mlp": 0.01025551, "balance_loss_clip": 1.0407145, "balance_loss_mlp": 1.01870561, "epoch": 0.7673901280586786, "flos": 16976697840000.0, "grad_norm": 2.3913270229965367, "language_loss": 0.784832, "learning_rate": 5.410819529065644e-07, "loss": 0.80590546, "num_input_tokens_seen": 137335275, "step": 6382, "time_per_iteration": 2.564643144607544 }, { "auxiliary_loss_clip": 0.01110222, "auxiliary_loss_mlp": 0.01022767, "balance_loss_clip": 1.04181111, "balance_loss_mlp": 1.0159874, "epoch": 0.7675103709493176, "flos": 29242669697280.0, "grad_norm": 2.0581881229278545, "language_loss": 0.65522915, "learning_rate": 5.405492278775079e-07, "loss": 0.67655903, "num_input_tokens_seen": 137355055, "step": 6383, "time_per_iteration": 2.597848415374756 }, { "auxiliary_loss_clip": 0.01139524, "auxiliary_loss_mlp": 0.01026378, "balance_loss_clip": 1.04418361, "balance_loss_mlp": 1.01903796, "epoch": 0.7676306138399567, "flos": 29023004073600.0, "grad_norm": 2.4440017688204447, "language_loss": 0.79515409, "learning_rate": 5.400167242459732e-07, "loss": 0.81681311, "num_input_tokens_seen": 137374015, "step": 6384, "time_per_iteration": 2.5313050746917725 }, { "auxiliary_loss_clip": 0.01150861, "auxiliary_loss_mlp": 0.01027705, "balance_loss_clip": 1.04624724, "balance_loss_mlp": 1.02072835, "epoch": 0.7677508567305958, "flos": 22565116650240.0, "grad_norm": 1.7362755035977755, "language_loss": 0.80614436, "learning_rate": 5.394844420927405e-07, "loss": 0.82793003, "num_input_tokens_seen": 137393625, "step": 6385, "time_per_iteration": 2.4353082180023193 }, { "auxiliary_loss_clip": 0.01165397, "auxiliary_loss_mlp": 0.01029275, "balance_loss_clip": 1.04734337, "balance_loss_mlp": 1.02205706, "epoch": 0.7678710996212349, "flos": 25411432222080.0, "grad_norm": 2.024455177025597, "language_loss": 0.73396903, "learning_rate": 5.389523814985562e-07, "loss": 0.75591576, "num_input_tokens_seen": 137413045, "step": 6386, "time_per_iteration": 2.427029848098755 }, { "auxiliary_loss_clip": 0.01111857, "auxiliary_loss_mlp": 0.01023286, "balance_loss_clip": 1.043805, "balance_loss_mlp": 1.01593113, "epoch": 0.767991342511874, "flos": 26756825063040.0, "grad_norm": 2.589964151125589, "language_loss": 0.76044774, "learning_rate": 5.384205425441344e-07, "loss": 0.78179914, "num_input_tokens_seen": 137433955, "step": 6387, "time_per_iteration": 2.579573631286621 }, { "auxiliary_loss_clip": 0.01139821, "auxiliary_loss_mlp": 0.01022253, "balance_loss_clip": 1.04385543, "balance_loss_mlp": 1.01521337, "epoch": 0.7681115854025131, "flos": 26359509749760.0, "grad_norm": 1.8126868902802877, "language_loss": 0.8405416, "learning_rate": 5.378889253101537e-07, "loss": 0.86216235, "num_input_tokens_seen": 137454510, "step": 6388, "time_per_iteration": 2.5149872303009033 }, { "auxiliary_loss_clip": 0.01153351, "auxiliary_loss_mlp": 0.01022075, "balance_loss_clip": 1.04650307, "balance_loss_mlp": 1.01528931, "epoch": 0.7682318282931522, "flos": 23257043314560.0, "grad_norm": 1.6333737764074625, "language_loss": 0.81280822, "learning_rate": 5.373575298772617e-07, "loss": 0.83456242, "num_input_tokens_seen": 137473630, "step": 6389, "time_per_iteration": 2.4467570781707764 }, { "auxiliary_loss_clip": 0.01059106, "auxiliary_loss_mlp": 0.01001136, "balance_loss_clip": 1.01214981, "balance_loss_mlp": 1.00000322, "epoch": 0.7683520711837912, "flos": 70072457137920.0, "grad_norm": 0.7662733122752389, "language_loss": 0.61310136, "learning_rate": 5.368263563260689e-07, "loss": 0.63370383, "num_input_tokens_seen": 137538765, "step": 6390, "time_per_iteration": 3.9423651695251465 }, { "auxiliary_loss_clip": 0.01153938, "auxiliary_loss_mlp": 0.01022762, "balance_loss_clip": 1.04655004, "balance_loss_mlp": 1.01564562, "epoch": 0.7684723140744304, "flos": 18624890332800.0, "grad_norm": 1.4744298952076518, "language_loss": 0.64075053, "learning_rate": 5.362954047371537e-07, "loss": 0.66251755, "num_input_tokens_seen": 137557875, "step": 6391, "time_per_iteration": 4.062402009963989 }, { "auxiliary_loss_clip": 0.01128922, "auxiliary_loss_mlp": 0.01027103, "balance_loss_clip": 1.05053234, "balance_loss_mlp": 1.01964617, "epoch": 0.7685925569650695, "flos": 27452989532160.0, "grad_norm": 2.862261316174651, "language_loss": 0.72164237, "learning_rate": 5.357646751910627e-07, "loss": 0.74320257, "num_input_tokens_seen": 137579055, "step": 6392, "time_per_iteration": 2.5438971519470215 }, { "auxiliary_loss_clip": 0.0113704, "auxiliary_loss_mlp": 0.0103107, "balance_loss_clip": 1.04458559, "balance_loss_mlp": 1.02365828, "epoch": 0.7687127998557085, "flos": 24535714642560.0, "grad_norm": 2.5933805690484544, "language_loss": 0.79862297, "learning_rate": 5.352341677683061e-07, "loss": 0.8203041, "num_input_tokens_seen": 137600355, "step": 6393, "time_per_iteration": 2.5085439682006836 }, { "auxiliary_loss_clip": 0.01132052, "auxiliary_loss_mlp": 0.010278, "balance_loss_clip": 1.04546452, "balance_loss_mlp": 1.02042115, "epoch": 0.7688330427463477, "flos": 25155963717120.0, "grad_norm": 1.8067173779569732, "language_loss": 0.7894783, "learning_rate": 5.347038825493617e-07, "loss": 0.81107688, "num_input_tokens_seen": 137621885, "step": 6394, "time_per_iteration": 2.5423941612243652 }, { "auxiliary_loss_clip": 0.01136182, "auxiliary_loss_mlp": 0.01024991, "balance_loss_clip": 1.04805613, "balance_loss_mlp": 1.01808882, "epoch": 0.7689532856369867, "flos": 21211284113280.0, "grad_norm": 2.348601471180008, "language_loss": 0.68609875, "learning_rate": 5.341738196146732e-07, "loss": 0.7077105, "num_input_tokens_seen": 137640230, "step": 6395, "time_per_iteration": 2.462198257446289 }, { "auxiliary_loss_clip": 0.01148984, "auxiliary_loss_mlp": 0.01022085, "balance_loss_clip": 1.04504156, "balance_loss_mlp": 1.01493239, "epoch": 0.7690735285276258, "flos": 25119083427840.0, "grad_norm": 2.3378419669585626, "language_loss": 0.73600239, "learning_rate": 5.336439790446503e-07, "loss": 0.75771308, "num_input_tokens_seen": 137659330, "step": 6396, "time_per_iteration": 3.214081048965454 }, { "auxiliary_loss_clip": 0.01119193, "auxiliary_loss_mlp": 0.01026499, "balance_loss_clip": 1.04023659, "balance_loss_mlp": 1.01849461, "epoch": 0.769193771418265, "flos": 54744020640000.0, "grad_norm": 1.7038441912025504, "language_loss": 0.62881178, "learning_rate": 5.331143609196711e-07, "loss": 0.65026867, "num_input_tokens_seen": 137683145, "step": 6397, "time_per_iteration": 2.8088443279266357 }, { "auxiliary_loss_clip": 0.01153538, "auxiliary_loss_mlp": 0.01024576, "balance_loss_clip": 1.04888165, "balance_loss_mlp": 1.01738739, "epoch": 0.769314014308904, "flos": 37341890115840.0, "grad_norm": 1.6713104968828993, "language_loss": 0.76808703, "learning_rate": 5.325849653200758e-07, "loss": 0.78986812, "num_input_tokens_seen": 137707095, "step": 6398, "time_per_iteration": 2.5601727962493896 }, { "auxiliary_loss_clip": 0.0116715, "auxiliary_loss_mlp": 0.01023884, "balance_loss_clip": 1.0490346, "balance_loss_mlp": 1.01664782, "epoch": 0.7694342571995431, "flos": 20631686256000.0, "grad_norm": 1.6415384007702871, "language_loss": 0.76343262, "learning_rate": 5.32055792326175e-07, "loss": 0.78534299, "num_input_tokens_seen": 137725520, "step": 6399, "time_per_iteration": 2.4432456493377686 }, { "auxiliary_loss_clip": 0.01141757, "auxiliary_loss_mlp": 0.01021077, "balance_loss_clip": 1.04749393, "balance_loss_mlp": 1.01371002, "epoch": 0.7695545000901821, "flos": 24207706621440.0, "grad_norm": 2.3435896626514996, "language_loss": 0.73139322, "learning_rate": 5.315268420182437e-07, "loss": 0.75302148, "num_input_tokens_seen": 137744195, "step": 6400, "time_per_iteration": 2.4782979488372803 }, { "auxiliary_loss_clip": 0.01129143, "auxiliary_loss_mlp": 0.00761597, "balance_loss_clip": 1.04450679, "balance_loss_mlp": 1.00037694, "epoch": 0.7696747429808213, "flos": 28001273708160.0, "grad_norm": 1.7500450904575275, "language_loss": 0.76619947, "learning_rate": 5.309981144765221e-07, "loss": 0.7851069, "num_input_tokens_seen": 137764340, "step": 6401, "time_per_iteration": 2.6107680797576904 }, { "auxiliary_loss_clip": 0.01115922, "auxiliary_loss_mlp": 0.01020622, "balance_loss_clip": 1.04253924, "balance_loss_mlp": 1.01420283, "epoch": 0.7697949858714603, "flos": 11509550323200.0, "grad_norm": 3.108236240830104, "language_loss": 0.75308383, "learning_rate": 5.304696097812196e-07, "loss": 0.77444929, "num_input_tokens_seen": 137780940, "step": 6402, "time_per_iteration": 2.5105855464935303 }, { "auxiliary_loss_clip": 0.01135856, "auxiliary_loss_mlp": 0.01029401, "balance_loss_clip": 1.04330397, "balance_loss_mlp": 1.02156341, "epoch": 0.7699152287620994, "flos": 26688271956480.0, "grad_norm": 3.265144608985128, "language_loss": 0.60605782, "learning_rate": 5.299413280125078e-07, "loss": 0.62771046, "num_input_tokens_seen": 137799250, "step": 6403, "time_per_iteration": 2.5085442066192627 }, { "auxiliary_loss_clip": 0.01139339, "auxiliary_loss_mlp": 0.01029175, "balance_loss_clip": 1.04530358, "balance_loss_mlp": 1.02209449, "epoch": 0.7700354716527386, "flos": 16544944362240.0, "grad_norm": 2.181327075446862, "language_loss": 0.72531736, "learning_rate": 5.294132692505284e-07, "loss": 0.74700254, "num_input_tokens_seen": 137817660, "step": 6404, "time_per_iteration": 2.466015338897705 }, { "auxiliary_loss_clip": 0.01101995, "auxiliary_loss_mlp": 0.01025704, "balance_loss_clip": 1.04092407, "balance_loss_mlp": 1.01808906, "epoch": 0.7701557145433776, "flos": 19242733196160.0, "grad_norm": 1.8960630050847007, "language_loss": 0.79147637, "learning_rate": 5.288854335753861e-07, "loss": 0.81275332, "num_input_tokens_seen": 137835920, "step": 6405, "time_per_iteration": 2.5346932411193848 }, { "auxiliary_loss_clip": 0.01152231, "auxiliary_loss_mlp": 0.01020601, "balance_loss_clip": 1.044837, "balance_loss_mlp": 1.01351082, "epoch": 0.7702759574340167, "flos": 31685744211840.0, "grad_norm": 2.386096919592745, "language_loss": 0.75552845, "learning_rate": 5.283578210671551e-07, "loss": 0.77725673, "num_input_tokens_seen": 137858160, "step": 6406, "time_per_iteration": 2.52358078956604 }, { "auxiliary_loss_clip": 0.01141899, "auxiliary_loss_mlp": 0.01021041, "balance_loss_clip": 1.04589963, "balance_loss_mlp": 1.01400757, "epoch": 0.7703962003246558, "flos": 16800089644800.0, "grad_norm": 1.9971462237839832, "language_loss": 0.76459974, "learning_rate": 5.278304318058719e-07, "loss": 0.78622913, "num_input_tokens_seen": 137876015, "step": 6407, "time_per_iteration": 2.442455768585205 }, { "auxiliary_loss_clip": 0.01097723, "auxiliary_loss_mlp": 0.01027533, "balance_loss_clip": 1.04268956, "balance_loss_mlp": 1.01976061, "epoch": 0.7705164432152949, "flos": 35736072693120.0, "grad_norm": 1.9523814734497202, "language_loss": 0.79077709, "learning_rate": 5.273032658715411e-07, "loss": 0.81202972, "num_input_tokens_seen": 137898825, "step": 6408, "time_per_iteration": 2.6898953914642334 }, { "auxiliary_loss_clip": 0.01107563, "auxiliary_loss_mlp": 0.01023865, "balance_loss_clip": 1.04177475, "balance_loss_mlp": 1.01662326, "epoch": 0.7706366861059339, "flos": 23365960329600.0, "grad_norm": 2.030378492407595, "language_loss": 0.76631641, "learning_rate": 5.267763233441347e-07, "loss": 0.7876308, "num_input_tokens_seen": 137919455, "step": 6409, "time_per_iteration": 2.551865339279175 }, { "auxiliary_loss_clip": 0.0115581, "auxiliary_loss_mlp": 0.01022356, "balance_loss_clip": 1.04747939, "balance_loss_mlp": 1.01460743, "epoch": 0.7707569289965731, "flos": 22929897219840.0, "grad_norm": 3.1573725336824485, "language_loss": 0.6991812, "learning_rate": 5.26249604303588e-07, "loss": 0.72096288, "num_input_tokens_seen": 137937960, "step": 6410, "time_per_iteration": 2.443208694458008 }, { "auxiliary_loss_clip": 0.01166478, "auxiliary_loss_mlp": 0.01025082, "balance_loss_clip": 1.04868722, "balance_loss_mlp": 1.01802516, "epoch": 0.7708771718872122, "flos": 17420661941760.0, "grad_norm": 2.1389885977442047, "language_loss": 0.78599912, "learning_rate": 5.257231088298057e-07, "loss": 0.80791473, "num_input_tokens_seen": 137956370, "step": 6411, "time_per_iteration": 2.3761322498321533 }, { "auxiliary_loss_clip": 0.01032962, "auxiliary_loss_mlp": 0.01001088, "balance_loss_clip": 1.01138186, "balance_loss_mlp": 0.99999762, "epoch": 0.7709974147778512, "flos": 72241316248320.0, "grad_norm": 0.7985975284489502, "language_loss": 0.53935266, "learning_rate": 5.25196837002655e-07, "loss": 0.55969322, "num_input_tokens_seen": 138016080, "step": 6412, "time_per_iteration": 3.1264119148254395 }, { "auxiliary_loss_clip": 0.01136337, "auxiliary_loss_mlp": 0.01033831, "balance_loss_clip": 1.04465556, "balance_loss_mlp": 1.0263474, "epoch": 0.7711176576684904, "flos": 39859694876160.0, "grad_norm": 2.687812407898376, "language_loss": 0.68508303, "learning_rate": 5.24670788901971e-07, "loss": 0.70678473, "num_input_tokens_seen": 138039170, "step": 6413, "time_per_iteration": 2.6299617290496826 }, { "auxiliary_loss_clip": 0.01138367, "auxiliary_loss_mlp": 0.01026295, "balance_loss_clip": 1.04648781, "balance_loss_mlp": 1.01796854, "epoch": 0.7712379005591294, "flos": 36976391274240.0, "grad_norm": 2.625574801016011, "language_loss": 0.68696755, "learning_rate": 5.241449646075557e-07, "loss": 0.70861423, "num_input_tokens_seen": 138062395, "step": 6414, "time_per_iteration": 2.5986409187316895 }, { "auxiliary_loss_clip": 0.01161972, "auxiliary_loss_mlp": 0.01026753, "balance_loss_clip": 1.0483228, "balance_loss_mlp": 1.0195111, "epoch": 0.7713581434497685, "flos": 22776773541120.0, "grad_norm": 2.105796709133199, "language_loss": 0.72661221, "learning_rate": 5.236193641991762e-07, "loss": 0.74849951, "num_input_tokens_seen": 138080325, "step": 6415, "time_per_iteration": 2.4423716068267822 }, { "auxiliary_loss_clip": 0.01137557, "auxiliary_loss_mlp": 0.01024376, "balance_loss_clip": 1.04565716, "balance_loss_mlp": 1.01741719, "epoch": 0.7714783863404077, "flos": 24097460803200.0, "grad_norm": 2.363846756376933, "language_loss": 0.70127535, "learning_rate": 5.23093987756565e-07, "loss": 0.72289467, "num_input_tokens_seen": 138099020, "step": 6416, "time_per_iteration": 3.344449281692505 }, { "auxiliary_loss_clip": 0.01128316, "auxiliary_loss_mlp": 0.01026605, "balance_loss_clip": 1.04139125, "balance_loss_mlp": 1.019104, "epoch": 0.7715986292310467, "flos": 21063655215360.0, "grad_norm": 1.8919636641419346, "language_loss": 0.75455183, "learning_rate": 5.225688353594217e-07, "loss": 0.77610105, "num_input_tokens_seen": 138118650, "step": 6417, "time_per_iteration": 3.261704683303833 }, { "auxiliary_loss_clip": 0.01143051, "auxiliary_loss_mlp": 0.0076162, "balance_loss_clip": 1.04709172, "balance_loss_mlp": 1.0003562, "epoch": 0.7717188721216858, "flos": 20594877793920.0, "grad_norm": 2.5151830443973946, "language_loss": 0.77783799, "learning_rate": 5.220439070874108e-07, "loss": 0.79688466, "num_input_tokens_seen": 138137890, "step": 6418, "time_per_iteration": 3.306267738342285 }, { "auxiliary_loss_clip": 0.01153546, "auxiliary_loss_mlp": 0.01026658, "balance_loss_clip": 1.04918706, "balance_loss_mlp": 1.01971447, "epoch": 0.7718391150123249, "flos": 26250951870720.0, "grad_norm": 1.8273201541943596, "language_loss": 0.71487725, "learning_rate": 5.215192030201652e-07, "loss": 0.73667926, "num_input_tokens_seen": 138158880, "step": 6419, "time_per_iteration": 2.4934487342834473 }, { "auxiliary_loss_clip": 0.01110969, "auxiliary_loss_mlp": 0.01026385, "balance_loss_clip": 1.03986096, "balance_loss_mlp": 1.01911354, "epoch": 0.771959357902964, "flos": 22049762267520.0, "grad_norm": 2.124705495135433, "language_loss": 0.86177957, "learning_rate": 5.209947232372798e-07, "loss": 0.88315308, "num_input_tokens_seen": 138176370, "step": 6420, "time_per_iteration": 2.4981281757354736 }, { "auxiliary_loss_clip": 0.01154832, "auxiliary_loss_mlp": 0.00761693, "balance_loss_clip": 1.04481959, "balance_loss_mlp": 1.00033069, "epoch": 0.772079600793603, "flos": 30446000248320.0, "grad_norm": 1.711911543446483, "language_loss": 0.81308931, "learning_rate": 5.204704678183196e-07, "loss": 0.83225459, "num_input_tokens_seen": 138195105, "step": 6421, "time_per_iteration": 2.511103391647339 }, { "auxiliary_loss_clip": 0.01167727, "auxiliary_loss_mlp": 0.01023517, "balance_loss_clip": 1.04906154, "balance_loss_mlp": 1.01585793, "epoch": 0.7721998436842422, "flos": 12969857750400.0, "grad_norm": 1.8407963179988214, "language_loss": 0.85071194, "learning_rate": 5.19946436842813e-07, "loss": 0.87262434, "num_input_tokens_seen": 138212235, "step": 6422, "time_per_iteration": 3.145216703414917 }, { "auxiliary_loss_clip": 0.01129394, "auxiliary_loss_mlp": 0.01022093, "balance_loss_clip": 1.04907441, "balance_loss_mlp": 1.01520836, "epoch": 0.7723200865748813, "flos": 32635509678720.0, "grad_norm": 1.7149964127767598, "language_loss": 0.68343997, "learning_rate": 5.194226303902546e-07, "loss": 0.70495486, "num_input_tokens_seen": 138231970, "step": 6423, "time_per_iteration": 2.592524528503418 }, { "auxiliary_loss_clip": 0.01136572, "auxiliary_loss_mlp": 0.0102629, "balance_loss_clip": 1.04452682, "balance_loss_mlp": 1.01940012, "epoch": 0.7724403294655203, "flos": 21105707063040.0, "grad_norm": 1.8414754019365667, "language_loss": 0.71045929, "learning_rate": 5.188990485401072e-07, "loss": 0.73208791, "num_input_tokens_seen": 138251175, "step": 6424, "time_per_iteration": 2.4589998722076416 }, { "auxiliary_loss_clip": 0.01153004, "auxiliary_loss_mlp": 0.01020877, "balance_loss_clip": 1.04669023, "balance_loss_mlp": 1.01413012, "epoch": 0.7725605723561595, "flos": 22090736707200.0, "grad_norm": 1.873710553877904, "language_loss": 0.85851347, "learning_rate": 5.183756913717954e-07, "loss": 0.88025224, "num_input_tokens_seen": 138270950, "step": 6425, "time_per_iteration": 2.457789897918701 }, { "auxiliary_loss_clip": 0.01134962, "auxiliary_loss_mlp": 0.01029954, "balance_loss_clip": 1.04533386, "balance_loss_mlp": 1.02292693, "epoch": 0.7726808152467985, "flos": 34495610457600.0, "grad_norm": 2.513988629670602, "language_loss": 0.73195076, "learning_rate": 5.178525589647136e-07, "loss": 0.75359988, "num_input_tokens_seen": 138292590, "step": 6426, "time_per_iteration": 2.5548133850097656 }, { "auxiliary_loss_clip": 0.01142894, "auxiliary_loss_mlp": 0.01023533, "balance_loss_clip": 1.04459167, "balance_loss_mlp": 1.01694965, "epoch": 0.7728010581374376, "flos": 22306344094080.0, "grad_norm": 1.7305743299857412, "language_loss": 0.78915191, "learning_rate": 5.173296513982197e-07, "loss": 0.81081617, "num_input_tokens_seen": 138311115, "step": 6427, "time_per_iteration": 2.4777238368988037 }, { "auxiliary_loss_clip": 0.01134783, "auxiliary_loss_mlp": 0.01027905, "balance_loss_clip": 1.04711282, "balance_loss_mlp": 1.01950645, "epoch": 0.7729213010280768, "flos": 27126453968640.0, "grad_norm": 1.9938340598284112, "language_loss": 0.64725649, "learning_rate": 5.168069687516398e-07, "loss": 0.66888338, "num_input_tokens_seen": 138330885, "step": 6428, "time_per_iteration": 2.551393985748291 }, { "auxiliary_loss_clip": 0.01140267, "auxiliary_loss_mlp": 0.01021382, "balance_loss_clip": 1.04872108, "balance_loss_mlp": 1.01407135, "epoch": 0.7730415439187158, "flos": 18150223080960.0, "grad_norm": 1.8176777061761609, "language_loss": 0.71884924, "learning_rate": 5.16284511104263e-07, "loss": 0.7404657, "num_input_tokens_seen": 138350020, "step": 6429, "time_per_iteration": 2.4699227809906006 }, { "auxiliary_loss_clip": 0.01137933, "auxiliary_loss_mlp": 0.01025744, "balance_loss_clip": 1.04636443, "balance_loss_mlp": 1.01819777, "epoch": 0.7731617868093549, "flos": 11947480940160.0, "grad_norm": 2.564977626660824, "language_loss": 0.80906701, "learning_rate": 5.157622785353457e-07, "loss": 0.83070374, "num_input_tokens_seen": 138368135, "step": 6430, "time_per_iteration": 2.431574821472168 }, { "auxiliary_loss_clip": 0.01057661, "auxiliary_loss_mlp": 0.0100141, "balance_loss_clip": 1.01181853, "balance_loss_mlp": 1.00029504, "epoch": 0.7732820296999939, "flos": 64201027069440.0, "grad_norm": 0.6419605017811381, "language_loss": 0.60350251, "learning_rate": 5.152402711241113e-07, "loss": 0.62409329, "num_input_tokens_seen": 138436040, "step": 6431, "time_per_iteration": 3.090681791305542 }, { "auxiliary_loss_clip": 0.01118897, "auxiliary_loss_mlp": 0.01021826, "balance_loss_clip": 1.03975129, "balance_loss_mlp": 1.01476574, "epoch": 0.7734022725906331, "flos": 25302191984640.0, "grad_norm": 1.810689450895245, "language_loss": 0.8296504, "learning_rate": 5.147184889497465e-07, "loss": 0.85105759, "num_input_tokens_seen": 138455510, "step": 6432, "time_per_iteration": 2.5600802898406982 }, { "auxiliary_loss_clip": 0.01116893, "auxiliary_loss_mlp": 0.01024007, "balance_loss_clip": 1.04305601, "balance_loss_mlp": 1.01667523, "epoch": 0.7735225154812722, "flos": 17347440067200.0, "grad_norm": 2.6104377427408134, "language_loss": 0.8023839, "learning_rate": 5.141969320914072e-07, "loss": 0.82379293, "num_input_tokens_seen": 138473015, "step": 6433, "time_per_iteration": 2.4652419090270996 }, { "auxiliary_loss_clip": 0.01169906, "auxiliary_loss_mlp": 0.0102564, "balance_loss_clip": 1.04759467, "balance_loss_mlp": 1.01796913, "epoch": 0.7736427583719112, "flos": 32630086725120.0, "grad_norm": 2.1434589849239587, "language_loss": 0.62281764, "learning_rate": 5.136756006282113e-07, "loss": 0.64477313, "num_input_tokens_seen": 138491680, "step": 6434, "time_per_iteration": 2.4845428466796875 }, { "auxiliary_loss_clip": 0.01168518, "auxiliary_loss_mlp": 0.01025235, "balance_loss_clip": 1.04943657, "balance_loss_mlp": 1.0178616, "epoch": 0.7737630012625504, "flos": 19860073269120.0, "grad_norm": 2.137264941480943, "language_loss": 0.85020149, "learning_rate": 5.131544946392446e-07, "loss": 0.87213904, "num_input_tokens_seen": 138506960, "step": 6435, "time_per_iteration": 2.3737845420837402 }, { "auxiliary_loss_clip": 0.01143483, "auxiliary_loss_mlp": 0.01025696, "balance_loss_clip": 1.05161285, "balance_loss_mlp": 1.01839757, "epoch": 0.7738832441531894, "flos": 36022639397760.0, "grad_norm": 2.589393978676421, "language_loss": 0.6365037, "learning_rate": 5.126336142035592e-07, "loss": 0.6581955, "num_input_tokens_seen": 138526995, "step": 6436, "time_per_iteration": 2.6106059551239014 }, { "auxiliary_loss_clip": 0.01138641, "auxiliary_loss_mlp": 0.01025322, "balance_loss_clip": 1.04440117, "balance_loss_mlp": 1.01779437, "epoch": 0.7740034870438285, "flos": 13405274415360.0, "grad_norm": 2.814142197144034, "language_loss": 0.72009242, "learning_rate": 5.121129594001721e-07, "loss": 0.74173212, "num_input_tokens_seen": 138541260, "step": 6437, "time_per_iteration": 2.4573123455047607 }, { "auxiliary_loss_clip": 0.01153181, "auxiliary_loss_mlp": 0.01025154, "balance_loss_clip": 1.04819942, "balance_loss_mlp": 1.01763141, "epoch": 0.7741237299344677, "flos": 22086714384000.0, "grad_norm": 1.5735089788083296, "language_loss": 0.80978227, "learning_rate": 5.115925303080661e-07, "loss": 0.83156556, "num_input_tokens_seen": 138560970, "step": 6438, "time_per_iteration": 2.476163387298584 }, { "auxiliary_loss_clip": 0.01141201, "auxiliary_loss_mlp": 0.01025642, "balance_loss_clip": 1.0459826, "balance_loss_mlp": 1.01894593, "epoch": 0.7742439728251067, "flos": 19864777950720.0, "grad_norm": 2.1021872459072335, "language_loss": 0.79231328, "learning_rate": 5.110723270061899e-07, "loss": 0.81398171, "num_input_tokens_seen": 138577460, "step": 6439, "time_per_iteration": 2.441087245941162 }, { "auxiliary_loss_clip": 0.01163282, "auxiliary_loss_mlp": 0.01023818, "balance_loss_clip": 1.04651213, "balance_loss_mlp": 1.01740479, "epoch": 0.7743642157157458, "flos": 16690167048960.0, "grad_norm": 2.005066207805757, "language_loss": 0.79437143, "learning_rate": 5.105523495734572e-07, "loss": 0.81624246, "num_input_tokens_seen": 138594860, "step": 6440, "time_per_iteration": 2.3974616527557373 }, { "auxiliary_loss_clip": 0.0116725, "auxiliary_loss_mlp": 0.01028219, "balance_loss_clip": 1.04725432, "balance_loss_mlp": 1.02051222, "epoch": 0.7744844586063849, "flos": 20304360593280.0, "grad_norm": 1.521867159672832, "language_loss": 0.7495482, "learning_rate": 5.100325980887499e-07, "loss": 0.77150285, "num_input_tokens_seen": 138614785, "step": 6441, "time_per_iteration": 2.4130403995513916 }, { "auxiliary_loss_clip": 0.01148242, "auxiliary_loss_mlp": 0.01023467, "balance_loss_clip": 1.04713202, "balance_loss_mlp": 1.01623058, "epoch": 0.774604701497024, "flos": 22966705681920.0, "grad_norm": 1.7288709964931221, "language_loss": 0.83163971, "learning_rate": 5.095130726309116e-07, "loss": 0.85335678, "num_input_tokens_seen": 138634960, "step": 6442, "time_per_iteration": 2.4730496406555176 }, { "auxiliary_loss_clip": 0.0106572, "auxiliary_loss_mlp": 0.01000898, "balance_loss_clip": 1.01094031, "balance_loss_mlp": 0.99984854, "epoch": 0.774724944387663, "flos": 60288523073280.0, "grad_norm": 0.8013333376209394, "language_loss": 0.59004474, "learning_rate": 5.089937732787559e-07, "loss": 0.61071086, "num_input_tokens_seen": 138699520, "step": 6443, "time_per_iteration": 3.8609416484832764 }, { "auxiliary_loss_clip": 0.01125663, "auxiliary_loss_mlp": 0.0102824, "balance_loss_clip": 1.0437777, "balance_loss_mlp": 1.02057743, "epoch": 0.7748451872783022, "flos": 26761026954240.0, "grad_norm": 2.680038287115145, "language_loss": 0.66918653, "learning_rate": 5.084747001110592e-07, "loss": 0.69072556, "num_input_tokens_seen": 138719145, "step": 6444, "time_per_iteration": 3.225616455078125 }, { "auxiliary_loss_clip": 0.01152046, "auxiliary_loss_mlp": 0.00761735, "balance_loss_clip": 1.05140138, "balance_loss_mlp": 1.00038409, "epoch": 0.7749654301689413, "flos": 30338627518080.0, "grad_norm": 1.7290778018416668, "language_loss": 0.70300925, "learning_rate": 5.07955853206564e-07, "loss": 0.72214717, "num_input_tokens_seen": 138743850, "step": 6445, "time_per_iteration": 3.3695642948150635 }, { "auxiliary_loss_clip": 0.01156388, "auxiliary_loss_mlp": 0.01024178, "balance_loss_clip": 1.04747105, "balance_loss_mlp": 1.0171237, "epoch": 0.7750856730595803, "flos": 43179851687040.0, "grad_norm": 1.5053421106051914, "language_loss": 0.70791072, "learning_rate": 5.074372326439807e-07, "loss": 0.72971636, "num_input_tokens_seen": 138766860, "step": 6446, "time_per_iteration": 2.6301259994506836 }, { "auxiliary_loss_clip": 0.01126631, "auxiliary_loss_mlp": 0.01024101, "balance_loss_clip": 1.04497623, "balance_loss_mlp": 1.01697826, "epoch": 0.7752059159502195, "flos": 17640040256640.0, "grad_norm": 2.160646410047585, "language_loss": 0.73268616, "learning_rate": 5.069188385019814e-07, "loss": 0.75419348, "num_input_tokens_seen": 138784560, "step": 6447, "time_per_iteration": 2.468282461166382 }, { "auxiliary_loss_clip": 0.01116939, "auxiliary_loss_mlp": 0.01023589, "balance_loss_clip": 1.04172969, "balance_loss_mlp": 1.01630569, "epoch": 0.7753261588408585, "flos": 12677688524160.0, "grad_norm": 2.5037220948534036, "language_loss": 0.61468381, "learning_rate": 5.064006708592077e-07, "loss": 0.63608903, "num_input_tokens_seen": 138800805, "step": 6448, "time_per_iteration": 2.5272462368011475 }, { "auxiliary_loss_clip": 0.01132719, "auxiliary_loss_mlp": 0.01021799, "balance_loss_clip": 1.04613602, "balance_loss_mlp": 1.01481926, "epoch": 0.7754464017314976, "flos": 16690741666560.0, "grad_norm": 3.541752868752098, "language_loss": 0.75716442, "learning_rate": 5.058827297942641e-07, "loss": 0.77870965, "num_input_tokens_seen": 138815910, "step": 6449, "time_per_iteration": 3.141327142715454 }, { "auxiliary_loss_clip": 0.01145791, "auxiliary_loss_mlp": 0.0102569, "balance_loss_clip": 1.04628992, "balance_loss_mlp": 1.01848936, "epoch": 0.7755666446221368, "flos": 19718944732800.0, "grad_norm": 1.7544060391221827, "language_loss": 0.75124407, "learning_rate": 5.053650153857237e-07, "loss": 0.77295887, "num_input_tokens_seen": 138834920, "step": 6450, "time_per_iteration": 2.501918315887451 }, { "auxiliary_loss_clip": 0.01152202, "auxiliary_loss_mlp": 0.01027634, "balance_loss_clip": 1.04784274, "balance_loss_mlp": 1.02066088, "epoch": 0.7756868875127758, "flos": 18693623007360.0, "grad_norm": 1.6570058684961895, "language_loss": 0.70000458, "learning_rate": 5.048475277121214e-07, "loss": 0.72180295, "num_input_tokens_seen": 138852135, "step": 6451, "time_per_iteration": 2.40974497795105 }, { "auxiliary_loss_clip": 0.01151228, "auxiliary_loss_mlp": 0.01022735, "balance_loss_clip": 1.04468381, "balance_loss_mlp": 1.01558852, "epoch": 0.7758071304034149, "flos": 28404191543040.0, "grad_norm": 1.6769634699945188, "language_loss": 0.76976824, "learning_rate": 5.043302668519598e-07, "loss": 0.79150784, "num_input_tokens_seen": 138871470, "step": 6452, "time_per_iteration": 2.4969372749328613 }, { "auxiliary_loss_clip": 0.01155396, "auxiliary_loss_mlp": 0.01020389, "balance_loss_clip": 1.04569912, "balance_loss_mlp": 1.01350188, "epoch": 0.775927373294054, "flos": 20595344670720.0, "grad_norm": 2.080420852405165, "language_loss": 0.71731466, "learning_rate": 5.038132328837079e-07, "loss": 0.7390725, "num_input_tokens_seen": 138889860, "step": 6453, "time_per_iteration": 2.426710367202759 }, { "auxiliary_loss_clip": 0.01152916, "auxiliary_loss_mlp": 0.01020059, "balance_loss_clip": 1.04607224, "balance_loss_mlp": 1.01286435, "epoch": 0.7760476161846931, "flos": 22526368853760.0, "grad_norm": 2.033429583875141, "language_loss": 0.74034488, "learning_rate": 5.032964258857993e-07, "loss": 0.76207459, "num_input_tokens_seen": 138909955, "step": 6454, "time_per_iteration": 2.439711809158325 }, { "auxiliary_loss_clip": 0.01149091, "auxiliary_loss_mlp": 0.01024304, "balance_loss_clip": 1.04198992, "balance_loss_mlp": 1.01694012, "epoch": 0.7761678590753321, "flos": 48651488403840.0, "grad_norm": 1.5741380698309486, "language_loss": 0.68368697, "learning_rate": 5.027798459366329e-07, "loss": 0.70542085, "num_input_tokens_seen": 138935320, "step": 6455, "time_per_iteration": 2.666825294494629 }, { "auxiliary_loss_clip": 0.01157841, "auxiliary_loss_mlp": 0.01026364, "balance_loss_clip": 1.04690766, "balance_loss_mlp": 1.01894331, "epoch": 0.7762881019659713, "flos": 26177047637760.0, "grad_norm": 1.3340643596533168, "language_loss": 0.63644969, "learning_rate": 5.02263493114573e-07, "loss": 0.6582917, "num_input_tokens_seen": 138957115, "step": 6456, "time_per_iteration": 2.4717013835906982 }, { "auxiliary_loss_clip": 0.01163778, "auxiliary_loss_mlp": 0.01025042, "balance_loss_clip": 1.04592049, "balance_loss_mlp": 1.0176245, "epoch": 0.7764083448566104, "flos": 20588341518720.0, "grad_norm": 5.4999977880007185, "language_loss": 0.7733652, "learning_rate": 5.017473674979502e-07, "loss": 0.7952534, "num_input_tokens_seen": 138973140, "step": 6457, "time_per_iteration": 2.366192579269409 }, { "auxiliary_loss_clip": 0.010271, "auxiliary_loss_mlp": 0.01002277, "balance_loss_clip": 1.0127337, "balance_loss_mlp": 1.00142431, "epoch": 0.7765285877472494, "flos": 67293078560640.0, "grad_norm": 0.7388999738542961, "language_loss": 0.58362532, "learning_rate": 5.01231469165061e-07, "loss": 0.60391903, "num_input_tokens_seen": 139028965, "step": 6458, "time_per_iteration": 2.9756686687469482 }, { "auxiliary_loss_clip": 0.01056278, "auxiliary_loss_mlp": 0.01001753, "balance_loss_clip": 1.01111376, "balance_loss_mlp": 1.00070441, "epoch": 0.7766488306378886, "flos": 61344476121600.0, "grad_norm": 0.8341655305906173, "language_loss": 0.56878179, "learning_rate": 5.007157981941663e-07, "loss": 0.58936214, "num_input_tokens_seen": 139094325, "step": 6459, "time_per_iteration": 3.1317391395568848 }, { "auxiliary_loss_clip": 0.0104721, "auxiliary_loss_mlp": 0.01001525, "balance_loss_clip": 1.01143205, "balance_loss_mlp": 1.00048208, "epoch": 0.7767690735285276, "flos": 62946199393920.0, "grad_norm": 0.8776113354078642, "language_loss": 0.67385995, "learning_rate": 5.002003546634928e-07, "loss": 0.69434738, "num_input_tokens_seen": 139150425, "step": 6460, "time_per_iteration": 3.0022177696228027 }, { "auxiliary_loss_clip": 0.01111732, "auxiliary_loss_mlp": 0.01024074, "balance_loss_clip": 1.04784656, "balance_loss_mlp": 1.01729679, "epoch": 0.7768893164191667, "flos": 20886400575360.0, "grad_norm": 1.9647584450303748, "language_loss": 0.75964898, "learning_rate": 4.996851386512331e-07, "loss": 0.78100705, "num_input_tokens_seen": 139169130, "step": 6461, "time_per_iteration": 2.529478073120117 }, { "auxiliary_loss_clip": 0.01138187, "auxiliary_loss_mlp": 0.01026045, "balance_loss_clip": 1.04549611, "balance_loss_mlp": 1.0183115, "epoch": 0.7770095593098058, "flos": 20704584908160.0, "grad_norm": 1.7860559881382527, "language_loss": 0.83080018, "learning_rate": 4.991701502355444e-07, "loss": 0.85244256, "num_input_tokens_seen": 139189595, "step": 6462, "time_per_iteration": 2.465259552001953 }, { "auxiliary_loss_clip": 0.01155056, "auxiliary_loss_mlp": 0.01023779, "balance_loss_clip": 1.04640627, "balance_loss_mlp": 1.01743054, "epoch": 0.7771298022004449, "flos": 24717709877760.0, "grad_norm": 1.5090409968500693, "language_loss": 0.7594372, "learning_rate": 4.986553894945518e-07, "loss": 0.78122556, "num_input_tokens_seen": 139210805, "step": 6463, "time_per_iteration": 2.483222723007202 }, { "auxiliary_loss_clip": 0.01111352, "auxiliary_loss_mlp": 0.01023983, "balance_loss_clip": 1.0402081, "balance_loss_mlp": 1.01786137, "epoch": 0.777250045091084, "flos": 25009232659200.0, "grad_norm": 2.1081996816741966, "language_loss": 0.86050177, "learning_rate": 4.981408565063416e-07, "loss": 0.88185507, "num_input_tokens_seen": 139230750, "step": 6464, "time_per_iteration": 2.5557990074157715 }, { "auxiliary_loss_clip": 0.01167666, "auxiliary_loss_mlp": 0.01022888, "balance_loss_clip": 1.04819059, "balance_loss_mlp": 1.01584315, "epoch": 0.777370287981723, "flos": 20119887319680.0, "grad_norm": 2.0370118934167842, "language_loss": 0.7603848, "learning_rate": 4.976265513489701e-07, "loss": 0.78229034, "num_input_tokens_seen": 139250720, "step": 6465, "time_per_iteration": 2.4163033962249756 }, { "auxiliary_loss_clip": 0.01150861, "auxiliary_loss_mlp": 0.01024172, "balance_loss_clip": 1.04393113, "balance_loss_mlp": 1.01696539, "epoch": 0.7774905308723622, "flos": 21718809331200.0, "grad_norm": 1.8639734558211378, "language_loss": 0.80701333, "learning_rate": 4.971124741004562e-07, "loss": 0.82876366, "num_input_tokens_seen": 139269720, "step": 6466, "time_per_iteration": 2.438819169998169 }, { "auxiliary_loss_clip": 0.01150274, "auxiliary_loss_mlp": 0.01021797, "balance_loss_clip": 1.04517806, "balance_loss_mlp": 1.01493669, "epoch": 0.7776107737630013, "flos": 16034115093120.0, "grad_norm": 2.0543492683344833, "language_loss": 0.76488411, "learning_rate": 4.965986248387846e-07, "loss": 0.78660482, "num_input_tokens_seen": 139288035, "step": 6467, "time_per_iteration": 2.401381015777588 }, { "auxiliary_loss_clip": 0.01140179, "auxiliary_loss_mlp": 0.0102372, "balance_loss_clip": 1.04370236, "balance_loss_mlp": 1.01692843, "epoch": 0.7777310166536403, "flos": 24790895838720.0, "grad_norm": 1.6802066583015651, "language_loss": 0.76982903, "learning_rate": 4.960850036419073e-07, "loss": 0.79146802, "num_input_tokens_seen": 139307135, "step": 6468, "time_per_iteration": 2.494659423828125 }, { "auxiliary_loss_clip": 0.01134654, "auxiliary_loss_mlp": 0.01021582, "balance_loss_clip": 1.0441258, "balance_loss_mlp": 1.01435757, "epoch": 0.7778512595442795, "flos": 17272530253440.0, "grad_norm": 2.1862019421500607, "language_loss": 0.78757286, "learning_rate": 4.955716105877378e-07, "loss": 0.80913526, "num_input_tokens_seen": 139325905, "step": 6469, "time_per_iteration": 3.275754928588867 }, { "auxiliary_loss_clip": 0.01156343, "auxiliary_loss_mlp": 0.00761567, "balance_loss_clip": 1.04682326, "balance_loss_mlp": 1.00036108, "epoch": 0.7779715024349185, "flos": 17748418567680.0, "grad_norm": 1.7528109889418506, "language_loss": 0.82955408, "learning_rate": 4.950584457541598e-07, "loss": 0.84873319, "num_input_tokens_seen": 139344370, "step": 6470, "time_per_iteration": 3.13714861869812 }, { "auxiliary_loss_clip": 0.01154319, "auxiliary_loss_mlp": 0.01025287, "balance_loss_clip": 1.04663897, "balance_loss_mlp": 1.01850057, "epoch": 0.7780917453255576, "flos": 24316875031680.0, "grad_norm": 1.347970465380064, "language_loss": 0.81814688, "learning_rate": 4.945455092190183e-07, "loss": 0.83994293, "num_input_tokens_seen": 139365625, "step": 6471, "time_per_iteration": 3.3791353702545166 }, { "auxiliary_loss_clip": 0.01065539, "auxiliary_loss_mlp": 0.01000919, "balance_loss_clip": 1.01064253, "balance_loss_mlp": 0.99985808, "epoch": 0.7782119882161967, "flos": 56364601530240.0, "grad_norm": 0.6771088358195161, "language_loss": 0.55961573, "learning_rate": 4.940328010601271e-07, "loss": 0.5802803, "num_input_tokens_seen": 139430540, "step": 6472, "time_per_iteration": 3.017413377761841 }, { "auxiliary_loss_clip": 0.01151504, "auxiliary_loss_mlp": 0.01028954, "balance_loss_clip": 1.05152142, "balance_loss_mlp": 1.02137816, "epoch": 0.7783322311068358, "flos": 46789986994560.0, "grad_norm": 1.719036534090131, "language_loss": 0.76680022, "learning_rate": 4.935203213552621e-07, "loss": 0.78860486, "num_input_tokens_seen": 139454280, "step": 6473, "time_per_iteration": 2.691930055618286 }, { "auxiliary_loss_clip": 0.01141165, "auxiliary_loss_mlp": 0.01023115, "balance_loss_clip": 1.0461632, "balance_loss_mlp": 1.01555395, "epoch": 0.7784524739974749, "flos": 19057864872960.0, "grad_norm": 8.55752309875727, "language_loss": 0.66779333, "learning_rate": 4.930080701821662e-07, "loss": 0.68943614, "num_input_tokens_seen": 139471745, "step": 6474, "time_per_iteration": 2.4473488330841064 }, { "auxiliary_loss_clip": 0.01139418, "auxiliary_loss_mlp": 0.01025608, "balance_loss_clip": 1.04459834, "balance_loss_mlp": 1.01850629, "epoch": 0.778572716888114, "flos": 24791111320320.0, "grad_norm": 2.3501120765167505, "language_loss": 0.77194738, "learning_rate": 4.92496047618548e-07, "loss": 0.7935977, "num_input_tokens_seen": 139491505, "step": 6475, "time_per_iteration": 3.2657601833343506 }, { "auxiliary_loss_clip": 0.01157503, "auxiliary_loss_mlp": 0.01022966, "balance_loss_clip": 1.05059385, "balance_loss_mlp": 1.01574206, "epoch": 0.7786929597787531, "flos": 20078086867200.0, "grad_norm": 1.7811780938141082, "language_loss": 0.77793294, "learning_rate": 4.919842537420811e-07, "loss": 0.79973757, "num_input_tokens_seen": 139508620, "step": 6476, "time_per_iteration": 2.42966890335083 }, { "auxiliary_loss_clip": 0.01140987, "auxiliary_loss_mlp": 0.01026943, "balance_loss_clip": 1.04864883, "balance_loss_mlp": 1.02031803, "epoch": 0.7788132026693921, "flos": 21872220318720.0, "grad_norm": 1.5867789015892955, "language_loss": 0.79264104, "learning_rate": 4.91472688630404e-07, "loss": 0.81432033, "num_input_tokens_seen": 139529360, "step": 6477, "time_per_iteration": 2.474210500717163 }, { "auxiliary_loss_clip": 0.01163792, "auxiliary_loss_mlp": 0.01020733, "balance_loss_clip": 1.04691815, "balance_loss_mlp": 1.01408958, "epoch": 0.7789334455600313, "flos": 11181937351680.0, "grad_norm": 1.840047594253558, "language_loss": 0.74176162, "learning_rate": 4.909613523611202e-07, "loss": 0.76360685, "num_input_tokens_seen": 139546240, "step": 6478, "time_per_iteration": 2.438084363937378 }, { "auxiliary_loss_clip": 0.01106869, "auxiliary_loss_mlp": 0.00761989, "balance_loss_clip": 1.03926635, "balance_loss_mlp": 1.00046504, "epoch": 0.7790536884506704, "flos": 28695427015680.0, "grad_norm": 1.7131593213837262, "language_loss": 0.74512082, "learning_rate": 4.904502450117991e-07, "loss": 0.76380944, "num_input_tokens_seen": 139567200, "step": 6479, "time_per_iteration": 2.602010726928711 }, { "auxiliary_loss_clip": 0.01138148, "auxiliary_loss_mlp": 0.0102528, "balance_loss_clip": 1.0487994, "balance_loss_mlp": 1.01820242, "epoch": 0.7791739313413094, "flos": 11072302064640.0, "grad_norm": 2.2655887327138386, "language_loss": 0.7183193, "learning_rate": 4.899393666599762e-07, "loss": 0.73995364, "num_input_tokens_seen": 139583775, "step": 6480, "time_per_iteration": 2.4409658908843994 }, { "auxiliary_loss_clip": 0.01164531, "auxiliary_loss_mlp": 0.01019843, "balance_loss_clip": 1.04551697, "balance_loss_mlp": 1.01322973, "epoch": 0.7792941742319486, "flos": 14679276975360.0, "grad_norm": 2.451872446933454, "language_loss": 0.72448373, "learning_rate": 4.894287173831506e-07, "loss": 0.74632752, "num_input_tokens_seen": 139599735, "step": 6481, "time_per_iteration": 2.357515335083008 }, { "auxiliary_loss_clip": 0.01138921, "auxiliary_loss_mlp": 0.01025041, "balance_loss_clip": 1.04311824, "balance_loss_mlp": 1.01717901, "epoch": 0.7794144171225876, "flos": 23258874908160.0, "grad_norm": 2.7050119523459117, "language_loss": 0.84315956, "learning_rate": 4.889182972587877e-07, "loss": 0.8647992, "num_input_tokens_seen": 139619030, "step": 6482, "time_per_iteration": 2.469722270965576 }, { "auxiliary_loss_clip": 0.01132318, "auxiliary_loss_mlp": 0.01025619, "balance_loss_clip": 1.04506886, "balance_loss_mlp": 1.01877379, "epoch": 0.7795346600132267, "flos": 21507080613120.0, "grad_norm": 1.7858208545962788, "language_loss": 0.66032982, "learning_rate": 4.884081063643177e-07, "loss": 0.68190914, "num_input_tokens_seen": 139637690, "step": 6483, "time_per_iteration": 2.518369436264038 }, { "auxiliary_loss_clip": 0.01040475, "auxiliary_loss_mlp": 0.0100151, "balance_loss_clip": 1.01195192, "balance_loss_mlp": 1.00052702, "epoch": 0.7796549029038659, "flos": 70052273694720.0, "grad_norm": 0.8648663077288179, "language_loss": 0.52530181, "learning_rate": 4.878981447771353e-07, "loss": 0.54572165, "num_input_tokens_seen": 139692070, "step": 6484, "time_per_iteration": 3.0795040130615234 }, { "auxiliary_loss_clip": 0.01118967, "auxiliary_loss_mlp": 0.01025686, "balance_loss_clip": 1.04298294, "balance_loss_mlp": 1.01744246, "epoch": 0.7797751457945049, "flos": 23989405714560.0, "grad_norm": 1.651649588835177, "language_loss": 0.73228383, "learning_rate": 4.873884125746035e-07, "loss": 0.75373042, "num_input_tokens_seen": 139713745, "step": 6485, "time_per_iteration": 2.5281119346618652 }, { "auxiliary_loss_clip": 0.0113465, "auxiliary_loss_mlp": 0.01019535, "balance_loss_clip": 1.04448938, "balance_loss_mlp": 1.01248991, "epoch": 0.779895388685144, "flos": 22674751937280.0, "grad_norm": 2.448838597906675, "language_loss": 0.72434759, "learning_rate": 4.868789098340456e-07, "loss": 0.74588943, "num_input_tokens_seen": 139731650, "step": 6486, "time_per_iteration": 2.473839521408081 }, { "auxiliary_loss_clip": 0.01123394, "auxiliary_loss_mlp": 0.01023626, "balance_loss_clip": 1.04187012, "balance_loss_mlp": 1.01695895, "epoch": 0.7800156315757831, "flos": 23768698596480.0, "grad_norm": 4.081162562167619, "language_loss": 0.73117936, "learning_rate": 4.863696366327543e-07, "loss": 0.75264961, "num_input_tokens_seen": 139750820, "step": 6487, "time_per_iteration": 2.516204595565796 }, { "auxiliary_loss_clip": 0.01153253, "auxiliary_loss_mlp": 0.01026933, "balance_loss_clip": 1.04431617, "balance_loss_mlp": 1.01979876, "epoch": 0.7801358744664222, "flos": 26429714881920.0, "grad_norm": 1.8289264418598066, "language_loss": 0.7812897, "learning_rate": 4.85860593047986e-07, "loss": 0.80309165, "num_input_tokens_seen": 139770885, "step": 6488, "time_per_iteration": 2.492370367050171 }, { "auxiliary_loss_clip": 0.01115999, "auxiliary_loss_mlp": 0.01024151, "balance_loss_clip": 1.03857565, "balance_loss_mlp": 1.0173409, "epoch": 0.7802561173570612, "flos": 26322162583680.0, "grad_norm": 2.1004130141492454, "language_loss": 0.74782205, "learning_rate": 4.853517791569613e-07, "loss": 0.76922357, "num_input_tokens_seen": 139793065, "step": 6489, "time_per_iteration": 2.557070016860962 }, { "auxiliary_loss_clip": 0.01144002, "auxiliary_loss_mlp": 0.00762092, "balance_loss_clip": 1.04451227, "balance_loss_mlp": 1.00033188, "epoch": 0.7803763602477004, "flos": 40333751596800.0, "grad_norm": 1.714467869202251, "language_loss": 0.66034502, "learning_rate": 4.848431950368684e-07, "loss": 0.67940599, "num_input_tokens_seen": 139815625, "step": 6490, "time_per_iteration": 2.636080026626587 }, { "auxiliary_loss_clip": 0.01065659, "auxiliary_loss_mlp": 0.00752864, "balance_loss_clip": 1.01103115, "balance_loss_mlp": 0.99989241, "epoch": 0.7804966031383395, "flos": 67001448038400.0, "grad_norm": 0.712297458260766, "language_loss": 0.55715853, "learning_rate": 4.843348407648569e-07, "loss": 0.57534379, "num_input_tokens_seen": 139876905, "step": 6491, "time_per_iteration": 2.953810453414917 }, { "auxiliary_loss_clip": 0.01153366, "auxiliary_loss_mlp": 0.01024581, "balance_loss_clip": 1.04263616, "balance_loss_mlp": 1.01674891, "epoch": 0.7806168460289785, "flos": 17740733057280.0, "grad_norm": 2.377935191854429, "language_loss": 0.82714969, "learning_rate": 4.838267164180457e-07, "loss": 0.84892911, "num_input_tokens_seen": 139892575, "step": 6492, "time_per_iteration": 2.399416208267212 }, { "auxiliary_loss_clip": 0.01168954, "auxiliary_loss_mlp": 0.01024881, "balance_loss_clip": 1.04785323, "balance_loss_mlp": 1.01714444, "epoch": 0.7807370889196176, "flos": 23946240545280.0, "grad_norm": 2.0274687536790608, "language_loss": 0.83741719, "learning_rate": 4.833188220735156e-07, "loss": 0.85935557, "num_input_tokens_seen": 139912245, "step": 6493, "time_per_iteration": 2.419259548187256 }, { "auxiliary_loss_clip": 0.01151573, "auxiliary_loss_mlp": 0.0102186, "balance_loss_clip": 1.046157, "balance_loss_mlp": 1.01484728, "epoch": 0.7808573318102567, "flos": 18989024457600.0, "grad_norm": 2.023430002560144, "language_loss": 0.74718851, "learning_rate": 4.828111578083152e-07, "loss": 0.76892281, "num_input_tokens_seen": 139929150, "step": 6494, "time_per_iteration": 2.4137871265411377 }, { "auxiliary_loss_clip": 0.0113444, "auxiliary_loss_mlp": 0.01026902, "balance_loss_clip": 1.0453341, "balance_loss_mlp": 1.01955867, "epoch": 0.7809775747008958, "flos": 23980750536960.0, "grad_norm": 2.6199241854478497, "language_loss": 0.8170386, "learning_rate": 4.823037236994556e-07, "loss": 0.83865201, "num_input_tokens_seen": 139947315, "step": 6495, "time_per_iteration": 2.527867078781128 }, { "auxiliary_loss_clip": 0.01056259, "auxiliary_loss_mlp": 0.01000758, "balance_loss_clip": 1.01057625, "balance_loss_mlp": 0.99972039, "epoch": 0.7810978175915348, "flos": 68535875180160.0, "grad_norm": 0.7132369122011739, "language_loss": 0.56343669, "learning_rate": 4.817965198239136e-07, "loss": 0.58400691, "num_input_tokens_seen": 140013775, "step": 6496, "time_per_iteration": 3.8623228073120117 }, { "auxiliary_loss_clip": 0.0112264, "auxiliary_loss_mlp": 0.01025794, "balance_loss_clip": 1.04158223, "balance_loss_mlp": 1.01798308, "epoch": 0.781218060482174, "flos": 19642131498240.0, "grad_norm": 2.1266018211535296, "language_loss": 0.74443281, "learning_rate": 4.812895462586331e-07, "loss": 0.76591718, "num_input_tokens_seen": 140031600, "step": 6497, "time_per_iteration": 3.2123382091522217 }, { "auxiliary_loss_clip": 0.01127468, "auxiliary_loss_mlp": 0.01022041, "balance_loss_clip": 1.04589307, "balance_loss_mlp": 1.0156728, "epoch": 0.7813383033728131, "flos": 25627865621760.0, "grad_norm": 1.7443390149669613, "language_loss": 0.82332885, "learning_rate": 4.807828030805207e-07, "loss": 0.8448239, "num_input_tokens_seen": 140050590, "step": 6498, "time_per_iteration": 3.3967041969299316 }, { "auxiliary_loss_clip": 0.01151899, "auxiliary_loss_mlp": 0.01033252, "balance_loss_clip": 1.04835701, "balance_loss_mlp": 1.02563453, "epoch": 0.7814585462634521, "flos": 20485924865280.0, "grad_norm": 1.7733321957795256, "language_loss": 0.68363786, "learning_rate": 4.802762903664495e-07, "loss": 0.70548934, "num_input_tokens_seen": 140069770, "step": 6499, "time_per_iteration": 2.540027618408203 }, { "auxiliary_loss_clip": 0.01146462, "auxiliary_loss_mlp": 0.01028036, "balance_loss_clip": 1.04868495, "balance_loss_mlp": 1.02036452, "epoch": 0.7815787891540913, "flos": 22304297018880.0, "grad_norm": 2.278488497321996, "language_loss": 0.73927927, "learning_rate": 4.797700081932565e-07, "loss": 0.76102424, "num_input_tokens_seen": 140087635, "step": 6500, "time_per_iteration": 2.468290328979492 }, { "auxiliary_loss_clip": 0.01089879, "auxiliary_loss_mlp": 0.01025554, "balance_loss_clip": 1.03757548, "balance_loss_mlp": 1.01852059, "epoch": 0.7816990320447303, "flos": 22600668136320.0, "grad_norm": 2.353676989737286, "language_loss": 0.82056361, "learning_rate": 4.792639566377442e-07, "loss": 0.84171796, "num_input_tokens_seen": 140105045, "step": 6501, "time_per_iteration": 2.5547449588775635 }, { "auxiliary_loss_clip": 0.01146882, "auxiliary_loss_mlp": 0.01020525, "balance_loss_clip": 1.04369044, "balance_loss_mlp": 1.01303852, "epoch": 0.7818192749353694, "flos": 24935974871040.0, "grad_norm": 1.7143268110907315, "language_loss": 0.77669728, "learning_rate": 4.78758135776681e-07, "loss": 0.79837132, "num_input_tokens_seen": 140124900, "step": 6502, "time_per_iteration": 3.249781608581543 }, { "auxiliary_loss_clip": 0.01139391, "auxiliary_loss_mlp": 0.0102537, "balance_loss_clip": 1.04566169, "balance_loss_mlp": 1.0186348, "epoch": 0.7819395178260086, "flos": 23733039369600.0, "grad_norm": 2.052509956360546, "language_loss": 0.78826964, "learning_rate": 4.782525456867989e-07, "loss": 0.80991721, "num_input_tokens_seen": 140143755, "step": 6503, "time_per_iteration": 2.4841930866241455 }, { "auxiliary_loss_clip": 0.01125685, "auxiliary_loss_mlp": 0.01025145, "balance_loss_clip": 1.04330504, "balance_loss_mlp": 1.01759315, "epoch": 0.7820597607166476, "flos": 23221671396480.0, "grad_norm": 2.2475392216689696, "language_loss": 0.83056802, "learning_rate": 4.777471864447959e-07, "loss": 0.85207629, "num_input_tokens_seen": 140164495, "step": 6504, "time_per_iteration": 2.5042660236358643 }, { "auxiliary_loss_clip": 0.01139824, "auxiliary_loss_mlp": 0.01029602, "balance_loss_clip": 1.04402256, "balance_loss_mlp": 1.02259493, "epoch": 0.7821800036072867, "flos": 22309540404480.0, "grad_norm": 2.289048015419642, "language_loss": 0.80342984, "learning_rate": 4.772420581273344e-07, "loss": 0.82512403, "num_input_tokens_seen": 140181980, "step": 6505, "time_per_iteration": 2.4771053791046143 }, { "auxiliary_loss_clip": 0.01148002, "auxiliary_loss_mlp": 0.01022983, "balance_loss_clip": 1.0461762, "balance_loss_mlp": 1.01564002, "epoch": 0.7823002464979258, "flos": 21544176384000.0, "grad_norm": 1.8111270710972136, "language_loss": 0.75861347, "learning_rate": 4.7673716081104134e-07, "loss": 0.78032333, "num_input_tokens_seen": 140202155, "step": 6506, "time_per_iteration": 2.4428014755249023 }, { "auxiliary_loss_clip": 0.01153807, "auxiliary_loss_mlp": 0.01025059, "balance_loss_clip": 1.04919577, "balance_loss_mlp": 1.01808512, "epoch": 0.7824204893885649, "flos": 24535642815360.0, "grad_norm": 1.6172844939184783, "language_loss": 0.84568745, "learning_rate": 4.762324945725109e-07, "loss": 0.86747611, "num_input_tokens_seen": 140221600, "step": 6507, "time_per_iteration": 2.466966152191162 }, { "auxiliary_loss_clip": 0.01134125, "auxiliary_loss_mlp": 0.01026296, "balance_loss_clip": 1.0478878, "balance_loss_mlp": 1.0193938, "epoch": 0.782540732279204, "flos": 27415211402880.0, "grad_norm": 1.6534168662577466, "language_loss": 0.75591129, "learning_rate": 4.7572805948829844e-07, "loss": 0.77751541, "num_input_tokens_seen": 140241860, "step": 6508, "time_per_iteration": 2.501706123352051 }, { "auxiliary_loss_clip": 0.01115915, "auxiliary_loss_mlp": 0.01021402, "balance_loss_clip": 1.04416871, "balance_loss_mlp": 1.01469302, "epoch": 0.7826609751698431, "flos": 24353216616960.0, "grad_norm": 2.0298483841624435, "language_loss": 0.71122849, "learning_rate": 4.7522385563492795e-07, "loss": 0.7326017, "num_input_tokens_seen": 140262160, "step": 6509, "time_per_iteration": 2.5437140464782715 }, { "auxiliary_loss_clip": 0.01131003, "auxiliary_loss_mlp": 0.01025451, "balance_loss_clip": 1.0479269, "balance_loss_mlp": 1.01852536, "epoch": 0.7827812180604822, "flos": 23988543788160.0, "grad_norm": 1.8421312027599637, "language_loss": 0.70350665, "learning_rate": 4.747198830888863e-07, "loss": 0.72507119, "num_input_tokens_seen": 140282030, "step": 6510, "time_per_iteration": 2.5157268047332764 }, { "auxiliary_loss_clip": 0.01133541, "auxiliary_loss_mlp": 0.01026722, "balance_loss_clip": 1.04472017, "balance_loss_mlp": 1.01967096, "epoch": 0.7829014609511212, "flos": 27454318335360.0, "grad_norm": 2.8099577356954786, "language_loss": 0.68423414, "learning_rate": 4.742161419266251e-07, "loss": 0.70583677, "num_input_tokens_seen": 140301190, "step": 6511, "time_per_iteration": 2.5029919147491455 }, { "auxiliary_loss_clip": 0.0115644, "auxiliary_loss_mlp": 0.01027691, "balance_loss_clip": 1.04618263, "balance_loss_mlp": 1.01999009, "epoch": 0.7830217038417604, "flos": 29204532432000.0, "grad_norm": 3.2335599872123586, "language_loss": 0.64929855, "learning_rate": 4.7371263222456304e-07, "loss": 0.67113984, "num_input_tokens_seen": 140318510, "step": 6512, "time_per_iteration": 2.479785680770874 }, { "auxiliary_loss_clip": 0.01051348, "auxiliary_loss_mlp": 0.01001284, "balance_loss_clip": 1.01059556, "balance_loss_mlp": 1.00031281, "epoch": 0.7831419467323995, "flos": 60950895822720.0, "grad_norm": 0.8085924692440947, "language_loss": 0.61353695, "learning_rate": 4.7320935405908004e-07, "loss": 0.63406324, "num_input_tokens_seen": 140379380, "step": 6513, "time_per_iteration": 2.9893112182617188 }, { "auxiliary_loss_clip": 0.01168898, "auxiliary_loss_mlp": 0.01025646, "balance_loss_clip": 1.04777169, "balance_loss_mlp": 1.01790285, "epoch": 0.7832621896230385, "flos": 19682531320320.0, "grad_norm": 2.0746706653203, "language_loss": 0.84062672, "learning_rate": 4.7270630750652475e-07, "loss": 0.86257219, "num_input_tokens_seen": 140395335, "step": 6514, "time_per_iteration": 2.3778579235076904 }, { "auxiliary_loss_clip": 0.01149626, "auxiliary_loss_mlp": 0.01020532, "balance_loss_clip": 1.04434681, "balance_loss_mlp": 1.01382351, "epoch": 0.7833824325136777, "flos": 25009232659200.0, "grad_norm": 1.6879473377035084, "language_loss": 0.80741167, "learning_rate": 4.7220349264320746e-07, "loss": 0.82911325, "num_input_tokens_seen": 140414420, "step": 6515, "time_per_iteration": 2.4611709117889404 }, { "auxiliary_loss_clip": 0.01054559, "auxiliary_loss_mlp": 0.01000669, "balance_loss_clip": 1.01051331, "balance_loss_mlp": 0.99963778, "epoch": 0.7835026754043167, "flos": 68800142517120.0, "grad_norm": 0.7329849561181503, "language_loss": 0.54916102, "learning_rate": 4.71700909545407e-07, "loss": 0.56971329, "num_input_tokens_seen": 140477365, "step": 6516, "time_per_iteration": 3.0265395641326904 }, { "auxiliary_loss_clip": 0.01154139, "auxiliary_loss_mlp": 0.01021434, "balance_loss_clip": 1.04637933, "balance_loss_mlp": 1.01434982, "epoch": 0.7836229182949558, "flos": 19864598382720.0, "grad_norm": 1.8738135362949782, "language_loss": 0.77109027, "learning_rate": 4.711985582893627e-07, "loss": 0.79284596, "num_input_tokens_seen": 140495885, "step": 6517, "time_per_iteration": 2.413577079772949 }, { "auxiliary_loss_clip": 0.01112179, "auxiliary_loss_mlp": 0.01021841, "balance_loss_clip": 1.04055035, "balance_loss_mlp": 1.0145036, "epoch": 0.783743161185595, "flos": 22965843755520.0, "grad_norm": 1.6402273495031856, "language_loss": 0.71610463, "learning_rate": 4.706964389512811e-07, "loss": 0.73744488, "num_input_tokens_seen": 140515920, "step": 6518, "time_per_iteration": 2.5515036582946777 }, { "auxiliary_loss_clip": 0.01166309, "auxiliary_loss_mlp": 0.01020135, "balance_loss_clip": 1.04967618, "balance_loss_mlp": 1.01331663, "epoch": 0.783863404076234, "flos": 12458489777280.0, "grad_norm": 1.9073058921881056, "language_loss": 0.87306106, "learning_rate": 4.701945516073345e-07, "loss": 0.89492559, "num_input_tokens_seen": 140533395, "step": 6519, "time_per_iteration": 2.369558572769165 }, { "auxiliary_loss_clip": 0.01122572, "auxiliary_loss_mlp": 0.01020777, "balance_loss_clip": 1.04431868, "balance_loss_mlp": 1.01414037, "epoch": 0.7839836469668731, "flos": 24243940465920.0, "grad_norm": 1.7950173901195265, "language_loss": 0.75393915, "learning_rate": 4.696928963336577e-07, "loss": 0.77537262, "num_input_tokens_seen": 140552825, "step": 6520, "time_per_iteration": 2.5475728511810303 }, { "auxiliary_loss_clip": 0.01051419, "auxiliary_loss_mlp": 0.01001331, "balance_loss_clip": 1.01031482, "balance_loss_mlp": 1.00031209, "epoch": 0.7841038898575122, "flos": 62121978938880.0, "grad_norm": 0.8534967495046033, "language_loss": 0.61017454, "learning_rate": 4.6919147320635224e-07, "loss": 0.63070214, "num_input_tokens_seen": 140615535, "step": 6521, "time_per_iteration": 2.9999747276306152 }, { "auxiliary_loss_clip": 0.01154218, "auxiliary_loss_mlp": 0.01024606, "balance_loss_clip": 1.04555976, "balance_loss_mlp": 1.01783764, "epoch": 0.7842241327481513, "flos": 20193899293440.0, "grad_norm": 2.342963454017732, "language_loss": 0.72994047, "learning_rate": 4.6869028230148286e-07, "loss": 0.75172877, "num_input_tokens_seen": 140633330, "step": 6522, "time_per_iteration": 3.2662525177001953 }, { "auxiliary_loss_clip": 0.01118504, "auxiliary_loss_mlp": 0.01024257, "balance_loss_clip": 1.04025996, "balance_loss_mlp": 1.01652002, "epoch": 0.7843443756387903, "flos": 28074531496320.0, "grad_norm": 2.34381135662973, "language_loss": 0.59847081, "learning_rate": 4.6818932369507957e-07, "loss": 0.61989844, "num_input_tokens_seen": 140652830, "step": 6523, "time_per_iteration": 3.2541370391845703 }, { "auxiliary_loss_clip": 0.01153993, "auxiliary_loss_mlp": 0.01026472, "balance_loss_clip": 1.04981923, "balance_loss_mlp": 1.01931953, "epoch": 0.7844646185294295, "flos": 21323397438720.0, "grad_norm": 2.20927760909072, "language_loss": 0.88968349, "learning_rate": 4.676885974631386e-07, "loss": 0.91148812, "num_input_tokens_seen": 140671190, "step": 6524, "time_per_iteration": 2.43854022026062 }, { "auxiliary_loss_clip": 0.01153788, "auxiliary_loss_mlp": 0.01022298, "balance_loss_clip": 1.04735041, "balance_loss_mlp": 1.01559854, "epoch": 0.7845848614200686, "flos": 23656585271040.0, "grad_norm": 1.9212038267190354, "language_loss": 0.81113577, "learning_rate": 4.67188103681619e-07, "loss": 0.83289659, "num_input_tokens_seen": 140690975, "step": 6525, "time_per_iteration": 3.256356716156006 }, { "auxiliary_loss_clip": 0.01152803, "auxiliary_loss_mlp": 0.00761716, "balance_loss_clip": 1.05069482, "balance_loss_mlp": 1.00032806, "epoch": 0.7847051043107076, "flos": 23402194174080.0, "grad_norm": 4.33744629859235, "language_loss": 0.6875186, "learning_rate": 4.666878424264453e-07, "loss": 0.70666373, "num_input_tokens_seen": 140710930, "step": 6526, "time_per_iteration": 2.444742441177368 }, { "auxiliary_loss_clip": 0.01130504, "auxiliary_loss_mlp": 0.01019782, "balance_loss_clip": 1.0446167, "balance_loss_mlp": 1.01362491, "epoch": 0.7848253472013467, "flos": 19022277473280.0, "grad_norm": 1.8657727553705894, "language_loss": 0.73486632, "learning_rate": 4.661878137735069e-07, "loss": 0.75636923, "num_input_tokens_seen": 140729120, "step": 6527, "time_per_iteration": 2.444528102874756 }, { "auxiliary_loss_clip": 0.01137561, "auxiliary_loss_mlp": 0.01021878, "balance_loss_clip": 1.04501319, "balance_loss_mlp": 1.01512766, "epoch": 0.7849455900919858, "flos": 21179180332800.0, "grad_norm": 2.172175631590525, "language_loss": 0.74662203, "learning_rate": 4.656880177986571e-07, "loss": 0.76821643, "num_input_tokens_seen": 140747665, "step": 6528, "time_per_iteration": 2.4619085788726807 }, { "auxiliary_loss_clip": 0.01140805, "auxiliary_loss_mlp": 0.01024067, "balance_loss_clip": 1.04357326, "balance_loss_mlp": 1.0165273, "epoch": 0.7850658329826249, "flos": 19536482620800.0, "grad_norm": 1.8828479089549728, "language_loss": 0.81276512, "learning_rate": 4.6518845457771607e-07, "loss": 0.83441389, "num_input_tokens_seen": 140766525, "step": 6529, "time_per_iteration": 3.2345666885375977 }, { "auxiliary_loss_clip": 0.01144647, "auxiliary_loss_mlp": 0.00761501, "balance_loss_clip": 1.04440784, "balance_loss_mlp": 1.00037599, "epoch": 0.7851860758732639, "flos": 12495334152960.0, "grad_norm": 1.7715898218398218, "language_loss": 0.78886473, "learning_rate": 4.646891241864652e-07, "loss": 0.8079263, "num_input_tokens_seen": 140785090, "step": 6530, "time_per_iteration": 2.420994520187378 }, { "auxiliary_loss_clip": 0.01151563, "auxiliary_loss_mlp": 0.01027857, "balance_loss_clip": 1.04540133, "balance_loss_mlp": 1.02001297, "epoch": 0.7853063187639031, "flos": 22960959505920.0, "grad_norm": 2.0034811796725855, "language_loss": 0.73514444, "learning_rate": 4.6419002670065397e-07, "loss": 0.75693864, "num_input_tokens_seen": 140804670, "step": 6531, "time_per_iteration": 2.4273664951324463 }, { "auxiliary_loss_clip": 0.01129057, "auxiliary_loss_mlp": 0.01028411, "balance_loss_clip": 1.04596663, "balance_loss_mlp": 1.02072835, "epoch": 0.7854265616545422, "flos": 17347260499200.0, "grad_norm": 3.4046561018796853, "language_loss": 0.86753833, "learning_rate": 4.6369116219599445e-07, "loss": 0.88911295, "num_input_tokens_seen": 140820655, "step": 6532, "time_per_iteration": 2.481265068054199 }, { "auxiliary_loss_clip": 0.01123649, "auxiliary_loss_mlp": 0.0102251, "balance_loss_clip": 1.04358661, "balance_loss_mlp": 1.01591802, "epoch": 0.7855468045451812, "flos": 23838293197440.0, "grad_norm": 1.657003108576288, "language_loss": 0.79092884, "learning_rate": 4.631925307481637e-07, "loss": 0.81239045, "num_input_tokens_seen": 140840470, "step": 6533, "time_per_iteration": 2.51924467086792 }, { "auxiliary_loss_clip": 0.01137392, "auxiliary_loss_mlp": 0.01022508, "balance_loss_clip": 1.04688919, "balance_loss_mlp": 1.01583219, "epoch": 0.7856670474358204, "flos": 25666792986240.0, "grad_norm": 1.94763198897157, "language_loss": 0.75912499, "learning_rate": 4.6269413243280533e-07, "loss": 0.78072405, "num_input_tokens_seen": 140859890, "step": 6534, "time_per_iteration": 2.507854461669922 }, { "auxiliary_loss_clip": 0.01146527, "auxiliary_loss_mlp": 0.0102237, "balance_loss_clip": 1.05017853, "balance_loss_mlp": 1.01415014, "epoch": 0.7857872903264594, "flos": 18144656472960.0, "grad_norm": 2.7598073972263037, "language_loss": 0.74365258, "learning_rate": 4.621959673255236e-07, "loss": 0.76534152, "num_input_tokens_seen": 140876190, "step": 6535, "time_per_iteration": 2.435556411743164 }, { "auxiliary_loss_clip": 0.01107208, "auxiliary_loss_mlp": 0.01026101, "balance_loss_clip": 1.04113913, "balance_loss_mlp": 1.01904368, "epoch": 0.7859075332170985, "flos": 14386138081920.0, "grad_norm": 2.0730766876488294, "language_loss": 0.90412748, "learning_rate": 4.6169803550189135e-07, "loss": 0.92546058, "num_input_tokens_seen": 140891885, "step": 6536, "time_per_iteration": 2.4852495193481445 }, { "auxiliary_loss_clip": 0.01104958, "auxiliary_loss_mlp": 0.01028422, "balance_loss_clip": 1.04470956, "balance_loss_mlp": 1.02053666, "epoch": 0.7860277761077377, "flos": 19864059678720.0, "grad_norm": 2.048396562476462, "language_loss": 0.77154273, "learning_rate": 4.6120033703744355e-07, "loss": 0.7928766, "num_input_tokens_seen": 140910780, "step": 6537, "time_per_iteration": 2.513505697250366 }, { "auxiliary_loss_clip": 0.01126414, "auxiliary_loss_mlp": 0.0102377, "balance_loss_clip": 1.04220009, "balance_loss_mlp": 1.01680577, "epoch": 0.7861480189983767, "flos": 26396174557440.0, "grad_norm": 1.7797252122112697, "language_loss": 0.78226602, "learning_rate": 4.607028720076822e-07, "loss": 0.80376792, "num_input_tokens_seen": 140927460, "step": 6538, "time_per_iteration": 2.5055649280548096 }, { "auxiliary_loss_clip": 0.01152233, "auxiliary_loss_mlp": 0.01024347, "balance_loss_clip": 1.04643404, "balance_loss_mlp": 1.0172925, "epoch": 0.7862682618890158, "flos": 24236578177920.0, "grad_norm": 1.882501517667879, "language_loss": 0.73534894, "learning_rate": 4.6020564048807074e-07, "loss": 0.75711471, "num_input_tokens_seen": 140945135, "step": 6539, "time_per_iteration": 2.4453020095825195 }, { "auxiliary_loss_clip": 0.01156743, "auxiliary_loss_mlp": 0.01024812, "balance_loss_clip": 1.04839015, "balance_loss_mlp": 1.01742077, "epoch": 0.7863885047796549, "flos": 47551508259840.0, "grad_norm": 2.340499658063534, "language_loss": 0.71783292, "learning_rate": 4.5970864255403883e-07, "loss": 0.73964846, "num_input_tokens_seen": 140966660, "step": 6540, "time_per_iteration": 2.655489444732666 }, { "auxiliary_loss_clip": 0.01140207, "auxiliary_loss_mlp": 0.01022249, "balance_loss_clip": 1.04365444, "balance_loss_mlp": 1.01574016, "epoch": 0.786508747670294, "flos": 24389234979840.0, "grad_norm": 1.8667333097532215, "language_loss": 0.81985426, "learning_rate": 4.59211878280982e-07, "loss": 0.84147882, "num_input_tokens_seen": 140986175, "step": 6541, "time_per_iteration": 2.4602620601654053 }, { "auxiliary_loss_clip": 0.01138426, "auxiliary_loss_mlp": 0.01021688, "balance_loss_clip": 1.04481506, "balance_loss_mlp": 1.0146513, "epoch": 0.786628990560933, "flos": 18041234238720.0, "grad_norm": 2.638287957537487, "language_loss": 0.70001912, "learning_rate": 4.587153477442578e-07, "loss": 0.72162032, "num_input_tokens_seen": 141002490, "step": 6542, "time_per_iteration": 2.4550085067749023 }, { "auxiliary_loss_clip": 0.01169613, "auxiliary_loss_mlp": 0.01025984, "balance_loss_clip": 1.04906929, "balance_loss_mlp": 1.01834321, "epoch": 0.7867492334515722, "flos": 25848860048640.0, "grad_norm": 3.045472342088166, "language_loss": 0.81581938, "learning_rate": 4.582190510191899e-07, "loss": 0.83777535, "num_input_tokens_seen": 141021150, "step": 6543, "time_per_iteration": 2.4252097606658936 }, { "auxiliary_loss_clip": 0.01121494, "auxiliary_loss_mlp": 0.01024218, "balance_loss_clip": 1.04519296, "balance_loss_mlp": 1.01729512, "epoch": 0.7868694763422113, "flos": 16580819070720.0, "grad_norm": 2.0068830520879373, "language_loss": 0.87285471, "learning_rate": 4.5772298818106625e-07, "loss": 0.89431179, "num_input_tokens_seen": 141036940, "step": 6544, "time_per_iteration": 2.4546384811401367 }, { "auxiliary_loss_clip": 0.01133911, "auxiliary_loss_mlp": 0.01025222, "balance_loss_clip": 1.04806626, "balance_loss_mlp": 1.01776505, "epoch": 0.7869897192328503, "flos": 29386276272000.0, "grad_norm": 2.2657619764442587, "language_loss": 0.71594775, "learning_rate": 4.572271593051384e-07, "loss": 0.73753911, "num_input_tokens_seen": 141054295, "step": 6545, "time_per_iteration": 2.540421962738037 }, { "auxiliary_loss_clip": 0.01105133, "auxiliary_loss_mlp": 0.0102248, "balance_loss_clip": 1.04405975, "balance_loss_mlp": 1.01569057, "epoch": 0.7871099621234895, "flos": 17128923678720.0, "grad_norm": 1.7652684842074522, "language_loss": 0.78288555, "learning_rate": 4.567315644666245e-07, "loss": 0.80416167, "num_input_tokens_seen": 141073090, "step": 6546, "time_per_iteration": 2.508835554122925 }, { "auxiliary_loss_clip": 0.01120778, "auxiliary_loss_mlp": 0.01020691, "balance_loss_clip": 1.04563427, "balance_loss_mlp": 1.01372063, "epoch": 0.7872302050141285, "flos": 23440187784960.0, "grad_norm": 1.9833915426105033, "language_loss": 0.84761536, "learning_rate": 4.5623620374070507e-07, "loss": 0.86903006, "num_input_tokens_seen": 141092405, "step": 6547, "time_per_iteration": 2.4985992908477783 }, { "auxiliary_loss_clip": 0.01032081, "auxiliary_loss_mlp": 0.01001732, "balance_loss_clip": 1.00973082, "balance_loss_mlp": 1.00065935, "epoch": 0.7873504479047676, "flos": 65959752689280.0, "grad_norm": 0.7646426487743536, "language_loss": 0.58440018, "learning_rate": 4.557410772025263e-07, "loss": 0.60473835, "num_input_tokens_seen": 141154355, "step": 6548, "time_per_iteration": 3.1714861392974854 }, { "auxiliary_loss_clip": 0.0113385, "auxiliary_loss_mlp": 0.01026978, "balance_loss_clip": 1.04336679, "balance_loss_mlp": 1.01981091, "epoch": 0.7874706907954068, "flos": 23258336204160.0, "grad_norm": 2.0820165254770107, "language_loss": 0.66226792, "learning_rate": 4.5524618492719803e-07, "loss": 0.68387616, "num_input_tokens_seen": 141173575, "step": 6549, "time_per_iteration": 3.360734462738037 }, { "auxiliary_loss_clip": 0.01151419, "auxiliary_loss_mlp": 0.01022071, "balance_loss_clip": 1.04533255, "balance_loss_mlp": 1.01524878, "epoch": 0.7875909336860458, "flos": 28767786963840.0, "grad_norm": 1.6711485690248198, "language_loss": 0.79356635, "learning_rate": 4.54751526989795e-07, "loss": 0.8153013, "num_input_tokens_seen": 141195415, "step": 6550, "time_per_iteration": 3.2458090782165527 }, { "auxiliary_loss_clip": 0.01154321, "auxiliary_loss_mlp": 0.01024548, "balance_loss_clip": 1.04614449, "balance_loss_mlp": 1.01723158, "epoch": 0.7877111765766849, "flos": 18697286194560.0, "grad_norm": 2.0306043773456373, "language_loss": 0.79386109, "learning_rate": 4.5425710346535775e-07, "loss": 0.81564975, "num_input_tokens_seen": 141213360, "step": 6551, "time_per_iteration": 2.397141456604004 }, { "auxiliary_loss_clip": 0.01153103, "auxiliary_loss_mlp": 0.01024801, "balance_loss_clip": 1.04562199, "balance_loss_mlp": 1.01686788, "epoch": 0.787831419467324, "flos": 27592968833280.0, "grad_norm": 1.9140507309289247, "language_loss": 0.81705105, "learning_rate": 4.537629144288877e-07, "loss": 0.83883017, "num_input_tokens_seen": 141230815, "step": 6552, "time_per_iteration": 3.270242929458618 }, { "auxiliary_loss_clip": 0.01115908, "auxiliary_loss_mlp": 0.01025015, "balance_loss_clip": 1.0404191, "balance_loss_mlp": 1.01798737, "epoch": 0.7879516623579631, "flos": 18150187167360.0, "grad_norm": 2.170461918890001, "language_loss": 0.75071132, "learning_rate": 4.5326895995535477e-07, "loss": 0.77212059, "num_input_tokens_seen": 141249715, "step": 6553, "time_per_iteration": 2.5500857830047607 }, { "auxiliary_loss_clip": 0.01147228, "auxiliary_loss_mlp": 0.01023523, "balance_loss_clip": 1.0442307, "balance_loss_mlp": 1.01643324, "epoch": 0.7880719052486022, "flos": 20339193807360.0, "grad_norm": 2.3007570624229223, "language_loss": 0.84597278, "learning_rate": 4.527752401196907e-07, "loss": 0.86768031, "num_input_tokens_seen": 141267730, "step": 6554, "time_per_iteration": 2.426267385482788 }, { "auxiliary_loss_clip": 0.01133772, "auxiliary_loss_mlp": 0.01026927, "balance_loss_clip": 1.04474401, "balance_loss_mlp": 1.01949465, "epoch": 0.7881921481392413, "flos": 21653237053440.0, "grad_norm": 1.7210816373088333, "language_loss": 0.66440421, "learning_rate": 4.5228175499679254e-07, "loss": 0.6860112, "num_input_tokens_seen": 141287315, "step": 6555, "time_per_iteration": 3.2407355308532715 }, { "auxiliary_loss_clip": 0.01055194, "auxiliary_loss_mlp": 0.01001301, "balance_loss_clip": 1.01077652, "balance_loss_mlp": 1.00024033, "epoch": 0.7883123910298804, "flos": 68565860058240.0, "grad_norm": 0.8151351645725482, "language_loss": 0.54515839, "learning_rate": 4.5178850466152174e-07, "loss": 0.5657233, "num_input_tokens_seen": 141346145, "step": 6556, "time_per_iteration": 3.0649099349975586 }, { "auxiliary_loss_clip": 0.0113277, "auxiliary_loss_mlp": 0.01023415, "balance_loss_clip": 1.04187489, "balance_loss_mlp": 1.01644158, "epoch": 0.7884326339205194, "flos": 19318217627520.0, "grad_norm": 1.7963096579371622, "language_loss": 0.81914318, "learning_rate": 4.512954891887031e-07, "loss": 0.84070504, "num_input_tokens_seen": 141364445, "step": 6557, "time_per_iteration": 2.4567461013793945 }, { "auxiliary_loss_clip": 0.01136199, "auxiliary_loss_mlp": 0.01029725, "balance_loss_clip": 1.04742813, "balance_loss_mlp": 1.02181304, "epoch": 0.7885528768111585, "flos": 17784903807360.0, "grad_norm": 2.2733152396122227, "language_loss": 0.83459806, "learning_rate": 4.5080270865312806e-07, "loss": 0.85625732, "num_input_tokens_seen": 141381640, "step": 6558, "time_per_iteration": 2.43115496635437 }, { "auxiliary_loss_clip": 0.01150972, "auxiliary_loss_mlp": 0.01022017, "balance_loss_clip": 1.04605997, "balance_loss_mlp": 1.01529968, "epoch": 0.7886731197017977, "flos": 18807639753600.0, "grad_norm": 2.0857913693067376, "language_loss": 0.71319795, "learning_rate": 4.5031016312954985e-07, "loss": 0.73492777, "num_input_tokens_seen": 141399955, "step": 6559, "time_per_iteration": 2.4247934818267822 }, { "auxiliary_loss_clip": 0.01161901, "auxiliary_loss_mlp": 0.01023811, "balance_loss_clip": 1.04975438, "balance_loss_mlp": 1.0168258, "epoch": 0.7887933625924367, "flos": 33365358126720.0, "grad_norm": 4.1630067051630775, "language_loss": 0.74909467, "learning_rate": 4.498178526926886e-07, "loss": 0.77095181, "num_input_tokens_seen": 141420820, "step": 6560, "time_per_iteration": 2.553723096847534 }, { "auxiliary_loss_clip": 0.0116526, "auxiliary_loss_mlp": 0.01027075, "balance_loss_clip": 1.04810011, "balance_loss_mlp": 1.02041388, "epoch": 0.7889136054830758, "flos": 17019360218880.0, "grad_norm": 2.758014043534276, "language_loss": 0.72061914, "learning_rate": 4.4932577741722635e-07, "loss": 0.74254251, "num_input_tokens_seen": 141439350, "step": 6561, "time_per_iteration": 2.380889654159546 }, { "auxiliary_loss_clip": 0.01134743, "auxiliary_loss_mlp": 0.01028906, "balance_loss_clip": 1.04419732, "balance_loss_mlp": 1.02115107, "epoch": 0.7890338483737149, "flos": 29424629018880.0, "grad_norm": 1.9559362193656162, "language_loss": 0.74352396, "learning_rate": 4.4883393737780985e-07, "loss": 0.76516044, "num_input_tokens_seen": 141460300, "step": 6562, "time_per_iteration": 2.524661064147949 }, { "auxiliary_loss_clip": 0.01146446, "auxiliary_loss_mlp": 0.01024985, "balance_loss_clip": 1.04456294, "balance_loss_mlp": 1.01792789, "epoch": 0.789154091264354, "flos": 19971576063360.0, "grad_norm": 1.922886281588387, "language_loss": 0.7861793, "learning_rate": 4.4834233264905254e-07, "loss": 0.80789363, "num_input_tokens_seen": 141477315, "step": 6563, "time_per_iteration": 2.416424512863159 }, { "auxiliary_loss_clip": 0.01116282, "auxiliary_loss_mlp": 0.0102847, "balance_loss_clip": 1.04081166, "balance_loss_mlp": 1.0210191, "epoch": 0.789274334154993, "flos": 14537825216640.0, "grad_norm": 2.6226564838199424, "language_loss": 0.7143079, "learning_rate": 4.478509633055294e-07, "loss": 0.73575538, "num_input_tokens_seen": 141495025, "step": 6564, "time_per_iteration": 2.471700668334961 }, { "auxiliary_loss_clip": 0.01140841, "auxiliary_loss_mlp": 0.01028025, "balance_loss_clip": 1.04515147, "balance_loss_mlp": 1.0203836, "epoch": 0.7893945770456322, "flos": 21827403123840.0, "grad_norm": 3.3680339608858048, "language_loss": 0.79994029, "learning_rate": 4.473598294217813e-07, "loss": 0.82162899, "num_input_tokens_seen": 141510450, "step": 6565, "time_per_iteration": 2.4578559398651123 }, { "auxiliary_loss_clip": 0.0115129, "auxiliary_loss_mlp": 0.01022398, "balance_loss_clip": 1.04835749, "balance_loss_mlp": 1.01551068, "epoch": 0.7895148199362713, "flos": 20740639184640.0, "grad_norm": 2.0752877753709598, "language_loss": 0.71948761, "learning_rate": 4.468689310723124e-07, "loss": 0.74122441, "num_input_tokens_seen": 141528265, "step": 6566, "time_per_iteration": 2.4193062782287598 }, { "auxiliary_loss_clip": 0.01127121, "auxiliary_loss_mlp": 0.01024212, "balance_loss_clip": 1.04337358, "balance_loss_mlp": 1.01701736, "epoch": 0.7896350628269103, "flos": 16690669839360.0, "grad_norm": 1.68254047823207, "language_loss": 0.78595316, "learning_rate": 4.463782683315913e-07, "loss": 0.80746651, "num_input_tokens_seen": 141547270, "step": 6567, "time_per_iteration": 2.4683709144592285 }, { "auxiliary_loss_clip": 0.0116324, "auxiliary_loss_mlp": 0.0102462, "balance_loss_clip": 1.04682946, "balance_loss_mlp": 1.01782513, "epoch": 0.7897553057175495, "flos": 22638374438400.0, "grad_norm": 1.7791749198296285, "language_loss": 0.73169696, "learning_rate": 4.458878412740523e-07, "loss": 0.75357556, "num_input_tokens_seen": 141566050, "step": 6568, "time_per_iteration": 2.400585651397705 }, { "auxiliary_loss_clip": 0.01147715, "auxiliary_loss_mlp": 0.01023001, "balance_loss_clip": 1.04556549, "balance_loss_mlp": 1.0158428, "epoch": 0.7898755486081885, "flos": 14537573821440.0, "grad_norm": 2.809623088500836, "language_loss": 0.77538687, "learning_rate": 4.453976499740919e-07, "loss": 0.79709399, "num_input_tokens_seen": 141583695, "step": 6569, "time_per_iteration": 2.3946003913879395 }, { "auxiliary_loss_clip": 0.01148466, "auxiliary_loss_mlp": 0.01021776, "balance_loss_clip": 1.04703081, "balance_loss_mlp": 1.01494503, "epoch": 0.7899957914988276, "flos": 17238487138560.0, "grad_norm": 1.6439343101453636, "language_loss": 0.77846432, "learning_rate": 4.4490769450607215e-07, "loss": 0.80016673, "num_input_tokens_seen": 141601320, "step": 6570, "time_per_iteration": 2.4071273803710938 }, { "auxiliary_loss_clip": 0.01120218, "auxiliary_loss_mlp": 0.0102223, "balance_loss_clip": 1.03963912, "balance_loss_mlp": 1.01531291, "epoch": 0.7901160343894668, "flos": 41279351086080.0, "grad_norm": 1.8300896281009391, "language_loss": 0.72620595, "learning_rate": 4.4441797494431845e-07, "loss": 0.74763042, "num_input_tokens_seen": 141623125, "step": 6571, "time_per_iteration": 2.662055253982544 }, { "auxiliary_loss_clip": 0.01149012, "auxiliary_loss_mlp": 0.01024046, "balance_loss_clip": 1.04602754, "balance_loss_mlp": 1.01655662, "epoch": 0.7902362772801058, "flos": 16837005847680.0, "grad_norm": 1.9374060795423802, "language_loss": 0.7782433, "learning_rate": 4.439284913631207e-07, "loss": 0.79997391, "num_input_tokens_seen": 141640335, "step": 6572, "time_per_iteration": 2.397940158843994 }, { "auxiliary_loss_clip": 0.01127686, "auxiliary_loss_mlp": 0.01029301, "balance_loss_clip": 1.04582214, "balance_loss_mlp": 1.02207732, "epoch": 0.7903565201707449, "flos": 27125987091840.0, "grad_norm": 2.702596699673583, "language_loss": 0.83565795, "learning_rate": 4.434392438367347e-07, "loss": 0.8572278, "num_input_tokens_seen": 141659760, "step": 6573, "time_per_iteration": 2.531533718109131 }, { "auxiliary_loss_clip": 0.0115622, "auxiliary_loss_mlp": 0.0102143, "balance_loss_clip": 1.04528213, "balance_loss_mlp": 1.01421762, "epoch": 0.790476763061384, "flos": 31025167142400.0, "grad_norm": 2.966522421635397, "language_loss": 0.74172604, "learning_rate": 4.4295023243937677e-07, "loss": 0.76350248, "num_input_tokens_seen": 141679965, "step": 6574, "time_per_iteration": 2.5097243785858154 }, { "auxiliary_loss_clip": 0.01157439, "auxiliary_loss_mlp": 0.01027073, "balance_loss_clip": 1.05093598, "balance_loss_mlp": 1.01953292, "epoch": 0.7905970059520231, "flos": 22089084681600.0, "grad_norm": 1.7204156251130005, "language_loss": 0.80530035, "learning_rate": 4.4246145724523123e-07, "loss": 0.82714558, "num_input_tokens_seen": 141697710, "step": 6575, "time_per_iteration": 2.4417030811309814 }, { "auxiliary_loss_clip": 0.01122534, "auxiliary_loss_mlp": 0.01023488, "balance_loss_clip": 1.04502881, "balance_loss_mlp": 1.01640725, "epoch": 0.7907172488426621, "flos": 20558141159040.0, "grad_norm": 2.096361457670146, "language_loss": 0.77651459, "learning_rate": 4.41972918328444e-07, "loss": 0.79797482, "num_input_tokens_seen": 141715145, "step": 6576, "time_per_iteration": 3.3165934085845947 }, { "auxiliary_loss_clip": 0.01149275, "auxiliary_loss_mlp": 0.01029651, "balance_loss_clip": 1.0459249, "balance_loss_mlp": 1.02227449, "epoch": 0.7908374917333013, "flos": 30081542901120.0, "grad_norm": 3.214441896016704, "language_loss": 0.7731806, "learning_rate": 4.4148461576312646e-07, "loss": 0.79496992, "num_input_tokens_seen": 141734810, "step": 6577, "time_per_iteration": 3.2404873371124268 }, { "auxiliary_loss_clip": 0.01153834, "auxiliary_loss_mlp": 0.01022512, "balance_loss_clip": 1.0492245, "balance_loss_mlp": 1.01559448, "epoch": 0.7909577346239404, "flos": 20996359084800.0, "grad_norm": 1.3785762635557686, "language_loss": 0.74699754, "learning_rate": 4.4099654962335343e-07, "loss": 0.76876098, "num_input_tokens_seen": 141755260, "step": 6578, "time_per_iteration": 3.3011975288391113 }, { "auxiliary_loss_clip": 0.01145444, "auxiliary_loss_mlp": 0.0102379, "balance_loss_clip": 1.04795718, "balance_loss_mlp": 1.01679218, "epoch": 0.7910779775145794, "flos": 26247935128320.0, "grad_norm": 1.8128132770653769, "language_loss": 0.75138372, "learning_rate": 4.405087199831636e-07, "loss": 0.77307606, "num_input_tokens_seen": 141775500, "step": 6579, "time_per_iteration": 2.5055079460144043 }, { "auxiliary_loss_clip": 0.01140659, "auxiliary_loss_mlp": 0.00761408, "balance_loss_clip": 1.04465985, "balance_loss_mlp": 1.00035143, "epoch": 0.7911982204052186, "flos": 22564434291840.0, "grad_norm": 2.102353587808442, "language_loss": 0.67287648, "learning_rate": 4.400211269165619e-07, "loss": 0.69189715, "num_input_tokens_seen": 141791955, "step": 6580, "time_per_iteration": 2.4734010696411133 }, { "auxiliary_loss_clip": 0.01171218, "auxiliary_loss_mlp": 0.01023561, "balance_loss_clip": 1.05281544, "balance_loss_mlp": 1.01724887, "epoch": 0.7913184632958576, "flos": 23112538899840.0, "grad_norm": 1.4725045682308275, "language_loss": 0.7670927, "learning_rate": 4.3953377049751416e-07, "loss": 0.78904045, "num_input_tokens_seen": 141812380, "step": 6581, "time_per_iteration": 3.1848576068878174 }, { "auxiliary_loss_clip": 0.01143829, "auxiliary_loss_mlp": 0.01025948, "balance_loss_clip": 1.04724455, "balance_loss_mlp": 1.01895678, "epoch": 0.7914387061864967, "flos": 12311758719360.0, "grad_norm": 2.777868849081666, "language_loss": 0.77703995, "learning_rate": 4.390466507999537e-07, "loss": 0.79873765, "num_input_tokens_seen": 141828130, "step": 6582, "time_per_iteration": 2.41615629196167 }, { "auxiliary_loss_clip": 0.01120402, "auxiliary_loss_mlp": 0.01028229, "balance_loss_clip": 1.04160357, "balance_loss_mlp": 1.0213058, "epoch": 0.7915589490771359, "flos": 17603267708160.0, "grad_norm": 2.2040057104361357, "language_loss": 0.75313091, "learning_rate": 4.385597678977748e-07, "loss": 0.7746172, "num_input_tokens_seen": 141846965, "step": 6583, "time_per_iteration": 2.477984666824341 }, { "auxiliary_loss_clip": 0.01138378, "auxiliary_loss_mlp": 0.01021882, "balance_loss_clip": 1.04486108, "balance_loss_mlp": 1.01464927, "epoch": 0.7916791919677749, "flos": 25591272641280.0, "grad_norm": 1.8946348671589661, "language_loss": 0.75569862, "learning_rate": 4.3807312186483726e-07, "loss": 0.77730131, "num_input_tokens_seen": 141867685, "step": 6584, "time_per_iteration": 2.496858596801758 }, { "auxiliary_loss_clip": 0.0114917, "auxiliary_loss_mlp": 0.01023636, "balance_loss_clip": 1.04819489, "balance_loss_mlp": 1.01640296, "epoch": 0.791799434858414, "flos": 18844340474880.0, "grad_norm": 2.648147981521643, "language_loss": 0.78432012, "learning_rate": 4.375867127749655e-07, "loss": 0.80604827, "num_input_tokens_seen": 141885960, "step": 6585, "time_per_iteration": 2.4113566875457764 }, { "auxiliary_loss_clip": 0.01128374, "auxiliary_loss_mlp": 0.01025166, "balance_loss_clip": 1.04844689, "balance_loss_mlp": 1.01781094, "epoch": 0.7919196777490531, "flos": 25812015672960.0, "grad_norm": 2.411423776021973, "language_loss": 0.6719209, "learning_rate": 4.3710054070194744e-07, "loss": 0.69345629, "num_input_tokens_seen": 141905655, "step": 6586, "time_per_iteration": 2.531641721725464 }, { "auxiliary_loss_clip": 0.01168053, "auxiliary_loss_mlp": 0.00761989, "balance_loss_clip": 1.04852152, "balance_loss_mlp": 1.0004313, "epoch": 0.7920399206396922, "flos": 11947624594560.0, "grad_norm": 2.8417938874174347, "language_loss": 0.66800284, "learning_rate": 4.3661460571953455e-07, "loss": 0.68730319, "num_input_tokens_seen": 141922390, "step": 6587, "time_per_iteration": 2.3614377975463867 }, { "auxiliary_loss_clip": 0.011509, "auxiliary_loss_mlp": 0.01020981, "balance_loss_clip": 1.04291511, "balance_loss_mlp": 1.01420724, "epoch": 0.7921601635303313, "flos": 21579907438080.0, "grad_norm": 1.5125766647095769, "language_loss": 0.68470734, "learning_rate": 4.36128907901443e-07, "loss": 0.70642608, "num_input_tokens_seen": 141941985, "step": 6588, "time_per_iteration": 2.4317331314086914 }, { "auxiliary_loss_clip": 0.0112554, "auxiliary_loss_mlp": 0.0102264, "balance_loss_clip": 1.04207182, "balance_loss_mlp": 1.01519573, "epoch": 0.7922804064209703, "flos": 18113989236480.0, "grad_norm": 3.0670378583880904, "language_loss": 0.72801197, "learning_rate": 4.356434473213519e-07, "loss": 0.74949372, "num_input_tokens_seen": 141959435, "step": 6589, "time_per_iteration": 2.463629961013794 }, { "auxiliary_loss_clip": 0.01135478, "auxiliary_loss_mlp": 0.01023051, "balance_loss_clip": 1.04511428, "balance_loss_mlp": 1.01650667, "epoch": 0.7924006493116095, "flos": 21652806090240.0, "grad_norm": 1.6623448842462079, "language_loss": 0.79864168, "learning_rate": 4.351582240529068e-07, "loss": 0.82022691, "num_input_tokens_seen": 141980265, "step": 6590, "time_per_iteration": 2.4719135761260986 }, { "auxiliary_loss_clip": 0.01048226, "auxiliary_loss_mlp": 0.0100223, "balance_loss_clip": 1.01133204, "balance_loss_mlp": 1.00105023, "epoch": 0.7925208922022485, "flos": 64242755694720.0, "grad_norm": 0.8190078717215722, "language_loss": 0.58210039, "learning_rate": 4.346732381697149e-07, "loss": 0.60260493, "num_input_tokens_seen": 142044395, "step": 6591, "time_per_iteration": 3.0929298400878906 }, { "auxiliary_loss_clip": 0.01131311, "auxiliary_loss_mlp": 0.01025845, "balance_loss_clip": 1.0438385, "balance_loss_mlp": 1.01848435, "epoch": 0.7926411350928876, "flos": 16941541403520.0, "grad_norm": 2.0617446319843036, "language_loss": 0.81260121, "learning_rate": 4.3418848974534825e-07, "loss": 0.83417273, "num_input_tokens_seen": 142061335, "step": 6592, "time_per_iteration": 2.4242377281188965 }, { "auxiliary_loss_clip": 0.01129228, "auxiliary_loss_mlp": 0.01023377, "balance_loss_clip": 1.04533362, "balance_loss_mlp": 1.01690149, "epoch": 0.7927613779835267, "flos": 34459987144320.0, "grad_norm": 1.7948372383114815, "language_loss": 0.68892765, "learning_rate": 4.3370397885334276e-07, "loss": 0.71045375, "num_input_tokens_seen": 142081965, "step": 6593, "time_per_iteration": 2.5983049869537354 }, { "auxiliary_loss_clip": 0.01147252, "auxiliary_loss_mlp": 0.01028533, "balance_loss_clip": 1.04742765, "balance_loss_mlp": 1.02138102, "epoch": 0.7928816208741658, "flos": 18951174501120.0, "grad_norm": 1.8713264344573266, "language_loss": 0.75638378, "learning_rate": 4.3321970556719777e-07, "loss": 0.77814162, "num_input_tokens_seen": 142100260, "step": 6594, "time_per_iteration": 2.4202566146850586 }, { "auxiliary_loss_clip": 0.01166175, "auxiliary_loss_mlp": 0.01027012, "balance_loss_clip": 1.04855394, "balance_loss_mlp": 1.01957309, "epoch": 0.7930018637648049, "flos": 18623022825600.0, "grad_norm": 2.6766589335515127, "language_loss": 0.718279, "learning_rate": 4.3273566996037856e-07, "loss": 0.74021089, "num_input_tokens_seen": 142116955, "step": 6595, "time_per_iteration": 2.3635802268981934 }, { "auxiliary_loss_clip": 0.01137154, "auxiliary_loss_mlp": 0.01025349, "balance_loss_clip": 1.04545259, "balance_loss_mlp": 1.01849699, "epoch": 0.793122106655444, "flos": 24530650824960.0, "grad_norm": 1.9060626493223316, "language_loss": 0.80084467, "learning_rate": 4.322518721063113e-07, "loss": 0.82246965, "num_input_tokens_seen": 142135505, "step": 6596, "time_per_iteration": 2.4905996322631836 }, { "auxiliary_loss_clip": 0.01152518, "auxiliary_loss_mlp": 0.01026542, "balance_loss_clip": 1.04818416, "balance_loss_mlp": 1.01936543, "epoch": 0.7932423495460831, "flos": 34421203434240.0, "grad_norm": 2.032777346926402, "language_loss": 0.70500243, "learning_rate": 4.3176831207838906e-07, "loss": 0.72679299, "num_input_tokens_seen": 142158915, "step": 6597, "time_per_iteration": 2.5546207427978516 }, { "auxiliary_loss_clip": 0.01151181, "auxiliary_loss_mlp": 0.01022238, "balance_loss_clip": 1.04924536, "balance_loss_mlp": 1.01523197, "epoch": 0.7933625924367221, "flos": 26980333441920.0, "grad_norm": 2.064778182947546, "language_loss": 0.74736714, "learning_rate": 4.3128498994996685e-07, "loss": 0.76910138, "num_input_tokens_seen": 142178390, "step": 6598, "time_per_iteration": 2.48626971244812 }, { "auxiliary_loss_clip": 0.0115717, "auxiliary_loss_mlp": 0.01026676, "balance_loss_clip": 1.04825103, "balance_loss_mlp": 1.01898718, "epoch": 0.7934828353273613, "flos": 29568630643200.0, "grad_norm": 2.188865164993516, "language_loss": 0.71513712, "learning_rate": 4.308019057943646e-07, "loss": 0.73697561, "num_input_tokens_seen": 142200115, "step": 6599, "time_per_iteration": 2.5224437713623047 }, { "auxiliary_loss_clip": 0.0111875, "auxiliary_loss_mlp": 0.01027022, "balance_loss_clip": 1.0437125, "balance_loss_mlp": 1.02027774, "epoch": 0.7936030782180004, "flos": 28615381557120.0, "grad_norm": 1.566567151418977, "language_loss": 0.74606913, "learning_rate": 4.3031905968486535e-07, "loss": 0.76752687, "num_input_tokens_seen": 142220945, "step": 6600, "time_per_iteration": 2.617225408554077 }, { "auxiliary_loss_clip": 0.01106504, "auxiliary_loss_mlp": 0.01022823, "balance_loss_clip": 1.04422569, "balance_loss_mlp": 1.01592636, "epoch": 0.7937233211086394, "flos": 16392574869120.0, "grad_norm": 2.4639726675608062, "language_loss": 0.68404102, "learning_rate": 4.298364516947162e-07, "loss": 0.70533431, "num_input_tokens_seen": 142238175, "step": 6601, "time_per_iteration": 2.509373903274536 }, { "auxiliary_loss_clip": 0.01105193, "auxiliary_loss_mlp": 0.01021577, "balance_loss_clip": 1.0403955, "balance_loss_mlp": 1.01476693, "epoch": 0.7938435639992786, "flos": 22013420682240.0, "grad_norm": 1.8733447522045494, "language_loss": 0.65741765, "learning_rate": 4.293540818971295e-07, "loss": 0.67868537, "num_input_tokens_seen": 142255980, "step": 6602, "time_per_iteration": 3.3912975788116455 }, { "auxiliary_loss_clip": 0.01156994, "auxiliary_loss_mlp": 0.01018569, "balance_loss_clip": 1.04711223, "balance_loss_mlp": 1.01138937, "epoch": 0.7939638068899176, "flos": 22197032029440.0, "grad_norm": 2.044910526542651, "language_loss": 0.76782644, "learning_rate": 4.2887195036527934e-07, "loss": 0.78958201, "num_input_tokens_seen": 142274785, "step": 6603, "time_per_iteration": 3.1848556995391846 }, { "auxiliary_loss_clip": 0.01141112, "auxiliary_loss_mlp": 0.01022265, "balance_loss_clip": 1.04158592, "balance_loss_mlp": 1.01513028, "epoch": 0.7940840497805567, "flos": 17745186343680.0, "grad_norm": 3.092064818015703, "language_loss": 0.73251331, "learning_rate": 4.28390057172306e-07, "loss": 0.75414705, "num_input_tokens_seen": 142291290, "step": 6604, "time_per_iteration": 2.3862674236297607 }, { "auxiliary_loss_clip": 0.01118427, "auxiliary_loss_mlp": 0.01026807, "balance_loss_clip": 1.04036212, "balance_loss_mlp": 1.01908207, "epoch": 0.7942042926711959, "flos": 23805435231360.0, "grad_norm": 2.348865616828976, "language_loss": 0.72159493, "learning_rate": 4.279084023913111e-07, "loss": 0.74304724, "num_input_tokens_seen": 142309165, "step": 6605, "time_per_iteration": 3.351757049560547 }, { "auxiliary_loss_clip": 0.01151842, "auxiliary_loss_mlp": 0.01023228, "balance_loss_clip": 1.04752648, "balance_loss_mlp": 1.01621902, "epoch": 0.7943245355618349, "flos": 19244959839360.0, "grad_norm": 1.8509945064574276, "language_loss": 0.69608033, "learning_rate": 4.2742698609536096e-07, "loss": 0.71783102, "num_input_tokens_seen": 142327475, "step": 6606, "time_per_iteration": 2.411729574203491 }, { "auxiliary_loss_clip": 0.01141759, "auxiliary_loss_mlp": 0.01026158, "balance_loss_clip": 1.04604197, "balance_loss_mlp": 1.01916051, "epoch": 0.794444778452474, "flos": 25007616547200.0, "grad_norm": 1.8525007965634, "language_loss": 0.78784835, "learning_rate": 4.2694580835748706e-07, "loss": 0.80952752, "num_input_tokens_seen": 142347335, "step": 6607, "time_per_iteration": 2.507347583770752 }, { "auxiliary_loss_clip": 0.01137214, "auxiliary_loss_mlp": 0.01024264, "balance_loss_clip": 1.04394722, "balance_loss_mlp": 1.01677167, "epoch": 0.7945650213431131, "flos": 23221491828480.0, "grad_norm": 2.2480442197891364, "language_loss": 0.74014509, "learning_rate": 4.264648692506836e-07, "loss": 0.76175988, "num_input_tokens_seen": 142366125, "step": 6608, "time_per_iteration": 3.192314624786377 }, { "auxiliary_loss_clip": 0.01130992, "auxiliary_loss_mlp": 0.01025439, "balance_loss_clip": 1.04236984, "balance_loss_mlp": 1.01782703, "epoch": 0.7946852642337522, "flos": 26062887237120.0, "grad_norm": 1.8070844801424428, "language_loss": 0.72065431, "learning_rate": 4.2598416884790824e-07, "loss": 0.74221861, "num_input_tokens_seen": 142385175, "step": 6609, "time_per_iteration": 2.48479962348938 }, { "auxiliary_loss_clip": 0.0114658, "auxiliary_loss_mlp": 0.01022373, "balance_loss_clip": 1.04435027, "balance_loss_mlp": 1.0144453, "epoch": 0.7948055071243912, "flos": 23769704177280.0, "grad_norm": 1.8588221875126303, "language_loss": 0.81020772, "learning_rate": 4.255037072220828e-07, "loss": 0.83189726, "num_input_tokens_seen": 142406545, "step": 6610, "time_per_iteration": 2.4791934490203857 }, { "auxiliary_loss_clip": 0.01163863, "auxiliary_loss_mlp": 0.01021872, "balance_loss_clip": 1.047575, "balance_loss_mlp": 1.01540196, "epoch": 0.7949257500150304, "flos": 21980814111360.0, "grad_norm": 1.6322089106261355, "language_loss": 0.72071111, "learning_rate": 4.2502348444609293e-07, "loss": 0.74256849, "num_input_tokens_seen": 142426165, "step": 6611, "time_per_iteration": 2.393824338912964 }, { "auxiliary_loss_clip": 0.01107092, "auxiliary_loss_mlp": 0.01024829, "balance_loss_clip": 1.03946376, "balance_loss_mlp": 1.01811194, "epoch": 0.7950459929056695, "flos": 25774129802880.0, "grad_norm": 2.0735636942778717, "language_loss": 0.69263506, "learning_rate": 4.2454350059278844e-07, "loss": 0.71395433, "num_input_tokens_seen": 142447225, "step": 6612, "time_per_iteration": 2.576458692550659 }, { "auxiliary_loss_clip": 0.0113145, "auxiliary_loss_mlp": 0.01022896, "balance_loss_clip": 1.04071593, "balance_loss_mlp": 1.01608574, "epoch": 0.7951662357963085, "flos": 22158068751360.0, "grad_norm": 1.8668637044677727, "language_loss": 0.84584939, "learning_rate": 4.240637557349824e-07, "loss": 0.8673929, "num_input_tokens_seen": 142464440, "step": 6613, "time_per_iteration": 2.454233169555664 }, { "auxiliary_loss_clip": 0.01125005, "auxiliary_loss_mlp": 0.01024335, "balance_loss_clip": 1.04366422, "balance_loss_mlp": 1.01713729, "epoch": 0.7952864786869477, "flos": 24641938137600.0, "grad_norm": 1.9281627515450201, "language_loss": 0.66895097, "learning_rate": 4.235842499454516e-07, "loss": 0.69044435, "num_input_tokens_seen": 142484355, "step": 6614, "time_per_iteration": 2.4768826961517334 }, { "auxiliary_loss_clip": 0.01139714, "auxiliary_loss_mlp": 0.01027459, "balance_loss_clip": 1.04618716, "balance_loss_mlp": 1.02038097, "epoch": 0.7954067215775867, "flos": 21830922656640.0, "grad_norm": 2.052894538503174, "language_loss": 0.83102804, "learning_rate": 4.2310498329693687e-07, "loss": 0.85269976, "num_input_tokens_seen": 142505255, "step": 6615, "time_per_iteration": 2.5034849643707275 }, { "auxiliary_loss_clip": 0.01155481, "auxiliary_loss_mlp": 0.01024506, "balance_loss_clip": 1.04739833, "balance_loss_mlp": 1.01675487, "epoch": 0.7955269644682258, "flos": 24060652341120.0, "grad_norm": 1.8714757143023533, "language_loss": 0.80673945, "learning_rate": 4.2262595586214164e-07, "loss": 0.82853931, "num_input_tokens_seen": 142526350, "step": 6616, "time_per_iteration": 2.4590518474578857 }, { "auxiliary_loss_clip": 0.0115553, "auxiliary_loss_mlp": 0.01026049, "balance_loss_clip": 1.04738343, "balance_loss_mlp": 1.01852143, "epoch": 0.795647207358865, "flos": 25010741030400.0, "grad_norm": 1.8369499267244507, "language_loss": 0.76683843, "learning_rate": 4.221471677137358e-07, "loss": 0.78865427, "num_input_tokens_seen": 142547165, "step": 6617, "time_per_iteration": 2.4743330478668213 }, { "auxiliary_loss_clip": 0.01126113, "auxiliary_loss_mlp": 0.01022762, "balance_loss_clip": 1.04303741, "balance_loss_mlp": 1.01626253, "epoch": 0.795767450249504, "flos": 14648358343680.0, "grad_norm": 1.8995220252151008, "language_loss": 0.70113754, "learning_rate": 4.216686189243492e-07, "loss": 0.72262627, "num_input_tokens_seen": 142565955, "step": 6618, "time_per_iteration": 2.4523582458496094 }, { "auxiliary_loss_clip": 0.01122303, "auxiliary_loss_mlp": 0.01021656, "balance_loss_clip": 1.04486299, "balance_loss_mlp": 1.01412189, "epoch": 0.7958876931401431, "flos": 18547897530240.0, "grad_norm": 1.8052228019774563, "language_loss": 0.72656584, "learning_rate": 4.211903095665785e-07, "loss": 0.74800545, "num_input_tokens_seen": 142585340, "step": 6619, "time_per_iteration": 2.480369806289673 }, { "auxiliary_loss_clip": 0.01145166, "auxiliary_loss_mlp": 0.01028525, "balance_loss_clip": 1.04394758, "balance_loss_mlp": 1.02140558, "epoch": 0.7960079360307821, "flos": 21543960902400.0, "grad_norm": 1.917538964622712, "language_loss": 0.75246668, "learning_rate": 4.2071223971298277e-07, "loss": 0.77420354, "num_input_tokens_seen": 142602525, "step": 6620, "time_per_iteration": 2.462979555130005 }, { "auxiliary_loss_clip": 0.01152625, "auxiliary_loss_mlp": 0.01023628, "balance_loss_clip": 1.04575241, "balance_loss_mlp": 1.01584041, "epoch": 0.7961281789214213, "flos": 25481745095040.0, "grad_norm": 2.0093183304856908, "language_loss": 0.61201036, "learning_rate": 4.2023440943608433e-07, "loss": 0.63377297, "num_input_tokens_seen": 142622490, "step": 6621, "time_per_iteration": 2.4907608032226562 }, { "auxiliary_loss_clip": 0.011482, "auxiliary_loss_mlp": 0.0102103, "balance_loss_clip": 1.04299212, "balance_loss_mlp": 1.0142591, "epoch": 0.7962484218120603, "flos": 21944436612480.0, "grad_norm": 1.471355302036172, "language_loss": 0.77995121, "learning_rate": 4.1975681880837023e-07, "loss": 0.80164349, "num_input_tokens_seen": 142642495, "step": 6622, "time_per_iteration": 2.4658966064453125 }, { "auxiliary_loss_clip": 0.0111791, "auxiliary_loss_mlp": 0.01024361, "balance_loss_clip": 1.03971243, "balance_loss_mlp": 1.01717281, "epoch": 0.7963686647026994, "flos": 18876264687360.0, "grad_norm": 1.7483037580313359, "language_loss": 0.82218623, "learning_rate": 4.192794679022895e-07, "loss": 0.84360898, "num_input_tokens_seen": 142660820, "step": 6623, "time_per_iteration": 2.4928228855133057 }, { "auxiliary_loss_clip": 0.01152496, "auxiliary_loss_mlp": 0.01026655, "balance_loss_clip": 1.04489589, "balance_loss_mlp": 1.02009594, "epoch": 0.7964889075933386, "flos": 29716582763520.0, "grad_norm": 1.7625971478868678, "language_loss": 0.7220723, "learning_rate": 4.1880235679025743e-07, "loss": 0.74386382, "num_input_tokens_seen": 142680915, "step": 6624, "time_per_iteration": 2.5325610637664795 }, { "auxiliary_loss_clip": 0.01095573, "auxiliary_loss_mlp": 0.01031653, "balance_loss_clip": 1.03922272, "balance_loss_mlp": 1.02478313, "epoch": 0.7966091504839776, "flos": 29491458272640.0, "grad_norm": 1.9780163457741171, "language_loss": 0.63591146, "learning_rate": 4.1832548554464986e-07, "loss": 0.65718377, "num_input_tokens_seen": 142699210, "step": 6625, "time_per_iteration": 2.616722583770752 }, { "auxiliary_loss_clip": 0.01048986, "auxiliary_loss_mlp": 0.01002833, "balance_loss_clip": 1.0090121, "balance_loss_mlp": 1.00172412, "epoch": 0.7967293933746167, "flos": 67288697101440.0, "grad_norm": 0.7447212932907569, "language_loss": 0.58800209, "learning_rate": 4.178488542378098e-07, "loss": 0.60852027, "num_input_tokens_seen": 142756790, "step": 6626, "time_per_iteration": 2.9727652072906494 }, { "auxiliary_loss_clip": 0.01169683, "auxiliary_loss_mlp": 0.01025511, "balance_loss_clip": 1.04920316, "balance_loss_mlp": 1.01832235, "epoch": 0.7968496362652558, "flos": 25554679660800.0, "grad_norm": 1.6894785489034347, "language_loss": 0.89079356, "learning_rate": 4.173724629420401e-07, "loss": 0.91274548, "num_input_tokens_seen": 142778150, "step": 6627, "time_per_iteration": 2.4321272373199463 }, { "auxiliary_loss_clip": 0.01145601, "auxiliary_loss_mlp": 0.01026488, "balance_loss_clip": 1.04684985, "balance_loss_mlp": 1.01886463, "epoch": 0.7969698791558949, "flos": 14501088581760.0, "grad_norm": 2.6422234536385565, "language_loss": 0.68216497, "learning_rate": 4.168963117296087e-07, "loss": 0.70388585, "num_input_tokens_seen": 142795485, "step": 6628, "time_per_iteration": 2.4320638179779053 }, { "auxiliary_loss_clip": 0.01166011, "auxiliary_loss_mlp": 0.01021662, "balance_loss_clip": 1.04813099, "balance_loss_mlp": 1.01493895, "epoch": 0.797090122046534, "flos": 22127545169280.0, "grad_norm": 2.09721354716106, "language_loss": 0.76210493, "learning_rate": 4.1642040067274876e-07, "loss": 0.78398168, "num_input_tokens_seen": 142815155, "step": 6629, "time_per_iteration": 3.2515742778778076 }, { "auxiliary_loss_clip": 0.0114023, "auxiliary_loss_mlp": 0.01019083, "balance_loss_clip": 1.04439282, "balance_loss_mlp": 1.01215971, "epoch": 0.7972103649371731, "flos": 19897671830400.0, "grad_norm": 1.584353341051865, "language_loss": 0.72362602, "learning_rate": 4.1594472984365493e-07, "loss": 0.74521911, "num_input_tokens_seen": 142833840, "step": 6630, "time_per_iteration": 3.233954906463623 }, { "auxiliary_loss_clip": 0.01148549, "auxiliary_loss_mlp": 0.01027875, "balance_loss_clip": 1.04648972, "balance_loss_mlp": 1.02128029, "epoch": 0.7973306078278122, "flos": 36058621847040.0, "grad_norm": 1.9469795785085844, "language_loss": 0.77993846, "learning_rate": 4.154692993144862e-07, "loss": 0.80170268, "num_input_tokens_seen": 142853610, "step": 6631, "time_per_iteration": 3.4039735794067383 }, { "auxiliary_loss_clip": 0.01164139, "auxiliary_loss_mlp": 0.00761675, "balance_loss_clip": 1.04683375, "balance_loss_mlp": 1.00043726, "epoch": 0.7974508507184512, "flos": 21360600950400.0, "grad_norm": 2.6279990431078666, "language_loss": 0.71698678, "learning_rate": 4.1499410915736476e-07, "loss": 0.73624492, "num_input_tokens_seen": 142872540, "step": 6632, "time_per_iteration": 2.421588897705078 }, { "auxiliary_loss_clip": 0.01058281, "auxiliary_loss_mlp": 0.01000898, "balance_loss_clip": 1.01310456, "balance_loss_mlp": 0.99992037, "epoch": 0.7975710936090904, "flos": 68253115317120.0, "grad_norm": 0.7752791990943658, "language_loss": 0.6431545, "learning_rate": 4.145191594443762e-07, "loss": 0.6637463, "num_input_tokens_seen": 142936895, "step": 6633, "time_per_iteration": 3.1609485149383545 }, { "auxiliary_loss_clip": 0.01117424, "auxiliary_loss_mlp": 0.01026095, "balance_loss_clip": 1.04175329, "balance_loss_mlp": 1.01909709, "epoch": 0.7976913364997295, "flos": 22492433479680.0, "grad_norm": 1.6771579006444906, "language_loss": 0.70526063, "learning_rate": 4.140444502475713e-07, "loss": 0.72669578, "num_input_tokens_seen": 142956445, "step": 6634, "time_per_iteration": 2.549887180328369 }, { "auxiliary_loss_clip": 0.01146464, "auxiliary_loss_mlp": 0.01028909, "balance_loss_clip": 1.04296589, "balance_loss_mlp": 1.02203321, "epoch": 0.7978115793903685, "flos": 15263220378240.0, "grad_norm": 1.7871226904047188, "language_loss": 0.70029974, "learning_rate": 4.1356998163896216e-07, "loss": 0.72205341, "num_input_tokens_seen": 142973495, "step": 6635, "time_per_iteration": 3.19233775138855 }, { "auxiliary_loss_clip": 0.01127764, "auxiliary_loss_mlp": 0.01023754, "balance_loss_clip": 1.04546535, "balance_loss_mlp": 1.01692009, "epoch": 0.7979318222810077, "flos": 19719232041600.0, "grad_norm": 1.865828625478986, "language_loss": 0.74809849, "learning_rate": 4.130957536905255e-07, "loss": 0.76961368, "num_input_tokens_seen": 142991510, "step": 6636, "time_per_iteration": 2.5263454914093018 }, { "auxiliary_loss_clip": 0.01146819, "auxiliary_loss_mlp": 0.01028785, "balance_loss_clip": 1.04677951, "balance_loss_mlp": 1.0209589, "epoch": 0.7980520651716467, "flos": 15560273854080.0, "grad_norm": 2.3695581695747987, "language_loss": 0.71548969, "learning_rate": 4.1262176647420134e-07, "loss": 0.73724574, "num_input_tokens_seen": 143009675, "step": 6637, "time_per_iteration": 2.4899373054504395 }, { "auxiliary_loss_clip": 0.01145188, "auxiliary_loss_mlp": 0.01025695, "balance_loss_clip": 1.04762423, "balance_loss_mlp": 1.0186677, "epoch": 0.7981723080622858, "flos": 22309432663680.0, "grad_norm": 1.747200761284665, "language_loss": 0.79858661, "learning_rate": 4.121480200618923e-07, "loss": 0.82029545, "num_input_tokens_seen": 143029330, "step": 6638, "time_per_iteration": 2.630326986312866 }, { "auxiliary_loss_clip": 0.01133709, "auxiliary_loss_mlp": 0.01022972, "balance_loss_clip": 1.044855, "balance_loss_mlp": 1.01623678, "epoch": 0.798292550952925, "flos": 22929573997440.0, "grad_norm": 1.673504942036836, "language_loss": 0.80067152, "learning_rate": 4.116745145254674e-07, "loss": 0.82223833, "num_input_tokens_seen": 143048865, "step": 6639, "time_per_iteration": 2.527531623840332 }, { "auxiliary_loss_clip": 0.01039239, "auxiliary_loss_mlp": 0.0100207, "balance_loss_clip": 1.00960457, "balance_loss_mlp": 1.00106227, "epoch": 0.798412793843564, "flos": 64497936890880.0, "grad_norm": 0.7656802468105876, "language_loss": 0.58011341, "learning_rate": 4.1120124993675476e-07, "loss": 0.60052645, "num_input_tokens_seen": 143113295, "step": 6640, "time_per_iteration": 3.1508359909057617 }, { "auxiliary_loss_clip": 0.01145018, "auxiliary_loss_mlp": 0.01022817, "balance_loss_clip": 1.04478443, "balance_loss_mlp": 1.0151875, "epoch": 0.7985330367342031, "flos": 13586910514560.0, "grad_norm": 2.1090898927963853, "language_loss": 0.61771172, "learning_rate": 4.107282263675498e-07, "loss": 0.63939011, "num_input_tokens_seen": 143130965, "step": 6641, "time_per_iteration": 2.501239538192749 }, { "auxiliary_loss_clip": 0.01039931, "auxiliary_loss_mlp": 0.00752985, "balance_loss_clip": 1.01194334, "balance_loss_mlp": 0.99999315, "epoch": 0.7986532796248422, "flos": 67698797656320.0, "grad_norm": 0.7910024269912419, "language_loss": 0.52526909, "learning_rate": 4.1025544388960907e-07, "loss": 0.54319823, "num_input_tokens_seen": 143192005, "step": 6642, "time_per_iteration": 3.082073450088501 }, { "auxiliary_loss_clip": 0.01151596, "auxiliary_loss_mlp": 0.01026218, "balance_loss_clip": 1.04854918, "balance_loss_mlp": 1.01911342, "epoch": 0.7987735225154813, "flos": 22455373622400.0, "grad_norm": 1.7941724437790638, "language_loss": 0.7171185, "learning_rate": 4.097829025746538e-07, "loss": 0.73889667, "num_input_tokens_seen": 143213550, "step": 6643, "time_per_iteration": 2.5058670043945312 }, { "auxiliary_loss_clip": 0.01054208, "auxiliary_loss_mlp": 0.01003208, "balance_loss_clip": 1.01105118, "balance_loss_mlp": 1.00221813, "epoch": 0.7988937654061203, "flos": 68864098682880.0, "grad_norm": 0.657115815200203, "language_loss": 0.60978001, "learning_rate": 4.0931060249436757e-07, "loss": 0.63035417, "num_input_tokens_seen": 143277390, "step": 6644, "time_per_iteration": 3.07392954826355 }, { "auxiliary_loss_clip": 0.0114965, "auxiliary_loss_mlp": 0.01030269, "balance_loss_clip": 1.04651618, "balance_loss_mlp": 1.0229193, "epoch": 0.7990140082967595, "flos": 20806893820800.0, "grad_norm": 1.9034701140918886, "language_loss": 0.69732344, "learning_rate": 4.088385437203978e-07, "loss": 0.71912265, "num_input_tokens_seen": 143294400, "step": 6645, "time_per_iteration": 2.4290459156036377 }, { "auxiliary_loss_clip": 0.01165878, "auxiliary_loss_mlp": 0.01023936, "balance_loss_clip": 1.04676008, "balance_loss_mlp": 1.01672649, "epoch": 0.7991342511873986, "flos": 18985289443200.0, "grad_norm": 2.137935974741238, "language_loss": 0.77576292, "learning_rate": 4.083667263243564e-07, "loss": 0.79766107, "num_input_tokens_seen": 143312745, "step": 6646, "time_per_iteration": 2.4386520385742188 }, { "auxiliary_loss_clip": 0.01150429, "auxiliary_loss_mlp": 0.01025181, "balance_loss_clip": 1.04839861, "balance_loss_mlp": 1.018502, "epoch": 0.7992544940780376, "flos": 20816805974400.0, "grad_norm": 1.9925427171261334, "language_loss": 0.71968889, "learning_rate": 4.0789515037781653e-07, "loss": 0.741445, "num_input_tokens_seen": 143333470, "step": 6647, "time_per_iteration": 2.4792494773864746 }, { "auxiliary_loss_clip": 0.01155546, "auxiliary_loss_mlp": 0.01022919, "balance_loss_clip": 1.04755127, "balance_loss_mlp": 1.01606393, "epoch": 0.7993747369686768, "flos": 12640772321280.0, "grad_norm": 1.7584999545295306, "language_loss": 0.82385117, "learning_rate": 4.0742381595231755e-07, "loss": 0.84563583, "num_input_tokens_seen": 143350195, "step": 6648, "time_per_iteration": 2.477421522140503 }, { "auxiliary_loss_clip": 0.01127987, "auxiliary_loss_mlp": 0.01025056, "balance_loss_clip": 1.04492748, "balance_loss_mlp": 1.01849341, "epoch": 0.7994949798593158, "flos": 20078769225600.0, "grad_norm": 2.2203364577199, "language_loss": 0.78458756, "learning_rate": 4.06952723119359e-07, "loss": 0.80611795, "num_input_tokens_seen": 143370070, "step": 6649, "time_per_iteration": 2.5374505519866943 }, { "auxiliary_loss_clip": 0.01127601, "auxiliary_loss_mlp": 0.01026033, "balance_loss_clip": 1.04227853, "balance_loss_mlp": 1.0186007, "epoch": 0.7996152227499549, "flos": 38654209509120.0, "grad_norm": 1.8569947275147876, "language_loss": 0.67374372, "learning_rate": 4.0648187195040504e-07, "loss": 0.69528008, "num_input_tokens_seen": 143392275, "step": 6650, "time_per_iteration": 2.6422083377838135 }, { "auxiliary_loss_clip": 0.01049539, "auxiliary_loss_mlp": 0.01002018, "balance_loss_clip": 1.00941777, "balance_loss_mlp": 1.00083816, "epoch": 0.799735465640594, "flos": 70243821947520.0, "grad_norm": 0.8104505219086647, "language_loss": 0.6760093, "learning_rate": 4.060112625168848e-07, "loss": 0.69652486, "num_input_tokens_seen": 143457385, "step": 6651, "time_per_iteration": 3.1255669593811035 }, { "auxiliary_loss_clip": 0.01166879, "auxiliary_loss_mlp": 0.01023976, "balance_loss_clip": 1.04918313, "balance_loss_mlp": 1.01689172, "epoch": 0.7998557085312331, "flos": 24240995550720.0, "grad_norm": 1.8984052468269064, "language_loss": 0.73776877, "learning_rate": 4.055408948901886e-07, "loss": 0.75967729, "num_input_tokens_seen": 143478785, "step": 6652, "time_per_iteration": 2.4691245555877686 }, { "auxiliary_loss_clip": 0.01155409, "auxiliary_loss_mlp": 0.01023496, "balance_loss_clip": 1.04716694, "balance_loss_mlp": 1.01551473, "epoch": 0.7999759514218722, "flos": 27564025449600.0, "grad_norm": 2.258278549137495, "language_loss": 0.71474397, "learning_rate": 4.050707691416708e-07, "loss": 0.73653305, "num_input_tokens_seen": 143500095, "step": 6653, "time_per_iteration": 2.503244161605835 }, { "auxiliary_loss_clip": 0.01049821, "auxiliary_loss_mlp": 0.01001798, "balance_loss_clip": 1.0097363, "balance_loss_mlp": 1.0006417, "epoch": 0.8000961943125112, "flos": 67337428878720.0, "grad_norm": 0.6762803425296061, "language_loss": 0.59772789, "learning_rate": 4.046008853426495e-07, "loss": 0.61824417, "num_input_tokens_seen": 143563410, "step": 6654, "time_per_iteration": 3.1881251335144043 }, { "auxiliary_loss_clip": 0.01118773, "auxiliary_loss_mlp": 0.01025273, "balance_loss_clip": 1.04262495, "balance_loss_mlp": 1.017977, "epoch": 0.8002164372031504, "flos": 28733815676160.0, "grad_norm": 4.545113583885504, "language_loss": 0.62991476, "learning_rate": 4.0413124356440464e-07, "loss": 0.65135515, "num_input_tokens_seen": 143587455, "step": 6655, "time_per_iteration": 3.439544200897217 }, { "auxiliary_loss_clip": 0.01113118, "auxiliary_loss_mlp": 0.01025344, "balance_loss_clip": 1.04144979, "balance_loss_mlp": 1.01833725, "epoch": 0.8003366800937894, "flos": 17639429725440.0, "grad_norm": 1.8082924126450304, "language_loss": 0.82063144, "learning_rate": 4.0366184387818223e-07, "loss": 0.8420161, "num_input_tokens_seen": 143605915, "step": 6656, "time_per_iteration": 2.5539333820343018 }, { "auxiliary_loss_clip": 0.01171186, "auxiliary_loss_mlp": 0.01022304, "balance_loss_clip": 1.04884708, "balance_loss_mlp": 1.01445675, "epoch": 0.8004569229844285, "flos": 25995303797760.0, "grad_norm": 1.7958345363539914, "language_loss": 0.85095513, "learning_rate": 4.0319268635518797e-07, "loss": 0.87289011, "num_input_tokens_seen": 143626490, "step": 6657, "time_per_iteration": 2.4990313053131104 }, { "auxiliary_loss_clip": 0.01151284, "auxiliary_loss_mlp": 0.01018873, "balance_loss_clip": 1.0458653, "balance_loss_mlp": 1.01235521, "epoch": 0.8005771658750677, "flos": 20812352688000.0, "grad_norm": 1.628504546613032, "language_loss": 0.75346142, "learning_rate": 4.027237710665943e-07, "loss": 0.77516305, "num_input_tokens_seen": 143644955, "step": 6658, "time_per_iteration": 3.2879130840301514 }, { "auxiliary_loss_clip": 0.01125352, "auxiliary_loss_mlp": 0.01025662, "balance_loss_clip": 1.04112196, "balance_loss_mlp": 1.01829529, "epoch": 0.8006974087657067, "flos": 25812626204160.0, "grad_norm": 2.697019971360361, "language_loss": 0.69361913, "learning_rate": 4.022550980835344e-07, "loss": 0.71512932, "num_input_tokens_seen": 143667200, "step": 6659, "time_per_iteration": 2.60933256149292 }, { "auxiliary_loss_clip": 0.01121099, "auxiliary_loss_mlp": 0.01022905, "balance_loss_clip": 1.04010904, "balance_loss_mlp": 1.0158838, "epoch": 0.8008176516563458, "flos": 17164690646400.0, "grad_norm": 2.2842929689764855, "language_loss": 0.79404354, "learning_rate": 4.017866674771051e-07, "loss": 0.81548357, "num_input_tokens_seen": 143684685, "step": 6660, "time_per_iteration": 2.5000317096710205 }, { "auxiliary_loss_clip": 0.01101012, "auxiliary_loss_mlp": 0.01027672, "balance_loss_clip": 1.03868544, "balance_loss_mlp": 1.02040291, "epoch": 0.8009378945469849, "flos": 24207311571840.0, "grad_norm": 1.8961428877580324, "language_loss": 0.74707818, "learning_rate": 4.013184793183688e-07, "loss": 0.76836503, "num_input_tokens_seen": 143706780, "step": 6661, "time_per_iteration": 3.3352625370025635 }, { "auxiliary_loss_clip": 0.01149144, "auxiliary_loss_mlp": 0.01025466, "balance_loss_clip": 1.04364705, "balance_loss_mlp": 1.01876688, "epoch": 0.801058137437624, "flos": 19787318271360.0, "grad_norm": 1.8881118416660736, "language_loss": 0.72744876, "learning_rate": 4.008505336783472e-07, "loss": 0.74919486, "num_input_tokens_seen": 143724505, "step": 6662, "time_per_iteration": 2.44775128364563 }, { "auxiliary_loss_clip": 0.01141981, "auxiliary_loss_mlp": 0.01024167, "balance_loss_clip": 1.04392195, "balance_loss_mlp": 1.01752436, "epoch": 0.801178380328263, "flos": 18659400324480.0, "grad_norm": 1.9128885411117909, "language_loss": 0.80724239, "learning_rate": 4.003828306280284e-07, "loss": 0.82890391, "num_input_tokens_seen": 143742180, "step": 6663, "time_per_iteration": 2.4275035858154297 }, { "auxiliary_loss_clip": 0.01152745, "auxiliary_loss_mlp": 0.01022696, "balance_loss_clip": 1.0462929, "balance_loss_mlp": 1.01620793, "epoch": 0.8012986232189022, "flos": 15706573948800.0, "grad_norm": 1.8353789082887482, "language_loss": 0.7816295, "learning_rate": 3.999153702383626e-07, "loss": 0.80338395, "num_input_tokens_seen": 143760070, "step": 6664, "time_per_iteration": 2.4279041290283203 }, { "auxiliary_loss_clip": 0.01154723, "auxiliary_loss_mlp": 0.01023236, "balance_loss_clip": 1.04591537, "balance_loss_mlp": 1.0157851, "epoch": 0.8014188661095413, "flos": 28584139703040.0, "grad_norm": 1.786727106013486, "language_loss": 0.73600835, "learning_rate": 3.9944815258026263e-07, "loss": 0.757788, "num_input_tokens_seen": 143781890, "step": 6665, "time_per_iteration": 2.5283732414245605 }, { "auxiliary_loss_clip": 0.01155398, "auxiliary_loss_mlp": 0.01025454, "balance_loss_clip": 1.04737949, "balance_loss_mlp": 1.01794958, "epoch": 0.8015391090001803, "flos": 29310360877440.0, "grad_norm": 1.7974828799829023, "language_loss": 0.83458436, "learning_rate": 3.989811777246057e-07, "loss": 0.85639292, "num_input_tokens_seen": 143802060, "step": 6666, "time_per_iteration": 2.5059494972229004 }, { "auxiliary_loss_clip": 0.01064879, "auxiliary_loss_mlp": 0.01000984, "balance_loss_clip": 1.0104183, "balance_loss_mlp": 0.99997026, "epoch": 0.8016593518908195, "flos": 70397340675840.0, "grad_norm": 0.8520184815143531, "language_loss": 0.66284394, "learning_rate": 3.985144457422305e-07, "loss": 0.68350255, "num_input_tokens_seen": 143856345, "step": 6667, "time_per_iteration": 2.964301586151123 }, { "auxiliary_loss_clip": 0.01166185, "auxiliary_loss_mlp": 0.01022195, "balance_loss_clip": 1.04805779, "balance_loss_mlp": 1.01520085, "epoch": 0.8017795947814585, "flos": 26026114688640.0, "grad_norm": 2.0642245157263837, "language_loss": 0.76509959, "learning_rate": 3.9804795670394096e-07, "loss": 0.78698337, "num_input_tokens_seen": 143876470, "step": 6668, "time_per_iteration": 2.4591193199157715 }, { "auxiliary_loss_clip": 0.0112999, "auxiliary_loss_mlp": 0.01023939, "balance_loss_clip": 1.04341531, "balance_loss_mlp": 1.01714134, "epoch": 0.8018998376720976, "flos": 22087181260800.0, "grad_norm": 1.5969724554204254, "language_loss": 0.70731962, "learning_rate": 3.975817106805022e-07, "loss": 0.72885883, "num_input_tokens_seen": 143895170, "step": 6669, "time_per_iteration": 2.4780917167663574 }, { "auxiliary_loss_clip": 0.01124782, "auxiliary_loss_mlp": 0.01027451, "balance_loss_clip": 1.04396296, "balance_loss_mlp": 1.0199343, "epoch": 0.8020200805627368, "flos": 34568545023360.0, "grad_norm": 2.1789523549842444, "language_loss": 0.64707994, "learning_rate": 3.97115707742645e-07, "loss": 0.66860229, "num_input_tokens_seen": 143915845, "step": 6670, "time_per_iteration": 2.634737491607666 }, { "auxiliary_loss_clip": 0.01140208, "auxiliary_loss_mlp": 0.01021172, "balance_loss_clip": 1.04683733, "balance_loss_mlp": 1.01445496, "epoch": 0.8021403234533758, "flos": 20120354196480.0, "grad_norm": 2.0240657156320894, "language_loss": 0.65097427, "learning_rate": 3.966499479610599e-07, "loss": 0.67258811, "num_input_tokens_seen": 143933940, "step": 6671, "time_per_iteration": 2.4889557361602783 }, { "auxiliary_loss_clip": 0.01120821, "auxiliary_loss_mlp": 0.01022906, "balance_loss_clip": 1.04516745, "balance_loss_mlp": 1.01626611, "epoch": 0.8022605663440149, "flos": 27746200252800.0, "grad_norm": 1.772648516596436, "language_loss": 0.65133452, "learning_rate": 3.9618443140640225e-07, "loss": 0.67277181, "num_input_tokens_seen": 143952850, "step": 6672, "time_per_iteration": 2.549924850463867 }, { "auxiliary_loss_clip": 0.0102025, "auxiliary_loss_mlp": 0.0100202, "balance_loss_clip": 1.00861204, "balance_loss_mlp": 1.0009768, "epoch": 0.802380809234654, "flos": 60244998768000.0, "grad_norm": 0.6882311653034767, "language_loss": 0.51398444, "learning_rate": 3.957191581492918e-07, "loss": 0.53420717, "num_input_tokens_seen": 144013610, "step": 6673, "time_per_iteration": 3.1349406242370605 }, { "auxiliary_loss_clip": 0.0113414, "auxiliary_loss_mlp": 0.01022172, "balance_loss_clip": 1.04385209, "balance_loss_mlp": 1.01503181, "epoch": 0.8025010521252931, "flos": 15080722352640.0, "grad_norm": 2.6127279831027876, "language_loss": 0.70470041, "learning_rate": 3.952541282603097e-07, "loss": 0.72626352, "num_input_tokens_seen": 144028715, "step": 6674, "time_per_iteration": 2.43558669090271 }, { "auxiliary_loss_clip": 0.01150407, "auxiliary_loss_mlp": 0.01021929, "balance_loss_clip": 1.04614091, "balance_loss_mlp": 1.01501822, "epoch": 0.8026212950159322, "flos": 22163527618560.0, "grad_norm": 2.0698465936192596, "language_loss": 0.83650875, "learning_rate": 3.9478934181000013e-07, "loss": 0.85823214, "num_input_tokens_seen": 144048740, "step": 6675, "time_per_iteration": 2.449049234390259 }, { "auxiliary_loss_clip": 0.01169052, "auxiliary_loss_mlp": 0.01022378, "balance_loss_clip": 1.0476408, "balance_loss_mlp": 1.01522565, "epoch": 0.8027415379065713, "flos": 17675986792320.0, "grad_norm": 2.487618029224469, "language_loss": 0.84529996, "learning_rate": 3.943247988688714e-07, "loss": 0.8672142, "num_input_tokens_seen": 144067435, "step": 6676, "time_per_iteration": 2.3897159099578857 }, { "auxiliary_loss_clip": 0.01152194, "auxiliary_loss_mlp": 0.01022045, "balance_loss_clip": 1.0463289, "balance_loss_mlp": 1.01578331, "epoch": 0.8028617807972104, "flos": 21979593048960.0, "grad_norm": 1.6999376240464537, "language_loss": 0.72038507, "learning_rate": 3.938604995073933e-07, "loss": 0.74212742, "num_input_tokens_seen": 144085905, "step": 6677, "time_per_iteration": 2.531378984451294 }, { "auxiliary_loss_clip": 0.01140265, "auxiliary_loss_mlp": 0.01024485, "balance_loss_clip": 1.04425871, "balance_loss_mlp": 1.0176903, "epoch": 0.8029820236878494, "flos": 26428457905920.0, "grad_norm": 1.8971618848262934, "language_loss": 0.65273595, "learning_rate": 3.9339644379600157e-07, "loss": 0.67438352, "num_input_tokens_seen": 144105735, "step": 6678, "time_per_iteration": 2.527109384536743 }, { "auxiliary_loss_clip": 0.01156656, "auxiliary_loss_mlp": 0.01025693, "balance_loss_clip": 1.04819679, "balance_loss_mlp": 1.01890421, "epoch": 0.8031022665784886, "flos": 17676489582720.0, "grad_norm": 1.7219975960449956, "language_loss": 0.71130437, "learning_rate": 3.929326318050907e-07, "loss": 0.73312783, "num_input_tokens_seen": 144123405, "step": 6679, "time_per_iteration": 2.4120123386383057 }, { "auxiliary_loss_clip": 0.0116192, "auxiliary_loss_mlp": 0.01024445, "balance_loss_clip": 1.04530418, "balance_loss_mlp": 1.01745081, "epoch": 0.8032225094691277, "flos": 15450279431040.0, "grad_norm": 2.2371876749609902, "language_loss": 0.78963286, "learning_rate": 3.924690636050225e-07, "loss": 0.8114965, "num_input_tokens_seen": 144140815, "step": 6680, "time_per_iteration": 2.3645241260528564 }, { "auxiliary_loss_clip": 0.01154145, "auxiliary_loss_mlp": 0.01026626, "balance_loss_clip": 1.04784143, "balance_loss_mlp": 1.01846623, "epoch": 0.8033427523597667, "flos": 26179202453760.0, "grad_norm": 2.23077868817721, "language_loss": 0.73061395, "learning_rate": 3.9200573926611915e-07, "loss": 0.75242168, "num_input_tokens_seen": 144162230, "step": 6681, "time_per_iteration": 2.476757526397705 }, { "auxiliary_loss_clip": 0.01152857, "auxiliary_loss_mlp": 0.01025499, "balance_loss_clip": 1.04971278, "balance_loss_mlp": 1.01810765, "epoch": 0.8034629952504058, "flos": 21324905809920.0, "grad_norm": 1.9271129731830863, "language_loss": 0.72951698, "learning_rate": 3.9154265885866613e-07, "loss": 0.75130057, "num_input_tokens_seen": 144181540, "step": 6682, "time_per_iteration": 3.3160345554351807 }, { "auxiliary_loss_clip": 0.01149921, "auxiliary_loss_mlp": 0.01025662, "balance_loss_clip": 1.04675198, "balance_loss_mlp": 1.0178771, "epoch": 0.8035832381410449, "flos": 21651585027840.0, "grad_norm": 2.7474751679527434, "language_loss": 0.75135398, "learning_rate": 3.9107982245291394e-07, "loss": 0.77310973, "num_input_tokens_seen": 144199665, "step": 6683, "time_per_iteration": 3.273348331451416 }, { "auxiliary_loss_clip": 0.01124769, "auxiliary_loss_mlp": 0.01025883, "balance_loss_clip": 1.04568338, "balance_loss_mlp": 1.01844466, "epoch": 0.803703481031684, "flos": 20518818744960.0, "grad_norm": 2.376624634240618, "language_loss": 0.77103812, "learning_rate": 3.9061723011907245e-07, "loss": 0.79254466, "num_input_tokens_seen": 144219020, "step": 6684, "time_per_iteration": 2.5193569660186768 }, { "auxiliary_loss_clip": 0.011371, "auxiliary_loss_mlp": 0.01023423, "balance_loss_clip": 1.04436541, "balance_loss_mlp": 1.01645815, "epoch": 0.803823723922323, "flos": 22854807838080.0, "grad_norm": 1.78926650257332, "language_loss": 0.79217517, "learning_rate": 3.901548819273179e-07, "loss": 0.81378043, "num_input_tokens_seen": 144239035, "step": 6685, "time_per_iteration": 3.3258824348449707 }, { "auxiliary_loss_clip": 0.01153621, "auxiliary_loss_mlp": 0.01025708, "balance_loss_clip": 1.04795897, "balance_loss_mlp": 1.01842475, "epoch": 0.8039439668129622, "flos": 21362145235200.0, "grad_norm": 2.420503086853807, "language_loss": 0.69310957, "learning_rate": 3.896927779477881e-07, "loss": 0.71490288, "num_input_tokens_seen": 144258295, "step": 6686, "time_per_iteration": 2.447969436645508 }, { "auxiliary_loss_clip": 0.01123779, "auxiliary_loss_mlp": 0.01022723, "balance_loss_clip": 1.04276121, "balance_loss_mlp": 1.01528442, "epoch": 0.8040642097036013, "flos": 23802382575360.0, "grad_norm": 2.0869171929170998, "language_loss": 0.66904062, "learning_rate": 3.892309182505833e-07, "loss": 0.69050562, "num_input_tokens_seen": 144276110, "step": 6687, "time_per_iteration": 2.5395350456237793 }, { "auxiliary_loss_clip": 0.01163261, "auxiliary_loss_mlp": 0.01024587, "balance_loss_clip": 1.04517627, "balance_loss_mlp": 1.01760769, "epoch": 0.8041844525942403, "flos": 25922046009600.0, "grad_norm": 2.2044442396972013, "language_loss": 0.86270356, "learning_rate": 3.887693029057675e-07, "loss": 0.8845821, "num_input_tokens_seen": 144295620, "step": 6688, "time_per_iteration": 3.204310178756714 }, { "auxiliary_loss_clip": 0.01136067, "auxiliary_loss_mlp": 0.01023722, "balance_loss_clip": 1.04409742, "balance_loss_mlp": 1.01681995, "epoch": 0.8043046954848795, "flos": 25191120153600.0, "grad_norm": 1.621480719087694, "language_loss": 0.81293631, "learning_rate": 3.8830793198336684e-07, "loss": 0.83453417, "num_input_tokens_seen": 144315210, "step": 6689, "time_per_iteration": 2.5239224433898926 }, { "auxiliary_loss_clip": 0.0115652, "auxiliary_loss_mlp": 0.0102652, "balance_loss_clip": 1.04580259, "balance_loss_mlp": 1.01953149, "epoch": 0.8044249383755185, "flos": 41719185123840.0, "grad_norm": 1.6535277724128856, "language_loss": 0.70343477, "learning_rate": 3.878468055533721e-07, "loss": 0.7252652, "num_input_tokens_seen": 144337750, "step": 6690, "time_per_iteration": 2.612668514251709 }, { "auxiliary_loss_clip": 0.01129922, "auxiliary_loss_mlp": 0.01026217, "balance_loss_clip": 1.04634726, "balance_loss_mlp": 1.01890326, "epoch": 0.8045451812661576, "flos": 20631434860800.0, "grad_norm": 2.4727561753325453, "language_loss": 0.84178805, "learning_rate": 3.8738592368573464e-07, "loss": 0.86334938, "num_input_tokens_seen": 144355305, "step": 6691, "time_per_iteration": 2.504849672317505 }, { "auxiliary_loss_clip": 0.01114367, "auxiliary_loss_mlp": 0.01023368, "balance_loss_clip": 1.04309714, "balance_loss_mlp": 1.01620388, "epoch": 0.8046654241567968, "flos": 29711806254720.0, "grad_norm": 2.2698168009613346, "language_loss": 0.87997055, "learning_rate": 3.8692528645037137e-07, "loss": 0.90134788, "num_input_tokens_seen": 144374485, "step": 6692, "time_per_iteration": 2.547819137573242 }, { "auxiliary_loss_clip": 0.01165869, "auxiliary_loss_mlp": 0.01024266, "balance_loss_clip": 1.04839516, "balance_loss_mlp": 1.01750088, "epoch": 0.8047856670474358, "flos": 17671389851520.0, "grad_norm": 2.1837270677836833, "language_loss": 0.77791119, "learning_rate": 3.8646489391715907e-07, "loss": 0.79981256, "num_input_tokens_seen": 144388780, "step": 6693, "time_per_iteration": 2.38718581199646 }, { "auxiliary_loss_clip": 0.01136008, "auxiliary_loss_mlp": 0.0102805, "balance_loss_clip": 1.04445839, "balance_loss_mlp": 1.02090669, "epoch": 0.8049059099380749, "flos": 17120699464320.0, "grad_norm": 2.3759723085219804, "language_loss": 0.8833952, "learning_rate": 3.8600474615593903e-07, "loss": 0.90503579, "num_input_tokens_seen": 144403395, "step": 6694, "time_per_iteration": 2.420403242111206 }, { "auxiliary_loss_clip": 0.01034693, "auxiliary_loss_mlp": 0.01001139, "balance_loss_clip": 1.01007438, "balance_loss_mlp": 1.00019681, "epoch": 0.805026152828714, "flos": 62212903240320.0, "grad_norm": 0.7862004234973567, "language_loss": 0.59673691, "learning_rate": 3.8554484323651605e-07, "loss": 0.61709523, "num_input_tokens_seen": 144465265, "step": 6695, "time_per_iteration": 3.1427159309387207 }, { "auxiliary_loss_clip": 0.01151022, "auxiliary_loss_mlp": 0.00761606, "balance_loss_clip": 1.04756498, "balance_loss_mlp": 1.00050426, "epoch": 0.8051463957193531, "flos": 21688608971520.0, "grad_norm": 1.6207591652070437, "language_loss": 0.79389042, "learning_rate": 3.85085185228657e-07, "loss": 0.81301665, "num_input_tokens_seen": 144484235, "step": 6696, "time_per_iteration": 2.4662370681762695 }, { "auxiliary_loss_clip": 0.01133228, "auxiliary_loss_mlp": 0.01026799, "balance_loss_clip": 1.04390085, "balance_loss_mlp": 1.0196048, "epoch": 0.8052666386099921, "flos": 32051458535040.0, "grad_norm": 2.027045296516793, "language_loss": 0.73432875, "learning_rate": 3.8462577220209114e-07, "loss": 0.75592899, "num_input_tokens_seen": 144504610, "step": 6697, "time_per_iteration": 2.560473918914795 }, { "auxiliary_loss_clip": 0.01064645, "auxiliary_loss_mlp": 0.01001232, "balance_loss_clip": 1.01025391, "balance_loss_mlp": 1.00022495, "epoch": 0.8053868815006313, "flos": 67157875768320.0, "grad_norm": 0.7226530099967193, "language_loss": 0.59050536, "learning_rate": 3.8416660422651127e-07, "loss": 0.61116409, "num_input_tokens_seen": 144574260, "step": 6698, "time_per_iteration": 3.09350848197937 }, { "auxiliary_loss_clip": 0.01126881, "auxiliary_loss_mlp": 0.01027739, "balance_loss_clip": 1.04254925, "balance_loss_mlp": 1.02010632, "epoch": 0.8055071243912704, "flos": 23837000307840.0, "grad_norm": 2.1423675499563575, "language_loss": 0.67922997, "learning_rate": 3.837076813715723e-07, "loss": 0.70077622, "num_input_tokens_seen": 144594145, "step": 6699, "time_per_iteration": 2.529806613922119 }, { "auxiliary_loss_clip": 0.01121408, "auxiliary_loss_mlp": 0.01022613, "balance_loss_clip": 1.04094076, "balance_loss_mlp": 1.01503682, "epoch": 0.8056273672819094, "flos": 21324510760320.0, "grad_norm": 1.6593974194596193, "language_loss": 0.75310874, "learning_rate": 3.832490037068941e-07, "loss": 0.77454895, "num_input_tokens_seen": 144612935, "step": 6700, "time_per_iteration": 2.5115110874176025 }, { "auxiliary_loss_clip": 0.01094882, "auxiliary_loss_mlp": 0.01020787, "balance_loss_clip": 1.0410533, "balance_loss_mlp": 1.0135628, "epoch": 0.8057476101725486, "flos": 25768383626880.0, "grad_norm": 1.8502713842368181, "language_loss": 0.75766021, "learning_rate": 3.827905713020554e-07, "loss": 0.77881694, "num_input_tokens_seen": 144630580, "step": 6701, "time_per_iteration": 2.5936341285705566 }, { "auxiliary_loss_clip": 0.01126223, "auxiliary_loss_mlp": 0.01027587, "balance_loss_clip": 1.04121804, "balance_loss_mlp": 1.01959968, "epoch": 0.8058678530631876, "flos": 24535283679360.0, "grad_norm": 2.9949510074060215, "language_loss": 0.68960822, "learning_rate": 3.823323842266017e-07, "loss": 0.71114624, "num_input_tokens_seen": 144649975, "step": 6702, "time_per_iteration": 2.5584065914154053 }, { "auxiliary_loss_clip": 0.01151998, "auxiliary_loss_mlp": 0.01026027, "balance_loss_clip": 1.04330182, "balance_loss_mlp": 1.01885414, "epoch": 0.8059880959538267, "flos": 24753728240640.0, "grad_norm": 2.5664519967402004, "language_loss": 0.7320441, "learning_rate": 3.818744425500393e-07, "loss": 0.75382435, "num_input_tokens_seen": 144667990, "step": 6703, "time_per_iteration": 2.468947172164917 }, { "auxiliary_loss_clip": 0.01117729, "auxiliary_loss_mlp": 0.0102817, "balance_loss_clip": 1.04089403, "balance_loss_mlp": 1.02005482, "epoch": 0.8061083388444659, "flos": 22196349671040.0, "grad_norm": 1.8267384998821306, "language_loss": 0.80989528, "learning_rate": 3.8141674634183675e-07, "loss": 0.83135426, "num_input_tokens_seen": 144687020, "step": 6704, "time_per_iteration": 2.511730670928955 }, { "auxiliary_loss_clip": 0.01107883, "auxiliary_loss_mlp": 0.01024559, "balance_loss_clip": 1.04325318, "balance_loss_mlp": 1.01813102, "epoch": 0.8062285817351049, "flos": 30044195735040.0, "grad_norm": 1.7698414397158995, "language_loss": 0.66323364, "learning_rate": 3.809592956714278e-07, "loss": 0.68455803, "num_input_tokens_seen": 144710255, "step": 6705, "time_per_iteration": 2.619598627090454 }, { "auxiliary_loss_clip": 0.01156786, "auxiliary_loss_mlp": 0.01027432, "balance_loss_clip": 1.04809058, "balance_loss_mlp": 1.02072668, "epoch": 0.806348824625744, "flos": 22782591544320.0, "grad_norm": 1.907890647993773, "language_loss": 0.74567449, "learning_rate": 3.805020906082057e-07, "loss": 0.76751667, "num_input_tokens_seen": 144728830, "step": 6706, "time_per_iteration": 2.48353910446167 }, { "auxiliary_loss_clip": 0.01141351, "auxiliary_loss_mlp": 0.01023647, "balance_loss_clip": 1.04517627, "balance_loss_mlp": 1.01607728, "epoch": 0.8064690675163831, "flos": 23404600385280.0, "grad_norm": 2.3937022476838288, "language_loss": 0.80873859, "learning_rate": 3.8004513122152917e-07, "loss": 0.83038855, "num_input_tokens_seen": 144747140, "step": 6707, "time_per_iteration": 2.474058151245117 }, { "auxiliary_loss_clip": 0.0113162, "auxiliary_loss_mlp": 0.01032052, "balance_loss_clip": 1.04780769, "balance_loss_mlp": 1.02523577, "epoch": 0.8065893104070222, "flos": 24060903736320.0, "grad_norm": 1.685016397270131, "language_loss": 0.66997236, "learning_rate": 3.79588417580718e-07, "loss": 0.69160903, "num_input_tokens_seen": 144765250, "step": 6708, "time_per_iteration": 3.3262388706207275 }, { "auxiliary_loss_clip": 0.01153963, "auxiliary_loss_mlp": 0.01023628, "balance_loss_clip": 1.04774809, "balance_loss_mlp": 1.01699686, "epoch": 0.8067095532976613, "flos": 22305410340480.0, "grad_norm": 2.1075360374718954, "language_loss": 0.76660264, "learning_rate": 3.791319497550558e-07, "loss": 0.78837848, "num_input_tokens_seen": 144783080, "step": 6709, "time_per_iteration": 2.4473893642425537 }, { "auxiliary_loss_clip": 0.01130252, "auxiliary_loss_mlp": 0.00761406, "balance_loss_clip": 1.04489422, "balance_loss_mlp": 1.00036776, "epoch": 0.8068297961883004, "flos": 17129498296320.0, "grad_norm": 1.9592145636576255, "language_loss": 0.70970368, "learning_rate": 3.78675727813788e-07, "loss": 0.72862029, "num_input_tokens_seen": 144800645, "step": 6710, "time_per_iteration": 3.3032350540161133 }, { "auxiliary_loss_clip": 0.0113773, "auxiliary_loss_mlp": 0.01020867, "balance_loss_clip": 1.04538417, "balance_loss_mlp": 1.01394057, "epoch": 0.8069500390789395, "flos": 22018843635840.0, "grad_norm": 1.8471727289918745, "language_loss": 0.73801577, "learning_rate": 3.782197518261225e-07, "loss": 0.75960177, "num_input_tokens_seen": 144820085, "step": 6711, "time_per_iteration": 3.321425199508667 }, { "auxiliary_loss_clip": 0.01144001, "auxiliary_loss_mlp": 0.01030216, "balance_loss_clip": 1.04665875, "balance_loss_mlp": 1.0232482, "epoch": 0.8070702819695785, "flos": 19244241567360.0, "grad_norm": 1.9601434379668574, "language_loss": 0.95249069, "learning_rate": 3.777640218612319e-07, "loss": 0.97423285, "num_input_tokens_seen": 144838070, "step": 6712, "time_per_iteration": 2.44708251953125 }, { "auxiliary_loss_clip": 0.01144317, "auxiliary_loss_mlp": 0.01025124, "balance_loss_clip": 1.0443778, "balance_loss_mlp": 1.01818633, "epoch": 0.8071905248602176, "flos": 21544320038400.0, "grad_norm": 2.1807629174518945, "language_loss": 0.71770018, "learning_rate": 3.773085379882488e-07, "loss": 0.73939461, "num_input_tokens_seen": 144857125, "step": 6713, "time_per_iteration": 2.441237211227417 }, { "auxiliary_loss_clip": 0.01150555, "auxiliary_loss_mlp": 0.00761898, "balance_loss_clip": 1.04338217, "balance_loss_mlp": 1.00043595, "epoch": 0.8073107677508568, "flos": 37268309105280.0, "grad_norm": 5.107299336549649, "language_loss": 0.76135564, "learning_rate": 3.768533002762715e-07, "loss": 0.78048015, "num_input_tokens_seen": 144880660, "step": 6714, "time_per_iteration": 2.5883443355560303 }, { "auxiliary_loss_clip": 0.01136512, "auxiliary_loss_mlp": 0.01020915, "balance_loss_clip": 1.04164505, "balance_loss_mlp": 1.01426852, "epoch": 0.8074310106414958, "flos": 28366269759360.0, "grad_norm": 1.8073421214175733, "language_loss": 0.76833713, "learning_rate": 3.763983087943572e-07, "loss": 0.78991139, "num_input_tokens_seen": 144900050, "step": 6715, "time_per_iteration": 3.2253429889678955 }, { "auxiliary_loss_clip": 0.01142234, "auxiliary_loss_mlp": 0.00761875, "balance_loss_clip": 1.04312479, "balance_loss_mlp": 1.00046432, "epoch": 0.8075512535321349, "flos": 24281646768000.0, "grad_norm": 5.668014207877506, "language_loss": 0.80547422, "learning_rate": 3.759435636115282e-07, "loss": 0.82451534, "num_input_tokens_seen": 144920835, "step": 6716, "time_per_iteration": 2.5130603313446045 }, { "auxiliary_loss_clip": 0.01095494, "auxiliary_loss_mlp": 0.00761484, "balance_loss_clip": 1.04471576, "balance_loss_mlp": 1.0004586, "epoch": 0.807671496422774, "flos": 26030855283840.0, "grad_norm": 1.690034293635486, "language_loss": 0.72882104, "learning_rate": 3.7548906479676967e-07, "loss": 0.74739081, "num_input_tokens_seen": 144940430, "step": 6717, "time_per_iteration": 2.62199068069458 }, { "auxiliary_loss_clip": 0.01155652, "auxiliary_loss_mlp": 0.01022447, "balance_loss_clip": 1.04553843, "balance_loss_mlp": 1.01581025, "epoch": 0.8077917393134131, "flos": 23730740899200.0, "grad_norm": 1.7878890026842318, "language_loss": 0.71957392, "learning_rate": 3.7503481241902855e-07, "loss": 0.74135494, "num_input_tokens_seen": 144960405, "step": 6718, "time_per_iteration": 2.470048189163208 }, { "auxiliary_loss_clip": 0.01139283, "auxiliary_loss_mlp": 0.00761381, "balance_loss_clip": 1.04501224, "balance_loss_mlp": 1.0004518, "epoch": 0.8079119822040521, "flos": 18402028398720.0, "grad_norm": 1.7789515554819748, "language_loss": 0.80462229, "learning_rate": 3.745808065472145e-07, "loss": 0.82362896, "num_input_tokens_seen": 144977700, "step": 6719, "time_per_iteration": 2.4562885761260986 }, { "auxiliary_loss_clip": 0.01151322, "auxiliary_loss_mlp": 0.01028168, "balance_loss_clip": 1.05156875, "balance_loss_mlp": 1.02140331, "epoch": 0.8080322250946913, "flos": 23621787970560.0, "grad_norm": 1.5855762524968087, "language_loss": 0.76182711, "learning_rate": 3.741270472501994e-07, "loss": 0.78362209, "num_input_tokens_seen": 144998340, "step": 6720, "time_per_iteration": 2.4687557220458984 }, { "auxiliary_loss_clip": 0.01135863, "auxiliary_loss_mlp": 0.01021836, "balance_loss_clip": 1.04576635, "balance_loss_mlp": 1.01557755, "epoch": 0.8081524679853304, "flos": 22820692896000.0, "grad_norm": 1.6951240409153494, "language_loss": 0.72786272, "learning_rate": 3.736735345968183e-07, "loss": 0.74943966, "num_input_tokens_seen": 145017950, "step": 6721, "time_per_iteration": 2.491544723510742 }, { "auxiliary_loss_clip": 0.0115311, "auxiliary_loss_mlp": 0.0102307, "balance_loss_clip": 1.04740298, "balance_loss_mlp": 1.01637042, "epoch": 0.8082727108759694, "flos": 17640004343040.0, "grad_norm": 2.2561155494325416, "language_loss": 0.78834271, "learning_rate": 3.7322026865586986e-07, "loss": 0.81010449, "num_input_tokens_seen": 145036985, "step": 6722, "time_per_iteration": 2.431783437728882 }, { "auxiliary_loss_clip": 0.01158348, "auxiliary_loss_mlp": 0.01023599, "balance_loss_clip": 1.0487026, "balance_loss_mlp": 1.01633954, "epoch": 0.8083929537666086, "flos": 25958172113280.0, "grad_norm": 2.373226396603151, "language_loss": 0.73515564, "learning_rate": 3.7276724949611206e-07, "loss": 0.75697517, "num_input_tokens_seen": 145057095, "step": 6723, "time_per_iteration": 2.480518102645874 }, { "auxiliary_loss_clip": 0.01141912, "auxiliary_loss_mlp": 0.0102297, "balance_loss_clip": 1.04560447, "balance_loss_mlp": 1.0155189, "epoch": 0.8085131966572476, "flos": 27089178629760.0, "grad_norm": 1.8693078251011823, "language_loss": 0.75131035, "learning_rate": 3.723144771862694e-07, "loss": 0.77295911, "num_input_tokens_seen": 145077735, "step": 6724, "time_per_iteration": 2.5143120288848877 }, { "auxiliary_loss_clip": 0.01126972, "auxiliary_loss_mlp": 0.0102167, "balance_loss_clip": 1.04233742, "balance_loss_mlp": 1.01441061, "epoch": 0.8086334395478867, "flos": 23988543788160.0, "grad_norm": 1.5281396431524967, "language_loss": 0.76768053, "learning_rate": 3.718619517950263e-07, "loss": 0.78916693, "num_input_tokens_seen": 145098330, "step": 6725, "time_per_iteration": 2.533939838409424 }, { "auxiliary_loss_clip": 0.01166834, "auxiliary_loss_mlp": 0.0102911, "balance_loss_clip": 1.04898512, "balance_loss_mlp": 1.02228868, "epoch": 0.8087536824385259, "flos": 20405879406720.0, "grad_norm": 1.9280259871147754, "language_loss": 0.76751697, "learning_rate": 3.714096733910301e-07, "loss": 0.78947645, "num_input_tokens_seen": 145115855, "step": 6726, "time_per_iteration": 2.4907848834991455 }, { "auxiliary_loss_clip": 0.01160154, "auxiliary_loss_mlp": 0.01024471, "balance_loss_clip": 1.047575, "balance_loss_mlp": 1.01678181, "epoch": 0.8088739253291649, "flos": 25919639798400.0, "grad_norm": 1.8955932804268432, "language_loss": 0.705755, "learning_rate": 3.709576420428926e-07, "loss": 0.72760129, "num_input_tokens_seen": 145136655, "step": 6727, "time_per_iteration": 2.5362765789031982 }, { "auxiliary_loss_clip": 0.01137981, "auxiliary_loss_mlp": 0.01022948, "balance_loss_clip": 1.04221606, "balance_loss_mlp": 1.01615, "epoch": 0.808994168219804, "flos": 28402072640640.0, "grad_norm": 2.3979954908928223, "language_loss": 0.73727179, "learning_rate": 3.7050585781918463e-07, "loss": 0.75888109, "num_input_tokens_seen": 145156955, "step": 6728, "time_per_iteration": 2.522301435470581 }, { "auxiliary_loss_clip": 0.01155645, "auxiliary_loss_mlp": 0.01025312, "balance_loss_clip": 1.04588485, "balance_loss_mlp": 1.01780176, "epoch": 0.8091144111104431, "flos": 17421056991360.0, "grad_norm": 1.959254908405037, "language_loss": 0.68922156, "learning_rate": 3.700543207884428e-07, "loss": 0.71103108, "num_input_tokens_seen": 145173865, "step": 6729, "time_per_iteration": 2.4134814739227295 }, { "auxiliary_loss_clip": 0.0115155, "auxiliary_loss_mlp": 0.01023111, "balance_loss_clip": 1.04756951, "balance_loss_mlp": 1.01617646, "epoch": 0.8092346540010822, "flos": 32153803361280.0, "grad_norm": 2.802261506359597, "language_loss": 0.71408498, "learning_rate": 3.6960303101916466e-07, "loss": 0.73583168, "num_input_tokens_seen": 145193780, "step": 6730, "time_per_iteration": 2.5015552043914795 }, { "auxiliary_loss_clip": 0.01064216, "auxiliary_loss_mlp": 0.00753147, "balance_loss_clip": 1.00998366, "balance_loss_mlp": 1.00010478, "epoch": 0.8093548968917212, "flos": 58035093390720.0, "grad_norm": 0.7580382329229355, "language_loss": 0.5553233, "learning_rate": 3.6915198857981047e-07, "loss": 0.57349694, "num_input_tokens_seen": 145258980, "step": 6731, "time_per_iteration": 3.040640115737915 }, { "auxiliary_loss_clip": 0.01123919, "auxiliary_loss_mlp": 0.01024858, "balance_loss_clip": 1.04523206, "balance_loss_mlp": 1.0169307, "epoch": 0.8094751397823604, "flos": 27381599251200.0, "grad_norm": 1.7873658474159342, "language_loss": 0.68177879, "learning_rate": 3.687011935388027e-07, "loss": 0.70326662, "num_input_tokens_seen": 145281875, "step": 6732, "time_per_iteration": 2.5652506351470947 }, { "auxiliary_loss_clip": 0.01151315, "auxiliary_loss_mlp": 0.01019669, "balance_loss_clip": 1.0462985, "balance_loss_mlp": 1.01307106, "epoch": 0.8095953826729995, "flos": 24061083304320.0, "grad_norm": 1.9111601097421276, "language_loss": 0.72748965, "learning_rate": 3.6825064596452646e-07, "loss": 0.74919951, "num_input_tokens_seen": 145302220, "step": 6733, "time_per_iteration": 2.4660708904266357 }, { "auxiliary_loss_clip": 0.01151579, "auxiliary_loss_mlp": 0.01022666, "balance_loss_clip": 1.04503703, "balance_loss_mlp": 1.0158236, "epoch": 0.8097156255636385, "flos": 23951412103680.0, "grad_norm": 1.6899498514628872, "language_loss": 0.7094565, "learning_rate": 3.678003459253305e-07, "loss": 0.73119903, "num_input_tokens_seen": 145323070, "step": 6734, "time_per_iteration": 3.3349273204803467 }, { "auxiliary_loss_clip": 0.01124053, "auxiliary_loss_mlp": 0.01021403, "balance_loss_clip": 1.04286206, "balance_loss_mlp": 1.01383352, "epoch": 0.8098358684542777, "flos": 21799142098560.0, "grad_norm": 2.3888308610252293, "language_loss": 0.7422775, "learning_rate": 3.673502934895236e-07, "loss": 0.76373208, "num_input_tokens_seen": 145342575, "step": 6735, "time_per_iteration": 2.5073490142822266 }, { "auxiliary_loss_clip": 0.01063695, "auxiliary_loss_mlp": 0.0100167, "balance_loss_clip": 1.00955558, "balance_loss_mlp": 1.00069845, "epoch": 0.8099561113449167, "flos": 68809515966720.0, "grad_norm": 0.6919123196324478, "language_loss": 0.57949078, "learning_rate": 3.669004887253802e-07, "loss": 0.60014439, "num_input_tokens_seen": 145408865, "step": 6736, "time_per_iteration": 3.1324520111083984 }, { "auxiliary_loss_clip": 0.01140924, "auxiliary_loss_mlp": 0.0102314, "balance_loss_clip": 1.04639959, "balance_loss_mlp": 1.01644981, "epoch": 0.8100763542355558, "flos": 23586056916480.0, "grad_norm": 1.5942867345826883, "language_loss": 0.78958714, "learning_rate": 3.664509317011335e-07, "loss": 0.8112278, "num_input_tokens_seen": 145429200, "step": 6737, "time_per_iteration": 3.324171781539917 }, { "auxiliary_loss_clip": 0.01153624, "auxiliary_loss_mlp": 0.01026404, "balance_loss_clip": 1.04933083, "balance_loss_mlp": 1.01877201, "epoch": 0.810196597126195, "flos": 31650408207360.0, "grad_norm": 2.221953221788739, "language_loss": 0.73493624, "learning_rate": 3.6600162248498134e-07, "loss": 0.75673652, "num_input_tokens_seen": 145452830, "step": 6738, "time_per_iteration": 3.3799679279327393 }, { "auxiliary_loss_clip": 0.01081264, "auxiliary_loss_mlp": 0.01022846, "balance_loss_clip": 1.03793836, "balance_loss_mlp": 1.01630449, "epoch": 0.810316840016834, "flos": 24900459298560.0, "grad_norm": 1.7505623178314402, "language_loss": 0.75748158, "learning_rate": 3.6555256114508426e-07, "loss": 0.77852267, "num_input_tokens_seen": 145472625, "step": 6739, "time_per_iteration": 2.589928388595581 }, { "auxiliary_loss_clip": 0.01136107, "auxiliary_loss_mlp": 0.01024665, "balance_loss_clip": 1.04095316, "balance_loss_mlp": 1.0174644, "epoch": 0.8104370829074731, "flos": 27965003950080.0, "grad_norm": 2.152393013104217, "language_loss": 0.73196304, "learning_rate": 3.651037477495642e-07, "loss": 0.75357068, "num_input_tokens_seen": 145494075, "step": 6740, "time_per_iteration": 2.5469439029693604 }, { "auxiliary_loss_clip": 0.01165486, "auxiliary_loss_mlp": 0.01023075, "balance_loss_clip": 1.04593062, "balance_loss_mlp": 1.01602674, "epoch": 0.8105573257981122, "flos": 24640752988800.0, "grad_norm": 2.214866630627599, "language_loss": 0.68145937, "learning_rate": 3.6465518236650584e-07, "loss": 0.703345, "num_input_tokens_seen": 145514220, "step": 6741, "time_per_iteration": 3.183682441711426 }, { "auxiliary_loss_clip": 0.01123966, "auxiliary_loss_mlp": 0.01027906, "balance_loss_clip": 1.04270828, "balance_loss_mlp": 1.0214386, "epoch": 0.8106775686887513, "flos": 26358935132160.0, "grad_norm": 1.6831597036051975, "language_loss": 0.78348041, "learning_rate": 3.642068650639558e-07, "loss": 0.80499911, "num_input_tokens_seen": 145533965, "step": 6742, "time_per_iteration": 2.5536162853240967 }, { "auxiliary_loss_clip": 0.01129906, "auxiliary_loss_mlp": 0.01027939, "balance_loss_clip": 1.03913927, "balance_loss_mlp": 1.02102757, "epoch": 0.8107978115793903, "flos": 27271892136960.0, "grad_norm": 5.969952903569133, "language_loss": 0.64504516, "learning_rate": 3.6375879590992334e-07, "loss": 0.66662353, "num_input_tokens_seen": 145554310, "step": 6743, "time_per_iteration": 2.519423246383667 }, { "auxiliary_loss_clip": 0.01133314, "auxiliary_loss_mlp": 0.0102539, "balance_loss_clip": 1.04300809, "balance_loss_mlp": 1.01820803, "epoch": 0.8109180544700295, "flos": 24934322845440.0, "grad_norm": 1.76173379754774, "language_loss": 0.81191272, "learning_rate": 3.6331097497238173e-07, "loss": 0.83349979, "num_input_tokens_seen": 145573755, "step": 6744, "time_per_iteration": 2.5046942234039307 }, { "auxiliary_loss_clip": 0.0112124, "auxiliary_loss_mlp": 0.01019148, "balance_loss_clip": 1.04242122, "balance_loss_mlp": 1.0123229, "epoch": 0.8110382973606686, "flos": 21105383840640.0, "grad_norm": 2.112121223684056, "language_loss": 0.8019529, "learning_rate": 3.628634023192627e-07, "loss": 0.82335675, "num_input_tokens_seen": 145594000, "step": 6745, "time_per_iteration": 2.5282111167907715 }, { "auxiliary_loss_clip": 0.011539, "auxiliary_loss_mlp": 0.01026568, "balance_loss_clip": 1.04620337, "balance_loss_mlp": 1.01866996, "epoch": 0.8111585402513076, "flos": 15414081500160.0, "grad_norm": 2.0551271466630947, "language_loss": 0.75035942, "learning_rate": 3.624160780184644e-07, "loss": 0.77216411, "num_input_tokens_seen": 145611215, "step": 6746, "time_per_iteration": 2.4097530841827393 }, { "auxiliary_loss_clip": 0.0113142, "auxiliary_loss_mlp": 0.01025055, "balance_loss_clip": 1.04324627, "balance_loss_mlp": 1.01795936, "epoch": 0.8112787831419467, "flos": 24095736950400.0, "grad_norm": 2.179920322205396, "language_loss": 0.74357545, "learning_rate": 3.6196900213784496e-07, "loss": 0.76514018, "num_input_tokens_seen": 145630530, "step": 6747, "time_per_iteration": 2.516167640686035 }, { "auxiliary_loss_clip": 0.01150903, "auxiliary_loss_mlp": 0.01025265, "balance_loss_clip": 1.04547477, "balance_loss_mlp": 1.01844358, "epoch": 0.8113990260325858, "flos": 20483374999680.0, "grad_norm": 1.7869277150268115, "language_loss": 0.8668676, "learning_rate": 3.6152217474522527e-07, "loss": 0.88862932, "num_input_tokens_seen": 145647345, "step": 6748, "time_per_iteration": 2.4633281230926514 }, { "auxiliary_loss_clip": 0.01150552, "auxiliary_loss_mlp": 0.01027627, "balance_loss_clip": 1.04738283, "balance_loss_mlp": 1.02104044, "epoch": 0.8115192689232249, "flos": 24901141656960.0, "grad_norm": 1.782953547231123, "language_loss": 0.72669113, "learning_rate": 3.6107559590838975e-07, "loss": 0.74847293, "num_input_tokens_seen": 145666330, "step": 6749, "time_per_iteration": 2.5125668048858643 }, { "auxiliary_loss_clip": 0.01091151, "auxiliary_loss_mlp": 0.01024067, "balance_loss_clip": 1.04053617, "balance_loss_mlp": 1.01686072, "epoch": 0.811639511813864, "flos": 24057204635520.0, "grad_norm": 2.306753225631151, "language_loss": 0.66318452, "learning_rate": 3.606292656950822e-07, "loss": 0.68433666, "num_input_tokens_seen": 145684740, "step": 6750, "time_per_iteration": 2.596280574798584 }, { "auxiliary_loss_clip": 0.01133294, "auxiliary_loss_mlp": 0.01021933, "balance_loss_clip": 1.04214907, "balance_loss_mlp": 1.01464307, "epoch": 0.8117597547045031, "flos": 23185150243200.0, "grad_norm": 2.020430092515925, "language_loss": 0.86557746, "learning_rate": 3.601831841730121e-07, "loss": 0.88712972, "num_input_tokens_seen": 145702660, "step": 6751, "time_per_iteration": 2.458836317062378 }, { "auxiliary_loss_clip": 0.01151578, "auxiliary_loss_mlp": 0.01022982, "balance_loss_clip": 1.04663992, "balance_loss_mlp": 1.01578462, "epoch": 0.8118799975951422, "flos": 23040250778880.0, "grad_norm": 1.9722918012160227, "language_loss": 0.72822905, "learning_rate": 3.5973735140984916e-07, "loss": 0.74997461, "num_input_tokens_seen": 145722830, "step": 6752, "time_per_iteration": 2.4769294261932373 }, { "auxiliary_loss_clip": 0.01103634, "auxiliary_loss_mlp": 0.00760804, "balance_loss_clip": 1.03789997, "balance_loss_mlp": 1.0003264, "epoch": 0.8120002404857812, "flos": 24639962889600.0, "grad_norm": 1.9830176021582018, "language_loss": 0.79507232, "learning_rate": 3.5929176747322607e-07, "loss": 0.81371665, "num_input_tokens_seen": 145741935, "step": 6753, "time_per_iteration": 2.562243700027466 }, { "auxiliary_loss_clip": 0.01046647, "auxiliary_loss_mlp": 0.01000779, "balance_loss_clip": 1.01001108, "balance_loss_mlp": 0.9998495, "epoch": 0.8121204833764204, "flos": 57415742156160.0, "grad_norm": 0.8115165272066597, "language_loss": 0.56236714, "learning_rate": 3.588464324307372e-07, "loss": 0.5828414, "num_input_tokens_seen": 145805560, "step": 6754, "time_per_iteration": 3.1068928241729736 }, { "auxiliary_loss_clip": 0.01151819, "auxiliary_loss_mlp": 0.01023164, "balance_loss_clip": 1.04366028, "balance_loss_mlp": 1.01618767, "epoch": 0.8122407262670595, "flos": 19464589549440.0, "grad_norm": 2.0230850209949782, "language_loss": 0.7553066, "learning_rate": 3.584013463499391e-07, "loss": 0.77705646, "num_input_tokens_seen": 145824180, "step": 6755, "time_per_iteration": 2.4217872619628906 }, { "auxiliary_loss_clip": 0.01045, "auxiliary_loss_mlp": 0.01002071, "balance_loss_clip": 1.01142633, "balance_loss_mlp": 1.00114107, "epoch": 0.8123609691576985, "flos": 56425325472000.0, "grad_norm": 0.7364622901970289, "language_loss": 0.64498335, "learning_rate": 3.579565092983521e-07, "loss": 0.66545409, "num_input_tokens_seen": 145885300, "step": 6756, "time_per_iteration": 2.9235947132110596 }, { "auxiliary_loss_clip": 0.01167596, "auxiliary_loss_mlp": 0.0103167, "balance_loss_clip": 1.04898143, "balance_loss_mlp": 1.02456522, "epoch": 0.8124812120483377, "flos": 20631973564800.0, "grad_norm": 2.057129140048583, "language_loss": 0.83786952, "learning_rate": 3.575119213434565e-07, "loss": 0.85986215, "num_input_tokens_seen": 145903815, "step": 6757, "time_per_iteration": 2.4005305767059326 }, { "auxiliary_loss_clip": 0.01148256, "auxiliary_loss_mlp": 0.01020378, "balance_loss_clip": 1.04555666, "balance_loss_mlp": 1.01345491, "epoch": 0.8126014549389767, "flos": 22492397566080.0, "grad_norm": 1.7362842786458825, "language_loss": 0.81990516, "learning_rate": 3.5706758255269765e-07, "loss": 0.84159148, "num_input_tokens_seen": 145922270, "step": 6758, "time_per_iteration": 2.451983690261841 }, { "auxiliary_loss_clip": 0.01140172, "auxiliary_loss_mlp": 0.01025422, "balance_loss_clip": 1.04530013, "balance_loss_mlp": 1.01828694, "epoch": 0.8127216978296158, "flos": 23287961946240.0, "grad_norm": 1.484807185471433, "language_loss": 0.69738537, "learning_rate": 3.566234929934795e-07, "loss": 0.71904129, "num_input_tokens_seen": 145941470, "step": 6759, "time_per_iteration": 2.4744327068328857 }, { "auxiliary_loss_clip": 0.01151942, "auxiliary_loss_mlp": 0.01028017, "balance_loss_clip": 1.04967868, "balance_loss_mlp": 1.02092731, "epoch": 0.812841940720255, "flos": 25154994049920.0, "grad_norm": 1.467336948036511, "language_loss": 0.717875, "learning_rate": 3.561796527331706e-07, "loss": 0.73967457, "num_input_tokens_seen": 145963145, "step": 6760, "time_per_iteration": 2.4990997314453125 }, { "auxiliary_loss_clip": 0.0112695, "auxiliary_loss_mlp": 0.01021798, "balance_loss_clip": 1.04342651, "balance_loss_mlp": 1.01450837, "epoch": 0.812962183610894, "flos": 26648446752000.0, "grad_norm": 3.316365297541003, "language_loss": 0.77720737, "learning_rate": 3.5573606183910163e-07, "loss": 0.79869485, "num_input_tokens_seen": 145983150, "step": 6761, "time_per_iteration": 3.4013988971710205 }, { "auxiliary_loss_clip": 0.01156931, "auxiliary_loss_mlp": 0.01023663, "balance_loss_clip": 1.04515123, "balance_loss_mlp": 1.0164094, "epoch": 0.8130824265015331, "flos": 24966965329920.0, "grad_norm": 3.1749222682202984, "language_loss": 0.7878952, "learning_rate": 3.5529272037856493e-07, "loss": 0.80970114, "num_input_tokens_seen": 146001365, "step": 6762, "time_per_iteration": 2.455958843231201 }, { "auxiliary_loss_clip": 0.01019128, "auxiliary_loss_mlp": 0.01001795, "balance_loss_clip": 1.01098633, "balance_loss_mlp": 1.00054884, "epoch": 0.8132026693921722, "flos": 67622918175360.0, "grad_norm": 0.7079936112692861, "language_loss": 0.5393635, "learning_rate": 3.548496284188149e-07, "loss": 0.5595727, "num_input_tokens_seen": 146061570, "step": 6763, "time_per_iteration": 4.004713773727417 }, { "auxiliary_loss_clip": 0.01106041, "auxiliary_loss_mlp": 0.01022403, "balance_loss_clip": 1.04592276, "balance_loss_mlp": 1.01542282, "epoch": 0.8133229122828113, "flos": 19495149045120.0, "grad_norm": 1.788848618826129, "language_loss": 0.7925967, "learning_rate": 3.544067860270681e-07, "loss": 0.81388116, "num_input_tokens_seen": 146079145, "step": 6764, "time_per_iteration": 2.506537437438965 }, { "auxiliary_loss_clip": 0.011261, "auxiliary_loss_mlp": 0.01022283, "balance_loss_clip": 1.04371929, "balance_loss_mlp": 1.01499963, "epoch": 0.8134431551734503, "flos": 20668135582080.0, "grad_norm": 1.6238590636023333, "language_loss": 0.71366942, "learning_rate": 3.539641932705029e-07, "loss": 0.7351532, "num_input_tokens_seen": 146097625, "step": 6765, "time_per_iteration": 3.4392738342285156 }, { "auxiliary_loss_clip": 0.01168683, "auxiliary_loss_mlp": 0.01022638, "balance_loss_clip": 1.0468924, "balance_loss_mlp": 1.01481187, "epoch": 0.8135633980640895, "flos": 21507332008320.0, "grad_norm": 2.0457383455938745, "language_loss": 0.76971227, "learning_rate": 3.53521850216262e-07, "loss": 0.7916255, "num_input_tokens_seen": 146117195, "step": 6766, "time_per_iteration": 2.4210317134857178 }, { "auxiliary_loss_clip": 0.0116716, "auxiliary_loss_mlp": 0.01025439, "balance_loss_clip": 1.04795361, "balance_loss_mlp": 1.01800609, "epoch": 0.8136836409547286, "flos": 20554442058240.0, "grad_norm": 1.8248419244985346, "language_loss": 0.76607561, "learning_rate": 3.530797569314461e-07, "loss": 0.78800166, "num_input_tokens_seen": 146136220, "step": 6767, "time_per_iteration": 2.449932813644409 }, { "auxiliary_loss_clip": 0.0116627, "auxiliary_loss_mlp": 0.01021435, "balance_loss_clip": 1.04790521, "balance_loss_mlp": 1.01436305, "epoch": 0.8138038838453676, "flos": 20299045380480.0, "grad_norm": 2.0417898437070305, "language_loss": 0.77742028, "learning_rate": 3.5263791348312235e-07, "loss": 0.79929733, "num_input_tokens_seen": 146155415, "step": 6768, "time_per_iteration": 3.1844124794006348 }, { "auxiliary_loss_clip": 0.01135365, "auxiliary_loss_mlp": 0.01020437, "balance_loss_clip": 1.04314303, "balance_loss_mlp": 1.01336169, "epoch": 0.8139241267360068, "flos": 29789840551680.0, "grad_norm": 1.835123071558431, "language_loss": 0.70644391, "learning_rate": 3.521963199383171e-07, "loss": 0.72800195, "num_input_tokens_seen": 146178370, "step": 6769, "time_per_iteration": 2.521782398223877 }, { "auxiliary_loss_clip": 0.01111653, "auxiliary_loss_mlp": 0.01024076, "balance_loss_clip": 1.04306459, "balance_loss_mlp": 1.01651525, "epoch": 0.8140443696266458, "flos": 19713270384000.0, "grad_norm": 1.8564443512224955, "language_loss": 0.77004403, "learning_rate": 3.517549763640197e-07, "loss": 0.79140127, "num_input_tokens_seen": 146196010, "step": 6770, "time_per_iteration": 2.511802911758423 }, { "auxiliary_loss_clip": 0.01150091, "auxiliary_loss_mlp": 0.00761422, "balance_loss_clip": 1.04841042, "balance_loss_mlp": 1.0003798, "epoch": 0.8141646125172849, "flos": 27160568910720.0, "grad_norm": 1.8615189306943924, "language_loss": 0.71163589, "learning_rate": 3.513138828271829e-07, "loss": 0.73075098, "num_input_tokens_seen": 146215880, "step": 6771, "time_per_iteration": 2.478868246078491 }, { "auxiliary_loss_clip": 0.01121097, "auxiliary_loss_mlp": 0.0102805, "balance_loss_clip": 1.04437292, "balance_loss_mlp": 1.02095151, "epoch": 0.8142848554079241, "flos": 39673102700160.0, "grad_norm": 2.1408818788341955, "language_loss": 0.70136356, "learning_rate": 3.508730393947179e-07, "loss": 0.72285497, "num_input_tokens_seen": 146239135, "step": 6772, "time_per_iteration": 2.6523797512054443 }, { "auxiliary_loss_clip": 0.01124148, "auxiliary_loss_mlp": 0.01025436, "balance_loss_clip": 1.04441214, "balance_loss_mlp": 1.01801252, "epoch": 0.8144050982985631, "flos": 22237288197120.0, "grad_norm": 1.5977252217901663, "language_loss": 0.72052252, "learning_rate": 3.504324461335024e-07, "loss": 0.74201846, "num_input_tokens_seen": 146259245, "step": 6773, "time_per_iteration": 2.5373635292053223 }, { "auxiliary_loss_clip": 0.01101859, "auxiliary_loss_mlp": 0.01025827, "balance_loss_clip": 1.04027641, "balance_loss_mlp": 1.01764369, "epoch": 0.8145253411892022, "flos": 23038239617280.0, "grad_norm": 1.9788502437136732, "language_loss": 0.88268816, "learning_rate": 3.499921031103732e-07, "loss": 0.90396506, "num_input_tokens_seen": 146280015, "step": 6774, "time_per_iteration": 2.567244052886963 }, { "auxiliary_loss_clip": 0.01133096, "auxiliary_loss_mlp": 0.01021849, "balance_loss_clip": 1.04251802, "balance_loss_mlp": 1.01452994, "epoch": 0.8146455840798413, "flos": 24827668387200.0, "grad_norm": 1.7630653603384026, "language_loss": 0.78407025, "learning_rate": 3.4955201039212987e-07, "loss": 0.80561972, "num_input_tokens_seen": 146300935, "step": 6775, "time_per_iteration": 2.546619415283203 }, { "auxiliary_loss_clip": 0.01158709, "auxiliary_loss_mlp": 0.0102378, "balance_loss_clip": 1.04821849, "balance_loss_mlp": 1.01678216, "epoch": 0.8147658269704804, "flos": 19974520978560.0, "grad_norm": 1.8649612895370975, "language_loss": 0.65722191, "learning_rate": 3.4911216804553465e-07, "loss": 0.67904675, "num_input_tokens_seen": 146319835, "step": 6776, "time_per_iteration": 2.4294660091400146 }, { "auxiliary_loss_clip": 0.01137693, "auxiliary_loss_mlp": 0.01028213, "balance_loss_clip": 1.04436111, "balance_loss_mlp": 1.02024341, "epoch": 0.8148860698611194, "flos": 21178031097600.0, "grad_norm": 2.32663602890063, "language_loss": 0.70550597, "learning_rate": 3.4867257613731017e-07, "loss": 0.72716498, "num_input_tokens_seen": 146339030, "step": 6777, "time_per_iteration": 2.457915782928467 }, { "auxiliary_loss_clip": 0.0113917, "auxiliary_loss_mlp": 0.01028638, "balance_loss_clip": 1.0445354, "balance_loss_mlp": 1.02190268, "epoch": 0.8150063127517585, "flos": 19606903234560.0, "grad_norm": 1.7156260156982601, "language_loss": 0.85830152, "learning_rate": 3.4823323473414343e-07, "loss": 0.87997961, "num_input_tokens_seen": 146358550, "step": 6778, "time_per_iteration": 2.468317985534668 }, { "auxiliary_loss_clip": 0.01129731, "auxiliary_loss_mlp": 0.01028186, "balance_loss_clip": 1.04342556, "balance_loss_mlp": 1.02039576, "epoch": 0.8151265556423977, "flos": 22638374438400.0, "grad_norm": 1.983731193480427, "language_loss": 0.76251066, "learning_rate": 3.477941439026812e-07, "loss": 0.7840898, "num_input_tokens_seen": 146376770, "step": 6779, "time_per_iteration": 2.494454860687256 }, { "auxiliary_loss_clip": 0.01138949, "auxiliary_loss_mlp": 0.01020383, "balance_loss_clip": 1.0453769, "balance_loss_mlp": 1.01381469, "epoch": 0.8152467985330367, "flos": 17968048277760.0, "grad_norm": 1.7503836485970063, "language_loss": 0.7342881, "learning_rate": 3.473553037095349e-07, "loss": 0.75588149, "num_input_tokens_seen": 146395795, "step": 6780, "time_per_iteration": 2.452880382537842 }, { "auxiliary_loss_clip": 0.01131214, "auxiliary_loss_mlp": 0.01023514, "balance_loss_clip": 1.04388928, "balance_loss_mlp": 1.01690102, "epoch": 0.8153670414236758, "flos": 24969012405120.0, "grad_norm": 1.9062589312524958, "language_loss": 0.83227015, "learning_rate": 3.469167142212743e-07, "loss": 0.85381746, "num_input_tokens_seen": 146417640, "step": 6781, "time_per_iteration": 2.5235676765441895 }, { "auxiliary_loss_clip": 0.01152717, "auxiliary_loss_mlp": 0.01021006, "balance_loss_clip": 1.04702604, "balance_loss_mlp": 1.0134542, "epoch": 0.8154872843143149, "flos": 31066069754880.0, "grad_norm": 2.3758032009090173, "language_loss": 0.63515741, "learning_rate": 3.4647837550443337e-07, "loss": 0.65689468, "num_input_tokens_seen": 146436205, "step": 6782, "time_per_iteration": 2.5068750381469727 }, { "auxiliary_loss_clip": 0.01125705, "auxiliary_loss_mlp": 0.01024466, "balance_loss_clip": 1.04395533, "balance_loss_mlp": 1.01763213, "epoch": 0.815607527204954, "flos": 19391654983680.0, "grad_norm": 1.7211574302120567, "language_loss": 0.74506408, "learning_rate": 3.460402876255086e-07, "loss": 0.7665658, "num_input_tokens_seen": 146453595, "step": 6783, "time_per_iteration": 2.486147403717041 }, { "auxiliary_loss_clip": 0.01153723, "auxiliary_loss_mlp": 0.01021259, "balance_loss_clip": 1.04567194, "balance_loss_mlp": 1.01410079, "epoch": 0.815727770095593, "flos": 26140418743680.0, "grad_norm": 2.3238753950209503, "language_loss": 0.72175866, "learning_rate": 3.456024506509574e-07, "loss": 0.74350846, "num_input_tokens_seen": 146474515, "step": 6784, "time_per_iteration": 2.4733428955078125 }, { "auxiliary_loss_clip": 0.01152921, "auxiliary_loss_mlp": 0.00762019, "balance_loss_clip": 1.04905045, "balance_loss_mlp": 1.00027728, "epoch": 0.8158480129862322, "flos": 25337527989120.0, "grad_norm": 1.5051325928858246, "language_loss": 0.74103093, "learning_rate": 3.4516486464719873e-07, "loss": 0.76018029, "num_input_tokens_seen": 146493905, "step": 6785, "time_per_iteration": 2.4823248386383057 }, { "auxiliary_loss_clip": 0.01106708, "auxiliary_loss_mlp": 0.01025541, "balance_loss_clip": 1.04218411, "balance_loss_mlp": 1.01806688, "epoch": 0.8159682558768713, "flos": 34423645559040.0, "grad_norm": 2.8210384125043633, "language_loss": 0.61852992, "learning_rate": 3.4472752968061445e-07, "loss": 0.6398524, "num_input_tokens_seen": 146518335, "step": 6786, "time_per_iteration": 2.6731882095336914 }, { "auxiliary_loss_clip": 0.01150896, "auxiliary_loss_mlp": 0.01022862, "balance_loss_clip": 1.04534078, "balance_loss_mlp": 1.01609671, "epoch": 0.8160884987675103, "flos": 18653223185280.0, "grad_norm": 1.8253541527372343, "language_loss": 0.73594785, "learning_rate": 3.442904458175475e-07, "loss": 0.75768542, "num_input_tokens_seen": 146535655, "step": 6787, "time_per_iteration": 2.4440877437591553 }, { "auxiliary_loss_clip": 0.01149909, "auxiliary_loss_mlp": 0.01022029, "balance_loss_clip": 1.04453826, "balance_loss_mlp": 1.01494253, "epoch": 0.8162087416581495, "flos": 31430527102080.0, "grad_norm": 1.5866742510091092, "language_loss": 0.75886804, "learning_rate": 3.438536131243044e-07, "loss": 0.78058738, "num_input_tokens_seen": 146556815, "step": 6788, "time_per_iteration": 3.3892970085144043 }, { "auxiliary_loss_clip": 0.01141134, "auxiliary_loss_mlp": 0.01021383, "balance_loss_clip": 1.04529071, "balance_loss_mlp": 1.01405191, "epoch": 0.8163289845487885, "flos": 37593910915200.0, "grad_norm": 2.0582690814130373, "language_loss": 0.62014854, "learning_rate": 3.434170316671503e-07, "loss": 0.6417737, "num_input_tokens_seen": 146581845, "step": 6789, "time_per_iteration": 2.616269826889038 }, { "auxiliary_loss_clip": 0.01119483, "auxiliary_loss_mlp": 0.01021749, "balance_loss_clip": 1.04565001, "balance_loss_mlp": 1.01456952, "epoch": 0.8164492274394276, "flos": 13953989554560.0, "grad_norm": 2.5681373642359984, "language_loss": 0.89551032, "learning_rate": 3.4298070151231583e-07, "loss": 0.91692269, "num_input_tokens_seen": 146597245, "step": 6790, "time_per_iteration": 4.1213014125823975 }, { "auxiliary_loss_clip": 0.01141093, "auxiliary_loss_mlp": 0.01023911, "balance_loss_clip": 1.04423189, "balance_loss_mlp": 1.01686323, "epoch": 0.8165694703300668, "flos": 28986554747520.0, "grad_norm": 1.807408001445995, "language_loss": 0.59984934, "learning_rate": 3.425446227259916e-07, "loss": 0.62149942, "num_input_tokens_seen": 146618210, "step": 6791, "time_per_iteration": 2.5338826179504395 }, { "auxiliary_loss_clip": 0.01138134, "auxiliary_loss_mlp": 0.01022011, "balance_loss_clip": 1.04407978, "balance_loss_mlp": 1.0158453, "epoch": 0.8166897132207058, "flos": 25118365155840.0, "grad_norm": 1.8671361287771309, "language_loss": 0.82407534, "learning_rate": 3.421087953743296e-07, "loss": 0.84567678, "num_input_tokens_seen": 146637975, "step": 6792, "time_per_iteration": 2.5040810108184814 }, { "auxiliary_loss_clip": 0.01151004, "auxiliary_loss_mlp": 0.0102189, "balance_loss_clip": 1.04346251, "balance_loss_mlp": 1.01463604, "epoch": 0.8168099561113449, "flos": 23148593176320.0, "grad_norm": 2.553035375556881, "language_loss": 0.80015707, "learning_rate": 3.416732195234464e-07, "loss": 0.82188606, "num_input_tokens_seen": 146658030, "step": 6793, "time_per_iteration": 2.4484047889709473 }, { "auxiliary_loss_clip": 0.01154799, "auxiliary_loss_mlp": 0.01019627, "balance_loss_clip": 1.04622722, "balance_loss_mlp": 1.01309741, "epoch": 0.816930199001984, "flos": 18407666833920.0, "grad_norm": 1.5536434911464005, "language_loss": 0.79532695, "learning_rate": 3.4123789523941613e-07, "loss": 0.81707126, "num_input_tokens_seen": 146677855, "step": 6794, "time_per_iteration": 3.206885814666748 }, { "auxiliary_loss_clip": 0.01146089, "auxiliary_loss_mlp": 0.0102228, "balance_loss_clip": 1.04367006, "balance_loss_mlp": 1.01458549, "epoch": 0.8170504418926231, "flos": 21251324799360.0, "grad_norm": 1.4107944683858575, "language_loss": 0.63365006, "learning_rate": 3.4080282258827884e-07, "loss": 0.65533376, "num_input_tokens_seen": 146696230, "step": 6795, "time_per_iteration": 2.4373180866241455 }, { "auxiliary_loss_clip": 0.01152875, "auxiliary_loss_mlp": 0.01024394, "balance_loss_clip": 1.04489505, "balance_loss_mlp": 1.01735151, "epoch": 0.8171706847832622, "flos": 19099234362240.0, "grad_norm": 2.140463471777983, "language_loss": 0.72583759, "learning_rate": 3.403680016360342e-07, "loss": 0.74761027, "num_input_tokens_seen": 146714835, "step": 6796, "time_per_iteration": 2.417269229888916 }, { "auxiliary_loss_clip": 0.011466, "auxiliary_loss_mlp": 0.01028384, "balance_loss_clip": 1.04709888, "balance_loss_mlp": 1.02022433, "epoch": 0.8172909276739013, "flos": 21470128496640.0, "grad_norm": 1.710010476710125, "language_loss": 0.67913902, "learning_rate": 3.3993343244864403e-07, "loss": 0.70088887, "num_input_tokens_seen": 146734425, "step": 6797, "time_per_iteration": 2.448167562484741 }, { "auxiliary_loss_clip": 0.01150581, "auxiliary_loss_mlp": 0.01023064, "balance_loss_clip": 1.04670596, "balance_loss_mlp": 1.01618886, "epoch": 0.8174111705645404, "flos": 27599792417280.0, "grad_norm": 1.5355567119877302, "language_loss": 0.72985637, "learning_rate": 3.394991150920323e-07, "loss": 0.75159276, "num_input_tokens_seen": 146757545, "step": 6798, "time_per_iteration": 2.4922478199005127 }, { "auxiliary_loss_clip": 0.01112965, "auxiliary_loss_mlp": 0.00762497, "balance_loss_clip": 1.04283333, "balance_loss_mlp": 1.00042295, "epoch": 0.8175314134551794, "flos": 14064594508800.0, "grad_norm": 2.1401578866203996, "language_loss": 0.74430454, "learning_rate": 3.3906504963208396e-07, "loss": 0.76305914, "num_input_tokens_seen": 146774240, "step": 6799, "time_per_iteration": 2.5008223056793213 }, { "auxiliary_loss_clip": 0.01105796, "auxiliary_loss_mlp": 0.01020806, "balance_loss_clip": 1.04337239, "balance_loss_mlp": 1.01371288, "epoch": 0.8176516563458186, "flos": 22708076780160.0, "grad_norm": 1.8264213608802478, "language_loss": 0.66216028, "learning_rate": 3.3863123613464774e-07, "loss": 0.68342632, "num_input_tokens_seen": 146793140, "step": 6800, "time_per_iteration": 2.5437674522399902 }, { "auxiliary_loss_clip": 0.01137908, "auxiliary_loss_mlp": 0.01024504, "balance_loss_clip": 1.04049945, "balance_loss_mlp": 1.0177331, "epoch": 0.8177718992364577, "flos": 21945406279680.0, "grad_norm": 1.7232300717056792, "language_loss": 0.75027788, "learning_rate": 3.381976746655317e-07, "loss": 0.77190197, "num_input_tokens_seen": 146812895, "step": 6801, "time_per_iteration": 2.479572296142578 }, { "auxiliary_loss_clip": 0.01102541, "auxiliary_loss_mlp": 0.01024681, "balance_loss_clip": 1.04279757, "balance_loss_mlp": 1.01776099, "epoch": 0.8178921421270967, "flos": 22017443005440.0, "grad_norm": 2.240674159579414, "language_loss": 0.67516673, "learning_rate": 3.3776436529050756e-07, "loss": 0.69643903, "num_input_tokens_seen": 146832445, "step": 6802, "time_per_iteration": 2.5210154056549072 }, { "auxiliary_loss_clip": 0.01161293, "auxiliary_loss_mlp": 0.01024136, "balance_loss_clip": 1.04499745, "balance_loss_mlp": 1.01685262, "epoch": 0.8180123850177359, "flos": 33183111496320.0, "grad_norm": 1.5574127030815785, "language_loss": 0.72516805, "learning_rate": 3.373313080753073e-07, "loss": 0.74702239, "num_input_tokens_seen": 146856505, "step": 6803, "time_per_iteration": 2.507780075073242 }, { "auxiliary_loss_clip": 0.01146004, "auxiliary_loss_mlp": 0.01026121, "balance_loss_clip": 1.0437026, "balance_loss_mlp": 1.01911998, "epoch": 0.8181326279083749, "flos": 22091167670400.0, "grad_norm": 1.5935360346107537, "language_loss": 0.77332461, "learning_rate": 3.3689850308562527e-07, "loss": 0.79504585, "num_input_tokens_seen": 146876950, "step": 6804, "time_per_iteration": 2.45294451713562 }, { "auxiliary_loss_clip": 0.01102351, "auxiliary_loss_mlp": 0.01025249, "balance_loss_clip": 1.0438962, "balance_loss_mlp": 1.01847172, "epoch": 0.818252870799014, "flos": 15705747936000.0, "grad_norm": 1.988547393998068, "language_loss": 0.7751469, "learning_rate": 3.364659503871183e-07, "loss": 0.79642284, "num_input_tokens_seen": 146894885, "step": 6805, "time_per_iteration": 2.517223834991455 }, { "auxiliary_loss_clip": 0.01121308, "auxiliary_loss_mlp": 0.01022799, "balance_loss_clip": 1.04175854, "balance_loss_mlp": 1.01660275, "epoch": 0.8183731136896532, "flos": 18770687637120.0, "grad_norm": 1.8962312875875167, "language_loss": 0.83957648, "learning_rate": 3.3603365004540417e-07, "loss": 0.86101753, "num_input_tokens_seen": 146913180, "step": 6806, "time_per_iteration": 2.4995391368865967 }, { "auxiliary_loss_clip": 0.01164914, "auxiliary_loss_mlp": 0.01025446, "balance_loss_clip": 1.04882169, "balance_loss_mlp": 1.01820111, "epoch": 0.8184933565802922, "flos": 26541792293760.0, "grad_norm": 4.5038079164222955, "language_loss": 0.77001929, "learning_rate": 3.356016021260624e-07, "loss": 0.79192287, "num_input_tokens_seen": 146933510, "step": 6807, "time_per_iteration": 2.434100389480591 }, { "auxiliary_loss_clip": 0.01152504, "auxiliary_loss_mlp": 0.01023411, "balance_loss_clip": 1.04692888, "balance_loss_mlp": 1.01617527, "epoch": 0.8186135994709313, "flos": 17530117660800.0, "grad_norm": 4.464870280662668, "language_loss": 0.65455735, "learning_rate": 3.35169806694634e-07, "loss": 0.6763165, "num_input_tokens_seen": 146951760, "step": 6808, "time_per_iteration": 2.418520212173462 }, { "auxiliary_loss_clip": 0.01031117, "auxiliary_loss_mlp": 0.01001583, "balance_loss_clip": 1.01141548, "balance_loss_mlp": 1.00076663, "epoch": 0.8187338423615703, "flos": 63480300675840.0, "grad_norm": 0.7969511332793651, "language_loss": 0.60679591, "learning_rate": 3.3473826381662186e-07, "loss": 0.627123, "num_input_tokens_seen": 147022900, "step": 6809, "time_per_iteration": 3.213564872741699 }, { "auxiliary_loss_clip": 0.0114486, "auxiliary_loss_mlp": 0.01022537, "balance_loss_clip": 1.04562509, "balance_loss_mlp": 1.01572168, "epoch": 0.8188540852522095, "flos": 17529974006400.0, "grad_norm": 1.808331669833302, "language_loss": 0.8150667, "learning_rate": 3.3430697355749216e-07, "loss": 0.83674073, "num_input_tokens_seen": 147040590, "step": 6810, "time_per_iteration": 2.410327434539795 }, { "auxiliary_loss_clip": 0.01105508, "auxiliary_loss_mlp": 0.01023655, "balance_loss_clip": 1.04036248, "balance_loss_mlp": 1.01603508, "epoch": 0.8189743281428485, "flos": 14392530702720.0, "grad_norm": 1.8819289404063788, "language_loss": 0.75460494, "learning_rate": 3.3387593598266907e-07, "loss": 0.77589655, "num_input_tokens_seen": 147057200, "step": 6811, "time_per_iteration": 2.4785306453704834 }, { "auxiliary_loss_clip": 0.01114251, "auxiliary_loss_mlp": 0.01021904, "balance_loss_clip": 1.04026806, "balance_loss_mlp": 1.01505256, "epoch": 0.8190945710334876, "flos": 25080479285760.0, "grad_norm": 2.59803691005206, "language_loss": 0.78205597, "learning_rate": 3.3344515115754225e-07, "loss": 0.80341756, "num_input_tokens_seen": 147076180, "step": 6812, "time_per_iteration": 2.528937816619873 }, { "auxiliary_loss_clip": 0.01127628, "auxiliary_loss_mlp": 0.0102061, "balance_loss_clip": 1.04191816, "balance_loss_mlp": 1.01353741, "epoch": 0.8192148139241268, "flos": 21507152440320.0, "grad_norm": 3.966273827817656, "language_loss": 0.80260217, "learning_rate": 3.33014619147461e-07, "loss": 0.82408452, "num_input_tokens_seen": 147094205, "step": 6813, "time_per_iteration": 2.5006282329559326 }, { "auxiliary_loss_clip": 0.01138862, "auxiliary_loss_mlp": 0.01026467, "balance_loss_clip": 1.04736912, "balance_loss_mlp": 1.01959419, "epoch": 0.8193350568147658, "flos": 23952166289280.0, "grad_norm": 2.9015100002381757, "language_loss": 0.71450073, "learning_rate": 3.325843400177362e-07, "loss": 0.73615396, "num_input_tokens_seen": 147115545, "step": 6814, "time_per_iteration": 3.3452374935150146 }, { "auxiliary_loss_clip": 0.01155299, "auxiliary_loss_mlp": 0.00761855, "balance_loss_clip": 1.04669714, "balance_loss_mlp": 1.00044894, "epoch": 0.8194552997054049, "flos": 20559469962240.0, "grad_norm": 1.8868272918197915, "language_loss": 0.73585165, "learning_rate": 3.32154313833642e-07, "loss": 0.75502312, "num_input_tokens_seen": 147135700, "step": 6815, "time_per_iteration": 2.52557635307312 }, { "auxiliary_loss_clip": 0.01167123, "auxiliary_loss_mlp": 0.01025334, "balance_loss_clip": 1.04735839, "balance_loss_mlp": 1.01802087, "epoch": 0.819575542596044, "flos": 26031753123840.0, "grad_norm": 2.701960319744234, "language_loss": 0.59457517, "learning_rate": 3.3172454066041164e-07, "loss": 0.61649978, "num_input_tokens_seen": 147155205, "step": 6816, "time_per_iteration": 3.199972629547119 }, { "auxiliary_loss_clip": 0.01096637, "auxiliary_loss_mlp": 0.00761462, "balance_loss_clip": 1.04412985, "balance_loss_mlp": 1.00040317, "epoch": 0.8196957854866831, "flos": 29096944220160.0, "grad_norm": 1.8209534521393016, "language_loss": 0.75782466, "learning_rate": 3.3129502056324234e-07, "loss": 0.77640569, "num_input_tokens_seen": 147176570, "step": 6817, "time_per_iteration": 3.434685707092285 }, { "auxiliary_loss_clip": 0.01005078, "auxiliary_loss_mlp": 0.01001706, "balance_loss_clip": 1.01192069, "balance_loss_mlp": 1.00066268, "epoch": 0.8198160283773221, "flos": 69033631898880.0, "grad_norm": 0.8014186045695255, "language_loss": 0.59753567, "learning_rate": 3.3086575360729165e-07, "loss": 0.61760348, "num_input_tokens_seen": 147234105, "step": 6818, "time_per_iteration": 3.1724495887756348 }, { "auxiliary_loss_clip": 0.011364, "auxiliary_loss_mlp": 0.01025633, "balance_loss_clip": 1.04478908, "balance_loss_mlp": 1.0182184, "epoch": 0.8199362712679613, "flos": 16618058496000.0, "grad_norm": 1.6375991749784842, "language_loss": 0.71092522, "learning_rate": 3.3043673985767906e-07, "loss": 0.73254561, "num_input_tokens_seen": 147253170, "step": 6819, "time_per_iteration": 2.771613121032715 }, { "auxiliary_loss_clip": 0.0111398, "auxiliary_loss_mlp": 0.01026777, "balance_loss_clip": 1.03930044, "balance_loss_mlp": 1.01935029, "epoch": 0.8200565141586004, "flos": 21757664868480.0, "grad_norm": 1.836627459465846, "language_loss": 0.77541542, "learning_rate": 3.3000797937948564e-07, "loss": 0.79682302, "num_input_tokens_seen": 147271465, "step": 6820, "time_per_iteration": 2.5191915035247803 }, { "auxiliary_loss_clip": 0.01031906, "auxiliary_loss_mlp": 0.01001433, "balance_loss_clip": 1.00739598, "balance_loss_mlp": 1.00044334, "epoch": 0.8201767570492394, "flos": 69807112392960.0, "grad_norm": 0.9465417163068718, "language_loss": 0.65043789, "learning_rate": 3.295794722377534e-07, "loss": 0.6707713, "num_input_tokens_seen": 147335070, "step": 6821, "time_per_iteration": 4.202096939086914 }, { "auxiliary_loss_clip": 0.01161817, "auxiliary_loss_mlp": 0.01024813, "balance_loss_clip": 1.0443958, "balance_loss_mlp": 1.01835465, "epoch": 0.8202969999398786, "flos": 23111892455040.0, "grad_norm": 2.0523362890499146, "language_loss": 0.7999754, "learning_rate": 3.291512184974876e-07, "loss": 0.82184172, "num_input_tokens_seen": 147355460, "step": 6822, "time_per_iteration": 2.4443702697753906 }, { "auxiliary_loss_clip": 0.01134688, "auxiliary_loss_mlp": 0.010225, "balance_loss_clip": 1.04119492, "balance_loss_mlp": 1.01518691, "epoch": 0.8204172428305176, "flos": 28220616109440.0, "grad_norm": 1.6446217423910388, "language_loss": 0.66581541, "learning_rate": 3.2872321822365346e-07, "loss": 0.68738729, "num_input_tokens_seen": 147375675, "step": 6823, "time_per_iteration": 2.5599071979522705 }, { "auxiliary_loss_clip": 0.01150777, "auxiliary_loss_mlp": 0.01020447, "balance_loss_clip": 1.04727721, "balance_loss_mlp": 1.01334214, "epoch": 0.8205374857211567, "flos": 20887011106560.0, "grad_norm": 1.8260951084729837, "language_loss": 0.73330677, "learning_rate": 3.282954714811783e-07, "loss": 0.75501901, "num_input_tokens_seen": 147394580, "step": 6824, "time_per_iteration": 2.4431612491607666 }, { "auxiliary_loss_clip": 0.01124304, "auxiliary_loss_mlp": 0.01023651, "balance_loss_clip": 1.04019284, "balance_loss_mlp": 1.01628447, "epoch": 0.8206577286117959, "flos": 13152140294400.0, "grad_norm": 2.3441282185655514, "language_loss": 0.70823205, "learning_rate": 3.2786797833495093e-07, "loss": 0.72971153, "num_input_tokens_seen": 147409935, "step": 6825, "time_per_iteration": 2.463106155395508 }, { "auxiliary_loss_clip": 0.01163014, "auxiliary_loss_mlp": 0.01024655, "balance_loss_clip": 1.04668975, "balance_loss_mlp": 1.01823604, "epoch": 0.8207779715024349, "flos": 25265634917760.0, "grad_norm": 1.7494974601576794, "language_loss": 0.7275176, "learning_rate": 3.274407388498213e-07, "loss": 0.7493943, "num_input_tokens_seen": 147428065, "step": 6826, "time_per_iteration": 2.440802812576294 }, { "auxiliary_loss_clip": 0.01118471, "auxiliary_loss_mlp": 0.01027381, "balance_loss_clip": 1.04158759, "balance_loss_mlp": 1.02046967, "epoch": 0.820898214393074, "flos": 19610243199360.0, "grad_norm": 5.975114684799946, "language_loss": 0.73896289, "learning_rate": 3.270137530906021e-07, "loss": 0.7604214, "num_input_tokens_seen": 147447300, "step": 6827, "time_per_iteration": 2.5104928016662598 }, { "auxiliary_loss_clip": 0.01102647, "auxiliary_loss_mlp": 0.01023184, "balance_loss_clip": 1.04478908, "balance_loss_mlp": 1.01661253, "epoch": 0.8210184572837131, "flos": 15596615439360.0, "grad_norm": 1.7635218594643283, "language_loss": 0.83393168, "learning_rate": 3.265870211220665e-07, "loss": 0.85519004, "num_input_tokens_seen": 147465135, "step": 6828, "time_per_iteration": 2.555386781692505 }, { "auxiliary_loss_clip": 0.01121022, "auxiliary_loss_mlp": 0.01028747, "balance_loss_clip": 1.04428482, "balance_loss_mlp": 1.02113509, "epoch": 0.8211387001743522, "flos": 20813932886400.0, "grad_norm": 1.9315952694056193, "language_loss": 0.81658369, "learning_rate": 3.2616054300894934e-07, "loss": 0.83808136, "num_input_tokens_seen": 147484585, "step": 6829, "time_per_iteration": 2.6306464672088623 }, { "auxiliary_loss_clip": 0.01128758, "auxiliary_loss_mlp": 0.01021882, "balance_loss_clip": 1.04499853, "balance_loss_mlp": 1.01465201, "epoch": 0.8212589430649913, "flos": 27704579368320.0, "grad_norm": 1.952161659050395, "language_loss": 0.84430063, "learning_rate": 3.2573431881594693e-07, "loss": 0.865807, "num_input_tokens_seen": 147504130, "step": 6830, "time_per_iteration": 2.5522513389587402 }, { "auxiliary_loss_clip": 0.01094959, "auxiliary_loss_mlp": 0.01022258, "balance_loss_clip": 1.03914845, "balance_loss_mlp": 1.01516461, "epoch": 0.8213791859556304, "flos": 22455625017600.0, "grad_norm": 2.0103338968504167, "language_loss": 0.65954649, "learning_rate": 3.2530834860771663e-07, "loss": 0.68071866, "num_input_tokens_seen": 147523510, "step": 6831, "time_per_iteration": 2.6042492389678955 }, { "auxiliary_loss_clip": 0.01151151, "auxiliary_loss_mlp": 0.01023988, "balance_loss_clip": 1.04412436, "balance_loss_mlp": 1.01647198, "epoch": 0.8214994288462695, "flos": 16654471908480.0, "grad_norm": 1.9991066861645834, "language_loss": 0.74314129, "learning_rate": 3.248826324488794e-07, "loss": 0.7648927, "num_input_tokens_seen": 147540805, "step": 6832, "time_per_iteration": 2.421898365020752 }, { "auxiliary_loss_clip": 0.01166024, "auxiliary_loss_mlp": 0.01026505, "balance_loss_clip": 1.05060315, "balance_loss_mlp": 1.0197432, "epoch": 0.8216196717369085, "flos": 25221787390080.0, "grad_norm": 1.6731329061506115, "language_loss": 0.8793807, "learning_rate": 3.244571704040138e-07, "loss": 0.90130591, "num_input_tokens_seen": 147560965, "step": 6833, "time_per_iteration": 2.4621951580047607 }, { "auxiliary_loss_clip": 0.01148958, "auxiliary_loss_mlp": 0.01026551, "balance_loss_clip": 1.04374933, "balance_loss_mlp": 1.01836705, "epoch": 0.8217399146275477, "flos": 25371930240000.0, "grad_norm": 2.232277605495166, "language_loss": 0.73934895, "learning_rate": 3.2403196253766374e-07, "loss": 0.76110405, "num_input_tokens_seen": 147580045, "step": 6834, "time_per_iteration": 2.4883499145507812 }, { "auxiliary_loss_clip": 0.01148492, "auxiliary_loss_mlp": 0.01025578, "balance_loss_clip": 1.04524648, "balance_loss_mlp": 1.01779974, "epoch": 0.8218601575181868, "flos": 25629625388160.0, "grad_norm": 2.4671913219894672, "language_loss": 0.78968859, "learning_rate": 3.2360700891433254e-07, "loss": 0.81142926, "num_input_tokens_seen": 147599070, "step": 6835, "time_per_iteration": 2.4703404903411865 }, { "auxiliary_loss_clip": 0.01023107, "auxiliary_loss_mlp": 0.01002827, "balance_loss_clip": 1.01077461, "balance_loss_mlp": 1.0017724, "epoch": 0.8219804004088258, "flos": 67660229427840.0, "grad_norm": 0.799394554818687, "language_loss": 0.57317376, "learning_rate": 3.231823095984847e-07, "loss": 0.59343314, "num_input_tokens_seen": 147653710, "step": 6836, "time_per_iteration": 3.033067226409912 }, { "auxiliary_loss_clip": 0.01137238, "auxiliary_loss_mlp": 0.01022, "balance_loss_clip": 1.04527628, "balance_loss_mlp": 1.01504469, "epoch": 0.822100643299465, "flos": 19464266327040.0, "grad_norm": 2.012688134441855, "language_loss": 0.76158744, "learning_rate": 3.2275786465454814e-07, "loss": 0.78317982, "num_input_tokens_seen": 147670360, "step": 6837, "time_per_iteration": 2.4612369537353516 }, { "auxiliary_loss_clip": 0.01120838, "auxiliary_loss_mlp": 0.01023007, "balance_loss_clip": 1.04266298, "balance_loss_mlp": 1.01639128, "epoch": 0.822220886190104, "flos": 24681368292480.0, "grad_norm": 2.137336169612773, "language_loss": 0.75605828, "learning_rate": 3.2233367414690917e-07, "loss": 0.7774967, "num_input_tokens_seen": 147692550, "step": 6838, "time_per_iteration": 2.548297643661499 }, { "auxiliary_loss_clip": 0.01118648, "auxiliary_loss_mlp": 0.01022626, "balance_loss_clip": 1.04009938, "balance_loss_mlp": 1.01574206, "epoch": 0.8223411290807431, "flos": 27819062991360.0, "grad_norm": 2.6293256821619257, "language_loss": 0.8494792, "learning_rate": 3.219097381399183e-07, "loss": 0.87089193, "num_input_tokens_seen": 147709725, "step": 6839, "time_per_iteration": 2.5535924434661865 }, { "auxiliary_loss_clip": 0.01143055, "auxiliary_loss_mlp": 0.01024934, "balance_loss_clip": 1.04465055, "balance_loss_mlp": 1.01835024, "epoch": 0.8224613719713821, "flos": 23218546913280.0, "grad_norm": 2.384893230331127, "language_loss": 0.80890906, "learning_rate": 3.2148605669788584e-07, "loss": 0.83058888, "num_input_tokens_seen": 147729615, "step": 6840, "time_per_iteration": 3.3277125358581543 }, { "auxiliary_loss_clip": 0.01138229, "auxiliary_loss_mlp": 0.0102417, "balance_loss_clip": 1.04555702, "balance_loss_mlp": 1.01680923, "epoch": 0.8225816148620213, "flos": 15706250726400.0, "grad_norm": 2.6911656078117816, "language_loss": 0.77285266, "learning_rate": 3.2106262988508405e-07, "loss": 0.79447669, "num_input_tokens_seen": 147747665, "step": 6841, "time_per_iteration": 2.462916851043701 }, { "auxiliary_loss_clip": 0.01139259, "auxiliary_loss_mlp": 0.0102207, "balance_loss_clip": 1.04456878, "balance_loss_mlp": 1.01505494, "epoch": 0.8227018577526604, "flos": 18515111391360.0, "grad_norm": 1.7893112431219729, "language_loss": 0.74198681, "learning_rate": 3.206394577657465e-07, "loss": 0.76360011, "num_input_tokens_seen": 147765445, "step": 6842, "time_per_iteration": 2.455817222595215 }, { "auxiliary_loss_clip": 0.01156358, "auxiliary_loss_mlp": 0.01028684, "balance_loss_clip": 1.04799604, "balance_loss_mlp": 1.02120948, "epoch": 0.8228221006432994, "flos": 22236785406720.0, "grad_norm": 2.408334106447516, "language_loss": 0.72702813, "learning_rate": 3.202165404040675e-07, "loss": 0.74887854, "num_input_tokens_seen": 147783365, "step": 6843, "time_per_iteration": 3.2742409706115723 }, { "auxiliary_loss_clip": 0.01097748, "auxiliary_loss_mlp": 0.0102869, "balance_loss_clip": 1.0427283, "balance_loss_mlp": 1.02111745, "epoch": 0.8229423435339386, "flos": 24097532630400.0, "grad_norm": 2.8885698183824884, "language_loss": 0.74479699, "learning_rate": 3.1979387786420396e-07, "loss": 0.76606143, "num_input_tokens_seen": 147803605, "step": 6844, "time_per_iteration": 2.601386308670044 }, { "auxiliary_loss_clip": 0.01138487, "auxiliary_loss_mlp": 0.01018616, "balance_loss_clip": 1.04223073, "balance_loss_mlp": 1.01184177, "epoch": 0.8230625864245776, "flos": 23878549365120.0, "grad_norm": 1.7502135431720856, "language_loss": 0.82327712, "learning_rate": 3.1937147021027346e-07, "loss": 0.84484816, "num_input_tokens_seen": 147822060, "step": 6845, "time_per_iteration": 2.4975786209106445 }, { "auxiliary_loss_clip": 0.01148717, "auxiliary_loss_mlp": 0.01021213, "balance_loss_clip": 1.04495025, "balance_loss_mlp": 1.01484394, "epoch": 0.8231828293152167, "flos": 16581106379520.0, "grad_norm": 2.4845316230889787, "language_loss": 0.7667774, "learning_rate": 3.189493175063547e-07, "loss": 0.78847677, "num_input_tokens_seen": 147839295, "step": 6846, "time_per_iteration": 2.4254555702209473 }, { "auxiliary_loss_clip": 0.0113708, "auxiliary_loss_mlp": 0.01024559, "balance_loss_clip": 1.04483533, "balance_loss_mlp": 1.01731062, "epoch": 0.8233030722058559, "flos": 18880071528960.0, "grad_norm": 1.8202686101939696, "language_loss": 0.67626613, "learning_rate": 3.1852741981648776e-07, "loss": 0.69788247, "num_input_tokens_seen": 147857945, "step": 6847, "time_per_iteration": 2.458911418914795 }, { "auxiliary_loss_clip": 0.01112208, "auxiliary_loss_mlp": 0.01029079, "balance_loss_clip": 1.04267943, "balance_loss_mlp": 1.0218966, "epoch": 0.8234233150964949, "flos": 28439024757120.0, "grad_norm": 2.729353710444371, "language_loss": 0.69912225, "learning_rate": 3.1810577720467404e-07, "loss": 0.7205351, "num_input_tokens_seen": 147879675, "step": 6848, "time_per_iteration": 3.3158977031707764 }, { "auxiliary_loss_clip": 0.01138887, "auxiliary_loss_mlp": 0.01021392, "balance_loss_clip": 1.04517186, "balance_loss_mlp": 1.01412606, "epoch": 0.823543557987134, "flos": 33765941577600.0, "grad_norm": 10.5492874034828, "language_loss": 0.56379616, "learning_rate": 3.176843897348769e-07, "loss": 0.58539897, "num_input_tokens_seen": 147902870, "step": 6849, "time_per_iteration": 2.606759548187256 }, { "auxiliary_loss_clip": 0.011339, "auxiliary_loss_mlp": 0.01023834, "balance_loss_clip": 1.04406989, "balance_loss_mlp": 1.01666057, "epoch": 0.8236638008777731, "flos": 17092366611840.0, "grad_norm": 2.457616355858711, "language_loss": 0.75882953, "learning_rate": 3.1726325747102034e-07, "loss": 0.78040683, "num_input_tokens_seen": 147921245, "step": 6850, "time_per_iteration": 2.4447011947631836 }, { "auxiliary_loss_clip": 0.01100901, "auxiliary_loss_mlp": 0.01024609, "balance_loss_clip": 1.03642058, "balance_loss_mlp": 1.01736712, "epoch": 0.8237840437684122, "flos": 61639982334720.0, "grad_norm": 1.4723497310501072, "language_loss": 0.64271373, "learning_rate": 3.1684238047698974e-07, "loss": 0.66396886, "num_input_tokens_seen": 147949515, "step": 6851, "time_per_iteration": 2.912355661392212 }, { "auxiliary_loss_clip": 0.01140571, "auxiliary_loss_mlp": 0.01026097, "balance_loss_clip": 1.04617202, "balance_loss_mlp": 1.01889682, "epoch": 0.8239042866590512, "flos": 27309023821440.0, "grad_norm": 2.1365551628339623, "language_loss": 0.53172874, "learning_rate": 3.1642175881663155e-07, "loss": 0.55339539, "num_input_tokens_seen": 147969245, "step": 6852, "time_per_iteration": 2.53395938873291 }, { "auxiliary_loss_clip": 0.01162797, "auxiliary_loss_mlp": 0.01021581, "balance_loss_clip": 1.04615116, "balance_loss_mlp": 1.01508379, "epoch": 0.8240245295496904, "flos": 21726351187200.0, "grad_norm": 2.023049428612565, "language_loss": 0.83929038, "learning_rate": 3.160013925537537e-07, "loss": 0.86113411, "num_input_tokens_seen": 147990080, "step": 6853, "time_per_iteration": 2.416192054748535 }, { "auxiliary_loss_clip": 0.01123604, "auxiliary_loss_mlp": 0.01024113, "balance_loss_clip": 1.04085815, "balance_loss_mlp": 1.01683879, "epoch": 0.8241447724403295, "flos": 20009318279040.0, "grad_norm": 7.111965539277004, "language_loss": 0.75757122, "learning_rate": 3.155812817521266e-07, "loss": 0.77904838, "num_input_tokens_seen": 148010455, "step": 6854, "time_per_iteration": 2.501901388168335 }, { "auxiliary_loss_clip": 0.01138439, "auxiliary_loss_mlp": 0.0102572, "balance_loss_clip": 1.04483306, "balance_loss_mlp": 1.01877356, "epoch": 0.8242650153309685, "flos": 22272983337600.0, "grad_norm": 2.017624505195568, "language_loss": 0.78218204, "learning_rate": 3.151614264754787e-07, "loss": 0.80382371, "num_input_tokens_seen": 148028400, "step": 6855, "time_per_iteration": 2.4778809547424316 }, { "auxiliary_loss_clip": 0.01164009, "auxiliary_loss_mlp": 0.01024597, "balance_loss_clip": 1.04513669, "balance_loss_mlp": 1.01758194, "epoch": 0.8243852582216077, "flos": 22309971367680.0, "grad_norm": 2.4675703312436514, "language_loss": 0.79649591, "learning_rate": 3.147418267875035e-07, "loss": 0.81838197, "num_input_tokens_seen": 148046530, "step": 6856, "time_per_iteration": 2.4160687923431396 }, { "auxiliary_loss_clip": 0.01091824, "auxiliary_loss_mlp": 0.00761465, "balance_loss_clip": 1.03800154, "balance_loss_mlp": 1.00042939, "epoch": 0.8245055011122467, "flos": 24645421756800.0, "grad_norm": 2.167277393445466, "language_loss": 0.65251005, "learning_rate": 3.1432248275185315e-07, "loss": 0.67104286, "num_input_tokens_seen": 148067040, "step": 6857, "time_per_iteration": 2.6238596439361572 }, { "auxiliary_loss_clip": 0.01150927, "auxiliary_loss_mlp": 0.01023067, "balance_loss_clip": 1.04731131, "balance_loss_mlp": 1.01618898, "epoch": 0.8246257440028858, "flos": 17487275713920.0, "grad_norm": 2.049562297499048, "language_loss": 0.76828903, "learning_rate": 3.139033944321412e-07, "loss": 0.79002899, "num_input_tokens_seen": 148084400, "step": 6858, "time_per_iteration": 2.4194962978363037 }, { "auxiliary_loss_clip": 0.01152845, "auxiliary_loss_mlp": 0.01021198, "balance_loss_clip": 1.04540741, "balance_loss_mlp": 1.01443911, "epoch": 0.824745986893525, "flos": 25010130499200.0, "grad_norm": 1.639518671723465, "language_loss": 0.79117405, "learning_rate": 3.1348456189194507e-07, "loss": 0.81291449, "num_input_tokens_seen": 148104860, "step": 6859, "time_per_iteration": 2.4707069396972656 }, { "auxiliary_loss_clip": 0.01112755, "auxiliary_loss_mlp": 0.01021608, "balance_loss_clip": 1.03932667, "balance_loss_mlp": 1.01434231, "epoch": 0.824866229784164, "flos": 18772698798720.0, "grad_norm": 1.7934963059501927, "language_loss": 0.83034438, "learning_rate": 3.1306598519479876e-07, "loss": 0.85168803, "num_input_tokens_seen": 148124680, "step": 6860, "time_per_iteration": 2.5230376720428467 }, { "auxiliary_loss_clip": 0.01131201, "auxiliary_loss_mlp": 0.0102251, "balance_loss_clip": 1.04248166, "balance_loss_mlp": 1.01595318, "epoch": 0.8249864726748031, "flos": 23842171866240.0, "grad_norm": 3.4669129118230835, "language_loss": 0.78268087, "learning_rate": 3.1264766440420177e-07, "loss": 0.80421793, "num_input_tokens_seen": 148147150, "step": 6861, "time_per_iteration": 2.5132832527160645 }, { "auxiliary_loss_clip": 0.01149082, "auxiliary_loss_mlp": 0.01022166, "balance_loss_clip": 1.04645252, "balance_loss_mlp": 1.01529336, "epoch": 0.8251067155654422, "flos": 20303103617280.0, "grad_norm": 2.0856019670899046, "language_loss": 0.69264448, "learning_rate": 3.122295995836124e-07, "loss": 0.71435696, "num_input_tokens_seen": 148167020, "step": 6862, "time_per_iteration": 2.4347610473632812 }, { "auxiliary_loss_clip": 0.01154201, "auxiliary_loss_mlp": 0.01021424, "balance_loss_clip": 1.04321921, "balance_loss_mlp": 1.01381302, "epoch": 0.8252269584560813, "flos": 25009699536000.0, "grad_norm": 1.7768888128862097, "language_loss": 0.77480811, "learning_rate": 3.118117907964508e-07, "loss": 0.7965644, "num_input_tokens_seen": 148188965, "step": 6863, "time_per_iteration": 2.5595972537994385 }, { "auxiliary_loss_clip": 0.01129612, "auxiliary_loss_mlp": 0.01024955, "balance_loss_clip": 1.04345632, "balance_loss_mlp": 1.01807952, "epoch": 0.8253472013467203, "flos": 17128564542720.0, "grad_norm": 1.8906899471746743, "language_loss": 0.80131149, "learning_rate": 3.1139423810609856e-07, "loss": 0.82285714, "num_input_tokens_seen": 148205660, "step": 6864, "time_per_iteration": 2.5707077980041504 }, { "auxiliary_loss_clip": 0.01162816, "auxiliary_loss_mlp": 0.01025638, "balance_loss_clip": 1.04380798, "balance_loss_mlp": 1.01819897, "epoch": 0.8254674442373595, "flos": 22414794232320.0, "grad_norm": 1.8437331759961366, "language_loss": 0.75427389, "learning_rate": 3.1097694157589714e-07, "loss": 0.77615833, "num_input_tokens_seen": 148225545, "step": 6865, "time_per_iteration": 2.445305585861206 }, { "auxiliary_loss_clip": 0.01148257, "auxiliary_loss_mlp": 0.01027076, "balance_loss_clip": 1.04637527, "balance_loss_mlp": 1.01984286, "epoch": 0.8255876871279986, "flos": 24786765774720.0, "grad_norm": 2.964809227991256, "language_loss": 0.76846826, "learning_rate": 3.105599012691511e-07, "loss": 0.79022163, "num_input_tokens_seen": 148243975, "step": 6866, "time_per_iteration": 2.467745780944824 }, { "auxiliary_loss_clip": 0.01147489, "auxiliary_loss_mlp": 0.01023943, "balance_loss_clip": 1.04500008, "balance_loss_mlp": 1.01710033, "epoch": 0.8257079300186376, "flos": 27455431656960.0, "grad_norm": 1.4302936193903149, "language_loss": 0.82487613, "learning_rate": 3.101431172491249e-07, "loss": 0.84659046, "num_input_tokens_seen": 148265520, "step": 6867, "time_per_iteration": 3.3579022884368896 }, { "auxiliary_loss_clip": 0.011261, "auxiliary_loss_mlp": 0.00761875, "balance_loss_clip": 1.04161596, "balance_loss_mlp": 1.00042987, "epoch": 0.8258281729092768, "flos": 16471866142080.0, "grad_norm": 2.126919550353325, "language_loss": 0.71970677, "learning_rate": 3.097265895790444e-07, "loss": 0.73858654, "num_input_tokens_seen": 148283730, "step": 6868, "time_per_iteration": 2.537630558013916 }, { "auxiliary_loss_clip": 0.01123445, "auxiliary_loss_mlp": 0.01021809, "balance_loss_clip": 1.0421474, "balance_loss_mlp": 1.01530576, "epoch": 0.8259484157999158, "flos": 21433822824960.0, "grad_norm": 2.618214932771576, "language_loss": 0.83413196, "learning_rate": 3.093103183220962e-07, "loss": 0.85558456, "num_input_tokens_seen": 148303775, "step": 6869, "time_per_iteration": 3.3767173290252686 }, { "auxiliary_loss_clip": 0.01053424, "auxiliary_loss_mlp": 0.01000874, "balance_loss_clip": 1.00961125, "balance_loss_mlp": 0.99995059, "epoch": 0.8260686586905549, "flos": 58322342453760.0, "grad_norm": 0.8200754910891466, "language_loss": 0.5940001, "learning_rate": 3.0889430354142796e-07, "loss": 0.61454308, "num_input_tokens_seen": 148365285, "step": 6870, "time_per_iteration": 3.014164924621582 }, { "auxiliary_loss_clip": 0.01126464, "auxiliary_loss_mlp": 0.01024728, "balance_loss_clip": 1.04191077, "balance_loss_mlp": 1.01742935, "epoch": 0.826188901581194, "flos": 27527288814720.0, "grad_norm": 1.9459485876219063, "language_loss": 0.69888842, "learning_rate": 3.084785453001497e-07, "loss": 0.72040039, "num_input_tokens_seen": 148386200, "step": 6871, "time_per_iteration": 2.5698933601379395 }, { "auxiliary_loss_clip": 0.01137556, "auxiliary_loss_mlp": 0.00761863, "balance_loss_clip": 1.04683053, "balance_loss_mlp": 1.00045538, "epoch": 0.8263091444718331, "flos": 23696051339520.0, "grad_norm": 2.13352818553947, "language_loss": 0.82282352, "learning_rate": 3.080630436613314e-07, "loss": 0.84181774, "num_input_tokens_seen": 148403970, "step": 6872, "time_per_iteration": 2.4855942726135254 }, { "auxiliary_loss_clip": 0.0114296, "auxiliary_loss_mlp": 0.01028623, "balance_loss_clip": 1.04358542, "balance_loss_mlp": 1.02146769, "epoch": 0.8264293873624722, "flos": 17165157523200.0, "grad_norm": 2.547302559084706, "language_loss": 0.86318392, "learning_rate": 3.076477986880039e-07, "loss": 0.88489974, "num_input_tokens_seen": 148421765, "step": 6873, "time_per_iteration": 2.4098455905914307 }, { "auxiliary_loss_clip": 0.01135802, "auxiliary_loss_mlp": 0.01021283, "balance_loss_clip": 1.04510796, "balance_loss_mlp": 1.01429129, "epoch": 0.8265496302531112, "flos": 24098645952000.0, "grad_norm": 2.3262202204578513, "language_loss": 0.69370931, "learning_rate": 3.0723281044315986e-07, "loss": 0.71528018, "num_input_tokens_seen": 148443720, "step": 6874, "time_per_iteration": 3.2196807861328125 }, { "auxiliary_loss_clip": 0.01158421, "auxiliary_loss_mlp": 0.01023775, "balance_loss_clip": 1.04345608, "balance_loss_mlp": 1.01721573, "epoch": 0.8266698731437504, "flos": 14099894599680.0, "grad_norm": 2.0246735928259816, "language_loss": 0.76180756, "learning_rate": 3.068180789897521e-07, "loss": 0.78362954, "num_input_tokens_seen": 148462130, "step": 6875, "time_per_iteration": 2.3761699199676514 }, { "auxiliary_loss_clip": 0.01154858, "auxiliary_loss_mlp": 0.01022623, "balance_loss_clip": 1.04526854, "balance_loss_mlp": 1.01517797, "epoch": 0.8267901160343895, "flos": 30777563715840.0, "grad_norm": 1.4361712172831118, "language_loss": 0.81562686, "learning_rate": 3.064036043906966e-07, "loss": 0.83740163, "num_input_tokens_seen": 148485570, "step": 6876, "time_per_iteration": 2.5402393341064453 }, { "auxiliary_loss_clip": 0.01129314, "auxiliary_loss_mlp": 0.01025423, "balance_loss_clip": 1.04240394, "balance_loss_mlp": 1.01782906, "epoch": 0.8269103589250285, "flos": 40624915242240.0, "grad_norm": 2.376346023107187, "language_loss": 0.68122613, "learning_rate": 3.059893867088668e-07, "loss": 0.70277345, "num_input_tokens_seen": 148509715, "step": 6877, "time_per_iteration": 2.6738810539245605 }, { "auxiliary_loss_clip": 0.01147215, "auxiliary_loss_mlp": 0.01024467, "balance_loss_clip": 1.04483867, "balance_loss_mlp": 1.01801217, "epoch": 0.8270306018156677, "flos": 30263645877120.0, "grad_norm": 2.17552021079681, "language_loss": 0.66741371, "learning_rate": 3.055754260071004e-07, "loss": 0.68913054, "num_input_tokens_seen": 148532010, "step": 6878, "time_per_iteration": 2.506004571914673 }, { "auxiliary_loss_clip": 0.01150462, "auxiliary_loss_mlp": 0.0102653, "balance_loss_clip": 1.04599977, "balance_loss_mlp": 1.01986313, "epoch": 0.8271508447063067, "flos": 25226599812480.0, "grad_norm": 4.579153493549264, "language_loss": 0.7360543, "learning_rate": 3.051617223481948e-07, "loss": 0.75782418, "num_input_tokens_seen": 148553330, "step": 6879, "time_per_iteration": 2.4655911922454834 }, { "auxiliary_loss_clip": 0.01134737, "auxiliary_loss_mlp": 0.01029523, "balance_loss_clip": 1.04430628, "balance_loss_mlp": 1.02197146, "epoch": 0.8272710875969458, "flos": 17566602900480.0, "grad_norm": 2.055779408794126, "language_loss": 0.75496978, "learning_rate": 3.047482757949078e-07, "loss": 0.7766124, "num_input_tokens_seen": 148570960, "step": 6880, "time_per_iteration": 2.485534429550171 }, { "auxiliary_loss_clip": 0.01116602, "auxiliary_loss_mlp": 0.00761293, "balance_loss_clip": 1.03960729, "balance_loss_mlp": 1.00044405, "epoch": 0.827391330487585, "flos": 19755465886080.0, "grad_norm": 2.143221753535972, "language_loss": 0.85831439, "learning_rate": 3.043350864099605e-07, "loss": 0.87709337, "num_input_tokens_seen": 148589520, "step": 6881, "time_per_iteration": 2.491403102874756 }, { "auxiliary_loss_clip": 0.01152137, "auxiliary_loss_mlp": 0.01022256, "balance_loss_clip": 1.04414189, "balance_loss_mlp": 1.01538384, "epoch": 0.827511573378224, "flos": 16835174254080.0, "grad_norm": 2.165174932197664, "language_loss": 0.8078407, "learning_rate": 3.039221542560315e-07, "loss": 0.82958466, "num_input_tokens_seen": 148606085, "step": 6882, "time_per_iteration": 2.398483991622925 }, { "auxiliary_loss_clip": 0.0115022, "auxiliary_loss_mlp": 0.01021935, "balance_loss_clip": 1.04638052, "balance_loss_mlp": 1.0149138, "epoch": 0.8276318162688631, "flos": 18369242259840.0, "grad_norm": 2.210022440403164, "language_loss": 0.73260939, "learning_rate": 3.0350947939576356e-07, "loss": 0.75433099, "num_input_tokens_seen": 148625240, "step": 6883, "time_per_iteration": 2.4215855598449707 }, { "auxiliary_loss_clip": 0.01156976, "auxiliary_loss_mlp": 0.01027053, "balance_loss_clip": 1.04678643, "balance_loss_mlp": 1.01920927, "epoch": 0.8277520591595022, "flos": 19352691705600.0, "grad_norm": 1.5453395721510128, "language_loss": 0.72249901, "learning_rate": 3.0309706189175876e-07, "loss": 0.74433929, "num_input_tokens_seen": 148645075, "step": 6884, "time_per_iteration": 2.438347339630127 }, { "auxiliary_loss_clip": 0.01044694, "auxiliary_loss_mlp": 0.01001846, "balance_loss_clip": 1.0100615, "balance_loss_mlp": 1.00084484, "epoch": 0.8278723020501413, "flos": 67918858329600.0, "grad_norm": 0.7623672521857662, "language_loss": 0.57401317, "learning_rate": 3.0268490180658045e-07, "loss": 0.59447861, "num_input_tokens_seen": 148707855, "step": 6885, "time_per_iteration": 3.0473792552948 }, { "auxiliary_loss_clip": 0.01169552, "auxiliary_loss_mlp": 0.01022055, "balance_loss_clip": 1.04985881, "balance_loss_mlp": 1.01508081, "epoch": 0.8279925449407803, "flos": 18185738653440.0, "grad_norm": 2.4562757559457586, "language_loss": 0.7903136, "learning_rate": 3.0227299920275305e-07, "loss": 0.81222963, "num_input_tokens_seen": 148724170, "step": 6886, "time_per_iteration": 2.370415210723877 }, { "auxiliary_loss_clip": 0.01128666, "auxiliary_loss_mlp": 0.01028614, "balance_loss_clip": 1.04585683, "balance_loss_mlp": 1.02101135, "epoch": 0.8281127878314195, "flos": 20631434860800.0, "grad_norm": 1.8884400687405891, "language_loss": 0.85513437, "learning_rate": 3.018613541427613e-07, "loss": 0.87670708, "num_input_tokens_seen": 148743690, "step": 6887, "time_per_iteration": 2.5087547302246094 }, { "auxiliary_loss_clip": 0.0116309, "auxiliary_loss_mlp": 0.01024937, "balance_loss_clip": 1.04531491, "balance_loss_mlp": 1.01805305, "epoch": 0.8282330307220586, "flos": 18004282122240.0, "grad_norm": 1.6714680723685145, "language_loss": 0.73599643, "learning_rate": 3.0144996668905243e-07, "loss": 0.75787669, "num_input_tokens_seen": 148761070, "step": 6888, "time_per_iteration": 2.4022560119628906 }, { "auxiliary_loss_clip": 0.01097036, "auxiliary_loss_mlp": 0.0076154, "balance_loss_clip": 1.03731155, "balance_loss_mlp": 1.00039434, "epoch": 0.8283532736126976, "flos": 20084120352000.0, "grad_norm": 3.183612837149206, "language_loss": 0.82138085, "learning_rate": 3.010388369040331e-07, "loss": 0.83996665, "num_input_tokens_seen": 148779730, "step": 6889, "time_per_iteration": 2.60994291305542 }, { "auxiliary_loss_clip": 0.01152733, "auxiliary_loss_mlp": 0.01024846, "balance_loss_clip": 1.04653656, "balance_loss_mlp": 1.01799452, "epoch": 0.8284735165033368, "flos": 31868421805440.0, "grad_norm": 1.6621977960126766, "language_loss": 0.82844639, "learning_rate": 3.0062796485007156e-07, "loss": 0.85022223, "num_input_tokens_seen": 148800670, "step": 6890, "time_per_iteration": 2.541858196258545 }, { "auxiliary_loss_clip": 0.01164281, "auxiliary_loss_mlp": 0.00761771, "balance_loss_clip": 1.04595208, "balance_loss_mlp": 1.00039065, "epoch": 0.8285937593939758, "flos": 26651319840000.0, "grad_norm": 3.8447576595839563, "language_loss": 0.65936232, "learning_rate": 3.002173505894965e-07, "loss": 0.67862284, "num_input_tokens_seen": 148819820, "step": 6891, "time_per_iteration": 2.4771533012390137 }, { "auxiliary_loss_clip": 0.01154038, "auxiliary_loss_mlp": 0.01023016, "balance_loss_clip": 1.04346776, "balance_loss_mlp": 1.01531482, "epoch": 0.8287140022846149, "flos": 20193683811840.0, "grad_norm": 9.186926026801611, "language_loss": 0.62150168, "learning_rate": 2.998069941845973e-07, "loss": 0.64327222, "num_input_tokens_seen": 148838890, "step": 6892, "time_per_iteration": 2.45940899848938 }, { "auxiliary_loss_clip": 0.01062982, "auxiliary_loss_mlp": 0.01001451, "balance_loss_clip": 1.00901604, "balance_loss_mlp": 1.00048542, "epoch": 0.8288342451752541, "flos": 70755980019840.0, "grad_norm": 0.7093223707026493, "language_loss": 0.57449889, "learning_rate": 2.993968956976258e-07, "loss": 0.5951432, "num_input_tokens_seen": 148906635, "step": 6893, "time_per_iteration": 3.120269775390625 }, { "auxiliary_loss_clip": 0.01170626, "auxiliary_loss_mlp": 0.01023406, "balance_loss_clip": 1.04820871, "balance_loss_mlp": 1.01537752, "epoch": 0.8289544880658931, "flos": 24572235795840.0, "grad_norm": 1.7797692418218232, "language_loss": 0.70151687, "learning_rate": 2.9898705519079313e-07, "loss": 0.72345722, "num_input_tokens_seen": 148925740, "step": 6894, "time_per_iteration": 3.3278324604034424 }, { "auxiliary_loss_clip": 0.01130049, "auxiliary_loss_mlp": 0.01021247, "balance_loss_clip": 1.04206598, "balance_loss_mlp": 1.01422596, "epoch": 0.8290747309565322, "flos": 22273378387200.0, "grad_norm": 1.8173374357746075, "language_loss": 0.74415332, "learning_rate": 2.985774727262715e-07, "loss": 0.76566625, "num_input_tokens_seen": 148944585, "step": 6895, "time_per_iteration": 2.473827838897705 }, { "auxiliary_loss_clip": 0.01163261, "auxiliary_loss_mlp": 0.01023308, "balance_loss_clip": 1.04610658, "balance_loss_mlp": 1.01673973, "epoch": 0.8291949738471713, "flos": 23255570856960.0, "grad_norm": 1.8623617111640176, "language_loss": 0.81484956, "learning_rate": 2.981681483661949e-07, "loss": 0.83671522, "num_input_tokens_seen": 148964170, "step": 6896, "time_per_iteration": 3.122509717941284 }, { "auxiliary_loss_clip": 0.01152999, "auxiliary_loss_mlp": 0.01031486, "balance_loss_clip": 1.04818177, "balance_loss_mlp": 1.02474427, "epoch": 0.8293152167378104, "flos": 52555768185600.0, "grad_norm": 1.7380999878035768, "language_loss": 0.71114653, "learning_rate": 2.9775908217265633e-07, "loss": 0.7329914, "num_input_tokens_seen": 148989405, "step": 6897, "time_per_iteration": 3.413006544113159 }, { "auxiliary_loss_clip": 0.01008657, "auxiliary_loss_mlp": 0.01002649, "balance_loss_clip": 1.00801134, "balance_loss_mlp": 1.00151622, "epoch": 0.8294354596284494, "flos": 63356156294400.0, "grad_norm": 0.8240186140067517, "language_loss": 0.50400698, "learning_rate": 2.9735027420771253e-07, "loss": 0.52412009, "num_input_tokens_seen": 149049740, "step": 6898, "time_per_iteration": 3.0846452713012695 }, { "auxiliary_loss_clip": 0.01133749, "auxiliary_loss_mlp": 0.01028298, "balance_loss_clip": 1.04777813, "balance_loss_mlp": 1.02184868, "epoch": 0.8295557025190886, "flos": 24827021942400.0, "grad_norm": 1.7845273763816505, "language_loss": 0.71218121, "learning_rate": 2.969417245333774e-07, "loss": 0.73380172, "num_input_tokens_seen": 149069120, "step": 6899, "time_per_iteration": 2.501249074935913 }, { "auxiliary_loss_clip": 0.01121774, "auxiliary_loss_mlp": 0.0102371, "balance_loss_clip": 1.04458678, "balance_loss_mlp": 1.01688242, "epoch": 0.8296759454097277, "flos": 25118580637440.0, "grad_norm": 2.0046030871539307, "language_loss": 0.77855927, "learning_rate": 2.9653343321162915e-07, "loss": 0.80001408, "num_input_tokens_seen": 149088630, "step": 6900, "time_per_iteration": 2.515753984451294 }, { "auxiliary_loss_clip": 0.01125178, "auxiliary_loss_mlp": 0.01021405, "balance_loss_clip": 1.04566622, "balance_loss_mlp": 1.01363289, "epoch": 0.8297961883003667, "flos": 24132581326080.0, "grad_norm": 4.005801740255351, "language_loss": 0.65104586, "learning_rate": 2.9612540030440446e-07, "loss": 0.67251164, "num_input_tokens_seen": 149109175, "step": 6901, "time_per_iteration": 3.291018486022949 }, { "auxiliary_loss_clip": 0.01042717, "auxiliary_loss_mlp": 0.01001102, "balance_loss_clip": 1.00893545, "balance_loss_mlp": 1.00011218, "epoch": 0.8299164311910058, "flos": 67446561375360.0, "grad_norm": 0.8973955168334014, "language_loss": 0.64099193, "learning_rate": 2.9571762587360206e-07, "loss": 0.66143012, "num_input_tokens_seen": 149165560, "step": 6902, "time_per_iteration": 3.005819320678711 }, { "auxiliary_loss_clip": 0.01103939, "auxiliary_loss_mlp": 0.01024169, "balance_loss_clip": 1.03537178, "balance_loss_mlp": 1.01749635, "epoch": 0.8300366740816449, "flos": 25228682801280.0, "grad_norm": 1.796975921297616, "language_loss": 0.74119967, "learning_rate": 2.953101099810806e-07, "loss": 0.76248074, "num_input_tokens_seen": 149185165, "step": 6903, "time_per_iteration": 2.5984020233154297 }, { "auxiliary_loss_clip": 0.01145991, "auxiliary_loss_mlp": 0.01025332, "balance_loss_clip": 1.046229, "balance_loss_mlp": 1.01837027, "epoch": 0.830156916972284, "flos": 18041018757120.0, "grad_norm": 2.4255223080633463, "language_loss": 0.82811439, "learning_rate": 2.9490285268865965e-07, "loss": 0.84982765, "num_input_tokens_seen": 149202655, "step": 6904, "time_per_iteration": 2.437838077545166 }, { "auxiliary_loss_clip": 0.01156831, "auxiliary_loss_mlp": 0.01019836, "balance_loss_clip": 1.04906797, "balance_loss_mlp": 1.01275253, "epoch": 0.830277159862923, "flos": 26322485806080.0, "grad_norm": 2.039131686130138, "language_loss": 0.79322606, "learning_rate": 2.9449585405812085e-07, "loss": 0.81499279, "num_input_tokens_seen": 149220035, "step": 6905, "time_per_iteration": 2.4953598976135254 }, { "auxiliary_loss_clip": 0.01129367, "auxiliary_loss_mlp": 0.01019925, "balance_loss_clip": 1.04520488, "balance_loss_mlp": 1.01289141, "epoch": 0.8303974027535622, "flos": 19938861751680.0, "grad_norm": 1.9171193094958274, "language_loss": 0.73835111, "learning_rate": 2.940891141512043e-07, "loss": 0.75984395, "num_input_tokens_seen": 149238055, "step": 6906, "time_per_iteration": 2.5078468322753906 }, { "auxiliary_loss_clip": 0.01136621, "auxiliary_loss_mlp": 0.01026169, "balance_loss_clip": 1.04362547, "balance_loss_mlp": 1.01843262, "epoch": 0.8305176456442013, "flos": 17165552572800.0, "grad_norm": 3.06073748522985, "language_loss": 0.71972716, "learning_rate": 2.9368263302961385e-07, "loss": 0.74135506, "num_input_tokens_seen": 149256755, "step": 6907, "time_per_iteration": 2.4663076400756836 }, { "auxiliary_loss_clip": 0.01096808, "auxiliary_loss_mlp": 0.01019424, "balance_loss_clip": 1.03937244, "balance_loss_mlp": 1.01244736, "epoch": 0.8306378885348403, "flos": 25627614226560.0, "grad_norm": 1.9844074649284424, "language_loss": 0.80027926, "learning_rate": 2.9327641075501075e-07, "loss": 0.82144165, "num_input_tokens_seen": 149275745, "step": 6908, "time_per_iteration": 2.6545732021331787 }, { "auxiliary_loss_clip": 0.01129501, "auxiliary_loss_mlp": 0.01030973, "balance_loss_clip": 1.04019618, "balance_loss_mlp": 1.0232538, "epoch": 0.8307581314254795, "flos": 33947864985600.0, "grad_norm": 2.362089000669139, "language_loss": 0.66148579, "learning_rate": 2.9287044738901866e-07, "loss": 0.68309051, "num_input_tokens_seen": 149293730, "step": 6909, "time_per_iteration": 2.5797693729400635 }, { "auxiliary_loss_clip": 0.01152017, "auxiliary_loss_mlp": 0.00761515, "balance_loss_clip": 1.04486823, "balance_loss_mlp": 1.00038576, "epoch": 0.8308783743161186, "flos": 17562724231680.0, "grad_norm": 1.860933637122064, "language_loss": 0.90652752, "learning_rate": 2.9246474299322274e-07, "loss": 0.92566288, "num_input_tokens_seen": 149309290, "step": 6910, "time_per_iteration": 2.4368836879730225 }, { "auxiliary_loss_clip": 0.0102841, "auxiliary_loss_mlp": 0.01002087, "balance_loss_clip": 1.0083518, "balance_loss_mlp": 1.00103843, "epoch": 0.8309986172067576, "flos": 69412885649280.0, "grad_norm": 0.8961285096441381, "language_loss": 0.63182837, "learning_rate": 2.920592976291678e-07, "loss": 0.65213335, "num_input_tokens_seen": 149366620, "step": 6911, "time_per_iteration": 3.0481698513031006 }, { "auxiliary_loss_clip": 0.01150029, "auxiliary_loss_mlp": 0.01027453, "balance_loss_clip": 1.04512262, "balance_loss_mlp": 1.020208, "epoch": 0.8311188600973968, "flos": 22309755886080.0, "grad_norm": 1.8766682038346312, "language_loss": 0.80626452, "learning_rate": 2.916541113583595e-07, "loss": 0.82803929, "num_input_tokens_seen": 149385120, "step": 6912, "time_per_iteration": 2.4464707374572754 }, { "auxiliary_loss_clip": 0.01129246, "auxiliary_loss_mlp": 0.01023349, "balance_loss_clip": 1.04649973, "balance_loss_mlp": 1.01607394, "epoch": 0.8312391029880358, "flos": 18770077105920.0, "grad_norm": 2.5345308066752987, "language_loss": 0.66431165, "learning_rate": 2.912491842422642e-07, "loss": 0.68583763, "num_input_tokens_seen": 149402825, "step": 6913, "time_per_iteration": 2.5723748207092285 }, { "auxiliary_loss_clip": 0.01152995, "auxiliary_loss_mlp": 0.01023288, "balance_loss_clip": 1.04673028, "balance_loss_mlp": 1.01606369, "epoch": 0.8313593458786749, "flos": 20376648714240.0, "grad_norm": 1.7016825346540787, "language_loss": 0.70998251, "learning_rate": 2.9084451634230857e-07, "loss": 0.73174536, "num_input_tokens_seen": 149422125, "step": 6914, "time_per_iteration": 2.456085205078125 }, { "auxiliary_loss_clip": 0.01121059, "auxiliary_loss_mlp": 0.01027056, "balance_loss_clip": 1.04170632, "balance_loss_mlp": 1.0198977, "epoch": 0.831479588769314, "flos": 32124069878400.0, "grad_norm": 2.970555828884649, "language_loss": 0.71587288, "learning_rate": 2.9044010771988125e-07, "loss": 0.73735398, "num_input_tokens_seen": 149441940, "step": 6915, "time_per_iteration": 2.627363443374634 }, { "auxiliary_loss_clip": 0.01130238, "auxiliary_loss_mlp": 0.01025398, "balance_loss_clip": 1.04355609, "balance_loss_mlp": 1.01846337, "epoch": 0.8315998316599531, "flos": 45185929338240.0, "grad_norm": 1.709565035641112, "language_loss": 0.72039658, "learning_rate": 2.900359584363303e-07, "loss": 0.7419529, "num_input_tokens_seen": 149465045, "step": 6916, "time_per_iteration": 2.6957600116729736 }, { "auxiliary_loss_clip": 0.01106916, "auxiliary_loss_mlp": 0.01031515, "balance_loss_clip": 1.04366982, "balance_loss_mlp": 1.02343297, "epoch": 0.8317200745505922, "flos": 18363747479040.0, "grad_norm": 2.0739871062153847, "language_loss": 0.84478104, "learning_rate": 2.8963206855296494e-07, "loss": 0.86616528, "num_input_tokens_seen": 149481285, "step": 6917, "time_per_iteration": 2.498197078704834 }, { "auxiliary_loss_clip": 0.01151923, "auxiliary_loss_mlp": 0.01028665, "balance_loss_clip": 1.04496694, "balance_loss_mlp": 1.02136922, "epoch": 0.8318403174412313, "flos": 24206557386240.0, "grad_norm": 2.164780494814873, "language_loss": 0.7699967, "learning_rate": 2.892284381310548e-07, "loss": 0.79180259, "num_input_tokens_seen": 149502700, "step": 6918, "time_per_iteration": 2.459580898284912 }, { "auxiliary_loss_clip": 0.01131532, "auxiliary_loss_mlp": 0.01025444, "balance_loss_clip": 1.04203439, "balance_loss_mlp": 1.01781154, "epoch": 0.8319605603318704, "flos": 22418780641920.0, "grad_norm": 2.5551873929896742, "language_loss": 0.72564185, "learning_rate": 2.888250672318302e-07, "loss": 0.74721158, "num_input_tokens_seen": 149520100, "step": 6919, "time_per_iteration": 2.4520742893218994 }, { "auxiliary_loss_clip": 0.01169981, "auxiliary_loss_mlp": 0.0102842, "balance_loss_clip": 1.05039489, "balance_loss_mlp": 1.02141023, "epoch": 0.8320808032225094, "flos": 37414501459200.0, "grad_norm": 1.7787985402648339, "language_loss": 0.6867522, "learning_rate": 2.884219559164831e-07, "loss": 0.70873618, "num_input_tokens_seen": 149543245, "step": 6920, "time_per_iteration": 2.5364813804626465 }, { "auxiliary_loss_clip": 0.01150751, "auxiliary_loss_mlp": 0.01022845, "balance_loss_clip": 1.04656553, "balance_loss_mlp": 1.01604974, "epoch": 0.8322010461131486, "flos": 12787395638400.0, "grad_norm": 1.9305490846526931, "language_loss": 0.81247079, "learning_rate": 2.880191042461635e-07, "loss": 0.83420676, "num_input_tokens_seen": 149559185, "step": 6921, "time_per_iteration": 3.244818925857544 }, { "auxiliary_loss_clip": 0.01113086, "auxiliary_loss_mlp": 0.01022549, "balance_loss_clip": 1.04098594, "balance_loss_mlp": 1.01606703, "epoch": 0.8323212890037877, "flos": 15815455050240.0, "grad_norm": 1.64007199583088, "language_loss": 0.79980624, "learning_rate": 2.876165122819849e-07, "loss": 0.82116258, "num_input_tokens_seen": 149577165, "step": 6922, "time_per_iteration": 2.516418933868408 }, { "auxiliary_loss_clip": 0.01163232, "auxiliary_loss_mlp": 0.0101988, "balance_loss_clip": 1.04628515, "balance_loss_mlp": 1.0132308, "epoch": 0.8324415318944267, "flos": 21719276208000.0, "grad_norm": 1.7273990896534759, "language_loss": 0.79559994, "learning_rate": 2.872141800850201e-07, "loss": 0.81743103, "num_input_tokens_seen": 149594340, "step": 6923, "time_per_iteration": 3.223858118057251 }, { "auxiliary_loss_clip": 0.01164221, "auxiliary_loss_mlp": 0.01023564, "balance_loss_clip": 1.04692876, "balance_loss_mlp": 1.01726723, "epoch": 0.8325617747850659, "flos": 34198700636160.0, "grad_norm": 2.0780732159364854, "language_loss": 0.73479724, "learning_rate": 2.868121077163024e-07, "loss": 0.75667512, "num_input_tokens_seen": 149613895, "step": 6924, "time_per_iteration": 3.296491861343384 }, { "auxiliary_loss_clip": 0.01154852, "auxiliary_loss_mlp": 0.01028769, "balance_loss_clip": 1.0455606, "balance_loss_mlp": 1.02177751, "epoch": 0.8326820176757049, "flos": 18369457741440.0, "grad_norm": 1.7263277654853793, "language_loss": 0.72476411, "learning_rate": 2.864102952368257e-07, "loss": 0.74660033, "num_input_tokens_seen": 149631820, "step": 6925, "time_per_iteration": 2.4176251888275146 }, { "auxiliary_loss_clip": 0.0109671, "auxiliary_loss_mlp": 0.01025103, "balance_loss_clip": 1.0358516, "balance_loss_mlp": 1.01763415, "epoch": 0.832802260566344, "flos": 35991325716480.0, "grad_norm": 1.32661437481317, "language_loss": 0.59605002, "learning_rate": 2.860087427075444e-07, "loss": 0.61726815, "num_input_tokens_seen": 149656070, "step": 6926, "time_per_iteration": 2.6803042888641357 }, { "auxiliary_loss_clip": 0.01131479, "auxiliary_loss_mlp": 0.01027585, "balance_loss_clip": 1.0436573, "balance_loss_mlp": 1.02074826, "epoch": 0.8329225034569832, "flos": 14244434928000.0, "grad_norm": 2.473242672375076, "language_loss": 0.86173701, "learning_rate": 2.856074501893744e-07, "loss": 0.8833276, "num_input_tokens_seen": 149671270, "step": 6927, "time_per_iteration": 2.436058521270752 }, { "auxiliary_loss_clip": 0.01155315, "auxiliary_loss_mlp": 0.01023799, "balance_loss_clip": 1.04841924, "balance_loss_mlp": 1.01699221, "epoch": 0.8330427463476222, "flos": 18077468083200.0, "grad_norm": 1.6385088483415946, "language_loss": 0.81599665, "learning_rate": 2.8520641774319054e-07, "loss": 0.83778775, "num_input_tokens_seen": 149689360, "step": 6928, "time_per_iteration": 3.199218511581421 }, { "auxiliary_loss_clip": 0.01137855, "auxiliary_loss_mlp": 0.01024733, "balance_loss_clip": 1.04070127, "balance_loss_mlp": 1.01763415, "epoch": 0.8331629892382613, "flos": 18040839189120.0, "grad_norm": 2.275202050732897, "language_loss": 0.75433743, "learning_rate": 2.848056454298309e-07, "loss": 0.77596325, "num_input_tokens_seen": 149706685, "step": 6929, "time_per_iteration": 2.4604151248931885 }, { "auxiliary_loss_clip": 0.0113524, "auxiliary_loss_mlp": 0.0102128, "balance_loss_clip": 1.04547763, "balance_loss_mlp": 1.0141964, "epoch": 0.8332832321289004, "flos": 17457398576640.0, "grad_norm": 1.8827614930271386, "language_loss": 0.65574348, "learning_rate": 2.844051333100905e-07, "loss": 0.67730868, "num_input_tokens_seen": 149724230, "step": 6930, "time_per_iteration": 2.449049711227417 }, { "auxiliary_loss_clip": 0.01138384, "auxiliary_loss_mlp": 0.01021326, "balance_loss_clip": 1.04648757, "balance_loss_mlp": 1.01484096, "epoch": 0.8334034750195395, "flos": 15084852416640.0, "grad_norm": 1.9847170586925396, "language_loss": 0.8394497, "learning_rate": 2.840048814447269e-07, "loss": 0.86104679, "num_input_tokens_seen": 149742395, "step": 6931, "time_per_iteration": 2.453190565109253 }, { "auxiliary_loss_clip": 0.01131392, "auxiliary_loss_mlp": 0.01021846, "balance_loss_clip": 1.04281545, "balance_loss_mlp": 1.0149827, "epoch": 0.8335237179101785, "flos": 19427170556160.0, "grad_norm": 2.315069979658486, "language_loss": 0.739308, "learning_rate": 2.836048898944587e-07, "loss": 0.76084042, "num_input_tokens_seen": 149760820, "step": 6932, "time_per_iteration": 2.454880952835083 }, { "auxiliary_loss_clip": 0.0113615, "auxiliary_loss_mlp": 0.01024557, "balance_loss_clip": 1.04366779, "balance_loss_mlp": 1.01809025, "epoch": 0.8336439608008177, "flos": 21762046327680.0, "grad_norm": 2.6239200687743787, "language_loss": 0.72031128, "learning_rate": 2.832051587199642e-07, "loss": 0.74191839, "num_input_tokens_seen": 149778075, "step": 6933, "time_per_iteration": 2.458258867263794 }, { "auxiliary_loss_clip": 0.01053453, "auxiliary_loss_mlp": 0.01002175, "balance_loss_clip": 1.00882685, "balance_loss_mlp": 1.00120354, "epoch": 0.8337642036914568, "flos": 59702783990400.0, "grad_norm": 0.8098193649508391, "language_loss": 0.57789481, "learning_rate": 2.828056879818821e-07, "loss": 0.59845108, "num_input_tokens_seen": 149837150, "step": 6934, "time_per_iteration": 2.9578049182891846 }, { "auxiliary_loss_clip": 0.01121406, "auxiliary_loss_mlp": 0.01024675, "balance_loss_clip": 1.03947902, "balance_loss_mlp": 1.01855052, "epoch": 0.8338844465820958, "flos": 27162185022720.0, "grad_norm": 1.729355543817932, "language_loss": 0.83444381, "learning_rate": 2.824064777408117e-07, "loss": 0.85590464, "num_input_tokens_seen": 149856940, "step": 6935, "time_per_iteration": 2.5528767108917236 }, { "auxiliary_loss_clip": 0.01150406, "auxiliary_loss_mlp": 0.01024755, "balance_loss_clip": 1.04656959, "balance_loss_mlp": 1.01760292, "epoch": 0.8340046894727349, "flos": 30481264425600.0, "grad_norm": 1.8801422958469023, "language_loss": 0.75670588, "learning_rate": 2.8200752805731263e-07, "loss": 0.77845752, "num_input_tokens_seen": 149879930, "step": 6936, "time_per_iteration": 2.507885456085205 }, { "auxiliary_loss_clip": 0.01149698, "auxiliary_loss_mlp": 0.01023174, "balance_loss_clip": 1.04594362, "balance_loss_mlp": 1.01621246, "epoch": 0.834124932363374, "flos": 27126166659840.0, "grad_norm": 1.6709066521178355, "language_loss": 0.81128395, "learning_rate": 2.8160883899190625e-07, "loss": 0.8330127, "num_input_tokens_seen": 149903200, "step": 6937, "time_per_iteration": 2.4954211711883545 }, { "auxiliary_loss_clip": 0.01116148, "auxiliary_loss_mlp": 0.01024616, "balance_loss_clip": 1.04419136, "balance_loss_mlp": 1.0178299, "epoch": 0.8342451752540131, "flos": 24569865498240.0, "grad_norm": 2.1986581566204837, "language_loss": 0.72984147, "learning_rate": 2.8121041060507234e-07, "loss": 0.75124907, "num_input_tokens_seen": 149922230, "step": 6938, "time_per_iteration": 2.5200958251953125 }, { "auxiliary_loss_clip": 0.01155553, "auxiliary_loss_mlp": 0.01021479, "balance_loss_clip": 1.0454793, "balance_loss_mlp": 1.01440072, "epoch": 0.8343654181446521, "flos": 26615085995520.0, "grad_norm": 1.5648783740553736, "language_loss": 0.71559078, "learning_rate": 2.808122429572528e-07, "loss": 0.73736107, "num_input_tokens_seen": 149942435, "step": 6939, "time_per_iteration": 2.477778911590576 }, { "auxiliary_loss_clip": 0.01130513, "auxiliary_loss_mlp": 0.01025887, "balance_loss_clip": 1.04283655, "balance_loss_mlp": 1.0185318, "epoch": 0.8344856610352913, "flos": 20777268078720.0, "grad_norm": 2.8486585117066476, "language_loss": 0.76067793, "learning_rate": 2.804143361088489e-07, "loss": 0.78224194, "num_input_tokens_seen": 149961615, "step": 6940, "time_per_iteration": 2.499452829360962 }, { "auxiliary_loss_clip": 0.0113024, "auxiliary_loss_mlp": 0.01029388, "balance_loss_clip": 1.04214132, "balance_loss_mlp": 1.0218333, "epoch": 0.8346059039259304, "flos": 26095960684800.0, "grad_norm": 2.387089435526413, "language_loss": 0.78118169, "learning_rate": 2.8001669012022277e-07, "loss": 0.80277801, "num_input_tokens_seen": 149979585, "step": 6941, "time_per_iteration": 2.493180274963379 }, { "auxiliary_loss_clip": 0.01151673, "auxiliary_loss_mlp": 0.01026672, "balance_loss_clip": 1.04898906, "balance_loss_mlp": 1.01939154, "epoch": 0.8347261468165694, "flos": 29027708755200.0, "grad_norm": 1.7584457250076992, "language_loss": 0.69360065, "learning_rate": 2.7961930505169795e-07, "loss": 0.71538413, "num_input_tokens_seen": 150003830, "step": 6942, "time_per_iteration": 2.5236563682556152 }, { "auxiliary_loss_clip": 0.01153725, "auxiliary_loss_mlp": 0.00761738, "balance_loss_clip": 1.04787278, "balance_loss_mlp": 1.00048041, "epoch": 0.8348463897072086, "flos": 26396461866240.0, "grad_norm": 2.015232040819183, "language_loss": 0.76177752, "learning_rate": 2.792221809635558e-07, "loss": 0.78093219, "num_input_tokens_seen": 150024460, "step": 6943, "time_per_iteration": 2.494431257247925 }, { "auxiliary_loss_clip": 0.01086584, "auxiliary_loss_mlp": 0.01023602, "balance_loss_clip": 1.04176188, "balance_loss_mlp": 1.01638675, "epoch": 0.8349666325978476, "flos": 23367720096000.0, "grad_norm": 2.1210041043345345, "language_loss": 0.7506578, "learning_rate": 2.788253179160411e-07, "loss": 0.77175963, "num_input_tokens_seen": 150045620, "step": 6944, "time_per_iteration": 2.7207767963409424 }, { "auxiliary_loss_clip": 0.01136251, "auxiliary_loss_mlp": 0.0102763, "balance_loss_clip": 1.04426312, "balance_loss_mlp": 1.02097559, "epoch": 0.8350868754884867, "flos": 12896528135040.0, "grad_norm": 1.8673273006479607, "language_loss": 0.64921761, "learning_rate": 2.7842871596935725e-07, "loss": 0.67085636, "num_input_tokens_seen": 150064135, "step": 6945, "time_per_iteration": 2.9518702030181885 }, { "auxiliary_loss_clip": 0.01154998, "auxiliary_loss_mlp": 0.0101947, "balance_loss_clip": 1.04560351, "balance_loss_mlp": 1.01249576, "epoch": 0.8352071183791259, "flos": 26505522535680.0, "grad_norm": 1.6611386414730471, "language_loss": 0.68976617, "learning_rate": 2.780323751836682e-07, "loss": 0.71151078, "num_input_tokens_seen": 150085350, "step": 6946, "time_per_iteration": 2.495692729949951 }, { "auxiliary_loss_clip": 0.01134376, "auxiliary_loss_mlp": 0.00761209, "balance_loss_clip": 1.04079604, "balance_loss_mlp": 1.00044143, "epoch": 0.8353273612697649, "flos": 20668063754880.0, "grad_norm": 1.4186230174993504, "language_loss": 0.78450453, "learning_rate": 2.7763629561909876e-07, "loss": 0.80346036, "num_input_tokens_seen": 150106180, "step": 6947, "time_per_iteration": 2.4927756786346436 }, { "auxiliary_loss_clip": 0.01161122, "auxiliary_loss_mlp": 0.01020948, "balance_loss_clip": 1.04444301, "balance_loss_mlp": 1.01373017, "epoch": 0.835447604160404, "flos": 19754137082880.0, "grad_norm": 1.901840883099076, "language_loss": 0.76970482, "learning_rate": 2.772404773357335e-07, "loss": 0.79152548, "num_input_tokens_seen": 150125585, "step": 6948, "time_per_iteration": 3.2674615383148193 }, { "auxiliary_loss_clip": 0.01117828, "auxiliary_loss_mlp": 0.01024899, "balance_loss_clip": 1.04270077, "balance_loss_mlp": 1.01764548, "epoch": 0.8355678470510431, "flos": 23435842239360.0, "grad_norm": 1.755933792338582, "language_loss": 0.78216481, "learning_rate": 2.7684492039361853e-07, "loss": 0.80359209, "num_input_tokens_seen": 150144810, "step": 6949, "time_per_iteration": 3.301015853881836 }, { "auxiliary_loss_clip": 0.01166449, "auxiliary_loss_mlp": 0.01024814, "balance_loss_clip": 1.04882836, "balance_loss_mlp": 1.01816261, "epoch": 0.8356880899416822, "flos": 21214588164480.0, "grad_norm": 2.087516574864285, "language_loss": 0.83766705, "learning_rate": 2.764496248527586e-07, "loss": 0.85957968, "num_input_tokens_seen": 150163785, "step": 6950, "time_per_iteration": 3.195256471633911 }, { "auxiliary_loss_clip": 0.0113307, "auxiliary_loss_mlp": 0.01024228, "balance_loss_clip": 1.0437808, "balance_loss_mlp": 1.0173614, "epoch": 0.8358083328323213, "flos": 28037543466240.0, "grad_norm": 2.3510273278985987, "language_loss": 0.78784251, "learning_rate": 2.760545907731211e-07, "loss": 0.80941546, "num_input_tokens_seen": 150184360, "step": 6951, "time_per_iteration": 2.5752291679382324 }, { "auxiliary_loss_clip": 0.01151516, "auxiliary_loss_mlp": 0.0102019, "balance_loss_clip": 1.04373634, "balance_loss_mlp": 1.01315629, "epoch": 0.8359285757229604, "flos": 27783655159680.0, "grad_norm": 1.70605508551502, "language_loss": 0.6787008, "learning_rate": 2.75659818214631e-07, "loss": 0.70041788, "num_input_tokens_seen": 150205465, "step": 6952, "time_per_iteration": 2.4936938285827637 }, { "auxiliary_loss_clip": 0.01138847, "auxiliary_loss_mlp": 0.01021823, "balance_loss_clip": 1.04327345, "balance_loss_mlp": 1.0147959, "epoch": 0.8360488186135995, "flos": 21435115714560.0, "grad_norm": 2.805824680037939, "language_loss": 0.78170407, "learning_rate": 2.752653072371749e-07, "loss": 0.80331075, "num_input_tokens_seen": 150224900, "step": 6953, "time_per_iteration": 2.4774389266967773 }, { "auxiliary_loss_clip": 0.0111829, "auxiliary_loss_mlp": 0.01025847, "balance_loss_clip": 1.04366326, "balance_loss_mlp": 1.01947212, "epoch": 0.8361690615042385, "flos": 27632327160960.0, "grad_norm": 1.7937690686683287, "language_loss": 0.7484467, "learning_rate": 2.7487105790060105e-07, "loss": 0.7698881, "num_input_tokens_seen": 150244310, "step": 6954, "time_per_iteration": 2.5428404808044434 }, { "auxiliary_loss_clip": 0.01152706, "auxiliary_loss_mlp": 0.01024512, "balance_loss_clip": 1.04540277, "balance_loss_mlp": 1.01797664, "epoch": 0.8362893043948777, "flos": 39202529598720.0, "grad_norm": 1.9532298233738328, "language_loss": 0.69287205, "learning_rate": 2.7447707026471587e-07, "loss": 0.71464431, "num_input_tokens_seen": 150267285, "step": 6955, "time_per_iteration": 3.3278300762176514 }, { "auxiliary_loss_clip": 0.01122584, "auxiliary_loss_mlp": 0.0102251, "balance_loss_clip": 1.04156768, "balance_loss_mlp": 1.01582241, "epoch": 0.8364095472855168, "flos": 24785329230720.0, "grad_norm": 1.9375372219850042, "language_loss": 0.79851127, "learning_rate": 2.740833443892874e-07, "loss": 0.81996214, "num_input_tokens_seen": 150285455, "step": 6956, "time_per_iteration": 2.5371198654174805 }, { "auxiliary_loss_clip": 0.01135907, "auxiliary_loss_mlp": 0.01021642, "balance_loss_clip": 1.04385018, "balance_loss_mlp": 1.01459134, "epoch": 0.8365297901761558, "flos": 22743412784640.0, "grad_norm": 1.9392602941511379, "language_loss": 0.79682028, "learning_rate": 2.7368988033404327e-07, "loss": 0.81839573, "num_input_tokens_seen": 150302970, "step": 6957, "time_per_iteration": 2.468261957168579 }, { "auxiliary_loss_clip": 0.01125615, "auxiliary_loss_mlp": 0.01024318, "balance_loss_clip": 1.04395747, "balance_loss_mlp": 1.01799953, "epoch": 0.836650033066795, "flos": 28396003242240.0, "grad_norm": 1.5531683819212478, "language_loss": 0.84720874, "learning_rate": 2.732966781586712e-07, "loss": 0.86870801, "num_input_tokens_seen": 150322715, "step": 6958, "time_per_iteration": 2.5641021728515625 }, { "auxiliary_loss_clip": 0.01144791, "auxiliary_loss_mlp": 0.0102156, "balance_loss_clip": 1.04266, "balance_loss_mlp": 1.01514602, "epoch": 0.836770275957434, "flos": 22236857233920.0, "grad_norm": 1.7216225154039906, "language_loss": 0.66908062, "learning_rate": 2.729037379228205e-07, "loss": 0.69074416, "num_input_tokens_seen": 150342900, "step": 6959, "time_per_iteration": 2.4576852321624756 }, { "auxiliary_loss_clip": 0.01137675, "auxiliary_loss_mlp": 0.01026606, "balance_loss_clip": 1.04780185, "balance_loss_mlp": 1.01967692, "epoch": 0.8368905188480731, "flos": 22491930689280.0, "grad_norm": 1.4476338864757565, "language_loss": 0.8069762, "learning_rate": 2.725110596860998e-07, "loss": 0.828619, "num_input_tokens_seen": 150363580, "step": 6960, "time_per_iteration": 2.4842004776000977 }, { "auxiliary_loss_clip": 0.01104386, "auxiliary_loss_mlp": 0.01022301, "balance_loss_clip": 1.04132843, "balance_loss_mlp": 1.01566148, "epoch": 0.8370107617387123, "flos": 13370405287680.0, "grad_norm": 2.1321467620516, "language_loss": 0.70102775, "learning_rate": 2.7211864350807776e-07, "loss": 0.72229463, "num_input_tokens_seen": 150381780, "step": 6961, "time_per_iteration": 2.521544933319092 }, { "auxiliary_loss_clip": 0.01163386, "auxiliary_loss_mlp": 0.01026392, "balance_loss_clip": 1.04560983, "balance_loss_mlp": 1.01908493, "epoch": 0.8371310046293513, "flos": 25261289372160.0, "grad_norm": 1.703790546895152, "language_loss": 0.74247926, "learning_rate": 2.717264894482836e-07, "loss": 0.76437712, "num_input_tokens_seen": 150402120, "step": 6962, "time_per_iteration": 2.464632749557495 }, { "auxiliary_loss_clip": 0.01154373, "auxiliary_loss_mlp": 0.01023766, "balance_loss_clip": 1.04802001, "balance_loss_mlp": 1.01622856, "epoch": 0.8372512475199904, "flos": 19792705311360.0, "grad_norm": 2.007787186534202, "language_loss": 0.81070566, "learning_rate": 2.7133459756620646e-07, "loss": 0.83248711, "num_input_tokens_seen": 150419315, "step": 6963, "time_per_iteration": 2.468907594680786 }, { "auxiliary_loss_clip": 0.01145278, "auxiliary_loss_mlp": 0.01030272, "balance_loss_clip": 1.0443536, "balance_loss_mlp": 1.02338147, "epoch": 0.8373714904106295, "flos": 19391224020480.0, "grad_norm": 1.7195082167261235, "language_loss": 0.73618448, "learning_rate": 2.7094296792129733e-07, "loss": 0.75793993, "num_input_tokens_seen": 150438915, "step": 6964, "time_per_iteration": 2.430236577987671 }, { "auxiliary_loss_clip": 0.01150345, "auxiliary_loss_mlp": 0.01022109, "balance_loss_clip": 1.04481459, "balance_loss_mlp": 1.01544523, "epoch": 0.8374917333012686, "flos": 14975935401600.0, "grad_norm": 1.7689790264549623, "language_loss": 0.75638026, "learning_rate": 2.7055160057296424e-07, "loss": 0.77810478, "num_input_tokens_seen": 150456155, "step": 6965, "time_per_iteration": 2.5920419692993164 }, { "auxiliary_loss_clip": 0.01121272, "auxiliary_loss_mlp": 0.01023584, "balance_loss_clip": 1.04243863, "balance_loss_mlp": 1.01647019, "epoch": 0.8376119761919076, "flos": 30331839847680.0, "grad_norm": 1.5715509710924365, "language_loss": 0.72631294, "learning_rate": 2.7016049558057896e-07, "loss": 0.74776149, "num_input_tokens_seen": 150478115, "step": 6966, "time_per_iteration": 2.5752642154693604 }, { "auxiliary_loss_clip": 0.01149372, "auxiliary_loss_mlp": 0.0102895, "balance_loss_clip": 1.04579222, "balance_loss_mlp": 1.02147007, "epoch": 0.8377322190825467, "flos": 29423336129280.0, "grad_norm": 1.6801702603297883, "language_loss": 0.70689237, "learning_rate": 2.6976965300347074e-07, "loss": 0.72867548, "num_input_tokens_seen": 150500725, "step": 6967, "time_per_iteration": 2.5245361328125 }, { "auxiliary_loss_clip": 0.01132521, "auxiliary_loss_mlp": 0.01019616, "balance_loss_clip": 1.04240608, "balance_loss_mlp": 1.01227856, "epoch": 0.8378524619731859, "flos": 26687086807680.0, "grad_norm": 2.9911795981002314, "language_loss": 0.69328952, "learning_rate": 2.693790729009309e-07, "loss": 0.71481085, "num_input_tokens_seen": 150522335, "step": 6968, "time_per_iteration": 2.5130228996276855 }, { "auxiliary_loss_clip": 0.01135986, "auxiliary_loss_mlp": 0.01022417, "balance_loss_clip": 1.04358733, "balance_loss_mlp": 1.01564646, "epoch": 0.8379727048638249, "flos": 20703866636160.0, "grad_norm": 1.808333112762289, "language_loss": 0.8842935, "learning_rate": 2.6898875533220946e-07, "loss": 0.90587747, "num_input_tokens_seen": 150541640, "step": 6969, "time_per_iteration": 2.480952739715576 }, { "auxiliary_loss_clip": 0.01160835, "auxiliary_loss_mlp": 0.01021923, "balance_loss_clip": 1.04758787, "balance_loss_mlp": 1.01572394, "epoch": 0.838092947754464, "flos": 20084084438400.0, "grad_norm": 1.9256385222617913, "language_loss": 0.81550467, "learning_rate": 2.685987003565171e-07, "loss": 0.83733219, "num_input_tokens_seen": 150559680, "step": 6970, "time_per_iteration": 2.430691957473755 }, { "auxiliary_loss_clip": 0.0112071, "auxiliary_loss_mlp": 0.01023381, "balance_loss_clip": 1.04733825, "balance_loss_mlp": 1.01575792, "epoch": 0.8382131906451031, "flos": 18113270964480.0, "grad_norm": 3.0809953122499074, "language_loss": 0.74431747, "learning_rate": 2.6820890803302566e-07, "loss": 0.7657584, "num_input_tokens_seen": 150575205, "step": 6971, "time_per_iteration": 2.490973711013794 }, { "auxiliary_loss_clip": 0.01137266, "auxiliary_loss_mlp": 0.01021454, "balance_loss_clip": 1.04685569, "balance_loss_mlp": 1.01451862, "epoch": 0.8383334335357422, "flos": 17092653920640.0, "grad_norm": 2.0855543562827252, "language_loss": 0.81963855, "learning_rate": 2.6781937842086557e-07, "loss": 0.84122574, "num_input_tokens_seen": 150593995, "step": 6972, "time_per_iteration": 2.4419753551483154 }, { "auxiliary_loss_clip": 0.01152122, "auxiliary_loss_mlp": 0.0102735, "balance_loss_clip": 1.0447402, "balance_loss_mlp": 1.02017021, "epoch": 0.8384536764263812, "flos": 20704728562560.0, "grad_norm": 2.1998285428165913, "language_loss": 0.67823023, "learning_rate": 2.6743011157912933e-07, "loss": 0.70002496, "num_input_tokens_seen": 150613715, "step": 6973, "time_per_iteration": 2.4497125148773193 }, { "auxiliary_loss_clip": 0.01106161, "auxiliary_loss_mlp": 0.01023144, "balance_loss_clip": 1.03709149, "balance_loss_mlp": 1.01575351, "epoch": 0.8385739193170204, "flos": 28986842056320.0, "grad_norm": 1.7955662608864091, "language_loss": 0.65369266, "learning_rate": 2.6704110756686725e-07, "loss": 0.67498571, "num_input_tokens_seen": 150634540, "step": 6974, "time_per_iteration": 2.6011180877685547 }, { "auxiliary_loss_clip": 0.01133066, "auxiliary_loss_mlp": 0.00761537, "balance_loss_clip": 1.04210615, "balance_loss_mlp": 1.00039816, "epoch": 0.8386941622076595, "flos": 23438068882560.0, "grad_norm": 1.6424947887458081, "language_loss": 0.83731735, "learning_rate": 2.6665236644309085e-07, "loss": 0.8562634, "num_input_tokens_seen": 150654850, "step": 6975, "time_per_iteration": 3.338249444961548 }, { "auxiliary_loss_clip": 0.01149939, "auxiliary_loss_mlp": 0.01022066, "balance_loss_clip": 1.04336858, "balance_loss_mlp": 1.01553047, "epoch": 0.8388144050982985, "flos": 23002724044800.0, "grad_norm": 1.8162602908230185, "language_loss": 0.79731917, "learning_rate": 2.662638882667727e-07, "loss": 0.81903923, "num_input_tokens_seen": 150673790, "step": 6976, "time_per_iteration": 4.033307790756226 }, { "auxiliary_loss_clip": 0.01167135, "auxiliary_loss_mlp": 0.01025097, "balance_loss_clip": 1.0469172, "balance_loss_mlp": 1.0177809, "epoch": 0.8389346479889377, "flos": 24280353878400.0, "grad_norm": 2.0341288116991194, "language_loss": 0.72796947, "learning_rate": 2.658756730968443e-07, "loss": 0.74989176, "num_input_tokens_seen": 150692255, "step": 6977, "time_per_iteration": 2.4284560680389404 }, { "auxiliary_loss_clip": 0.01142641, "auxiliary_loss_mlp": 0.01021912, "balance_loss_clip": 1.048944, "balance_loss_mlp": 1.01503658, "epoch": 0.8390548908795767, "flos": 21215019127680.0, "grad_norm": 2.2504662227208594, "language_loss": 0.88496709, "learning_rate": 2.654877209921975e-07, "loss": 0.90661263, "num_input_tokens_seen": 150709790, "step": 6978, "time_per_iteration": 2.4695749282836914 }, { "auxiliary_loss_clip": 0.01114414, "auxiliary_loss_mlp": 0.01027919, "balance_loss_clip": 1.04012942, "balance_loss_mlp": 1.01961565, "epoch": 0.8391751337702158, "flos": 35627299332480.0, "grad_norm": 4.455165390242117, "language_loss": 0.63079774, "learning_rate": 2.651000320116843e-07, "loss": 0.65222108, "num_input_tokens_seen": 150730675, "step": 6979, "time_per_iteration": 2.661766290664673 }, { "auxiliary_loss_clip": 0.01118862, "auxiliary_loss_mlp": 0.00762102, "balance_loss_clip": 1.04148579, "balance_loss_mlp": 1.00054991, "epoch": 0.839295376660855, "flos": 21325229032320.0, "grad_norm": 2.0538298703010174, "language_loss": 0.7569896, "learning_rate": 2.647126062141163e-07, "loss": 0.77579921, "num_input_tokens_seen": 150749750, "step": 6980, "time_per_iteration": 2.5189621448516846 }, { "auxiliary_loss_clip": 0.01139241, "auxiliary_loss_mlp": 0.0102409, "balance_loss_clip": 1.04163551, "balance_loss_mlp": 1.01731288, "epoch": 0.839415619551494, "flos": 18442535961600.0, "grad_norm": 1.668233801528303, "language_loss": 0.83698547, "learning_rate": 2.643254436582669e-07, "loss": 0.85861874, "num_input_tokens_seen": 150769240, "step": 6981, "time_per_iteration": 2.4516139030456543 }, { "auxiliary_loss_clip": 0.01109935, "auxiliary_loss_mlp": 0.01022436, "balance_loss_clip": 1.04181814, "balance_loss_mlp": 1.01553988, "epoch": 0.8395358624421331, "flos": 23221958705280.0, "grad_norm": 1.783339782362761, "language_loss": 0.82419789, "learning_rate": 2.6393854440286743e-07, "loss": 0.84552157, "num_input_tokens_seen": 150788410, "step": 6982, "time_per_iteration": 3.3319625854492188 }, { "auxiliary_loss_clip": 0.01165053, "auxiliary_loss_mlp": 0.01023904, "balance_loss_clip": 1.04849601, "balance_loss_mlp": 1.0171212, "epoch": 0.8396561053327722, "flos": 24381657210240.0, "grad_norm": 2.569599960357759, "language_loss": 0.70636833, "learning_rate": 2.6355190850661045e-07, "loss": 0.72825789, "num_input_tokens_seen": 150805245, "step": 6983, "time_per_iteration": 2.415895700454712 }, { "auxiliary_loss_clip": 0.01138006, "auxiliary_loss_mlp": 0.0102481, "balance_loss_clip": 1.04539049, "balance_loss_mlp": 1.01755309, "epoch": 0.8397763482234113, "flos": 22237755073920.0, "grad_norm": 1.5791957367098992, "language_loss": 0.86302984, "learning_rate": 2.631655360281486e-07, "loss": 0.88465798, "num_input_tokens_seen": 150824920, "step": 6984, "time_per_iteration": 2.4833171367645264 }, { "auxiliary_loss_clip": 0.01154089, "auxiliary_loss_mlp": 0.00761754, "balance_loss_clip": 1.04390144, "balance_loss_mlp": 1.0005095, "epoch": 0.8398965911140504, "flos": 22163743100160.0, "grad_norm": 1.7952765267680695, "language_loss": 0.65966392, "learning_rate": 2.6277942702609323e-07, "loss": 0.67882234, "num_input_tokens_seen": 150844400, "step": 6985, "time_per_iteration": 2.4489188194274902 }, { "auxiliary_loss_clip": 0.01124673, "auxiliary_loss_mlp": 0.01026844, "balance_loss_clip": 1.04439592, "balance_loss_mlp": 1.01978111, "epoch": 0.8400168340046895, "flos": 21542775753600.0, "grad_norm": 1.962985614768301, "language_loss": 0.8769843, "learning_rate": 2.623935815590186e-07, "loss": 0.89849943, "num_input_tokens_seen": 150862780, "step": 6986, "time_per_iteration": 2.5191588401794434 }, { "auxiliary_loss_clip": 0.01139752, "auxiliary_loss_mlp": 0.01022126, "balance_loss_clip": 1.04759479, "balance_loss_mlp": 1.01508653, "epoch": 0.8401370768953286, "flos": 22491966602880.0, "grad_norm": 1.793309077995339, "language_loss": 0.80801427, "learning_rate": 2.6200799968545516e-07, "loss": 0.829633, "num_input_tokens_seen": 150883075, "step": 6987, "time_per_iteration": 2.4853620529174805 }, { "auxiliary_loss_clip": 0.01038587, "auxiliary_loss_mlp": 0.0100071, "balance_loss_clip": 1.01141524, "balance_loss_mlp": 0.99992907, "epoch": 0.8402573197859676, "flos": 59238890818560.0, "grad_norm": 0.7862854362295917, "language_loss": 0.56427372, "learning_rate": 2.616226814638969e-07, "loss": 0.58466667, "num_input_tokens_seen": 150948180, "step": 6988, "time_per_iteration": 3.092822551727295 }, { "auxiliary_loss_clip": 0.01138336, "auxiliary_loss_mlp": 0.0102241, "balance_loss_clip": 1.04511523, "balance_loss_mlp": 1.0157733, "epoch": 0.8403775626766068, "flos": 22674608282880.0, "grad_norm": 1.905849159551543, "language_loss": 0.77137804, "learning_rate": 2.612376269527954e-07, "loss": 0.79298556, "num_input_tokens_seen": 150967885, "step": 6989, "time_per_iteration": 2.5085322856903076 }, { "auxiliary_loss_clip": 0.01133763, "auxiliary_loss_mlp": 0.01025864, "balance_loss_clip": 1.04457855, "balance_loss_mlp": 1.01916468, "epoch": 0.8404978055672458, "flos": 19609704495360.0, "grad_norm": 1.669140990877128, "language_loss": 0.67639565, "learning_rate": 2.608528362105635e-07, "loss": 0.69799197, "num_input_tokens_seen": 150987255, "step": 6990, "time_per_iteration": 2.461534023284912 }, { "auxiliary_loss_clip": 0.01122708, "auxiliary_loss_mlp": 0.01022297, "balance_loss_clip": 1.03927422, "balance_loss_mlp": 1.01555538, "epoch": 0.8406180484578849, "flos": 27526929678720.0, "grad_norm": 1.9221059056010676, "language_loss": 0.7348206, "learning_rate": 2.6046830929557374e-07, "loss": 0.75627065, "num_input_tokens_seen": 151006905, "step": 6991, "time_per_iteration": 2.5702147483825684 }, { "auxiliary_loss_clip": 0.01119833, "auxiliary_loss_mlp": 0.01023608, "balance_loss_clip": 1.04361022, "balance_loss_mlp": 1.01688504, "epoch": 0.8407382913485241, "flos": 22127473342080.0, "grad_norm": 1.9428695315506979, "language_loss": 0.84759438, "learning_rate": 2.6008404626615776e-07, "loss": 0.86902881, "num_input_tokens_seen": 151025405, "step": 6992, "time_per_iteration": 2.5197558403015137 }, { "auxiliary_loss_clip": 0.01157517, "auxiliary_loss_mlp": 0.01023965, "balance_loss_clip": 1.04906511, "balance_loss_mlp": 1.01738441, "epoch": 0.8408585342391631, "flos": 13918473982080.0, "grad_norm": 3.084258260778641, "language_loss": 0.74324363, "learning_rate": 2.597000471806092e-07, "loss": 0.7650584, "num_input_tokens_seen": 151041970, "step": 6993, "time_per_iteration": 2.442943572998047 }, { "auxiliary_loss_clip": 0.01135269, "auxiliary_loss_mlp": 0.01023159, "balance_loss_clip": 1.04818904, "balance_loss_mlp": 1.01504421, "epoch": 0.8409787771298022, "flos": 20187865808640.0, "grad_norm": 1.9098399148648195, "language_loss": 0.73390204, "learning_rate": 2.593163120971793e-07, "loss": 0.75548637, "num_input_tokens_seen": 151060835, "step": 6994, "time_per_iteration": 2.439973831176758 }, { "auxiliary_loss_clip": 0.01099618, "auxiliary_loss_mlp": 0.010228, "balance_loss_clip": 1.03676105, "balance_loss_mlp": 1.01613069, "epoch": 0.8410990200204413, "flos": 23142523777920.0, "grad_norm": 1.7711748046407754, "language_loss": 0.68708283, "learning_rate": 2.5893284107408165e-07, "loss": 0.70830703, "num_input_tokens_seen": 151078205, "step": 6995, "time_per_iteration": 2.5352261066436768 }, { "auxiliary_loss_clip": 0.01111214, "auxiliary_loss_mlp": 0.01027742, "balance_loss_clip": 1.0441314, "balance_loss_mlp": 1.02020264, "epoch": 0.8412192629110804, "flos": 24027219757440.0, "grad_norm": 1.7909777825080915, "language_loss": 0.77824235, "learning_rate": 2.5854963416948726e-07, "loss": 0.79963195, "num_input_tokens_seen": 151100470, "step": 6996, "time_per_iteration": 2.6048130989074707 }, { "auxiliary_loss_clip": 0.01107499, "auxiliary_loss_mlp": 0.01024388, "balance_loss_clip": 1.03611398, "balance_loss_mlp": 1.01739049, "epoch": 0.8413395058017195, "flos": 25591703604480.0, "grad_norm": 2.029241840102886, "language_loss": 0.69332278, "learning_rate": 2.5816669144152816e-07, "loss": 0.71464163, "num_input_tokens_seen": 151121650, "step": 6997, "time_per_iteration": 2.5674712657928467 }, { "auxiliary_loss_clip": 0.01061934, "auxiliary_loss_mlp": 0.01000901, "balance_loss_clip": 1.00829625, "balance_loss_mlp": 1.00005424, "epoch": 0.8414597486923585, "flos": 63635396624640.0, "grad_norm": 0.8665706395173894, "language_loss": 0.66371548, "learning_rate": 2.5778401294829777e-07, "loss": 0.68434381, "num_input_tokens_seen": 151180390, "step": 6998, "time_per_iteration": 3.0523874759674072 }, { "auxiliary_loss_clip": 0.0114817, "auxiliary_loss_mlp": 0.00761816, "balance_loss_clip": 1.04551339, "balance_loss_mlp": 1.00046432, "epoch": 0.8415799915829977, "flos": 19098731571840.0, "grad_norm": 1.6629609593982126, "language_loss": 0.65053612, "learning_rate": 2.574015987478473e-07, "loss": 0.66963595, "num_input_tokens_seen": 151198520, "step": 6999, "time_per_iteration": 2.4193525314331055 }, { "auxiliary_loss_clip": 0.01142237, "auxiliary_loss_mlp": 0.01024528, "balance_loss_clip": 1.04505825, "balance_loss_mlp": 1.01735151, "epoch": 0.8417002344736367, "flos": 19821612781440.0, "grad_norm": 2.4083744176707147, "language_loss": 0.86991733, "learning_rate": 2.570194488981887e-07, "loss": 0.89158493, "num_input_tokens_seen": 151215065, "step": 7000, "time_per_iteration": 2.569504499435425 }, { "auxiliary_loss_clip": 0.01062398, "auxiliary_loss_mlp": 0.0100188, "balance_loss_clip": 1.00882101, "balance_loss_mlp": 1.00096226, "epoch": 0.8418204773642758, "flos": 62161516834560.0, "grad_norm": 0.8395303638495538, "language_loss": 0.60327309, "learning_rate": 2.566375634572939e-07, "loss": 0.62391591, "num_input_tokens_seen": 151275705, "step": 7001, "time_per_iteration": 3.7731032371520996 }, { "auxiliary_loss_clip": 0.01127754, "auxiliary_loss_mlp": 0.01026222, "balance_loss_clip": 1.04184294, "balance_loss_mlp": 1.0197494, "epoch": 0.841940720254915, "flos": 17092905315840.0, "grad_norm": 2.1404324055956803, "language_loss": 0.76612818, "learning_rate": 2.562559424830943e-07, "loss": 0.78766793, "num_input_tokens_seen": 151293665, "step": 7002, "time_per_iteration": 3.2008044719696045 }, { "auxiliary_loss_clip": 0.01135183, "auxiliary_loss_mlp": 0.01021252, "balance_loss_clip": 1.04324269, "balance_loss_mlp": 1.01389098, "epoch": 0.842060963145554, "flos": 16283586026880.0, "grad_norm": 2.29952970873097, "language_loss": 0.70264083, "learning_rate": 2.5587458603348256e-07, "loss": 0.7242052, "num_input_tokens_seen": 151310955, "step": 7003, "time_per_iteration": 3.311286211013794 }, { "auxiliary_loss_clip": 0.01117077, "auxiliary_loss_mlp": 0.0102657, "balance_loss_clip": 1.04225576, "balance_loss_mlp": 1.0195303, "epoch": 0.8421812060361931, "flos": 21908238681600.0, "grad_norm": 1.9825496680862764, "language_loss": 0.84104604, "learning_rate": 2.554934941663085e-07, "loss": 0.86248249, "num_input_tokens_seen": 151328490, "step": 7004, "time_per_iteration": 2.5046706199645996 }, { "auxiliary_loss_clip": 0.01121421, "auxiliary_loss_mlp": 0.01025504, "balance_loss_clip": 1.04192698, "balance_loss_mlp": 1.01776981, "epoch": 0.8423014489268322, "flos": 27777693502080.0, "grad_norm": 2.7630898728600113, "language_loss": 0.73586118, "learning_rate": 2.5511266693938484e-07, "loss": 0.75733042, "num_input_tokens_seen": 151346950, "step": 7005, "time_per_iteration": 2.54984974861145 }, { "auxiliary_loss_clip": 0.01135532, "auxiliary_loss_mlp": 0.01022897, "balance_loss_clip": 1.04614401, "balance_loss_mlp": 1.01546133, "epoch": 0.8424216918174713, "flos": 25117610970240.0, "grad_norm": 1.6079876458492246, "language_loss": 0.77764171, "learning_rate": 2.547321044104822e-07, "loss": 0.79922599, "num_input_tokens_seen": 151368445, "step": 7006, "time_per_iteration": 2.513831615447998 }, { "auxiliary_loss_clip": 0.01167217, "auxiliary_loss_mlp": 0.01023001, "balance_loss_clip": 1.04758072, "balance_loss_mlp": 1.01549375, "epoch": 0.8425419347081103, "flos": 24748448941440.0, "grad_norm": 1.634285320447925, "language_loss": 0.76497817, "learning_rate": 2.5435180663733113e-07, "loss": 0.78688037, "num_input_tokens_seen": 151388745, "step": 7007, "time_per_iteration": 2.4596002101898193 }, { "auxiliary_loss_clip": 0.01117529, "auxiliary_loss_mlp": 0.01026392, "balance_loss_clip": 1.04068685, "balance_loss_mlp": 1.01949883, "epoch": 0.8426621775987495, "flos": 24820916630400.0, "grad_norm": 2.883054714080116, "language_loss": 0.71693021, "learning_rate": 2.539717736776241e-07, "loss": 0.73836941, "num_input_tokens_seen": 151404970, "step": 7008, "time_per_iteration": 2.551286220550537 }, { "auxiliary_loss_clip": 0.01148071, "auxiliary_loss_mlp": 0.01020657, "balance_loss_clip": 1.04632854, "balance_loss_mlp": 1.01380873, "epoch": 0.8427824204893886, "flos": 23550074467200.0, "grad_norm": 1.3407034826020992, "language_loss": 0.76278132, "learning_rate": 2.535920055890097e-07, "loss": 0.78446853, "num_input_tokens_seen": 151426265, "step": 7009, "time_per_iteration": 3.2268190383911133 }, { "auxiliary_loss_clip": 0.0110609, "auxiliary_loss_mlp": 0.01024158, "balance_loss_clip": 1.04093981, "balance_loss_mlp": 1.0167129, "epoch": 0.8429026633800276, "flos": 16143858120960.0, "grad_norm": 2.1513441011988768, "language_loss": 0.64647067, "learning_rate": 2.5321250242910006e-07, "loss": 0.66777313, "num_input_tokens_seen": 151444180, "step": 7010, "time_per_iteration": 2.519245147705078 }, { "auxiliary_loss_clip": 0.01164749, "auxiliary_loss_mlp": 0.01021121, "balance_loss_clip": 1.04798377, "balance_loss_mlp": 1.01430249, "epoch": 0.8430229062706668, "flos": 22198540400640.0, "grad_norm": 1.7210399338278748, "language_loss": 0.8663035, "learning_rate": 2.5283326425546493e-07, "loss": 0.8881622, "num_input_tokens_seen": 151463290, "step": 7011, "time_per_iteration": 2.4217967987060547 }, { "auxiliary_loss_clip": 0.01118338, "auxiliary_loss_mlp": 0.01021769, "balance_loss_clip": 1.04444456, "balance_loss_mlp": 1.01512289, "epoch": 0.8431431491613058, "flos": 35330317683840.0, "grad_norm": 2.4405993873849874, "language_loss": 0.6960088, "learning_rate": 2.5245429112563443e-07, "loss": 0.71740985, "num_input_tokens_seen": 151483965, "step": 7012, "time_per_iteration": 2.660486936569214 }, { "auxiliary_loss_clip": 0.01151365, "auxiliary_loss_mlp": 0.01025005, "balance_loss_clip": 1.04739499, "balance_loss_mlp": 1.01815689, "epoch": 0.8432633920519449, "flos": 25812374808960.0, "grad_norm": 2.11706414987177, "language_loss": 0.81914449, "learning_rate": 2.5207558309709865e-07, "loss": 0.84090823, "num_input_tokens_seen": 151503700, "step": 7013, "time_per_iteration": 2.4803364276885986 }, { "auxiliary_loss_clip": 0.01036122, "auxiliary_loss_mlp": 0.00752945, "balance_loss_clip": 1.00835383, "balance_loss_mlp": 0.99995768, "epoch": 0.8433836349425841, "flos": 64959531592320.0, "grad_norm": 0.6855319897730692, "language_loss": 0.56307161, "learning_rate": 2.516971402273065e-07, "loss": 0.58096224, "num_input_tokens_seen": 151569765, "step": 7014, "time_per_iteration": 3.092280864715576 }, { "auxiliary_loss_clip": 0.01138566, "auxiliary_loss_mlp": 0.0102131, "balance_loss_clip": 1.04304528, "balance_loss_mlp": 1.01442587, "epoch": 0.8435038778332231, "flos": 20229989483520.0, "grad_norm": 2.0057004313342417, "language_loss": 0.67897224, "learning_rate": 2.513189625736687e-07, "loss": 0.70057106, "num_input_tokens_seen": 151586660, "step": 7015, "time_per_iteration": 2.4535529613494873 }, { "auxiliary_loss_clip": 0.01127464, "auxiliary_loss_mlp": 0.01029711, "balance_loss_clip": 1.04397988, "balance_loss_mlp": 1.02226651, "epoch": 0.8436241207238622, "flos": 20992229020800.0, "grad_norm": 2.3479606485779536, "language_loss": 0.71756548, "learning_rate": 2.509410501935534e-07, "loss": 0.73913723, "num_input_tokens_seen": 151602295, "step": 7016, "time_per_iteration": 2.494269847869873 }, { "auxiliary_loss_clip": 0.01137849, "auxiliary_loss_mlp": 0.01025129, "balance_loss_clip": 1.04366112, "balance_loss_mlp": 1.01775014, "epoch": 0.8437443636145013, "flos": 14682257804160.0, "grad_norm": 2.8334619111776687, "language_loss": 0.75318938, "learning_rate": 2.5056340314429116e-07, "loss": 0.77481914, "num_input_tokens_seen": 151619760, "step": 7017, "time_per_iteration": 2.4375545978546143 }, { "auxiliary_loss_clip": 0.01112279, "auxiliary_loss_mlp": 0.01026009, "balance_loss_clip": 1.04012799, "balance_loss_mlp": 1.01819444, "epoch": 0.8438646065051404, "flos": 21608814908160.0, "grad_norm": 2.265978582369029, "language_loss": 0.80613446, "learning_rate": 2.5018602148316904e-07, "loss": 0.82751739, "num_input_tokens_seen": 151635795, "step": 7018, "time_per_iteration": 2.542299509048462 }, { "auxiliary_loss_clip": 0.01164011, "auxiliary_loss_mlp": 0.01024838, "balance_loss_clip": 1.04841876, "balance_loss_mlp": 1.01825452, "epoch": 0.8439848493957794, "flos": 23289937194240.0, "grad_norm": 2.1239860431179127, "language_loss": 0.79964215, "learning_rate": 2.498089052674359e-07, "loss": 0.82153064, "num_input_tokens_seen": 151653770, "step": 7019, "time_per_iteration": 2.4089088439941406 }, { "auxiliary_loss_clip": 0.01151191, "auxiliary_loss_mlp": 0.01031505, "balance_loss_clip": 1.04663754, "balance_loss_mlp": 1.02433455, "epoch": 0.8441050922864186, "flos": 19719339782400.0, "grad_norm": 4.8655346410299085, "language_loss": 0.75239706, "learning_rate": 2.494320545543007e-07, "loss": 0.77422404, "num_input_tokens_seen": 151673340, "step": 7020, "time_per_iteration": 2.429816246032715 }, { "auxiliary_loss_clip": 0.01168795, "auxiliary_loss_mlp": 0.01025901, "balance_loss_clip": 1.04787779, "balance_loss_mlp": 1.01792574, "epoch": 0.8442253351770577, "flos": 21835268202240.0, "grad_norm": 2.9438224088060516, "language_loss": 0.66626304, "learning_rate": 2.490554694009308e-07, "loss": 0.68821001, "num_input_tokens_seen": 151694205, "step": 7021, "time_per_iteration": 2.442040205001831 }, { "auxiliary_loss_clip": 0.01154134, "auxiliary_loss_mlp": 0.01028043, "balance_loss_clip": 1.04513526, "balance_loss_mlp": 1.02133143, "epoch": 0.8443455780676967, "flos": 34346365447680.0, "grad_norm": 1.690566373744392, "language_loss": 0.78432953, "learning_rate": 2.4867914986445426e-07, "loss": 0.80615127, "num_input_tokens_seen": 151716595, "step": 7022, "time_per_iteration": 2.5730972290039062 }, { "auxiliary_loss_clip": 0.01139241, "auxiliary_loss_mlp": 0.01024833, "balance_loss_clip": 1.04228497, "balance_loss_mlp": 1.01833272, "epoch": 0.8444658209583359, "flos": 48214599281280.0, "grad_norm": 1.8828106901096475, "language_loss": 0.71129698, "learning_rate": 2.483030960019581e-07, "loss": 0.73293775, "num_input_tokens_seen": 151740525, "step": 7023, "time_per_iteration": 2.7226295471191406 }, { "auxiliary_loss_clip": 0.01018381, "auxiliary_loss_mlp": 0.01003795, "balance_loss_clip": 1.0081439, "balance_loss_mlp": 1.00277555, "epoch": 0.8445860638489749, "flos": 68484773105280.0, "grad_norm": 0.7366740590603003, "language_loss": 0.55469286, "learning_rate": 2.479273078704891e-07, "loss": 0.57491457, "num_input_tokens_seen": 151793890, "step": 7024, "time_per_iteration": 2.962144136428833 }, { "auxiliary_loss_clip": 0.01014278, "auxiliary_loss_mlp": 0.01003095, "balance_loss_clip": 1.01141953, "balance_loss_mlp": 1.00209367, "epoch": 0.844706306739614, "flos": 62833331882880.0, "grad_norm": 0.7809757381921016, "language_loss": 0.6473707, "learning_rate": 2.475517855270552e-07, "loss": 0.66754442, "num_input_tokens_seen": 151853970, "step": 7025, "time_per_iteration": 3.0823192596435547 }, { "auxiliary_loss_clip": 0.0116425, "auxiliary_loss_mlp": 0.01024306, "balance_loss_clip": 1.04844451, "balance_loss_mlp": 1.01764858, "epoch": 0.8448265496302532, "flos": 14976114969600.0, "grad_norm": 1.8202925918147292, "language_loss": 0.72594321, "learning_rate": 2.4717652902862143e-07, "loss": 0.74782872, "num_input_tokens_seen": 151872945, "step": 7026, "time_per_iteration": 2.411222457885742 }, { "auxiliary_loss_clip": 0.01140705, "auxiliary_loss_mlp": 0.01023645, "balance_loss_clip": 1.04426026, "balance_loss_mlp": 1.01702857, "epoch": 0.8449467925208922, "flos": 23441265192960.0, "grad_norm": 1.617793512075884, "language_loss": 0.81420934, "learning_rate": 2.4680153843211495e-07, "loss": 0.8358528, "num_input_tokens_seen": 151892875, "step": 7027, "time_per_iteration": 2.5046610832214355 }, { "auxiliary_loss_clip": 0.01136664, "auxiliary_loss_mlp": 0.01022622, "balance_loss_clip": 1.04533029, "balance_loss_mlp": 1.01553178, "epoch": 0.8450670354115313, "flos": 22748045639040.0, "grad_norm": 1.659828788536089, "language_loss": 0.72587234, "learning_rate": 2.464268137944212e-07, "loss": 0.74746519, "num_input_tokens_seen": 151914170, "step": 7028, "time_per_iteration": 3.3836312294006348 }, { "auxiliary_loss_clip": 0.01099818, "auxiliary_loss_mlp": 0.01027749, "balance_loss_clip": 1.0400908, "balance_loss_mlp": 1.01974404, "epoch": 0.8451872783021703, "flos": 29825571605760.0, "grad_norm": 2.9001062545065763, "language_loss": 0.78255951, "learning_rate": 2.46052355172385e-07, "loss": 0.80383521, "num_input_tokens_seen": 151932210, "step": 7029, "time_per_iteration": 3.48899507522583 }, { "auxiliary_loss_clip": 0.01164604, "auxiliary_loss_mlp": 0.01028765, "balance_loss_clip": 1.04637384, "balance_loss_mlp": 1.02118921, "epoch": 0.8453075211928095, "flos": 21870029589120.0, "grad_norm": 1.7236844803737197, "language_loss": 0.74734879, "learning_rate": 2.456781626228128e-07, "loss": 0.76928246, "num_input_tokens_seen": 151951715, "step": 7030, "time_per_iteration": 3.3004095554351807 }, { "auxiliary_loss_clip": 0.01022387, "auxiliary_loss_mlp": 0.00753187, "balance_loss_clip": 1.00737798, "balance_loss_mlp": 0.9999218, "epoch": 0.8454277640834486, "flos": 58751869288320.0, "grad_norm": 0.9131877959224518, "language_loss": 0.66333985, "learning_rate": 2.453042362024675e-07, "loss": 0.6810956, "num_input_tokens_seen": 152004960, "step": 7031, "time_per_iteration": 3.135413885116577 }, { "auxiliary_loss_clip": 0.01163056, "auxiliary_loss_mlp": 0.0102616, "balance_loss_clip": 1.04623342, "balance_loss_mlp": 1.01947212, "epoch": 0.8455480069740876, "flos": 27090076469760.0, "grad_norm": 1.6454883883613196, "language_loss": 0.73299348, "learning_rate": 2.449305759680751e-07, "loss": 0.75488567, "num_input_tokens_seen": 152026285, "step": 7032, "time_per_iteration": 2.4569928646087646 }, { "auxiliary_loss_clip": 0.01121787, "auxiliary_loss_mlp": 0.01026833, "balance_loss_clip": 1.04415393, "balance_loss_mlp": 1.01988935, "epoch": 0.8456682498647268, "flos": 27198670262400.0, "grad_norm": 1.5675712988851356, "language_loss": 0.75331032, "learning_rate": 2.445571819763188e-07, "loss": 0.77479649, "num_input_tokens_seen": 152048585, "step": 7033, "time_per_iteration": 2.6643624305725098 }, { "auxiliary_loss_clip": 0.01164267, "auxiliary_loss_mlp": 0.01030296, "balance_loss_clip": 1.0476284, "balance_loss_mlp": 1.02287579, "epoch": 0.8457884927553658, "flos": 20631901737600.0, "grad_norm": 1.77588912014989, "language_loss": 0.58323652, "learning_rate": 2.4418405428384227e-07, "loss": 0.60518217, "num_input_tokens_seen": 152068795, "step": 7034, "time_per_iteration": 2.436022996902466 }, { "auxiliary_loss_clip": 0.0116601, "auxiliary_loss_mlp": 0.00761763, "balance_loss_clip": 1.04790545, "balance_loss_mlp": 1.00046527, "epoch": 0.8459087356460049, "flos": 15299023259520.0, "grad_norm": 1.7307936949013905, "language_loss": 0.71941221, "learning_rate": 2.4381119294724864e-07, "loss": 0.7386899, "num_input_tokens_seen": 152086240, "step": 7035, "time_per_iteration": 2.3920671939849854 }, { "auxiliary_loss_clip": 0.01165406, "auxiliary_loss_mlp": 0.01021897, "balance_loss_clip": 1.04743385, "balance_loss_mlp": 1.01508093, "epoch": 0.846028978536644, "flos": 18843155326080.0, "grad_norm": 3.6283295659671184, "language_loss": 0.53970593, "learning_rate": 2.434385980231004e-07, "loss": 0.56157893, "num_input_tokens_seen": 152105080, "step": 7036, "time_per_iteration": 3.1459147930145264 }, { "auxiliary_loss_clip": 0.01150569, "auxiliary_loss_mlp": 0.01027904, "balance_loss_clip": 1.04591298, "balance_loss_mlp": 1.02069736, "epoch": 0.8461492214272831, "flos": 52661740285440.0, "grad_norm": 1.492225457134181, "language_loss": 0.6556018, "learning_rate": 2.4306626956792043e-07, "loss": 0.67738652, "num_input_tokens_seen": 152130025, "step": 7037, "time_per_iteration": 2.719534158706665 }, { "auxiliary_loss_clip": 0.01148756, "auxiliary_loss_mlp": 0.01024081, "balance_loss_clip": 1.04312038, "balance_loss_mlp": 1.0173254, "epoch": 0.8462694643179222, "flos": 18588405093120.0, "grad_norm": 1.677755895706697, "language_loss": 0.75646615, "learning_rate": 2.4269420763819017e-07, "loss": 0.77819455, "num_input_tokens_seen": 152148070, "step": 7038, "time_per_iteration": 2.429368257522583 }, { "auxiliary_loss_clip": 0.01147268, "auxiliary_loss_mlp": 0.01022207, "balance_loss_clip": 1.04502559, "balance_loss_mlp": 1.01565695, "epoch": 0.8463897072085613, "flos": 24387080163840.0, "grad_norm": 2.3201240319917944, "language_loss": 0.83149451, "learning_rate": 2.4232241229035223e-07, "loss": 0.85318923, "num_input_tokens_seen": 152165825, "step": 7039, "time_per_iteration": 2.4675028324127197 }, { "auxiliary_loss_clip": 0.01052888, "auxiliary_loss_mlp": 0.01000745, "balance_loss_clip": 1.0079565, "balance_loss_mlp": 0.99983925, "epoch": 0.8465099500992004, "flos": 68702140258560.0, "grad_norm": 0.8032244963941125, "language_loss": 0.5678364, "learning_rate": 2.419508835808064e-07, "loss": 0.58837271, "num_input_tokens_seen": 152222380, "step": 7040, "time_per_iteration": 2.9750545024871826 }, { "auxiliary_loss_clip": 0.0113739, "auxiliary_loss_mlp": 0.01021965, "balance_loss_clip": 1.04579735, "balance_loss_mlp": 1.01506317, "epoch": 0.8466301929898394, "flos": 13735724561280.0, "grad_norm": 1.9433629375392678, "language_loss": 0.6314559, "learning_rate": 2.415796215659134e-07, "loss": 0.65304947, "num_input_tokens_seen": 152239085, "step": 7041, "time_per_iteration": 2.4568138122558594 }, { "auxiliary_loss_clip": 0.011259, "auxiliary_loss_mlp": 0.01031079, "balance_loss_clip": 1.04076469, "balance_loss_mlp": 1.02414083, "epoch": 0.8467504358804786, "flos": 19241260738560.0, "grad_norm": 2.011923393267298, "language_loss": 0.77123839, "learning_rate": 2.412086263019939e-07, "loss": 0.79280818, "num_input_tokens_seen": 152257110, "step": 7042, "time_per_iteration": 2.4937891960144043 }, { "auxiliary_loss_clip": 0.01160337, "auxiliary_loss_mlp": 0.01022558, "balance_loss_clip": 1.04707003, "balance_loss_mlp": 1.01592696, "epoch": 0.8468706787711177, "flos": 21324115710720.0, "grad_norm": 2.320967958914889, "language_loss": 0.79747272, "learning_rate": 2.408378978453276e-07, "loss": 0.81930166, "num_input_tokens_seen": 152277230, "step": 7043, "time_per_iteration": 2.4199469089508057 }, { "auxiliary_loss_clip": 0.01053078, "auxiliary_loss_mlp": 0.01002209, "balance_loss_clip": 1.00811934, "balance_loss_mlp": 1.00129664, "epoch": 0.8469909216617567, "flos": 64877439058560.0, "grad_norm": 0.8116887091471326, "language_loss": 0.63976562, "learning_rate": 2.404674362521533e-07, "loss": 0.66031849, "num_input_tokens_seen": 152335725, "step": 7044, "time_per_iteration": 2.9290642738342285 }, { "auxiliary_loss_clip": 0.01151659, "auxiliary_loss_mlp": 0.0102219, "balance_loss_clip": 1.04819751, "balance_loss_mlp": 1.01595497, "epoch": 0.8471111645523959, "flos": 19280583152640.0, "grad_norm": 2.2022597730265208, "language_loss": 0.74563384, "learning_rate": 2.4009724157866997e-07, "loss": 0.76737237, "num_input_tokens_seen": 152352785, "step": 7045, "time_per_iteration": 2.4344186782836914 }, { "auxiliary_loss_clip": 0.0116297, "auxiliary_loss_mlp": 0.01022478, "balance_loss_clip": 1.04652834, "balance_loss_mlp": 1.01587665, "epoch": 0.8472314074430349, "flos": 22015826893440.0, "grad_norm": 2.0010858820738973, "language_loss": 0.76488608, "learning_rate": 2.3972731388103564e-07, "loss": 0.78674054, "num_input_tokens_seen": 152371265, "step": 7046, "time_per_iteration": 2.411935806274414 }, { "auxiliary_loss_clip": 0.01005485, "auxiliary_loss_mlp": 0.01002247, "balance_loss_clip": 1.00797129, "balance_loss_mlp": 1.00129282, "epoch": 0.847351650333674, "flos": 57882580243200.0, "grad_norm": 0.8081796483934046, "language_loss": 0.62394696, "learning_rate": 2.393576532153687e-07, "loss": 0.64402425, "num_input_tokens_seen": 152435050, "step": 7047, "time_per_iteration": 3.3289384841918945 }, { "auxiliary_loss_clip": 0.01049463, "auxiliary_loss_mlp": 0.01000561, "balance_loss_clip": 1.00717735, "balance_loss_mlp": 0.99964905, "epoch": 0.8474718932243132, "flos": 41284238313600.0, "grad_norm": 1.0148745035124487, "language_loss": 0.57805216, "learning_rate": 2.389882596377453e-07, "loss": 0.59855247, "num_input_tokens_seen": 152489315, "step": 7048, "time_per_iteration": 3.1335747241973877 }, { "auxiliary_loss_clip": 0.01162949, "auxiliary_loss_mlp": 0.01023258, "balance_loss_clip": 1.04563689, "balance_loss_mlp": 1.01646328, "epoch": 0.8475921361149522, "flos": 38180906974080.0, "grad_norm": 1.783882695297717, "language_loss": 0.76419449, "learning_rate": 2.386191332042031e-07, "loss": 0.78605652, "num_input_tokens_seen": 152511210, "step": 7049, "time_per_iteration": 2.559741973876953 }, { "auxiliary_loss_clip": 0.01170272, "auxiliary_loss_mlp": 0.01029119, "balance_loss_clip": 1.04890227, "balance_loss_mlp": 1.02202296, "epoch": 0.8477123790055913, "flos": 25375054723200.0, "grad_norm": 1.6930635494095527, "language_loss": 0.72977793, "learning_rate": 2.3825027397073794e-07, "loss": 0.75177181, "num_input_tokens_seen": 152531685, "step": 7050, "time_per_iteration": 2.455000638961792 }, { "auxiliary_loss_clip": 0.01149127, "auxiliary_loss_mlp": 0.01024604, "balance_loss_clip": 1.04783297, "balance_loss_mlp": 1.01772869, "epoch": 0.8478326218962304, "flos": 30225185389440.0, "grad_norm": 1.9775603583509382, "language_loss": 0.66726625, "learning_rate": 2.3788168199330515e-07, "loss": 0.68900353, "num_input_tokens_seen": 152553245, "step": 7051, "time_per_iteration": 2.502920389175415 }, { "auxiliary_loss_clip": 0.01121482, "auxiliary_loss_mlp": 0.01023664, "balance_loss_clip": 1.03889132, "balance_loss_mlp": 1.01621366, "epoch": 0.8479528647868695, "flos": 38213800853760.0, "grad_norm": 1.5450701602903603, "language_loss": 0.72626805, "learning_rate": 2.3751335732782074e-07, "loss": 0.74771953, "num_input_tokens_seen": 152574505, "step": 7052, "time_per_iteration": 2.6080124378204346 }, { "auxiliary_loss_clip": 0.01151635, "auxiliary_loss_mlp": 0.01023658, "balance_loss_clip": 1.04819822, "balance_loss_mlp": 1.01690161, "epoch": 0.8480731076775085, "flos": 20957790856320.0, "grad_norm": 1.9118088469892158, "language_loss": 0.79593438, "learning_rate": 2.371453000301582e-07, "loss": 0.81768733, "num_input_tokens_seen": 152593190, "step": 7053, "time_per_iteration": 2.4335508346557617 }, { "auxiliary_loss_clip": 0.01119364, "auxiliary_loss_mlp": 0.01020221, "balance_loss_clip": 1.04296207, "balance_loss_mlp": 1.01344085, "epoch": 0.8481933505681477, "flos": 32596510487040.0, "grad_norm": 1.6431137248093355, "language_loss": 0.7410087, "learning_rate": 2.3677751015615222e-07, "loss": 0.76240456, "num_input_tokens_seen": 152615265, "step": 7054, "time_per_iteration": 2.5895867347717285 }, { "auxiliary_loss_clip": 0.01125532, "auxiliary_loss_mlp": 0.01029679, "balance_loss_clip": 1.04154372, "balance_loss_mlp": 1.02259183, "epoch": 0.8483135934587868, "flos": 20741177888640.0, "grad_norm": 4.149614036353006, "language_loss": 0.85451144, "learning_rate": 2.3640998776159593e-07, "loss": 0.87606359, "num_input_tokens_seen": 152632770, "step": 7055, "time_per_iteration": 4.348072290420532 }, { "auxiliary_loss_clip": 0.01137351, "auxiliary_loss_mlp": 0.01022682, "balance_loss_clip": 1.04526162, "balance_loss_mlp": 1.01656961, "epoch": 0.8484338363494258, "flos": 21653057485440.0, "grad_norm": 1.6847481393658448, "language_loss": 0.81471545, "learning_rate": 2.3604273290224253e-07, "loss": 0.83631575, "num_input_tokens_seen": 152653485, "step": 7056, "time_per_iteration": 2.500746965408325 }, { "auxiliary_loss_clip": 0.01140739, "auxiliary_loss_mlp": 0.01029285, "balance_loss_clip": 1.04679561, "balance_loss_mlp": 1.02154207, "epoch": 0.848554079240065, "flos": 15013964926080.0, "grad_norm": 3.531464086747567, "language_loss": 0.74845701, "learning_rate": 2.356757456338039e-07, "loss": 0.77015722, "num_input_tokens_seen": 152670970, "step": 7057, "time_per_iteration": 3.2922630310058594 }, { "auxiliary_loss_clip": 0.01039997, "auxiliary_loss_mlp": 0.01001309, "balance_loss_clip": 1.01098704, "balance_loss_mlp": 1.00036764, "epoch": 0.848674322130704, "flos": 68060453742720.0, "grad_norm": 0.7689343665705757, "language_loss": 0.59039354, "learning_rate": 2.3530902601195147e-07, "loss": 0.61080658, "num_input_tokens_seen": 152739460, "step": 7058, "time_per_iteration": 3.1545004844665527 }, { "auxiliary_loss_clip": 0.01147982, "auxiliary_loss_mlp": 0.01027185, "balance_loss_clip": 1.04442942, "balance_loss_mlp": 1.0201664, "epoch": 0.8487945650213431, "flos": 18475788977280.0, "grad_norm": 6.813239192788781, "language_loss": 0.79047465, "learning_rate": 2.34942574092317e-07, "loss": 0.8122263, "num_input_tokens_seen": 152754710, "step": 7059, "time_per_iteration": 2.401810646057129 }, { "auxiliary_loss_clip": 0.01154059, "auxiliary_loss_mlp": 0.01025475, "balance_loss_clip": 1.04601407, "balance_loss_mlp": 1.01834035, "epoch": 0.8489148079119821, "flos": 23473189405440.0, "grad_norm": 2.474307463828105, "language_loss": 0.76760042, "learning_rate": 2.3457638993049045e-07, "loss": 0.78939581, "num_input_tokens_seen": 152772700, "step": 7060, "time_per_iteration": 2.512019157409668 }, { "auxiliary_loss_clip": 0.01099105, "auxiliary_loss_mlp": 0.01023676, "balance_loss_clip": 1.04408264, "balance_loss_mlp": 1.01621664, "epoch": 0.8490350508026213, "flos": 19937604775680.0, "grad_norm": 1.9053257710084055, "language_loss": 0.64234185, "learning_rate": 2.3421047358202252e-07, "loss": 0.66356963, "num_input_tokens_seen": 152791550, "step": 7061, "time_per_iteration": 2.5750207901000977 }, { "auxiliary_loss_clip": 0.01152972, "auxiliary_loss_mlp": 0.01026238, "balance_loss_clip": 1.04677105, "balance_loss_mlp": 1.01937747, "epoch": 0.8491552936932604, "flos": 24279958828800.0, "grad_norm": 2.2386390924518813, "language_loss": 0.83409441, "learning_rate": 2.3384482510242144e-07, "loss": 0.85588646, "num_input_tokens_seen": 152809410, "step": 7062, "time_per_iteration": 2.45922589302063 }, { "auxiliary_loss_clip": 0.01166247, "auxiliary_loss_mlp": 0.01025929, "balance_loss_clip": 1.04649138, "balance_loss_mlp": 1.01872015, "epoch": 0.8492755365838994, "flos": 22522526098560.0, "grad_norm": 2.5269522640472566, "language_loss": 0.77260256, "learning_rate": 2.3347944454715575e-07, "loss": 0.79452431, "num_input_tokens_seen": 152825800, "step": 7063, "time_per_iteration": 3.300910472869873 }, { "auxiliary_loss_clip": 0.01168376, "auxiliary_loss_mlp": 0.01023725, "balance_loss_clip": 1.04738522, "balance_loss_mlp": 1.01630998, "epoch": 0.8493957794745386, "flos": 26980441182720.0, "grad_norm": 1.7028184501106434, "language_loss": 0.67474258, "learning_rate": 2.331143319716542e-07, "loss": 0.69666356, "num_input_tokens_seen": 152845330, "step": 7064, "time_per_iteration": 2.4657676219940186 }, { "auxiliary_loss_clip": 0.01126241, "auxiliary_loss_mlp": 0.01024094, "balance_loss_clip": 1.04288673, "balance_loss_mlp": 1.0171051, "epoch": 0.8495160223651776, "flos": 29861985018240.0, "grad_norm": 2.0395667671375084, "language_loss": 0.66012293, "learning_rate": 2.3274948743130363e-07, "loss": 0.68162626, "num_input_tokens_seen": 152865165, "step": 7065, "time_per_iteration": 2.5739710330963135 }, { "auxiliary_loss_clip": 0.01164269, "auxiliary_loss_mlp": 0.01022455, "balance_loss_clip": 1.04554188, "balance_loss_mlp": 1.01505852, "epoch": 0.8496362652558167, "flos": 23075443128960.0, "grad_norm": 1.573993371533445, "language_loss": 0.79392433, "learning_rate": 2.3238491098145085e-07, "loss": 0.81579161, "num_input_tokens_seen": 152884695, "step": 7066, "time_per_iteration": 2.4210197925567627 }, { "auxiliary_loss_clip": 0.011484, "auxiliary_loss_mlp": 0.01021955, "balance_loss_clip": 1.04456878, "balance_loss_mlp": 1.01509786, "epoch": 0.8497565081464559, "flos": 14609107756800.0, "grad_norm": 2.121251356048633, "language_loss": 0.73573291, "learning_rate": 2.3202060267740141e-07, "loss": 0.75743645, "num_input_tokens_seen": 152902220, "step": 7067, "time_per_iteration": 2.4155685901641846 }, { "auxiliary_loss_clip": 0.0110278, "auxiliary_loss_mlp": 0.01019291, "balance_loss_clip": 1.03824425, "balance_loss_mlp": 1.01261258, "epoch": 0.8498767510370949, "flos": 21136446126720.0, "grad_norm": 2.1323481787273066, "language_loss": 0.77249926, "learning_rate": 2.3165656257442044e-07, "loss": 0.79372001, "num_input_tokens_seen": 152920740, "step": 7068, "time_per_iteration": 2.5306179523468018 }, { "auxiliary_loss_clip": 0.01147328, "auxiliary_loss_mlp": 0.01020414, "balance_loss_clip": 1.04548645, "balance_loss_mlp": 1.01403379, "epoch": 0.849996993927734, "flos": 23654538195840.0, "grad_norm": 2.1418123922771297, "language_loss": 0.89970219, "learning_rate": 2.31292790727734e-07, "loss": 0.92137963, "num_input_tokens_seen": 152938305, "step": 7069, "time_per_iteration": 2.462441921234131 }, { "auxiliary_loss_clip": 0.01161451, "auxiliary_loss_mlp": 0.01022922, "balance_loss_clip": 1.04514194, "balance_loss_mlp": 1.01657152, "epoch": 0.8501172368183731, "flos": 20558069331840.0, "grad_norm": 2.718824295277621, "language_loss": 0.80245382, "learning_rate": 2.3092928719252392e-07, "loss": 0.82429755, "num_input_tokens_seen": 152956705, "step": 7070, "time_per_iteration": 2.419555187225342 }, { "auxiliary_loss_clip": 0.01148455, "auxiliary_loss_mlp": 0.01024365, "balance_loss_clip": 1.04458272, "balance_loss_mlp": 1.01728415, "epoch": 0.8502374797090122, "flos": 22272624201600.0, "grad_norm": 1.8605462975381586, "language_loss": 0.78503716, "learning_rate": 2.3056605202393475e-07, "loss": 0.80676532, "num_input_tokens_seen": 152974265, "step": 7071, "time_per_iteration": 2.4259607791900635 }, { "auxiliary_loss_clip": 0.01146862, "auxiliary_loss_mlp": 0.0076227, "balance_loss_clip": 1.04307997, "balance_loss_mlp": 1.00040865, "epoch": 0.8503577225996513, "flos": 23659817495040.0, "grad_norm": 1.741231877719285, "language_loss": 0.66835445, "learning_rate": 2.3020308527706888e-07, "loss": 0.68744576, "num_input_tokens_seen": 152993680, "step": 7072, "time_per_iteration": 2.4568064212799072 }, { "auxiliary_loss_clip": 0.01141689, "auxiliary_loss_mlp": 0.01024764, "balance_loss_clip": 1.04329515, "balance_loss_mlp": 1.01753986, "epoch": 0.8504779654902904, "flos": 26758513002240.0, "grad_norm": 1.5741104597957458, "language_loss": 0.88587922, "learning_rate": 2.2984038700698715e-07, "loss": 0.90754372, "num_input_tokens_seen": 153012990, "step": 7073, "time_per_iteration": 2.512040615081787 }, { "auxiliary_loss_clip": 0.0114737, "auxiliary_loss_mlp": 0.01028099, "balance_loss_clip": 1.04633343, "balance_loss_mlp": 1.02081561, "epoch": 0.8505982083809295, "flos": 26468247196800.0, "grad_norm": 2.330735734679678, "language_loss": 0.78830302, "learning_rate": 2.2947795726871222e-07, "loss": 0.8100577, "num_input_tokens_seen": 153034015, "step": 7074, "time_per_iteration": 2.471318483352661 }, { "auxiliary_loss_clip": 0.01151886, "auxiliary_loss_mlp": 0.00761997, "balance_loss_clip": 1.05040956, "balance_loss_mlp": 1.00046706, "epoch": 0.8507184512715685, "flos": 20303390926080.0, "grad_norm": 1.6814630545809488, "language_loss": 0.85835367, "learning_rate": 2.2911579611722253e-07, "loss": 0.87749255, "num_input_tokens_seen": 153053160, "step": 7075, "time_per_iteration": 2.4278669357299805 }, { "auxiliary_loss_clip": 0.01134444, "auxiliary_loss_mlp": 0.01027121, "balance_loss_clip": 1.04330981, "balance_loss_mlp": 1.01997757, "epoch": 0.8508386941622077, "flos": 19025186474880.0, "grad_norm": 10.859556549340144, "language_loss": 0.87158132, "learning_rate": 2.2875390360745905e-07, "loss": 0.89319694, "num_input_tokens_seen": 153072565, "step": 7076, "time_per_iteration": 2.447910785675049 }, { "auxiliary_loss_clip": 0.01127973, "auxiliary_loss_mlp": 0.01026943, "balance_loss_clip": 1.04248953, "balance_loss_mlp": 1.01969528, "epoch": 0.8509589370528468, "flos": 16433405654400.0, "grad_norm": 1.5811002385968285, "language_loss": 0.77661842, "learning_rate": 2.2839227979432008e-07, "loss": 0.79816759, "num_input_tokens_seen": 153090215, "step": 7077, "time_per_iteration": 2.4639763832092285 }, { "auxiliary_loss_clip": 0.01137051, "auxiliary_loss_mlp": 0.0102733, "balance_loss_clip": 1.04286647, "balance_loss_mlp": 1.02009082, "epoch": 0.8510791799434858, "flos": 18259714713600.0, "grad_norm": 1.8369294047695677, "language_loss": 0.8523283, "learning_rate": 2.2803092473266373e-07, "loss": 0.87397212, "num_input_tokens_seen": 153107740, "step": 7078, "time_per_iteration": 2.4514496326446533 }, { "auxiliary_loss_clip": 0.01167681, "auxiliary_loss_mlp": 0.01027965, "balance_loss_clip": 1.04919112, "balance_loss_mlp": 1.0211879, "epoch": 0.851199422834125, "flos": 23441372933760.0, "grad_norm": 2.2591622548663004, "language_loss": 0.87185091, "learning_rate": 2.2766983847730724e-07, "loss": 0.89380741, "num_input_tokens_seen": 153127410, "step": 7079, "time_per_iteration": 2.4115891456604004 }, { "auxiliary_loss_clip": 0.01130762, "auxiliary_loss_mlp": 0.01027365, "balance_loss_clip": 1.04164219, "balance_loss_mlp": 1.02004004, "epoch": 0.851319665724764, "flos": 16289404030080.0, "grad_norm": 2.1513353759889227, "language_loss": 0.66395998, "learning_rate": 2.2730902108302663e-07, "loss": 0.68554127, "num_input_tokens_seen": 153144325, "step": 7080, "time_per_iteration": 2.4805057048797607 }, { "auxiliary_loss_clip": 0.011282, "auxiliary_loss_mlp": 0.01024554, "balance_loss_clip": 1.04047132, "balance_loss_mlp": 1.01704419, "epoch": 0.8514399086154031, "flos": 18989347680000.0, "grad_norm": 1.6018961236200482, "language_loss": 0.68767834, "learning_rate": 2.269484726045583e-07, "loss": 0.70920593, "num_input_tokens_seen": 153163240, "step": 7081, "time_per_iteration": 3.2784430980682373 }, { "auxiliary_loss_clip": 0.01126317, "auxiliary_loss_mlp": 0.01029128, "balance_loss_clip": 1.04383409, "balance_loss_mlp": 1.02173698, "epoch": 0.8515601515060423, "flos": 24571194301440.0, "grad_norm": 1.706802448039936, "language_loss": 0.79028738, "learning_rate": 2.2658819309659672e-07, "loss": 0.81184185, "num_input_tokens_seen": 153183440, "step": 7082, "time_per_iteration": 3.2958412170410156 }, { "auxiliary_loss_clip": 0.01135462, "auxiliary_loss_mlp": 0.01020593, "balance_loss_clip": 1.04651475, "balance_loss_mlp": 1.01406932, "epoch": 0.8516803943966813, "flos": 19529443555200.0, "grad_norm": 2.0187326952820426, "language_loss": 0.84839016, "learning_rate": 2.2622818261379706e-07, "loss": 0.86995071, "num_input_tokens_seen": 153200460, "step": 7083, "time_per_iteration": 2.4643161296844482 }, { "auxiliary_loss_clip": 0.01134213, "auxiliary_loss_mlp": 0.01025148, "balance_loss_clip": 1.04279351, "balance_loss_mlp": 1.01789105, "epoch": 0.8518006372873204, "flos": 20265792364800.0, "grad_norm": 1.9362268411796046, "language_loss": 0.7506994, "learning_rate": 2.2586844121077142e-07, "loss": 0.77229297, "num_input_tokens_seen": 153218970, "step": 7084, "time_per_iteration": 3.321122407913208 }, { "auxiliary_loss_clip": 0.01109514, "auxiliary_loss_mlp": 0.01030037, "balance_loss_clip": 1.03928852, "balance_loss_mlp": 1.02259803, "epoch": 0.8519208801779595, "flos": 24133227770880.0, "grad_norm": 1.823985184558145, "language_loss": 0.72233599, "learning_rate": 2.2550896894209215e-07, "loss": 0.7437315, "num_input_tokens_seen": 153238485, "step": 7085, "time_per_iteration": 2.5603764057159424 }, { "auxiliary_loss_clip": 0.01012651, "auxiliary_loss_mlp": 0.01000803, "balance_loss_clip": 1.01078892, "balance_loss_mlp": 0.99989742, "epoch": 0.8520411230685986, "flos": 63035223252480.0, "grad_norm": 0.6806603840275841, "language_loss": 0.56611848, "learning_rate": 2.2514976586229184e-07, "loss": 0.58625305, "num_input_tokens_seen": 153306430, "step": 7086, "time_per_iteration": 3.2596099376678467 }, { "auxiliary_loss_clip": 0.01052337, "auxiliary_loss_mlp": 0.01001103, "balance_loss_clip": 1.00837684, "balance_loss_mlp": 1.00020313, "epoch": 0.8521613659592376, "flos": 65836865283840.0, "grad_norm": 0.7580509956980961, "language_loss": 0.5481354, "learning_rate": 2.247908320258609e-07, "loss": 0.5686698, "num_input_tokens_seen": 153366520, "step": 7087, "time_per_iteration": 3.0005438327789307 }, { "auxiliary_loss_clip": 0.01101327, "auxiliary_loss_mlp": 0.01024358, "balance_loss_clip": 1.04148126, "balance_loss_mlp": 1.01699662, "epoch": 0.8522816088498768, "flos": 23112323418240.0, "grad_norm": 2.148926769257755, "language_loss": 0.7967459, "learning_rate": 2.2443216748724914e-07, "loss": 0.81800264, "num_input_tokens_seen": 153387230, "step": 7088, "time_per_iteration": 2.5657765865325928 }, { "auxiliary_loss_clip": 0.01154512, "auxiliary_loss_mlp": 0.00761629, "balance_loss_clip": 1.04673874, "balance_loss_mlp": 1.00036407, "epoch": 0.8524018517405159, "flos": 31758140073600.0, "grad_norm": 2.414209224319873, "language_loss": 0.74304509, "learning_rate": 2.2407377230086588e-07, "loss": 0.76220644, "num_input_tokens_seen": 153409585, "step": 7089, "time_per_iteration": 3.2796361446380615 }, { "auxiliary_loss_clip": 0.01121988, "auxiliary_loss_mlp": 0.01020596, "balance_loss_clip": 1.04518378, "balance_loss_mlp": 1.01373291, "epoch": 0.8525220946311549, "flos": 18690318956160.0, "grad_norm": 1.8693936582278239, "language_loss": 0.83429128, "learning_rate": 2.23715646521079e-07, "loss": 0.85571712, "num_input_tokens_seen": 153427105, "step": 7090, "time_per_iteration": 2.5232646465301514 }, { "auxiliary_loss_clip": 0.01153943, "auxiliary_loss_mlp": 0.00761992, "balance_loss_clip": 1.04529393, "balance_loss_mlp": 1.00046611, "epoch": 0.852642337521794, "flos": 21793216354560.0, "grad_norm": 1.9545998235180504, "language_loss": 0.83970618, "learning_rate": 2.2335779020221724e-07, "loss": 0.85886556, "num_input_tokens_seen": 153443725, "step": 7091, "time_per_iteration": 2.4448933601379395 }, { "auxiliary_loss_clip": 0.0105643, "auxiliary_loss_mlp": 0.01002323, "balance_loss_clip": 1.01856983, "balance_loss_mlp": 1.00123203, "epoch": 0.8527625804124331, "flos": 69040132260480.0, "grad_norm": 0.7988800743528195, "language_loss": 0.56406486, "learning_rate": 2.2300020339856497e-07, "loss": 0.5846523, "num_input_tokens_seen": 153506410, "step": 7092, "time_per_iteration": 3.0925779342651367 }, { "auxiliary_loss_clip": 0.01132005, "auxiliary_loss_mlp": 0.01022019, "balance_loss_clip": 1.04318929, "balance_loss_mlp": 1.01496196, "epoch": 0.8528828233030722, "flos": 26979399688320.0, "grad_norm": 2.170279768260591, "language_loss": 0.78223598, "learning_rate": 2.2264288616436966e-07, "loss": 0.80377626, "num_input_tokens_seen": 153526665, "step": 7093, "time_per_iteration": 2.554919481277466 }, { "auxiliary_loss_clip": 0.01129065, "auxiliary_loss_mlp": 0.01025349, "balance_loss_clip": 1.04226148, "balance_loss_mlp": 1.01838422, "epoch": 0.8530030661937112, "flos": 17487598936320.0, "grad_norm": 2.3092569011024198, "language_loss": 0.72660351, "learning_rate": 2.222858385538351e-07, "loss": 0.74814761, "num_input_tokens_seen": 153543465, "step": 7094, "time_per_iteration": 2.4423117637634277 }, { "auxiliary_loss_clip": 0.01146373, "auxiliary_loss_mlp": 0.01028126, "balance_loss_clip": 1.04410696, "balance_loss_mlp": 1.02111912, "epoch": 0.8531233090843504, "flos": 22160798184960.0, "grad_norm": 3.44874137753195, "language_loss": 0.68056417, "learning_rate": 2.2192906062112527e-07, "loss": 0.70230913, "num_input_tokens_seen": 153563340, "step": 7095, "time_per_iteration": 2.4753835201263428 }, { "auxiliary_loss_clip": 0.01163591, "auxiliary_loss_mlp": 0.01021055, "balance_loss_clip": 1.04470897, "balance_loss_mlp": 1.01439142, "epoch": 0.8532435519749895, "flos": 37635388145280.0, "grad_norm": 1.584784203027402, "language_loss": 0.70633185, "learning_rate": 2.2157255242036377e-07, "loss": 0.72817832, "num_input_tokens_seen": 153587005, "step": 7096, "time_per_iteration": 2.5972514152526855 }, { "auxiliary_loss_clip": 0.01118378, "auxiliary_loss_mlp": 0.01025146, "balance_loss_clip": 1.04140472, "balance_loss_mlp": 1.01800263, "epoch": 0.8533637948656285, "flos": 21398163598080.0, "grad_norm": 2.844418129219226, "language_loss": 0.74186468, "learning_rate": 2.2121631400563135e-07, "loss": 0.76329982, "num_input_tokens_seen": 153606835, "step": 7097, "time_per_iteration": 2.513479709625244 }, { "auxiliary_loss_clip": 0.01049414, "auxiliary_loss_mlp": 0.01001437, "balance_loss_clip": 1.01043344, "balance_loss_mlp": 1.00050151, "epoch": 0.8534840377562677, "flos": 53345122490880.0, "grad_norm": 0.7640189036336291, "language_loss": 0.53004527, "learning_rate": 2.208603454309701e-07, "loss": 0.55055374, "num_input_tokens_seen": 153664925, "step": 7098, "time_per_iteration": 3.006751775741577 }, { "auxiliary_loss_clip": 0.01109014, "auxiliary_loss_mlp": 0.01025397, "balance_loss_clip": 1.04181433, "balance_loss_mlp": 1.01755607, "epoch": 0.8536042806469067, "flos": 20814148368000.0, "grad_norm": 2.2556983047475723, "language_loss": 0.70660621, "learning_rate": 2.2050464675037994e-07, "loss": 0.72795033, "num_input_tokens_seen": 153683550, "step": 7099, "time_per_iteration": 2.573735237121582 }, { "auxiliary_loss_clip": 0.01136092, "auxiliary_loss_mlp": 0.01026461, "balance_loss_clip": 1.0445708, "balance_loss_mlp": 1.01921868, "epoch": 0.8537245235375458, "flos": 24681368292480.0, "grad_norm": 2.2744954366880434, "language_loss": 0.72964811, "learning_rate": 2.2014921801782016e-07, "loss": 0.75127357, "num_input_tokens_seen": 153703040, "step": 7100, "time_per_iteration": 2.505391836166382 }, { "auxiliary_loss_clip": 0.01136182, "auxiliary_loss_mlp": 0.0102226, "balance_loss_clip": 1.03957009, "balance_loss_mlp": 1.01538169, "epoch": 0.853844766428185, "flos": 24384817607040.0, "grad_norm": 1.8797281719053744, "language_loss": 0.74105167, "learning_rate": 2.1979405928720872e-07, "loss": 0.76263613, "num_input_tokens_seen": 153722695, "step": 7101, "time_per_iteration": 2.5264322757720947 }, { "auxiliary_loss_clip": 0.01139696, "auxiliary_loss_mlp": 0.01021695, "balance_loss_clip": 1.04485679, "balance_loss_mlp": 1.01508152, "epoch": 0.853965009318824, "flos": 20955707867520.0, "grad_norm": 1.5139654686054798, "language_loss": 0.79297006, "learning_rate": 2.1943917061242257e-07, "loss": 0.81458396, "num_input_tokens_seen": 153742550, "step": 7102, "time_per_iteration": 2.4963033199310303 }, { "auxiliary_loss_clip": 0.01158273, "auxiliary_loss_mlp": 0.00762013, "balance_loss_clip": 1.04658568, "balance_loss_mlp": 1.00049496, "epoch": 0.8540852522094631, "flos": 24201816791040.0, "grad_norm": 1.7341689569974803, "language_loss": 0.66479659, "learning_rate": 2.1908455204729903e-07, "loss": 0.68399942, "num_input_tokens_seen": 153761700, "step": 7103, "time_per_iteration": 2.471831798553467 }, { "auxiliary_loss_clip": 0.01136763, "auxiliary_loss_mlp": 0.0102547, "balance_loss_clip": 1.04289103, "balance_loss_mlp": 1.01815653, "epoch": 0.8542054951001022, "flos": 25082921410560.0, "grad_norm": 2.693554586123264, "language_loss": 0.78179169, "learning_rate": 2.1873020364563265e-07, "loss": 0.80341399, "num_input_tokens_seen": 153780765, "step": 7104, "time_per_iteration": 2.512988567352295 }, { "auxiliary_loss_clip": 0.01146736, "auxiliary_loss_mlp": 0.0102334, "balance_loss_clip": 1.04565895, "balance_loss_mlp": 1.01646447, "epoch": 0.8543257379907413, "flos": 24316551809280.0, "grad_norm": 24.71493465391521, "language_loss": 0.75604618, "learning_rate": 2.183761254611789e-07, "loss": 0.77774698, "num_input_tokens_seen": 153801090, "step": 7105, "time_per_iteration": 2.463827610015869 }, { "auxiliary_loss_clip": 0.01149472, "auxiliary_loss_mlp": 0.01027142, "balance_loss_clip": 1.04569709, "balance_loss_mlp": 1.02007318, "epoch": 0.8544459808813804, "flos": 55286630467200.0, "grad_norm": 2.015847224516383, "language_loss": 0.70016515, "learning_rate": 2.1802231754764987e-07, "loss": 0.72193128, "num_input_tokens_seen": 153826530, "step": 7106, "time_per_iteration": 2.7713136672973633 }, { "auxiliary_loss_clip": 0.01136835, "auxiliary_loss_mlp": 0.01027235, "balance_loss_clip": 1.04165757, "balance_loss_mlp": 1.01954257, "epoch": 0.8545662237720195, "flos": 25776248705280.0, "grad_norm": 1.8350454229343163, "language_loss": 0.76465547, "learning_rate": 2.17668779958718e-07, "loss": 0.78629619, "num_input_tokens_seen": 153849110, "step": 7107, "time_per_iteration": 2.527698516845703 }, { "auxiliary_loss_clip": 0.01164663, "auxiliary_loss_mlp": 0.01023817, "balance_loss_clip": 1.04628968, "balance_loss_mlp": 1.01670957, "epoch": 0.8546864666626586, "flos": 11108320427520.0, "grad_norm": 2.8153978998934415, "language_loss": 0.80400735, "learning_rate": 2.1731551274801553e-07, "loss": 0.82589221, "num_input_tokens_seen": 153865550, "step": 7108, "time_per_iteration": 3.35726261138916 }, { "auxiliary_loss_clip": 0.01140238, "auxiliary_loss_mlp": 0.01024911, "balance_loss_clip": 1.04658413, "balance_loss_mlp": 1.01740098, "epoch": 0.8548067095532976, "flos": 25520169669120.0, "grad_norm": 2.9064952533683606, "language_loss": 0.61815995, "learning_rate": 2.169625159691324e-07, "loss": 0.63981146, "num_input_tokens_seen": 153885425, "step": 7109, "time_per_iteration": 3.27755069732666 }, { "auxiliary_loss_clip": 0.01118058, "auxiliary_loss_mlp": 0.01024274, "balance_loss_clip": 1.04166985, "balance_loss_mlp": 1.01730919, "epoch": 0.8549269524439368, "flos": 24717853532160.0, "grad_norm": 2.3224754343060345, "language_loss": 0.7424742, "learning_rate": 2.1660978967561784e-07, "loss": 0.76389754, "num_input_tokens_seen": 153904760, "step": 7110, "time_per_iteration": 3.447495698928833 }, { "auxiliary_loss_clip": 0.01163556, "auxiliary_loss_mlp": 0.0102038, "balance_loss_clip": 1.04498148, "balance_loss_mlp": 1.01380217, "epoch": 0.8550471953345758, "flos": 19825599191040.0, "grad_norm": 2.7017082195453903, "language_loss": 0.78816509, "learning_rate": 2.1625733392098035e-07, "loss": 0.81000435, "num_input_tokens_seen": 153920370, "step": 7111, "time_per_iteration": 2.401700735092163 }, { "auxiliary_loss_clip": 0.01164089, "auxiliary_loss_mlp": 0.01023094, "balance_loss_clip": 1.04617691, "balance_loss_mlp": 1.01623917, "epoch": 0.8551674382252149, "flos": 22820441500800.0, "grad_norm": 1.583238807419085, "language_loss": 0.79560065, "learning_rate": 2.159051487586867e-07, "loss": 0.81747246, "num_input_tokens_seen": 153940500, "step": 7112, "time_per_iteration": 2.4194259643554688 }, { "auxiliary_loss_clip": 0.01141066, "auxiliary_loss_mlp": 0.01029279, "balance_loss_clip": 1.04665303, "balance_loss_mlp": 1.02133965, "epoch": 0.8552876811158541, "flos": 20631255292800.0, "grad_norm": 2.010515579972332, "language_loss": 0.72758937, "learning_rate": 2.155532342421642e-07, "loss": 0.74929285, "num_input_tokens_seen": 153958500, "step": 7113, "time_per_iteration": 2.4786014556884766 }, { "auxiliary_loss_clip": 0.01154349, "auxiliary_loss_mlp": 0.01028762, "balance_loss_clip": 1.04639828, "balance_loss_mlp": 1.02130246, "epoch": 0.8554079240064931, "flos": 23112359331840.0, "grad_norm": 1.7578985421022326, "language_loss": 0.78201163, "learning_rate": 2.1520159042479636e-07, "loss": 0.80384278, "num_input_tokens_seen": 153976790, "step": 7114, "time_per_iteration": 2.466628074645996 }, { "auxiliary_loss_clip": 0.01150476, "auxiliary_loss_mlp": 0.01024776, "balance_loss_clip": 1.04641366, "balance_loss_mlp": 1.01762962, "epoch": 0.8555281668971322, "flos": 22128047959680.0, "grad_norm": 2.1415884181085025, "language_loss": 0.71078467, "learning_rate": 2.148502173599287e-07, "loss": 0.73253727, "num_input_tokens_seen": 153994930, "step": 7115, "time_per_iteration": 2.4356510639190674 }, { "auxiliary_loss_clip": 0.01130847, "auxiliary_loss_mlp": 0.01019869, "balance_loss_clip": 1.04307628, "balance_loss_mlp": 1.01238227, "epoch": 0.8556484097877713, "flos": 31139040234240.0, "grad_norm": 1.7688118373270425, "language_loss": 0.66214544, "learning_rate": 2.1449911510086372e-07, "loss": 0.68365264, "num_input_tokens_seen": 154014400, "step": 7116, "time_per_iteration": 3.298696517944336 }, { "auxiliary_loss_clip": 0.01147216, "auxiliary_loss_mlp": 0.0102453, "balance_loss_clip": 1.04423118, "balance_loss_mlp": 1.01752079, "epoch": 0.8557686526784104, "flos": 24316551809280.0, "grad_norm": 1.752646639779243, "language_loss": 0.76703835, "learning_rate": 2.141482837008628e-07, "loss": 0.78875583, "num_input_tokens_seen": 154034940, "step": 7117, "time_per_iteration": 2.4745707511901855 }, { "auxiliary_loss_clip": 0.01141461, "auxiliary_loss_mlp": 0.01028354, "balance_loss_clip": 1.04265523, "balance_loss_mlp": 1.02091861, "epoch": 0.8558888955690495, "flos": 17712723427200.0, "grad_norm": 1.915362085385774, "language_loss": 0.71950197, "learning_rate": 2.1379772321314826e-07, "loss": 0.74120009, "num_input_tokens_seen": 154052985, "step": 7118, "time_per_iteration": 2.4002320766448975 }, { "auxiliary_loss_clip": 0.01088184, "auxiliary_loss_mlp": 0.01026403, "balance_loss_clip": 1.04164338, "balance_loss_mlp": 1.01889861, "epoch": 0.8560091384596886, "flos": 19171702051200.0, "grad_norm": 1.9415124180322658, "language_loss": 0.81642765, "learning_rate": 2.1344743369089802e-07, "loss": 0.83757353, "num_input_tokens_seen": 154068765, "step": 7119, "time_per_iteration": 2.5673701763153076 }, { "auxiliary_loss_clip": 0.01138132, "auxiliary_loss_mlp": 0.01024128, "balance_loss_clip": 1.0464741, "balance_loss_mlp": 1.01710999, "epoch": 0.8561293813503277, "flos": 23914855036800.0, "grad_norm": 1.6660649511231194, "language_loss": 0.81984925, "learning_rate": 2.130974151872522e-07, "loss": 0.84147185, "num_input_tokens_seen": 154089100, "step": 7120, "time_per_iteration": 2.5017502307891846 }, { "auxiliary_loss_clip": 0.01127324, "auxiliary_loss_mlp": 0.01023549, "balance_loss_clip": 1.04591632, "balance_loss_mlp": 1.01660848, "epoch": 0.8562496242409667, "flos": 22529206028160.0, "grad_norm": 1.6972027605980051, "language_loss": 0.78405559, "learning_rate": 2.1274766775530773e-07, "loss": 0.80556428, "num_input_tokens_seen": 154108965, "step": 7121, "time_per_iteration": 2.5228111743927 }, { "auxiliary_loss_clip": 0.01166229, "auxiliary_loss_mlp": 0.01021753, "balance_loss_clip": 1.04575682, "balance_loss_mlp": 1.01455534, "epoch": 0.8563698671316058, "flos": 14712745472640.0, "grad_norm": 2.262499353113867, "language_loss": 0.79953611, "learning_rate": 2.1239819144812077e-07, "loss": 0.8214159, "num_input_tokens_seen": 154123425, "step": 7122, "time_per_iteration": 2.382805109024048 }, { "auxiliary_loss_clip": 0.01115698, "auxiliary_loss_mlp": 0.01024639, "balance_loss_clip": 1.03922987, "balance_loss_mlp": 1.01757264, "epoch": 0.856490110022245, "flos": 39167768211840.0, "grad_norm": 1.7684929227127204, "language_loss": 0.70028448, "learning_rate": 2.1204898631870716e-07, "loss": 0.72168791, "num_input_tokens_seen": 154148315, "step": 7123, "time_per_iteration": 2.654613733291626 }, { "auxiliary_loss_clip": 0.01137594, "auxiliary_loss_mlp": 0.01022538, "balance_loss_clip": 1.04618323, "balance_loss_mlp": 1.01592505, "epoch": 0.856610352912884, "flos": 29059345658880.0, "grad_norm": 1.713443754030315, "language_loss": 0.75897026, "learning_rate": 2.1170005242004006e-07, "loss": 0.78057152, "num_input_tokens_seen": 154169665, "step": 7124, "time_per_iteration": 2.5396406650543213 }, { "auxiliary_loss_clip": 0.0114077, "auxiliary_loss_mlp": 0.01020307, "balance_loss_clip": 1.04329169, "balance_loss_mlp": 1.01335979, "epoch": 0.8567305958035231, "flos": 23878333883520.0, "grad_norm": 1.6401032456238034, "language_loss": 0.77739376, "learning_rate": 2.1135138980505384e-07, "loss": 0.7990045, "num_input_tokens_seen": 154190335, "step": 7125, "time_per_iteration": 2.5006911754608154 }, { "auxiliary_loss_clip": 0.01133988, "auxiliary_loss_mlp": 0.01021823, "balance_loss_clip": 1.04508853, "balance_loss_mlp": 1.01464081, "epoch": 0.8568508386941622, "flos": 22200120599040.0, "grad_norm": 1.6923256701763048, "language_loss": 0.7226361, "learning_rate": 2.110029985266395e-07, "loss": 0.74419427, "num_input_tokens_seen": 154210040, "step": 7126, "time_per_iteration": 2.481630563735962 }, { "auxiliary_loss_clip": 0.01139962, "auxiliary_loss_mlp": 0.01023347, "balance_loss_clip": 1.04299104, "balance_loss_mlp": 1.01628399, "epoch": 0.8569710815848013, "flos": 17307507121920.0, "grad_norm": 1.717440676243043, "language_loss": 0.73825502, "learning_rate": 2.1065487863764787e-07, "loss": 0.75988805, "num_input_tokens_seen": 154228385, "step": 7127, "time_per_iteration": 2.4848124980926514 }, { "auxiliary_loss_clip": 0.01099867, "auxiliary_loss_mlp": 0.01022333, "balance_loss_clip": 1.03663266, "balance_loss_mlp": 1.01476645, "epoch": 0.8570913244754403, "flos": 23732285184000.0, "grad_norm": 1.5067486900678617, "language_loss": 0.85707498, "learning_rate": 2.1030703019088846e-07, "loss": 0.87829697, "num_input_tokens_seen": 154249015, "step": 7128, "time_per_iteration": 2.5588924884796143 }, { "auxiliary_loss_clip": 0.01144073, "auxiliary_loss_mlp": 0.01022251, "balance_loss_clip": 1.04383111, "balance_loss_mlp": 1.01531649, "epoch": 0.8572115673660795, "flos": 20048748433920.0, "grad_norm": 1.8404305668588965, "language_loss": 0.71009517, "learning_rate": 2.099594532391291e-07, "loss": 0.73175842, "num_input_tokens_seen": 154267700, "step": 7129, "time_per_iteration": 2.447481870651245 }, { "auxiliary_loss_clip": 0.01141461, "auxiliary_loss_mlp": 0.01022812, "balance_loss_clip": 1.04364192, "balance_loss_mlp": 1.01587701, "epoch": 0.8573318102567186, "flos": 27160389342720.0, "grad_norm": 2.135642708660653, "language_loss": 0.79491234, "learning_rate": 2.0961214783509806e-07, "loss": 0.81655508, "num_input_tokens_seen": 154290580, "step": 7130, "time_per_iteration": 2.4881370067596436 }, { "auxiliary_loss_clip": 0.01141851, "auxiliary_loss_mlp": 0.01022728, "balance_loss_clip": 1.04389179, "balance_loss_mlp": 1.01544392, "epoch": 0.8574520531473576, "flos": 24936585402240.0, "grad_norm": 1.672699755340515, "language_loss": 0.74431634, "learning_rate": 2.0926511403148051e-07, "loss": 0.76596206, "num_input_tokens_seen": 154309545, "step": 7131, "time_per_iteration": 2.5279552936553955 }, { "auxiliary_loss_clip": 0.01131397, "auxiliary_loss_mlp": 0.01028293, "balance_loss_clip": 1.04535735, "balance_loss_mlp": 1.02132261, "epoch": 0.8575722960379968, "flos": 18771154513920.0, "grad_norm": 2.010697116662194, "language_loss": 0.75646871, "learning_rate": 2.0891835188092143e-07, "loss": 0.77806562, "num_input_tokens_seen": 154326545, "step": 7132, "time_per_iteration": 2.485581398010254 }, { "auxiliary_loss_clip": 0.01130087, "auxiliary_loss_mlp": 0.0102527, "balance_loss_clip": 1.04269528, "balance_loss_mlp": 1.01760507, "epoch": 0.8576925389286358, "flos": 22200300167040.0, "grad_norm": 1.785602718310032, "language_loss": 0.81554157, "learning_rate": 2.0857186143602434e-07, "loss": 0.83709514, "num_input_tokens_seen": 154345190, "step": 7133, "time_per_iteration": 2.5050699710845947 }, { "auxiliary_loss_clip": 0.01112251, "auxiliary_loss_mlp": 0.01029203, "balance_loss_clip": 1.03923154, "balance_loss_mlp": 1.02162695, "epoch": 0.8578127818192749, "flos": 22894345733760.0, "grad_norm": 2.153421500470876, "language_loss": 0.67821896, "learning_rate": 2.0822564274935094e-07, "loss": 0.69963348, "num_input_tokens_seen": 154364615, "step": 7134, "time_per_iteration": 3.3863205909729004 }, { "auxiliary_loss_clip": 0.01138747, "auxiliary_loss_mlp": 0.01024078, "balance_loss_clip": 1.04717696, "balance_loss_mlp": 1.01645803, "epoch": 0.8579330247099141, "flos": 34824839541120.0, "grad_norm": 1.6992535868343805, "language_loss": 0.67037719, "learning_rate": 2.078796958734239e-07, "loss": 0.69200546, "num_input_tokens_seen": 154387335, "step": 7135, "time_per_iteration": 3.3488194942474365 }, { "auxiliary_loss_clip": 0.01150362, "auxiliary_loss_mlp": 0.01025291, "balance_loss_clip": 1.04596567, "balance_loss_mlp": 1.01869583, "epoch": 0.8580532676005531, "flos": 19755681367680.0, "grad_norm": 1.810673972587797, "language_loss": 0.75111914, "learning_rate": 2.0753402086072124e-07, "loss": 0.77287567, "num_input_tokens_seen": 154405965, "step": 7136, "time_per_iteration": 2.4711809158325195 }, { "auxiliary_loss_clip": 0.01092402, "auxiliary_loss_mlp": 0.01028217, "balance_loss_clip": 1.04108095, "balance_loss_mlp": 1.02101052, "epoch": 0.8581735104911922, "flos": 22739318634240.0, "grad_norm": 3.4370197576638586, "language_loss": 0.75543499, "learning_rate": 2.071886177636828e-07, "loss": 0.77664119, "num_input_tokens_seen": 154422750, "step": 7137, "time_per_iteration": 3.4663712978363037 }, { "auxiliary_loss_clip": 0.01148035, "auxiliary_loss_mlp": 0.01022027, "balance_loss_clip": 1.04576111, "balance_loss_mlp": 1.01519358, "epoch": 0.8582937533818313, "flos": 23149131880320.0, "grad_norm": 1.8716892172047015, "language_loss": 0.83404833, "learning_rate": 2.0684348663470575e-07, "loss": 0.85574889, "num_input_tokens_seen": 154442930, "step": 7138, "time_per_iteration": 2.4559977054595947 }, { "auxiliary_loss_clip": 0.01133601, "auxiliary_loss_mlp": 0.01025404, "balance_loss_clip": 1.03944218, "balance_loss_mlp": 1.01821876, "epoch": 0.8584139962724704, "flos": 19498668577920.0, "grad_norm": 1.9761921902968498, "language_loss": 0.61640847, "learning_rate": 2.0649862752614555e-07, "loss": 0.63799858, "num_input_tokens_seen": 154461640, "step": 7139, "time_per_iteration": 2.475545644760132 }, { "auxiliary_loss_clip": 0.01042598, "auxiliary_loss_mlp": 0.01000632, "balance_loss_clip": 1.00727153, "balance_loss_mlp": 0.9997142, "epoch": 0.8585342391631094, "flos": 71276577788160.0, "grad_norm": 0.7948786302469087, "language_loss": 0.57112753, "learning_rate": 2.0615404049031838e-07, "loss": 0.59155977, "num_input_tokens_seen": 154518610, "step": 7140, "time_per_iteration": 3.0986568927764893 }, { "auxiliary_loss_clip": 0.01152329, "auxiliary_loss_mlp": 0.01026267, "balance_loss_clip": 1.04647374, "balance_loss_mlp": 1.01862538, "epoch": 0.8586544820537486, "flos": 10815432929280.0, "grad_norm": 2.54544364075567, "language_loss": 0.77928972, "learning_rate": 2.0580972557949616e-07, "loss": 0.8010757, "num_input_tokens_seen": 154533700, "step": 7141, "time_per_iteration": 2.4081368446350098 }, { "auxiliary_loss_clip": 0.01053219, "auxiliary_loss_mlp": 0.01000548, "balance_loss_clip": 1.0076679, "balance_loss_mlp": 0.99956435, "epoch": 0.8587747249443877, "flos": 64811184422400.0, "grad_norm": 0.7943313617794318, "language_loss": 0.54253036, "learning_rate": 2.054656828459125e-07, "loss": 0.56306803, "num_input_tokens_seen": 154597810, "step": 7142, "time_per_iteration": 3.787107229232788 }, { "auxiliary_loss_clip": 0.01105922, "auxiliary_loss_mlp": 0.01025544, "balance_loss_clip": 1.04046488, "balance_loss_mlp": 1.01786041, "epoch": 0.8588949678350267, "flos": 26834607964800.0, "grad_norm": 1.792004687903429, "language_loss": 0.77316689, "learning_rate": 2.051219123417578e-07, "loss": 0.79448158, "num_input_tokens_seen": 154617870, "step": 7143, "time_per_iteration": 2.601811170578003 }, { "auxiliary_loss_clip": 0.01166115, "auxiliary_loss_mlp": 0.01024218, "balance_loss_clip": 1.04571009, "balance_loss_mlp": 1.01630831, "epoch": 0.8590152107256659, "flos": 26104256726400.0, "grad_norm": 2.120210973560474, "language_loss": 0.59869969, "learning_rate": 2.0477841411918196e-07, "loss": 0.62060302, "num_input_tokens_seen": 154637395, "step": 7144, "time_per_iteration": 2.4668514728546143 }, { "auxiliary_loss_clip": 0.01144708, "auxiliary_loss_mlp": 0.01022133, "balance_loss_clip": 1.04402995, "balance_loss_mlp": 1.01509368, "epoch": 0.859135453616305, "flos": 26140885620480.0, "grad_norm": 1.8277466197563197, "language_loss": 0.75026977, "learning_rate": 2.0443518823029326e-07, "loss": 0.7719382, "num_input_tokens_seen": 154657935, "step": 7145, "time_per_iteration": 2.522007465362549 }, { "auxiliary_loss_clip": 0.01117454, "auxiliary_loss_mlp": 0.01027138, "balance_loss_clip": 1.04153407, "balance_loss_mlp": 1.0197289, "epoch": 0.859255696506944, "flos": 12969319046400.0, "grad_norm": 2.0723992810253353, "language_loss": 0.76774371, "learning_rate": 2.0409223472715854e-07, "loss": 0.78918958, "num_input_tokens_seen": 154675080, "step": 7146, "time_per_iteration": 2.488037586212158 }, { "auxiliary_loss_clip": 0.01124392, "auxiliary_loss_mlp": 0.00761389, "balance_loss_clip": 1.04345751, "balance_loss_mlp": 1.00045025, "epoch": 0.8593759393975832, "flos": 18475753063680.0, "grad_norm": 1.8579247381037083, "language_loss": 0.75002623, "learning_rate": 2.0374955366180434e-07, "loss": 0.76888394, "num_input_tokens_seen": 154692720, "step": 7147, "time_per_iteration": 2.5174143314361572 }, { "auxiliary_loss_clip": 0.01126578, "auxiliary_loss_mlp": 0.01021481, "balance_loss_clip": 1.04173207, "balance_loss_mlp": 1.01446247, "epoch": 0.8594961822882222, "flos": 22200156512640.0, "grad_norm": 1.6797549329276067, "language_loss": 0.72642541, "learning_rate": 2.034071450862147e-07, "loss": 0.74790597, "num_input_tokens_seen": 154710190, "step": 7148, "time_per_iteration": 2.5408291816711426 }, { "auxiliary_loss_clip": 0.01139129, "auxiliary_loss_mlp": 0.01023014, "balance_loss_clip": 1.04301882, "balance_loss_mlp": 1.015432, "epoch": 0.8596164251788613, "flos": 23294749616640.0, "grad_norm": 1.8180564129992038, "language_loss": 0.76850498, "learning_rate": 2.030650090523327e-07, "loss": 0.79012644, "num_input_tokens_seen": 154729380, "step": 7149, "time_per_iteration": 2.5200419425964355 }, { "auxiliary_loss_clip": 0.01120077, "auxiliary_loss_mlp": 0.01023761, "balance_loss_clip": 1.04114318, "balance_loss_mlp": 1.01627517, "epoch": 0.8597366680695004, "flos": 31649905416960.0, "grad_norm": 1.7057502844911314, "language_loss": 0.59345913, "learning_rate": 2.0272314561205995e-07, "loss": 0.61489749, "num_input_tokens_seen": 154749775, "step": 7150, "time_per_iteration": 2.5961899757385254 }, { "auxiliary_loss_clip": 0.01115746, "auxiliary_loss_mlp": 0.01019331, "balance_loss_clip": 1.03978515, "balance_loss_mlp": 1.01245558, "epoch": 0.8598569109601395, "flos": 21287738211840.0, "grad_norm": 2.2297157319955887, "language_loss": 0.72871649, "learning_rate": 2.023815548172567e-07, "loss": 0.75006723, "num_input_tokens_seen": 154769845, "step": 7151, "time_per_iteration": 2.5622732639312744 }, { "auxiliary_loss_clip": 0.01150406, "auxiliary_loss_mlp": 0.01023706, "balance_loss_clip": 1.04482269, "balance_loss_mlp": 1.01644015, "epoch": 0.8599771538507786, "flos": 25447809720960.0, "grad_norm": 1.6101062331573892, "language_loss": 0.6585561, "learning_rate": 2.0204023671974267e-07, "loss": 0.68029726, "num_input_tokens_seen": 154789230, "step": 7152, "time_per_iteration": 2.4884986877441406 }, { "auxiliary_loss_clip": 0.01143532, "auxiliary_loss_mlp": 0.01023821, "balance_loss_clip": 1.04296422, "balance_loss_mlp": 1.01630533, "epoch": 0.8600973967414177, "flos": 16723958768640.0, "grad_norm": 2.0342596018549015, "language_loss": 0.81103659, "learning_rate": 2.0169919137129532e-07, "loss": 0.83271015, "num_input_tokens_seen": 154807670, "step": 7153, "time_per_iteration": 2.436661720275879 }, { "auxiliary_loss_clip": 0.01151922, "auxiliary_loss_mlp": 0.01026136, "balance_loss_clip": 1.04653025, "balance_loss_mlp": 1.01864386, "epoch": 0.8602176396320568, "flos": 25227928615680.0, "grad_norm": 3.207644420848581, "language_loss": 0.70785671, "learning_rate": 2.013584188236508e-07, "loss": 0.72963732, "num_input_tokens_seen": 154825575, "step": 7154, "time_per_iteration": 2.4701919555664062 }, { "auxiliary_loss_clip": 0.01166557, "auxiliary_loss_mlp": 0.01024054, "balance_loss_clip": 1.04713452, "balance_loss_mlp": 1.0166328, "epoch": 0.8603378825226958, "flos": 20412236113920.0, "grad_norm": 1.8171104162711655, "language_loss": 0.7927283, "learning_rate": 2.0101791912850396e-07, "loss": 0.81463444, "num_input_tokens_seen": 154845115, "step": 7155, "time_per_iteration": 2.4896609783172607 }, { "auxiliary_loss_clip": 0.01136658, "auxiliary_loss_mlp": 0.01018456, "balance_loss_clip": 1.04521883, "balance_loss_mlp": 1.01131868, "epoch": 0.8604581254133349, "flos": 34930201109760.0, "grad_norm": 2.06850491880919, "language_loss": 0.64094603, "learning_rate": 2.006776923375082e-07, "loss": 0.66249716, "num_input_tokens_seen": 154866770, "step": 7156, "time_per_iteration": 2.5888118743896484 }, { "auxiliary_loss_clip": 0.01164964, "auxiliary_loss_mlp": 0.01020246, "balance_loss_clip": 1.04633975, "balance_loss_mlp": 1.01316762, "epoch": 0.860578368303974, "flos": 22596538072320.0, "grad_norm": 1.615905795665432, "language_loss": 0.71076477, "learning_rate": 2.003377385022764e-07, "loss": 0.7326169, "num_input_tokens_seen": 154885595, "step": 7157, "time_per_iteration": 2.442112445831299 }, { "auxiliary_loss_clip": 0.01136359, "auxiliary_loss_mlp": 0.0102615, "balance_loss_clip": 1.04194939, "balance_loss_mlp": 1.01952505, "epoch": 0.8606986111946131, "flos": 21324331192320.0, "grad_norm": 2.7986868252528163, "language_loss": 0.77131236, "learning_rate": 1.9999805767437826e-07, "loss": 0.7929374, "num_input_tokens_seen": 154904485, "step": 7158, "time_per_iteration": 2.4537353515625 }, { "auxiliary_loss_clip": 0.01130876, "auxiliary_loss_mlp": 0.01022342, "balance_loss_clip": 1.04168081, "balance_loss_mlp": 1.01549327, "epoch": 0.8608188540852522, "flos": 28877206769280.0, "grad_norm": 1.6227244744477838, "language_loss": 0.71838284, "learning_rate": 1.9965864990534386e-07, "loss": 0.73991501, "num_input_tokens_seen": 154925010, "step": 7159, "time_per_iteration": 2.5383620262145996 }, { "auxiliary_loss_clip": 0.01114805, "auxiliary_loss_mlp": 0.01022494, "balance_loss_clip": 1.03851306, "balance_loss_mlp": 1.01589882, "epoch": 0.8609390969758913, "flos": 29716187713920.0, "grad_norm": 1.7403032928859041, "language_loss": 0.77594042, "learning_rate": 1.9931951524666092e-07, "loss": 0.79731339, "num_input_tokens_seen": 154946100, "step": 7160, "time_per_iteration": 2.657500743865967 }, { "auxiliary_loss_clip": 0.01153075, "auxiliary_loss_mlp": 0.00761612, "balance_loss_clip": 1.04609132, "balance_loss_mlp": 1.00046122, "epoch": 0.8610593398665304, "flos": 21249349551360.0, "grad_norm": 1.6085437579817292, "language_loss": 0.81116873, "learning_rate": 1.9898065374977534e-07, "loss": 0.83031559, "num_input_tokens_seen": 154966305, "step": 7161, "time_per_iteration": 3.298006772994995 }, { "auxiliary_loss_clip": 0.01118675, "auxiliary_loss_mlp": 0.01018596, "balance_loss_clip": 1.04061389, "balance_loss_mlp": 1.01259685, "epoch": 0.8611795827571694, "flos": 14830102183680.0, "grad_norm": 2.334776530974076, "language_loss": 0.72395498, "learning_rate": 1.9864206546609342e-07, "loss": 0.74532765, "num_input_tokens_seen": 154985145, "step": 7162, "time_per_iteration": 2.4654252529144287 }, { "auxiliary_loss_clip": 0.01162847, "auxiliary_loss_mlp": 0.01021165, "balance_loss_clip": 1.04496765, "balance_loss_mlp": 1.01436126, "epoch": 0.8612998256478086, "flos": 24243258107520.0, "grad_norm": 1.7584485460681074, "language_loss": 0.84370685, "learning_rate": 1.983037504469771e-07, "loss": 0.86554694, "num_input_tokens_seen": 155003855, "step": 7163, "time_per_iteration": 2.439310312271118 }, { "auxiliary_loss_clip": 0.01154268, "auxiliary_loss_mlp": 0.01023178, "balance_loss_clip": 1.04794371, "balance_loss_mlp": 1.01624954, "epoch": 0.8614200685384477, "flos": 21252653602560.0, "grad_norm": 1.7550456793430336, "language_loss": 0.6641385, "learning_rate": 1.9796570874374984e-07, "loss": 0.68591297, "num_input_tokens_seen": 155023960, "step": 7164, "time_per_iteration": 3.2776477336883545 }, { "auxiliary_loss_clip": 0.01138227, "auxiliary_loss_mlp": 0.01019308, "balance_loss_clip": 1.04431295, "balance_loss_mlp": 1.01208401, "epoch": 0.8615403114290867, "flos": 20007738080640.0, "grad_norm": 1.9202443777637868, "language_loss": 0.77560365, "learning_rate": 1.976279404076917e-07, "loss": 0.79717898, "num_input_tokens_seen": 155043360, "step": 7165, "time_per_iteration": 2.489232301712036 }, { "auxiliary_loss_clip": 0.01121536, "auxiliary_loss_mlp": 0.01024967, "balance_loss_clip": 1.04301095, "balance_loss_mlp": 1.01811802, "epoch": 0.8616605543197259, "flos": 29789373674880.0, "grad_norm": 2.3160453414399793, "language_loss": 0.76121736, "learning_rate": 1.9729044549004193e-07, "loss": 0.78268236, "num_input_tokens_seen": 155064745, "step": 7166, "time_per_iteration": 2.5777416229248047 }, { "auxiliary_loss_clip": 0.01149603, "auxiliary_loss_mlp": 0.01021178, "balance_loss_clip": 1.04632354, "balance_loss_mlp": 1.01432967, "epoch": 0.8617807972103649, "flos": 28911609020160.0, "grad_norm": 2.392063385774023, "language_loss": 0.70326626, "learning_rate": 1.9695322404199822e-07, "loss": 0.72497404, "num_input_tokens_seen": 155086790, "step": 7167, "time_per_iteration": 2.4902288913726807 }, { "auxiliary_loss_clip": 0.01139383, "auxiliary_loss_mlp": 0.01025531, "balance_loss_clip": 1.04743254, "balance_loss_mlp": 1.01831341, "epoch": 0.861901040101004, "flos": 27673804391040.0, "grad_norm": 2.8428791164739757, "language_loss": 0.82057333, "learning_rate": 1.9661627611471654e-07, "loss": 0.84222245, "num_input_tokens_seen": 155106585, "step": 7168, "time_per_iteration": 2.53017520904541 }, { "auxiliary_loss_clip": 0.01143684, "auxiliary_loss_mlp": 0.01021376, "balance_loss_clip": 1.04487944, "balance_loss_mlp": 1.01368058, "epoch": 0.8620212829916432, "flos": 49748056755840.0, "grad_norm": 1.9479253952305635, "language_loss": 0.7032724, "learning_rate": 1.9627960175931246e-07, "loss": 0.72492301, "num_input_tokens_seen": 155131285, "step": 7169, "time_per_iteration": 3.5001347064971924 }, { "auxiliary_loss_clip": 0.01150959, "auxiliary_loss_mlp": 0.01027892, "balance_loss_clip": 1.04737639, "balance_loss_mlp": 1.02125502, "epoch": 0.8621415258822822, "flos": 21138672769920.0, "grad_norm": 1.9796452057515024, "language_loss": 0.74033833, "learning_rate": 1.9594320102685847e-07, "loss": 0.7621268, "num_input_tokens_seen": 155150555, "step": 7170, "time_per_iteration": 2.4481654167175293 }, { "auxiliary_loss_clip": 0.01125506, "auxiliary_loss_mlp": 0.00761345, "balance_loss_clip": 1.04096925, "balance_loss_mlp": 1.00041819, "epoch": 0.8622617687729213, "flos": 21689039934720.0, "grad_norm": 1.8623388644017276, "language_loss": 0.63945937, "learning_rate": 1.956070739683864e-07, "loss": 0.65832794, "num_input_tokens_seen": 155169890, "step": 7171, "time_per_iteration": 2.482297897338867 }, { "auxiliary_loss_clip": 0.01107715, "auxiliary_loss_mlp": 0.01020904, "balance_loss_clip": 1.03871894, "balance_loss_mlp": 1.01387656, "epoch": 0.8623820116635604, "flos": 26250592734720.0, "grad_norm": 1.4883773715103235, "language_loss": 0.74448526, "learning_rate": 1.9527122063488678e-07, "loss": 0.76577145, "num_input_tokens_seen": 155191005, "step": 7172, "time_per_iteration": 2.5603301525115967 }, { "auxiliary_loss_clip": 0.01132689, "auxiliary_loss_mlp": 0.01019137, "balance_loss_clip": 1.03947949, "balance_loss_mlp": 1.01248479, "epoch": 0.8625022545541995, "flos": 19647554451840.0, "grad_norm": 1.7552696607859688, "language_loss": 0.8015672, "learning_rate": 1.9493564107730755e-07, "loss": 0.82308543, "num_input_tokens_seen": 155211005, "step": 7173, "time_per_iteration": 2.484396457672119 }, { "auxiliary_loss_clip": 0.01130122, "auxiliary_loss_mlp": 0.01024789, "balance_loss_clip": 1.04005027, "balance_loss_mlp": 1.01823866, "epoch": 0.8626224974448385, "flos": 21908382336000.0, "grad_norm": 1.8993551317961008, "language_loss": 0.61120141, "learning_rate": 1.9460033534655684e-07, "loss": 0.63275051, "num_input_tokens_seen": 155230365, "step": 7174, "time_per_iteration": 2.513669967651367 }, { "auxiliary_loss_clip": 0.011275, "auxiliary_loss_mlp": 0.01024181, "balance_loss_clip": 1.03730571, "balance_loss_mlp": 1.01709414, "epoch": 0.8627427403354777, "flos": 23331198942720.0, "grad_norm": 2.01607008177014, "language_loss": 0.84329581, "learning_rate": 1.9426530349349978e-07, "loss": 0.86481261, "num_input_tokens_seen": 155250815, "step": 7175, "time_per_iteration": 2.4828202724456787 }, { "auxiliary_loss_clip": 0.01148406, "auxiliary_loss_mlp": 0.00761486, "balance_loss_clip": 1.04410505, "balance_loss_mlp": 1.00044036, "epoch": 0.8628629832261168, "flos": 16362877299840.0, "grad_norm": 1.8052870818472062, "language_loss": 0.64746886, "learning_rate": 1.9393054556896038e-07, "loss": 0.6665678, "num_input_tokens_seen": 155268515, "step": 7176, "time_per_iteration": 2.498678684234619 }, { "auxiliary_loss_clip": 0.01118835, "auxiliary_loss_mlp": 0.01023025, "balance_loss_clip": 1.03963065, "balance_loss_mlp": 1.01558614, "epoch": 0.8629832261167558, "flos": 28103941756800.0, "grad_norm": 2.2220566490366784, "language_loss": 0.69286215, "learning_rate": 1.9359606162372133e-07, "loss": 0.71428072, "num_input_tokens_seen": 155290120, "step": 7177, "time_per_iteration": 2.6455132961273193 }, { "auxiliary_loss_clip": 0.01163659, "auxiliary_loss_mlp": 0.01021851, "balance_loss_clip": 1.0466615, "balance_loss_mlp": 1.01476979, "epoch": 0.863103469007395, "flos": 20230061310720.0, "grad_norm": 1.6731357986143505, "language_loss": 0.70456815, "learning_rate": 1.9326185170852293e-07, "loss": 0.72642326, "num_input_tokens_seen": 155309085, "step": 7178, "time_per_iteration": 2.419935941696167 }, { "auxiliary_loss_clip": 0.01148719, "auxiliary_loss_mlp": 0.01022166, "balance_loss_clip": 1.0447228, "balance_loss_mlp": 1.01529956, "epoch": 0.863223711898034, "flos": 24498547044480.0, "grad_norm": 2.2492280640774993, "language_loss": 0.72275078, "learning_rate": 1.9292791587406598e-07, "loss": 0.74445969, "num_input_tokens_seen": 155327945, "step": 7179, "time_per_iteration": 2.484071969985962 }, { "auxiliary_loss_clip": 0.01148161, "auxiliary_loss_mlp": 0.00761599, "balance_loss_clip": 1.04366326, "balance_loss_mlp": 1.00045872, "epoch": 0.8633439547886731, "flos": 17675376261120.0, "grad_norm": 1.9709774704506708, "language_loss": 0.86926478, "learning_rate": 1.9259425417100661e-07, "loss": 0.88836235, "num_input_tokens_seen": 155344060, "step": 7180, "time_per_iteration": 2.4398000240325928 }, { "auxiliary_loss_clip": 0.01089203, "auxiliary_loss_mlp": 0.01028645, "balance_loss_clip": 1.03364253, "balance_loss_mlp": 1.02115881, "epoch": 0.8634641976793123, "flos": 12895055677440.0, "grad_norm": 2.273902149310325, "language_loss": 0.75118923, "learning_rate": 1.9226086664996234e-07, "loss": 0.7723676, "num_input_tokens_seen": 155362305, "step": 7181, "time_per_iteration": 2.5962767601013184 }, { "auxiliary_loss_clip": 0.01140575, "auxiliary_loss_mlp": 0.01025767, "balance_loss_clip": 1.04629219, "balance_loss_mlp": 1.01902294, "epoch": 0.8635844405699513, "flos": 23878980328320.0, "grad_norm": 2.1145054036243787, "language_loss": 0.74558616, "learning_rate": 1.9192775336150712e-07, "loss": 0.76724958, "num_input_tokens_seen": 155382605, "step": 7182, "time_per_iteration": 2.522078037261963 }, { "auxiliary_loss_clip": 0.01047765, "auxiliary_loss_mlp": 0.01001411, "balance_loss_clip": 1.00723147, "balance_loss_mlp": 1.00045192, "epoch": 0.8637046834605904, "flos": 60453387521280.0, "grad_norm": 0.7791210614873482, "language_loss": 0.56297916, "learning_rate": 1.915949143561739e-07, "loss": 0.58347094, "num_input_tokens_seen": 155437280, "step": 7183, "time_per_iteration": 3.0353150367736816 }, { "auxiliary_loss_clip": 0.01150454, "auxiliary_loss_mlp": 0.01025263, "balance_loss_clip": 1.04654932, "balance_loss_mlp": 1.01849771, "epoch": 0.8638249263512295, "flos": 20558751690240.0, "grad_norm": 1.7818436930383539, "language_loss": 0.78101188, "learning_rate": 1.9126234968445498e-07, "loss": 0.80276906, "num_input_tokens_seen": 155456970, "step": 7184, "time_per_iteration": 2.4756267070770264 }, { "auxiliary_loss_clip": 0.01166765, "auxiliary_loss_mlp": 0.01029052, "balance_loss_clip": 1.04843652, "balance_loss_mlp": 1.02172065, "epoch": 0.8639451692418686, "flos": 26615768353920.0, "grad_norm": 1.4424492467860675, "language_loss": 0.67526162, "learning_rate": 1.9093005939679884e-07, "loss": 0.69721973, "num_input_tokens_seen": 155478925, "step": 7185, "time_per_iteration": 2.46785044670105 }, { "auxiliary_loss_clip": 0.01151267, "auxiliary_loss_mlp": 0.01027334, "balance_loss_clip": 1.04599619, "balance_loss_mlp": 1.01983833, "epoch": 0.8640654121325076, "flos": 15122450977920.0, "grad_norm": 1.7826463810661388, "language_loss": 0.76235586, "learning_rate": 1.9059804354361452e-07, "loss": 0.78414184, "num_input_tokens_seen": 155496700, "step": 7186, "time_per_iteration": 2.3923046588897705 }, { "auxiliary_loss_clip": 0.0112752, "auxiliary_loss_mlp": 0.01022135, "balance_loss_clip": 1.03850758, "balance_loss_mlp": 1.01474404, "epoch": 0.8641856550231467, "flos": 31869068250240.0, "grad_norm": 2.7666134160528597, "language_loss": 0.70189577, "learning_rate": 1.902663021752684e-07, "loss": 0.72339237, "num_input_tokens_seen": 155518130, "step": 7187, "time_per_iteration": 2.5652294158935547 }, { "auxiliary_loss_clip": 0.01167919, "auxiliary_loss_mlp": 0.0102202, "balance_loss_clip": 1.04863822, "balance_loss_mlp": 1.01492715, "epoch": 0.8643058979137859, "flos": 14976545932800.0, "grad_norm": 2.0883023640443947, "language_loss": 0.82624209, "learning_rate": 1.8993483534208556e-07, "loss": 0.84814155, "num_input_tokens_seen": 155537040, "step": 7188, "time_per_iteration": 3.165694236755371 }, { "auxiliary_loss_clip": 0.01132931, "auxiliary_loss_mlp": 0.01025959, "balance_loss_clip": 1.04535341, "balance_loss_mlp": 1.0183475, "epoch": 0.8644261408044249, "flos": 13115726881920.0, "grad_norm": 2.3793782609293195, "language_loss": 0.74976712, "learning_rate": 1.8960364309434884e-07, "loss": 0.77135599, "num_input_tokens_seen": 155554535, "step": 7189, "time_per_iteration": 3.3004629611968994 }, { "auxiliary_loss_clip": 0.01089953, "auxiliary_loss_mlp": 0.00761578, "balance_loss_clip": 1.03765702, "balance_loss_mlp": 1.00045216, "epoch": 0.864546383695064, "flos": 20850920916480.0, "grad_norm": 1.6633452739898058, "language_loss": 0.78486216, "learning_rate": 1.8927272548229967e-07, "loss": 0.80337751, "num_input_tokens_seen": 155574225, "step": 7190, "time_per_iteration": 3.425333261489868 }, { "auxiliary_loss_clip": 0.0110905, "auxiliary_loss_mlp": 0.01024414, "balance_loss_clip": 1.04110312, "balance_loss_mlp": 1.01752329, "epoch": 0.8646666265857031, "flos": 21324582587520.0, "grad_norm": 1.5625766720170955, "language_loss": 0.83032167, "learning_rate": 1.8894208255613876e-07, "loss": 0.85165632, "num_input_tokens_seen": 155593540, "step": 7191, "time_per_iteration": 2.5528724193573 }, { "auxiliary_loss_clip": 0.01164949, "auxiliary_loss_mlp": 0.01021209, "balance_loss_clip": 1.04750764, "balance_loss_mlp": 1.01422906, "epoch": 0.8647868694763422, "flos": 19750833031680.0, "grad_norm": 2.0039131677404303, "language_loss": 0.77587777, "learning_rate": 1.8861171436602397e-07, "loss": 0.79773939, "num_input_tokens_seen": 155610655, "step": 7192, "time_per_iteration": 2.401355028152466 }, { "auxiliary_loss_clip": 0.01153267, "auxiliary_loss_mlp": 0.01026935, "balance_loss_clip": 1.0469681, "balance_loss_mlp": 1.01959729, "epoch": 0.8649071123669813, "flos": 26176760328960.0, "grad_norm": 2.2980797467456395, "language_loss": 0.80351007, "learning_rate": 1.882816209620719e-07, "loss": 0.82531208, "num_input_tokens_seen": 155627365, "step": 7193, "time_per_iteration": 2.4719905853271484 }, { "auxiliary_loss_clip": 0.01143419, "auxiliary_loss_mlp": 0.01026071, "balance_loss_clip": 1.05009413, "balance_loss_mlp": 1.0185132, "epoch": 0.8650273552576204, "flos": 20302888135680.0, "grad_norm": 4.460382097870268, "language_loss": 0.76593405, "learning_rate": 1.8795180239435738e-07, "loss": 0.78762901, "num_input_tokens_seen": 155646220, "step": 7194, "time_per_iteration": 2.4722025394439697 }, { "auxiliary_loss_clip": 0.01143013, "auxiliary_loss_mlp": 0.01025932, "balance_loss_clip": 1.045964, "balance_loss_mlp": 1.01846695, "epoch": 0.8651475981482595, "flos": 23951088881280.0, "grad_norm": 3.0105802304119966, "language_loss": 0.7641834, "learning_rate": 1.8762225871291348e-07, "loss": 0.78587282, "num_input_tokens_seen": 155662095, "step": 7195, "time_per_iteration": 2.491901397705078 }, { "auxiliary_loss_clip": 0.01165321, "auxiliary_loss_mlp": 0.0076158, "balance_loss_clip": 1.0470469, "balance_loss_mlp": 1.00047779, "epoch": 0.8652678410388985, "flos": 21684622561920.0, "grad_norm": 1.664628418272885, "language_loss": 0.80965567, "learning_rate": 1.8729298996773201e-07, "loss": 0.82892466, "num_input_tokens_seen": 155680845, "step": 7196, "time_per_iteration": 3.188472270965576 }, { "auxiliary_loss_clip": 0.01047082, "auxiliary_loss_mlp": 0.01002488, "balance_loss_clip": 1.00778794, "balance_loss_mlp": 1.00148618, "epoch": 0.8653880839295377, "flos": 65224660855680.0, "grad_norm": 0.8355544559823973, "language_loss": 0.61004162, "learning_rate": 1.8696399620876301e-07, "loss": 0.63053727, "num_input_tokens_seen": 155737875, "step": 7197, "time_per_iteration": 2.9900219440460205 }, { "auxiliary_loss_clip": 0.01120454, "auxiliary_loss_mlp": 0.01024241, "balance_loss_clip": 1.03924894, "balance_loss_mlp": 1.01658797, "epoch": 0.8655083268201768, "flos": 17749172753280.0, "grad_norm": 2.3259976094731494, "language_loss": 0.78793573, "learning_rate": 1.866352774859141e-07, "loss": 0.80938268, "num_input_tokens_seen": 155753100, "step": 7198, "time_per_iteration": 2.471815824508667 }, { "auxiliary_loss_clip": 0.0112523, "auxiliary_loss_mlp": 0.01022292, "balance_loss_clip": 1.0403744, "balance_loss_mlp": 1.01574779, "epoch": 0.8656285697108158, "flos": 20703974376960.0, "grad_norm": 2.6370103477383484, "language_loss": 0.69603854, "learning_rate": 1.8630683384905188e-07, "loss": 0.71751374, "num_input_tokens_seen": 155772430, "step": 7199, "time_per_iteration": 2.489495277404785 }, { "auxiliary_loss_clip": 0.01167, "auxiliary_loss_mlp": 0.00761697, "balance_loss_clip": 1.04912877, "balance_loss_mlp": 1.00046921, "epoch": 0.865748812601455, "flos": 18653833716480.0, "grad_norm": 2.1157043708059677, "language_loss": 0.88626057, "learning_rate": 1.8597866534800045e-07, "loss": 0.90554756, "num_input_tokens_seen": 155787545, "step": 7200, "time_per_iteration": 2.3899731636047363 }, { "auxiliary_loss_clip": 0.01153518, "auxiliary_loss_mlp": 0.00762037, "balance_loss_clip": 1.04650259, "balance_loss_mlp": 1.00039446, "epoch": 0.865869055492094, "flos": 70652554807680.0, "grad_norm": 2.4030041371908037, "language_loss": 0.74556565, "learning_rate": 1.8565077203254398e-07, "loss": 0.76472116, "num_input_tokens_seen": 155813005, "step": 7201, "time_per_iteration": 2.8548152446746826 }, { "auxiliary_loss_clip": 0.01125996, "auxiliary_loss_mlp": 0.01027375, "balance_loss_clip": 1.04730594, "balance_loss_mlp": 1.0201304, "epoch": 0.8659892983827331, "flos": 17383961220480.0, "grad_norm": 2.8187071375143, "language_loss": 0.72664392, "learning_rate": 1.8532315395242203e-07, "loss": 0.74817753, "num_input_tokens_seen": 155829455, "step": 7202, "time_per_iteration": 2.478963851928711 }, { "auxiliary_loss_clip": 0.01123371, "auxiliary_loss_mlp": 0.01021681, "balance_loss_clip": 1.04116428, "balance_loss_mlp": 1.01486802, "epoch": 0.8661095412733723, "flos": 17895221452800.0, "grad_norm": 2.187291378597507, "language_loss": 0.7256676, "learning_rate": 1.849958111573353e-07, "loss": 0.74711818, "num_input_tokens_seen": 155848060, "step": 7203, "time_per_iteration": 2.4842655658721924 }, { "auxiliary_loss_clip": 0.01162012, "auxiliary_loss_mlp": 0.01021479, "balance_loss_clip": 1.0460043, "balance_loss_mlp": 1.01482391, "epoch": 0.8662297841640113, "flos": 18224163227520.0, "grad_norm": 1.69114508267875, "language_loss": 0.64192033, "learning_rate": 1.8466874369694074e-07, "loss": 0.66375524, "num_input_tokens_seen": 155865755, "step": 7204, "time_per_iteration": 2.408764362335205 }, { "auxiliary_loss_clip": 0.01123255, "auxiliary_loss_mlp": 0.01023464, "balance_loss_clip": 1.03958607, "balance_loss_mlp": 1.01678586, "epoch": 0.8663500270546504, "flos": 16362159027840.0, "grad_norm": 2.5639042246858006, "language_loss": 0.7051791, "learning_rate": 1.843419516208542e-07, "loss": 0.7266463, "num_input_tokens_seen": 155882680, "step": 7205, "time_per_iteration": 2.4768755435943604 }, { "auxiliary_loss_clip": 0.01153133, "auxiliary_loss_mlp": 0.01023827, "balance_loss_clip": 1.04743052, "balance_loss_mlp": 1.01622105, "epoch": 0.8664702699452895, "flos": 17894431353600.0, "grad_norm": 2.30049604041738, "language_loss": 0.79530966, "learning_rate": 1.8401543497865047e-07, "loss": 0.81707931, "num_input_tokens_seen": 155900680, "step": 7206, "time_per_iteration": 2.4018285274505615 }, { "auxiliary_loss_clip": 0.0115393, "auxiliary_loss_mlp": 0.00761717, "balance_loss_clip": 1.0466845, "balance_loss_mlp": 1.00041735, "epoch": 0.8665905128359286, "flos": 30736373794560.0, "grad_norm": 1.8444418785065984, "language_loss": 0.64331782, "learning_rate": 1.836891938198608e-07, "loss": 0.66247427, "num_input_tokens_seen": 155921105, "step": 7207, "time_per_iteration": 2.519784450531006 }, { "auxiliary_loss_clip": 0.01135419, "auxiliary_loss_mlp": 0.01026907, "balance_loss_clip": 1.04418874, "balance_loss_mlp": 1.01985538, "epoch": 0.8667107557265676, "flos": 18656419495680.0, "grad_norm": 2.112179523624969, "language_loss": 0.71469545, "learning_rate": 1.8336322819397677e-07, "loss": 0.73631871, "num_input_tokens_seen": 155938640, "step": 7208, "time_per_iteration": 2.4382028579711914 }, { "auxiliary_loss_clip": 0.01127471, "auxiliary_loss_mlp": 0.01024617, "balance_loss_clip": 1.04024673, "balance_loss_mlp": 1.01731586, "epoch": 0.8668309986172068, "flos": 20083725302400.0, "grad_norm": 3.634784215260924, "language_loss": 0.6269623, "learning_rate": 1.8303753815044654e-07, "loss": 0.64848322, "num_input_tokens_seen": 155957945, "step": 7209, "time_per_iteration": 2.5275588035583496 }, { "auxiliary_loss_clip": 0.01143566, "auxiliary_loss_mlp": 0.01023894, "balance_loss_clip": 1.04285336, "balance_loss_mlp": 1.01640725, "epoch": 0.8669512415078459, "flos": 21615099788160.0, "grad_norm": 21.689983554009228, "language_loss": 0.70910311, "learning_rate": 1.827121237386773e-07, "loss": 0.73077774, "num_input_tokens_seen": 155975390, "step": 7210, "time_per_iteration": 2.5133891105651855 }, { "auxiliary_loss_clip": 0.01138843, "auxiliary_loss_mlp": 0.01026775, "balance_loss_clip": 1.04371071, "balance_loss_mlp": 1.01966405, "epoch": 0.8670714843984849, "flos": 17703601372800.0, "grad_norm": 2.5703796765308438, "language_loss": 0.74971163, "learning_rate": 1.8238698500803374e-07, "loss": 0.77136779, "num_input_tokens_seen": 155988155, "step": 7211, "time_per_iteration": 2.404639482498169 }, { "auxiliary_loss_clip": 0.01052786, "auxiliary_loss_mlp": 0.01001665, "balance_loss_clip": 1.00803161, "balance_loss_mlp": 1.00067604, "epoch": 0.8671917272891241, "flos": 60705483125760.0, "grad_norm": 0.7177639008424767, "language_loss": 0.56275952, "learning_rate": 1.820621220078391e-07, "loss": 0.58330399, "num_input_tokens_seen": 156052065, "step": 7212, "time_per_iteration": 3.0944032669067383 }, { "auxiliary_loss_clip": 0.01163049, "auxiliary_loss_mlp": 0.01018756, "balance_loss_clip": 1.04521549, "balance_loss_mlp": 1.01137066, "epoch": 0.8673119701797631, "flos": 20451881750400.0, "grad_norm": 1.6343438347074417, "language_loss": 0.67810798, "learning_rate": 1.8173753478737553e-07, "loss": 0.69992602, "num_input_tokens_seen": 156072500, "step": 7213, "time_per_iteration": 2.422515392303467 }, { "auxiliary_loss_clip": 0.01168366, "auxiliary_loss_mlp": 0.01026591, "balance_loss_clip": 1.04770541, "balance_loss_mlp": 1.01931643, "epoch": 0.8674322130704022, "flos": 19647410797440.0, "grad_norm": 2.129531267401255, "language_loss": 0.79784894, "learning_rate": 1.8141322339588205e-07, "loss": 0.81979847, "num_input_tokens_seen": 156089840, "step": 7214, "time_per_iteration": 2.3884010314941406 }, { "auxiliary_loss_clip": 0.01165535, "auxiliary_loss_mlp": 0.01025992, "balance_loss_clip": 1.04775238, "balance_loss_mlp": 1.01889634, "epoch": 0.8675524559610414, "flos": 26025001367040.0, "grad_norm": 1.9739554735825269, "language_loss": 0.70256305, "learning_rate": 1.810891878825569e-07, "loss": 0.72447836, "num_input_tokens_seen": 156109815, "step": 7215, "time_per_iteration": 3.151298999786377 }, { "auxiliary_loss_clip": 0.01132954, "auxiliary_loss_mlp": 0.0102297, "balance_loss_clip": 1.04019213, "balance_loss_mlp": 1.01597226, "epoch": 0.8676726988516804, "flos": 15049444584960.0, "grad_norm": 2.3792088920886827, "language_loss": 0.7182318, "learning_rate": 1.8076542829655561e-07, "loss": 0.73979104, "num_input_tokens_seen": 156128620, "step": 7216, "time_per_iteration": 3.5483880043029785 }, { "auxiliary_loss_clip": 0.01140362, "auxiliary_loss_mlp": 0.01027005, "balance_loss_clip": 1.0467881, "balance_loss_mlp": 1.01904202, "epoch": 0.8677929417423195, "flos": 16288111140480.0, "grad_norm": 1.8380978494496327, "language_loss": 0.79279149, "learning_rate": 1.8044194468699203e-07, "loss": 0.81446517, "num_input_tokens_seen": 156145930, "step": 7217, "time_per_iteration": 3.3012497425079346 }, { "auxiliary_loss_clip": 0.01136375, "auxiliary_loss_mlp": 0.0102343, "balance_loss_clip": 1.04667044, "balance_loss_mlp": 1.01642644, "epoch": 0.8679131846329585, "flos": 18844160906880.0, "grad_norm": 2.6265625464221776, "language_loss": 0.76021826, "learning_rate": 1.8011873710293912e-07, "loss": 0.7818163, "num_input_tokens_seen": 156164435, "step": 7218, "time_per_iteration": 2.518362522125244 }, { "auxiliary_loss_clip": 0.01147029, "auxiliary_loss_mlp": 0.01023731, "balance_loss_clip": 1.0453763, "balance_loss_mlp": 1.01647723, "epoch": 0.8680334275235977, "flos": 33620718890880.0, "grad_norm": 1.7668753068412386, "language_loss": 0.69502074, "learning_rate": 1.7979580559342677e-07, "loss": 0.71672833, "num_input_tokens_seen": 156185165, "step": 7219, "time_per_iteration": 2.539588451385498 }, { "auxiliary_loss_clip": 0.0113732, "auxiliary_loss_mlp": 0.01024839, "balance_loss_clip": 1.04524732, "balance_loss_mlp": 1.01791036, "epoch": 0.8681536704142367, "flos": 24681152810880.0, "grad_norm": 1.661279573481478, "language_loss": 0.66628891, "learning_rate": 1.7947315020744358e-07, "loss": 0.6879105, "num_input_tokens_seen": 156206260, "step": 7220, "time_per_iteration": 2.5168042182922363 }, { "auxiliary_loss_clip": 0.01133822, "auxiliary_loss_mlp": 0.01020396, "balance_loss_clip": 1.04179478, "balance_loss_mlp": 1.01354706, "epoch": 0.8682739133048758, "flos": 20011042131840.0, "grad_norm": 1.7472550506304882, "language_loss": 0.8028605, "learning_rate": 1.7915077099393594e-07, "loss": 0.82440269, "num_input_tokens_seen": 156222860, "step": 7221, "time_per_iteration": 2.4560141563415527 }, { "auxiliary_loss_clip": 0.01153419, "auxiliary_loss_mlp": 0.01025319, "balance_loss_clip": 1.04447746, "balance_loss_mlp": 1.01802301, "epoch": 0.868394156195515, "flos": 16654759217280.0, "grad_norm": 1.8490439489410155, "language_loss": 0.73337972, "learning_rate": 1.788286680018083e-07, "loss": 0.75516713, "num_input_tokens_seen": 156241570, "step": 7222, "time_per_iteration": 2.440154552459717 }, { "auxiliary_loss_clip": 0.01141224, "auxiliary_loss_mlp": 0.01025568, "balance_loss_clip": 1.04470885, "balance_loss_mlp": 1.01887417, "epoch": 0.868514399086154, "flos": 28001381448960.0, "grad_norm": 1.8230106415532108, "language_loss": 0.72670174, "learning_rate": 1.7850684127992443e-07, "loss": 0.74836957, "num_input_tokens_seen": 156261315, "step": 7223, "time_per_iteration": 3.2229413986206055 }, { "auxiliary_loss_clip": 0.01123506, "auxiliary_loss_mlp": 0.01024615, "balance_loss_clip": 1.04491425, "balance_loss_mlp": 1.0175817, "epoch": 0.8686346419767931, "flos": 20084587228800.0, "grad_norm": 1.735764597549995, "language_loss": 0.70108593, "learning_rate": 1.7818529087710378e-07, "loss": 0.72256708, "num_input_tokens_seen": 156281670, "step": 7224, "time_per_iteration": 2.5162742137908936 }, { "auxiliary_loss_clip": 0.01147513, "auxiliary_loss_mlp": 0.00761875, "balance_loss_clip": 1.04344988, "balance_loss_mlp": 1.00043964, "epoch": 0.8687548848674322, "flos": 18223516782720.0, "grad_norm": 1.6537983379543508, "language_loss": 0.84135675, "learning_rate": 1.7786401684212637e-07, "loss": 0.86045063, "num_input_tokens_seen": 156300500, "step": 7225, "time_per_iteration": 2.427837610244751 }, { "auxiliary_loss_clip": 0.01030253, "auxiliary_loss_mlp": 0.01003754, "balance_loss_clip": 1.01050448, "balance_loss_mlp": 1.00300622, "epoch": 0.8688751277580713, "flos": 70457885049600.0, "grad_norm": 0.7448156500959605, "language_loss": 0.55952418, "learning_rate": 1.7754301922372883e-07, "loss": 0.57986426, "num_input_tokens_seen": 156350145, "step": 7226, "time_per_iteration": 2.960059881210327 }, { "auxiliary_loss_clip": 0.01101348, "auxiliary_loss_mlp": 0.01023442, "balance_loss_clip": 1.04107022, "balance_loss_mlp": 1.01597977, "epoch": 0.8689953706487104, "flos": 26906788344960.0, "grad_norm": 1.9821852930127382, "language_loss": 0.80675328, "learning_rate": 1.7722229807060617e-07, "loss": 0.82800114, "num_input_tokens_seen": 156368725, "step": 7227, "time_per_iteration": 2.626340389251709 }, { "auxiliary_loss_clip": 0.01113914, "auxiliary_loss_mlp": 0.01023528, "balance_loss_clip": 1.03940463, "balance_loss_mlp": 1.01679015, "epoch": 0.8691156135393495, "flos": 34637385438720.0, "grad_norm": 2.6854853968846255, "language_loss": 0.82057846, "learning_rate": 1.7690185343141172e-07, "loss": 0.84195292, "num_input_tokens_seen": 156388640, "step": 7228, "time_per_iteration": 2.6248438358306885 }, { "auxiliary_loss_clip": 0.01136533, "auxiliary_loss_mlp": 0.01020992, "balance_loss_clip": 1.04294658, "balance_loss_mlp": 1.01432788, "epoch": 0.8692358564299886, "flos": 18989814556800.0, "grad_norm": 2.2716592416477717, "language_loss": 0.69993812, "learning_rate": 1.7658168535475615e-07, "loss": 0.72151327, "num_input_tokens_seen": 156406425, "step": 7229, "time_per_iteration": 2.4600648880004883 }, { "auxiliary_loss_clip": 0.0114353, "auxiliary_loss_mlp": 0.01026314, "balance_loss_clip": 1.04729962, "balance_loss_mlp": 1.0191381, "epoch": 0.8693560993206276, "flos": 30370839039360.0, "grad_norm": 1.6102992224663388, "language_loss": 0.64323294, "learning_rate": 1.7626179388920948e-07, "loss": 0.66493136, "num_input_tokens_seen": 156427705, "step": 7230, "time_per_iteration": 2.569403886795044 }, { "auxiliary_loss_clip": 0.01137886, "auxiliary_loss_mlp": 0.00761504, "balance_loss_clip": 1.04577625, "balance_loss_mlp": 1.00051188, "epoch": 0.8694763422112668, "flos": 27200430028800.0, "grad_norm": 1.6627130175782878, "language_loss": 0.80660558, "learning_rate": 1.7594217908329866e-07, "loss": 0.82559949, "num_input_tokens_seen": 156449890, "step": 7231, "time_per_iteration": 2.549978494644165 }, { "auxiliary_loss_clip": 0.01129133, "auxiliary_loss_mlp": 0.01020288, "balance_loss_clip": 1.04360998, "balance_loss_mlp": 1.01343369, "epoch": 0.8695965851019059, "flos": 26139161767680.0, "grad_norm": 2.1040868894728995, "language_loss": 0.74063492, "learning_rate": 1.7562284098550895e-07, "loss": 0.76212919, "num_input_tokens_seen": 156469600, "step": 7232, "time_per_iteration": 2.504659652709961 }, { "auxiliary_loss_clip": 0.01036728, "auxiliary_loss_mlp": 0.01001842, "balance_loss_clip": 1.00906968, "balance_loss_mlp": 1.00084066, "epoch": 0.8697168279925449, "flos": 67332616456320.0, "grad_norm": 0.8363226528400701, "language_loss": 0.62288058, "learning_rate": 1.753037796442838e-07, "loss": 0.64326626, "num_input_tokens_seen": 156529040, "step": 7233, "time_per_iteration": 3.0336718559265137 }, { "auxiliary_loss_clip": 0.0116483, "auxiliary_loss_mlp": 0.0102388, "balance_loss_clip": 1.04627991, "balance_loss_mlp": 1.0163815, "epoch": 0.8698370708831841, "flos": 19718693337600.0, "grad_norm": 2.1924795882386596, "language_loss": 0.75254858, "learning_rate": 1.74984995108024e-07, "loss": 0.77443564, "num_input_tokens_seen": 156546970, "step": 7234, "time_per_iteration": 2.4014999866485596 }, { "auxiliary_loss_clip": 0.01152445, "auxiliary_loss_mlp": 0.01020582, "balance_loss_clip": 1.04538691, "balance_loss_mlp": 1.01382589, "epoch": 0.8699573137738231, "flos": 12859971068160.0, "grad_norm": 2.0490979757867835, "language_loss": 0.83430582, "learning_rate": 1.7466648742508981e-07, "loss": 0.85603607, "num_input_tokens_seen": 156563155, "step": 7235, "time_per_iteration": 2.3954050540924072 }, { "auxiliary_loss_clip": 0.01134735, "auxiliary_loss_mlp": 0.01028024, "balance_loss_clip": 1.04438365, "balance_loss_mlp": 1.02059102, "epoch": 0.8700775566644622, "flos": 17420733768960.0, "grad_norm": 1.8509984896470082, "language_loss": 0.84744728, "learning_rate": 1.7434825664379837e-07, "loss": 0.86907488, "num_input_tokens_seen": 156581660, "step": 7236, "time_per_iteration": 2.4437103271484375 }, { "auxiliary_loss_clip": 0.01152251, "auxiliary_loss_mlp": 0.01020869, "balance_loss_clip": 1.04583621, "balance_loss_mlp": 1.01381803, "epoch": 0.8701977995551013, "flos": 13735221770880.0, "grad_norm": 2.472246871219972, "language_loss": 0.86005932, "learning_rate": 1.740303028124246e-07, "loss": 0.88179052, "num_input_tokens_seen": 156597720, "step": 7237, "time_per_iteration": 2.4325177669525146 }, { "auxiliary_loss_clip": 0.01087302, "auxiliary_loss_mlp": 0.01022647, "balance_loss_clip": 1.03820395, "balance_loss_mlp": 1.01557159, "epoch": 0.8703180424457404, "flos": 30555707362560.0, "grad_norm": 2.1474210848133737, "language_loss": 0.75847697, "learning_rate": 1.7371262597920212e-07, "loss": 0.77957654, "num_input_tokens_seen": 156619780, "step": 7238, "time_per_iteration": 2.6330466270446777 }, { "auxiliary_loss_clip": 0.01110035, "auxiliary_loss_mlp": 0.01031193, "balance_loss_clip": 1.04637206, "balance_loss_mlp": 1.02401686, "epoch": 0.8704382853363795, "flos": 19608986223360.0, "grad_norm": 1.4507670531626606, "language_loss": 0.76313674, "learning_rate": 1.7339522619232195e-07, "loss": 0.784549, "num_input_tokens_seen": 156638160, "step": 7239, "time_per_iteration": 2.5429558753967285 }, { "auxiliary_loss_clip": 0.01144011, "auxiliary_loss_mlp": 0.01025196, "balance_loss_clip": 1.04311478, "balance_loss_mlp": 1.01756704, "epoch": 0.8705585282270186, "flos": 26613900846720.0, "grad_norm": 1.8145665293701223, "language_loss": 0.7527864, "learning_rate": 1.730781034999338e-07, "loss": 0.77447844, "num_input_tokens_seen": 156659740, "step": 7240, "time_per_iteration": 2.5225019454956055 }, { "auxiliary_loss_clip": 0.01162477, "auxiliary_loss_mlp": 0.01025346, "balance_loss_clip": 1.04853749, "balance_loss_mlp": 1.01852775, "epoch": 0.8706787711176577, "flos": 34090465979520.0, "grad_norm": 2.189082264317166, "language_loss": 0.7326014, "learning_rate": 1.7276125795014497e-07, "loss": 0.75447965, "num_input_tokens_seen": 156678190, "step": 7241, "time_per_iteration": 3.258859872817993 }, { "auxiliary_loss_clip": 0.0113959, "auxiliary_loss_mlp": 0.01024554, "balance_loss_clip": 1.04195809, "balance_loss_mlp": 1.01709104, "epoch": 0.8707990140082967, "flos": 14611513968000.0, "grad_norm": 1.8861881373261027, "language_loss": 0.67529809, "learning_rate": 1.7244468959102054e-07, "loss": 0.69693953, "num_input_tokens_seen": 156695245, "step": 7242, "time_per_iteration": 3.2692439556121826 }, { "auxiliary_loss_clip": 0.01152093, "auxiliary_loss_mlp": 0.0102106, "balance_loss_clip": 1.04796839, "balance_loss_mlp": 1.01383018, "epoch": 0.8709192568989359, "flos": 20084156265600.0, "grad_norm": 2.162124046544784, "language_loss": 0.85106719, "learning_rate": 1.7212839847058348e-07, "loss": 0.87279868, "num_input_tokens_seen": 156710375, "step": 7243, "time_per_iteration": 3.285090923309326 }, { "auxiliary_loss_clip": 0.01102517, "auxiliary_loss_mlp": 0.01021966, "balance_loss_clip": 1.03975677, "balance_loss_mlp": 1.01538289, "epoch": 0.871039499789575, "flos": 16727083251840.0, "grad_norm": 1.9652953171757017, "language_loss": 0.73743653, "learning_rate": 1.718123846368147e-07, "loss": 0.7586813, "num_input_tokens_seen": 156729420, "step": 7244, "time_per_iteration": 2.563595771789551 }, { "auxiliary_loss_clip": 0.01137063, "auxiliary_loss_mlp": 0.0076149, "balance_loss_clip": 1.04633033, "balance_loss_mlp": 1.00042486, "epoch": 0.871159742680214, "flos": 21068790860160.0, "grad_norm": 1.6732765310763076, "language_loss": 0.71722603, "learning_rate": 1.714966481376543e-07, "loss": 0.7362116, "num_input_tokens_seen": 156746100, "step": 7245, "time_per_iteration": 2.4658870697021484 }, { "auxiliary_loss_clip": 0.01149281, "auxiliary_loss_mlp": 0.01023633, "balance_loss_clip": 1.04388595, "balance_loss_mlp": 1.0166893, "epoch": 0.8712799855708532, "flos": 28256526731520.0, "grad_norm": 1.9632740234594193, "language_loss": 0.83133149, "learning_rate": 1.7118118902099797e-07, "loss": 0.8530606, "num_input_tokens_seen": 156764185, "step": 7246, "time_per_iteration": 2.474360466003418 }, { "auxiliary_loss_clip": 0.01150039, "auxiliary_loss_mlp": 0.01029283, "balance_loss_clip": 1.04477, "balance_loss_mlp": 1.02224934, "epoch": 0.8714002284614922, "flos": 22236677665920.0, "grad_norm": 1.5796608125500102, "language_loss": 0.8056609, "learning_rate": 1.7086600733470146e-07, "loss": 0.82745409, "num_input_tokens_seen": 156784855, "step": 7247, "time_per_iteration": 2.445566177368164 }, { "auxiliary_loss_clip": 0.01147125, "auxiliary_loss_mlp": 0.01026997, "balance_loss_clip": 1.04464507, "balance_loss_mlp": 1.02038074, "epoch": 0.8715204713521313, "flos": 21431919404160.0, "grad_norm": 1.935767532876687, "language_loss": 0.77335012, "learning_rate": 1.7055110312657738e-07, "loss": 0.79509139, "num_input_tokens_seen": 156804350, "step": 7248, "time_per_iteration": 2.4370718002319336 }, { "auxiliary_loss_clip": 0.01132677, "auxiliary_loss_mlp": 0.01027483, "balance_loss_clip": 1.04434276, "balance_loss_mlp": 1.01993704, "epoch": 0.8716407142427703, "flos": 23440439180160.0, "grad_norm": 2.5153054770464633, "language_loss": 0.74408627, "learning_rate": 1.702364764443962e-07, "loss": 0.76568788, "num_input_tokens_seen": 156823425, "step": 7249, "time_per_iteration": 3.2174181938171387 }, { "auxiliary_loss_clip": 0.01089067, "auxiliary_loss_mlp": 0.01021527, "balance_loss_clip": 1.03563094, "balance_loss_mlp": 1.0139842, "epoch": 0.8717609571334095, "flos": 27958683156480.0, "grad_norm": 1.8951449523988524, "language_loss": 0.72447848, "learning_rate": 1.6992212733588685e-07, "loss": 0.74558449, "num_input_tokens_seen": 156843090, "step": 7250, "time_per_iteration": 2.640659809112549 }, { "auxiliary_loss_clip": 0.01133421, "auxiliary_loss_mlp": 0.01024201, "balance_loss_clip": 1.04252803, "balance_loss_mlp": 1.01718819, "epoch": 0.8718812000240486, "flos": 25479482538240.0, "grad_norm": 1.7649682355255958, "language_loss": 0.75195658, "learning_rate": 1.6960805584873538e-07, "loss": 0.77353281, "num_input_tokens_seen": 156861090, "step": 7251, "time_per_iteration": 2.500795841217041 }, { "auxiliary_loss_clip": 0.01111381, "auxiliary_loss_mlp": 0.01025281, "balance_loss_clip": 1.04045248, "balance_loss_mlp": 1.01842391, "epoch": 0.8720014429146876, "flos": 23403056100480.0, "grad_norm": 2.812243738197062, "language_loss": 0.78263247, "learning_rate": 1.6929426203058684e-07, "loss": 0.80399907, "num_input_tokens_seen": 156881515, "step": 7252, "time_per_iteration": 2.5852882862091064 }, { "auxiliary_loss_clip": 0.01167568, "auxiliary_loss_mlp": 0.00762262, "balance_loss_clip": 1.04532886, "balance_loss_mlp": 1.00051188, "epoch": 0.8721216858053268, "flos": 24352821567360.0, "grad_norm": 2.770436258071441, "language_loss": 0.80376977, "learning_rate": 1.689807459290431e-07, "loss": 0.82306808, "num_input_tokens_seen": 156900170, "step": 7253, "time_per_iteration": 2.434401512145996 }, { "auxiliary_loss_clip": 0.01137395, "auxiliary_loss_mlp": 0.01023688, "balance_loss_clip": 1.04412496, "balance_loss_mlp": 1.01732874, "epoch": 0.8722419286959658, "flos": 33869687034240.0, "grad_norm": 1.9828134672737794, "language_loss": 0.70552158, "learning_rate": 1.6866750759166437e-07, "loss": 0.72713244, "num_input_tokens_seen": 156920150, "step": 7254, "time_per_iteration": 2.590894937515259 }, { "auxiliary_loss_clip": 0.01117179, "auxiliary_loss_mlp": 0.01021353, "balance_loss_clip": 1.03890204, "balance_loss_mlp": 1.0138483, "epoch": 0.8723621715866049, "flos": 18369385914240.0, "grad_norm": 2.325752208452468, "language_loss": 0.77237034, "learning_rate": 1.6835454706596865e-07, "loss": 0.79375571, "num_input_tokens_seen": 156937980, "step": 7255, "time_per_iteration": 2.482780933380127 }, { "auxiliary_loss_clip": 0.01165945, "auxiliary_loss_mlp": 0.01026586, "balance_loss_clip": 1.04769075, "balance_loss_mlp": 1.01935589, "epoch": 0.8724824144772441, "flos": 22013348855040.0, "grad_norm": 2.354655660311847, "language_loss": 0.737064, "learning_rate": 1.680418643994317e-07, "loss": 0.75898921, "num_input_tokens_seen": 156956550, "step": 7256, "time_per_iteration": 2.4169082641601562 }, { "auxiliary_loss_clip": 0.01062112, "auxiliary_loss_mlp": 0.01001034, "balance_loss_clip": 1.00829339, "balance_loss_mlp": 1.00016332, "epoch": 0.8726026573678831, "flos": 66698720213760.0, "grad_norm": 0.8921880064192208, "language_loss": 0.64547169, "learning_rate": 1.6772945963948738e-07, "loss": 0.66610312, "num_input_tokens_seen": 157014715, "step": 7257, "time_per_iteration": 3.026306629180908 }, { "auxiliary_loss_clip": 0.01131867, "auxiliary_loss_mlp": 0.01023775, "balance_loss_clip": 1.04413319, "balance_loss_mlp": 1.01699543, "epoch": 0.8727229002585222, "flos": 13370908078080.0, "grad_norm": 2.9417976147240394, "language_loss": 0.77364326, "learning_rate": 1.6741733283352733e-07, "loss": 0.79519969, "num_input_tokens_seen": 157032320, "step": 7258, "time_per_iteration": 2.4624507427215576 }, { "auxiliary_loss_clip": 0.01116296, "auxiliary_loss_mlp": 0.01026885, "balance_loss_clip": 1.0433048, "balance_loss_mlp": 1.01999807, "epoch": 0.8728431431491613, "flos": 21796987282560.0, "grad_norm": 1.5145087344391084, "language_loss": 0.8384738, "learning_rate": 1.6710548402890102e-07, "loss": 0.8599056, "num_input_tokens_seen": 157052845, "step": 7259, "time_per_iteration": 2.553818941116333 }, { "auxiliary_loss_clip": 0.01168785, "auxiliary_loss_mlp": 0.01024988, "balance_loss_clip": 1.04685569, "balance_loss_mlp": 1.01740909, "epoch": 0.8729633860398004, "flos": 36173823742080.0, "grad_norm": 1.8175940698179587, "language_loss": 0.66873902, "learning_rate": 1.6679391327291527e-07, "loss": 0.69067675, "num_input_tokens_seen": 157074050, "step": 7260, "time_per_iteration": 2.5198378562927246 }, { "auxiliary_loss_clip": 0.01135614, "auxiliary_loss_mlp": 0.01023822, "balance_loss_clip": 1.04127288, "balance_loss_mlp": 1.01706612, "epoch": 0.8730836289304394, "flos": 16359680989440.0, "grad_norm": 4.247101649933086, "language_loss": 0.67918015, "learning_rate": 1.6648262061283492e-07, "loss": 0.70077455, "num_input_tokens_seen": 157089350, "step": 7261, "time_per_iteration": 2.42807936668396 }, { "auxiliary_loss_clip": 0.01121677, "auxiliary_loss_mlp": 0.01024106, "balance_loss_clip": 1.03930736, "balance_loss_mlp": 1.01723695, "epoch": 0.8732038718210786, "flos": 21215126868480.0, "grad_norm": 1.877826568790639, "language_loss": 0.73299396, "learning_rate": 1.6617160609588353e-07, "loss": 0.75445175, "num_input_tokens_seen": 157108525, "step": 7262, "time_per_iteration": 2.4914398193359375 }, { "auxiliary_loss_clip": 0.01142879, "auxiliary_loss_mlp": 0.01027586, "balance_loss_clip": 1.04569161, "balance_loss_mlp": 1.01995111, "epoch": 0.8733241147117177, "flos": 16610696208000.0, "grad_norm": 4.5376051386047065, "language_loss": 0.71651846, "learning_rate": 1.6586086976924163e-07, "loss": 0.73822308, "num_input_tokens_seen": 157124025, "step": 7263, "time_per_iteration": 2.4247589111328125 }, { "auxiliary_loss_clip": 0.01150642, "auxiliary_loss_mlp": 0.01021705, "balance_loss_clip": 1.04396796, "balance_loss_mlp": 1.01519895, "epoch": 0.8734443576023567, "flos": 20193935207040.0, "grad_norm": 1.936928895674092, "language_loss": 0.7832166, "learning_rate": 1.6555041168004747e-07, "loss": 0.80494004, "num_input_tokens_seen": 157143345, "step": 7264, "time_per_iteration": 2.4338228702545166 }, { "auxiliary_loss_clip": 0.01130214, "auxiliary_loss_mlp": 0.01022533, "balance_loss_clip": 1.04194391, "balance_loss_mlp": 1.01580715, "epoch": 0.8735646004929959, "flos": 18041162411520.0, "grad_norm": 1.7579067613630046, "language_loss": 0.69390976, "learning_rate": 1.6524023187539715e-07, "loss": 0.71543723, "num_input_tokens_seen": 157161630, "step": 7265, "time_per_iteration": 2.4513816833496094 }, { "auxiliary_loss_clip": 0.01137891, "auxiliary_loss_mlp": 0.01023288, "balance_loss_clip": 1.04442573, "balance_loss_mlp": 1.01650167, "epoch": 0.873684843383635, "flos": 20262344659200.0, "grad_norm": 2.0068267346133606, "language_loss": 0.74740982, "learning_rate": 1.649303304023446e-07, "loss": 0.76902163, "num_input_tokens_seen": 157181385, "step": 7266, "time_per_iteration": 2.471242666244507 }, { "auxiliary_loss_clip": 0.011177, "auxiliary_loss_mlp": 0.01022204, "balance_loss_clip": 1.04360771, "balance_loss_mlp": 1.01524258, "epoch": 0.873805086274274, "flos": 16947287579520.0, "grad_norm": 1.8667044926312626, "language_loss": 0.78711295, "learning_rate": 1.6462070730790246e-07, "loss": 0.80851197, "num_input_tokens_seen": 157200545, "step": 7267, "time_per_iteration": 2.478363275527954 }, { "auxiliary_loss_clip": 0.01132664, "auxiliary_loss_mlp": 0.01024071, "balance_loss_clip": 1.0407896, "balance_loss_mlp": 1.01688588, "epoch": 0.8739253291649132, "flos": 18041270152320.0, "grad_norm": 2.6493317700626657, "language_loss": 0.7863695, "learning_rate": 1.6431136263903912e-07, "loss": 0.80793685, "num_input_tokens_seen": 157219545, "step": 7268, "time_per_iteration": 4.352779865264893 }, { "auxiliary_loss_clip": 0.01153717, "auxiliary_loss_mlp": 0.00761389, "balance_loss_clip": 1.04387498, "balance_loss_mlp": 1.00046408, "epoch": 0.8740455720555522, "flos": 21325085377920.0, "grad_norm": 2.452105333380181, "language_loss": 0.73743081, "learning_rate": 1.6400229644268282e-07, "loss": 0.7565819, "num_input_tokens_seen": 157237900, "step": 7269, "time_per_iteration": 2.4299886226654053 }, { "auxiliary_loss_clip": 0.01118299, "auxiliary_loss_mlp": 0.01028769, "balance_loss_clip": 1.04583395, "balance_loss_mlp": 1.0210557, "epoch": 0.8741658149461913, "flos": 15158684822400.0, "grad_norm": 1.8136251130857342, "language_loss": 0.8119356, "learning_rate": 1.6369350876571852e-07, "loss": 0.83340633, "num_input_tokens_seen": 157256055, "step": 7270, "time_per_iteration": 3.328937292098999 }, { "auxiliary_loss_clip": 0.01105456, "auxiliary_loss_mlp": 0.01023493, "balance_loss_clip": 1.0391233, "balance_loss_mlp": 1.01657021, "epoch": 0.8742860578368304, "flos": 23039855729280.0, "grad_norm": 1.8416639093662597, "language_loss": 0.81421393, "learning_rate": 1.6338499965498874e-07, "loss": 0.83550346, "num_input_tokens_seen": 157274785, "step": 7271, "time_per_iteration": 2.5355899333953857 }, { "auxiliary_loss_clip": 0.01120467, "auxiliary_loss_mlp": 0.01030033, "balance_loss_clip": 1.04358613, "balance_loss_mlp": 1.02230835, "epoch": 0.8744063007274695, "flos": 28145347159680.0, "grad_norm": 1.439658537946952, "language_loss": 0.77447951, "learning_rate": 1.630767691572943e-07, "loss": 0.79598451, "num_input_tokens_seen": 157294805, "step": 7272, "time_per_iteration": 2.5608465671539307 }, { "auxiliary_loss_clip": 0.01042944, "auxiliary_loss_mlp": 0.0100128, "balance_loss_clip": 1.00839281, "balance_loss_mlp": 1.00034416, "epoch": 0.8745265436181086, "flos": 64034076654720.0, "grad_norm": 0.7358329192011749, "language_loss": 0.53526986, "learning_rate": 1.6276881731939306e-07, "loss": 0.5557121, "num_input_tokens_seen": 157356695, "step": 7273, "time_per_iteration": 3.1215240955352783 }, { "auxiliary_loss_clip": 0.01146961, "auxiliary_loss_mlp": 0.0102357, "balance_loss_clip": 1.04521716, "balance_loss_mlp": 1.01659966, "epoch": 0.8746467865087477, "flos": 28658618553600.0, "grad_norm": 1.852533439932542, "language_loss": 0.75572109, "learning_rate": 1.6246114418800193e-07, "loss": 0.77742648, "num_input_tokens_seen": 157376975, "step": 7274, "time_per_iteration": 2.499825954437256 }, { "auxiliary_loss_clip": 0.01145811, "auxiliary_loss_mlp": 0.01026986, "balance_loss_clip": 1.04466081, "balance_loss_mlp": 1.01937413, "epoch": 0.8747670293993868, "flos": 23985850268160.0, "grad_norm": 1.7093377120631499, "language_loss": 0.76604038, "learning_rate": 1.6215374980979423e-07, "loss": 0.78776836, "num_input_tokens_seen": 157397385, "step": 7275, "time_per_iteration": 2.4932079315185547 }, { "auxiliary_loss_clip": 0.01145528, "auxiliary_loss_mlp": 0.01027972, "balance_loss_clip": 1.04585576, "balance_loss_mlp": 1.02143073, "epoch": 0.8748872722900258, "flos": 45221624478720.0, "grad_norm": 1.9740980187618742, "language_loss": 0.68492776, "learning_rate": 1.6184663423140133e-07, "loss": 0.70666277, "num_input_tokens_seen": 157417685, "step": 7276, "time_per_iteration": 3.3338396549224854 }, { "auxiliary_loss_clip": 0.01112649, "auxiliary_loss_mlp": 0.0102933, "balance_loss_clip": 1.04239416, "balance_loss_mlp": 1.02246654, "epoch": 0.875007515180665, "flos": 19754280737280.0, "grad_norm": 1.7860318129655117, "language_loss": 0.63926178, "learning_rate": 1.615397974994126e-07, "loss": 0.66068155, "num_input_tokens_seen": 157435490, "step": 7277, "time_per_iteration": 2.5295870304107666 }, { "auxiliary_loss_clip": 0.01162282, "auxiliary_loss_mlp": 0.01022699, "balance_loss_clip": 1.04633284, "balance_loss_mlp": 1.01594031, "epoch": 0.875127758071304, "flos": 22710734386560.0, "grad_norm": 1.4724612628082139, "language_loss": 0.80776554, "learning_rate": 1.6123323966037438e-07, "loss": 0.82961535, "num_input_tokens_seen": 157454010, "step": 7278, "time_per_iteration": 2.4248545169830322 }, { "auxiliary_loss_clip": 0.01165152, "auxiliary_loss_mlp": 0.01031969, "balance_loss_clip": 1.04795313, "balance_loss_mlp": 1.02484655, "epoch": 0.8752480009619431, "flos": 23403846199680.0, "grad_norm": 2.220548201688487, "language_loss": 0.78453571, "learning_rate": 1.6092696076079216e-07, "loss": 0.80650693, "num_input_tokens_seen": 157472385, "step": 7279, "time_per_iteration": 2.405405044555664 }, { "auxiliary_loss_clip": 0.01111073, "auxiliary_loss_mlp": 0.01022329, "balance_loss_clip": 1.04066515, "balance_loss_mlp": 1.01545322, "epoch": 0.8753682438525822, "flos": 26213101914240.0, "grad_norm": 1.5701699245583525, "language_loss": 0.74108648, "learning_rate": 1.6062096084712785e-07, "loss": 0.76242048, "num_input_tokens_seen": 157493735, "step": 7280, "time_per_iteration": 2.5235345363616943 }, { "auxiliary_loss_clip": 0.01127196, "auxiliary_loss_mlp": 0.00761459, "balance_loss_clip": 1.03945374, "balance_loss_mlp": 1.00047183, "epoch": 0.8754884867432213, "flos": 23326745656320.0, "grad_norm": 2.105885957597929, "language_loss": 0.70599437, "learning_rate": 1.6031523996580098e-07, "loss": 0.72488093, "num_input_tokens_seen": 157511295, "step": 7281, "time_per_iteration": 2.472104549407959 }, { "auxiliary_loss_clip": 0.01131526, "auxiliary_loss_mlp": 0.01026154, "balance_loss_clip": 1.04429686, "balance_loss_mlp": 1.01880455, "epoch": 0.8756087296338604, "flos": 12495226412160.0, "grad_norm": 2.5097686155391026, "language_loss": 0.66421473, "learning_rate": 1.6000979816318981e-07, "loss": 0.68579155, "num_input_tokens_seen": 157529760, "step": 7282, "time_per_iteration": 2.4813733100891113 }, { "auxiliary_loss_clip": 0.01144003, "auxiliary_loss_mlp": 0.01020992, "balance_loss_clip": 1.04466677, "balance_loss_mlp": 1.01388156, "epoch": 0.8757289725244994, "flos": 18952898353920.0, "grad_norm": 2.1019711633317932, "language_loss": 0.75089264, "learning_rate": 1.5970463548562886e-07, "loss": 0.7725426, "num_input_tokens_seen": 157548915, "step": 7283, "time_per_iteration": 2.4103548526763916 }, { "auxiliary_loss_clip": 0.01135534, "auxiliary_loss_mlp": 0.01023007, "balance_loss_clip": 1.0453856, "balance_loss_mlp": 1.01614642, "epoch": 0.8758492154151386, "flos": 25265958140160.0, "grad_norm": 1.794976923770229, "language_loss": 0.70912701, "learning_rate": 1.5939975197941192e-07, "loss": 0.73071241, "num_input_tokens_seen": 157570570, "step": 7284, "time_per_iteration": 2.501824378967285 }, { "auxiliary_loss_clip": 0.01044147, "auxiliary_loss_mlp": 0.01000643, "balance_loss_clip": 1.00908566, "balance_loss_mlp": 0.99974883, "epoch": 0.8759694583057777, "flos": 65571664193280.0, "grad_norm": 0.8480058578704652, "language_loss": 0.53354669, "learning_rate": 1.5909514769078892e-07, "loss": 0.55399466, "num_input_tokens_seen": 157635675, "step": 7285, "time_per_iteration": 3.11342453956604 }, { "auxiliary_loss_clip": 0.01115919, "auxiliary_loss_mlp": 0.01025474, "balance_loss_clip": 1.04525328, "balance_loss_mlp": 1.01875901, "epoch": 0.8760897011964167, "flos": 25446193608960.0, "grad_norm": 1.5350151270816048, "language_loss": 0.77772117, "learning_rate": 1.5879082266596867e-07, "loss": 0.79913509, "num_input_tokens_seen": 157657015, "step": 7286, "time_per_iteration": 2.5437259674072266 }, { "auxiliary_loss_clip": 0.01128602, "auxiliary_loss_mlp": 0.01022283, "balance_loss_clip": 1.03911042, "balance_loss_mlp": 1.01504064, "epoch": 0.8762099440870559, "flos": 28984830894720.0, "grad_norm": 1.832594416387367, "language_loss": 0.71956927, "learning_rate": 1.5848677695111645e-07, "loss": 0.74107808, "num_input_tokens_seen": 157678615, "step": 7287, "time_per_iteration": 2.5251834392547607 }, { "auxiliary_loss_clip": 0.01127496, "auxiliary_loss_mlp": 0.01024736, "balance_loss_clip": 1.04283404, "balance_loss_mlp": 1.01744914, "epoch": 0.8763301869776949, "flos": 21609461352960.0, "grad_norm": 2.254454354289077, "language_loss": 0.69463563, "learning_rate": 1.5818301059235562e-07, "loss": 0.71615803, "num_input_tokens_seen": 157693790, "step": 7288, "time_per_iteration": 2.4859535694122314 }, { "auxiliary_loss_clip": 0.01137619, "auxiliary_loss_mlp": 0.01020425, "balance_loss_clip": 1.04543161, "balance_loss_mlp": 1.01324582, "epoch": 0.876450429868334, "flos": 24644416176000.0, "grad_norm": 1.8075034109766241, "language_loss": 0.813995, "learning_rate": 1.578795236357684e-07, "loss": 0.83557546, "num_input_tokens_seen": 157715255, "step": 7289, "time_per_iteration": 2.508932590484619 }, { "auxiliary_loss_clip": 0.0113726, "auxiliary_loss_mlp": 0.01022899, "balance_loss_clip": 1.04508615, "balance_loss_mlp": 1.01613379, "epoch": 0.8765706727589732, "flos": 20260046188800.0, "grad_norm": 3.5040755570939806, "language_loss": 0.85885584, "learning_rate": 1.5757631612739218e-07, "loss": 0.8804574, "num_input_tokens_seen": 157728800, "step": 7290, "time_per_iteration": 2.4742484092712402 }, { "auxiliary_loss_clip": 0.01061895, "auxiliary_loss_mlp": 0.01001848, "balance_loss_clip": 1.0079484, "balance_loss_mlp": 1.00092435, "epoch": 0.8766909156496122, "flos": 71371165276800.0, "grad_norm": 0.7815669485717385, "language_loss": 0.61447883, "learning_rate": 1.572733881132242e-07, "loss": 0.63511622, "num_input_tokens_seen": 157789445, "step": 7291, "time_per_iteration": 3.063943386077881 }, { "auxiliary_loss_clip": 0.01028644, "auxiliary_loss_mlp": 0.01002972, "balance_loss_clip": 1.01076031, "balance_loss_mlp": 1.00190496, "epoch": 0.8768111585402513, "flos": 69523490603520.0, "grad_norm": 0.7819245990748084, "language_loss": 0.58509946, "learning_rate": 1.5697073963921814e-07, "loss": 0.60541558, "num_input_tokens_seen": 157848685, "step": 7292, "time_per_iteration": 3.016230344772339 }, { "auxiliary_loss_clip": 0.01151952, "auxiliary_loss_mlp": 0.01018753, "balance_loss_clip": 1.04669309, "balance_loss_mlp": 1.01141024, "epoch": 0.8769314014308904, "flos": 18838558385280.0, "grad_norm": 2.264413964133642, "language_loss": 0.84785247, "learning_rate": 1.566683707512857e-07, "loss": 0.86955953, "num_input_tokens_seen": 157866360, "step": 7293, "time_per_iteration": 2.421010971069336 }, { "auxiliary_loss_clip": 0.0113355, "auxiliary_loss_mlp": 0.01028772, "balance_loss_clip": 1.04351854, "balance_loss_mlp": 1.02105916, "epoch": 0.8770516443215295, "flos": 14976402278400.0, "grad_norm": 1.9243735443994014, "language_loss": 0.79577112, "learning_rate": 1.5636628149529553e-07, "loss": 0.81739438, "num_input_tokens_seen": 157884150, "step": 7294, "time_per_iteration": 3.179569959640503 }, { "auxiliary_loss_clip": 0.01133557, "auxiliary_loss_mlp": 0.01022716, "balance_loss_clip": 1.04266167, "balance_loss_mlp": 1.01615024, "epoch": 0.8771718872121685, "flos": 31649654021760.0, "grad_norm": 2.573076373108681, "language_loss": 0.79259431, "learning_rate": 1.560644719170743e-07, "loss": 0.81415707, "num_input_tokens_seen": 157905020, "step": 7295, "time_per_iteration": 3.405374765396118 }, { "auxiliary_loss_clip": 0.01120247, "auxiliary_loss_mlp": 0.01025591, "balance_loss_clip": 1.04078662, "balance_loss_mlp": 1.01785469, "epoch": 0.8772921301028077, "flos": 36095466222720.0, "grad_norm": 1.7139196897832787, "language_loss": 0.72097123, "learning_rate": 1.5576294206240692e-07, "loss": 0.74242961, "num_input_tokens_seen": 157924545, "step": 7296, "time_per_iteration": 2.63200306892395 }, { "auxiliary_loss_clip": 0.01134486, "auxiliary_loss_mlp": 0.01024436, "balance_loss_clip": 1.04464006, "balance_loss_mlp": 1.01770353, "epoch": 0.8774123729934468, "flos": 57116961849600.0, "grad_norm": 1.599752716293309, "language_loss": 0.67424309, "learning_rate": 1.5546169197703507e-07, "loss": 0.69583237, "num_input_tokens_seen": 157950820, "step": 7297, "time_per_iteration": 3.6132898330688477 }, { "auxiliary_loss_clip": 0.01141044, "auxiliary_loss_mlp": 0.01026423, "balance_loss_clip": 1.04180646, "balance_loss_mlp": 1.0195744, "epoch": 0.8775326158840858, "flos": 23914495900800.0, "grad_norm": 2.819588830314426, "language_loss": 0.77361566, "learning_rate": 1.5516072170665774e-07, "loss": 0.79529035, "num_input_tokens_seen": 157968790, "step": 7298, "time_per_iteration": 2.4898064136505127 }, { "auxiliary_loss_clip": 0.01151527, "auxiliary_loss_mlp": 0.01020035, "balance_loss_clip": 1.04496181, "balance_loss_mlp": 1.01342487, "epoch": 0.877652858774725, "flos": 17123285243520.0, "grad_norm": 1.9497699785209313, "language_loss": 0.86900139, "learning_rate": 1.5486003129693214e-07, "loss": 0.89071703, "num_input_tokens_seen": 157986155, "step": 7299, "time_per_iteration": 2.4063479900360107 }, { "auxiliary_loss_clip": 0.01151676, "auxiliary_loss_mlp": 0.01021452, "balance_loss_clip": 1.04569602, "balance_loss_mlp": 1.01438928, "epoch": 0.877773101665364, "flos": 16508961912960.0, "grad_norm": 1.9450436910258881, "language_loss": 0.78161234, "learning_rate": 1.545596207934725e-07, "loss": 0.80334359, "num_input_tokens_seen": 158004640, "step": 7300, "time_per_iteration": 2.410071611404419 }, { "auxiliary_loss_clip": 0.01128689, "auxiliary_loss_mlp": 0.0102387, "balance_loss_clip": 1.04172814, "balance_loss_mlp": 1.01659501, "epoch": 0.8778933445560031, "flos": 22053209973120.0, "grad_norm": 1.7879151151134898, "language_loss": 0.77681673, "learning_rate": 1.5425949024185147e-07, "loss": 0.79834229, "num_input_tokens_seen": 158024665, "step": 7301, "time_per_iteration": 2.486607313156128 }, { "auxiliary_loss_clip": 0.01138102, "auxiliary_loss_mlp": 0.01025856, "balance_loss_clip": 1.04291797, "balance_loss_mlp": 1.01881361, "epoch": 0.8780135874466423, "flos": 22564757514240.0, "grad_norm": 2.075972349259541, "language_loss": 0.67669159, "learning_rate": 1.5395963968759818e-07, "loss": 0.69833124, "num_input_tokens_seen": 158044940, "step": 7302, "time_per_iteration": 3.2607688903808594 }, { "auxiliary_loss_clip": 0.01137313, "auxiliary_loss_mlp": 0.01020815, "balance_loss_clip": 1.04224372, "balance_loss_mlp": 1.01370716, "epoch": 0.8781338303372813, "flos": 61531999073280.0, "grad_norm": 1.5629112086878276, "language_loss": 0.64401269, "learning_rate": 1.536600691761998e-07, "loss": 0.66559398, "num_input_tokens_seen": 158070770, "step": 7303, "time_per_iteration": 2.8303260803222656 }, { "auxiliary_loss_clip": 0.01127785, "auxiliary_loss_mlp": 0.0102383, "balance_loss_clip": 1.04722524, "balance_loss_mlp": 1.01747036, "epoch": 0.8782540732279204, "flos": 22674751937280.0, "grad_norm": 2.908706747026936, "language_loss": 0.71378052, "learning_rate": 1.5336077875310084e-07, "loss": 0.73529673, "num_input_tokens_seen": 158089995, "step": 7304, "time_per_iteration": 2.513988971710205 }, { "auxiliary_loss_clip": 0.01112788, "auxiliary_loss_mlp": 0.01021601, "balance_loss_clip": 1.04157197, "balance_loss_mlp": 1.01475227, "epoch": 0.8783743161185595, "flos": 16070348937600.0, "grad_norm": 1.8643091032824237, "language_loss": 0.74091262, "learning_rate": 1.5306176846370321e-07, "loss": 0.7622565, "num_input_tokens_seen": 158108140, "step": 7305, "time_per_iteration": 2.500054121017456 }, { "auxiliary_loss_clip": 0.01144422, "auxiliary_loss_mlp": 0.01032515, "balance_loss_clip": 1.04369247, "balance_loss_mlp": 1.0251832, "epoch": 0.8784945590091986, "flos": 26067879227520.0, "grad_norm": 2.006700364624822, "language_loss": 0.74064058, "learning_rate": 1.5276303835336712e-07, "loss": 0.76240993, "num_input_tokens_seen": 158128680, "step": 7306, "time_per_iteration": 2.5232291221618652 }, { "auxiliary_loss_clip": 0.01053913, "auxiliary_loss_mlp": 0.01000644, "balance_loss_clip": 1.00853157, "balance_loss_mlp": 0.99977398, "epoch": 0.8786148018998376, "flos": 62720643939840.0, "grad_norm": 0.7645305825577966, "language_loss": 0.53551519, "learning_rate": 1.524645884674094e-07, "loss": 0.55606073, "num_input_tokens_seen": 158185610, "step": 7307, "time_per_iteration": 3.014658212661743 }, { "auxiliary_loss_clip": 0.01164924, "auxiliary_loss_mlp": 0.00762419, "balance_loss_clip": 1.04495406, "balance_loss_mlp": 1.00043738, "epoch": 0.8787350447904768, "flos": 21652734263040.0, "grad_norm": 3.863340952209873, "language_loss": 0.79001743, "learning_rate": 1.521664188511047e-07, "loss": 0.80929089, "num_input_tokens_seen": 158205635, "step": 7308, "time_per_iteration": 2.4349019527435303 }, { "auxiliary_loss_clip": 0.01139687, "auxiliary_loss_mlp": 0.00761752, "balance_loss_clip": 1.04875147, "balance_loss_mlp": 1.00054598, "epoch": 0.8788552876811159, "flos": 25478476957440.0, "grad_norm": 1.8669475001719467, "language_loss": 0.80641675, "learning_rate": 1.518685295496851e-07, "loss": 0.82543111, "num_input_tokens_seen": 158223495, "step": 7309, "time_per_iteration": 2.5166094303131104 }, { "auxiliary_loss_clip": 0.0115075, "auxiliary_loss_mlp": 0.01022878, "balance_loss_clip": 1.04389, "balance_loss_mlp": 1.01619339, "epoch": 0.8789755305717549, "flos": 22310222762880.0, "grad_norm": 1.5892033441673044, "language_loss": 0.85107481, "learning_rate": 1.5157092060833975e-07, "loss": 0.87281114, "num_input_tokens_seen": 158243145, "step": 7310, "time_per_iteration": 2.471359968185425 }, { "auxiliary_loss_clip": 0.01134144, "auxiliary_loss_mlp": 0.01018158, "balance_loss_clip": 1.04197431, "balance_loss_mlp": 1.01142299, "epoch": 0.879095773462394, "flos": 29310971408640.0, "grad_norm": 1.524385952489216, "language_loss": 0.65837979, "learning_rate": 1.5127359207221658e-07, "loss": 0.67990279, "num_input_tokens_seen": 158262625, "step": 7311, "time_per_iteration": 2.5396535396575928 }, { "auxiliary_loss_clip": 0.01083491, "auxiliary_loss_mlp": 0.0102234, "balance_loss_clip": 1.03510821, "balance_loss_mlp": 1.01461518, "epoch": 0.8792160163530331, "flos": 16690023394560.0, "grad_norm": 1.8527610637499978, "language_loss": 0.73353505, "learning_rate": 1.5097654398641923e-07, "loss": 0.75459343, "num_input_tokens_seen": 158280530, "step": 7312, "time_per_iteration": 2.576570987701416 }, { "auxiliary_loss_clip": 0.01155488, "auxiliary_loss_mlp": 0.01025056, "balance_loss_clip": 1.04616594, "balance_loss_mlp": 1.01764083, "epoch": 0.8793362592436722, "flos": 24499301230080.0, "grad_norm": 1.5507347272198129, "language_loss": 0.73067212, "learning_rate": 1.5067977639601014e-07, "loss": 0.75247753, "num_input_tokens_seen": 158303290, "step": 7313, "time_per_iteration": 2.491948366165161 }, { "auxiliary_loss_clip": 0.0113365, "auxiliary_loss_mlp": 0.01024339, "balance_loss_clip": 1.04443944, "balance_loss_mlp": 1.01734138, "epoch": 0.8794565021343113, "flos": 14538399834240.0, "grad_norm": 3.035761442661135, "language_loss": 0.71191573, "learning_rate": 1.5038328934600864e-07, "loss": 0.73349553, "num_input_tokens_seen": 158319925, "step": 7314, "time_per_iteration": 2.4762117862701416 }, { "auxiliary_loss_clip": 0.01134273, "auxiliary_loss_mlp": 0.01025895, "balance_loss_clip": 1.04428303, "balance_loss_mlp": 1.01915979, "epoch": 0.8795767450249504, "flos": 39530286224640.0, "grad_norm": 2.275108579227439, "language_loss": 0.70001376, "learning_rate": 1.5008708288139161e-07, "loss": 0.72161543, "num_input_tokens_seen": 158342285, "step": 7315, "time_per_iteration": 2.642104387283325 }, { "auxiliary_loss_clip": 0.01151542, "auxiliary_loss_mlp": 0.01025986, "balance_loss_clip": 1.04667854, "balance_loss_mlp": 1.01880705, "epoch": 0.8796969879155895, "flos": 22960672197120.0, "grad_norm": 1.907857171125672, "language_loss": 0.73190546, "learning_rate": 1.497911570470931e-07, "loss": 0.75368077, "num_input_tokens_seen": 158362290, "step": 7316, "time_per_iteration": 2.471937894821167 }, { "auxiliary_loss_clip": 0.01112257, "auxiliary_loss_mlp": 0.01028597, "balance_loss_clip": 1.04162276, "balance_loss_mlp": 1.02133727, "epoch": 0.8798172308062285, "flos": 28362427004160.0, "grad_norm": 1.7013615199981986, "language_loss": 0.85545564, "learning_rate": 1.494955118880048e-07, "loss": 0.87686414, "num_input_tokens_seen": 158383275, "step": 7317, "time_per_iteration": 2.5632169246673584 }, { "auxiliary_loss_clip": 0.01150999, "auxiliary_loss_mlp": 0.01025349, "balance_loss_clip": 1.04445624, "balance_loss_mlp": 1.01834583, "epoch": 0.8799374736968677, "flos": 23988974751360.0, "grad_norm": 1.6072975066153066, "language_loss": 0.72797132, "learning_rate": 1.4920014744897634e-07, "loss": 0.74973476, "num_input_tokens_seen": 158402690, "step": 7318, "time_per_iteration": 2.5129966735839844 }, { "auxiliary_loss_clip": 0.01127804, "auxiliary_loss_mlp": 0.01018451, "balance_loss_clip": 1.04282677, "balance_loss_mlp": 1.01139379, "epoch": 0.8800577165875068, "flos": 25630271832960.0, "grad_norm": 1.9919640693505132, "language_loss": 0.86248398, "learning_rate": 1.4890506377481392e-07, "loss": 0.88394654, "num_input_tokens_seen": 158421780, "step": 7319, "time_per_iteration": 2.5378336906433105 }, { "auxiliary_loss_clip": 0.01094993, "auxiliary_loss_mlp": 0.01030529, "balance_loss_clip": 1.04215813, "balance_loss_mlp": 1.02373707, "epoch": 0.8801779594781458, "flos": 23440331439360.0, "grad_norm": 1.598586831054208, "language_loss": 0.6399948, "learning_rate": 1.486102609102815e-07, "loss": 0.66125, "num_input_tokens_seen": 158442330, "step": 7320, "time_per_iteration": 2.613802433013916 }, { "auxiliary_loss_clip": 0.01129238, "auxiliary_loss_mlp": 0.01023758, "balance_loss_clip": 1.04291725, "balance_loss_mlp": 1.01688814, "epoch": 0.880298202368785, "flos": 11508580656000.0, "grad_norm": 2.4365163959647145, "language_loss": 0.8557207, "learning_rate": 1.483157389001004e-07, "loss": 0.87725067, "num_input_tokens_seen": 158459890, "step": 7321, "time_per_iteration": 3.2902450561523438 }, { "auxiliary_loss_clip": 0.01136308, "auxiliary_loss_mlp": 0.01022919, "balance_loss_clip": 1.04140711, "balance_loss_mlp": 1.01521778, "epoch": 0.880418445259424, "flos": 22671447886080.0, "grad_norm": 2.066468582129408, "language_loss": 0.78696632, "learning_rate": 1.4802149778894933e-07, "loss": 0.80855858, "num_input_tokens_seen": 158478680, "step": 7322, "time_per_iteration": 2.4976484775543213 }, { "auxiliary_loss_clip": 0.01140318, "auxiliary_loss_mlp": 0.0102217, "balance_loss_clip": 1.04095674, "balance_loss_mlp": 1.01589322, "epoch": 0.8805386881500631, "flos": 20522158709760.0, "grad_norm": 1.7618588769863543, "language_loss": 0.87580609, "learning_rate": 1.4772753762146484e-07, "loss": 0.89743096, "num_input_tokens_seen": 158497935, "step": 7323, "time_per_iteration": 2.436676502227783 }, { "auxiliary_loss_clip": 0.01145624, "auxiliary_loss_mlp": 0.01019832, "balance_loss_clip": 1.0443821, "balance_loss_mlp": 1.01225281, "epoch": 0.8806589310407023, "flos": 36538891620480.0, "grad_norm": 1.591509823588181, "language_loss": 0.70549273, "learning_rate": 1.474338584422401e-07, "loss": 0.72714734, "num_input_tokens_seen": 158523145, "step": 7324, "time_per_iteration": 3.442779779434204 }, { "auxiliary_loss_clip": 0.01145237, "auxiliary_loss_mlp": 0.01020684, "balance_loss_clip": 1.04453063, "balance_loss_mlp": 1.01381409, "epoch": 0.8807791739313413, "flos": 23440187784960.0, "grad_norm": 1.6276190756949633, "language_loss": 0.75873756, "learning_rate": 1.4714046029582595e-07, "loss": 0.78039676, "num_input_tokens_seen": 158542210, "step": 7325, "time_per_iteration": 2.455404758453369 }, { "auxiliary_loss_clip": 0.01124952, "auxiliary_loss_mlp": 0.01022688, "balance_loss_clip": 1.0425694, "balance_loss_mlp": 1.01545227, "epoch": 0.8808994168219804, "flos": 25956843310080.0, "grad_norm": 1.818360019062367, "language_loss": 0.75533503, "learning_rate": 1.46847343226731e-07, "loss": 0.77681148, "num_input_tokens_seen": 158563250, "step": 7326, "time_per_iteration": 2.552887439727783 }, { "auxiliary_loss_clip": 0.01151537, "auxiliary_loss_mlp": 0.01023485, "balance_loss_clip": 1.04391599, "balance_loss_mlp": 1.01612389, "epoch": 0.8810196597126195, "flos": 17092079303040.0, "grad_norm": 2.0285178791236946, "language_loss": 0.69577301, "learning_rate": 1.465545072794203e-07, "loss": 0.71752322, "num_input_tokens_seen": 158581125, "step": 7327, "time_per_iteration": 2.424048900604248 }, { "auxiliary_loss_clip": 0.01107508, "auxiliary_loss_mlp": 0.01023891, "balance_loss_clip": 1.04375744, "balance_loss_mlp": 1.01673532, "epoch": 0.8811399026032586, "flos": 23002831785600.0, "grad_norm": 2.5769298564056418, "language_loss": 0.75683004, "learning_rate": 1.4626195249831774e-07, "loss": 0.778144, "num_input_tokens_seen": 158602025, "step": 7328, "time_per_iteration": 2.5792641639709473 }, { "auxiliary_loss_clip": 0.01148546, "auxiliary_loss_mlp": 0.01021431, "balance_loss_clip": 1.04422379, "balance_loss_mlp": 1.0146066, "epoch": 0.8812601454938976, "flos": 14463813242880.0, "grad_norm": 1.9528449440150601, "language_loss": 0.71760786, "learning_rate": 1.4596967892780244e-07, "loss": 0.73930764, "num_input_tokens_seen": 158618355, "step": 7329, "time_per_iteration": 3.194915771484375 }, { "auxiliary_loss_clip": 0.01162686, "auxiliary_loss_mlp": 0.01023766, "balance_loss_clip": 1.04621303, "balance_loss_mlp": 1.01692367, "epoch": 0.8813803883845368, "flos": 22493223578880.0, "grad_norm": 1.6494222605649067, "language_loss": 0.74884796, "learning_rate": 1.4567768661221314e-07, "loss": 0.77071249, "num_input_tokens_seen": 158638925, "step": 7330, "time_per_iteration": 2.4850776195526123 }, { "auxiliary_loss_clip": 0.01155302, "auxiliary_loss_mlp": 0.00761926, "balance_loss_clip": 1.04727781, "balance_loss_mlp": 1.00046337, "epoch": 0.8815006312751759, "flos": 21506901045120.0, "grad_norm": 1.9400371862733354, "language_loss": 0.74573803, "learning_rate": 1.4538597559584442e-07, "loss": 0.76491034, "num_input_tokens_seen": 158656715, "step": 7331, "time_per_iteration": 2.478764295578003 }, { "auxiliary_loss_clip": 0.01133124, "auxiliary_loss_mlp": 0.01025317, "balance_loss_clip": 1.04287696, "balance_loss_mlp": 1.01793778, "epoch": 0.8816208741658149, "flos": 22784566792320.0, "grad_norm": 2.3234348253468395, "language_loss": 0.79236937, "learning_rate": 1.4509454592294823e-07, "loss": 0.81395376, "num_input_tokens_seen": 158677200, "step": 7332, "time_per_iteration": 2.5042765140533447 }, { "auxiliary_loss_clip": 0.01126054, "auxiliary_loss_mlp": 0.00761949, "balance_loss_clip": 1.04570723, "balance_loss_mlp": 1.00043428, "epoch": 0.8817411170564541, "flos": 17779409026560.0, "grad_norm": 1.9214944616600578, "language_loss": 0.79064822, "learning_rate": 1.448033976377354e-07, "loss": 0.80952829, "num_input_tokens_seen": 158692185, "step": 7333, "time_per_iteration": 2.497204303741455 }, { "auxiliary_loss_clip": 0.01151412, "auxiliary_loss_mlp": 0.01020055, "balance_loss_clip": 1.0434835, "balance_loss_mlp": 1.01328373, "epoch": 0.8818613599470931, "flos": 18551812112640.0, "grad_norm": 1.8108215662815614, "language_loss": 0.7412011, "learning_rate": 1.445125307843713e-07, "loss": 0.76291573, "num_input_tokens_seen": 158710410, "step": 7334, "time_per_iteration": 2.422093629837036 }, { "auxiliary_loss_clip": 0.01149064, "auxiliary_loss_mlp": 0.01021286, "balance_loss_clip": 1.04654717, "balance_loss_mlp": 1.01507831, "epoch": 0.8819816028377322, "flos": 27599792417280.0, "grad_norm": 1.5808760505769883, "language_loss": 0.75867236, "learning_rate": 1.442219454069813e-07, "loss": 0.78037584, "num_input_tokens_seen": 158731435, "step": 7335, "time_per_iteration": 2.5133044719696045 }, { "auxiliary_loss_clip": 0.01110849, "auxiliary_loss_mlp": 0.01026464, "balance_loss_clip": 1.0421983, "balance_loss_mlp": 1.0197438, "epoch": 0.8821018457283714, "flos": 23404600385280.0, "grad_norm": 2.1464745733324277, "language_loss": 0.66530502, "learning_rate": 1.4393164154964676e-07, "loss": 0.68667817, "num_input_tokens_seen": 158750965, "step": 7336, "time_per_iteration": 2.555781364440918 }, { "auxiliary_loss_clip": 0.01150348, "auxiliary_loss_mlp": 0.01025165, "balance_loss_clip": 1.04768014, "balance_loss_mlp": 1.01837564, "epoch": 0.8822220886190104, "flos": 29132459792640.0, "grad_norm": 2.126371595911774, "language_loss": 0.94086659, "learning_rate": 1.4364161925640649e-07, "loss": 0.96262175, "num_input_tokens_seen": 158772365, "step": 7337, "time_per_iteration": 2.5190610885620117 }, { "auxiliary_loss_clip": 0.01163945, "auxiliary_loss_mlp": 0.01022252, "balance_loss_clip": 1.04698277, "balance_loss_mlp": 1.01548684, "epoch": 0.8823423315096495, "flos": 20485422074880.0, "grad_norm": 1.8771424008116642, "language_loss": 0.85013425, "learning_rate": 1.4335187857125663e-07, "loss": 0.87199628, "num_input_tokens_seen": 158791065, "step": 7338, "time_per_iteration": 2.4469211101531982 }, { "auxiliary_loss_clip": 0.01151542, "auxiliary_loss_mlp": 0.01020202, "balance_loss_clip": 1.04482007, "balance_loss_mlp": 1.01351452, "epoch": 0.8824625744002886, "flos": 24206377818240.0, "grad_norm": 1.707140800204605, "language_loss": 0.75492656, "learning_rate": 1.4306241953815023e-07, "loss": 0.77664405, "num_input_tokens_seen": 158812125, "step": 7339, "time_per_iteration": 2.481071949005127 }, { "auxiliary_loss_clip": 0.01152356, "auxiliary_loss_mlp": 0.01021534, "balance_loss_clip": 1.04634786, "balance_loss_mlp": 1.01454782, "epoch": 0.8825828172909277, "flos": 24679500785280.0, "grad_norm": 1.8240121142624837, "language_loss": 0.70930111, "learning_rate": 1.4277324220099862e-07, "loss": 0.73104006, "num_input_tokens_seen": 158834035, "step": 7340, "time_per_iteration": 2.4850611686706543 }, { "auxiliary_loss_clip": 0.01116506, "auxiliary_loss_mlp": 0.010231, "balance_loss_clip": 1.04049945, "balance_loss_mlp": 1.01630521, "epoch": 0.8827030601815667, "flos": 22456163721600.0, "grad_norm": 1.9610634645826541, "language_loss": 0.7435261, "learning_rate": 1.4248434660366938e-07, "loss": 0.76492214, "num_input_tokens_seen": 158853510, "step": 7341, "time_per_iteration": 2.502406358718872 }, { "auxiliary_loss_clip": 0.01134861, "auxiliary_loss_mlp": 0.01025517, "balance_loss_clip": 1.04488158, "balance_loss_mlp": 1.01854336, "epoch": 0.8828233030722058, "flos": 19865639877120.0, "grad_norm": 2.2529110446295153, "language_loss": 0.70449144, "learning_rate": 1.4219573278998808e-07, "loss": 0.7260952, "num_input_tokens_seen": 158871970, "step": 7342, "time_per_iteration": 2.465952157974243 }, { "auxiliary_loss_clip": 0.01134606, "auxiliary_loss_mlp": 0.01023129, "balance_loss_clip": 1.04135799, "balance_loss_mlp": 1.01561308, "epoch": 0.882943545962845, "flos": 39347213581440.0, "grad_norm": 2.4144026349215286, "language_loss": 0.65100288, "learning_rate": 1.4190740080373685e-07, "loss": 0.67258024, "num_input_tokens_seen": 158892250, "step": 7343, "time_per_iteration": 2.651212453842163 }, { "auxiliary_loss_clip": 0.01108483, "auxiliary_loss_mlp": 0.01024688, "balance_loss_clip": 1.04243755, "balance_loss_mlp": 1.01709437, "epoch": 0.883063788853484, "flos": 19054524908160.0, "grad_norm": 1.9286691086263699, "language_loss": 0.84236878, "learning_rate": 1.4161935068865538e-07, "loss": 0.86370045, "num_input_tokens_seen": 158907395, "step": 7344, "time_per_iteration": 2.551642656326294 }, { "auxiliary_loss_clip": 0.01164032, "auxiliary_loss_mlp": 0.01021445, "balance_loss_clip": 1.0461843, "balance_loss_mlp": 1.01421499, "epoch": 0.8831840317441231, "flos": 18733196816640.0, "grad_norm": 2.1069957650828313, "language_loss": 0.75574142, "learning_rate": 1.4133158248844113e-07, "loss": 0.77759618, "num_input_tokens_seen": 158926300, "step": 7345, "time_per_iteration": 2.3864638805389404 }, { "auxiliary_loss_clip": 0.01126088, "auxiliary_loss_mlp": 0.01023549, "balance_loss_clip": 1.04333997, "balance_loss_mlp": 1.01577067, "epoch": 0.8833042746347622, "flos": 26827712553600.0, "grad_norm": 2.282310599232799, "language_loss": 0.73526591, "learning_rate": 1.4104409624674785e-07, "loss": 0.75676233, "num_input_tokens_seen": 158946085, "step": 7346, "time_per_iteration": 2.5429930686950684 }, { "auxiliary_loss_clip": 0.01152918, "auxiliary_loss_mlp": 0.01019155, "balance_loss_clip": 1.04880488, "balance_loss_mlp": 1.01244402, "epoch": 0.8834245175254013, "flos": 26104077158400.0, "grad_norm": 1.8701023507103074, "language_loss": 0.78414667, "learning_rate": 1.407568920071873e-07, "loss": 0.80586743, "num_input_tokens_seen": 158964950, "step": 7347, "time_per_iteration": 2.4870550632476807 }, { "auxiliary_loss_clip": 0.01169903, "auxiliary_loss_mlp": 0.0102551, "balance_loss_clip": 1.04780555, "balance_loss_mlp": 1.01800549, "epoch": 0.8835447604160404, "flos": 30629036977920.0, "grad_norm": 1.8127838562903202, "language_loss": 0.68040216, "learning_rate": 1.4046996981332782e-07, "loss": 0.70235622, "num_input_tokens_seen": 158984835, "step": 7348, "time_per_iteration": 3.5452449321746826 }, { "auxiliary_loss_clip": 0.01125377, "auxiliary_loss_mlp": 0.01022474, "balance_loss_clip": 1.04293323, "balance_loss_mlp": 1.01482034, "epoch": 0.8836650033066795, "flos": 24718356322560.0, "grad_norm": 1.8635928875697585, "language_loss": 0.77988631, "learning_rate": 1.4018332970869516e-07, "loss": 0.80136484, "num_input_tokens_seen": 159002775, "step": 7349, "time_per_iteration": 2.5430126190185547 }, { "auxiliary_loss_clip": 0.01132065, "auxiliary_loss_mlp": 0.0102571, "balance_loss_clip": 1.04517722, "balance_loss_mlp": 1.01832163, "epoch": 0.8837852461973186, "flos": 25413371556480.0, "grad_norm": 1.8206539258283447, "language_loss": 0.85138249, "learning_rate": 1.398969717367733e-07, "loss": 0.87296033, "num_input_tokens_seen": 159024100, "step": 7350, "time_per_iteration": 3.366276264190674 }, { "auxiliary_loss_clip": 0.01108142, "auxiliary_loss_mlp": 0.01025792, "balance_loss_clip": 1.04441047, "balance_loss_mlp": 1.01903009, "epoch": 0.8839054890879576, "flos": 17822574195840.0, "grad_norm": 2.448918069540013, "language_loss": 0.75964421, "learning_rate": 1.396108959410014e-07, "loss": 0.78098363, "num_input_tokens_seen": 159043315, "step": 7351, "time_per_iteration": 2.516375780105591 }, { "auxiliary_loss_clip": 0.0115221, "auxiliary_loss_mlp": 0.00761962, "balance_loss_clip": 1.04712796, "balance_loss_mlp": 1.00036573, "epoch": 0.8840257319785968, "flos": 23769021818880.0, "grad_norm": 1.7945940777075668, "language_loss": 0.81374007, "learning_rate": 1.3932510236477745e-07, "loss": 0.83288181, "num_input_tokens_seen": 159063985, "step": 7352, "time_per_iteration": 2.462893486022949 }, { "auxiliary_loss_clip": 0.0114889, "auxiliary_loss_mlp": 0.01025624, "balance_loss_clip": 1.04266739, "balance_loss_mlp": 1.01796532, "epoch": 0.8841459748692359, "flos": 29059776622080.0, "grad_norm": 1.7131172184681667, "language_loss": 0.5630846, "learning_rate": 1.3903959105145636e-07, "loss": 0.58482969, "num_input_tokens_seen": 159084475, "step": 7353, "time_per_iteration": 2.5107421875 }, { "auxiliary_loss_clip": 0.01164539, "auxiliary_loss_mlp": 0.01023206, "balance_loss_clip": 1.04688239, "balance_loss_mlp": 1.01627707, "epoch": 0.8842662177598749, "flos": 24311523905280.0, "grad_norm": 2.3033963809454545, "language_loss": 0.83300853, "learning_rate": 1.387543620443492e-07, "loss": 0.85488594, "num_input_tokens_seen": 159101320, "step": 7354, "time_per_iteration": 2.418693780899048 }, { "auxiliary_loss_clip": 0.01164544, "auxiliary_loss_mlp": 0.01024318, "balance_loss_clip": 1.04751563, "balance_loss_mlp": 1.01733816, "epoch": 0.8843864606505141, "flos": 25007867942400.0, "grad_norm": 1.6967419519665319, "language_loss": 0.84051484, "learning_rate": 1.3846941538672606e-07, "loss": 0.86240351, "num_input_tokens_seen": 159120025, "step": 7355, "time_per_iteration": 2.4436275959014893 }, { "auxiliary_loss_clip": 0.01112892, "auxiliary_loss_mlp": 0.01023019, "balance_loss_clip": 1.04326117, "balance_loss_mlp": 1.01636386, "epoch": 0.8845067035411531, "flos": 28183915388160.0, "grad_norm": 2.1474783124500516, "language_loss": 0.80969095, "learning_rate": 1.3818475112181193e-07, "loss": 0.8310501, "num_input_tokens_seen": 159138820, "step": 7356, "time_per_iteration": 3.369537830352783 }, { "auxiliary_loss_clip": 0.01135514, "auxiliary_loss_mlp": 0.01023688, "balance_loss_clip": 1.04359782, "balance_loss_mlp": 1.01730442, "epoch": 0.8846269464317922, "flos": 12853219311360.0, "grad_norm": 2.0950300963596704, "language_loss": 0.80003589, "learning_rate": 1.3790036929279091e-07, "loss": 0.82162786, "num_input_tokens_seen": 159155975, "step": 7357, "time_per_iteration": 2.47066593170166 }, { "auxiliary_loss_clip": 0.01154914, "auxiliary_loss_mlp": 0.00762031, "balance_loss_clip": 1.04783964, "balance_loss_mlp": 1.00032699, "epoch": 0.8847471893224313, "flos": 18624351628800.0, "grad_norm": 2.2044592804261054, "language_loss": 0.5883671, "learning_rate": 1.3761626994280363e-07, "loss": 0.60753655, "num_input_tokens_seen": 159173445, "step": 7358, "time_per_iteration": 2.4363293647766113 }, { "auxiliary_loss_clip": 0.01129126, "auxiliary_loss_mlp": 0.01023427, "balance_loss_clip": 1.04412818, "balance_loss_mlp": 1.01628006, "epoch": 0.8848674322130704, "flos": 35769433449600.0, "grad_norm": 1.698533001841016, "language_loss": 0.73515761, "learning_rate": 1.3733245311494735e-07, "loss": 0.75668311, "num_input_tokens_seen": 159196100, "step": 7359, "time_per_iteration": 2.6968295574188232 }, { "auxiliary_loss_clip": 0.01153576, "auxiliary_loss_mlp": 0.01024562, "balance_loss_clip": 1.04804122, "balance_loss_mlp": 1.01734066, "epoch": 0.8849876751037095, "flos": 24243760897920.0, "grad_norm": 3.9645384812960938, "language_loss": 0.70654458, "learning_rate": 1.3704891885227676e-07, "loss": 0.72832602, "num_input_tokens_seen": 159216145, "step": 7360, "time_per_iteration": 2.474438428878784 }, { "auxiliary_loss_clip": 0.0112333, "auxiliary_loss_mlp": 0.01029616, "balance_loss_clip": 1.04123974, "balance_loss_mlp": 1.02166462, "epoch": 0.8851079179943486, "flos": 21500580251520.0, "grad_norm": 1.8429630333122524, "language_loss": 0.78026068, "learning_rate": 1.367656671978037e-07, "loss": 0.80179018, "num_input_tokens_seen": 159233610, "step": 7361, "time_per_iteration": 2.5289735794067383 }, { "auxiliary_loss_clip": 0.01142969, "auxiliary_loss_mlp": 0.01025879, "balance_loss_clip": 1.04348564, "balance_loss_mlp": 1.01915514, "epoch": 0.8852281608849877, "flos": 15300711198720.0, "grad_norm": 1.9542397882552758, "language_loss": 0.73432469, "learning_rate": 1.36482698194498e-07, "loss": 0.75601315, "num_input_tokens_seen": 159250155, "step": 7362, "time_per_iteration": 2.4433975219726562 }, { "auxiliary_loss_clip": 0.01136594, "auxiliary_loss_mlp": 0.01025033, "balance_loss_clip": 1.04306769, "balance_loss_mlp": 1.01749301, "epoch": 0.8853484037756267, "flos": 23295719283840.0, "grad_norm": 1.7797861929175878, "language_loss": 0.71842694, "learning_rate": 1.3620001188528506e-07, "loss": 0.74004328, "num_input_tokens_seen": 159270875, "step": 7363, "time_per_iteration": 2.519090414047241 }, { "auxiliary_loss_clip": 0.01153476, "auxiliary_loss_mlp": 0.01024235, "balance_loss_clip": 1.04402423, "balance_loss_mlp": 1.01638174, "epoch": 0.8854686466662659, "flos": 25114773795840.0, "grad_norm": 2.482945675780781, "language_loss": 0.73656273, "learning_rate": 1.3591760831304865e-07, "loss": 0.75833976, "num_input_tokens_seen": 159288565, "step": 7364, "time_per_iteration": 2.475139617919922 }, { "auxiliary_loss_clip": 0.01163315, "auxiliary_loss_mlp": 0.01022319, "balance_loss_clip": 1.04613352, "balance_loss_mlp": 1.01521993, "epoch": 0.885588889556905, "flos": 21390873137280.0, "grad_norm": 2.3943866196458363, "language_loss": 0.7963419, "learning_rate": 1.356354875206287e-07, "loss": 0.8181982, "num_input_tokens_seen": 159306400, "step": 7365, "time_per_iteration": 2.425713300704956 }, { "auxiliary_loss_clip": 0.01120097, "auxiliary_loss_mlp": 0.01021032, "balance_loss_clip": 1.04411805, "balance_loss_mlp": 1.01428771, "epoch": 0.885709132447544, "flos": 26906752431360.0, "grad_norm": 2.4027470743625323, "language_loss": 0.70194697, "learning_rate": 1.3535364955082296e-07, "loss": 0.72335827, "num_input_tokens_seen": 159326250, "step": 7366, "time_per_iteration": 2.545585870742798 }, { "auxiliary_loss_clip": 0.01161817, "auxiliary_loss_mlp": 0.01022819, "balance_loss_clip": 1.04649854, "balance_loss_mlp": 1.01589322, "epoch": 0.8858293753381832, "flos": 26103394800000.0, "grad_norm": 1.7741755039733227, "language_loss": 0.64465559, "learning_rate": 1.3507209444638613e-07, "loss": 0.66650188, "num_input_tokens_seen": 159348250, "step": 7367, "time_per_iteration": 2.4805407524108887 }, { "auxiliary_loss_clip": 0.01151003, "auxiliary_loss_mlp": 0.01023902, "balance_loss_clip": 1.04648042, "balance_loss_mlp": 1.01688075, "epoch": 0.8859496182288222, "flos": 23292810282240.0, "grad_norm": 2.0168229738428836, "language_loss": 0.74196815, "learning_rate": 1.347908222500298e-07, "loss": 0.76371723, "num_input_tokens_seen": 159368325, "step": 7368, "time_per_iteration": 2.452939510345459 }, { "auxiliary_loss_clip": 0.01112959, "auxiliary_loss_mlp": 0.01026246, "balance_loss_clip": 1.04356933, "balance_loss_mlp": 1.01949, "epoch": 0.8860698611194613, "flos": 16872916469760.0, "grad_norm": 1.9161946428212586, "language_loss": 0.69581848, "learning_rate": 1.3450983300442276e-07, "loss": 0.71721053, "num_input_tokens_seen": 159387555, "step": 7369, "time_per_iteration": 2.5182132720947266 }, { "auxiliary_loss_clip": 0.01151776, "auxiliary_loss_mlp": 0.0102322, "balance_loss_clip": 1.04609036, "balance_loss_mlp": 1.01636243, "epoch": 0.8861901040101005, "flos": 24681404206080.0, "grad_norm": 2.098912052055472, "language_loss": 0.73768014, "learning_rate": 1.3422912675219068e-07, "loss": 0.75943005, "num_input_tokens_seen": 159407310, "step": 7370, "time_per_iteration": 2.4848122596740723 }, { "auxiliary_loss_clip": 0.01161121, "auxiliary_loss_mlp": 0.01020819, "balance_loss_clip": 1.04725075, "balance_loss_mlp": 1.01467657, "epoch": 0.8863103469007395, "flos": 24423026699520.0, "grad_norm": 1.51897907599744, "language_loss": 0.79161704, "learning_rate": 1.339487035359166e-07, "loss": 0.81343639, "num_input_tokens_seen": 159427680, "step": 7371, "time_per_iteration": 2.488556385040283 }, { "auxiliary_loss_clip": 0.0113892, "auxiliary_loss_mlp": 0.00761439, "balance_loss_clip": 1.04668522, "balance_loss_mlp": 1.00043988, "epoch": 0.8864305897913786, "flos": 22053964158720.0, "grad_norm": 1.5583586568860697, "language_loss": 0.84675568, "learning_rate": 1.336685633981409e-07, "loss": 0.86575925, "num_input_tokens_seen": 159448765, "step": 7372, "time_per_iteration": 2.502751111984253 }, { "auxiliary_loss_clip": 0.01152084, "auxiliary_loss_mlp": 0.01023228, "balance_loss_clip": 1.04411387, "balance_loss_mlp": 1.01605463, "epoch": 0.8865508326820177, "flos": 19099449843840.0, "grad_norm": 1.759198767596091, "language_loss": 0.75007427, "learning_rate": 1.333887063813597e-07, "loss": 0.77182746, "num_input_tokens_seen": 159466870, "step": 7373, "time_per_iteration": 2.4545724391937256 }, { "auxiliary_loss_clip": 0.01136512, "auxiliary_loss_mlp": 0.01019296, "balance_loss_clip": 1.0419085, "balance_loss_mlp": 1.01268888, "epoch": 0.8866710755726568, "flos": 15414189240960.0, "grad_norm": 1.8081956465789568, "language_loss": 0.65971494, "learning_rate": 1.331091325280278e-07, "loss": 0.68127298, "num_input_tokens_seen": 159485840, "step": 7374, "time_per_iteration": 2.477057933807373 }, { "auxiliary_loss_clip": 0.01101656, "auxiliary_loss_mlp": 0.01024392, "balance_loss_clip": 1.04003167, "balance_loss_mlp": 1.01719809, "epoch": 0.8867913184632958, "flos": 20083689388800.0, "grad_norm": 1.629645969604547, "language_loss": 0.78244472, "learning_rate": 1.3282984188055625e-07, "loss": 0.80370522, "num_input_tokens_seen": 159505630, "step": 7375, "time_per_iteration": 4.1883156299591064 }, { "auxiliary_loss_clip": 0.01162492, "auxiliary_loss_mlp": 0.01029274, "balance_loss_clip": 1.04465771, "balance_loss_mlp": 1.02260971, "epoch": 0.8869115613539349, "flos": 23365852588800.0, "grad_norm": 1.8117834368326777, "language_loss": 0.79558909, "learning_rate": 1.3255083448131288e-07, "loss": 0.81750679, "num_input_tokens_seen": 159524675, "step": 7376, "time_per_iteration": 2.4247028827667236 }, { "auxiliary_loss_clip": 0.01152487, "auxiliary_loss_mlp": 0.010239, "balance_loss_clip": 1.04326463, "balance_loss_mlp": 1.01706338, "epoch": 0.8870318042445741, "flos": 21286840371840.0, "grad_norm": 1.9723421291595973, "language_loss": 0.78709328, "learning_rate": 1.3227211037262365e-07, "loss": 0.80885714, "num_input_tokens_seen": 159541915, "step": 7377, "time_per_iteration": 3.265744924545288 }, { "auxiliary_loss_clip": 0.01110145, "auxiliary_loss_mlp": 0.01027215, "balance_loss_clip": 1.03944755, "balance_loss_mlp": 1.01971054, "epoch": 0.8871520471352131, "flos": 20010862563840.0, "grad_norm": 2.6128971238669627, "language_loss": 0.85398757, "learning_rate": 1.319936695967696e-07, "loss": 0.8753612, "num_input_tokens_seen": 159559740, "step": 7378, "time_per_iteration": 2.5648369789123535 }, { "auxiliary_loss_clip": 0.01172738, "auxiliary_loss_mlp": 0.01023659, "balance_loss_clip": 1.04862738, "balance_loss_mlp": 1.01571369, "epoch": 0.8872722900258522, "flos": 22601422321920.0, "grad_norm": 2.4603420899076847, "language_loss": 0.82059735, "learning_rate": 1.3171551219599097e-07, "loss": 0.84256136, "num_input_tokens_seen": 159578265, "step": 7379, "time_per_iteration": 2.429309368133545 }, { "auxiliary_loss_clip": 0.01166356, "auxiliary_loss_mlp": 0.01023926, "balance_loss_clip": 1.04935789, "balance_loss_mlp": 1.01661849, "epoch": 0.8873925329164913, "flos": 22163276223360.0, "grad_norm": 10.800634261934333, "language_loss": 0.78081715, "learning_rate": 1.3143763821248377e-07, "loss": 0.80271995, "num_input_tokens_seen": 159595350, "step": 7380, "time_per_iteration": 2.423342704772949 }, { "auxiliary_loss_clip": 0.01162399, "auxiliary_loss_mlp": 0.01025604, "balance_loss_clip": 1.04613316, "balance_loss_mlp": 1.01890492, "epoch": 0.8875127758071304, "flos": 19208223204480.0, "grad_norm": 2.039672831047613, "language_loss": 0.72376847, "learning_rate": 1.3116004768840118e-07, "loss": 0.7456485, "num_input_tokens_seen": 159613725, "step": 7381, "time_per_iteration": 2.395111560821533 }, { "auxiliary_loss_clip": 0.01165582, "auxiliary_loss_mlp": 0.01027356, "balance_loss_clip": 1.04605055, "balance_loss_mlp": 1.02000403, "epoch": 0.8876330186977694, "flos": 18110900666880.0, "grad_norm": 1.605186338971091, "language_loss": 0.73751259, "learning_rate": 1.3088274066585348e-07, "loss": 0.75944197, "num_input_tokens_seen": 159631335, "step": 7382, "time_per_iteration": 2.3889708518981934 }, { "auxiliary_loss_clip": 0.01129574, "auxiliary_loss_mlp": 0.01022226, "balance_loss_clip": 1.04249358, "balance_loss_mlp": 1.01566339, "epoch": 0.8877532615884086, "flos": 22009434272640.0, "grad_norm": 2.141240953957513, "language_loss": 0.90490758, "learning_rate": 1.3060571718690749e-07, "loss": 0.92642558, "num_input_tokens_seen": 159648830, "step": 7383, "time_per_iteration": 3.247978687286377 }, { "auxiliary_loss_clip": 0.01030585, "auxiliary_loss_mlp": 0.00753116, "balance_loss_clip": 1.00662804, "balance_loss_mlp": 0.99996281, "epoch": 0.8878735044790477, "flos": 72136924346880.0, "grad_norm": 0.740767475928398, "language_loss": 0.56882465, "learning_rate": 1.3032897729358805e-07, "loss": 0.5866617, "num_input_tokens_seen": 159709785, "step": 7384, "time_per_iteration": 3.098950147628784 }, { "auxiliary_loss_clip": 0.01083905, "auxiliary_loss_mlp": 0.00762132, "balance_loss_clip": 1.03628862, "balance_loss_mlp": 1.00048435, "epoch": 0.8879937473696867, "flos": 27526355061120.0, "grad_norm": 1.8490682792470905, "language_loss": 0.799389, "learning_rate": 1.3005252102787645e-07, "loss": 0.81784934, "num_input_tokens_seen": 159728725, "step": 7385, "time_per_iteration": 2.620107412338257 }, { "auxiliary_loss_clip": 0.01153436, "auxiliary_loss_mlp": 0.01022024, "balance_loss_clip": 1.04546702, "balance_loss_mlp": 1.01478839, "epoch": 0.8881139902603259, "flos": 22234091886720.0, "grad_norm": 1.5884687384779326, "language_loss": 0.73328096, "learning_rate": 1.297763484317105e-07, "loss": 0.75503558, "num_input_tokens_seen": 159747020, "step": 7386, "time_per_iteration": 2.456625461578369 }, { "auxiliary_loss_clip": 0.0110674, "auxiliary_loss_mlp": 0.00762024, "balance_loss_clip": 1.03916097, "balance_loss_mlp": 1.00045383, "epoch": 0.888234233150965, "flos": 20299548170880.0, "grad_norm": 2.4678131581554066, "language_loss": 0.7072646, "learning_rate": 1.2950045954698551e-07, "loss": 0.72595221, "num_input_tokens_seen": 159764855, "step": 7387, "time_per_iteration": 2.5199553966522217 }, { "auxiliary_loss_clip": 0.01116769, "auxiliary_loss_mlp": 0.01023508, "balance_loss_clip": 1.04328775, "balance_loss_mlp": 1.0166651, "epoch": 0.888354476041604, "flos": 18147996437760.0, "grad_norm": 2.566538141348987, "language_loss": 0.75402319, "learning_rate": 1.2922485441555343e-07, "loss": 0.77542597, "num_input_tokens_seen": 159783935, "step": 7388, "time_per_iteration": 2.495147705078125 }, { "auxiliary_loss_clip": 0.01162016, "auxiliary_loss_mlp": 0.0102299, "balance_loss_clip": 1.04497838, "balance_loss_mlp": 1.01596832, "epoch": 0.8884747189322432, "flos": 22014282608640.0, "grad_norm": 1.746468268030024, "language_loss": 0.81769919, "learning_rate": 1.2894953307922363e-07, "loss": 0.8395493, "num_input_tokens_seen": 159802895, "step": 7389, "time_per_iteration": 2.4161171913146973 }, { "auxiliary_loss_clip": 0.01118811, "auxiliary_loss_mlp": 0.01023653, "balance_loss_clip": 1.04299176, "balance_loss_mlp": 1.01690578, "epoch": 0.8885949618228822, "flos": 19786779567360.0, "grad_norm": 2.2308530063595366, "language_loss": 0.84037381, "learning_rate": 1.2867449557976208e-07, "loss": 0.86179847, "num_input_tokens_seen": 159820995, "step": 7390, "time_per_iteration": 2.4809858798980713 }, { "auxiliary_loss_clip": 0.01150604, "auxiliary_loss_mlp": 0.01023462, "balance_loss_clip": 1.04704762, "balance_loss_mlp": 1.01650918, "epoch": 0.8887152047135213, "flos": 20047599198720.0, "grad_norm": 2.4767584532694107, "language_loss": 0.7560463, "learning_rate": 1.283997419588916e-07, "loss": 0.77778697, "num_input_tokens_seen": 159840465, "step": 7391, "time_per_iteration": 2.4775102138519287 }, { "auxiliary_loss_clip": 0.01154936, "auxiliary_loss_mlp": 0.01024917, "balance_loss_clip": 1.04622376, "balance_loss_mlp": 1.01793742, "epoch": 0.8888354476041604, "flos": 18588117784320.0, "grad_norm": 2.9011225206470748, "language_loss": 0.61898756, "learning_rate": 1.2812527225829216e-07, "loss": 0.64078605, "num_input_tokens_seen": 159858690, "step": 7392, "time_per_iteration": 2.428845167160034 }, { "auxiliary_loss_clip": 0.01158261, "auxiliary_loss_mlp": 0.01022825, "balance_loss_clip": 1.04874802, "balance_loss_mlp": 1.01490343, "epoch": 0.8889556904947995, "flos": 21689794120320.0, "grad_norm": 1.8064572492466093, "language_loss": 0.76296711, "learning_rate": 1.2785108651960052e-07, "loss": 0.784778, "num_input_tokens_seen": 159880325, "step": 7393, "time_per_iteration": 2.478649616241455 }, { "auxiliary_loss_clip": 0.01154083, "auxiliary_loss_mlp": 0.01024215, "balance_loss_clip": 1.0462513, "balance_loss_mlp": 1.01708913, "epoch": 0.8890759333854386, "flos": 27381204201600.0, "grad_norm": 1.8777480353814533, "language_loss": 0.80651295, "learning_rate": 1.2757718478441094e-07, "loss": 0.82829589, "num_input_tokens_seen": 159901070, "step": 7394, "time_per_iteration": 2.494816303253174 }, { "auxiliary_loss_clip": 0.01135999, "auxiliary_loss_mlp": 0.01020188, "balance_loss_clip": 1.04309082, "balance_loss_mlp": 1.01359856, "epoch": 0.8891961762760777, "flos": 24498834353280.0, "grad_norm": 1.698131686081044, "language_loss": 0.77360392, "learning_rate": 1.2730356709427302e-07, "loss": 0.79516578, "num_input_tokens_seen": 159919750, "step": 7395, "time_per_iteration": 2.5131702423095703 }, { "auxiliary_loss_clip": 0.01151857, "auxiliary_loss_mlp": 0.01030085, "balance_loss_clip": 1.04872251, "balance_loss_mlp": 1.02248871, "epoch": 0.8893164191667168, "flos": 41499770895360.0, "grad_norm": 1.5232726930304756, "language_loss": 0.59716618, "learning_rate": 1.2703023349069542e-07, "loss": 0.61898559, "num_input_tokens_seen": 159944600, "step": 7396, "time_per_iteration": 2.6196460723876953 }, { "auxiliary_loss_clip": 0.01149084, "auxiliary_loss_mlp": 0.01022189, "balance_loss_clip": 1.04570353, "balance_loss_mlp": 1.0152247, "epoch": 0.8894366620573558, "flos": 33583623120000.0, "grad_norm": 2.018056443920704, "language_loss": 0.61910081, "learning_rate": 1.2675718401514223e-07, "loss": 0.64081353, "num_input_tokens_seen": 159968780, "step": 7397, "time_per_iteration": 2.5523598194122314 }, { "auxiliary_loss_clip": 0.01136144, "auxiliary_loss_mlp": 0.01023348, "balance_loss_clip": 1.04382122, "balance_loss_mlp": 1.01604617, "epoch": 0.889556904947995, "flos": 16909832672640.0, "grad_norm": 1.876765995548863, "language_loss": 0.74329406, "learning_rate": 1.264844187090346e-07, "loss": 0.764889, "num_input_tokens_seen": 159985905, "step": 7398, "time_per_iteration": 2.4495253562927246 }, { "auxiliary_loss_clip": 0.01130571, "auxiliary_loss_mlp": 0.01022667, "balance_loss_clip": 1.0420146, "balance_loss_mlp": 1.01581502, "epoch": 0.889677147838634, "flos": 26030855283840.0, "grad_norm": 1.6358880314610367, "language_loss": 0.75075173, "learning_rate": 1.262119376137516e-07, "loss": 0.77228409, "num_input_tokens_seen": 160006965, "step": 7399, "time_per_iteration": 2.5144355297088623 }, { "auxiliary_loss_clip": 0.01140077, "auxiliary_loss_mlp": 0.0102124, "balance_loss_clip": 1.04162705, "balance_loss_mlp": 1.01450479, "epoch": 0.8897973907292731, "flos": 26468283110400.0, "grad_norm": 1.7458554696152109, "language_loss": 0.85329747, "learning_rate": 1.2593974077062707e-07, "loss": 0.87491059, "num_input_tokens_seen": 160028585, "step": 7400, "time_per_iteration": 2.476206064224243 }, { "auxiliary_loss_clip": 0.01113747, "auxiliary_loss_mlp": 0.01024551, "balance_loss_clip": 1.04120493, "balance_loss_mlp": 1.01775885, "epoch": 0.8899176336199123, "flos": 26249694894720.0, "grad_norm": 7.562010722876379, "language_loss": 0.63592076, "learning_rate": 1.2566782822095423e-07, "loss": 0.65730375, "num_input_tokens_seen": 160048840, "step": 7401, "time_per_iteration": 4.126861333847046 }, { "auxiliary_loss_clip": 0.01129818, "auxiliary_loss_mlp": 0.0102283, "balance_loss_clip": 1.04625154, "balance_loss_mlp": 1.01578498, "epoch": 0.8900378765105513, "flos": 20811742156800.0, "grad_norm": 2.8760480489099813, "language_loss": 0.71339417, "learning_rate": 1.2539620000598162e-07, "loss": 0.73492062, "num_input_tokens_seen": 160068175, "step": 7402, "time_per_iteration": 2.5083558559417725 }, { "auxiliary_loss_clip": 0.01164046, "auxiliary_loss_mlp": 0.01025839, "balance_loss_clip": 1.04726446, "balance_loss_mlp": 1.01882672, "epoch": 0.8901581194011904, "flos": 16472333018880.0, "grad_norm": 1.883859289469901, "language_loss": 0.79626667, "learning_rate": 1.2512485616691492e-07, "loss": 0.81816554, "num_input_tokens_seen": 160085230, "step": 7403, "time_per_iteration": 3.2173001766204834 }, { "auxiliary_loss_clip": 0.01125272, "auxiliary_loss_mlp": 0.01030064, "balance_loss_clip": 1.04240394, "balance_loss_mlp": 1.02227354, "epoch": 0.8902783622918296, "flos": 35155253773440.0, "grad_norm": 1.5119369748728388, "language_loss": 0.81071472, "learning_rate": 1.2485379674491681e-07, "loss": 0.83226812, "num_input_tokens_seen": 160111425, "step": 7404, "time_per_iteration": 2.6411101818084717 }, { "auxiliary_loss_clip": 0.01135224, "auxiliary_loss_mlp": 0.01026599, "balance_loss_clip": 1.0448761, "balance_loss_mlp": 1.01967311, "epoch": 0.8903986051824686, "flos": 17201068145280.0, "grad_norm": 2.6336134624395586, "language_loss": 0.79408747, "learning_rate": 1.2458302178110657e-07, "loss": 0.81570572, "num_input_tokens_seen": 160129790, "step": 7405, "time_per_iteration": 2.427273988723755 }, { "auxiliary_loss_clip": 0.01111209, "auxiliary_loss_mlp": 0.01019656, "balance_loss_clip": 1.04037714, "balance_loss_mlp": 1.01325107, "epoch": 0.8905188480731077, "flos": 25483863997440.0, "grad_norm": 1.941480803882672, "language_loss": 0.82519531, "learning_rate": 1.2431253131656118e-07, "loss": 0.84650397, "num_input_tokens_seen": 160149265, "step": 7406, "time_per_iteration": 2.5330002307891846 }, { "auxiliary_loss_clip": 0.01129471, "auxiliary_loss_mlp": 0.01024975, "balance_loss_clip": 1.04289412, "balance_loss_mlp": 1.01754248, "epoch": 0.8906390909637467, "flos": 23365888502400.0, "grad_norm": 3.7995477910424658, "language_loss": 0.76877761, "learning_rate": 1.240423253923133e-07, "loss": 0.79032207, "num_input_tokens_seen": 160168870, "step": 7407, "time_per_iteration": 2.478048086166382 }, { "auxiliary_loss_clip": 0.01150761, "auxiliary_loss_mlp": 0.01026083, "balance_loss_clip": 1.0448072, "balance_loss_mlp": 1.01868892, "epoch": 0.8907593338543859, "flos": 21068790860160.0, "grad_norm": 1.8226502738910912, "language_loss": 0.69473112, "learning_rate": 1.237724040493533e-07, "loss": 0.71649951, "num_input_tokens_seen": 160187495, "step": 7408, "time_per_iteration": 2.421055793762207 }, { "auxiliary_loss_clip": 0.01170912, "auxiliary_loss_mlp": 0.01028792, "balance_loss_clip": 1.05000138, "balance_loss_mlp": 1.02120709, "epoch": 0.8908795767450249, "flos": 21869562712320.0, "grad_norm": 3.31065782897963, "language_loss": 0.73056811, "learning_rate": 1.2350276732862773e-07, "loss": 0.75256515, "num_input_tokens_seen": 160208520, "step": 7409, "time_per_iteration": 2.4612503051757812 }, { "auxiliary_loss_clip": 0.01052676, "auxiliary_loss_mlp": 0.01000968, "balance_loss_clip": 1.0082022, "balance_loss_mlp": 1.00007427, "epoch": 0.890999819635664, "flos": 66307869348480.0, "grad_norm": 0.8374006158886216, "language_loss": 0.56670988, "learning_rate": 1.2323341527103993e-07, "loss": 0.5872463, "num_input_tokens_seen": 160263720, "step": 7410, "time_per_iteration": 3.659691572189331 }, { "auxiliary_loss_clip": 0.01162823, "auxiliary_loss_mlp": 0.01023182, "balance_loss_clip": 1.04615211, "balance_loss_mlp": 1.01632476, "epoch": 0.8911200625263032, "flos": 26869908055680.0, "grad_norm": 2.1003972524876815, "language_loss": 0.85460377, "learning_rate": 1.2296434791745135e-07, "loss": 0.87646377, "num_input_tokens_seen": 160282170, "step": 7411, "time_per_iteration": 2.4414825439453125 }, { "auxiliary_loss_clip": 0.01154789, "auxiliary_loss_mlp": 0.01026548, "balance_loss_clip": 1.04659259, "balance_loss_mlp": 1.01940739, "epoch": 0.8912403054169422, "flos": 20885825957760.0, "grad_norm": 2.557252075666105, "language_loss": 0.76739734, "learning_rate": 1.2269556530867875e-07, "loss": 0.78921074, "num_input_tokens_seen": 160300725, "step": 7412, "time_per_iteration": 2.450429677963257 }, { "auxiliary_loss_clip": 0.01170689, "auxiliary_loss_mlp": 0.01025854, "balance_loss_clip": 1.04911482, "balance_loss_mlp": 1.01763499, "epoch": 0.8913605483075813, "flos": 27016567286400.0, "grad_norm": 2.022584928617387, "language_loss": 0.82289821, "learning_rate": 1.2242706748549614e-07, "loss": 0.84486365, "num_input_tokens_seen": 160318720, "step": 7413, "time_per_iteration": 2.451805830001831 }, { "auxiliary_loss_clip": 0.01136557, "auxiliary_loss_mlp": 0.01019489, "balance_loss_clip": 1.04030132, "balance_loss_mlp": 1.01242292, "epoch": 0.8914807911982204, "flos": 23621500661760.0, "grad_norm": 1.7612669472426261, "language_loss": 0.82317173, "learning_rate": 1.2215885448863473e-07, "loss": 0.84473217, "num_input_tokens_seen": 160339595, "step": 7414, "time_per_iteration": 2.521181106567383 }, { "auxiliary_loss_clip": 0.01136878, "auxiliary_loss_mlp": 0.01025779, "balance_loss_clip": 1.04524505, "balance_loss_mlp": 1.01901364, "epoch": 0.8916010340888595, "flos": 24462277286400.0, "grad_norm": 1.9553151334583236, "language_loss": 0.8039254, "learning_rate": 1.2189092635878152e-07, "loss": 0.82555199, "num_input_tokens_seen": 160361045, "step": 7415, "time_per_iteration": 2.5448262691497803 }, { "auxiliary_loss_clip": 0.01113019, "auxiliary_loss_mlp": 0.01023014, "balance_loss_clip": 1.0408926, "balance_loss_mlp": 1.01594472, "epoch": 0.8917212769794985, "flos": 21215773313280.0, "grad_norm": 1.9104440364754791, "language_loss": 0.7771312, "learning_rate": 1.216232831365822e-07, "loss": 0.7984916, "num_input_tokens_seen": 160379990, "step": 7416, "time_per_iteration": 2.550075054168701 }, { "auxiliary_loss_clip": 0.01143079, "auxiliary_loss_mlp": 0.01026576, "balance_loss_clip": 1.04576087, "balance_loss_mlp": 1.01917338, "epoch": 0.8918415198701377, "flos": 25513992529920.0, "grad_norm": 1.9466832616763416, "language_loss": 0.80975842, "learning_rate": 1.2135592486263678e-07, "loss": 0.83145499, "num_input_tokens_seen": 160399240, "step": 7417, "time_per_iteration": 2.5348081588745117 }, { "auxiliary_loss_clip": 0.01135294, "auxiliary_loss_mlp": 0.01025101, "balance_loss_clip": 1.042979, "balance_loss_mlp": 1.01838028, "epoch": 0.8919617627607768, "flos": 37853006693760.0, "grad_norm": 1.6980455305116617, "language_loss": 0.61257374, "learning_rate": 1.2108885157750415e-07, "loss": 0.63417768, "num_input_tokens_seen": 160421600, "step": 7418, "time_per_iteration": 2.607163190841675 }, { "auxiliary_loss_clip": 0.01118372, "auxiliary_loss_mlp": 0.00761162, "balance_loss_clip": 1.04372573, "balance_loss_mlp": 1.00039625, "epoch": 0.8920820056514158, "flos": 26213676531840.0, "grad_norm": 1.8163406681216865, "language_loss": 0.79795241, "learning_rate": 1.2082206332169897e-07, "loss": 0.81674778, "num_input_tokens_seen": 160441695, "step": 7419, "time_per_iteration": 2.5493650436401367 }, { "auxiliary_loss_clip": 0.01133754, "auxiliary_loss_mlp": 0.01027934, "balance_loss_clip": 1.04627526, "balance_loss_mlp": 1.02039957, "epoch": 0.892202248542055, "flos": 17383135207680.0, "grad_norm": 4.941490706673702, "language_loss": 0.73491853, "learning_rate": 1.2055556013569225e-07, "loss": 0.75653541, "num_input_tokens_seen": 160457205, "step": 7420, "time_per_iteration": 2.424400568008423 }, { "auxiliary_loss_clip": 0.0113947, "auxiliary_loss_mlp": 0.01023454, "balance_loss_clip": 1.04440367, "balance_loss_mlp": 1.01654625, "epoch": 0.892322491432694, "flos": 21324223451520.0, "grad_norm": 1.8439221813257884, "language_loss": 0.82086712, "learning_rate": 1.2028934205991315e-07, "loss": 0.84249634, "num_input_tokens_seen": 160476525, "step": 7421, "time_per_iteration": 2.496135950088501 }, { "auxiliary_loss_clip": 0.01149567, "auxiliary_loss_mlp": 0.01021889, "balance_loss_clip": 1.04336703, "balance_loss_mlp": 1.01447713, "epoch": 0.8924427343233331, "flos": 24029374573440.0, "grad_norm": 1.7799896893532616, "language_loss": 0.76575053, "learning_rate": 1.2002340913474607e-07, "loss": 0.7874651, "num_input_tokens_seen": 160500160, "step": 7422, "time_per_iteration": 2.5181031227111816 }, { "auxiliary_loss_clip": 0.01165782, "auxiliary_loss_mlp": 0.01029137, "balance_loss_clip": 1.04667592, "balance_loss_mlp": 1.02164483, "epoch": 0.8925629772139723, "flos": 30008069631360.0, "grad_norm": 11.415048182270004, "language_loss": 0.74153155, "learning_rate": 1.1975776140053317e-07, "loss": 0.76348078, "num_input_tokens_seen": 160520130, "step": 7423, "time_per_iteration": 2.4831578731536865 }, { "auxiliary_loss_clip": 0.01111058, "auxiliary_loss_mlp": 0.01025745, "balance_loss_clip": 1.04205298, "balance_loss_mlp": 1.01827919, "epoch": 0.8926832201046113, "flos": 22601709630720.0, "grad_norm": 1.8767466371757187, "language_loss": 0.73635966, "learning_rate": 1.194923988975729e-07, "loss": 0.75772768, "num_input_tokens_seen": 160539730, "step": 7424, "time_per_iteration": 2.5377964973449707 }, { "auxiliary_loss_clip": 0.01119216, "auxiliary_loss_mlp": 0.01020395, "balance_loss_clip": 1.04280376, "balance_loss_mlp": 1.01238418, "epoch": 0.8928034629952504, "flos": 13297722117120.0, "grad_norm": 2.266639274637357, "language_loss": 0.74034125, "learning_rate": 1.192273216661206e-07, "loss": 0.76173735, "num_input_tokens_seen": 160557820, "step": 7425, "time_per_iteration": 2.517336130142212 }, { "auxiliary_loss_clip": 0.01011198, "auxiliary_loss_mlp": 0.01000785, "balance_loss_clip": 1.00823212, "balance_loss_mlp": 0.99978322, "epoch": 0.8929237058858895, "flos": 54854556744960.0, "grad_norm": 0.774418217978881, "language_loss": 0.57487988, "learning_rate": 1.189625297463881e-07, "loss": 0.59499979, "num_input_tokens_seen": 160619510, "step": 7426, "time_per_iteration": 3.2216598987579346 }, { "auxiliary_loss_clip": 0.01089477, "auxiliary_loss_mlp": 0.01025038, "balance_loss_clip": 1.03863728, "balance_loss_mlp": 1.01832664, "epoch": 0.8930439487765286, "flos": 28883850785280.0, "grad_norm": 1.5840368515953966, "language_loss": 0.79691482, "learning_rate": 1.1869802317854394e-07, "loss": 0.81805998, "num_input_tokens_seen": 160643295, "step": 7427, "time_per_iteration": 3.826235055923462 }, { "auxiliary_loss_clip": 0.01113734, "auxiliary_loss_mlp": 0.01023256, "balance_loss_clip": 1.04221451, "balance_loss_mlp": 1.0160886, "epoch": 0.8931641916671677, "flos": 22419283432320.0, "grad_norm": 1.9062757888674386, "language_loss": 0.71996498, "learning_rate": 1.1843380200271425e-07, "loss": 0.74133492, "num_input_tokens_seen": 160662495, "step": 7428, "time_per_iteration": 3.3114731311798096 }, { "auxiliary_loss_clip": 0.01115727, "auxiliary_loss_mlp": 0.01018786, "balance_loss_clip": 1.04142535, "balance_loss_mlp": 1.01155257, "epoch": 0.8932844345578068, "flos": 25843149786240.0, "grad_norm": 2.1047395284678694, "language_loss": 0.80183971, "learning_rate": 1.181698662589805e-07, "loss": 0.82318485, "num_input_tokens_seen": 160682080, "step": 7429, "time_per_iteration": 3.39693284034729 }, { "auxiliary_loss_clip": 0.01150058, "auxiliary_loss_mlp": 0.01025833, "balance_loss_clip": 1.04470217, "balance_loss_mlp": 1.01841831, "epoch": 0.8934046774484459, "flos": 22925803069440.0, "grad_norm": 2.0712411145507152, "language_loss": 0.7615875, "learning_rate": 1.1790621598738249e-07, "loss": 0.78334641, "num_input_tokens_seen": 160700395, "step": 7430, "time_per_iteration": 2.5288875102996826 }, { "auxiliary_loss_clip": 0.01162498, "auxiliary_loss_mlp": 0.01023594, "balance_loss_clip": 1.04783988, "balance_loss_mlp": 1.01703143, "epoch": 0.8935249203390849, "flos": 24462097718400.0, "grad_norm": 2.0024294129715092, "language_loss": 0.75163984, "learning_rate": 1.1764285122791461e-07, "loss": 0.77350074, "num_input_tokens_seen": 160721115, "step": 7431, "time_per_iteration": 2.4550232887268066 }, { "auxiliary_loss_clip": 0.01148549, "auxiliary_loss_mlp": 0.0102207, "balance_loss_clip": 1.04208839, "balance_loss_mlp": 1.01517642, "epoch": 0.8936451632297241, "flos": 15742735966080.0, "grad_norm": 2.513241487722935, "language_loss": 0.77320063, "learning_rate": 1.173797720205294e-07, "loss": 0.79490674, "num_input_tokens_seen": 160739150, "step": 7432, "time_per_iteration": 2.4107601642608643 }, { "auxiliary_loss_clip": 0.0115387, "auxiliary_loss_mlp": 0.0102771, "balance_loss_clip": 1.04704237, "balance_loss_mlp": 1.0200119, "epoch": 0.8937654061203631, "flos": 35115500396160.0, "grad_norm": 3.5361902286749323, "language_loss": 0.71541786, "learning_rate": 1.1711697840513602e-07, "loss": 0.73723364, "num_input_tokens_seen": 160758585, "step": 7433, "time_per_iteration": 2.547919273376465 }, { "auxiliary_loss_clip": 0.01142744, "auxiliary_loss_mlp": 0.01022179, "balance_loss_clip": 1.04243076, "balance_loss_mlp": 1.01492846, "epoch": 0.8938856490110022, "flos": 16107444708480.0, "grad_norm": 2.8434040265186833, "language_loss": 0.71168488, "learning_rate": 1.1685447042160012e-07, "loss": 0.73333406, "num_input_tokens_seen": 160776620, "step": 7434, "time_per_iteration": 2.403050184249878 }, { "auxiliary_loss_clip": 0.01166433, "auxiliary_loss_mlp": 0.01028259, "balance_loss_clip": 1.0472002, "balance_loss_mlp": 1.02115083, "epoch": 0.8940058919016414, "flos": 20704189858560.0, "grad_norm": 2.2754156304127213, "language_loss": 0.71512991, "learning_rate": 1.1659224810974367e-07, "loss": 0.73707676, "num_input_tokens_seen": 160796580, "step": 7435, "time_per_iteration": 2.403956413269043 }, { "auxiliary_loss_clip": 0.01138612, "auxiliary_loss_mlp": 0.0102581, "balance_loss_clip": 1.047207, "balance_loss_mlp": 1.01860118, "epoch": 0.8941261347922804, "flos": 25229041937280.0, "grad_norm": 2.186319305615022, "language_loss": 0.68190753, "learning_rate": 1.1633031150934591e-07, "loss": 0.70355177, "num_input_tokens_seen": 160819610, "step": 7436, "time_per_iteration": 2.550081491470337 }, { "auxiliary_loss_clip": 0.0115303, "auxiliary_loss_mlp": 0.01031275, "balance_loss_clip": 1.04694581, "balance_loss_mlp": 1.02384853, "epoch": 0.8942463776829195, "flos": 19537236806400.0, "grad_norm": 2.0891362424621764, "language_loss": 0.80209744, "learning_rate": 1.1606866066014176e-07, "loss": 0.82394052, "num_input_tokens_seen": 160838660, "step": 7437, "time_per_iteration": 3.192620277404785 }, { "auxiliary_loss_clip": 0.0111977, "auxiliary_loss_mlp": 0.01022264, "balance_loss_clip": 1.04246688, "balance_loss_mlp": 1.01493287, "epoch": 0.8943666205735585, "flos": 22301567585280.0, "grad_norm": 2.570170749072981, "language_loss": 0.75530273, "learning_rate": 1.1580729560182434e-07, "loss": 0.77672309, "num_input_tokens_seen": 160854515, "step": 7438, "time_per_iteration": 2.481064558029175 }, { "auxiliary_loss_clip": 0.01164953, "auxiliary_loss_mlp": 0.00761766, "balance_loss_clip": 1.0469346, "balance_loss_mlp": 1.0004034, "epoch": 0.8944868634641977, "flos": 18912893581440.0, "grad_norm": 2.0853067501221054, "language_loss": 0.70763063, "learning_rate": 1.1554621637404171e-07, "loss": 0.72689784, "num_input_tokens_seen": 160872605, "step": 7439, "time_per_iteration": 2.3844475746154785 }, { "auxiliary_loss_clip": 0.0115437, "auxiliary_loss_mlp": 0.01019545, "balance_loss_clip": 1.04668391, "balance_loss_mlp": 1.01252007, "epoch": 0.8946071063548368, "flos": 14460904241280.0, "grad_norm": 3.424561241713183, "language_loss": 0.61309856, "learning_rate": 1.1528542301639999e-07, "loss": 0.63483769, "num_input_tokens_seen": 160889395, "step": 7440, "time_per_iteration": 2.398780584335327 }, { "auxiliary_loss_clip": 0.01124041, "auxiliary_loss_mlp": 0.01020349, "balance_loss_clip": 1.04062712, "balance_loss_mlp": 1.01352429, "epoch": 0.8947273492454758, "flos": 20084084438400.0, "grad_norm": 2.939457369216438, "language_loss": 0.82780707, "learning_rate": 1.1502491556846105e-07, "loss": 0.84925097, "num_input_tokens_seen": 160907890, "step": 7441, "time_per_iteration": 2.5046281814575195 }, { "auxiliary_loss_clip": 0.01136078, "auxiliary_loss_mlp": 0.01023901, "balance_loss_clip": 1.0440073, "balance_loss_mlp": 1.01705587, "epoch": 0.894847592136115, "flos": 18550555136640.0, "grad_norm": 2.7401060281557363, "language_loss": 0.81608999, "learning_rate": 1.1476469406974331e-07, "loss": 0.83768976, "num_input_tokens_seen": 160923490, "step": 7442, "time_per_iteration": 2.4371020793914795 }, { "auxiliary_loss_clip": 0.01163353, "auxiliary_loss_mlp": 0.01025048, "balance_loss_clip": 1.04734921, "balance_loss_mlp": 1.01837802, "epoch": 0.894967835026754, "flos": 23478468704640.0, "grad_norm": 1.7182839172039632, "language_loss": 0.77063191, "learning_rate": 1.1450475855972341e-07, "loss": 0.79251593, "num_input_tokens_seen": 160944280, "step": 7443, "time_per_iteration": 2.4304957389831543 }, { "auxiliary_loss_clip": 0.01135476, "auxiliary_loss_mlp": 0.00762009, "balance_loss_clip": 1.04205918, "balance_loss_mlp": 1.00045168, "epoch": 0.8950880779173931, "flos": 15188310564480.0, "grad_norm": 3.074678388272662, "language_loss": 0.71294785, "learning_rate": 1.1424510907783158e-07, "loss": 0.73192275, "num_input_tokens_seen": 160961560, "step": 7444, "time_per_iteration": 2.4330248832702637 }, { "auxiliary_loss_clip": 0.01140329, "auxiliary_loss_mlp": 0.01026176, "balance_loss_clip": 1.04253697, "balance_loss_mlp": 1.01941133, "epoch": 0.8952083208080323, "flos": 22091957769600.0, "grad_norm": 1.5495937147365109, "language_loss": 0.82725537, "learning_rate": 1.1398574566345787e-07, "loss": 0.84892035, "num_input_tokens_seen": 160982195, "step": 7445, "time_per_iteration": 2.4831669330596924 }, { "auxiliary_loss_clip": 0.01141352, "auxiliary_loss_mlp": 0.01022911, "balance_loss_clip": 1.04245603, "balance_loss_mlp": 1.0154959, "epoch": 0.8953285636986713, "flos": 23254026572160.0, "grad_norm": 2.426159385548367, "language_loss": 0.82391077, "learning_rate": 1.1372666835594702e-07, "loss": 0.8455534, "num_input_tokens_seen": 161000520, "step": 7446, "time_per_iteration": 2.4716262817382812 }, { "auxiliary_loss_clip": 0.0113505, "auxiliary_loss_mlp": 0.0101974, "balance_loss_clip": 1.04381502, "balance_loss_mlp": 1.01338887, "epoch": 0.8954488065893104, "flos": 16362661818240.0, "grad_norm": 2.000137099003427, "language_loss": 0.71833777, "learning_rate": 1.1346787719460071e-07, "loss": 0.73988569, "num_input_tokens_seen": 161019405, "step": 7447, "time_per_iteration": 2.4457385540008545 }, { "auxiliary_loss_clip": 0.01136824, "auxiliary_loss_mlp": 0.01025025, "balance_loss_clip": 1.04591799, "balance_loss_mlp": 1.01820016, "epoch": 0.8955690494799495, "flos": 18257883120000.0, "grad_norm": 1.7287385889543345, "language_loss": 0.71899825, "learning_rate": 1.1320937221867732e-07, "loss": 0.7406168, "num_input_tokens_seen": 161036985, "step": 7448, "time_per_iteration": 2.464522361755371 }, { "auxiliary_loss_clip": 0.01134856, "auxiliary_loss_mlp": 0.01024953, "balance_loss_clip": 1.04284477, "balance_loss_mlp": 1.01863527, "epoch": 0.8956892923705886, "flos": 25447486498560.0, "grad_norm": 1.7557887997380401, "language_loss": 0.79429418, "learning_rate": 1.1295115346739192e-07, "loss": 0.81589228, "num_input_tokens_seen": 161056985, "step": 7449, "time_per_iteration": 2.5046324729919434 }, { "auxiliary_loss_clip": 0.01140015, "auxiliary_loss_mlp": 0.01025745, "balance_loss_clip": 1.04442084, "balance_loss_mlp": 1.01857412, "epoch": 0.8958095352612276, "flos": 52661883939840.0, "grad_norm": 5.6498850965287355, "language_loss": 0.72896147, "learning_rate": 1.1269322097991629e-07, "loss": 0.75061899, "num_input_tokens_seen": 161080270, "step": 7450, "time_per_iteration": 2.739476203918457 }, { "auxiliary_loss_clip": 0.0115662, "auxiliary_loss_mlp": 0.01025377, "balance_loss_clip": 1.0491457, "balance_loss_mlp": 1.0177176, "epoch": 0.8959297781518668, "flos": 23186335392000.0, "grad_norm": 2.5594880084113467, "language_loss": 0.67722237, "learning_rate": 1.1243557479537846e-07, "loss": 0.69904232, "num_input_tokens_seen": 161100160, "step": 7451, "time_per_iteration": 2.4521005153656006 }, { "auxiliary_loss_clip": 0.01162095, "auxiliary_loss_mlp": 0.01019687, "balance_loss_clip": 1.04424345, "balance_loss_mlp": 1.01251972, "epoch": 0.8960500210425059, "flos": 20334309557760.0, "grad_norm": 2.1946308636044383, "language_loss": 0.68573368, "learning_rate": 1.121782149528634e-07, "loss": 0.70755148, "num_input_tokens_seen": 161117260, "step": 7452, "time_per_iteration": 2.383960485458374 }, { "auxiliary_loss_clip": 0.01142041, "auxiliary_loss_mlp": 0.01020068, "balance_loss_clip": 1.04624486, "balance_loss_mlp": 1.01320195, "epoch": 0.8961702639331449, "flos": 19901694153600.0, "grad_norm": 2.1996692613356705, "language_loss": 0.78960574, "learning_rate": 1.1192114149141208e-07, "loss": 0.81122684, "num_input_tokens_seen": 161136895, "step": 7453, "time_per_iteration": 2.463416576385498 }, { "auxiliary_loss_clip": 0.01140509, "auxiliary_loss_mlp": 0.01027593, "balance_loss_clip": 1.04319167, "balance_loss_mlp": 1.01975524, "epoch": 0.8962905068237841, "flos": 12896348567040.0, "grad_norm": 3.0133785671726088, "language_loss": 0.6505838, "learning_rate": 1.1166435445002197e-07, "loss": 0.67226481, "num_input_tokens_seen": 161154565, "step": 7454, "time_per_iteration": 3.442310333251953 }, { "auxiliary_loss_clip": 0.0115446, "auxiliary_loss_mlp": 0.01026608, "balance_loss_clip": 1.04701924, "balance_loss_mlp": 1.01890111, "epoch": 0.8964107497144231, "flos": 23440331439360.0, "grad_norm": 2.124418998365666, "language_loss": 0.68864495, "learning_rate": 1.1140785386764818e-07, "loss": 0.71045566, "num_input_tokens_seen": 161173265, "step": 7455, "time_per_iteration": 3.191255807876587 }, { "auxiliary_loss_clip": 0.0114625, "auxiliary_loss_mlp": 0.01027711, "balance_loss_clip": 1.04445922, "balance_loss_mlp": 1.02039468, "epoch": 0.8965309926050622, "flos": 19500176949120.0, "grad_norm": 2.1232624020422164, "language_loss": 0.69580305, "learning_rate": 1.1115163978320153e-07, "loss": 0.71754265, "num_input_tokens_seen": 161191995, "step": 7456, "time_per_iteration": 3.2421813011169434 }, { "auxiliary_loss_clip": 0.01156426, "auxiliary_loss_mlp": 0.00762208, "balance_loss_clip": 1.04639804, "balance_loss_mlp": 1.00046539, "epoch": 0.8966512354957014, "flos": 28658008022400.0, "grad_norm": 1.8422661718315823, "language_loss": 0.82507384, "learning_rate": 1.1089571223554917e-07, "loss": 0.84426022, "num_input_tokens_seen": 161212880, "step": 7457, "time_per_iteration": 2.504429340362549 }, { "auxiliary_loss_clip": 0.01151697, "auxiliary_loss_mlp": 0.01024351, "balance_loss_clip": 1.04404759, "balance_loss_mlp": 1.01708531, "epoch": 0.8967714783863404, "flos": 23370916406400.0, "grad_norm": 2.0165449609390267, "language_loss": 0.85309386, "learning_rate": 1.1064007126351537e-07, "loss": 0.87485433, "num_input_tokens_seen": 161233595, "step": 7458, "time_per_iteration": 2.4564974308013916 }, { "auxiliary_loss_clip": 0.01131771, "auxiliary_loss_mlp": 0.01021595, "balance_loss_clip": 1.04406762, "balance_loss_mlp": 1.01449597, "epoch": 0.8968917212769795, "flos": 24535175938560.0, "grad_norm": 2.126901697992628, "language_loss": 0.76233041, "learning_rate": 1.1038471690588003e-07, "loss": 0.78386408, "num_input_tokens_seen": 161252740, "step": 7459, "time_per_iteration": 2.479123592376709 }, { "auxiliary_loss_clip": 0.01112005, "auxiliary_loss_mlp": 0.01025745, "balance_loss_clip": 1.04642045, "balance_loss_mlp": 1.01862216, "epoch": 0.8970119641676186, "flos": 23475416048640.0, "grad_norm": 2.3379079766910373, "language_loss": 0.79907155, "learning_rate": 1.1012964920138145e-07, "loss": 0.82044905, "num_input_tokens_seen": 161272325, "step": 7460, "time_per_iteration": 2.550973892211914 }, { "auxiliary_loss_clip": 0.01130086, "auxiliary_loss_mlp": 0.01024242, "balance_loss_clip": 1.03987694, "balance_loss_mlp": 1.01748919, "epoch": 0.8971322070582577, "flos": 24538192680960.0, "grad_norm": 1.8724472127650784, "language_loss": 0.75860697, "learning_rate": 1.0987486818871205e-07, "loss": 0.78015023, "num_input_tokens_seen": 161295915, "step": 7461, "time_per_iteration": 2.5437662601470947 }, { "auxiliary_loss_clip": 0.01151035, "auxiliary_loss_mlp": 0.00761777, "balance_loss_clip": 1.04521227, "balance_loss_mlp": 1.00043297, "epoch": 0.8972524499488967, "flos": 21797454159360.0, "grad_norm": 2.5814274669859403, "language_loss": 0.73589694, "learning_rate": 1.0962037390652245e-07, "loss": 0.75502509, "num_input_tokens_seen": 161314935, "step": 7462, "time_per_iteration": 2.455146551132202 }, { "auxiliary_loss_clip": 0.01138681, "auxiliary_loss_mlp": 0.01026467, "balance_loss_clip": 1.04517114, "balance_loss_mlp": 1.01901698, "epoch": 0.8973726928395359, "flos": 21726243446400.0, "grad_norm": 1.785756433750136, "language_loss": 0.71740371, "learning_rate": 1.0936616639341911e-07, "loss": 0.73905522, "num_input_tokens_seen": 161335225, "step": 7463, "time_per_iteration": 2.479999303817749 }, { "auxiliary_loss_clip": 0.01046846, "auxiliary_loss_mlp": 0.01002208, "balance_loss_clip": 1.0100919, "balance_loss_mlp": 1.00145423, "epoch": 0.897492935730175, "flos": 53837100097920.0, "grad_norm": 0.7421691002946087, "language_loss": 0.54748058, "learning_rate": 1.0911224568796473e-07, "loss": 0.56797123, "num_input_tokens_seen": 161393420, "step": 7464, "time_per_iteration": 3.8243470191955566 }, { "auxiliary_loss_clip": 0.01149507, "auxiliary_loss_mlp": 0.01027458, "balance_loss_clip": 1.04633892, "balance_loss_mlp": 1.02052569, "epoch": 0.897613178620814, "flos": 18290346036480.0, "grad_norm": 3.4304111658655776, "language_loss": 0.70918477, "learning_rate": 1.0885861182867984e-07, "loss": 0.73095441, "num_input_tokens_seen": 161411525, "step": 7465, "time_per_iteration": 2.428588628768921 }, { "auxiliary_loss_clip": 0.01140001, "auxiliary_loss_mlp": 0.01022244, "balance_loss_clip": 1.04350233, "balance_loss_mlp": 1.01524067, "epoch": 0.8977334215114532, "flos": 32993718059520.0, "grad_norm": 2.1091594877299604, "language_loss": 0.70983779, "learning_rate": 1.0860526485403942e-07, "loss": 0.73146021, "num_input_tokens_seen": 161432800, "step": 7466, "time_per_iteration": 2.5627036094665527 }, { "auxiliary_loss_clip": 0.01164526, "auxiliary_loss_mlp": 0.01024297, "balance_loss_clip": 1.0464983, "balance_loss_mlp": 1.01779079, "epoch": 0.8978536644020922, "flos": 15195636938880.0, "grad_norm": 1.668742687822235, "language_loss": 0.77024531, "learning_rate": 1.0835220480247675e-07, "loss": 0.79213357, "num_input_tokens_seen": 161451295, "step": 7467, "time_per_iteration": 2.377856492996216 }, { "auxiliary_loss_clip": 0.0113386, "auxiliary_loss_mlp": 0.01024681, "balance_loss_clip": 1.04409337, "balance_loss_mlp": 1.0176661, "epoch": 0.8979739072927313, "flos": 18004389863040.0, "grad_norm": 2.1487084897035733, "language_loss": 0.8361212, "learning_rate": 1.0809943171238067e-07, "loss": 0.85770667, "num_input_tokens_seen": 161469220, "step": 7468, "time_per_iteration": 2.4430463314056396 }, { "auxiliary_loss_clip": 0.01144633, "auxiliary_loss_mlp": 0.01029538, "balance_loss_clip": 1.04544282, "balance_loss_mlp": 1.02128887, "epoch": 0.8980941501833704, "flos": 22271546793600.0, "grad_norm": 2.3138926097217705, "language_loss": 0.62957704, "learning_rate": 1.078469456220965e-07, "loss": 0.65131873, "num_input_tokens_seen": 161489375, "step": 7469, "time_per_iteration": 2.4690537452697754 }, { "auxiliary_loss_clip": 0.01150439, "auxiliary_loss_mlp": 0.01024784, "balance_loss_clip": 1.04356158, "balance_loss_mlp": 1.01764941, "epoch": 0.8982143930740095, "flos": 37560729726720.0, "grad_norm": 2.2147316442639955, "language_loss": 0.69598716, "learning_rate": 1.0759474656992606e-07, "loss": 0.71773946, "num_input_tokens_seen": 161512145, "step": 7470, "time_per_iteration": 2.5764031410217285 }, { "auxiliary_loss_clip": 0.01140976, "auxiliary_loss_mlp": 0.0102563, "balance_loss_clip": 1.04351974, "balance_loss_mlp": 1.01827812, "epoch": 0.8983346359646486, "flos": 18076893465600.0, "grad_norm": 2.144831081515994, "language_loss": 0.77924454, "learning_rate": 1.0734283459412785e-07, "loss": 0.80091059, "num_input_tokens_seen": 161528995, "step": 7471, "time_per_iteration": 2.447983741760254 }, { "auxiliary_loss_clip": 0.01111957, "auxiliary_loss_mlp": 0.01026433, "balance_loss_clip": 1.04047716, "balance_loss_mlp": 1.01849985, "epoch": 0.8984548788552876, "flos": 20558895344640.0, "grad_norm": 1.855930994033701, "language_loss": 0.80479497, "learning_rate": 1.0709120973291707e-07, "loss": 0.82617891, "num_input_tokens_seen": 161548775, "step": 7472, "time_per_iteration": 2.5386130809783936 }, { "auxiliary_loss_clip": 0.01166358, "auxiliary_loss_mlp": 0.01026877, "balance_loss_clip": 1.04730916, "balance_loss_mlp": 1.01941466, "epoch": 0.8985751217459268, "flos": 17785442511360.0, "grad_norm": 2.0362015033547927, "language_loss": 0.78058845, "learning_rate": 1.0683987202446475e-07, "loss": 0.80252075, "num_input_tokens_seen": 161566960, "step": 7473, "time_per_iteration": 2.4053902626037598 }, { "auxiliary_loss_clip": 0.01153978, "auxiliary_loss_mlp": 0.01022581, "balance_loss_clip": 1.04515266, "balance_loss_mlp": 1.01513028, "epoch": 0.8986953646365659, "flos": 21617003208960.0, "grad_norm": 1.873774541766139, "language_loss": 0.69974357, "learning_rate": 1.0658882150689862e-07, "loss": 0.72150916, "num_input_tokens_seen": 161585820, "step": 7474, "time_per_iteration": 2.445111036300659 }, { "auxiliary_loss_clip": 0.01127501, "auxiliary_loss_mlp": 0.01021578, "balance_loss_clip": 1.04371154, "balance_loss_mlp": 1.01436627, "epoch": 0.8988156075272049, "flos": 14027355083520.0, "grad_norm": 2.9974118179394447, "language_loss": 0.78034985, "learning_rate": 1.0633805821830288e-07, "loss": 0.80184066, "num_input_tokens_seen": 161602505, "step": 7475, "time_per_iteration": 2.4873292446136475 }, { "auxiliary_loss_clip": 0.01139418, "auxiliary_loss_mlp": 0.01025284, "balance_loss_clip": 1.04479635, "balance_loss_mlp": 1.01764822, "epoch": 0.8989358504178441, "flos": 29059202004480.0, "grad_norm": 6.770366517517771, "language_loss": 0.82814127, "learning_rate": 1.0608758219671753e-07, "loss": 0.84978831, "num_input_tokens_seen": 161621545, "step": 7476, "time_per_iteration": 2.529716968536377 }, { "auxiliary_loss_clip": 0.01141771, "auxiliary_loss_mlp": 0.01021794, "balance_loss_clip": 1.04437053, "balance_loss_mlp": 1.01490045, "epoch": 0.8990560933084831, "flos": 20230420446720.0, "grad_norm": 1.6617175867881215, "language_loss": 0.70759833, "learning_rate": 1.0583739348014065e-07, "loss": 0.72923398, "num_input_tokens_seen": 161642630, "step": 7477, "time_per_iteration": 2.4870245456695557 }, { "auxiliary_loss_clip": 0.01168127, "auxiliary_loss_mlp": 0.01021755, "balance_loss_clip": 1.05057144, "balance_loss_mlp": 1.01488304, "epoch": 0.8991763361991222, "flos": 25520672459520.0, "grad_norm": 2.1802925975292387, "language_loss": 0.8453145, "learning_rate": 1.0558749210652518e-07, "loss": 0.86721325, "num_input_tokens_seen": 161662560, "step": 7478, "time_per_iteration": 2.4409868717193604 }, { "auxiliary_loss_clip": 0.01128589, "auxiliary_loss_mlp": 0.01024721, "balance_loss_clip": 1.04435384, "balance_loss_mlp": 1.01808703, "epoch": 0.8992965790897613, "flos": 25119191168640.0, "grad_norm": 1.8246832863465052, "language_loss": 0.85857689, "learning_rate": 1.053378781137808e-07, "loss": 0.88011003, "num_input_tokens_seen": 161683480, "step": 7479, "time_per_iteration": 2.546184778213501 }, { "auxiliary_loss_clip": 0.01140802, "auxiliary_loss_mlp": 0.01026357, "balance_loss_clip": 1.04472256, "balance_loss_mlp": 1.01917768, "epoch": 0.8994168219804004, "flos": 16070815814400.0, "grad_norm": 1.9466141023063341, "language_loss": 0.77657908, "learning_rate": 1.0508855153977392e-07, "loss": 0.79825068, "num_input_tokens_seen": 161699945, "step": 7480, "time_per_iteration": 2.4394664764404297 }, { "auxiliary_loss_clip": 0.01152279, "auxiliary_loss_mlp": 0.01026968, "balance_loss_clip": 1.04386854, "balance_loss_mlp": 1.01962423, "epoch": 0.8995370648710395, "flos": 24825764966400.0, "grad_norm": 2.3094500020391857, "language_loss": 0.67153329, "learning_rate": 1.0483951242232669e-07, "loss": 0.69332576, "num_input_tokens_seen": 161720420, "step": 7481, "time_per_iteration": 3.387246608734131 }, { "auxiliary_loss_clip": 0.01061128, "auxiliary_loss_mlp": 0.01000908, "balance_loss_clip": 1.00747323, "balance_loss_mlp": 1.00005579, "epoch": 0.8996573077616786, "flos": 63116238378240.0, "grad_norm": 0.9730907483693916, "language_loss": 0.57671976, "learning_rate": 1.0459076079921936e-07, "loss": 0.59734011, "num_input_tokens_seen": 161773080, "step": 7482, "time_per_iteration": 3.7595181465148926 }, { "auxiliary_loss_clip": 0.01132041, "auxiliary_loss_mlp": 0.0102983, "balance_loss_clip": 1.04411364, "balance_loss_mlp": 1.02226901, "epoch": 0.8997775506523177, "flos": 18219674027520.0, "grad_norm": 2.167037524681334, "language_loss": 0.85043657, "learning_rate": 1.0434229670818618e-07, "loss": 0.87205529, "num_input_tokens_seen": 161789755, "step": 7483, "time_per_iteration": 3.267090320587158 }, { "auxiliary_loss_clip": 0.01129703, "auxiliary_loss_mlp": 0.01022938, "balance_loss_clip": 1.04197001, "balance_loss_mlp": 1.01560652, "epoch": 0.8998977935429567, "flos": 24166768095360.0, "grad_norm": 1.8458140546701203, "language_loss": 0.79887593, "learning_rate": 1.0409412018691944e-07, "loss": 0.82040238, "num_input_tokens_seen": 161810220, "step": 7484, "time_per_iteration": 2.478732109069824 }, { "auxiliary_loss_clip": 0.01134222, "auxiliary_loss_mlp": 0.010253, "balance_loss_clip": 1.04471016, "balance_loss_mlp": 1.01820457, "epoch": 0.9000180364335959, "flos": 20773030273920.0, "grad_norm": 2.04131354953962, "language_loss": 0.74898016, "learning_rate": 1.0384623127306724e-07, "loss": 0.77057534, "num_input_tokens_seen": 161827565, "step": 7485, "time_per_iteration": 2.4748785495758057 }, { "auxiliary_loss_clip": 0.01120198, "auxiliary_loss_mlp": 0.01019777, "balance_loss_clip": 1.04142404, "balance_loss_mlp": 1.01309812, "epoch": 0.900138279324235, "flos": 19205745166080.0, "grad_norm": 1.7764018029115372, "language_loss": 0.79428732, "learning_rate": 1.0359863000423397e-07, "loss": 0.81568706, "num_input_tokens_seen": 161845700, "step": 7486, "time_per_iteration": 2.4852969646453857 }, { "auxiliary_loss_clip": 0.01166592, "auxiliary_loss_mlp": 0.01024787, "balance_loss_clip": 1.04826248, "balance_loss_mlp": 1.01786065, "epoch": 0.900258522214874, "flos": 28731158069760.0, "grad_norm": 1.7360435892239923, "language_loss": 0.71823859, "learning_rate": 1.0335131641798112e-07, "loss": 0.74015236, "num_input_tokens_seen": 161867660, "step": 7487, "time_per_iteration": 2.4582371711730957 }, { "auxiliary_loss_clip": 0.01041264, "auxiliary_loss_mlp": 0.01001337, "balance_loss_clip": 1.00787663, "balance_loss_mlp": 1.00045443, "epoch": 0.9003787651055132, "flos": 58280685655680.0, "grad_norm": 0.9647917465211733, "language_loss": 0.55606437, "learning_rate": 1.0310429055182512e-07, "loss": 0.57649028, "num_input_tokens_seen": 161921980, "step": 7488, "time_per_iteration": 2.888615131378174 }, { "auxiliary_loss_clip": 0.01123851, "auxiliary_loss_mlp": 0.01028221, "balance_loss_clip": 1.04195583, "balance_loss_mlp": 1.02096105, "epoch": 0.9004990079961522, "flos": 25556475340800.0, "grad_norm": 1.6885474998831453, "language_loss": 0.74030489, "learning_rate": 1.0285755244324024e-07, "loss": 0.76182562, "num_input_tokens_seen": 161942725, "step": 7489, "time_per_iteration": 2.5354793071746826 }, { "auxiliary_loss_clip": 0.01139014, "auxiliary_loss_mlp": 0.00761441, "balance_loss_clip": 1.04192805, "balance_loss_mlp": 1.0004847, "epoch": 0.9006192508867913, "flos": 23335185352320.0, "grad_norm": 2.9019924795602194, "language_loss": 0.68757826, "learning_rate": 1.0261110212965629e-07, "loss": 0.70658278, "num_input_tokens_seen": 161964520, "step": 7490, "time_per_iteration": 3.296309471130371 }, { "auxiliary_loss_clip": 0.01137709, "auxiliary_loss_mlp": 0.01025712, "balance_loss_clip": 1.04443598, "balance_loss_mlp": 1.01904547, "epoch": 0.9007394937774305, "flos": 18040300485120.0, "grad_norm": 1.889246220983641, "language_loss": 0.7922442, "learning_rate": 1.023649396484596e-07, "loss": 0.81387842, "num_input_tokens_seen": 161983575, "step": 7491, "time_per_iteration": 2.449601173400879 }, { "auxiliary_loss_clip": 0.01164072, "auxiliary_loss_mlp": 0.01024508, "balance_loss_clip": 1.04584169, "balance_loss_mlp": 1.01798701, "epoch": 0.9008597366680695, "flos": 43068456633600.0, "grad_norm": 1.8900207930832382, "language_loss": 0.67653573, "learning_rate": 1.0211906503699275e-07, "loss": 0.69842148, "num_input_tokens_seen": 162006550, "step": 7492, "time_per_iteration": 2.5934319496154785 }, { "auxiliary_loss_clip": 0.01154813, "auxiliary_loss_mlp": 0.01027674, "balance_loss_clip": 1.04832292, "balance_loss_mlp": 1.01970506, "epoch": 0.9009799795587086, "flos": 14939055112320.0, "grad_norm": 2.452430364775128, "language_loss": 0.82468253, "learning_rate": 1.0187347833255455e-07, "loss": 0.84650743, "num_input_tokens_seen": 162022455, "step": 7493, "time_per_iteration": 2.398282527923584 }, { "auxiliary_loss_clip": 0.01162646, "auxiliary_loss_mlp": 0.01024284, "balance_loss_clip": 1.04701686, "balance_loss_mlp": 1.0173043, "epoch": 0.9011002224493477, "flos": 21579584215680.0, "grad_norm": 1.635963031346961, "language_loss": 0.79221272, "learning_rate": 1.0162817957240056e-07, "loss": 0.81408203, "num_input_tokens_seen": 162042350, "step": 7494, "time_per_iteration": 2.407040596008301 }, { "auxiliary_loss_clip": 0.0105174, "auxiliary_loss_mlp": 0.01001423, "balance_loss_clip": 1.00783718, "balance_loss_mlp": 1.0005703, "epoch": 0.9012204653399868, "flos": 71166367883520.0, "grad_norm": 0.8835582868496925, "language_loss": 0.63025457, "learning_rate": 1.0138316879374253e-07, "loss": 0.65078616, "num_input_tokens_seen": 162111640, "step": 7495, "time_per_iteration": 3.1716079711914062 }, { "auxiliary_loss_clip": 0.01141133, "auxiliary_loss_mlp": 0.01021399, "balance_loss_clip": 1.04807615, "balance_loss_mlp": 1.01397514, "epoch": 0.9013407082306258, "flos": 15594963413760.0, "grad_norm": 3.6988235192551513, "language_loss": 0.74481285, "learning_rate": 1.0113844603374833e-07, "loss": 0.76643813, "num_input_tokens_seen": 162128165, "step": 7496, "time_per_iteration": 2.4523158073425293 }, { "auxiliary_loss_clip": 0.01136963, "auxiliary_loss_mlp": 0.01024187, "balance_loss_clip": 1.04221725, "balance_loss_mlp": 1.01631284, "epoch": 0.901460951121265, "flos": 15049157276160.0, "grad_norm": 3.226879365709535, "language_loss": 0.72016931, "learning_rate": 1.0089401132954178e-07, "loss": 0.74178082, "num_input_tokens_seen": 162146145, "step": 7497, "time_per_iteration": 2.4674620628356934 }, { "auxiliary_loss_clip": 0.01139322, "auxiliary_loss_mlp": 0.0102361, "balance_loss_clip": 1.04646587, "balance_loss_mlp": 1.01690114, "epoch": 0.9015811940119041, "flos": 22236857233920.0, "grad_norm": 1.580516604419315, "language_loss": 0.72286189, "learning_rate": 1.006498647182037e-07, "loss": 0.74449122, "num_input_tokens_seen": 162164800, "step": 7498, "time_per_iteration": 2.483145236968994 }, { "auxiliary_loss_clip": 0.01092436, "auxiliary_loss_mlp": 0.01027796, "balance_loss_clip": 1.03913879, "balance_loss_mlp": 1.0205512, "epoch": 0.9017014369025431, "flos": 24973824827520.0, "grad_norm": 2.358434843356064, "language_loss": 0.7148248, "learning_rate": 1.004060062367713e-07, "loss": 0.73602712, "num_input_tokens_seen": 162185895, "step": 7499, "time_per_iteration": 2.6162989139556885 }, { "auxiliary_loss_clip": 0.0115358, "auxiliary_loss_mlp": 0.01023289, "balance_loss_clip": 1.04602504, "balance_loss_mlp": 1.01587462, "epoch": 0.9018216797931822, "flos": 18114168804480.0, "grad_norm": 1.739161401098802, "language_loss": 0.69203043, "learning_rate": 1.0016243592223728e-07, "loss": 0.71379912, "num_input_tokens_seen": 162206295, "step": 7500, "time_per_iteration": 2.4434964656829834 }, { "auxiliary_loss_clip": 0.01094081, "auxiliary_loss_mlp": 0.01024004, "balance_loss_clip": 1.04049206, "balance_loss_mlp": 1.01689661, "epoch": 0.9019419226838213, "flos": 37268452759680.0, "grad_norm": 1.856609138793378, "language_loss": 0.65798825, "learning_rate": 9.991915381155114e-08, "loss": 0.67916918, "num_input_tokens_seen": 162229275, "step": 7501, "time_per_iteration": 2.699352502822876 }, { "auxiliary_loss_clip": 0.01154714, "auxiliary_loss_mlp": 0.01023093, "balance_loss_clip": 1.04588509, "balance_loss_mlp": 1.01587462, "epoch": 0.9020621655744604, "flos": 23441121538560.0, "grad_norm": 2.0112876345868647, "language_loss": 0.74872202, "learning_rate": 9.967615994161871e-08, "loss": 0.77050006, "num_input_tokens_seen": 162248935, "step": 7502, "time_per_iteration": 2.4711592197418213 }, { "auxiliary_loss_clip": 0.0116393, "auxiliary_loss_mlp": 0.01021335, "balance_loss_clip": 1.04577315, "balance_loss_mlp": 1.01435566, "epoch": 0.9021824084650995, "flos": 22857465444480.0, "grad_norm": 1.7030176325353608, "language_loss": 0.78332675, "learning_rate": 9.943345434930161e-08, "loss": 0.80517936, "num_input_tokens_seen": 162269185, "step": 7503, "time_per_iteration": 2.4451146125793457 }, { "auxiliary_loss_clip": 0.01123429, "auxiliary_loss_mlp": 0.01023997, "balance_loss_clip": 1.04454303, "balance_loss_mlp": 1.01692176, "epoch": 0.9023026513557386, "flos": 22127581082880.0, "grad_norm": 1.8910205186235143, "language_loss": 0.68888879, "learning_rate": 9.919103707141885e-08, "loss": 0.71036303, "num_input_tokens_seen": 162288065, "step": 7504, "time_per_iteration": 2.504936456680298 }, { "auxiliary_loss_clip": 0.01149092, "auxiliary_loss_mlp": 0.01025003, "balance_loss_clip": 1.04483533, "balance_loss_mlp": 1.01733744, "epoch": 0.9024228942463777, "flos": 24199087357440.0, "grad_norm": 1.8683829897163502, "language_loss": 0.76707155, "learning_rate": 9.89489081447441e-08, "loss": 0.78881252, "num_input_tokens_seen": 162305265, "step": 7505, "time_per_iteration": 2.436614751815796 }, { "auxiliary_loss_clip": 0.01136553, "auxiliary_loss_mlp": 0.01021817, "balance_loss_clip": 1.04272437, "balance_loss_mlp": 1.0144136, "epoch": 0.9025431371370167, "flos": 25008262992000.0, "grad_norm": 1.8462170707015797, "language_loss": 0.83278984, "learning_rate": 9.870706760600844e-08, "loss": 0.85437357, "num_input_tokens_seen": 162325215, "step": 7506, "time_per_iteration": 2.4989330768585205 }, { "auxiliary_loss_clip": 0.01118712, "auxiliary_loss_mlp": 0.01026842, "balance_loss_clip": 1.04765582, "balance_loss_mlp": 1.01949906, "epoch": 0.9026633800276559, "flos": 18952862440320.0, "grad_norm": 2.492504751861586, "language_loss": 0.72674954, "learning_rate": 9.846551549189918e-08, "loss": 0.74820513, "num_input_tokens_seen": 162344820, "step": 7507, "time_per_iteration": 3.3484420776367188 }, { "auxiliary_loss_clip": 0.01135015, "auxiliary_loss_mlp": 0.01023458, "balance_loss_clip": 1.04397464, "balance_loss_mlp": 1.01621294, "epoch": 0.902783622918295, "flos": 32416059536640.0, "grad_norm": 1.9638969202479812, "language_loss": 0.68712258, "learning_rate": 9.822425183905902e-08, "loss": 0.70870733, "num_input_tokens_seen": 162365345, "step": 7508, "time_per_iteration": 2.557969808578491 }, { "auxiliary_loss_clip": 0.01032407, "auxiliary_loss_mlp": 0.01001098, "balance_loss_clip": 1.00843966, "balance_loss_mlp": 1.00021541, "epoch": 0.902903865808934, "flos": 63717453244800.0, "grad_norm": 0.9300813243559274, "language_loss": 0.7522254, "learning_rate": 9.798327668408823e-08, "loss": 0.77256048, "num_input_tokens_seen": 162426980, "step": 7509, "time_per_iteration": 3.8911752700805664 }, { "auxiliary_loss_clip": 0.0116887, "auxiliary_loss_mlp": 0.01025138, "balance_loss_clip": 1.0476203, "balance_loss_mlp": 1.01756859, "epoch": 0.9030241086995732, "flos": 23804034600960.0, "grad_norm": 2.3033024802018858, "language_loss": 0.68901622, "learning_rate": 9.774259006354158e-08, "loss": 0.71095628, "num_input_tokens_seen": 162447050, "step": 7510, "time_per_iteration": 3.165647268295288 }, { "auxiliary_loss_clip": 0.0114119, "auxiliary_loss_mlp": 0.01025986, "balance_loss_clip": 1.0435791, "balance_loss_mlp": 1.01893759, "epoch": 0.9031443515902122, "flos": 26395887248640.0, "grad_norm": 1.9847206849650965, "language_loss": 0.75995255, "learning_rate": 9.750219201393184e-08, "loss": 0.78162432, "num_input_tokens_seen": 162467015, "step": 7511, "time_per_iteration": 2.5126726627349854 }, { "auxiliary_loss_clip": 0.0114898, "auxiliary_loss_mlp": 0.01019428, "balance_loss_clip": 1.0442965, "balance_loss_mlp": 1.0123018, "epoch": 0.9032645944808513, "flos": 24939350749440.0, "grad_norm": 1.922756282202372, "language_loss": 0.77448583, "learning_rate": 9.726208257172697e-08, "loss": 0.79616994, "num_input_tokens_seen": 162488710, "step": 7512, "time_per_iteration": 2.4924213886260986 }, { "auxiliary_loss_clip": 0.01164692, "auxiliary_loss_mlp": 0.01022105, "balance_loss_clip": 1.04710519, "balance_loss_mlp": 1.0151639, "epoch": 0.9033848373714904, "flos": 21178821196800.0, "grad_norm": 2.6589378993722392, "language_loss": 0.74869305, "learning_rate": 9.702226177335115e-08, "loss": 0.77056104, "num_input_tokens_seen": 162507205, "step": 7513, "time_per_iteration": 2.4249424934387207 }, { "auxiliary_loss_clip": 0.01136297, "auxiliary_loss_mlp": 0.0103205, "balance_loss_clip": 1.04524136, "balance_loss_mlp": 1.02432549, "epoch": 0.9035050802621295, "flos": 26286359702400.0, "grad_norm": 1.7505175252544227, "language_loss": 0.72428823, "learning_rate": 9.67827296551853e-08, "loss": 0.7459718, "num_input_tokens_seen": 162528490, "step": 7514, "time_per_iteration": 2.5159690380096436 }, { "auxiliary_loss_clip": 0.01129064, "auxiliary_loss_mlp": 0.00761706, "balance_loss_clip": 1.040609, "balance_loss_mlp": 1.00048304, "epoch": 0.9036253231527686, "flos": 24204546224640.0, "grad_norm": 2.1967319204783156, "language_loss": 0.68626791, "learning_rate": 9.65434862535659e-08, "loss": 0.70517558, "num_input_tokens_seen": 162547860, "step": 7515, "time_per_iteration": 2.486316204071045 }, { "auxiliary_loss_clip": 0.01138782, "auxiliary_loss_mlp": 0.01029733, "balance_loss_clip": 1.04319763, "balance_loss_mlp": 1.0222944, "epoch": 0.9037455660434077, "flos": 18072655660800.0, "grad_norm": 3.026203917082258, "language_loss": 0.64671063, "learning_rate": 9.630453160478635e-08, "loss": 0.66839582, "num_input_tokens_seen": 162563215, "step": 7516, "time_per_iteration": 2.4332194328308105 }, { "auxiliary_loss_clip": 0.01111823, "auxiliary_loss_mlp": 0.01022925, "balance_loss_clip": 1.04214573, "balance_loss_mlp": 1.01568878, "epoch": 0.9038658089340468, "flos": 24060795995520.0, "grad_norm": 1.6804954566547874, "language_loss": 0.82410437, "learning_rate": 9.60658657450959e-08, "loss": 0.84545183, "num_input_tokens_seen": 162583515, "step": 7517, "time_per_iteration": 3.349963426589966 }, { "auxiliary_loss_clip": 0.01122738, "auxiliary_loss_mlp": 0.01024807, "balance_loss_clip": 1.04015434, "balance_loss_mlp": 1.01776767, "epoch": 0.9039860518246858, "flos": 21834298535040.0, "grad_norm": 1.5632127066299444, "language_loss": 0.79289317, "learning_rate": 9.582748871069979e-08, "loss": 0.81436861, "num_input_tokens_seen": 162602955, "step": 7518, "time_per_iteration": 2.5092217922210693 }, { "auxiliary_loss_clip": 0.01138744, "auxiliary_loss_mlp": 0.00761296, "balance_loss_clip": 1.04273462, "balance_loss_mlp": 1.00042844, "epoch": 0.904106294715325, "flos": 26614870513920.0, "grad_norm": 2.177561591943742, "language_loss": 0.82648271, "learning_rate": 9.558940053775954e-08, "loss": 0.84548306, "num_input_tokens_seen": 162621595, "step": 7519, "time_per_iteration": 2.578040599822998 }, { "auxiliary_loss_clip": 0.01149752, "auxiliary_loss_mlp": 0.01026148, "balance_loss_clip": 1.04571056, "balance_loss_mlp": 1.01901889, "epoch": 0.904226537605964, "flos": 17785693906560.0, "grad_norm": 6.918367209100176, "language_loss": 0.68074358, "learning_rate": 9.535160126239294e-08, "loss": 0.70250249, "num_input_tokens_seen": 162638220, "step": 7520, "time_per_iteration": 2.4742648601531982 }, { "auxiliary_loss_clip": 0.01149248, "auxiliary_loss_mlp": 0.01025002, "balance_loss_clip": 1.04616082, "balance_loss_mlp": 1.01804328, "epoch": 0.9043467804966031, "flos": 24790428961920.0, "grad_norm": 1.5518936283260256, "language_loss": 0.70606309, "learning_rate": 9.511409092067424e-08, "loss": 0.72780561, "num_input_tokens_seen": 162658575, "step": 7521, "time_per_iteration": 2.55526065826416 }, { "auxiliary_loss_clip": 0.01139797, "auxiliary_loss_mlp": 0.01022345, "balance_loss_clip": 1.0469507, "balance_loss_mlp": 1.01537108, "epoch": 0.9044670233872423, "flos": 22632125472000.0, "grad_norm": 1.7966911876723506, "language_loss": 0.67536676, "learning_rate": 9.487686954863327e-08, "loss": 0.69698817, "num_input_tokens_seen": 162678295, "step": 7522, "time_per_iteration": 2.4851834774017334 }, { "auxiliary_loss_clip": 0.01149355, "auxiliary_loss_mlp": 0.01023135, "balance_loss_clip": 1.04549479, "balance_loss_mlp": 1.01616693, "epoch": 0.9045872662778813, "flos": 23771320289280.0, "grad_norm": 5.003024867350654, "language_loss": 0.76886868, "learning_rate": 9.46399371822566e-08, "loss": 0.79059362, "num_input_tokens_seen": 162698070, "step": 7523, "time_per_iteration": 2.4811840057373047 }, { "auxiliary_loss_clip": 0.01165438, "auxiliary_loss_mlp": 0.01023687, "balance_loss_clip": 1.04738045, "balance_loss_mlp": 1.01633167, "epoch": 0.9047075091685204, "flos": 15191039998080.0, "grad_norm": 3.0669154921102857, "language_loss": 0.72337472, "learning_rate": 9.440329385748657e-08, "loss": 0.74526596, "num_input_tokens_seen": 162715140, "step": 7524, "time_per_iteration": 2.378437042236328 }, { "auxiliary_loss_clip": 0.0112303, "auxiliary_loss_mlp": 0.01017748, "balance_loss_clip": 1.04415119, "balance_loss_mlp": 1.01148367, "epoch": 0.9048277520591596, "flos": 18003707504640.0, "grad_norm": 1.6884552421013848, "language_loss": 0.70513642, "learning_rate": 9.416693961022137e-08, "loss": 0.72654414, "num_input_tokens_seen": 162733390, "step": 7525, "time_per_iteration": 2.4754233360290527 }, { "auxiliary_loss_clip": 0.01083976, "auxiliary_loss_mlp": 0.0102422, "balance_loss_clip": 1.03895688, "balance_loss_mlp": 1.01720452, "epoch": 0.9049479949497986, "flos": 21872471713920.0, "grad_norm": 1.6828772868579742, "language_loss": 0.77079493, "learning_rate": 9.393087447631654e-08, "loss": 0.79187691, "num_input_tokens_seen": 162751670, "step": 7526, "time_per_iteration": 2.560612678527832 }, { "auxiliary_loss_clip": 0.01137852, "auxiliary_loss_mlp": 0.01019729, "balance_loss_clip": 1.04315889, "balance_loss_mlp": 1.01331544, "epoch": 0.9050682378404377, "flos": 20773928113920.0, "grad_norm": 1.8653675147424515, "language_loss": 0.72918928, "learning_rate": 9.36950984915823e-08, "loss": 0.75076509, "num_input_tokens_seen": 162770025, "step": 7527, "time_per_iteration": 2.4717085361480713 }, { "auxiliary_loss_clip": 0.01167262, "auxiliary_loss_mlp": 0.01025271, "balance_loss_clip": 1.0491178, "balance_loss_mlp": 1.01799929, "epoch": 0.9051884807310768, "flos": 21580015178880.0, "grad_norm": 1.7162262224571077, "language_loss": 0.6927138, "learning_rate": 9.345961169178607e-08, "loss": 0.71463913, "num_input_tokens_seen": 162789710, "step": 7528, "time_per_iteration": 2.4101996421813965 }, { "auxiliary_loss_clip": 0.01113461, "auxiliary_loss_mlp": 0.01026406, "balance_loss_clip": 1.04762328, "balance_loss_mlp": 1.01943541, "epoch": 0.9053087236217159, "flos": 21908059113600.0, "grad_norm": 1.4055299782988353, "language_loss": 0.7284615, "learning_rate": 9.322441411265081e-08, "loss": 0.74986017, "num_input_tokens_seen": 162810695, "step": 7529, "time_per_iteration": 2.531101703643799 }, { "auxiliary_loss_clip": 0.01133867, "auxiliary_loss_mlp": 0.01024464, "balance_loss_clip": 1.04462218, "balance_loss_mlp": 1.01745772, "epoch": 0.9054289665123549, "flos": 17055809544960.0, "grad_norm": 1.9134742899752655, "language_loss": 0.72953081, "learning_rate": 9.298950578985554e-08, "loss": 0.75111413, "num_input_tokens_seen": 162827770, "step": 7530, "time_per_iteration": 2.532728672027588 }, { "auxiliary_loss_clip": 0.01146184, "auxiliary_loss_mlp": 0.00762121, "balance_loss_clip": 1.04526711, "balance_loss_mlp": 1.00035715, "epoch": 0.905549209402994, "flos": 20777268078720.0, "grad_norm": 1.6917842593366994, "language_loss": 0.70836216, "learning_rate": 9.275488675903665e-08, "loss": 0.72744524, "num_input_tokens_seen": 162846715, "step": 7531, "time_per_iteration": 2.448601722717285 }, { "auxiliary_loss_clip": 0.01109131, "auxiliary_loss_mlp": 0.0102543, "balance_loss_clip": 1.04351771, "balance_loss_mlp": 1.01837349, "epoch": 0.9056694522936332, "flos": 21686813291520.0, "grad_norm": 1.9192459311768664, "language_loss": 0.73923755, "learning_rate": 9.252055705578454e-08, "loss": 0.76058316, "num_input_tokens_seen": 162866215, "step": 7532, "time_per_iteration": 2.5558061599731445 }, { "auxiliary_loss_clip": 0.01149128, "auxiliary_loss_mlp": 0.01024256, "balance_loss_clip": 1.04382646, "balance_loss_mlp": 1.017717, "epoch": 0.9057896951842722, "flos": 29569133433600.0, "grad_norm": 1.6143226344728039, "language_loss": 0.71910238, "learning_rate": 9.228651671564747e-08, "loss": 0.74083626, "num_input_tokens_seen": 162888245, "step": 7533, "time_per_iteration": 2.512439012527466 }, { "auxiliary_loss_clip": 0.01104851, "auxiliary_loss_mlp": 0.01024418, "balance_loss_clip": 1.04348254, "balance_loss_mlp": 1.01763821, "epoch": 0.9059099380749113, "flos": 27892248952320.0, "grad_norm": 1.567067747710581, "language_loss": 0.77892238, "learning_rate": 9.205276577412901e-08, "loss": 0.80021507, "num_input_tokens_seen": 162911025, "step": 7534, "time_per_iteration": 3.405759572982788 }, { "auxiliary_loss_clip": 0.0114276, "auxiliary_loss_mlp": 0.00761577, "balance_loss_clip": 1.0430876, "balance_loss_mlp": 1.00043666, "epoch": 0.9060301809655504, "flos": 17748993185280.0, "grad_norm": 2.6138948860132687, "language_loss": 0.77097189, "learning_rate": 9.181930426668905e-08, "loss": 0.79001522, "num_input_tokens_seen": 162927820, "step": 7535, "time_per_iteration": 2.4517784118652344 }, { "auxiliary_loss_clip": 0.01108211, "auxiliary_loss_mlp": 0.01023992, "balance_loss_clip": 1.04355836, "balance_loss_mlp": 1.01737297, "epoch": 0.9061504238561895, "flos": 31759432963200.0, "grad_norm": 1.5695747644015576, "language_loss": 0.67939436, "learning_rate": 9.158613222874346e-08, "loss": 0.70071638, "num_input_tokens_seen": 162949445, "step": 7536, "time_per_iteration": 4.1521477699279785 }, { "auxiliary_loss_clip": 0.01134582, "auxiliary_loss_mlp": 0.01020546, "balance_loss_clip": 1.04255736, "balance_loss_mlp": 1.0136584, "epoch": 0.9062706667468285, "flos": 20048066075520.0, "grad_norm": 1.625523697874035, "language_loss": 0.81722605, "learning_rate": 9.135324969566394e-08, "loss": 0.8387773, "num_input_tokens_seen": 162968945, "step": 7537, "time_per_iteration": 2.476557493209839 }, { "auxiliary_loss_clip": 0.0115614, "auxiliary_loss_mlp": 0.01025756, "balance_loss_clip": 1.0471549, "balance_loss_mlp": 1.01867819, "epoch": 0.9063909096374677, "flos": 18437292576000.0, "grad_norm": 1.785844906890705, "language_loss": 0.75357199, "learning_rate": 9.112065670277913e-08, "loss": 0.77539092, "num_input_tokens_seen": 162985310, "step": 7538, "time_per_iteration": 2.420950174331665 }, { "auxiliary_loss_clip": 0.01134283, "auxiliary_loss_mlp": 0.01022287, "balance_loss_clip": 1.04285014, "balance_loss_mlp": 1.01562071, "epoch": 0.9065111525281068, "flos": 33547353361920.0, "grad_norm": 2.1834346437163754, "language_loss": 0.72660363, "learning_rate": 9.088835328537303e-08, "loss": 0.7481693, "num_input_tokens_seen": 163006900, "step": 7539, "time_per_iteration": 2.586735248565674 }, { "auxiliary_loss_clip": 0.0114094, "auxiliary_loss_mlp": 0.01023267, "balance_loss_clip": 1.04530478, "balance_loss_mlp": 1.01602483, "epoch": 0.9066313954187458, "flos": 23367863750400.0, "grad_norm": 11.837360681386912, "language_loss": 0.71457005, "learning_rate": 9.065633947868568e-08, "loss": 0.73621213, "num_input_tokens_seen": 163026505, "step": 7540, "time_per_iteration": 2.5033652782440186 }, { "auxiliary_loss_clip": 0.01123452, "auxiliary_loss_mlp": 0.00761718, "balance_loss_clip": 1.04577088, "balance_loss_mlp": 1.00037241, "epoch": 0.906751638309385, "flos": 26249623067520.0, "grad_norm": 2.3554558908120646, "language_loss": 0.79849142, "learning_rate": 9.042461531791379e-08, "loss": 0.81734312, "num_input_tokens_seen": 163044925, "step": 7541, "time_per_iteration": 2.55106520652771 }, { "auxiliary_loss_clip": 0.01161096, "auxiliary_loss_mlp": 0.01022355, "balance_loss_clip": 1.04542935, "balance_loss_mlp": 1.01575983, "epoch": 0.906871881200024, "flos": 16544477485440.0, "grad_norm": 1.7459686267517114, "language_loss": 0.78215909, "learning_rate": 9.019318083820903e-08, "loss": 0.80399358, "num_input_tokens_seen": 163063505, "step": 7542, "time_per_iteration": 2.4267797470092773 }, { "auxiliary_loss_clip": 0.01150141, "auxiliary_loss_mlp": 0.01029084, "balance_loss_clip": 1.04700756, "balance_loss_mlp": 1.02212477, "epoch": 0.9069921240906631, "flos": 24605129675520.0, "grad_norm": 1.6561832999381751, "language_loss": 0.85298896, "learning_rate": 8.996203607468045e-08, "loss": 0.87478125, "num_input_tokens_seen": 163082505, "step": 7543, "time_per_iteration": 2.4794650077819824 }, { "auxiliary_loss_clip": 0.01147579, "auxiliary_loss_mlp": 0.01025188, "balance_loss_clip": 1.04332185, "balance_loss_mlp": 1.01804733, "epoch": 0.9071123669813023, "flos": 25374731500800.0, "grad_norm": 1.503951133656035, "language_loss": 0.75568414, "learning_rate": 8.973118106239241e-08, "loss": 0.77741194, "num_input_tokens_seen": 163105110, "step": 7544, "time_per_iteration": 3.316143274307251 }, { "auxiliary_loss_clip": 0.01092663, "auxiliary_loss_mlp": 0.01027414, "balance_loss_clip": 1.03725302, "balance_loss_mlp": 1.02018106, "epoch": 0.9072326098719413, "flos": 26725798690560.0, "grad_norm": 1.9916631437492198, "language_loss": 0.94762319, "learning_rate": 8.95006158363656e-08, "loss": 0.96882391, "num_input_tokens_seen": 163125295, "step": 7545, "time_per_iteration": 2.6222267150878906 }, { "auxiliary_loss_clip": 0.01152857, "auxiliary_loss_mlp": 0.0102901, "balance_loss_clip": 1.04871011, "balance_loss_mlp": 1.02144623, "epoch": 0.9073528527625804, "flos": 23878800760320.0, "grad_norm": 1.8083816990157908, "language_loss": 0.77342892, "learning_rate": 8.9270340431576e-08, "loss": 0.79524761, "num_input_tokens_seen": 163144385, "step": 7546, "time_per_iteration": 2.461282253265381 }, { "auxiliary_loss_clip": 0.01153047, "auxiliary_loss_mlp": 0.01025931, "balance_loss_clip": 1.04460895, "balance_loss_mlp": 1.01890039, "epoch": 0.9074730956532195, "flos": 37852144767360.0, "grad_norm": 2.0768832205644254, "language_loss": 0.7363615, "learning_rate": 8.904035488295658e-08, "loss": 0.75815123, "num_input_tokens_seen": 163163885, "step": 7547, "time_per_iteration": 2.592989683151245 }, { "auxiliary_loss_clip": 0.01051382, "auxiliary_loss_mlp": 0.00752984, "balance_loss_clip": 1.00768936, "balance_loss_mlp": 0.99991649, "epoch": 0.9075933385438586, "flos": 65173307385600.0, "grad_norm": 0.7052276313477072, "language_loss": 0.53276253, "learning_rate": 8.881065922539632e-08, "loss": 0.55080622, "num_input_tokens_seen": 163224325, "step": 7548, "time_per_iteration": 2.9635889530181885 }, { "auxiliary_loss_clip": 0.01115727, "auxiliary_loss_mlp": 0.01018214, "balance_loss_clip": 1.04232168, "balance_loss_mlp": 1.0118897, "epoch": 0.9077135814344977, "flos": 19931571290880.0, "grad_norm": 2.2110591409263756, "language_loss": 0.73235798, "learning_rate": 8.85812534937389e-08, "loss": 0.7536974, "num_input_tokens_seen": 163242425, "step": 7549, "time_per_iteration": 2.503549337387085 }, { "auxiliary_loss_clip": 0.01157539, "auxiliary_loss_mlp": 0.01026804, "balance_loss_clip": 1.04628873, "balance_loss_mlp": 1.01960421, "epoch": 0.9078338243251368, "flos": 17529650784000.0, "grad_norm": 2.5991638556843792, "language_loss": 0.67745119, "learning_rate": 8.835213772278583e-08, "loss": 0.69929463, "num_input_tokens_seen": 163259280, "step": 7550, "time_per_iteration": 2.405254602432251 }, { "auxiliary_loss_clip": 0.01114218, "auxiliary_loss_mlp": 0.01021718, "balance_loss_clip": 1.04352665, "balance_loss_mlp": 1.0150156, "epoch": 0.9079540672157759, "flos": 28803410277120.0, "grad_norm": 1.7372053746226181, "language_loss": 0.78826368, "learning_rate": 8.812331194729373e-08, "loss": 0.809623, "num_input_tokens_seen": 163278925, "step": 7551, "time_per_iteration": 2.5577621459960938 }, { "auxiliary_loss_clip": 0.01172598, "auxiliary_loss_mlp": 0.01024896, "balance_loss_clip": 1.05264294, "balance_loss_mlp": 1.01746345, "epoch": 0.9080743101064149, "flos": 23513840622720.0, "grad_norm": 1.8450358039537986, "language_loss": 0.72374737, "learning_rate": 8.789477620197461e-08, "loss": 0.74572229, "num_input_tokens_seen": 163298450, "step": 7552, "time_per_iteration": 2.4289121627807617 }, { "auxiliary_loss_clip": 0.0113478, "auxiliary_loss_mlp": 0.01025533, "balance_loss_clip": 1.04257774, "balance_loss_mlp": 1.01837099, "epoch": 0.9081945529970541, "flos": 22778102344320.0, "grad_norm": 2.3035225049834023, "language_loss": 0.79151344, "learning_rate": 8.766653052149831e-08, "loss": 0.81311661, "num_input_tokens_seen": 163313635, "step": 7553, "time_per_iteration": 2.4660696983337402 }, { "auxiliary_loss_clip": 0.0113367, "auxiliary_loss_mlp": 0.01026882, "balance_loss_clip": 1.04270899, "balance_loss_mlp": 1.01934838, "epoch": 0.9083147958876931, "flos": 18873714821760.0, "grad_norm": 2.1813363136202804, "language_loss": 0.74388784, "learning_rate": 8.743857494048823e-08, "loss": 0.76549333, "num_input_tokens_seen": 163330450, "step": 7554, "time_per_iteration": 2.4595232009887695 }, { "auxiliary_loss_clip": 0.01122732, "auxiliary_loss_mlp": 0.01027159, "balance_loss_clip": 1.04260564, "balance_loss_mlp": 1.01972318, "epoch": 0.9084350387783322, "flos": 18909374048640.0, "grad_norm": 2.3014308346613634, "language_loss": 0.62604713, "learning_rate": 8.721090949352605e-08, "loss": 0.64754605, "num_input_tokens_seen": 163346690, "step": 7555, "time_per_iteration": 2.4897663593292236 }, { "auxiliary_loss_clip": 0.01160267, "auxiliary_loss_mlp": 0.01025307, "balance_loss_clip": 1.04845309, "balance_loss_mlp": 1.0175879, "epoch": 0.9085552816689714, "flos": 20595488325120.0, "grad_norm": 2.959404170864298, "language_loss": 0.73229825, "learning_rate": 8.698353421514793e-08, "loss": 0.75415397, "num_input_tokens_seen": 163365065, "step": 7556, "time_per_iteration": 2.4372453689575195 }, { "auxiliary_loss_clip": 0.01152229, "auxiliary_loss_mlp": 0.01025786, "balance_loss_clip": 1.04782963, "balance_loss_mlp": 1.01907396, "epoch": 0.9086755245596104, "flos": 18113163223680.0, "grad_norm": 2.5466287973693644, "language_loss": 0.80263793, "learning_rate": 8.67564491398467e-08, "loss": 0.82441807, "num_input_tokens_seen": 163382070, "step": 7557, "time_per_iteration": 2.4207801818847656 }, { "auxiliary_loss_clip": 0.01152376, "auxiliary_loss_mlp": 0.01025431, "balance_loss_clip": 1.04449224, "balance_loss_mlp": 1.01765227, "epoch": 0.9087957674502495, "flos": 19129793857920.0, "grad_norm": 1.829358981837066, "language_loss": 0.74024117, "learning_rate": 8.652965430207104e-08, "loss": 0.76201928, "num_input_tokens_seen": 163399975, "step": 7558, "time_per_iteration": 2.4300026893615723 }, { "auxiliary_loss_clip": 0.01154412, "auxiliary_loss_mlp": 0.01027573, "balance_loss_clip": 1.04613686, "balance_loss_mlp": 1.01985383, "epoch": 0.9089160103408886, "flos": 18109930999680.0, "grad_norm": 2.7621351215237477, "language_loss": 0.65728498, "learning_rate": 8.630314973622521e-08, "loss": 0.6791048, "num_input_tokens_seen": 163417520, "step": 7559, "time_per_iteration": 2.4251675605773926 }, { "auxiliary_loss_clip": 0.01148022, "auxiliary_loss_mlp": 0.01023092, "balance_loss_clip": 1.0470233, "balance_loss_mlp": 1.01640701, "epoch": 0.9090362532315277, "flos": 33364855336320.0, "grad_norm": 6.0647576622088, "language_loss": 0.7082597, "learning_rate": 8.607693547666995e-08, "loss": 0.72997075, "num_input_tokens_seen": 163440060, "step": 7560, "time_per_iteration": 3.3738601207733154 }, { "auxiliary_loss_clip": 0.01032142, "auxiliary_loss_mlp": 0.01002092, "balance_loss_clip": 1.00794005, "balance_loss_mlp": 1.00115621, "epoch": 0.9091564961221668, "flos": 71480585082240.0, "grad_norm": 0.945148805249054, "language_loss": 0.58006293, "learning_rate": 8.585101155772201e-08, "loss": 0.60040528, "num_input_tokens_seen": 163502180, "step": 7561, "time_per_iteration": 3.174197196960449 }, { "auxiliary_loss_clip": 0.01128535, "auxiliary_loss_mlp": 0.01025505, "balance_loss_clip": 1.03911686, "balance_loss_mlp": 1.01817942, "epoch": 0.9092767390128058, "flos": 24712574232960.0, "grad_norm": 1.871363762117446, "language_loss": 0.68608797, "learning_rate": 8.562537801365377e-08, "loss": 0.70762837, "num_input_tokens_seen": 163521915, "step": 7562, "time_per_iteration": 3.236593246459961 }, { "auxiliary_loss_clip": 0.0116584, "auxiliary_loss_mlp": 0.01027418, "balance_loss_clip": 1.04670584, "balance_loss_mlp": 1.02020562, "epoch": 0.909396981903445, "flos": 23586487879680.0, "grad_norm": 1.7726976425605123, "language_loss": 0.70005274, "learning_rate": 8.540003487869362e-08, "loss": 0.72198522, "num_input_tokens_seen": 163543585, "step": 7563, "time_per_iteration": 3.2606747150421143 }, { "auxiliary_loss_clip": 0.0111132, "auxiliary_loss_mlp": 0.0102229, "balance_loss_clip": 1.04052711, "balance_loss_mlp": 1.01507771, "epoch": 0.909517224794084, "flos": 23404169422080.0, "grad_norm": 1.758883586674906, "language_loss": 0.80005574, "learning_rate": 8.517498218702557e-08, "loss": 0.82139182, "num_input_tokens_seen": 163561515, "step": 7564, "time_per_iteration": 2.5211315155029297 }, { "auxiliary_loss_clip": 0.0111875, "auxiliary_loss_mlp": 0.01020598, "balance_loss_clip": 1.0425452, "balance_loss_mlp": 1.01372004, "epoch": 0.9096374676847231, "flos": 19208618254080.0, "grad_norm": 1.602903414177587, "language_loss": 0.69473499, "learning_rate": 8.49502199727905e-08, "loss": 0.71612847, "num_input_tokens_seen": 163579540, "step": 7565, "time_per_iteration": 2.50368070602417 }, { "auxiliary_loss_clip": 0.01146143, "auxiliary_loss_mlp": 0.01025401, "balance_loss_clip": 1.04178071, "balance_loss_mlp": 1.01820064, "epoch": 0.9097577105753623, "flos": 33292495388160.0, "grad_norm": 2.8865628899563283, "language_loss": 0.66459376, "learning_rate": 8.472574827008428e-08, "loss": 0.68630922, "num_input_tokens_seen": 163600425, "step": 7566, "time_per_iteration": 2.5603184700012207 }, { "auxiliary_loss_clip": 0.01149132, "auxiliary_loss_mlp": 0.01023433, "balance_loss_clip": 1.04352951, "balance_loss_mlp": 1.01647758, "epoch": 0.9098779534660013, "flos": 21906443001600.0, "grad_norm": 1.6950286568803576, "language_loss": 0.84148777, "learning_rate": 8.450156711295942e-08, "loss": 0.86321342, "num_input_tokens_seen": 163620595, "step": 7567, "time_per_iteration": 2.449101448059082 }, { "auxiliary_loss_clip": 0.01137527, "auxiliary_loss_mlp": 0.01025478, "balance_loss_clip": 1.04732513, "balance_loss_mlp": 1.01843262, "epoch": 0.9099981963566404, "flos": 25730354102400.0, "grad_norm": 1.9698055534687635, "language_loss": 0.86512679, "learning_rate": 8.427767653542383e-08, "loss": 0.88675684, "num_input_tokens_seen": 163635765, "step": 7568, "time_per_iteration": 2.5017755031585693 }, { "auxiliary_loss_clip": 0.01102267, "auxiliary_loss_mlp": 0.01025372, "balance_loss_clip": 1.03823709, "balance_loss_mlp": 1.0188688, "epoch": 0.9101184392472795, "flos": 21069437304960.0, "grad_norm": 2.4978980834833613, "language_loss": 0.70341295, "learning_rate": 8.405407657144125e-08, "loss": 0.72468925, "num_input_tokens_seen": 163654925, "step": 7569, "time_per_iteration": 2.5462863445281982 }, { "auxiliary_loss_clip": 0.01132208, "auxiliary_loss_mlp": 0.01023143, "balance_loss_clip": 1.04384458, "balance_loss_mlp": 1.016011, "epoch": 0.9102386821379186, "flos": 24752614919040.0, "grad_norm": 1.688640509720927, "language_loss": 0.72256434, "learning_rate": 8.383076725493232e-08, "loss": 0.74411786, "num_input_tokens_seen": 163672245, "step": 7570, "time_per_iteration": 3.277507781982422 }, { "auxiliary_loss_clip": 0.01152124, "auxiliary_loss_mlp": 0.01019254, "balance_loss_clip": 1.04558885, "balance_loss_mlp": 1.0124532, "epoch": 0.9103589250285576, "flos": 22562818179840.0, "grad_norm": 1.8136145324130062, "language_loss": 0.67824489, "learning_rate": 8.360774861977216e-08, "loss": 0.69995862, "num_input_tokens_seen": 163691365, "step": 7571, "time_per_iteration": 2.4458084106445312 }, { "auxiliary_loss_clip": 0.01135282, "auxiliary_loss_mlp": 0.01020508, "balance_loss_clip": 1.04011559, "balance_loss_mlp": 1.01348901, "epoch": 0.9104791679191968, "flos": 25373474524800.0, "grad_norm": 2.718233018534829, "language_loss": 0.74644589, "learning_rate": 8.338502069979281e-08, "loss": 0.76800382, "num_input_tokens_seen": 163711675, "step": 7572, "time_per_iteration": 2.529071807861328 }, { "auxiliary_loss_clip": 0.01153382, "auxiliary_loss_mlp": 0.01024591, "balance_loss_clip": 1.04505289, "balance_loss_mlp": 1.01723576, "epoch": 0.9105994108098359, "flos": 14426681558400.0, "grad_norm": 4.686721507726226, "language_loss": 0.80009401, "learning_rate": 8.316258352878214e-08, "loss": 0.82187378, "num_input_tokens_seen": 163728095, "step": 7573, "time_per_iteration": 2.409842014312744 }, { "auxiliary_loss_clip": 0.01155798, "auxiliary_loss_mlp": 0.0102672, "balance_loss_clip": 1.04461288, "balance_loss_mlp": 1.01912093, "epoch": 0.9107196537004749, "flos": 26718292748160.0, "grad_norm": 1.8767521077486224, "language_loss": 0.71043646, "learning_rate": 8.294043714048338e-08, "loss": 0.73226166, "num_input_tokens_seen": 163747175, "step": 7574, "time_per_iteration": 2.5055599212646484 }, { "auxiliary_loss_clip": 0.01041189, "auxiliary_loss_mlp": 0.01000876, "balance_loss_clip": 1.00678039, "balance_loss_mlp": 0.99991018, "epoch": 0.9108398965911141, "flos": 66532634703360.0, "grad_norm": 0.7496199689679823, "language_loss": 0.60471404, "learning_rate": 8.271858156859624e-08, "loss": 0.62513471, "num_input_tokens_seen": 163812545, "step": 7575, "time_per_iteration": 3.124560594558716 }, { "auxiliary_loss_clip": 0.01162867, "auxiliary_loss_mlp": 0.01018015, "balance_loss_clip": 1.04664505, "balance_loss_mlp": 1.010988, "epoch": 0.9109601394817531, "flos": 25411073086080.0, "grad_norm": 1.591850634327226, "language_loss": 0.73872173, "learning_rate": 8.249701684677557e-08, "loss": 0.76053047, "num_input_tokens_seen": 163833870, "step": 7576, "time_per_iteration": 2.4611191749572754 }, { "auxiliary_loss_clip": 0.01151638, "auxiliary_loss_mlp": 0.0102314, "balance_loss_clip": 1.04795575, "balance_loss_mlp": 1.01612711, "epoch": 0.9110803823723922, "flos": 22747794243840.0, "grad_norm": 1.786722172825009, "language_loss": 0.81441593, "learning_rate": 8.227574300863294e-08, "loss": 0.83616364, "num_input_tokens_seen": 163854040, "step": 7577, "time_per_iteration": 2.471400260925293 }, { "auxiliary_loss_clip": 0.01141203, "auxiliary_loss_mlp": 0.01021569, "balance_loss_clip": 1.04657054, "balance_loss_mlp": 1.01424098, "epoch": 0.9112006252630314, "flos": 48469924131840.0, "grad_norm": 1.819111450387425, "language_loss": 0.69685501, "learning_rate": 8.205476008773548e-08, "loss": 0.71848273, "num_input_tokens_seen": 163878040, "step": 7578, "time_per_iteration": 2.7352585792541504 }, { "auxiliary_loss_clip": 0.01115899, "auxiliary_loss_mlp": 0.01026756, "balance_loss_clip": 1.04291379, "balance_loss_mlp": 1.01962113, "epoch": 0.9113208681536704, "flos": 30009649829760.0, "grad_norm": 2.2898339699230763, "language_loss": 0.82572925, "learning_rate": 8.183406811760596e-08, "loss": 0.84715581, "num_input_tokens_seen": 163897770, "step": 7579, "time_per_iteration": 2.6108853816986084 }, { "auxiliary_loss_clip": 0.01111025, "auxiliary_loss_mlp": 0.01019578, "balance_loss_clip": 1.04013908, "balance_loss_mlp": 1.01264334, "epoch": 0.9114411110443095, "flos": 25594971742080.0, "grad_norm": 1.9316176250182733, "language_loss": 0.74161077, "learning_rate": 8.161366713172313e-08, "loss": 0.76291674, "num_input_tokens_seen": 163920160, "step": 7580, "time_per_iteration": 2.600269079208374 }, { "auxiliary_loss_clip": 0.01128745, "auxiliary_loss_mlp": 0.01025573, "balance_loss_clip": 1.04281259, "balance_loss_mlp": 1.01824141, "epoch": 0.9115613539349486, "flos": 18399729928320.0, "grad_norm": 2.692894319953287, "language_loss": 0.84322506, "learning_rate": 8.139355716352137e-08, "loss": 0.86476827, "num_input_tokens_seen": 163935000, "step": 7581, "time_per_iteration": 2.497633695602417 }, { "auxiliary_loss_clip": 0.01139455, "auxiliary_loss_mlp": 0.01026491, "balance_loss_clip": 1.04306817, "balance_loss_mlp": 1.01909161, "epoch": 0.9116815968255877, "flos": 21726171619200.0, "grad_norm": 2.2415526055878705, "language_loss": 0.70173478, "learning_rate": 8.117373824639196e-08, "loss": 0.72339422, "num_input_tokens_seen": 163955265, "step": 7582, "time_per_iteration": 2.486483573913574 }, { "auxiliary_loss_clip": 0.01060359, "auxiliary_loss_mlp": 0.01000657, "balance_loss_clip": 1.00708246, "balance_loss_mlp": 0.99976271, "epoch": 0.9118018397162267, "flos": 65363526835200.0, "grad_norm": 0.7236309186387634, "language_loss": 0.59293932, "learning_rate": 8.095421041368067e-08, "loss": 0.61354953, "num_input_tokens_seen": 164014680, "step": 7583, "time_per_iteration": 2.9243624210357666 }, { "auxiliary_loss_clip": 0.01134847, "auxiliary_loss_mlp": 0.00761967, "balance_loss_clip": 1.04453897, "balance_loss_mlp": 1.00047135, "epoch": 0.9119220826068659, "flos": 20922885815040.0, "grad_norm": 2.65117118910255, "language_loss": 0.70748264, "learning_rate": 8.073497369868999e-08, "loss": 0.72645074, "num_input_tokens_seen": 164033140, "step": 7584, "time_per_iteration": 2.4762933254241943 }, { "auxiliary_loss_clip": 0.0114606, "auxiliary_loss_mlp": 0.01023689, "balance_loss_clip": 1.04572368, "balance_loss_mlp": 1.0164355, "epoch": 0.912042325497505, "flos": 28366449327360.0, "grad_norm": 2.0049272295103755, "language_loss": 0.75278604, "learning_rate": 8.051602813467772e-08, "loss": 0.77448344, "num_input_tokens_seen": 164054995, "step": 7585, "time_per_iteration": 2.5583436489105225 }, { "auxiliary_loss_clip": 0.01155188, "auxiliary_loss_mlp": 0.01022027, "balance_loss_clip": 1.04712009, "balance_loss_mlp": 1.01530313, "epoch": 0.912162568388144, "flos": 17566782468480.0, "grad_norm": 5.615056663456039, "language_loss": 0.71047747, "learning_rate": 8.029737375485756e-08, "loss": 0.73224968, "num_input_tokens_seen": 164074225, "step": 7586, "time_per_iteration": 2.4545328617095947 }, { "auxiliary_loss_clip": 0.01165329, "auxiliary_loss_mlp": 0.01022721, "balance_loss_clip": 1.047382, "balance_loss_mlp": 1.01607215, "epoch": 0.9122828112787832, "flos": 19827897661440.0, "grad_norm": 1.737784491086537, "language_loss": 0.72438973, "learning_rate": 8.007901059239986e-08, "loss": 0.7462703, "num_input_tokens_seen": 164093505, "step": 7587, "time_per_iteration": 3.2481539249420166 }, { "auxiliary_loss_clip": 0.01136636, "auxiliary_loss_mlp": 0.01020159, "balance_loss_clip": 1.04129672, "balance_loss_mlp": 1.01310825, "epoch": 0.9124030541694222, "flos": 20813789232000.0, "grad_norm": 1.70199400515549, "language_loss": 0.80106831, "learning_rate": 7.986093868042964e-08, "loss": 0.82263625, "num_input_tokens_seen": 164113750, "step": 7588, "time_per_iteration": 2.4724841117858887 }, { "auxiliary_loss_clip": 0.0114988, "auxiliary_loss_mlp": 0.01028608, "balance_loss_clip": 1.04561639, "balance_loss_mlp": 1.02177155, "epoch": 0.9125232970600613, "flos": 25192305302400.0, "grad_norm": 2.0762520832926286, "language_loss": 0.67812824, "learning_rate": 7.964315805202826e-08, "loss": 0.69991308, "num_input_tokens_seen": 164134330, "step": 7589, "time_per_iteration": 3.219475269317627 }, { "auxiliary_loss_clip": 0.01136878, "auxiliary_loss_mlp": 0.01022026, "balance_loss_clip": 1.04485464, "balance_loss_mlp": 1.01427102, "epoch": 0.9126435399507005, "flos": 19719591177600.0, "grad_norm": 1.867345436542811, "language_loss": 0.72886568, "learning_rate": 7.942566874023304e-08, "loss": 0.75045466, "num_input_tokens_seen": 164153515, "step": 7590, "time_per_iteration": 3.209393262863159 }, { "auxiliary_loss_clip": 0.01132917, "auxiliary_loss_mlp": 0.01025876, "balance_loss_clip": 1.0413537, "balance_loss_mlp": 1.01860976, "epoch": 0.9127637828413395, "flos": 19573614305280.0, "grad_norm": 2.0016724650280557, "language_loss": 0.69446194, "learning_rate": 7.920847077803649e-08, "loss": 0.71604991, "num_input_tokens_seen": 164171305, "step": 7591, "time_per_iteration": 2.452043294906616 }, { "auxiliary_loss_clip": 0.01097696, "auxiliary_loss_mlp": 0.01024446, "balance_loss_clip": 1.03641629, "balance_loss_mlp": 1.01745701, "epoch": 0.9128840257319786, "flos": 20230635928320.0, "grad_norm": 2.003938756046427, "language_loss": 0.82037741, "learning_rate": 7.899156419838826e-08, "loss": 0.84159881, "num_input_tokens_seen": 164190275, "step": 7592, "time_per_iteration": 2.545992612838745 }, { "auxiliary_loss_clip": 0.01119745, "auxiliary_loss_mlp": 0.01023781, "balance_loss_clip": 1.04175746, "balance_loss_mlp": 1.01712632, "epoch": 0.9130042686226177, "flos": 24858658846080.0, "grad_norm": 1.837050280725676, "language_loss": 0.65535933, "learning_rate": 7.87749490341918e-08, "loss": 0.67679459, "num_input_tokens_seen": 164210550, "step": 7593, "time_per_iteration": 2.547178268432617 }, { "auxiliary_loss_clip": 0.01168361, "auxiliary_loss_mlp": 0.01024578, "balance_loss_clip": 1.0488019, "balance_loss_mlp": 1.01726437, "epoch": 0.9131245115132568, "flos": 23581747284480.0, "grad_norm": 2.1144910283486245, "language_loss": 0.8319335, "learning_rate": 7.855862531830836e-08, "loss": 0.85386288, "num_input_tokens_seen": 164226660, "step": 7594, "time_per_iteration": 2.405717134475708 }, { "auxiliary_loss_clip": 0.01148084, "auxiliary_loss_mlp": 0.01021432, "balance_loss_clip": 1.04354155, "balance_loss_mlp": 1.01450872, "epoch": 0.9132447544038959, "flos": 19931607204480.0, "grad_norm": 1.5790664591105037, "language_loss": 0.72917855, "learning_rate": 7.834259308355373e-08, "loss": 0.75087368, "num_input_tokens_seen": 164245425, "step": 7595, "time_per_iteration": 2.4460058212280273 }, { "auxiliary_loss_clip": 0.01078176, "auxiliary_loss_mlp": 0.01024884, "balance_loss_clip": 1.03630245, "balance_loss_mlp": 1.01792192, "epoch": 0.9133649972945349, "flos": 21981747864960.0, "grad_norm": 2.0350844150389253, "language_loss": 0.75049615, "learning_rate": 7.812685236269989e-08, "loss": 0.77152681, "num_input_tokens_seen": 164264085, "step": 7596, "time_per_iteration": 2.6115055084228516 }, { "auxiliary_loss_clip": 0.01026284, "auxiliary_loss_mlp": 0.01002656, "balance_loss_clip": 1.00898743, "balance_loss_mlp": 1.00182152, "epoch": 0.9134852401851741, "flos": 71240523511680.0, "grad_norm": 0.7881695381492786, "language_loss": 0.58615732, "learning_rate": 7.791140318847445e-08, "loss": 0.60644674, "num_input_tokens_seen": 164322220, "step": 7597, "time_per_iteration": 3.8404805660247803 }, { "auxiliary_loss_clip": 0.01134291, "auxiliary_loss_mlp": 0.01019556, "balance_loss_clip": 1.04687822, "balance_loss_mlp": 1.01305056, "epoch": 0.9136054830758131, "flos": 23626923615360.0, "grad_norm": 1.4800515167768031, "language_loss": 0.80263603, "learning_rate": 7.769624559356081e-08, "loss": 0.82417446, "num_input_tokens_seen": 164345615, "step": 7598, "time_per_iteration": 2.5635030269622803 }, { "auxiliary_loss_clip": 0.01149581, "auxiliary_loss_mlp": 0.01024296, "balance_loss_clip": 1.04385662, "balance_loss_mlp": 1.01632106, "epoch": 0.9137257259664522, "flos": 23438858981760.0, "grad_norm": 3.999870129167285, "language_loss": 0.75410414, "learning_rate": 7.748137961059842e-08, "loss": 0.77584291, "num_input_tokens_seen": 164359595, "step": 7599, "time_per_iteration": 2.4458670616149902 }, { "auxiliary_loss_clip": 0.0116198, "auxiliary_loss_mlp": 0.01024343, "balance_loss_clip": 1.04683089, "balance_loss_mlp": 1.01709819, "epoch": 0.9138459688570914, "flos": 19127854523520.0, "grad_norm": 2.5979285564238825, "language_loss": 0.65739489, "learning_rate": 7.726680527218211e-08, "loss": 0.67925811, "num_input_tokens_seen": 164376635, "step": 7600, "time_per_iteration": 2.3873424530029297 }, { "auxiliary_loss_clip": 0.01164609, "auxiliary_loss_mlp": 0.01022347, "balance_loss_clip": 1.04517794, "balance_loss_mlp": 1.01536131, "epoch": 0.9139662117477304, "flos": 46281240714240.0, "grad_norm": 1.6979897915311042, "language_loss": 0.75865489, "learning_rate": 7.70525226108627e-08, "loss": 0.78052443, "num_input_tokens_seen": 164400305, "step": 7601, "time_per_iteration": 2.619725465774536 }, { "auxiliary_loss_clip": 0.01152321, "auxiliary_loss_mlp": 0.01026129, "balance_loss_clip": 1.04807508, "balance_loss_mlp": 1.01919091, "epoch": 0.9140864546383695, "flos": 22273198819200.0, "grad_norm": 2.168382931186185, "language_loss": 0.79799485, "learning_rate": 7.683853165914666e-08, "loss": 0.81977934, "num_input_tokens_seen": 164418075, "step": 7602, "time_per_iteration": 2.4384236335754395 }, { "auxiliary_loss_clip": 0.01110692, "auxiliary_loss_mlp": 0.01025256, "balance_loss_clip": 1.04312313, "balance_loss_mlp": 1.01855612, "epoch": 0.9142066975290086, "flos": 17530009920000.0, "grad_norm": 1.9916114291657985, "language_loss": 0.77349365, "learning_rate": 7.662483244949602e-08, "loss": 0.79485315, "num_input_tokens_seen": 164435335, "step": 7603, "time_per_iteration": 2.5221846103668213 }, { "auxiliary_loss_clip": 0.01115678, "auxiliary_loss_mlp": 0.01019379, "balance_loss_clip": 1.04257107, "balance_loss_mlp": 1.01239026, "epoch": 0.9143269404196477, "flos": 17712148809600.0, "grad_norm": 2.980415360896527, "language_loss": 0.80810434, "learning_rate": 7.641142501432951e-08, "loss": 0.8294549, "num_input_tokens_seen": 164451530, "step": 7604, "time_per_iteration": 2.4473040103912354 }, { "auxiliary_loss_clip": 0.01131348, "auxiliary_loss_mlp": 0.01023347, "balance_loss_clip": 1.04267859, "balance_loss_mlp": 1.01647472, "epoch": 0.9144471833102867, "flos": 33323414019840.0, "grad_norm": 1.8313251872224725, "language_loss": 0.73798144, "learning_rate": 7.619830938602013e-08, "loss": 0.7595284, "num_input_tokens_seen": 164472755, "step": 7605, "time_per_iteration": 2.581111431121826 }, { "auxiliary_loss_clip": 0.01145574, "auxiliary_loss_mlp": 0.01022238, "balance_loss_clip": 1.04453278, "balance_loss_mlp": 1.01508856, "epoch": 0.9145674262009259, "flos": 21068970428160.0, "grad_norm": 2.413515252292556, "language_loss": 0.82065606, "learning_rate": 7.598548559689777e-08, "loss": 0.84233415, "num_input_tokens_seen": 164491155, "step": 7606, "time_per_iteration": 2.441040277481079 }, { "auxiliary_loss_clip": 0.01117324, "auxiliary_loss_mlp": 0.01022258, "balance_loss_clip": 1.04175425, "balance_loss_mlp": 1.01544833, "epoch": 0.914687669091565, "flos": 16800269212800.0, "grad_norm": 6.175178509466248, "language_loss": 0.80802983, "learning_rate": 7.577295367924751e-08, "loss": 0.82942563, "num_input_tokens_seen": 164507555, "step": 7607, "time_per_iteration": 2.474886655807495 }, { "auxiliary_loss_clip": 0.01141552, "auxiliary_loss_mlp": 0.0102126, "balance_loss_clip": 1.04582965, "balance_loss_mlp": 1.0139823, "epoch": 0.914807911982204, "flos": 25773627012480.0, "grad_norm": 1.7403130139530418, "language_loss": 0.82291836, "learning_rate": 7.556071366531002e-08, "loss": 0.8445465, "num_input_tokens_seen": 164528525, "step": 7608, "time_per_iteration": 2.517787218093872 }, { "auxiliary_loss_clip": 0.01150925, "auxiliary_loss_mlp": 0.01024437, "balance_loss_clip": 1.04643726, "balance_loss_mlp": 1.01704049, "epoch": 0.9149281548728432, "flos": 19208043636480.0, "grad_norm": 4.017934441258296, "language_loss": 0.79343176, "learning_rate": 7.53487655872822e-08, "loss": 0.81518531, "num_input_tokens_seen": 164547695, "step": 7609, "time_per_iteration": 2.4374399185180664 }, { "auxiliary_loss_clip": 0.01110623, "auxiliary_loss_mlp": 0.01022692, "balance_loss_clip": 1.03855669, "balance_loss_mlp": 1.01514578, "epoch": 0.9150483977634822, "flos": 26870554500480.0, "grad_norm": 1.8176322488121996, "language_loss": 0.73922813, "learning_rate": 7.513710947731656e-08, "loss": 0.76056129, "num_input_tokens_seen": 164568905, "step": 7610, "time_per_iteration": 2.56787109375 }, { "auxiliary_loss_clip": 0.01129543, "auxiliary_loss_mlp": 0.01024964, "balance_loss_clip": 1.04334033, "balance_loss_mlp": 1.01781785, "epoch": 0.9151686406541213, "flos": 21908956953600.0, "grad_norm": 1.950405755254037, "language_loss": 0.85082835, "learning_rate": 7.492574536752095e-08, "loss": 0.87237346, "num_input_tokens_seen": 164588895, "step": 7611, "time_per_iteration": 2.4774608612060547 }, { "auxiliary_loss_clip": 0.01136622, "auxiliary_loss_mlp": 0.0102584, "balance_loss_clip": 1.04433525, "balance_loss_mlp": 1.01925111, "epoch": 0.9152888835447605, "flos": 27308556944640.0, "grad_norm": 1.9405805394408722, "language_loss": 0.78302896, "learning_rate": 7.471467328995907e-08, "loss": 0.80465358, "num_input_tokens_seen": 164607705, "step": 7612, "time_per_iteration": 2.5159993171691895 }, { "auxiliary_loss_clip": 0.0108466, "auxiliary_loss_mlp": 0.0102145, "balance_loss_clip": 1.0383029, "balance_loss_mlp": 1.01409447, "epoch": 0.9154091264353995, "flos": 13370728510080.0, "grad_norm": 2.288525583068321, "language_loss": 0.6045481, "learning_rate": 7.450389327665018e-08, "loss": 0.62560916, "num_input_tokens_seen": 164625540, "step": 7613, "time_per_iteration": 2.595487117767334 }, { "auxiliary_loss_clip": 0.01127667, "auxiliary_loss_mlp": 0.01027012, "balance_loss_clip": 1.0492301, "balance_loss_mlp": 1.01963925, "epoch": 0.9155293693260386, "flos": 20193037367040.0, "grad_norm": 3.6124289286554423, "language_loss": 0.67764872, "learning_rate": 7.429340535957029e-08, "loss": 0.6991955, "num_input_tokens_seen": 164640735, "step": 7614, "time_per_iteration": 3.337618589401245 }, { "auxiliary_loss_clip": 0.01136766, "auxiliary_loss_mlp": 0.01025526, "balance_loss_clip": 1.04304767, "balance_loss_mlp": 1.01893091, "epoch": 0.9156496122166777, "flos": 19354990176000.0, "grad_norm": 2.188030475064479, "language_loss": 0.70678705, "learning_rate": 7.40832095706494e-08, "loss": 0.72841001, "num_input_tokens_seen": 164657430, "step": 7615, "time_per_iteration": 3.2225139141082764 }, { "auxiliary_loss_clip": 0.01129649, "auxiliary_loss_mlp": 0.01028783, "balance_loss_clip": 1.04544044, "balance_loss_mlp": 1.02209473, "epoch": 0.9157698551073168, "flos": 21107287261440.0, "grad_norm": 1.8147881568414153, "language_loss": 0.79950982, "learning_rate": 7.387330594177443e-08, "loss": 0.82109416, "num_input_tokens_seen": 164679505, "step": 7616, "time_per_iteration": 3.3735921382904053 }, { "auxiliary_loss_clip": 0.01118296, "auxiliary_loss_mlp": 0.01027594, "balance_loss_clip": 1.04283071, "balance_loss_mlp": 1.02089691, "epoch": 0.9158900979979558, "flos": 25193167228800.0, "grad_norm": 1.6284707456835912, "language_loss": 0.79017949, "learning_rate": 7.366369450478749e-08, "loss": 0.81163836, "num_input_tokens_seen": 164700615, "step": 7617, "time_per_iteration": 2.54510235786438 }, { "auxiliary_loss_clip": 0.01118042, "auxiliary_loss_mlp": 0.01026774, "balance_loss_clip": 1.04187059, "balance_loss_mlp": 1.01980066, "epoch": 0.916010340888595, "flos": 30146648302080.0, "grad_norm": 1.5856731105762434, "language_loss": 0.66581762, "learning_rate": 7.345437529148646e-08, "loss": 0.68726575, "num_input_tokens_seen": 164719625, "step": 7618, "time_per_iteration": 2.5736329555511475 }, { "auxiliary_loss_clip": 0.01121513, "auxiliary_loss_mlp": 0.01030444, "balance_loss_clip": 1.04237258, "balance_loss_mlp": 1.02339554, "epoch": 0.9161305837792341, "flos": 17091827907840.0, "grad_norm": 1.8377405237655395, "language_loss": 0.72814631, "learning_rate": 7.324534833362483e-08, "loss": 0.74966586, "num_input_tokens_seen": 164737200, "step": 7619, "time_per_iteration": 2.4841952323913574 }, { "auxiliary_loss_clip": 0.01137303, "auxiliary_loss_mlp": 0.01023393, "balance_loss_clip": 1.04526663, "balance_loss_mlp": 1.01624596, "epoch": 0.9162508266698731, "flos": 22893699288960.0, "grad_norm": 1.78582716112665, "language_loss": 0.68454885, "learning_rate": 7.303661366291192e-08, "loss": 0.70615578, "num_input_tokens_seen": 164757870, "step": 7620, "time_per_iteration": 2.5334620475769043 }, { "auxiliary_loss_clip": 0.01107017, "auxiliary_loss_mlp": 0.01022593, "balance_loss_clip": 1.04087758, "balance_loss_mlp": 1.0158217, "epoch": 0.9163710695605123, "flos": 19974808287360.0, "grad_norm": 1.7482009389320294, "language_loss": 0.81431764, "learning_rate": 7.28281713110126e-08, "loss": 0.83561373, "num_input_tokens_seen": 164775945, "step": 7621, "time_per_iteration": 2.546293258666992 }, { "auxiliary_loss_clip": 0.01134049, "auxiliary_loss_mlp": 0.01024958, "balance_loss_clip": 1.0453943, "balance_loss_mlp": 1.0179249, "epoch": 0.9164913124511513, "flos": 22783812606720.0, "grad_norm": 1.9266749360005215, "language_loss": 0.77157038, "learning_rate": 7.262002130954759e-08, "loss": 0.79316044, "num_input_tokens_seen": 164794400, "step": 7622, "time_per_iteration": 2.507571220397949 }, { "auxiliary_loss_clip": 0.01113115, "auxiliary_loss_mlp": 0.01027043, "balance_loss_clip": 1.04219127, "balance_loss_mlp": 1.01950264, "epoch": 0.9166115553417904, "flos": 24900854348160.0, "grad_norm": 1.6920187560768574, "language_loss": 0.78658438, "learning_rate": 7.241216369009296e-08, "loss": 0.80798596, "num_input_tokens_seen": 164814585, "step": 7623, "time_per_iteration": 3.3467843532562256 }, { "auxiliary_loss_clip": 0.01162854, "auxiliary_loss_mlp": 0.010208, "balance_loss_clip": 1.0452739, "balance_loss_mlp": 1.01393938, "epoch": 0.9167317982324296, "flos": 25702919089920.0, "grad_norm": 2.8370637099199048, "language_loss": 0.66368234, "learning_rate": 7.220459848418037e-08, "loss": 0.68551886, "num_input_tokens_seen": 164834660, "step": 7624, "time_per_iteration": 2.4610095024108887 }, { "auxiliary_loss_clip": 0.0116212, "auxiliary_loss_mlp": 0.01021668, "balance_loss_clip": 1.04705691, "balance_loss_mlp": 1.01495612, "epoch": 0.9168520411230686, "flos": 15632813370240.0, "grad_norm": 1.6681159869818696, "language_loss": 0.79513001, "learning_rate": 7.199732572329708e-08, "loss": 0.8169679, "num_input_tokens_seen": 164852560, "step": 7625, "time_per_iteration": 2.41252064704895 }, { "auxiliary_loss_clip": 0.01126649, "auxiliary_loss_mlp": 0.01027555, "balance_loss_clip": 1.04299486, "balance_loss_mlp": 1.02026534, "epoch": 0.9169722840137077, "flos": 30258151096320.0, "grad_norm": 2.2095812584423653, "language_loss": 0.75729418, "learning_rate": 7.179034543888684e-08, "loss": 0.77883625, "num_input_tokens_seen": 164872065, "step": 7626, "time_per_iteration": 2.5752599239349365 }, { "auxiliary_loss_clip": 0.011522, "auxiliary_loss_mlp": 0.01025421, "balance_loss_clip": 1.04423654, "balance_loss_mlp": 1.01856041, "epoch": 0.9170925269043467, "flos": 22491643380480.0, "grad_norm": 2.582222675900588, "language_loss": 0.77464998, "learning_rate": 7.158365766234808e-08, "loss": 0.79642618, "num_input_tokens_seen": 164890915, "step": 7627, "time_per_iteration": 2.4673123359680176 }, { "auxiliary_loss_clip": 0.01113003, "auxiliary_loss_mlp": 0.0102329, "balance_loss_clip": 1.03824651, "balance_loss_mlp": 1.0156486, "epoch": 0.9172127697949859, "flos": 22893914770560.0, "grad_norm": 1.9622509135071753, "language_loss": 0.72362083, "learning_rate": 7.137726242503527e-08, "loss": 0.74498373, "num_input_tokens_seen": 164909835, "step": 7628, "time_per_iteration": 2.513026714324951 }, { "auxiliary_loss_clip": 0.01149513, "auxiliary_loss_mlp": 0.00762049, "balance_loss_clip": 1.04580176, "balance_loss_mlp": 1.00047135, "epoch": 0.917333012685625, "flos": 17451867882240.0, "grad_norm": 2.3189796566703813, "language_loss": 0.77865136, "learning_rate": 7.11711597582585e-08, "loss": 0.79776692, "num_input_tokens_seen": 164927195, "step": 7629, "time_per_iteration": 2.4811604022979736 }, { "auxiliary_loss_clip": 0.01121428, "auxiliary_loss_mlp": 0.01020991, "balance_loss_clip": 1.03957272, "balance_loss_mlp": 1.01472926, "epoch": 0.917453255576264, "flos": 14318949692160.0, "grad_norm": 1.819354923664696, "language_loss": 0.79972935, "learning_rate": 7.096534969328271e-08, "loss": 0.82115358, "num_input_tokens_seen": 164944640, "step": 7630, "time_per_iteration": 2.476349115371704 }, { "auxiliary_loss_clip": 0.01140394, "auxiliary_loss_mlp": 0.01021659, "balance_loss_clip": 1.04276109, "balance_loss_mlp": 1.01481354, "epoch": 0.9175734984669032, "flos": 20741177888640.0, "grad_norm": 2.535046132391288, "language_loss": 0.83909428, "learning_rate": 7.075983226132987e-08, "loss": 0.86071479, "num_input_tokens_seen": 164963570, "step": 7631, "time_per_iteration": 2.4864015579223633 }, { "auxiliary_loss_clip": 0.01139387, "auxiliary_loss_mlp": 0.0076235, "balance_loss_clip": 1.04189718, "balance_loss_mlp": 1.00040793, "epoch": 0.9176937413575422, "flos": 14830497233280.0, "grad_norm": 2.752170995558212, "language_loss": 0.7917909, "learning_rate": 7.055460749357656e-08, "loss": 0.8108083, "num_input_tokens_seen": 164979850, "step": 7632, "time_per_iteration": 2.43985915184021 }, { "auxiliary_loss_clip": 0.01137293, "auxiliary_loss_mlp": 0.01026468, "balance_loss_clip": 1.04539037, "balance_loss_mlp": 1.0190413, "epoch": 0.9178139842481813, "flos": 18474603828480.0, "grad_norm": 1.6268610325364654, "language_loss": 0.70351678, "learning_rate": 7.034967542115521e-08, "loss": 0.7251544, "num_input_tokens_seen": 164998115, "step": 7633, "time_per_iteration": 2.4819447994232178 }, { "auxiliary_loss_clip": 0.01140377, "auxiliary_loss_mlp": 0.00761638, "balance_loss_clip": 1.04300833, "balance_loss_mlp": 1.00050187, "epoch": 0.9179342271388204, "flos": 20047455544320.0, "grad_norm": 2.1005244022234226, "language_loss": 0.75454772, "learning_rate": 7.014503607515388e-08, "loss": 0.77356791, "num_input_tokens_seen": 165017420, "step": 7634, "time_per_iteration": 2.4153895378112793 }, { "auxiliary_loss_clip": 0.01137467, "auxiliary_loss_mlp": 0.01028589, "balance_loss_clip": 1.04749775, "balance_loss_mlp": 1.02157021, "epoch": 0.9180544700294595, "flos": 24676232647680.0, "grad_norm": 2.333512025323974, "language_loss": 0.68532926, "learning_rate": 6.994068948661592e-08, "loss": 0.70698977, "num_input_tokens_seen": 165035575, "step": 7635, "time_per_iteration": 2.519930362701416 }, { "auxiliary_loss_clip": 0.01150302, "auxiliary_loss_mlp": 0.01024838, "balance_loss_clip": 1.04553819, "balance_loss_mlp": 1.01688635, "epoch": 0.9181747129200986, "flos": 16727478301440.0, "grad_norm": 2.31083351932169, "language_loss": 0.7637673, "learning_rate": 6.973663568654142e-08, "loss": 0.78551865, "num_input_tokens_seen": 165053280, "step": 7636, "time_per_iteration": 2.4259276390075684 }, { "auxiliary_loss_clip": 0.01163957, "auxiliary_loss_mlp": 0.0102963, "balance_loss_clip": 1.0475173, "balance_loss_mlp": 1.02247739, "epoch": 0.9182949558107377, "flos": 24271626873600.0, "grad_norm": 2.4403909420445866, "language_loss": 0.65316081, "learning_rate": 6.953287470588386e-08, "loss": 0.67509669, "num_input_tokens_seen": 165071235, "step": 7637, "time_per_iteration": 2.438974380493164 }, { "auxiliary_loss_clip": 0.01154247, "auxiliary_loss_mlp": 0.01024416, "balance_loss_clip": 1.04480255, "balance_loss_mlp": 1.01719177, "epoch": 0.9184151987013768, "flos": 22082117443200.0, "grad_norm": 2.2356865436449294, "language_loss": 0.86101222, "learning_rate": 6.932940657555452e-08, "loss": 0.88279891, "num_input_tokens_seen": 165087365, "step": 7638, "time_per_iteration": 2.4101572036743164 }, { "auxiliary_loss_clip": 0.01158337, "auxiliary_loss_mlp": 0.01021286, "balance_loss_clip": 1.04486382, "balance_loss_mlp": 1.0150187, "epoch": 0.9185354415920158, "flos": 32166732257280.0, "grad_norm": 1.3898957626803734, "language_loss": 0.76542276, "learning_rate": 6.912623132641938e-08, "loss": 0.78721899, "num_input_tokens_seen": 165112455, "step": 7639, "time_per_iteration": 2.5528690814971924 }, { "auxiliary_loss_clip": 0.01138677, "auxiliary_loss_mlp": 0.01027123, "balance_loss_clip": 1.04500747, "balance_loss_mlp": 1.01977324, "epoch": 0.918655684482655, "flos": 20997831542400.0, "grad_norm": 1.7388803268277686, "language_loss": 0.76233828, "learning_rate": 6.892334898929952e-08, "loss": 0.78399622, "num_input_tokens_seen": 165132700, "step": 7640, "time_per_iteration": 3.331432819366455 }, { "auxiliary_loss_clip": 0.01144526, "auxiliary_loss_mlp": 0.01024127, "balance_loss_clip": 1.04361367, "balance_loss_mlp": 1.01712942, "epoch": 0.918775927373294, "flos": 15560704817280.0, "grad_norm": 4.224198701075811, "language_loss": 0.84922671, "learning_rate": 6.872075959497236e-08, "loss": 0.87091327, "num_input_tokens_seen": 165151475, "step": 7641, "time_per_iteration": 2.4294159412384033 }, { "auxiliary_loss_clip": 0.01151841, "auxiliary_loss_mlp": 0.0102423, "balance_loss_clip": 1.04352629, "balance_loss_mlp": 1.01765573, "epoch": 0.9188961702639331, "flos": 29934057657600.0, "grad_norm": 1.834512383712307, "language_loss": 0.82921076, "learning_rate": 6.85184631741702e-08, "loss": 0.85097146, "num_input_tokens_seen": 165172040, "step": 7642, "time_per_iteration": 3.274141550064087 }, { "auxiliary_loss_clip": 0.01149807, "auxiliary_loss_mlp": 0.01022051, "balance_loss_clip": 1.0444268, "balance_loss_mlp": 1.01463604, "epoch": 0.9190164131545723, "flos": 20701244943360.0, "grad_norm": 2.053465059566003, "language_loss": 0.77197266, "learning_rate": 6.831645975758161e-08, "loss": 0.79369128, "num_input_tokens_seen": 165189980, "step": 7643, "time_per_iteration": 3.158571243286133 }, { "auxiliary_loss_clip": 0.01131273, "auxiliary_loss_mlp": 0.01025614, "balance_loss_clip": 1.04436064, "balance_loss_mlp": 1.0187242, "epoch": 0.9191366560452113, "flos": 25629912696960.0, "grad_norm": 1.8956614170926007, "language_loss": 0.67449749, "learning_rate": 6.811474937585026e-08, "loss": 0.69606632, "num_input_tokens_seen": 165209770, "step": 7644, "time_per_iteration": 2.4951059818267822 }, { "auxiliary_loss_clip": 0.0111854, "auxiliary_loss_mlp": 0.01022489, "balance_loss_clip": 1.04157495, "balance_loss_mlp": 1.01566386, "epoch": 0.9192568989358504, "flos": 21434325615360.0, "grad_norm": 1.7229902207045182, "language_loss": 0.79311526, "learning_rate": 6.79133320595755e-08, "loss": 0.81452554, "num_input_tokens_seen": 165229690, "step": 7645, "time_per_iteration": 2.5088160037994385 }, { "auxiliary_loss_clip": 0.01122449, "auxiliary_loss_mlp": 0.00761736, "balance_loss_clip": 1.04321325, "balance_loss_mlp": 1.00047684, "epoch": 0.9193771418264896, "flos": 23185078416000.0, "grad_norm": 1.6575532127398032, "language_loss": 0.75533366, "learning_rate": 6.771220783931198e-08, "loss": 0.77417552, "num_input_tokens_seen": 165249850, "step": 7646, "time_per_iteration": 2.5735199451446533 }, { "auxiliary_loss_clip": 0.01003497, "auxiliary_loss_mlp": 0.0100263, "balance_loss_clip": 1.02024174, "balance_loss_mlp": 1.00112808, "epoch": 0.9194973847171286, "flos": 70582963184640.0, "grad_norm": 0.9037482616793509, "language_loss": 0.64680159, "learning_rate": 6.751137674556994e-08, "loss": 0.66686285, "num_input_tokens_seen": 165310235, "step": 7647, "time_per_iteration": 3.3206405639648438 }, { "auxiliary_loss_clip": 0.01151934, "auxiliary_loss_mlp": 0.01020698, "balance_loss_clip": 1.04307282, "balance_loss_mlp": 1.01377749, "epoch": 0.9196176276077677, "flos": 14720682378240.0, "grad_norm": 2.176927481258253, "language_loss": 0.77329636, "learning_rate": 6.731083880881572e-08, "loss": 0.79502273, "num_input_tokens_seen": 165326455, "step": 7648, "time_per_iteration": 2.616607666015625 }, { "auxiliary_loss_clip": 0.01136327, "auxiliary_loss_mlp": 0.01021587, "balance_loss_clip": 1.04371262, "balance_loss_mlp": 1.01500916, "epoch": 0.9197378704984068, "flos": 23294893271040.0, "grad_norm": 2.090158422838104, "language_loss": 0.8082872, "learning_rate": 6.711059405947072e-08, "loss": 0.82986629, "num_input_tokens_seen": 165344645, "step": 7649, "time_per_iteration": 2.494624137878418 }, { "auxiliary_loss_clip": 0.01119597, "auxiliary_loss_mlp": 0.01022311, "balance_loss_clip": 1.0427438, "balance_loss_mlp": 1.01512527, "epoch": 0.9198581133890459, "flos": 20302564913280.0, "grad_norm": 2.082361382738563, "language_loss": 0.77008837, "learning_rate": 6.691064252791156e-08, "loss": 0.79150748, "num_input_tokens_seen": 165364120, "step": 7650, "time_per_iteration": 3.2656569480895996 }, { "auxiliary_loss_clip": 0.01102726, "auxiliary_loss_mlp": 0.01024218, "balance_loss_clip": 1.0415051, "balance_loss_mlp": 1.01702416, "epoch": 0.9199783562796849, "flos": 17675663569920.0, "grad_norm": 1.6669751251756852, "language_loss": 0.77895582, "learning_rate": 6.67109842444713e-08, "loss": 0.80022526, "num_input_tokens_seen": 165383050, "step": 7651, "time_per_iteration": 2.512842893600464 }, { "auxiliary_loss_clip": 0.01153345, "auxiliary_loss_mlp": 0.00762696, "balance_loss_clip": 1.05007184, "balance_loss_mlp": 1.00045204, "epoch": 0.9200985991703241, "flos": 17676022705920.0, "grad_norm": 1.991096165951223, "language_loss": 0.76562107, "learning_rate": 6.651161923943704e-08, "loss": 0.7847814, "num_input_tokens_seen": 165400955, "step": 7652, "time_per_iteration": 2.4329683780670166 }, { "auxiliary_loss_clip": 0.01146242, "auxiliary_loss_mlp": 0.01027349, "balance_loss_clip": 1.04374027, "balance_loss_mlp": 1.0198952, "epoch": 0.9202188420609632, "flos": 20996574566400.0, "grad_norm": 2.1668702760713856, "language_loss": 0.77489841, "learning_rate": 6.631254754305326e-08, "loss": 0.79663432, "num_input_tokens_seen": 165420415, "step": 7653, "time_per_iteration": 2.442662477493286 }, { "auxiliary_loss_clip": 0.01167091, "auxiliary_loss_mlp": 0.0102357, "balance_loss_clip": 1.04695463, "balance_loss_mlp": 1.01641774, "epoch": 0.9203390849516022, "flos": 13918222586880.0, "grad_norm": 2.497854279128939, "language_loss": 0.77975565, "learning_rate": 6.611376918551848e-08, "loss": 0.80166227, "num_input_tokens_seen": 165439200, "step": 7654, "time_per_iteration": 2.3864758014678955 }, { "auxiliary_loss_clip": 0.01121223, "auxiliary_loss_mlp": 0.00762089, "balance_loss_clip": 1.041278, "balance_loss_mlp": 1.00042582, "epoch": 0.9204593278422414, "flos": 21175912195200.0, "grad_norm": 2.0443332873869093, "language_loss": 0.79564846, "learning_rate": 6.591528419698744e-08, "loss": 0.81448162, "num_input_tokens_seen": 165458985, "step": 7655, "time_per_iteration": 2.5202560424804688 }, { "auxiliary_loss_clip": 0.01139449, "auxiliary_loss_mlp": 0.01025383, "balance_loss_clip": 1.04314375, "balance_loss_mlp": 1.01906824, "epoch": 0.9205795707328804, "flos": 14501375890560.0, "grad_norm": 2.36874502198062, "language_loss": 0.83454204, "learning_rate": 6.571709260756986e-08, "loss": 0.85619038, "num_input_tokens_seen": 165475630, "step": 7656, "time_per_iteration": 2.436110734939575 }, { "auxiliary_loss_clip": 0.01155873, "auxiliary_loss_mlp": 0.01028997, "balance_loss_clip": 1.05001628, "balance_loss_mlp": 1.02145672, "epoch": 0.9206998136235195, "flos": 22417559579520.0, "grad_norm": 2.944361512186727, "language_loss": 0.76342273, "learning_rate": 6.551919444733122e-08, "loss": 0.78527141, "num_input_tokens_seen": 165493445, "step": 7657, "time_per_iteration": 2.427248954772949 }, { "auxiliary_loss_clip": 0.01136374, "auxiliary_loss_mlp": 0.01025271, "balance_loss_clip": 1.04570186, "balance_loss_mlp": 1.01783276, "epoch": 0.9208200565141585, "flos": 53358407544960.0, "grad_norm": 3.6060789658169665, "language_loss": 0.65963018, "learning_rate": 6.53215897462931e-08, "loss": 0.68124658, "num_input_tokens_seen": 165517200, "step": 7658, "time_per_iteration": 2.7555553913116455 }, { "auxiliary_loss_clip": 0.01148493, "auxiliary_loss_mlp": 0.01027063, "balance_loss_clip": 1.04461527, "balance_loss_mlp": 1.01982403, "epoch": 0.9209402994047977, "flos": 30589139946240.0, "grad_norm": 2.2499708108735508, "language_loss": 0.74821383, "learning_rate": 6.512427853443103e-08, "loss": 0.76996934, "num_input_tokens_seen": 165539280, "step": 7659, "time_per_iteration": 2.517777919769287 }, { "auxiliary_loss_clip": 0.0115339, "auxiliary_loss_mlp": 0.01019071, "balance_loss_clip": 1.04559326, "balance_loss_mlp": 1.01173377, "epoch": 0.9210605422954368, "flos": 29132711187840.0, "grad_norm": 1.6276872320456355, "language_loss": 0.75560385, "learning_rate": 6.492726084167799e-08, "loss": 0.77732849, "num_input_tokens_seen": 165561395, "step": 7660, "time_per_iteration": 2.506317138671875 }, { "auxiliary_loss_clip": 0.01060779, "auxiliary_loss_mlp": 0.0100092, "balance_loss_clip": 1.00725389, "balance_loss_mlp": 1.00006723, "epoch": 0.9211807851860758, "flos": 54853838472960.0, "grad_norm": 0.7772790524315556, "language_loss": 0.57487422, "learning_rate": 6.473053669792072e-08, "loss": 0.59549117, "num_input_tokens_seen": 165616085, "step": 7661, "time_per_iteration": 2.8749241828918457 }, { "auxiliary_loss_clip": 0.01148598, "auxiliary_loss_mlp": 0.0102397, "balance_loss_clip": 1.04341173, "balance_loss_mlp": 1.01619124, "epoch": 0.921301028076715, "flos": 19201974238080.0, "grad_norm": 6.1458698094312405, "language_loss": 0.72845161, "learning_rate": 6.453410613300248e-08, "loss": 0.75017726, "num_input_tokens_seen": 165634015, "step": 7662, "time_per_iteration": 2.4140779972076416 }, { "auxiliary_loss_clip": 0.01094105, "auxiliary_loss_mlp": 0.01027295, "balance_loss_clip": 1.04023457, "balance_loss_mlp": 1.01992822, "epoch": 0.921421270967354, "flos": 27526893765120.0, "grad_norm": 1.671496699512302, "language_loss": 0.58453703, "learning_rate": 6.43379691767214e-08, "loss": 0.60575104, "num_input_tokens_seen": 165653220, "step": 7663, "time_per_iteration": 2.6053354740142822 }, { "auxiliary_loss_clip": 0.01022273, "auxiliary_loss_mlp": 0.01001594, "balance_loss_clip": 1.0084753, "balance_loss_mlp": 1.0007118, "epoch": 0.9215415138579931, "flos": 70209311955840.0, "grad_norm": 0.7166928675433318, "language_loss": 0.55127835, "learning_rate": 6.414212585883105e-08, "loss": 0.57151705, "num_input_tokens_seen": 165715850, "step": 7664, "time_per_iteration": 3.1619186401367188 }, { "auxiliary_loss_clip": 0.01140502, "auxiliary_loss_mlp": 0.01020532, "balance_loss_clip": 1.04497957, "balance_loss_mlp": 1.01301932, "epoch": 0.9216617567486323, "flos": 35553107790720.0, "grad_norm": 1.6033462203750817, "language_loss": 0.7004081, "learning_rate": 6.394657620904143e-08, "loss": 0.72201842, "num_input_tokens_seen": 165738960, "step": 7665, "time_per_iteration": 2.588714838027954 }, { "auxiliary_loss_clip": 0.01169377, "auxiliary_loss_mlp": 0.01026027, "balance_loss_clip": 1.04820788, "balance_loss_mlp": 1.01867187, "epoch": 0.9217819996392713, "flos": 29533330552320.0, "grad_norm": 1.7149657571500336, "language_loss": 0.72017533, "learning_rate": 6.375132025701657e-08, "loss": 0.74212939, "num_input_tokens_seen": 165761260, "step": 7666, "time_per_iteration": 2.499300956726074 }, { "auxiliary_loss_clip": 0.0116913, "auxiliary_loss_mlp": 0.01024412, "balance_loss_clip": 1.04929817, "balance_loss_mlp": 1.01709533, "epoch": 0.9219022425299104, "flos": 14574669592320.0, "grad_norm": 2.151956093164181, "language_loss": 0.69253838, "learning_rate": 6.355635803237724e-08, "loss": 0.71447384, "num_input_tokens_seen": 165776960, "step": 7667, "time_per_iteration": 3.2311604022979736 }, { "auxiliary_loss_clip": 0.01148691, "auxiliary_loss_mlp": 0.01024621, "balance_loss_clip": 1.04342127, "balance_loss_mlp": 1.01731324, "epoch": 0.9220224854205495, "flos": 18077503996800.0, "grad_norm": 1.9775197839352552, "language_loss": 0.79580146, "learning_rate": 6.336168956469867e-08, "loss": 0.81753457, "num_input_tokens_seen": 165795435, "step": 7668, "time_per_iteration": 2.414461851119995 }, { "auxiliary_loss_clip": 0.01129756, "auxiliary_loss_mlp": 0.0102533, "balance_loss_clip": 1.04336774, "balance_loss_mlp": 1.01898789, "epoch": 0.9221427283111886, "flos": 24790464875520.0, "grad_norm": 1.7744680438508844, "language_loss": 0.72014236, "learning_rate": 6.316731488351168e-08, "loss": 0.74169326, "num_input_tokens_seen": 165816625, "step": 7669, "time_per_iteration": 3.2863781452178955 }, { "auxiliary_loss_clip": 0.01150566, "auxiliary_loss_mlp": 0.01022543, "balance_loss_clip": 1.04579663, "balance_loss_mlp": 1.01539671, "epoch": 0.9222629712018277, "flos": 13845036625920.0, "grad_norm": 1.793424297577179, "language_loss": 0.635225, "learning_rate": 6.297323401830334e-08, "loss": 0.65695614, "num_input_tokens_seen": 165835410, "step": 7670, "time_per_iteration": 3.237065315246582 }, { "auxiliary_loss_clip": 0.01153341, "auxiliary_loss_mlp": 0.0102241, "balance_loss_clip": 1.04544127, "balance_loss_mlp": 1.01544249, "epoch": 0.9223832140924668, "flos": 21616177196160.0, "grad_norm": 2.8107892334190265, "language_loss": 0.69219255, "learning_rate": 6.277944699851523e-08, "loss": 0.71395004, "num_input_tokens_seen": 165854930, "step": 7671, "time_per_iteration": 2.4299607276916504 }, { "auxiliary_loss_clip": 0.01162791, "auxiliary_loss_mlp": 0.01025976, "balance_loss_clip": 1.04550004, "balance_loss_mlp": 1.01891541, "epoch": 0.9225034569831059, "flos": 21142084561920.0, "grad_norm": 2.3558633985658775, "language_loss": 0.73348355, "learning_rate": 6.25859538535447e-08, "loss": 0.75537121, "num_input_tokens_seen": 165875725, "step": 7672, "time_per_iteration": 2.3961212635040283 }, { "auxiliary_loss_clip": 0.01136135, "auxiliary_loss_mlp": 0.01022535, "balance_loss_clip": 1.04473019, "balance_loss_mlp": 1.01548386, "epoch": 0.9226236998737449, "flos": 12495046844160.0, "grad_norm": 2.4901182051713526, "language_loss": 0.78354412, "learning_rate": 6.239275461274474e-08, "loss": 0.80513084, "num_input_tokens_seen": 165892100, "step": 7673, "time_per_iteration": 2.426987886428833 }, { "auxiliary_loss_clip": 0.01152056, "auxiliary_loss_mlp": 0.01027172, "balance_loss_clip": 1.04599643, "balance_loss_mlp": 1.02035928, "epoch": 0.9227439427643841, "flos": 26214071581440.0, "grad_norm": 1.9298177639148921, "language_loss": 0.85972744, "learning_rate": 6.219984930542299e-08, "loss": 0.88151968, "num_input_tokens_seen": 165912840, "step": 7674, "time_per_iteration": 2.4786314964294434 }, { "auxiliary_loss_clip": 0.01152311, "auxiliary_loss_mlp": 0.01027124, "balance_loss_clip": 1.04498756, "balance_loss_mlp": 1.02031684, "epoch": 0.9228641856550232, "flos": 17967581400960.0, "grad_norm": 2.4581083088160924, "language_loss": 0.75962865, "learning_rate": 6.200723796084383e-08, "loss": 0.78142297, "num_input_tokens_seen": 165930935, "step": 7675, "time_per_iteration": 2.3962435722351074 }, { "auxiliary_loss_clip": 0.01035324, "auxiliary_loss_mlp": 0.01001526, "balance_loss_clip": 1.00832105, "balance_loss_mlp": 1.00054836, "epoch": 0.9229844285456622, "flos": 70420609710720.0, "grad_norm": 0.7649790507259957, "language_loss": 0.63101447, "learning_rate": 6.181492060822546e-08, "loss": 0.65138292, "num_input_tokens_seen": 165991110, "step": 7676, "time_per_iteration": 2.991464138031006 }, { "auxiliary_loss_clip": 0.01107256, "auxiliary_loss_mlp": 0.0102243, "balance_loss_clip": 1.04085457, "balance_loss_mlp": 1.01518166, "epoch": 0.9231046714363014, "flos": 17967832796160.0, "grad_norm": 2.1056106856434056, "language_loss": 0.81498748, "learning_rate": 6.162289727674274e-08, "loss": 0.8362844, "num_input_tokens_seen": 166008790, "step": 7677, "time_per_iteration": 3.2753589153289795 }, { "auxiliary_loss_clip": 0.01122257, "auxiliary_loss_mlp": 0.0102265, "balance_loss_clip": 1.04182434, "balance_loss_mlp": 1.01609945, "epoch": 0.9232249143269404, "flos": 17858233422720.0, "grad_norm": 2.154044540981909, "language_loss": 0.87877572, "learning_rate": 6.143116799552527e-08, "loss": 0.9002248, "num_input_tokens_seen": 166025035, "step": 7678, "time_per_iteration": 2.524458885192871 }, { "auxiliary_loss_clip": 0.01155175, "auxiliary_loss_mlp": 0.01022785, "balance_loss_clip": 1.04737461, "balance_loss_mlp": 1.01561451, "epoch": 0.9233451572175795, "flos": 23404384903680.0, "grad_norm": 2.3157741603411695, "language_loss": 0.55908358, "learning_rate": 6.123973279365802e-08, "loss": 0.58086324, "num_input_tokens_seen": 166044010, "step": 7679, "time_per_iteration": 2.468770742416382 }, { "auxiliary_loss_clip": 0.01155081, "auxiliary_loss_mlp": 0.01023463, "balance_loss_clip": 1.04675853, "balance_loss_mlp": 1.01700437, "epoch": 0.9234654001082186, "flos": 17999326045440.0, "grad_norm": 2.0193275203489454, "language_loss": 0.7778011, "learning_rate": 6.10485917001824e-08, "loss": 0.79958653, "num_input_tokens_seen": 166061865, "step": 7680, "time_per_iteration": 2.422494649887085 }, { "auxiliary_loss_clip": 0.0114014, "auxiliary_loss_mlp": 0.01021268, "balance_loss_clip": 1.04387641, "balance_loss_mlp": 1.01465154, "epoch": 0.9235856429988577, "flos": 24750747411840.0, "grad_norm": 1.6685948186289121, "language_loss": 0.81034184, "learning_rate": 6.085774474409322e-08, "loss": 0.83195591, "num_input_tokens_seen": 166082425, "step": 7681, "time_per_iteration": 2.503793716430664 }, { "auxiliary_loss_clip": 0.01138869, "auxiliary_loss_mlp": 0.01028445, "balance_loss_clip": 1.04790807, "balance_loss_mlp": 1.02164137, "epoch": 0.9237058858894968, "flos": 14099894599680.0, "grad_norm": 2.1480405752097687, "language_loss": 0.70128691, "learning_rate": 6.066719195434267e-08, "loss": 0.72296011, "num_input_tokens_seen": 166100225, "step": 7682, "time_per_iteration": 2.4714064598083496 }, { "auxiliary_loss_clip": 0.01153123, "auxiliary_loss_mlp": 0.01025575, "balance_loss_clip": 1.04660714, "balance_loss_mlp": 1.01844883, "epoch": 0.9238261287801359, "flos": 28694529175680.0, "grad_norm": 2.3167486633050283, "language_loss": 0.66617912, "learning_rate": 6.047693335983717e-08, "loss": 0.68796605, "num_input_tokens_seen": 166122570, "step": 7683, "time_per_iteration": 2.499300241470337 }, { "auxiliary_loss_clip": 0.01153306, "auxiliary_loss_mlp": 0.01023919, "balance_loss_clip": 1.04448271, "balance_loss_mlp": 1.01668024, "epoch": 0.923946371670775, "flos": 23111856541440.0, "grad_norm": 2.5498969938233595, "language_loss": 0.82253832, "learning_rate": 6.028696898943853e-08, "loss": 0.84431058, "num_input_tokens_seen": 166141630, "step": 7684, "time_per_iteration": 2.450551748275757 }, { "auxiliary_loss_clip": 0.01136665, "auxiliary_loss_mlp": 0.00762501, "balance_loss_clip": 1.04158926, "balance_loss_mlp": 1.00044954, "epoch": 0.924066614561414, "flos": 21867120587520.0, "grad_norm": 1.9420646668544417, "language_loss": 0.70732617, "learning_rate": 6.00972988719648e-08, "loss": 0.72631782, "num_input_tokens_seen": 166159865, "step": 7685, "time_per_iteration": 2.4805076122283936 }, { "auxiliary_loss_clip": 0.01126211, "auxiliary_loss_mlp": 0.00762283, "balance_loss_clip": 1.04354739, "balance_loss_mlp": 1.00046277, "epoch": 0.9241868574520532, "flos": 28511887495680.0, "grad_norm": 2.324000694529363, "language_loss": 0.70573151, "learning_rate": 5.990792303618807e-08, "loss": 0.72461641, "num_input_tokens_seen": 166179445, "step": 7686, "time_per_iteration": 2.560917615890503 }, { "auxiliary_loss_clip": 0.01123514, "auxiliary_loss_mlp": 0.01019981, "balance_loss_clip": 1.04523516, "balance_loss_mlp": 1.01284671, "epoch": 0.9243071003426923, "flos": 30518324282880.0, "grad_norm": 1.5605261130800718, "language_loss": 0.69285089, "learning_rate": 5.971884151083695e-08, "loss": 0.71428585, "num_input_tokens_seen": 166201855, "step": 7687, "time_per_iteration": 2.5913567543029785 }, { "auxiliary_loss_clip": 0.01138238, "auxiliary_loss_mlp": 0.01023886, "balance_loss_clip": 1.04310644, "balance_loss_mlp": 1.01737392, "epoch": 0.9244273432333313, "flos": 28658331244800.0, "grad_norm": 1.761702590115966, "language_loss": 0.74576336, "learning_rate": 5.9530054324595124e-08, "loss": 0.76738453, "num_input_tokens_seen": 166221970, "step": 7688, "time_per_iteration": 2.5443081855773926 }, { "auxiliary_loss_clip": 0.01046539, "auxiliary_loss_mlp": 0.0075287, "balance_loss_clip": 1.00763559, "balance_loss_mlp": 0.99982464, "epoch": 0.9245475861239704, "flos": 66230589237120.0, "grad_norm": 0.7171431574459651, "language_loss": 0.5757215, "learning_rate": 5.934156150610103e-08, "loss": 0.59371555, "num_input_tokens_seen": 166279335, "step": 7689, "time_per_iteration": 3.082620143890381 }, { "auxiliary_loss_clip": 0.01134944, "auxiliary_loss_mlp": 0.01024706, "balance_loss_clip": 1.04298222, "balance_loss_mlp": 1.0172013, "epoch": 0.9246678290146095, "flos": 24239918142720.0, "grad_norm": 3.624306164854581, "language_loss": 0.78735125, "learning_rate": 5.915336308394914e-08, "loss": 0.80894774, "num_input_tokens_seen": 166298170, "step": 7690, "time_per_iteration": 2.485715866088867 }, { "auxiliary_loss_clip": 0.01145328, "auxiliary_loss_mlp": 0.01022568, "balance_loss_clip": 1.0442158, "balance_loss_mlp": 1.01650369, "epoch": 0.9247880719052486, "flos": 18988808976000.0, "grad_norm": 1.5879498549657893, "language_loss": 0.76919603, "learning_rate": 5.89654590866886e-08, "loss": 0.79087508, "num_input_tokens_seen": 166317670, "step": 7691, "time_per_iteration": 2.4459073543548584 }, { "auxiliary_loss_clip": 0.01102379, "auxiliary_loss_mlp": 0.0102443, "balance_loss_clip": 1.04512763, "balance_loss_mlp": 1.01674128, "epoch": 0.9249083147958876, "flos": 24024095274240.0, "grad_norm": 1.9296176075650382, "language_loss": 0.88285249, "learning_rate": 5.877784954282483e-08, "loss": 0.90412056, "num_input_tokens_seen": 166337010, "step": 7692, "time_per_iteration": 2.593445301055908 }, { "auxiliary_loss_clip": 0.01154752, "auxiliary_loss_mlp": 0.01022227, "balance_loss_clip": 1.04630804, "balance_loss_mlp": 1.01444268, "epoch": 0.9250285576865268, "flos": 30773972355840.0, "grad_norm": 1.9143729784944223, "language_loss": 0.72114557, "learning_rate": 5.8590534480817963e-08, "loss": 0.74291539, "num_input_tokens_seen": 166358735, "step": 7693, "time_per_iteration": 3.345564126968384 }, { "auxiliary_loss_clip": 0.01166166, "auxiliary_loss_mlp": 0.01026565, "balance_loss_clip": 1.04837728, "balance_loss_mlp": 1.01949334, "epoch": 0.9251488005771659, "flos": 10633581348480.0, "grad_norm": 2.2040602087679444, "language_loss": 0.72239274, "learning_rate": 5.840351392908349e-08, "loss": 0.74432003, "num_input_tokens_seen": 166374455, "step": 7694, "time_per_iteration": 2.3825693130493164 }, { "auxiliary_loss_clip": 0.01144394, "auxiliary_loss_mlp": 0.00762162, "balance_loss_clip": 1.04419816, "balance_loss_mlp": 1.00046086, "epoch": 0.9252690434678049, "flos": 23586416052480.0, "grad_norm": 2.525139122086702, "language_loss": 0.70718002, "learning_rate": 5.821678791599205e-08, "loss": 0.72624552, "num_input_tokens_seen": 166393900, "step": 7695, "time_per_iteration": 3.231208086013794 }, { "auxiliary_loss_clip": 0.0113516, "auxiliary_loss_mlp": 0.01022443, "balance_loss_clip": 1.04476511, "balance_loss_mlp": 1.01569295, "epoch": 0.9253892863584441, "flos": 21469158829440.0, "grad_norm": 2.467006232156446, "language_loss": 0.80672932, "learning_rate": 5.803035646986965e-08, "loss": 0.82830536, "num_input_tokens_seen": 166413235, "step": 7696, "time_per_iteration": 3.293334484100342 }, { "auxiliary_loss_clip": 0.01167685, "auxiliary_loss_mlp": 0.0102406, "balance_loss_clip": 1.04764295, "balance_loss_mlp": 1.01640689, "epoch": 0.9255095292490831, "flos": 17456680304640.0, "grad_norm": 2.690685647825443, "language_loss": 0.67256463, "learning_rate": 5.7844219618998766e-08, "loss": 0.69448209, "num_input_tokens_seen": 166427560, "step": 7697, "time_per_iteration": 2.397186756134033 }, { "auxiliary_loss_clip": 0.01107582, "auxiliary_loss_mlp": 0.01022265, "balance_loss_clip": 1.03805733, "balance_loss_mlp": 1.01510406, "epoch": 0.9256297721397222, "flos": 24750675584640.0, "grad_norm": 1.7160769054601068, "language_loss": 0.71809733, "learning_rate": 5.765837739161505e-08, "loss": 0.7393958, "num_input_tokens_seen": 166446680, "step": 7698, "time_per_iteration": 2.542025566101074 }, { "auxiliary_loss_clip": 0.01123132, "auxiliary_loss_mlp": 0.01021095, "balance_loss_clip": 1.04255581, "balance_loss_mlp": 1.01413357, "epoch": 0.9257500150303614, "flos": 23112215677440.0, "grad_norm": 2.253933801140475, "language_loss": 0.74176347, "learning_rate": 5.7472829815911504e-08, "loss": 0.76320571, "num_input_tokens_seen": 166465505, "step": 7699, "time_per_iteration": 2.5108723640441895 }, { "auxiliary_loss_clip": 0.01133904, "auxiliary_loss_mlp": 0.01025839, "balance_loss_clip": 1.04401207, "balance_loss_mlp": 1.01847148, "epoch": 0.9258702579210004, "flos": 22564685687040.0, "grad_norm": 1.7266540174413394, "language_loss": 0.81365013, "learning_rate": 5.7287576920035164e-08, "loss": 0.83524758, "num_input_tokens_seen": 166484520, "step": 7700, "time_per_iteration": 2.488548517227173 }, { "auxiliary_loss_clip": 0.01120688, "auxiliary_loss_mlp": 0.01021268, "balance_loss_clip": 1.04389286, "balance_loss_mlp": 1.01470518, "epoch": 0.9259905008116395, "flos": 30004298703360.0, "grad_norm": 2.234269878052166, "language_loss": 0.76880634, "learning_rate": 5.7102618732088435e-08, "loss": 0.79022586, "num_input_tokens_seen": 166503850, "step": 7701, "time_per_iteration": 2.5677061080932617 }, { "auxiliary_loss_clip": 0.01142892, "auxiliary_loss_mlp": 0.01027352, "balance_loss_clip": 1.04523194, "balance_loss_mlp": 1.02100372, "epoch": 0.9261107437022786, "flos": 24572128055040.0, "grad_norm": 1.5401736708876554, "language_loss": 0.74428922, "learning_rate": 5.6917955280130216e-08, "loss": 0.76599169, "num_input_tokens_seen": 166525330, "step": 7702, "time_per_iteration": 2.5289223194122314 }, { "auxiliary_loss_clip": 0.01149794, "auxiliary_loss_mlp": 0.01023195, "balance_loss_clip": 1.04583275, "balance_loss_mlp": 1.01612616, "epoch": 0.9262309865929177, "flos": 22018448586240.0, "grad_norm": 2.1804399549653306, "language_loss": 0.72180104, "learning_rate": 5.6733586592172755e-08, "loss": 0.74353099, "num_input_tokens_seen": 166544825, "step": 7703, "time_per_iteration": 2.436396360397339 }, { "auxiliary_loss_clip": 0.01129411, "auxiliary_loss_mlp": 0.00761033, "balance_loss_clip": 1.04053009, "balance_loss_mlp": 1.00041819, "epoch": 0.9263512294835567, "flos": 20339481116160.0, "grad_norm": 2.078291786574832, "language_loss": 0.80020046, "learning_rate": 5.6549512696185244e-08, "loss": 0.81910491, "num_input_tokens_seen": 166563325, "step": 7704, "time_per_iteration": 3.258502960205078 }, { "auxiliary_loss_clip": 0.01162358, "auxiliary_loss_mlp": 0.01021805, "balance_loss_clip": 1.04607201, "balance_loss_mlp": 1.01470888, "epoch": 0.9264714723741959, "flos": 21215378263680.0, "grad_norm": 1.6007703747131554, "language_loss": 0.68425059, "learning_rate": 5.636573362009156e-08, "loss": 0.70609224, "num_input_tokens_seen": 166583385, "step": 7705, "time_per_iteration": 2.415381669998169 }, { "auxiliary_loss_clip": 0.01167858, "auxiliary_loss_mlp": 0.01026402, "balance_loss_clip": 1.04826832, "balance_loss_mlp": 1.01919281, "epoch": 0.926591715264835, "flos": 18004964480640.0, "grad_norm": 3.776885144725662, "language_loss": 0.77157074, "learning_rate": 5.618224939177074e-08, "loss": 0.79351336, "num_input_tokens_seen": 166601290, "step": 7706, "time_per_iteration": 2.475008249282837 }, { "auxiliary_loss_clip": 0.01125939, "auxiliary_loss_mlp": 0.0102428, "balance_loss_clip": 1.04210711, "balance_loss_mlp": 1.01675177, "epoch": 0.926711958155474, "flos": 36167969825280.0, "grad_norm": 1.7146859937973897, "language_loss": 0.70432246, "learning_rate": 5.599906003905719e-08, "loss": 0.72582465, "num_input_tokens_seen": 166623835, "step": 7707, "time_per_iteration": 2.598785877227783 }, { "auxiliary_loss_clip": 0.01149999, "auxiliary_loss_mlp": 0.01024928, "balance_loss_clip": 1.04927945, "balance_loss_mlp": 1.01750767, "epoch": 0.9268322010461132, "flos": 21032736583680.0, "grad_norm": 2.7246534353093868, "language_loss": 0.82195169, "learning_rate": 5.581616558974023e-08, "loss": 0.843701, "num_input_tokens_seen": 166642400, "step": 7708, "time_per_iteration": 2.4794108867645264 }, { "auxiliary_loss_clip": 0.01157854, "auxiliary_loss_mlp": 0.00762039, "balance_loss_clip": 1.0469873, "balance_loss_mlp": 1.00044227, "epoch": 0.9269524439367522, "flos": 22964838174720.0, "grad_norm": 2.5884560524548634, "language_loss": 0.79461956, "learning_rate": 5.5633566071565444e-08, "loss": 0.81381845, "num_input_tokens_seen": 166661640, "step": 7709, "time_per_iteration": 2.44502854347229 }, { "auxiliary_loss_clip": 0.01098035, "auxiliary_loss_mlp": 0.01021495, "balance_loss_clip": 1.04028416, "balance_loss_mlp": 1.01492405, "epoch": 0.9270726868273913, "flos": 41975551468800.0, "grad_norm": 2.2718387141297023, "language_loss": 0.70256007, "learning_rate": 5.5451261512232896e-08, "loss": 0.72375536, "num_input_tokens_seen": 166684320, "step": 7710, "time_per_iteration": 2.740999937057495 }, { "auxiliary_loss_clip": 0.01155226, "auxiliary_loss_mlp": 0.01024191, "balance_loss_clip": 1.0435636, "balance_loss_mlp": 1.01687133, "epoch": 0.9271929297180305, "flos": 19791771557760.0, "grad_norm": 2.1713689590747838, "language_loss": 0.62382555, "learning_rate": 5.5269251939397576e-08, "loss": 0.64561969, "num_input_tokens_seen": 166703835, "step": 7711, "time_per_iteration": 2.4188764095306396 }, { "auxiliary_loss_clip": 0.01121206, "auxiliary_loss_mlp": 0.01022366, "balance_loss_clip": 1.03927302, "balance_loss_mlp": 1.01551783, "epoch": 0.9273131726086695, "flos": 19968343839360.0, "grad_norm": 1.9894028545136337, "language_loss": 0.76486635, "learning_rate": 5.508753738067073e-08, "loss": 0.78630203, "num_input_tokens_seen": 166723375, "step": 7712, "time_per_iteration": 2.510390043258667 }, { "auxiliary_loss_clip": 0.01152427, "auxiliary_loss_mlp": 0.01023906, "balance_loss_clip": 1.0429548, "balance_loss_mlp": 1.01665831, "epoch": 0.9274334154993086, "flos": 23258587599360.0, "grad_norm": 2.1511972505550183, "language_loss": 0.79094118, "learning_rate": 5.4906117863617875e-08, "loss": 0.81270456, "num_input_tokens_seen": 166742760, "step": 7713, "time_per_iteration": 2.449094533920288 }, { "auxiliary_loss_clip": 0.01119578, "auxiliary_loss_mlp": 0.01019551, "balance_loss_clip": 1.04015279, "balance_loss_mlp": 1.01269639, "epoch": 0.9275536583899477, "flos": 31795343585280.0, "grad_norm": 1.9971500394771942, "language_loss": 0.77926266, "learning_rate": 5.4724993415760533e-08, "loss": 0.80065393, "num_input_tokens_seen": 166761115, "step": 7714, "time_per_iteration": 2.574300527572632 }, { "auxiliary_loss_clip": 0.01131738, "auxiliary_loss_mlp": 0.0076222, "balance_loss_clip": 1.04331052, "balance_loss_mlp": 1.00049901, "epoch": 0.9276739012805868, "flos": 18696998885760.0, "grad_norm": 2.2041691207075473, "language_loss": 0.74647439, "learning_rate": 5.454416406457496e-08, "loss": 0.765414, "num_input_tokens_seen": 166780210, "step": 7715, "time_per_iteration": 2.484652519226074 }, { "auxiliary_loss_clip": 0.01150329, "auxiliary_loss_mlp": 0.01024749, "balance_loss_clip": 1.04465127, "balance_loss_mlp": 1.01838326, "epoch": 0.9277941441712259, "flos": 13879079740800.0, "grad_norm": 3.8223549304381845, "language_loss": 0.74234211, "learning_rate": 5.436362983749299e-08, "loss": 0.76409286, "num_input_tokens_seen": 166795380, "step": 7716, "time_per_iteration": 2.382547378540039 }, { "auxiliary_loss_clip": 0.0111638, "auxiliary_loss_mlp": 0.01025423, "balance_loss_clip": 1.04309928, "balance_loss_mlp": 1.01898551, "epoch": 0.927914387061865, "flos": 23258659426560.0, "grad_norm": 1.9003039239133317, "language_loss": 0.64127475, "learning_rate": 5.418339076190137e-08, "loss": 0.66269279, "num_input_tokens_seen": 166814890, "step": 7717, "time_per_iteration": 2.507246732711792 }, { "auxiliary_loss_clip": 0.01130186, "auxiliary_loss_mlp": 0.01019055, "balance_loss_clip": 1.04356813, "balance_loss_mlp": 1.01203942, "epoch": 0.9280346299525041, "flos": 18073733068800.0, "grad_norm": 1.824236892508934, "language_loss": 0.88926315, "learning_rate": 5.400344686514202e-08, "loss": 0.91075552, "num_input_tokens_seen": 166832475, "step": 7718, "time_per_iteration": 2.4689576625823975 }, { "auxiliary_loss_clip": 0.01151405, "auxiliary_loss_mlp": 0.01020594, "balance_loss_clip": 1.0477742, "balance_loss_mlp": 1.01370335, "epoch": 0.9281548728431431, "flos": 22342901160960.0, "grad_norm": 1.928529545349362, "language_loss": 0.66944981, "learning_rate": 5.38237981745131e-08, "loss": 0.69116986, "num_input_tokens_seen": 166850590, "step": 7719, "time_per_iteration": 2.4414563179016113 }, { "auxiliary_loss_clip": 0.01154081, "auxiliary_loss_mlp": 0.00761852, "balance_loss_clip": 1.04626405, "balance_loss_mlp": 1.00044453, "epoch": 0.9282751157337822, "flos": 18843765857280.0, "grad_norm": 1.7148912425862641, "language_loss": 0.81268364, "learning_rate": 5.364444471726592e-08, "loss": 0.83184296, "num_input_tokens_seen": 166869795, "step": 7720, "time_per_iteration": 3.2681591510772705 }, { "auxiliary_loss_clip": 0.01150214, "auxiliary_loss_mlp": 0.01022147, "balance_loss_clip": 1.04457581, "balance_loss_mlp": 1.01527441, "epoch": 0.9283953586244214, "flos": 25556834476800.0, "grad_norm": 2.1036630140560133, "language_loss": 0.80129731, "learning_rate": 5.346538652060939e-08, "loss": 0.82302094, "num_input_tokens_seen": 166891150, "step": 7721, "time_per_iteration": 3.2041115760803223 }, { "auxiliary_loss_clip": 0.0113554, "auxiliary_loss_mlp": 0.01023081, "balance_loss_clip": 1.04560232, "balance_loss_mlp": 1.0163902, "epoch": 0.9285156015150604, "flos": 18223480869120.0, "grad_norm": 1.8075976976897448, "language_loss": 0.70257974, "learning_rate": 5.3286623611705994e-08, "loss": 0.72416592, "num_input_tokens_seen": 166909195, "step": 7722, "time_per_iteration": 2.441241979598999 }, { "auxiliary_loss_clip": 0.0106071, "auxiliary_loss_mlp": 0.01000679, "balance_loss_clip": 1.00722241, "balance_loss_mlp": 0.99983847, "epoch": 0.9286358444056995, "flos": 66400017690240.0, "grad_norm": 0.8209562883633076, "language_loss": 0.60578537, "learning_rate": 5.3108156017673824e-08, "loss": 0.62639928, "num_input_tokens_seen": 166970955, "step": 7723, "time_per_iteration": 3.8582022190093994 }, { "auxiliary_loss_clip": 0.01143751, "auxiliary_loss_mlp": 0.01024558, "balance_loss_clip": 1.04489994, "balance_loss_mlp": 1.01667273, "epoch": 0.9287560872963386, "flos": 22345630594560.0, "grad_norm": 1.7369532673867514, "language_loss": 0.71586514, "learning_rate": 5.2929983765586775e-08, "loss": 0.73754817, "num_input_tokens_seen": 166989735, "step": 7724, "time_per_iteration": 2.4824748039245605 }, { "auxiliary_loss_clip": 0.01168027, "auxiliary_loss_mlp": 0.01025992, "balance_loss_clip": 1.05047107, "balance_loss_mlp": 1.01937258, "epoch": 0.9288763301869777, "flos": 25700225569920.0, "grad_norm": 1.9292333302726883, "language_loss": 0.62602824, "learning_rate": 5.275210688247278e-08, "loss": 0.64796841, "num_input_tokens_seen": 167010060, "step": 7725, "time_per_iteration": 2.4439094066619873 }, { "auxiliary_loss_clip": 0.01110409, "auxiliary_loss_mlp": 0.01024357, "balance_loss_clip": 1.04322231, "balance_loss_mlp": 1.01732993, "epoch": 0.9289965730776167, "flos": 12312046028160.0, "grad_norm": 2.5179880367599026, "language_loss": 0.84967697, "learning_rate": 5.257452539531604e-08, "loss": 0.87102473, "num_input_tokens_seen": 167027130, "step": 7726, "time_per_iteration": 2.5316436290740967 }, { "auxiliary_loss_clip": 0.01151051, "auxiliary_loss_mlp": 0.01025356, "balance_loss_clip": 1.04416776, "balance_loss_mlp": 1.01818252, "epoch": 0.9291168159682559, "flos": 26685973486080.0, "grad_norm": 1.712772720054803, "language_loss": 0.68287933, "learning_rate": 5.2397239331055445e-08, "loss": 0.70464337, "num_input_tokens_seen": 167049130, "step": 7727, "time_per_iteration": 2.495070219039917 }, { "auxiliary_loss_clip": 0.01133389, "auxiliary_loss_mlp": 0.01022613, "balance_loss_clip": 1.04475236, "balance_loss_mlp": 1.01545489, "epoch": 0.929237058858895, "flos": 14538256179840.0, "grad_norm": 1.9723114897677214, "language_loss": 0.81500828, "learning_rate": 5.2220248716585036e-08, "loss": 0.8365683, "num_input_tokens_seen": 167066810, "step": 7728, "time_per_iteration": 2.4856009483337402 }, { "auxiliary_loss_clip": 0.01142898, "auxiliary_loss_mlp": 0.01026097, "balance_loss_clip": 1.04326773, "balance_loss_mlp": 1.01881921, "epoch": 0.929357301749534, "flos": 23835456023040.0, "grad_norm": 2.0918458255568253, "language_loss": 0.75484776, "learning_rate": 5.204355357875445e-08, "loss": 0.77653766, "num_input_tokens_seen": 167085155, "step": 7729, "time_per_iteration": 2.4698424339294434 }, { "auxiliary_loss_clip": 0.01134139, "auxiliary_loss_mlp": 0.01021903, "balance_loss_clip": 1.0421741, "balance_loss_mlp": 1.01469111, "epoch": 0.9294775446401732, "flos": 12969319046400.0, "grad_norm": 3.819865613437238, "language_loss": 0.70565271, "learning_rate": 5.1867153944367584e-08, "loss": 0.72721314, "num_input_tokens_seen": 167101545, "step": 7730, "time_per_iteration": 3.2055585384368896 }, { "auxiliary_loss_clip": 0.01129575, "auxiliary_loss_mlp": 0.0102842, "balance_loss_clip": 1.04517806, "balance_loss_mlp": 1.02137816, "epoch": 0.9295977875308122, "flos": 26211809024640.0, "grad_norm": 1.9733551291917715, "language_loss": 0.73428869, "learning_rate": 5.16910498401848e-08, "loss": 0.75586867, "num_input_tokens_seen": 167120995, "step": 7731, "time_per_iteration": 2.55090594291687 }, { "auxiliary_loss_clip": 0.01164265, "auxiliary_loss_mlp": 0.01022809, "balance_loss_clip": 1.04812241, "balance_loss_mlp": 1.01623213, "epoch": 0.9297180304214513, "flos": 16472297105280.0, "grad_norm": 2.761499483817806, "language_loss": 0.83834797, "learning_rate": 5.151524129292073e-08, "loss": 0.86021876, "num_input_tokens_seen": 167138890, "step": 7732, "time_per_iteration": 2.360590934753418 }, { "auxiliary_loss_clip": 0.01148188, "auxiliary_loss_mlp": 0.01025616, "balance_loss_clip": 1.04436302, "balance_loss_mlp": 1.01897931, "epoch": 0.9298382733120905, "flos": 24060436859520.0, "grad_norm": 1.8887326872437653, "language_loss": 0.66394758, "learning_rate": 5.1339728329245155e-08, "loss": 0.68568558, "num_input_tokens_seen": 167159455, "step": 7733, "time_per_iteration": 2.4520535469055176 }, { "auxiliary_loss_clip": 0.01170393, "auxiliary_loss_mlp": 0.01027838, "balance_loss_clip": 1.04811764, "balance_loss_mlp": 1.02027082, "epoch": 0.9299585162027295, "flos": 22127652910080.0, "grad_norm": 8.35413144269687, "language_loss": 0.78955173, "learning_rate": 5.116451097578367e-08, "loss": 0.81153399, "num_input_tokens_seen": 167178495, "step": 7734, "time_per_iteration": 2.397625684738159 }, { "auxiliary_loss_clip": 0.01121863, "auxiliary_loss_mlp": 0.01024547, "balance_loss_clip": 1.04243064, "balance_loss_mlp": 1.01761496, "epoch": 0.9300787590933686, "flos": 21471780522240.0, "grad_norm": 1.605495500273465, "language_loss": 0.74439311, "learning_rate": 5.0989589259115895e-08, "loss": 0.76585722, "num_input_tokens_seen": 167199380, "step": 7735, "time_per_iteration": 2.513848066329956 }, { "auxiliary_loss_clip": 0.01148072, "auxiliary_loss_mlp": 0.01026779, "balance_loss_clip": 1.04247499, "balance_loss_mlp": 1.01885128, "epoch": 0.9301990019840077, "flos": 17779588594560.0, "grad_norm": 1.8065433676061218, "language_loss": 0.71542984, "learning_rate": 5.081496320577816e-08, "loss": 0.73717839, "num_input_tokens_seen": 167216500, "step": 7736, "time_per_iteration": 2.395390272140503 }, { "auxiliary_loss_clip": 0.01045717, "auxiliary_loss_mlp": 0.01002243, "balance_loss_clip": 1.01703501, "balance_loss_mlp": 1.00104511, "epoch": 0.9303192448746468, "flos": 58896122307840.0, "grad_norm": 0.9145884418444209, "language_loss": 0.61160982, "learning_rate": 5.0640632842260835e-08, "loss": 0.63208944, "num_input_tokens_seen": 167276760, "step": 7737, "time_per_iteration": 3.1183815002441406 }, { "auxiliary_loss_clip": 0.01120504, "auxiliary_loss_mlp": 0.00761968, "balance_loss_clip": 1.04490495, "balance_loss_mlp": 1.00042558, "epoch": 0.9304394877652858, "flos": 57663522172800.0, "grad_norm": 1.5205629772131883, "language_loss": 0.72742152, "learning_rate": 5.0466598195009426e-08, "loss": 0.74624628, "num_input_tokens_seen": 167303630, "step": 7738, "time_per_iteration": 2.83030366897583 }, { "auxiliary_loss_clip": 0.01123498, "auxiliary_loss_mlp": 0.01022881, "balance_loss_clip": 1.04342103, "balance_loss_mlp": 1.01574373, "epoch": 0.930559730655925, "flos": 20996143603200.0, "grad_norm": 1.885378797414096, "language_loss": 0.70368576, "learning_rate": 5.0292859290425036e-08, "loss": 0.72514963, "num_input_tokens_seen": 167321500, "step": 7739, "time_per_iteration": 2.476635456085205 }, { "auxiliary_loss_clip": 0.01164687, "auxiliary_loss_mlp": 0.01022429, "balance_loss_clip": 1.04824924, "balance_loss_mlp": 1.01592886, "epoch": 0.9306799735465641, "flos": 23258264376960.0, "grad_norm": 1.907589577368997, "language_loss": 0.7770347, "learning_rate": 5.011941615486348e-08, "loss": 0.79890585, "num_input_tokens_seen": 167340615, "step": 7740, "time_per_iteration": 2.418637990951538 }, { "auxiliary_loss_clip": 0.0116328, "auxiliary_loss_mlp": 0.01020981, "balance_loss_clip": 1.0453999, "balance_loss_mlp": 1.01398671, "epoch": 0.9308002164372031, "flos": 15231547560960.0, "grad_norm": 2.223850965521068, "language_loss": 0.84600842, "learning_rate": 4.994626881463659e-08, "loss": 0.86785102, "num_input_tokens_seen": 167356870, "step": 7741, "time_per_iteration": 2.373375415802002 }, { "auxiliary_loss_clip": 0.0109311, "auxiliary_loss_mlp": 0.01023941, "balance_loss_clip": 1.03836513, "balance_loss_mlp": 1.01694906, "epoch": 0.9309204593278423, "flos": 30847481539200.0, "grad_norm": 2.4769621717711185, "language_loss": 0.71428066, "learning_rate": 4.9773417296009814e-08, "loss": 0.73545122, "num_input_tokens_seen": 167378390, "step": 7742, "time_per_iteration": 2.5885632038116455 }, { "auxiliary_loss_clip": 0.01156992, "auxiliary_loss_mlp": 0.01028758, "balance_loss_clip": 1.04703808, "balance_loss_mlp": 1.02171302, "epoch": 0.9310407022184813, "flos": 23037269950080.0, "grad_norm": 3.556905768030258, "language_loss": 0.65484577, "learning_rate": 4.960086162520527e-08, "loss": 0.67670321, "num_input_tokens_seen": 167398480, "step": 7743, "time_per_iteration": 2.469297409057617 }, { "auxiliary_loss_clip": 0.01117885, "auxiliary_loss_mlp": 0.01024157, "balance_loss_clip": 1.04314637, "balance_loss_mlp": 1.01715684, "epoch": 0.9311609451091204, "flos": 22127976132480.0, "grad_norm": 2.0493586048885253, "language_loss": 0.82337803, "learning_rate": 4.942860182839936e-08, "loss": 0.84479845, "num_input_tokens_seen": 167416825, "step": 7744, "time_per_iteration": 2.5263147354125977 }, { "auxiliary_loss_clip": 0.01135616, "auxiliary_loss_mlp": 0.010244, "balance_loss_clip": 1.04458964, "balance_loss_mlp": 1.01740253, "epoch": 0.9312811879997596, "flos": 21099206701440.0, "grad_norm": 1.8076055369687123, "language_loss": 0.79525733, "learning_rate": 4.925663793172341e-08, "loss": 0.81685758, "num_input_tokens_seen": 167434785, "step": 7745, "time_per_iteration": 2.477511167526245 }, { "auxiliary_loss_clip": 0.01034751, "auxiliary_loss_mlp": 0.00752947, "balance_loss_clip": 1.00645757, "balance_loss_mlp": 0.99990386, "epoch": 0.9314014308903986, "flos": 67148179096320.0, "grad_norm": 0.7989216417284355, "language_loss": 0.5653435, "learning_rate": 4.908496996126477e-08, "loss": 0.58322048, "num_input_tokens_seen": 167498245, "step": 7746, "time_per_iteration": 3.1061158180236816 }, { "auxiliary_loss_clip": 0.01153219, "auxiliary_loss_mlp": 0.01028678, "balance_loss_clip": 1.050313, "balance_loss_mlp": 1.02121568, "epoch": 0.9315216737810377, "flos": 22565583527040.0, "grad_norm": 1.4371260626113949, "language_loss": 0.76294762, "learning_rate": 4.89135979430646e-08, "loss": 0.78476655, "num_input_tokens_seen": 167518290, "step": 7747, "time_per_iteration": 3.307936191558838 }, { "auxiliary_loss_clip": 0.01166726, "auxiliary_loss_mlp": 0.01023054, "balance_loss_clip": 1.04971361, "balance_loss_mlp": 1.01529002, "epoch": 0.9316419166716768, "flos": 23984054588160.0, "grad_norm": 1.7163494940621176, "language_loss": 0.85623944, "learning_rate": 4.874252190312078e-08, "loss": 0.87813723, "num_input_tokens_seen": 167538675, "step": 7748, "time_per_iteration": 3.2668607234954834 }, { "auxiliary_loss_clip": 0.01154245, "auxiliary_loss_mlp": 0.0102253, "balance_loss_clip": 1.04525566, "balance_loss_mlp": 1.01544046, "epoch": 0.9317621595623159, "flos": 30230464688640.0, "grad_norm": 1.8446139642175259, "language_loss": 0.64664161, "learning_rate": 4.857174186738477e-08, "loss": 0.66840935, "num_input_tokens_seen": 167562025, "step": 7749, "time_per_iteration": 3.336543560028076 }, { "auxiliary_loss_clip": 0.01167269, "auxiliary_loss_mlp": 0.01024198, "balance_loss_clip": 1.04961932, "balance_loss_mlp": 1.0169208, "epoch": 0.931882402452955, "flos": 15742735966080.0, "grad_norm": 2.2914542483161298, "language_loss": 0.73243737, "learning_rate": 4.840125786176408e-08, "loss": 0.75435209, "num_input_tokens_seen": 167578230, "step": 7750, "time_per_iteration": 2.376410484313965 }, { "auxiliary_loss_clip": 0.01134074, "auxiliary_loss_mlp": 0.01024268, "balance_loss_clip": 1.04345489, "balance_loss_mlp": 1.01722527, "epoch": 0.932002645343594, "flos": 28366521154560.0, "grad_norm": 1.8739850468914647, "language_loss": 0.77088964, "learning_rate": 4.823106991212067e-08, "loss": 0.79247302, "num_input_tokens_seen": 167597470, "step": 7751, "time_per_iteration": 2.5340423583984375 }, { "auxiliary_loss_clip": 0.01151699, "auxiliary_loss_mlp": 0.01023895, "balance_loss_clip": 1.04476213, "balance_loss_mlp": 1.01700163, "epoch": 0.9321228882342332, "flos": 15341146934400.0, "grad_norm": 2.216654090081813, "language_loss": 0.83282322, "learning_rate": 4.806117804427212e-08, "loss": 0.85457915, "num_input_tokens_seen": 167615405, "step": 7752, "time_per_iteration": 2.4378795623779297 }, { "auxiliary_loss_clip": 0.01144995, "auxiliary_loss_mlp": 0.01027329, "balance_loss_clip": 1.0427556, "balance_loss_mlp": 1.01986051, "epoch": 0.9322431311248722, "flos": 17895365107200.0, "grad_norm": 3.744604978197083, "language_loss": 0.63993376, "learning_rate": 4.7891582283990926e-08, "loss": 0.66165698, "num_input_tokens_seen": 167634130, "step": 7753, "time_per_iteration": 2.406168222427368 }, { "auxiliary_loss_clip": 0.01122858, "auxiliary_loss_mlp": 0.01019465, "balance_loss_clip": 1.0418663, "balance_loss_mlp": 1.01253879, "epoch": 0.9323633740155113, "flos": 24169713010560.0, "grad_norm": 1.5018631277508483, "language_loss": 0.72707492, "learning_rate": 4.772228265700473e-08, "loss": 0.74849814, "num_input_tokens_seen": 167654990, "step": 7754, "time_per_iteration": 2.5253841876983643 }, { "auxiliary_loss_clip": 0.01155114, "auxiliary_loss_mlp": 0.01025631, "balance_loss_clip": 1.0465281, "balance_loss_mlp": 1.0185976, "epoch": 0.9324836169061504, "flos": 15043482927360.0, "grad_norm": 2.2490912748123106, "language_loss": 0.7558254, "learning_rate": 4.75532791889961e-08, "loss": 0.77763277, "num_input_tokens_seen": 167671690, "step": 7755, "time_per_iteration": 2.393376350402832 }, { "auxiliary_loss_clip": 0.01148558, "auxiliary_loss_mlp": 0.01024901, "balance_loss_clip": 1.04360056, "balance_loss_mlp": 1.01761401, "epoch": 0.9326038597967895, "flos": 18624890332800.0, "grad_norm": 1.9383887156947166, "language_loss": 0.65799683, "learning_rate": 4.738457190560252e-08, "loss": 0.67973143, "num_input_tokens_seen": 167690800, "step": 7756, "time_per_iteration": 3.1505818367004395 }, { "auxiliary_loss_clip": 0.01110736, "auxiliary_loss_mlp": 0.01025574, "balance_loss_clip": 1.04478145, "balance_loss_mlp": 1.0183084, "epoch": 0.9327241026874286, "flos": 18952646958720.0, "grad_norm": 2.0157851272316516, "language_loss": 0.78822607, "learning_rate": 4.721616083241664e-08, "loss": 0.80958921, "num_input_tokens_seen": 167709055, "step": 7757, "time_per_iteration": 2.5190184116363525 }, { "auxiliary_loss_clip": 0.01147471, "auxiliary_loss_mlp": 0.01023643, "balance_loss_clip": 1.04522109, "balance_loss_mlp": 1.016747, "epoch": 0.9328443455780677, "flos": 29570282668800.0, "grad_norm": 1.6793944576780386, "language_loss": 0.77681106, "learning_rate": 4.7048045994986684e-08, "loss": 0.79852223, "num_input_tokens_seen": 167729915, "step": 7758, "time_per_iteration": 2.4970245361328125 }, { "auxiliary_loss_clip": 0.01157731, "auxiliary_loss_mlp": 0.01023386, "balance_loss_clip": 1.04786968, "balance_loss_mlp": 1.01630521, "epoch": 0.9329645884687068, "flos": 30081722469120.0, "grad_norm": 2.9284981467544413, "language_loss": 0.90691423, "learning_rate": 4.688022741881559e-08, "loss": 0.92872536, "num_input_tokens_seen": 167750440, "step": 7759, "time_per_iteration": 2.4959499835968018 }, { "auxiliary_loss_clip": 0.01147888, "auxiliary_loss_mlp": 0.01022146, "balance_loss_clip": 1.04418719, "balance_loss_mlp": 1.01595926, "epoch": 0.9330848313593458, "flos": 21867982513920.0, "grad_norm": 1.5600949313376908, "language_loss": 0.75029552, "learning_rate": 4.671270512936076e-08, "loss": 0.77199578, "num_input_tokens_seen": 167769600, "step": 7760, "time_per_iteration": 2.4216272830963135 }, { "auxiliary_loss_clip": 0.01115762, "auxiliary_loss_mlp": 0.01021622, "balance_loss_clip": 1.04108334, "balance_loss_mlp": 1.01483846, "epoch": 0.933205074249985, "flos": 22127221946880.0, "grad_norm": 1.6774029024005355, "language_loss": 0.82507986, "learning_rate": 4.6545479152035884e-08, "loss": 0.84645379, "num_input_tokens_seen": 167788770, "step": 7761, "time_per_iteration": 2.50838565826416 }, { "auxiliary_loss_clip": 0.01150832, "auxiliary_loss_mlp": 0.01020741, "balance_loss_clip": 1.0461936, "balance_loss_mlp": 1.01398754, "epoch": 0.9333253171406241, "flos": 15341254675200.0, "grad_norm": 1.902626518266492, "language_loss": 0.7608161, "learning_rate": 4.637854951220821e-08, "loss": 0.78253186, "num_input_tokens_seen": 167805555, "step": 7762, "time_per_iteration": 2.3953194618225098 }, { "auxiliary_loss_clip": 0.01118119, "auxiliary_loss_mlp": 0.01021795, "balance_loss_clip": 1.04127622, "balance_loss_mlp": 1.01502419, "epoch": 0.9334455600312631, "flos": 15706142985600.0, "grad_norm": 2.2045194435547866, "language_loss": 0.74879515, "learning_rate": 4.621191623520171e-08, "loss": 0.77019429, "num_input_tokens_seen": 167823985, "step": 7763, "time_per_iteration": 2.4767022132873535 }, { "auxiliary_loss_clip": 0.01109057, "auxiliary_loss_mlp": 0.01025436, "balance_loss_clip": 1.04244483, "balance_loss_mlp": 1.01818228, "epoch": 0.9335658029219023, "flos": 22163563532160.0, "grad_norm": 2.4869808445278814, "language_loss": 0.84675682, "learning_rate": 4.604557934629372e-08, "loss": 0.86810172, "num_input_tokens_seen": 167843060, "step": 7764, "time_per_iteration": 2.5723752975463867 }, { "auxiliary_loss_clip": 0.01133255, "auxiliary_loss_mlp": 0.01021582, "balance_loss_clip": 1.04558575, "balance_loss_mlp": 1.01515388, "epoch": 0.9336860458125413, "flos": 20266833859200.0, "grad_norm": 1.8042832881352566, "language_loss": 0.80361527, "learning_rate": 4.587953887071805e-08, "loss": 0.82516366, "num_input_tokens_seen": 167862880, "step": 7765, "time_per_iteration": 2.459326982498169 }, { "auxiliary_loss_clip": 0.01133867, "auxiliary_loss_mlp": 0.01026673, "balance_loss_clip": 1.04234624, "balance_loss_mlp": 1.019485, "epoch": 0.9338062887031804, "flos": 20919689504640.0, "grad_norm": 15.34083718671415, "language_loss": 0.85835814, "learning_rate": 4.5713794833662554e-08, "loss": 0.87996364, "num_input_tokens_seen": 167882095, "step": 7766, "time_per_iteration": 2.469886064529419 }, { "auxiliary_loss_clip": 0.01165673, "auxiliary_loss_mlp": 0.01027064, "balance_loss_clip": 1.04746377, "balance_loss_mlp": 1.01961958, "epoch": 0.9339265315938196, "flos": 23221635482880.0, "grad_norm": 1.8208497749642827, "language_loss": 0.63362467, "learning_rate": 4.5548347260270236e-08, "loss": 0.65555209, "num_input_tokens_seen": 167901385, "step": 7767, "time_per_iteration": 2.42073392868042 }, { "auxiliary_loss_clip": 0.01119422, "auxiliary_loss_mlp": 0.01021985, "balance_loss_clip": 1.04212189, "balance_loss_mlp": 1.01518703, "epoch": 0.9340467744844586, "flos": 22820261932800.0, "grad_norm": 1.7006399955007752, "language_loss": 0.69158447, "learning_rate": 4.538319617564012e-08, "loss": 0.71299851, "num_input_tokens_seen": 167920405, "step": 7768, "time_per_iteration": 2.503188133239746 }, { "auxiliary_loss_clip": 0.01132618, "auxiliary_loss_mlp": 0.01022276, "balance_loss_clip": 1.04027438, "balance_loss_mlp": 1.01530838, "epoch": 0.9341670173750977, "flos": 23660428026240.0, "grad_norm": 1.8903753494492697, "language_loss": 0.7408365, "learning_rate": 4.521834160482485e-08, "loss": 0.76238549, "num_input_tokens_seen": 167939145, "step": 7769, "time_per_iteration": 2.5169825553894043 }, { "auxiliary_loss_clip": 0.01152735, "auxiliary_loss_mlp": 0.01026104, "balance_loss_clip": 1.0453434, "balance_loss_mlp": 1.0189997, "epoch": 0.9342872602657368, "flos": 24824256595200.0, "grad_norm": 1.5718700654669397, "language_loss": 0.81908619, "learning_rate": 4.5053783572832846e-08, "loss": 0.84087461, "num_input_tokens_seen": 167959325, "step": 7770, "time_per_iteration": 2.4633185863494873 }, { "auxiliary_loss_clip": 0.01152006, "auxiliary_loss_mlp": 0.01024606, "balance_loss_clip": 1.04681134, "balance_loss_mlp": 1.01768017, "epoch": 0.9344075031563759, "flos": 25771831332480.0, "grad_norm": 1.666095821348059, "language_loss": 0.76537269, "learning_rate": 4.488952210462771e-08, "loss": 0.78713882, "num_input_tokens_seen": 167979530, "step": 7771, "time_per_iteration": 2.487454414367676 }, { "auxiliary_loss_clip": 0.01162954, "auxiliary_loss_mlp": 0.01021831, "balance_loss_clip": 1.04672909, "balance_loss_mlp": 1.01490474, "epoch": 0.9345277460470149, "flos": 25551303782400.0, "grad_norm": 2.3990158235627224, "language_loss": 0.8547321, "learning_rate": 4.4725557225127495e-08, "loss": 0.87658, "num_input_tokens_seen": 167997870, "step": 7772, "time_per_iteration": 2.4244980812072754 }, { "auxiliary_loss_clip": 0.01152041, "auxiliary_loss_mlp": 0.01026955, "balance_loss_clip": 1.0475527, "balance_loss_mlp": 1.02057409, "epoch": 0.9346479889376541, "flos": 34313112432000.0, "grad_norm": 1.8285645013147327, "language_loss": 0.79216981, "learning_rate": 4.456188895920565e-08, "loss": 0.81395984, "num_input_tokens_seen": 168019625, "step": 7773, "time_per_iteration": 2.5352118015289307 }, { "auxiliary_loss_clip": 0.011659, "auxiliary_loss_mlp": 0.01024555, "balance_loss_clip": 1.0481379, "balance_loss_mlp": 1.01721811, "epoch": 0.9347682318282932, "flos": 19093739581440.0, "grad_norm": 1.969581702050408, "language_loss": 0.85555845, "learning_rate": 4.439851733169031e-08, "loss": 0.87746298, "num_input_tokens_seen": 168037415, "step": 7774, "time_per_iteration": 3.2043185234069824 }, { "auxiliary_loss_clip": 0.01123122, "auxiliary_loss_mlp": 0.01029402, "balance_loss_clip": 1.04230046, "balance_loss_mlp": 1.02273536, "epoch": 0.9348884747189322, "flos": 26249587153920.0, "grad_norm": 2.5685939524587074, "language_loss": 0.69176096, "learning_rate": 4.4235442367365204e-08, "loss": 0.71328622, "num_input_tokens_seen": 168057725, "step": 7775, "time_per_iteration": 3.276679039001465 }, { "auxiliary_loss_clip": 0.01131988, "auxiliary_loss_mlp": 0.01023976, "balance_loss_clip": 1.04113317, "balance_loss_mlp": 1.01660895, "epoch": 0.9350087176095714, "flos": 18333080242560.0, "grad_norm": 2.3176809293446397, "language_loss": 0.79683447, "learning_rate": 4.4072664090968545e-08, "loss": 0.81839418, "num_input_tokens_seen": 168076110, "step": 7776, "time_per_iteration": 3.250821590423584 }, { "auxiliary_loss_clip": 0.01136684, "auxiliary_loss_mlp": 0.01025387, "balance_loss_clip": 1.04198039, "balance_loss_mlp": 1.01855874, "epoch": 0.9351289605002104, "flos": 19318253541120.0, "grad_norm": 1.8559219383452683, "language_loss": 0.84946561, "learning_rate": 4.391018252719347e-08, "loss": 0.8710863, "num_input_tokens_seen": 168095905, "step": 7777, "time_per_iteration": 2.4717464447021484 }, { "auxiliary_loss_clip": 0.01137661, "auxiliary_loss_mlp": 0.01029292, "balance_loss_clip": 1.0426228, "balance_loss_mlp": 1.02189183, "epoch": 0.9352492033908495, "flos": 18799990156800.0, "grad_norm": 1.9409895288930221, "language_loss": 0.6928851, "learning_rate": 4.374799770068849e-08, "loss": 0.71455467, "num_input_tokens_seen": 168112580, "step": 7778, "time_per_iteration": 2.4392635822296143 }, { "auxiliary_loss_clip": 0.01148175, "auxiliary_loss_mlp": 0.01023359, "balance_loss_clip": 1.04526949, "balance_loss_mlp": 1.01598263, "epoch": 0.9353694462814887, "flos": 29530134241920.0, "grad_norm": 1.9265869928941481, "language_loss": 0.74665433, "learning_rate": 4.358610963605658e-08, "loss": 0.76836967, "num_input_tokens_seen": 168133030, "step": 7779, "time_per_iteration": 2.4740631580352783 }, { "auxiliary_loss_clip": 0.01168087, "auxiliary_loss_mlp": 0.01033546, "balance_loss_clip": 1.04889739, "balance_loss_mlp": 1.0265367, "epoch": 0.9354896891721277, "flos": 30665450390400.0, "grad_norm": 2.273553290300527, "language_loss": 0.68731302, "learning_rate": 4.342451835785677e-08, "loss": 0.70932937, "num_input_tokens_seen": 168153940, "step": 7780, "time_per_iteration": 2.4637398719787598 }, { "auxiliary_loss_clip": 0.01134468, "auxiliary_loss_mlp": 0.01022129, "balance_loss_clip": 1.04410577, "balance_loss_mlp": 1.01531363, "epoch": 0.9356099320627668, "flos": 19463907191040.0, "grad_norm": 1.5673099519592728, "language_loss": 0.74924242, "learning_rate": 4.3263223890601665e-08, "loss": 0.7708084, "num_input_tokens_seen": 168172650, "step": 7781, "time_per_iteration": 2.440971612930298 }, { "auxiliary_loss_clip": 0.0114648, "auxiliary_loss_mlp": 0.00761531, "balance_loss_clip": 1.04738081, "balance_loss_mlp": 1.00047398, "epoch": 0.9357301749534058, "flos": 19098156954240.0, "grad_norm": 1.935911680435023, "language_loss": 0.79572177, "learning_rate": 4.31022262587597e-08, "loss": 0.81480187, "num_input_tokens_seen": 168191325, "step": 7782, "time_per_iteration": 2.4253463745117188 }, { "auxiliary_loss_clip": 0.01150224, "auxiliary_loss_mlp": 0.01028048, "balance_loss_clip": 1.04578006, "balance_loss_mlp": 1.02001333, "epoch": 0.935850417844045, "flos": 23550361776000.0, "grad_norm": 1.580729245362894, "language_loss": 0.6575495, "learning_rate": 4.2941525486754225e-08, "loss": 0.67933214, "num_input_tokens_seen": 168211645, "step": 7783, "time_per_iteration": 3.1668505668640137 }, { "auxiliary_loss_clip": 0.01116851, "auxiliary_loss_mlp": 0.01022312, "balance_loss_clip": 1.0418303, "balance_loss_mlp": 1.01605356, "epoch": 0.935970660734684, "flos": 18588333265920.0, "grad_norm": 1.7964190534449191, "language_loss": 0.7951709, "learning_rate": 4.278112159896286e-08, "loss": 0.81656253, "num_input_tokens_seen": 168229485, "step": 7784, "time_per_iteration": 2.4852676391601562 }, { "auxiliary_loss_clip": 0.01128844, "auxiliary_loss_mlp": 0.01021092, "balance_loss_clip": 1.0398128, "balance_loss_mlp": 1.0146699, "epoch": 0.9360909036253231, "flos": 20631255292800.0, "grad_norm": 1.6579822404683744, "language_loss": 0.67743784, "learning_rate": 4.2621014619719896e-08, "loss": 0.69893718, "num_input_tokens_seen": 168247250, "step": 7785, "time_per_iteration": 2.4690158367156982 }, { "auxiliary_loss_clip": 0.01038029, "auxiliary_loss_mlp": 0.01001393, "balance_loss_clip": 1.0067811, "balance_loss_mlp": 1.00048733, "epoch": 0.9362111465159623, "flos": 61791421052160.0, "grad_norm": 0.7231168886449887, "language_loss": 0.58630222, "learning_rate": 4.246120457331215e-08, "loss": 0.60669643, "num_input_tokens_seen": 168309425, "step": 7786, "time_per_iteration": 3.085767984390259 }, { "auxiliary_loss_clip": 0.01132813, "auxiliary_loss_mlp": 0.01024115, "balance_loss_clip": 1.04543233, "balance_loss_mlp": 1.01677513, "epoch": 0.9363313894066013, "flos": 24170395368960.0, "grad_norm": 2.7735232215143557, "language_loss": 0.72310555, "learning_rate": 4.2301691483983325e-08, "loss": 0.74467486, "num_input_tokens_seen": 168329545, "step": 7787, "time_per_iteration": 2.5015017986297607 }, { "auxiliary_loss_clip": 0.01152399, "auxiliary_loss_mlp": 0.01022984, "balance_loss_clip": 1.04464877, "balance_loss_mlp": 1.01555109, "epoch": 0.9364516322972404, "flos": 20120354196480.0, "grad_norm": 2.5073143857702576, "language_loss": 0.75994349, "learning_rate": 4.214247537593163e-08, "loss": 0.78169727, "num_input_tokens_seen": 168348795, "step": 7788, "time_per_iteration": 2.435617208480835 }, { "auxiliary_loss_clip": 0.01136672, "auxiliary_loss_mlp": 0.01033454, "balance_loss_clip": 1.04271424, "balance_loss_mlp": 1.02665269, "epoch": 0.9365718751878795, "flos": 20703758895360.0, "grad_norm": 1.8076671171808698, "language_loss": 0.80504346, "learning_rate": 4.1983556273309293e-08, "loss": 0.82674468, "num_input_tokens_seen": 168367545, "step": 7789, "time_per_iteration": 2.4675133228302 }, { "auxiliary_loss_clip": 0.01167386, "auxiliary_loss_mlp": 0.01028656, "balance_loss_clip": 1.04741704, "balance_loss_mlp": 1.02112162, "epoch": 0.9366921180785186, "flos": 18655270260480.0, "grad_norm": 3.11434642827236, "language_loss": 0.6876986, "learning_rate": 4.182493420022526e-08, "loss": 0.70965904, "num_input_tokens_seen": 168383215, "step": 7790, "time_per_iteration": 2.3460826873779297 }, { "auxiliary_loss_clip": 0.01126054, "auxiliary_loss_mlp": 0.01024757, "balance_loss_clip": 1.04309714, "balance_loss_mlp": 1.01791477, "epoch": 0.9368123609691577, "flos": 25774955815680.0, "grad_norm": 1.7006037433620487, "language_loss": 0.78611624, "learning_rate": 4.166660918074139e-08, "loss": 0.8076244, "num_input_tokens_seen": 168403120, "step": 7791, "time_per_iteration": 2.5708913803100586 }, { "auxiliary_loss_clip": 0.01118965, "auxiliary_loss_mlp": 0.01024519, "balance_loss_clip": 1.04160142, "balance_loss_mlp": 1.01745915, "epoch": 0.9369326038597968, "flos": 25553386771200.0, "grad_norm": 1.4620012960436688, "language_loss": 0.73728013, "learning_rate": 4.15085812388758e-08, "loss": 0.75871497, "num_input_tokens_seen": 168425340, "step": 7792, "time_per_iteration": 2.5659966468811035 }, { "auxiliary_loss_clip": 0.0113498, "auxiliary_loss_mlp": 0.01025806, "balance_loss_clip": 1.04341435, "balance_loss_mlp": 1.01883793, "epoch": 0.9370528467504359, "flos": 23220019370880.0, "grad_norm": 1.67810545195276, "language_loss": 0.78653908, "learning_rate": 4.135085039860153e-08, "loss": 0.80814695, "num_input_tokens_seen": 168444740, "step": 7793, "time_per_iteration": 2.4897918701171875 }, { "auxiliary_loss_clip": 0.01138718, "auxiliary_loss_mlp": 0.0102116, "balance_loss_clip": 1.04795122, "balance_loss_mlp": 1.01391828, "epoch": 0.9371730896410749, "flos": 24967468120320.0, "grad_norm": 2.3897843632486113, "language_loss": 0.78454274, "learning_rate": 4.1193416683845906e-08, "loss": 0.8061415, "num_input_tokens_seen": 168463670, "step": 7794, "time_per_iteration": 2.4905483722686768 }, { "auxiliary_loss_clip": 0.01126918, "auxiliary_loss_mlp": 0.01025202, "balance_loss_clip": 1.04585826, "balance_loss_mlp": 1.01877379, "epoch": 0.9372933325317141, "flos": 15553091134080.0, "grad_norm": 2.325795915003717, "language_loss": 0.83366895, "learning_rate": 4.103628011849136e-08, "loss": 0.85519016, "num_input_tokens_seen": 168479030, "step": 7795, "time_per_iteration": 2.4702987670898438 }, { "auxiliary_loss_clip": 0.01139317, "auxiliary_loss_mlp": 0.01023624, "balance_loss_clip": 1.04500806, "balance_loss_mlp": 1.01660562, "epoch": 0.9374135754223532, "flos": 21871861182720.0, "grad_norm": 1.9432192115945677, "language_loss": 0.76117641, "learning_rate": 4.0879440726375506e-08, "loss": 0.7828058, "num_input_tokens_seen": 168496815, "step": 7796, "time_per_iteration": 2.4747061729431152 }, { "auxiliary_loss_clip": 0.01133299, "auxiliary_loss_mlp": 0.01020199, "balance_loss_clip": 1.04025006, "balance_loss_mlp": 1.01305223, "epoch": 0.9375338183129922, "flos": 22631048064000.0, "grad_norm": 2.74369260518212, "language_loss": 0.56226516, "learning_rate": 4.0722898531291074e-08, "loss": 0.58380014, "num_input_tokens_seen": 168514055, "step": 7797, "time_per_iteration": 2.475250244140625 }, { "auxiliary_loss_clip": 0.01142699, "auxiliary_loss_mlp": 0.01023708, "balance_loss_clip": 1.0441041, "balance_loss_mlp": 1.0164547, "epoch": 0.9376540612036314, "flos": 26104292640000.0, "grad_norm": 1.942390645137417, "language_loss": 0.766204, "learning_rate": 4.0566653556985295e-08, "loss": 0.78786808, "num_input_tokens_seen": 168534600, "step": 7798, "time_per_iteration": 2.516303777694702 }, { "auxiliary_loss_clip": 0.01083056, "auxiliary_loss_mlp": 0.01030433, "balance_loss_clip": 1.03850102, "balance_loss_mlp": 1.02238905, "epoch": 0.9377743040942704, "flos": 19717580016000.0, "grad_norm": 2.6680341454378937, "language_loss": 0.81443667, "learning_rate": 4.0410705827159886e-08, "loss": 0.83557159, "num_input_tokens_seen": 168551895, "step": 7799, "time_per_iteration": 2.5737733840942383 }, { "auxiliary_loss_clip": 0.01133304, "auxiliary_loss_mlp": 0.01024367, "balance_loss_clip": 1.04146504, "balance_loss_mlp": 1.01711297, "epoch": 0.9378945469849095, "flos": 15267530010240.0, "grad_norm": 1.9570826450293635, "language_loss": 0.7145617, "learning_rate": 4.0255055365472356e-08, "loss": 0.7361384, "num_input_tokens_seen": 168569990, "step": 7800, "time_per_iteration": 2.4544239044189453 }, { "auxiliary_loss_clip": 0.01095665, "auxiliary_loss_mlp": 0.01027194, "balance_loss_clip": 1.03767526, "balance_loss_mlp": 1.02018118, "epoch": 0.9380147898755486, "flos": 20591394174720.0, "grad_norm": 2.9220867427633412, "language_loss": 0.74910402, "learning_rate": 4.009970219553471e-08, "loss": 0.77033257, "num_input_tokens_seen": 168586940, "step": 7801, "time_per_iteration": 3.3849499225616455 }, { "auxiliary_loss_clip": 0.01154351, "auxiliary_loss_mlp": 0.01024841, "balance_loss_clip": 1.04489481, "balance_loss_mlp": 1.01719368, "epoch": 0.9381350327661877, "flos": 26281116316800.0, "grad_norm": 2.7744341475645493, "language_loss": 0.77005196, "learning_rate": 3.99446463409141e-08, "loss": 0.79184389, "num_input_tokens_seen": 168604795, "step": 7802, "time_per_iteration": 3.2243008613586426 }, { "auxiliary_loss_clip": 0.01154225, "auxiliary_loss_mlp": 0.01024454, "balance_loss_clip": 1.04296434, "balance_loss_mlp": 1.01724207, "epoch": 0.9382552756568268, "flos": 23586344225280.0, "grad_norm": 2.002154923716347, "language_loss": 0.687397, "learning_rate": 3.978988782513215e-08, "loss": 0.70918369, "num_input_tokens_seen": 168622290, "step": 7803, "time_per_iteration": 3.318350315093994 }, { "auxiliary_loss_clip": 0.01153139, "auxiliary_loss_mlp": 0.01020312, "balance_loss_clip": 1.04404259, "balance_loss_mlp": 1.01345742, "epoch": 0.9383755185474659, "flos": 28438809275520.0, "grad_norm": 1.7941594912290635, "language_loss": 0.76412523, "learning_rate": 3.963542667166586e-08, "loss": 0.7858597, "num_input_tokens_seen": 168642395, "step": 7804, "time_per_iteration": 2.520297050476074 }, { "auxiliary_loss_clip": 0.01128937, "auxiliary_loss_mlp": 0.01025573, "balance_loss_clip": 1.04824924, "balance_loss_mlp": 1.0187602, "epoch": 0.938495761438105, "flos": 20449583280000.0, "grad_norm": 1.6813638873015362, "language_loss": 0.68237638, "learning_rate": 3.9481262903946486e-08, "loss": 0.70392156, "num_input_tokens_seen": 168661840, "step": 7805, "time_per_iteration": 2.534358501434326 }, { "auxiliary_loss_clip": 0.01024006, "auxiliary_loss_mlp": 0.01001157, "balance_loss_clip": 1.00830948, "balance_loss_mlp": 1.00019741, "epoch": 0.938616004328744, "flos": 69302711658240.0, "grad_norm": 0.7719606455094693, "language_loss": 0.54471767, "learning_rate": 3.932739654536066e-08, "loss": 0.5649693, "num_input_tokens_seen": 168724540, "step": 7806, "time_per_iteration": 3.108773946762085 }, { "auxiliary_loss_clip": 0.01150496, "auxiliary_loss_mlp": 0.0102358, "balance_loss_clip": 1.04647648, "balance_loss_mlp": 1.01726782, "epoch": 0.9387362472193832, "flos": 18911636605440.0, "grad_norm": 2.13495686068401, "language_loss": 0.74022591, "learning_rate": 3.917382761925014e-08, "loss": 0.76196665, "num_input_tokens_seen": 168740375, "step": 7807, "time_per_iteration": 2.4531922340393066 }, { "auxiliary_loss_clip": 0.01147488, "auxiliary_loss_mlp": 0.01026001, "balance_loss_clip": 1.04545498, "balance_loss_mlp": 1.01929867, "epoch": 0.9388564901100223, "flos": 26501967089280.0, "grad_norm": 1.7085085274545841, "language_loss": 0.79436463, "learning_rate": 3.9020556148910754e-08, "loss": 0.81609952, "num_input_tokens_seen": 168759730, "step": 7808, "time_per_iteration": 2.5222485065460205 }, { "auxiliary_loss_clip": 0.01044044, "auxiliary_loss_mlp": 0.01000411, "balance_loss_clip": 1.00848341, "balance_loss_mlp": 0.99955845, "epoch": 0.9389767330006613, "flos": 58941083157120.0, "grad_norm": 0.7353404238872535, "language_loss": 0.5674243, "learning_rate": 3.8867582157593895e-08, "loss": 0.58786893, "num_input_tokens_seen": 168813935, "step": 7809, "time_per_iteration": 2.924941301345825 }, { "auxiliary_loss_clip": 0.01151369, "auxiliary_loss_mlp": 0.01021866, "balance_loss_clip": 1.04829741, "balance_loss_mlp": 1.0151453, "epoch": 0.9390969758913005, "flos": 31102554994560.0, "grad_norm": 2.5688877658802576, "language_loss": 0.76330429, "learning_rate": 3.871490566850544e-08, "loss": 0.78503668, "num_input_tokens_seen": 168838145, "step": 7810, "time_per_iteration": 3.3108325004577637 }, { "auxiliary_loss_clip": 0.01133487, "auxiliary_loss_mlp": 0.0102335, "balance_loss_clip": 1.04414129, "balance_loss_mlp": 1.01631927, "epoch": 0.9392172187819395, "flos": 22419391173120.0, "grad_norm": 1.8026362078527254, "language_loss": 0.70803982, "learning_rate": 3.856252670480642e-08, "loss": 0.72960818, "num_input_tokens_seen": 168856805, "step": 7811, "time_per_iteration": 2.4905502796173096 }, { "auxiliary_loss_clip": 0.01133843, "auxiliary_loss_mlp": 0.01026459, "balance_loss_clip": 1.0414331, "balance_loss_mlp": 1.01903772, "epoch": 0.9393374616725786, "flos": 19719483436800.0, "grad_norm": 1.8284271896453064, "language_loss": 0.81367469, "learning_rate": 3.841044528961279e-08, "loss": 0.83527768, "num_input_tokens_seen": 168874600, "step": 7812, "time_per_iteration": 2.485079050064087 }, { "auxiliary_loss_clip": 0.01164374, "auxiliary_loss_mlp": 0.01022322, "balance_loss_clip": 1.04488468, "balance_loss_mlp": 1.01506829, "epoch": 0.9394577045632178, "flos": 24170215800960.0, "grad_norm": 1.9090386087029503, "language_loss": 0.78516585, "learning_rate": 3.825866144599477e-08, "loss": 0.80703282, "num_input_tokens_seen": 168893655, "step": 7813, "time_per_iteration": 2.4292402267456055 }, { "auxiliary_loss_clip": 0.01136229, "auxiliary_loss_mlp": 0.01021705, "balance_loss_clip": 1.04269695, "balance_loss_mlp": 1.01446617, "epoch": 0.9395779474538568, "flos": 19023929498880.0, "grad_norm": 1.9382582998029014, "language_loss": 0.75169373, "learning_rate": 3.8107175196978145e-08, "loss": 0.77327305, "num_input_tokens_seen": 168909960, "step": 7814, "time_per_iteration": 2.447415590286255 }, { "auxiliary_loss_clip": 0.01121879, "auxiliary_loss_mlp": 0.01026275, "balance_loss_clip": 1.04426503, "balance_loss_mlp": 1.01953089, "epoch": 0.9396981903444959, "flos": 14319129260160.0, "grad_norm": 4.050239831428167, "language_loss": 0.76650649, "learning_rate": 3.7955986565542996e-08, "loss": 0.78798807, "num_input_tokens_seen": 168928040, "step": 7815, "time_per_iteration": 2.4909706115722656 }, { "auxiliary_loss_clip": 0.01122221, "auxiliary_loss_mlp": 0.01029132, "balance_loss_clip": 1.04187632, "balance_loss_mlp": 1.02242899, "epoch": 0.9398184332351349, "flos": 34787564202240.0, "grad_norm": 1.8837717056348675, "language_loss": 0.68207049, "learning_rate": 3.780509557462497e-08, "loss": 0.70358396, "num_input_tokens_seen": 168948240, "step": 7816, "time_per_iteration": 2.6253855228424072 }, { "auxiliary_loss_clip": 0.01132494, "auxiliary_loss_mlp": 0.01022974, "balance_loss_clip": 1.04111564, "balance_loss_mlp": 1.01553535, "epoch": 0.9399386761257741, "flos": 25372253462400.0, "grad_norm": 1.5061363765502904, "language_loss": 0.75301248, "learning_rate": 3.765450224711375e-08, "loss": 0.77456713, "num_input_tokens_seen": 168968745, "step": 7817, "time_per_iteration": 2.5172901153564453 }, { "auxiliary_loss_clip": 0.0113344, "auxiliary_loss_mlp": 0.01023986, "balance_loss_clip": 1.04583824, "balance_loss_mlp": 1.01706004, "epoch": 0.9400589190164131, "flos": 27304965584640.0, "grad_norm": 1.7555954865867462, "language_loss": 0.79811132, "learning_rate": 3.750420660585396e-08, "loss": 0.81968558, "num_input_tokens_seen": 168990685, "step": 7818, "time_per_iteration": 2.539062261581421 }, { "auxiliary_loss_clip": 0.01164381, "auxiliary_loss_mlp": 0.0102412, "balance_loss_clip": 1.04792666, "balance_loss_mlp": 1.01726007, "epoch": 0.9401791619070522, "flos": 23399859790080.0, "grad_norm": 2.0165620934241266, "language_loss": 0.7980516, "learning_rate": 3.735420867364603e-08, "loss": 0.81993663, "num_input_tokens_seen": 169011665, "step": 7819, "time_per_iteration": 2.435166835784912 }, { "auxiliary_loss_clip": 0.01085759, "auxiliary_loss_mlp": 0.01020414, "balance_loss_clip": 1.03537977, "balance_loss_mlp": 1.01383698, "epoch": 0.9402994047976914, "flos": 35881403120640.0, "grad_norm": 1.6147118527315714, "language_loss": 0.6174134, "learning_rate": 3.7204508473244186e-08, "loss": 0.63847518, "num_input_tokens_seen": 169035290, "step": 7820, "time_per_iteration": 2.6953890323638916 }, { "auxiliary_loss_clip": 0.01075353, "auxiliary_loss_mlp": 0.01022725, "balance_loss_clip": 1.0386436, "balance_loss_mlp": 1.01636803, "epoch": 0.9404196476883304, "flos": 22236821320320.0, "grad_norm": 3.2616484913639825, "language_loss": 0.69448584, "learning_rate": 3.7055106027357395e-08, "loss": 0.71546662, "num_input_tokens_seen": 169055155, "step": 7821, "time_per_iteration": 2.607365608215332 }, { "auxiliary_loss_clip": 0.011494, "auxiliary_loss_mlp": 0.01022732, "balance_loss_clip": 1.04647636, "balance_loss_mlp": 1.015275, "epoch": 0.9405398905789695, "flos": 18915802583040.0, "grad_norm": 1.9504624223105993, "language_loss": 0.71733874, "learning_rate": 3.690600135865063e-08, "loss": 0.73906004, "num_input_tokens_seen": 169072080, "step": 7822, "time_per_iteration": 2.414651870727539 }, { "auxiliary_loss_clip": 0.01020244, "auxiliary_loss_mlp": 0.01001222, "balance_loss_clip": 1.00700426, "balance_loss_mlp": 1.00032222, "epoch": 0.9406601334696086, "flos": 70274130048000.0, "grad_norm": 0.7955488122472639, "language_loss": 0.58150923, "learning_rate": 3.675719448974246e-08, "loss": 0.60172391, "num_input_tokens_seen": 169137170, "step": 7823, "time_per_iteration": 3.236582040786743 }, { "auxiliary_loss_clip": 0.01106744, "auxiliary_loss_mlp": 0.00761564, "balance_loss_clip": 1.04196453, "balance_loss_mlp": 1.00043607, "epoch": 0.9407803763602477, "flos": 22165071903360.0, "grad_norm": 1.9763820737392357, "language_loss": 0.60037947, "learning_rate": 3.6608685443207054e-08, "loss": 0.6190626, "num_input_tokens_seen": 169156320, "step": 7824, "time_per_iteration": 2.839423418045044 }, { "auxiliary_loss_clip": 0.01125414, "auxiliary_loss_mlp": 0.0102409, "balance_loss_clip": 1.04265451, "balance_loss_mlp": 1.01742303, "epoch": 0.9409006192508867, "flos": 18879496911360.0, "grad_norm": 2.4829460325040564, "language_loss": 0.66655159, "learning_rate": 3.646047424157306e-08, "loss": 0.68804657, "num_input_tokens_seen": 169173295, "step": 7825, "time_per_iteration": 2.4948480129241943 }, { "auxiliary_loss_clip": 0.01137966, "auxiliary_loss_mlp": 0.01026241, "balance_loss_clip": 1.04580545, "balance_loss_mlp": 1.01870084, "epoch": 0.9410208621415259, "flos": 23368258800000.0, "grad_norm": 2.5389447741234665, "language_loss": 0.68397832, "learning_rate": 3.631256090732382e-08, "loss": 0.70562041, "num_input_tokens_seen": 169193755, "step": 7826, "time_per_iteration": 2.4857218265533447 }, { "auxiliary_loss_clip": 0.01124, "auxiliary_loss_mlp": 0.01024014, "balance_loss_clip": 1.04449153, "balance_loss_mlp": 1.01751721, "epoch": 0.941141105032165, "flos": 22742227635840.0, "grad_norm": 2.0194203161119013, "language_loss": 0.82629824, "learning_rate": 3.6164945462897833e-08, "loss": 0.84777832, "num_input_tokens_seen": 169213045, "step": 7827, "time_per_iteration": 2.547708034515381 }, { "auxiliary_loss_clip": 0.01149818, "auxiliary_loss_mlp": 0.00761317, "balance_loss_clip": 1.04693913, "balance_loss_mlp": 1.00037503, "epoch": 0.941261347922804, "flos": 20704908130560.0, "grad_norm": 1.65488498100959, "language_loss": 0.75595284, "learning_rate": 3.6017627930687856e-08, "loss": 0.77506417, "num_input_tokens_seen": 169232870, "step": 7828, "time_per_iteration": 3.2857019901275635 }, { "auxiliary_loss_clip": 0.01105444, "auxiliary_loss_mlp": 0.01021398, "balance_loss_clip": 1.03908086, "balance_loss_mlp": 1.01471066, "epoch": 0.9413815908134432, "flos": 19421998997760.0, "grad_norm": 2.0088585946240354, "language_loss": 0.77042317, "learning_rate": 3.587060833304267e-08, "loss": 0.79169154, "num_input_tokens_seen": 169251060, "step": 7829, "time_per_iteration": 4.3125159740448 }, { "auxiliary_loss_clip": 0.01153841, "auxiliary_loss_mlp": 0.0102507, "balance_loss_clip": 1.04700339, "balance_loss_mlp": 1.01793277, "epoch": 0.9415018337040822, "flos": 17493452853120.0, "grad_norm": 1.9794684696201283, "language_loss": 0.64035976, "learning_rate": 3.5723886692264225e-08, "loss": 0.66214889, "num_input_tokens_seen": 169268600, "step": 7830, "time_per_iteration": 2.4527854919433594 }, { "auxiliary_loss_clip": 0.01131958, "auxiliary_loss_mlp": 0.01025191, "balance_loss_clip": 1.04079461, "balance_loss_mlp": 1.01860428, "epoch": 0.9416220765947213, "flos": 31831613343360.0, "grad_norm": 2.0541751213589077, "language_loss": 0.62432235, "learning_rate": 3.557746303061071e-08, "loss": 0.64589387, "num_input_tokens_seen": 169290355, "step": 7831, "time_per_iteration": 2.593571901321411 }, { "auxiliary_loss_clip": 0.01133843, "auxiliary_loss_mlp": 0.01021214, "balance_loss_clip": 1.04275656, "balance_loss_mlp": 1.01470542, "epoch": 0.9417423194853605, "flos": 23511973115520.0, "grad_norm": 1.5962669450845155, "language_loss": 0.72410691, "learning_rate": 3.543133737029391e-08, "loss": 0.74565744, "num_input_tokens_seen": 169310865, "step": 7832, "time_per_iteration": 2.5236294269561768 }, { "auxiliary_loss_clip": 0.01154959, "auxiliary_loss_mlp": 0.01022886, "balance_loss_clip": 1.04571009, "balance_loss_mlp": 1.01582038, "epoch": 0.9418625623759995, "flos": 23915106432000.0, "grad_norm": 2.718070526543077, "language_loss": 0.69199193, "learning_rate": 3.5285509733481214e-08, "loss": 0.71377039, "num_input_tokens_seen": 169330590, "step": 7833, "time_per_iteration": 2.481441020965576 }, { "auxiliary_loss_clip": 0.01147231, "auxiliary_loss_mlp": 0.01027193, "balance_loss_clip": 1.04372609, "balance_loss_mlp": 1.01955175, "epoch": 0.9419828052666386, "flos": 18076965292800.0, "grad_norm": 2.151817133250693, "language_loss": 0.76551831, "learning_rate": 3.513998014229469e-08, "loss": 0.78726256, "num_input_tokens_seen": 169349540, "step": 7834, "time_per_iteration": 2.424168825149536 }, { "auxiliary_loss_clip": 0.01137357, "auxiliary_loss_mlp": 0.01023545, "balance_loss_clip": 1.04495227, "balance_loss_mlp": 1.01679182, "epoch": 0.9421030481572777, "flos": 17712328377600.0, "grad_norm": 2.578003550508524, "language_loss": 0.86075974, "learning_rate": 3.499474861881069e-08, "loss": 0.88236868, "num_input_tokens_seen": 169366765, "step": 7835, "time_per_iteration": 2.4340837001800537 }, { "auxiliary_loss_clip": 0.01095575, "auxiliary_loss_mlp": 0.01019496, "balance_loss_clip": 1.04073191, "balance_loss_mlp": 1.01264429, "epoch": 0.9422232910479168, "flos": 20194114775040.0, "grad_norm": 1.8416254845304234, "language_loss": 0.67910206, "learning_rate": 3.4849815185061136e-08, "loss": 0.70025283, "num_input_tokens_seen": 169386655, "step": 7836, "time_per_iteration": 2.6137242317199707 }, { "auxiliary_loss_clip": 0.01147591, "auxiliary_loss_mlp": 0.01021292, "balance_loss_clip": 1.04196286, "balance_loss_mlp": 1.0149473, "epoch": 0.9423435339385559, "flos": 18442571875200.0, "grad_norm": 1.9351711929411826, "language_loss": 0.76200539, "learning_rate": 3.470517986303223e-08, "loss": 0.78369421, "num_input_tokens_seen": 169405640, "step": 7837, "time_per_iteration": 3.2082364559173584 }, { "auxiliary_loss_clip": 0.0112106, "auxiliary_loss_mlp": 0.01033876, "balance_loss_clip": 1.04501939, "balance_loss_mlp": 1.02684546, "epoch": 0.942463776829195, "flos": 20080636732800.0, "grad_norm": 1.8018896076547664, "language_loss": 0.79175133, "learning_rate": 3.4560842674664856e-08, "loss": 0.81330073, "num_input_tokens_seen": 169424155, "step": 7838, "time_per_iteration": 2.509373188018799 }, { "auxiliary_loss_clip": 0.01152592, "auxiliary_loss_mlp": 0.01020774, "balance_loss_clip": 1.04382777, "balance_loss_mlp": 1.01363909, "epoch": 0.9425840197198341, "flos": 22636255536000.0, "grad_norm": 1.7856425887286214, "language_loss": 0.75214523, "learning_rate": 3.441680364185506e-08, "loss": 0.77387887, "num_input_tokens_seen": 169444025, "step": 7839, "time_per_iteration": 2.45835280418396 }, { "auxiliary_loss_clip": 0.01141236, "auxiliary_loss_mlp": 0.01028442, "balance_loss_clip": 1.0474081, "balance_loss_mlp": 1.0212183, "epoch": 0.9427042626104731, "flos": 19937892084480.0, "grad_norm": 2.1279493500966087, "language_loss": 0.75157106, "learning_rate": 3.427306278645314e-08, "loss": 0.77326781, "num_input_tokens_seen": 169462480, "step": 7840, "time_per_iteration": 2.4605257511138916 }, { "auxiliary_loss_clip": 0.01108333, "auxiliary_loss_mlp": 0.01021506, "balance_loss_clip": 1.04143429, "balance_loss_mlp": 1.01475585, "epoch": 0.9428245055011123, "flos": 22856998567680.0, "grad_norm": 3.1156319889505286, "language_loss": 0.72698045, "learning_rate": 3.4129620130264767e-08, "loss": 0.74827886, "num_input_tokens_seen": 169480840, "step": 7841, "time_per_iteration": 2.539158821105957 }, { "auxiliary_loss_clip": 0.01142726, "auxiliary_loss_mlp": 0.00761415, "balance_loss_clip": 1.04810536, "balance_loss_mlp": 1.00046575, "epoch": 0.9429447483917514, "flos": 20951757371520.0, "grad_norm": 2.2051420720485986, "language_loss": 0.78080136, "learning_rate": 3.398647569505009e-08, "loss": 0.79984283, "num_input_tokens_seen": 169498265, "step": 7842, "time_per_iteration": 2.4850659370422363 }, { "auxiliary_loss_clip": 0.01127832, "auxiliary_loss_mlp": 0.0102496, "balance_loss_clip": 1.04265666, "balance_loss_mlp": 1.01764941, "epoch": 0.9430649912823904, "flos": 18843658116480.0, "grad_norm": 2.458175718296991, "language_loss": 0.74562967, "learning_rate": 3.384362950252373e-08, "loss": 0.76715755, "num_input_tokens_seen": 169515235, "step": 7843, "time_per_iteration": 2.502018928527832 }, { "auxiliary_loss_clip": 0.01132468, "auxiliary_loss_mlp": 0.01021218, "balance_loss_clip": 1.04088688, "balance_loss_mlp": 1.01443481, "epoch": 0.9431852341730296, "flos": 32556038837760.0, "grad_norm": 2.1438351596002847, "language_loss": 0.56783378, "learning_rate": 3.3701081574355473e-08, "loss": 0.58937061, "num_input_tokens_seen": 169537195, "step": 7844, "time_per_iteration": 2.6157724857330322 }, { "auxiliary_loss_clip": 0.01044108, "auxiliary_loss_mlp": 0.01000561, "balance_loss_clip": 1.00895619, "balance_loss_mlp": 0.99973881, "epoch": 0.9433054770636686, "flos": 66904490252160.0, "grad_norm": 0.6389719999319686, "language_loss": 0.51683843, "learning_rate": 3.3558831932169796e-08, "loss": 0.53728509, "num_input_tokens_seen": 169605865, "step": 7845, "time_per_iteration": 3.1352052688598633 }, { "auxiliary_loss_clip": 0.01147098, "auxiliary_loss_mlp": 0.01022537, "balance_loss_clip": 1.0436132, "balance_loss_mlp": 1.01576877, "epoch": 0.9434257199543077, "flos": 26140346916480.0, "grad_norm": 1.8783722097149476, "language_loss": 0.88667786, "learning_rate": 3.341688059754588e-08, "loss": 0.90837425, "num_input_tokens_seen": 169621520, "step": 7846, "time_per_iteration": 2.4822561740875244 }, { "auxiliary_loss_clip": 0.01128338, "auxiliary_loss_mlp": 0.00761133, "balance_loss_clip": 1.0414772, "balance_loss_mlp": 1.00044942, "epoch": 0.9435459628449467, "flos": 25003486483200.0, "grad_norm": 2.487402269255585, "language_loss": 0.7776705, "learning_rate": 3.327522759201762e-08, "loss": 0.79656523, "num_input_tokens_seen": 169641390, "step": 7847, "time_per_iteration": 2.557314872741699 }, { "auxiliary_loss_clip": 0.01123359, "auxiliary_loss_mlp": 0.01027031, "balance_loss_clip": 1.04450023, "balance_loss_mlp": 1.01948535, "epoch": 0.9436662057355859, "flos": 22163240309760.0, "grad_norm": 2.1591998272621566, "language_loss": 0.67063391, "learning_rate": 3.313387293707359e-08, "loss": 0.69213784, "num_input_tokens_seen": 169660095, "step": 7848, "time_per_iteration": 2.5176422595977783 }, { "auxiliary_loss_clip": 0.011214, "auxiliary_loss_mlp": 0.01028111, "balance_loss_clip": 1.04487526, "balance_loss_mlp": 1.02023673, "epoch": 0.943786448626225, "flos": 20118522602880.0, "grad_norm": 2.23324976364037, "language_loss": 0.68399274, "learning_rate": 3.29928166541571e-08, "loss": 0.70548785, "num_input_tokens_seen": 169679050, "step": 7849, "time_per_iteration": 2.5070457458496094 }, { "auxiliary_loss_clip": 0.0112696, "auxiliary_loss_mlp": 0.01022574, "balance_loss_clip": 1.04282618, "balance_loss_mlp": 1.01555312, "epoch": 0.943906691516864, "flos": 22090808534400.0, "grad_norm": 2.0133440353510057, "language_loss": 0.80231166, "learning_rate": 3.2852058764666346e-08, "loss": 0.823807, "num_input_tokens_seen": 169698150, "step": 7850, "time_per_iteration": 2.4874753952026367 }, { "auxiliary_loss_clip": 0.01111345, "auxiliary_loss_mlp": 0.0102702, "balance_loss_clip": 1.04403138, "balance_loss_mlp": 1.02015352, "epoch": 0.9440269344075032, "flos": 35298501212160.0, "grad_norm": 2.561723524048758, "language_loss": 0.68113899, "learning_rate": 3.2711599289954264e-08, "loss": 0.70252264, "num_input_tokens_seen": 169722185, "step": 7851, "time_per_iteration": 2.6302433013916016 }, { "auxiliary_loss_clip": 0.01095275, "auxiliary_loss_mlp": 0.01029302, "balance_loss_clip": 1.03962159, "balance_loss_mlp": 1.02248001, "epoch": 0.9441471772981422, "flos": 19238136255360.0, "grad_norm": 1.8332428705005133, "language_loss": 0.7770437, "learning_rate": 3.257143825132847e-08, "loss": 0.79828948, "num_input_tokens_seen": 169740355, "step": 7852, "time_per_iteration": 2.5932705402374268 }, { "auxiliary_loss_clip": 0.01136856, "auxiliary_loss_mlp": 0.01022315, "balance_loss_clip": 1.04454064, "balance_loss_mlp": 1.01548481, "epoch": 0.9442674201887813, "flos": 25739799379200.0, "grad_norm": 4.394625244606911, "language_loss": 0.76086712, "learning_rate": 3.243157567005106e-08, "loss": 0.78245878, "num_input_tokens_seen": 169758535, "step": 7853, "time_per_iteration": 2.51511287689209 }, { "auxiliary_loss_clip": 0.01170208, "auxiliary_loss_mlp": 0.01026276, "balance_loss_clip": 1.05040717, "balance_loss_mlp": 1.01894498, "epoch": 0.9443876630794205, "flos": 15523321737600.0, "grad_norm": 4.034205100257239, "language_loss": 0.63952571, "learning_rate": 3.2292011567339296e-08, "loss": 0.66149056, "num_input_tokens_seen": 169776340, "step": 7854, "time_per_iteration": 3.224992036819458 }, { "auxiliary_loss_clip": 0.01151649, "auxiliary_loss_mlp": 0.00761541, "balance_loss_clip": 1.04486895, "balance_loss_mlp": 1.00043523, "epoch": 0.9445079059700595, "flos": 13400821128960.0, "grad_norm": 2.1489651399515006, "language_loss": 0.55808181, "learning_rate": 3.21527459643649e-08, "loss": 0.5772137, "num_input_tokens_seen": 169793225, "step": 7855, "time_per_iteration": 3.1556334495544434 }, { "auxiliary_loss_clip": 0.01153853, "auxiliary_loss_mlp": 0.01023941, "balance_loss_clip": 1.04605937, "balance_loss_mlp": 1.01671994, "epoch": 0.9446281488606986, "flos": 23659242877440.0, "grad_norm": 2.310149782181084, "language_loss": 0.74146456, "learning_rate": 3.2013778882254536e-08, "loss": 0.76324248, "num_input_tokens_seen": 169812020, "step": 7856, "time_per_iteration": 3.2867541313171387 }, { "auxiliary_loss_clip": 0.01142727, "auxiliary_loss_mlp": 0.01029392, "balance_loss_clip": 1.04364061, "balance_loss_mlp": 1.02255809, "epoch": 0.9447483917513377, "flos": 25557337267200.0, "grad_norm": 1.7366303690542395, "language_loss": 0.75807232, "learning_rate": 3.1875110342088676e-08, "loss": 0.7797935, "num_input_tokens_seen": 169833470, "step": 7857, "time_per_iteration": 2.5026652812957764 }, { "auxiliary_loss_clip": 0.01133024, "auxiliary_loss_mlp": 0.01020985, "balance_loss_clip": 1.04511595, "balance_loss_mlp": 1.01433003, "epoch": 0.9448686346419768, "flos": 24535463247360.0, "grad_norm": 1.6502779326658825, "language_loss": 0.65774155, "learning_rate": 3.1736740364904035e-08, "loss": 0.67928159, "num_input_tokens_seen": 169854000, "step": 7858, "time_per_iteration": 2.4973976612091064 }, { "auxiliary_loss_clip": 0.01104734, "auxiliary_loss_mlp": 0.0076186, "balance_loss_clip": 1.04017067, "balance_loss_mlp": 1.00051117, "epoch": 0.9449888775326158, "flos": 14721256995840.0, "grad_norm": 2.04362269263793, "language_loss": 0.77462316, "learning_rate": 3.159866897169094e-08, "loss": 0.79328907, "num_input_tokens_seen": 169872200, "step": 7859, "time_per_iteration": 2.543001174926758 }, { "auxiliary_loss_clip": 0.01130372, "auxiliary_loss_mlp": 0.01025186, "balance_loss_clip": 1.04449677, "balance_loss_mlp": 1.0182246, "epoch": 0.945109120423255, "flos": 15447873219840.0, "grad_norm": 1.7353632625093676, "language_loss": 0.75522542, "learning_rate": 3.146089618339487e-08, "loss": 0.77678096, "num_input_tokens_seen": 169889055, "step": 7860, "time_per_iteration": 2.4711670875549316 }, { "auxiliary_loss_clip": 0.01124224, "auxiliary_loss_mlp": 0.0102003, "balance_loss_clip": 1.04256558, "balance_loss_mlp": 1.01294041, "epoch": 0.9452293633138941, "flos": 25448097029760.0, "grad_norm": 2.402187216531958, "language_loss": 0.67991292, "learning_rate": 3.132342202091554e-08, "loss": 0.70135552, "num_input_tokens_seen": 169909280, "step": 7861, "time_per_iteration": 2.54388427734375 }, { "auxiliary_loss_clip": 0.01165571, "auxiliary_loss_mlp": 0.01024379, "balance_loss_clip": 1.04651487, "balance_loss_mlp": 1.01716661, "epoch": 0.9453496062045331, "flos": 21215342350080.0, "grad_norm": 2.0322010002819706, "language_loss": 0.68738711, "learning_rate": 3.1186246505107595e-08, "loss": 0.70928663, "num_input_tokens_seen": 169928420, "step": 7862, "time_per_iteration": 2.4076900482177734 }, { "auxiliary_loss_clip": 0.01152482, "auxiliary_loss_mlp": 0.01024305, "balance_loss_clip": 1.04783988, "balance_loss_mlp": 1.01694083, "epoch": 0.9454698490951723, "flos": 20010898477440.0, "grad_norm": 1.751808450621398, "language_loss": 0.8378222, "learning_rate": 3.104936965678084e-08, "loss": 0.85959011, "num_input_tokens_seen": 169946750, "step": 7863, "time_per_iteration": 3.22268009185791 }, { "auxiliary_loss_clip": 0.01149984, "auxiliary_loss_mlp": 0.01021535, "balance_loss_clip": 1.04435408, "balance_loss_mlp": 1.01422191, "epoch": 0.9455900919858113, "flos": 21069652786560.0, "grad_norm": 1.9975630592408675, "language_loss": 0.81775296, "learning_rate": 3.091279149669956e-08, "loss": 0.83946812, "num_input_tokens_seen": 169965540, "step": 7864, "time_per_iteration": 2.4439079761505127 }, { "auxiliary_loss_clip": 0.01150629, "auxiliary_loss_mlp": 0.00761415, "balance_loss_clip": 1.04563642, "balance_loss_mlp": 1.00042129, "epoch": 0.9457103348764504, "flos": 20740854666240.0, "grad_norm": 1.759538006397204, "language_loss": 0.73269653, "learning_rate": 3.0776512045581624e-08, "loss": 0.75181699, "num_input_tokens_seen": 169984330, "step": 7865, "time_per_iteration": 2.4579951763153076 }, { "auxiliary_loss_clip": 0.01131592, "auxiliary_loss_mlp": 0.01027437, "balance_loss_clip": 1.04380536, "balance_loss_mlp": 1.02014744, "epoch": 0.9458305777670896, "flos": 21428363957760.0, "grad_norm": 2.1695022760122167, "language_loss": 0.77613556, "learning_rate": 3.0640531324101384e-08, "loss": 0.79772586, "num_input_tokens_seen": 170002095, "step": 7866, "time_per_iteration": 2.462996244430542 }, { "auxiliary_loss_clip": 0.01155181, "auxiliary_loss_mlp": 0.01024206, "balance_loss_clip": 1.04993081, "balance_loss_mlp": 1.01644576, "epoch": 0.9459508206577286, "flos": 20011185786240.0, "grad_norm": 1.8162068436731986, "language_loss": 0.76079047, "learning_rate": 3.0504849352886554e-08, "loss": 0.78258437, "num_input_tokens_seen": 170020240, "step": 7867, "time_per_iteration": 2.4411990642547607 }, { "auxiliary_loss_clip": 0.01151334, "auxiliary_loss_mlp": 0.01021391, "balance_loss_clip": 1.04655886, "balance_loss_mlp": 1.01462269, "epoch": 0.9460710635483677, "flos": 12166428291840.0, "grad_norm": 9.210517222510402, "language_loss": 0.712843, "learning_rate": 3.036946615252023e-08, "loss": 0.73457026, "num_input_tokens_seen": 170035770, "step": 7868, "time_per_iteration": 2.4180634021759033 }, { "auxiliary_loss_clip": 0.01140638, "auxiliary_loss_mlp": 0.01024854, "balance_loss_clip": 1.04395854, "balance_loss_mlp": 1.01774037, "epoch": 0.9461913064390068, "flos": 34276196229120.0, "grad_norm": 2.3525004973140993, "language_loss": 0.67115986, "learning_rate": 3.0234381743539984e-08, "loss": 0.69281471, "num_input_tokens_seen": 170053385, "step": 7869, "time_per_iteration": 2.5811891555786133 }, { "auxiliary_loss_clip": 0.01142579, "auxiliary_loss_mlp": 0.01021928, "balance_loss_clip": 1.04408407, "balance_loss_mlp": 1.0148561, "epoch": 0.9463115493296459, "flos": 19463763536640.0, "grad_norm": 2.0340607844145624, "language_loss": 0.80144501, "learning_rate": 3.0099596146437863e-08, "loss": 0.82309002, "num_input_tokens_seen": 170070490, "step": 7870, "time_per_iteration": 2.476649522781372 }, { "auxiliary_loss_clip": 0.01060806, "auxiliary_loss_mlp": 0.01000792, "balance_loss_clip": 1.00727773, "balance_loss_mlp": 0.99993342, "epoch": 0.946431792220285, "flos": 70570824387840.0, "grad_norm": 0.7681569221726348, "language_loss": 0.60079873, "learning_rate": 2.996510938166086e-08, "loss": 0.62141472, "num_input_tokens_seen": 170133465, "step": 7871, "time_per_iteration": 3.0822348594665527 }, { "auxiliary_loss_clip": 0.01148955, "auxiliary_loss_mlp": 0.0102205, "balance_loss_clip": 1.04755247, "balance_loss_mlp": 1.01548183, "epoch": 0.9465520351109241, "flos": 18947906363520.0, "grad_norm": 1.8692398209276266, "language_loss": 0.7330935, "learning_rate": 2.983092146960997e-08, "loss": 0.75480354, "num_input_tokens_seen": 170150810, "step": 7872, "time_per_iteration": 2.435206413269043 }, { "auxiliary_loss_clip": 0.01137302, "auxiliary_loss_mlp": 0.01028445, "balance_loss_clip": 1.04230714, "balance_loss_mlp": 1.02062798, "epoch": 0.9466722780015632, "flos": 19135647774720.0, "grad_norm": 1.9942096704682952, "language_loss": 0.80014908, "learning_rate": 2.9697032430642256e-08, "loss": 0.82180655, "num_input_tokens_seen": 170169025, "step": 7873, "time_per_iteration": 2.4758474826812744 }, { "auxiliary_loss_clip": 0.01161119, "auxiliary_loss_mlp": 0.0102198, "balance_loss_clip": 1.04604065, "balance_loss_mlp": 1.01557302, "epoch": 0.9467925208922022, "flos": 17237912520960.0, "grad_norm": 2.3568464725218714, "language_loss": 0.73452997, "learning_rate": 2.9563442285067906e-08, "loss": 0.75636089, "num_input_tokens_seen": 170186070, "step": 7874, "time_per_iteration": 2.370945692062378 }, { "auxiliary_loss_clip": 0.01153819, "auxiliary_loss_mlp": 0.01023831, "balance_loss_clip": 1.0470686, "balance_loss_mlp": 1.0165503, "epoch": 0.9469127637828414, "flos": 29169016859520.0, "grad_norm": 2.236254873262255, "language_loss": 0.79644465, "learning_rate": 2.943015105315294e-08, "loss": 0.81822109, "num_input_tokens_seen": 170206265, "step": 7875, "time_per_iteration": 2.5071771144866943 }, { "auxiliary_loss_clip": 0.01108762, "auxiliary_loss_mlp": 0.01023267, "balance_loss_clip": 1.03932202, "balance_loss_mlp": 1.0153513, "epoch": 0.9470330066734804, "flos": 26030460234240.0, "grad_norm": 3.049599271476982, "language_loss": 0.6647172, "learning_rate": 2.929715875511718e-08, "loss": 0.68603754, "num_input_tokens_seen": 170225300, "step": 7876, "time_per_iteration": 2.5589890480041504 }, { "auxiliary_loss_clip": 0.0115107, "auxiliary_loss_mlp": 0.01022888, "balance_loss_clip": 1.04243565, "balance_loss_mlp": 1.01573563, "epoch": 0.9471532495641195, "flos": 23440906056960.0, "grad_norm": 1.815602305408838, "language_loss": 0.69892883, "learning_rate": 2.9164465411135375e-08, "loss": 0.72066844, "num_input_tokens_seen": 170245070, "step": 7877, "time_per_iteration": 2.4514646530151367 }, { "auxiliary_loss_clip": 0.011531, "auxiliary_loss_mlp": 0.01021538, "balance_loss_clip": 1.04796863, "balance_loss_mlp": 1.01454651, "epoch": 0.9472734924547586, "flos": 15815850099840.0, "grad_norm": 1.7370293766497769, "language_loss": 0.80699301, "learning_rate": 2.9032071041337426e-08, "loss": 0.82873929, "num_input_tokens_seen": 170263305, "step": 7878, "time_per_iteration": 2.4093966484069824 }, { "auxiliary_loss_clip": 0.01130405, "auxiliary_loss_mlp": 0.01027202, "balance_loss_clip": 1.04324424, "balance_loss_mlp": 1.0203445, "epoch": 0.9473937353453977, "flos": 11181793697280.0, "grad_norm": 1.8808750636263978, "language_loss": 0.73073232, "learning_rate": 2.889997566580704e-08, "loss": 0.75230843, "num_input_tokens_seen": 170281460, "step": 7879, "time_per_iteration": 2.4499850273132324 }, { "auxiliary_loss_clip": 0.01165363, "auxiliary_loss_mlp": 0.0102494, "balance_loss_clip": 1.04604936, "balance_loss_mlp": 1.01741755, "epoch": 0.9475139782360368, "flos": 25775530433280.0, "grad_norm": 1.5926766350699328, "language_loss": 0.7053048, "learning_rate": 2.8768179304583086e-08, "loss": 0.72720784, "num_input_tokens_seen": 170303515, "step": 7880, "time_per_iteration": 2.4577770233154297 }, { "auxiliary_loss_clip": 0.01124808, "auxiliary_loss_mlp": 0.01026758, "balance_loss_clip": 1.04575443, "balance_loss_mlp": 1.01992762, "epoch": 0.9476342211266758, "flos": 22820046451200.0, "grad_norm": 1.833557670925971, "language_loss": 0.73398244, "learning_rate": 2.8636681977659117e-08, "loss": 0.75549817, "num_input_tokens_seen": 170323165, "step": 7881, "time_per_iteration": 3.3875198364257812 }, { "auxiliary_loss_clip": 0.01107102, "auxiliary_loss_mlp": 0.01026972, "balance_loss_clip": 1.04396105, "balance_loss_mlp": 1.01984334, "epoch": 0.947754464017315, "flos": 20193611984640.0, "grad_norm": 2.6093908905250554, "language_loss": 0.78018767, "learning_rate": 2.850548370498318e-08, "loss": 0.80152845, "num_input_tokens_seen": 170341005, "step": 7882, "time_per_iteration": 3.291841983795166 }, { "auxiliary_loss_clip": 0.01147573, "auxiliary_loss_mlp": 0.01020507, "balance_loss_clip": 1.04203844, "balance_loss_mlp": 1.01399851, "epoch": 0.9478747069079541, "flos": 24717925359360.0, "grad_norm": 4.0760837896489095, "language_loss": 0.71137774, "learning_rate": 2.8374584506457798e-08, "loss": 0.73305857, "num_input_tokens_seen": 170362280, "step": 7883, "time_per_iteration": 3.295448064804077 }, { "auxiliary_loss_clip": 0.01135331, "auxiliary_loss_mlp": 0.01020674, "balance_loss_clip": 1.04536188, "balance_loss_mlp": 1.01327085, "epoch": 0.9479949497985931, "flos": 21361355136000.0, "grad_norm": 2.475995640123661, "language_loss": 0.67193711, "learning_rate": 2.824398440193998e-08, "loss": 0.69349718, "num_input_tokens_seen": 170381080, "step": 7884, "time_per_iteration": 2.469330072402954 }, { "auxiliary_loss_clip": 0.01106307, "auxiliary_loss_mlp": 0.01025589, "balance_loss_clip": 1.04240775, "balance_loss_mlp": 1.01794779, "epoch": 0.9481151926892323, "flos": 18148606968960.0, "grad_norm": 3.7627254362232887, "language_loss": 0.71908408, "learning_rate": 2.811368341124232e-08, "loss": 0.74040306, "num_input_tokens_seen": 170400150, "step": 7885, "time_per_iteration": 2.54787015914917 }, { "auxiliary_loss_clip": 0.0113517, "auxiliary_loss_mlp": 0.01029065, "balance_loss_clip": 1.04399848, "balance_loss_mlp": 1.02203727, "epoch": 0.9482354355798713, "flos": 22128012046080.0, "grad_norm": 2.1640753261137786, "language_loss": 0.68272167, "learning_rate": 2.7983681554131222e-08, "loss": 0.70436406, "num_input_tokens_seen": 170420410, "step": 7886, "time_per_iteration": 2.4963207244873047 }, { "auxiliary_loss_clip": 0.01135129, "auxiliary_loss_mlp": 0.01024581, "balance_loss_clip": 1.04341221, "balance_loss_mlp": 1.01740503, "epoch": 0.9483556784705104, "flos": 19063072344960.0, "grad_norm": 3.9878115921925037, "language_loss": 0.708947, "learning_rate": 2.7853978850327365e-08, "loss": 0.73054409, "num_input_tokens_seen": 170439580, "step": 7887, "time_per_iteration": 2.463390827178955 }, { "auxiliary_loss_clip": 0.0112247, "auxiliary_loss_mlp": 0.01022628, "balance_loss_clip": 1.04718244, "balance_loss_mlp": 1.01565993, "epoch": 0.9484759213611496, "flos": 25777110631680.0, "grad_norm": 1.8949550063773881, "language_loss": 0.87284803, "learning_rate": 2.7724575319507225e-08, "loss": 0.89429903, "num_input_tokens_seen": 170459290, "step": 7888, "time_per_iteration": 2.5543770790100098 }, { "auxiliary_loss_clip": 0.01149039, "auxiliary_loss_mlp": 0.01022585, "balance_loss_clip": 1.04298997, "balance_loss_mlp": 1.01584625, "epoch": 0.9485961642517886, "flos": 20667740532480.0, "grad_norm": 1.7886601044194215, "language_loss": 0.77062273, "learning_rate": 2.759547098130044e-08, "loss": 0.79233897, "num_input_tokens_seen": 170478020, "step": 7889, "time_per_iteration": 2.4523630142211914 }, { "auxiliary_loss_clip": 0.01161788, "auxiliary_loss_mlp": 0.01023784, "balance_loss_clip": 1.04602659, "balance_loss_mlp": 1.01664329, "epoch": 0.9487164071424277, "flos": 22674069578880.0, "grad_norm": 2.415688712473607, "language_loss": 0.76696026, "learning_rate": 2.746666585529267e-08, "loss": 0.78881598, "num_input_tokens_seen": 170498295, "step": 7890, "time_per_iteration": 3.210886240005493 }, { "auxiliary_loss_clip": 0.01142561, "auxiliary_loss_mlp": 0.01025073, "balance_loss_clip": 1.04321933, "balance_loss_mlp": 1.0180279, "epoch": 0.9488366500330668, "flos": 38726461716480.0, "grad_norm": 2.0665936921282975, "language_loss": 0.74251044, "learning_rate": 2.73381599610234e-08, "loss": 0.76418686, "num_input_tokens_seen": 170518695, "step": 7891, "time_per_iteration": 2.6543033123016357 }, { "auxiliary_loss_clip": 0.01144506, "auxiliary_loss_mlp": 0.0102551, "balance_loss_clip": 1.04194331, "balance_loss_mlp": 1.01799703, "epoch": 0.9489568929237059, "flos": 27890920149120.0, "grad_norm": 1.6947393843977296, "language_loss": 0.71408939, "learning_rate": 2.7209953317987033e-08, "loss": 0.73578954, "num_input_tokens_seen": 170539735, "step": 7892, "time_per_iteration": 2.508558511734009 }, { "auxiliary_loss_clip": 0.01151589, "auxiliary_loss_mlp": 0.01021677, "balance_loss_clip": 1.04643822, "balance_loss_mlp": 1.01466143, "epoch": 0.9490771358143449, "flos": 33580642291200.0, "grad_norm": 3.0216657475812747, "language_loss": 0.78105861, "learning_rate": 2.7082045945631793e-08, "loss": 0.80279124, "num_input_tokens_seen": 170561950, "step": 7893, "time_per_iteration": 2.5604779720306396 }, { "auxiliary_loss_clip": 0.01114849, "auxiliary_loss_mlp": 0.01022422, "balance_loss_clip": 1.04189968, "balance_loss_mlp": 1.01517975, "epoch": 0.9491973787049841, "flos": 14793796512000.0, "grad_norm": 2.18425932258128, "language_loss": 0.69260025, "learning_rate": 2.6954437863361712e-08, "loss": 0.71397299, "num_input_tokens_seen": 170579865, "step": 7894, "time_per_iteration": 2.504115104675293 }, { "auxiliary_loss_clip": 0.01095179, "auxiliary_loss_mlp": 0.01021642, "balance_loss_clip": 1.03972673, "balance_loss_mlp": 1.01520801, "epoch": 0.9493176215956232, "flos": 25332535998720.0, "grad_norm": 2.1189384699708183, "language_loss": 0.70841491, "learning_rate": 2.6827129090534862e-08, "loss": 0.72958314, "num_input_tokens_seen": 170600165, "step": 7895, "time_per_iteration": 2.644534111022949 }, { "auxiliary_loss_clip": 0.01135989, "auxiliary_loss_mlp": 0.01026754, "balance_loss_clip": 1.04632664, "balance_loss_mlp": 1.019279, "epoch": 0.9494378644862622, "flos": 21029971236480.0, "grad_norm": 2.0963763626508687, "language_loss": 0.77955866, "learning_rate": 2.670011964646335e-08, "loss": 0.80118608, "num_input_tokens_seen": 170618845, "step": 7896, "time_per_iteration": 2.4762024879455566 }, { "auxiliary_loss_clip": 0.01083493, "auxiliary_loss_mlp": 0.01023583, "balance_loss_clip": 1.03318751, "balance_loss_mlp": 1.01596892, "epoch": 0.9495581073769014, "flos": 15195134148480.0, "grad_norm": 3.246791396021892, "language_loss": 0.68356919, "learning_rate": 2.657340955041487e-08, "loss": 0.70463997, "num_input_tokens_seen": 170637620, "step": 7897, "time_per_iteration": 2.6020145416259766 }, { "auxiliary_loss_clip": 0.01138134, "auxiliary_loss_mlp": 0.01026416, "balance_loss_clip": 1.04704404, "balance_loss_mlp": 1.01871181, "epoch": 0.9496783502675404, "flos": 28616566705920.0, "grad_norm": 1.8595902935398658, "language_loss": 0.7189703, "learning_rate": 2.6446998821611167e-08, "loss": 0.74061584, "num_input_tokens_seen": 170657815, "step": 7898, "time_per_iteration": 2.5292317867279053 }, { "auxiliary_loss_clip": 0.01109144, "auxiliary_loss_mlp": 0.0102707, "balance_loss_clip": 1.0407455, "balance_loss_mlp": 1.01994395, "epoch": 0.9497985931581795, "flos": 14866874732160.0, "grad_norm": 2.51988940484905, "language_loss": 0.71779197, "learning_rate": 2.6320887479228228e-08, "loss": 0.7391541, "num_input_tokens_seen": 170674415, "step": 7899, "time_per_iteration": 2.5008280277252197 }, { "auxiliary_loss_clip": 0.0113983, "auxiliary_loss_mlp": 0.01028155, "balance_loss_clip": 1.04374695, "balance_loss_mlp": 1.02104425, "epoch": 0.9499188360488187, "flos": 27193319136000.0, "grad_norm": 2.3891901605304007, "language_loss": 0.726197, "learning_rate": 2.619507554239786e-08, "loss": 0.74787682, "num_input_tokens_seen": 170692975, "step": 7900, "time_per_iteration": 2.5016028881073 }, { "auxiliary_loss_clip": 0.01136778, "auxiliary_loss_mlp": 0.01027396, "balance_loss_clip": 1.04434419, "balance_loss_mlp": 1.02016592, "epoch": 0.9500390789394577, "flos": 24316479982080.0, "grad_norm": 1.7727170261963179, "language_loss": 0.6947031, "learning_rate": 2.606956303020502e-08, "loss": 0.71634483, "num_input_tokens_seen": 170713780, "step": 7901, "time_per_iteration": 2.5260939598083496 }, { "auxiliary_loss_clip": 0.0115266, "auxiliary_loss_mlp": 0.01022778, "balance_loss_clip": 1.04780269, "balance_loss_mlp": 1.01538396, "epoch": 0.9501593218300968, "flos": 14354752573440.0, "grad_norm": 1.7317561122193883, "language_loss": 0.8402015, "learning_rate": 2.5944349961690036e-08, "loss": 0.86195588, "num_input_tokens_seen": 170730800, "step": 7902, "time_per_iteration": 2.4129440784454346 }, { "auxiliary_loss_clip": 0.0112145, "auxiliary_loss_mlp": 0.01021667, "balance_loss_clip": 1.04260767, "balance_loss_mlp": 1.01452649, "epoch": 0.9502795647207359, "flos": 38728113742080.0, "grad_norm": 1.5990768469515926, "language_loss": 0.73021787, "learning_rate": 2.581943635584749e-08, "loss": 0.75164902, "num_input_tokens_seen": 170753630, "step": 7903, "time_per_iteration": 2.6594784259796143 }, { "auxiliary_loss_clip": 0.01128696, "auxiliary_loss_mlp": 0.01019697, "balance_loss_clip": 1.04357338, "balance_loss_mlp": 1.01336074, "epoch": 0.950399807611375, "flos": 40808023799040.0, "grad_norm": 1.510223638928864, "language_loss": 0.65172625, "learning_rate": 2.569482223162689e-08, "loss": 0.67321008, "num_input_tokens_seen": 170777605, "step": 7904, "time_per_iteration": 2.660789966583252 }, { "auxiliary_loss_clip": 0.01150816, "auxiliary_loss_mlp": 0.01021235, "balance_loss_clip": 1.04420209, "balance_loss_mlp": 1.01404631, "epoch": 0.950520050502014, "flos": 23440403266560.0, "grad_norm": 1.7361663881273115, "language_loss": 0.72268748, "learning_rate": 2.5570507607932e-08, "loss": 0.74440795, "num_input_tokens_seen": 170797520, "step": 7905, "time_per_iteration": 2.4628560543060303 }, { "auxiliary_loss_clip": 0.01155819, "auxiliary_loss_mlp": 0.01025197, "balance_loss_clip": 1.04578066, "balance_loss_mlp": 1.01793456, "epoch": 0.9506402933926532, "flos": 17783718658560.0, "grad_norm": 3.220973105537502, "language_loss": 0.63576317, "learning_rate": 2.54464925036213e-08, "loss": 0.65757334, "num_input_tokens_seen": 170814810, "step": 7906, "time_per_iteration": 2.4090638160705566 }, { "auxiliary_loss_clip": 0.01149802, "auxiliary_loss_mlp": 0.0102232, "balance_loss_clip": 1.04518294, "balance_loss_mlp": 1.01466048, "epoch": 0.9507605362832923, "flos": 32561928668160.0, "grad_norm": 1.8310937891401635, "language_loss": 0.60720944, "learning_rate": 2.532277693750773e-08, "loss": 0.62893069, "num_input_tokens_seen": 170835735, "step": 7907, "time_per_iteration": 3.38250470161438 }, { "auxiliary_loss_clip": 0.01109587, "auxiliary_loss_mlp": 0.0102353, "balance_loss_clip": 1.04533446, "balance_loss_mlp": 1.01631212, "epoch": 0.9508807791739313, "flos": 19602054898560.0, "grad_norm": 2.0829247431788755, "language_loss": 0.76211607, "learning_rate": 2.5199360928358948e-08, "loss": 0.78344727, "num_input_tokens_seen": 170852970, "step": 7908, "time_per_iteration": 3.339094400405884 }, { "auxiliary_loss_clip": 0.01139686, "auxiliary_loss_mlp": 0.00761169, "balance_loss_clip": 1.0419178, "balance_loss_mlp": 1.00043821, "epoch": 0.9510010220645704, "flos": 21471852349440.0, "grad_norm": 1.778389177432278, "language_loss": 0.86937213, "learning_rate": 2.507624449489665e-08, "loss": 0.88838065, "num_input_tokens_seen": 170871600, "step": 7909, "time_per_iteration": 3.2919118404388428 }, { "auxiliary_loss_clip": 0.01137541, "auxiliary_loss_mlp": 0.01026501, "balance_loss_clip": 1.04638052, "balance_loss_mlp": 1.01918125, "epoch": 0.9511212649552095, "flos": 18879999701760.0, "grad_norm": 1.8383305851584208, "language_loss": 0.65043491, "learning_rate": 2.495342765579811e-08, "loss": 0.67207533, "num_input_tokens_seen": 170890260, "step": 7910, "time_per_iteration": 2.4687838554382324 }, { "auxiliary_loss_clip": 0.01108182, "auxiliary_loss_mlp": 0.01020726, "balance_loss_clip": 1.04540312, "balance_loss_mlp": 1.01413095, "epoch": 0.9512415078458486, "flos": 20810521094400.0, "grad_norm": 1.7067830872009233, "language_loss": 0.70883018, "learning_rate": 2.4830910429693984e-08, "loss": 0.73011923, "num_input_tokens_seen": 170910220, "step": 7911, "time_per_iteration": 2.5718703269958496 }, { "auxiliary_loss_clip": 0.01163859, "auxiliary_loss_mlp": 0.01028249, "balance_loss_clip": 1.04575384, "balance_loss_mlp": 1.02097416, "epoch": 0.9513617507364877, "flos": 18369565482240.0, "grad_norm": 2.249737197616849, "language_loss": 0.79921162, "learning_rate": 2.470869283517052e-08, "loss": 0.82113266, "num_input_tokens_seen": 170928255, "step": 7912, "time_per_iteration": 2.3896517753601074 }, { "auxiliary_loss_clip": 0.01143383, "auxiliary_loss_mlp": 0.0102455, "balance_loss_clip": 1.04266667, "balance_loss_mlp": 1.0173403, "epoch": 0.9514819936271268, "flos": 25010166412800.0, "grad_norm": 1.5636190551080271, "language_loss": 0.76957893, "learning_rate": 2.458677489076777e-08, "loss": 0.79125828, "num_input_tokens_seen": 170949265, "step": 7913, "time_per_iteration": 2.507042407989502 }, { "auxiliary_loss_clip": 0.01140404, "auxiliary_loss_mlp": 0.01027256, "balance_loss_clip": 1.04278743, "balance_loss_mlp": 1.02054405, "epoch": 0.9516022365177659, "flos": 18662129758080.0, "grad_norm": 1.8812950454134774, "language_loss": 0.82991415, "learning_rate": 2.446515661498072e-08, "loss": 0.85159069, "num_input_tokens_seen": 170968595, "step": 7914, "time_per_iteration": 2.4373035430908203 }, { "auxiliary_loss_clip": 0.01093446, "auxiliary_loss_mlp": 0.01026178, "balance_loss_clip": 1.04129684, "balance_loss_mlp": 1.01921654, "epoch": 0.9517224794084049, "flos": 25372109808000.0, "grad_norm": 3.3668795184327522, "language_loss": 0.74233401, "learning_rate": 2.434383802625861e-08, "loss": 0.76353025, "num_input_tokens_seen": 170987550, "step": 7915, "time_per_iteration": 2.6068801879882812 }, { "auxiliary_loss_clip": 0.0112181, "auxiliary_loss_mlp": 0.01024353, "balance_loss_clip": 1.03996992, "balance_loss_mlp": 1.01738262, "epoch": 0.9518427222990441, "flos": 21470918595840.0, "grad_norm": 1.7502016370595734, "language_loss": 0.73787653, "learning_rate": 2.4222819143005168e-08, "loss": 0.75933814, "num_input_tokens_seen": 171007145, "step": 7916, "time_per_iteration": 3.2857820987701416 }, { "auxiliary_loss_clip": 0.01162166, "auxiliary_loss_mlp": 0.01023244, "balance_loss_clip": 1.04657066, "balance_loss_mlp": 1.01639533, "epoch": 0.9519629651896832, "flos": 21033634423680.0, "grad_norm": 4.0626144362388175, "language_loss": 0.81006205, "learning_rate": 2.4102099983579706e-08, "loss": 0.83191621, "num_input_tokens_seen": 171026295, "step": 7917, "time_per_iteration": 2.430516481399536 }, { "auxiliary_loss_clip": 0.01149456, "auxiliary_loss_mlp": 0.01026124, "balance_loss_clip": 1.04395151, "balance_loss_mlp": 1.01835704, "epoch": 0.9520832080803222, "flos": 21689219502720.0, "grad_norm": 1.8973443977570357, "language_loss": 0.77114999, "learning_rate": 2.3981680566294236e-08, "loss": 0.79290581, "num_input_tokens_seen": 171045895, "step": 7918, "time_per_iteration": 2.4527502059936523 }, { "auxiliary_loss_clip": 0.01163303, "auxiliary_loss_mlp": 0.01025004, "balance_loss_clip": 1.04807699, "balance_loss_mlp": 1.01848936, "epoch": 0.9522034509709614, "flos": 23145289125120.0, "grad_norm": 1.812190183045559, "language_loss": 0.73185623, "learning_rate": 2.3861560909416822e-08, "loss": 0.75373924, "num_input_tokens_seen": 171065445, "step": 7919, "time_per_iteration": 2.4244539737701416 }, { "auxiliary_loss_clip": 0.01111961, "auxiliary_loss_mlp": 0.01026699, "balance_loss_clip": 1.04474926, "balance_loss_mlp": 1.01994562, "epoch": 0.9523236938616004, "flos": 24679428958080.0, "grad_norm": 1.6848474862599423, "language_loss": 0.82475185, "learning_rate": 2.3741741031169325e-08, "loss": 0.84613848, "num_input_tokens_seen": 171085015, "step": 7920, "time_per_iteration": 2.5710854530334473 }, { "auxiliary_loss_clip": 0.01103872, "auxiliary_loss_mlp": 0.01027915, "balance_loss_clip": 1.03927994, "balance_loss_mlp": 1.02136779, "epoch": 0.9524439367522395, "flos": 22672309812480.0, "grad_norm": 1.8287716841394348, "language_loss": 0.71649098, "learning_rate": 2.3622220949728544e-08, "loss": 0.73780882, "num_input_tokens_seen": 171103900, "step": 7921, "time_per_iteration": 2.547158718109131 }, { "auxiliary_loss_clip": 0.0114276, "auxiliary_loss_mlp": 0.01027537, "balance_loss_clip": 1.04337382, "balance_loss_mlp": 1.01993132, "epoch": 0.9525641796428787, "flos": 34055525024640.0, "grad_norm": 2.8791901054580578, "language_loss": 0.61022198, "learning_rate": 2.3503000683225526e-08, "loss": 0.63192499, "num_input_tokens_seen": 171121615, "step": 7922, "time_per_iteration": 2.545583963394165 }, { "auxiliary_loss_clip": 0.01165562, "auxiliary_loss_mlp": 0.01024035, "balance_loss_clip": 1.04637206, "balance_loss_mlp": 1.01675129, "epoch": 0.9526844225335177, "flos": 16727083251840.0, "grad_norm": 2.132282022322332, "language_loss": 0.84494841, "learning_rate": 2.3384080249745585e-08, "loss": 0.86684436, "num_input_tokens_seen": 171139505, "step": 7923, "time_per_iteration": 2.4152626991271973 }, { "auxiliary_loss_clip": 0.01111939, "auxiliary_loss_mlp": 0.01024284, "balance_loss_clip": 1.04118335, "balance_loss_mlp": 1.01775384, "epoch": 0.9528046654241568, "flos": 36939367330560.0, "grad_norm": 2.3650445255666943, "language_loss": 0.82840222, "learning_rate": 2.3265459667329178e-08, "loss": 0.84976447, "num_input_tokens_seen": 171158995, "step": 7924, "time_per_iteration": 2.6985020637512207 }, { "auxiliary_loss_clip": 0.01137995, "auxiliary_loss_mlp": 0.01019972, "balance_loss_clip": 1.04471958, "balance_loss_mlp": 1.0129447, "epoch": 0.9529249083147959, "flos": 18255010032000.0, "grad_norm": 2.0093128489894374, "language_loss": 0.85960305, "learning_rate": 2.31471389539708e-08, "loss": 0.88118267, "num_input_tokens_seen": 171176120, "step": 7925, "time_per_iteration": 2.4927144050598145 }, { "auxiliary_loss_clip": 0.01151615, "auxiliary_loss_mlp": 0.00760929, "balance_loss_clip": 1.0464139, "balance_loss_mlp": 1.0004046, "epoch": 0.953045151205435, "flos": 28658438985600.0, "grad_norm": 2.239207463529204, "language_loss": 0.7294575, "learning_rate": 2.3029118127619872e-08, "loss": 0.74858296, "num_input_tokens_seen": 171195835, "step": 7926, "time_per_iteration": 2.5613765716552734 }, { "auxiliary_loss_clip": 0.01128845, "auxiliary_loss_mlp": 0.01022349, "balance_loss_clip": 1.04283845, "balance_loss_mlp": 1.01494622, "epoch": 0.953165394096074, "flos": 21835232288640.0, "grad_norm": 2.4410223684445755, "language_loss": 0.87269485, "learning_rate": 2.2911397206179628e-08, "loss": 0.89420676, "num_input_tokens_seen": 171212585, "step": 7927, "time_per_iteration": 2.4757931232452393 }, { "auxiliary_loss_clip": 0.01161736, "auxiliary_loss_mlp": 0.01024627, "balance_loss_clip": 1.04604971, "balance_loss_mlp": 1.01794589, "epoch": 0.9532856369867132, "flos": 19975059682560.0, "grad_norm": 6.581115793409175, "language_loss": 0.62622267, "learning_rate": 2.279397620750845e-08, "loss": 0.64808631, "num_input_tokens_seen": 171231630, "step": 7928, "time_per_iteration": 2.4068655967712402 }, { "auxiliary_loss_clip": 0.01133473, "auxiliary_loss_mlp": 0.01024837, "balance_loss_clip": 1.04270196, "balance_loss_mlp": 1.01843214, "epoch": 0.9534058798773523, "flos": 15049588239360.0, "grad_norm": 2.1314840585865342, "language_loss": 0.78899407, "learning_rate": 2.2676855149419195e-08, "loss": 0.81057721, "num_input_tokens_seen": 171248800, "step": 7929, "time_per_iteration": 2.44342303276062 }, { "auxiliary_loss_clip": 0.0113582, "auxiliary_loss_mlp": 0.0102509, "balance_loss_clip": 1.04875267, "balance_loss_mlp": 1.01795495, "epoch": 0.9535261227679913, "flos": 17602800831360.0, "grad_norm": 2.6539089806052143, "language_loss": 0.75727117, "learning_rate": 2.2560034049678988e-08, "loss": 0.77888036, "num_input_tokens_seen": 171263150, "step": 7930, "time_per_iteration": 2.443641424179077 }, { "auxiliary_loss_clip": 0.01169942, "auxiliary_loss_mlp": 0.01024312, "balance_loss_clip": 1.0495379, "balance_loss_mlp": 1.0171833, "epoch": 0.9536463656586305, "flos": 23142954741120.0, "grad_norm": 1.780541748263086, "language_loss": 0.75321794, "learning_rate": 2.2443512926008988e-08, "loss": 0.77516055, "num_input_tokens_seen": 171282480, "step": 7931, "time_per_iteration": 2.4090731143951416 }, { "auxiliary_loss_clip": 0.01123183, "auxiliary_loss_mlp": 0.01026154, "balance_loss_clip": 1.04190397, "balance_loss_mlp": 1.01912355, "epoch": 0.9537666085492695, "flos": 18625033987200.0, "grad_norm": 2.6809850364366263, "language_loss": 0.69789481, "learning_rate": 2.2327291796085946e-08, "loss": 0.71938813, "num_input_tokens_seen": 171300840, "step": 7932, "time_per_iteration": 2.4717390537261963 }, { "auxiliary_loss_clip": 0.01164471, "auxiliary_loss_mlp": 0.01025609, "balance_loss_clip": 1.0455513, "balance_loss_mlp": 1.01853418, "epoch": 0.9538868514399086, "flos": 18989347680000.0, "grad_norm": 3.124842473277786, "language_loss": 0.77398485, "learning_rate": 2.2211370677540197e-08, "loss": 0.79588568, "num_input_tokens_seen": 171317365, "step": 7933, "time_per_iteration": 3.242696762084961 }, { "auxiliary_loss_clip": 0.0116536, "auxiliary_loss_mlp": 0.01026135, "balance_loss_clip": 1.0467689, "balance_loss_mlp": 1.01891732, "epoch": 0.9540070943305478, "flos": 16800556521600.0, "grad_norm": 3.348007318913691, "language_loss": 0.7835173, "learning_rate": 2.2095749587957012e-08, "loss": 0.8054322, "num_input_tokens_seen": 171335270, "step": 7934, "time_per_iteration": 2.389634609222412 }, { "auxiliary_loss_clip": 0.01131952, "auxiliary_loss_mlp": 0.01026264, "balance_loss_clip": 1.04043746, "balance_loss_mlp": 1.01871836, "epoch": 0.9541273372211868, "flos": 20156911263360.0, "grad_norm": 2.830768016858018, "language_loss": 0.69362223, "learning_rate": 2.1980428544876138e-08, "loss": 0.71520436, "num_input_tokens_seen": 171353910, "step": 7935, "time_per_iteration": 3.239562511444092 }, { "auxiliary_loss_clip": 0.01100154, "auxiliary_loss_mlp": 0.01024546, "balance_loss_clip": 1.03607368, "balance_loss_mlp": 1.01690483, "epoch": 0.9542475801118259, "flos": 26725511381760.0, "grad_norm": 1.5325552328243468, "language_loss": 0.73867846, "learning_rate": 2.1865407565791584e-08, "loss": 0.75992548, "num_input_tokens_seen": 171375480, "step": 7936, "time_per_iteration": 3.4552974700927734 }, { "auxiliary_loss_clip": 0.01135908, "auxiliary_loss_mlp": 0.01022589, "balance_loss_clip": 1.0413959, "balance_loss_mlp": 1.01513267, "epoch": 0.954367823002465, "flos": 23330911633920.0, "grad_norm": 6.233755643949018, "language_loss": 0.77534211, "learning_rate": 2.175068666815183e-08, "loss": 0.79692709, "num_input_tokens_seen": 171396320, "step": 7937, "time_per_iteration": 2.5032925605773926 }, { "auxiliary_loss_clip": 0.01124115, "auxiliary_loss_mlp": 0.01030638, "balance_loss_clip": 1.04264879, "balance_loss_mlp": 1.02337813, "epoch": 0.9544880658931041, "flos": 14902713527040.0, "grad_norm": 2.1005712454665537, "language_loss": 0.78787315, "learning_rate": 2.163626586935985e-08, "loss": 0.80942065, "num_input_tokens_seen": 171412860, "step": 7938, "time_per_iteration": 2.511671781539917 }, { "auxiliary_loss_clip": 0.01146452, "auxiliary_loss_mlp": 0.0102855, "balance_loss_clip": 1.04323411, "balance_loss_mlp": 1.02118242, "epoch": 0.9546083087837431, "flos": 29095902725760.0, "grad_norm": 2.280249562238388, "language_loss": 0.63033187, "learning_rate": 2.1522145186773755e-08, "loss": 0.65208197, "num_input_tokens_seen": 171431780, "step": 7939, "time_per_iteration": 2.5599637031555176 }, { "auxiliary_loss_clip": 0.0113347, "auxiliary_loss_mlp": 0.01025053, "balance_loss_clip": 1.04318857, "balance_loss_mlp": 1.01823378, "epoch": 0.9547285516743822, "flos": 21142335957120.0, "grad_norm": 2.3914949473883462, "language_loss": 0.85148418, "learning_rate": 2.140832463770481e-08, "loss": 0.87306941, "num_input_tokens_seen": 171450975, "step": 7940, "time_per_iteration": 2.4612669944763184 }, { "auxiliary_loss_clip": 0.0113996, "auxiliary_loss_mlp": 0.01021349, "balance_loss_clip": 1.04350042, "balance_loss_mlp": 1.01423788, "epoch": 0.9548487945650214, "flos": 27490157130240.0, "grad_norm": 2.594106354653213, "language_loss": 0.75912237, "learning_rate": 2.129480423941987e-08, "loss": 0.78073543, "num_input_tokens_seen": 171467645, "step": 7941, "time_per_iteration": 2.5190694332122803 }, { "auxiliary_loss_clip": 0.01138088, "auxiliary_loss_mlp": 0.01022245, "balance_loss_clip": 1.04352808, "balance_loss_mlp": 1.01566744, "epoch": 0.9549690374556604, "flos": 22273198819200.0, "grad_norm": 1.628924507148828, "language_loss": 0.80196249, "learning_rate": 2.1181584009140052e-08, "loss": 0.82356584, "num_input_tokens_seen": 171487185, "step": 7942, "time_per_iteration": 3.177783250808716 }, { "auxiliary_loss_clip": 0.01131414, "auxiliary_loss_mlp": 0.01025015, "balance_loss_clip": 1.04435813, "balance_loss_mlp": 1.01856613, "epoch": 0.9550892803462995, "flos": 17595294888960.0, "grad_norm": 2.2168011145549005, "language_loss": 0.83744979, "learning_rate": 2.10686639640405e-08, "loss": 0.85901415, "num_input_tokens_seen": 171501275, "step": 7943, "time_per_iteration": 2.4626402854919434 }, { "auxiliary_loss_clip": 0.01153053, "auxiliary_loss_mlp": 0.01024652, "balance_loss_clip": 1.04559457, "balance_loss_mlp": 1.01724911, "epoch": 0.9552095232369386, "flos": 24353144789760.0, "grad_norm": 1.7419552880218059, "language_loss": 0.81155145, "learning_rate": 2.0956044121251294e-08, "loss": 0.83332855, "num_input_tokens_seen": 171520060, "step": 7944, "time_per_iteration": 2.489194393157959 }, { "auxiliary_loss_clip": 0.01123822, "auxiliary_loss_mlp": 0.01024473, "balance_loss_clip": 1.04558134, "balance_loss_mlp": 1.01712394, "epoch": 0.9553297661275777, "flos": 22746860490240.0, "grad_norm": 1.7479338731445038, "language_loss": 0.81214035, "learning_rate": 2.084372449785654e-08, "loss": 0.83362329, "num_input_tokens_seen": 171539895, "step": 7945, "time_per_iteration": 2.5234341621398926 }, { "auxiliary_loss_clip": 0.01131955, "auxiliary_loss_mlp": 0.01025876, "balance_loss_clip": 1.04187047, "balance_loss_mlp": 1.01883042, "epoch": 0.9554500090182168, "flos": 15413866018560.0, "grad_norm": 1.7917656836217764, "language_loss": 0.68747103, "learning_rate": 2.0731705110895282e-08, "loss": 0.70904934, "num_input_tokens_seen": 171557385, "step": 7946, "time_per_iteration": 2.4476161003112793 }, { "auxiliary_loss_clip": 0.01155151, "auxiliary_loss_mlp": 0.01024817, "balance_loss_clip": 1.0483501, "balance_loss_mlp": 1.01714623, "epoch": 0.9555702519088559, "flos": 23513517400320.0, "grad_norm": 2.0195917611910446, "language_loss": 0.86830354, "learning_rate": 2.0619985977360587e-08, "loss": 0.89010322, "num_input_tokens_seen": 171575705, "step": 7947, "time_per_iteration": 2.4610509872436523 }, { "auxiliary_loss_clip": 0.01107197, "auxiliary_loss_mlp": 0.0102604, "balance_loss_clip": 1.03770041, "balance_loss_mlp": 1.01914978, "epoch": 0.955690494799495, "flos": 22962072827520.0, "grad_norm": 1.853082052047393, "language_loss": 0.76820534, "learning_rate": 2.0508567114200237e-08, "loss": 0.78953773, "num_input_tokens_seen": 171595620, "step": 7948, "time_per_iteration": 2.5383851528167725 }, { "auxiliary_loss_clip": 0.01139389, "auxiliary_loss_mlp": 0.01022111, "balance_loss_clip": 1.04455066, "balance_loss_mlp": 1.0154798, "epoch": 0.955810737690134, "flos": 26031250333440.0, "grad_norm": 3.7066386509107794, "language_loss": 0.78812921, "learning_rate": 2.0397448538316485e-08, "loss": 0.80974424, "num_input_tokens_seen": 171616660, "step": 7949, "time_per_iteration": 2.549020290374756 }, { "auxiliary_loss_clip": 0.01117263, "auxiliary_loss_mlp": 0.01023825, "balance_loss_clip": 1.04193544, "balance_loss_mlp": 1.01698875, "epoch": 0.9559309805807732, "flos": 20849951249280.0, "grad_norm": 2.79569559823281, "language_loss": 0.66580039, "learning_rate": 2.028663026656563e-08, "loss": 0.68721128, "num_input_tokens_seen": 171635515, "step": 7950, "time_per_iteration": 2.505826473236084 }, { "auxiliary_loss_clip": 0.01162852, "auxiliary_loss_mlp": 0.00762192, "balance_loss_clip": 1.04665995, "balance_loss_mlp": 1.00042915, "epoch": 0.9560512234714122, "flos": 21578219498880.0, "grad_norm": 1.9237452955086372, "language_loss": 0.71839231, "learning_rate": 2.0176112315758885e-08, "loss": 0.73764277, "num_input_tokens_seen": 171653305, "step": 7951, "time_per_iteration": 2.4165947437286377 }, { "auxiliary_loss_clip": 0.01116675, "auxiliary_loss_mlp": 0.01028879, "balance_loss_clip": 1.04335201, "balance_loss_mlp": 1.02158952, "epoch": 0.9561714663620513, "flos": 17450144029440.0, "grad_norm": 3.264957688021787, "language_loss": 0.69134521, "learning_rate": 2.0065894702661957e-08, "loss": 0.71280074, "num_input_tokens_seen": 171669980, "step": 7952, "time_per_iteration": 2.504631519317627 }, { "auxiliary_loss_clip": 0.01115792, "auxiliary_loss_mlp": 0.00762011, "balance_loss_clip": 1.04038048, "balance_loss_mlp": 1.00043249, "epoch": 0.9562917092526905, "flos": 26098510550400.0, "grad_norm": 1.9200275155092934, "language_loss": 0.7782023, "learning_rate": 1.9955977443994577e-08, "loss": 0.79698032, "num_input_tokens_seen": 171689970, "step": 7953, "time_per_iteration": 2.5528924465179443 }, { "auxiliary_loss_clip": 0.01139017, "auxiliary_loss_mlp": 0.0102832, "balance_loss_clip": 1.04606366, "balance_loss_mlp": 1.02061868, "epoch": 0.9564119521433295, "flos": 24096742531200.0, "grad_norm": 1.9844726399692598, "language_loss": 0.61818546, "learning_rate": 1.9846360556430965e-08, "loss": 0.63985884, "num_input_tokens_seen": 171708270, "step": 7954, "time_per_iteration": 2.486854076385498 }, { "auxiliary_loss_clip": 0.01161706, "auxiliary_loss_mlp": 0.01020885, "balance_loss_clip": 1.0447855, "balance_loss_mlp": 1.01412845, "epoch": 0.9565321950339686, "flos": 32008903896960.0, "grad_norm": 2.1461605526826943, "language_loss": 0.61596388, "learning_rate": 1.973704405660004e-08, "loss": 0.63778973, "num_input_tokens_seen": 171729385, "step": 7955, "time_per_iteration": 2.4807586669921875 }, { "auxiliary_loss_clip": 0.0109332, "auxiliary_loss_mlp": 0.01024786, "balance_loss_clip": 1.04128313, "balance_loss_mlp": 1.01813412, "epoch": 0.9566524379246077, "flos": 23588642695680.0, "grad_norm": 1.4773026871911497, "language_loss": 0.78230816, "learning_rate": 1.9628027961085203e-08, "loss": 0.80348927, "num_input_tokens_seen": 171752615, "step": 7956, "time_per_iteration": 2.6084582805633545 }, { "auxiliary_loss_clip": 0.01109845, "auxiliary_loss_mlp": 0.01021198, "balance_loss_clip": 1.03786778, "balance_loss_mlp": 1.01438785, "epoch": 0.9567726808152468, "flos": 38067716240640.0, "grad_norm": 1.7272869933097705, "language_loss": 0.83818829, "learning_rate": 1.9519312286423894e-08, "loss": 0.85949874, "num_input_tokens_seen": 171775810, "step": 7957, "time_per_iteration": 2.6442482471466064 }, { "auxiliary_loss_clip": 0.01147918, "auxiliary_loss_mlp": 0.01020035, "balance_loss_clip": 1.04619336, "balance_loss_mlp": 1.0128262, "epoch": 0.9568929237058859, "flos": 22744059229440.0, "grad_norm": 2.0668731747494506, "language_loss": 0.77823257, "learning_rate": 1.9410897049108255e-08, "loss": 0.79991215, "num_input_tokens_seen": 171795090, "step": 7958, "time_per_iteration": 2.4371297359466553 }, { "auxiliary_loss_clip": 0.01172366, "auxiliary_loss_mlp": 0.01024694, "balance_loss_clip": 1.05144978, "balance_loss_mlp": 1.01723456, "epoch": 0.957013166596525, "flos": 23841633162240.0, "grad_norm": 2.1492120647910298, "language_loss": 0.91172862, "learning_rate": 1.9302782265584905e-08, "loss": 0.93369925, "num_input_tokens_seen": 171815755, "step": 7959, "time_per_iteration": 3.2843074798583984 }, { "auxiliary_loss_clip": 0.01098423, "auxiliary_loss_mlp": 0.01026196, "balance_loss_clip": 1.04239273, "balance_loss_mlp": 1.01895714, "epoch": 0.9571334094871641, "flos": 17639286071040.0, "grad_norm": 2.1963974297954882, "language_loss": 0.87164724, "learning_rate": 1.9194967952254282e-08, "loss": 0.89289337, "num_input_tokens_seen": 171834330, "step": 7960, "time_per_iteration": 2.5030741691589355 }, { "auxiliary_loss_clip": 0.0115075, "auxiliary_loss_mlp": 0.0102257, "balance_loss_clip": 1.04653263, "balance_loss_mlp": 1.01517904, "epoch": 0.9572536523778031, "flos": 15369623441280.0, "grad_norm": 2.1855534926130233, "language_loss": 0.81008863, "learning_rate": 1.9087454125472635e-08, "loss": 0.83182186, "num_input_tokens_seen": 171848805, "step": 7961, "time_per_iteration": 2.4061107635498047 }, { "auxiliary_loss_clip": 0.01164972, "auxiliary_loss_mlp": 0.01021859, "balance_loss_clip": 1.04690921, "balance_loss_mlp": 1.01451898, "epoch": 0.9573738952684423, "flos": 24969838417920.0, "grad_norm": 2.024653760470153, "language_loss": 0.78619277, "learning_rate": 1.8980240801548696e-08, "loss": 0.80806106, "num_input_tokens_seen": 171867995, "step": 7962, "time_per_iteration": 3.1779263019561768 }, { "auxiliary_loss_clip": 0.01135594, "auxiliary_loss_mlp": 0.01022001, "balance_loss_clip": 1.04747093, "balance_loss_mlp": 1.01507747, "epoch": 0.9574941381590814, "flos": 25769461034880.0, "grad_norm": 1.6299165619662728, "language_loss": 0.74275267, "learning_rate": 1.8873327996747458e-08, "loss": 0.7643286, "num_input_tokens_seen": 171886495, "step": 7963, "time_per_iteration": 3.326136589050293 }, { "auxiliary_loss_clip": 0.01152431, "auxiliary_loss_mlp": 0.01024561, "balance_loss_clip": 1.0443151, "balance_loss_mlp": 1.01774478, "epoch": 0.9576143810497204, "flos": 32307178435200.0, "grad_norm": 1.9668565514878975, "language_loss": 0.65918481, "learning_rate": 1.8766715727287053e-08, "loss": 0.68095475, "num_input_tokens_seen": 171908200, "step": 7964, "time_per_iteration": 2.525831699371338 }, { "auxiliary_loss_clip": 0.01153872, "auxiliary_loss_mlp": 0.00761995, "balance_loss_clip": 1.04485583, "balance_loss_mlp": 1.00050509, "epoch": 0.9577346239403596, "flos": 27745733376000.0, "grad_norm": 1.5841434800486671, "language_loss": 0.79479587, "learning_rate": 1.8660404009340546e-08, "loss": 0.81395447, "num_input_tokens_seen": 171928650, "step": 7965, "time_per_iteration": 2.4738192558288574 }, { "auxiliary_loss_clip": 0.01052641, "auxiliary_loss_mlp": 0.01001325, "balance_loss_clip": 1.00757861, "balance_loss_mlp": 1.00046718, "epoch": 0.9578548668309986, "flos": 57468313710720.0, "grad_norm": 0.8629714234026357, "language_loss": 0.59536701, "learning_rate": 1.8554392859035485e-08, "loss": 0.61590672, "num_input_tokens_seen": 171986400, "step": 7966, "time_per_iteration": 3.0143535137176514 }, { "auxiliary_loss_clip": 0.01082606, "auxiliary_loss_mlp": 0.01023758, "balance_loss_clip": 1.03738642, "balance_loss_mlp": 1.01682019, "epoch": 0.9579751097216377, "flos": 19756040503680.0, "grad_norm": 1.7255162415298613, "language_loss": 0.78941423, "learning_rate": 1.8448682292453444e-08, "loss": 0.81047785, "num_input_tokens_seen": 172005475, "step": 7967, "time_per_iteration": 2.575777530670166 }, { "auxiliary_loss_clip": 0.011644, "auxiliary_loss_mlp": 0.00761294, "balance_loss_clip": 1.04726803, "balance_loss_mlp": 1.00043714, "epoch": 0.9580953526122769, "flos": 18041270152320.0, "grad_norm": 1.9599523951132716, "language_loss": 0.65948665, "learning_rate": 1.8343272325631154e-08, "loss": 0.6787436, "num_input_tokens_seen": 172024420, "step": 7968, "time_per_iteration": 2.3758771419525146 }, { "auxiliary_loss_clip": 0.01085485, "auxiliary_loss_mlp": 0.01028224, "balance_loss_clip": 1.03934121, "balance_loss_mlp": 1.02048755, "epoch": 0.9582155955029159, "flos": 24270154416000.0, "grad_norm": 2.2049259503429677, "language_loss": 0.77987635, "learning_rate": 1.8238162974558492e-08, "loss": 0.80101335, "num_input_tokens_seen": 172038350, "step": 7969, "time_per_iteration": 3.3561575412750244 }, { "auxiliary_loss_clip": 0.01134892, "auxiliary_loss_mlp": 0.0102909, "balance_loss_clip": 1.04508424, "balance_loss_mlp": 1.02177644, "epoch": 0.958335838393555, "flos": 22783309816320.0, "grad_norm": 2.2488678942268163, "language_loss": 0.74685216, "learning_rate": 1.8133354255181144e-08, "loss": 0.76849198, "num_input_tokens_seen": 172058665, "step": 7970, "time_per_iteration": 2.4849183559417725 }, { "auxiliary_loss_clip": 0.01143761, "auxiliary_loss_mlp": 0.01025063, "balance_loss_clip": 1.04262257, "balance_loss_mlp": 1.01827741, "epoch": 0.958456081284194, "flos": 16911484698240.0, "grad_norm": 2.037698247234785, "language_loss": 0.74285889, "learning_rate": 1.802884618339795e-08, "loss": 0.76454711, "num_input_tokens_seen": 172077470, "step": 7971, "time_per_iteration": 2.4473037719726562 }, { "auxiliary_loss_clip": 0.01152308, "auxiliary_loss_mlp": 0.01022579, "balance_loss_clip": 1.04821038, "balance_loss_mlp": 1.01518214, "epoch": 0.9585763241748332, "flos": 19974951941760.0, "grad_norm": 2.000170344588468, "language_loss": 0.81331193, "learning_rate": 1.7924638775062894e-08, "loss": 0.83506083, "num_input_tokens_seen": 172096590, "step": 7972, "time_per_iteration": 2.4401140213012695 }, { "auxiliary_loss_clip": 0.01118317, "auxiliary_loss_mlp": 0.01025462, "balance_loss_clip": 1.04457879, "balance_loss_mlp": 1.01850927, "epoch": 0.9586965670654722, "flos": 21395649646080.0, "grad_norm": 1.967503584092124, "language_loss": 0.81185436, "learning_rate": 1.7820732045984444e-08, "loss": 0.83329213, "num_input_tokens_seen": 172116735, "step": 7973, "time_per_iteration": 2.5631072521209717 }, { "auxiliary_loss_clip": 0.01148353, "auxiliary_loss_mlp": 0.01024575, "balance_loss_clip": 1.04366922, "balance_loss_mlp": 1.01722538, "epoch": 0.9588168099561113, "flos": 21435115714560.0, "grad_norm": 1.712006301599308, "language_loss": 0.74138802, "learning_rate": 1.7717126011924655e-08, "loss": 0.76311725, "num_input_tokens_seen": 172138320, "step": 7974, "time_per_iteration": 2.505289316177368 }, { "auxiliary_loss_clip": 0.01101998, "auxiliary_loss_mlp": 0.01024133, "balance_loss_clip": 1.0366025, "balance_loss_mlp": 1.01701307, "epoch": 0.9589370528467505, "flos": 11763761852160.0, "grad_norm": 2.674116235276764, "language_loss": 0.76672012, "learning_rate": 1.7613820688600957e-08, "loss": 0.78798145, "num_input_tokens_seen": 172154225, "step": 7975, "time_per_iteration": 2.5380561351776123 }, { "auxiliary_loss_clip": 0.01142381, "auxiliary_loss_mlp": 0.01024792, "balance_loss_clip": 1.0422163, "balance_loss_mlp": 1.01819932, "epoch": 0.9590572957373895, "flos": 23441516588160.0, "grad_norm": 1.8174654296468669, "language_loss": 0.7863487, "learning_rate": 1.7510816091684588e-08, "loss": 0.80802035, "num_input_tokens_seen": 172174150, "step": 7976, "time_per_iteration": 2.4980061054229736 }, { "auxiliary_loss_clip": 0.01138007, "auxiliary_loss_mlp": 0.01026886, "balance_loss_clip": 1.04491901, "balance_loss_mlp": 1.01980507, "epoch": 0.9591775386280286, "flos": 22528272274560.0, "grad_norm": 2.996606761601389, "language_loss": 0.78698194, "learning_rate": 1.740811223680083e-08, "loss": 0.80863088, "num_input_tokens_seen": 172191005, "step": 7977, "time_per_iteration": 2.467432737350464 }, { "auxiliary_loss_clip": 0.01164301, "auxiliary_loss_mlp": 0.0102541, "balance_loss_clip": 1.04647398, "balance_loss_mlp": 1.018013, "epoch": 0.9592977815186677, "flos": 18186959715840.0, "grad_norm": 2.3649816352065645, "language_loss": 0.7385416, "learning_rate": 1.7305709139530334e-08, "loss": 0.7604388, "num_input_tokens_seen": 172209785, "step": 7978, "time_per_iteration": 2.3876655101776123 }, { "auxiliary_loss_clip": 0.01143611, "auxiliary_loss_mlp": 0.0102323, "balance_loss_clip": 1.04281366, "balance_loss_mlp": 1.01604724, "epoch": 0.9594180244093068, "flos": 16537797555840.0, "grad_norm": 2.297310040866127, "language_loss": 0.74602473, "learning_rate": 1.7203606815407334e-08, "loss": 0.76769316, "num_input_tokens_seen": 172224380, "step": 7979, "time_per_iteration": 2.3791959285736084 }, { "auxiliary_loss_clip": 0.01143854, "auxiliary_loss_mlp": 0.0102526, "balance_loss_clip": 1.04835653, "balance_loss_mlp": 1.0182147, "epoch": 0.9595382672999458, "flos": 20554334317440.0, "grad_norm": 1.6140123118052505, "language_loss": 0.79366875, "learning_rate": 1.7101805279920557e-08, "loss": 0.81535983, "num_input_tokens_seen": 172242540, "step": 7980, "time_per_iteration": 2.459242105484009 }, { "auxiliary_loss_clip": 0.01164608, "auxiliary_loss_mlp": 0.01021678, "balance_loss_clip": 1.04711461, "balance_loss_mlp": 1.01416826, "epoch": 0.959658510190585, "flos": 22638266697600.0, "grad_norm": 2.1376761115108267, "language_loss": 0.80899394, "learning_rate": 1.7000304548513643e-08, "loss": 0.8308568, "num_input_tokens_seen": 172262645, "step": 7981, "time_per_iteration": 2.4031434059143066 }, { "auxiliary_loss_clip": 0.0111909, "auxiliary_loss_mlp": 0.01025055, "balance_loss_clip": 1.04051685, "balance_loss_mlp": 1.0180366, "epoch": 0.9597787530812241, "flos": 19135252725120.0, "grad_norm": 2.354070304963403, "language_loss": 0.82593018, "learning_rate": 1.6899104636583394e-08, "loss": 0.84737158, "num_input_tokens_seen": 172280695, "step": 7982, "time_per_iteration": 2.5189952850341797 }, { "auxiliary_loss_clip": 0.01052758, "auxiliary_loss_mlp": 0.01000838, "balance_loss_clip": 1.00722671, "balance_loss_mlp": 1.0000155, "epoch": 0.9598989959718631, "flos": 60098124055680.0, "grad_norm": 0.7247936510611176, "language_loss": 0.61941171, "learning_rate": 1.6798205559482638e-08, "loss": 0.63994765, "num_input_tokens_seen": 172343075, "step": 7983, "time_per_iteration": 3.1807339191436768 }, { "auxiliary_loss_clip": 0.01126898, "auxiliary_loss_mlp": 0.01027383, "balance_loss_clip": 1.04594326, "balance_loss_mlp": 1.0198071, "epoch": 0.9600192388625023, "flos": 20886795624960.0, "grad_norm": 2.5727226969359105, "language_loss": 0.76545107, "learning_rate": 1.669760733251713e-08, "loss": 0.78699386, "num_input_tokens_seen": 172361950, "step": 7984, "time_per_iteration": 2.5094847679138184 }, { "auxiliary_loss_clip": 0.01104054, "auxiliary_loss_mlp": 0.01023181, "balance_loss_clip": 1.04141057, "balance_loss_mlp": 1.0167048, "epoch": 0.9601394817531413, "flos": 20445740524800.0, "grad_norm": 1.6676778808878976, "language_loss": 0.8231622, "learning_rate": 1.659730997094755e-08, "loss": 0.84443456, "num_input_tokens_seen": 172380440, "step": 7985, "time_per_iteration": 2.5675392150878906 }, { "auxiliary_loss_clip": 0.01143451, "auxiliary_loss_mlp": 0.01023737, "balance_loss_clip": 1.04309511, "balance_loss_mlp": 1.01680207, "epoch": 0.9602597246437804, "flos": 21507152440320.0, "grad_norm": 2.189371102896242, "language_loss": 0.62065828, "learning_rate": 1.6497313489989283e-08, "loss": 0.64233017, "num_input_tokens_seen": 172400265, "step": 7986, "time_per_iteration": 3.309988021850586 }, { "auxiliary_loss_clip": 0.0110739, "auxiliary_loss_mlp": 0.01024528, "balance_loss_clip": 1.0347755, "balance_loss_mlp": 1.01708341, "epoch": 0.9603799675344196, "flos": 29935099152000.0, "grad_norm": 2.176662426371312, "language_loss": 0.70074391, "learning_rate": 1.639761790481131e-08, "loss": 0.72206306, "num_input_tokens_seen": 172421145, "step": 7987, "time_per_iteration": 2.610603094100952 }, { "auxiliary_loss_clip": 0.01153397, "auxiliary_loss_mlp": 0.01022966, "balance_loss_clip": 1.04622841, "balance_loss_mlp": 1.01599848, "epoch": 0.9605002104250586, "flos": 28001525103360.0, "grad_norm": 1.8062192392037126, "language_loss": 0.79249227, "learning_rate": 1.6298223230537754e-08, "loss": 0.81425589, "num_input_tokens_seen": 172438945, "step": 7988, "time_per_iteration": 2.484236717224121 }, { "auxiliary_loss_clip": 0.01133696, "auxiliary_loss_mlp": 0.00761984, "balance_loss_clip": 1.04295492, "balance_loss_mlp": 1.00039208, "epoch": 0.9606204533156977, "flos": 35590490870400.0, "grad_norm": 2.976650739555826, "language_loss": 0.69689834, "learning_rate": 1.619912948224611e-08, "loss": 0.71585512, "num_input_tokens_seen": 172460150, "step": 7989, "time_per_iteration": 4.188363552093506 }, { "auxiliary_loss_clip": 0.01118726, "auxiliary_loss_mlp": 0.01029061, "balance_loss_clip": 1.04359126, "balance_loss_mlp": 1.0212115, "epoch": 0.9607406962063368, "flos": 26574614346240.0, "grad_norm": 2.5082978190502794, "language_loss": 0.6079818, "learning_rate": 1.6100336674969682e-08, "loss": 0.62945962, "num_input_tokens_seen": 172478990, "step": 7990, "time_per_iteration": 2.54133939743042 }, { "auxiliary_loss_clip": 0.01112353, "auxiliary_loss_mlp": 0.01030608, "balance_loss_clip": 1.04011869, "balance_loss_mlp": 1.02298164, "epoch": 0.9608609390969759, "flos": 25331781813120.0, "grad_norm": 1.9375775869444478, "language_loss": 0.76695067, "learning_rate": 1.600184482369449e-08, "loss": 0.78838027, "num_input_tokens_seen": 172498905, "step": 7991, "time_per_iteration": 2.5657451152801514 }, { "auxiliary_loss_clip": 0.01126856, "auxiliary_loss_mlp": 0.01022371, "balance_loss_clip": 1.04248345, "balance_loss_mlp": 1.01446128, "epoch": 0.960981181987615, "flos": 21069114082560.0, "grad_norm": 2.6783739872364594, "language_loss": 0.8941471, "learning_rate": 1.5903653943362126e-08, "loss": 0.9156394, "num_input_tokens_seen": 172517900, "step": 7992, "time_per_iteration": 2.4946417808532715 }, { "auxiliary_loss_clip": 0.01137278, "auxiliary_loss_mlp": 0.01022231, "balance_loss_clip": 1.04416943, "balance_loss_mlp": 1.01561487, "epoch": 0.9611014248782541, "flos": 17823256554240.0, "grad_norm": 1.9655420976840847, "language_loss": 0.76993406, "learning_rate": 1.580576404886802e-08, "loss": 0.79152912, "num_input_tokens_seen": 172536430, "step": 7993, "time_per_iteration": 2.4922561645507812 }, { "auxiliary_loss_clip": 0.01149915, "auxiliary_loss_mlp": 0.01025593, "balance_loss_clip": 1.04450226, "balance_loss_mlp": 1.01886058, "epoch": 0.9612216677688932, "flos": 19354631040000.0, "grad_norm": 2.9965339583526482, "language_loss": 0.79640782, "learning_rate": 1.570817515506162e-08, "loss": 0.81816292, "num_input_tokens_seen": 172555120, "step": 7994, "time_per_iteration": 2.429800033569336 }, { "auxiliary_loss_clip": 0.01162984, "auxiliary_loss_mlp": 0.01023004, "balance_loss_clip": 1.04747438, "balance_loss_mlp": 1.01651645, "epoch": 0.9613419106595322, "flos": 15808739207040.0, "grad_norm": 2.1654653277555913, "language_loss": 0.81549096, "learning_rate": 1.561088727674753e-08, "loss": 0.83735085, "num_input_tokens_seen": 172569330, "step": 7995, "time_per_iteration": 3.1712207794189453 }, { "auxiliary_loss_clip": 0.01124153, "auxiliary_loss_mlp": 0.01031186, "balance_loss_clip": 1.04307365, "balance_loss_mlp": 1.02313077, "epoch": 0.9614621535501714, "flos": 25702488126720.0, "grad_norm": 4.480018090819754, "language_loss": 0.71564418, "learning_rate": 1.551390042868417e-08, "loss": 0.73719752, "num_input_tokens_seen": 172591100, "step": 7996, "time_per_iteration": 2.6296262741088867 }, { "auxiliary_loss_clip": 0.0115296, "auxiliary_loss_mlp": 0.01022448, "balance_loss_clip": 1.04740715, "balance_loss_mlp": 1.01515555, "epoch": 0.9615823964408104, "flos": 17819054663040.0, "grad_norm": 2.4070389326038573, "language_loss": 0.70827353, "learning_rate": 1.5417214625584207e-08, "loss": 0.73002762, "num_input_tokens_seen": 172608755, "step": 7997, "time_per_iteration": 2.4281885623931885 }, { "auxiliary_loss_clip": 0.01145111, "auxiliary_loss_mlp": 0.01021469, "balance_loss_clip": 1.04231751, "balance_loss_mlp": 1.01415002, "epoch": 0.9617026393314495, "flos": 20190020624640.0, "grad_norm": 1.7035342154509898, "language_loss": 0.8509208, "learning_rate": 1.5320829882114806e-08, "loss": 0.87258661, "num_input_tokens_seen": 172626830, "step": 7998, "time_per_iteration": 2.4530656337738037 }, { "auxiliary_loss_clip": 0.01162587, "auxiliary_loss_mlp": 0.01024893, "balance_loss_clip": 1.04414797, "balance_loss_mlp": 1.01784742, "epoch": 0.9618228822220887, "flos": 20267013427200.0, "grad_norm": 2.103792161629396, "language_loss": 0.7945298, "learning_rate": 1.5224746212897378e-08, "loss": 0.81640458, "num_input_tokens_seen": 172646125, "step": 7999, "time_per_iteration": 2.397569179534912 }, { "auxiliary_loss_clip": 0.01161861, "auxiliary_loss_mlp": 0.01022478, "balance_loss_clip": 1.04578233, "balance_loss_mlp": 1.01566219, "epoch": 0.9619431251127277, "flos": 21031300039680.0, "grad_norm": 1.7070868867199418, "language_loss": 0.77420163, "learning_rate": 1.512896363250804e-08, "loss": 0.79604501, "num_input_tokens_seen": 172666235, "step": 8000, "time_per_iteration": 2.4012913703918457 }, { "auxiliary_loss_clip": 0.01151927, "auxiliary_loss_mlp": 0.01024658, "balance_loss_clip": 1.04453051, "balance_loss_mlp": 1.01761556, "epoch": 0.9620633680033668, "flos": 22382654538240.0, "grad_norm": 2.0752732430047525, "language_loss": 0.75432396, "learning_rate": 1.503348215547673e-08, "loss": 0.77608979, "num_input_tokens_seen": 172687325, "step": 8001, "time_per_iteration": 2.463216781616211 }, { "auxiliary_loss_clip": 0.01133512, "auxiliary_loss_mlp": 0.01024969, "balance_loss_clip": 1.04337692, "balance_loss_mlp": 1.01809025, "epoch": 0.962183610894006, "flos": 18471730740480.0, "grad_norm": 1.7598745730702738, "language_loss": 0.80655944, "learning_rate": 1.4938301796288078e-08, "loss": 0.82814425, "num_input_tokens_seen": 172703895, "step": 8002, "time_per_iteration": 2.439070701599121 }, { "auxiliary_loss_clip": 0.01163782, "auxiliary_loss_mlp": 0.01022461, "balance_loss_clip": 1.04655981, "balance_loss_mlp": 1.01488554, "epoch": 0.962303853784645, "flos": 18435245500800.0, "grad_norm": 3.8070569528311458, "language_loss": 0.81698322, "learning_rate": 1.4843422569380537e-08, "loss": 0.83884573, "num_input_tokens_seen": 172720650, "step": 8003, "time_per_iteration": 2.3881473541259766 }, { "auxiliary_loss_clip": 0.01105355, "auxiliary_loss_mlp": 0.01022258, "balance_loss_clip": 1.03931236, "balance_loss_mlp": 1.01543045, "epoch": 0.9624240966752841, "flos": 26391074826240.0, "grad_norm": 2.2307647353505273, "language_loss": 0.82862902, "learning_rate": 1.4748844489147483e-08, "loss": 0.84990513, "num_input_tokens_seen": 172737640, "step": 8004, "time_per_iteration": 2.5483381748199463 }, { "auxiliary_loss_clip": 0.01135586, "auxiliary_loss_mlp": 0.0102131, "balance_loss_clip": 1.04193461, "balance_loss_mlp": 1.01512051, "epoch": 0.9625443395659231, "flos": 14647675985280.0, "grad_norm": 1.819601697079657, "language_loss": 0.71162355, "learning_rate": 1.4654567569936326e-08, "loss": 0.7331925, "num_input_tokens_seen": 172755215, "step": 8005, "time_per_iteration": 2.44250750541687 }, { "auxiliary_loss_clip": 0.01103186, "auxiliary_loss_mlp": 0.01028933, "balance_loss_clip": 1.03991902, "balance_loss_mlp": 1.02194762, "epoch": 0.9626645824565623, "flos": 18367626147840.0, "grad_norm": 1.866163557316775, "language_loss": 0.83024216, "learning_rate": 1.456059182604874e-08, "loss": 0.85156333, "num_input_tokens_seen": 172774020, "step": 8006, "time_per_iteration": 2.5249311923980713 }, { "auxiliary_loss_clip": 0.01165618, "auxiliary_loss_mlp": 0.01027199, "balance_loss_clip": 1.04809225, "balance_loss_mlp": 1.01969528, "epoch": 0.9627848253472013, "flos": 16580424021120.0, "grad_norm": 2.0115270960916143, "language_loss": 0.76493579, "learning_rate": 1.4466917271740653e-08, "loss": 0.78686398, "num_input_tokens_seen": 172792220, "step": 8007, "time_per_iteration": 2.3745837211608887 }, { "auxiliary_loss_clip": 0.01132993, "auxiliary_loss_mlp": 0.01025764, "balance_loss_clip": 1.0425992, "balance_loss_mlp": 1.01844716, "epoch": 0.9629050682378404, "flos": 20886867452160.0, "grad_norm": 1.8869264242336798, "language_loss": 0.68009061, "learning_rate": 1.4373543921222697e-08, "loss": 0.70167816, "num_input_tokens_seen": 172811805, "step": 8008, "time_per_iteration": 2.478329658508301 }, { "auxiliary_loss_clip": 0.01134216, "auxiliary_loss_mlp": 0.01025379, "balance_loss_clip": 1.04455376, "balance_loss_mlp": 1.01825011, "epoch": 0.9630253111284796, "flos": 17019252478080.0, "grad_norm": 1.819200845425801, "language_loss": 0.78233594, "learning_rate": 1.428047178865932e-08, "loss": 0.80393189, "num_input_tokens_seen": 172828595, "step": 8009, "time_per_iteration": 2.444878578186035 }, { "auxiliary_loss_clip": 0.01134872, "auxiliary_loss_mlp": 0.01025375, "balance_loss_clip": 1.04145074, "balance_loss_mlp": 1.01797211, "epoch": 0.9631455540191186, "flos": 20338942412160.0, "grad_norm": 1.6557753227576513, "language_loss": 0.74411094, "learning_rate": 1.4187700888169451e-08, "loss": 0.76571333, "num_input_tokens_seen": 172847770, "step": 8010, "time_per_iteration": 2.4842169284820557 }, { "auxiliary_loss_clip": 0.01050607, "auxiliary_loss_mlp": 0.01000932, "balance_loss_clip": 1.00816846, "balance_loss_mlp": 1.00010931, "epoch": 0.9632657969097577, "flos": 65956700033280.0, "grad_norm": 0.7521184666515475, "language_loss": 0.57041264, "learning_rate": 1.40952312338265e-08, "loss": 0.59092808, "num_input_tokens_seen": 172912415, "step": 8011, "time_per_iteration": 3.1016392707824707 }, { "auxiliary_loss_clip": 0.01123823, "auxiliary_loss_mlp": 0.01024639, "balance_loss_clip": 1.04143739, "balance_loss_mlp": 1.01772463, "epoch": 0.9633860398003968, "flos": 44419523823360.0, "grad_norm": 1.8663555050622491, "language_loss": 0.68743664, "learning_rate": 1.4003062839657909e-08, "loss": 0.70892125, "num_input_tokens_seen": 172934895, "step": 8012, "time_per_iteration": 3.6033921241760254 }, { "auxiliary_loss_clip": 0.01124424, "auxiliary_loss_mlp": 0.0101895, "balance_loss_clip": 1.04278994, "balance_loss_mlp": 1.01215243, "epoch": 0.9635062826910359, "flos": 24827704300800.0, "grad_norm": 1.6342624660680392, "language_loss": 0.79802448, "learning_rate": 1.391119571964583e-08, "loss": 0.81945819, "num_input_tokens_seen": 172955835, "step": 8013, "time_per_iteration": 2.540250778198242 }, { "auxiliary_loss_clip": 0.01150097, "auxiliary_loss_mlp": 0.01026722, "balance_loss_clip": 1.04688573, "balance_loss_mlp": 1.01936686, "epoch": 0.9636265255816749, "flos": 15961360095360.0, "grad_norm": 1.7553171824376277, "language_loss": 0.72552949, "learning_rate": 1.3819629887726225e-08, "loss": 0.74729764, "num_input_tokens_seen": 172973925, "step": 8014, "time_per_iteration": 2.42315936088562 }, { "auxiliary_loss_clip": 0.01142499, "auxiliary_loss_mlp": 0.01023382, "balance_loss_clip": 1.04757643, "balance_loss_mlp": 1.01628637, "epoch": 0.9637467684723141, "flos": 22601781457920.0, "grad_norm": 2.148582884711216, "language_loss": 0.76414245, "learning_rate": 1.3728365357789317e-08, "loss": 0.78580129, "num_input_tokens_seen": 172993290, "step": 8015, "time_per_iteration": 3.309800386428833 }, { "auxiliary_loss_clip": 0.01086686, "auxiliary_loss_mlp": 0.01021758, "balance_loss_clip": 1.0387907, "balance_loss_mlp": 1.01396489, "epoch": 0.9638670113629532, "flos": 17565812801280.0, "grad_norm": 2.9348131178571517, "language_loss": 0.76806539, "learning_rate": 1.3637402143680254e-08, "loss": 0.78914988, "num_input_tokens_seen": 173008190, "step": 8016, "time_per_iteration": 3.4087624549865723 }, { "auxiliary_loss_clip": 0.01027878, "auxiliary_loss_mlp": 0.01005326, "balance_loss_clip": 1.01040268, "balance_loss_mlp": 1.00456893, "epoch": 0.9639872542535922, "flos": 55072139379840.0, "grad_norm": 0.7240401755084293, "language_loss": 0.55069923, "learning_rate": 1.3546740259197998e-08, "loss": 0.57103133, "num_input_tokens_seen": 173061000, "step": 8017, "time_per_iteration": 3.0416760444641113 }, { "auxiliary_loss_clip": 0.01136952, "auxiliary_loss_mlp": 0.01027191, "balance_loss_clip": 1.04455602, "balance_loss_mlp": 1.01946592, "epoch": 0.9641074971442314, "flos": 24134484746880.0, "grad_norm": 2.3182985443941777, "language_loss": 0.70437837, "learning_rate": 1.3456379718095989e-08, "loss": 0.7260198, "num_input_tokens_seen": 173081415, "step": 8018, "time_per_iteration": 2.493027448654175 }, { "auxiliary_loss_clip": 0.01038254, "auxiliary_loss_mlp": 0.010025, "balance_loss_clip": 1.0074898, "balance_loss_mlp": 1.00148702, "epoch": 0.9642277400348704, "flos": 66747416077440.0, "grad_norm": 0.8401229538323925, "language_loss": 0.6205194, "learning_rate": 1.3366320534081487e-08, "loss": 0.64092696, "num_input_tokens_seen": 173144095, "step": 8019, "time_per_iteration": 3.0780186653137207 }, { "auxiliary_loss_clip": 0.01150467, "auxiliary_loss_mlp": 0.01022583, "balance_loss_clip": 1.04524851, "balance_loss_mlp": 1.01557374, "epoch": 0.9643479829255095, "flos": 30920272450560.0, "grad_norm": 2.3020841670504706, "language_loss": 0.762299, "learning_rate": 1.3276562720816675e-08, "loss": 0.78402954, "num_input_tokens_seen": 173165605, "step": 8020, "time_per_iteration": 2.5196034908294678 }, { "auxiliary_loss_clip": 0.01164528, "auxiliary_loss_mlp": 0.01024522, "balance_loss_clip": 1.04593754, "balance_loss_mlp": 1.0171932, "epoch": 0.9644682258161487, "flos": 20048245643520.0, "grad_norm": 2.2740153508614322, "language_loss": 0.82784462, "learning_rate": 1.3187106291917549e-08, "loss": 0.84973514, "num_input_tokens_seen": 173182595, "step": 8021, "time_per_iteration": 2.4315030574798584 }, { "auxiliary_loss_clip": 0.0114628, "auxiliary_loss_mlp": 0.01020621, "balance_loss_clip": 1.04464972, "balance_loss_mlp": 1.01437151, "epoch": 0.9645884687067877, "flos": 21178713456000.0, "grad_norm": 1.6772075495342211, "language_loss": 0.70642328, "learning_rate": 1.309795126095503e-08, "loss": 0.72809225, "num_input_tokens_seen": 173200895, "step": 8022, "time_per_iteration": 3.219061851501465 }, { "auxiliary_loss_clip": 0.01077201, "auxiliary_loss_mlp": 0.01023354, "balance_loss_clip": 1.0384481, "balance_loss_mlp": 1.01615071, "epoch": 0.9647087115974268, "flos": 18945967029120.0, "grad_norm": 2.0479811731810837, "language_loss": 0.80459267, "learning_rate": 1.3009097641453192e-08, "loss": 0.82559824, "num_input_tokens_seen": 173218745, "step": 8023, "time_per_iteration": 2.6150705814361572 }, { "auxiliary_loss_clip": 0.01138224, "auxiliary_loss_mlp": 0.01020576, "balance_loss_clip": 1.0463717, "balance_loss_mlp": 1.01352477, "epoch": 0.9648289544880659, "flos": 16545088016640.0, "grad_norm": 1.6444242924151706, "language_loss": 0.75677907, "learning_rate": 1.2920545446891474e-08, "loss": 0.77836704, "num_input_tokens_seen": 173235465, "step": 8024, "time_per_iteration": 2.4677605628967285 }, { "auxiliary_loss_clip": 0.01138745, "auxiliary_loss_mlp": 0.01032666, "balance_loss_clip": 1.04642177, "balance_loss_mlp": 1.02527535, "epoch": 0.964949197378705, "flos": 24057527857920.0, "grad_norm": 1.731194863134548, "language_loss": 0.70507318, "learning_rate": 1.2832294690703127e-08, "loss": 0.72678733, "num_input_tokens_seen": 173254440, "step": 8025, "time_per_iteration": 2.5037741661071777 }, { "auxiliary_loss_clip": 0.01151583, "auxiliary_loss_mlp": 0.01024383, "balance_loss_clip": 1.04662836, "balance_loss_mlp": 1.01701283, "epoch": 0.965069440269344, "flos": 23365565280000.0, "grad_norm": 2.622304409622533, "language_loss": 0.77384508, "learning_rate": 1.2744345386275668e-08, "loss": 0.79560471, "num_input_tokens_seen": 173273980, "step": 8026, "time_per_iteration": 2.452924966812134 }, { "auxiliary_loss_clip": 0.01146549, "auxiliary_loss_mlp": 0.01023452, "balance_loss_clip": 1.04961061, "balance_loss_mlp": 1.01627266, "epoch": 0.9651896831599832, "flos": 25374875155200.0, "grad_norm": 2.094194159590921, "language_loss": 0.78486365, "learning_rate": 1.265669754695109e-08, "loss": 0.80656362, "num_input_tokens_seen": 173293550, "step": 8027, "time_per_iteration": 2.525113344192505 }, { "auxiliary_loss_clip": 0.01097247, "auxiliary_loss_mlp": 0.01026168, "balance_loss_clip": 1.0392282, "balance_loss_mlp": 1.01872373, "epoch": 0.9653099260506223, "flos": 22272875596800.0, "grad_norm": 1.9654110756514116, "language_loss": 0.8197763, "learning_rate": 1.2569351186025201e-08, "loss": 0.84101051, "num_input_tokens_seen": 173312005, "step": 8028, "time_per_iteration": 2.5802760124206543 }, { "auxiliary_loss_clip": 0.01109185, "auxiliary_loss_mlp": 0.01022687, "balance_loss_clip": 1.03895581, "balance_loss_mlp": 1.01583862, "epoch": 0.9654301689412613, "flos": 26760847386240.0, "grad_norm": 1.4557781139830313, "language_loss": 0.75236416, "learning_rate": 1.2482306316748737e-08, "loss": 0.77368289, "num_input_tokens_seen": 173332450, "step": 8029, "time_per_iteration": 2.558187961578369 }, { "auxiliary_loss_clip": 0.01155324, "auxiliary_loss_mlp": 0.01021752, "balance_loss_clip": 1.04522073, "balance_loss_mlp": 1.01528478, "epoch": 0.9655504118319005, "flos": 17412689122560.0, "grad_norm": 2.8016751496578234, "language_loss": 0.78271544, "learning_rate": 1.2395562952326021e-08, "loss": 0.80448627, "num_input_tokens_seen": 173349610, "step": 8030, "time_per_iteration": 2.42197585105896 }, { "auxiliary_loss_clip": 0.01146571, "auxiliary_loss_mlp": 0.01031628, "balance_loss_clip": 1.04655778, "balance_loss_mlp": 1.02334261, "epoch": 0.9656706547225395, "flos": 22126970551680.0, "grad_norm": 2.689096297879985, "language_loss": 0.81186926, "learning_rate": 1.2309121105916309e-08, "loss": 0.83365124, "num_input_tokens_seen": 173367900, "step": 8031, "time_per_iteration": 2.510634422302246 }, { "auxiliary_loss_clip": 0.01153724, "auxiliary_loss_mlp": 0.01022617, "balance_loss_clip": 1.0461762, "balance_loss_mlp": 1.01580405, "epoch": 0.9657908976131786, "flos": 37049289926400.0, "grad_norm": 1.942218903475128, "language_loss": 0.69271475, "learning_rate": 1.222298079063222e-08, "loss": 0.71447814, "num_input_tokens_seen": 173389040, "step": 8032, "time_per_iteration": 2.573167085647583 }, { "auxiliary_loss_clip": 0.01148338, "auxiliary_loss_mlp": 0.0102345, "balance_loss_clip": 1.04495907, "balance_loss_mlp": 1.01681042, "epoch": 0.9659111405038178, "flos": 24389809597440.0, "grad_norm": 2.367894053191486, "language_loss": 0.7275871, "learning_rate": 1.2137142019541524e-08, "loss": 0.74930501, "num_input_tokens_seen": 173407595, "step": 8033, "time_per_iteration": 2.483229637145996 }, { "auxiliary_loss_clip": 0.0114312, "auxiliary_loss_mlp": 0.01025484, "balance_loss_clip": 1.04368556, "balance_loss_mlp": 1.01894808, "epoch": 0.9660313833944568, "flos": 25009412227200.0, "grad_norm": 2.0519206501182703, "language_loss": 0.73214149, "learning_rate": 1.2051604805666027e-08, "loss": 0.75382757, "num_input_tokens_seen": 173424720, "step": 8034, "time_per_iteration": 2.508850574493408 }, { "auxiliary_loss_clip": 0.01164758, "auxiliary_loss_mlp": 0.00761615, "balance_loss_clip": 1.04720926, "balance_loss_mlp": 1.0004679, "epoch": 0.9661516262850959, "flos": 11801575895040.0, "grad_norm": 2.2018230336157836, "language_loss": 0.78638655, "learning_rate": 1.196636916198135e-08, "loss": 0.80565029, "num_input_tokens_seen": 173442260, "step": 8035, "time_per_iteration": 2.434105634689331 }, { "auxiliary_loss_clip": 0.01166037, "auxiliary_loss_mlp": 0.01021356, "balance_loss_clip": 1.04735994, "balance_loss_mlp": 1.01456451, "epoch": 0.9662718691757349, "flos": 20047778766720.0, "grad_norm": 2.3415810863217192, "language_loss": 0.76836479, "learning_rate": 1.1881435101418036e-08, "loss": 0.79023874, "num_input_tokens_seen": 173461675, "step": 8036, "time_per_iteration": 2.4042298793792725 }, { "auxiliary_loss_clip": 0.01040999, "auxiliary_loss_mlp": 0.01000853, "balance_loss_clip": 1.0084275, "balance_loss_mlp": 1.00004792, "epoch": 0.9663921120663741, "flos": 68027703517440.0, "grad_norm": 0.7314223204400929, "language_loss": 0.65539575, "learning_rate": 1.1796802636860003e-08, "loss": 0.67581427, "num_input_tokens_seen": 173530205, "step": 8037, "time_per_iteration": 3.1086549758911133 }, { "auxiliary_loss_clip": 0.01164308, "auxiliary_loss_mlp": 0.01025713, "balance_loss_clip": 1.04581904, "balance_loss_mlp": 1.01864982, "epoch": 0.9665123549570132, "flos": 26322916769280.0, "grad_norm": 2.6093817005600024, "language_loss": 0.73843199, "learning_rate": 1.1712471781146316e-08, "loss": 0.76033217, "num_input_tokens_seen": 173549540, "step": 8038, "time_per_iteration": 2.4813179969787598 }, { "auxiliary_loss_clip": 0.01162232, "auxiliary_loss_mlp": 0.01024857, "balance_loss_clip": 1.04458714, "balance_loss_mlp": 1.01766288, "epoch": 0.9666325978476522, "flos": 43941121557120.0, "grad_norm": 2.4006004273797945, "language_loss": 0.66401631, "learning_rate": 1.1628442547069628e-08, "loss": 0.68588722, "num_input_tokens_seen": 173571740, "step": 8039, "time_per_iteration": 3.4404666423797607 }, { "auxiliary_loss_clip": 0.01154698, "auxiliary_loss_mlp": 0.00762184, "balance_loss_clip": 1.04512525, "balance_loss_mlp": 1.00049806, "epoch": 0.9667528407382914, "flos": 21543422198400.0, "grad_norm": 1.869458742041927, "language_loss": 0.77185112, "learning_rate": 1.1544714947377521e-08, "loss": 0.79101992, "num_input_tokens_seen": 173589425, "step": 8040, "time_per_iteration": 2.4575209617614746 }, { "auxiliary_loss_clip": 0.011663, "auxiliary_loss_mlp": 0.0102747, "balance_loss_clip": 1.04730177, "balance_loss_mlp": 1.01940536, "epoch": 0.9668730836289304, "flos": 23878585278720.0, "grad_norm": 2.0660478233787587, "language_loss": 0.70098913, "learning_rate": 1.1461288994770945e-08, "loss": 0.72292686, "num_input_tokens_seen": 173608500, "step": 8041, "time_per_iteration": 2.4305317401885986 }, { "auxiliary_loss_clip": 0.01165788, "auxiliary_loss_mlp": 0.0102665, "balance_loss_clip": 1.04506731, "balance_loss_mlp": 1.01915205, "epoch": 0.9669933265195695, "flos": 28293011971200.0, "grad_norm": 1.8540090468184476, "language_loss": 0.77420712, "learning_rate": 1.1378164701906002e-08, "loss": 0.79613149, "num_input_tokens_seen": 173630265, "step": 8042, "time_per_iteration": 3.2934224605560303 }, { "auxiliary_loss_clip": 0.0116739, "auxiliary_loss_mlp": 0.0102494, "balance_loss_clip": 1.04746318, "balance_loss_mlp": 1.01774251, "epoch": 0.9671135694102087, "flos": 22454763091200.0, "grad_norm": 1.7582208785185078, "language_loss": 0.67034984, "learning_rate": 1.1295342081392156e-08, "loss": 0.69227314, "num_input_tokens_seen": 173649625, "step": 8043, "time_per_iteration": 3.2975029945373535 }, { "auxiliary_loss_clip": 0.01138482, "auxiliary_loss_mlp": 0.01021146, "balance_loss_clip": 1.04324889, "balance_loss_mlp": 1.01383877, "epoch": 0.9672338123008477, "flos": 20155941596160.0, "grad_norm": 1.6501541485930686, "language_loss": 0.69209087, "learning_rate": 1.1212821145793804e-08, "loss": 0.71368718, "num_input_tokens_seen": 173669240, "step": 8044, "time_per_iteration": 2.472506284713745 }, { "auxiliary_loss_clip": 0.01137483, "auxiliary_loss_mlp": 0.01025928, "balance_loss_clip": 1.04343355, "balance_loss_mlp": 1.01865637, "epoch": 0.9673540551914868, "flos": 16977487939200.0, "grad_norm": 1.965621934423793, "language_loss": 0.78626651, "learning_rate": 1.1130601907629156e-08, "loss": 0.80790061, "num_input_tokens_seen": 173686970, "step": 8045, "time_per_iteration": 2.465703010559082 }, { "auxiliary_loss_clip": 0.01052286, "auxiliary_loss_mlp": 0.01001207, "balance_loss_clip": 1.00705481, "balance_loss_mlp": 1.00024748, "epoch": 0.9674742980821259, "flos": 61892903952000.0, "grad_norm": 0.8310061385023739, "language_loss": 0.64818013, "learning_rate": 1.1048684379370899e-08, "loss": 0.668715, "num_input_tokens_seen": 173747655, "step": 8046, "time_per_iteration": 3.0365893840789795 }, { "auxiliary_loss_clip": 0.01127874, "auxiliary_loss_mlp": 0.01023229, "balance_loss_clip": 1.04432428, "balance_loss_mlp": 1.0167743, "epoch": 0.967594540972765, "flos": 18697824898560.0, "grad_norm": 1.975165150900015, "language_loss": 0.74594229, "learning_rate": 1.0967068573445759e-08, "loss": 0.76745331, "num_input_tokens_seen": 173765140, "step": 8047, "time_per_iteration": 2.4526827335357666 }, { "auxiliary_loss_clip": 0.01131824, "auxiliary_loss_mlp": 0.01024695, "balance_loss_clip": 1.04140198, "balance_loss_mlp": 1.01767612, "epoch": 0.967714783863404, "flos": 20777411733120.0, "grad_norm": 2.2523430456295594, "language_loss": 0.65201139, "learning_rate": 1.0885754502234945e-08, "loss": 0.67357659, "num_input_tokens_seen": 173784800, "step": 8048, "time_per_iteration": 2.486752510070801 }, { "auxiliary_loss_clip": 0.01120488, "auxiliary_loss_mlp": 0.01022743, "balance_loss_clip": 1.04309964, "balance_loss_mlp": 1.01585615, "epoch": 0.9678350267540432, "flos": 23185473465600.0, "grad_norm": 2.930101495036063, "language_loss": 0.77860063, "learning_rate": 1.08047421780737e-08, "loss": 0.80003291, "num_input_tokens_seen": 173803990, "step": 8049, "time_per_iteration": 3.294926166534424 }, { "auxiliary_loss_clip": 0.01143411, "auxiliary_loss_mlp": 0.00761413, "balance_loss_clip": 1.04477239, "balance_loss_mlp": 1.00044656, "epoch": 0.9679552696446823, "flos": 21726063878400.0, "grad_norm": 2.39012814248842, "language_loss": 0.73707503, "learning_rate": 1.0724031613251305e-08, "loss": 0.75612324, "num_input_tokens_seen": 173821890, "step": 8050, "time_per_iteration": 2.4826231002807617 }, { "auxiliary_loss_clip": 0.01158448, "auxiliary_loss_mlp": 0.010281, "balance_loss_clip": 1.04752302, "balance_loss_mlp": 1.02058959, "epoch": 0.9680755125353213, "flos": 26869046129280.0, "grad_norm": 2.1964798174937377, "language_loss": 0.66212511, "learning_rate": 1.0643622820011744e-08, "loss": 0.6839906, "num_input_tokens_seen": 173842945, "step": 8051, "time_per_iteration": 2.506913423538208 }, { "auxiliary_loss_clip": 0.01167649, "auxiliary_loss_mlp": 0.01027458, "balance_loss_clip": 1.04657817, "balance_loss_mlp": 1.01967049, "epoch": 0.9681957554259605, "flos": 28325008010880.0, "grad_norm": 3.250580618000372, "language_loss": 0.67942607, "learning_rate": 1.0563515810552814e-08, "loss": 0.70137715, "num_input_tokens_seen": 173859915, "step": 8052, "time_per_iteration": 2.4511044025421143 }, { "auxiliary_loss_clip": 0.01167943, "auxiliary_loss_mlp": 0.01024931, "balance_loss_clip": 1.05024862, "balance_loss_mlp": 1.01860392, "epoch": 0.9683159983165995, "flos": 20557674282240.0, "grad_norm": 1.545115350126205, "language_loss": 0.73504066, "learning_rate": 1.0483710597026795e-08, "loss": 0.75696933, "num_input_tokens_seen": 173879775, "step": 8053, "time_per_iteration": 2.428269386291504 }, { "auxiliary_loss_clip": 0.01121768, "auxiliary_loss_mlp": 0.01027398, "balance_loss_clip": 1.0420028, "balance_loss_mlp": 1.02030206, "epoch": 0.9684362412072386, "flos": 24207958016640.0, "grad_norm": 1.9586683093633246, "language_loss": 0.74320185, "learning_rate": 1.0404207191540227e-08, "loss": 0.76469362, "num_input_tokens_seen": 173900230, "step": 8054, "time_per_iteration": 2.552485704421997 }, { "auxiliary_loss_clip": 0.01163779, "auxiliary_loss_mlp": 0.01023646, "balance_loss_clip": 1.04628396, "balance_loss_mlp": 1.016675, "epoch": 0.9685564840978778, "flos": 22346241125760.0, "grad_norm": 1.9054769218514387, "language_loss": 0.74835038, "learning_rate": 1.0325005606153236e-08, "loss": 0.77022457, "num_input_tokens_seen": 173919690, "step": 8055, "time_per_iteration": 2.4212632179260254 }, { "auxiliary_loss_clip": 0.01112163, "auxiliary_loss_mlp": 0.01024409, "balance_loss_clip": 1.04104924, "balance_loss_mlp": 1.01762271, "epoch": 0.9686767269885168, "flos": 14386389477120.0, "grad_norm": 2.4766435932876942, "language_loss": 0.78819972, "learning_rate": 1.0246105852881104e-08, "loss": 0.80956542, "num_input_tokens_seen": 173934790, "step": 8056, "time_per_iteration": 2.5299460887908936 }, { "auxiliary_loss_clip": 0.01165878, "auxiliary_loss_mlp": 0.0102007, "balance_loss_clip": 1.0464952, "balance_loss_mlp": 1.01280379, "epoch": 0.9687969698791559, "flos": 21287630471040.0, "grad_norm": 1.9109274534879837, "language_loss": 0.78610182, "learning_rate": 1.0167507943692476e-08, "loss": 0.80796129, "num_input_tokens_seen": 173953875, "step": 8057, "time_per_iteration": 2.458423137664795 }, { "auxiliary_loss_clip": 0.01148511, "auxiliary_loss_mlp": 0.01029773, "balance_loss_clip": 1.04701614, "balance_loss_mlp": 1.02248335, "epoch": 0.968917212769795, "flos": 19828328624640.0, "grad_norm": 2.2689689022155544, "language_loss": 0.71478963, "learning_rate": 1.008921189051093e-08, "loss": 0.73657244, "num_input_tokens_seen": 173971220, "step": 8058, "time_per_iteration": 2.424872875213623 }, { "auxiliary_loss_clip": 0.01166278, "auxiliary_loss_mlp": 0.01023702, "balance_loss_clip": 1.04768538, "balance_loss_mlp": 1.01653421, "epoch": 0.9690374556604341, "flos": 21681749473920.0, "grad_norm": 1.9392450467574212, "language_loss": 0.77505368, "learning_rate": 1.0011217705213848e-08, "loss": 0.79695344, "num_input_tokens_seen": 173989095, "step": 8059, "time_per_iteration": 2.43770432472229 }, { "auxiliary_loss_clip": 0.01148331, "auxiliary_loss_mlp": 0.01022937, "balance_loss_clip": 1.04623878, "balance_loss_mlp": 1.01673484, "epoch": 0.9691576985510731, "flos": 32635437851520.0, "grad_norm": 2.123343916845261, "language_loss": 0.74618512, "learning_rate": 9.933525399632658e-09, "loss": 0.76789784, "num_input_tokens_seen": 174007330, "step": 8060, "time_per_iteration": 2.5370001792907715 }, { "auxiliary_loss_clip": 0.01135209, "auxiliary_loss_mlp": 0.01024343, "balance_loss_clip": 1.0440166, "balance_loss_mlp": 1.01672876, "epoch": 0.9692779414417123, "flos": 35663174040960.0, "grad_norm": 1.7216145954454074, "language_loss": 0.65012652, "learning_rate": 9.856134985553488e-09, "loss": 0.67172205, "num_input_tokens_seen": 174027055, "step": 8061, "time_per_iteration": 2.6319658756256104 }, { "auxiliary_loss_clip": 0.01165094, "auxiliary_loss_mlp": 0.01023794, "balance_loss_clip": 1.04694629, "balance_loss_mlp": 1.01640892, "epoch": 0.9693981843323514, "flos": 28366952117760.0, "grad_norm": 1.8314184513888254, "language_loss": 0.73636782, "learning_rate": 9.77904647471628e-09, "loss": 0.75825667, "num_input_tokens_seen": 174050235, "step": 8062, "time_per_iteration": 2.462043523788452 }, { "auxiliary_loss_clip": 0.01100896, "auxiliary_loss_mlp": 0.01023935, "balance_loss_clip": 1.03848267, "balance_loss_mlp": 1.01683605, "epoch": 0.9695184272229904, "flos": 23622865378560.0, "grad_norm": 1.627487402718025, "language_loss": 0.73876345, "learning_rate": 9.702259878815454e-09, "loss": 0.76001179, "num_input_tokens_seen": 174070560, "step": 8063, "time_per_iteration": 2.5861976146698 }, { "auxiliary_loss_clip": 0.01154809, "auxiliary_loss_mlp": 0.01025747, "balance_loss_clip": 1.04831481, "balance_loss_mlp": 1.01801002, "epoch": 0.9696386701136296, "flos": 23294677789440.0, "grad_norm": 2.2755134982597247, "language_loss": 0.74469686, "learning_rate": 9.625775209499254e-09, "loss": 0.76650244, "num_input_tokens_seen": 174090565, "step": 8064, "time_per_iteration": 2.446993827819824 }, { "auxiliary_loss_clip": 0.01117169, "auxiliary_loss_mlp": 0.01026692, "balance_loss_clip": 1.04014075, "balance_loss_mlp": 1.01973653, "epoch": 0.9697589130042686, "flos": 15121876360320.0, "grad_norm": 2.1073407652064695, "language_loss": 0.74444348, "learning_rate": 9.549592478370172e-09, "loss": 0.76588207, "num_input_tokens_seen": 174108745, "step": 8065, "time_per_iteration": 2.489091396331787 }, { "auxiliary_loss_clip": 0.01151399, "auxiliary_loss_mlp": 0.01022232, "balance_loss_clip": 1.04410863, "balance_loss_mlp": 1.0153296, "epoch": 0.9698791558949077, "flos": 18879532824960.0, "grad_norm": 1.8332367474606057, "language_loss": 0.79838645, "learning_rate": 9.473711696985632e-09, "loss": 0.82012278, "num_input_tokens_seen": 174128075, "step": 8066, "time_per_iteration": 3.286348581314087 }, { "auxiliary_loss_clip": 0.01134805, "auxiliary_loss_mlp": 0.01026192, "balance_loss_clip": 1.04295421, "balance_loss_mlp": 1.01897371, "epoch": 0.9699993987855468, "flos": 17931455297280.0, "grad_norm": 2.5280725916669637, "language_loss": 0.75695115, "learning_rate": 9.398132876856201e-09, "loss": 0.77856117, "num_input_tokens_seen": 174147040, "step": 8067, "time_per_iteration": 2.44856858253479 }, { "auxiliary_loss_clip": 0.01019933, "auxiliary_loss_mlp": 0.01001354, "balance_loss_clip": 1.00769925, "balance_loss_mlp": 1.00036418, "epoch": 0.9701196416761859, "flos": 67182186297600.0, "grad_norm": 0.7725819647919476, "language_loss": 0.60848546, "learning_rate": 9.322856029447379e-09, "loss": 0.62869823, "num_input_tokens_seen": 174208225, "step": 8068, "time_per_iteration": 3.8124983310699463 }, { "auxiliary_loss_clip": 0.01163216, "auxiliary_loss_mlp": 0.01027093, "balance_loss_clip": 1.04721701, "balance_loss_mlp": 1.02021766, "epoch": 0.970239884566825, "flos": 24277804012800.0, "grad_norm": 3.0484449822882316, "language_loss": 0.80546355, "learning_rate": 9.247881166178695e-09, "loss": 0.82736659, "num_input_tokens_seen": 174226935, "step": 8069, "time_per_iteration": 2.439831018447876 }, { "auxiliary_loss_clip": 0.0113341, "auxiliary_loss_mlp": 0.01025749, "balance_loss_clip": 1.04554296, "balance_loss_mlp": 1.01879644, "epoch": 0.970360127457464, "flos": 25301689194240.0, "grad_norm": 2.2817928502613936, "language_loss": 0.76568025, "learning_rate": 9.173208298423274e-09, "loss": 0.78727186, "num_input_tokens_seen": 174248140, "step": 8070, "time_per_iteration": 3.4088246822357178 }, { "auxiliary_loss_clip": 0.01107803, "auxiliary_loss_mlp": 0.00762062, "balance_loss_clip": 1.04301524, "balance_loss_mlp": 1.00040603, "epoch": 0.9704803703481032, "flos": 29572473398400.0, "grad_norm": 1.603832583363965, "language_loss": 0.76314199, "learning_rate": 9.09883743750961e-09, "loss": 0.78184056, "num_input_tokens_seen": 174271030, "step": 8071, "time_per_iteration": 2.653918504714966 }, { "auxiliary_loss_clip": 0.0113509, "auxiliary_loss_mlp": 0.01023769, "balance_loss_clip": 1.04407382, "balance_loss_mlp": 1.0167892, "epoch": 0.9706006132387422, "flos": 17380046638080.0, "grad_norm": 1.5501400718443472, "language_loss": 0.83708215, "learning_rate": 9.024768594719124e-09, "loss": 0.85867071, "num_input_tokens_seen": 174289410, "step": 8072, "time_per_iteration": 2.5123648643493652 }, { "auxiliary_loss_clip": 0.01127061, "auxiliary_loss_mlp": 0.0101999, "balance_loss_clip": 1.04702282, "balance_loss_mlp": 1.01304591, "epoch": 0.9707208561293813, "flos": 18186421011840.0, "grad_norm": 2.3703793545808134, "language_loss": 0.72509736, "learning_rate": 8.95100178128816e-09, "loss": 0.74656785, "num_input_tokens_seen": 174308550, "step": 8073, "time_per_iteration": 2.4899332523345947 }, { "auxiliary_loss_clip": 0.01137115, "auxiliary_loss_mlp": 0.01024758, "balance_loss_clip": 1.0438652, "balance_loss_mlp": 1.01708102, "epoch": 0.9708410990200205, "flos": 31248388212480.0, "grad_norm": 2.188654683360268, "language_loss": 0.69983995, "learning_rate": 8.877537008407321e-09, "loss": 0.72145867, "num_input_tokens_seen": 174328600, "step": 8074, "time_per_iteration": 2.5641233921051025 }, { "auxiliary_loss_clip": 0.01140478, "auxiliary_loss_mlp": 0.01022969, "balance_loss_clip": 1.04446626, "balance_loss_mlp": 1.01608467, "epoch": 0.9709613419106595, "flos": 30554450386560.0, "grad_norm": 1.7964615062313898, "language_loss": 0.68543047, "learning_rate": 8.804374287221028e-09, "loss": 0.70706493, "num_input_tokens_seen": 174349835, "step": 8075, "time_per_iteration": 2.560720920562744 }, { "auxiliary_loss_clip": 0.01114021, "auxiliary_loss_mlp": 0.01023088, "balance_loss_clip": 1.03740501, "balance_loss_mlp": 1.01593816, "epoch": 0.9710815848012986, "flos": 23730166281600.0, "grad_norm": 1.621147413992952, "language_loss": 0.84582198, "learning_rate": 8.731513628827958e-09, "loss": 0.8671931, "num_input_tokens_seen": 174369200, "step": 8076, "time_per_iteration": 3.271825075149536 }, { "auxiliary_loss_clip": 0.01152095, "auxiliary_loss_mlp": 0.01023799, "balance_loss_clip": 1.04534686, "balance_loss_mlp": 1.0165956, "epoch": 0.9712018276919377, "flos": 23761875012480.0, "grad_norm": 3.96040113086316, "language_loss": 0.82598138, "learning_rate": 8.658955044280825e-09, "loss": 0.84774029, "num_input_tokens_seen": 174388125, "step": 8077, "time_per_iteration": 2.47666335105896 }, { "auxiliary_loss_clip": 0.01150589, "auxiliary_loss_mlp": 0.01021192, "balance_loss_clip": 1.04697752, "balance_loss_mlp": 1.01399779, "epoch": 0.9713220705825768, "flos": 23330983461120.0, "grad_norm": 2.1279930362772497, "language_loss": 0.77493244, "learning_rate": 8.586698544587268e-09, "loss": 0.79665029, "num_input_tokens_seen": 174409735, "step": 8078, "time_per_iteration": 2.4621527194976807 }, { "auxiliary_loss_clip": 0.01130648, "auxiliary_loss_mlp": 0.01025276, "balance_loss_clip": 1.04271126, "balance_loss_mlp": 1.01736045, "epoch": 0.9714423134732159, "flos": 22200946611840.0, "grad_norm": 1.942505467808045, "language_loss": 0.74173582, "learning_rate": 8.514744140707853e-09, "loss": 0.76329511, "num_input_tokens_seen": 174428875, "step": 8079, "time_per_iteration": 2.49058198928833 }, { "auxiliary_loss_clip": 0.01161932, "auxiliary_loss_mlp": 0.01021949, "balance_loss_clip": 1.04512179, "balance_loss_mlp": 1.01533628, "epoch": 0.971562556363855, "flos": 20229917656320.0, "grad_norm": 1.6707515383438474, "language_loss": 0.76542616, "learning_rate": 8.443091843558515e-09, "loss": 0.78726494, "num_input_tokens_seen": 174447960, "step": 8080, "time_per_iteration": 2.4033408164978027 }, { "auxiliary_loss_clip": 0.01131959, "auxiliary_loss_mlp": 0.01020976, "balance_loss_clip": 1.04291368, "balance_loss_mlp": 1.01365054, "epoch": 0.9716827992544941, "flos": 24970197553920.0, "grad_norm": 2.096770904796048, "language_loss": 0.64655209, "learning_rate": 8.37174166400878e-09, "loss": 0.66808146, "num_input_tokens_seen": 174463535, "step": 8081, "time_per_iteration": 2.5080575942993164 }, { "auxiliary_loss_clip": 0.01164134, "auxiliary_loss_mlp": 0.0102404, "balance_loss_clip": 1.04750323, "balance_loss_mlp": 1.01714945, "epoch": 0.9718030421451331, "flos": 24681476033280.0, "grad_norm": 2.005648810817171, "language_loss": 0.84900779, "learning_rate": 8.300693612881992e-09, "loss": 0.87088954, "num_input_tokens_seen": 174483600, "step": 8082, "time_per_iteration": 2.4283270835876465 }, { "auxiliary_loss_clip": 0.01148927, "auxiliary_loss_mlp": 0.0076156, "balance_loss_clip": 1.0455029, "balance_loss_mlp": 1.00043702, "epoch": 0.9719232850357723, "flos": 22090700793600.0, "grad_norm": 1.8154833613872918, "language_loss": 0.81318843, "learning_rate": 8.22994770095664e-09, "loss": 0.83229339, "num_input_tokens_seen": 174502175, "step": 8083, "time_per_iteration": 2.4316601753234863 }, { "auxiliary_loss_clip": 0.0113981, "auxiliary_loss_mlp": 0.01023543, "balance_loss_clip": 1.04910612, "balance_loss_mlp": 1.01608944, "epoch": 0.9720435279264114, "flos": 23656908493440.0, "grad_norm": 2.195695438138768, "language_loss": 0.75365877, "learning_rate": 8.159503938964585e-09, "loss": 0.77529228, "num_input_tokens_seen": 174519495, "step": 8084, "time_per_iteration": 2.486135959625244 }, { "auxiliary_loss_clip": 0.01115123, "auxiliary_loss_mlp": 0.01017586, "balance_loss_clip": 1.04169464, "balance_loss_mlp": 1.01093674, "epoch": 0.9721637708170504, "flos": 28365910623360.0, "grad_norm": 1.8123352055315165, "language_loss": 0.70398134, "learning_rate": 8.089362337592164e-09, "loss": 0.72530842, "num_input_tokens_seen": 174543120, "step": 8085, "time_per_iteration": 2.5561342239379883 }, { "auxiliary_loss_clip": 0.0113299, "auxiliary_loss_mlp": 0.01025978, "balance_loss_clip": 1.04344547, "balance_loss_mlp": 1.01916194, "epoch": 0.9722840137076896, "flos": 29130807767040.0, "grad_norm": 1.5550398862774735, "language_loss": 0.72033215, "learning_rate": 8.019522907479536e-09, "loss": 0.74192178, "num_input_tokens_seen": 174563480, "step": 8086, "time_per_iteration": 2.519136428833008 }, { "auxiliary_loss_clip": 0.01154088, "auxiliary_loss_mlp": 0.01026556, "balance_loss_clip": 1.04679084, "balance_loss_mlp": 1.01915598, "epoch": 0.9724042565983286, "flos": 19243954258560.0, "grad_norm": 2.362611294194622, "language_loss": 0.7724455, "learning_rate": 7.949985659221558e-09, "loss": 0.79425192, "num_input_tokens_seen": 174580745, "step": 8087, "time_per_iteration": 2.426196336746216 }, { "auxiliary_loss_clip": 0.01140333, "auxiliary_loss_mlp": 0.01025646, "balance_loss_clip": 1.04531229, "balance_loss_mlp": 1.01915216, "epoch": 0.9725244994889677, "flos": 23039676161280.0, "grad_norm": 2.0301704482002187, "language_loss": 0.78899974, "learning_rate": 7.880750603366904e-09, "loss": 0.81065953, "num_input_tokens_seen": 174599615, "step": 8088, "time_per_iteration": 2.466291666030884 }, { "auxiliary_loss_clip": 0.01130884, "auxiliary_loss_mlp": 0.01025933, "balance_loss_clip": 1.04219651, "balance_loss_mlp": 1.01794887, "epoch": 0.9726447423796069, "flos": 23367468700800.0, "grad_norm": 1.8461772697387901, "language_loss": 0.79450363, "learning_rate": 7.811817750418282e-09, "loss": 0.81607181, "num_input_tokens_seen": 174618375, "step": 8089, "time_per_iteration": 2.5140490531921387 }, { "auxiliary_loss_clip": 0.0112053, "auxiliary_loss_mlp": 0.0102358, "balance_loss_clip": 1.04447043, "balance_loss_mlp": 1.01606369, "epoch": 0.9727649852702459, "flos": 26541648639360.0, "grad_norm": 1.7699347020096867, "language_loss": 0.80101824, "learning_rate": 7.743187110833105e-09, "loss": 0.82245934, "num_input_tokens_seen": 174641135, "step": 8090, "time_per_iteration": 2.5541646480560303 }, { "auxiliary_loss_clip": 0.01138747, "auxiliary_loss_mlp": 0.01019088, "balance_loss_clip": 1.04279661, "balance_loss_mlp": 1.01243615, "epoch": 0.972885228160885, "flos": 20522338277760.0, "grad_norm": 1.4327316647404809, "language_loss": 0.80602801, "learning_rate": 7.674858695022602e-09, "loss": 0.82760644, "num_input_tokens_seen": 174659490, "step": 8091, "time_per_iteration": 2.4772720336914062 }, { "auxiliary_loss_clip": 0.01167419, "auxiliary_loss_mlp": 0.01025288, "balance_loss_clip": 1.04871655, "balance_loss_mlp": 1.0179683, "epoch": 0.9730054710515241, "flos": 17566064196480.0, "grad_norm": 4.171080513387283, "language_loss": 0.75368786, "learning_rate": 7.606832513351591e-09, "loss": 0.77561498, "num_input_tokens_seen": 174677440, "step": 8092, "time_per_iteration": 2.4062042236328125 }, { "auxiliary_loss_clip": 0.01060855, "auxiliary_loss_mlp": 0.00752819, "balance_loss_clip": 1.00727057, "balance_loss_mlp": 0.99990046, "epoch": 0.9731257139421632, "flos": 68972010117120.0, "grad_norm": 0.8387898086433941, "language_loss": 0.63925081, "learning_rate": 7.539108576140264e-09, "loss": 0.65738755, "num_input_tokens_seen": 174741550, "step": 8093, "time_per_iteration": 3.9284298419952393 }, { "auxiliary_loss_clip": 0.01108176, "auxiliary_loss_mlp": 0.01019434, "balance_loss_clip": 1.04169524, "balance_loss_mlp": 1.01318407, "epoch": 0.9732459568328022, "flos": 18478841633280.0, "grad_norm": 7.059130983022169, "language_loss": 0.70622194, "learning_rate": 7.471686893661732e-09, "loss": 0.72749805, "num_input_tokens_seen": 174759845, "step": 8094, "time_per_iteration": 3.3531012535095215 }, { "auxiliary_loss_clip": 0.01133073, "auxiliary_loss_mlp": 0.01024277, "balance_loss_clip": 1.04347253, "balance_loss_mlp": 1.01741314, "epoch": 0.9733661997234414, "flos": 20883886623360.0, "grad_norm": 1.8451418180629455, "language_loss": 0.64421296, "learning_rate": 7.4045674761442636e-09, "loss": 0.6657865, "num_input_tokens_seen": 174777175, "step": 8095, "time_per_iteration": 2.484976291656494 }, { "auxiliary_loss_clip": 0.01163274, "auxiliary_loss_mlp": 0.00761438, "balance_loss_clip": 1.0464251, "balance_loss_mlp": 1.00037682, "epoch": 0.9734864426140805, "flos": 23766795175680.0, "grad_norm": 2.013372456560244, "language_loss": 0.74053681, "learning_rate": 7.337750333769488e-09, "loss": 0.75978392, "num_input_tokens_seen": 174796980, "step": 8096, "time_per_iteration": 3.1700375080108643 }, { "auxiliary_loss_clip": 0.01141496, "auxiliary_loss_mlp": 0.01023873, "balance_loss_clip": 1.04051638, "balance_loss_mlp": 1.01596034, "epoch": 0.9736066855047195, "flos": 35042422176000.0, "grad_norm": 1.8499675575263224, "language_loss": 0.72811949, "learning_rate": 7.2712354766737425e-09, "loss": 0.7497732, "num_input_tokens_seen": 174817310, "step": 8097, "time_per_iteration": 2.590702533721924 }, { "auxiliary_loss_clip": 0.01118651, "auxiliary_loss_mlp": 0.01025073, "balance_loss_clip": 1.04600024, "balance_loss_mlp": 1.01694345, "epoch": 0.9737269283953586, "flos": 20410620001920.0, "grad_norm": 1.5692223122004738, "language_loss": 0.80446506, "learning_rate": 7.2050229149469565e-09, "loss": 0.82590234, "num_input_tokens_seen": 174837320, "step": 8098, "time_per_iteration": 2.4967474937438965 }, { "auxiliary_loss_clip": 0.01125675, "auxiliary_loss_mlp": 0.01024111, "balance_loss_clip": 1.04091692, "balance_loss_mlp": 1.01717913, "epoch": 0.9738471712859977, "flos": 28911680847360.0, "grad_norm": 1.9796723327975578, "language_loss": 0.6377998, "learning_rate": 7.139112658633984e-09, "loss": 0.6592977, "num_input_tokens_seen": 174857470, "step": 8099, "time_per_iteration": 2.5598456859588623 }, { "auxiliary_loss_clip": 0.01122661, "auxiliary_loss_mlp": 0.01022095, "balance_loss_clip": 1.04465377, "balance_loss_mlp": 1.01537728, "epoch": 0.9739674141766368, "flos": 27782326356480.0, "grad_norm": 1.9990668795700717, "language_loss": 0.70322263, "learning_rate": 7.073504717733048e-09, "loss": 0.72467023, "num_input_tokens_seen": 174877035, "step": 8100, "time_per_iteration": 2.5454845428466797 }, { "auxiliary_loss_clip": 0.0101923, "auxiliary_loss_mlp": 0.01002455, "balance_loss_clip": 1.00994813, "balance_loss_mlp": 1.00165641, "epoch": 0.9740876570672758, "flos": 68863057188480.0, "grad_norm": 0.7397545835754016, "language_loss": 0.57183361, "learning_rate": 7.008199102196855e-09, "loss": 0.59205049, "num_input_tokens_seen": 174938460, "step": 8101, "time_per_iteration": 3.064176321029663 }, { "auxiliary_loss_clip": 0.01034016, "auxiliary_loss_mlp": 0.01002128, "balance_loss_clip": 1.00816333, "balance_loss_mlp": 1.00116813, "epoch": 0.974207899957915, "flos": 58236622646400.0, "grad_norm": 0.8218522176226631, "language_loss": 0.58975828, "learning_rate": 6.9431958219321464e-09, "loss": 0.6101197, "num_input_tokens_seen": 174994625, "step": 8102, "time_per_iteration": 3.7765049934387207 }, { "auxiliary_loss_clip": 0.01136935, "auxiliary_loss_mlp": 0.01024942, "balance_loss_clip": 1.04401708, "balance_loss_mlp": 1.01757753, "epoch": 0.9743281428485541, "flos": 22600057605120.0, "grad_norm": 1.492971481254201, "language_loss": 0.77770996, "learning_rate": 6.878494886800146e-09, "loss": 0.79932868, "num_input_tokens_seen": 175015400, "step": 8103, "time_per_iteration": 2.4827070236206055 }, { "auxiliary_loss_clip": 0.01138482, "auxiliary_loss_mlp": 0.01021934, "balance_loss_clip": 1.04606843, "balance_loss_mlp": 1.01478493, "epoch": 0.9744483857391931, "flos": 20008815488640.0, "grad_norm": 2.1230928173914614, "language_loss": 0.76486599, "learning_rate": 6.814096306615669e-09, "loss": 0.78647017, "num_input_tokens_seen": 175033540, "step": 8104, "time_per_iteration": 2.4556937217712402 }, { "auxiliary_loss_clip": 0.01142051, "auxiliary_loss_mlp": 0.01024623, "balance_loss_clip": 1.04281402, "balance_loss_mlp": 1.01721144, "epoch": 0.9745686286298323, "flos": 17675268520320.0, "grad_norm": 2.080499863953652, "language_loss": 0.65418673, "learning_rate": 6.750000091148011e-09, "loss": 0.67585349, "num_input_tokens_seen": 175050835, "step": 8105, "time_per_iteration": 2.442660331726074 }, { "auxiliary_loss_clip": 0.01165537, "auxiliary_loss_mlp": 0.01024805, "balance_loss_clip": 1.04791081, "balance_loss_mlp": 1.01741982, "epoch": 0.9746888715204713, "flos": 29460252332160.0, "grad_norm": 3.059365685756993, "language_loss": 0.72483301, "learning_rate": 6.686206250120729e-09, "loss": 0.74673647, "num_input_tokens_seen": 175072330, "step": 8106, "time_per_iteration": 2.4667716026306152 }, { "auxiliary_loss_clip": 0.01128273, "auxiliary_loss_mlp": 0.0101988, "balance_loss_clip": 1.04172897, "balance_loss_mlp": 1.01302528, "epoch": 0.9748091144111104, "flos": 18479308510080.0, "grad_norm": 2.012738753033083, "language_loss": 0.74808598, "learning_rate": 6.622714793210749e-09, "loss": 0.76956749, "num_input_tokens_seen": 175091250, "step": 8107, "time_per_iteration": 2.486872434616089 }, { "auxiliary_loss_clip": 0.01165791, "auxiliary_loss_mlp": 0.01021233, "balance_loss_clip": 1.04698598, "balance_loss_mlp": 1.01446748, "epoch": 0.9749293573017496, "flos": 20665154753280.0, "grad_norm": 1.6697969042145406, "language_loss": 0.79010618, "learning_rate": 6.559525730050364e-09, "loss": 0.81197643, "num_input_tokens_seen": 175111350, "step": 8108, "time_per_iteration": 2.4031989574432373 }, { "auxiliary_loss_clip": 0.01126333, "auxiliary_loss_mlp": 0.01021914, "balance_loss_clip": 1.04463696, "balance_loss_mlp": 1.01506567, "epoch": 0.9750496001923886, "flos": 18478590238080.0, "grad_norm": 2.100066966205352, "language_loss": 0.75925088, "learning_rate": 6.496639070224574e-09, "loss": 0.78073335, "num_input_tokens_seen": 175129835, "step": 8109, "time_per_iteration": 2.4959845542907715 }, { "auxiliary_loss_clip": 0.01154928, "auxiliary_loss_mlp": 0.01022647, "balance_loss_clip": 1.04728138, "balance_loss_mlp": 1.01635861, "epoch": 0.9751698430830277, "flos": 19572967860480.0, "grad_norm": 2.285181785990845, "language_loss": 0.84143889, "learning_rate": 6.4340548232739714e-09, "loss": 0.86321467, "num_input_tokens_seen": 175146035, "step": 8110, "time_per_iteration": 2.4160165786743164 }, { "auxiliary_loss_clip": 0.01127199, "auxiliary_loss_mlp": 0.01023324, "balance_loss_clip": 1.0425216, "balance_loss_mlp": 1.01586461, "epoch": 0.9752900859736668, "flos": 23550325862400.0, "grad_norm": 2.7302618905670695, "language_loss": 0.79359519, "learning_rate": 6.371772998692071e-09, "loss": 0.81510043, "num_input_tokens_seen": 175165290, "step": 8111, "time_per_iteration": 2.5457630157470703 }, { "auxiliary_loss_clip": 0.01128015, "auxiliary_loss_mlp": 0.01020181, "balance_loss_clip": 1.04192674, "balance_loss_mlp": 1.01324296, "epoch": 0.9754103288643059, "flos": 20303211358080.0, "grad_norm": 6.202481461481704, "language_loss": 0.64976633, "learning_rate": 6.309793605927094e-09, "loss": 0.67124832, "num_input_tokens_seen": 175183610, "step": 8112, "time_per_iteration": 2.5083487033843994 }, { "auxiliary_loss_clip": 0.01141585, "auxiliary_loss_mlp": 0.01021796, "balance_loss_clip": 1.04339075, "balance_loss_mlp": 1.0145781, "epoch": 0.975530571754945, "flos": 19350680544000.0, "grad_norm": 1.6971518476172494, "language_loss": 0.80095458, "learning_rate": 6.248116654381297e-09, "loss": 0.82258844, "num_input_tokens_seen": 175202080, "step": 8113, "time_per_iteration": 2.4608726501464844 }, { "auxiliary_loss_clip": 0.01137997, "auxiliary_loss_mlp": 0.01022572, "balance_loss_clip": 1.04163122, "balance_loss_mlp": 1.01568747, "epoch": 0.9756508146455841, "flos": 23583399310080.0, "grad_norm": 1.669579803985765, "language_loss": 0.72641063, "learning_rate": 6.186742153410751e-09, "loss": 0.74801636, "num_input_tokens_seen": 175221575, "step": 8114, "time_per_iteration": 2.497535228729248 }, { "auxiliary_loss_clip": 0.01136944, "auxiliary_loss_mlp": 0.01028302, "balance_loss_clip": 1.04501879, "balance_loss_mlp": 1.02047586, "epoch": 0.9757710575362232, "flos": 22966921163520.0, "grad_norm": 2.371762953044903, "language_loss": 0.87399918, "learning_rate": 6.125670112326453e-09, "loss": 0.89565158, "num_input_tokens_seen": 175240835, "step": 8115, "time_per_iteration": 2.492361068725586 }, { "auxiliary_loss_clip": 0.01150661, "auxiliary_loss_mlp": 0.01023932, "balance_loss_clip": 1.04295027, "balance_loss_mlp": 1.01676512, "epoch": 0.9758913004268622, "flos": 27966009530880.0, "grad_norm": 1.6151264758519213, "language_loss": 0.70200801, "learning_rate": 6.064900540392548e-09, "loss": 0.72375393, "num_input_tokens_seen": 175262930, "step": 8116, "time_per_iteration": 2.5042977333068848 }, { "auxiliary_loss_clip": 0.01131119, "auxiliary_loss_mlp": 0.0102034, "balance_loss_clip": 1.0444541, "balance_loss_mlp": 1.0135746, "epoch": 0.9760115433175014, "flos": 22200156512640.0, "grad_norm": 1.9972200295645908, "language_loss": 0.78850317, "learning_rate": 6.0044334468278835e-09, "loss": 0.81001776, "num_input_tokens_seen": 175282275, "step": 8117, "time_per_iteration": 2.4768154621124268 }, { "auxiliary_loss_clip": 0.01112618, "auxiliary_loss_mlp": 0.01027179, "balance_loss_clip": 1.04240966, "balance_loss_mlp": 1.01954341, "epoch": 0.9761317862081405, "flos": 26250736389120.0, "grad_norm": 1.7051864042213905, "language_loss": 0.71447563, "learning_rate": 5.944268840805345e-09, "loss": 0.73587358, "num_input_tokens_seen": 175303020, "step": 8118, "time_per_iteration": 2.5904552936553955 }, { "auxiliary_loss_clip": 0.01114685, "auxiliary_loss_mlp": 0.01022792, "balance_loss_clip": 1.03993487, "balance_loss_mlp": 1.01632166, "epoch": 0.9762520290987795, "flos": 26575440359040.0, "grad_norm": 3.337457256799733, "language_loss": 0.63808692, "learning_rate": 5.88440673145163e-09, "loss": 0.65946174, "num_input_tokens_seen": 175324070, "step": 8119, "time_per_iteration": 3.382863759994507 }, { "auxiliary_loss_clip": 0.01151053, "auxiliary_loss_mlp": 0.01024807, "balance_loss_clip": 1.04823482, "balance_loss_mlp": 1.01774132, "epoch": 0.9763722719894187, "flos": 18005036307840.0, "grad_norm": 2.106207766736534, "language_loss": 0.82200015, "learning_rate": 5.824847127848142e-09, "loss": 0.84375882, "num_input_tokens_seen": 175342595, "step": 8120, "time_per_iteration": 2.4163408279418945 }, { "auxiliary_loss_clip": 0.01111372, "auxiliary_loss_mlp": 0.01020143, "balance_loss_clip": 1.04342175, "balance_loss_mlp": 1.01313317, "epoch": 0.9764925148800577, "flos": 22455660931200.0, "grad_norm": 2.017126212230986, "language_loss": 0.7852121, "learning_rate": 5.765590039029433e-09, "loss": 0.80652726, "num_input_tokens_seen": 175361915, "step": 8121, "time_per_iteration": 3.365196704864502 }, { "auxiliary_loss_clip": 0.01163146, "auxiliary_loss_mlp": 0.01025189, "balance_loss_clip": 1.04720449, "balance_loss_mlp": 1.01806903, "epoch": 0.9766127577706968, "flos": 36757084786560.0, "grad_norm": 1.5228147647626102, "language_loss": 0.7102589, "learning_rate": 5.706635473985422e-09, "loss": 0.73214221, "num_input_tokens_seen": 175385785, "step": 8122, "time_per_iteration": 2.569197416305542 }, { "auxiliary_loss_clip": 0.01147091, "auxiliary_loss_mlp": 0.01023305, "balance_loss_clip": 1.04369509, "balance_loss_mlp": 1.01640034, "epoch": 0.976733000661336, "flos": 22309971367680.0, "grad_norm": 1.778794603266535, "language_loss": 0.85213721, "learning_rate": 5.6479834416591764e-09, "loss": 0.87384117, "num_input_tokens_seen": 175405145, "step": 8123, "time_per_iteration": 3.242032051086426 }, { "auxiliary_loss_clip": 0.01151754, "auxiliary_loss_mlp": 0.00762177, "balance_loss_clip": 1.04720449, "balance_loss_mlp": 1.00044072, "epoch": 0.976853243551975, "flos": 25810938264960.0, "grad_norm": 4.051246936015088, "language_loss": 0.68507946, "learning_rate": 5.589633950947803e-09, "loss": 0.70421875, "num_input_tokens_seen": 175422645, "step": 8124, "time_per_iteration": 2.4661929607391357 }, { "auxiliary_loss_clip": 0.01136933, "auxiliary_loss_mlp": 0.01027225, "balance_loss_clip": 1.04482484, "balance_loss_mlp": 1.01977992, "epoch": 0.9769734864426141, "flos": 21397445326080.0, "grad_norm": 2.0128132042892366, "language_loss": 0.69590014, "learning_rate": 5.5315870107035535e-09, "loss": 0.71754169, "num_input_tokens_seen": 175440695, "step": 8125, "time_per_iteration": 2.4700679779052734 }, { "auxiliary_loss_clip": 0.01136227, "auxiliary_loss_mlp": 0.01025401, "balance_loss_clip": 1.04820156, "balance_loss_mlp": 1.01805735, "epoch": 0.9770937293332532, "flos": 13990977584640.0, "grad_norm": 1.7515072225288109, "language_loss": 0.78687161, "learning_rate": 5.473842629731607e-09, "loss": 0.80848789, "num_input_tokens_seen": 175459195, "step": 8126, "time_per_iteration": 2.467736005783081 }, { "auxiliary_loss_clip": 0.01144096, "auxiliary_loss_mlp": 0.0076178, "balance_loss_clip": 1.04396796, "balance_loss_mlp": 1.00050998, "epoch": 0.9772139722238923, "flos": 17931994001280.0, "grad_norm": 1.9751408344668755, "language_loss": 0.7812764, "learning_rate": 5.416400816792066e-09, "loss": 0.80033517, "num_input_tokens_seen": 175476710, "step": 8127, "time_per_iteration": 2.4525721073150635 }, { "auxiliary_loss_clip": 0.01162821, "auxiliary_loss_mlp": 0.01020739, "balance_loss_clip": 1.04615641, "balance_loss_mlp": 1.01363122, "epoch": 0.9773342151145313, "flos": 20446171488000.0, "grad_norm": 2.442607377215319, "language_loss": 0.78262478, "learning_rate": 5.359261580598407e-09, "loss": 0.80446041, "num_input_tokens_seen": 175492550, "step": 8128, "time_per_iteration": 2.3831400871276855 }, { "auxiliary_loss_clip": 0.01154127, "auxiliary_loss_mlp": 0.01023394, "balance_loss_clip": 1.04757047, "balance_loss_mlp": 1.01586878, "epoch": 0.9774544580051704, "flos": 11837306949120.0, "grad_norm": 2.3653080014836827, "language_loss": 0.77689081, "learning_rate": 5.302424929819027e-09, "loss": 0.798666, "num_input_tokens_seen": 175506560, "step": 8129, "time_per_iteration": 3.1712870597839355 }, { "auxiliary_loss_clip": 0.01154645, "auxiliary_loss_mlp": 0.01024732, "balance_loss_clip": 1.04369283, "balance_loss_mlp": 1.0176394, "epoch": 0.9775747008958096, "flos": 13479932833920.0, "grad_norm": 2.408888524014105, "language_loss": 0.73017269, "learning_rate": 5.24589087307592e-09, "loss": 0.75196648, "num_input_tokens_seen": 175524180, "step": 8130, "time_per_iteration": 2.410435914993286 }, { "auxiliary_loss_clip": 0.01165728, "auxiliary_loss_mlp": 0.01025072, "balance_loss_clip": 1.04668736, "balance_loss_mlp": 1.01816738, "epoch": 0.9776949437864486, "flos": 59532314042880.0, "grad_norm": 2.011909747498914, "language_loss": 0.64827871, "learning_rate": 5.189659418944891e-09, "loss": 0.67018676, "num_input_tokens_seen": 175554355, "step": 8131, "time_per_iteration": 2.8076958656311035 }, { "auxiliary_loss_clip": 0.01165473, "auxiliary_loss_mlp": 0.01026967, "balance_loss_clip": 1.04889047, "balance_loss_mlp": 1.02024364, "epoch": 0.9778151866770877, "flos": 21178605715200.0, "grad_norm": 2.1845730907548653, "language_loss": 0.78398108, "learning_rate": 5.133730575956674e-09, "loss": 0.80590552, "num_input_tokens_seen": 175574025, "step": 8132, "time_per_iteration": 2.3939788341522217 }, { "auxiliary_loss_clip": 0.01139669, "auxiliary_loss_mlp": 0.01019615, "balance_loss_clip": 1.04460788, "balance_loss_mlp": 1.01264966, "epoch": 0.9779354295677268, "flos": 20886795624960.0, "grad_norm": 1.8914406519125866, "language_loss": 0.7174443, "learning_rate": 5.0781043525953696e-09, "loss": 0.73903716, "num_input_tokens_seen": 175592090, "step": 8133, "time_per_iteration": 2.4702401161193848 }, { "auxiliary_loss_clip": 0.01131888, "auxiliary_loss_mlp": 0.01027151, "balance_loss_clip": 1.04608107, "balance_loss_mlp": 1.01998973, "epoch": 0.9780556724583659, "flos": 23440618748160.0, "grad_norm": 1.7668006370257898, "language_loss": 0.73815346, "learning_rate": 5.0227807572995605e-09, "loss": 0.75974387, "num_input_tokens_seen": 175614065, "step": 8134, "time_per_iteration": 2.4858009815216064 }, { "auxiliary_loss_clip": 0.01138725, "auxiliary_loss_mlp": 0.01019099, "balance_loss_clip": 1.04271317, "balance_loss_mlp": 1.01222396, "epoch": 0.9781759153490049, "flos": 20923244951040.0, "grad_norm": 2.150014382063035, "language_loss": 0.66863084, "learning_rate": 4.967759798461646e-09, "loss": 0.69020909, "num_input_tokens_seen": 175632410, "step": 8135, "time_per_iteration": 2.4722399711608887 }, { "auxiliary_loss_clip": 0.01163006, "auxiliary_loss_mlp": 0.01021028, "balance_loss_clip": 1.0472796, "balance_loss_mlp": 1.01430786, "epoch": 0.9782961582396441, "flos": 28293191539200.0, "grad_norm": 1.9720769277041839, "language_loss": 0.74722046, "learning_rate": 4.913041484428282e-09, "loss": 0.76906079, "num_input_tokens_seen": 175652885, "step": 8136, "time_per_iteration": 2.4538471698760986 }, { "auxiliary_loss_clip": 0.011533, "auxiliary_loss_mlp": 0.01021275, "balance_loss_clip": 1.04742908, "balance_loss_mlp": 1.01448333, "epoch": 0.9784164011302832, "flos": 25552955808000.0, "grad_norm": 1.8485643289530442, "language_loss": 0.74049205, "learning_rate": 4.858625823500384e-09, "loss": 0.76223785, "num_input_tokens_seen": 175670585, "step": 8137, "time_per_iteration": 2.462751626968384 }, { "auxiliary_loss_clip": 0.01153439, "auxiliary_loss_mlp": 0.01025347, "balance_loss_clip": 1.04546785, "balance_loss_mlp": 1.01767635, "epoch": 0.9785366440209222, "flos": 29965945956480.0, "grad_norm": 1.8551775818326384, "language_loss": 0.73223931, "learning_rate": 4.80451282393246e-09, "loss": 0.75402713, "num_input_tokens_seen": 175690570, "step": 8138, "time_per_iteration": 2.526431083679199 }, { "auxiliary_loss_clip": 0.01135768, "auxiliary_loss_mlp": 0.01019203, "balance_loss_clip": 1.04343545, "balance_loss_mlp": 1.01248288, "epoch": 0.9786568869115614, "flos": 32343591847680.0, "grad_norm": 1.8344639634645163, "language_loss": 0.6755361, "learning_rate": 4.750702493933722e-09, "loss": 0.69708586, "num_input_tokens_seen": 175710455, "step": 8139, "time_per_iteration": 2.5411572456359863 }, { "auxiliary_loss_clip": 0.0113968, "auxiliary_loss_mlp": 0.0076186, "balance_loss_clip": 1.04781199, "balance_loss_mlp": 1.0004015, "epoch": 0.9787771298022004, "flos": 23331414424320.0, "grad_norm": 1.8296538899820265, "language_loss": 0.85251313, "learning_rate": 4.697194841666974e-09, "loss": 0.87152851, "num_input_tokens_seen": 175729380, "step": 8140, "time_per_iteration": 2.482389211654663 }, { "auxiliary_loss_clip": 0.01153549, "auxiliary_loss_mlp": 0.01026866, "balance_loss_clip": 1.04599369, "balance_loss_mlp": 1.01856875, "epoch": 0.9788973726928395, "flos": 21468548298240.0, "grad_norm": 2.3121388636969593, "language_loss": 0.81939673, "learning_rate": 4.6439898752492764e-09, "loss": 0.84120089, "num_input_tokens_seen": 175749520, "step": 8141, "time_per_iteration": 2.4324891567230225 }, { "auxiliary_loss_clip": 0.01052235, "auxiliary_loss_mlp": 0.00752834, "balance_loss_clip": 1.00818765, "balance_loss_mlp": 0.99988747, "epoch": 0.9790176155834787, "flos": 68897459439360.0, "grad_norm": 0.747806976167841, "language_loss": 0.63677412, "learning_rate": 4.591087602751731e-09, "loss": 0.65482479, "num_input_tokens_seen": 175811380, "step": 8142, "time_per_iteration": 3.103113889694214 }, { "auxiliary_loss_clip": 0.01151411, "auxiliary_loss_mlp": 0.01025138, "balance_loss_clip": 1.04546499, "balance_loss_mlp": 1.0182898, "epoch": 0.9791378584741177, "flos": 21430877909760.0, "grad_norm": 1.6713511878043568, "language_loss": 0.71668696, "learning_rate": 4.538488032199916e-09, "loss": 0.73845243, "num_input_tokens_seen": 175829480, "step": 8143, "time_per_iteration": 2.4319615364074707 }, { "auxiliary_loss_clip": 0.01153742, "auxiliary_loss_mlp": 0.01028185, "balance_loss_clip": 1.04391646, "balance_loss_mlp": 1.02075255, "epoch": 0.9792581013647568, "flos": 20153032594560.0, "grad_norm": 1.9393341764512604, "language_loss": 0.68740761, "learning_rate": 4.486191171572784e-09, "loss": 0.70922685, "num_input_tokens_seen": 175846750, "step": 8144, "time_per_iteration": 2.4169678688049316 }, { "auxiliary_loss_clip": 0.01153707, "auxiliary_loss_mlp": 0.01021738, "balance_loss_clip": 1.04774129, "balance_loss_mlp": 1.01483262, "epoch": 0.9793783442553959, "flos": 23728191033600.0, "grad_norm": 1.6116398258788225, "language_loss": 0.77950466, "learning_rate": 4.434197028803766e-09, "loss": 0.8012591, "num_input_tokens_seen": 175865975, "step": 8145, "time_per_iteration": 2.491436243057251 }, { "auxiliary_loss_clip": 0.0112927, "auxiliary_loss_mlp": 0.01026079, "balance_loss_clip": 1.04270816, "balance_loss_mlp": 1.01811552, "epoch": 0.979498587146035, "flos": 23038742407680.0, "grad_norm": 2.120129183388703, "language_loss": 0.81977075, "learning_rate": 4.3825056117805514e-09, "loss": 0.84132421, "num_input_tokens_seen": 175881860, "step": 8146, "time_per_iteration": 3.350921630859375 }, { "auxiliary_loss_clip": 0.01164236, "auxiliary_loss_mlp": 0.01017539, "balance_loss_clip": 1.04483521, "balance_loss_mlp": 1.01034176, "epoch": 0.979618830036674, "flos": 14318841951360.0, "grad_norm": 3.371658584094465, "language_loss": 0.79727459, "learning_rate": 4.331116928344425e-09, "loss": 0.81909239, "num_input_tokens_seen": 175898175, "step": 8147, "time_per_iteration": 2.37263560295105 }, { "auxiliary_loss_clip": 0.01140426, "auxiliary_loss_mlp": 0.00761766, "balance_loss_clip": 1.04245234, "balance_loss_mlp": 1.00039959, "epoch": 0.9797390729273132, "flos": 16727514215040.0, "grad_norm": 2.061560084937971, "language_loss": 0.62689418, "learning_rate": 4.28003098629115e-09, "loss": 0.6459161, "num_input_tokens_seen": 175914310, "step": 8148, "time_per_iteration": 3.3335866928100586 }, { "auxiliary_loss_clip": 0.01120241, "auxiliary_loss_mlp": 0.01018853, "balance_loss_clip": 1.03794074, "balance_loss_mlp": 1.01156902, "epoch": 0.9798593158179523, "flos": 24532661986560.0, "grad_norm": 1.8677481259793705, "language_loss": 0.78492945, "learning_rate": 4.229247793370305e-09, "loss": 0.80632043, "num_input_tokens_seen": 175933435, "step": 8149, "time_per_iteration": 2.5447299480438232 }, { "auxiliary_loss_clip": 0.01168353, "auxiliary_loss_mlp": 0.01023926, "balance_loss_clip": 1.04859984, "balance_loss_mlp": 1.01657724, "epoch": 0.9799795587085913, "flos": 27308808339840.0, "grad_norm": 1.5736061605973455, "language_loss": 0.70442384, "learning_rate": 4.178767357285951e-09, "loss": 0.72634661, "num_input_tokens_seen": 175955065, "step": 8150, "time_per_iteration": 3.272921323776245 }, { "auxiliary_loss_clip": 0.01151484, "auxiliary_loss_mlp": 0.00761873, "balance_loss_clip": 1.04627144, "balance_loss_mlp": 1.00048232, "epoch": 0.9800998015992305, "flos": 26286575184000.0, "grad_norm": 1.8535681547593335, "language_loss": 0.71334445, "learning_rate": 4.128589685695516e-09, "loss": 0.73247802, "num_input_tokens_seen": 175975490, "step": 8151, "time_per_iteration": 2.495699167251587 }, { "auxiliary_loss_clip": 0.01164658, "auxiliary_loss_mlp": 0.01023643, "balance_loss_clip": 1.04765797, "balance_loss_mlp": 1.01717293, "epoch": 0.9802200444898695, "flos": 16723635546240.0, "grad_norm": 1.9045521649027974, "language_loss": 0.84778726, "learning_rate": 4.078714786211135e-09, "loss": 0.86967027, "num_input_tokens_seen": 175991340, "step": 8152, "time_per_iteration": 2.3678603172302246 }, { "auxiliary_loss_clip": 0.01147414, "auxiliary_loss_mlp": 0.01019576, "balance_loss_clip": 1.04514623, "balance_loss_mlp": 1.01288581, "epoch": 0.9803402873805086, "flos": 24900459298560.0, "grad_norm": 1.6416467066632476, "language_loss": 0.76966894, "learning_rate": 4.029142666398977e-09, "loss": 0.79133886, "num_input_tokens_seen": 176011505, "step": 8153, "time_per_iteration": 2.4981791973114014 }, { "auxiliary_loss_clip": 0.01163146, "auxiliary_loss_mlp": 0.01025707, "balance_loss_clip": 1.04716539, "balance_loss_mlp": 1.01903391, "epoch": 0.9804605302711478, "flos": 22564937082240.0, "grad_norm": 1.767689089059547, "language_loss": 0.80228394, "learning_rate": 3.979873333778805e-09, "loss": 0.8241725, "num_input_tokens_seen": 176029680, "step": 8154, "time_per_iteration": 2.404353141784668 }, { "auxiliary_loss_clip": 0.01141301, "auxiliary_loss_mlp": 0.01025651, "balance_loss_clip": 1.0459156, "balance_loss_mlp": 1.01857567, "epoch": 0.9805807731617868, "flos": 38905368382080.0, "grad_norm": 1.803885339464823, "language_loss": 0.73662245, "learning_rate": 3.930906795824862e-09, "loss": 0.75829196, "num_input_tokens_seen": 176050355, "step": 8155, "time_per_iteration": 2.6353776454925537 }, { "auxiliary_loss_clip": 0.0114837, "auxiliary_loss_mlp": 0.01019828, "balance_loss_clip": 1.04474437, "balance_loss_mlp": 1.0128839, "epoch": 0.9807010160524259, "flos": 17821999578240.0, "grad_norm": 2.0487655471343174, "language_loss": 0.76738417, "learning_rate": 3.882243059965207e-09, "loss": 0.78906608, "num_input_tokens_seen": 176068070, "step": 8156, "time_per_iteration": 3.191441297531128 }, { "auxiliary_loss_clip": 0.0114346, "auxiliary_loss_mlp": 0.01021056, "balance_loss_clip": 1.04243064, "balance_loss_mlp": 1.01410329, "epoch": 0.980821258943065, "flos": 13552975140480.0, "grad_norm": 2.508938664063899, "language_loss": 0.66036391, "learning_rate": 3.833882133582156e-09, "loss": 0.6820091, "num_input_tokens_seen": 176083730, "step": 8157, "time_per_iteration": 2.411853790283203 }, { "auxiliary_loss_clip": 0.01152159, "auxiliary_loss_mlp": 0.01021613, "balance_loss_clip": 1.04498196, "balance_loss_mlp": 1.01479387, "epoch": 0.9809415018337041, "flos": 21689794120320.0, "grad_norm": 1.8332722094206069, "language_loss": 0.78118157, "learning_rate": 3.785824024012285e-09, "loss": 0.80291933, "num_input_tokens_seen": 176102730, "step": 8158, "time_per_iteration": 2.4413275718688965 }, { "auxiliary_loss_clip": 0.01129618, "auxiliary_loss_mlp": 0.01024136, "balance_loss_clip": 1.0465486, "balance_loss_mlp": 1.01722431, "epoch": 0.9810617447243432, "flos": 23294857357440.0, "grad_norm": 1.8396009466613221, "language_loss": 0.78398681, "learning_rate": 3.738068738545541e-09, "loss": 0.80552435, "num_input_tokens_seen": 176121815, "step": 8159, "time_per_iteration": 2.4800519943237305 }, { "auxiliary_loss_clip": 0.01155248, "auxiliary_loss_mlp": 0.01029246, "balance_loss_clip": 1.0465827, "balance_loss_mlp": 1.02221584, "epoch": 0.9811819876149822, "flos": 18332038748160.0, "grad_norm": 3.4786211851779782, "language_loss": 0.7845642, "learning_rate": 3.6906162844265733e-09, "loss": 0.80640912, "num_input_tokens_seen": 176138900, "step": 8160, "time_per_iteration": 2.401216506958008 }, { "auxiliary_loss_clip": 0.01130173, "auxiliary_loss_mlp": 0.01026881, "balance_loss_clip": 1.04174781, "balance_loss_mlp": 1.01934052, "epoch": 0.9813022305056214, "flos": 22601961025920.0, "grad_norm": 1.9317683172812767, "language_loss": 0.70888704, "learning_rate": 3.643466668853845e-09, "loss": 0.73045754, "num_input_tokens_seen": 176156925, "step": 8161, "time_per_iteration": 2.465182065963745 }, { "auxiliary_loss_clip": 0.01138091, "auxiliary_loss_mlp": 0.01019916, "balance_loss_clip": 1.04395604, "balance_loss_mlp": 1.01228404, "epoch": 0.9814224733962604, "flos": 25413335642880.0, "grad_norm": 4.715763493552389, "language_loss": 0.75653255, "learning_rate": 3.59661989898008e-09, "loss": 0.77811265, "num_input_tokens_seen": 176177980, "step": 8162, "time_per_iteration": 2.513110637664795 }, { "auxiliary_loss_clip": 0.01117367, "auxiliary_loss_mlp": 0.01024265, "balance_loss_clip": 1.04372442, "balance_loss_mlp": 1.01737475, "epoch": 0.9815427162868995, "flos": 25007185584000.0, "grad_norm": 1.6625366758979532, "language_loss": 0.76566422, "learning_rate": 3.5500759819115934e-09, "loss": 0.78708053, "num_input_tokens_seen": 176198345, "step": 8163, "time_per_iteration": 2.5160861015319824 }, { "auxiliary_loss_clip": 0.01166516, "auxiliary_loss_mlp": 0.01021826, "balance_loss_clip": 1.04857266, "balance_loss_mlp": 1.01467061, "epoch": 0.9816629591775387, "flos": 20662604887680.0, "grad_norm": 2.4644786885744705, "language_loss": 0.80745482, "learning_rate": 3.5038349247094034e-09, "loss": 0.82933831, "num_input_tokens_seen": 176215605, "step": 8164, "time_per_iteration": 2.3838396072387695 }, { "auxiliary_loss_clip": 0.01134614, "auxiliary_loss_mlp": 0.0102223, "balance_loss_clip": 1.0405457, "balance_loss_mlp": 1.01488686, "epoch": 0.9817832020681777, "flos": 17712220636800.0, "grad_norm": 2.0773076081398094, "language_loss": 0.77310395, "learning_rate": 3.4578967343878994e-09, "loss": 0.79467249, "num_input_tokens_seen": 176231810, "step": 8165, "time_per_iteration": 2.4234228134155273 }, { "auxiliary_loss_clip": 0.01134869, "auxiliary_loss_mlp": 0.01023098, "balance_loss_clip": 1.04334927, "balance_loss_mlp": 1.01638103, "epoch": 0.9819034449588168, "flos": 22530032040960.0, "grad_norm": 1.8230582597765592, "language_loss": 0.80955255, "learning_rate": 3.4122614179161733e-09, "loss": 0.83113223, "num_input_tokens_seen": 176251770, "step": 8166, "time_per_iteration": 2.4711897373199463 }, { "auxiliary_loss_clip": 0.01113284, "auxiliary_loss_mlp": 0.01021099, "balance_loss_clip": 1.04131031, "balance_loss_mlp": 1.01410198, "epoch": 0.9820236878494559, "flos": 20011221699840.0, "grad_norm": 1.7171763175084334, "language_loss": 0.78283048, "learning_rate": 3.36692898221691e-09, "loss": 0.8041743, "num_input_tokens_seen": 176270135, "step": 8167, "time_per_iteration": 2.4911482334136963 }, { "auxiliary_loss_clip": 0.01151389, "auxiliary_loss_mlp": 0.01023163, "balance_loss_clip": 1.0445013, "balance_loss_mlp": 1.01653206, "epoch": 0.982143930740095, "flos": 18807316531200.0, "grad_norm": 1.92690498703883, "language_loss": 0.73083019, "learning_rate": 3.3218994341668305e-09, "loss": 0.75257564, "num_input_tokens_seen": 176289065, "step": 8168, "time_per_iteration": 2.413191318511963 }, { "auxiliary_loss_clip": 0.01164056, "auxiliary_loss_mlp": 0.01021881, "balance_loss_clip": 1.04855669, "balance_loss_mlp": 1.01537204, "epoch": 0.982264173630734, "flos": 26578026138240.0, "grad_norm": 1.5357757619413042, "language_loss": 0.75578237, "learning_rate": 3.2771727805971373e-09, "loss": 0.77764171, "num_input_tokens_seen": 176310450, "step": 8169, "time_per_iteration": 2.4397590160369873 }, { "auxiliary_loss_clip": 0.01102818, "auxiliary_loss_mlp": 0.01020651, "balance_loss_clip": 1.03774548, "balance_loss_mlp": 1.0137248, "epoch": 0.9823844165213732, "flos": 22014462176640.0, "grad_norm": 1.8060366368054204, "language_loss": 0.77218843, "learning_rate": 3.232749028292847e-09, "loss": 0.79342312, "num_input_tokens_seen": 176327415, "step": 8170, "time_per_iteration": 2.5204882621765137 }, { "auxiliary_loss_clip": 0.01165422, "auxiliary_loss_mlp": 0.01025064, "balance_loss_clip": 1.04566145, "balance_loss_mlp": 1.01802504, "epoch": 0.9825046594120123, "flos": 21908166854400.0, "grad_norm": 1.9804211903255802, "language_loss": 0.88267243, "learning_rate": 3.188628183992792e-09, "loss": 0.90457731, "num_input_tokens_seen": 176347680, "step": 8171, "time_per_iteration": 2.4180612564086914 }, { "auxiliary_loss_clip": 0.01051799, "auxiliary_loss_mlp": 0.01000784, "balance_loss_clip": 1.00713468, "balance_loss_mlp": 0.99993753, "epoch": 0.9826249023026513, "flos": 59494610718720.0, "grad_norm": 0.7387942025385362, "language_loss": 0.62611729, "learning_rate": 3.1448102543902844e-09, "loss": 0.64664316, "num_input_tokens_seen": 176411595, "step": 8172, "time_per_iteration": 3.0106372833251953 }, { "auxiliary_loss_clip": 0.01129078, "auxiliary_loss_mlp": 0.01027565, "balance_loss_clip": 1.0434792, "balance_loss_mlp": 1.02008486, "epoch": 0.9827451451932905, "flos": 16071031296000.0, "grad_norm": 1.821035821082583, "language_loss": 0.67597413, "learning_rate": 3.1012952461324515e-09, "loss": 0.69754052, "num_input_tokens_seen": 176430570, "step": 8173, "time_per_iteration": 3.326709270477295 }, { "auxiliary_loss_clip": 0.01148025, "auxiliary_loss_mlp": 0.01026059, "balance_loss_clip": 1.04706907, "balance_loss_mlp": 1.01877487, "epoch": 0.9828653880839295, "flos": 20262775622400.0, "grad_norm": 3.6820413674070247, "language_loss": 0.73906165, "learning_rate": 3.0580831658204575e-09, "loss": 0.76080251, "num_input_tokens_seen": 176448150, "step": 8174, "time_per_iteration": 2.43330454826355 }, { "auxiliary_loss_clip": 0.01149705, "auxiliary_loss_mlp": 0.01021411, "balance_loss_clip": 1.047019, "balance_loss_mlp": 1.0150249, "epoch": 0.9829856309745686, "flos": 21616141282560.0, "grad_norm": 1.5922564052528934, "language_loss": 0.77869344, "learning_rate": 3.015174020009281e-09, "loss": 0.80040467, "num_input_tokens_seen": 176467475, "step": 8175, "time_per_iteration": 3.263957977294922 }, { "auxiliary_loss_clip": 0.01126519, "auxiliary_loss_mlp": 0.01023605, "balance_loss_clip": 1.04298818, "balance_loss_mlp": 1.01717329, "epoch": 0.9831058738652078, "flos": 23764209396480.0, "grad_norm": 2.0496633053446307, "language_loss": 0.74972546, "learning_rate": 2.9725678152086043e-09, "loss": 0.77122664, "num_input_tokens_seen": 176486045, "step": 8176, "time_per_iteration": 3.312959909439087 }, { "auxiliary_loss_clip": 0.01123988, "auxiliary_loss_mlp": 0.01022397, "balance_loss_clip": 1.0419023, "balance_loss_mlp": 1.0152328, "epoch": 0.9832261167558468, "flos": 11320911072000.0, "grad_norm": 2.5278741537232836, "language_loss": 0.82737422, "learning_rate": 2.930264557881257e-09, "loss": 0.84883809, "num_input_tokens_seen": 176501230, "step": 8177, "time_per_iteration": 2.418728828430176 }, { "auxiliary_loss_clip": 0.0106086, "auxiliary_loss_mlp": 0.01000673, "balance_loss_clip": 1.0073384, "balance_loss_mlp": 0.99982661, "epoch": 0.9833463596464859, "flos": 60000304343040.0, "grad_norm": 0.8349583636822246, "language_loss": 0.58210409, "learning_rate": 2.8882642544452163e-09, "loss": 0.60271943, "num_input_tokens_seen": 176565955, "step": 8178, "time_per_iteration": 3.0307512283325195 }, { "auxiliary_loss_clip": 0.01125384, "auxiliary_loss_mlp": 0.01027846, "balance_loss_clip": 1.04159105, "balance_loss_mlp": 1.0206666, "epoch": 0.983466602537125, "flos": 13626699805440.0, "grad_norm": 2.2724603447668277, "language_loss": 0.74419934, "learning_rate": 2.8465669112716083e-09, "loss": 0.76573163, "num_input_tokens_seen": 176583480, "step": 8179, "time_per_iteration": 2.4320054054260254 }, { "auxiliary_loss_clip": 0.01151471, "auxiliary_loss_mlp": 0.0076166, "balance_loss_clip": 1.04367423, "balance_loss_mlp": 1.00044596, "epoch": 0.9835868454277641, "flos": 22926844563840.0, "grad_norm": 1.938651430601561, "language_loss": 0.76275772, "learning_rate": 2.8051725346858177e-09, "loss": 0.78188902, "num_input_tokens_seen": 176603740, "step": 8180, "time_per_iteration": 2.453141927719116 }, { "auxiliary_loss_clip": 0.01167265, "auxiliary_loss_mlp": 0.01025723, "balance_loss_clip": 1.04661274, "balance_loss_mlp": 1.01874924, "epoch": 0.9837070883184031, "flos": 27673409341440.0, "grad_norm": 2.052482446452464, "language_loss": 0.7064414, "learning_rate": 2.7640811309674883e-09, "loss": 0.72837126, "num_input_tokens_seen": 176623240, "step": 8181, "time_per_iteration": 2.4584624767303467 }, { "auxiliary_loss_clip": 0.01112655, "auxiliary_loss_mlp": 0.01020801, "balance_loss_clip": 1.04154074, "balance_loss_mlp": 1.01393723, "epoch": 0.9838273312090423, "flos": 29241951425280.0, "grad_norm": 1.6232823629135043, "language_loss": 0.80797291, "learning_rate": 2.7232927063498557e-09, "loss": 0.82930744, "num_input_tokens_seen": 176643615, "step": 8182, "time_per_iteration": 3.3102962970733643 }, { "auxiliary_loss_clip": 0.01153901, "auxiliary_loss_mlp": 0.01021006, "balance_loss_clip": 1.04618919, "balance_loss_mlp": 1.01408613, "epoch": 0.9839475740996814, "flos": 40110207304320.0, "grad_norm": 6.551770927331249, "language_loss": 0.68979466, "learning_rate": 2.682807267020859e-09, "loss": 0.71154368, "num_input_tokens_seen": 176666375, "step": 8183, "time_per_iteration": 2.5814456939697266 }, { "auxiliary_loss_clip": 0.01150134, "auxiliary_loss_mlp": 0.01022598, "balance_loss_clip": 1.04517221, "balance_loss_mlp": 1.01510835, "epoch": 0.9840678169903204, "flos": 24169389788160.0, "grad_norm": 1.522742496874214, "language_loss": 0.62137973, "learning_rate": 2.642624819121808e-09, "loss": 0.643107, "num_input_tokens_seen": 176686525, "step": 8184, "time_per_iteration": 2.433030605316162 }, { "auxiliary_loss_clip": 0.01134684, "auxiliary_loss_mlp": 0.01025279, "balance_loss_clip": 1.04492819, "balance_loss_mlp": 1.0182457, "epoch": 0.9841880598809596, "flos": 14684484447360.0, "grad_norm": 1.9074219501131964, "language_loss": 0.61766517, "learning_rate": 2.6027453687487154e-09, "loss": 0.63926476, "num_input_tokens_seen": 176703615, "step": 8185, "time_per_iteration": 2.453526258468628 }, { "auxiliary_loss_clip": 0.0113783, "auxiliary_loss_mlp": 0.01022139, "balance_loss_clip": 1.04499722, "balance_loss_mlp": 1.01523411, "epoch": 0.9843083027715986, "flos": 22344768668160.0, "grad_norm": 2.3130294478795603, "language_loss": 0.54243672, "learning_rate": 2.5631689219509643e-09, "loss": 0.56403637, "num_input_tokens_seen": 176722295, "step": 8186, "time_per_iteration": 2.466492176055908 }, { "auxiliary_loss_clip": 0.01139525, "auxiliary_loss_mlp": 0.01021497, "balance_loss_clip": 1.04729521, "balance_loss_mlp": 1.0150423, "epoch": 0.9844285456622377, "flos": 21800111765760.0, "grad_norm": 1.7542054970354624, "language_loss": 0.83476424, "learning_rate": 2.523895484732197e-09, "loss": 0.8563745, "num_input_tokens_seen": 176741750, "step": 8187, "time_per_iteration": 2.4721720218658447 }, { "auxiliary_loss_clip": 0.01155893, "auxiliary_loss_mlp": 0.01024094, "balance_loss_clip": 1.04543436, "balance_loss_mlp": 1.01610994, "epoch": 0.9845487885528769, "flos": 18035380321920.0, "grad_norm": 1.9993578250037793, "language_loss": 0.74915093, "learning_rate": 2.4849250630505357e-09, "loss": 0.77095079, "num_input_tokens_seen": 176759995, "step": 8188, "time_per_iteration": 2.4326727390289307 }, { "auxiliary_loss_clip": 0.0106834, "auxiliary_loss_mlp": 0.01025149, "balance_loss_clip": 1.03526163, "balance_loss_mlp": 1.01820219, "epoch": 0.9846690314435159, "flos": 25228610974080.0, "grad_norm": 1.6861320855085145, "language_loss": 0.73389876, "learning_rate": 2.4462576628172528e-09, "loss": 0.75483364, "num_input_tokens_seen": 176778625, "step": 8189, "time_per_iteration": 2.6228549480438232 }, { "auxiliary_loss_clip": 0.01149123, "auxiliary_loss_mlp": 0.01028448, "balance_loss_clip": 1.04652023, "balance_loss_mlp": 1.02082777, "epoch": 0.984789274334155, "flos": 18552171248640.0, "grad_norm": 2.8843873994560543, "language_loss": 0.74140555, "learning_rate": 2.407893289898766e-09, "loss": 0.76318121, "num_input_tokens_seen": 176797655, "step": 8190, "time_per_iteration": 2.4206769466400146 }, { "auxiliary_loss_clip": 0.01113867, "auxiliary_loss_mlp": 0.01018266, "balance_loss_clip": 1.03964412, "balance_loss_mlp": 1.01115489, "epoch": 0.984909517224794, "flos": 27345437233920.0, "grad_norm": 1.7752209905871499, "language_loss": 0.84196013, "learning_rate": 2.3698319501144202e-09, "loss": 0.86328149, "num_input_tokens_seen": 176818640, "step": 8191, "time_per_iteration": 2.5583198070526123 }, { "auxiliary_loss_clip": 0.01156865, "auxiliary_loss_mlp": 0.01025094, "balance_loss_clip": 1.0452652, "balance_loss_mlp": 1.01747036, "epoch": 0.9850297601154332, "flos": 18734058743040.0, "grad_norm": 1.5630931861897777, "language_loss": 0.73526227, "learning_rate": 2.3320736492382644e-09, "loss": 0.75708187, "num_input_tokens_seen": 176837475, "step": 8192, "time_per_iteration": 2.4372599124908447 }, { "auxiliary_loss_clip": 0.01162336, "auxiliary_loss_mlp": 0.01026282, "balance_loss_clip": 1.04683065, "balance_loss_mlp": 1.01958275, "epoch": 0.9851500030060723, "flos": 22308247514880.0, "grad_norm": 1.5268476826951913, "language_loss": 0.68155867, "learning_rate": 2.29461839299816e-09, "loss": 0.70344484, "num_input_tokens_seen": 176857190, "step": 8193, "time_per_iteration": 2.4199471473693848 }, { "auxiliary_loss_clip": 0.01124452, "auxiliary_loss_mlp": 0.01017563, "balance_loss_clip": 1.0418613, "balance_loss_mlp": 1.01095009, "epoch": 0.9852702458967113, "flos": 26353691746560.0, "grad_norm": 1.5149358253887673, "language_loss": 0.80068064, "learning_rate": 2.257466187076229e-09, "loss": 0.82210076, "num_input_tokens_seen": 176876395, "step": 8194, "time_per_iteration": 2.5408670902252197 }, { "auxiliary_loss_clip": 0.01154064, "auxiliary_loss_mlp": 0.00761708, "balance_loss_clip": 1.04399788, "balance_loss_mlp": 1.00036049, "epoch": 0.9853904887873505, "flos": 20883599314560.0, "grad_norm": 2.049029767752672, "language_loss": 0.71050161, "learning_rate": 2.2206170371081854e-09, "loss": 0.72965932, "num_input_tokens_seen": 176894980, "step": 8195, "time_per_iteration": 2.443204402923584 }, { "auxiliary_loss_clip": 0.01138112, "auxiliary_loss_mlp": 0.0102752, "balance_loss_clip": 1.04334724, "balance_loss_mlp": 1.01947975, "epoch": 0.9855107316779895, "flos": 25263444188160.0, "grad_norm": 1.6394022493108442, "language_loss": 0.85019565, "learning_rate": 2.1840709486842247e-09, "loss": 0.87185192, "num_input_tokens_seen": 176914600, "step": 8196, "time_per_iteration": 2.506575107574463 }, { "auxiliary_loss_clip": 0.01127804, "auxiliary_loss_mlp": 0.01027435, "balance_loss_clip": 1.04220319, "balance_loss_mlp": 1.02022243, "epoch": 0.9856309745686286, "flos": 19062102677760.0, "grad_norm": 1.9332385512072892, "language_loss": 0.79136419, "learning_rate": 2.1478279273481335e-09, "loss": 0.81291664, "num_input_tokens_seen": 176933085, "step": 8197, "time_per_iteration": 2.462186098098755 }, { "auxiliary_loss_clip": 0.01150696, "auxiliary_loss_mlp": 0.01025565, "balance_loss_clip": 1.04712677, "balance_loss_mlp": 1.01867497, "epoch": 0.9857512174592677, "flos": 34130758060800.0, "grad_norm": 2.1952292721048243, "language_loss": 0.79617262, "learning_rate": 2.1118879785981815e-09, "loss": 0.81793523, "num_input_tokens_seen": 176953225, "step": 8198, "time_per_iteration": 2.538031816482544 }, { "auxiliary_loss_clip": 0.01133297, "auxiliary_loss_mlp": 0.01022241, "balance_loss_clip": 1.04383445, "balance_loss_mlp": 1.01555085, "epoch": 0.9858714603499068, "flos": 25994693266560.0, "grad_norm": 1.5733570963846992, "language_loss": 0.79581743, "learning_rate": 2.0762511078862288e-09, "loss": 0.8173728, "num_input_tokens_seen": 176973570, "step": 8199, "time_per_iteration": 3.349174737930298 }, { "auxiliary_loss_clip": 0.01142479, "auxiliary_loss_mlp": 0.01020954, "balance_loss_clip": 1.0434587, "balance_loss_mlp": 1.01426649, "epoch": 0.9859917032405459, "flos": 23696230907520.0, "grad_norm": 2.171100552167529, "language_loss": 0.65155751, "learning_rate": 2.0409173206186183e-09, "loss": 0.67319179, "num_input_tokens_seen": 176992810, "step": 8200, "time_per_iteration": 2.4816854000091553 }, { "auxiliary_loss_clip": 0.01121187, "auxiliary_loss_mlp": 0.01023869, "balance_loss_clip": 1.04552507, "balance_loss_mlp": 1.01708949, "epoch": 0.986111946131185, "flos": 19938287134080.0, "grad_norm": 4.955463796134859, "language_loss": 0.87317407, "learning_rate": 2.0058866221550617e-09, "loss": 0.89462465, "num_input_tokens_seen": 177011050, "step": 8201, "time_per_iteration": 3.305004596710205 }, { "auxiliary_loss_clip": 0.01163263, "auxiliary_loss_mlp": 0.0102051, "balance_loss_clip": 1.04446673, "balance_loss_mlp": 1.01359296, "epoch": 0.9862321890218241, "flos": 19828831415040.0, "grad_norm": 2.128078414798084, "language_loss": 0.75860846, "learning_rate": 1.971159017809976e-09, "loss": 0.78044623, "num_input_tokens_seen": 177029340, "step": 8202, "time_per_iteration": 2.389552354812622 }, { "auxiliary_loss_clip": 0.01149284, "auxiliary_loss_mlp": 0.01024006, "balance_loss_clip": 1.04506397, "balance_loss_mlp": 1.01650786, "epoch": 0.9863524319124631, "flos": 21652051904640.0, "grad_norm": 23.582536999610802, "language_loss": 0.77674824, "learning_rate": 1.93673451285159e-09, "loss": 0.79848117, "num_input_tokens_seen": 177048390, "step": 8203, "time_per_iteration": 3.1873929500579834 }, { "auxiliary_loss_clip": 0.0104363, "auxiliary_loss_mlp": 0.0100119, "balance_loss_clip": 1.00771856, "balance_loss_mlp": 1.00029624, "epoch": 0.9864726748031023, "flos": 52769977920000.0, "grad_norm": 0.7331525055409038, "language_loss": 0.56509274, "learning_rate": 1.9026131125019495e-09, "loss": 0.58554095, "num_input_tokens_seen": 177105760, "step": 8204, "time_per_iteration": 2.98915958404541 }, { "auxiliary_loss_clip": 0.01146008, "auxiliary_loss_mlp": 0.01023233, "balance_loss_clip": 1.04641187, "balance_loss_mlp": 1.01645613, "epoch": 0.9865929176937414, "flos": 23364631526400.0, "grad_norm": 1.6207546569319238, "language_loss": 0.86899006, "learning_rate": 1.8687948219371363e-09, "loss": 0.89068246, "num_input_tokens_seen": 177124985, "step": 8205, "time_per_iteration": 2.4355862140655518 }, { "auxiliary_loss_clip": 0.01169421, "auxiliary_loss_mlp": 0.01025656, "balance_loss_clip": 1.04695034, "balance_loss_mlp": 1.01823783, "epoch": 0.9867131605843804, "flos": 21616679986560.0, "grad_norm": 1.9949169528993096, "language_loss": 0.88505238, "learning_rate": 1.835279646287491e-09, "loss": 0.90700316, "num_input_tokens_seen": 177142995, "step": 8206, "time_per_iteration": 2.3916306495666504 }, { "auxiliary_loss_clip": 0.01157556, "auxiliary_loss_mlp": 0.01029037, "balance_loss_clip": 1.04673064, "balance_loss_mlp": 1.02163994, "epoch": 0.9868334034750196, "flos": 22271403139200.0, "grad_norm": 1.8474755622821888, "language_loss": 0.76665807, "learning_rate": 1.8020675906371685e-09, "loss": 0.78852397, "num_input_tokens_seen": 177162390, "step": 8207, "time_per_iteration": 2.433635950088501 }, { "auxiliary_loss_clip": 0.01103672, "auxiliary_loss_mlp": 0.01022467, "balance_loss_clip": 1.03899336, "balance_loss_mlp": 1.01545727, "epoch": 0.9869536463656586, "flos": 25809573548160.0, "grad_norm": 2.0798946815339803, "language_loss": 0.75366139, "learning_rate": 1.7691586600243612e-09, "loss": 0.77492273, "num_input_tokens_seen": 177181290, "step": 8208, "time_per_iteration": 2.557999610900879 }, { "auxiliary_loss_clip": 0.01137628, "auxiliary_loss_mlp": 0.01024275, "balance_loss_clip": 1.04889417, "balance_loss_mlp": 1.01703644, "epoch": 0.9870738892562977, "flos": 16398500613120.0, "grad_norm": 5.2309106437645605, "language_loss": 0.86625189, "learning_rate": 1.7365528594415202e-09, "loss": 0.88787091, "num_input_tokens_seen": 177195360, "step": 8209, "time_per_iteration": 3.213785171508789 }, { "auxiliary_loss_clip": 0.01153724, "auxiliary_loss_mlp": 0.00762035, "balance_loss_clip": 1.04468632, "balance_loss_mlp": 1.00045824, "epoch": 0.9871941321469369, "flos": 35481358373760.0, "grad_norm": 9.45133272130327, "language_loss": 0.67614359, "learning_rate": 1.7042501938346888e-09, "loss": 0.69530118, "num_input_tokens_seen": 177218090, "step": 8210, "time_per_iteration": 2.5642542839050293 }, { "auxiliary_loss_clip": 0.01124516, "auxiliary_loss_mlp": 0.01021806, "balance_loss_clip": 1.03919399, "balance_loss_mlp": 1.01470113, "epoch": 0.9873143750375759, "flos": 21434217874560.0, "grad_norm": 1.975584357556349, "language_loss": 0.76455963, "learning_rate": 1.6722506681043913e-09, "loss": 0.7860229, "num_input_tokens_seen": 177237050, "step": 8211, "time_per_iteration": 2.476716995239258 }, { "auxiliary_loss_clip": 0.01140849, "auxiliary_loss_mlp": 0.01027585, "balance_loss_clip": 1.04503632, "balance_loss_mlp": 1.0207485, "epoch": 0.987434617928215, "flos": 16326499800960.0, "grad_norm": 2.979844503968214, "language_loss": 0.68847156, "learning_rate": 1.640554287104745e-09, "loss": 0.71015596, "num_input_tokens_seen": 177255325, "step": 8212, "time_per_iteration": 2.4656195640563965 }, { "auxiliary_loss_clip": 0.01123628, "auxiliary_loss_mlp": 0.01023534, "balance_loss_clip": 1.03853714, "balance_loss_mlp": 1.01629519, "epoch": 0.9875548608188541, "flos": 17851984456320.0, "grad_norm": 2.073369511657995, "language_loss": 0.79677486, "learning_rate": 1.609161055644348e-09, "loss": 0.81824642, "num_input_tokens_seen": 177271250, "step": 8213, "time_per_iteration": 2.4907920360565186 }, { "auxiliary_loss_clip": 0.01158763, "auxiliary_loss_mlp": 0.01023699, "balance_loss_clip": 1.04591489, "balance_loss_mlp": 1.01667762, "epoch": 0.9876751037094932, "flos": 26132876887680.0, "grad_norm": 2.0447305519179375, "language_loss": 0.68622923, "learning_rate": 1.5780709784849467e-09, "loss": 0.70805389, "num_input_tokens_seen": 177288270, "step": 8214, "time_per_iteration": 2.4699149131774902 }, { "auxiliary_loss_clip": 0.01100066, "auxiliary_loss_mlp": 0.01025625, "balance_loss_clip": 1.0441165, "balance_loss_mlp": 1.0185796, "epoch": 0.9877953466001322, "flos": 15991344973440.0, "grad_norm": 2.0068672353271833, "language_loss": 0.82483274, "learning_rate": 1.5472840603436565e-09, "loss": 0.8460896, "num_input_tokens_seen": 177305500, "step": 8215, "time_per_iteration": 2.543728828430176 }, { "auxiliary_loss_clip": 0.01141856, "auxiliary_loss_mlp": 0.01024319, "balance_loss_clip": 1.04618812, "balance_loss_mlp": 1.0172441, "epoch": 0.9879155894907714, "flos": 18806777827200.0, "grad_norm": 1.9405801073787239, "language_loss": 0.77950031, "learning_rate": 1.5168003058900757e-09, "loss": 0.80116206, "num_input_tokens_seen": 177323500, "step": 8216, "time_per_iteration": 2.4423890113830566 }, { "auxiliary_loss_clip": 0.01121892, "auxiliary_loss_mlp": 0.01026682, "balance_loss_clip": 1.04090095, "balance_loss_mlp": 1.01993763, "epoch": 0.9880358323814105, "flos": 22382044007040.0, "grad_norm": 2.0017821861851766, "language_loss": 0.9196167, "learning_rate": 1.4866197197491715e-09, "loss": 0.94110239, "num_input_tokens_seen": 177342860, "step": 8217, "time_per_iteration": 2.5106611251831055 }, { "auxiliary_loss_clip": 0.01155507, "auxiliary_loss_mlp": 0.0076251, "balance_loss_clip": 1.04641223, "balance_loss_mlp": 1.00047112, "epoch": 0.9881560752720495, "flos": 15668831733120.0, "grad_norm": 2.9338304579527676, "language_loss": 0.7857607, "learning_rate": 1.4567423064988371e-09, "loss": 0.80494094, "num_input_tokens_seen": 177360210, "step": 8218, "time_per_iteration": 2.408687114715576 }, { "auxiliary_loss_clip": 0.01165686, "auxiliary_loss_mlp": 0.01024608, "balance_loss_clip": 1.04594207, "balance_loss_mlp": 1.01744914, "epoch": 0.9882763181626887, "flos": 21500113374720.0, "grad_norm": 1.8667621797794887, "language_loss": 0.77782595, "learning_rate": 1.4271680706718913e-09, "loss": 0.79972887, "num_input_tokens_seen": 177377885, "step": 8219, "time_per_iteration": 2.394237518310547 }, { "auxiliary_loss_clip": 0.01155444, "auxiliary_loss_mlp": 0.01025989, "balance_loss_clip": 1.04795313, "balance_loss_mlp": 1.01845527, "epoch": 0.9883965610533277, "flos": 28034598551040.0, "grad_norm": 1.6049216300576692, "language_loss": 0.82374293, "learning_rate": 1.3978970167543013e-09, "loss": 0.84555721, "num_input_tokens_seen": 177398065, "step": 8220, "time_per_iteration": 2.5051913261413574 }, { "auxiliary_loss_clip": 0.01128108, "auxiliary_loss_mlp": 0.01025634, "balance_loss_clip": 1.04279625, "balance_loss_mlp": 1.01836777, "epoch": 0.9885168039439668, "flos": 14098601710080.0, "grad_norm": 2.4830200864720564, "language_loss": 0.7796874, "learning_rate": 1.3689291491867372e-09, "loss": 0.80122483, "num_input_tokens_seen": 177416380, "step": 8221, "time_per_iteration": 2.443425178527832 }, { "auxiliary_loss_clip": 0.01165878, "auxiliary_loss_mlp": 0.01028487, "balance_loss_clip": 1.04621661, "balance_loss_mlp": 1.02078021, "epoch": 0.988637046834606, "flos": 26432013352320.0, "grad_norm": 2.1511575434542562, "language_loss": 0.73536474, "learning_rate": 1.3402644723636836e-09, "loss": 0.75730836, "num_input_tokens_seen": 177438410, "step": 8222, "time_per_iteration": 2.483613967895508 }, { "auxiliary_loss_clip": 0.01131337, "auxiliary_loss_mlp": 0.01023972, "balance_loss_clip": 1.04553604, "balance_loss_mlp": 1.01703703, "epoch": 0.988757289725245, "flos": 25229113764480.0, "grad_norm": 1.9001244244780227, "language_loss": 0.83645672, "learning_rate": 1.311902990633218e-09, "loss": 0.85800976, "num_input_tokens_seen": 177457375, "step": 8223, "time_per_iteration": 2.4874227046966553 }, { "auxiliary_loss_clip": 0.01130236, "auxiliary_loss_mlp": 0.01019633, "balance_loss_clip": 1.04057741, "balance_loss_mlp": 1.01319551, "epoch": 0.9888775326158841, "flos": 26359042872960.0, "grad_norm": 1.5520314851969768, "language_loss": 0.71581125, "learning_rate": 1.2838447082978987e-09, "loss": 0.73730993, "num_input_tokens_seen": 177478530, "step": 8224, "time_per_iteration": 2.5411577224731445 }, { "auxiliary_loss_clip": 0.01146979, "auxiliary_loss_mlp": 0.01022409, "balance_loss_clip": 1.04357994, "balance_loss_mlp": 1.01494384, "epoch": 0.9889977755065231, "flos": 24316120846080.0, "grad_norm": 3.907379924692396, "language_loss": 0.83664012, "learning_rate": 1.2560896296143208e-09, "loss": 0.85833395, "num_input_tokens_seen": 177496995, "step": 8225, "time_per_iteration": 2.4394185543060303 }, { "auxiliary_loss_clip": 0.01165436, "auxiliary_loss_mlp": 0.01024812, "balance_loss_clip": 1.04679155, "balance_loss_mlp": 1.01777577, "epoch": 0.9891180183971623, "flos": 18951066760320.0, "grad_norm": 2.118567691603205, "language_loss": 0.82306933, "learning_rate": 1.2286377587926722e-09, "loss": 0.84497184, "num_input_tokens_seen": 177513785, "step": 8226, "time_per_iteration": 3.2136430740356445 }, { "auxiliary_loss_clip": 0.01163463, "auxiliary_loss_mlp": 0.01022611, "balance_loss_clip": 1.04537535, "balance_loss_mlp": 1.01529193, "epoch": 0.9892382612878013, "flos": 26176580760960.0, "grad_norm": 1.9185948701520856, "language_loss": 0.74751675, "learning_rate": 1.2014890999973992e-09, "loss": 0.76937747, "num_input_tokens_seen": 177530705, "step": 8227, "time_per_iteration": 2.427504062652588 }, { "auxiliary_loss_clip": 0.01162274, "auxiliary_loss_mlp": 0.01023804, "balance_loss_clip": 1.04502368, "balance_loss_mlp": 1.01721191, "epoch": 0.9893585041784404, "flos": 25449605400960.0, "grad_norm": 1.5633670665264983, "language_loss": 0.78335214, "learning_rate": 1.1746436573472073e-09, "loss": 0.80521291, "num_input_tokens_seen": 177552440, "step": 8228, "time_per_iteration": 3.2761571407318115 }, { "auxiliary_loss_clip": 0.01145328, "auxiliary_loss_mlp": 0.01024164, "balance_loss_clip": 1.04438376, "balance_loss_mlp": 1.01721108, "epoch": 0.9894787470690796, "flos": 20189302352640.0, "grad_norm": 1.9098205785811575, "language_loss": 0.691966, "learning_rate": 1.1481014349141726e-09, "loss": 0.71366096, "num_input_tokens_seen": 177569660, "step": 8229, "time_per_iteration": 2.446530818939209 }, { "auxiliary_loss_clip": 0.01136167, "auxiliary_loss_mlp": 0.01024777, "balance_loss_clip": 1.0436995, "balance_loss_mlp": 1.01718926, "epoch": 0.9895989899597186, "flos": 24644308435200.0, "grad_norm": 1.822870311681976, "language_loss": 0.84340888, "learning_rate": 1.121862436724852e-09, "loss": 0.86501831, "num_input_tokens_seen": 177588500, "step": 8230, "time_per_iteration": 3.2739226818084717 }, { "auxiliary_loss_clip": 0.01153257, "auxiliary_loss_mlp": 0.01026766, "balance_loss_clip": 1.04843354, "balance_loss_mlp": 1.01961923, "epoch": 0.9897192328503577, "flos": 21799034357760.0, "grad_norm": 1.7222976887879071, "language_loss": 0.70558274, "learning_rate": 1.0959266667598388e-09, "loss": 0.72738296, "num_input_tokens_seen": 177607315, "step": 8231, "time_per_iteration": 2.500082015991211 }, { "auxiliary_loss_clip": 0.01127318, "auxiliary_loss_mlp": 0.01026287, "balance_loss_clip": 1.04540443, "balance_loss_mlp": 1.01870525, "epoch": 0.9898394757409968, "flos": 21325229032320.0, "grad_norm": 1.9228778199221135, "language_loss": 0.74942732, "learning_rate": 1.0702941289533196e-09, "loss": 0.77096337, "num_input_tokens_seen": 177625990, "step": 8232, "time_per_iteration": 2.558720827102661 }, { "auxiliary_loss_clip": 0.01120073, "auxiliary_loss_mlp": 0.01024423, "balance_loss_clip": 1.04184389, "balance_loss_mlp": 1.0182476, "epoch": 0.9899597186316359, "flos": 18545024442240.0, "grad_norm": 2.152609881846572, "language_loss": 0.88654989, "learning_rate": 1.0449648271939615e-09, "loss": 0.90799487, "num_input_tokens_seen": 177642335, "step": 8233, "time_per_iteration": 2.496023654937744 }, { "auxiliary_loss_clip": 0.01116254, "auxiliary_loss_mlp": 0.00762187, "balance_loss_clip": 1.04606092, "balance_loss_mlp": 1.00047374, "epoch": 0.990079961522275, "flos": 23766723348480.0, "grad_norm": 1.4953693311670235, "language_loss": 0.72706974, "learning_rate": 1.0199387653240243e-09, "loss": 0.7458542, "num_input_tokens_seen": 177662025, "step": 8234, "time_per_iteration": 2.562835454940796 }, { "auxiliary_loss_clip": 0.01131009, "auxiliary_loss_mlp": 0.01021564, "balance_loss_clip": 1.04393375, "balance_loss_mlp": 1.01464093, "epoch": 0.9902002044129141, "flos": 16399182971520.0, "grad_norm": 1.671507168851399, "language_loss": 0.70949006, "learning_rate": 9.952159471400267e-10, "loss": 0.73101586, "num_input_tokens_seen": 177679065, "step": 8235, "time_per_iteration": 3.1519935131073 }, { "auxiliary_loss_clip": 0.01153063, "auxiliary_loss_mlp": 0.00761591, "balance_loss_clip": 1.04518795, "balance_loss_mlp": 1.0004065, "epoch": 0.9903204473035532, "flos": 22559657783040.0, "grad_norm": 1.8935456654621574, "language_loss": 0.84576666, "learning_rate": 9.707963763923022e-10, "loss": 0.86491323, "num_input_tokens_seen": 177698115, "step": 8236, "time_per_iteration": 2.517108917236328 }, { "auxiliary_loss_clip": 0.01132951, "auxiliary_loss_mlp": 0.01021913, "balance_loss_clip": 1.04107857, "balance_loss_mlp": 1.01476312, "epoch": 0.9904406901941922, "flos": 16144001775360.0, "grad_norm": 1.9080028037985008, "language_loss": 0.79011548, "learning_rate": 9.466800567854427e-10, "loss": 0.8116641, "num_input_tokens_seen": 177716715, "step": 8237, "time_per_iteration": 2.46001935005188 }, { "auxiliary_loss_clip": 0.0112425, "auxiliary_loss_mlp": 0.0102061, "balance_loss_clip": 1.03991556, "balance_loss_mlp": 1.01293325, "epoch": 0.9905609330848314, "flos": 26651499408000.0, "grad_norm": 3.4816435553339224, "language_loss": 0.68279338, "learning_rate": 9.228669919778553e-10, "loss": 0.70424193, "num_input_tokens_seen": 177735640, "step": 8238, "time_per_iteration": 2.5612010955810547 }, { "auxiliary_loss_clip": 0.01131846, "auxiliary_loss_mlp": 0.01027475, "balance_loss_clip": 1.04183245, "balance_loss_mlp": 1.01966715, "epoch": 0.9906811759754705, "flos": 23111820627840.0, "grad_norm": 2.039818090992299, "language_loss": 0.79856187, "learning_rate": 8.993571855817617e-10, "loss": 0.82015508, "num_input_tokens_seen": 177754470, "step": 8239, "time_per_iteration": 2.4819586277008057 }, { "auxiliary_loss_clip": 0.01151675, "auxiliary_loss_mlp": 0.01023849, "balance_loss_clip": 1.04525828, "balance_loss_mlp": 1.01685715, "epoch": 0.9908014188661095, "flos": 22090593052800.0, "grad_norm": 3.4560413166188786, "language_loss": 0.751387, "learning_rate": 8.761506411638642e-10, "loss": 0.77314222, "num_input_tokens_seen": 177773935, "step": 8240, "time_per_iteration": 2.5067315101623535 }, { "auxiliary_loss_clip": 0.0113594, "auxiliary_loss_mlp": 0.01030578, "balance_loss_clip": 1.04533064, "balance_loss_mlp": 1.02366042, "epoch": 0.9909216617567487, "flos": 19242948677760.0, "grad_norm": 1.7315299882238075, "language_loss": 0.73853493, "learning_rate": 8.53247362244236e-10, "loss": 0.76020014, "num_input_tokens_seen": 177792745, "step": 8241, "time_per_iteration": 2.50022292137146 }, { "auxiliary_loss_clip": 0.01138599, "auxiliary_loss_mlp": 0.01023076, "balance_loss_clip": 1.04534411, "balance_loss_mlp": 1.01628959, "epoch": 0.9910419046473877, "flos": 23621213352960.0, "grad_norm": 1.728782356122078, "language_loss": 0.68237358, "learning_rate": 8.306473522976532e-10, "loss": 0.70399034, "num_input_tokens_seen": 177812150, "step": 8242, "time_per_iteration": 2.5199990272521973 }, { "auxiliary_loss_clip": 0.01165397, "auxiliary_loss_mlp": 0.01019958, "balance_loss_clip": 1.0478332, "balance_loss_mlp": 1.01259971, "epoch": 0.9911621475380268, "flos": 22711380831360.0, "grad_norm": 1.7422668945760342, "language_loss": 0.71756887, "learning_rate": 8.083506147522623e-10, "loss": 0.73942244, "num_input_tokens_seen": 177831545, "step": 8243, "time_per_iteration": 2.4368674755096436 }, { "auxiliary_loss_clip": 0.01144136, "auxiliary_loss_mlp": 0.01026857, "balance_loss_clip": 1.04296041, "balance_loss_mlp": 1.01975846, "epoch": 0.991282390428666, "flos": 13516956777600.0, "grad_norm": 2.071491621034792, "language_loss": 0.853513, "learning_rate": 7.863571529906909e-10, "loss": 0.87522292, "num_input_tokens_seen": 177847130, "step": 8244, "time_per_iteration": 2.428632974624634 }, { "auxiliary_loss_clip": 0.01034688, "auxiliary_loss_mlp": 0.0100149, "balance_loss_clip": 1.00752044, "balance_loss_mlp": 1.00058353, "epoch": 0.991402633319305, "flos": 61830492071040.0, "grad_norm": 0.7218832933836399, "language_loss": 0.59645915, "learning_rate": 7.646669703489372e-10, "loss": 0.61682093, "num_input_tokens_seen": 177911440, "step": 8245, "time_per_iteration": 3.1910464763641357 }, { "auxiliary_loss_clip": 0.01086141, "auxiliary_loss_mlp": 0.01021818, "balance_loss_clip": 1.04061437, "balance_loss_mlp": 1.01485324, "epoch": 0.9915228762099441, "flos": 18770148933120.0, "grad_norm": 1.8423509444893138, "language_loss": 0.57068819, "learning_rate": 7.432800701177023e-10, "loss": 0.59176779, "num_input_tokens_seen": 177929440, "step": 8246, "time_per_iteration": 2.581807851791382 }, { "auxiliary_loss_clip": 0.01043244, "auxiliary_loss_mlp": 0.01001444, "balance_loss_clip": 1.00897479, "balance_loss_mlp": 1.00056732, "epoch": 0.9916431191005832, "flos": 65936660244480.0, "grad_norm": 0.7936290014642606, "language_loss": 0.57828838, "learning_rate": 7.221964555415017e-10, "loss": 0.59873521, "num_input_tokens_seen": 177989100, "step": 8247, "time_per_iteration": 3.0172791481018066 }, { "auxiliary_loss_clip": 0.01135196, "auxiliary_loss_mlp": 0.01019479, "balance_loss_clip": 1.04408693, "balance_loss_mlp": 1.01303303, "epoch": 0.9917633619912223, "flos": 16581573256320.0, "grad_norm": 1.7519676958283414, "language_loss": 0.75069213, "learning_rate": 7.01416129818222e-10, "loss": 0.77223885, "num_input_tokens_seen": 178006720, "step": 8248, "time_per_iteration": 2.4533557891845703 }, { "auxiliary_loss_clip": 0.01129357, "auxiliary_loss_mlp": 0.01024999, "balance_loss_clip": 1.0436641, "balance_loss_mlp": 1.01781344, "epoch": 0.9918836048818613, "flos": 25411108999680.0, "grad_norm": 1.824356976086391, "language_loss": 0.58724999, "learning_rate": 6.809390961006745e-10, "loss": 0.6087935, "num_input_tokens_seen": 178026850, "step": 8249, "time_per_iteration": 2.563532590866089 }, { "auxiliary_loss_clip": 0.0113507, "auxiliary_loss_mlp": 0.01027173, "balance_loss_clip": 1.04441929, "balance_loss_mlp": 1.02013946, "epoch": 0.9920038477725005, "flos": 25046867134080.0, "grad_norm": 1.8895234176989288, "language_loss": 0.68826032, "learning_rate": 6.607653574948191e-10, "loss": 0.70988274, "num_input_tokens_seen": 178047630, "step": 8250, "time_per_iteration": 2.498539686203003 }, { "auxiliary_loss_clip": 0.01142, "auxiliary_loss_mlp": 0.01024422, "balance_loss_clip": 1.04175401, "balance_loss_mlp": 1.01758504, "epoch": 0.9921240906631396, "flos": 21829773421440.0, "grad_norm": 2.000025237290295, "language_loss": 0.81559461, "learning_rate": 6.408949170613187e-10, "loss": 0.83725882, "num_input_tokens_seen": 178066895, "step": 8251, "time_per_iteration": 2.4991910457611084 }, { "auxiliary_loss_clip": 0.01135389, "auxiliary_loss_mlp": 0.01024816, "balance_loss_clip": 1.04371572, "balance_loss_mlp": 1.01703203, "epoch": 0.9922443335537786, "flos": 24864225454080.0, "grad_norm": 2.615890616532026, "language_loss": 0.81753099, "learning_rate": 6.213277778144288e-10, "loss": 0.83913302, "num_input_tokens_seen": 178088540, "step": 8252, "time_per_iteration": 2.5331714153289795 }, { "auxiliary_loss_clip": 0.0109828, "auxiliary_loss_mlp": 0.01027662, "balance_loss_clip": 1.03925061, "balance_loss_mlp": 1.02021778, "epoch": 0.9923645764444178, "flos": 21613088626560.0, "grad_norm": 2.0226087257790306, "language_loss": 0.6694026, "learning_rate": 6.020639427224416e-10, "loss": 0.69066203, "num_input_tokens_seen": 178106185, "step": 8253, "time_per_iteration": 3.434929847717285 }, { "auxiliary_loss_clip": 0.01136079, "auxiliary_loss_mlp": 0.01021989, "balance_loss_clip": 1.04462194, "balance_loss_mlp": 1.01495004, "epoch": 0.9924848193350568, "flos": 25001798544000.0, "grad_norm": 2.0996007644077106, "language_loss": 0.72443128, "learning_rate": 5.831034147076864e-10, "loss": 0.74601185, "num_input_tokens_seen": 178123435, "step": 8254, "time_per_iteration": 2.504014730453491 }, { "auxiliary_loss_clip": 0.01049314, "auxiliary_loss_mlp": 0.01000677, "balance_loss_clip": 1.00832915, "balance_loss_mlp": 0.9997946, "epoch": 0.9926050622256959, "flos": 68912543151360.0, "grad_norm": 0.6875357353503534, "language_loss": 0.55787557, "learning_rate": 5.644461966463065e-10, "loss": 0.57837552, "num_input_tokens_seen": 178191045, "step": 8255, "time_per_iteration": 4.022282361984253 }, { "auxiliary_loss_clip": 0.01136333, "auxiliary_loss_mlp": 0.0102066, "balance_loss_clip": 1.0449394, "balance_loss_mlp": 1.01431227, "epoch": 0.9927253051163349, "flos": 20923675914240.0, "grad_norm": 1.7225020069692987, "language_loss": 0.75699162, "learning_rate": 5.460922913687049e-10, "loss": 0.77856159, "num_input_tokens_seen": 178210135, "step": 8256, "time_per_iteration": 3.2249083518981934 }, { "auxiliary_loss_clip": 0.01107072, "auxiliary_loss_mlp": 0.00762176, "balance_loss_clip": 1.03843856, "balance_loss_mlp": 1.00043488, "epoch": 0.9928455480069741, "flos": 22308211601280.0, "grad_norm": 2.286550897856297, "language_loss": 0.74978691, "learning_rate": 5.280417016593208e-10, "loss": 0.76847941, "num_input_tokens_seen": 178229925, "step": 8257, "time_per_iteration": 2.6291749477386475 }, { "auxiliary_loss_clip": 0.01148363, "auxiliary_loss_mlp": 0.00760879, "balance_loss_clip": 1.04695773, "balance_loss_mlp": 1.00039744, "epoch": 0.9929657908976132, "flos": 17383889393280.0, "grad_norm": 1.732674229424684, "language_loss": 0.74804699, "learning_rate": 5.102944302559642e-10, "loss": 0.76713943, "num_input_tokens_seen": 178247420, "step": 8258, "time_per_iteration": 2.442061424255371 }, { "auxiliary_loss_clip": 0.01104334, "auxiliary_loss_mlp": 0.01026245, "balance_loss_clip": 1.04263341, "balance_loss_mlp": 1.01882112, "epoch": 0.9930860337882522, "flos": 22674680110080.0, "grad_norm": 3.710278410150471, "language_loss": 0.79882371, "learning_rate": 4.9285047985137e-10, "loss": 0.82012951, "num_input_tokens_seen": 178266840, "step": 8259, "time_per_iteration": 2.6384880542755127 }, { "auxiliary_loss_clip": 0.01153456, "auxiliary_loss_mlp": 0.01028022, "balance_loss_clip": 1.04535997, "balance_loss_mlp": 1.02107477, "epoch": 0.9932062766788914, "flos": 28147789284480.0, "grad_norm": 1.6430070788512317, "language_loss": 0.74637204, "learning_rate": 4.757098530916436e-10, "loss": 0.76818681, "num_input_tokens_seen": 178287285, "step": 8260, "time_per_iteration": 2.5147809982299805 }, { "auxiliary_loss_clip": 0.01156336, "auxiliary_loss_mlp": 0.01025337, "balance_loss_clip": 1.04876614, "balance_loss_mlp": 1.0181278, "epoch": 0.9933265195695304, "flos": 20156659868160.0, "grad_norm": 3.782602774906576, "language_loss": 0.77083755, "learning_rate": 4.5887255257670563e-10, "loss": 0.79265428, "num_input_tokens_seen": 178304325, "step": 8261, "time_per_iteration": 2.4801785945892334 }, { "auxiliary_loss_clip": 0.01166238, "auxiliary_loss_mlp": 0.01027377, "balance_loss_clip": 1.04711235, "balance_loss_mlp": 1.0200752, "epoch": 0.9934467624601695, "flos": 21362037494400.0, "grad_norm": 2.6216314162661725, "language_loss": 0.7667163, "learning_rate": 4.4233858086117906e-10, "loss": 0.78865242, "num_input_tokens_seen": 178322850, "step": 8262, "time_per_iteration": 3.1451010704040527 }, { "auxiliary_loss_clip": 0.0110727, "auxiliary_loss_mlp": 0.01025509, "balance_loss_clip": 1.0444746, "balance_loss_mlp": 1.01830852, "epoch": 0.9935670053508087, "flos": 19756040503680.0, "grad_norm": 2.265969735330147, "language_loss": 0.67954516, "learning_rate": 4.261079404528356e-10, "loss": 0.70087296, "num_input_tokens_seen": 178342330, "step": 8263, "time_per_iteration": 2.5545506477355957 }, { "auxiliary_loss_clip": 0.01148542, "auxiliary_loss_mlp": 0.01020643, "balance_loss_clip": 1.04423273, "balance_loss_mlp": 1.01347888, "epoch": 0.9936872482414477, "flos": 21978838863360.0, "grad_norm": 3.2466274623058737, "language_loss": 0.69114614, "learning_rate": 4.1018063381437205e-10, "loss": 0.71283805, "num_input_tokens_seen": 178362715, "step": 8264, "time_per_iteration": 2.4658918380737305 }, { "auxiliary_loss_clip": 0.01047146, "auxiliary_loss_mlp": 0.01002243, "balance_loss_clip": 1.01047111, "balance_loss_mlp": 1.00147402, "epoch": 0.9938074911320868, "flos": 69810667839360.0, "grad_norm": 0.8778695489876152, "language_loss": 0.61184144, "learning_rate": 3.9455666336141167e-10, "loss": 0.63233531, "num_input_tokens_seen": 178426495, "step": 8265, "time_per_iteration": 3.1018972396850586 }, { "auxiliary_loss_clip": 0.01164182, "auxiliary_loss_mlp": 0.01023043, "balance_loss_clip": 1.04696429, "balance_loss_mlp": 1.01627243, "epoch": 0.9939277340227259, "flos": 15084170058240.0, "grad_norm": 2.5755021350693985, "language_loss": 0.83323324, "learning_rate": 3.7923603146450267e-10, "loss": 0.85510552, "num_input_tokens_seen": 178442555, "step": 8266, "time_per_iteration": 2.384077787399292 }, { "auxiliary_loss_clip": 0.01123074, "auxiliary_loss_mlp": 0.01024825, "balance_loss_clip": 1.03999043, "balance_loss_mlp": 1.01764226, "epoch": 0.994047976913365, "flos": 17712364291200.0, "grad_norm": 1.9986799115018, "language_loss": 0.81031752, "learning_rate": 3.642187404473418e-10, "loss": 0.83179653, "num_input_tokens_seen": 178460715, "step": 8267, "time_per_iteration": 2.5039939880371094 }, { "auxiliary_loss_clip": 0.01150662, "auxiliary_loss_mlp": 0.01020048, "balance_loss_clip": 1.0439105, "balance_loss_mlp": 1.0130173, "epoch": 0.994168219804004, "flos": 19171558396800.0, "grad_norm": 2.4422687537614896, "language_loss": 0.86333156, "learning_rate": 3.495047925885508e-10, "loss": 0.88503867, "num_input_tokens_seen": 178479050, "step": 8268, "time_per_iteration": 2.4145679473876953 }, { "auxiliary_loss_clip": 0.01135162, "auxiliary_loss_mlp": 0.01025372, "balance_loss_clip": 1.04222631, "balance_loss_mlp": 1.0183835, "epoch": 0.9942884626946432, "flos": 17851589406720.0, "grad_norm": 3.280059971402832, "language_loss": 0.82468426, "learning_rate": 3.350941901199e-10, "loss": 0.84628963, "num_input_tokens_seen": 178495970, "step": 8269, "time_per_iteration": 2.4775407314300537 }, { "auxiliary_loss_clip": 0.01139904, "auxiliary_loss_mlp": 0.01024556, "balance_loss_clip": 1.04442763, "balance_loss_mlp": 1.01753485, "epoch": 0.9944087055852823, "flos": 18796578364800.0, "grad_norm": 2.8330819347656204, "language_loss": 0.83172882, "learning_rate": 3.2098693522764066e-10, "loss": 0.85337341, "num_input_tokens_seen": 178509170, "step": 8270, "time_per_iteration": 2.444460868835449 }, { "auxiliary_loss_clip": 0.01143707, "auxiliary_loss_mlp": 0.00761732, "balance_loss_clip": 1.04585123, "balance_loss_mlp": 1.00047684, "epoch": 0.9945289484759213, "flos": 20996969616000.0, "grad_norm": 1.8870604872235957, "language_loss": 0.81237614, "learning_rate": 3.071830300516165e-10, "loss": 0.83143055, "num_input_tokens_seen": 178527000, "step": 8271, "time_per_iteration": 2.5649683475494385 }, { "auxiliary_loss_clip": 0.01155754, "auxiliary_loss_mlp": 0.01028625, "balance_loss_clip": 1.04585576, "balance_loss_mlp": 1.02126396, "epoch": 0.9946491913665605, "flos": 14756952136320.0, "grad_norm": 2.946529532764987, "language_loss": 0.70839703, "learning_rate": 2.9368247668615234e-10, "loss": 0.73024082, "num_input_tokens_seen": 178545590, "step": 8272, "time_per_iteration": 2.434903621673584 }, { "auxiliary_loss_clip": 0.01169771, "auxiliary_loss_mlp": 0.01026383, "balance_loss_clip": 1.04889834, "balance_loss_mlp": 1.01915324, "epoch": 0.9947694342571995, "flos": 12669931186560.0, "grad_norm": 2.834484945376903, "language_loss": 0.61655366, "learning_rate": 2.804852771789434e-10, "loss": 0.63851517, "num_input_tokens_seen": 178558890, "step": 8273, "time_per_iteration": 2.3944907188415527 }, { "auxiliary_loss_clip": 0.01162144, "auxiliary_loss_mlp": 0.01023541, "balance_loss_clip": 1.04530811, "balance_loss_mlp": 1.01653194, "epoch": 0.9948896771478386, "flos": 18843442634880.0, "grad_norm": 1.7999877857802915, "language_loss": 0.55636847, "learning_rate": 2.675914335321661e-10, "loss": 0.57822537, "num_input_tokens_seen": 178577645, "step": 8274, "time_per_iteration": 2.3823635578155518 }, { "auxiliary_loss_clip": 0.01157135, "auxiliary_loss_mlp": 0.01026259, "balance_loss_clip": 1.0468781, "balance_loss_mlp": 1.01828969, "epoch": 0.9950099200384778, "flos": 24900207903360.0, "grad_norm": 2.2975960007290586, "language_loss": 0.79673886, "learning_rate": 2.550009477018111e-10, "loss": 0.81857276, "num_input_tokens_seen": 178596415, "step": 8275, "time_per_iteration": 2.5290284156799316 }, { "auxiliary_loss_clip": 0.01137668, "auxiliary_loss_mlp": 0.00762007, "balance_loss_clip": 1.04595709, "balance_loss_mlp": 1.00047469, "epoch": 0.9951301629291168, "flos": 23733613987200.0, "grad_norm": 2.1993405447889796, "language_loss": 0.62824106, "learning_rate": 2.4271382159790634e-10, "loss": 0.64723784, "num_input_tokens_seen": 178613845, "step": 8276, "time_per_iteration": 2.50490665435791 }, { "auxiliary_loss_clip": 0.01107082, "auxiliary_loss_mlp": 0.01029012, "balance_loss_clip": 1.04100072, "balance_loss_mlp": 1.02220547, "epoch": 0.9952504058197559, "flos": 22236893147520.0, "grad_norm": 1.7755456709311237, "language_loss": 0.85643125, "learning_rate": 2.3073005708429406e-10, "loss": 0.87779218, "num_input_tokens_seen": 178633490, "step": 8277, "time_per_iteration": 2.58384370803833 }, { "auxiliary_loss_clip": 0.01122953, "auxiliary_loss_mlp": 0.01020328, "balance_loss_clip": 1.04578829, "balance_loss_mlp": 1.01383066, "epoch": 0.995370648710395, "flos": 21211032718080.0, "grad_norm": 1.7508562613524354, "language_loss": 0.72135162, "learning_rate": 2.190496559788535e-10, "loss": 0.74278438, "num_input_tokens_seen": 178651775, "step": 8278, "time_per_iteration": 2.5453786849975586 }, { "auxiliary_loss_clip": 0.01135067, "auxiliary_loss_mlp": 0.01027814, "balance_loss_clip": 1.04354143, "balance_loss_mlp": 1.02114177, "epoch": 0.9954908916010341, "flos": 14866731077760.0, "grad_norm": 2.746550380378938, "language_loss": 0.76418847, "learning_rate": 2.0767262005372265e-10, "loss": 0.78581727, "num_input_tokens_seen": 178669290, "step": 8279, "time_per_iteration": 3.3816354274749756 }, { "auxiliary_loss_clip": 0.01129969, "auxiliary_loss_mlp": 0.01023569, "balance_loss_clip": 1.04281509, "balance_loss_mlp": 1.01700974, "epoch": 0.9956111344916732, "flos": 19208259118080.0, "grad_norm": 2.048887141908911, "language_loss": 0.74988109, "learning_rate": 1.965989510346322e-10, "loss": 0.77141643, "num_input_tokens_seen": 178688410, "step": 8280, "time_per_iteration": 2.513462543487549 }, { "auxiliary_loss_clip": 0.01105228, "auxiliary_loss_mlp": 0.01026597, "balance_loss_clip": 1.04150772, "balance_loss_mlp": 1.01866937, "epoch": 0.9957313773823123, "flos": 20047060494720.0, "grad_norm": 1.9790593245790244, "language_loss": 0.71111608, "learning_rate": 1.8582865060134955e-10, "loss": 0.73243439, "num_input_tokens_seen": 178706600, "step": 8281, "time_per_iteration": 3.2828047275543213 }, { "auxiliary_loss_clip": 0.01060818, "auxiliary_loss_mlp": 0.0100096, "balance_loss_clip": 1.0073421, "balance_loss_mlp": 1.00010133, "epoch": 0.9958516202729514, "flos": 57483253768320.0, "grad_norm": 0.7859882120812638, "language_loss": 0.557284, "learning_rate": 1.7536172038790098e-10, "loss": 0.57790178, "num_input_tokens_seen": 178766910, "step": 8282, "time_per_iteration": 3.0593149662017822 }, { "auxiliary_loss_clip": 0.01137493, "auxiliary_loss_mlp": 0.01020369, "balance_loss_clip": 1.04611886, "balance_loss_mlp": 1.01371384, "epoch": 0.9959718631635904, "flos": 27782900974080.0, "grad_norm": 2.007162501087017, "language_loss": 0.69224477, "learning_rate": 1.651981619819054e-10, "loss": 0.71382344, "num_input_tokens_seen": 178784060, "step": 8283, "time_per_iteration": 3.3584096431732178 }, { "auxiliary_loss_clip": 0.01113905, "auxiliary_loss_mlp": 0.01023593, "balance_loss_clip": 1.04202104, "balance_loss_mlp": 1.0162971, "epoch": 0.9960921060542296, "flos": 24024095274240.0, "grad_norm": 2.3053812037058274, "language_loss": 0.71121198, "learning_rate": 1.5533797692546257e-10, "loss": 0.73258698, "num_input_tokens_seen": 178802795, "step": 8284, "time_per_iteration": 2.545639753341675 }, { "auxiliary_loss_clip": 0.01147523, "auxiliary_loss_mlp": 0.01019912, "balance_loss_clip": 1.0439508, "balance_loss_mlp": 1.01255369, "epoch": 0.9962123489448687, "flos": 18697393935360.0, "grad_norm": 1.9730891523720424, "language_loss": 0.84074718, "learning_rate": 1.4578116671404296e-10, "loss": 0.86242151, "num_input_tokens_seen": 178821075, "step": 8285, "time_per_iteration": 2.429304599761963 }, { "auxiliary_loss_clip": 0.01147004, "auxiliary_loss_mlp": 0.01028342, "balance_loss_clip": 1.04730105, "balance_loss_mlp": 1.02129698, "epoch": 0.9963325918355077, "flos": 20010754823040.0, "grad_norm": 2.318298631824008, "language_loss": 0.71324831, "learning_rate": 1.3652773279759777e-10, "loss": 0.7350018, "num_input_tokens_seen": 178837725, "step": 8286, "time_per_iteration": 2.428987503051758 }, { "auxiliary_loss_clip": 0.01149722, "auxiliary_loss_mlp": 0.0102428, "balance_loss_clip": 1.04419994, "balance_loss_mlp": 1.01704144, "epoch": 0.9964528347261468, "flos": 33108488991360.0, "grad_norm": 2.262626396535188, "language_loss": 0.62857044, "learning_rate": 1.2757767657989305e-10, "loss": 0.65031046, "num_input_tokens_seen": 178861515, "step": 8287, "time_per_iteration": 2.546614408493042 }, { "auxiliary_loss_clip": 0.01149698, "auxiliary_loss_mlp": 0.0101993, "balance_loss_clip": 1.04598641, "balance_loss_mlp": 1.01333213, "epoch": 0.9965730776167859, "flos": 23109342589440.0, "grad_norm": 1.900974434634456, "language_loss": 0.87091804, "learning_rate": 1.1893099941850948e-10, "loss": 0.89261436, "num_input_tokens_seen": 178880410, "step": 8288, "time_per_iteration": 2.4448535442352295 }, { "auxiliary_loss_clip": 0.01140624, "auxiliary_loss_mlp": 0.01024703, "balance_loss_clip": 1.04210591, "balance_loss_mlp": 1.01733637, "epoch": 0.996693320507425, "flos": 22965843755520.0, "grad_norm": 1.9484440571806665, "language_loss": 0.77200848, "learning_rate": 1.105877026252866e-10, "loss": 0.79366171, "num_input_tokens_seen": 178898740, "step": 8289, "time_per_iteration": 3.2521162033081055 }, { "auxiliary_loss_clip": 0.01165879, "auxiliary_loss_mlp": 0.0102603, "balance_loss_clip": 1.04586184, "balance_loss_mlp": 1.01809692, "epoch": 0.996813563398064, "flos": 13222740476160.0, "grad_norm": 1.9755491691134537, "language_loss": 0.71674532, "learning_rate": 1.0254778746565663e-10, "loss": 0.73866439, "num_input_tokens_seen": 178914015, "step": 8290, "time_per_iteration": 2.352820634841919 }, { "auxiliary_loss_clip": 0.01121647, "auxiliary_loss_mlp": 0.01026668, "balance_loss_clip": 1.04423678, "balance_loss_mlp": 1.01964045, "epoch": 0.9969338062887032, "flos": 14647855553280.0, "grad_norm": 2.1297054866249323, "language_loss": 0.7353223, "learning_rate": 9.481125515953259e-11, "loss": 0.75680548, "num_input_tokens_seen": 178932075, "step": 8291, "time_per_iteration": 2.52404522895813 }, { "auxiliary_loss_clip": 0.01108891, "auxiliary_loss_mlp": 0.01023996, "balance_loss_clip": 1.03809643, "balance_loss_mlp": 1.01663804, "epoch": 0.9970540491793423, "flos": 25735741142400.0, "grad_norm": 1.607387266526498, "language_loss": 0.79856205, "learning_rate": 8.737810688064228e-11, "loss": 0.81989086, "num_input_tokens_seen": 178951910, "step": 8292, "time_per_iteration": 2.5697712898254395 }, { "auxiliary_loss_clip": 0.01120101, "auxiliary_loss_mlp": 0.01030636, "balance_loss_clip": 1.04326797, "balance_loss_mlp": 1.02142727, "epoch": 0.9971742920699813, "flos": 21470236237440.0, "grad_norm": 2.0263255259910165, "language_loss": 0.79260868, "learning_rate": 8.024834375608414e-11, "loss": 0.814116, "num_input_tokens_seen": 178970500, "step": 8293, "time_per_iteration": 2.500955820083618 }, { "auxiliary_loss_clip": 0.01060692, "auxiliary_loss_mlp": 0.01000356, "balance_loss_clip": 1.00724888, "balance_loss_mlp": 0.99950927, "epoch": 0.9972945349606205, "flos": 72211223629440.0, "grad_norm": 0.8280346749141044, "language_loss": 0.6286484, "learning_rate": 7.342196686788149e-11, "loss": 0.64925897, "num_input_tokens_seen": 179023665, "step": 8294, "time_per_iteration": 2.9057698249816895 }, { "auxiliary_loss_clip": 0.01137953, "auxiliary_loss_mlp": 0.01026779, "balance_loss_clip": 1.04893553, "balance_loss_mlp": 1.01976371, "epoch": 0.9974147778512595, "flos": 19678293515520.0, "grad_norm": 1.9549901843514716, "language_loss": 0.69124424, "learning_rate": 6.689897725142834e-11, "loss": 0.71289158, "num_input_tokens_seen": 179043140, "step": 8295, "time_per_iteration": 2.4749598503112793 }, { "auxiliary_loss_clip": 0.01138681, "auxiliary_loss_mlp": 0.0102738, "balance_loss_clip": 1.04306006, "balance_loss_mlp": 1.020257, "epoch": 0.9975350207418986, "flos": 15960821391360.0, "grad_norm": 2.2458612826183146, "language_loss": 0.89006549, "learning_rate": 6.067937589615545e-11, "loss": 0.91172612, "num_input_tokens_seen": 179061215, "step": 8296, "time_per_iteration": 2.4623379707336426 }, { "auxiliary_loss_clip": 0.01044403, "auxiliary_loss_mlp": 0.01001086, "balance_loss_clip": 1.00851381, "balance_loss_mlp": 1.00013244, "epoch": 0.9976552636325378, "flos": 59961879768960.0, "grad_norm": 0.7528782873822731, "language_loss": 0.57673573, "learning_rate": 5.476316374575241e-11, "loss": 0.59719062, "num_input_tokens_seen": 179124700, "step": 8297, "time_per_iteration": 3.042626142501831 }, { "auxiliary_loss_clip": 0.011653, "auxiliary_loss_mlp": 0.01024709, "balance_loss_clip": 1.04595232, "balance_loss_mlp": 1.01693606, "epoch": 0.9977755065231768, "flos": 22487872452480.0, "grad_norm": 1.8741773739445933, "language_loss": 0.7285217, "learning_rate": 4.9150341697723476e-11, "loss": 0.7504217, "num_input_tokens_seen": 179144590, "step": 8298, "time_per_iteration": 2.4386112689971924 }, { "auxiliary_loss_clip": 0.01134111, "auxiliary_loss_mlp": 0.01029019, "balance_loss_clip": 1.04580164, "balance_loss_mlp": 1.02175307, "epoch": 0.9978957494138159, "flos": 26030280666240.0, "grad_norm": 1.5686091087729772, "language_loss": 0.66557115, "learning_rate": 4.384091060338768e-11, "loss": 0.68720245, "num_input_tokens_seen": 179165060, "step": 8299, "time_per_iteration": 2.5679140090942383 }, { "auxiliary_loss_clip": 0.01147877, "auxiliary_loss_mlp": 0.01024976, "balance_loss_clip": 1.04495907, "balance_loss_mlp": 1.01781726, "epoch": 0.998015992304455, "flos": 22637835734400.0, "grad_norm": 2.3049236335459375, "language_loss": 0.73332906, "learning_rate": 3.883487126810081e-11, "loss": 0.75505757, "num_input_tokens_seen": 179184320, "step": 8300, "time_per_iteration": 2.5333354473114014 }, { "auxiliary_loss_clip": 0.01140805, "auxiliary_loss_mlp": 0.01022428, "balance_loss_clip": 1.0424962, "balance_loss_mlp": 1.01512599, "epoch": 0.9981362351950941, "flos": 18223444955520.0, "grad_norm": 1.7308427971709435, "language_loss": 0.79488659, "learning_rate": 3.41322244516995e-11, "loss": 0.8165189, "num_input_tokens_seen": 179202265, "step": 8301, "time_per_iteration": 2.470193386077881 }, { "auxiliary_loss_clip": 0.01097787, "auxiliary_loss_mlp": 0.0102471, "balance_loss_clip": 1.04121625, "balance_loss_mlp": 1.0175364, "epoch": 0.9982564780857331, "flos": 33474095573760.0, "grad_norm": 1.6369820164721491, "language_loss": 0.63092965, "learning_rate": 2.9732970866946925e-11, "loss": 0.65215462, "num_input_tokens_seen": 179222145, "step": 8302, "time_per_iteration": 2.629645347595215 }, { "auxiliary_loss_clip": 0.0110876, "auxiliary_loss_mlp": 0.01023539, "balance_loss_clip": 1.03783226, "balance_loss_mlp": 1.01604652, "epoch": 0.9983767209763723, "flos": 15523465392000.0, "grad_norm": 2.0318882216207843, "language_loss": 0.78223205, "learning_rate": 2.563711118175327e-11, "loss": 0.80355501, "num_input_tokens_seen": 179239030, "step": 8303, "time_per_iteration": 2.491415023803711 }, { "auxiliary_loss_clip": 0.01119852, "auxiliary_loss_mlp": 0.01024339, "balance_loss_clip": 1.04409909, "balance_loss_mlp": 1.01711535, "epoch": 0.9984969638670114, "flos": 19974377324160.0, "grad_norm": 1.7846632600815349, "language_loss": 0.83492184, "learning_rate": 2.184464601717728e-11, "loss": 0.85636377, "num_input_tokens_seen": 179257345, "step": 8304, "time_per_iteration": 2.490250587463379 }, { "auxiliary_loss_clip": 0.01156084, "auxiliary_loss_mlp": 0.01023918, "balance_loss_clip": 1.04848409, "balance_loss_mlp": 1.01642227, "epoch": 0.9986172067576504, "flos": 20375750874240.0, "grad_norm": 2.4936858363745684, "language_loss": 0.78070009, "learning_rate": 1.8355575948758585e-11, "loss": 0.80250013, "num_input_tokens_seen": 179275330, "step": 8305, "time_per_iteration": 2.4227540493011475 }, { "auxiliary_loss_clip": 0.01134825, "auxiliary_loss_mlp": 0.01024285, "balance_loss_clip": 1.04143059, "balance_loss_mlp": 1.01687062, "epoch": 0.9987374496482896, "flos": 23727903724800.0, "grad_norm": 2.092696406343103, "language_loss": 0.73691463, "learning_rate": 1.5169901505407424e-11, "loss": 0.75850576, "num_input_tokens_seen": 179292395, "step": 8306, "time_per_iteration": 3.3225760459899902 }, { "auxiliary_loss_clip": 0.01135571, "auxiliary_loss_mlp": 0.01021153, "balance_loss_clip": 1.04548454, "balance_loss_mlp": 1.0143795, "epoch": 0.9988576925389286, "flos": 25044029959680.0, "grad_norm": 3.150302437516206, "language_loss": 0.74091977, "learning_rate": 1.228762317073695e-11, "loss": 0.76248705, "num_input_tokens_seen": 179311225, "step": 8307, "time_per_iteration": 2.500269889831543 }, { "auxiliary_loss_clip": 0.01138507, "auxiliary_loss_mlp": 0.01023328, "balance_loss_clip": 1.04582012, "balance_loss_mlp": 1.01616979, "epoch": 0.9989779354295677, "flos": 31285627637760.0, "grad_norm": 2.6241618388627823, "language_loss": 0.79009187, "learning_rate": 9.70874138195299e-12, "loss": 0.81171024, "num_input_tokens_seen": 179333135, "step": 8308, "time_per_iteration": 3.370873212814331 }, { "auxiliary_loss_clip": 0.01165362, "auxiliary_loss_mlp": 0.01026353, "balance_loss_clip": 1.04553401, "balance_loss_mlp": 1.01942658, "epoch": 0.9990981783202069, "flos": 19573398823680.0, "grad_norm": 2.2544950448494077, "language_loss": 0.74613214, "learning_rate": 7.433256530076093e-12, "loss": 0.76804924, "num_input_tokens_seen": 179353090, "step": 8309, "time_per_iteration": 2.4445178508758545 }, { "auxiliary_loss_clip": 0.0111342, "auxiliary_loss_mlp": 0.01021633, "balance_loss_clip": 1.04081964, "balance_loss_mlp": 1.01496065, "epoch": 0.9992184212108459, "flos": 17199667514880.0, "grad_norm": 2.1625161308300807, "language_loss": 0.75373125, "learning_rate": 5.46116896038562e-12, "loss": 0.77508187, "num_input_tokens_seen": 179367500, "step": 8310, "time_per_iteration": 3.2368834018707275 }, { "auxiliary_loss_clip": 0.0113511, "auxiliary_loss_mlp": 0.01026915, "balance_loss_clip": 1.04501009, "balance_loss_mlp": 1.01980472, "epoch": 0.999338664101485, "flos": 46497853681920.0, "grad_norm": 1.9497580178527667, "language_loss": 0.61725223, "learning_rate": 3.792478972197699e-12, "loss": 0.63887244, "num_input_tokens_seen": 179388085, "step": 8311, "time_per_iteration": 2.6791017055511475 }, { "auxiliary_loss_clip": 0.01162768, "auxiliary_loss_mlp": 0.01019195, "balance_loss_clip": 1.04503465, "balance_loss_mlp": 1.01223326, "epoch": 0.9994589069921241, "flos": 15158253859200.0, "grad_norm": 3.0613174986916745, "language_loss": 0.70076668, "learning_rate": 2.4271868181990895e-12, "loss": 0.72258627, "num_input_tokens_seen": 179405250, "step": 8312, "time_per_iteration": 2.390583038330078 }, { "auxiliary_loss_clip": 0.01152469, "auxiliary_loss_mlp": 0.01023196, "balance_loss_clip": 1.04528689, "balance_loss_mlp": 1.01573968, "epoch": 0.9995791498827632, "flos": 12531460256640.0, "grad_norm": 2.1285603506656257, "language_loss": 0.81137228, "learning_rate": 1.3652927060014973e-12, "loss": 0.83312893, "num_input_tokens_seen": 179420845, "step": 8313, "time_per_iteration": 2.4048800468444824 }, { "auxiliary_loss_clip": 0.01129273, "auxiliary_loss_mlp": 0.01027272, "balance_loss_clip": 1.04499984, "balance_loss_mlp": 1.01942825, "epoch": 0.9996993927734023, "flos": 19245175320960.0, "grad_norm": 2.1174869671760064, "language_loss": 0.6384474, "learning_rate": 6.067967965872612e-13, "loss": 0.6600129, "num_input_tokens_seen": 179440455, "step": 8314, "time_per_iteration": 2.47021222114563 }, { "auxiliary_loss_clip": 0.01121787, "auxiliary_loss_mlp": 0.01026695, "balance_loss_clip": 1.04453707, "balance_loss_mlp": 1.01979291, "epoch": 0.9998196356640414, "flos": 62952804518400.0, "grad_norm": 3.3841141806213604, "language_loss": 0.77099526, "learning_rate": 1.5169920497548615e-13, "loss": 0.79248011, "num_input_tokens_seen": 179465075, "step": 8315, "time_per_iteration": 3.634526014328003 }, { "auxiliary_loss_clip": 0.01104206, "auxiliary_loss_mlp": 0.01015116, "balance_loss_clip": 1.02576852, "balance_loss_mlp": 1.01107836, "epoch": 0.9999398785546805, "flos": 50922375073920.0, "grad_norm": 1.0914396496542906, "language_loss": 0.54964697, "learning_rate": 0.0, "loss": 0.57084024, "num_input_tokens_seen": 179513955, "step": 8316, "time_per_iteration": 3.0176162719726562 }, { "epoch": 0.9999398785546805, "num_input_tokens_seen": 179513955, "step": 8316, "total_flos": 6.996749092776837e+17, "train_loss": 0.7892607923968014, "train_runtime": 23405.2918, "train_samples_per_second": 14.213, "train_steps_per_second": 0.355 } ], "logging_steps": 1.0, "max_steps": 8316, "num_input_tokens_seen": 179513955, "num_train_epochs": 1, "save_steps": 1664, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.996749092776837e+17, "train_batch_size": 5, "trial_name": null, "trial_params": null }