diff --git "a/CompeteSMoE/competesmoe_versions/MAX_8competesmoev32/trainer_state.json" "b/CompeteSMoE/competesmoe_versions/MAX_8competesmoev32/trainer_state.json" new file mode 100644--- /dev/null +++ "b/CompeteSMoE/competesmoe_versions/MAX_8competesmoev32/trainer_state.json" @@ -0,0 +1,124783 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999398785546805, + "eval_steps": 500, + "global_step": 8316, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.04592296, + "auxiliary_loss_mlp": 0.0257779, + "balance_loss_clip": 2.47145319, + "balance_loss_mlp": 2.09008121, + "epoch": 0.00012024289063909097, + "flos": 24932483919360.0, + "grad_norm": 40.30592831483485, + "language_loss": 2.5798173, + "learning_rate": 0.0, + "loss": 1.90189219, + "num_input_tokens_seen": 20375, + "step": 1, + "time_per_iteration": 12.973398685455322 + }, + { + "auxiliary_loss_clip": 0.03096366, + "auxiliary_loss_mlp": 0.01642735, + "balance_loss_clip": 1.65442955, + "balance_loss_mlp": 1.31867695, + "epoch": 0.00024048578127818193, + "flos": 30664624377600.0, + "grad_norm": 54.530462462352105, + "language_loss": 1.88843608, + "learning_rate": 5.021476677069823e-07, + "loss": 1.93582726, + "num_input_tokens_seen": 39035, + "step": 2, + "time_per_iteration": 2.4862096309661865 + }, + { + "auxiliary_loss_clip": 0.03069804, + "auxiliary_loss_mlp": 0.0166964, + "balance_loss_clip": 1.65269542, + "balance_loss_mlp": 1.34806049, + "epoch": 0.0003607286719172729, + "flos": 19026227969280.0, + "grad_norm": 40.11147186677424, + "language_loss": 1.61379004, + "learning_rate": 7.958852231401551e-07, + "loss": 1.66118455, + "num_input_tokens_seen": 57600, + "step": 3, + "time_per_iteration": 2.3074429035186768 + }, + { + "auxiliary_loss_clip": 0.03078047, + "auxiliary_loss_mlp": 0.01738445, + "balance_loss_clip": 1.65200186, + "balance_loss_mlp": 1.41495919, + "epoch": 0.00048097156255636386, + "flos": 19316314206720.0, + "grad_norm": 36.82747997568199, + "language_loss": 1.64347053, + "learning_rate": 1.0042953354139647e-06, + "loss": 1.69163549, + "num_input_tokens_seen": 76465, + "step": 4, + "time_per_iteration": 2.339784860610962 + }, + { + "auxiliary_loss_clip": 0.0307353, + "auxiliary_loss_mlp": 0.01667204, + "balance_loss_clip": 1.65349627, + "balance_loss_mlp": 1.35020256, + "epoch": 0.0006012144531954548, + "flos": 13991264893440.0, + "grad_norm": 55.3589190232194, + "language_loss": 1.93605816, + "learning_rate": 1.1659507774310057e-06, + "loss": 1.98346543, + "num_input_tokens_seen": 94350, + "step": 5, + "time_per_iteration": 2.588155746459961 + }, + { + "auxiliary_loss_clip": 0.03084577, + "auxiliary_loss_mlp": 0.01685274, + "balance_loss_clip": 1.65829551, + "balance_loss_mlp": 1.36350429, + "epoch": 0.0007214573438345458, + "flos": 23148988225920.0, + "grad_norm": 45.45357605432344, + "language_loss": 1.61244512, + "learning_rate": 1.2980328908471373e-06, + "loss": 1.66014361, + "num_input_tokens_seen": 114595, + "step": 6, + "time_per_iteration": 2.610466241836548 + }, + { + "auxiliary_loss_clip": 0.03137068, + "auxiliary_loss_mlp": 0.01618546, + "balance_loss_clip": 1.79350352, + "balance_loss_mlp": 1.41712987, + "epoch": 0.0008417002344736367, + "flos": 67663246170240.0, + "grad_norm": 4.61251129694427, + "language_loss": 0.81483877, + "learning_rate": 1.4097067265369432e-06, + "loss": 0.86239493, + "num_input_tokens_seen": 179590, + "step": 7, + "time_per_iteration": 3.0786995887756348 + }, + { + "auxiliary_loss_clip": 0.03049939, + "auxiliary_loss_mlp": 0.01707117, + "balance_loss_clip": 1.64220679, + "balance_loss_mlp": 1.39431167, + "epoch": 0.0009619431251127277, + "flos": 21281381504640.0, + "grad_norm": 40.86469656632831, + "language_loss": 1.58314562, + "learning_rate": 1.506443003120947e-06, + "loss": 1.6307162, + "num_input_tokens_seen": 195090, + "step": 8, + "time_per_iteration": 2.569751262664795 + }, + { + "auxiliary_loss_clip": 0.03055521, + "auxiliary_loss_mlp": 0.01697552, + "balance_loss_clip": 1.64976835, + "balance_loss_mlp": 1.37940598, + "epoch": 0.0010821860157518186, + "flos": 23331342597120.0, + "grad_norm": 17.557884827290334, + "language_loss": 1.48055208, + "learning_rate": 1.5917704462803102e-06, + "loss": 1.52808285, + "num_input_tokens_seen": 211635, + "step": 9, + "time_per_iteration": 2.630746364593506 + }, + { + "auxiliary_loss_clip": 0.03042232, + "auxiliary_loss_mlp": 0.0165756, + "balance_loss_clip": 1.64873385, + "balance_loss_mlp": 1.34093976, + "epoch": 0.0012024289063909096, + "flos": 17010166337280.0, + "grad_norm": 13.264859313460732, + "language_loss": 1.53033519, + "learning_rate": 1.6680984451379884e-06, + "loss": 1.57733321, + "num_input_tokens_seen": 224705, + "step": 10, + "time_per_iteration": 2.549220561981201 + }, + { + "auxiliary_loss_clip": 0.03050448, + "auxiliary_loss_mlp": 0.01683857, + "balance_loss_clip": 1.64478493, + "balance_loss_mlp": 1.37982523, + "epoch": 0.0013226717970300007, + "flos": 21288133261440.0, + "grad_norm": 18.351262216430698, + "language_loss": 1.32663512, + "learning_rate": 1.7371455188905097e-06, + "loss": 1.37397814, + "num_input_tokens_seen": 244635, + "step": 11, + "time_per_iteration": 2.594905376434326 + }, + { + "auxiliary_loss_clip": 0.03064348, + "auxiliary_loss_mlp": 0.01705926, + "balance_loss_clip": 1.64955437, + "balance_loss_mlp": 1.37767088, + "epoch": 0.0014429146876690916, + "flos": 27237884935680.0, + "grad_norm": 10.57016698859891, + "language_loss": 1.25267851, + "learning_rate": 1.8001805585541196e-06, + "loss": 1.30038142, + "num_input_tokens_seen": 265765, + "step": 12, + "time_per_iteration": 2.7384464740753174 + }, + { + "auxiliary_loss_clip": 0.03043681, + "auxiliary_loss_mlp": 0.01664609, + "balance_loss_clip": 1.64175451, + "balance_loss_mlp": 1.35714459, + "epoch": 0.0015631575783081825, + "flos": 19062174504960.0, + "grad_norm": 6.6469347218709744, + "language_loss": 1.29183769, + "learning_rate": 1.8581671739548328e-06, + "loss": 1.33892059, + "num_input_tokens_seen": 283500, + "step": 13, + "time_per_iteration": 2.5889413356781006 + }, + { + "auxiliary_loss_clip": 0.03038597, + "auxiliary_loss_mlp": 0.01618205, + "balance_loss_clip": 1.63765264, + "balance_loss_mlp": 1.30578113, + "epoch": 0.0016834004689472734, + "flos": 48139473985920.0, + "grad_norm": 6.61038063923488, + "language_loss": 1.13568783, + "learning_rate": 1.9118543942439254e-06, + "loss": 1.18225574, + "num_input_tokens_seen": 305685, + "step": 14, + "time_per_iteration": 2.8433737754821777 + }, + { + "auxiliary_loss_clip": 0.03015297, + "auxiliary_loss_mlp": 0.01677781, + "balance_loss_clip": 1.63290501, + "balance_loss_mlp": 1.3615427, + "epoch": 0.0018036433595863645, + "flos": 34970026314240.0, + "grad_norm": 5.648299709965761, + "language_loss": 1.1277442, + "learning_rate": 1.961836000571161e-06, + "loss": 1.17467487, + "num_input_tokens_seen": 327340, + "step": 15, + "time_per_iteration": 2.709066867828369 + }, + { + "auxiliary_loss_clip": 0.03029608, + "auxiliary_loss_mlp": 0.01441161, + "balance_loss_clip": 1.77048922, + "balance_loss_mlp": 1.25347769, + "epoch": 0.0019238862502254555, + "flos": 59768284440960.0, + "grad_norm": 3.790747894423256, + "language_loss": 0.64638621, + "learning_rate": 2.0085906708279293e-06, + "loss": 0.69109386, + "num_input_tokens_seen": 382710, + "step": 16, + "time_per_iteration": 5.4371256828308105 + }, + { + "auxiliary_loss_clip": 0.03002673, + "auxiliary_loss_mlp": 0.01635118, + "balance_loss_clip": 1.63701117, + "balance_loss_mlp": 1.32422042, + "epoch": 0.0020441291408645466, + "flos": 20814543417600.0, + "grad_norm": 4.358314224621205, + "language_loss": 1.16002595, + "learning_rate": 2.0525099325728135e-06, + "loss": 1.20640373, + "num_input_tokens_seen": 400890, + "step": 17, + "time_per_iteration": 2.6080520153045654 + }, + { + "auxiliary_loss_clip": 0.02992871, + "auxiliary_loss_mlp": 0.01408738, + "balance_loss_clip": 1.76264501, + "balance_loss_mlp": 1.22334337, + "epoch": 0.0021643720315036373, + "flos": 63857001582720.0, + "grad_norm": 3.5415361815598376, + "language_loss": 0.72182769, + "learning_rate": 2.0939181139872922e-06, + "loss": 0.76584375, + "num_input_tokens_seen": 462605, + "step": 18, + "time_per_iteration": 3.0414412021636963 + }, + { + "auxiliary_loss_clip": 0.0297091, + "auxiliary_loss_mlp": 0.01583854, + "balance_loss_clip": 1.63004708, + "balance_loss_mlp": 1.28306484, + "epoch": 0.0022846149221427284, + "flos": 31284981192960.0, + "grad_norm": 5.008540326195495, + "language_loss": 1.01585436, + "learning_rate": 2.1330868934640175e-06, + "loss": 1.06140208, + "num_input_tokens_seen": 483280, + "step": 19, + "time_per_iteration": 2.6710667610168457 + }, + { + "auxiliary_loss_clip": 0.02946908, + "auxiliary_loss_mlp": 0.01370672, + "balance_loss_clip": 1.75090122, + "balance_loss_mlp": 1.18909228, + "epoch": 0.002404857812781819, + "flos": 51083648161920.0, + "grad_norm": 3.558218944716869, + "language_loss": 0.76430988, + "learning_rate": 2.170246112844971e-06, + "loss": 0.8074857, + "num_input_tokens_seen": 537620, + "step": 20, + "time_per_iteration": 2.8559646606445312 + }, + { + "auxiliary_loss_clip": 0.02916723, + "auxiliary_loss_mlp": 0.01538374, + "balance_loss_clip": 1.61860657, + "balance_loss_mlp": 1.23872912, + "epoch": 0.0025251007034209102, + "flos": 15815347309440.0, + "grad_norm": 4.168200166493397, + "language_loss": 1.01579821, + "learning_rate": 2.2055919496770983e-06, + "loss": 1.06034923, + "num_input_tokens_seen": 555760, + "step": 21, + "time_per_iteration": 2.593475341796875 + }, + { + "auxiliary_loss_clip": 0.02902704, + "auxiliary_loss_mlp": 0.01524855, + "balance_loss_clip": 1.61413932, + "balance_loss_mlp": 1.22692752, + "epoch": 0.0026453435940600014, + "flos": 37851857458560.0, + "grad_norm": 14.115339426981507, + "language_loss": 0.89570194, + "learning_rate": 2.2392931865974923e-06, + "loss": 0.93997753, + "num_input_tokens_seen": 578450, + "step": 22, + "time_per_iteration": 2.7723278999328613 + }, + { + "auxiliary_loss_clip": 0.02864187, + "auxiliary_loss_mlp": 0.01505727, + "balance_loss_clip": 1.6054498, + "balance_loss_mlp": 1.20951533, + "epoch": 0.002765586484699092, + "flos": 21141976821120.0, + "grad_norm": 4.968711832150228, + "language_loss": 1.02135181, + "learning_rate": 2.271496085962064e-06, + "loss": 1.06505084, + "num_input_tokens_seen": 596145, + "step": 23, + "time_per_iteration": 2.595005989074707 + }, + { + "auxiliary_loss_clip": 0.02837522, + "auxiliary_loss_mlp": 0.01486034, + "balance_loss_clip": 1.5925765, + "balance_loss_mlp": 1.19230223, + "epoch": 0.002885829375338183, + "flos": 20667381396480.0, + "grad_norm": 3.9298066881701703, + "language_loss": 1.02688789, + "learning_rate": 2.3023282262611022e-06, + "loss": 1.07012343, + "num_input_tokens_seen": 614920, + "step": 24, + "time_per_iteration": 2.6063718795776367 + }, + { + "auxiliary_loss_clip": 0.02847328, + "auxiliary_loss_mlp": 0.01485847, + "balance_loss_clip": 1.60034382, + "balance_loss_mlp": 1.20031679, + "epoch": 0.003006072265977274, + "flos": 34823869873920.0, + "grad_norm": 3.274699957567405, + "language_loss": 0.92701113, + "learning_rate": 2.3319015548620114e-06, + "loss": 0.97034287, + "num_input_tokens_seen": 636060, + "step": 25, + "time_per_iteration": 2.7200188636779785 + }, + { + "auxiliary_loss_clip": 0.02804452, + "auxiliary_loss_mlp": 0.01455079, + "balance_loss_clip": 1.58764672, + "balance_loss_mlp": 1.17832291, + "epoch": 0.003126315156616365, + "flos": 24422021118720.0, + "grad_norm": 2.348708505032503, + "language_loss": 0.93200314, + "learning_rate": 2.3603148416618152e-06, + "loss": 0.97459847, + "num_input_tokens_seen": 655575, + "step": 26, + "time_per_iteration": 2.655686378479004 + }, + { + "auxiliary_loss_clip": 0.02812132, + "auxiliary_loss_mlp": 0.01435621, + "balance_loss_clip": 1.58872533, + "balance_loss_mlp": 1.16210747, + "epoch": 0.003246558047255456, + "flos": 23622326674560.0, + "grad_norm": 2.3704284149753097, + "language_loss": 1.01012504, + "learning_rate": 2.3876556694204647e-06, + "loss": 1.05260253, + "num_input_tokens_seen": 675730, + "step": 27, + "time_per_iteration": 2.6256227493286133 + }, + { + "auxiliary_loss_clip": 0.02772604, + "auxiliary_loss_mlp": 0.01439181, + "balance_loss_clip": 1.5820055, + "balance_loss_mlp": 1.15212476, + "epoch": 0.003366800937894547, + "flos": 17820275725440.0, + "grad_norm": 2.544138136512781, + "language_loss": 0.90868145, + "learning_rate": 2.414002061950908e-06, + "loss": 0.95079935, + "num_input_tokens_seen": 694605, + "step": 28, + "time_per_iteration": 2.5750157833099365 + }, + { + "auxiliary_loss_clip": 0.02756048, + "auxiliary_loss_mlp": 0.01412161, + "balance_loss_clip": 1.57506227, + "balance_loss_mlp": 1.1430341, + "epoch": 0.003487043828533638, + "flos": 24426115269120.0, + "grad_norm": 2.440302019985754, + "language_loss": 1.00170696, + "learning_rate": 2.4394238264681557e-06, + "loss": 1.04338908, + "num_input_tokens_seen": 714340, + "step": 29, + "time_per_iteration": 2.624516010284424 + }, + { + "auxiliary_loss_clip": 0.02730119, + "auxiliary_loss_mlp": 0.01411706, + "balance_loss_clip": 1.56739223, + "balance_loss_mlp": 1.13800144, + "epoch": 0.003607286719172729, + "flos": 26140311002880.0, + "grad_norm": 2.053741178708341, + "language_loss": 0.99668521, + "learning_rate": 2.4639836682781433e-06, + "loss": 1.03810346, + "num_input_tokens_seen": 734470, + "step": 30, + "time_per_iteration": 2.6385016441345215 + }, + { + "auxiliary_loss_clip": 0.02743457, + "auxiliary_loss_mlp": 0.0139929, + "balance_loss_clip": 1.58120334, + "balance_loss_mlp": 1.11814642, + "epoch": 0.00372752960981182, + "flos": 20593082113920.0, + "grad_norm": 2.6278743328151637, + "language_loss": 1.0030545, + "learning_rate": 2.487738122623307e-06, + "loss": 1.04448199, + "num_input_tokens_seen": 753380, + "step": 31, + "time_per_iteration": 2.591437816619873 + }, + { + "auxiliary_loss_clip": 0.02709206, + "auxiliary_loss_mlp": 0.01378023, + "balance_loss_clip": 1.5664413, + "balance_loss_mlp": 1.10450876, + "epoch": 0.003847772500450911, + "flos": 22674608282880.0, + "grad_norm": 2.419766182004324, + "language_loss": 0.9910568, + "learning_rate": 2.510738338534912e-06, + "loss": 1.03192902, + "num_input_tokens_seen": 772105, + "step": 32, + "time_per_iteration": 2.598050594329834 + }, + { + "auxiliary_loss_clip": 0.02569658, + "auxiliary_loss_mlp": 0.01362103, + "balance_loss_clip": 1.5228374, + "balance_loss_mlp": 1.09354854, + "epoch": 0.003968015391090002, + "flos": 17967796882560.0, + "grad_norm": 2.5580156819909803, + "language_loss": 1.02645445, + "learning_rate": 2.5330307420306648e-06, + "loss": 1.06577206, + "num_input_tokens_seen": 788955, + "step": 33, + "time_per_iteration": 2.5666611194610596 + }, + { + "auxiliary_loss_clip": 0.02525339, + "auxiliary_loss_mlp": 0.01346146, + "balance_loss_clip": 1.51616287, + "balance_loss_mlp": 1.10086036, + "epoch": 0.004088258281729093, + "flos": 27304103658240.0, + "grad_norm": 2.346209604328419, + "language_loss": 0.8810668, + "learning_rate": 2.554657600279796e-06, + "loss": 0.91978157, + "num_input_tokens_seen": 810230, + "step": 34, + "time_per_iteration": 2.678388833999634 + }, + { + "auxiliary_loss_clip": 0.02505123, + "auxiliary_loss_mlp": 0.01325481, + "balance_loss_clip": 1.50772309, + "balance_loss_mlp": 1.07275724, + "epoch": 0.004208501172368184, + "flos": 23258587599360.0, + "grad_norm": 3.4097778660950397, + "language_loss": 1.03323376, + "learning_rate": 2.5756575039679493e-06, + "loss": 1.07153964, + "num_input_tokens_seen": 829780, + "step": 35, + "time_per_iteration": 2.5957841873168945 + }, + { + "auxiliary_loss_clip": 0.02467491, + "auxiliary_loss_mlp": 0.01354623, + "balance_loss_clip": 1.49681664, + "balance_loss_mlp": 1.10170817, + "epoch": 0.0043287440630072746, + "flos": 17312104062720.0, + "grad_norm": 2.0325100402492935, + "language_loss": 0.95220184, + "learning_rate": 2.5960657816942747e-06, + "loss": 0.99042296, + "num_input_tokens_seen": 848695, + "step": 36, + "time_per_iteration": 2.58579421043396 + }, + { + "auxiliary_loss_clip": 0.02308794, + "auxiliary_loss_mlp": 0.01407153, + "balance_loss_clip": 1.57745957, + "balance_loss_mlp": 1.26524663, + "epoch": 0.004448986953646365, + "flos": 53092491160320.0, + "grad_norm": 1.3935582848370274, + "language_loss": 0.60953599, + "learning_rate": 2.6159148575788668e-06, + "loss": 0.64669544, + "num_input_tokens_seen": 906730, + "step": 37, + "time_per_iteration": 3.0579185485839844 + }, + { + "auxiliary_loss_clip": 0.02416703, + "auxiliary_loss_mlp": 0.01359864, + "balance_loss_clip": 1.48631597, + "balance_loss_mlp": 1.10942912, + "epoch": 0.004569229844285457, + "flos": 13444165866240.0, + "grad_norm": 3.903547067599738, + "language_loss": 0.98843646, + "learning_rate": 2.635234561171e-06, + "loss": 1.02620208, + "num_input_tokens_seen": 925125, + "step": 38, + "time_per_iteration": 2.5785717964172363 + }, + { + "auxiliary_loss_clip": 0.02392348, + "auxiliary_loss_mlp": 0.01329143, + "balance_loss_clip": 1.47850311, + "balance_loss_mlp": 1.09320402, + "epoch": 0.0046894727349245475, + "flos": 16209609966720.0, + "grad_norm": 2.318908176625905, + "language_loss": 0.94204128, + "learning_rate": 2.6540523970949877e-06, + "loss": 0.97925615, + "num_input_tokens_seen": 939970, + "step": 39, + "time_per_iteration": 2.5568325519561768 + }, + { + "auxiliary_loss_clip": 0.02361672, + "auxiliary_loss_mlp": 0.01336149, + "balance_loss_clip": 1.47741508, + "balance_loss_mlp": 1.09849334, + "epoch": 0.004809715625563638, + "flos": 23914244505600.0, + "grad_norm": 3.417243327216274, + "language_loss": 0.92579496, + "learning_rate": 2.6723937805519533e-06, + "loss": 0.9627732, + "num_input_tokens_seen": 957470, + "step": 40, + "time_per_iteration": 2.6221487522125244 + }, + { + "auxiliary_loss_clip": 0.02352536, + "auxiliary_loss_mlp": 0.01305043, + "balance_loss_clip": 1.46702111, + "balance_loss_mlp": 1.07864046, + "epoch": 0.00492995851620273, + "flos": 20773030273920.0, + "grad_norm": 2.2510982192542563, + "language_loss": 0.9307704, + "learning_rate": 2.690282243737839e-06, + "loss": 0.96734619, + "num_input_tokens_seen": 976405, + "step": 41, + "time_per_iteration": 2.6054177284240723 + }, + { + "auxiliary_loss_clip": 0.02316806, + "auxiliary_loss_mlp": 0.01329531, + "balance_loss_clip": 1.45504665, + "balance_loss_mlp": 1.09817004, + "epoch": 0.0050502014068418205, + "flos": 20338655103360.0, + "grad_norm": 3.3319108026448316, + "language_loss": 0.99413979, + "learning_rate": 2.7077396173840807e-06, + "loss": 1.03060329, + "num_input_tokens_seen": 994690, + "step": 42, + "time_per_iteration": 4.1634602546691895 + }, + { + "auxiliary_loss_clip": 0.02291487, + "auxiliary_loss_mlp": 0.01316486, + "balance_loss_clip": 1.44737756, + "balance_loss_mlp": 1.09485221, + "epoch": 0.005170444297480911, + "flos": 25994872834560.0, + "grad_norm": 2.2874607822359825, + "language_loss": 0.9274407, + "learning_rate": 2.7247861909342594e-06, + "loss": 0.96352041, + "num_input_tokens_seen": 1015615, + "step": 43, + "time_per_iteration": 3.416159152984619 + }, + { + "auxiliary_loss_clip": 0.022872, + "auxiliary_loss_mlp": 0.01312914, + "balance_loss_clip": 1.44599891, + "balance_loss_mlp": 1.09490407, + "epoch": 0.005290687188120003, + "flos": 20954055841920.0, + "grad_norm": 2.6131044720590255, + "language_loss": 0.831155, + "learning_rate": 2.7414408543044743e-06, + "loss": 0.86715615, + "num_input_tokens_seen": 1031255, + "step": 44, + "time_per_iteration": 2.6036062240600586 + }, + { + "auxiliary_loss_clip": 0.02236754, + "auxiliary_loss_mlp": 0.01334458, + "balance_loss_clip": 1.43135262, + "balance_loss_mlp": 1.11291921, + "epoch": 0.005410930078759093, + "flos": 15851401585920.0, + "grad_norm": 5.351912393565261, + "language_loss": 0.79350853, + "learning_rate": 2.7577212237113157e-06, + "loss": 0.82922065, + "num_input_tokens_seen": 1048295, + "step": 45, + "time_per_iteration": 2.568028211593628 + }, + { + "auxiliary_loss_clip": 0.02224554, + "auxiliary_loss_mlp": 0.0131116, + "balance_loss_clip": 1.42551792, + "balance_loss_mlp": 1.09505737, + "epoch": 0.005531172969398184, + "flos": 21104988791040.0, + "grad_norm": 1.905561835251974, + "language_loss": 1.03979588, + "learning_rate": 2.7736437536690466e-06, + "loss": 1.07515311, + "num_input_tokens_seen": 1067925, + "step": 46, + "time_per_iteration": 2.6655561923980713 + }, + { + "auxiliary_loss_clip": 0.02213421, + "auxiliary_loss_mlp": 0.01277628, + "balance_loss_clip": 1.42540312, + "balance_loss_mlp": 1.06629395, + "epoch": 0.005651415860037276, + "flos": 20844887431680.0, + "grad_norm": 2.3669999179711487, + "language_loss": 1.07952857, + "learning_rate": 2.789223836941131e-06, + "loss": 1.11443901, + "num_input_tokens_seen": 1088060, + "step": 47, + "time_per_iteration": 2.6151816844940186 + }, + { + "auxiliary_loss_clip": 0.02178406, + "auxiliary_loss_mlp": 0.01286408, + "balance_loss_clip": 1.41299033, + "balance_loss_mlp": 1.07955611, + "epoch": 0.005771658750676366, + "flos": 13260195383040.0, + "grad_norm": 2.1681333384994534, + "language_loss": 1.08764195, + "learning_rate": 2.8044758939680847e-06, + "loss": 1.12229002, + "num_input_tokens_seen": 1104130, + "step": 48, + "time_per_iteration": 2.5712850093841553 + }, + { + "auxiliary_loss_clip": 0.0215123, + "auxiliary_loss_mlp": 0.01283309, + "balance_loss_clip": 1.41177583, + "balance_loss_mlp": 1.07521725, + "epoch": 0.005891901641315457, + "flos": 24425396997120.0, + "grad_norm": 2.8822779316296283, + "language_loss": 1.02085197, + "learning_rate": 2.8194134530738863e-06, + "loss": 1.05519748, + "num_input_tokens_seen": 1122900, + "step": 49, + "time_per_iteration": 2.6097676753997803 + }, + { + "auxiliary_loss_clip": 0.02143398, + "auxiliary_loss_mlp": 0.01292998, + "balance_loss_clip": 1.40617967, + "balance_loss_mlp": 1.09472919, + "epoch": 0.006012144531954548, + "flos": 23076197314560.0, + "grad_norm": 3.9646797986535125, + "language_loss": 0.90268862, + "learning_rate": 2.834049222568994e-06, + "loss": 0.93705261, + "num_input_tokens_seen": 1140250, + "step": 50, + "time_per_iteration": 2.6013917922973633 + }, + { + "auxiliary_loss_clip": 0.02138275, + "auxiliary_loss_mlp": 0.01255863, + "balance_loss_clip": 1.40294933, + "balance_loss_mlp": 1.06302977, + "epoch": 0.006132387422593639, + "flos": 22528775064960.0, + "grad_norm": 2.244583146028269, + "language_loss": 0.92647684, + "learning_rate": 2.848395155712969e-06, + "loss": 0.96041822, + "num_input_tokens_seen": 1160470, + "step": 51, + "time_per_iteration": 2.585029363632202 + }, + { + "auxiliary_loss_clip": 0.02121068, + "auxiliary_loss_mlp": 0.01293397, + "balance_loss_clip": 1.40352368, + "balance_loss_mlp": 1.09693992, + "epoch": 0.00625263031323273, + "flos": 27628340751360.0, + "grad_norm": 2.0945650084428413, + "language_loss": 0.97697073, + "learning_rate": 2.8624625093687977e-06, + "loss": 1.01111531, + "num_input_tokens_seen": 1177605, + "step": 52, + "time_per_iteration": 2.6547396183013916 + }, + { + "auxiliary_loss_clip": 0.02102938, + "auxiliary_loss_mlp": 0.01261258, + "balance_loss_clip": 1.39373267, + "balance_loss_mlp": 1.07691312, + "epoch": 0.006372873203871821, + "flos": 23110671392640.0, + "grad_norm": 2.080223198832574, + "language_loss": 0.88914132, + "learning_rate": 2.876261897070029e-06, + "loss": 0.92278326, + "num_input_tokens_seen": 1197735, + "step": 53, + "time_per_iteration": 2.6076254844665527 + }, + { + "auxiliary_loss_clip": 0.02100085, + "auxiliary_loss_mlp": 0.01279491, + "balance_loss_clip": 1.39694059, + "balance_loss_mlp": 1.09238017, + "epoch": 0.006493116094510912, + "flos": 22856028900480.0, + "grad_norm": 3.2763776804340807, + "language_loss": 0.92631829, + "learning_rate": 2.889803337127447e-06, + "loss": 0.960114, + "num_input_tokens_seen": 1216335, + "step": 54, + "time_per_iteration": 2.581669330596924 + }, + { + "auxiliary_loss_clip": 0.02068879, + "auxiliary_loss_mlp": 0.01297448, + "balance_loss_clip": 1.38467956, + "balance_loss_mlp": 1.10032368, + "epoch": 0.006613358985150003, + "flos": 23071708114560.0, + "grad_norm": 3.3005048086977817, + "language_loss": 0.84689069, + "learning_rate": 2.903096296321516e-06, + "loss": 0.88055396, + "num_input_tokens_seen": 1234480, + "step": 55, + "time_per_iteration": 2.591900587081909 + }, + { + "auxiliary_loss_clip": 0.02069197, + "auxiliary_loss_mlp": 0.01251068, + "balance_loss_clip": 1.38638353, + "balance_loss_mlp": 1.07473373, + "epoch": 0.006733601875789094, + "flos": 26537662229760.0, + "grad_norm": 2.0096965143652934, + "language_loss": 0.91520768, + "learning_rate": 2.9161497296578907e-06, + "loss": 0.94841027, + "num_input_tokens_seen": 1253870, + "step": 56, + "time_per_iteration": 2.6639773845672607 + }, + { + "auxiliary_loss_clip": 0.02048511, + "auxiliary_loss_mlp": 0.01253555, + "balance_loss_clip": 1.37968683, + "balance_loss_mlp": 1.07493222, + "epoch": 0.006853844766428185, + "flos": 15523178083200.0, + "grad_norm": 2.3246443989327426, + "language_loss": 0.8600198, + "learning_rate": 2.928972116604173e-06, + "loss": 0.89304048, + "num_input_tokens_seen": 1270145, + "step": 57, + "time_per_iteration": 2.5760161876678467 + }, + { + "auxiliary_loss_clip": 0.02020285, + "auxiliary_loss_mlp": 0.01233107, + "balance_loss_clip": 1.3707037, + "balance_loss_mlp": 1.06583238, + "epoch": 0.006974087657067276, + "flos": 24243760897920.0, + "grad_norm": 2.1847461892956273, + "language_loss": 1.01862478, + "learning_rate": 2.9415714941751377e-06, + "loss": 1.05115867, + "num_input_tokens_seen": 1291365, + "step": 58, + "time_per_iteration": 2.6366748809814453 + }, + { + "auxiliary_loss_clip": 0.02035641, + "auxiliary_loss_mlp": 0.01254965, + "balance_loss_clip": 1.37243676, + "balance_loss_mlp": 1.08654642, + "epoch": 0.007094330547706367, + "flos": 25772513690880.0, + "grad_norm": 1.9095273418643135, + "language_loss": 0.9345569, + "learning_rate": 2.9539554871897396e-06, + "loss": 0.96746302, + "num_input_tokens_seen": 1311535, + "step": 59, + "time_per_iteration": 2.714545965194702 + }, + { + "auxiliary_loss_clip": 0.02002079, + "auxiliary_loss_mlp": 0.01241921, + "balance_loss_clip": 1.36373866, + "balance_loss_mlp": 1.07827067, + "epoch": 0.007214573438345458, + "flos": 21319015979520.0, + "grad_norm": 3.3977070902244786, + "language_loss": 0.97434664, + "learning_rate": 2.9661313359851253e-06, + "loss": 1.0067867, + "num_input_tokens_seen": 1329420, + "step": 60, + "time_per_iteration": 2.702277660369873 + }, + { + "auxiliary_loss_clip": 0.01979869, + "auxiliary_loss_mlp": 0.01237526, + "balance_loss_clip": 1.36084771, + "balance_loss_mlp": 1.0781666, + "epoch": 0.007334816328984549, + "flos": 24937088192640.0, + "grad_norm": 1.9795142517872977, + "language_loss": 0.93900704, + "learning_rate": 2.978105921839922e-06, + "loss": 0.97118098, + "num_input_tokens_seen": 1349965, + "step": 61, + "time_per_iteration": 2.613574504852295 + }, + { + "auxiliary_loss_clip": 0.01965701, + "auxiliary_loss_mlp": 0.01248457, + "balance_loss_clip": 1.35759425, + "balance_loss_mlp": 1.0889076, + "epoch": 0.00745505921962364, + "flos": 18510586277760.0, + "grad_norm": 2.2265460245745463, + "language_loss": 0.72166783, + "learning_rate": 2.9898857903302893e-06, + "loss": 0.75380939, + "num_input_tokens_seen": 1368915, + "step": 62, + "time_per_iteration": 2.594130039215088 + }, + { + "auxiliary_loss_clip": 0.01966738, + "auxiliary_loss_mlp": 0.01252594, + "balance_loss_clip": 1.35537457, + "balance_loss_mlp": 1.08903909, + "epoch": 0.007575302110262731, + "flos": 18477656484480.0, + "grad_norm": 2.842248495759108, + "language_loss": 0.87959313, + "learning_rate": 3.001477172817253e-06, + "loss": 0.91178644, + "num_input_tokens_seen": 1386805, + "step": 63, + "time_per_iteration": 2.5810022354125977 + }, + { + "auxiliary_loss_clip": 0.01941068, + "auxiliary_loss_mlp": 0.01225859, + "balance_loss_clip": 1.34662545, + "balance_loss_mlp": 1.07899344, + "epoch": 0.007695545000901822, + "flos": 24973178382720.0, + "grad_norm": 2.8049304616869004, + "language_loss": 0.9612003, + "learning_rate": 3.012886006241894e-06, + "loss": 0.99286962, + "num_input_tokens_seen": 1406190, + "step": 64, + "time_per_iteration": 2.62530517578125 + }, + { + "auxiliary_loss_clip": 0.0194591, + "auxiliary_loss_mlp": 0.01225819, + "balance_loss_clip": 1.34772635, + "balance_loss_mlp": 1.07141912, + "epoch": 0.007815787891540913, + "flos": 21324223451520.0, + "grad_norm": 1.9349188782883724, + "language_loss": 0.88213897, + "learning_rate": 3.0241179513858383e-06, + "loss": 0.91385627, + "num_input_tokens_seen": 1425500, + "step": 65, + "time_per_iteration": 2.5906243324279785 + }, + { + "auxiliary_loss_clip": 0.019255, + "auxiliary_loss_mlp": 0.01252022, + "balance_loss_clip": 1.3360672, + "balance_loss_mlp": 1.09046948, + "epoch": 0.007936030782180003, + "flos": 21575777374080.0, + "grad_norm": 2.990366064377798, + "language_loss": 0.87674552, + "learning_rate": 3.035178409737647e-06, + "loss": 0.9085207, + "num_input_tokens_seen": 1442950, + "step": 66, + "time_per_iteration": 2.581000328063965 + }, + { + "auxiliary_loss_clip": 0.01905269, + "auxiliary_loss_mlp": 0.01213104, + "balance_loss_clip": 1.32983994, + "balance_loss_mlp": 1.07949448, + "epoch": 0.008056273672819095, + "flos": 20120785159680.0, + "grad_norm": 2.6164892972706326, + "language_loss": 0.88762152, + "learning_rate": 3.046072539090907e-06, + "loss": 0.91880524, + "num_input_tokens_seen": 1460915, + "step": 67, + "time_per_iteration": 2.5937912464141846 + }, + { + "auxiliary_loss_clip": 0.01897565, + "auxiliary_loss_mlp": 0.01214614, + "balance_loss_clip": 1.3280766, + "balance_loss_mlp": 1.07490039, + "epoch": 0.008176516563458186, + "flos": 18333116156160.0, + "grad_norm": 2.439134358970867, + "language_loss": 1.04790306, + "learning_rate": 3.056805267986779e-06, + "loss": 1.07902479, + "num_input_tokens_seen": 1478385, + "step": 68, + "time_per_iteration": 2.5610482692718506 + }, + { + "auxiliary_loss_clip": 0.01880588, + "auxiliary_loss_mlp": 0.01216668, + "balance_loss_clip": 1.32177734, + "balance_loss_mlp": 1.08076966, + "epoch": 0.008296759454097276, + "flos": 21872076664320.0, + "grad_norm": 2.11425529632868, + "language_loss": 0.95354486, + "learning_rate": 3.0673813091022194e-06, + "loss": 0.98451734, + "num_input_tokens_seen": 1497605, + "step": 69, + "time_per_iteration": 4.13210391998291 + }, + { + "auxiliary_loss_clip": 0.01733297, + "auxiliary_loss_mlp": 0.01225022, + "balance_loss_clip": 1.33566403, + "balance_loss_mlp": 1.1594094, + "epoch": 0.008417002344736368, + "flos": 63408228036480.0, + "grad_norm": 1.3077501044447453, + "language_loss": 0.6204685, + "learning_rate": 3.0778051716749317e-06, + "loss": 0.65005171, + "num_input_tokens_seen": 1561150, + "step": 70, + "time_per_iteration": 4.660879611968994 + }, + { + "auxiliary_loss_clip": 0.01852118, + "auxiliary_loss_mlp": 0.01208634, + "balance_loss_clip": 1.30373967, + "balance_loss_mlp": 1.07502413, + "epoch": 0.008537245235375458, + "flos": 22966454286720.0, + "grad_norm": 2.752938498140635, + "language_loss": 0.90491068, + "learning_rate": 3.0880811730470094e-06, + "loss": 0.93551826, + "num_input_tokens_seen": 1580605, + "step": 71, + "time_per_iteration": 2.613633155822754 + }, + { + "auxiliary_loss_clip": 0.01702189, + "auxiliary_loss_mlp": 0.0117619, + "balance_loss_clip": 1.31741333, + "balance_loss_mlp": 1.11591816, + "epoch": 0.008657488126014549, + "flos": 61984046712960.0, + "grad_norm": 1.195995292344468, + "language_loss": 0.58630359, + "learning_rate": 3.098213449401257e-06, + "loss": 0.61508739, + "num_input_tokens_seen": 1647535, + "step": 72, + "time_per_iteration": 3.0908663272857666 + }, + { + "auxiliary_loss_clip": 0.01839998, + "auxiliary_loss_mlp": 0.01207614, + "balance_loss_clip": 1.30162024, + "balance_loss_mlp": 1.08048964, + "epoch": 0.00877773101665364, + "flos": 30296791152000.0, + "grad_norm": 3.220020153484265, + "language_loss": 0.99028701, + "learning_rate": 3.1082059657570015e-06, + "loss": 1.02076316, + "num_input_tokens_seen": 1666770, + "step": 73, + "time_per_iteration": 2.654447078704834 + }, + { + "auxiliary_loss_clip": 0.01810848, + "auxiliary_loss_mlp": 0.01195604, + "balance_loss_clip": 1.29256868, + "balance_loss_mlp": 1.06619, + "epoch": 0.00889797390729273, + "flos": 23514056104320.0, + "grad_norm": 3.158582003024304, + "language_loss": 0.96458066, + "learning_rate": 3.1180625252858496e-06, + "loss": 0.99464512, + "num_input_tokens_seen": 1685200, + "step": 74, + "time_per_iteration": 2.6013121604919434 + }, + { + "auxiliary_loss_clip": 0.01796212, + "auxiliary_loss_mlp": 0.01206966, + "balance_loss_clip": 1.28428733, + "balance_loss_mlp": 1.0849911, + "epoch": 0.009018216797931822, + "flos": 23075838178560.0, + "grad_norm": 3.1246334450160047, + "language_loss": 0.79981315, + "learning_rate": 3.1277867780021663e-06, + "loss": 0.82984495, + "num_input_tokens_seen": 1701835, + "step": 75, + "time_per_iteration": 2.5928139686584473 + }, + { + "auxiliary_loss_clip": 0.01774833, + "auxiliary_loss_mlp": 0.01175923, + "balance_loss_clip": 1.27673197, + "balance_loss_mlp": 1.06491494, + "epoch": 0.009138459688570914, + "flos": 15918877284480.0, + "grad_norm": 1.9991087524455453, + "language_loss": 0.95455825, + "learning_rate": 3.1373822288779824e-06, + "loss": 0.98406589, + "num_input_tokens_seen": 1718415, + "step": 76, + "time_per_iteration": 2.560398817062378 + }, + { + "auxiliary_loss_clip": 0.01774367, + "auxiliary_loss_mlp": 0.01205573, + "balance_loss_clip": 1.27877617, + "balance_loss_mlp": 1.08712697, + "epoch": 0.009258702579210003, + "flos": 27016531372800.0, + "grad_norm": 2.303352752182179, + "language_loss": 0.79563236, + "learning_rate": 3.1468522454274533e-06, + "loss": 0.82543176, + "num_input_tokens_seen": 1738770, + "step": 77, + "time_per_iteration": 2.6369876861572266 + }, + { + "auxiliary_loss_clip": 0.01765547, + "auxiliary_loss_mlp": 0.01190466, + "balance_loss_clip": 1.27327418, + "balance_loss_mlp": 1.07373595, + "epoch": 0.009378945469849095, + "flos": 26903196984960.0, + "grad_norm": 2.1110317024542784, + "language_loss": 0.91977882, + "learning_rate": 3.15620006480197e-06, + "loss": 0.94933891, + "num_input_tokens_seen": 1758040, + "step": 78, + "time_per_iteration": 2.6155006885528564 + }, + { + "auxiliary_loss_clip": 0.01762193, + "auxiliary_loss_mlp": 0.01187474, + "balance_loss_clip": 1.27047551, + "balance_loss_mlp": 1.07207918, + "epoch": 0.009499188360488187, + "flos": 35694236327040.0, + "grad_norm": 9.418198619268875, + "language_loss": 0.7480371, + "learning_rate": 3.1654288004333087e-06, + "loss": 0.77753377, + "num_input_tokens_seen": 1776705, + "step": 79, + "time_per_iteration": 2.678626775741577 + }, + { + "auxiliary_loss_clip": 0.01742163, + "auxiliary_loss_mlp": 0.01175921, + "balance_loss_clip": 1.26429605, + "balance_loss_mlp": 1.07044458, + "epoch": 0.009619431251127276, + "flos": 21503201944320.0, + "grad_norm": 4.353280550041497, + "language_loss": 0.76012665, + "learning_rate": 3.1745414482589353e-06, + "loss": 0.78930748, + "num_input_tokens_seen": 1795915, + "step": 80, + "time_per_iteration": 2.592921733856201 + }, + { + "auxiliary_loss_clip": 0.01732466, + "auxiliary_loss_mlp": 0.01168541, + "balance_loss_clip": 1.25981581, + "balance_loss_mlp": 1.06125247, + "epoch": 0.009739674141766368, + "flos": 17421056991360.0, + "grad_norm": 3.718579123802725, + "language_loss": 0.86938286, + "learning_rate": 3.1835408925606204e-06, + "loss": 0.89839292, + "num_input_tokens_seen": 1814055, + "step": 81, + "time_per_iteration": 2.5709149837493896 + }, + { + "auxiliary_loss_clip": 0.0171341, + "auxiliary_loss_mlp": 0.01184519, + "balance_loss_clip": 1.25318789, + "balance_loss_mlp": 1.0786134, + "epoch": 0.00985991703240546, + "flos": 27527109246720.0, + "grad_norm": 2.2822920651227814, + "language_loss": 0.89397371, + "learning_rate": 3.1924299114448214e-06, + "loss": 0.92295301, + "num_input_tokens_seen": 1834535, + "step": 82, + "time_per_iteration": 2.654970407485962 + }, + { + "auxiliary_loss_clip": 0.01722265, + "auxiliary_loss_mlp": 0.01184854, + "balance_loss_clip": 1.25749195, + "balance_loss_mlp": 1.07909179, + "epoch": 0.00998015992304455, + "flos": 13808084509440.0, + "grad_norm": 2.3867025389089993, + "language_loss": 0.83379078, + "learning_rate": 3.2012111819909055e-06, + "loss": 0.86286199, + "num_input_tokens_seen": 1851865, + "step": 83, + "time_per_iteration": 2.6232094764709473 + }, + { + "auxiliary_loss_clip": 0.01710508, + "auxiliary_loss_mlp": 0.01173847, + "balance_loss_clip": 1.25103104, + "balance_loss_mlp": 1.07084978, + "epoch": 0.010100402813683641, + "flos": 20191385341440.0, + "grad_norm": 2.2340209261991273, + "language_loss": 0.94921529, + "learning_rate": 3.2098872850910627e-06, + "loss": 0.97805882, + "num_input_tokens_seen": 1868540, + "step": 84, + "time_per_iteration": 2.5778613090515137 + }, + { + "auxiliary_loss_clip": 0.01707698, + "auxiliary_loss_mlp": 0.01178119, + "balance_loss_clip": 1.25227761, + "balance_loss_mlp": 1.07927012, + "epoch": 0.010220645704322733, + "flos": 17201642762880.0, + "grad_norm": 2.022174581628354, + "language_loss": 0.89185631, + "learning_rate": 3.2184607100038194e-06, + "loss": 0.9207145, + "num_input_tokens_seen": 1887180, + "step": 85, + "time_per_iteration": 2.5686352252960205 + }, + { + "auxiliary_loss_clip": 0.01706509, + "auxiliary_loss_mlp": 0.01179537, + "balance_loss_clip": 1.25297189, + "balance_loss_mlp": 1.08183265, + "epoch": 0.010340888594961822, + "flos": 21470415805440.0, + "grad_norm": 2.0693631771246843, + "language_loss": 0.93181765, + "learning_rate": 3.2269338586412414e-06, + "loss": 0.9606781, + "num_input_tokens_seen": 1904765, + "step": 86, + "time_per_iteration": 2.5833380222320557 + }, + { + "auxiliary_loss_clip": 0.01691739, + "auxiliary_loss_mlp": 0.01170414, + "balance_loss_clip": 1.24513865, + "balance_loss_mlp": 1.07909966, + "epoch": 0.010461131485600914, + "flos": 23002831785600.0, + "grad_norm": 2.3713113908674175, + "language_loss": 0.96356446, + "learning_rate": 3.2353090496083106e-06, + "loss": 0.99218601, + "num_input_tokens_seen": 1922600, + "step": 87, + "time_per_iteration": 2.592686891555786 + }, + { + "auxiliary_loss_clip": 0.01669849, + "auxiliary_loss_mlp": 0.01166239, + "balance_loss_clip": 1.23631072, + "balance_loss_mlp": 1.08064651, + "epoch": 0.010581374376240005, + "flos": 33546850571520.0, + "grad_norm": 2.3858480068184815, + "language_loss": 0.8118974, + "learning_rate": 3.2435885220114572e-06, + "loss": 0.84025824, + "num_input_tokens_seen": 1943950, + "step": 88, + "time_per_iteration": 2.685737371444702 + }, + { + "auxiliary_loss_clip": 0.01676988, + "auxiliary_loss_mlp": 0.01154448, + "balance_loss_clip": 1.2415725, + "balance_loss_mlp": 1.06556535, + "epoch": 0.010701617266879095, + "flos": 21763087822080.0, + "grad_norm": 2.344462122997114, + "language_loss": 0.93791831, + "learning_rate": 3.2517744390519113e-06, + "loss": 0.96623272, + "num_input_tokens_seen": 1962815, + "step": 89, + "time_per_iteration": 2.586205005645752 + }, + { + "auxiliary_loss_clip": 0.01663001, + "auxiliary_loss_mlp": 0.01151318, + "balance_loss_clip": 1.22787857, + "balance_loss_mlp": 1.06710911, + "epoch": 0.010821860157518187, + "flos": 19060199256960.0, + "grad_norm": 2.1677310165067603, + "language_loss": 0.74984062, + "learning_rate": 3.259868891418298e-06, + "loss": 0.77798378, + "num_input_tokens_seen": 1980580, + "step": 90, + "time_per_iteration": 2.5825257301330566 + }, + { + "auxiliary_loss_clip": 0.01671167, + "auxiliary_loss_mlp": 0.01186266, + "balance_loss_clip": 1.23762274, + "balance_loss_mlp": 1.09843302, + "epoch": 0.010942103048157278, + "flos": 25447378757760.0, + "grad_norm": 1.934414085066812, + "language_loss": 0.85021621, + "learning_rate": 3.2678739004917757e-06, + "loss": 0.8787905, + "num_input_tokens_seen": 2000315, + "step": 91, + "time_per_iteration": 2.600583791732788 + }, + { + "auxiliary_loss_clip": 0.01654083, + "auxiliary_loss_mlp": 0.01167234, + "balance_loss_clip": 1.23138618, + "balance_loss_mlp": 1.08550417, + "epoch": 0.011062345938796368, + "flos": 27493928058240.0, + "grad_norm": 1.8901112275220244, + "language_loss": 0.92056656, + "learning_rate": 3.275791421376029e-06, + "loss": 0.94877964, + "num_input_tokens_seen": 2023760, + "step": 92, + "time_per_iteration": 2.659606456756592 + }, + { + "auxiliary_loss_clip": 0.01642071, + "auxiliary_loss_mlp": 0.01145391, + "balance_loss_clip": 1.22293091, + "balance_loss_mlp": 1.07090902, + "epoch": 0.01118258882943546, + "flos": 16071210864000.0, + "grad_norm": 2.3527366811273853, + "language_loss": 0.96212232, + "learning_rate": 3.2836233457634622e-06, + "loss": 0.98999691, + "num_input_tokens_seen": 2041895, + "step": 93, + "time_per_iteration": 2.5673742294311523 + }, + { + "auxiliary_loss_clip": 0.01638822, + "auxiliary_loss_mlp": 0.01174562, + "balance_loss_clip": 1.22155738, + "balance_loss_mlp": 1.08505964, + "epoch": 0.011302831720074551, + "flos": 20668602458880.0, + "grad_norm": 2.198391126988277, + "language_loss": 0.85368729, + "learning_rate": 3.2913715046481135e-06, + "loss": 0.88182116, + "num_input_tokens_seen": 2061640, + "step": 94, + "time_per_iteration": 2.5757038593292236 + }, + { + "auxiliary_loss_clip": 0.01635234, + "auxiliary_loss_mlp": 0.01154606, + "balance_loss_clip": 1.21978712, + "balance_loss_mlp": 1.07936168, + "epoch": 0.011423074610713641, + "flos": 13072238490240.0, + "grad_norm": 3.308346582378893, + "language_loss": 0.88695258, + "learning_rate": 3.299037670895023e-06, + "loss": 0.91485101, + "num_input_tokens_seen": 2078255, + "step": 95, + "time_per_iteration": 2.553851842880249 + }, + { + "auxiliary_loss_clip": 0.01634962, + "auxiliary_loss_mlp": 0.01141578, + "balance_loss_clip": 1.22435939, + "balance_loss_mlp": 1.06423545, + "epoch": 0.011543317501352733, + "flos": 30335646689280.0, + "grad_norm": 2.106724821654539, + "language_loss": 0.80220473, + "learning_rate": 3.3066235616750667e-06, + "loss": 0.82997006, + "num_input_tokens_seen": 2099490, + "step": 96, + "time_per_iteration": 3.4257164001464844 + }, + { + "auxiliary_loss_clip": 0.0161484, + "auxiliary_loss_mlp": 0.01136486, + "balance_loss_clip": 1.21167731, + "balance_loss_mlp": 1.0634346, + "epoch": 0.011663560391991824, + "flos": 15522962601600.0, + "grad_norm": 2.1457459814915776, + "language_loss": 0.92438602, + "learning_rate": 3.3141308407736276e-06, + "loss": 0.95189929, + "num_input_tokens_seen": 2116125, + "step": 97, + "time_per_iteration": 4.154680967330933 + }, + { + "auxiliary_loss_clip": 0.01619825, + "auxiliary_loss_mlp": 0.01143809, + "balance_loss_clip": 1.2090559, + "balance_loss_mlp": 1.07180691, + "epoch": 0.011783803282630914, + "flos": 19902125116800.0, + "grad_norm": 3.4769828245807184, + "language_loss": 0.86598301, + "learning_rate": 3.321561120780869e-06, + "loss": 0.89361936, + "num_input_tokens_seen": 2134835, + "step": 98, + "time_per_iteration": 2.571636199951172 + }, + { + "auxiliary_loss_clip": 0.01609456, + "auxiliary_loss_mlp": 0.01137591, + "balance_loss_clip": 1.21195722, + "balance_loss_mlp": 1.0735991, + "epoch": 0.011904046173270006, + "flos": 22340674517760.0, + "grad_norm": 3.1009522640666796, + "language_loss": 1.01495814, + "learning_rate": 3.3289159651708192e-06, + "loss": 1.04242861, + "num_input_tokens_seen": 2152410, + "step": 99, + "time_per_iteration": 2.5761711597442627 + }, + { + "auxiliary_loss_clip": 0.01606472, + "auxiliary_loss_mlp": 0.01134673, + "balance_loss_clip": 1.20822096, + "balance_loss_mlp": 1.06391037, + "epoch": 0.012024289063909096, + "flos": 19100060375040.0, + "grad_norm": 1.9097169616611607, + "language_loss": 0.97610283, + "learning_rate": 3.3361968902759768e-06, + "loss": 1.00351429, + "num_input_tokens_seen": 2172090, + "step": 100, + "time_per_iteration": 2.5848615169525146 + }, + { + "auxiliary_loss_clip": 0.0159926, + "auxiliary_loss_mlp": 0.01123522, + "balance_loss_clip": 1.2057656, + "balance_loss_mlp": 1.06167603, + "epoch": 0.012144531954548187, + "flos": 15012205159680.0, + "grad_norm": 2.302898922823281, + "language_loss": 0.93713981, + "learning_rate": 3.343405367163663e-06, + "loss": 0.96436763, + "num_input_tokens_seen": 2189020, + "step": 101, + "time_per_iteration": 2.569058895111084 + }, + { + "auxiliary_loss_clip": 0.01602041, + "auxiliary_loss_mlp": 0.01131222, + "balance_loss_clip": 1.20525932, + "balance_loss_mlp": 1.06727839, + "epoch": 0.012264774845187279, + "flos": 15122020014720.0, + "grad_norm": 2.5068547797992613, + "language_loss": 0.80971867, + "learning_rate": 3.350542823419951e-06, + "loss": 0.83705133, + "num_input_tokens_seen": 2205620, + "step": 102, + "time_per_iteration": 2.529803991317749 + }, + { + "auxiliary_loss_clip": 0.01595624, + "auxiliary_loss_mlp": 0.01146278, + "balance_loss_clip": 1.19933033, + "balance_loss_mlp": 1.08285892, + "epoch": 0.012385017735826368, + "flos": 13949248959360.0, + "grad_norm": 4.232325771702045, + "language_loss": 0.87617886, + "learning_rate": 3.3576106448465615e-06, + "loss": 0.90359789, + "num_input_tokens_seen": 2219000, + "step": 103, + "time_per_iteration": 2.5215837955474854 + }, + { + "auxiliary_loss_clip": 0.01584435, + "auxiliary_loss_mlp": 0.01132815, + "balance_loss_clip": 1.19702041, + "balance_loss_mlp": 1.06870461, + "epoch": 0.01250526062646546, + "flos": 23623260428160.0, + "grad_norm": 2.0906451048764323, + "language_loss": 0.88140273, + "learning_rate": 3.3646101770757797e-06, + "loss": 0.90857518, + "num_input_tokens_seen": 2237790, + "step": 104, + "time_per_iteration": 2.5895588397979736 + }, + { + "auxiliary_loss_clip": 0.0157778, + "auxiliary_loss_mlp": 0.01132769, + "balance_loss_clip": 1.19366133, + "balance_loss_mlp": 1.06520116, + "epoch": 0.012625503517104552, + "flos": 34640078958720.0, + "grad_norm": 1.8691833116480119, + "language_loss": 0.85582542, + "learning_rate": 3.371542727108104e-06, + "loss": 0.88293093, + "num_input_tokens_seen": 2259965, + "step": 105, + "time_per_iteration": 2.7099413871765137 + }, + { + "auxiliary_loss_clip": 0.01579799, + "auxiliary_loss_mlp": 0.01173558, + "balance_loss_clip": 1.19605803, + "balance_loss_mlp": 1.10846996, + "epoch": 0.012745746407743641, + "flos": 17821891837440.0, + "grad_norm": 2.2430939595900643, + "language_loss": 0.89907062, + "learning_rate": 3.3784095647770114e-06, + "loss": 0.92660415, + "num_input_tokens_seen": 2278610, + "step": 106, + "time_per_iteration": 2.5534234046936035 + }, + { + "auxiliary_loss_clip": 0.01570918, + "auxiliary_loss_mlp": 0.01135381, + "balance_loss_clip": 1.18696356, + "balance_loss_mlp": 1.07091224, + "epoch": 0.012865989298382733, + "flos": 20595057361920.0, + "grad_norm": 1.955238009514176, + "language_loss": 0.88626301, + "learning_rate": 3.3852119241449547e-06, + "loss": 0.91332608, + "num_input_tokens_seen": 2297730, + "step": 107, + "time_per_iteration": 2.6147494316101074 + }, + { + "auxiliary_loss_clip": 0.01566498, + "auxiliary_loss_mlp": 0.01123507, + "balance_loss_clip": 1.18676448, + "balance_loss_mlp": 1.06194758, + "epoch": 0.012986232189021825, + "flos": 23948969978880.0, + "grad_norm": 3.4916264609340684, + "language_loss": 0.9632901, + "learning_rate": 3.3919510048344295e-06, + "loss": 0.99019015, + "num_input_tokens_seen": 2315740, + "step": 108, + "time_per_iteration": 2.5744357109069824 + }, + { + "auxiliary_loss_clip": 0.01556225, + "auxiliary_loss_mlp": 0.01127044, + "balance_loss_clip": 1.18250084, + "balance_loss_mlp": 1.07044399, + "epoch": 0.013106475079660914, + "flos": 23725425686400.0, + "grad_norm": 1.9626343219373188, + "language_loss": 0.86696678, + "learning_rate": 3.3986279732976907e-06, + "loss": 0.89379942, + "num_input_tokens_seen": 2334215, + "step": 109, + "time_per_iteration": 2.589395761489868 + }, + { + "auxiliary_loss_clip": 0.01551019, + "auxiliary_loss_mlp": 0.0110544, + "balance_loss_clip": 1.17968082, + "balance_loss_mlp": 1.05019867, + "epoch": 0.013226717970300006, + "flos": 21102438925440.0, + "grad_norm": 2.029088923413398, + "language_loss": 0.95304513, + "learning_rate": 3.4052439640284983e-06, + "loss": 0.97960973, + "num_input_tokens_seen": 2353130, + "step": 110, + "time_per_iteration": 2.558588743209839 + }, + { + "auxiliary_loss_clip": 0.01552122, + "auxiliary_loss_mlp": 0.01124658, + "balance_loss_clip": 1.18285251, + "balance_loss_mlp": 1.0672226, + "epoch": 0.013346960860939098, + "flos": 24863902231680.0, + "grad_norm": 1.947597124451499, + "language_loss": 0.81146818, + "learning_rate": 3.4118000807190217e-06, + "loss": 0.83823597, + "num_input_tokens_seen": 2374010, + "step": 111, + "time_per_iteration": 2.6177725791931152 + }, + { + "auxiliary_loss_clip": 0.01555606, + "auxiliary_loss_mlp": 0.01123984, + "balance_loss_clip": 1.18171406, + "balance_loss_mlp": 1.06848025, + "epoch": 0.013467203751578187, + "flos": 28181940140160.0, + "grad_norm": 2.0527272613748146, + "language_loss": 0.76018333, + "learning_rate": 3.4182973973648723e-06, + "loss": 0.7869792, + "num_input_tokens_seen": 2395220, + "step": 112, + "time_per_iteration": 2.6187939643859863 + }, + { + "auxiliary_loss_clip": 0.01543172, + "auxiliary_loss_mlp": 0.01143209, + "balance_loss_clip": 1.17763972, + "balance_loss_mlp": 1.08801532, + "epoch": 0.013587446642217279, + "flos": 18916233546240.0, + "grad_norm": 2.63957338676087, + "language_loss": 0.95100272, + "learning_rate": 3.424736959321014e-06, + "loss": 0.97786653, + "num_input_tokens_seen": 2413025, + "step": 113, + "time_per_iteration": 2.5657899379730225 + }, + { + "auxiliary_loss_clip": 0.01546462, + "auxiliary_loss_mlp": 0.01136929, + "balance_loss_clip": 1.17783368, + "balance_loss_mlp": 1.08028102, + "epoch": 0.01370768953285637, + "flos": 23988615615360.0, + "grad_norm": 2.0829172239638205, + "language_loss": 0.88596255, + "learning_rate": 3.431119784311155e-06, + "loss": 0.9127965, + "num_input_tokens_seen": 2432700, + "step": 114, + "time_per_iteration": 2.585623025894165 + }, + { + "auxiliary_loss_clip": 0.01532785, + "auxiliary_loss_mlp": 0.01122465, + "balance_loss_clip": 1.17347753, + "balance_loss_mlp": 1.07010818, + "epoch": 0.01382793242349546, + "flos": 39202565512320.0, + "grad_norm": 1.7419485992895507, + "language_loss": 0.77649581, + "learning_rate": 3.43744686339307e-06, + "loss": 0.80304837, + "num_input_tokens_seen": 2455020, + "step": 115, + "time_per_iteration": 2.7040534019470215 + }, + { + "auxiliary_loss_clip": 0.01528337, + "auxiliary_loss_mlp": 0.0108936, + "balance_loss_clip": 1.16759372, + "balance_loss_mlp": 1.04048419, + "epoch": 0.013948175314134552, + "flos": 41353506714240.0, + "grad_norm": 2.479831592446327, + "language_loss": 0.90831041, + "learning_rate": 3.44371916188212e-06, + "loss": 0.9344874, + "num_input_tokens_seen": 2475775, + "step": 116, + "time_per_iteration": 2.725522994995117 + }, + { + "auxiliary_loss_clip": 0.01524439, + "auxiliary_loss_mlp": 0.01105922, + "balance_loss_clip": 1.16801572, + "balance_loss_mlp": 1.0591917, + "epoch": 0.014068418204773643, + "flos": 22453542028800.0, + "grad_norm": 2.178544666561105, + "language_loss": 0.86177218, + "learning_rate": 3.449937620235143e-06, + "loss": 0.88807583, + "num_input_tokens_seen": 2496370, + "step": 117, + "time_per_iteration": 2.557469367980957 + }, + { + "auxiliary_loss_clip": 0.01526513, + "auxiliary_loss_mlp": 0.01108986, + "balance_loss_clip": 1.1692493, + "balance_loss_mlp": 1.05996704, + "epoch": 0.014188661095412733, + "flos": 23805147922560.0, + "grad_norm": 1.7787093196507968, + "language_loss": 0.89567256, + "learning_rate": 3.456103154896722e-06, + "loss": 0.92202759, + "num_input_tokens_seen": 2517645, + "step": 118, + "time_per_iteration": 2.5810985565185547 + }, + { + "auxiliary_loss_clip": 0.01513456, + "auxiliary_loss_mlp": 0.01118267, + "balance_loss_clip": 1.16169643, + "balance_loss_mlp": 1.07153726, + "epoch": 0.014308903986051825, + "flos": 23660248458240.0, + "grad_norm": 1.9652364782085645, + "language_loss": 0.9237023, + "learning_rate": 3.462216659109757e-06, + "loss": 0.95001954, + "num_input_tokens_seen": 2537825, + "step": 119, + "time_per_iteration": 2.5864460468292236 + }, + { + "auxiliary_loss_clip": 0.01532223, + "auxiliary_loss_mlp": 0.0112598, + "balance_loss_clip": 1.17054057, + "balance_loss_mlp": 1.07860649, + "epoch": 0.014429146876690916, + "flos": 20667991927680.0, + "grad_norm": 2.8809150514322264, + "language_loss": 0.85053694, + "learning_rate": 3.4682790036921077e-06, + "loss": 0.87711895, + "num_input_tokens_seen": 2556485, + "step": 120, + "time_per_iteration": 2.5462751388549805 + }, + { + "auxiliary_loss_clip": 0.01508496, + "auxiliary_loss_mlp": 0.01102069, + "balance_loss_clip": 1.16297102, + "balance_loss_mlp": 1.06280136, + "epoch": 0.014549389767330006, + "flos": 20229199384320.0, + "grad_norm": 1.8946942663062525, + "language_loss": 0.82924461, + "learning_rate": 3.4742910377810193e-06, + "loss": 0.85535026, + "num_input_tokens_seen": 2573945, + "step": 121, + "time_per_iteration": 2.6077053546905518 + }, + { + "auxiliary_loss_clip": 0.01507048, + "auxiliary_loss_mlp": 0.01113262, + "balance_loss_clip": 1.16140771, + "balance_loss_mlp": 1.07125258, + "epoch": 0.014669632657969098, + "flos": 18004174381440.0, + "grad_norm": 2.3511690066118107, + "language_loss": 0.88603497, + "learning_rate": 3.4802535895469042e-06, + "loss": 0.912238, + "num_input_tokens_seen": 2592695, + "step": 122, + "time_per_iteration": 2.5273358821868896 + }, + { + "auxiliary_loss_clip": 0.01509293, + "auxiliary_loss_mlp": 0.01106476, + "balance_loss_clip": 1.16028714, + "balance_loss_mlp": 1.06391859, + "epoch": 0.01478987554860819, + "flos": 22741796672640.0, + "grad_norm": 2.143852045472221, + "language_loss": 0.8964687, + "learning_rate": 3.4861674668779934e-06, + "loss": 0.92262638, + "num_input_tokens_seen": 2610925, + "step": 123, + "time_per_iteration": 3.3465163707733154 + }, + { + "auxiliary_loss_clip": 0.01501227, + "auxiliary_loss_mlp": 0.01103596, + "balance_loss_clip": 1.15584016, + "balance_loss_mlp": 1.05975103, + "epoch": 0.01491011843924728, + "flos": 17198590106880.0, + "grad_norm": 4.492649030481515, + "language_loss": 0.84242064, + "learning_rate": 3.492033458037272e-06, + "loss": 0.86846888, + "num_input_tokens_seen": 2629495, + "step": 124, + "time_per_iteration": 4.057400226593018 + }, + { + "auxiliary_loss_clip": 0.01497071, + "auxiliary_loss_mlp": 0.01103023, + "balance_loss_clip": 1.15339327, + "balance_loss_mlp": 1.06373191, + "epoch": 0.01503036132988637, + "flos": 17673867889920.0, + "grad_norm": 2.7690553558833346, + "language_loss": 0.87096393, + "learning_rate": 3.497852332293018e-06, + "loss": 0.89696491, + "num_input_tokens_seen": 2645070, + "step": 125, + "time_per_iteration": 3.3263142108917236 + }, + { + "auxiliary_loss_clip": 0.01495533, + "auxiliary_loss_mlp": 0.01107749, + "balance_loss_clip": 1.15484738, + "balance_loss_mlp": 1.0697211, + "epoch": 0.015150604220525462, + "flos": 18878239935360.0, + "grad_norm": 5.076934354947765, + "language_loss": 0.96391708, + "learning_rate": 3.5036248405242356e-06, + "loss": 0.98994994, + "num_input_tokens_seen": 2663825, + "step": 126, + "time_per_iteration": 2.512669563293457 + }, + { + "auxiliary_loss_clip": 0.01496993, + "auxiliary_loss_mlp": 0.01104651, + "balance_loss_clip": 1.15406203, + "balance_loss_mlp": 1.0626651, + "epoch": 0.015270847111164552, + "flos": 39420184060800.0, + "grad_norm": 2.1644598072395427, + "language_loss": 0.826913, + "learning_rate": 3.509351715802146e-06, + "loss": 0.85292947, + "num_input_tokens_seen": 2684710, + "step": 127, + "time_per_iteration": 2.7218220233917236 + }, + { + "auxiliary_loss_clip": 0.01494846, + "auxiliary_loss_mlp": 0.01118222, + "balance_loss_clip": 1.1521256, + "balance_loss_mlp": 1.07523537, + "epoch": 0.015391090001803644, + "flos": 43762466286720.0, + "grad_norm": 2.0322822328187238, + "language_loss": 0.78316963, + "learning_rate": 3.5150336739488763e-06, + "loss": 0.8093003, + "num_input_tokens_seen": 2706995, + "step": 128, + "time_per_iteration": 2.7172088623046875 + }, + { + "auxiliary_loss_clip": 0.01491146, + "auxiliary_loss_mlp": 0.01085619, + "balance_loss_clip": 1.15286744, + "balance_loss_mlp": 1.04973662, + "epoch": 0.015511332892442733, + "flos": 18916341287040.0, + "grad_norm": 2.0983978847198923, + "language_loss": 0.84055287, + "learning_rate": 3.5206714140744143e-06, + "loss": 0.86632049, + "num_input_tokens_seen": 2727050, + "step": 129, + "time_per_iteration": 2.5514066219329834 + }, + { + "auxiliary_loss_clip": 0.01493265, + "auxiliary_loss_mlp": 0.01111721, + "balance_loss_clip": 1.1550709, + "balance_loss_mlp": 1.07314491, + "epoch": 0.015631575783081827, + "flos": 24535283679360.0, + "grad_norm": 2.9974329057316584, + "language_loss": 0.87674659, + "learning_rate": 3.5262656190928208e-06, + "loss": 0.90279651, + "num_input_tokens_seen": 2745350, + "step": 130, + "time_per_iteration": 2.5502102375030518 + }, + { + "auxiliary_loss_clip": 0.01442274, + "auxiliary_loss_mlp": 0.01048806, + "balance_loss_clip": 1.16859031, + "balance_loss_mlp": 1.03173578, + "epoch": 0.015751818673720917, + "flos": 62328536098560.0, + "grad_norm": 1.051767882664357, + "language_loss": 0.71528602, + "learning_rate": 3.5318169562186737e-06, + "loss": 0.74019682, + "num_input_tokens_seen": 2814195, + "step": 131, + "time_per_iteration": 3.1636390686035156 + }, + { + "auxiliary_loss_clip": 0.01480426, + "auxiliary_loss_mlp": 0.01117686, + "balance_loss_clip": 1.14843965, + "balance_loss_mlp": 1.08161306, + "epoch": 0.015872061564360006, + "flos": 23878549365120.0, + "grad_norm": 2.1447394178867456, + "language_loss": 0.82098413, + "learning_rate": 3.5373260774446292e-06, + "loss": 0.84696525, + "num_input_tokens_seen": 2834645, + "step": 132, + "time_per_iteration": 2.5613608360290527 + }, + { + "auxiliary_loss_clip": 0.01478594, + "auxiliary_loss_mlp": 0.01109167, + "balance_loss_clip": 1.1474092, + "balance_loss_mlp": 1.07285607, + "epoch": 0.0159923044549991, + "flos": 23367899664000.0, + "grad_norm": 1.8482094297772338, + "language_loss": 0.90164125, + "learning_rate": 3.542793620000961e-06, + "loss": 0.92751884, + "num_input_tokens_seen": 2854120, + "step": 133, + "time_per_iteration": 2.557941436767578 + }, + { + "auxiliary_loss_clip": 0.01475577, + "auxiliary_loss_mlp": 0.01100854, + "balance_loss_clip": 1.14599049, + "balance_loss_mlp": 1.0638994, + "epoch": 0.01611254734563819, + "flos": 17858305249920.0, + "grad_norm": 2.4363006501052733, + "language_loss": 0.86906779, + "learning_rate": 3.5482202067978894e-06, + "loss": 0.89483207, + "num_input_tokens_seen": 2871330, + "step": 134, + "time_per_iteration": 2.521550416946411 + }, + { + "auxiliary_loss_clip": 0.01474334, + "auxiliary_loss_mlp": 0.01096474, + "balance_loss_clip": 1.14693415, + "balance_loss_mlp": 1.06025863, + "epoch": 0.01623279023627728, + "flos": 20954774113920.0, + "grad_norm": 2.4750540491373028, + "language_loss": 0.76133597, + "learning_rate": 3.553606446851471e-06, + "loss": 0.78704405, + "num_input_tokens_seen": 2888070, + "step": 135, + "time_per_iteration": 2.5556414127349854 + }, + { + "auxiliary_loss_clip": 0.01461536, + "auxiliary_loss_mlp": 0.01092203, + "balance_loss_clip": 1.13896465, + "balance_loss_mlp": 1.05701256, + "epoch": 0.016353033126916373, + "flos": 15742412743680.0, + "grad_norm": 2.928625393507346, + "language_loss": 0.83242512, + "learning_rate": 3.5589529356937613e-06, + "loss": 0.85796255, + "num_input_tokens_seen": 2906465, + "step": 136, + "time_per_iteration": 2.535094738006592 + }, + { + "auxiliary_loss_clip": 0.01470212, + "auxiliary_loss_mlp": 0.01096274, + "balance_loss_clip": 1.14211655, + "balance_loss_mlp": 1.06091619, + "epoch": 0.016473276017555463, + "flos": 18807280617600.0, + "grad_norm": 1.747330886608657, + "language_loss": 0.76972306, + "learning_rate": 3.5642602557679627e-06, + "loss": 0.79538798, + "num_input_tokens_seen": 2924915, + "step": 137, + "time_per_iteration": 2.543062686920166 + }, + { + "auxiliary_loss_clip": 0.01465126, + "auxiliary_loss_mlp": 0.01086598, + "balance_loss_clip": 1.14812791, + "balance_loss_mlp": 1.05671275, + "epoch": 0.016593518908194552, + "flos": 24352641999360.0, + "grad_norm": 1.9439834585816407, + "language_loss": 0.84278202, + "learning_rate": 3.569528976809202e-06, + "loss": 0.86829925, + "num_input_tokens_seen": 2942130, + "step": 138, + "time_per_iteration": 2.5556271076202393 + }, + { + "auxiliary_loss_clip": 0.0146564, + "auxiliary_loss_mlp": 0.01105538, + "balance_loss_clip": 1.14256382, + "balance_loss_mlp": 1.06938148, + "epoch": 0.016713761798833646, + "flos": 22346133384960.0, + "grad_norm": 1.7401386078919736, + "language_loss": 0.899616, + "learning_rate": 3.5747596562115522e-06, + "loss": 0.92532778, + "num_input_tokens_seen": 2962745, + "step": 139, + "time_per_iteration": 2.5523946285247803 + }, + { + "auxiliary_loss_clip": 0.01470027, + "auxiliary_loss_mlp": 0.01100337, + "balance_loss_clip": 1.14435399, + "balance_loss_mlp": 1.06621969, + "epoch": 0.016834004689472735, + "flos": 17821820010240.0, + "grad_norm": 2.462543700498006, + "language_loss": 0.90845287, + "learning_rate": 3.5799528393819138e-06, + "loss": 0.9341566, + "num_input_tokens_seen": 2981825, + "step": 140, + "time_per_iteration": 2.5202603340148926 + }, + { + "auxiliary_loss_clip": 0.01452626, + "auxiliary_loss_mlp": 0.0109801, + "balance_loss_clip": 1.13499069, + "balance_loss_mlp": 1.06584692, + "epoch": 0.016954247580111825, + "flos": 20519501103360.0, + "grad_norm": 1.868405311732462, + "language_loss": 0.88004827, + "learning_rate": 3.585109060081286e-06, + "loss": 0.90555459, + "num_input_tokens_seen": 3001625, + "step": 141, + "time_per_iteration": 2.5441956520080566 + }, + { + "auxiliary_loss_clip": 0.01458807, + "auxiliary_loss_mlp": 0.01095376, + "balance_loss_clip": 1.1388948, + "balance_loss_mlp": 1.06314182, + "epoch": 0.017074490470750915, + "flos": 22088869200000.0, + "grad_norm": 1.7452507799229682, + "language_loss": 0.78435934, + "learning_rate": 3.590228840753992e-06, + "loss": 0.80990124, + "num_input_tokens_seen": 3022055, + "step": 142, + "time_per_iteration": 2.544304847717285 + }, + { + "auxiliary_loss_clip": 0.01450217, + "auxiliary_loss_mlp": 0.01099896, + "balance_loss_clip": 1.13637674, + "balance_loss_mlp": 1.06947398, + "epoch": 0.01719473336139001, + "flos": 15997270717440.0, + "grad_norm": 2.4882814821347368, + "language_loss": 0.87446427, + "learning_rate": 3.5953126928453423e-06, + "loss": 0.89996541, + "num_input_tokens_seen": 3039605, + "step": 143, + "time_per_iteration": 2.5124332904815674 + }, + { + "auxiliary_loss_clip": 0.01446738, + "auxiliary_loss_mlp": 0.0108296, + "balance_loss_clip": 1.13231742, + "balance_loss_mlp": 1.05351567, + "epoch": 0.017314976252029098, + "flos": 22492038430080.0, + "grad_norm": 2.2052251191081726, + "language_loss": 0.80593991, + "learning_rate": 3.600361117108239e-06, + "loss": 0.8312369, + "num_input_tokens_seen": 3059405, + "step": 144, + "time_per_iteration": 2.6201119422912598 + }, + { + "auxiliary_loss_clip": 0.01452068, + "auxiliary_loss_mlp": 0.0108743, + "balance_loss_clip": 1.13415515, + "balance_loss_mlp": 1.05631661, + "epoch": 0.017435219142668188, + "flos": 22018053536640.0, + "grad_norm": 1.989866662987648, + "language_loss": 0.97189891, + "learning_rate": 3.6053746038991616e-06, + "loss": 0.99729383, + "num_input_tokens_seen": 3078490, + "step": 145, + "time_per_iteration": 2.5484704971313477 + }, + { + "auxiliary_loss_clip": 0.01405522, + "auxiliary_loss_mlp": 0.01021133, + "balance_loss_clip": 1.15632319, + "balance_loss_mlp": 1.00916469, + "epoch": 0.01755546203330728, + "flos": 72240526149120.0, + "grad_norm": 1.0586301608358775, + "language_loss": 0.58395016, + "learning_rate": 3.6103536334639843e-06, + "loss": 0.6082167, + "num_input_tokens_seen": 3131755, + "step": 146, + "time_per_iteration": 3.102198362350464 + }, + { + "auxiliary_loss_clip": 0.01441734, + "auxiliary_loss_mlp": 0.0108181, + "balance_loss_clip": 1.13129234, + "balance_loss_mlp": 1.05302119, + "epoch": 0.01767570492394637, + "flos": 25337061112320.0, + "grad_norm": 2.4421924424549575, + "language_loss": 0.85631436, + "learning_rate": 3.615298676214041e-06, + "loss": 0.88154984, + "num_input_tokens_seen": 3152035, + "step": 147, + "time_per_iteration": 2.573974370956421 + }, + { + "auxiliary_loss_clip": 0.01438709, + "auxiliary_loss_mlp": 0.01094995, + "balance_loss_clip": 1.1285032, + "balance_loss_mlp": 1.06625366, + "epoch": 0.01779594781458546, + "flos": 20449188230400.0, + "grad_norm": 2.1356795512314344, + "language_loss": 0.8868205, + "learning_rate": 3.6202101929928317e-06, + "loss": 0.91215754, + "num_input_tokens_seen": 3170625, + "step": 148, + "time_per_iteration": 2.540816307067871 + }, + { + "auxiliary_loss_clip": 0.01433388, + "auxiliary_loss_mlp": 0.01091953, + "balance_loss_clip": 1.1267786, + "balance_loss_mlp": 1.06434381, + "epoch": 0.017916190705224554, + "flos": 16253601148800.0, + "grad_norm": 2.337250624950447, + "language_loss": 0.88306475, + "learning_rate": 3.6250886353337413e-06, + "loss": 0.9083181, + "num_input_tokens_seen": 3188155, + "step": 149, + "time_per_iteration": 2.5211598873138428 + }, + { + "auxiliary_loss_clip": 0.01446542, + "auxiliary_loss_mlp": 0.01095693, + "balance_loss_clip": 1.13409865, + "balance_loss_mlp": 1.06825066, + "epoch": 0.018036433595863644, + "flos": 23330588411520.0, + "grad_norm": 1.9955333400507853, + "language_loss": 0.86260021, + "learning_rate": 3.6299344457091488e-06, + "loss": 0.88802254, + "num_input_tokens_seen": 3209015, + "step": 150, + "time_per_iteration": 3.3218226432800293 + }, + { + "auxiliary_loss_clip": 0.01440034, + "auxiliary_loss_mlp": 0.01086933, + "balance_loss_clip": 1.13143778, + "balance_loss_mlp": 1.06036091, + "epoch": 0.018156676486502734, + "flos": 18588010043520.0, + "grad_norm": 3.846093198631413, + "language_loss": 0.9381001, + "learning_rate": 3.634748057771256e-06, + "loss": 0.96336979, + "num_input_tokens_seen": 3224955, + "step": 151, + "time_per_iteration": 4.10172700881958 + }, + { + "auxiliary_loss_clip": 0.0143272, + "auxiliary_loss_mlp": 0.01087673, + "balance_loss_clip": 1.12906146, + "balance_loss_mlp": 1.06133997, + "epoch": 0.018276919377141827, + "flos": 25448707560960.0, + "grad_norm": 1.6840890794972185, + "language_loss": 0.85633469, + "learning_rate": 3.639529896584965e-06, + "loss": 0.88153863, + "num_input_tokens_seen": 3246330, + "step": 152, + "time_per_iteration": 2.5600790977478027 + }, + { + "auxiliary_loss_clip": 0.01434353, + "auxiliary_loss_mlp": 0.01079964, + "balance_loss_clip": 1.1283679, + "balance_loss_mlp": 1.05224836, + "epoch": 0.018397162267780917, + "flos": 20047311889920.0, + "grad_norm": 3.100628410585477, + "language_loss": 0.8880167, + "learning_rate": 3.6442803788531233e-06, + "loss": 0.91315985, + "num_input_tokens_seen": 3264290, + "step": 153, + "time_per_iteration": 2.4990718364715576 + }, + { + "auxiliary_loss_clip": 0.01437747, + "auxiliary_loss_mlp": 0.01090818, + "balance_loss_clip": 1.12873864, + "balance_loss_mlp": 1.06202853, + "epoch": 0.018517405158420007, + "flos": 27565282425600.0, + "grad_norm": 1.9896706519562288, + "language_loss": 0.95998096, + "learning_rate": 3.6489999131344357e-06, + "loss": 0.98526657, + "num_input_tokens_seen": 3287065, + "step": 154, + "time_per_iteration": 2.5866172313690186 + }, + { + "auxiliary_loss_clip": 0.01425608, + "auxiliary_loss_mlp": 0.01084961, + "balance_loss_clip": 1.12498331, + "balance_loss_mlp": 1.06021333, + "epoch": 0.0186376480490591, + "flos": 19354056422400.0, + "grad_norm": 1.9462282157906112, + "language_loss": 0.90695214, + "learning_rate": 3.653688900054313e-06, + "loss": 0.93205786, + "num_input_tokens_seen": 3305595, + "step": 155, + "time_per_iteration": 2.5321578979492188 + }, + { + "auxiliary_loss_clip": 0.0142819, + "auxiliary_loss_mlp": 0.01070836, + "balance_loss_clip": 1.12252116, + "balance_loss_mlp": 1.04443121, + "epoch": 0.01875789093969819, + "flos": 26687840993280.0, + "grad_norm": 2.4854410981356283, + "language_loss": 0.76075947, + "learning_rate": 3.6583477325089526e-06, + "loss": 0.78574973, + "num_input_tokens_seen": 3326135, + "step": 156, + "time_per_iteration": 2.587226152420044 + }, + { + "auxiliary_loss_clip": 0.01423365, + "auxiliary_loss_mlp": 0.01078624, + "balance_loss_clip": 1.12208652, + "balance_loss_mlp": 1.05281568, + "epoch": 0.01887813383033728, + "flos": 24353001135360.0, + "grad_norm": 2.1823199109729634, + "language_loss": 1.04159069, + "learning_rate": 3.6629767958628916e-06, + "loss": 1.06661057, + "num_input_tokens_seen": 3343510, + "step": 157, + "time_per_iteration": 2.5517029762268066 + }, + { + "auxiliary_loss_clip": 0.01420099, + "auxiliary_loss_mlp": 0.01078252, + "balance_loss_clip": 1.12351322, + "balance_loss_mlp": 1.05207324, + "epoch": 0.018998376720976373, + "flos": 14647532330880.0, + "grad_norm": 2.650914691546543, + "language_loss": 0.8559233, + "learning_rate": 3.667576468140291e-06, + "loss": 0.88090682, + "num_input_tokens_seen": 3361325, + "step": 158, + "time_per_iteration": 2.4924206733703613 + }, + { + "auxiliary_loss_clip": 0.0141424, + "auxiliary_loss_mlp": 0.01065997, + "balance_loss_clip": 1.11777139, + "balance_loss_mlp": 1.04188061, + "epoch": 0.019118619611615463, + "flos": 29305261146240.0, + "grad_norm": 3.0443549337388647, + "language_loss": 0.88940209, + "learning_rate": 3.672147120210184e-06, + "loss": 0.91420448, + "num_input_tokens_seen": 3377925, + "step": 159, + "time_per_iteration": 2.5959393978118896 + }, + { + "auxiliary_loss_clip": 0.01421356, + "auxiliary_loss_mlp": 0.01074298, + "balance_loss_clip": 1.12535644, + "balance_loss_mlp": 1.05074215, + "epoch": 0.019238862502254553, + "flos": 20886723797760.0, + "grad_norm": 2.0272908587336143, + "language_loss": 0.86431396, + "learning_rate": 3.6766891159659177e-06, + "loss": 0.88927042, + "num_input_tokens_seen": 3396335, + "step": 160, + "time_per_iteration": 2.51916241645813 + }, + { + "auxiliary_loss_clip": 0.0142213, + "auxiliary_loss_mlp": 0.01079834, + "balance_loss_clip": 1.12692046, + "balance_loss_mlp": 1.05592012, + "epoch": 0.019359105392893646, + "flos": 21360672777600.0, + "grad_norm": 3.4849225702478037, + "language_loss": 0.87722075, + "learning_rate": 3.6812028124990075e-06, + "loss": 0.9022404, + "num_input_tokens_seen": 3413605, + "step": 161, + "time_per_iteration": 2.5227601528167725 + }, + { + "auxiliary_loss_clip": 0.01415926, + "auxiliary_loss_mlp": 0.01078918, + "balance_loss_clip": 1.12217307, + "balance_loss_mlp": 1.05569565, + "epoch": 0.019479348283532736, + "flos": 16283729681280.0, + "grad_norm": 3.108502108828164, + "language_loss": 0.81496394, + "learning_rate": 3.6856885602676016e-06, + "loss": 0.83991247, + "num_input_tokens_seen": 3429640, + "step": 162, + "time_per_iteration": 2.4878756999969482 + }, + { + "auxiliary_loss_clip": 0.01415932, + "auxiliary_loss_mlp": 0.01081889, + "balance_loss_clip": 1.12264919, + "balance_loss_mlp": 1.05895281, + "epoch": 0.019599591174171826, + "flos": 22091239497600.0, + "grad_norm": 2.7520934861780146, + "language_loss": 0.94037157, + "learning_rate": 3.6901467032597733e-06, + "loss": 0.96534973, + "num_input_tokens_seen": 3448125, + "step": 163, + "time_per_iteration": 2.556347131729126 + }, + { + "auxiliary_loss_clip": 0.01418126, + "auxiliary_loss_mlp": 0.01067418, + "balance_loss_clip": 1.12142372, + "balance_loss_mlp": 1.04216945, + "epoch": 0.01971983406481092, + "flos": 19609668581760.0, + "grad_norm": 2.786632746894377, + "language_loss": 0.87170005, + "learning_rate": 3.694577579151804e-06, + "loss": 0.89655548, + "num_input_tokens_seen": 3466535, + "step": 164, + "time_per_iteration": 2.5096168518066406 + }, + { + "auxiliary_loss_clip": 0.01418073, + "auxiliary_loss_mlp": 0.01075373, + "balance_loss_clip": 1.12321091, + "balance_loss_mlp": 1.05116224, + "epoch": 0.01984007695545001, + "flos": 19099342103040.0, + "grad_norm": 2.2271091461607355, + "language_loss": 0.73455423, + "learning_rate": 3.6989815194616703e-06, + "loss": 0.7594887, + "num_input_tokens_seen": 3483730, + "step": 165, + "time_per_iteration": 2.54156231880188 + }, + { + "auxiliary_loss_clip": 0.01417512, + "auxiliary_loss_mlp": 0.01079657, + "balance_loss_clip": 1.11948693, + "balance_loss_mlp": 1.05363393, + "epoch": 0.0199603198460891, + "flos": 20848406964480.0, + "grad_norm": 2.1542935184906744, + "language_loss": 0.79787493, + "learning_rate": 3.703358849697888e-06, + "loss": 0.82284659, + "num_input_tokens_seen": 3503640, + "step": 166, + "time_per_iteration": 2.5200016498565674 + }, + { + "auxiliary_loss_clip": 0.01412842, + "auxiliary_loss_mlp": 0.01089529, + "balance_loss_clip": 1.12190962, + "balance_loss_mlp": 1.06717765, + "epoch": 0.020080562736728192, + "flos": 21870747861120.0, + "grad_norm": 1.9537558893015845, + "language_loss": 0.82575959, + "learning_rate": 3.7077098895038803e-06, + "loss": 0.85078329, + "num_input_tokens_seen": 3523010, + "step": 167, + "time_per_iteration": 2.5394539833068848 + }, + { + "auxiliary_loss_clip": 0.01412159, + "auxiliary_loss_mlp": 0.0107532, + "balance_loss_clip": 1.12004828, + "balance_loss_mlp": 1.05213404, + "epoch": 0.020200805627367282, + "flos": 21688788539520.0, + "grad_norm": 3.1443403830411465, + "language_loss": 0.96840394, + "learning_rate": 3.712034952798045e-06, + "loss": 0.99327868, + "num_input_tokens_seen": 3541125, + "step": 168, + "time_per_iteration": 2.5210137367248535 + }, + { + "auxiliary_loss_clip": 0.01408444, + "auxiliary_loss_mlp": 0.01081409, + "balance_loss_clip": 1.1151979, + "balance_loss_mlp": 1.05779326, + "epoch": 0.02032104851800637, + "flos": 33543043729920.0, + "grad_norm": 2.210201297573146, + "language_loss": 0.84755468, + "learning_rate": 3.7163343479096656e-06, + "loss": 0.87245321, + "num_input_tokens_seen": 3562700, + "step": 169, + "time_per_iteration": 2.6174745559692383 + }, + { + "auxiliary_loss_clip": 0.01405874, + "auxiliary_loss_mlp": 0.01075655, + "balance_loss_clip": 1.11800385, + "balance_loss_mlp": 1.05456734, + "epoch": 0.020441291408645465, + "flos": 31686965274240.0, + "grad_norm": 2.2677587444031584, + "language_loss": 0.82777423, + "learning_rate": 3.720608377710802e-06, + "loss": 0.85258955, + "num_input_tokens_seen": 3582790, + "step": 170, + "time_per_iteration": 2.590531587600708 + }, + { + "auxiliary_loss_clip": 0.01399944, + "auxiliary_loss_mlp": 0.01085831, + "balance_loss_clip": 1.11225176, + "balance_loss_mlp": 1.06221545, + "epoch": 0.020561534299284555, + "flos": 20886687884160.0, + "grad_norm": 3.337326635033776, + "language_loss": 0.86277342, + "learning_rate": 3.7248573397443277e-06, + "loss": 0.88763124, + "num_input_tokens_seen": 3601715, + "step": 171, + "time_per_iteration": 2.517613649368286 + }, + { + "auxiliary_loss_clip": 0.01404999, + "auxiliary_loss_mlp": 0.0108844, + "balance_loss_clip": 1.11810768, + "balance_loss_mlp": 1.06407332, + "epoch": 0.020681777189923645, + "flos": 20996610480000.0, + "grad_norm": 4.195163194034303, + "language_loss": 0.97541755, + "learning_rate": 3.729081526348224e-06, + "loss": 1.00035203, + "num_input_tokens_seen": 3620245, + "step": 172, + "time_per_iteration": 2.513267755508423 + }, + { + "auxiliary_loss_clip": 0.01404137, + "auxiliary_loss_mlp": 0.01068282, + "balance_loss_clip": 1.11566329, + "balance_loss_mlp": 1.0468961, + "epoch": 0.020802020080562738, + "flos": 28257532312320.0, + "grad_norm": 1.9307049387591249, + "language_loss": 0.84838766, + "learning_rate": 3.7332812247762777e-06, + "loss": 0.87311184, + "num_input_tokens_seen": 3641545, + "step": 173, + "time_per_iteration": 2.574241876602173 + }, + { + "auxiliary_loss_clip": 0.01405323, + "auxiliary_loss_mlp": 0.01064652, + "balance_loss_clip": 1.1193924, + "balance_loss_mlp": 1.04284906, + "epoch": 0.020922262971201828, + "flos": 19681274344320.0, + "grad_norm": 2.4782733661407943, + "language_loss": 0.95535183, + "learning_rate": 3.737456717315293e-06, + "loss": 0.98005152, + "num_input_tokens_seen": 3660510, + "step": 174, + "time_per_iteration": 2.5038774013519287 + }, + { + "auxiliary_loss_clip": 0.01393938, + "auxiliary_loss_mlp": 0.01083499, + "balance_loss_clip": 1.1146754, + "balance_loss_mlp": 1.0617907, + "epoch": 0.021042505861840918, + "flos": 15666353694720.0, + "grad_norm": 1.9364473831361835, + "language_loss": 0.90586734, + "learning_rate": 3.7416082813989552e-06, + "loss": 0.93064165, + "num_input_tokens_seen": 3677505, + "step": 175, + "time_per_iteration": 2.4928431510925293 + }, + { + "auxiliary_loss_clip": 0.01402004, + "auxiliary_loss_mlp": 0.01077178, + "balance_loss_clip": 1.11617398, + "balance_loss_mlp": 1.05450416, + "epoch": 0.02116274875248001, + "flos": 21142012734720.0, + "grad_norm": 2.154984338665581, + "language_loss": 0.89056826, + "learning_rate": 3.745736189718439e-06, + "loss": 0.91536003, + "num_input_tokens_seen": 3696760, + "step": 176, + "time_per_iteration": 2.520490884780884 + }, + { + "auxiliary_loss_clip": 0.01393433, + "auxiliary_loss_mlp": 0.01066693, + "balance_loss_clip": 1.11218047, + "balance_loss_mlp": 1.04505706, + "epoch": 0.0212829916431191, + "flos": 24715770543360.0, + "grad_norm": 3.136151688918174, + "language_loss": 0.72756231, + "learning_rate": 3.749840710329894e-06, + "loss": 0.75216353, + "num_input_tokens_seen": 3717465, + "step": 177, + "time_per_iteration": 3.317242383956909 + }, + { + "auxiliary_loss_clip": 0.01404227, + "auxiliary_loss_mlp": 0.01085725, + "balance_loss_clip": 1.11585331, + "balance_loss_mlp": 1.06188297, + "epoch": 0.02140323453375819, + "flos": 16645493508480.0, + "grad_norm": 3.7314567014156776, + "language_loss": 0.97910494, + "learning_rate": 3.7539221067588938e-06, + "loss": 1.00400436, + "num_input_tokens_seen": 3731440, + "step": 178, + "time_per_iteration": 3.257619857788086 + }, + { + "auxiliary_loss_clip": 0.01399474, + "auxiliary_loss_mlp": 0.01080372, + "balance_loss_clip": 1.11387014, + "balance_loss_mlp": 1.05723333, + "epoch": 0.021523477424397284, + "flos": 20299332689280.0, + "grad_norm": 5.015503341745836, + "language_loss": 0.93318528, + "learning_rate": 3.757980638101964e-06, + "loss": 0.95798379, + "num_input_tokens_seen": 3744935, + "step": 179, + "time_per_iteration": 3.262655735015869 + }, + { + "auxiliary_loss_clip": 0.01400968, + "auxiliary_loss_mlp": 0.01075122, + "balance_loss_clip": 1.1154542, + "balance_loss_mlp": 1.05081499, + "epoch": 0.021643720315036374, + "flos": 26104005331200.0, + "grad_norm": 2.0817292247131145, + "language_loss": 0.89443547, + "learning_rate": 3.7620165591252806e-06, + "loss": 0.91919637, + "num_input_tokens_seen": 3763035, + "step": 180, + "time_per_iteration": 2.5527048110961914 + }, + { + "auxiliary_loss_clip": 0.0139161, + "auxiliary_loss_mlp": 0.01070342, + "balance_loss_clip": 1.11411166, + "balance_loss_mlp": 1.04930186, + "epoch": 0.021763963205675464, + "flos": 24787663614720.0, + "grad_norm": 1.8951505005649283, + "language_loss": 0.942065, + "learning_rate": 3.766030120360636e-06, + "loss": 0.96668452, + "num_input_tokens_seen": 3782665, + "step": 181, + "time_per_iteration": 2.5362741947174072 + }, + { + "auxiliary_loss_clip": 0.01397386, + "auxiliary_loss_mlp": 0.010746, + "balance_loss_clip": 1.11405849, + "balance_loss_mlp": 1.05322552, + "epoch": 0.021884206096314557, + "flos": 25813559957760.0, + "grad_norm": 2.792262480531276, + "language_loss": 0.90175784, + "learning_rate": 3.7700215681987578e-06, + "loss": 0.92647779, + "num_input_tokens_seen": 3802435, + "step": 182, + "time_per_iteration": 2.5687003135681152 + }, + { + "auxiliary_loss_clip": 0.01390384, + "auxiliary_loss_mlp": 0.01084656, + "balance_loss_clip": 1.1116004, + "balance_loss_mlp": 1.06212544, + "epoch": 0.022004448986953647, + "flos": 20082719721600.0, + "grad_norm": 1.899866800993508, + "language_loss": 0.82202125, + "learning_rate": 3.7739911449800767e-06, + "loss": 0.8467716, + "num_input_tokens_seen": 3822490, + "step": 183, + "time_per_iteration": 2.523084878921509 + }, + { + "auxiliary_loss_clip": 0.01391195, + "auxiliary_loss_mlp": 0.01081675, + "balance_loss_clip": 1.11056018, + "balance_loss_mlp": 1.06137347, + "epoch": 0.022124691877592736, + "flos": 20480609652480.0, + "grad_norm": 2.3835292975657376, + "language_loss": 0.80617714, + "learning_rate": 3.7779390890830114e-06, + "loss": 0.8309058, + "num_input_tokens_seen": 3841140, + "step": 184, + "time_per_iteration": 2.522141456604004 + }, + { + "auxiliary_loss_clip": 0.01391726, + "auxiliary_loss_mlp": 0.01081403, + "balance_loss_clip": 1.11046088, + "balance_loss_mlp": 1.0590508, + "epoch": 0.02224493476823183, + "flos": 23586847015680.0, + "grad_norm": 1.9357115896971642, + "language_loss": 0.85874891, + "learning_rate": 3.7818656350098723e-06, + "loss": 0.88348025, + "num_input_tokens_seen": 3862090, + "step": 185, + "time_per_iteration": 2.551746368408203 + }, + { + "auxiliary_loss_clip": 0.0138681, + "auxiliary_loss_mlp": 0.01071429, + "balance_loss_clip": 1.10734677, + "balance_loss_mlp": 1.04826641, + "epoch": 0.02236517765887092, + "flos": 16909940413440.0, + "grad_norm": 5.913946688024179, + "language_loss": 0.77253032, + "learning_rate": 3.7857710134704447e-06, + "loss": 0.79711264, + "num_input_tokens_seen": 3881025, + "step": 186, + "time_per_iteration": 2.4852542877197266 + }, + { + "auxiliary_loss_clip": 0.01386538, + "auxiliary_loss_mlp": 0.01058114, + "balance_loss_clip": 1.11150527, + "balance_loss_mlp": 1.03757441, + "epoch": 0.02248542054951001, + "flos": 43508182930560.0, + "grad_norm": 2.932066493820842, + "language_loss": 0.79396552, + "learning_rate": 3.7896554514633234e-06, + "loss": 0.81841201, + "num_input_tokens_seen": 3905310, + "step": 187, + "time_per_iteration": 2.705152988433838 + }, + { + "auxiliary_loss_clip": 0.01385539, + "auxiliary_loss_mlp": 0.01065956, + "balance_loss_clip": 1.11024642, + "balance_loss_mlp": 1.04503489, + "epoch": 0.022605663440149103, + "flos": 23367648268800.0, + "grad_norm": 4.419222932230572, + "language_loss": 0.84175617, + "learning_rate": 3.7935191723550955e-06, + "loss": 0.86627114, + "num_input_tokens_seen": 3924265, + "step": 188, + "time_per_iteration": 2.522643804550171 + }, + { + "auxiliary_loss_clip": 0.01383346, + "auxiliary_loss_mlp": 0.01070883, + "balance_loss_clip": 1.1078099, + "balance_loss_mlp": 1.05084443, + "epoch": 0.022725906330788193, + "flos": 29019915504000.0, + "grad_norm": 2.224111780995197, + "language_loss": 0.8836152, + "learning_rate": 3.797362395957408e-06, + "loss": 0.90815747, + "num_input_tokens_seen": 3944830, + "step": 189, + "time_per_iteration": 2.5683798789978027 + }, + { + "auxiliary_loss_clip": 0.01393708, + "auxiliary_loss_mlp": 0.0106516, + "balance_loss_clip": 1.11458147, + "balance_loss_mlp": 1.04407167, + "epoch": 0.022846149221427282, + "flos": 24496176746880.0, + "grad_norm": 2.10573820985673, + "language_loss": 0.78279817, + "learning_rate": 3.8011853386020055e-06, + "loss": 0.80738688, + "num_input_tokens_seen": 3965735, + "step": 190, + "time_per_iteration": 2.5549910068511963 + }, + { + "auxiliary_loss_clip": 0.01389985, + "auxiliary_loss_mlp": 0.01081432, + "balance_loss_clip": 1.11216521, + "balance_loss_mlp": 1.0593667, + "epoch": 0.022966392112066376, + "flos": 15523537219200.0, + "grad_norm": 2.685610509251227, + "language_loss": 0.89632165, + "learning_rate": 3.804988213213804e-06, + "loss": 0.92103583, + "num_input_tokens_seen": 3983975, + "step": 191, + "time_per_iteration": 2.4739296436309814 + }, + { + "auxiliary_loss_clip": 0.01362589, + "auxiliary_loss_mlp": 0.01019276, + "balance_loss_clip": 1.1423471, + "balance_loss_mlp": 1.00821292, + "epoch": 0.023086635002705466, + "flos": 55650408433920.0, + "grad_norm": 1.0203103250227827, + "language_loss": 0.63168716, + "learning_rate": 3.808771229382049e-06, + "loss": 0.6555059, + "num_input_tokens_seen": 4043440, + "step": 192, + "time_per_iteration": 2.9965782165527344 + }, + { + "auxiliary_loss_clip": 0.01381559, + "auxiliary_loss_mlp": 0.01076545, + "balance_loss_clip": 1.10947418, + "balance_loss_mlp": 1.05663681, + "epoch": 0.023206877893344555, + "flos": 19313441118720.0, + "grad_norm": 2.1495516289333096, + "language_loss": 0.84398031, + "learning_rate": 3.8125345934296324e-06, + "loss": 0.86856127, + "num_input_tokens_seen": 4061750, + "step": 193, + "time_per_iteration": 2.5078327655792236 + }, + { + "auxiliary_loss_clip": 0.01382354, + "auxiliary_loss_mlp": 0.01075661, + "balance_loss_clip": 1.10815191, + "balance_loss_mlp": 1.05315447, + "epoch": 0.02332712078398365, + "flos": 23072965090560.0, + "grad_norm": 2.7411863410950303, + "language_loss": 0.88314342, + "learning_rate": 3.81627850848061e-06, + "loss": 0.90772361, + "num_input_tokens_seen": 4082345, + "step": 194, + "time_per_iteration": 2.5454564094543457 + }, + { + "auxiliary_loss_clip": 0.01377404, + "auxiliary_loss_mlp": 0.0106545, + "balance_loss_clip": 1.10466194, + "balance_loss_mlp": 1.04529142, + "epoch": 0.02344736367462274, + "flos": 24425971614720.0, + "grad_norm": 2.2489528499556553, + "language_loss": 0.86254656, + "learning_rate": 3.820003174525994e-06, + "loss": 0.88697511, + "num_input_tokens_seen": 4101770, + "step": 195, + "time_per_iteration": 2.5412087440490723 + }, + { + "auxiliary_loss_clip": 0.0138183, + "auxiliary_loss_mlp": 0.01078724, + "balance_loss_clip": 1.10957766, + "balance_loss_mlp": 1.05823147, + "epoch": 0.02356760656526183, + "flos": 21579799697280.0, + "grad_norm": 2.632520756520399, + "language_loss": 0.82722145, + "learning_rate": 3.823708788487851e-06, + "loss": 0.85182703, + "num_input_tokens_seen": 4118770, + "step": 196, + "time_per_iteration": 2.515359401702881 + }, + { + "auxiliary_loss_clip": 0.0137733, + "auxiliary_loss_mlp": 0.01080923, + "balance_loss_clip": 1.10626209, + "balance_loss_mlp": 1.06143212, + "epoch": 0.02368784945590092, + "flos": 25193598192000.0, + "grad_norm": 2.2785333733159283, + "language_loss": 0.84377313, + "learning_rate": 3.827395544281781e-06, + "loss": 0.86835569, + "num_input_tokens_seen": 4141110, + "step": 197, + "time_per_iteration": 2.559605360031128 + }, + { + "auxiliary_loss_clip": 0.0138553, + "auxiliary_loss_mlp": 0.01081797, + "balance_loss_clip": 1.11135197, + "balance_loss_mlp": 1.06097066, + "epoch": 0.02380809234654001, + "flos": 27562481164800.0, + "grad_norm": 1.8326088347018719, + "language_loss": 0.79306883, + "learning_rate": 3.831063632877802e-06, + "loss": 0.81774211, + "num_input_tokens_seen": 4161430, + "step": 198, + "time_per_iteration": 2.5714168548583984 + }, + { + "auxiliary_loss_clip": 0.01380766, + "auxiliary_loss_mlp": 0.01074323, + "balance_loss_clip": 1.11333597, + "balance_loss_mlp": 1.05532074, + "epoch": 0.0239283352371791, + "flos": 18259786540800.0, + "grad_norm": 2.375015258584147, + "language_loss": 0.76139212, + "learning_rate": 3.834713242359712e-06, + "loss": 0.78594297, + "num_input_tokens_seen": 4179260, + "step": 199, + "time_per_iteration": 2.524303436279297 + }, + { + "auxiliary_loss_clip": 0.01383205, + "auxiliary_loss_mlp": 0.01075962, + "balance_loss_clip": 1.10706353, + "balance_loss_mlp": 1.05382454, + "epoch": 0.02404857812781819, + "flos": 21395110942080.0, + "grad_norm": 1.9744315121640061, + "language_loss": 0.86993873, + "learning_rate": 3.838344557982959e-06, + "loss": 0.89453042, + "num_input_tokens_seen": 4200640, + "step": 200, + "time_per_iteration": 2.549511194229126 + }, + { + "auxiliary_loss_clip": 0.01377841, + "auxiliary_loss_mlp": 0.01076878, + "balance_loss_clip": 1.10672569, + "balance_loss_mlp": 1.05505061, + "epoch": 0.024168821018457284, + "flos": 16654256426880.0, + "grad_norm": 4.03999650084242, + "language_loss": 0.84832001, + "learning_rate": 3.841957762231063e-06, + "loss": 0.87286723, + "num_input_tokens_seen": 4218170, + "step": 201, + "time_per_iteration": 2.4958574771881104 + }, + { + "auxiliary_loss_clip": 0.01374627, + "auxiliary_loss_mlp": 0.01063248, + "balance_loss_clip": 1.1046145, + "balance_loss_mlp": 1.04283917, + "epoch": 0.024289063909096374, + "flos": 22820872464000.0, + "grad_norm": 2.2181017966171135, + "language_loss": 0.87492532, + "learning_rate": 3.8455530348706454e-06, + "loss": 0.89930403, + "num_input_tokens_seen": 4237770, + "step": 202, + "time_per_iteration": 2.5220320224761963 + }, + { + "auxiliary_loss_clip": 0.01375393, + "auxiliary_loss_mlp": 0.01072425, + "balance_loss_clip": 1.10619295, + "balance_loss_mlp": 1.05342305, + "epoch": 0.024409306799735464, + "flos": 17748598135680.0, + "grad_norm": 1.836895746598813, + "language_loss": 0.7707116, + "learning_rate": 3.849130553005099e-06, + "loss": 0.7951898, + "num_input_tokens_seen": 4255985, + "step": 203, + "time_per_iteration": 2.4904184341430664 + }, + { + "auxiliary_loss_clip": 0.01375367, + "auxiliary_loss_mlp": 0.01068239, + "balance_loss_clip": 1.10360968, + "balance_loss_mlp": 1.04896259, + "epoch": 0.024529549690374557, + "flos": 21616213109760.0, + "grad_norm": 2.517389688849972, + "language_loss": 0.83498913, + "learning_rate": 3.852690491126933e-06, + "loss": 0.85942519, + "num_input_tokens_seen": 4276035, + "step": 204, + "time_per_iteration": 2.514249563217163 + }, + { + "auxiliary_loss_clip": 0.01370376, + "auxiliary_loss_mlp": 0.01058018, + "balance_loss_clip": 1.10057807, + "balance_loss_mlp": 1.03726375, + "epoch": 0.024649792581013647, + "flos": 25551662918400.0, + "grad_norm": 4.735033265206218, + "language_loss": 0.91177219, + "learning_rate": 3.856233021168845e-06, + "loss": 0.93605614, + "num_input_tokens_seen": 4295730, + "step": 205, + "time_per_iteration": 3.334784984588623 + }, + { + "auxiliary_loss_clip": 0.01364783, + "auxiliary_loss_mlp": 0.01051772, + "balance_loss_clip": 1.10114348, + "balance_loss_mlp": 1.03387856, + "epoch": 0.024770035471652737, + "flos": 34495574544000.0, + "grad_norm": 2.5055646812729844, + "language_loss": 0.91217089, + "learning_rate": 3.859758312553544e-06, + "loss": 0.93633652, + "num_input_tokens_seen": 4317950, + "step": 206, + "time_per_iteration": 3.4021618366241455 + }, + { + "auxiliary_loss_clip": 0.0137437, + "auxiliary_loss_mlp": 0.01071442, + "balance_loss_clip": 1.10712922, + "balance_loss_mlp": 1.05264282, + "epoch": 0.02489027836229183, + "flos": 21505428587520.0, + "grad_norm": 1.8621466946119434, + "language_loss": 0.9155277, + "learning_rate": 3.8632665322423735e-06, + "loss": 0.93998575, + "num_input_tokens_seen": 4337605, + "step": 207, + "time_per_iteration": 2.513622283935547 + }, + { + "auxiliary_loss_clip": 0.01371598, + "auxiliary_loss_mlp": 0.01063143, + "balance_loss_clip": 1.10356951, + "balance_loss_mlp": 1.04367566, + "epoch": 0.02501052125293092, + "flos": 23219013790080.0, + "grad_norm": 2.072910785528738, + "language_loss": 0.85850835, + "learning_rate": 3.866757844782762e-06, + "loss": 0.88285571, + "num_input_tokens_seen": 4358110, + "step": 208, + "time_per_iteration": 2.5398149490356445 + }, + { + "auxiliary_loss_clip": 0.01371495, + "auxiliary_loss_mlp": 0.01065162, + "balance_loss_clip": 1.10533583, + "balance_loss_mlp": 1.04554021, + "epoch": 0.02513076414357001, + "flos": 26388920010240.0, + "grad_norm": 2.840763406444016, + "language_loss": 0.91151309, + "learning_rate": 3.870232412354527e-06, + "loss": 0.93587965, + "num_input_tokens_seen": 4374955, + "step": 209, + "time_per_iteration": 2.546449661254883 + }, + { + "auxiliary_loss_clip": 0.01367419, + "auxiliary_loss_mlp": 0.01062151, + "balance_loss_clip": 1.10205674, + "balance_loss_mlp": 1.04268384, + "epoch": 0.025251007034209103, + "flos": 13590430047360.0, + "grad_norm": 2.1100060992604726, + "language_loss": 0.9261682, + "learning_rate": 3.873690394815086e-06, + "loss": 0.95046395, + "num_input_tokens_seen": 4391535, + "step": 210, + "time_per_iteration": 2.4711389541625977 + }, + { + "auxiliary_loss_clip": 0.01366196, + "auxiliary_loss_mlp": 0.01060388, + "balance_loss_clip": 1.0996232, + "balance_loss_mlp": 1.04118323, + "epoch": 0.025371249924848193, + "flos": 15049229103360.0, + "grad_norm": 2.7115683084104343, + "language_loss": 0.90933937, + "learning_rate": 3.877131949743587e-06, + "loss": 0.93360519, + "num_input_tokens_seen": 4408400, + "step": 211, + "time_per_iteration": 2.4879870414733887 + }, + { + "auxiliary_loss_clip": 0.01366964, + "auxiliary_loss_mlp": 0.01078233, + "balance_loss_clip": 1.10272682, + "balance_loss_mlp": 1.05867076, + "epoch": 0.025491492815487283, + "flos": 25553853648000.0, + "grad_norm": 2.783800482044302, + "language_loss": 0.78288186, + "learning_rate": 3.880557232483993e-06, + "loss": 0.80733383, + "num_input_tokens_seen": 4427840, + "step": 212, + "time_per_iteration": 2.5724966526031494 + }, + { + "auxiliary_loss_clip": 0.01365516, + "auxiliary_loss_mlp": 0.01062439, + "balance_loss_clip": 1.09917068, + "balance_loss_mlp": 1.04304326, + "epoch": 0.025611735706126376, + "flos": 20630752502400.0, + "grad_norm": 2.0756576951114774, + "language_loss": 0.86463666, + "learning_rate": 3.883966396187164e-06, + "loss": 0.88891619, + "num_input_tokens_seen": 4447110, + "step": 213, + "time_per_iteration": 2.5140509605407715 + }, + { + "auxiliary_loss_clip": 0.0136789, + "auxiliary_loss_mlp": 0.01062164, + "balance_loss_clip": 1.10333276, + "balance_loss_mlp": 1.04376984, + "epoch": 0.025731978596765466, + "flos": 19062282245760.0, + "grad_norm": 2.2064631239945136, + "language_loss": 0.89837182, + "learning_rate": 3.887359591851937e-06, + "loss": 0.92267239, + "num_input_tokens_seen": 4464715, + "step": 214, + "time_per_iteration": 2.4943320751190186 + }, + { + "auxiliary_loss_clip": 0.01362564, + "auxiliary_loss_mlp": 0.01057097, + "balance_loss_clip": 1.10101032, + "balance_loss_mlp": 1.03770137, + "epoch": 0.025852221487404556, + "flos": 22163814927360.0, + "grad_norm": 1.686390338778976, + "language_loss": 0.92141807, + "learning_rate": 3.890736968365265e-06, + "loss": 0.9456147, + "num_input_tokens_seen": 4485030, + "step": 215, + "time_per_iteration": 2.5198683738708496 + }, + { + "auxiliary_loss_clip": 0.01362902, + "auxiliary_loss_mlp": 0.01063166, + "balance_loss_clip": 1.09894753, + "balance_loss_mlp": 1.04280543, + "epoch": 0.02597246437804365, + "flos": 26541971861760.0, + "grad_norm": 3.540330690650193, + "language_loss": 0.85054177, + "learning_rate": 3.894098672541412e-06, + "loss": 0.87480247, + "num_input_tokens_seen": 4505935, + "step": 216, + "time_per_iteration": 2.5463695526123047 + }, + { + "auxiliary_loss_clip": 0.01363874, + "auxiliary_loss_mlp": 0.01066384, + "balance_loss_clip": 1.0995574, + "balance_loss_mlp": 1.04548621, + "epoch": 0.02609270726868274, + "flos": 32671671696000.0, + "grad_norm": 2.148600112113667, + "language_loss": 0.75221992, + "learning_rate": 3.89744484916025e-06, + "loss": 0.77652252, + "num_input_tokens_seen": 4527045, + "step": 217, + "time_per_iteration": 2.595496654510498 + }, + { + "auxiliary_loss_clip": 0.01364411, + "auxiliary_loss_mlp": 0.01068723, + "balance_loss_clip": 1.10048914, + "balance_loss_mlp": 1.04815912, + "epoch": 0.02621295015932183, + "flos": 26243553669120.0, + "grad_norm": 1.9960407250565653, + "language_loss": 0.87137294, + "learning_rate": 3.900775641004673e-06, + "loss": 0.89570427, + "num_input_tokens_seen": 4546360, + "step": 218, + "time_per_iteration": 2.5429790019989014 + }, + { + "auxiliary_loss_clip": 0.01370181, + "auxiliary_loss_mlp": 0.01072358, + "balance_loss_clip": 1.10283589, + "balance_loss_mlp": 1.04929137, + "epoch": 0.026333193049960922, + "flos": 42921402353280.0, + "grad_norm": 2.5386048336969114, + "language_loss": 0.74267161, + "learning_rate": 3.904091188897156e-06, + "loss": 0.767097, + "num_input_tokens_seen": 4565495, + "step": 219, + "time_per_iteration": 2.6807854175567627 + }, + { + "auxiliary_loss_clip": 0.0136209, + "auxiliary_loss_mlp": 0.01074623, + "balance_loss_clip": 1.09893322, + "balance_loss_mlp": 1.05383253, + "epoch": 0.026453435940600012, + "flos": 17963846386560.0, + "grad_norm": 2.2367295938935983, + "language_loss": 0.82138014, + "learning_rate": 3.90739163173548e-06, + "loss": 0.84574729, + "num_input_tokens_seen": 4583330, + "step": 220, + "time_per_iteration": 2.4785163402557373 + }, + { + "auxiliary_loss_clip": 0.01359886, + "auxiliary_loss_mlp": 0.01070334, + "balance_loss_clip": 1.09797764, + "balance_loss_mlp": 1.0505811, + "epoch": 0.026573678831239102, + "flos": 18984319776000.0, + "grad_norm": 3.3468982252817656, + "language_loss": 0.88515902, + "learning_rate": 3.910677106527646e-06, + "loss": 0.9094612, + "num_input_tokens_seen": 4600520, + "step": 221, + "time_per_iteration": 2.4954025745391846 + }, + { + "auxiliary_loss_clip": 0.01356875, + "auxiliary_loss_mlp": 0.01070372, + "balance_loss_clip": 1.09769595, + "balance_loss_mlp": 1.05206144, + "epoch": 0.026693921721878195, + "flos": 29241448634880.0, + "grad_norm": 2.2065206660669365, + "language_loss": 0.8398757, + "learning_rate": 3.913947748426004e-06, + "loss": 0.8641482, + "num_input_tokens_seen": 4617340, + "step": 222, + "time_per_iteration": 2.5604944229125977 + }, + { + "auxiliary_loss_clip": 0.0136317, + "auxiliary_loss_mlp": 0.01066863, + "balance_loss_clip": 1.10153687, + "balance_loss_mlp": 1.04788446, + "epoch": 0.026814164612517285, + "flos": 14128083797760.0, + "grad_norm": 3.8542603265336037, + "language_loss": 0.76345778, + "learning_rate": 3.9172036907606136e-06, + "loss": 0.78775817, + "num_input_tokens_seen": 4630820, + "step": 223, + "time_per_iteration": 2.446089506149292 + }, + { + "auxiliary_loss_clip": 0.01361898, + "auxiliary_loss_mlp": 0.01065974, + "balance_loss_clip": 1.09795535, + "balance_loss_mlp": 1.04659081, + "epoch": 0.026934407503156375, + "flos": 23511973115520.0, + "grad_norm": 1.9374279305033544, + "language_loss": 0.94729817, + "learning_rate": 3.920445065071855e-06, + "loss": 0.97157693, + "num_input_tokens_seen": 4651985, + "step": 224, + "time_per_iteration": 2.5348546504974365 + }, + { + "auxiliary_loss_clip": 0.01357697, + "auxiliary_loss_mlp": 0.01070541, + "balance_loss_clip": 1.09738135, + "balance_loss_mlp": 1.05089545, + "epoch": 0.027054650393795468, + "flos": 28950356816640.0, + "grad_norm": 13.27893312861928, + "language_loss": 0.80045915, + "learning_rate": 3.923672001142322e-06, + "loss": 0.8247416, + "num_input_tokens_seen": 4672295, + "step": 225, + "time_per_iteration": 2.556966781616211 + }, + { + "auxiliary_loss_clip": 0.01354055, + "auxiliary_loss_mlp": 0.01077774, + "balance_loss_clip": 1.09526587, + "balance_loss_mlp": 1.0578537, + "epoch": 0.027174893284434558, + "flos": 31431568596480.0, + "grad_norm": 1.7739075324480815, + "language_loss": 0.84475338, + "learning_rate": 3.926884627027996e-06, + "loss": 0.8690716, + "num_input_tokens_seen": 4696065, + "step": 226, + "time_per_iteration": 2.619288206100464 + }, + { + "auxiliary_loss_clip": 0.01354678, + "auxiliary_loss_mlp": 0.01070977, + "balance_loss_clip": 1.09409118, + "balance_loss_mlp": 1.05204654, + "epoch": 0.027295136175073648, + "flos": 22054466949120.0, + "grad_norm": 1.840717014796817, + "language_loss": 0.77406526, + "learning_rate": 3.930083069088744e-06, + "loss": 0.79832178, + "num_input_tokens_seen": 4716065, + "step": 227, + "time_per_iteration": 2.504880428314209 + }, + { + "auxiliary_loss_clip": 0.01326296, + "auxiliary_loss_mlp": 0.01011763, + "balance_loss_clip": 1.12221646, + "balance_loss_mlp": 1.00127304, + "epoch": 0.02741537906571274, + "flos": 60800752972800.0, + "grad_norm": 0.9848593115977056, + "language_loss": 0.59300649, + "learning_rate": 3.933267452018137e-06, + "loss": 0.61638713, + "num_input_tokens_seen": 4775860, + "step": 228, + "time_per_iteration": 3.0803451538085938 + }, + { + "auxiliary_loss_clip": 0.01353012, + "auxiliary_loss_mlp": 0.01059487, + "balance_loss_clip": 1.09613943, + "balance_loss_mlp": 1.04006815, + "epoch": 0.02753562195635183, + "flos": 24606278910720.0, + "grad_norm": 2.1505540893560307, + "language_loss": 0.84399986, + "learning_rate": 3.936437898872622e-06, + "loss": 0.8681249, + "num_input_tokens_seen": 4795835, + "step": 229, + "time_per_iteration": 2.542393207550049 + }, + { + "auxiliary_loss_clip": 0.01354135, + "auxiliary_loss_mlp": 0.01057198, + "balance_loss_clip": 1.09551549, + "balance_loss_mlp": 1.03863668, + "epoch": 0.02765586484699092, + "flos": 34094236907520.0, + "grad_norm": 3.758842533514662, + "language_loss": 0.79765379, + "learning_rate": 3.9395945311000525e-06, + "loss": 0.82176709, + "num_input_tokens_seen": 4817460, + "step": 230, + "time_per_iteration": 2.6779978275299072 + }, + { + "auxiliary_loss_clip": 0.0135599, + "auxiliary_loss_mlp": 0.01072187, + "balance_loss_clip": 1.09660053, + "balance_loss_mlp": 1.0528152, + "epoch": 0.027776107737630014, + "flos": 14829922615680.0, + "grad_norm": 3.151749154309875, + "language_loss": 0.91076326, + "learning_rate": 3.942737468567608e-06, + "loss": 0.935045, + "num_input_tokens_seen": 4835475, + "step": 231, + "time_per_iteration": 3.2169108390808105 + }, + { + "auxiliary_loss_clip": 0.01354291, + "auxiliary_loss_mlp": 0.01070793, + "balance_loss_clip": 1.09660125, + "balance_loss_mlp": 1.05187428, + "epoch": 0.027896350628269104, + "flos": 47920347066240.0, + "grad_norm": 1.8889404993089527, + "language_loss": 0.85969847, + "learning_rate": 3.9458668295891026e-06, + "loss": 0.88394928, + "num_input_tokens_seen": 4857760, + "step": 232, + "time_per_iteration": 3.5114328861236572 + }, + { + "auxiliary_loss_clip": 0.01349224, + "auxiliary_loss_mlp": 0.01062025, + "balance_loss_clip": 1.09101987, + "balance_loss_mlp": 1.04173541, + "epoch": 0.028016593518908194, + "flos": 21684550734720.0, + "grad_norm": 2.4314860571174486, + "language_loss": 0.86889899, + "learning_rate": 3.948982730951712e-06, + "loss": 0.89301145, + "num_input_tokens_seen": 4875855, + "step": 233, + "time_per_iteration": 3.2786333560943604 + }, + { + "auxiliary_loss_clip": 0.01353471, + "auxiliary_loss_mlp": 0.01062575, + "balance_loss_clip": 1.09464431, + "balance_loss_mlp": 1.04267931, + "epoch": 0.028136836409547287, + "flos": 18439483305600.0, + "grad_norm": 3.0896151389763618, + "language_loss": 0.81904656, + "learning_rate": 3.9520852879421254e-06, + "loss": 0.843207, + "num_input_tokens_seen": 4893200, + "step": 234, + "time_per_iteration": 2.4680025577545166 + }, + { + "auxiliary_loss_clip": 0.01348266, + "auxiliary_loss_mlp": 0.01066691, + "balance_loss_clip": 1.09376168, + "balance_loss_mlp": 1.04857159, + "epoch": 0.028257079300186377, + "flos": 31576934937600.0, + "grad_norm": 2.3130018881245378, + "language_loss": 0.81674641, + "learning_rate": 3.955174614372137e-06, + "loss": 0.84089601, + "num_input_tokens_seen": 4912965, + "step": 235, + "time_per_iteration": 2.5786325931549072 + }, + { + "auxiliary_loss_clip": 0.01351076, + "auxiliary_loss_mlp": 0.01069959, + "balance_loss_clip": 1.09448063, + "balance_loss_mlp": 1.05052805, + "epoch": 0.028377322190825467, + "flos": 23513337832320.0, + "grad_norm": 7.943824017175866, + "language_loss": 0.84564978, + "learning_rate": 3.9582508226037045e-06, + "loss": 0.86986017, + "num_input_tokens_seen": 4933105, + "step": 236, + "time_per_iteration": 2.5305490493774414 + }, + { + "auxiliary_loss_clip": 0.01357303, + "auxiliary_loss_mlp": 0.01071812, + "balance_loss_clip": 1.09551573, + "balance_loss_mlp": 1.05139136, + "epoch": 0.02849756508146456, + "flos": 20479604071680.0, + "grad_norm": 3.6809381437713626, + "language_loss": 0.9409613, + "learning_rate": 3.9613140235734636e-06, + "loss": 0.96525252, + "num_input_tokens_seen": 4950085, + "step": 237, + "time_per_iteration": 2.4741756916046143 + }, + { + "auxiliary_loss_clip": 0.01349135, + "auxiliary_loss_mlp": 0.01067082, + "balance_loss_clip": 1.09262824, + "balance_loss_mlp": 1.04680419, + "epoch": 0.02861780797210365, + "flos": 14283362292480.0, + "grad_norm": 2.525460954595178, + "language_loss": 0.81021118, + "learning_rate": 3.96436432681674e-06, + "loss": 0.83437335, + "num_input_tokens_seen": 4968075, + "step": 238, + "time_per_iteration": 2.4845798015594482 + }, + { + "auxiliary_loss_clip": 0.01349284, + "auxiliary_loss_mlp": 0.01072533, + "balance_loss_clip": 1.09295821, + "balance_loss_mlp": 1.05298245, + "epoch": 0.02873805086274274, + "flos": 25808532053760.0, + "grad_norm": 2.05708852252817, + "language_loss": 0.89295793, + "learning_rate": 3.967401840491044e-06, + "loss": 0.91717613, + "num_input_tokens_seen": 4987355, + "step": 239, + "time_per_iteration": 2.5290651321411133 + }, + { + "auxiliary_loss_clip": 0.01346245, + "auxiliary_loss_mlp": 0.0106725, + "balance_loss_clip": 1.09430432, + "balance_loss_mlp": 1.05010724, + "epoch": 0.028858293753381833, + "flos": 17304238984320.0, + "grad_norm": 2.3817602527312425, + "language_loss": 0.87667888, + "learning_rate": 3.97042667139909e-06, + "loss": 0.90081382, + "num_input_tokens_seen": 5004680, + "step": 240, + "time_per_iteration": 2.463595390319824 + }, + { + "auxiliary_loss_clip": 0.01348262, + "auxiliary_loss_mlp": 0.01062421, + "balance_loss_clip": 1.09407306, + "balance_loss_mlp": 1.04378891, + "epoch": 0.028978536644020923, + "flos": 23038347358080.0, + "grad_norm": 2.1604341871390926, + "language_loss": 0.87474018, + "learning_rate": 3.973438925011327e-06, + "loss": 0.89884698, + "num_input_tokens_seen": 5022965, + "step": 241, + "time_per_iteration": 2.525973081588745 + }, + { + "auxiliary_loss_clip": 0.01347506, + "auxiliary_loss_mlp": 0.01053552, + "balance_loss_clip": 1.09113097, + "balance_loss_mlp": 1.03425217, + "epoch": 0.029098779534660012, + "flos": 28329712692480.0, + "grad_norm": 2.4578477999843757, + "language_loss": 0.90972769, + "learning_rate": 3.976438705488002e-06, + "loss": 0.93373829, + "num_input_tokens_seen": 5042625, + "step": 242, + "time_per_iteration": 2.545203924179077 + }, + { + "auxiliary_loss_clip": 0.01345601, + "auxiliary_loss_mlp": 0.01064654, + "balance_loss_clip": 1.09373951, + "balance_loss_mlp": 1.04705811, + "epoch": 0.029219022425299106, + "flos": 13881665520000.0, + "grad_norm": 3.5638130045096776, + "language_loss": 0.92791533, + "learning_rate": 3.9794261157007744e-06, + "loss": 0.95201784, + "num_input_tokens_seen": 5060380, + "step": 243, + "time_per_iteration": 2.4736506938934326 + }, + { + "auxiliary_loss_clip": 0.01351521, + "auxiliary_loss_mlp": 0.01060461, + "balance_loss_clip": 1.09558296, + "balance_loss_mlp": 1.04069591, + "epoch": 0.029339265315938196, + "flos": 19422501788160.0, + "grad_norm": 2.295518736749096, + "language_loss": 0.84787893, + "learning_rate": 3.982401257253887e-06, + "loss": 0.87199879, + "num_input_tokens_seen": 5078720, + "step": 244, + "time_per_iteration": 2.4775538444519043 + }, + { + "auxiliary_loss_clip": 0.01347656, + "auxiliary_loss_mlp": 0.01058273, + "balance_loss_clip": 1.0926137, + "balance_loss_mlp": 1.04040313, + "epoch": 0.029459508206577285, + "flos": 15669550005120.0, + "grad_norm": 2.266482574176097, + "language_loss": 0.89800018, + "learning_rate": 3.985364230504893e-06, + "loss": 0.92205942, + "num_input_tokens_seen": 5096605, + "step": 245, + "time_per_iteration": 2.468393564224243 + }, + { + "auxiliary_loss_clip": 0.01354523, + "auxiliary_loss_mlp": 0.01062779, + "balance_loss_clip": 1.09856486, + "balance_loss_mlp": 1.04539847, + "epoch": 0.02957975109721638, + "flos": 28220975245440.0, + "grad_norm": 2.219112503465037, + "language_loss": 0.84483522, + "learning_rate": 3.988315134584976e-06, + "loss": 0.86900824, + "num_input_tokens_seen": 5116285, + "step": 246, + "time_per_iteration": 2.553997278213501 + }, + { + "auxiliary_loss_clip": 0.01350477, + "auxiliary_loss_mlp": 0.01070069, + "balance_loss_clip": 1.09495914, + "balance_loss_mlp": 1.0513773, + "epoch": 0.02969999398785547, + "flos": 24315869450880.0, + "grad_norm": 15.82056443889475, + "language_loss": 0.80367231, + "learning_rate": 3.991254067418851e-06, + "loss": 0.82787776, + "num_input_tokens_seen": 5136825, + "step": 247, + "time_per_iteration": 2.530216932296753 + }, + { + "auxiliary_loss_clip": 0.01341602, + "auxiliary_loss_mlp": 0.01066642, + "balance_loss_clip": 1.09280539, + "balance_loss_mlp": 1.04895186, + "epoch": 0.02982023687849456, + "flos": 35078584193280.0, + "grad_norm": 1.9903386386066304, + "language_loss": 0.82710534, + "learning_rate": 3.994181125744254e-06, + "loss": 0.85118783, + "num_input_tokens_seen": 5158630, + "step": 248, + "time_per_iteration": 2.605059862136841 + }, + { + "auxiliary_loss_clip": 0.01344523, + "auxiliary_loss_mlp": 0.01056321, + "balance_loss_clip": 1.09293008, + "balance_loss_mlp": 1.03878558, + "epoch": 0.02994047976913365, + "flos": 26177155378560.0, + "grad_norm": 2.318865254656544, + "language_loss": 0.73937273, + "learning_rate": 3.99709640513106e-06, + "loss": 0.76338112, + "num_input_tokens_seen": 5179510, + "step": 249, + "time_per_iteration": 2.532275676727295 + }, + { + "auxiliary_loss_clip": 0.01345697, + "auxiliary_loss_mlp": 0.01073416, + "balance_loss_clip": 1.09023428, + "balance_loss_mlp": 1.05388987, + "epoch": 0.03006072265977274, + "flos": 25625028447360.0, + "grad_norm": 2.4674415747337073, + "language_loss": 0.8558358, + "learning_rate": 4e-06, + "loss": 0.88002694, + "num_input_tokens_seen": 5199345, + "step": 250, + "time_per_iteration": 2.510859727859497 + }, + { + "auxiliary_loss_clip": 0.0134679, + "auxiliary_loss_mlp": 0.01057379, + "balance_loss_clip": 1.0946939, + "balance_loss_mlp": 1.03995085, + "epoch": 0.03018096555041183, + "flos": 22127078292480.0, + "grad_norm": 3.9570271132612147, + "language_loss": 0.88469446, + "learning_rate": 3.999999848300794e-06, + "loss": 0.90873611, + "num_input_tokens_seen": 5218330, + "step": 251, + "time_per_iteration": 2.5033347606658936 + }, + { + "auxiliary_loss_clip": 0.01339022, + "auxiliary_loss_mlp": 0.01056661, + "balance_loss_clip": 1.08858752, + "balance_loss_mlp": 1.03882718, + "epoch": 0.030301208441050925, + "flos": 30188197359360.0, + "grad_norm": 1.6759931500953262, + "language_loss": 0.89020991, + "learning_rate": 3.999999393203203e-06, + "loss": 0.91416669, + "num_input_tokens_seen": 5240740, + "step": 252, + "time_per_iteration": 2.569035291671753 + }, + { + "auxiliary_loss_clip": 0.01339096, + "auxiliary_loss_mlp": 0.01056493, + "balance_loss_clip": 1.08732688, + "balance_loss_mlp": 1.03923118, + "epoch": 0.030421451331690014, + "flos": 23621392920960.0, + "grad_norm": 1.8659249127334487, + "language_loss": 0.85175359, + "learning_rate": 3.999998634707293e-06, + "loss": 0.87570947, + "num_input_tokens_seen": 5260290, + "step": 253, + "time_per_iteration": 2.5431325435638428 + }, + { + "auxiliary_loss_clip": 0.01349088, + "auxiliary_loss_mlp": 0.01061977, + "balance_loss_clip": 1.09604633, + "balance_loss_mlp": 1.04398835, + "epoch": 0.030541694222329104, + "flos": 27928446883200.0, + "grad_norm": 3.1131912333374494, + "language_loss": 0.96146917, + "learning_rate": 3.999997572813182e-06, + "loss": 0.98557979, + "num_input_tokens_seen": 5278100, + "step": 254, + "time_per_iteration": 2.5236926078796387 + }, + { + "auxiliary_loss_clip": 0.01341971, + "auxiliary_loss_mlp": 0.01066623, + "balance_loss_clip": 1.08959723, + "balance_loss_mlp": 1.04896796, + "epoch": 0.030661937112968194, + "flos": 18588441006720.0, + "grad_norm": 2.010356337150569, + "language_loss": 0.87787712, + "learning_rate": 3.999996207521028e-06, + "loss": 0.90196311, + "num_input_tokens_seen": 5296810, + "step": 255, + "time_per_iteration": 2.468306541442871 + }, + { + "auxiliary_loss_clip": 0.01346157, + "auxiliary_loss_mlp": 0.01056482, + "balance_loss_clip": 1.08989549, + "balance_loss_mlp": 1.03730166, + "epoch": 0.030782180003607287, + "flos": 12969139478400.0, + "grad_norm": 2.538320763769327, + "language_loss": 0.82212889, + "learning_rate": 3.999994538831039e-06, + "loss": 0.84615529, + "num_input_tokens_seen": 5313395, + "step": 256, + "time_per_iteration": 2.508166551589966 + }, + { + "auxiliary_loss_clip": 0.01342017, + "auxiliary_loss_mlp": 0.01058889, + "balance_loss_clip": 1.08980048, + "balance_loss_mlp": 1.0397085, + "epoch": 0.030902422894246377, + "flos": 23335364920320.0, + "grad_norm": 2.347064661723936, + "language_loss": 0.85697758, + "learning_rate": 3.99999256674347e-06, + "loss": 0.88098669, + "num_input_tokens_seen": 5333545, + "step": 257, + "time_per_iteration": 2.523103952407837 + }, + { + "auxiliary_loss_clip": 0.01289325, + "auxiliary_loss_mlp": 0.01012211, + "balance_loss_clip": 1.09698629, + "balance_loss_mlp": 1.00205445, + "epoch": 0.031022665784885467, + "flos": 55094151438720.0, + "grad_norm": 1.0252776540493487, + "language_loss": 0.53536487, + "learning_rate": 3.999990291258618e-06, + "loss": 0.55838025, + "num_input_tokens_seen": 5392235, + "step": 258, + "time_per_iteration": 4.4721808433532715 + }, + { + "auxiliary_loss_clip": 0.01342005, + "auxiliary_loss_mlp": 0.01061279, + "balance_loss_clip": 1.0910008, + "balance_loss_mlp": 1.04319489, + "epoch": 0.03114290867552456, + "flos": 19317786664320.0, + "grad_norm": 2.1569075481388404, + "language_loss": 0.86791718, + "learning_rate": 3.999987712376829e-06, + "loss": 0.89195001, + "num_input_tokens_seen": 5410555, + "step": 259, + "time_per_iteration": 3.2292797565460205 + }, + { + "auxiliary_loss_clip": 0.01340752, + "auxiliary_loss_mlp": 0.01057439, + "balance_loss_clip": 1.09219646, + "balance_loss_mlp": 1.03937888, + "epoch": 0.031263151566163654, + "flos": 20959442881920.0, + "grad_norm": 2.3601388966649854, + "language_loss": 0.81990385, + "learning_rate": 3.999984830098494e-06, + "loss": 0.84388572, + "num_input_tokens_seen": 5430135, + "step": 260, + "time_per_iteration": 3.2680933475494385 + }, + { + "auxiliary_loss_clip": 0.01337756, + "auxiliary_loss_mlp": 0.01064155, + "balance_loss_clip": 1.08853579, + "balance_loss_mlp": 1.04574871, + "epoch": 0.03138339445680274, + "flos": 14793006412800.0, + "grad_norm": 4.398785457470957, + "language_loss": 0.97748947, + "learning_rate": 3.999981644424051e-06, + "loss": 1.00150847, + "num_input_tokens_seen": 5444935, + "step": 261, + "time_per_iteration": 2.4641926288604736 + }, + { + "auxiliary_loss_clip": 0.01340889, + "auxiliary_loss_mlp": 0.01069519, + "balance_loss_clip": 1.09319425, + "balance_loss_mlp": 1.05012393, + "epoch": 0.03150363734744183, + "flos": 11655599022720.0, + "grad_norm": 4.818695758006385, + "language_loss": 0.8592943, + "learning_rate": 3.999978155353982e-06, + "loss": 0.88339829, + "num_input_tokens_seen": 5462080, + "step": 262, + "time_per_iteration": 2.475142240524292 + }, + { + "auxiliary_loss_clip": 0.01337332, + "auxiliary_loss_mlp": 0.01065098, + "balance_loss_clip": 1.0886066, + "balance_loss_mlp": 1.04607177, + "epoch": 0.03162388023808092, + "flos": 33727732485120.0, + "grad_norm": 3.399421424332192, + "language_loss": 0.8060078, + "learning_rate": 3.9999743628888186e-06, + "loss": 0.83003205, + "num_input_tokens_seen": 5483870, + "step": 263, + "time_per_iteration": 2.5936546325683594 + }, + { + "auxiliary_loss_clip": 0.01329948, + "auxiliary_loss_mlp": 0.01058418, + "balance_loss_clip": 1.08459926, + "balance_loss_mlp": 1.03995264, + "epoch": 0.03174412312872001, + "flos": 20810952057600.0, + "grad_norm": 2.2926582509300326, + "language_loss": 0.89487189, + "learning_rate": 3.999970267029133e-06, + "loss": 0.91875553, + "num_input_tokens_seen": 5502830, + "step": 264, + "time_per_iteration": 2.5226500034332275 + }, + { + "auxiliary_loss_clip": 0.01334499, + "auxiliary_loss_mlp": 0.01059069, + "balance_loss_clip": 1.08869398, + "balance_loss_mlp": 1.041188, + "epoch": 0.0318643660193591, + "flos": 23727939638400.0, + "grad_norm": 1.7355492332419065, + "language_loss": 0.80086368, + "learning_rate": 3.999965867775548e-06, + "loss": 0.8247993, + "num_input_tokens_seen": 5523225, + "step": 265, + "time_per_iteration": 2.523355007171631 + }, + { + "auxiliary_loss_clip": 0.01335227, + "auxiliary_loss_mlp": 0.01063364, + "balance_loss_clip": 1.08805633, + "balance_loss_mlp": 1.04576862, + "epoch": 0.0319846089099982, + "flos": 13917863450880.0, + "grad_norm": 2.973373560065952, + "language_loss": 0.8724243, + "learning_rate": 3.9999611651287315e-06, + "loss": 0.89641017, + "num_input_tokens_seen": 5541380, + "step": 266, + "time_per_iteration": 2.4691309928894043 + }, + { + "auxiliary_loss_clip": 0.01339967, + "auxiliary_loss_mlp": 0.0105854, + "balance_loss_clip": 1.09189296, + "balance_loss_mlp": 1.04058754, + "epoch": 0.03210485180063729, + "flos": 14753253035520.0, + "grad_norm": 2.4726796979704533, + "language_loss": 0.78826392, + "learning_rate": 3.999956159089396e-06, + "loss": 0.812249, + "num_input_tokens_seen": 5558830, + "step": 267, + "time_per_iteration": 2.4948835372924805 + }, + { + "auxiliary_loss_clip": 0.01335497, + "auxiliary_loss_mlp": 0.01060933, + "balance_loss_clip": 1.08993149, + "balance_loss_mlp": 1.04268169, + "epoch": 0.03222509469127638, + "flos": 28913153304960.0, + "grad_norm": 2.3269629345562644, + "language_loss": 0.79626644, + "learning_rate": 3.999950849658302e-06, + "loss": 0.82023078, + "num_input_tokens_seen": 5577750, + "step": 268, + "time_per_iteration": 2.521566390991211 + }, + { + "auxiliary_loss_clip": 0.01341878, + "auxiliary_loss_mlp": 0.01066623, + "balance_loss_clip": 1.09165204, + "balance_loss_mlp": 1.04862237, + "epoch": 0.03234533758191547, + "flos": 16946389739520.0, + "grad_norm": 2.1842875661679924, + "language_loss": 0.84184265, + "learning_rate": 3.999945236836254e-06, + "loss": 0.8659277, + "num_input_tokens_seen": 5596715, + "step": 269, + "time_per_iteration": 2.4864931106567383 + }, + { + "auxiliary_loss_clip": 0.01341352, + "auxiliary_loss_mlp": 0.01064996, + "balance_loss_clip": 1.09304571, + "balance_loss_mlp": 1.0456121, + "epoch": 0.03246558047255456, + "flos": 18989096284800.0, + "grad_norm": 2.6961443275007917, + "language_loss": 0.94631845, + "learning_rate": 3.999939320624103e-06, + "loss": 0.97038192, + "num_input_tokens_seen": 5611865, + "step": 270, + "time_per_iteration": 2.4692611694335938 + }, + { + "auxiliary_loss_clip": 0.01338087, + "auxiliary_loss_mlp": 0.01060531, + "balance_loss_clip": 1.09167767, + "balance_loss_mlp": 1.04247093, + "epoch": 0.03258582336319365, + "flos": 23728334688000.0, + "grad_norm": 1.880260231793019, + "language_loss": 0.8989259, + "learning_rate": 3.999933101022749e-06, + "loss": 0.92291206, + "num_input_tokens_seen": 5632270, + "step": 271, + "time_per_iteration": 2.4885618686676025 + }, + { + "auxiliary_loss_clip": 0.01335552, + "auxiliary_loss_mlp": 0.01066174, + "balance_loss_clip": 1.09034991, + "balance_loss_mlp": 1.04803038, + "epoch": 0.032706066253832745, + "flos": 27670823562240.0, + "grad_norm": 1.7686440556527296, + "language_loss": 0.86753166, + "learning_rate": 3.999926578033132e-06, + "loss": 0.89154887, + "num_input_tokens_seen": 5652085, + "step": 272, + "time_per_iteration": 2.524630546569824 + }, + { + "auxiliary_loss_clip": 0.0133421, + "auxiliary_loss_mlp": 0.01065738, + "balance_loss_clip": 1.0863694, + "balance_loss_mlp": 1.04753494, + "epoch": 0.032826309144471835, + "flos": 45624685968000.0, + "grad_norm": 2.3549362223479497, + "language_loss": 0.6315338, + "learning_rate": 3.999919751656244e-06, + "loss": 0.65553331, + "num_input_tokens_seen": 5678985, + "step": 273, + "time_per_iteration": 2.6910767555236816 + }, + { + "auxiliary_loss_clip": 0.01330337, + "auxiliary_loss_mlp": 0.01053555, + "balance_loss_clip": 1.08599365, + "balance_loss_mlp": 1.03448176, + "epoch": 0.032946552035110925, + "flos": 25812374808960.0, + "grad_norm": 2.5682281599340655, + "language_loss": 0.75831425, + "learning_rate": 3.9999126218931195e-06, + "loss": 0.78215319, + "num_input_tokens_seen": 5697020, + "step": 274, + "time_per_iteration": 2.5102109909057617 + }, + { + "auxiliary_loss_clip": 0.01337186, + "auxiliary_loss_mlp": 0.010515, + "balance_loss_clip": 1.09148669, + "balance_loss_mlp": 1.03339219, + "epoch": 0.033066794925750015, + "flos": 15121984101120.0, + "grad_norm": 2.878062301508436, + "language_loss": 0.89797938, + "learning_rate": 3.99990518874484e-06, + "loss": 0.9218663, + "num_input_tokens_seen": 5713460, + "step": 275, + "time_per_iteration": 2.4556920528411865 + }, + { + "auxiliary_loss_clip": 0.013354, + "auxiliary_loss_mlp": 0.01065713, + "balance_loss_clip": 1.09040284, + "balance_loss_mlp": 1.04766488, + "epoch": 0.033187037816389105, + "flos": 22776593973120.0, + "grad_norm": 3.2358580974507647, + "language_loss": 0.92774928, + "learning_rate": 3.999897452212534e-06, + "loss": 0.95176041, + "num_input_tokens_seen": 5730790, + "step": 276, + "time_per_iteration": 2.487253427505493 + }, + { + "auxiliary_loss_clip": 0.01330666, + "auxiliary_loss_mlp": 0.0106124, + "balance_loss_clip": 1.08761072, + "balance_loss_mlp": 1.0427984, + "epoch": 0.033307280707028195, + "flos": 23331414424320.0, + "grad_norm": 4.262380899923124, + "language_loss": 1.00082946, + "learning_rate": 3.999889412297374e-06, + "loss": 1.02474856, + "num_input_tokens_seen": 5750215, + "step": 277, + "time_per_iteration": 2.4790239334106445 + }, + { + "auxiliary_loss_clip": 0.01329393, + "auxiliary_loss_mlp": 0.01045541, + "balance_loss_clip": 1.08541465, + "balance_loss_mlp": 1.02860165, + "epoch": 0.03342752359766729, + "flos": 28840290566400.0, + "grad_norm": 2.5533003350154266, + "language_loss": 0.79086143, + "learning_rate": 3.999881069000581e-06, + "loss": 0.81461072, + "num_input_tokens_seen": 5769945, + "step": 278, + "time_per_iteration": 2.5298759937286377 + }, + { + "auxiliary_loss_clip": 0.01332404, + "auxiliary_loss_mlp": 0.0105588, + "balance_loss_clip": 1.08695269, + "balance_loss_mlp": 1.03699684, + "epoch": 0.03354776648830638, + "flos": 19384544090880.0, + "grad_norm": 3.133447787759568, + "language_loss": 0.86998105, + "learning_rate": 3.99987242232342e-06, + "loss": 0.89386392, + "num_input_tokens_seen": 5784950, + "step": 279, + "time_per_iteration": 2.4571354389190674 + }, + { + "auxiliary_loss_clip": 0.01334643, + "auxiliary_loss_mlp": 0.01064606, + "balance_loss_clip": 1.09066594, + "balance_loss_mlp": 1.04591453, + "epoch": 0.03366800937894547, + "flos": 17858628472320.0, + "grad_norm": 2.0347294967594722, + "language_loss": 0.79733664, + "learning_rate": 3.9998634722672026e-06, + "loss": 0.82132918, + "num_input_tokens_seen": 5805005, + "step": 280, + "time_per_iteration": 2.4850215911865234 + }, + { + "auxiliary_loss_clip": 0.01332991, + "auxiliary_loss_mlp": 0.01058158, + "balance_loss_clip": 1.08951116, + "balance_loss_mlp": 1.04040742, + "epoch": 0.03378825226958456, + "flos": 35951033635200.0, + "grad_norm": 2.3779417071972047, + "language_loss": 0.78571248, + "learning_rate": 3.999854218833286e-06, + "loss": 0.80962402, + "num_input_tokens_seen": 5825825, + "step": 281, + "time_per_iteration": 2.595372438430786 + }, + { + "auxiliary_loss_clip": 0.01332998, + "auxiliary_loss_mlp": 0.01063474, + "balance_loss_clip": 1.09024668, + "balance_loss_mlp": 1.04498482, + "epoch": 0.03390849516022365, + "flos": 25702488126720.0, + "grad_norm": 1.9231194387308705, + "language_loss": 0.82091057, + "learning_rate": 3.999844662023075e-06, + "loss": 0.84487534, + "num_input_tokens_seen": 5845700, + "step": 282, + "time_per_iteration": 2.5061395168304443 + }, + { + "auxiliary_loss_clip": 0.01325032, + "auxiliary_loss_mlp": 0.01058501, + "balance_loss_clip": 1.08529902, + "balance_loss_mlp": 1.04030931, + "epoch": 0.03402873805086274, + "flos": 21284505987840.0, + "grad_norm": 1.8290216407459101, + "language_loss": 0.92033422, + "learning_rate": 3.999834801838018e-06, + "loss": 0.94416958, + "num_input_tokens_seen": 5864680, + "step": 283, + "time_per_iteration": 2.5019774436950684 + }, + { + "auxiliary_loss_clip": 0.01325713, + "auxiliary_loss_mlp": 0.01055625, + "balance_loss_clip": 1.0858469, + "balance_loss_mlp": 1.03783941, + "epoch": 0.03414898094150183, + "flos": 22710913954560.0, + "grad_norm": 1.8833173396244591, + "language_loss": 0.73907673, + "learning_rate": 3.9998246382796115e-06, + "loss": 0.7628901, + "num_input_tokens_seen": 5884260, + "step": 284, + "time_per_iteration": 3.2848219871520996 + }, + { + "auxiliary_loss_clip": 0.01331297, + "auxiliary_loss_mlp": 0.01052493, + "balance_loss_clip": 1.08548725, + "balance_loss_mlp": 1.03378952, + "epoch": 0.03426922383214093, + "flos": 18879927874560.0, + "grad_norm": 3.070291762045472, + "language_loss": 0.90585881, + "learning_rate": 3.999814171349399e-06, + "loss": 0.92969668, + "num_input_tokens_seen": 5902120, + "step": 285, + "time_per_iteration": 2.4737772941589355 + }, + { + "auxiliary_loss_clip": 0.01327082, + "auxiliary_loss_mlp": 0.0105934, + "balance_loss_clip": 1.08655608, + "balance_loss_mlp": 1.04205418, + "epoch": 0.03438946672278002, + "flos": 34752012716160.0, + "grad_norm": 1.9230458969364137, + "language_loss": 0.7365886, + "learning_rate": 3.9998034010489655e-06, + "loss": 0.76045281, + "num_input_tokens_seen": 5925810, + "step": 286, + "time_per_iteration": 4.0204784870147705 + }, + { + "auxiliary_loss_clip": 0.01327641, + "auxiliary_loss_mlp": 0.01061606, + "balance_loss_clip": 1.08903408, + "balance_loss_mlp": 1.04439211, + "epoch": 0.03450970961341911, + "flos": 22164102236160.0, + "grad_norm": 2.1941772352482554, + "language_loss": 0.75989258, + "learning_rate": 3.999792327379946e-06, + "loss": 0.7837851, + "num_input_tokens_seen": 5945185, + "step": 287, + "time_per_iteration": 2.4804317951202393 + }, + { + "auxiliary_loss_clip": 0.01332465, + "auxiliary_loss_mlp": 0.01062289, + "balance_loss_clip": 1.09228015, + "balance_loss_mlp": 1.04509866, + "epoch": 0.034629952504058197, + "flos": 21725740656000.0, + "grad_norm": 2.2542382146896114, + "language_loss": 0.96522832, + "learning_rate": 3.999780950344021e-06, + "loss": 0.98917586, + "num_input_tokens_seen": 5963375, + "step": 288, + "time_per_iteration": 3.281276226043701 + }, + { + "auxiliary_loss_clip": 0.01333601, + "auxiliary_loss_mlp": 0.01066815, + "balance_loss_clip": 1.089252, + "balance_loss_mlp": 1.04829001, + "epoch": 0.034750195394697286, + "flos": 20047994248320.0, + "grad_norm": 2.415410867275009, + "language_loss": 0.82572448, + "learning_rate": 3.999769269942916e-06, + "loss": 0.8497287, + "num_input_tokens_seen": 5983415, + "step": 289, + "time_per_iteration": 2.5055954456329346 + }, + { + "auxiliary_loss_clip": 0.01328035, + "auxiliary_loss_mlp": 0.01055867, + "balance_loss_clip": 1.08744419, + "balance_loss_mlp": 1.03787804, + "epoch": 0.034870438285336376, + "flos": 27965865876480.0, + "grad_norm": 1.911709338502319, + "language_loss": 0.8086949, + "learning_rate": 3.999757286178402e-06, + "loss": 0.83253396, + "num_input_tokens_seen": 6005850, + "step": 290, + "time_per_iteration": 2.5642905235290527 + }, + { + "auxiliary_loss_clip": 0.01331526, + "auxiliary_loss_mlp": 0.01053261, + "balance_loss_clip": 1.09000993, + "balance_loss_mlp": 1.03530812, + "epoch": 0.03499068117597547, + "flos": 22017514832640.0, + "grad_norm": 2.3183233791046796, + "language_loss": 0.90844923, + "learning_rate": 3.999744999052299e-06, + "loss": 0.93229711, + "num_input_tokens_seen": 6027240, + "step": 291, + "time_per_iteration": 2.5462400913238525 + }, + { + "auxiliary_loss_clip": 0.01275574, + "auxiliary_loss_mlp": 0.01010972, + "balance_loss_clip": 1.0947113, + "balance_loss_mlp": 1.00086331, + "epoch": 0.03511092406661456, + "flos": 57242147725440.0, + "grad_norm": 0.956201032307118, + "language_loss": 0.6115731, + "learning_rate": 3.9997324085664675e-06, + "loss": 0.63443851, + "num_input_tokens_seen": 6087470, + "step": 292, + "time_per_iteration": 3.0921735763549805 + }, + { + "auxiliary_loss_clip": 0.01325674, + "auxiliary_loss_mlp": 0.01060805, + "balance_loss_clip": 1.08569109, + "balance_loss_mlp": 1.04264927, + "epoch": 0.03523116695725365, + "flos": 22928065626240.0, + "grad_norm": 2.5758150191211935, + "language_loss": 0.92101622, + "learning_rate": 3.999719514722821e-06, + "loss": 0.94488096, + "num_input_tokens_seen": 6107600, + "step": 293, + "time_per_iteration": 2.5170655250549316 + }, + { + "auxiliary_loss_clip": 0.01323159, + "auxiliary_loss_mlp": 0.01057293, + "balance_loss_clip": 1.08535314, + "balance_loss_mlp": 1.04048443, + "epoch": 0.03535140984789274, + "flos": 36903241226880.0, + "grad_norm": 5.697673959254209, + "language_loss": 0.74870491, + "learning_rate": 3.999706317523314e-06, + "loss": 0.77250946, + "num_input_tokens_seen": 6126160, + "step": 294, + "time_per_iteration": 2.6248831748962402 + }, + { + "auxiliary_loss_clip": 0.01324696, + "auxiliary_loss_mlp": 0.01054199, + "balance_loss_clip": 1.08683419, + "balance_loss_mlp": 1.03734291, + "epoch": 0.03547165273853183, + "flos": 20449152316800.0, + "grad_norm": 2.0012875947000888, + "language_loss": 0.86160874, + "learning_rate": 3.999692816969948e-06, + "loss": 0.88539773, + "num_input_tokens_seen": 6145695, + "step": 295, + "time_per_iteration": 2.4810407161712646 + }, + { + "auxiliary_loss_clip": 0.01262675, + "auxiliary_loss_mlp": 0.01008912, + "balance_loss_clip": 1.08641827, + "balance_loss_mlp": 0.99894661, + "epoch": 0.03559189562917092, + "flos": 69850564871040.0, + "grad_norm": 0.9944287142691653, + "language_loss": 0.69425642, + "learning_rate": 3.999679013064772e-06, + "loss": 0.71697229, + "num_input_tokens_seen": 6212440, + "step": 296, + "time_per_iteration": 3.1027281284332275 + }, + { + "auxiliary_loss_clip": 0.01327417, + "auxiliary_loss_mlp": 0.01056539, + "balance_loss_clip": 1.08839178, + "balance_loss_mlp": 1.03896713, + "epoch": 0.03571213851981002, + "flos": 21651944163840.0, + "grad_norm": 2.566228746400097, + "language_loss": 0.85612941, + "learning_rate": 3.99966490580988e-06, + "loss": 0.879969, + "num_input_tokens_seen": 6229800, + "step": 297, + "time_per_iteration": 2.507291793823242 + }, + { + "auxiliary_loss_clip": 0.01328405, + "auxiliary_loss_mlp": 0.01060601, + "balance_loss_clip": 1.08705926, + "balance_loss_mlp": 1.04339886, + "epoch": 0.03583238141044911, + "flos": 43945610757120.0, + "grad_norm": 4.56651286089815, + "language_loss": 0.65820038, + "learning_rate": 3.999650495207411e-06, + "loss": 0.68209052, + "num_input_tokens_seen": 6255825, + "step": 298, + "time_per_iteration": 2.718258857727051 + }, + { + "auxiliary_loss_clip": 0.01321999, + "auxiliary_loss_mlp": 0.01058667, + "balance_loss_clip": 1.08640397, + "balance_loss_mlp": 1.04072642, + "epoch": 0.0359526243010882, + "flos": 18910810592640.0, + "grad_norm": 2.795958941673833, + "language_loss": 0.90516531, + "learning_rate": 3.999635781259553e-06, + "loss": 0.92897195, + "num_input_tokens_seen": 6271090, + "step": 299, + "time_per_iteration": 2.474907636642456 + }, + { + "auxiliary_loss_clip": 0.01247493, + "auxiliary_loss_mlp": 0.01011162, + "balance_loss_clip": 1.07616377, + "balance_loss_mlp": 1.00129175, + "epoch": 0.03607286719172729, + "flos": 61668892782720.0, + "grad_norm": 0.9187193592068028, + "language_loss": 0.52312791, + "learning_rate": 3.999620763968535e-06, + "loss": 0.5457145, + "num_input_tokens_seen": 6329965, + "step": 300, + "time_per_iteration": 2.919100761413574 + }, + { + "auxiliary_loss_clip": 0.01320971, + "auxiliary_loss_mlp": 0.01058401, + "balance_loss_clip": 1.08601809, + "balance_loss_mlp": 1.04056692, + "epoch": 0.03619311008236638, + "flos": 27819062991360.0, + "grad_norm": 1.8779366080323172, + "language_loss": 0.86569273, + "learning_rate": 3.999605443336638e-06, + "loss": 0.88948649, + "num_input_tokens_seen": 6352095, + "step": 301, + "time_per_iteration": 2.558218240737915 + }, + { + "auxiliary_loss_clip": 0.01327276, + "auxiliary_loss_mlp": 0.01061268, + "balance_loss_clip": 1.08868957, + "balance_loss_mlp": 1.04349375, + "epoch": 0.03631335297300547, + "flos": 13621133197440.0, + "grad_norm": 2.534593622403176, + "language_loss": 0.89098251, + "learning_rate": 3.999589819366185e-06, + "loss": 0.91486794, + "num_input_tokens_seen": 6365885, + "step": 302, + "time_per_iteration": 2.4483182430267334 + }, + { + "auxiliary_loss_clip": 0.01326944, + "auxiliary_loss_mlp": 0.01056995, + "balance_loss_clip": 1.0876224, + "balance_loss_mlp": 1.03868484, + "epoch": 0.036433595863644565, + "flos": 27631788456960.0, + "grad_norm": 1.9945494571412443, + "language_loss": 0.84776509, + "learning_rate": 3.999573892059547e-06, + "loss": 0.87160444, + "num_input_tokens_seen": 6385015, + "step": 303, + "time_per_iteration": 2.548220634460449 + }, + { + "auxiliary_loss_clip": 0.01329677, + "auxiliary_loss_mlp": 0.01065341, + "balance_loss_clip": 1.08840084, + "balance_loss_mlp": 1.04624391, + "epoch": 0.036553838754283655, + "flos": 24572020314240.0, + "grad_norm": 2.0467309468304378, + "language_loss": 0.80708086, + "learning_rate": 3.999557661419138e-06, + "loss": 0.83103096, + "num_input_tokens_seen": 6405165, + "step": 304, + "time_per_iteration": 2.514946222305298 + }, + { + "auxiliary_loss_clip": 0.01328591, + "auxiliary_loss_mlp": 0.01055552, + "balance_loss_clip": 1.08992565, + "balance_loss_mlp": 1.03912473, + "epoch": 0.036674081644922744, + "flos": 23404313076480.0, + "grad_norm": 2.458833653911432, + "language_loss": 0.81248188, + "learning_rate": 3.9995411274474225e-06, + "loss": 0.83632332, + "num_input_tokens_seen": 6424445, + "step": 305, + "time_per_iteration": 2.5794801712036133 + }, + { + "auxiliary_loss_clip": 0.01325595, + "auxiliary_loss_mlp": 0.01064579, + "balance_loss_clip": 1.08685207, + "balance_loss_mlp": 1.04616141, + "epoch": 0.036794324535561834, + "flos": 27489690253440.0, + "grad_norm": 1.8315712809491722, + "language_loss": 0.81404781, + "learning_rate": 3.999524290146908e-06, + "loss": 0.83794957, + "num_input_tokens_seen": 6444650, + "step": 306, + "time_per_iteration": 2.530320167541504 + }, + { + "auxiliary_loss_clip": 0.01323805, + "auxiliary_loss_mlp": 0.01065729, + "balance_loss_clip": 1.08837736, + "balance_loss_mlp": 1.04795527, + "epoch": 0.036914567426200924, + "flos": 19463476227840.0, + "grad_norm": 2.351504085893815, + "language_loss": 0.92860562, + "learning_rate": 3.9995071495201485e-06, + "loss": 0.952501, + "num_input_tokens_seen": 6461755, + "step": 307, + "time_per_iteration": 2.480562210083008 + }, + { + "auxiliary_loss_clip": 0.01323101, + "auxiliary_loss_mlp": 0.01057908, + "balance_loss_clip": 1.08711219, + "balance_loss_mlp": 1.03963304, + "epoch": 0.037034810316840014, + "flos": 22309324922880.0, + "grad_norm": 2.5107950336766707, + "language_loss": 0.98026413, + "learning_rate": 3.999489705569744e-06, + "loss": 1.0040741, + "num_input_tokens_seen": 6479455, + "step": 308, + "time_per_iteration": 2.520212411880493 + }, + { + "auxiliary_loss_clip": 0.01319972, + "auxiliary_loss_mlp": 0.01060587, + "balance_loss_clip": 1.0835247, + "balance_loss_mlp": 1.04350424, + "epoch": 0.03715505320747911, + "flos": 18588333265920.0, + "grad_norm": 4.246892554590944, + "language_loss": 0.86535585, + "learning_rate": 3.999471958298341e-06, + "loss": 0.88916135, + "num_input_tokens_seen": 6498365, + "step": 309, + "time_per_iteration": 2.4561514854431152 + }, + { + "auxiliary_loss_clip": 0.01326527, + "auxiliary_loss_mlp": 0.01064597, + "balance_loss_clip": 1.08838558, + "balance_loss_mlp": 1.04628611, + "epoch": 0.0372752960981182, + "flos": 35955343267200.0, + "grad_norm": 2.0089292931076517, + "language_loss": 0.75975102, + "learning_rate": 3.999453907708631e-06, + "loss": 0.78366226, + "num_input_tokens_seen": 6520770, + "step": 310, + "time_per_iteration": 2.589653730392456 + }, + { + "auxiliary_loss_clip": 0.01323638, + "auxiliary_loss_mlp": 0.01051798, + "balance_loss_clip": 1.08718109, + "balance_loss_mlp": 1.0353353, + "epoch": 0.03739553898875729, + "flos": 20814040627200.0, + "grad_norm": 1.8118862464927357, + "language_loss": 0.81237221, + "learning_rate": 3.999435553803353e-06, + "loss": 0.83612657, + "num_input_tokens_seen": 6540170, + "step": 311, + "time_per_iteration": 3.285174608230591 + }, + { + "auxiliary_loss_clip": 0.01321397, + "auxiliary_loss_mlp": 0.01061583, + "balance_loss_clip": 1.08670557, + "balance_loss_mlp": 1.04446435, + "epoch": 0.03751578187939638, + "flos": 20264140339200.0, + "grad_norm": 2.515633357922611, + "language_loss": 0.83384275, + "learning_rate": 3.999416896585292e-06, + "loss": 0.85767251, + "num_input_tokens_seen": 6557200, + "step": 312, + "time_per_iteration": 2.49153733253479 + }, + { + "auxiliary_loss_clip": 0.01320988, + "auxiliary_loss_mlp": 0.01056582, + "balance_loss_clip": 1.08443785, + "balance_loss_mlp": 1.03932035, + "epoch": 0.03763602477003547, + "flos": 20668063754880.0, + "grad_norm": 3.8019515551363283, + "language_loss": 0.86598945, + "learning_rate": 3.9993979360572775e-06, + "loss": 0.88976514, + "num_input_tokens_seen": 6577340, + "step": 313, + "time_per_iteration": 4.006702184677124 + }, + { + "auxiliary_loss_clip": 0.01328346, + "auxiliary_loss_mlp": 0.010597, + "balance_loss_clip": 1.08934045, + "balance_loss_mlp": 1.04233122, + "epoch": 0.03775626766067456, + "flos": 16691352197760.0, + "grad_norm": 3.1595071754582102, + "language_loss": 0.82739764, + "learning_rate": 3.999378672222185e-06, + "loss": 0.85127813, + "num_input_tokens_seen": 6595125, + "step": 314, + "time_per_iteration": 2.46445631980896 + }, + { + "auxiliary_loss_clip": 0.01321963, + "auxiliary_loss_mlp": 0.01055233, + "balance_loss_clip": 1.08732605, + "balance_loss_mlp": 1.03657651, + "epoch": 0.03787651055131366, + "flos": 21141797253120.0, + "grad_norm": 1.9663889476601286, + "language_loss": 0.83048242, + "learning_rate": 3.9993591050829385e-06, + "loss": 0.85425436, + "num_input_tokens_seen": 6612990, + "step": 315, + "time_per_iteration": 3.2187600135803223 + }, + { + "auxiliary_loss_clip": 0.01322307, + "auxiliary_loss_mlp": 0.01064695, + "balance_loss_clip": 1.08702755, + "balance_loss_mlp": 1.04688501, + "epoch": 0.037996753441952746, + "flos": 22018089450240.0, + "grad_norm": 1.9853691005110956, + "language_loss": 0.79126501, + "learning_rate": 3.999339234642506e-06, + "loss": 0.815135, + "num_input_tokens_seen": 6632740, + "step": 316, + "time_per_iteration": 2.474998950958252 + }, + { + "auxiliary_loss_clip": 0.01324625, + "auxiliary_loss_mlp": 0.0104846, + "balance_loss_clip": 1.08964682, + "balance_loss_mlp": 1.03023279, + "epoch": 0.038116996332591836, + "flos": 27709391790720.0, + "grad_norm": 2.2753473060619163, + "language_loss": 0.83541602, + "learning_rate": 3.9993190609038994e-06, + "loss": 0.85914683, + "num_input_tokens_seen": 6651505, + "step": 317, + "time_per_iteration": 2.5113537311553955 + }, + { + "auxiliary_loss_clip": 0.01313289, + "auxiliary_loss_mlp": 0.01051586, + "balance_loss_clip": 1.08207631, + "balance_loss_mlp": 1.03424072, + "epoch": 0.038237239223230926, + "flos": 21178067011200.0, + "grad_norm": 1.8474948593105993, + "language_loss": 0.83223224, + "learning_rate": 3.999298583870182e-06, + "loss": 0.85588104, + "num_input_tokens_seen": 6671090, + "step": 318, + "time_per_iteration": 2.5051167011260986 + }, + { + "auxiliary_loss_clip": 0.01315902, + "auxiliary_loss_mlp": 0.01058173, + "balance_loss_clip": 1.08275998, + "balance_loss_mlp": 1.04086375, + "epoch": 0.038357482113870016, + "flos": 25556618995200.0, + "grad_norm": 1.913066097413741, + "language_loss": 0.77464581, + "learning_rate": 3.999277803544458e-06, + "loss": 0.79838651, + "num_input_tokens_seen": 6691245, + "step": 319, + "time_per_iteration": 2.60099720954895 + }, + { + "auxiliary_loss_clip": 0.01219404, + "auxiliary_loss_mlp": 0.01014322, + "balance_loss_clip": 1.0603559, + "balance_loss_mlp": 1.00535738, + "epoch": 0.038477725004509106, + "flos": 59227578034560.0, + "grad_norm": 0.9436031793909948, + "language_loss": 0.62377346, + "learning_rate": 3.999256719929882e-06, + "loss": 0.64611077, + "num_input_tokens_seen": 6752520, + "step": 320, + "time_per_iteration": 3.073620080947876 + }, + { + "auxiliary_loss_clip": 0.01218725, + "auxiliary_loss_mlp": 0.0101119, + "balance_loss_clip": 1.06007957, + "balance_loss_mlp": 1.0024159, + "epoch": 0.0385979678951482, + "flos": 67317676398720.0, + "grad_norm": 1.214502913456273, + "language_loss": 0.67093921, + "learning_rate": 3.999235333029651e-06, + "loss": 0.69323838, + "num_input_tokens_seen": 6806460, + "step": 321, + "time_per_iteration": 3.003993511199951 + }, + { + "auxiliary_loss_clip": 0.0131478, + "auxiliary_loss_mlp": 0.01055672, + "balance_loss_clip": 1.08509707, + "balance_loss_mlp": 1.0393641, + "epoch": 0.03871821078578729, + "flos": 22746752749440.0, + "grad_norm": 1.9513842049528818, + "language_loss": 0.81862938, + "learning_rate": 3.999213642847009e-06, + "loss": 0.84233391, + "num_input_tokens_seen": 6827045, + "step": 322, + "time_per_iteration": 2.609020948410034 + }, + { + "auxiliary_loss_clip": 0.01315036, + "auxiliary_loss_mlp": 0.01050703, + "balance_loss_clip": 1.08233976, + "balance_loss_mlp": 1.03431141, + "epoch": 0.03883845367642638, + "flos": 26280613526400.0, + "grad_norm": 1.8164798398968927, + "language_loss": 0.91327918, + "learning_rate": 3.999191649385247e-06, + "loss": 0.93693656, + "num_input_tokens_seen": 6848220, + "step": 323, + "time_per_iteration": 2.6252121925354004 + }, + { + "auxiliary_loss_clip": 0.01213392, + "auxiliary_loss_mlp": 0.01011024, + "balance_loss_clip": 1.05699348, + "balance_loss_mlp": 1.0025841, + "epoch": 0.03895869656706547, + "flos": 56962835568000.0, + "grad_norm": 0.9040895075930151, + "language_loss": 0.59717798, + "learning_rate": 3.999169352647702e-06, + "loss": 0.61942214, + "num_input_tokens_seen": 6909400, + "step": 324, + "time_per_iteration": 3.016223430633545 + }, + { + "auxiliary_loss_clip": 0.01317804, + "auxiliary_loss_mlp": 0.01075828, + "balance_loss_clip": 1.08438349, + "balance_loss_mlp": 1.05742192, + "epoch": 0.03907893945770456, + "flos": 24863363527680.0, + "grad_norm": 1.8781251848175944, + "language_loss": 0.83045161, + "learning_rate": 3.999146752637755e-06, + "loss": 0.854388, + "num_input_tokens_seen": 6930445, + "step": 325, + "time_per_iteration": 2.6454153060913086 + }, + { + "auxiliary_loss_clip": 0.01314468, + "auxiliary_loss_mlp": 0.01056587, + "balance_loss_clip": 1.08235145, + "balance_loss_mlp": 1.03908658, + "epoch": 0.03919918234834365, + "flos": 18368595815040.0, + "grad_norm": 2.3824528994339427, + "language_loss": 0.89779007, + "learning_rate": 3.999123849358836e-06, + "loss": 0.92150056, + "num_input_tokens_seen": 6948110, + "step": 326, + "time_per_iteration": 2.4771316051483154 + }, + { + "auxiliary_loss_clip": 0.01314286, + "auxiliary_loss_mlp": 0.01059719, + "balance_loss_clip": 1.08249521, + "balance_loss_mlp": 1.04165912, + "epoch": 0.03931942523898275, + "flos": 25225414663680.0, + "grad_norm": 2.1935023633981174, + "language_loss": 0.74551797, + "learning_rate": 3.999100642814418e-06, + "loss": 0.76925802, + "num_input_tokens_seen": 6968550, + "step": 327, + "time_per_iteration": 2.537433385848999 + }, + { + "auxiliary_loss_clip": 0.01314826, + "auxiliary_loss_mlp": 0.01062518, + "balance_loss_clip": 1.08435059, + "balance_loss_mlp": 1.04494631, + "epoch": 0.03943966812962184, + "flos": 23257905240960.0, + "grad_norm": 2.8757409776741625, + "language_loss": 0.88505042, + "learning_rate": 3.999077133008022e-06, + "loss": 0.90882385, + "num_input_tokens_seen": 6987135, + "step": 328, + "time_per_iteration": 2.527554750442505 + }, + { + "auxiliary_loss_clip": 0.01316408, + "auxiliary_loss_mlp": 0.01063347, + "balance_loss_clip": 1.08398223, + "balance_loss_mlp": 1.04379714, + "epoch": 0.03955991102026093, + "flos": 29168837291520.0, + "grad_norm": 1.8404334953321044, + "language_loss": 0.90596014, + "learning_rate": 3.9990533199432145e-06, + "loss": 0.92975771, + "num_input_tokens_seen": 7008630, + "step": 329, + "time_per_iteration": 2.5503015518188477 + }, + { + "auxiliary_loss_clip": 0.0131511, + "auxiliary_loss_mlp": 0.01056945, + "balance_loss_clip": 1.0836792, + "balance_loss_mlp": 1.03919506, + "epoch": 0.03968015391090002, + "flos": 17602441695360.0, + "grad_norm": 2.3730231531057426, + "language_loss": 0.75864828, + "learning_rate": 3.999029203623608e-06, + "loss": 0.78236884, + "num_input_tokens_seen": 7026350, + "step": 330, + "time_per_iteration": 2.4718339443206787 + }, + { + "auxiliary_loss_clip": 0.01310602, + "auxiliary_loss_mlp": 0.01051923, + "balance_loss_clip": 1.08228421, + "balance_loss_mlp": 1.03427982, + "epoch": 0.03980039680153911, + "flos": 21799285752960.0, + "grad_norm": 2.205257816804057, + "language_loss": 0.86954749, + "learning_rate": 3.99900478405286e-06, + "loss": 0.89317274, + "num_input_tokens_seen": 7045660, + "step": 331, + "time_per_iteration": 2.506176471710205 + }, + { + "auxiliary_loss_clip": 0.01312153, + "auxiliary_loss_mlp": 0.01055656, + "balance_loss_clip": 1.08537209, + "balance_loss_mlp": 1.03986049, + "epoch": 0.0399206396921782, + "flos": 15195134148480.0, + "grad_norm": 2.326626942481044, + "language_loss": 0.82367086, + "learning_rate": 3.998980061234676e-06, + "loss": 0.84734893, + "num_input_tokens_seen": 7063575, + "step": 332, + "time_per_iteration": 2.4539794921875 + }, + { + "auxiliary_loss_clip": 0.01316718, + "auxiliary_loss_mlp": 0.01056486, + "balance_loss_clip": 1.08298492, + "balance_loss_mlp": 1.03834236, + "epoch": 0.040040882582817294, + "flos": 14422910630400.0, + "grad_norm": 5.1522867585524486, + "language_loss": 0.7598629, + "learning_rate": 3.9989550351728055e-06, + "loss": 0.78359491, + "num_input_tokens_seen": 7080505, + "step": 333, + "time_per_iteration": 2.464097261428833 + }, + { + "auxiliary_loss_clip": 0.0131382, + "auxiliary_loss_mlp": 0.01056974, + "balance_loss_clip": 1.08498096, + "balance_loss_mlp": 1.04026067, + "epoch": 0.040161125473456384, + "flos": 19280906375040.0, + "grad_norm": 2.6971830514954465, + "language_loss": 0.84647095, + "learning_rate": 3.998929705871046e-06, + "loss": 0.87017888, + "num_input_tokens_seen": 7097860, + "step": 334, + "time_per_iteration": 2.4804749488830566 + }, + { + "auxiliary_loss_clip": 0.01312448, + "auxiliary_loss_mlp": 0.01057339, + "balance_loss_clip": 1.08559787, + "balance_loss_mlp": 1.04069746, + "epoch": 0.040281368364095474, + "flos": 17821101738240.0, + "grad_norm": 2.9147075769844464, + "language_loss": 0.88949561, + "learning_rate": 3.99890407333324e-06, + "loss": 0.91319346, + "num_input_tokens_seen": 7116390, + "step": 335, + "time_per_iteration": 2.4682934284210205 + }, + { + "auxiliary_loss_clip": 0.01309159, + "auxiliary_loss_mlp": 0.01055216, + "balance_loss_clip": 1.07934308, + "balance_loss_mlp": 1.03782332, + "epoch": 0.040401611254734564, + "flos": 19573757959680.0, + "grad_norm": 1.9584373847112018, + "language_loss": 0.86938936, + "learning_rate": 3.998878137563275e-06, + "loss": 0.89303315, + "num_input_tokens_seen": 7135940, + "step": 336, + "time_per_iteration": 2.4650237560272217 + }, + { + "auxiliary_loss_clip": 0.01310372, + "auxiliary_loss_mlp": 0.01052812, + "balance_loss_clip": 1.08129871, + "balance_loss_mlp": 1.03510952, + "epoch": 0.040521854145373654, + "flos": 22054466949120.0, + "grad_norm": 2.0606216837567986, + "language_loss": 0.85248518, + "learning_rate": 3.998851898565085e-06, + "loss": 0.87611711, + "num_input_tokens_seen": 7155745, + "step": 337, + "time_per_iteration": 2.4912075996398926 + }, + { + "auxiliary_loss_clip": 0.01306981, + "auxiliary_loss_mlp": 0.01048127, + "balance_loss_clip": 1.07969785, + "balance_loss_mlp": 1.03168809, + "epoch": 0.04064209703601274, + "flos": 22674644196480.0, + "grad_norm": 2.5101495294052074, + "language_loss": 0.83097482, + "learning_rate": 3.998825356342653e-06, + "loss": 0.85452592, + "num_input_tokens_seen": 7175920, + "step": 338, + "time_per_iteration": 3.2604620456695557 + }, + { + "auxiliary_loss_clip": 0.01311223, + "auxiliary_loss_mlp": 0.01060529, + "balance_loss_clip": 1.08102417, + "balance_loss_mlp": 1.04344618, + "epoch": 0.04076233992665183, + "flos": 38582172783360.0, + "grad_norm": 2.1851602322778807, + "language_loss": 0.73371595, + "learning_rate": 3.998798510900003e-06, + "loss": 0.75743341, + "num_input_tokens_seen": 7198720, + "step": 339, + "time_per_iteration": 2.622479200363159 + }, + { + "auxiliary_loss_clip": 0.01309144, + "auxiliary_loss_mlp": 0.01051611, + "balance_loss_clip": 1.08058238, + "balance_loss_mlp": 1.0346241, + "epoch": 0.04088258281729093, + "flos": 25885309374720.0, + "grad_norm": 2.1588066935533394, + "language_loss": 0.83771682, + "learning_rate": 3.998771362241207e-06, + "loss": 0.86132431, + "num_input_tokens_seen": 7219125, + "step": 340, + "time_per_iteration": 3.4051342010498047 + }, + { + "auxiliary_loss_clip": 0.01304589, + "auxiliary_loss_mlp": 0.01053544, + "balance_loss_clip": 1.07899404, + "balance_loss_mlp": 1.03698564, + "epoch": 0.04100282570793002, + "flos": 19789832223360.0, + "grad_norm": 1.8102629071898464, + "language_loss": 0.88045514, + "learning_rate": 3.998743910370385e-06, + "loss": 0.90403652, + "num_input_tokens_seen": 7237985, + "step": 341, + "time_per_iteration": 2.4822134971618652 + }, + { + "auxiliary_loss_clip": 0.01314951, + "auxiliary_loss_mlp": 0.01048161, + "balance_loss_clip": 1.08846128, + "balance_loss_mlp": 1.02979136, + "epoch": 0.04112306859856911, + "flos": 22565152563840.0, + "grad_norm": 2.3604211269959023, + "language_loss": 0.73348784, + "learning_rate": 3.998716155291702e-06, + "loss": 0.75711894, + "num_input_tokens_seen": 7255825, + "step": 342, + "time_per_iteration": 3.2382519245147705 + }, + { + "auxiliary_loss_clip": 0.01311463, + "auxiliary_loss_mlp": 0.01057718, + "balance_loss_clip": 1.08532834, + "balance_loss_mlp": 1.04024172, + "epoch": 0.0412433114892082, + "flos": 25040654081280.0, + "grad_norm": 1.9425167877447709, + "language_loss": 0.9048599, + "learning_rate": 3.998688097009366e-06, + "loss": 0.92855167, + "num_input_tokens_seen": 7276590, + "step": 343, + "time_per_iteration": 2.502178430557251 + }, + { + "auxiliary_loss_clip": 0.01311018, + "auxiliary_loss_mlp": 0.01057823, + "balance_loss_clip": 1.08274341, + "balance_loss_mlp": 1.04184866, + "epoch": 0.04136355437984729, + "flos": 25191371548800.0, + "grad_norm": 2.3222421895329677, + "language_loss": 0.80014229, + "learning_rate": 3.998659735527636e-06, + "loss": 0.82383072, + "num_input_tokens_seen": 7295680, + "step": 344, + "time_per_iteration": 2.535226821899414 + }, + { + "auxiliary_loss_clip": 0.01309275, + "auxiliary_loss_mlp": 0.01055313, + "balance_loss_clip": 1.08166039, + "balance_loss_mlp": 1.03838563, + "epoch": 0.04148379727048638, + "flos": 22966777509120.0, + "grad_norm": 1.7302722998068656, + "language_loss": 0.77642286, + "learning_rate": 3.998631070850813e-06, + "loss": 0.8000688, + "num_input_tokens_seen": 7316300, + "step": 345, + "time_per_iteration": 2.4912161827087402 + }, + { + "auxiliary_loss_clip": 0.01307482, + "auxiliary_loss_mlp": 0.01059455, + "balance_loss_clip": 1.08352804, + "balance_loss_mlp": 1.04404104, + "epoch": 0.041604040161125476, + "flos": 14063481187200.0, + "grad_norm": 2.898414377965806, + "language_loss": 0.83564025, + "learning_rate": 3.9986021029832455e-06, + "loss": 0.85930967, + "num_input_tokens_seen": 7333615, + "step": 346, + "time_per_iteration": 2.54872727394104 + }, + { + "auxiliary_loss_clip": 0.01307571, + "auxiliary_loss_mlp": 0.01054875, + "balance_loss_clip": 1.08012533, + "balance_loss_mlp": 1.03644574, + "epoch": 0.041724283051764566, + "flos": 12091877614080.0, + "grad_norm": 5.047615285357826, + "language_loss": 0.91783828, + "learning_rate": 3.9985728319293285e-06, + "loss": 0.9414627, + "num_input_tokens_seen": 7347590, + "step": 347, + "time_per_iteration": 2.4675121307373047 + }, + { + "auxiliary_loss_clip": 0.0131176, + "auxiliary_loss_mlp": 0.01051879, + "balance_loss_clip": 1.08102751, + "balance_loss_mlp": 1.03429568, + "epoch": 0.041844525942403656, + "flos": 12385303816320.0, + "grad_norm": 1.9615180112849884, + "language_loss": 0.85155797, + "learning_rate": 3.998543257693501e-06, + "loss": 0.87519437, + "num_input_tokens_seen": 7364345, + "step": 348, + "time_per_iteration": 2.502147674560547 + }, + { + "auxiliary_loss_clip": 0.01308139, + "auxiliary_loss_mlp": 0.01064306, + "balance_loss_clip": 1.0822885, + "balance_loss_mlp": 1.04797411, + "epoch": 0.041964768833042745, + "flos": 23769345041280.0, + "grad_norm": 2.3442214399782837, + "language_loss": 0.87964201, + "learning_rate": 3.998513380280251e-06, + "loss": 0.90336645, + "num_input_tokens_seen": 7384625, + "step": 349, + "time_per_iteration": 2.5076870918273926 + }, + { + "auxiliary_loss_clip": 0.01311887, + "auxiliary_loss_mlp": 0.01070963, + "balance_loss_clip": 1.08300877, + "balance_loss_mlp": 1.05223489, + "epoch": 0.042085011723681835, + "flos": 11875336473600.0, + "grad_norm": 10.723986479896304, + "language_loss": 0.95015419, + "learning_rate": 3.99848319969411e-06, + "loss": 0.97398269, + "num_input_tokens_seen": 7402225, + "step": 350, + "time_per_iteration": 2.4794509410858154 + }, + { + "auxiliary_loss_clip": 0.01313776, + "auxiliary_loss_mlp": 0.01060482, + "balance_loss_clip": 1.08469117, + "balance_loss_mlp": 1.04293489, + "epoch": 0.042205254614320925, + "flos": 16873957964160.0, + "grad_norm": 2.2580122542223857, + "language_loss": 0.79506171, + "learning_rate": 3.9984527159396564e-06, + "loss": 0.81880426, + "num_input_tokens_seen": 7420865, + "step": 351, + "time_per_iteration": 2.4528613090515137 + }, + { + "auxiliary_loss_clip": 0.01307944, + "auxiliary_loss_mlp": 0.01055303, + "balance_loss_clip": 1.07946157, + "balance_loss_mlp": 1.03905523, + "epoch": 0.04232549750496002, + "flos": 25118508810240.0, + "grad_norm": 2.098863148784774, + "language_loss": 0.84698367, + "learning_rate": 3.9984219290215154e-06, + "loss": 0.8706162, + "num_input_tokens_seen": 7441040, + "step": 352, + "time_per_iteration": 2.520439863204956 + }, + { + "auxiliary_loss_clip": 0.01305016, + "auxiliary_loss_mlp": 0.01051342, + "balance_loss_clip": 1.08195615, + "balance_loss_mlp": 1.03576112, + "epoch": 0.04244574039559911, + "flos": 26724541714560.0, + "grad_norm": 1.6343933985270127, + "language_loss": 0.89132303, + "learning_rate": 3.998390838944356e-06, + "loss": 0.91488659, + "num_input_tokens_seen": 7462545, + "step": 353, + "time_per_iteration": 2.515289306640625 + }, + { + "auxiliary_loss_clip": 0.01307415, + "auxiliary_loss_mlp": 0.01060487, + "balance_loss_clip": 1.08205366, + "balance_loss_mlp": 1.04442942, + "epoch": 0.0425659832862382, + "flos": 20923244951040.0, + "grad_norm": 2.294501721306949, + "language_loss": 0.90254581, + "learning_rate": 3.998359445712895e-06, + "loss": 0.92622483, + "num_input_tokens_seen": 7481650, + "step": 354, + "time_per_iteration": 2.5284454822540283 + }, + { + "auxiliary_loss_clip": 0.01305737, + "auxiliary_loss_mlp": 0.01051958, + "balance_loss_clip": 1.07894349, + "balance_loss_mlp": 1.03611457, + "epoch": 0.04268622617687729, + "flos": 23331127115520.0, + "grad_norm": 2.215625902453295, + "language_loss": 0.81241643, + "learning_rate": 3.9983277493318955e-06, + "loss": 0.83599341, + "num_input_tokens_seen": 7500945, + "step": 355, + "time_per_iteration": 2.4882991313934326 + }, + { + "auxiliary_loss_clip": 0.01307679, + "auxiliary_loss_mlp": 0.01051758, + "balance_loss_clip": 1.07841706, + "balance_loss_mlp": 1.03554535, + "epoch": 0.04280646906751638, + "flos": 25994010908160.0, + "grad_norm": 1.863076985476901, + "language_loss": 0.81133151, + "learning_rate": 3.998295749806165e-06, + "loss": 0.83492583, + "num_input_tokens_seen": 7522170, + "step": 356, + "time_per_iteration": 2.550994634628296 + }, + { + "auxiliary_loss_clip": 0.01309361, + "auxiliary_loss_mlp": 0.01063046, + "balance_loss_clip": 1.08438325, + "balance_loss_mlp": 1.0459516, + "epoch": 0.04292671195815547, + "flos": 26906824258560.0, + "grad_norm": 4.679856537214271, + "language_loss": 0.83299279, + "learning_rate": 3.998263447140558e-06, + "loss": 0.85671687, + "num_input_tokens_seen": 7542370, + "step": 357, + "time_per_iteration": 2.5409672260284424 + }, + { + "auxiliary_loss_clip": 0.01304578, + "auxiliary_loss_mlp": 0.01049067, + "balance_loss_clip": 1.07845116, + "balance_loss_mlp": 1.03292572, + "epoch": 0.04304695484879457, + "flos": 39457315745280.0, + "grad_norm": 1.8423860540541745, + "language_loss": 0.81813687, + "learning_rate": 3.998230841339976e-06, + "loss": 0.84167331, + "num_input_tokens_seen": 7564380, + "step": 358, + "time_per_iteration": 2.6984286308288574 + }, + { + "auxiliary_loss_clip": 0.01304734, + "auxiliary_loss_mlp": 0.01052306, + "balance_loss_clip": 1.08210874, + "balance_loss_mlp": 1.03616524, + "epoch": 0.04316719773943366, + "flos": 19646297475840.0, + "grad_norm": 2.366074288231458, + "language_loss": 0.85104167, + "learning_rate": 3.998197932409363e-06, + "loss": 0.87461203, + "num_input_tokens_seen": 7582390, + "step": 359, + "time_per_iteration": 2.4699223041534424 + }, + { + "auxiliary_loss_clip": 0.01299114, + "auxiliary_loss_mlp": 0.01057669, + "balance_loss_clip": 1.07778478, + "balance_loss_mlp": 1.04179025, + "epoch": 0.04328744063007275, + "flos": 22452320966400.0, + "grad_norm": 2.1657395882885906, + "language_loss": 0.86110902, + "learning_rate": 3.9981647203537125e-06, + "loss": 0.88467687, + "num_input_tokens_seen": 7599890, + "step": 360, + "time_per_iteration": 2.494597911834717 + }, + { + "auxiliary_loss_clip": 0.01303162, + "auxiliary_loss_mlp": 0.01058231, + "balance_loss_clip": 1.07801938, + "balance_loss_mlp": 1.04242396, + "epoch": 0.04340768352071184, + "flos": 21283033530240.0, + "grad_norm": 2.3881690539015836, + "language_loss": 0.96250886, + "learning_rate": 3.998131205178063e-06, + "loss": 0.98612273, + "num_input_tokens_seen": 7618360, + "step": 361, + "time_per_iteration": 2.485642910003662 + }, + { + "auxiliary_loss_clip": 0.01302328, + "auxiliary_loss_mlp": 0.01059563, + "balance_loss_clip": 1.07777047, + "balance_loss_mlp": 1.04354119, + "epoch": 0.04352792641135093, + "flos": 11583705951360.0, + "grad_norm": 3.0595114970972483, + "language_loss": 0.77262086, + "learning_rate": 3.998097386887498e-06, + "loss": 0.79623973, + "num_input_tokens_seen": 7635435, + "step": 362, + "time_per_iteration": 2.474665403366089 + }, + { + "auxiliary_loss_clip": 0.01301405, + "auxiliary_loss_mlp": 0.01064721, + "balance_loss_clip": 1.07893157, + "balance_loss_mlp": 1.04837751, + "epoch": 0.04364816930199002, + "flos": 23623547736960.0, + "grad_norm": 1.8654457424148851, + "language_loss": 0.84828568, + "learning_rate": 3.998063265487148e-06, + "loss": 0.87194693, + "num_input_tokens_seen": 7656485, + "step": 363, + "time_per_iteration": 2.5073888301849365 + }, + { + "auxiliary_loss_clip": 0.01302772, + "auxiliary_loss_mlp": 0.01054374, + "balance_loss_clip": 1.07938504, + "balance_loss_mlp": 1.03836393, + "epoch": 0.043768412192629114, + "flos": 14429734214400.0, + "grad_norm": 2.331290033199297, + "language_loss": 0.81133044, + "learning_rate": 3.99802884098219e-06, + "loss": 0.83490187, + "num_input_tokens_seen": 7674595, + "step": 364, + "time_per_iteration": 3.2440385818481445 + }, + { + "auxiliary_loss_clip": 0.01302743, + "auxiliary_loss_mlp": 0.01046705, + "balance_loss_clip": 1.0778718, + "balance_loss_mlp": 1.02994466, + "epoch": 0.043888655083268203, + "flos": 26468893641600.0, + "grad_norm": 2.116146975681874, + "language_loss": 0.82780963, + "learning_rate": 3.997994113377845e-06, + "loss": 0.85130411, + "num_input_tokens_seen": 7693495, + "step": 365, + "time_per_iteration": 2.513350248336792 + }, + { + "auxiliary_loss_clip": 0.01302062, + "auxiliary_loss_mlp": 0.0104986, + "balance_loss_clip": 1.07772911, + "balance_loss_mlp": 1.03306317, + "epoch": 0.04400889797390729, + "flos": 27235263242880.0, + "grad_norm": 9.162401098293659, + "language_loss": 0.83283782, + "learning_rate": 3.9979590826793815e-06, + "loss": 0.85635698, + "num_input_tokens_seen": 7714685, + "step": 366, + "time_per_iteration": 2.5301401615142822 + }, + { + "auxiliary_loss_clip": 0.01306974, + "auxiliary_loss_mlp": 0.01052149, + "balance_loss_clip": 1.081864, + "balance_loss_mlp": 1.03542352, + "epoch": 0.04412914086454638, + "flos": 20119528183680.0, + "grad_norm": 2.147561542261021, + "language_loss": 0.80649555, + "learning_rate": 3.997923748892113e-06, + "loss": 0.83008683, + "num_input_tokens_seen": 7734005, + "step": 367, + "time_per_iteration": 4.0559914112091064 + }, + { + "auxiliary_loss_clip": 0.01301458, + "auxiliary_loss_mlp": 0.01050189, + "balance_loss_clip": 1.08045244, + "balance_loss_mlp": 1.03464389, + "epoch": 0.04424938375518547, + "flos": 22604618632320.0, + "grad_norm": 1.6837432941258395, + "language_loss": 0.88461745, + "learning_rate": 3.9978881120214015e-06, + "loss": 0.90813386, + "num_input_tokens_seen": 7755525, + "step": 368, + "time_per_iteration": 2.5241057872772217 + }, + { + "auxiliary_loss_clip": 0.01301943, + "auxiliary_loss_mlp": 0.01051145, + "balance_loss_clip": 1.07803154, + "balance_loss_mlp": 1.03436017, + "epoch": 0.04436962664582456, + "flos": 24132365844480.0, + "grad_norm": 1.9224432864758418, + "language_loss": 0.79304159, + "learning_rate": 3.997852172072652e-06, + "loss": 0.81657255, + "num_input_tokens_seen": 7776740, + "step": 369, + "time_per_iteration": 3.371454954147339 + }, + { + "auxiliary_loss_clip": 0.013024, + "auxiliary_loss_mlp": 0.01060026, + "balance_loss_clip": 1.07819998, + "balance_loss_mlp": 1.04352772, + "epoch": 0.04448986953646366, + "flos": 18222906251520.0, + "grad_norm": 4.138445382292018, + "language_loss": 0.89236021, + "learning_rate": 3.9978159290513155e-06, + "loss": 0.91598445, + "num_input_tokens_seen": 7794820, + "step": 370, + "time_per_iteration": 2.5034801959991455 + }, + { + "auxiliary_loss_clip": 0.01303442, + "auxiliary_loss_mlp": 0.0107006, + "balance_loss_clip": 1.07908964, + "balance_loss_mlp": 1.05321562, + "epoch": 0.04461011242710275, + "flos": 30117920400000.0, + "grad_norm": 1.6766991911810982, + "language_loss": 0.80102444, + "learning_rate": 3.997779382962892e-06, + "loss": 0.82475948, + "num_input_tokens_seen": 7817705, + "step": 371, + "time_per_iteration": 2.545022964477539 + }, + { + "auxiliary_loss_clip": 0.01297162, + "auxiliary_loss_mlp": 0.01054383, + "balance_loss_clip": 1.07647133, + "balance_loss_mlp": 1.03755069, + "epoch": 0.04473035531774184, + "flos": 29752529299200.0, + "grad_norm": 2.0852117709178395, + "language_loss": 0.73806906, + "learning_rate": 3.997742533812924e-06, + "loss": 0.76158452, + "num_input_tokens_seen": 7840970, + "step": 372, + "time_per_iteration": 2.537662982940674 + }, + { + "auxiliary_loss_clip": 0.01302797, + "auxiliary_loss_mlp": 0.0106599, + "balance_loss_clip": 1.08055937, + "balance_loss_mlp": 1.04956281, + "epoch": 0.04485059820838093, + "flos": 13151565676800.0, + "grad_norm": 2.587605812393181, + "language_loss": 0.92896003, + "learning_rate": 3.997705381607001e-06, + "loss": 0.95264798, + "num_input_tokens_seen": 7857785, + "step": 373, + "time_per_iteration": 2.434431552886963 + }, + { + "auxiliary_loss_clip": 0.01219387, + "auxiliary_loss_mlp": 0.01018079, + "balance_loss_clip": 1.06355357, + "balance_loss_mlp": 1.00854242, + "epoch": 0.04497084109902002, + "flos": 68094209548800.0, + "grad_norm": 3.441598278684891, + "language_loss": 0.60269362, + "learning_rate": 3.997667926350761e-06, + "loss": 0.62506825, + "num_input_tokens_seen": 7916115, + "step": 374, + "time_per_iteration": 2.9761295318603516 + }, + { + "auxiliary_loss_clip": 0.01220843, + "auxiliary_loss_mlp": 0.01019761, + "balance_loss_clip": 1.06425989, + "balance_loss_mlp": 1.0096041, + "epoch": 0.04509108398965911, + "flos": 64342263346560.0, + "grad_norm": 0.9009473513683679, + "language_loss": 0.57787335, + "learning_rate": 3.997630168049886e-06, + "loss": 0.60027933, + "num_input_tokens_seen": 7974480, + "step": 375, + "time_per_iteration": 3.053908109664917 + }, + { + "auxiliary_loss_clip": 0.01304488, + "auxiliary_loss_mlp": 0.01062974, + "balance_loss_clip": 1.07926226, + "balance_loss_mlp": 1.04614162, + "epoch": 0.045211326880298205, + "flos": 22271115830400.0, + "grad_norm": 2.0417330353075824, + "language_loss": 0.77269495, + "learning_rate": 3.997592106710101e-06, + "loss": 0.79636955, + "num_input_tokens_seen": 7993940, + "step": 376, + "time_per_iteration": 2.495969295501709 + }, + { + "auxiliary_loss_clip": 0.01295972, + "auxiliary_loss_mlp": 0.01049262, + "balance_loss_clip": 1.07530046, + "balance_loss_mlp": 1.03306115, + "epoch": 0.045331569770937295, + "flos": 32159441796480.0, + "grad_norm": 2.8234760522792337, + "language_loss": 0.65477991, + "learning_rate": 3.997553742337182e-06, + "loss": 0.67823225, + "num_input_tokens_seen": 8013365, + "step": 377, + "time_per_iteration": 2.5487477779388428 + }, + { + "auxiliary_loss_clip": 0.0129829, + "auxiliary_loss_mlp": 0.01053707, + "balance_loss_clip": 1.07643366, + "balance_loss_mlp": 1.03705335, + "epoch": 0.045451812661576385, + "flos": 22163455791360.0, + "grad_norm": 1.9602896687942037, + "language_loss": 0.91442895, + "learning_rate": 3.997515074936949e-06, + "loss": 0.93794894, + "num_input_tokens_seen": 8034240, + "step": 378, + "time_per_iteration": 2.483916759490967 + }, + { + "auxiliary_loss_clip": 0.01299621, + "auxiliary_loss_mlp": 0.01053588, + "balance_loss_clip": 1.07701004, + "balance_loss_mlp": 1.03759062, + "epoch": 0.045572055552215475, + "flos": 16581968305920.0, + "grad_norm": 12.199424999439428, + "language_loss": 0.86682439, + "learning_rate": 3.997476104515268e-06, + "loss": 0.89035648, + "num_input_tokens_seen": 8052430, + "step": 379, + "time_per_iteration": 2.456869125366211 + }, + { + "auxiliary_loss_clip": 0.01297881, + "auxiliary_loss_mlp": 0.01055273, + "balance_loss_clip": 1.07868552, + "balance_loss_mlp": 1.0394783, + "epoch": 0.045692298442854565, + "flos": 17603375448960.0, + "grad_norm": 1.960611743212873, + "language_loss": 0.77373731, + "learning_rate": 3.9974368310780485e-06, + "loss": 0.79726887, + "num_input_tokens_seen": 8069605, + "step": 380, + "time_per_iteration": 2.4475960731506348 + }, + { + "auxiliary_loss_clip": 0.01307146, + "auxiliary_loss_mlp": 0.01059515, + "balance_loss_clip": 1.08052945, + "balance_loss_mlp": 1.04135954, + "epoch": 0.045812541333493655, + "flos": 26761098781440.0, + "grad_norm": 2.223727236335624, + "language_loss": 0.74581373, + "learning_rate": 3.997397254631251e-06, + "loss": 0.76948029, + "num_input_tokens_seen": 8090225, + "step": 381, + "time_per_iteration": 2.509265422821045 + }, + { + "auxiliary_loss_clip": 0.01224099, + "auxiliary_loss_mlp": 0.01020289, + "balance_loss_clip": 1.06746006, + "balance_loss_mlp": 1.00913131, + "epoch": 0.04593278422413275, + "flos": 60250349894400.0, + "grad_norm": 0.8016322621070437, + "language_loss": 0.60039806, + "learning_rate": 3.997357375180878e-06, + "loss": 0.62284195, + "num_input_tokens_seen": 8154505, + "step": 382, + "time_per_iteration": 3.144186019897461 + }, + { + "auxiliary_loss_clip": 0.0129991, + "auxiliary_loss_mlp": 0.01047744, + "balance_loss_clip": 1.07691479, + "balance_loss_mlp": 1.03031588, + "epoch": 0.04605302711477184, + "flos": 21799249839360.0, + "grad_norm": 2.0787196201957427, + "language_loss": 0.75277728, + "learning_rate": 3.997317192732979e-06, + "loss": 0.77625382, + "num_input_tokens_seen": 8173285, + "step": 383, + "time_per_iteration": 2.4802451133728027 + }, + { + "auxiliary_loss_clip": 0.01299805, + "auxiliary_loss_mlp": 0.0106165, + "balance_loss_clip": 1.07713771, + "balance_loss_mlp": 1.04447186, + "epoch": 0.04617327000541093, + "flos": 19459705299840.0, + "grad_norm": 2.4980559487851535, + "language_loss": 0.8258487, + "learning_rate": 3.99727670729365e-06, + "loss": 0.84946322, + "num_input_tokens_seen": 8191845, + "step": 384, + "time_per_iteration": 2.4822232723236084 + }, + { + "auxiliary_loss_clip": 0.01298531, + "auxiliary_loss_mlp": 0.01057304, + "balance_loss_clip": 1.08002567, + "balance_loss_mlp": 1.04154491, + "epoch": 0.04629351289605002, + "flos": 25411468135680.0, + "grad_norm": 2.1646099551080518, + "language_loss": 0.77973628, + "learning_rate": 3.997235918869033e-06, + "loss": 0.80329466, + "num_input_tokens_seen": 8212880, + "step": 385, + "time_per_iteration": 2.5095272064208984 + }, + { + "auxiliary_loss_clip": 0.01298461, + "auxiliary_loss_mlp": 0.01048629, + "balance_loss_clip": 1.0793941, + "balance_loss_mlp": 1.03297639, + "epoch": 0.04641375578668911, + "flos": 20558284813440.0, + "grad_norm": 2.7548002195478625, + "language_loss": 0.82812452, + "learning_rate": 3.997194827465315e-06, + "loss": 0.8515954, + "num_input_tokens_seen": 8231475, + "step": 386, + "time_per_iteration": 2.476651906967163 + }, + { + "auxiliary_loss_clip": 0.01296566, + "auxiliary_loss_mlp": 0.01046726, + "balance_loss_clip": 1.07561266, + "balance_loss_mlp": 1.03095484, + "epoch": 0.0465339986773282, + "flos": 13188661447680.0, + "grad_norm": 2.6327701306469833, + "language_loss": 0.91049075, + "learning_rate": 3.997153433088728e-06, + "loss": 0.9339236, + "num_input_tokens_seen": 8248600, + "step": 387, + "time_per_iteration": 2.4458999633789062 + }, + { + "auxiliary_loss_clip": 0.01298195, + "auxiliary_loss_mlp": 0.01053405, + "balance_loss_clip": 1.07794797, + "balance_loss_mlp": 1.03579772, + "epoch": 0.0466542415679673, + "flos": 25556547168000.0, + "grad_norm": 2.0880852324273165, + "language_loss": 0.81379497, + "learning_rate": 3.997111735745554e-06, + "loss": 0.83731103, + "num_input_tokens_seen": 8271570, + "step": 388, + "time_per_iteration": 2.5137622356414795 + }, + { + "auxiliary_loss_clip": 0.01295055, + "auxiliary_loss_mlp": 0.01062325, + "balance_loss_clip": 1.07646298, + "balance_loss_mlp": 1.04451489, + "epoch": 0.04677448445860639, + "flos": 22236749493120.0, + "grad_norm": 2.007341450632061, + "language_loss": 0.82949543, + "learning_rate": 3.997069735442118e-06, + "loss": 0.85306925, + "num_input_tokens_seen": 8291265, + "step": 389, + "time_per_iteration": 2.47092604637146 + }, + { + "auxiliary_loss_clip": 0.01293797, + "auxiliary_loss_mlp": 0.01056254, + "balance_loss_clip": 1.0750699, + "balance_loss_mlp": 1.04007673, + "epoch": 0.04689472734924548, + "flos": 28147825198080.0, + "grad_norm": 1.4585153067582626, + "language_loss": 0.80143714, + "learning_rate": 3.997027432184792e-06, + "loss": 0.82493764, + "num_input_tokens_seen": 8315925, + "step": 390, + "time_per_iteration": 2.577768087387085 + }, + { + "auxiliary_loss_clip": 0.01296669, + "auxiliary_loss_mlp": 0.01051847, + "balance_loss_clip": 1.07731628, + "balance_loss_mlp": 1.03593278, + "epoch": 0.04701497023988457, + "flos": 23148952312320.0, + "grad_norm": 1.9529788829977832, + "language_loss": 0.89454085, + "learning_rate": 3.99698482597999e-06, + "loss": 0.91802597, + "num_input_tokens_seen": 8333605, + "step": 391, + "time_per_iteration": 3.2454674243927 + }, + { + "auxiliary_loss_clip": 0.01217137, + "auxiliary_loss_mlp": 0.0101766, + "balance_loss_clip": 1.06130981, + "balance_loss_mlp": 1.00635934, + "epoch": 0.04713521313052366, + "flos": 64827668764800.0, + "grad_norm": 1.1598924085451616, + "language_loss": 0.63914841, + "learning_rate": 3.99694191683418e-06, + "loss": 0.6614964, + "num_input_tokens_seen": 8394405, + "step": 392, + "time_per_iteration": 3.0704169273376465 + }, + { + "auxiliary_loss_clip": 0.01299073, + "auxiliary_loss_mlp": 0.01049919, + "balance_loss_clip": 1.08020818, + "balance_loss_mlp": 1.03309822, + "epoch": 0.047255456021162746, + "flos": 18771585477120.0, + "grad_norm": 1.9588600783117682, + "language_loss": 0.81657195, + "learning_rate": 3.996898704753867e-06, + "loss": 0.8400619, + "num_input_tokens_seen": 8412355, + "step": 393, + "time_per_iteration": 2.467010974884033 + }, + { + "auxiliary_loss_clip": 0.01292942, + "auxiliary_loss_mlp": 0.01049654, + "balance_loss_clip": 1.07421124, + "balance_loss_mlp": 1.03399634, + "epoch": 0.04737569891180184, + "flos": 22053820504320.0, + "grad_norm": 2.517299241216122, + "language_loss": 0.8783623, + "learning_rate": 3.996855189745609e-06, + "loss": 0.90178823, + "num_input_tokens_seen": 8431620, + "step": 394, + "time_per_iteration": 3.2509262561798096 + }, + { + "auxiliary_loss_clip": 0.0129367, + "auxiliary_loss_mlp": 0.01055033, + "balance_loss_clip": 1.07438624, + "balance_loss_mlp": 1.03870153, + "epoch": 0.04749594180244093, + "flos": 29057370410880.0, + "grad_norm": 1.8735117425544574, + "language_loss": 0.92822695, + "learning_rate": 3.996811371816007e-06, + "loss": 0.95171404, + "num_input_tokens_seen": 8454045, + "step": 395, + "time_per_iteration": 3.2701306343078613 + }, + { + "auxiliary_loss_clip": 0.01295382, + "auxiliary_loss_mlp": 0.01056229, + "balance_loss_clip": 1.07726705, + "balance_loss_mlp": 1.04106581, + "epoch": 0.04761618469308002, + "flos": 35112268172160.0, + "grad_norm": 2.140278422679574, + "language_loss": 0.77956021, + "learning_rate": 3.996767250971707e-06, + "loss": 0.80307627, + "num_input_tokens_seen": 8476785, + "step": 396, + "time_per_iteration": 2.608142375946045 + }, + { + "auxiliary_loss_clip": 0.01298636, + "auxiliary_loss_mlp": 0.01052694, + "balance_loss_clip": 1.07938743, + "balance_loss_mlp": 1.03632629, + "epoch": 0.04773642758371911, + "flos": 25630702796160.0, + "grad_norm": 4.445186830786252, + "language_loss": 0.86783576, + "learning_rate": 3.996722827219403e-06, + "loss": 0.89134908, + "num_input_tokens_seen": 8498400, + "step": 397, + "time_per_iteration": 3.331773042678833 + }, + { + "auxiliary_loss_clip": 0.01300353, + "auxiliary_loss_mlp": 0.01059419, + "balance_loss_clip": 1.08039951, + "balance_loss_mlp": 1.04304004, + "epoch": 0.0478566704743582, + "flos": 20631506688000.0, + "grad_norm": 2.1074433678735276, + "language_loss": 0.82596868, + "learning_rate": 3.996678100565833e-06, + "loss": 0.84956646, + "num_input_tokens_seen": 8517455, + "step": 398, + "time_per_iteration": 2.4582126140594482 + }, + { + "auxiliary_loss_clip": 0.01290483, + "auxiliary_loss_mlp": 0.01055077, + "balance_loss_clip": 1.07379007, + "balance_loss_mlp": 1.03754127, + "epoch": 0.04797691336499729, + "flos": 18835721210880.0, + "grad_norm": 2.3562645102206936, + "language_loss": 0.89080942, + "learning_rate": 3.996633071017783e-06, + "loss": 0.91426504, + "num_input_tokens_seen": 8534085, + "step": 399, + "time_per_iteration": 2.453824520111084 + }, + { + "auxiliary_loss_clip": 0.01293052, + "auxiliary_loss_mlp": 0.01051856, + "balance_loss_clip": 1.07641125, + "balance_loss_mlp": 1.03542924, + "epoch": 0.04809715625563638, + "flos": 21099673578240.0, + "grad_norm": 2.6939033869883615, + "language_loss": 0.82208145, + "learning_rate": 3.996587738582084e-06, + "loss": 0.84553051, + "num_input_tokens_seen": 8550885, + "step": 400, + "time_per_iteration": 2.4510090351104736 + }, + { + "auxiliary_loss_clip": 0.01290264, + "auxiliary_loss_mlp": 0.01045078, + "balance_loss_clip": 1.07269871, + "balance_loss_mlp": 1.02961659, + "epoch": 0.04821739914627548, + "flos": 23805650712960.0, + "grad_norm": 4.761028630580809, + "language_loss": 0.86329812, + "learning_rate": 3.9965421032656115e-06, + "loss": 0.88665152, + "num_input_tokens_seen": 8570815, + "step": 401, + "time_per_iteration": 2.4903905391693115 + }, + { + "auxiliary_loss_clip": 0.01292934, + "auxiliary_loss_mlp": 0.01047725, + "balance_loss_clip": 1.07453787, + "balance_loss_mlp": 1.03059506, + "epoch": 0.04833764203691457, + "flos": 22200587475840.0, + "grad_norm": 2.361206433778533, + "language_loss": 0.94121164, + "learning_rate": 3.99649616507529e-06, + "loss": 0.96461821, + "num_input_tokens_seen": 8589910, + "step": 402, + "time_per_iteration": 2.474510431289673 + }, + { + "auxiliary_loss_clip": 0.01201787, + "auxiliary_loss_mlp": 0.01007787, + "balance_loss_clip": 1.0528729, + "balance_loss_mlp": 0.99867934, + "epoch": 0.04845788492755366, + "flos": 65904376896000.0, + "grad_norm": 0.9195496378783726, + "language_loss": 0.63210803, + "learning_rate": 3.996449924018088e-06, + "loss": 0.65420377, + "num_input_tokens_seen": 8650370, + "step": 403, + "time_per_iteration": 3.050830841064453 + }, + { + "auxiliary_loss_clip": 0.01288724, + "auxiliary_loss_mlp": 0.01052226, + "balance_loss_clip": 1.07403874, + "balance_loss_mlp": 1.03745639, + "epoch": 0.04857812781819275, + "flos": 19281301424640.0, + "grad_norm": 2.0374144768533355, + "language_loss": 0.79340869, + "learning_rate": 3.99640338010102e-06, + "loss": 0.81681824, + "num_input_tokens_seen": 8669475, + "step": 404, + "time_per_iteration": 2.492931365966797 + }, + { + "auxiliary_loss_clip": 0.01289764, + "auxiliary_loss_mlp": 0.0104482, + "balance_loss_clip": 1.07293582, + "balance_loss_mlp": 1.02832103, + "epoch": 0.04869837070883184, + "flos": 24062376193920.0, + "grad_norm": 1.9254492404895827, + "language_loss": 0.7863344, + "learning_rate": 3.996356533331146e-06, + "loss": 0.80968028, + "num_input_tokens_seen": 8691345, + "step": 405, + "time_per_iteration": 2.5003504753112793 + }, + { + "auxiliary_loss_clip": 0.0130072, + "auxiliary_loss_mlp": 0.01044463, + "balance_loss_clip": 1.07622051, + "balance_loss_mlp": 1.0282867, + "epoch": 0.04881861359947093, + "flos": 25187169657600.0, + "grad_norm": 2.6002248373943284, + "language_loss": 0.62299538, + "learning_rate": 3.996309383715573e-06, + "loss": 0.64644718, + "num_input_tokens_seen": 8710125, + "step": 406, + "time_per_iteration": 2.5089380741119385 + }, + { + "auxiliary_loss_clip": 0.01298114, + "auxiliary_loss_mlp": 0.01043291, + "balance_loss_clip": 1.07806325, + "balance_loss_mlp": 1.02782321, + "epoch": 0.048938856490110025, + "flos": 16362913213440.0, + "grad_norm": 1.9141522714614692, + "language_loss": 0.73608553, + "learning_rate": 3.996261931261454e-06, + "loss": 0.75949961, + "num_input_tokens_seen": 8728705, + "step": 407, + "time_per_iteration": 2.452371835708618 + }, + { + "auxiliary_loss_clip": 0.01294997, + "auxiliary_loss_mlp": 0.01051698, + "balance_loss_clip": 1.07773805, + "balance_loss_mlp": 1.03567636, + "epoch": 0.049059099380749115, + "flos": 29895094379520.0, + "grad_norm": 1.705413977014146, + "language_loss": 0.86411536, + "learning_rate": 3.996214175975987e-06, + "loss": 0.8875823, + "num_input_tokens_seen": 8749225, + "step": 408, + "time_per_iteration": 2.5517704486846924 + }, + { + "auxiliary_loss_clip": 0.01298197, + "auxiliary_loss_mlp": 0.01054633, + "balance_loss_clip": 1.07865071, + "balance_loss_mlp": 1.03832567, + "epoch": 0.049179342271388204, + "flos": 35918858027520.0, + "grad_norm": 2.165444811994042, + "language_loss": 0.79383457, + "learning_rate": 3.996166117866417e-06, + "loss": 0.8173629, + "num_input_tokens_seen": 8771160, + "step": 409, + "time_per_iteration": 2.595257520675659 + }, + { + "auxiliary_loss_clip": 0.01290562, + "auxiliary_loss_mlp": 0.01047242, + "balance_loss_clip": 1.0742743, + "balance_loss_mlp": 1.0315305, + "epoch": 0.049299585162027294, + "flos": 14611226659200.0, + "grad_norm": 2.4224741871441626, + "language_loss": 0.86661237, + "learning_rate": 3.996117756940035e-06, + "loss": 0.88999039, + "num_input_tokens_seen": 8787845, + "step": 410, + "time_per_iteration": 2.467205286026001 + }, + { + "auxiliary_loss_clip": 0.01294211, + "auxiliary_loss_mlp": 0.01049326, + "balance_loss_clip": 1.07677102, + "balance_loss_mlp": 1.03369737, + "epoch": 0.049419828052666384, + "flos": 19567939956480.0, + "grad_norm": 2.5282208895163145, + "language_loss": 0.97796297, + "learning_rate": 3.996069093204175e-06, + "loss": 1.00139832, + "num_input_tokens_seen": 8803805, + "step": 411, + "time_per_iteration": 2.4948527812957764 + }, + { + "auxiliary_loss_clip": 0.01301345, + "auxiliary_loss_mlp": 0.01054746, + "balance_loss_clip": 1.08108604, + "balance_loss_mlp": 1.03786576, + "epoch": 0.049540070943305474, + "flos": 13659916907520.0, + "grad_norm": 2.239440568552941, + "language_loss": 0.87955725, + "learning_rate": 3.996020126666221e-06, + "loss": 0.90311813, + "num_input_tokens_seen": 8820785, + "step": 412, + "time_per_iteration": 2.4476399421691895 + }, + { + "auxiliary_loss_clip": 0.01293539, + "auxiliary_loss_mlp": 0.01049032, + "balance_loss_clip": 1.07671022, + "balance_loss_mlp": 1.03384519, + "epoch": 0.04966031383394457, + "flos": 21832035978240.0, + "grad_norm": 2.14727685873784, + "language_loss": 0.81994998, + "learning_rate": 3.995970857333601e-06, + "loss": 0.84337568, + "num_input_tokens_seen": 8841195, + "step": 413, + "time_per_iteration": 2.500669479370117 + }, + { + "auxiliary_loss_clip": 0.01295554, + "auxiliary_loss_mlp": 0.01053311, + "balance_loss_clip": 1.07601452, + "balance_loss_mlp": 1.03730118, + "epoch": 0.04978055672458366, + "flos": 28618793349120.0, + "grad_norm": 2.0169638587387184, + "language_loss": 0.79885966, + "learning_rate": 3.995921285213789e-06, + "loss": 0.8223483, + "num_input_tokens_seen": 8861455, + "step": 414, + "time_per_iteration": 2.517228603363037 + }, + { + "auxiliary_loss_clip": 0.01291808, + "auxiliary_loss_mlp": 0.01049962, + "balance_loss_clip": 1.07606435, + "balance_loss_mlp": 1.03519189, + "epoch": 0.04990079961522275, + "flos": 19828220883840.0, + "grad_norm": 2.411123116891815, + "language_loss": 0.8077035, + "learning_rate": 3.995871410314305e-06, + "loss": 0.83112121, + "num_input_tokens_seen": 8880015, + "step": 415, + "time_per_iteration": 2.445866346359253 + }, + { + "auxiliary_loss_clip": 0.01180096, + "auxiliary_loss_mlp": 0.0101149, + "balance_loss_clip": 1.05045271, + "balance_loss_mlp": 1.00457621, + "epoch": 0.05002104250586184, + "flos": 62735045293440.0, + "grad_norm": 0.8979447717597899, + "language_loss": 0.59645355, + "learning_rate": 3.995821232642714e-06, + "loss": 0.61836946, + "num_input_tokens_seen": 8938420, + "step": 416, + "time_per_iteration": 3.135676145553589 + }, + { + "auxiliary_loss_clip": 0.01276153, + "auxiliary_loss_mlp": 0.01052207, + "balance_loss_clip": 1.07766461, + "balance_loss_mlp": 1.03716302, + "epoch": 0.05014128539650093, + "flos": 27928518710400.0, + "grad_norm": 2.3421928816944773, + "language_loss": 0.82214165, + "learning_rate": 3.995770752206629e-06, + "loss": 0.84542531, + "num_input_tokens_seen": 8959495, + "step": 417, + "time_per_iteration": 3.350045919418335 + }, + { + "auxiliary_loss_clip": 0.01293407, + "auxiliary_loss_mlp": 0.01047149, + "balance_loss_clip": 1.07650661, + "balance_loss_mlp": 1.03051925, + "epoch": 0.05026152828714002, + "flos": 17705576620800.0, + "grad_norm": 2.440844099630588, + "language_loss": 0.97264028, + "learning_rate": 3.995719969013709e-06, + "loss": 0.99604583, + "num_input_tokens_seen": 8976675, + "step": 418, + "time_per_iteration": 2.458134889602661 + }, + { + "auxiliary_loss_clip": 0.01258322, + "auxiliary_loss_mlp": 0.01055077, + "balance_loss_clip": 1.07421327, + "balance_loss_mlp": 1.03886485, + "epoch": 0.05038177117777912, + "flos": 19133277477120.0, + "grad_norm": 2.7066528603281603, + "language_loss": 0.86007303, + "learning_rate": 3.995668883071655e-06, + "loss": 0.88320696, + "num_input_tokens_seen": 8992900, + "step": 419, + "time_per_iteration": 2.5086967945098877 + }, + { + "auxiliary_loss_clip": 0.01293718, + "auxiliary_loss_mlp": 0.01052127, + "balance_loss_clip": 1.07694817, + "balance_loss_mlp": 1.03618336, + "epoch": 0.050502014068418206, + "flos": 20667704618880.0, + "grad_norm": 2.2605902396875868, + "language_loss": 0.9078812, + "learning_rate": 3.995617494388219e-06, + "loss": 0.93133968, + "num_input_tokens_seen": 9011020, + "step": 420, + "time_per_iteration": 2.466245174407959 + }, + { + "auxiliary_loss_clip": 0.01255076, + "auxiliary_loss_mlp": 0.01043976, + "balance_loss_clip": 1.06931031, + "balance_loss_mlp": 1.02808511, + "epoch": 0.050622256959057296, + "flos": 21361103740800.0, + "grad_norm": 3.0461816785822724, + "language_loss": 0.80424124, + "learning_rate": 3.995565802971196e-06, + "loss": 0.82723176, + "num_input_tokens_seen": 9030995, + "step": 421, + "time_per_iteration": 3.327618360519409 + }, + { + "auxiliary_loss_clip": 0.0125362, + "auxiliary_loss_mlp": 0.01054225, + "balance_loss_clip": 1.07019687, + "balance_loss_mlp": 1.0393002, + "epoch": 0.050742499849696386, + "flos": 27673588909440.0, + "grad_norm": 1.9561043957561537, + "language_loss": 0.67689419, + "learning_rate": 3.995513808828427e-06, + "loss": 0.69997263, + "num_input_tokens_seen": 9053790, + "step": 422, + "time_per_iteration": 3.322999954223633 + }, + { + "auxiliary_loss_clip": 0.01256689, + "auxiliary_loss_mlp": 0.01047502, + "balance_loss_clip": 1.07185602, + "balance_loss_mlp": 1.03171253, + "epoch": 0.050862742740335476, + "flos": 19865999013120.0, + "grad_norm": 1.7922874806106202, + "language_loss": 0.76481724, + "learning_rate": 3.9954615119678e-06, + "loss": 0.7878592, + "num_input_tokens_seen": 9072345, + "step": 423, + "time_per_iteration": 2.564042806625366 + }, + { + "auxiliary_loss_clip": 0.01263806, + "auxiliary_loss_mlp": 0.01053276, + "balance_loss_clip": 1.07078195, + "balance_loss_mlp": 1.03723097, + "epoch": 0.050982985630974566, + "flos": 22085098272000.0, + "grad_norm": 2.151284317167721, + "language_loss": 0.80412853, + "learning_rate": 3.995408912397248e-06, + "loss": 0.82729942, + "num_input_tokens_seen": 9090240, + "step": 424, + "time_per_iteration": 3.323621988296509 + }, + { + "auxiliary_loss_clip": 0.01259924, + "auxiliary_loss_mlp": 0.01053478, + "balance_loss_clip": 1.07290792, + "balance_loss_mlp": 1.03703928, + "epoch": 0.05110322852161366, + "flos": 20740962407040.0, + "grad_norm": 2.697628150253574, + "language_loss": 0.93573213, + "learning_rate": 3.99535601012475e-06, + "loss": 0.95886618, + "num_input_tokens_seen": 9105570, + "step": 425, + "time_per_iteration": 2.5333735942840576 + }, + { + "auxiliary_loss_clip": 0.01238297, + "auxiliary_loss_mlp": 0.00766929, + "balance_loss_clip": 1.07062578, + "balance_loss_mlp": 1.00049376, + "epoch": 0.05122347141225275, + "flos": 28547295327360.0, + "grad_norm": 1.6138183968489654, + "language_loss": 0.75458729, + "learning_rate": 3.995302805158333e-06, + "loss": 0.77463949, + "num_input_tokens_seen": 9128225, + "step": 426, + "time_per_iteration": 2.6382720470428467 + }, + { + "auxiliary_loss_clip": 0.01250492, + "auxiliary_loss_mlp": 0.01055006, + "balance_loss_clip": 1.07045579, + "balance_loss_mlp": 1.03700519, + "epoch": 0.05134371430289184, + "flos": 19722679747200.0, + "grad_norm": 2.315742303002229, + "language_loss": 0.83617198, + "learning_rate": 3.9952492975060665e-06, + "loss": 0.85922694, + "num_input_tokens_seen": 9148295, + "step": 427, + "time_per_iteration": 2.5884625911712646 + }, + { + "auxiliary_loss_clip": 0.01272747, + "auxiliary_loss_mlp": 0.01043815, + "balance_loss_clip": 1.0740428, + "balance_loss_mlp": 1.02832925, + "epoch": 0.05146395719353093, + "flos": 34458945649920.0, + "grad_norm": 3.307715681427479, + "language_loss": 0.8486805, + "learning_rate": 3.995195487176067e-06, + "loss": 0.87184608, + "num_input_tokens_seen": 9168525, + "step": 428, + "time_per_iteration": 2.605844736099243 + }, + { + "auxiliary_loss_clip": 0.01289853, + "auxiliary_loss_mlp": 0.01054601, + "balance_loss_clip": 1.07503104, + "balance_loss_mlp": 1.03865123, + "epoch": 0.05158420008417002, + "flos": 21760286561280.0, + "grad_norm": 1.9716862278719876, + "language_loss": 0.85718322, + "learning_rate": 3.995141374176499e-06, + "loss": 0.88062775, + "num_input_tokens_seen": 9186920, + "step": 429, + "time_per_iteration": 2.483052968978882 + }, + { + "auxiliary_loss_clip": 0.01146823, + "auxiliary_loss_mlp": 0.00756689, + "balance_loss_clip": 1.04592776, + "balance_loss_mlp": 1.00012302, + "epoch": 0.05170444297480911, + "flos": 72553956226560.0, + "grad_norm": 0.9511295710392449, + "language_loss": 0.63201451, + "learning_rate": 3.995086958515572e-06, + "loss": 0.65104961, + "num_input_tokens_seen": 9244940, + "step": 430, + "time_per_iteration": 3.1620473861694336 + }, + { + "auxiliary_loss_clip": 0.01185777, + "auxiliary_loss_mlp": 0.00756878, + "balance_loss_clip": 1.04707694, + "balance_loss_mlp": 1.00012004, + "epoch": 0.05182468586544821, + "flos": 62416159326720.0, + "grad_norm": 0.8627251508277496, + "language_loss": 0.60047722, + "learning_rate": 3.995032240201538e-06, + "loss": 0.61990374, + "num_input_tokens_seen": 9307335, + "step": 431, + "time_per_iteration": 3.0191521644592285 + }, + { + "auxiliary_loss_clip": 0.01161219, + "auxiliary_loss_mlp": 0.01005835, + "balance_loss_clip": 1.04257286, + "balance_loss_mlp": 0.99973184, + "epoch": 0.0519449287560873, + "flos": 41225989432320.0, + "grad_norm": 0.9277479363233446, + "language_loss": 0.6312809, + "learning_rate": 3.9949772192427e-06, + "loss": 0.65295148, + "num_input_tokens_seen": 9353960, + "step": 432, + "time_per_iteration": 2.7891745567321777 + }, + { + "auxiliary_loss_clip": 0.01254461, + "auxiliary_loss_mlp": 0.01049231, + "balance_loss_clip": 1.06939209, + "balance_loss_mlp": 1.03293467, + "epoch": 0.05206517164672639, + "flos": 17494530261120.0, + "grad_norm": 1.7958254693214988, + "language_loss": 0.79510421, + "learning_rate": 3.994921895647405e-06, + "loss": 0.8181411, + "num_input_tokens_seen": 9372130, + "step": 433, + "time_per_iteration": 2.525357484817505 + }, + { + "auxiliary_loss_clip": 0.01181732, + "auxiliary_loss_mlp": 0.01006062, + "balance_loss_clip": 1.04485488, + "balance_loss_mlp": 1.00034034, + "epoch": 0.05218541453736548, + "flos": 64002762973440.0, + "grad_norm": 0.8754900396118457, + "language_loss": 0.5529961, + "learning_rate": 3.994866269424043e-06, + "loss": 0.57487404, + "num_input_tokens_seen": 9428500, + "step": 434, + "time_per_iteration": 2.9632058143615723 + }, + { + "auxiliary_loss_clip": 0.01198438, + "auxiliary_loss_mlp": 0.0105439, + "balance_loss_clip": 1.05671322, + "balance_loss_mlp": 1.03756917, + "epoch": 0.05230565742800457, + "flos": 19317319787520.0, + "grad_norm": 2.3277909821195144, + "language_loss": 0.78444457, + "learning_rate": 3.9948103405810545e-06, + "loss": 0.80697286, + "num_input_tokens_seen": 9447450, + "step": 435, + "time_per_iteration": 2.6249141693115234 + }, + { + "auxiliary_loss_clip": 0.01225834, + "auxiliary_loss_mlp": 0.0105434, + "balance_loss_clip": 1.06556201, + "balance_loss_mlp": 1.03973711, + "epoch": 0.05242590031864366, + "flos": 25298636538240.0, + "grad_norm": 2.0054612593061916, + "language_loss": 0.8601926, + "learning_rate": 3.994754109126923e-06, + "loss": 0.88299441, + "num_input_tokens_seen": 9468945, + "step": 436, + "time_per_iteration": 2.624011993408203 + }, + { + "auxiliary_loss_clip": 0.01199648, + "auxiliary_loss_mlp": 0.01046457, + "balance_loss_clip": 1.0651269, + "balance_loss_mlp": 1.03118622, + "epoch": 0.052546143209282754, + "flos": 26211629456640.0, + "grad_norm": 1.8107892198894793, + "language_loss": 0.93542624, + "learning_rate": 3.994697575070181e-06, + "loss": 0.95788729, + "num_input_tokens_seen": 9488405, + "step": 437, + "time_per_iteration": 2.811331272125244 + }, + { + "auxiliary_loss_clip": 0.01255958, + "auxiliary_loss_mlp": 0.01053923, + "balance_loss_clip": 1.07468414, + "balance_loss_mlp": 1.03792489, + "epoch": 0.052666386099921844, + "flos": 22158140578560.0, + "grad_norm": 1.8510564854370932, + "language_loss": 0.91383922, + "learning_rate": 3.994640738419402e-06, + "loss": 0.93693805, + "num_input_tokens_seen": 9507780, + "step": 438, + "time_per_iteration": 2.7243692874908447 + }, + { + "auxiliary_loss_clip": 0.01270993, + "auxiliary_loss_mlp": 0.01040547, + "balance_loss_clip": 1.07420468, + "balance_loss_mlp": 1.02574682, + "epoch": 0.052786628990560934, + "flos": 23881817502720.0, + "grad_norm": 1.788623621688451, + "language_loss": 0.8073777, + "learning_rate": 3.9945835991832075e-06, + "loss": 0.83049309, + "num_input_tokens_seen": 9529665, + "step": 439, + "time_per_iteration": 2.556776285171509 + }, + { + "auxiliary_loss_clip": 0.01289652, + "auxiliary_loss_mlp": 0.01059617, + "balance_loss_clip": 1.07873511, + "balance_loss_mlp": 1.04451275, + "epoch": 0.052906871881200024, + "flos": 24605021934720.0, + "grad_norm": 2.475093035285017, + "language_loss": 0.92728639, + "learning_rate": 3.994526157370268e-06, + "loss": 0.95077908, + "num_input_tokens_seen": 9548280, + "step": 440, + "time_per_iteration": 2.508171796798706 + }, + { + "auxiliary_loss_clip": 0.01154801, + "auxiliary_loss_mlp": 0.01009476, + "balance_loss_clip": 1.03958642, + "balance_loss_mlp": 1.0039444, + "epoch": 0.053027114771839114, + "flos": 56461631143680.0, + "grad_norm": 0.9015588560320608, + "language_loss": 0.59313738, + "learning_rate": 3.994468412989296e-06, + "loss": 0.61478019, + "num_input_tokens_seen": 9609690, + "step": 441, + "time_per_iteration": 3.2145025730133057 + }, + { + "auxiliary_loss_clip": 0.01229983, + "auxiliary_loss_mlp": 0.01051542, + "balance_loss_clip": 1.06538832, + "balance_loss_mlp": 1.03622389, + "epoch": 0.053147357662478203, + "flos": 17311098481920.0, + "grad_norm": 2.602418754104756, + "language_loss": 0.92491996, + "learning_rate": 3.994410366049052e-06, + "loss": 0.94773519, + "num_input_tokens_seen": 9627550, + "step": 442, + "time_per_iteration": 2.5211386680603027 + }, + { + "auxiliary_loss_clip": 0.01269769, + "auxiliary_loss_mlp": 0.01046399, + "balance_loss_clip": 1.07228065, + "balance_loss_mlp": 1.03059196, + "epoch": 0.0532676005531173, + "flos": 17164977955200.0, + "grad_norm": 2.1568829706652983, + "language_loss": 0.8283146, + "learning_rate": 3.994352016558341e-06, + "loss": 0.85147631, + "num_input_tokens_seen": 9644855, + "step": 443, + "time_per_iteration": 2.4885382652282715 + }, + { + "auxiliary_loss_clip": 0.01271454, + "auxiliary_loss_mlp": 0.01052809, + "balance_loss_clip": 1.0745548, + "balance_loss_mlp": 1.03720415, + "epoch": 0.05338784344375639, + "flos": 27819960831360.0, + "grad_norm": 1.9716881605738237, + "language_loss": 0.73758852, + "learning_rate": 3.994293364526014e-06, + "loss": 0.76083112, + "num_input_tokens_seen": 9665740, + "step": 444, + "time_per_iteration": 3.4042580127716064 + }, + { + "auxiliary_loss_clip": 0.01245566, + "auxiliary_loss_mlp": 0.0104517, + "balance_loss_clip": 1.07136273, + "balance_loss_mlp": 1.02802742, + "epoch": 0.05350808633439548, + "flos": 21507691144320.0, + "grad_norm": 2.229630236657604, + "language_loss": 0.84837407, + "learning_rate": 3.99423440996097e-06, + "loss": 0.87128145, + "num_input_tokens_seen": 9685280, + "step": 445, + "time_per_iteration": 2.56551194190979 + }, + { + "auxiliary_loss_clip": 0.0125509, + "auxiliary_loss_mlp": 0.01053344, + "balance_loss_clip": 1.07628942, + "balance_loss_mlp": 1.0373342, + "epoch": 0.05362832922503457, + "flos": 20084299920000.0, + "grad_norm": 2.911397480582296, + "language_loss": 0.81358296, + "learning_rate": 3.994175152872152e-06, + "loss": 0.8366673, + "num_input_tokens_seen": 9704365, + "step": 446, + "time_per_iteration": 2.547851085662842 + }, + { + "auxiliary_loss_clip": 0.01272011, + "auxiliary_loss_mlp": 0.0104233, + "balance_loss_clip": 1.07207322, + "balance_loss_mlp": 1.0278933, + "epoch": 0.05374857211567366, + "flos": 26137222433280.0, + "grad_norm": 2.096875843134791, + "language_loss": 0.78736699, + "learning_rate": 3.994115593268548e-06, + "loss": 0.81051034, + "num_input_tokens_seen": 9724145, + "step": 447, + "time_per_iteration": 2.5672664642333984 + }, + { + "auxiliary_loss_clip": 0.0128885, + "auxiliary_loss_mlp": 0.01053371, + "balance_loss_clip": 1.07638144, + "balance_loss_mlp": 1.03839827, + "epoch": 0.05386881500631275, + "flos": 27486817165440.0, + "grad_norm": 2.159325797145741, + "language_loss": 0.82444215, + "learning_rate": 3.994055731159195e-06, + "loss": 0.84786439, + "num_input_tokens_seen": 9741615, + "step": 448, + "time_per_iteration": 3.2991528511047363 + }, + { + "auxiliary_loss_clip": 0.01274637, + "auxiliary_loss_mlp": 0.01061136, + "balance_loss_clip": 1.07774019, + "balance_loss_mlp": 1.04619932, + "epoch": 0.053989057896951846, + "flos": 23585087249280.0, + "grad_norm": 1.8966846018144488, + "language_loss": 0.86752254, + "learning_rate": 3.993995566553172e-06, + "loss": 0.89088029, + "num_input_tokens_seen": 9760580, + "step": 449, + "time_per_iteration": 3.3204095363616943 + }, + { + "auxiliary_loss_clip": 0.01232949, + "auxiliary_loss_mlp": 0.01051591, + "balance_loss_clip": 1.06293118, + "balance_loss_mlp": 1.03659439, + "epoch": 0.054109300787590936, + "flos": 25228862369280.0, + "grad_norm": 1.7383379404753228, + "language_loss": 0.77199244, + "learning_rate": 3.993935099459607e-06, + "loss": 0.79483783, + "num_input_tokens_seen": 9782195, + "step": 450, + "time_per_iteration": 2.5810537338256836 + }, + { + "auxiliary_loss_clip": 0.01281186, + "auxiliary_loss_mlp": 0.01049659, + "balance_loss_clip": 1.07462883, + "balance_loss_mlp": 1.03514528, + "epoch": 0.054229543678230026, + "flos": 23841525421440.0, + "grad_norm": 1.9190300878041289, + "language_loss": 0.74367774, + "learning_rate": 3.993874329887673e-06, + "loss": 0.76698625, + "num_input_tokens_seen": 9800850, + "step": 451, + "time_per_iteration": 3.3274171352386475 + }, + { + "auxiliary_loss_clip": 0.0127145, + "auxiliary_loss_mlp": 0.01059555, + "balance_loss_clip": 1.0747869, + "balance_loss_mlp": 1.04415321, + "epoch": 0.054349786568869116, + "flos": 16320933192960.0, + "grad_norm": 2.3527455501879353, + "language_loss": 0.86231655, + "learning_rate": 3.993813257846589e-06, + "loss": 0.88562655, + "num_input_tokens_seen": 9817605, + "step": 452, + "time_per_iteration": 2.4805796146392822 + }, + { + "auxiliary_loss_clip": 0.01271345, + "auxiliary_loss_mlp": 0.01048368, + "balance_loss_clip": 1.0763458, + "balance_loss_mlp": 1.03300214, + "epoch": 0.054470029459508205, + "flos": 18660729127680.0, + "grad_norm": 3.0400742384719908, + "language_loss": 0.93059194, + "learning_rate": 3.993751883345619e-06, + "loss": 0.95378906, + "num_input_tokens_seen": 9835965, + "step": 453, + "time_per_iteration": 2.496147394180298 + }, + { + "auxiliary_loss_clip": 0.01249537, + "auxiliary_loss_mlp": 0.01050665, + "balance_loss_clip": 1.07325745, + "balance_loss_mlp": 1.03514373, + "epoch": 0.054590272350147295, + "flos": 17785298856960.0, + "grad_norm": 2.64717181379138, + "language_loss": 0.87878704, + "learning_rate": 3.993690206394073e-06, + "loss": 0.90178907, + "num_input_tokens_seen": 9852265, + "step": 454, + "time_per_iteration": 2.510390520095825 + }, + { + "auxiliary_loss_clip": 0.01255899, + "auxiliary_loss_mlp": 0.01051289, + "balance_loss_clip": 1.07210159, + "balance_loss_mlp": 1.03619146, + "epoch": 0.054710515240786385, + "flos": 17785945301760.0, + "grad_norm": 2.403624962143075, + "language_loss": 0.87662947, + "learning_rate": 3.993628227001307e-06, + "loss": 0.89970136, + "num_input_tokens_seen": 9870465, + "step": 455, + "time_per_iteration": 2.5200917720794678 + }, + { + "auxiliary_loss_clip": 0.01251004, + "auxiliary_loss_mlp": 0.01054004, + "balance_loss_clip": 1.07028365, + "balance_loss_mlp": 1.03921628, + "epoch": 0.05483075813142548, + "flos": 48210900180480.0, + "grad_norm": 1.8783877898273367, + "language_loss": 0.71291614, + "learning_rate": 3.993565945176726e-06, + "loss": 0.73596627, + "num_input_tokens_seen": 9891490, + "step": 456, + "time_per_iteration": 2.780634880065918 + }, + { + "auxiliary_loss_clip": 0.01243477, + "auxiliary_loss_mlp": 0.01054331, + "balance_loss_clip": 1.06987107, + "balance_loss_mlp": 1.0393703, + "epoch": 0.05495100102206457, + "flos": 19682244011520.0, + "grad_norm": 2.490379020563612, + "language_loss": 0.84182906, + "learning_rate": 3.993503360929776e-06, + "loss": 0.86480713, + "num_input_tokens_seen": 9910375, + "step": 457, + "time_per_iteration": 2.5302915573120117 + }, + { + "auxiliary_loss_clip": 0.01182349, + "auxiliary_loss_mlp": 0.0104942, + "balance_loss_clip": 1.05957043, + "balance_loss_mlp": 1.03346992, + "epoch": 0.05507124391270366, + "flos": 26360048453760.0, + "grad_norm": 1.8229244167078957, + "language_loss": 0.81065965, + "learning_rate": 3.99344047426995e-06, + "loss": 0.83297729, + "num_input_tokens_seen": 9931635, + "step": 458, + "time_per_iteration": 2.9718635082244873 + }, + { + "auxiliary_loss_clip": 0.01221628, + "auxiliary_loss_mlp": 0.01049946, + "balance_loss_clip": 1.0665884, + "balance_loss_mlp": 1.03385246, + "epoch": 0.05519148680334275, + "flos": 22601314581120.0, + "grad_norm": 2.4404535688264577, + "language_loss": 0.93501127, + "learning_rate": 3.993377285206789e-06, + "loss": 0.95772707, + "num_input_tokens_seen": 9951420, + "step": 459, + "time_per_iteration": 3.0350587368011475 + }, + { + "auxiliary_loss_clip": 0.01213123, + "auxiliary_loss_mlp": 0.01058744, + "balance_loss_clip": 1.06479621, + "balance_loss_mlp": 1.0431273, + "epoch": 0.05531172969398184, + "flos": 40552519380480.0, + "grad_norm": 1.9842620240727717, + "language_loss": 0.86448425, + "learning_rate": 3.99331379374988e-06, + "loss": 0.88720292, + "num_input_tokens_seen": 9975025, + "step": 460, + "time_per_iteration": 2.7634613513946533 + }, + { + "auxiliary_loss_clip": 0.01257361, + "auxiliary_loss_mlp": 0.01045947, + "balance_loss_clip": 1.06687808, + "balance_loss_mlp": 1.03157043, + "epoch": 0.05543197258462093, + "flos": 23477894087040.0, + "grad_norm": 1.852751102869974, + "language_loss": 0.79815382, + "learning_rate": 3.993249999908852e-06, + "loss": 0.8211869, + "num_input_tokens_seen": 9995175, + "step": 461, + "time_per_iteration": 2.58669376373291 + }, + { + "auxiliary_loss_clip": 0.01285538, + "auxiliary_loss_mlp": 0.01052906, + "balance_loss_clip": 1.07326889, + "balance_loss_mlp": 1.03838611, + "epoch": 0.05555221547526003, + "flos": 18624603024000.0, + "grad_norm": 1.9383444168804131, + "language_loss": 0.87098205, + "learning_rate": 3.993185903693384e-06, + "loss": 0.8943665, + "num_input_tokens_seen": 10011975, + "step": 462, + "time_per_iteration": 2.464580774307251 + }, + { + "auxiliary_loss_clip": 0.01248876, + "auxiliary_loss_mlp": 0.01042198, + "balance_loss_clip": 1.06899869, + "balance_loss_mlp": 1.02793455, + "epoch": 0.05567245836589912, + "flos": 23587098410880.0, + "grad_norm": 2.089435099161217, + "language_loss": 0.82068944, + "learning_rate": 3.9931215051131995e-06, + "loss": 0.84360015, + "num_input_tokens_seen": 10032620, + "step": 463, + "time_per_iteration": 2.5578839778900146 + }, + { + "auxiliary_loss_clip": 0.01253572, + "auxiliary_loss_mlp": 0.01049649, + "balance_loss_clip": 1.06601918, + "balance_loss_mlp": 1.03491497, + "epoch": 0.05579270125653821, + "flos": 27746667129600.0, + "grad_norm": 1.8730392021005378, + "language_loss": 0.80037493, + "learning_rate": 3.993056804178068e-06, + "loss": 0.82340711, + "num_input_tokens_seen": 10054165, + "step": 464, + "time_per_iteration": 2.5901246070861816 + }, + { + "auxiliary_loss_clip": 0.01213978, + "auxiliary_loss_mlp": 0.01047197, + "balance_loss_clip": 1.06600106, + "balance_loss_mlp": 1.03170013, + "epoch": 0.0559129441471773, + "flos": 27014161075200.0, + "grad_norm": 2.0552290591404305, + "language_loss": 0.84324718, + "learning_rate": 3.992991800897803e-06, + "loss": 0.86585891, + "num_input_tokens_seen": 10073970, + "step": 465, + "time_per_iteration": 2.6458468437194824 + }, + { + "auxiliary_loss_clip": 0.01283208, + "auxiliary_loss_mlp": 0.0105059, + "balance_loss_clip": 1.07314086, + "balance_loss_mlp": 1.03495002, + "epoch": 0.05603318703781639, + "flos": 15229787794560.0, + "grad_norm": 2.8407955117120953, + "language_loss": 0.89903772, + "learning_rate": 3.9929264952822665e-06, + "loss": 0.92237568, + "num_input_tokens_seen": 10091505, + "step": 466, + "time_per_iteration": 2.4553840160369873 + }, + { + "auxiliary_loss_clip": 0.01269327, + "auxiliary_loss_mlp": 0.01051337, + "balance_loss_clip": 1.06954575, + "balance_loss_mlp": 1.03640044, + "epoch": 0.05615342992845548, + "flos": 22266482976000.0, + "grad_norm": 1.878444074350051, + "language_loss": 0.88188112, + "learning_rate": 3.992860887341366e-06, + "loss": 0.90508771, + "num_input_tokens_seen": 10109675, + "step": 467, + "time_per_iteration": 2.5054750442504883 + }, + { + "auxiliary_loss_clip": 0.01221363, + "auxiliary_loss_mlp": 0.01042684, + "balance_loss_clip": 1.06447935, + "balance_loss_mlp": 1.02693677, + "epoch": 0.056273672819094574, + "flos": 23584979508480.0, + "grad_norm": 2.2136001077459038, + "language_loss": 0.81032932, + "learning_rate": 3.992794977085052e-06, + "loss": 0.83296973, + "num_input_tokens_seen": 10127675, + "step": 468, + "time_per_iteration": 2.5779855251312256 + }, + { + "auxiliary_loss_clip": 0.01235079, + "auxiliary_loss_mlp": 0.01056196, + "balance_loss_clip": 1.06839776, + "balance_loss_mlp": 1.04168236, + "epoch": 0.056393915709733664, + "flos": 19858708552320.0, + "grad_norm": 2.1808676934060354, + "language_loss": 0.84717184, + "learning_rate": 3.992728764523326e-06, + "loss": 0.87008458, + "num_input_tokens_seen": 10146620, + "step": 469, + "time_per_iteration": 2.570004463195801 + }, + { + "auxiliary_loss_clip": 0.01250685, + "auxiliary_loss_mlp": 0.01049049, + "balance_loss_clip": 1.06891441, + "balance_loss_mlp": 1.03395736, + "epoch": 0.05651415860037275, + "flos": 22163779013760.0, + "grad_norm": 1.627494321679239, + "language_loss": 0.80810058, + "learning_rate": 3.99266224966623e-06, + "loss": 0.83109796, + "num_input_tokens_seen": 10167535, + "step": 470, + "time_per_iteration": 2.5547146797180176 + }, + { + "auxiliary_loss_clip": 0.01240783, + "auxiliary_loss_mlp": 0.0104865, + "balance_loss_clip": 1.0690074, + "balance_loss_mlp": 1.0329144, + "epoch": 0.05663440149101184, + "flos": 19463548055040.0, + "grad_norm": 2.0231514587314, + "language_loss": 0.88070536, + "learning_rate": 3.992595432523855e-06, + "loss": 0.90359962, + "num_input_tokens_seen": 10184825, + "step": 471, + "time_per_iteration": 3.2858633995056152 + }, + { + "auxiliary_loss_clip": 0.01224562, + "auxiliary_loss_mlp": 0.01052562, + "balance_loss_clip": 1.06587934, + "balance_loss_mlp": 1.03794694, + "epoch": 0.05675464438165093, + "flos": 22670226823680.0, + "grad_norm": 2.0777575250887312, + "language_loss": 0.86332542, + "learning_rate": 3.992528313106338e-06, + "loss": 0.88609672, + "num_input_tokens_seen": 10203025, + "step": 472, + "time_per_iteration": 2.6441357135772705 + }, + { + "auxiliary_loss_clip": 0.0128274, + "auxiliary_loss_mlp": 0.007673, + "balance_loss_clip": 1.07579613, + "balance_loss_mlp": 1.00040364, + "epoch": 0.05687488727229002, + "flos": 16901177495040.0, + "grad_norm": 2.6434843796722403, + "language_loss": 0.81980187, + "learning_rate": 3.9924608914238595e-06, + "loss": 0.84030229, + "num_input_tokens_seen": 10218020, + "step": 473, + "time_per_iteration": 2.4354655742645264 + }, + { + "auxiliary_loss_clip": 0.01266408, + "auxiliary_loss_mlp": 0.01052033, + "balance_loss_clip": 1.07252312, + "balance_loss_mlp": 1.03723907, + "epoch": 0.05699513016292912, + "flos": 29168980945920.0, + "grad_norm": 3.563713873460749, + "language_loss": 0.83806086, + "learning_rate": 3.992393167486648e-06, + "loss": 0.86124527, + "num_input_tokens_seen": 10237170, + "step": 474, + "time_per_iteration": 2.558845281600952 + }, + { + "auxiliary_loss_clip": 0.0128474, + "auxiliary_loss_mlp": 0.01054905, + "balance_loss_clip": 1.07468224, + "balance_loss_mlp": 1.0390625, + "epoch": 0.05711537305356821, + "flos": 18916197632640.0, + "grad_norm": 2.834707437666838, + "language_loss": 0.80813545, + "learning_rate": 3.992325141304977e-06, + "loss": 0.83153188, + "num_input_tokens_seen": 10255125, + "step": 475, + "time_per_iteration": 3.20054292678833 + }, + { + "auxiliary_loss_clip": 0.01221069, + "auxiliary_loss_mlp": 0.01050403, + "balance_loss_clip": 1.06606305, + "balance_loss_mlp": 1.03584111, + "epoch": 0.0572356159442073, + "flos": 26758979879040.0, + "grad_norm": 2.3406306495893623, + "language_loss": 0.86462319, + "learning_rate": 3.992256812889166e-06, + "loss": 0.88733792, + "num_input_tokens_seen": 10271230, + "step": 476, + "time_per_iteration": 3.3448898792266846 + }, + { + "auxiliary_loss_clip": 0.0128419, + "auxiliary_loss_mlp": 0.01052444, + "balance_loss_clip": 1.07594681, + "balance_loss_mlp": 1.03758466, + "epoch": 0.05735585883484639, + "flos": 35116146840960.0, + "grad_norm": 2.3366130295281033, + "language_loss": 0.77038562, + "learning_rate": 3.992188182249582e-06, + "loss": 0.79375196, + "num_input_tokens_seen": 10293125, + "step": 477, + "time_per_iteration": 2.58583402633667 + }, + { + "auxiliary_loss_clip": 0.01249411, + "auxiliary_loss_mlp": 0.01055479, + "balance_loss_clip": 1.07264638, + "balance_loss_mlp": 1.03994632, + "epoch": 0.05747610172548548, + "flos": 18734381965440.0, + "grad_norm": 2.1363475238018954, + "language_loss": 0.90684426, + "learning_rate": 3.992119249396633e-06, + "loss": 0.92989314, + "num_input_tokens_seen": 10311810, + "step": 478, + "time_per_iteration": 3.259884834289551 + }, + { + "auxiliary_loss_clip": 0.01241297, + "auxiliary_loss_mlp": 0.00766844, + "balance_loss_clip": 1.06606936, + "balance_loss_mlp": 1.00048566, + "epoch": 0.05759634461612457, + "flos": 27964752554880.0, + "grad_norm": 1.812225969711567, + "language_loss": 0.82088786, + "learning_rate": 3.992050014340778e-06, + "loss": 0.84096932, + "num_input_tokens_seen": 10332165, + "step": 479, + "time_per_iteration": 2.5929741859436035 + }, + { + "auxiliary_loss_clip": 0.01154721, + "auxiliary_loss_mlp": 0.01009407, + "balance_loss_clip": 1.04297638, + "balance_loss_mlp": 1.00339913, + "epoch": 0.057716587506763666, + "flos": 69292009405440.0, + "grad_norm": 0.8313318607115783, + "language_loss": 0.5503701, + "learning_rate": 3.99198047709252e-06, + "loss": 0.57201135, + "num_input_tokens_seen": 10393685, + "step": 480, + "time_per_iteration": 3.1574766635894775 + }, + { + "auxiliary_loss_clip": 0.01229904, + "auxiliary_loss_mlp": 0.01050035, + "balance_loss_clip": 1.06195688, + "balance_loss_mlp": 1.03480017, + "epoch": 0.057836830397402755, + "flos": 25009196745600.0, + "grad_norm": 2.2387818440566596, + "language_loss": 0.78696144, + "learning_rate": 3.991910637662408e-06, + "loss": 0.80976081, + "num_input_tokens_seen": 10413975, + "step": 481, + "time_per_iteration": 2.5991899967193604 + }, + { + "auxiliary_loss_clip": 0.01281796, + "auxiliary_loss_mlp": 0.0104401, + "balance_loss_clip": 1.07526767, + "balance_loss_mlp": 1.02891839, + "epoch": 0.057957073288041845, + "flos": 25593894334080.0, + "grad_norm": 1.8311912291805688, + "language_loss": 0.80635411, + "learning_rate": 3.9918404960610355e-06, + "loss": 0.82961214, + "num_input_tokens_seen": 10433005, + "step": 482, + "time_per_iteration": 2.5026915073394775 + }, + { + "auxiliary_loss_clip": 0.01271805, + "auxiliary_loss_mlp": 0.01051127, + "balance_loss_clip": 1.07295442, + "balance_loss_mlp": 1.03614259, + "epoch": 0.058077316178680935, + "flos": 20777411733120.0, + "grad_norm": 2.117096029015613, + "language_loss": 0.77393216, + "learning_rate": 3.991770052299043e-06, + "loss": 0.79716146, + "num_input_tokens_seen": 10451235, + "step": 483, + "time_per_iteration": 2.550258159637451 + }, + { + "auxiliary_loss_clip": 0.01248801, + "auxiliary_loss_mlp": 0.01043003, + "balance_loss_clip": 1.06693375, + "balance_loss_mlp": 1.02950215, + "epoch": 0.058197559069320025, + "flos": 18916484941440.0, + "grad_norm": 2.784172060002887, + "language_loss": 0.87461144, + "learning_rate": 3.991699306387118e-06, + "loss": 0.89752948, + "num_input_tokens_seen": 10469705, + "step": 484, + "time_per_iteration": 2.5137414932250977 + }, + { + "auxiliary_loss_clip": 0.0126649, + "auxiliary_loss_mlp": 0.01052273, + "balance_loss_clip": 1.07179809, + "balance_loss_mlp": 1.03795564, + "epoch": 0.058317801959959115, + "flos": 24863327614080.0, + "grad_norm": 1.9096247907616577, + "language_loss": 0.78005743, + "learning_rate": 3.991628258335991e-06, + "loss": 0.80324501, + "num_input_tokens_seen": 10491910, + "step": 485, + "time_per_iteration": 2.542844533920288 + }, + { + "auxiliary_loss_clip": 0.01226008, + "auxiliary_loss_mlp": 0.01048505, + "balance_loss_clip": 1.06472743, + "balance_loss_mlp": 1.0339551, + "epoch": 0.05843804485059821, + "flos": 23257977068160.0, + "grad_norm": 2.9695555408550542, + "language_loss": 0.87947494, + "learning_rate": 3.991556908156442e-06, + "loss": 0.90222013, + "num_input_tokens_seen": 10508435, + "step": 486, + "time_per_iteration": 2.5722312927246094 + }, + { + "auxiliary_loss_clip": 0.01255132, + "auxiliary_loss_mlp": 0.01055192, + "balance_loss_clip": 1.07019806, + "balance_loss_mlp": 1.04075623, + "epoch": 0.0585582877412373, + "flos": 23150532510720.0, + "grad_norm": 1.956310963756151, + "language_loss": 0.8767817, + "learning_rate": 3.9914852558592914e-06, + "loss": 0.89988494, + "num_input_tokens_seen": 10529485, + "step": 487, + "time_per_iteration": 2.5601906776428223 + }, + { + "auxiliary_loss_clip": 0.01265898, + "auxiliary_loss_mlp": 0.0104586, + "balance_loss_clip": 1.07388735, + "balance_loss_mlp": 1.03039837, + "epoch": 0.05867853063187639, + "flos": 23506406507520.0, + "grad_norm": 2.8918725387208917, + "language_loss": 0.80806756, + "learning_rate": 3.991413301455413e-06, + "loss": 0.8311851, + "num_input_tokens_seen": 10545935, + "step": 488, + "time_per_iteration": 2.519502639770508 + }, + { + "auxiliary_loss_clip": 0.01233829, + "auxiliary_loss_mlp": 0.01046208, + "balance_loss_clip": 1.06589484, + "balance_loss_mlp": 1.03270125, + "epoch": 0.05879877352251548, + "flos": 29495803818240.0, + "grad_norm": 2.330576509333311, + "language_loss": 0.77649379, + "learning_rate": 3.991341044955719e-06, + "loss": 0.79929411, + "num_input_tokens_seen": 10565690, + "step": 489, + "time_per_iteration": 2.5951523780822754 + }, + { + "auxiliary_loss_clip": 0.01262315, + "auxiliary_loss_mlp": 0.00767525, + "balance_loss_clip": 1.06981814, + "balance_loss_mlp": 1.0006597, + "epoch": 0.05891901641315457, + "flos": 20157485880960.0, + "grad_norm": 2.0295788272645297, + "language_loss": 0.81575978, + "learning_rate": 3.991268486371172e-06, + "loss": 0.83605826, + "num_input_tokens_seen": 10584245, + "step": 490, + "time_per_iteration": 2.509990692138672 + }, + { + "auxiliary_loss_clip": 0.0124894, + "auxiliary_loss_mlp": 0.01053305, + "balance_loss_clip": 1.0678674, + "balance_loss_mlp": 1.03627038, + "epoch": 0.05903925930379366, + "flos": 24644200694400.0, + "grad_norm": 2.5470061414754914, + "language_loss": 0.88240743, + "learning_rate": 3.991195625712779e-06, + "loss": 0.9054299, + "num_input_tokens_seen": 10601210, + "step": 491, + "time_per_iteration": 2.561161518096924 + }, + { + "auxiliary_loss_clip": 0.01281219, + "auxiliary_loss_mlp": 0.0104491, + "balance_loss_clip": 1.07717371, + "balance_loss_mlp": 1.03032446, + "epoch": 0.05915950219443276, + "flos": 21250391045760.0, + "grad_norm": 2.0478491294815555, + "language_loss": 0.81616426, + "learning_rate": 3.991122462991592e-06, + "loss": 0.8394255, + "num_input_tokens_seen": 10620730, + "step": 492, + "time_per_iteration": 2.473135232925415 + }, + { + "auxiliary_loss_clip": 0.01284441, + "auxiliary_loss_mlp": 0.01050034, + "balance_loss_clip": 1.07337618, + "balance_loss_mlp": 1.03574109, + "epoch": 0.05927974508507185, + "flos": 9902727319680.0, + "grad_norm": 3.2756310202192593, + "language_loss": 0.80891371, + "learning_rate": 3.991048998218712e-06, + "loss": 0.83225846, + "num_input_tokens_seen": 10634035, + "step": 493, + "time_per_iteration": 2.490185022354126 + }, + { + "auxiliary_loss_clip": 0.01262458, + "auxiliary_loss_mlp": 0.01046504, + "balance_loss_clip": 1.06926203, + "balance_loss_mlp": 1.03234243, + "epoch": 0.05939998797571094, + "flos": 18259499232000.0, + "grad_norm": 2.110160134901879, + "language_loss": 0.76527911, + "learning_rate": 3.990975231405281e-06, + "loss": 0.7883687, + "num_input_tokens_seen": 10652485, + "step": 494, + "time_per_iteration": 2.4639039039611816 + }, + { + "auxiliary_loss_clip": 0.01260228, + "auxiliary_loss_mlp": 0.01047569, + "balance_loss_clip": 1.07136154, + "balance_loss_mlp": 1.03326392, + "epoch": 0.05952023086635003, + "flos": 28256598558720.0, + "grad_norm": 1.8006969940536237, + "language_loss": 0.78962845, + "learning_rate": 3.990901162562491e-06, + "loss": 0.81270635, + "num_input_tokens_seen": 10673175, + "step": 495, + "time_per_iteration": 2.5743350982666016 + }, + { + "auxiliary_loss_clip": 0.01223561, + "auxiliary_loss_mlp": 0.00767714, + "balance_loss_clip": 1.06105447, + "balance_loss_mlp": 1.00066733, + "epoch": 0.05964047375698912, + "flos": 14902498045440.0, + "grad_norm": 4.220110886839479, + "language_loss": 0.90613031, + "learning_rate": 3.9908267917015765e-06, + "loss": 0.92604315, + "num_input_tokens_seen": 10691235, + "step": 496, + "time_per_iteration": 2.5648128986358643 + }, + { + "auxiliary_loss_clip": 0.01250792, + "auxiliary_loss_mlp": 0.01057395, + "balance_loss_clip": 1.06675291, + "balance_loss_mlp": 1.0427382, + "epoch": 0.059760716647628206, + "flos": 23185581206400.0, + "grad_norm": 2.2769073623512557, + "language_loss": 0.92735124, + "learning_rate": 3.990752118833821e-06, + "loss": 0.95043308, + "num_input_tokens_seen": 10708675, + "step": 497, + "time_per_iteration": 2.5101516246795654 + }, + { + "auxiliary_loss_clip": 0.01279221, + "auxiliary_loss_mlp": 0.01041854, + "balance_loss_clip": 1.07403457, + "balance_loss_mlp": 1.02789509, + "epoch": 0.0598809595382673, + "flos": 22746968231040.0, + "grad_norm": 2.7657732557199877, + "language_loss": 0.78010136, + "learning_rate": 3.990677143970553e-06, + "loss": 0.80331206, + "num_input_tokens_seen": 10729485, + "step": 498, + "time_per_iteration": 3.234812021255493 + }, + { + "auxiliary_loss_clip": 0.01227156, + "auxiliary_loss_mlp": 0.01053104, + "balance_loss_clip": 1.07031405, + "balance_loss_mlp": 1.03804779, + "epoch": 0.06000120242890639, + "flos": 22127221946880.0, + "grad_norm": 2.4786989124716716, + "language_loss": 0.81152248, + "learning_rate": 3.990601867123144e-06, + "loss": 0.83432508, + "num_input_tokens_seen": 10749210, + "step": 499, + "time_per_iteration": 2.5610368251800537 + }, + { + "auxiliary_loss_clip": 0.0121288, + "auxiliary_loss_mlp": 0.0104583, + "balance_loss_clip": 1.06592095, + "balance_loss_mlp": 1.03114355, + "epoch": 0.06012144531954548, + "flos": 19171773878400.0, + "grad_norm": 4.539871358510705, + "language_loss": 0.84848893, + "learning_rate": 3.990526288303014e-06, + "loss": 0.87107599, + "num_input_tokens_seen": 10768000, + "step": 500, + "time_per_iteration": 2.6519477367401123 + }, + { + "auxiliary_loss_clip": 0.01242297, + "auxiliary_loss_mlp": 0.00766192, + "balance_loss_clip": 1.06735861, + "balance_loss_mlp": 1.00062752, + "epoch": 0.06024168821018457, + "flos": 22783345729920.0, + "grad_norm": 1.9648765373591515, + "language_loss": 0.90832067, + "learning_rate": 3.9904504075216295e-06, + "loss": 0.92840564, + "num_input_tokens_seen": 10788760, + "step": 501, + "time_per_iteration": 3.282116651535034 + }, + { + "auxiliary_loss_clip": 0.01227136, + "auxiliary_loss_mlp": 0.01049841, + "balance_loss_clip": 1.06238699, + "balance_loss_mlp": 1.03500533, + "epoch": 0.06036193110082366, + "flos": 18770687637120.0, + "grad_norm": 2.2109384439189017, + "language_loss": 0.93958259, + "learning_rate": 3.990374224790501e-06, + "loss": 0.96235234, + "num_input_tokens_seen": 10806965, + "step": 502, + "time_per_iteration": 2.539088726043701 + }, + { + "auxiliary_loss_clip": 0.01243085, + "auxiliary_loss_mlp": 0.01050286, + "balance_loss_clip": 1.06917167, + "balance_loss_mlp": 1.03573656, + "epoch": 0.06048217399146275, + "flos": 17201570935680.0, + "grad_norm": 1.9692761216687578, + "language_loss": 0.70952672, + "learning_rate": 3.990297740121185e-06, + "loss": 0.73246038, + "num_input_tokens_seen": 10824900, + "step": 503, + "time_per_iteration": 3.2890169620513916 + }, + { + "auxiliary_loss_clip": 0.01259161, + "auxiliary_loss_mlp": 0.00767051, + "balance_loss_clip": 1.07001007, + "balance_loss_mlp": 1.00059819, + "epoch": 0.06060241688210185, + "flos": 24024131187840.0, + "grad_norm": 2.3778004010462475, + "language_loss": 0.77955061, + "learning_rate": 3.990220953525284e-06, + "loss": 0.79981267, + "num_input_tokens_seen": 10842010, + "step": 504, + "time_per_iteration": 3.3623740673065186 + }, + { + "auxiliary_loss_clip": 0.01232617, + "auxiliary_loss_mlp": 0.01047437, + "balance_loss_clip": 1.064358, + "balance_loss_mlp": 1.0339241, + "epoch": 0.06072265977274094, + "flos": 14611190745600.0, + "grad_norm": 2.8286462754032, + "language_loss": 0.74569893, + "learning_rate": 3.9901438650144465e-06, + "loss": 0.76849937, + "num_input_tokens_seen": 10858260, + "step": 505, + "time_per_iteration": 2.511645793914795 + }, + { + "auxiliary_loss_clip": 0.0125155, + "auxiliary_loss_mlp": 0.01045554, + "balance_loss_clip": 1.06807923, + "balance_loss_mlp": 1.03242278, + "epoch": 0.06084290266338003, + "flos": 20558284813440.0, + "grad_norm": 2.5740683690383377, + "language_loss": 0.91967094, + "learning_rate": 3.990066474600367e-06, + "loss": 0.94264197, + "num_input_tokens_seen": 10876230, + "step": 506, + "time_per_iteration": 2.547173261642456 + }, + { + "auxiliary_loss_clip": 0.01244455, + "auxiliary_loss_mlp": 0.01049504, + "balance_loss_clip": 1.06261051, + "balance_loss_mlp": 1.03461504, + "epoch": 0.06096314555401912, + "flos": 22309217182080.0, + "grad_norm": 1.8713234828093008, + "language_loss": 0.67783332, + "learning_rate": 3.989988782294786e-06, + "loss": 0.70077288, + "num_input_tokens_seen": 10896320, + "step": 507, + "time_per_iteration": 2.5135107040405273 + }, + { + "auxiliary_loss_clip": 0.01213302, + "auxiliary_loss_mlp": 0.01051366, + "balance_loss_clip": 1.06371999, + "balance_loss_mlp": 1.03679895, + "epoch": 0.06108338844465821, + "flos": 19131374056320.0, + "grad_norm": 1.683530692019752, + "language_loss": 0.9504801, + "learning_rate": 3.989910788109489e-06, + "loss": 0.97312683, + "num_input_tokens_seen": 10912970, + "step": 508, + "time_per_iteration": 2.5465927124023438 + }, + { + "auxiliary_loss_clip": 0.01222685, + "auxiliary_loss_mlp": 0.01042926, + "balance_loss_clip": 1.06231344, + "balance_loss_mlp": 1.02962184, + "epoch": 0.0612036313352973, + "flos": 33584018169600.0, + "grad_norm": 2.0723878737143857, + "language_loss": 0.75014293, + "learning_rate": 3.989832492056307e-06, + "loss": 0.77279902, + "num_input_tokens_seen": 10933995, + "step": 509, + "time_per_iteration": 2.6766958236694336 + }, + { + "auxiliary_loss_clip": 0.01260558, + "auxiliary_loss_mlp": 0.01048391, + "balance_loss_clip": 1.07110012, + "balance_loss_mlp": 1.03355002, + "epoch": 0.06132387422593639, + "flos": 27490552179840.0, + "grad_norm": 2.191224502856369, + "language_loss": 0.80801362, + "learning_rate": 3.989753894147119e-06, + "loss": 0.83110309, + "num_input_tokens_seen": 10954120, + "step": 510, + "time_per_iteration": 2.570888042449951 + }, + { + "auxiliary_loss_clip": 0.01255227, + "auxiliary_loss_mlp": 0.01045296, + "balance_loss_clip": 1.07277131, + "balance_loss_mlp": 1.03187275, + "epoch": 0.061444117116575485, + "flos": 25885057979520.0, + "grad_norm": 1.9672200105037647, + "language_loss": 0.79823416, + "learning_rate": 3.989674994393846e-06, + "loss": 0.82123935, + "num_input_tokens_seen": 10973595, + "step": 511, + "time_per_iteration": 2.5370893478393555 + }, + { + "auxiliary_loss_clip": 0.01256182, + "auxiliary_loss_mlp": 0.01040944, + "balance_loss_clip": 1.07083619, + "balance_loss_mlp": 1.02709246, + "epoch": 0.061564360007214575, + "flos": 28512031150080.0, + "grad_norm": 2.0946444842391694, + "language_loss": 0.9379757, + "learning_rate": 3.98959579280846e-06, + "loss": 0.96094698, + "num_input_tokens_seen": 10991995, + "step": 512, + "time_per_iteration": 2.6123690605163574 + }, + { + "auxiliary_loss_clip": 0.01193131, + "auxiliary_loss_mlp": 0.01045373, + "balance_loss_clip": 1.06524837, + "balance_loss_mlp": 1.0310204, + "epoch": 0.061684602897853665, + "flos": 12094355652480.0, + "grad_norm": 2.244367714283535, + "language_loss": 0.83129632, + "learning_rate": 3.989516289402973e-06, + "loss": 0.85368133, + "num_input_tokens_seen": 11007625, + "step": 513, + "time_per_iteration": 2.5335652828216553 + }, + { + "auxiliary_loss_clip": 0.01174654, + "auxiliary_loss_mlp": 0.01043843, + "balance_loss_clip": 1.05567169, + "balance_loss_mlp": 1.02971697, + "epoch": 0.061804845788492754, + "flos": 19532639865600.0, + "grad_norm": 9.657093947859243, + "language_loss": 0.80434185, + "learning_rate": 3.989436484189447e-06, + "loss": 0.82652682, + "num_input_tokens_seen": 11025570, + "step": 514, + "time_per_iteration": 2.6459310054779053 + }, + { + "auxiliary_loss_clip": 0.01260874, + "auxiliary_loss_mlp": 0.01043697, + "balance_loss_clip": 1.06816781, + "balance_loss_mlp": 1.02981532, + "epoch": 0.061925088679131844, + "flos": 15341111020800.0, + "grad_norm": 2.57244475436849, + "language_loss": 0.80603409, + "learning_rate": 3.9893563771799885e-06, + "loss": 0.82907987, + "num_input_tokens_seen": 11042045, + "step": 515, + "time_per_iteration": 2.481623888015747 + }, + { + "auxiliary_loss_clip": 0.01277146, + "auxiliary_loss_mlp": 0.01050533, + "balance_loss_clip": 1.07345319, + "balance_loss_mlp": 1.03600121, + "epoch": 0.062045331569770934, + "flos": 25919927107200.0, + "grad_norm": 2.10114193957477, + "language_loss": 0.86202753, + "learning_rate": 3.989275968386749e-06, + "loss": 0.88530427, + "num_input_tokens_seen": 11059955, + "step": 516, + "time_per_iteration": 2.53367018699646 + }, + { + "auxiliary_loss_clip": 0.01237026, + "auxiliary_loss_mlp": 0.01052049, + "balance_loss_clip": 1.06511402, + "balance_loss_mlp": 1.03686213, + "epoch": 0.06216557446041003, + "flos": 28110621686400.0, + "grad_norm": 2.1894658953857644, + "language_loss": 0.76556492, + "learning_rate": 3.989195257821926e-06, + "loss": 0.78845561, + "num_input_tokens_seen": 11078440, + "step": 517, + "time_per_iteration": 2.5860280990600586 + }, + { + "auxiliary_loss_clip": 0.01240717, + "auxiliary_loss_mlp": 0.01050306, + "balance_loss_clip": 1.070117, + "balance_loss_mlp": 1.03560734, + "epoch": 0.06228581735104912, + "flos": 23478181395840.0, + "grad_norm": 2.1407385911884544, + "language_loss": 0.84724641, + "learning_rate": 3.989114245497765e-06, + "loss": 0.87015659, + "num_input_tokens_seen": 11098240, + "step": 518, + "time_per_iteration": 2.57220458984375 + }, + { + "auxiliary_loss_clip": 0.01259554, + "auxiliary_loss_mlp": 0.01043071, + "balance_loss_clip": 1.06509376, + "balance_loss_mlp": 1.02934432, + "epoch": 0.06240606024168821, + "flos": 15195205975680.0, + "grad_norm": 2.065967239105128, + "language_loss": 0.94785655, + "learning_rate": 3.989032931426554e-06, + "loss": 0.97088277, + "num_input_tokens_seen": 11115395, + "step": 519, + "time_per_iteration": 2.4849891662597656 + }, + { + "auxiliary_loss_clip": 0.01235783, + "auxiliary_loss_mlp": 0.01044456, + "balance_loss_clip": 1.06678462, + "balance_loss_mlp": 1.03024054, + "epoch": 0.06252630313232731, + "flos": 20631829910400.0, + "grad_norm": 2.2083846959491757, + "language_loss": 0.86480147, + "learning_rate": 3.9889513156206295e-06, + "loss": 0.88760382, + "num_input_tokens_seen": 11134835, + "step": 520, + "time_per_iteration": 2.536604166030884 + }, + { + "auxiliary_loss_clip": 0.01232455, + "auxiliary_loss_mlp": 0.0104991, + "balance_loss_clip": 1.06717825, + "balance_loss_mlp": 1.0349853, + "epoch": 0.06264654602296639, + "flos": 20778058177920.0, + "grad_norm": 2.639694906654075, + "language_loss": 0.74130774, + "learning_rate": 3.988869398092371e-06, + "loss": 0.76413143, + "num_input_tokens_seen": 11154745, + "step": 521, + "time_per_iteration": 2.5667564868927 + }, + { + "auxiliary_loss_clip": 0.0124328, + "auxiliary_loss_mlp": 0.01047355, + "balance_loss_clip": 1.06826401, + "balance_loss_mlp": 1.03280544, + "epoch": 0.06276678891360549, + "flos": 29605798241280.0, + "grad_norm": 2.972721417859887, + "language_loss": 0.790335, + "learning_rate": 3.988787178854206e-06, + "loss": 0.81324136, + "num_input_tokens_seen": 11174280, + "step": 522, + "time_per_iteration": 2.6047964096069336 + }, + { + "auxiliary_loss_clip": 0.01276307, + "auxiliary_loss_mlp": 0.01047834, + "balance_loss_clip": 1.07350826, + "balance_loss_mlp": 1.03360045, + "epoch": 0.06288703180424457, + "flos": 22126288193280.0, + "grad_norm": 2.096112641732193, + "language_loss": 0.87566084, + "learning_rate": 3.988704657918608e-06, + "loss": 0.8989023, + "num_input_tokens_seen": 11193340, + "step": 523, + "time_per_iteration": 2.4753804206848145 + }, + { + "auxiliary_loss_clip": 0.0125745, + "auxiliary_loss_mlp": 0.01051239, + "balance_loss_clip": 1.07204127, + "balance_loss_mlp": 1.0374819, + "epoch": 0.06300727469488367, + "flos": 14976689587200.0, + "grad_norm": 2.486236352800208, + "language_loss": 0.79771876, + "learning_rate": 3.988621835298094e-06, + "loss": 0.82080567, + "num_input_tokens_seen": 11210555, + "step": 524, + "time_per_iteration": 3.245584487915039 + }, + { + "auxiliary_loss_clip": 0.01271753, + "auxiliary_loss_mlp": 0.01047982, + "balance_loss_clip": 1.07245779, + "balance_loss_mlp": 1.0343082, + "epoch": 0.06312751758552275, + "flos": 24535391420160.0, + "grad_norm": 2.0489652943742094, + "language_loss": 0.91931838, + "learning_rate": 3.988538711005229e-06, + "loss": 0.94251573, + "num_input_tokens_seen": 11230010, + "step": 525, + "time_per_iteration": 2.502084493637085 + }, + { + "auxiliary_loss_clip": 0.01251574, + "auxiliary_loss_mlp": 0.01043592, + "balance_loss_clip": 1.06932604, + "balance_loss_mlp": 1.03057468, + "epoch": 0.06324776047616185, + "flos": 21507008785920.0, + "grad_norm": 2.022453533732525, + "language_loss": 0.87912524, + "learning_rate": 3.988455285052622e-06, + "loss": 0.9020769, + "num_input_tokens_seen": 11246190, + "step": 526, + "time_per_iteration": 2.512266159057617 + }, + { + "auxiliary_loss_clip": 0.01252736, + "auxiliary_loss_mlp": 0.01050591, + "balance_loss_clip": 1.06924009, + "balance_loss_mlp": 1.0366019, + "epoch": 0.06336800336680094, + "flos": 21688034353920.0, + "grad_norm": 2.269013680840488, + "language_loss": 0.83960563, + "learning_rate": 3.98837155745293e-06, + "loss": 0.86263895, + "num_input_tokens_seen": 11264230, + "step": 527, + "time_per_iteration": 2.506347894668579 + }, + { + "auxiliary_loss_clip": 0.0125899, + "auxiliary_loss_mlp": 0.01045915, + "balance_loss_clip": 1.07280695, + "balance_loss_mlp": 1.03161621, + "epoch": 0.06348824625744003, + "flos": 19500895221120.0, + "grad_norm": 2.1975804270592616, + "language_loss": 0.76072943, + "learning_rate": 3.988287528218854e-06, + "loss": 0.78377849, + "num_input_tokens_seen": 11283015, + "step": 528, + "time_per_iteration": 3.302335739135742 + }, + { + "auxiliary_loss_clip": 0.0125755, + "auxiliary_loss_mlp": 0.0104593, + "balance_loss_clip": 1.07372069, + "balance_loss_mlp": 1.03293645, + "epoch": 0.06360848914807912, + "flos": 15481233976320.0, + "grad_norm": 1.9638321536073247, + "language_loss": 0.90398848, + "learning_rate": 3.98820319736314e-06, + "loss": 0.92702329, + "num_input_tokens_seen": 11299630, + "step": 529, + "time_per_iteration": 2.466140031814575 + }, + { + "auxiliary_loss_clip": 0.01226153, + "auxiliary_loss_mlp": 0.0104704, + "balance_loss_clip": 1.06253505, + "balance_loss_mlp": 1.03252065, + "epoch": 0.0637287320387182, + "flos": 20593369422720.0, + "grad_norm": 1.7635276865270417, + "language_loss": 0.85382092, + "learning_rate": 3.988118564898582e-06, + "loss": 0.87655294, + "num_input_tokens_seen": 11319170, + "step": 530, + "time_per_iteration": 3.3891782760620117 + }, + { + "auxiliary_loss_clip": 0.01220252, + "auxiliary_loss_mlp": 0.00767532, + "balance_loss_clip": 1.06775022, + "balance_loss_mlp": 1.00056875, + "epoch": 0.0638489749293573, + "flos": 17412222245760.0, + "grad_norm": 2.2953848250889237, + "language_loss": 0.89453185, + "learning_rate": 3.988033630838019e-06, + "loss": 0.91440976, + "num_input_tokens_seen": 11333210, + "step": 531, + "time_per_iteration": 3.326415538787842 + }, + { + "auxiliary_loss_clip": 0.01260466, + "auxiliary_loss_mlp": 0.01052277, + "balance_loss_clip": 1.07220483, + "balance_loss_mlp": 1.03862107, + "epoch": 0.0639692178199964, + "flos": 23807661874560.0, + "grad_norm": 1.9862225487451832, + "language_loss": 0.88442659, + "learning_rate": 3.987948395194334e-06, + "loss": 0.90755403, + "num_input_tokens_seen": 11355590, + "step": 532, + "time_per_iteration": 2.593491315841675 + }, + { + "auxiliary_loss_clip": 0.01249503, + "auxiliary_loss_mlp": 0.01051117, + "balance_loss_clip": 1.06624246, + "balance_loss_mlp": 1.03755641, + "epoch": 0.06408946071063548, + "flos": 18477225521280.0, + "grad_norm": 2.0577616325793318, + "language_loss": 0.7643764, + "learning_rate": 3.987862857980458e-06, + "loss": 0.78738254, + "num_input_tokens_seen": 11371535, + "step": 533, + "time_per_iteration": 2.4695353507995605 + }, + { + "auxiliary_loss_clip": 0.01227385, + "auxiliary_loss_mlp": 0.01048792, + "balance_loss_clip": 1.06588495, + "balance_loss_mlp": 1.03418911, + "epoch": 0.06420970360127458, + "flos": 27162220936320.0, + "grad_norm": 2.340396455489207, + "language_loss": 0.76620436, + "learning_rate": 3.987777019209368e-06, + "loss": 0.78896606, + "num_input_tokens_seen": 11392050, + "step": 534, + "time_per_iteration": 2.621335983276367 + }, + { + "auxiliary_loss_clip": 0.0127768, + "auxiliary_loss_mlp": 0.01040936, + "balance_loss_clip": 1.07451987, + "balance_loss_mlp": 1.02702403, + "epoch": 0.06432994649191366, + "flos": 23659673840640.0, + "grad_norm": 2.7168564057669697, + "language_loss": 0.81212723, + "learning_rate": 3.987690878894084e-06, + "loss": 0.83531332, + "num_input_tokens_seen": 11411765, + "step": 535, + "time_per_iteration": 2.486724376678467 + }, + { + "auxiliary_loss_clip": 0.01246008, + "auxiliary_loss_mlp": 0.01036679, + "balance_loss_clip": 1.06825089, + "balance_loss_mlp": 1.02209997, + "epoch": 0.06445018938255276, + "flos": 23403953940480.0, + "grad_norm": 2.3054085157617825, + "language_loss": 0.85275811, + "learning_rate": 3.987604437047673e-06, + "loss": 0.87558496, + "num_input_tokens_seen": 11431565, + "step": 536, + "time_per_iteration": 2.5564231872558594 + }, + { + "auxiliary_loss_clip": 0.01253966, + "auxiliary_loss_mlp": 0.01043225, + "balance_loss_clip": 1.06867075, + "balance_loss_mlp": 1.02914035, + "epoch": 0.06457043227319184, + "flos": 19646692525440.0, + "grad_norm": 2.058267210621199, + "language_loss": 0.7794584, + "learning_rate": 3.987517693683251e-06, + "loss": 0.80243039, + "num_input_tokens_seen": 11450140, + "step": 537, + "time_per_iteration": 2.4787228107452393 + }, + { + "auxiliary_loss_clip": 0.01241568, + "auxiliary_loss_mlp": 0.01055525, + "balance_loss_clip": 1.07142925, + "balance_loss_mlp": 1.0408324, + "epoch": 0.06469067516383094, + "flos": 16978744915200.0, + "grad_norm": 2.7571394287493165, + "language_loss": 0.95941621, + "learning_rate": 3.9874306488139745e-06, + "loss": 0.98238719, + "num_input_tokens_seen": 11465400, + "step": 538, + "time_per_iteration": 2.5266637802124023 + }, + { + "auxiliary_loss_clip": 0.01223355, + "auxiliary_loss_mlp": 0.01048213, + "balance_loss_clip": 1.0674907, + "balance_loss_mlp": 1.0342356, + "epoch": 0.06481091805447003, + "flos": 23296401642240.0, + "grad_norm": 2.0019010325855158, + "language_loss": 0.88190854, + "learning_rate": 3.987343302453049e-06, + "loss": 0.90462422, + "num_input_tokens_seen": 11486675, + "step": 539, + "time_per_iteration": 2.5911343097686768 + }, + { + "auxiliary_loss_clip": 0.0124126, + "auxiliary_loss_mlp": 0.01050127, + "balance_loss_clip": 1.07000661, + "balance_loss_mlp": 1.03626251, + "epoch": 0.06493116094510912, + "flos": 29172356824320.0, + "grad_norm": 1.6680054548758108, + "language_loss": 0.82607216, + "learning_rate": 3.987255654613724e-06, + "loss": 0.84898603, + "num_input_tokens_seen": 11510440, + "step": 540, + "time_per_iteration": 2.6464385986328125 + }, + { + "auxiliary_loss_clip": 0.0121919, + "auxiliary_loss_mlp": 0.01047174, + "balance_loss_clip": 1.06317425, + "balance_loss_mlp": 1.03273201, + "epoch": 0.06505140383574821, + "flos": 19865065259520.0, + "grad_norm": 1.985203582113191, + "language_loss": 0.70137751, + "learning_rate": 3.987167705309296e-06, + "loss": 0.72404116, + "num_input_tokens_seen": 11529715, + "step": 541, + "time_per_iteration": 2.605062484741211 + }, + { + "auxiliary_loss_clip": 0.01258085, + "auxiliary_loss_mlp": 0.00766607, + "balance_loss_clip": 1.06927657, + "balance_loss_mlp": 1.00062013, + "epoch": 0.0651716467263873, + "flos": 17924703540480.0, + "grad_norm": 2.7404473888949816, + "language_loss": 0.95514673, + "learning_rate": 3.987079454553108e-06, + "loss": 0.97539365, + "num_input_tokens_seen": 11547665, + "step": 542, + "time_per_iteration": 2.527634382247925 + }, + { + "auxiliary_loss_clip": 0.01224755, + "auxiliary_loss_mlp": 0.01043036, + "balance_loss_clip": 1.06957293, + "balance_loss_mlp": 1.02906418, + "epoch": 0.0652918896170264, + "flos": 20842840356480.0, + "grad_norm": 1.853598789477471, + "language_loss": 0.91463315, + "learning_rate": 3.986990902358546e-06, + "loss": 0.93731111, + "num_input_tokens_seen": 11564605, + "step": 543, + "time_per_iteration": 2.556701898574829 + }, + { + "auxiliary_loss_clip": 0.01258044, + "auxiliary_loss_mlp": 0.01047583, + "balance_loss_clip": 1.06867337, + "balance_loss_mlp": 1.0331645, + "epoch": 0.06541213250766549, + "flos": 21872507627520.0, + "grad_norm": 2.271888911313839, + "language_loss": 0.93343383, + "learning_rate": 3.986902048739045e-06, + "loss": 0.9564901, + "num_input_tokens_seen": 11584550, + "step": 544, + "time_per_iteration": 2.5286076068878174 + }, + { + "auxiliary_loss_clip": 0.0124257, + "auxiliary_loss_mlp": 0.01051071, + "balance_loss_clip": 1.0677458, + "balance_loss_mlp": 1.03569257, + "epoch": 0.06553237539830457, + "flos": 23110743219840.0, + "grad_norm": 2.61528430282179, + "language_loss": 0.79914856, + "learning_rate": 3.986812893708082e-06, + "loss": 0.8220849, + "num_input_tokens_seen": 11600740, + "step": 545, + "time_per_iteration": 2.514575958251953 + }, + { + "auxiliary_loss_clip": 0.01242135, + "auxiliary_loss_mlp": 0.01052555, + "balance_loss_clip": 1.06550956, + "balance_loss_mlp": 1.03717732, + "epoch": 0.06565261828894367, + "flos": 17923769786880.0, + "grad_norm": 1.9452114639967422, + "language_loss": 0.81288373, + "learning_rate": 3.9867234372791826e-06, + "loss": 0.83583069, + "num_input_tokens_seen": 11618695, + "step": 546, + "time_per_iteration": 2.524569511413574 + }, + { + "auxiliary_loss_clip": 0.01254581, + "auxiliary_loss_mlp": 0.01046752, + "balance_loss_clip": 1.06904531, + "balance_loss_mlp": 1.03225017, + "epoch": 0.06577286117958275, + "flos": 22783058421120.0, + "grad_norm": 1.6360119288831934, + "language_loss": 0.87290156, + "learning_rate": 3.986633679465918e-06, + "loss": 0.89591491, + "num_input_tokens_seen": 11638850, + "step": 547, + "time_per_iteration": 2.5052170753479004 + }, + { + "auxiliary_loss_clip": 0.01211189, + "auxiliary_loss_mlp": 0.01050567, + "balance_loss_clip": 1.06598878, + "balance_loss_mlp": 1.03635728, + "epoch": 0.06589310407022185, + "flos": 23696194993920.0, + "grad_norm": 2.563159821357288, + "language_loss": 0.80727601, + "learning_rate": 3.986543620281904e-06, + "loss": 0.82989359, + "num_input_tokens_seen": 11658500, + "step": 548, + "time_per_iteration": 2.6448497772216797 + }, + { + "auxiliary_loss_clip": 0.01223539, + "auxiliary_loss_mlp": 0.01036386, + "balance_loss_clip": 1.06343102, + "balance_loss_mlp": 1.02205682, + "epoch": 0.06601334696086093, + "flos": 26864772410880.0, + "grad_norm": 2.1008336069379623, + "language_loss": 0.91126341, + "learning_rate": 3.986453259740802e-06, + "loss": 0.93386263, + "num_input_tokens_seen": 11676670, + "step": 549, + "time_per_iteration": 2.6180598735809326 + }, + { + "auxiliary_loss_clip": 0.01239152, + "auxiliary_loss_mlp": 0.01051534, + "balance_loss_clip": 1.07073021, + "balance_loss_mlp": 1.03684187, + "epoch": 0.06613358985150003, + "flos": 12567694101120.0, + "grad_norm": 2.795401782593018, + "language_loss": 0.78914273, + "learning_rate": 3.986362597856319e-06, + "loss": 0.81204957, + "num_input_tokens_seen": 11693170, + "step": 550, + "time_per_iteration": 2.5634279251098633 + }, + { + "auxiliary_loss_clip": 0.01238226, + "auxiliary_loss_mlp": 0.00768344, + "balance_loss_clip": 1.06659424, + "balance_loss_mlp": 1.00066447, + "epoch": 0.06625383274213913, + "flos": 18332505624960.0, + "grad_norm": 3.0795649748169316, + "language_loss": 0.81772655, + "learning_rate": 3.986271634642211e-06, + "loss": 0.83779222, + "num_input_tokens_seen": 11710150, + "step": 551, + "time_per_iteration": 3.3015031814575195 + }, + { + "auxiliary_loss_clip": 0.01272195, + "auxiliary_loss_mlp": 0.01045796, + "balance_loss_clip": 1.0730418, + "balance_loss_mlp": 1.03061438, + "epoch": 0.06637407563277821, + "flos": 15375585098880.0, + "grad_norm": 2.064229026420499, + "language_loss": 0.8187533, + "learning_rate": 3.986180370112274e-06, + "loss": 0.84193325, + "num_input_tokens_seen": 11726670, + "step": 552, + "time_per_iteration": 2.4844541549682617 + }, + { + "auxiliary_loss_clip": 0.0125699, + "auxiliary_loss_mlp": 0.00768304, + "balance_loss_clip": 1.0698514, + "balance_loss_mlp": 1.00060534, + "epoch": 0.0664943185234173, + "flos": 24025244509440.0, + "grad_norm": 2.006674647502356, + "language_loss": 0.74707925, + "learning_rate": 3.986088804280354e-06, + "loss": 0.7673322, + "num_input_tokens_seen": 11746400, + "step": 553, + "time_per_iteration": 2.6165852546691895 + }, + { + "auxiliary_loss_clip": 0.01243593, + "auxiliary_loss_mlp": 0.01047574, + "balance_loss_clip": 1.06985569, + "balance_loss_mlp": 1.03244638, + "epoch": 0.06661456141405639, + "flos": 20957503547520.0, + "grad_norm": 2.8246180517017545, + "language_loss": 0.9409703, + "learning_rate": 3.985996937160342e-06, + "loss": 0.96388197, + "num_input_tokens_seen": 11765590, + "step": 554, + "time_per_iteration": 3.329732894897461 + }, + { + "auxiliary_loss_clip": 0.01252792, + "auxiliary_loss_mlp": 0.01053446, + "balance_loss_clip": 1.06912017, + "balance_loss_mlp": 1.03884244, + "epoch": 0.06673480430469549, + "flos": 52223953322880.0, + "grad_norm": 2.2055390101366017, + "language_loss": 0.68906832, + "learning_rate": 3.985904768766173e-06, + "loss": 0.71213067, + "num_input_tokens_seen": 11788365, + "step": 555, + "time_per_iteration": 2.752044677734375 + }, + { + "auxiliary_loss_clip": 0.01229583, + "auxiliary_loss_mlp": 0.01048515, + "balance_loss_clip": 1.0672605, + "balance_loss_mlp": 1.0332799, + "epoch": 0.06685504719533458, + "flos": 16217079995520.0, + "grad_norm": 2.6221175758611603, + "language_loss": 0.75986302, + "learning_rate": 3.98581229911183e-06, + "loss": 0.78264403, + "num_input_tokens_seen": 11807285, + "step": 556, + "time_per_iteration": 3.2995612621307373 + }, + { + "auxiliary_loss_clip": 0.01256881, + "auxiliary_loss_mlp": 0.01049289, + "balance_loss_clip": 1.06688845, + "balance_loss_mlp": 1.0340538, + "epoch": 0.06697529008597367, + "flos": 22491535639680.0, + "grad_norm": 1.7947973146849212, + "language_loss": 0.92116451, + "learning_rate": 3.985719528211341e-06, + "loss": 0.94422615, + "num_input_tokens_seen": 11826655, + "step": 557, + "time_per_iteration": 3.2985076904296875 + }, + { + "auxiliary_loss_clip": 0.0114781, + "auxiliary_loss_mlp": 0.01013357, + "balance_loss_clip": 1.0406878, + "balance_loss_mlp": 1.00792062, + "epoch": 0.06709553297661276, + "flos": 62688216936960.0, + "grad_norm": 0.8435043126478914, + "language_loss": 0.63040304, + "learning_rate": 3.985626456078777e-06, + "loss": 0.65201473, + "num_input_tokens_seen": 11891310, + "step": 558, + "time_per_iteration": 3.21042537689209 + }, + { + "auxiliary_loss_clip": 0.01230392, + "auxiliary_loss_mlp": 0.01048199, + "balance_loss_clip": 1.06865454, + "balance_loss_mlp": 1.03378665, + "epoch": 0.06721577586725185, + "flos": 11216590997760.0, + "grad_norm": 2.1153278443399293, + "language_loss": 0.86077321, + "learning_rate": 3.985533082728259e-06, + "loss": 0.88355917, + "num_input_tokens_seen": 11906965, + "step": 559, + "time_per_iteration": 2.5211544036865234 + }, + { + "auxiliary_loss_clip": 0.01277783, + "auxiliary_loss_mlp": 0.01039957, + "balance_loss_clip": 1.07251072, + "balance_loss_mlp": 1.02502036, + "epoch": 0.06733601875789094, + "flos": 25922189664000.0, + "grad_norm": 1.8551702649296107, + "language_loss": 0.7479983, + "learning_rate": 3.985439408173951e-06, + "loss": 0.77117574, + "num_input_tokens_seen": 11927190, + "step": 560, + "time_per_iteration": 2.5083250999450684 + }, + { + "auxiliary_loss_clip": 0.01275447, + "auxiliary_loss_mlp": 0.01057057, + "balance_loss_clip": 1.07296538, + "balance_loss_mlp": 1.04212022, + "epoch": 0.06745626164853002, + "flos": 20813645577600.0, + "grad_norm": 1.8145374364877458, + "language_loss": 0.70964801, + "learning_rate": 3.9853454324300634e-06, + "loss": 0.7329731, + "num_input_tokens_seen": 11946400, + "step": 561, + "time_per_iteration": 2.45818829536438 + }, + { + "auxiliary_loss_clip": 0.01201458, + "auxiliary_loss_mlp": 0.01045288, + "balance_loss_clip": 1.06271887, + "balance_loss_mlp": 1.029374, + "epoch": 0.06757650453916912, + "flos": 19829262378240.0, + "grad_norm": 2.199953382781585, + "language_loss": 0.77794826, + "learning_rate": 3.985251155510852e-06, + "loss": 0.80041575, + "num_input_tokens_seen": 11965430, + "step": 562, + "time_per_iteration": 2.630274772644043 + }, + { + "auxiliary_loss_clip": 0.01208754, + "auxiliary_loss_mlp": 0.01044735, + "balance_loss_clip": 1.06740892, + "balance_loss_mlp": 1.02965546, + "epoch": 0.06769674742980822, + "flos": 25739224761600.0, + "grad_norm": 1.8059813711265038, + "language_loss": 0.80422574, + "learning_rate": 3.98515657743062e-06, + "loss": 0.82676065, + "num_input_tokens_seen": 11984895, + "step": 563, + "time_per_iteration": 2.616872549057007 + }, + { + "auxiliary_loss_clip": 0.01234603, + "auxiliary_loss_mlp": 0.01056963, + "balance_loss_clip": 1.06341136, + "balance_loss_mlp": 1.04252684, + "epoch": 0.0678169903204473, + "flos": 13074788355840.0, + "grad_norm": 2.35661166494529, + "language_loss": 0.77772522, + "learning_rate": 3.985061698203711e-06, + "loss": 0.80064088, + "num_input_tokens_seen": 12002010, + "step": 564, + "time_per_iteration": 2.5428597927093506 + }, + { + "auxiliary_loss_clip": 0.01172626, + "auxiliary_loss_mlp": 0.01005691, + "balance_loss_clip": 1.04333103, + "balance_loss_mlp": 1.0006609, + "epoch": 0.0679372332110864, + "flos": 70865830788480.0, + "grad_norm": 0.8860055869026141, + "language_loss": 0.63896382, + "learning_rate": 3.984966517844523e-06, + "loss": 0.66074693, + "num_input_tokens_seen": 12057255, + "step": 565, + "time_per_iteration": 3.0271852016448975 + }, + { + "auxiliary_loss_clip": 0.01274285, + "auxiliary_loss_mlp": 0.01055412, + "balance_loss_clip": 1.07185054, + "balance_loss_mlp": 1.04046309, + "epoch": 0.06805747610172548, + "flos": 28256418990720.0, + "grad_norm": 2.1322148396305614, + "language_loss": 0.80625701, + "learning_rate": 3.984871036367492e-06, + "loss": 0.82955396, + "num_input_tokens_seen": 12077280, + "step": 566, + "time_per_iteration": -0.0996100902557373 + }, + { + "auxiliary_loss_clip": 0.01254252, + "auxiliary_loss_mlp": 0.00767334, + "balance_loss_clip": 1.07010162, + "balance_loss_mlp": 1.00071073, + "epoch": 0.06817771899236458, + "flos": 20120533764480.0, + "grad_norm": 1.7981849832259107, + "language_loss": 0.83176655, + "learning_rate": 3.984775253787102e-06, + "loss": 0.85198241, + "num_input_tokens_seen": 12095570, + "step": 567, + "time_per_iteration": 2.5073535442352295 + }, + { + "auxiliary_loss_clip": 0.0125861, + "auxiliary_loss_mlp": 0.01042873, + "balance_loss_clip": 1.06736815, + "balance_loss_mlp": 1.02871656, + "epoch": 0.06829796188300366, + "flos": 17930629284480.0, + "grad_norm": 3.7409965192458507, + "language_loss": 0.87549865, + "learning_rate": 3.984679170117885e-06, + "loss": 0.89851344, + "num_input_tokens_seen": 12111775, + "step": 568, + "time_per_iteration": 2.491922616958618 + }, + { + "auxiliary_loss_clip": 0.01251909, + "auxiliary_loss_mlp": 0.01045467, + "balance_loss_clip": 1.06656051, + "balance_loss_mlp": 1.0300653, + "epoch": 0.06841820477364276, + "flos": 14501627285760.0, + "grad_norm": 6.223393954593174, + "language_loss": 0.78090847, + "learning_rate": 3.984582785374415e-06, + "loss": 0.80388212, + "num_input_tokens_seen": 12129215, + "step": 569, + "time_per_iteration": 2.479417324066162 + }, + { + "auxiliary_loss_clip": 0.01239543, + "auxiliary_loss_mlp": 0.0076764, + "balance_loss_clip": 1.06864417, + "balance_loss_mlp": 1.00053716, + "epoch": 0.06853844766428185, + "flos": 21938474954880.0, + "grad_norm": 4.814188436663387, + "language_loss": 0.809991, + "learning_rate": 3.9844860995713155e-06, + "loss": 0.83006281, + "num_input_tokens_seen": 12148755, + "step": 570, + "time_per_iteration": 2.582784414291382 + }, + { + "auxiliary_loss_clip": 0.0125671, + "auxiliary_loss_mlp": 0.01041542, + "balance_loss_clip": 1.07528925, + "balance_loss_mlp": 1.0272603, + "epoch": 0.06865869055492094, + "flos": 16800628348800.0, + "grad_norm": 2.215153898739612, + "language_loss": 0.82872438, + "learning_rate": 3.9843891127232524e-06, + "loss": 0.85170686, + "num_input_tokens_seen": 12166290, + "step": 571, + "time_per_iteration": 2.481745719909668 + }, + { + "auxiliary_loss_clip": 0.01194031, + "auxiliary_loss_mlp": 0.01043734, + "balance_loss_clip": 1.05967021, + "balance_loss_mlp": 1.02853441, + "epoch": 0.06877893344556003, + "flos": 19937281553280.0, + "grad_norm": 2.292411694167427, + "language_loss": 0.66955733, + "learning_rate": 3.984291824844938e-06, + "loss": 0.69193494, + "num_input_tokens_seen": 12181385, + "step": 572, + "time_per_iteration": 2.5917859077453613 + }, + { + "auxiliary_loss_clip": 0.0127175, + "auxiliary_loss_mlp": 0.01043498, + "balance_loss_clip": 1.07107544, + "balance_loss_mlp": 1.02894235, + "epoch": 0.06889917633619912, + "flos": 23039388852480.0, + "grad_norm": 2.361216639330607, + "language_loss": 0.84907508, + "learning_rate": 3.984194235951132e-06, + "loss": 0.87222749, + "num_input_tokens_seen": 12197530, + "step": 573, + "time_per_iteration": 2.503679037094116 + }, + { + "auxiliary_loss_clip": 0.01275183, + "auxiliary_loss_mlp": 0.01058248, + "balance_loss_clip": 1.07553101, + "balance_loss_mlp": 1.04456294, + "epoch": 0.06901941922683821, + "flos": 20960556203520.0, + "grad_norm": 3.489538670585963, + "language_loss": 0.84513795, + "learning_rate": 3.9840963460566375e-06, + "loss": 0.86847222, + "num_input_tokens_seen": 12216310, + "step": 574, + "time_per_iteration": 2.4956066608428955 + }, + { + "auxiliary_loss_clip": 0.01177495, + "auxiliary_loss_mlp": 0.01041418, + "balance_loss_clip": 1.05783725, + "balance_loss_mlp": 1.02723837, + "epoch": 0.06913966211747731, + "flos": 24821850384000.0, + "grad_norm": 2.369201798656756, + "language_loss": 0.8944869, + "learning_rate": 3.983998155176305e-06, + "loss": 0.91667604, + "num_input_tokens_seen": 12236670, + "step": 575, + "time_per_iteration": 2.64860463142395 + }, + { + "auxiliary_loss_clip": 0.0116223, + "auxiliary_loss_mlp": 0.01015787, + "balance_loss_clip": 1.03604841, + "balance_loss_mlp": 1.01109052, + "epoch": 0.06925990500811639, + "flos": 58367446957440.0, + "grad_norm": 0.8166091915463882, + "language_loss": 0.57034373, + "learning_rate": 3.9838996633250305e-06, + "loss": 0.59212387, + "num_input_tokens_seen": 12297185, + "step": 576, + "time_per_iteration": 3.021286964416504 + }, + { + "auxiliary_loss_clip": 0.01253268, + "auxiliary_loss_mlp": 0.01045179, + "balance_loss_clip": 1.06705368, + "balance_loss_mlp": 1.03220308, + "epoch": 0.06938014789875549, + "flos": 12749940731520.0, + "grad_norm": 6.761432826544195, + "language_loss": 0.88336289, + "learning_rate": 3.983800870517753e-06, + "loss": 0.90634739, + "num_input_tokens_seen": 12313975, + "step": 577, + "time_per_iteration": 2.493823289871216 + }, + { + "auxiliary_loss_clip": 0.01251581, + "auxiliary_loss_mlp": 0.01048118, + "balance_loss_clip": 1.07296371, + "balance_loss_mlp": 1.03551793, + "epoch": 0.06950039078939457, + "flos": 22820226019200.0, + "grad_norm": 3.2166363284147557, + "language_loss": 0.78451937, + "learning_rate": 3.983701776769463e-06, + "loss": 0.80751634, + "num_input_tokens_seen": 12331385, + "step": 578, + "time_per_iteration": 3.3079283237457275 + }, + { + "auxiliary_loss_clip": 0.01247262, + "auxiliary_loss_mlp": 0.0104595, + "balance_loss_clip": 1.07032847, + "balance_loss_mlp": 1.03132272, + "epoch": 0.06962063368003367, + "flos": 21941348042880.0, + "grad_norm": 3.430125229739833, + "language_loss": 0.8565371, + "learning_rate": 3.9836023820951885e-06, + "loss": 0.87946916, + "num_input_tokens_seen": 12350600, + "step": 579, + "time_per_iteration": 2.5063302516937256 + }, + { + "auxiliary_loss_clip": 0.01214431, + "auxiliary_loss_mlp": 0.01049294, + "balance_loss_clip": 1.05988836, + "balance_loss_mlp": 1.03653216, + "epoch": 0.06974087657067275, + "flos": 20706021452160.0, + "grad_norm": 1.9818834549219564, + "language_loss": 0.68316436, + "learning_rate": 3.983502686510011e-06, + "loss": 0.70580167, + "num_input_tokens_seen": 12371430, + "step": 580, + "time_per_iteration": 2.596201181411743 + }, + { + "auxiliary_loss_clip": 0.01255122, + "auxiliary_loss_mlp": 0.00766692, + "balance_loss_clip": 1.06587946, + "balance_loss_mlp": 1.00071025, + "epoch": 0.06986111946131185, + "flos": 22638230784000.0, + "grad_norm": 1.9736432947173932, + "language_loss": 0.73475456, + "learning_rate": 3.9834026900290525e-06, + "loss": 0.75497276, + "num_input_tokens_seen": 12390825, + "step": 581, + "time_per_iteration": 3.2935590744018555 + }, + { + "auxiliary_loss_clip": 0.01269472, + "auxiliary_loss_mlp": 0.01046349, + "balance_loss_clip": 1.07030118, + "balance_loss_mlp": 1.0328958, + "epoch": 0.06998136235195095, + "flos": 26943453152640.0, + "grad_norm": 2.025539430717587, + "language_loss": 1.00608838, + "learning_rate": 3.983302392667482e-06, + "loss": 1.02924669, + "num_input_tokens_seen": 12411670, + "step": 582, + "time_per_iteration": 2.525470733642578 + }, + { + "auxiliary_loss_clip": 0.01253825, + "auxiliary_loss_mlp": 0.01043795, + "balance_loss_clip": 1.07176614, + "balance_loss_mlp": 1.03021717, + "epoch": 0.07010160524259003, + "flos": 22492505306880.0, + "grad_norm": 1.903341177924393, + "language_loss": 0.93781245, + "learning_rate": 3.983201794440517e-06, + "loss": 0.96078867, + "num_input_tokens_seen": 12431245, + "step": 583, + "time_per_iteration": 3.2532238960266113 + }, + { + "auxiliary_loss_clip": 0.01225582, + "auxiliary_loss_mlp": 0.01042013, + "balance_loss_clip": 1.06505036, + "balance_loss_mlp": 1.02840543, + "epoch": 0.07022184813322913, + "flos": 18332541538560.0, + "grad_norm": 1.965047914082079, + "language_loss": 0.67302555, + "learning_rate": 3.9831008953634165e-06, + "loss": 0.69570148, + "num_input_tokens_seen": 12450535, + "step": 584, + "time_per_iteration": 3.3353402614593506 + }, + { + "auxiliary_loss_clip": 0.01188087, + "auxiliary_loss_mlp": 0.01047667, + "balance_loss_clip": 1.05708861, + "balance_loss_mlp": 1.03287339, + "epoch": 0.07034209102386821, + "flos": 24675550289280.0, + "grad_norm": 1.8965908224355559, + "language_loss": 0.81455553, + "learning_rate": 3.9829996954514864e-06, + "loss": 0.83691311, + "num_input_tokens_seen": 12469675, + "step": 585, + "time_per_iteration": 2.609774589538574 + }, + { + "auxiliary_loss_clip": 0.01242863, + "auxiliary_loss_mlp": 0.01048071, + "balance_loss_clip": 1.0672735, + "balance_loss_mlp": 1.03324175, + "epoch": 0.0704623339145073, + "flos": 25995878415360.0, + "grad_norm": 1.8351875711734102, + "language_loss": 0.84405535, + "learning_rate": 3.982898194720079e-06, + "loss": 0.8669647, + "num_input_tokens_seen": 12490405, + "step": 586, + "time_per_iteration": 2.5587968826293945 + }, + { + "auxiliary_loss_clip": 0.01236066, + "auxiliary_loss_mlp": 0.00767026, + "balance_loss_clip": 1.07100987, + "balance_loss_mlp": 1.00070858, + "epoch": 0.0705825768051464, + "flos": 25338318088320.0, + "grad_norm": 3.1091303002851034, + "language_loss": 0.82351351, + "learning_rate": 3.982796393184592e-06, + "loss": 0.84354442, + "num_input_tokens_seen": 12509485, + "step": 587, + "time_per_iteration": 2.5775508880615234 + }, + { + "auxiliary_loss_clip": 0.01141397, + "auxiliary_loss_mlp": 0.01007917, + "balance_loss_clip": 1.02991807, + "balance_loss_mlp": 1.00171852, + "epoch": 0.07070281969578548, + "flos": 66047552507520.0, + "grad_norm": 0.8739035214160116, + "language_loss": 0.6262874, + "learning_rate": 3.98269429086047e-06, + "loss": 0.64778054, + "num_input_tokens_seen": 12567325, + "step": 588, + "time_per_iteration": 2.9829788208007812 + }, + { + "auxiliary_loss_clip": 0.01226854, + "auxiliary_loss_mlp": 0.01051617, + "balance_loss_clip": 1.06749415, + "balance_loss_mlp": 1.03661454, + "epoch": 0.07082306258642458, + "flos": 23653568528640.0, + "grad_norm": 2.7252312543429555, + "language_loss": 0.86339498, + "learning_rate": 3.982591887763199e-06, + "loss": 0.88617969, + "num_input_tokens_seen": 12584785, + "step": 589, + "time_per_iteration": 2.590914011001587 + }, + { + "auxiliary_loss_clip": 0.01198829, + "auxiliary_loss_mlp": 0.01041922, + "balance_loss_clip": 1.05640483, + "balance_loss_mlp": 1.02701521, + "epoch": 0.07094330547706366, + "flos": 13880049408000.0, + "grad_norm": 2.4912160974803315, + "language_loss": 0.82054871, + "learning_rate": 3.982489183908316e-06, + "loss": 0.84295619, + "num_input_tokens_seen": 12601205, + "step": 590, + "time_per_iteration": 2.5865674018859863 + }, + { + "auxiliary_loss_clip": 0.01161136, + "auxiliary_loss_mlp": 0.01039214, + "balance_loss_clip": 1.05060363, + "balance_loss_mlp": 1.02661371, + "epoch": 0.07106354836770276, + "flos": 24645098534400.0, + "grad_norm": 1.7999102299512741, + "language_loss": 0.84534371, + "learning_rate": 3.982386179311399e-06, + "loss": 0.86734718, + "num_input_tokens_seen": 12621725, + "step": 591, + "time_per_iteration": 2.660905599594116 + }, + { + "auxiliary_loss_clip": 0.01257919, + "auxiliary_loss_mlp": 0.01051257, + "balance_loss_clip": 1.07138801, + "balance_loss_mlp": 1.03511596, + "epoch": 0.07118379125834184, + "flos": 16217223649920.0, + "grad_norm": 2.3685187786420947, + "language_loss": 0.87865996, + "learning_rate": 3.982282873988075e-06, + "loss": 0.90175176, + "num_input_tokens_seen": 12639600, + "step": 592, + "time_per_iteration": 2.517122983932495 + }, + { + "auxiliary_loss_clip": 0.01237926, + "auxiliary_loss_mlp": 0.01041194, + "balance_loss_clip": 1.06898308, + "balance_loss_mlp": 1.02842689, + "epoch": 0.07130403414898094, + "flos": 19719986227200.0, + "grad_norm": 2.8441111572103104, + "language_loss": 0.86998785, + "learning_rate": 3.982179267954016e-06, + "loss": 0.89277911, + "num_input_tokens_seen": 12660030, + "step": 593, + "time_per_iteration": 2.561278820037842 + }, + { + "auxiliary_loss_clip": 0.01268656, + "auxiliary_loss_mlp": 0.01039717, + "balance_loss_clip": 1.07086408, + "balance_loss_mlp": 1.02579904, + "epoch": 0.07142427703962004, + "flos": 21871933009920.0, + "grad_norm": 2.275703657327822, + "language_loss": 0.95681381, + "learning_rate": 3.982075361224937e-06, + "loss": 0.97989756, + "num_input_tokens_seen": 12678395, + "step": 594, + "time_per_iteration": 2.5042481422424316 + }, + { + "auxiliary_loss_clip": 0.01248521, + "auxiliary_loss_mlp": 0.00766163, + "balance_loss_clip": 1.06968391, + "balance_loss_mlp": 1.00067043, + "epoch": 0.07154451993025912, + "flos": 18296595002880.0, + "grad_norm": 1.924957472511983, + "language_loss": 0.87866783, + "learning_rate": 3.981971153816602e-06, + "loss": 0.89881468, + "num_input_tokens_seen": 12696000, + "step": 595, + "time_per_iteration": 2.4784765243530273 + }, + { + "auxiliary_loss_clip": 0.01269211, + "auxiliary_loss_mlp": 0.01043313, + "balance_loss_clip": 1.07485485, + "balance_loss_mlp": 1.03071213, + "epoch": 0.07166476282089822, + "flos": 22160690444160.0, + "grad_norm": 1.8324061791930113, + "language_loss": 0.96278226, + "learning_rate": 3.981866645744819e-06, + "loss": 0.98590755, + "num_input_tokens_seen": 12716715, + "step": 596, + "time_per_iteration": 2.5105478763580322 + }, + { + "auxiliary_loss_clip": 0.01271542, + "auxiliary_loss_mlp": 0.00767079, + "balance_loss_clip": 1.07350099, + "balance_loss_mlp": 1.0006156, + "epoch": 0.0717850057115373, + "flos": 14136343925760.0, + "grad_norm": 3.1165461196978277, + "language_loss": 0.81559205, + "learning_rate": 3.9817618370254416e-06, + "loss": 0.83597827, + "num_input_tokens_seen": 12733370, + "step": 597, + "time_per_iteration": 2.440387487411499 + }, + { + "auxiliary_loss_clip": 0.0127107, + "auxiliary_loss_mlp": 0.01052548, + "balance_loss_clip": 1.07327056, + "balance_loss_mlp": 1.03871346, + "epoch": 0.0719052486021764, + "flos": 30917794412160.0, + "grad_norm": 3.3152411939154907, + "language_loss": 0.87191731, + "learning_rate": 3.9816567276743684e-06, + "loss": 0.8951534, + "num_input_tokens_seen": 12753235, + "step": 598, + "time_per_iteration": 2.5943191051483154 + }, + { + "auxiliary_loss_clip": 0.0123293, + "auxiliary_loss_mlp": 0.01041794, + "balance_loss_clip": 1.06912899, + "balance_loss_mlp": 1.02810895, + "epoch": 0.0720254914928155, + "flos": 21287019939840.0, + "grad_norm": 1.86353522370305, + "language_loss": 0.77457869, + "learning_rate": 3.9815513177075466e-06, + "loss": 0.79732597, + "num_input_tokens_seen": 12772020, + "step": 599, + "time_per_iteration": 2.536543607711792 + }, + { + "auxiliary_loss_clip": 0.0124198, + "auxiliary_loss_mlp": 0.01043083, + "balance_loss_clip": 1.06771934, + "balance_loss_mlp": 1.03133535, + "epoch": 0.07214573438345458, + "flos": 27819170732160.0, + "grad_norm": 1.7020650396036558, + "language_loss": 0.70228499, + "learning_rate": 3.9814456071409646e-06, + "loss": 0.72513556, + "num_input_tokens_seen": 12792555, + "step": 600, + "time_per_iteration": 2.576333522796631 + }, + { + "auxiliary_loss_clip": 0.01209112, + "auxiliary_loss_mlp": 0.01053501, + "balance_loss_clip": 1.06565893, + "balance_loss_mlp": 1.03916037, + "epoch": 0.07226597727409367, + "flos": 25483576688640.0, + "grad_norm": 3.3486157906793093, + "language_loss": 0.85209084, + "learning_rate": 3.981339595990659e-06, + "loss": 0.874717, + "num_input_tokens_seen": 12811085, + "step": 601, + "time_per_iteration": 2.6185553073883057 + }, + { + "auxiliary_loss_clip": 0.0125316, + "auxiliary_loss_mlp": 0.01052284, + "balance_loss_clip": 1.07135034, + "balance_loss_mlp": 1.03764534, + "epoch": 0.07238622016473276, + "flos": 23513840622720.0, + "grad_norm": 2.028034811737701, + "language_loss": 0.81249613, + "learning_rate": 3.981233284272713e-06, + "loss": 0.83555061, + "num_input_tokens_seen": 12830830, + "step": 602, + "time_per_iteration": 2.5428874492645264 + }, + { + "auxiliary_loss_clip": 0.01220404, + "auxiliary_loss_mlp": 0.0104311, + "balance_loss_clip": 1.06490648, + "balance_loss_mlp": 1.03072464, + "epoch": 0.07250646305537185, + "flos": 25453519983360.0, + "grad_norm": 1.5743735442909974, + "language_loss": 0.89888448, + "learning_rate": 3.981126672003253e-06, + "loss": 0.92151964, + "num_input_tokens_seen": 12853505, + "step": 603, + "time_per_iteration": 2.6418471336364746 + }, + { + "auxiliary_loss_clip": 0.01240779, + "auxiliary_loss_mlp": 0.01049254, + "balance_loss_clip": 1.06516051, + "balance_loss_mlp": 1.03604579, + "epoch": 0.07262670594601094, + "flos": 27155038216320.0, + "grad_norm": 2.80538979617731, + "language_loss": 0.77797651, + "learning_rate": 3.981019759198451e-06, + "loss": 0.80087686, + "num_input_tokens_seen": 12872455, + "step": 604, + "time_per_iteration": 2.6096601486206055 + }, + { + "auxiliary_loss_clip": 0.01237259, + "auxiliary_loss_mlp": 0.01046776, + "balance_loss_clip": 1.06872606, + "balance_loss_mlp": 1.03304291, + "epoch": 0.07274694883665003, + "flos": 26651607148800.0, + "grad_norm": 2.083370041292177, + "language_loss": 0.84311461, + "learning_rate": 3.980912545874528e-06, + "loss": 0.86595488, + "num_input_tokens_seen": 12892620, + "step": 605, + "time_per_iteration": 3.324244737625122 + }, + { + "auxiliary_loss_clip": 0.01248193, + "auxiliary_loss_mlp": 0.00766413, + "balance_loss_clip": 1.06985664, + "balance_loss_mlp": 1.00062621, + "epoch": 0.07286719172728913, + "flos": 29862344154240.0, + "grad_norm": 2.2974032620622955, + "language_loss": 0.85555762, + "learning_rate": 3.980805032047746e-06, + "loss": 0.87570369, + "num_input_tokens_seen": 12914090, + "step": 606, + "time_per_iteration": 2.6336658000946045 + }, + { + "auxiliary_loss_clip": 0.01231032, + "auxiliary_loss_mlp": 0.01044102, + "balance_loss_clip": 1.06498444, + "balance_loss_mlp": 1.02895033, + "epoch": 0.07298743461792821, + "flos": 17382057799680.0, + "grad_norm": 1.9626911701846654, + "language_loss": 0.80716771, + "learning_rate": 3.980697217734415e-06, + "loss": 0.82991904, + "num_input_tokens_seen": 12931830, + "step": 607, + "time_per_iteration": 2.488373279571533 + }, + { + "auxiliary_loss_clip": 0.01206917, + "auxiliary_loss_mlp": 0.00765959, + "balance_loss_clip": 1.06505966, + "balance_loss_mlp": 1.00058341, + "epoch": 0.07310767750856731, + "flos": 19498201701120.0, + "grad_norm": 3.7311027606183855, + "language_loss": 0.91979587, + "learning_rate": 3.980589102950891e-06, + "loss": 0.93952465, + "num_input_tokens_seen": 12949995, + "step": 608, + "time_per_iteration": 3.3832359313964844 + }, + { + "auxiliary_loss_clip": 0.01237138, + "auxiliary_loss_mlp": 0.01043414, + "balance_loss_clip": 1.07181263, + "balance_loss_mlp": 1.0295198, + "epoch": 0.07322792039920639, + "flos": 29168693637120.0, + "grad_norm": 3.5414903203428802, + "language_loss": 0.76047254, + "learning_rate": 3.9804806877135755e-06, + "loss": 0.78327805, + "num_input_tokens_seen": 12968040, + "step": 609, + "time_per_iteration": 2.579653739929199 + }, + { + "auxiliary_loss_clip": 0.01257712, + "auxiliary_loss_mlp": 0.00766879, + "balance_loss_clip": 1.07002616, + "balance_loss_mlp": 1.00067878, + "epoch": 0.07334816328984549, + "flos": 23477822259840.0, + "grad_norm": 2.4127111431619963, + "language_loss": 0.86277318, + "learning_rate": 3.980371972038915e-06, + "loss": 0.88301915, + "num_input_tokens_seen": 12988530, + "step": 610, + "time_per_iteration": 3.3197388648986816 + }, + { + "auxiliary_loss_clip": 0.01272075, + "auxiliary_loss_mlp": 0.01046278, + "balance_loss_clip": 1.07532477, + "balance_loss_mlp": 1.03253925, + "epoch": 0.07346840618048459, + "flos": 22962467877120.0, + "grad_norm": 1.6893041742634987, + "language_loss": 0.84275824, + "learning_rate": 3.980262955943399e-06, + "loss": 0.86594176, + "num_input_tokens_seen": 13008195, + "step": 611, + "time_per_iteration": 3.2703471183776855 + }, + { + "auxiliary_loss_clip": 0.01230006, + "auxiliary_loss_mlp": 0.01044303, + "balance_loss_clip": 1.07071805, + "balance_loss_mlp": 1.03189933, + "epoch": 0.07358864907112367, + "flos": 17673903803520.0, + "grad_norm": 3.1165440414144663, + "language_loss": 0.86639142, + "learning_rate": 3.980153639443569e-06, + "loss": 0.88913447, + "num_input_tokens_seen": 13024180, + "step": 612, + "time_per_iteration": 2.5184671878814697 + }, + { + "auxiliary_loss_clip": 0.01241966, + "auxiliary_loss_mlp": 0.01044942, + "balance_loss_clip": 1.07194614, + "balance_loss_mlp": 1.0316087, + "epoch": 0.07370889196176277, + "flos": 24097029840000.0, + "grad_norm": 2.5367567166260776, + "language_loss": 0.8017149, + "learning_rate": 3.980044022556005e-06, + "loss": 0.82458395, + "num_input_tokens_seen": 13043865, + "step": 613, + "time_per_iteration": 2.5901529788970947 + }, + { + "auxiliary_loss_clip": 0.01253077, + "auxiliary_loss_mlp": 0.0105174, + "balance_loss_clip": 1.07130587, + "balance_loss_mlp": 1.0389961, + "epoch": 0.07382913485240185, + "flos": 25885919905920.0, + "grad_norm": 2.4745886295074797, + "language_loss": 0.72921932, + "learning_rate": 3.9799341052973375e-06, + "loss": 0.75226754, + "num_input_tokens_seen": 13063700, + "step": 614, + "time_per_iteration": 2.565976619720459 + }, + { + "auxiliary_loss_clip": 0.01237533, + "auxiliary_loss_mlp": 0.01042192, + "balance_loss_clip": 1.0728116, + "balance_loss_mlp": 1.02802956, + "epoch": 0.07394937774304094, + "flos": 16873850223360.0, + "grad_norm": 2.2735253609420023, + "language_loss": 0.74838507, + "learning_rate": 3.979823887684241e-06, + "loss": 0.7711823, + "num_input_tokens_seen": 13082640, + "step": 615, + "time_per_iteration": 2.516683578491211 + }, + { + "auxiliary_loss_clip": 0.01268209, + "auxiliary_loss_mlp": 0.0105038, + "balance_loss_clip": 1.07404709, + "balance_loss_mlp": 1.03670061, + "epoch": 0.07406962063368003, + "flos": 20703471586560.0, + "grad_norm": 2.59546602453547, + "language_loss": 0.84511334, + "learning_rate": 3.979713369733434e-06, + "loss": 0.86829925, + "num_input_tokens_seen": 13100505, + "step": 616, + "time_per_iteration": 2.5011987686157227 + }, + { + "auxiliary_loss_clip": 0.01250576, + "auxiliary_loss_mlp": 0.01055841, + "balance_loss_clip": 1.07286167, + "balance_loss_mlp": 1.04241204, + "epoch": 0.07418986352431912, + "flos": 21430985650560.0, + "grad_norm": 2.0100048863916617, + "language_loss": 0.84780967, + "learning_rate": 3.979602551461683e-06, + "loss": 0.87087387, + "num_input_tokens_seen": 13121285, + "step": 617, + "time_per_iteration": 2.5173001289367676 + }, + { + "auxiliary_loss_clip": 0.01233044, + "auxiliary_loss_mlp": 0.01042017, + "balance_loss_clip": 1.07031727, + "balance_loss_mlp": 1.02895117, + "epoch": 0.07431010641495822, + "flos": 12021133777920.0, + "grad_norm": 2.311072150115722, + "language_loss": 0.91766834, + "learning_rate": 3.979491432885799e-06, + "loss": 0.94041896, + "num_input_tokens_seen": 13137550, + "step": 618, + "time_per_iteration": 2.509082317352295 + }, + { + "auxiliary_loss_clip": 0.0119907, + "auxiliary_loss_mlp": 0.00765776, + "balance_loss_clip": 1.06079316, + "balance_loss_mlp": 1.00052238, + "epoch": 0.0744303493055973, + "flos": 20957575374720.0, + "grad_norm": 1.9681828294927735, + "language_loss": 0.82969439, + "learning_rate": 3.97938001402264e-06, + "loss": 0.84934282, + "num_input_tokens_seen": 13156675, + "step": 619, + "time_per_iteration": 2.591073513031006 + }, + { + "auxiliary_loss_clip": 0.01213567, + "auxiliary_loss_mlp": 0.01042528, + "balance_loss_clip": 1.06728375, + "balance_loss_mlp": 1.02960551, + "epoch": 0.0745505921962364, + "flos": 16253134272000.0, + "grad_norm": 2.735199400532606, + "language_loss": 0.80032378, + "learning_rate": 3.979268294889105e-06, + "loss": 0.82288474, + "num_input_tokens_seen": 13172225, + "step": 620, + "time_per_iteration": 2.538281202316284 + }, + { + "auxiliary_loss_clip": 0.01269382, + "auxiliary_loss_mlp": 0.01043928, + "balance_loss_clip": 1.07417536, + "balance_loss_mlp": 1.03079152, + "epoch": 0.07467083508687548, + "flos": 50944635550080.0, + "grad_norm": 1.752241703258604, + "language_loss": 0.74229091, + "learning_rate": 3.979156275502143e-06, + "loss": 0.76542401, + "num_input_tokens_seen": 13195885, + "step": 621, + "time_per_iteration": 2.754913568496704 + }, + { + "auxiliary_loss_clip": 0.01221917, + "auxiliary_loss_mlp": 0.01054653, + "balance_loss_clip": 1.06756699, + "balance_loss_mlp": 1.04061604, + "epoch": 0.07479107797751458, + "flos": 17529686697600.0, + "grad_norm": 2.253212484574021, + "language_loss": 0.91155332, + "learning_rate": 3.979043955878749e-06, + "loss": 0.93431902, + "num_input_tokens_seen": 13213730, + "step": 622, + "time_per_iteration": 2.5697338581085205 + }, + { + "auxiliary_loss_clip": 0.01234333, + "auxiliary_loss_mlp": 0.01042898, + "balance_loss_clip": 1.06975865, + "balance_loss_mlp": 1.03004074, + "epoch": 0.07491132086815366, + "flos": 23473943591040.0, + "grad_norm": 2.0408447196345407, + "language_loss": 0.82979482, + "learning_rate": 3.978931336035959e-06, + "loss": 0.85256714, + "num_input_tokens_seen": 13232540, + "step": 623, + "time_per_iteration": 2.541131019592285 + }, + { + "auxiliary_loss_clip": 0.01253329, + "auxiliary_loss_mlp": 0.01052081, + "balance_loss_clip": 1.07354045, + "balance_loss_mlp": 1.03825843, + "epoch": 0.07503156375879276, + "flos": 20157557708160.0, + "grad_norm": 2.344849952803113, + "language_loss": 0.82103324, + "learning_rate": 3.9788184159908595e-06, + "loss": 0.84408724, + "num_input_tokens_seen": 13249670, + "step": 624, + "time_per_iteration": 2.5124425888061523 + }, + { + "auxiliary_loss_clip": 0.0123016, + "auxiliary_loss_mlp": 0.01051643, + "balance_loss_clip": 1.06833708, + "balance_loss_mlp": 1.03858936, + "epoch": 0.07515180664943186, + "flos": 15115519653120.0, + "grad_norm": 4.883393479485179, + "language_loss": 0.8279621, + "learning_rate": 3.97870519576058e-06, + "loss": 0.85078007, + "num_input_tokens_seen": 13266095, + "step": 625, + "time_per_iteration": 2.503430128097534 + }, + { + "auxiliary_loss_clip": 0.01218874, + "auxiliary_loss_mlp": 0.00766872, + "balance_loss_clip": 1.06666958, + "balance_loss_mlp": 1.00045872, + "epoch": 0.07527204954007094, + "flos": 21287702298240.0, + "grad_norm": 3.307542888410078, + "language_loss": 0.8103677, + "learning_rate": 3.978591675362295e-06, + "loss": 0.83022523, + "num_input_tokens_seen": 13284810, + "step": 626, + "time_per_iteration": 2.6071343421936035 + }, + { + "auxiliary_loss_clip": 0.01201897, + "auxiliary_loss_mlp": 0.01041637, + "balance_loss_clip": 1.06914926, + "balance_loss_mlp": 1.02827978, + "epoch": 0.07539229243071004, + "flos": 21324187537920.0, + "grad_norm": 2.024827495073086, + "language_loss": 0.87505549, + "learning_rate": 3.978477854813226e-06, + "loss": 0.89749086, + "num_input_tokens_seen": 13304150, + "step": 627, + "time_per_iteration": 2.587711811065674 + }, + { + "auxiliary_loss_clip": 0.01253361, + "auxiliary_loss_mlp": 0.01046311, + "balance_loss_clip": 1.07049382, + "balance_loss_mlp": 1.0338124, + "epoch": 0.07551253532134912, + "flos": 13042540920960.0, + "grad_norm": 1.960794779852757, + "language_loss": 0.82389569, + "learning_rate": 3.97836373413064e-06, + "loss": 0.84689236, + "num_input_tokens_seen": 13322205, + "step": 628, + "time_per_iteration": 2.507089853286743 + }, + { + "auxiliary_loss_clip": 0.01265494, + "auxiliary_loss_mlp": 0.01046427, + "balance_loss_clip": 1.07051754, + "balance_loss_mlp": 1.03272939, + "epoch": 0.07563277821198822, + "flos": 19208761908480.0, + "grad_norm": 2.0661290998230575, + "language_loss": 0.74473584, + "learning_rate": 3.978249313331848e-06, + "loss": 0.76785505, + "num_input_tokens_seen": 13340435, + "step": 629, + "time_per_iteration": 2.4598329067230225 + }, + { + "auxiliary_loss_clip": 0.01257746, + "auxiliary_loss_mlp": 0.00766426, + "balance_loss_clip": 1.06975985, + "balance_loss_mlp": 1.00034499, + "epoch": 0.07575302110262731, + "flos": 19537200892800.0, + "grad_norm": 3.783015735886524, + "language_loss": 0.62065399, + "learning_rate": 3.978134592434208e-06, + "loss": 0.64089572, + "num_input_tokens_seen": 13358185, + "step": 630, + "time_per_iteration": 2.5049517154693604 + }, + { + "auxiliary_loss_clip": 0.01089447, + "auxiliary_loss_mlp": 0.01007085, + "balance_loss_clip": 1.02336013, + "balance_loss_mlp": 1.00205469, + "epoch": 0.0758732639932664, + "flos": 67961808017280.0, + "grad_norm": 1.0236118675322414, + "language_loss": 0.59456491, + "learning_rate": 3.978019571455123e-06, + "loss": 0.61553025, + "num_input_tokens_seen": 13410130, + "step": 631, + "time_per_iteration": 3.8852829933166504 + }, + { + "auxiliary_loss_clip": 0.01266826, + "auxiliary_loss_mlp": 0.01043326, + "balance_loss_clip": 1.07345462, + "balance_loss_mlp": 1.03084493, + "epoch": 0.07599350688390549, + "flos": 18989204025600.0, + "grad_norm": 2.05169153022908, + "language_loss": 0.84199613, + "learning_rate": 3.977904250412042e-06, + "loss": 0.86509764, + "num_input_tokens_seen": 13429085, + "step": 632, + "time_per_iteration": 2.4542248249053955 + }, + { + "auxiliary_loss_clip": 0.01241861, + "auxiliary_loss_mlp": 0.01045002, + "balance_loss_clip": 1.07061589, + "balance_loss_mlp": 1.0320797, + "epoch": 0.07611374977454458, + "flos": 21069006341760.0, + "grad_norm": 2.426035655648976, + "language_loss": 0.85729426, + "learning_rate": 3.97778862932246e-06, + "loss": 0.88016295, + "num_input_tokens_seen": 13446250, + "step": 633, + "time_per_iteration": 2.564500331878662 + }, + { + "auxiliary_loss_clip": 0.01133005, + "auxiliary_loss_mlp": 0.01041848, + "balance_loss_clip": 1.04840839, + "balance_loss_mlp": 1.02885413, + "epoch": 0.07623399266518367, + "flos": 18514536773760.0, + "grad_norm": 2.2257643471042234, + "language_loss": 0.94418752, + "learning_rate": 3.9776727082039144e-06, + "loss": 0.96593601, + "num_input_tokens_seen": 13463220, + "step": 634, + "time_per_iteration": 2.8990678787231445 + }, + { + "auxiliary_loss_clip": 0.0114955, + "auxiliary_loss_mlp": 0.01002838, + "balance_loss_clip": 1.02596307, + "balance_loss_mlp": 0.99792641, + "epoch": 0.07635423555582276, + "flos": 44663036077440.0, + "grad_norm": 0.7980920223679812, + "language_loss": 0.55489314, + "learning_rate": 3.977556487073991e-06, + "loss": 0.57641697, + "num_input_tokens_seen": 13517775, + "step": 635, + "time_per_iteration": 4.1852662563323975 + }, + { + "auxiliary_loss_clip": 0.0122628, + "auxiliary_loss_mlp": 0.01050058, + "balance_loss_clip": 1.06276739, + "balance_loss_mlp": 1.03792262, + "epoch": 0.07647447844646185, + "flos": 21761148487680.0, + "grad_norm": 1.7084143987425502, + "language_loss": 0.81499958, + "learning_rate": 3.97743996595032e-06, + "loss": 0.83776295, + "num_input_tokens_seen": 13537815, + "step": 636, + "time_per_iteration": 3.2983286380767822 + }, + { + "auxiliary_loss_clip": 0.01265889, + "auxiliary_loss_mlp": 0.0104653, + "balance_loss_clip": 1.07175374, + "balance_loss_mlp": 1.03213513, + "epoch": 0.07659472133710095, + "flos": 23806799948160.0, + "grad_norm": 1.6306251367238913, + "language_loss": 0.81470346, + "learning_rate": 3.9773231448505804e-06, + "loss": 0.83782768, + "num_input_tokens_seen": 13559605, + "step": 637, + "time_per_iteration": 3.2728660106658936 + }, + { + "auxiliary_loss_clip": 0.01232953, + "auxiliary_loss_mlp": 0.00767342, + "balance_loss_clip": 1.06926394, + "balance_loss_mlp": 1.00034535, + "epoch": 0.07671496422774003, + "flos": 21469984842240.0, + "grad_norm": 2.051403575368876, + "language_loss": 0.7820642, + "learning_rate": 3.977206023792491e-06, + "loss": 0.80206716, + "num_input_tokens_seen": 13579495, + "step": 638, + "time_per_iteration": 2.535459041595459 + }, + { + "auxiliary_loss_clip": 0.01253053, + "auxiliary_loss_mlp": 0.01058006, + "balance_loss_clip": 1.07437229, + "balance_loss_mlp": 1.04497027, + "epoch": 0.07683520711837913, + "flos": 16980971558400.0, + "grad_norm": 2.2775248940457655, + "language_loss": 0.80865395, + "learning_rate": 3.97708860279382e-06, + "loss": 0.83176452, + "num_input_tokens_seen": 13597605, + "step": 639, + "time_per_iteration": 2.4955267906188965 + }, + { + "auxiliary_loss_clip": 0.0121536, + "auxiliary_loss_mlp": 0.01049262, + "balance_loss_clip": 1.0637759, + "balance_loss_mlp": 1.03542185, + "epoch": 0.07695545000901821, + "flos": 23476744851840.0, + "grad_norm": 2.3296576645198104, + "language_loss": 0.78222764, + "learning_rate": 3.97697088187238e-06, + "loss": 0.80487394, + "num_input_tokens_seen": 13618120, + "step": 640, + "time_per_iteration": 2.5777978897094727 + }, + { + "auxiliary_loss_clip": 0.01231161, + "auxiliary_loss_mlp": 0.01046025, + "balance_loss_clip": 1.06997609, + "balance_loss_mlp": 1.03326964, + "epoch": 0.07707569289965731, + "flos": 17634258167040.0, + "grad_norm": 2.2556149562919448, + "language_loss": 0.91846919, + "learning_rate": 3.976852861046029e-06, + "loss": 0.94124103, + "num_input_tokens_seen": 13634735, + "step": 641, + "time_per_iteration": 2.507089614868164 + }, + { + "auxiliary_loss_clip": 0.01204242, + "auxiliary_loss_mlp": 0.01043932, + "balance_loss_clip": 1.06541526, + "balance_loss_mlp": 1.03051496, + "epoch": 0.0771959357902964, + "flos": 25775674087680.0, + "grad_norm": 1.5421666084530907, + "language_loss": 0.80240917, + "learning_rate": 3.97673454033267e-06, + "loss": 0.82489091, + "num_input_tokens_seen": 13656835, + "step": 642, + "time_per_iteration": 2.6390554904937744 + }, + { + "auxiliary_loss_clip": 0.01231412, + "auxiliary_loss_mlp": 0.01049859, + "balance_loss_clip": 1.06470287, + "balance_loss_mlp": 1.03607857, + "epoch": 0.07731617868093549, + "flos": 19828651847040.0, + "grad_norm": 2.165052851784037, + "language_loss": 0.82754278, + "learning_rate": 3.976615919750254e-06, + "loss": 0.85035551, + "num_input_tokens_seen": 13674535, + "step": 643, + "time_per_iteration": 2.5104119777679443 + }, + { + "auxiliary_loss_clip": 0.01247898, + "auxiliary_loss_mlp": 0.01050748, + "balance_loss_clip": 1.06959534, + "balance_loss_mlp": 1.03574014, + "epoch": 0.07743642157157458, + "flos": 21324654414720.0, + "grad_norm": 2.0086712302597927, + "language_loss": 0.86810112, + "learning_rate": 3.976496999316775e-06, + "loss": 0.89108759, + "num_input_tokens_seen": 13693290, + "step": 644, + "time_per_iteration": 2.492023229598999 + }, + { + "auxiliary_loss_clip": 0.01232366, + "auxiliary_loss_mlp": 0.01049275, + "balance_loss_clip": 1.0721066, + "balance_loss_mlp": 1.03532767, + "epoch": 0.07755666446221367, + "flos": 19969133938560.0, + "grad_norm": 1.9860446768941096, + "language_loss": 0.83979756, + "learning_rate": 3.976377779050271e-06, + "loss": 0.86261398, + "num_input_tokens_seen": 13711420, + "step": 645, + "time_per_iteration": 2.51507306098938 + }, + { + "auxiliary_loss_clip": 0.01241672, + "auxiliary_loss_mlp": 0.01051045, + "balance_loss_clip": 1.06776524, + "balance_loss_mlp": 1.03768742, + "epoch": 0.07767690735285276, + "flos": 23623224514560.0, + "grad_norm": 2.8947555746041225, + "language_loss": 0.84126723, + "learning_rate": 3.976258258968831e-06, + "loss": 0.86419439, + "num_input_tokens_seen": 13729965, + "step": 646, + "time_per_iteration": 2.598907947540283 + }, + { + "auxiliary_loss_clip": 0.01216161, + "auxiliary_loss_mlp": 0.01050288, + "balance_loss_clip": 1.06792367, + "balance_loss_mlp": 1.03763378, + "epoch": 0.07779715024349185, + "flos": 22236246702720.0, + "grad_norm": 2.311112799936636, + "language_loss": 0.74249607, + "learning_rate": 3.976138439090583e-06, + "loss": 0.76516056, + "num_input_tokens_seen": 13748045, + "step": 647, + "time_per_iteration": 2.5663063526153564 + }, + { + "auxiliary_loss_clip": 0.01221978, + "auxiliary_loss_mlp": 0.01040635, + "balance_loss_clip": 1.06928957, + "balance_loss_mlp": 1.02684283, + "epoch": 0.07791739313413094, + "flos": 20955097336320.0, + "grad_norm": 2.8317291925798287, + "language_loss": 0.85155499, + "learning_rate": 3.976018319433706e-06, + "loss": 0.87418103, + "num_input_tokens_seen": 13765590, + "step": 648, + "time_per_iteration": 2.5512115955352783 + }, + { + "auxiliary_loss_clip": 0.01246997, + "auxiliary_loss_mlp": 0.01047378, + "balance_loss_clip": 1.06928992, + "balance_loss_mlp": 1.03431821, + "epoch": 0.07803763602477004, + "flos": 19312327797120.0, + "grad_norm": 3.0551522351739577, + "language_loss": 0.91304576, + "learning_rate": 3.9758979000164205e-06, + "loss": 0.93598956, + "num_input_tokens_seen": 13782410, + "step": 649, + "time_per_iteration": 2.486398458480835 + }, + { + "auxiliary_loss_clip": 0.01223484, + "auxiliary_loss_mlp": 0.01038711, + "balance_loss_clip": 1.06832278, + "balance_loss_mlp": 1.02466822, + "epoch": 0.07815787891540912, + "flos": 22710806213760.0, + "grad_norm": 1.6795629248426387, + "language_loss": 0.72092342, + "learning_rate": 3.975777180856995e-06, + "loss": 0.74354535, + "num_input_tokens_seen": 13801530, + "step": 650, + "time_per_iteration": 2.549574613571167 + }, + { + "auxiliary_loss_clip": 0.01270013, + "auxiliary_loss_mlp": 0.01049733, + "balance_loss_clip": 1.07141244, + "balance_loss_mlp": 1.03546953, + "epoch": 0.07827812180604822, + "flos": 22711129436160.0, + "grad_norm": 2.082060718104179, + "language_loss": 0.86035955, + "learning_rate": 3.975656161973742e-06, + "loss": 0.88355702, + "num_input_tokens_seen": 13820615, + "step": 651, + "time_per_iteration": 2.488276958465576 + }, + { + "auxiliary_loss_clip": 0.01266454, + "auxiliary_loss_mlp": 0.01049573, + "balance_loss_clip": 1.06929433, + "balance_loss_mlp": 1.03539252, + "epoch": 0.0783983646966873, + "flos": 21725597001600.0, + "grad_norm": 2.508186060527994, + "language_loss": 0.88660514, + "learning_rate": 3.9755348433850194e-06, + "loss": 0.90976548, + "num_input_tokens_seen": 13835955, + "step": 652, + "time_per_iteration": 2.4550790786743164 + }, + { + "auxiliary_loss_clip": 0.01118069, + "auxiliary_loss_mlp": 0.0104429, + "balance_loss_clip": 1.02312422, + "balance_loss_mlp": 1.03942609, + "epoch": 0.0785186075873264, + "flos": 60640877537280.0, + "grad_norm": 0.9859885927264269, + "language_loss": 0.63647604, + "learning_rate": 3.975413225109232e-06, + "loss": 0.65809965, + "num_input_tokens_seen": 13896505, + "step": 653, + "time_per_iteration": 3.1290040016174316 + }, + { + "auxiliary_loss_clip": 0.01250091, + "auxiliary_loss_mlp": 0.0104343, + "balance_loss_clip": 1.07026172, + "balance_loss_mlp": 1.02936327, + "epoch": 0.0786388504779655, + "flos": 23877902920320.0, + "grad_norm": 2.900105980345201, + "language_loss": 0.93404973, + "learning_rate": 3.975291307164829e-06, + "loss": 0.956985, + "num_input_tokens_seen": 13915150, + "step": 654, + "time_per_iteration": 2.525437831878662 + }, + { + "auxiliary_loss_clip": 0.01203872, + "auxiliary_loss_mlp": 0.01044576, + "balance_loss_clip": 1.06196594, + "balance_loss_mlp": 1.03193378, + "epoch": 0.07875909336860458, + "flos": 15158684822400.0, + "grad_norm": 2.3614507124299347, + "language_loss": 0.84880996, + "learning_rate": 3.975169089570306e-06, + "loss": 0.87129438, + "num_input_tokens_seen": 13933525, + "step": 655, + "time_per_iteration": 2.5420384407043457 + }, + { + "auxiliary_loss_clip": 0.01232032, + "auxiliary_loss_mlp": 0.01042267, + "balance_loss_clip": 1.06518388, + "balance_loss_mlp": 1.02874923, + "epoch": 0.07887933625924368, + "flos": 22236857233920.0, + "grad_norm": 2.0453435161275433, + "language_loss": 0.91735131, + "learning_rate": 3.975046572344202e-06, + "loss": 0.94009429, + "num_input_tokens_seen": 13949985, + "step": 656, + "time_per_iteration": 2.5171566009521484 + }, + { + "auxiliary_loss_clip": 0.01211818, + "auxiliary_loss_mlp": 0.01053123, + "balance_loss_clip": 1.06183612, + "balance_loss_mlp": 1.03910959, + "epoch": 0.07899957914988276, + "flos": 20777734955520.0, + "grad_norm": 1.9762573363646514, + "language_loss": 0.71231031, + "learning_rate": 3.974923755505103e-06, + "loss": 0.73495978, + "num_input_tokens_seen": 13969215, + "step": 657, + "time_per_iteration": 2.575199842453003 + }, + { + "auxiliary_loss_clip": 0.0120567, + "auxiliary_loss_mlp": 0.01042204, + "balance_loss_clip": 1.06295741, + "balance_loss_mlp": 1.02918613, + "epoch": 0.07911982204052186, + "flos": 23003047267200.0, + "grad_norm": 1.6472857562999195, + "language_loss": 0.91087949, + "learning_rate": 3.974800639071641e-06, + "loss": 0.93335825, + "num_input_tokens_seen": 13989935, + "step": 658, + "time_per_iteration": 3.3598647117614746 + }, + { + "auxiliary_loss_clip": 0.01171875, + "auxiliary_loss_mlp": 0.00766676, + "balance_loss_clip": 1.0583272, + "balance_loss_mlp": 1.00052524, + "epoch": 0.07924006493116094, + "flos": 23111389664640.0, + "grad_norm": 2.1919683120847426, + "language_loss": 1.00416255, + "learning_rate": 3.974677223062492e-06, + "loss": 1.02354801, + "num_input_tokens_seen": 14007150, + "step": 659, + "time_per_iteration": 2.6282784938812256 + }, + { + "auxiliary_loss_clip": 0.01230539, + "auxiliary_loss_mlp": 0.01043082, + "balance_loss_clip": 1.06772208, + "balance_loss_mlp": 1.02986813, + "epoch": 0.07936030782180004, + "flos": 16472153450880.0, + "grad_norm": 2.2659268047522234, + "language_loss": 0.74295253, + "learning_rate": 3.974553507496378e-06, + "loss": 0.76568878, + "num_input_tokens_seen": 14025725, + "step": 660, + "time_per_iteration": 2.5385944843292236 + }, + { + "auxiliary_loss_clip": 0.0122355, + "auxiliary_loss_mlp": 0.01042474, + "balance_loss_clip": 1.06769204, + "balance_loss_mlp": 1.02717948, + "epoch": 0.07948055071243913, + "flos": 23733290764800.0, + "grad_norm": 2.3005483871874333, + "language_loss": 0.88846934, + "learning_rate": 3.974429492392068e-06, + "loss": 0.91112953, + "num_input_tokens_seen": 14045750, + "step": 661, + "time_per_iteration": 3.299978733062744 + }, + { + "auxiliary_loss_clip": 0.01261841, + "auxiliary_loss_mlp": 0.00766198, + "balance_loss_clip": 1.07184005, + "balance_loss_mlp": 1.0004952, + "epoch": 0.07960079360307822, + "flos": 19573326996480.0, + "grad_norm": 2.2129551365991347, + "language_loss": 0.90888309, + "learning_rate": 3.974305177768373e-06, + "loss": 0.92916346, + "num_input_tokens_seen": 14063960, + "step": 662, + "time_per_iteration": 2.482412576675415 + }, + { + "auxiliary_loss_clip": 0.01207328, + "auxiliary_loss_mlp": 0.01047276, + "balance_loss_clip": 1.06536782, + "balance_loss_mlp": 1.03345966, + "epoch": 0.07972103649371731, + "flos": 23513409659520.0, + "grad_norm": 3.7717864489526653, + "language_loss": 0.86377746, + "learning_rate": 3.974180563644152e-06, + "loss": 0.88632351, + "num_input_tokens_seen": 14082525, + "step": 663, + "time_per_iteration": 3.3422181606292725 + }, + { + "auxiliary_loss_clip": 0.01235542, + "auxiliary_loss_mlp": 0.01049512, + "balance_loss_clip": 1.06802857, + "balance_loss_mlp": 1.03621435, + "epoch": 0.0798412793843564, + "flos": 16726867770240.0, + "grad_norm": 2.2227209668992503, + "language_loss": 0.89531088, + "learning_rate": 3.97405565003831e-06, + "loss": 0.91816139, + "num_input_tokens_seen": 14098610, + "step": 664, + "time_per_iteration": 3.3519678115844727 + }, + { + "auxiliary_loss_clip": 0.0121446, + "auxiliary_loss_mlp": 0.01038996, + "balance_loss_clip": 1.06436193, + "balance_loss_mlp": 1.02579403, + "epoch": 0.07996152227499549, + "flos": 18223337214720.0, + "grad_norm": 1.9628717170294927, + "language_loss": 0.78192931, + "learning_rate": 3.973930436969794e-06, + "loss": 0.80446386, + "num_input_tokens_seen": 14117065, + "step": 665, + "time_per_iteration": 2.5433993339538574 + }, + { + "auxiliary_loss_clip": 0.01222319, + "auxiliary_loss_mlp": 0.01049288, + "balance_loss_clip": 1.06421602, + "balance_loss_mlp": 1.03559661, + "epoch": 0.08008176516563459, + "flos": 20594877793920.0, + "grad_norm": 1.8475724183816908, + "language_loss": 0.85847855, + "learning_rate": 3.973804924457602e-06, + "loss": 0.88119459, + "num_input_tokens_seen": 14135145, + "step": 666, + "time_per_iteration": 2.535959243774414 + }, + { + "auxiliary_loss_clip": 0.01224613, + "auxiliary_loss_mlp": 0.01055614, + "balance_loss_clip": 1.06747091, + "balance_loss_mlp": 1.04282904, + "epoch": 0.08020200805627367, + "flos": 31834306863360.0, + "grad_norm": 1.7634392170568443, + "language_loss": 0.85749519, + "learning_rate": 3.973679112520771e-06, + "loss": 0.88029748, + "num_input_tokens_seen": 14156860, + "step": 667, + "time_per_iteration": 2.6847317218780518 + }, + { + "auxiliary_loss_clip": 0.01206162, + "auxiliary_loss_mlp": 0.01040226, + "balance_loss_clip": 1.06202006, + "balance_loss_mlp": 1.02668369, + "epoch": 0.08032225094691277, + "flos": 17783503176960.0, + "grad_norm": 2.3912402871113883, + "language_loss": 0.98903203, + "learning_rate": 3.973553001178389e-06, + "loss": 1.01149607, + "num_input_tokens_seen": 14174365, + "step": 668, + "time_per_iteration": 2.5611748695373535 + }, + { + "auxiliary_loss_clip": 0.01217063, + "auxiliary_loss_mlp": 0.01040698, + "balance_loss_clip": 1.06715214, + "balance_loss_mlp": 1.02755511, + "epoch": 0.08044249383755185, + "flos": 24061693835520.0, + "grad_norm": 2.1303113000165386, + "language_loss": 0.75510895, + "learning_rate": 3.973426590449585e-06, + "loss": 0.77768654, + "num_input_tokens_seen": 14192320, + "step": 669, + "time_per_iteration": 2.58566951751709 + }, + { + "auxiliary_loss_clip": 0.01200599, + "auxiliary_loss_mlp": 0.01042027, + "balance_loss_clip": 1.06409967, + "balance_loss_mlp": 1.0289979, + "epoch": 0.08056273672819095, + "flos": 18223624523520.0, + "grad_norm": 2.4441255575818097, + "language_loss": 0.75287962, + "learning_rate": 3.9732998803535364e-06, + "loss": 0.77530587, + "num_input_tokens_seen": 14210380, + "step": 670, + "time_per_iteration": 2.58516263961792 + }, + { + "auxiliary_loss_clip": 0.01263961, + "auxiliary_loss_mlp": 0.01047931, + "balance_loss_clip": 1.070207, + "balance_loss_mlp": 1.03451395, + "epoch": 0.08068297961883003, + "flos": 19676856971520.0, + "grad_norm": 2.6470533836320604, + "language_loss": 0.85372972, + "learning_rate": 3.973172870909465e-06, + "loss": 0.87684864, + "num_input_tokens_seen": 14225145, + "step": 671, + "time_per_iteration": 2.4547576904296875 + }, + { + "auxiliary_loss_clip": 0.01236879, + "auxiliary_loss_mlp": 0.01041067, + "balance_loss_clip": 1.0659852, + "balance_loss_mlp": 1.02717912, + "epoch": 0.08080322250946913, + "flos": 23148736830720.0, + "grad_norm": 2.723175475764448, + "language_loss": 0.80596888, + "learning_rate": 3.973045562136638e-06, + "loss": 0.82874829, + "num_input_tokens_seen": 14241960, + "step": 672, + "time_per_iteration": 2.5638630390167236 + }, + { + "auxiliary_loss_clip": 0.01252941, + "auxiliary_loss_mlp": 0.01041733, + "balance_loss_clip": 1.07095194, + "balance_loss_mlp": 1.02891254, + "epoch": 0.08092346540010822, + "flos": 21763626526080.0, + "grad_norm": 2.1925504785405097, + "language_loss": 0.91441172, + "learning_rate": 3.972917954054368e-06, + "loss": 0.9373585, + "num_input_tokens_seen": 14260515, + "step": 673, + "time_per_iteration": 2.5423386096954346 + }, + { + "auxiliary_loss_clip": 0.0122938, + "auxiliary_loss_mlp": 0.0104695, + "balance_loss_clip": 1.0695684, + "balance_loss_mlp": 1.03203702, + "epoch": 0.08104370829074731, + "flos": 21032485188480.0, + "grad_norm": 3.1753538229896563, + "language_loss": 0.81602079, + "learning_rate": 3.972790046682013e-06, + "loss": 0.8387841, + "num_input_tokens_seen": 14279190, + "step": 674, + "time_per_iteration": 2.5484273433685303 + }, + { + "auxiliary_loss_clip": 0.01214863, + "auxiliary_loss_mlp": 0.01042928, + "balance_loss_clip": 1.06253695, + "balance_loss_mlp": 1.02993989, + "epoch": 0.0811639511813864, + "flos": 20083186598400.0, + "grad_norm": 1.769195000522939, + "language_loss": 0.79041719, + "learning_rate": 3.972661840038977e-06, + "loss": 0.81299514, + "num_input_tokens_seen": 14299480, + "step": 675, + "time_per_iteration": 2.5912442207336426 + }, + { + "auxiliary_loss_clip": 0.01250945, + "auxiliary_loss_mlp": 0.01041898, + "balance_loss_clip": 1.07238638, + "balance_loss_mlp": 1.02913666, + "epoch": 0.08128419407202549, + "flos": 16836718538880.0, + "grad_norm": 2.2996830757588658, + "language_loss": 0.8355068, + "learning_rate": 3.972533334144707e-06, + "loss": 0.85843521, + "num_input_tokens_seen": 14316405, + "step": 676, + "time_per_iteration": 2.479860305786133 + }, + { + "auxiliary_loss_clip": 0.01251827, + "auxiliary_loss_mlp": 0.01040128, + "balance_loss_clip": 1.06734347, + "balance_loss_mlp": 1.02706885, + "epoch": 0.08140443696266458, + "flos": 23769273214080.0, + "grad_norm": 2.226492101369579, + "language_loss": 0.78579968, + "learning_rate": 3.972404529018699e-06, + "loss": 0.80871922, + "num_input_tokens_seen": 14336265, + "step": 677, + "time_per_iteration": 2.536282539367676 + }, + { + "auxiliary_loss_clip": 0.01225659, + "auxiliary_loss_mlp": 0.01035909, + "balance_loss_clip": 1.06259763, + "balance_loss_mlp": 1.02330852, + "epoch": 0.08152467985330367, + "flos": 24390132819840.0, + "grad_norm": 4.247162569281038, + "language_loss": 0.85596025, + "learning_rate": 3.972275424680493e-06, + "loss": 0.87857592, + "num_input_tokens_seen": 14356375, + "step": 678, + "time_per_iteration": 2.564669609069824 + }, + { + "auxiliary_loss_clip": 0.01261189, + "auxiliary_loss_mlp": 0.01036943, + "balance_loss_clip": 1.06996584, + "balance_loss_mlp": 1.02446795, + "epoch": 0.08164492274394276, + "flos": 19317750750720.0, + "grad_norm": 2.2327267633712835, + "language_loss": 0.91787302, + "learning_rate": 3.972146021149673e-06, + "loss": 0.94085431, + "num_input_tokens_seen": 14374650, + "step": 679, + "time_per_iteration": 2.517213821411133 + }, + { + "auxiliary_loss_clip": 0.01215331, + "auxiliary_loss_mlp": 0.01042465, + "balance_loss_clip": 1.06607223, + "balance_loss_mlp": 1.03032923, + "epoch": 0.08176516563458186, + "flos": 14830461319680.0, + "grad_norm": 2.18154304826459, + "language_loss": 0.78700405, + "learning_rate": 3.972016318445868e-06, + "loss": 0.809582, + "num_input_tokens_seen": 14392650, + "step": 680, + "time_per_iteration": 2.561509370803833 + }, + { + "auxiliary_loss_clip": 0.01245364, + "auxiliary_loss_mlp": 0.01046365, + "balance_loss_clip": 1.06891441, + "balance_loss_mlp": 1.03362763, + "epoch": 0.08188540852522094, + "flos": 22602320161920.0, + "grad_norm": 1.912255193116824, + "language_loss": 0.92095745, + "learning_rate": 3.971886316588757e-06, + "loss": 0.94387472, + "num_input_tokens_seen": 14413155, + "step": 681, + "time_per_iteration": 2.5509376525878906 + }, + { + "auxiliary_loss_clip": 0.01205746, + "auxiliary_loss_mlp": 0.01048996, + "balance_loss_clip": 1.06581056, + "balance_loss_mlp": 1.03530526, + "epoch": 0.08200565141586004, + "flos": 19463727623040.0, + "grad_norm": 2.6400784355328306, + "language_loss": 0.73191619, + "learning_rate": 3.9717560155980595e-06, + "loss": 0.75446355, + "num_input_tokens_seen": 14428805, + "step": 682, + "time_per_iteration": 2.5590946674346924 + }, + { + "auxiliary_loss_clip": 0.01246225, + "auxiliary_loss_mlp": 0.01043115, + "balance_loss_clip": 1.06927931, + "balance_loss_mlp": 1.02988291, + "epoch": 0.08212589430649912, + "flos": 20594662312320.0, + "grad_norm": 2.0099624795361426, + "language_loss": 0.92096132, + "learning_rate": 3.971625415493542e-06, + "loss": 0.94385469, + "num_input_tokens_seen": 14447125, + "step": 683, + "time_per_iteration": 2.5184104442596436 + }, + { + "auxiliary_loss_clip": 0.01208588, + "auxiliary_loss_mlp": 0.01042654, + "balance_loss_clip": 1.06441963, + "balance_loss_mlp": 1.0291357, + "epoch": 0.08224613719713822, + "flos": 25953611086080.0, + "grad_norm": 1.8468610078807024, + "language_loss": 0.87288278, + "learning_rate": 3.971494516295017e-06, + "loss": 0.89539516, + "num_input_tokens_seen": 14466575, + "step": 684, + "time_per_iteration": 2.62156343460083 + }, + { + "auxiliary_loss_clip": 0.0121827, + "auxiliary_loss_mlp": 0.01045639, + "balance_loss_clip": 1.06443524, + "balance_loss_mlp": 1.03201389, + "epoch": 0.08236638008777732, + "flos": 23768734510080.0, + "grad_norm": 3.872537556199059, + "language_loss": 0.85369265, + "learning_rate": 3.971363318022341e-06, + "loss": 0.87633175, + "num_input_tokens_seen": 14487915, + "step": 685, + "time_per_iteration": 3.3890886306762695 + }, + { + "auxiliary_loss_clip": 0.01232046, + "auxiliary_loss_mlp": 0.01044796, + "balance_loss_clip": 1.06352282, + "balance_loss_mlp": 1.03123593, + "epoch": 0.0824866229784164, + "flos": 38799144887040.0, + "grad_norm": 2.6785486706786443, + "language_loss": 0.68184698, + "learning_rate": 3.971231820695417e-06, + "loss": 0.70461535, + "num_input_tokens_seen": 14511530, + "step": 686, + "time_per_iteration": 2.6905853748321533 + }, + { + "auxiliary_loss_clip": 0.01238172, + "auxiliary_loss_mlp": 0.0104454, + "balance_loss_clip": 1.06964993, + "balance_loss_mlp": 1.03077102, + "epoch": 0.0826068658690555, + "flos": 23107762391040.0, + "grad_norm": 2.0604644821034115, + "language_loss": 0.81313729, + "learning_rate": 3.971100024334193e-06, + "loss": 0.83596444, + "num_input_tokens_seen": 14529050, + "step": 687, + "time_per_iteration": 2.54071307182312 + }, + { + "auxiliary_loss_clip": 0.01196619, + "auxiliary_loss_mlp": 0.01039131, + "balance_loss_clip": 1.05875683, + "balance_loss_mlp": 1.02686489, + "epoch": 0.08272710875969458, + "flos": 21136374299520.0, + "grad_norm": 2.1657505075876196, + "language_loss": 0.86457825, + "learning_rate": 3.970967928958663e-06, + "loss": 0.88693577, + "num_input_tokens_seen": 14546165, + "step": 688, + "time_per_iteration": 3.3420021533966064 + }, + { + "auxiliary_loss_clip": 0.01201834, + "auxiliary_loss_mlp": 0.01051253, + "balance_loss_clip": 1.06336498, + "balance_loss_mlp": 1.03827691, + "epoch": 0.08284735165033368, + "flos": 19063000517760.0, + "grad_norm": 1.7311731233692753, + "language_loss": 0.83507043, + "learning_rate": 3.970835534588865e-06, + "loss": 0.85760128, + "num_input_tokens_seen": 14563660, + "step": 689, + "time_per_iteration": 3.385972738265991 + }, + { + "auxiliary_loss_clip": 0.01233223, + "auxiliary_loss_mlp": 0.01047568, + "balance_loss_clip": 1.07052946, + "balance_loss_mlp": 1.03508735, + "epoch": 0.08296759454097276, + "flos": 16727442387840.0, + "grad_norm": 1.8746972072443844, + "language_loss": 0.85375035, + "learning_rate": 3.970702841244883e-06, + "loss": 0.87655818, + "num_input_tokens_seen": 14581980, + "step": 690, + "time_per_iteration": 3.435075283050537 + }, + { + "auxiliary_loss_clip": 0.01250545, + "auxiliary_loss_mlp": 0.01045821, + "balance_loss_clip": 1.07133186, + "balance_loss_mlp": 1.03317332, + "epoch": 0.08308783743161186, + "flos": 18004928567040.0, + "grad_norm": 1.8802220235060187, + "language_loss": 0.82438743, + "learning_rate": 3.970569848946847e-06, + "loss": 0.84735107, + "num_input_tokens_seen": 14601795, + "step": 691, + "time_per_iteration": 2.5326900482177734 + }, + { + "auxiliary_loss_clip": 0.01230206, + "auxiliary_loss_mlp": 0.01039846, + "balance_loss_clip": 1.065763, + "balance_loss_mlp": 1.02725708, + "epoch": 0.08320808032225095, + "flos": 15079788599040.0, + "grad_norm": 2.2473436293157936, + "language_loss": 0.82757509, + "learning_rate": 3.970436557714932e-06, + "loss": 0.85027552, + "num_input_tokens_seen": 14618315, + "step": 692, + "time_per_iteration": 2.5363149642944336 + }, + { + "auxiliary_loss_clip": 0.01221626, + "auxiliary_loss_mlp": 0.0103896, + "balance_loss_clip": 1.06314635, + "balance_loss_mlp": 1.02591228, + "epoch": 0.08332832321289003, + "flos": 22383085501440.0, + "grad_norm": 2.3075281130396617, + "language_loss": 0.86338496, + "learning_rate": 3.970302967569358e-06, + "loss": 0.88599086, + "num_input_tokens_seen": 14636905, + "step": 693, + "time_per_iteration": 2.6844899654388428 + }, + { + "auxiliary_loss_clip": 0.01247986, + "auxiliary_loss_mlp": 0.01045819, + "balance_loss_clip": 1.0728929, + "balance_loss_mlp": 1.03280735, + "epoch": 0.08344856610352913, + "flos": 24717386655360.0, + "grad_norm": 1.9791543941197933, + "language_loss": 0.68082899, + "learning_rate": 3.9701690785303896e-06, + "loss": 0.70376706, + "num_input_tokens_seen": 14656100, + "step": 694, + "time_per_iteration": 2.6545259952545166 + }, + { + "auxiliary_loss_clip": 0.01250489, + "auxiliary_loss_mlp": 0.01045739, + "balance_loss_clip": 1.06917036, + "balance_loss_mlp": 1.03295994, + "epoch": 0.08356880899416821, + "flos": 25370206387200.0, + "grad_norm": 3.562034273458009, + "language_loss": 0.88314629, + "learning_rate": 3.970034890618339e-06, + "loss": 0.90610862, + "num_input_tokens_seen": 14675790, + "step": 695, + "time_per_iteration": 2.5797197818756104 + }, + { + "auxiliary_loss_clip": 0.01229635, + "auxiliary_loss_mlp": 0.01040999, + "balance_loss_clip": 1.06537187, + "balance_loss_mlp": 1.02872002, + "epoch": 0.08368905188480731, + "flos": 24353072962560.0, + "grad_norm": 2.083127816223687, + "language_loss": 0.88011509, + "learning_rate": 3.969900403853562e-06, + "loss": 0.90282142, + "num_input_tokens_seen": 14694830, + "step": 696, + "time_per_iteration": 2.5494306087493896 + }, + { + "auxiliary_loss_clip": 0.01264894, + "auxiliary_loss_mlp": 0.01055011, + "balance_loss_clip": 1.0731678, + "balance_loss_mlp": 1.04142666, + "epoch": 0.08380929477544641, + "flos": 18037319656320.0, + "grad_norm": 1.8348964329942858, + "language_loss": 0.77663541, + "learning_rate": 3.96976561825646e-06, + "loss": 0.79983443, + "num_input_tokens_seen": 14711920, + "step": 697, + "time_per_iteration": 2.455653667449951 + }, + { + "auxiliary_loss_clip": 0.01199992, + "auxiliary_loss_mlp": 0.0103813, + "balance_loss_clip": 1.06482601, + "balance_loss_mlp": 1.02608979, + "epoch": 0.08392953766608549, + "flos": 26286287875200.0, + "grad_norm": 3.1502364834491123, + "language_loss": 0.87298119, + "learning_rate": 3.969630533847479e-06, + "loss": 0.89536238, + "num_input_tokens_seen": 14730880, + "step": 698, + "time_per_iteration": 2.6675527095794678 + }, + { + "auxiliary_loss_clip": 0.01245406, + "auxiliary_loss_mlp": 0.01040255, + "balance_loss_clip": 1.06869721, + "balance_loss_mlp": 1.02758336, + "epoch": 0.08404978055672459, + "flos": 22492146170880.0, + "grad_norm": 2.5546338417245353, + "language_loss": 0.84473628, + "learning_rate": 3.969495150647113e-06, + "loss": 0.86759281, + "num_input_tokens_seen": 14749050, + "step": 699, + "time_per_iteration": 2.5723142623901367 + }, + { + "auxiliary_loss_clip": 0.01213254, + "auxiliary_loss_mlp": 0.01039275, + "balance_loss_clip": 1.06847441, + "balance_loss_mlp": 1.02656698, + "epoch": 0.08417002344736367, + "flos": 24826878288000.0, + "grad_norm": 1.7612828218217713, + "language_loss": 0.76404792, + "learning_rate": 3.969359468675899e-06, + "loss": 0.78657317, + "num_input_tokens_seen": 14769180, + "step": 700, + "time_per_iteration": 2.619915723800659 + }, + { + "auxiliary_loss_clip": 0.01242651, + "auxiliary_loss_mlp": 0.01037117, + "balance_loss_clip": 1.06920552, + "balance_loss_mlp": 1.02422488, + "epoch": 0.08429026633800277, + "flos": 16945922862720.0, + "grad_norm": 3.5946709391174956, + "language_loss": 0.89199495, + "learning_rate": 3.969223487954418e-06, + "loss": 0.91479272, + "num_input_tokens_seen": 14786640, + "step": 701, + "time_per_iteration": 2.4823615550994873 + }, + { + "auxiliary_loss_clip": 0.01199878, + "auxiliary_loss_mlp": 0.01042952, + "balance_loss_clip": 1.06876945, + "balance_loss_mlp": 1.03072095, + "epoch": 0.08441050922864185, + "flos": 23841920471040.0, + "grad_norm": 2.039331164148517, + "language_loss": 0.82615399, + "learning_rate": 3.969087208503301e-06, + "loss": 0.84858227, + "num_input_tokens_seen": 14806720, + "step": 702, + "time_per_iteration": 2.657860517501831 + }, + { + "auxiliary_loss_clip": 0.01200727, + "auxiliary_loss_mlp": 0.01042826, + "balance_loss_clip": 1.06530571, + "balance_loss_mlp": 1.03065503, + "epoch": 0.08453075211928095, + "flos": 25520205582720.0, + "grad_norm": 2.3324780736949764, + "language_loss": 0.84788305, + "learning_rate": 3.968950630343219e-06, + "loss": 0.87031859, + "num_input_tokens_seen": 14823705, + "step": 703, + "time_per_iteration": 2.614928960800171 + }, + { + "auxiliary_loss_clip": 0.01227148, + "auxiliary_loss_mlp": 0.01042946, + "balance_loss_clip": 1.06479073, + "balance_loss_mlp": 1.03045893, + "epoch": 0.08465099500992004, + "flos": 19532496211200.0, + "grad_norm": 2.428297790030425, + "language_loss": 0.93240726, + "learning_rate": 3.968813753494892e-06, + "loss": 0.95510817, + "num_input_tokens_seen": 14841865, + "step": 704, + "time_per_iteration": 2.569700241088867 + }, + { + "auxiliary_loss_clip": 0.01201526, + "auxiliary_loss_mlp": 0.00766966, + "balance_loss_clip": 1.05962002, + "balance_loss_mlp": 1.00073946, + "epoch": 0.08477123790055913, + "flos": 29351299403520.0, + "grad_norm": 2.532936575177491, + "language_loss": 0.7536428, + "learning_rate": 3.968676577979084e-06, + "loss": 0.77332771, + "num_input_tokens_seen": 14861415, + "step": 705, + "time_per_iteration": 2.6429154872894287 + }, + { + "auxiliary_loss_clip": 0.01189899, + "auxiliary_loss_mlp": 0.01047739, + "balance_loss_clip": 1.05933738, + "balance_loss_mlp": 1.03502512, + "epoch": 0.08489148079119822, + "flos": 18624495283200.0, + "grad_norm": 2.296601237578259, + "language_loss": 0.78324032, + "learning_rate": 3.968539103816605e-06, + "loss": 0.80561674, + "num_input_tokens_seen": 14879215, + "step": 706, + "time_per_iteration": 2.604400873184204 + }, + { + "auxiliary_loss_clip": 0.01227464, + "auxiliary_loss_mlp": 0.00766573, + "balance_loss_clip": 1.06906557, + "balance_loss_mlp": 1.00077415, + "epoch": 0.0850117236818373, + "flos": 23471393725440.0, + "grad_norm": 2.733942325574479, + "language_loss": 0.89456379, + "learning_rate": 3.9684013310283085e-06, + "loss": 0.91450417, + "num_input_tokens_seen": 14897900, + "step": 707, + "time_per_iteration": 2.6713945865631104 + }, + { + "auxiliary_loss_clip": 0.012243, + "auxiliary_loss_mlp": 0.01048342, + "balance_loss_clip": 1.06946659, + "balance_loss_mlp": 1.03519297, + "epoch": 0.0851319665724764, + "flos": 40625058896640.0, + "grad_norm": 4.331374943201475, + "language_loss": 0.6431098, + "learning_rate": 3.9682632596350956e-06, + "loss": 0.66583622, + "num_input_tokens_seen": 14919065, + "step": 708, + "time_per_iteration": 2.8674261569976807 + }, + { + "auxiliary_loss_clip": 0.0124169, + "auxiliary_loss_mlp": 0.01036444, + "balance_loss_clip": 1.07044983, + "balance_loss_mlp": 1.02359962, + "epoch": 0.0852522094631155, + "flos": 15879554870400.0, + "grad_norm": 2.042104575220502, + "language_loss": 0.7823633, + "learning_rate": 3.968124889657911e-06, + "loss": 0.80514467, + "num_input_tokens_seen": 14934165, + "step": 709, + "time_per_iteration": 2.609698534011841 + }, + { + "auxiliary_loss_clip": 0.01191782, + "auxiliary_loss_mlp": 0.01044849, + "balance_loss_clip": 1.06014132, + "balance_loss_mlp": 1.03307128, + "epoch": 0.08537245235375458, + "flos": 14567091822720.0, + "grad_norm": 2.1224445530114098, + "language_loss": 0.90540457, + "learning_rate": 3.967986221117746e-06, + "loss": 0.92777091, + "num_input_tokens_seen": 14950105, + "step": 710, + "time_per_iteration": 2.6894047260284424 + }, + { + "auxiliary_loss_clip": 0.01168282, + "auxiliary_loss_mlp": 0.01035944, + "balance_loss_clip": 1.06007218, + "balance_loss_mlp": 1.02401745, + "epoch": 0.08549269524439368, + "flos": 26468929555200.0, + "grad_norm": 2.065847005769564, + "language_loss": 0.86607647, + "learning_rate": 3.967847254035635e-06, + "loss": 0.88811874, + "num_input_tokens_seen": 14969490, + "step": 711, + "time_per_iteration": 2.8664019107818604 + }, + { + "auxiliary_loss_clip": 0.0120821, + "auxiliary_loss_mlp": 0.01042135, + "balance_loss_clip": 1.06185627, + "balance_loss_mlp": 1.02977872, + "epoch": 0.08561293813503276, + "flos": 13590214565760.0, + "grad_norm": 2.879623829096181, + "language_loss": 0.86524403, + "learning_rate": 3.967707988432661e-06, + "loss": 0.88774741, + "num_input_tokens_seen": 14987195, + "step": 712, + "time_per_iteration": 3.717588186264038 + }, + { + "auxiliary_loss_clip": 0.01259428, + "auxiliary_loss_mlp": 0.01042535, + "balance_loss_clip": 1.06858325, + "balance_loss_mlp": 1.02916598, + "epoch": 0.08573318102567186, + "flos": 26943524979840.0, + "grad_norm": 2.3687990033210546, + "language_loss": 0.8761425, + "learning_rate": 3.967568424329949e-06, + "loss": 0.89916211, + "num_input_tokens_seen": 15007620, + "step": 713, + "time_per_iteration": 2.6508307456970215 + }, + { + "auxiliary_loss_clip": 0.01128328, + "auxiliary_loss_mlp": 0.01013525, + "balance_loss_clip": 1.02787805, + "balance_loss_mlp": 1.00966227, + "epoch": 0.08585342391631094, + "flos": 67302739319040.0, + "grad_norm": 0.9921755847240605, + "language_loss": 0.55542946, + "learning_rate": 3.967428561748671e-06, + "loss": 0.57684803, + "num_input_tokens_seen": 15075590, + "step": 714, + "time_per_iteration": 3.2694153785705566 + }, + { + "auxiliary_loss_clip": 0.0118608, + "auxiliary_loss_mlp": 0.01044636, + "balance_loss_clip": 1.05693889, + "balance_loss_mlp": 1.03154087, + "epoch": 0.08597366680695004, + "flos": 22456594684800.0, + "grad_norm": 1.94880319900289, + "language_loss": 0.87396145, + "learning_rate": 3.967288400710045e-06, + "loss": 0.89626855, + "num_input_tokens_seen": 15095055, + "step": 715, + "time_per_iteration": 3.489793300628662 + }, + { + "auxiliary_loss_clip": 0.01206052, + "auxiliary_loss_mlp": 0.01040669, + "balance_loss_clip": 1.06660569, + "balance_loss_mlp": 1.0275557, + "epoch": 0.08609390969758914, + "flos": 23550505430400.0, + "grad_norm": 2.172420818324419, + "language_loss": 0.88466239, + "learning_rate": 3.9671479412353335e-06, + "loss": 0.90712959, + "num_input_tokens_seen": 15113520, + "step": 716, + "time_per_iteration": 3.3762714862823486 + }, + { + "auxiliary_loss_clip": 0.01244475, + "auxiliary_loss_mlp": 0.01043718, + "balance_loss_clip": 1.07050633, + "balance_loss_mlp": 1.03111768, + "epoch": 0.08621415258822822, + "flos": 25885848078720.0, + "grad_norm": 2.3528163256346932, + "language_loss": 0.74197948, + "learning_rate": 3.967007183345843e-06, + "loss": 0.7648614, + "num_input_tokens_seen": 15133375, + "step": 717, + "time_per_iteration": 3.3575246334075928 + }, + { + "auxiliary_loss_clip": 0.0123628, + "auxiliary_loss_mlp": 0.01041884, + "balance_loss_clip": 1.06687617, + "balance_loss_mlp": 1.0294981, + "epoch": 0.08633439547886732, + "flos": 13589568120960.0, + "grad_norm": 3.651220936590309, + "language_loss": 0.89048576, + "learning_rate": 3.966866127062927e-06, + "loss": 0.91326737, + "num_input_tokens_seen": 15150500, + "step": 718, + "time_per_iteration": 2.498567819595337 + }, + { + "auxiliary_loss_clip": 0.01131953, + "auxiliary_loss_mlp": 0.01009305, + "balance_loss_clip": 1.02837133, + "balance_loss_mlp": 1.00541866, + "epoch": 0.0864546383695064, + "flos": 57767342434560.0, + "grad_norm": 0.8665047066686268, + "language_loss": 0.62693983, + "learning_rate": 3.966724772407982e-06, + "loss": 0.64835238, + "num_input_tokens_seen": 15208015, + "step": 719, + "time_per_iteration": 2.931037187576294 + }, + { + "auxiliary_loss_clip": 0.01203631, + "auxiliary_loss_mlp": 0.01042548, + "balance_loss_clip": 1.063658, + "balance_loss_mlp": 1.03011417, + "epoch": 0.0865748812601455, + "flos": 20046952753920.0, + "grad_norm": 2.0983662698471037, + "language_loss": 0.88502824, + "learning_rate": 3.966583119402454e-06, + "loss": 0.90749002, + "num_input_tokens_seen": 15224780, + "step": 720, + "time_per_iteration": 2.559727668762207 + }, + { + "auxiliary_loss_clip": 0.0123768, + "auxiliary_loss_mlp": 0.00766102, + "balance_loss_clip": 1.06704521, + "balance_loss_mlp": 1.00082779, + "epoch": 0.08669512415078459, + "flos": 35262446935680.0, + "grad_norm": 1.6136433231270717, + "language_loss": 0.81910288, + "learning_rate": 3.9664411680678305e-06, + "loss": 0.83914065, + "num_input_tokens_seen": 15246535, + "step": 721, + "time_per_iteration": 2.6463537216186523 + }, + { + "auxiliary_loss_clip": 0.0110605, + "auxiliary_loss_mlp": 0.01005325, + "balance_loss_clip": 1.02516413, + "balance_loss_mlp": 1.00162983, + "epoch": 0.08681536704142367, + "flos": 65654870048640.0, + "grad_norm": 0.8462889051464064, + "language_loss": 0.61446106, + "learning_rate": 3.966298918425644e-06, + "loss": 0.63557482, + "num_input_tokens_seen": 15304025, + "step": 722, + "time_per_iteration": 3.1289379596710205 + }, + { + "auxiliary_loss_clip": 0.01242979, + "auxiliary_loss_mlp": 0.01045917, + "balance_loss_clip": 1.06669688, + "balance_loss_mlp": 1.03286386, + "epoch": 0.08693560993206277, + "flos": 34529940881280.0, + "grad_norm": 2.0118612598362597, + "language_loss": 0.82731563, + "learning_rate": 3.966156370497476e-06, + "loss": 0.85020459, + "num_input_tokens_seen": 15327635, + "step": 723, + "time_per_iteration": 2.8262815475463867 + }, + { + "auxiliary_loss_clip": 0.01243957, + "auxiliary_loss_mlp": 0.01043423, + "balance_loss_clip": 1.06662953, + "balance_loss_mlp": 1.03108501, + "epoch": 0.08705585282270185, + "flos": 23149419189120.0, + "grad_norm": 1.890453391527389, + "language_loss": 0.88504493, + "learning_rate": 3.96601352430495e-06, + "loss": 0.90791881, + "num_input_tokens_seen": 15347405, + "step": 724, + "time_per_iteration": 2.522639036178589 + }, + { + "auxiliary_loss_clip": 0.01228114, + "auxiliary_loss_mlp": 0.0104879, + "balance_loss_clip": 1.07018852, + "balance_loss_mlp": 1.0359689, + "epoch": 0.08717609571334095, + "flos": 29497599498240.0, + "grad_norm": 1.9385420713367252, + "language_loss": 0.82946992, + "learning_rate": 3.965870379869735e-06, + "loss": 0.85223889, + "num_input_tokens_seen": 15369450, + "step": 725, + "time_per_iteration": 2.607246160507202 + }, + { + "auxiliary_loss_clip": 0.01240354, + "auxiliary_loss_mlp": 0.01044868, + "balance_loss_clip": 1.0648241, + "balance_loss_mlp": 1.03282189, + "epoch": 0.08729633860398003, + "flos": 20667489137280.0, + "grad_norm": 2.0272724856326487, + "language_loss": 0.87052274, + "learning_rate": 3.965726937213547e-06, + "loss": 0.89337492, + "num_input_tokens_seen": 15388085, + "step": 726, + "time_per_iteration": 2.52020263671875 + }, + { + "auxiliary_loss_clip": 0.01237497, + "auxiliary_loss_mlp": 0.01047752, + "balance_loss_clip": 1.06297755, + "balance_loss_mlp": 1.0348475, + "epoch": 0.08741658149461913, + "flos": 18369493655040.0, + "grad_norm": 2.1139169543866894, + "language_loss": 0.81019062, + "learning_rate": 3.965583196358144e-06, + "loss": 0.8330431, + "num_input_tokens_seen": 15407120, + "step": 727, + "time_per_iteration": 2.50283145904541 + }, + { + "auxiliary_loss_clip": 0.01262119, + "auxiliary_loss_mlp": 0.01044878, + "balance_loss_clip": 1.07115102, + "balance_loss_mlp": 1.03082347, + "epoch": 0.08753682438525823, + "flos": 18729677283840.0, + "grad_norm": 2.4650151945743617, + "language_loss": 0.7417419, + "learning_rate": 3.965439157325335e-06, + "loss": 0.76481187, + "num_input_tokens_seen": 15424485, + "step": 728, + "time_per_iteration": 2.454554319381714 + }, + { + "auxiliary_loss_clip": 0.01219737, + "auxiliary_loss_mlp": 0.01038135, + "balance_loss_clip": 1.06065178, + "balance_loss_mlp": 1.02402067, + "epoch": 0.08765706727589731, + "flos": 27776113303680.0, + "grad_norm": 1.9717883938582617, + "language_loss": 0.7619139, + "learning_rate": 3.965294820136968e-06, + "loss": 0.78449267, + "num_input_tokens_seen": 15446285, + "step": 729, + "time_per_iteration": 2.643296003341675 + }, + { + "auxiliary_loss_clip": 0.01227846, + "auxiliary_loss_mlp": 0.01041101, + "balance_loss_clip": 1.0667212, + "balance_loss_mlp": 1.02864981, + "epoch": 0.08777731016653641, + "flos": 24389127239040.0, + "grad_norm": 2.162777231888247, + "language_loss": 0.87173331, + "learning_rate": 3.965150184814938e-06, + "loss": 0.89442277, + "num_input_tokens_seen": 15465770, + "step": 730, + "time_per_iteration": 2.697697401046753 + }, + { + "auxiliary_loss_clip": 0.01216787, + "auxiliary_loss_mlp": 0.01043315, + "balance_loss_clip": 1.0652678, + "balance_loss_mlp": 1.03014863, + "epoch": 0.08789755305717549, + "flos": 21981855605760.0, + "grad_norm": 2.753176275862073, + "language_loss": 0.76496947, + "learning_rate": 3.965005251381189e-06, + "loss": 0.78757048, + "num_input_tokens_seen": 15483705, + "step": 731, + "time_per_iteration": 2.6204416751861572 + }, + { + "auxiliary_loss_clip": 0.01130182, + "auxiliary_loss_mlp": 0.01008674, + "balance_loss_clip": 1.02430665, + "balance_loss_mlp": 1.00528872, + "epoch": 0.08801779594781459, + "flos": 58360120583040.0, + "grad_norm": 0.9027155467114226, + "language_loss": 0.64608657, + "learning_rate": 3.964860019857705e-06, + "loss": 0.6674751, + "num_input_tokens_seen": 15548620, + "step": 732, + "time_per_iteration": 3.089905023574829 + }, + { + "auxiliary_loss_clip": 0.01260288, + "auxiliary_loss_mlp": 0.01044804, + "balance_loss_clip": 1.07305336, + "balance_loss_mlp": 1.03293693, + "epoch": 0.08813803883845367, + "flos": 23294785530240.0, + "grad_norm": 1.8756176514158247, + "language_loss": 0.84125561, + "learning_rate": 3.964714490266518e-06, + "loss": 0.86430657, + "num_input_tokens_seen": 15569265, + "step": 733, + "time_per_iteration": 2.4999618530273438 + }, + { + "auxiliary_loss_clip": 0.01125125, + "auxiliary_loss_mlp": 0.01008522, + "balance_loss_clip": 1.0230056, + "balance_loss_mlp": 1.0051837, + "epoch": 0.08825828172909277, + "flos": 63424924882560.0, + "grad_norm": 0.8941840857568163, + "language_loss": 0.64535916, + "learning_rate": 3.964568662629706e-06, + "loss": 0.66669559, + "num_input_tokens_seen": 15630570, + "step": 734, + "time_per_iteration": 3.0048909187316895 + }, + { + "auxiliary_loss_clip": 0.01234947, + "auxiliary_loss_mlp": 0.01040886, + "balance_loss_clip": 1.06306958, + "balance_loss_mlp": 1.02864921, + "epoch": 0.08837852461973186, + "flos": 26720986268160.0, + "grad_norm": 2.3565645078991837, + "language_loss": 0.84230733, + "learning_rate": 3.9644225369693895e-06, + "loss": 0.86506569, + "num_input_tokens_seen": 15650870, + "step": 735, + "time_per_iteration": 2.648432970046997 + }, + { + "auxiliary_loss_clip": 0.01255374, + "auxiliary_loss_mlp": 0.01037529, + "balance_loss_clip": 1.06956255, + "balance_loss_mlp": 1.0250833, + "epoch": 0.08849876751037095, + "flos": 27265427688960.0, + "grad_norm": 2.0307746346000286, + "language_loss": 0.86867225, + "learning_rate": 3.964276113307735e-06, + "loss": 0.89160132, + "num_input_tokens_seen": 15670835, + "step": 736, + "time_per_iteration": 2.5729129314422607 + }, + { + "auxiliary_loss_clip": 0.0120736, + "auxiliary_loss_mlp": 0.01047304, + "balance_loss_clip": 1.06355774, + "balance_loss_mlp": 1.03431654, + "epoch": 0.08861901040101004, + "flos": 19828759587840.0, + "grad_norm": 1.883207196589286, + "language_loss": 0.80528355, + "learning_rate": 3.9641293916669574e-06, + "loss": 0.8278302, + "num_input_tokens_seen": 15689795, + "step": 737, + "time_per_iteration": 2.566600799560547 + }, + { + "auxiliary_loss_clip": 0.01203895, + "auxiliary_loss_mlp": 0.01034969, + "balance_loss_clip": 1.0629704, + "balance_loss_mlp": 1.02183783, + "epoch": 0.08873925329164913, + "flos": 23658704173440.0, + "grad_norm": 1.818489788700582, + "language_loss": 0.82762724, + "learning_rate": 3.9639823720693115e-06, + "loss": 0.85001588, + "num_input_tokens_seen": 15711650, + "step": 738, + "time_per_iteration": 3.472275495529175 + }, + { + "auxiliary_loss_clip": 0.01106465, + "auxiliary_loss_mlp": 0.01009621, + "balance_loss_clip": 1.03513455, + "balance_loss_mlp": 1.00554419, + "epoch": 0.08885949618228822, + "flos": 71831541893760.0, + "grad_norm": 0.8363260015934844, + "language_loss": 0.60024214, + "learning_rate": 3.963835054537102e-06, + "loss": 0.62140298, + "num_input_tokens_seen": 15780615, + "step": 739, + "time_per_iteration": 3.2010130882263184 + }, + { + "auxiliary_loss_clip": 0.01219693, + "auxiliary_loss_mlp": 0.01049139, + "balance_loss_clip": 1.06161332, + "balance_loss_mlp": 1.03711653, + "epoch": 0.08897973907292732, + "flos": 22346169298560.0, + "grad_norm": 2.607065714910966, + "language_loss": 0.60420525, + "learning_rate": 3.963687439092676e-06, + "loss": 0.62689364, + "num_input_tokens_seen": 15801300, + "step": 740, + "time_per_iteration": 2.552574634552002 + }, + { + "auxiliary_loss_clip": 0.01241839, + "auxiliary_loss_mlp": 0.01043873, + "balance_loss_clip": 1.06867588, + "balance_loss_mlp": 1.03162408, + "epoch": 0.0890999819635664, + "flos": 21251827589760.0, + "grad_norm": 2.0906363534961643, + "language_loss": 0.80382669, + "learning_rate": 3.963539525758427e-06, + "loss": 0.82668376, + "num_input_tokens_seen": 15820860, + "step": 741, + "time_per_iteration": 3.3114726543426514 + }, + { + "auxiliary_loss_clip": 0.01226644, + "auxiliary_loss_mlp": 0.01040611, + "balance_loss_clip": 1.06765425, + "balance_loss_mlp": 1.02723551, + "epoch": 0.0892202248542055, + "flos": 25370888745600.0, + "grad_norm": 1.8778941075979416, + "language_loss": 0.67674363, + "learning_rate": 3.9633913145567925e-06, + "loss": 0.69941616, + "num_input_tokens_seen": 15841350, + "step": 742, + "time_per_iteration": 2.63049578666687 + }, + { + "auxiliary_loss_clip": 0.01225475, + "auxiliary_loss_mlp": 0.01035263, + "balance_loss_clip": 1.06861424, + "balance_loss_mlp": 1.02337229, + "epoch": 0.08934046774484458, + "flos": 24457895827200.0, + "grad_norm": 2.5616089964231974, + "language_loss": 0.81676531, + "learning_rate": 3.9632428055102575e-06, + "loss": 0.83937269, + "num_input_tokens_seen": 15861360, + "step": 743, + "time_per_iteration": 4.124592304229736 + }, + { + "auxiliary_loss_clip": 0.01245209, + "auxiliary_loss_mlp": 0.01043935, + "balance_loss_clip": 1.07189703, + "balance_loss_mlp": 1.0303328, + "epoch": 0.08946071063548368, + "flos": 35772773414400.0, + "grad_norm": 2.391097491262561, + "language_loss": 0.67007166, + "learning_rate": 3.9630939986413495e-06, + "loss": 0.69296306, + "num_input_tokens_seen": 15883160, + "step": 744, + "time_per_iteration": 2.626033067703247 + }, + { + "auxiliary_loss_clip": 0.01196901, + "auxiliary_loss_mlp": 0.01047866, + "balance_loss_clip": 1.06256461, + "balance_loss_mlp": 1.03486633, + "epoch": 0.08958095352612276, + "flos": 14356584167040.0, + "grad_norm": 1.7741351165147137, + "language_loss": 0.78497273, + "learning_rate": 3.962944893972643e-06, + "loss": 0.80742037, + "num_input_tokens_seen": 15901610, + "step": 745, + "time_per_iteration": 2.5670721530914307 + }, + { + "auxiliary_loss_clip": 0.01222001, + "auxiliary_loss_mlp": 0.0103947, + "balance_loss_clip": 1.06450748, + "balance_loss_mlp": 1.0269053, + "epoch": 0.08970119641676186, + "flos": 17853277345920.0, + "grad_norm": 3.6587385095060667, + "language_loss": 0.90791011, + "learning_rate": 3.962795491526756e-06, + "loss": 0.93052477, + "num_input_tokens_seen": 15918770, + "step": 746, + "time_per_iteration": 2.5114543437957764 + }, + { + "auxiliary_loss_clip": 0.01262765, + "auxiliary_loss_mlp": 0.01051641, + "balance_loss_clip": 1.07288575, + "balance_loss_mlp": 1.03771782, + "epoch": 0.08982143930740095, + "flos": 20811670329600.0, + "grad_norm": 2.713978549051666, + "language_loss": 0.89198577, + "learning_rate": 3.962645791326354e-06, + "loss": 0.9151299, + "num_input_tokens_seen": 15938025, + "step": 747, + "time_per_iteration": 2.480376958847046 + }, + { + "auxiliary_loss_clip": 0.01239488, + "auxiliary_loss_mlp": 0.01037603, + "balance_loss_clip": 1.06973314, + "balance_loss_mlp": 1.02599835, + "epoch": 0.08994168219804004, + "flos": 24097712198400.0, + "grad_norm": 1.8835333930715121, + "language_loss": 0.82821393, + "learning_rate": 3.962495793394146e-06, + "loss": 0.85098481, + "num_input_tokens_seen": 15957215, + "step": 748, + "time_per_iteration": 2.5224411487579346 + }, + { + "auxiliary_loss_clip": 0.01140998, + "auxiliary_loss_mlp": 0.01009118, + "balance_loss_clip": 1.02582705, + "balance_loss_mlp": 1.0058521, + "epoch": 0.09006192508867913, + "flos": 57188893812480.0, + "grad_norm": 0.7778776343021869, + "language_loss": 0.61211097, + "learning_rate": 3.9623454977528864e-06, + "loss": 0.63361216, + "num_input_tokens_seen": 16015870, + "step": 749, + "time_per_iteration": 2.8883888721466064 + }, + { + "auxiliary_loss_clip": 0.01211896, + "auxiliary_loss_mlp": 0.01047986, + "balance_loss_clip": 1.06316137, + "balance_loss_mlp": 1.03534937, + "epoch": 0.09018216797931822, + "flos": 20487505063680.0, + "grad_norm": 1.7801426176161208, + "language_loss": 0.85241604, + "learning_rate": 3.962194904425375e-06, + "loss": 0.87501484, + "num_input_tokens_seen": 16036500, + "step": 750, + "time_per_iteration": 2.5985820293426514 + }, + { + "auxiliary_loss_clip": 0.01236744, + "auxiliary_loss_mlp": 0.01045639, + "balance_loss_clip": 1.06733501, + "balance_loss_mlp": 1.03281832, + "epoch": 0.09030241086995731, + "flos": 22638123043200.0, + "grad_norm": 2.0019811884502743, + "language_loss": 0.6813333, + "learning_rate": 3.9620440134344566e-06, + "loss": 0.70415711, + "num_input_tokens_seen": 16054655, + "step": 751, + "time_per_iteration": 2.5019288063049316 + }, + { + "auxiliary_loss_clip": 0.01207597, + "auxiliary_loss_mlp": 0.01049728, + "balance_loss_clip": 1.06480706, + "balance_loss_mlp": 1.03627503, + "epoch": 0.09042265376059641, + "flos": 21871502046720.0, + "grad_norm": 2.594544024180421, + "language_loss": 0.82142103, + "learning_rate": 3.9618928248030215e-06, + "loss": 0.84399432, + "num_input_tokens_seen": 16074165, + "step": 752, + "time_per_iteration": 2.5744919776916504 + }, + { + "auxiliary_loss_clip": 0.01239297, + "auxiliary_loss_mlp": 0.01047033, + "balance_loss_clip": 1.06913328, + "balance_loss_mlp": 1.0345695, + "epoch": 0.0905428966512355, + "flos": 24316192673280.0, + "grad_norm": 2.639391514229017, + "language_loss": 0.83158404, + "learning_rate": 3.961741338554005e-06, + "loss": 0.85444736, + "num_input_tokens_seen": 16092505, + "step": 753, + "time_per_iteration": 2.595125198364258 + }, + { + "auxiliary_loss_clip": 0.01233226, + "auxiliary_loss_mlp": 0.01050657, + "balance_loss_clip": 1.06905007, + "balance_loss_mlp": 1.03710866, + "epoch": 0.09066313954187459, + "flos": 35845061535360.0, + "grad_norm": 1.9406113111246892, + "language_loss": 0.76051354, + "learning_rate": 3.9615895547103865e-06, + "loss": 0.78335238, + "num_input_tokens_seen": 16116150, + "step": 754, + "time_per_iteration": 2.700745105743408 + }, + { + "auxiliary_loss_clip": 0.01224973, + "auxiliary_loss_mlp": 0.01051739, + "balance_loss_clip": 1.06565785, + "balance_loss_mlp": 1.03878665, + "epoch": 0.09078338243251367, + "flos": 29168729550720.0, + "grad_norm": 1.9284196329696175, + "language_loss": 0.77856338, + "learning_rate": 3.961437473295193e-06, + "loss": 0.80133057, + "num_input_tokens_seen": 16136295, + "step": 755, + "time_per_iteration": 2.632720947265625 + }, + { + "auxiliary_loss_clip": 0.01180216, + "auxiliary_loss_mlp": 0.01044969, + "balance_loss_clip": 1.05577898, + "balance_loss_mlp": 1.03227353, + "epoch": 0.09090362532315277, + "flos": 21907699977600.0, + "grad_norm": 4.094268027490676, + "language_loss": 0.72634542, + "learning_rate": 3.961285094331495e-06, + "loss": 0.74859726, + "num_input_tokens_seen": 16154210, + "step": 756, + "time_per_iteration": 2.596719980239868 + }, + { + "auxiliary_loss_clip": 0.01252763, + "auxiliary_loss_mlp": 0.010401, + "balance_loss_clip": 1.06764722, + "balance_loss_mlp": 1.02785742, + "epoch": 0.09102386821379185, + "flos": 27344503480320.0, + "grad_norm": 2.234486825642296, + "language_loss": 0.85952151, + "learning_rate": 3.961132417842406e-06, + "loss": 0.8824501, + "num_input_tokens_seen": 16173995, + "step": 757, + "time_per_iteration": 2.5575456619262695 + }, + { + "auxiliary_loss_clip": 0.01234358, + "auxiliary_loss_mlp": 0.01053942, + "balance_loss_clip": 1.06726992, + "balance_loss_mlp": 1.04181838, + "epoch": 0.09114411110443095, + "flos": 20813501923200.0, + "grad_norm": 2.5591352612194274, + "language_loss": 0.7525472, + "learning_rate": 3.960979443851089e-06, + "loss": 0.7754302, + "num_input_tokens_seen": 16191020, + "step": 758, + "time_per_iteration": 2.5053820610046387 + }, + { + "auxiliary_loss_clip": 0.01222845, + "auxiliary_loss_mlp": 0.01042024, + "balance_loss_clip": 1.06491888, + "balance_loss_mlp": 1.02827334, + "epoch": 0.09126435399507005, + "flos": 26145949438080.0, + "grad_norm": 4.544587056250705, + "language_loss": 0.78872705, + "learning_rate": 3.96082617238075e-06, + "loss": 0.81137574, + "num_input_tokens_seen": 16213645, + "step": 759, + "time_per_iteration": 2.6376826763153076 + }, + { + "auxiliary_loss_clip": 0.01223267, + "auxiliary_loss_mlp": 0.01041606, + "balance_loss_clip": 1.06497228, + "balance_loss_mlp": 1.02954173, + "epoch": 0.09138459688570913, + "flos": 24388911757440.0, + "grad_norm": 4.084158682931718, + "language_loss": 0.79707861, + "learning_rate": 3.960672603454639e-06, + "loss": 0.8197273, + "num_input_tokens_seen": 16233625, + "step": 760, + "time_per_iteration": 2.587028980255127 + }, + { + "auxiliary_loss_clip": 0.01234855, + "auxiliary_loss_mlp": 0.01044266, + "balance_loss_clip": 1.06706166, + "balance_loss_mlp": 1.03117061, + "epoch": 0.09150483977634823, + "flos": 21032664756480.0, + "grad_norm": 3.102138246243872, + "language_loss": 0.77461505, + "learning_rate": 3.960518737096054e-06, + "loss": 0.79740632, + "num_input_tokens_seen": 16253255, + "step": 761, + "time_per_iteration": 2.5373940467834473 + }, + { + "auxiliary_loss_clip": 0.01240211, + "auxiliary_loss_mlp": 0.01037754, + "balance_loss_clip": 1.0680207, + "balance_loss_mlp": 1.0254159, + "epoch": 0.09162508266698731, + "flos": 22856998567680.0, + "grad_norm": 3.1325324154612653, + "language_loss": 0.72093749, + "learning_rate": 3.960364573328334e-06, + "loss": 0.74371707, + "num_input_tokens_seen": 16272580, + "step": 762, + "time_per_iteration": 2.547060251235962 + }, + { + "auxiliary_loss_clip": 0.01210152, + "auxiliary_loss_mlp": 0.01039517, + "balance_loss_clip": 1.06250286, + "balance_loss_mlp": 1.0258323, + "epoch": 0.0917453255576264, + "flos": 21724411852800.0, + "grad_norm": 2.215420355339875, + "language_loss": 0.88497639, + "learning_rate": 3.9602101121748675e-06, + "loss": 0.90747309, + "num_input_tokens_seen": 16293075, + "step": 763, + "time_per_iteration": 2.5847504138946533 + }, + { + "auxiliary_loss_clip": 0.01226205, + "auxiliary_loss_mlp": 0.01042275, + "balance_loss_clip": 1.06973827, + "balance_loss_mlp": 1.03036547, + "epoch": 0.0918655684482655, + "flos": 14609215497600.0, + "grad_norm": 2.2553337841091032, + "language_loss": 0.72377741, + "learning_rate": 3.960055353659085e-06, + "loss": 0.74646223, + "num_input_tokens_seen": 16310185, + "step": 764, + "time_per_iteration": 2.529010534286499 + }, + { + "auxiliary_loss_clip": 0.01212431, + "auxiliary_loss_mlp": 0.01034523, + "balance_loss_clip": 1.06593943, + "balance_loss_mlp": 1.02158916, + "epoch": 0.09198581133890459, + "flos": 23435016226560.0, + "grad_norm": 2.2349425100954585, + "language_loss": 0.83497071, + "learning_rate": 3.959900297804465e-06, + "loss": 0.85744023, + "num_input_tokens_seen": 16330355, + "step": 765, + "time_per_iteration": 3.4237773418426514 + }, + { + "auxiliary_loss_clip": 0.01211905, + "auxiliary_loss_mlp": 0.01039224, + "balance_loss_clip": 1.06353045, + "balance_loss_mlp": 1.02689242, + "epoch": 0.09210605422954368, + "flos": 16795887753600.0, + "grad_norm": 1.7699660380095645, + "language_loss": 0.77539599, + "learning_rate": 3.9597449446345276e-06, + "loss": 0.79790723, + "num_input_tokens_seen": 16347600, + "step": 766, + "time_per_iteration": 2.6059064865112305 + }, + { + "auxiliary_loss_clip": 0.01210595, + "auxiliary_loss_mlp": 0.0103899, + "balance_loss_clip": 1.0598886, + "balance_loss_mlp": 1.02691388, + "epoch": 0.09222629712018277, + "flos": 22674249146880.0, + "grad_norm": 2.43765015937421, + "language_loss": 0.8330735, + "learning_rate": 3.95958929417284e-06, + "loss": 0.85556936, + "num_input_tokens_seen": 16365755, + "step": 767, + "time_per_iteration": 2.6747500896453857 + }, + { + "auxiliary_loss_clip": 0.0113141, + "auxiliary_loss_mlp": 0.01002406, + "balance_loss_clip": 1.02738619, + "balance_loss_mlp": 0.99916345, + "epoch": 0.09234654001082186, + "flos": 69976756327680.0, + "grad_norm": 0.7297055466650461, + "language_loss": 0.58788812, + "learning_rate": 3.9594333464430145e-06, + "loss": 0.60922635, + "num_input_tokens_seen": 16435245, + "step": 768, + "time_per_iteration": 4.001144170761108 + }, + { + "auxiliary_loss_clip": 0.0114926, + "auxiliary_loss_mlp": 0.01044094, + "balance_loss_clip": 1.0521245, + "balance_loss_mlp": 1.03235221, + "epoch": 0.09246678290146094, + "flos": 20011437181440.0, + "grad_norm": 1.8280600310415465, + "language_loss": 0.87908024, + "learning_rate": 3.959277101468709e-06, + "loss": 0.90101385, + "num_input_tokens_seen": 16454795, + "step": 769, + "time_per_iteration": 3.693542242050171 + }, + { + "auxiliary_loss_clip": 0.01207296, + "auxiliary_loss_mlp": 0.0104837, + "balance_loss_clip": 1.06152546, + "balance_loss_mlp": 1.03547132, + "epoch": 0.09258702579210004, + "flos": 17747448900480.0, + "grad_norm": 2.5733425651110133, + "language_loss": 0.78457868, + "learning_rate": 3.959120559273624e-06, + "loss": 0.80713522, + "num_input_tokens_seen": 16472580, + "step": 770, + "time_per_iteration": 3.376337766647339 + }, + { + "auxiliary_loss_clip": 0.01208504, + "auxiliary_loss_mlp": 0.0104461, + "balance_loss_clip": 1.06392288, + "balance_loss_mlp": 1.03207564, + "epoch": 0.09270726868273914, + "flos": 20886544229760.0, + "grad_norm": 2.113346237101469, + "language_loss": 0.83664393, + "learning_rate": 3.958963719881509e-06, + "loss": 0.85917509, + "num_input_tokens_seen": 16490670, + "step": 771, + "time_per_iteration": 2.5298922061920166 + }, + { + "auxiliary_loss_clip": 0.01240459, + "auxiliary_loss_mlp": 0.01040686, + "balance_loss_clip": 1.07046056, + "balance_loss_mlp": 1.02713227, + "epoch": 0.09282751157337822, + "flos": 17015697031680.0, + "grad_norm": 1.9761737823279046, + "language_loss": 0.93985045, + "learning_rate": 3.958806583316154e-06, + "loss": 0.96266198, + "num_input_tokens_seen": 16508640, + "step": 772, + "time_per_iteration": 2.5126922130584717 + }, + { + "auxiliary_loss_clip": 0.01254161, + "auxiliary_loss_mlp": 0.01034905, + "balance_loss_clip": 1.07021487, + "balance_loss_mlp": 1.02302539, + "epoch": 0.09294775446401732, + "flos": 32523647748480.0, + "grad_norm": 1.7614205491994583, + "language_loss": 0.78790301, + "learning_rate": 3.9586491496013985e-06, + "loss": 0.81079364, + "num_input_tokens_seen": 16531035, + "step": 773, + "time_per_iteration": 2.628710985183716 + }, + { + "auxiliary_loss_clip": 0.01243299, + "auxiliary_loss_mlp": 0.01049903, + "balance_loss_clip": 1.06782699, + "balance_loss_mlp": 1.03725529, + "epoch": 0.0930679973546564, + "flos": 18259750627200.0, + "grad_norm": 2.073406754988059, + "language_loss": 0.8314268, + "learning_rate": 3.958491418761124e-06, + "loss": 0.85435879, + "num_input_tokens_seen": 16548605, + "step": 774, + "time_per_iteration": 2.5062620639801025 + }, + { + "auxiliary_loss_clip": 0.01221743, + "auxiliary_loss_mlp": 0.01041634, + "balance_loss_clip": 1.06095064, + "balance_loss_mlp": 1.02933168, + "epoch": 0.0931882402452955, + "flos": 21099745405440.0, + "grad_norm": 2.31282168995737, + "language_loss": 0.72261965, + "learning_rate": 3.958333390819258e-06, + "loss": 0.74525344, + "num_input_tokens_seen": 16565535, + "step": 775, + "time_per_iteration": 2.538045883178711 + }, + { + "auxiliary_loss_clip": 0.01254876, + "auxiliary_loss_mlp": 0.01042595, + "balance_loss_clip": 1.07040215, + "balance_loss_mlp": 1.03048944, + "epoch": 0.0933084831359346, + "flos": 24207275658240.0, + "grad_norm": 3.071658970648125, + "language_loss": 0.79991275, + "learning_rate": 3.9581750657997754e-06, + "loss": 0.82288754, + "num_input_tokens_seen": 16584900, + "step": 776, + "time_per_iteration": 2.5025649070739746 + }, + { + "auxiliary_loss_clip": 0.01219598, + "auxiliary_loss_mlp": 0.01041254, + "balance_loss_clip": 1.06089115, + "balance_loss_mlp": 1.02928531, + "epoch": 0.09342872602657368, + "flos": 25480272637440.0, + "grad_norm": 1.794459342957532, + "language_loss": 0.89512026, + "learning_rate": 3.95801644372669e-06, + "loss": 0.91772878, + "num_input_tokens_seen": 16604805, + "step": 777, + "time_per_iteration": 2.6097683906555176 + }, + { + "auxiliary_loss_clip": 0.01228991, + "auxiliary_loss_mlp": 0.01042257, + "balance_loss_clip": 1.06354618, + "balance_loss_mlp": 1.03002644, + "epoch": 0.09354896891721277, + "flos": 23149060053120.0, + "grad_norm": 5.673419719986686, + "language_loss": 0.84493625, + "learning_rate": 3.957857524624068e-06, + "loss": 0.86764872, + "num_input_tokens_seen": 16623685, + "step": 778, + "time_per_iteration": 2.5777955055236816 + }, + { + "auxiliary_loss_clip": 0.01222397, + "auxiliary_loss_mlp": 0.01040155, + "balance_loss_clip": 1.06543183, + "balance_loss_mlp": 1.02789474, + "epoch": 0.09366921180785186, + "flos": 24279563779200.0, + "grad_norm": 1.6986200296245706, + "language_loss": 0.89562863, + "learning_rate": 3.957698308516016e-06, + "loss": 0.91825414, + "num_input_tokens_seen": 16644985, + "step": 779, + "time_per_iteration": 2.5732223987579346 + }, + { + "auxiliary_loss_clip": 0.01237425, + "auxiliary_loss_mlp": 0.00766177, + "balance_loss_clip": 1.07013726, + "balance_loss_mlp": 1.00077105, + "epoch": 0.09378945469849095, + "flos": 18730036419840.0, + "grad_norm": 2.2625779936216275, + "language_loss": 0.824893, + "learning_rate": 3.957538795426688e-06, + "loss": 0.8449291, + "num_input_tokens_seen": 16662410, + "step": 780, + "time_per_iteration": 2.506226062774658 + }, + { + "auxiliary_loss_clip": 0.0122607, + "auxiliary_loss_mlp": 0.01045934, + "balance_loss_clip": 1.06595945, + "balance_loss_mlp": 1.03221917, + "epoch": 0.09390969758913004, + "flos": 23218834222080.0, + "grad_norm": 2.2477864029833183, + "language_loss": 0.77126551, + "learning_rate": 3.9573789853802804e-06, + "loss": 0.79398549, + "num_input_tokens_seen": 16680885, + "step": 781, + "time_per_iteration": 2.559307098388672 + }, + { + "auxiliary_loss_clip": 0.01225284, + "auxiliary_loss_mlp": 0.00765814, + "balance_loss_clip": 1.06880665, + "balance_loss_mlp": 1.00070679, + "epoch": 0.09402994047976913, + "flos": 19646728439040.0, + "grad_norm": 1.9767771341668208, + "language_loss": 0.74613297, + "learning_rate": 3.957218878401037e-06, + "loss": 0.7660439, + "num_input_tokens_seen": 16699375, + "step": 782, + "time_per_iteration": 2.5517327785491943 + }, + { + "auxiliary_loss_clip": 0.01257567, + "auxiliary_loss_mlp": 0.01047261, + "balance_loss_clip": 1.07200956, + "balance_loss_mlp": 1.03408229, + "epoch": 0.09415018337040823, + "flos": 29420463041280.0, + "grad_norm": 2.1437620966296707, + "language_loss": 0.89368534, + "learning_rate": 3.957058474513246e-06, + "loss": 0.91673362, + "num_input_tokens_seen": 16719230, + "step": 783, + "time_per_iteration": 2.5868775844573975 + }, + { + "auxiliary_loss_clip": 0.01234803, + "auxiliary_loss_mlp": 0.0104875, + "balance_loss_clip": 1.06807137, + "balance_loss_mlp": 1.03733563, + "epoch": 0.09427042626104731, + "flos": 24572092141440.0, + "grad_norm": 3.5834837309023344, + "language_loss": 0.78268135, + "learning_rate": 3.956897773741241e-06, + "loss": 0.8055169, + "num_input_tokens_seen": 16738220, + "step": 784, + "time_per_iteration": 2.5433714389801025 + }, + { + "auxiliary_loss_clip": 0.01212772, + "auxiliary_loss_mlp": 0.01047675, + "balance_loss_clip": 1.06348455, + "balance_loss_mlp": 1.03481197, + "epoch": 0.09439066915168641, + "flos": 26359581576960.0, + "grad_norm": 1.6634837131914642, + "language_loss": 0.71240383, + "learning_rate": 3.956736776109398e-06, + "loss": 0.73500836, + "num_input_tokens_seen": 16759395, + "step": 785, + "time_per_iteration": 2.5866026878356934 + }, + { + "auxiliary_loss_clip": 0.01228823, + "auxiliary_loss_mlp": 0.00766852, + "balance_loss_clip": 1.06277013, + "balance_loss_mlp": 1.00076914, + "epoch": 0.09451091204232549, + "flos": 19427278296960.0, + "grad_norm": 2.0351728452895785, + "language_loss": 0.83388215, + "learning_rate": 3.956575481642143e-06, + "loss": 0.85383892, + "num_input_tokens_seen": 16778285, + "step": 786, + "time_per_iteration": 2.525662660598755 + }, + { + "auxiliary_loss_clip": 0.01181237, + "auxiliary_loss_mlp": 0.01040343, + "balance_loss_clip": 1.05475426, + "balance_loss_mlp": 1.02783823, + "epoch": 0.09463115493296459, + "flos": 25368051571200.0, + "grad_norm": 2.7416563876490803, + "language_loss": 0.75036019, + "learning_rate": 3.956413890363943e-06, + "loss": 0.77257597, + "num_input_tokens_seen": 16795265, + "step": 787, + "time_per_iteration": 2.652031898498535 + }, + { + "auxiliary_loss_clip": 0.01236324, + "auxiliary_loss_mlp": 0.01041664, + "balance_loss_clip": 1.06740999, + "balance_loss_mlp": 1.02990389, + "epoch": 0.09475139782360369, + "flos": 10123254869760.0, + "grad_norm": 3.821818422601921, + "language_loss": 0.81739682, + "learning_rate": 3.956252002299312e-06, + "loss": 0.8401767, + "num_input_tokens_seen": 16811165, + "step": 788, + "time_per_iteration": 2.4628429412841797 + }, + { + "auxiliary_loss_clip": 0.01251632, + "auxiliary_loss_mlp": 0.01035573, + "balance_loss_clip": 1.06730688, + "balance_loss_mlp": 1.02336049, + "epoch": 0.09487164071424277, + "flos": 17231088936960.0, + "grad_norm": 3.9522762294152503, + "language_loss": 0.90402168, + "learning_rate": 3.956089817472807e-06, + "loss": 0.92689371, + "num_input_tokens_seen": 16828470, + "step": 789, + "time_per_iteration": 2.4374899864196777 + }, + { + "auxiliary_loss_clip": 0.01219635, + "auxiliary_loss_mlp": 0.01043595, + "balance_loss_clip": 1.06605983, + "balance_loss_mlp": 1.03218639, + "epoch": 0.09499188360488187, + "flos": 30849564528000.0, + "grad_norm": 2.345088621387017, + "language_loss": 0.85210741, + "learning_rate": 3.955927335909032e-06, + "loss": 0.87473971, + "num_input_tokens_seen": 16851680, + "step": 790, + "time_per_iteration": 2.735522508621216 + }, + { + "auxiliary_loss_clip": 0.01190145, + "auxiliary_loss_mlp": 0.01047134, + "balance_loss_clip": 1.06565571, + "balance_loss_mlp": 1.03503418, + "epoch": 0.09511212649552095, + "flos": 29351694453120.0, + "grad_norm": 2.6504001422862524, + "language_loss": 0.75852346, + "learning_rate": 3.955764557632634e-06, + "loss": 0.78089631, + "num_input_tokens_seen": 16871490, + "step": 791, + "time_per_iteration": 2.7065045833587646 + }, + { + "auxiliary_loss_clip": 0.01216542, + "auxiliary_loss_mlp": 0.01039299, + "balance_loss_clip": 1.06363356, + "balance_loss_mlp": 1.02718127, + "epoch": 0.09523236938616005, + "flos": 10378687461120.0, + "grad_norm": 3.278758335772562, + "language_loss": 0.94479316, + "learning_rate": 3.955601482668309e-06, + "loss": 0.96735156, + "num_input_tokens_seen": 16889350, + "step": 792, + "time_per_iteration": 3.3049941062927246 + }, + { + "auxiliary_loss_clip": 0.01187258, + "auxiliary_loss_mlp": 0.01046052, + "balance_loss_clip": 1.05763102, + "balance_loss_mlp": 1.03204489, + "epoch": 0.09535261227679913, + "flos": 19061815368960.0, + "grad_norm": 1.821926605507522, + "language_loss": 0.88490039, + "learning_rate": 3.955438111040794e-06, + "loss": 0.90723348, + "num_input_tokens_seen": 16907625, + "step": 793, + "time_per_iteration": 2.6063244342803955 + }, + { + "auxiliary_loss_clip": 0.01184835, + "auxiliary_loss_mlp": 0.01048702, + "balance_loss_clip": 1.05820382, + "balance_loss_mlp": 1.03694224, + "epoch": 0.09547285516743823, + "flos": 20922993555840.0, + "grad_norm": 2.2040469574264354, + "language_loss": 0.80448693, + "learning_rate": 3.955274442774873e-06, + "loss": 0.82682228, + "num_input_tokens_seen": 16926205, + "step": 794, + "time_per_iteration": 3.3855788707733154 + }, + { + "auxiliary_loss_clip": 0.01237275, + "auxiliary_loss_mlp": 0.01047146, + "balance_loss_clip": 1.06663167, + "balance_loss_mlp": 1.03401494, + "epoch": 0.09559309805807732, + "flos": 30154405639680.0, + "grad_norm": 2.2695723798782295, + "language_loss": 0.70146376, + "learning_rate": 3.9551104778953725e-06, + "loss": 0.72430795, + "num_input_tokens_seen": 16946500, + "step": 795, + "time_per_iteration": 2.6293785572052 + }, + { + "auxiliary_loss_clip": 0.01204365, + "auxiliary_loss_mlp": 0.01033077, + "balance_loss_clip": 1.05990207, + "balance_loss_mlp": 1.02117991, + "epoch": 0.0957133409487164, + "flos": 21066743784960.0, + "grad_norm": 1.9941457084103587, + "language_loss": 0.85289621, + "learning_rate": 3.954946216427167e-06, + "loss": 0.87527061, + "num_input_tokens_seen": 16966960, + "step": 796, + "time_per_iteration": 3.2828824520111084 + }, + { + "auxiliary_loss_clip": 0.01099254, + "auxiliary_loss_mlp": 0.01004542, + "balance_loss_clip": 1.02635622, + "balance_loss_mlp": 1.00146604, + "epoch": 0.0958335838393555, + "flos": 71297979315840.0, + "grad_norm": 0.8805545806861318, + "language_loss": 0.61583591, + "learning_rate": 3.954781658395176e-06, + "loss": 0.63687384, + "num_input_tokens_seen": 17023215, + "step": 797, + "time_per_iteration": 3.8980631828308105 + }, + { + "auxiliary_loss_clip": 0.01225323, + "auxiliary_loss_mlp": 0.01040145, + "balance_loss_clip": 1.06423926, + "balance_loss_mlp": 1.02766967, + "epoch": 0.09595382672999458, + "flos": 21872974504320.0, + "grad_norm": 1.910101585281029, + "language_loss": 0.9193927, + "learning_rate": 3.95461680382436e-06, + "loss": 0.94204736, + "num_input_tokens_seen": 17042140, + "step": 798, + "time_per_iteration": 2.5563690662384033 + }, + { + "auxiliary_loss_clip": 0.01239905, + "auxiliary_loss_mlp": 0.01043697, + "balance_loss_clip": 1.06849098, + "balance_loss_mlp": 1.03105474, + "epoch": 0.09607406962063368, + "flos": 18695562341760.0, + "grad_norm": 4.197895724760292, + "language_loss": 0.86319005, + "learning_rate": 3.9544516527397295e-06, + "loss": 0.88602608, + "num_input_tokens_seen": 17058490, + "step": 799, + "time_per_iteration": 2.4984312057495117 + }, + { + "auxiliary_loss_clip": 0.01207275, + "auxiliary_loss_mlp": 0.01037552, + "balance_loss_clip": 1.06363952, + "balance_loss_mlp": 1.02547038, + "epoch": 0.09619431251127276, + "flos": 22568456615040.0, + "grad_norm": 1.7152145166529265, + "language_loss": 0.80599344, + "learning_rate": 3.954286205166338e-06, + "loss": 0.82844168, + "num_input_tokens_seen": 17079655, + "step": 800, + "time_per_iteration": 2.580705165863037 + }, + { + "auxiliary_loss_clip": 0.01243863, + "auxiliary_loss_mlp": 0.01043047, + "balance_loss_clip": 1.0732553, + "balance_loss_mlp": 1.03022575, + "epoch": 0.09631455540191186, + "flos": 14246230608000.0, + "grad_norm": 2.671843406898212, + "language_loss": 0.8359642, + "learning_rate": 3.954120461129282e-06, + "loss": 0.85883325, + "num_input_tokens_seen": 17097065, + "step": 801, + "time_per_iteration": 2.466104507446289 + }, + { + "auxiliary_loss_clip": 0.01256136, + "auxiliary_loss_mlp": 0.0104693, + "balance_loss_clip": 1.07195854, + "balance_loss_mlp": 1.03516459, + "epoch": 0.09643479829255096, + "flos": 20740387789440.0, + "grad_norm": 1.8668756835953102, + "language_loss": 0.83946806, + "learning_rate": 3.953954420653706e-06, + "loss": 0.86249876, + "num_input_tokens_seen": 17114090, + "step": 802, + "time_per_iteration": 2.4702770709991455 + }, + { + "auxiliary_loss_clip": 0.01234986, + "auxiliary_loss_mlp": 0.01041049, + "balance_loss_clip": 1.06834829, + "balance_loss_mlp": 1.02944434, + "epoch": 0.09655504118319004, + "flos": 24420476833920.0, + "grad_norm": 2.102826947959934, + "language_loss": 0.88187945, + "learning_rate": 3.953788083764798e-06, + "loss": 0.90463984, + "num_input_tokens_seen": 17133325, + "step": 803, + "time_per_iteration": 2.5209147930145264 + }, + { + "auxiliary_loss_clip": 0.01189015, + "auxiliary_loss_mlp": 0.01045762, + "balance_loss_clip": 1.06236613, + "balance_loss_mlp": 1.03398395, + "epoch": 0.09667528407382914, + "flos": 18441961344000.0, + "grad_norm": 2.2260721412451323, + "language_loss": 0.9200899, + "learning_rate": 3.953621450487792e-06, + "loss": 0.94243765, + "num_input_tokens_seen": 17151945, + "step": 804, + "time_per_iteration": 2.5669755935668945 + }, + { + "auxiliary_loss_clip": 0.0113834, + "auxiliary_loss_mlp": 0.01007151, + "balance_loss_clip": 1.02769732, + "balance_loss_mlp": 1.00413549, + "epoch": 0.09679552696446822, + "flos": 70816455544320.0, + "grad_norm": 0.8396165910466719, + "language_loss": 0.61205292, + "learning_rate": 3.953454520847964e-06, + "loss": 0.63350785, + "num_input_tokens_seen": 17216790, + "step": 805, + "time_per_iteration": 3.1807608604431152 + }, + { + "auxiliary_loss_clip": 0.01217701, + "auxiliary_loss_mlp": 0.01045562, + "balance_loss_clip": 1.06531501, + "balance_loss_mlp": 1.03110206, + "epoch": 0.09691576985510732, + "flos": 21945514020480.0, + "grad_norm": 2.0425662717116975, + "language_loss": 0.74070203, + "learning_rate": 3.9532872948706395e-06, + "loss": 0.76333463, + "num_input_tokens_seen": 17236285, + "step": 806, + "time_per_iteration": 2.548994779586792 + }, + { + "auxiliary_loss_clip": 0.01222778, + "auxiliary_loss_mlp": 0.01045693, + "balance_loss_clip": 1.06582189, + "balance_loss_mlp": 1.03281283, + "epoch": 0.09703601274574641, + "flos": 17965211103360.0, + "grad_norm": 2.4286220447738653, + "language_loss": 0.82728553, + "learning_rate": 3.9531197725811845e-06, + "loss": 0.84997028, + "num_input_tokens_seen": 17251670, + "step": 807, + "time_per_iteration": 2.507479190826416 + }, + { + "auxiliary_loss_clip": 0.01253427, + "auxiliary_loss_mlp": 0.01050426, + "balance_loss_clip": 1.07279372, + "balance_loss_mlp": 1.03809416, + "epoch": 0.0971562556363855, + "flos": 22162162901760.0, + "grad_norm": 2.6850121768617816, + "language_loss": 0.88039207, + "learning_rate": 3.952951954005013e-06, + "loss": 0.90343058, + "num_input_tokens_seen": 17271355, + "step": 808, + "time_per_iteration": 2.4892542362213135 + }, + { + "auxiliary_loss_clip": 0.01217134, + "auxiliary_loss_mlp": 0.01036162, + "balance_loss_clip": 1.06101489, + "balance_loss_mlp": 1.02496886, + "epoch": 0.0972764985270246, + "flos": 25848716394240.0, + "grad_norm": 1.8231963722908722, + "language_loss": 0.84561324, + "learning_rate": 3.952783839167584e-06, + "loss": 0.86814618, + "num_input_tokens_seen": 17291400, + "step": 809, + "time_per_iteration": 2.5775306224823 + }, + { + "auxiliary_loss_clip": 0.01236344, + "auxiliary_loss_mlp": 0.01049684, + "balance_loss_clip": 1.06806099, + "balance_loss_mlp": 1.03743494, + "epoch": 0.09739674141766368, + "flos": 20339373375360.0, + "grad_norm": 2.7183259182901116, + "language_loss": 0.74613631, + "learning_rate": 3.952615428094398e-06, + "loss": 0.7689966, + "num_input_tokens_seen": 17310920, + "step": 810, + "time_per_iteration": 2.5210890769958496 + }, + { + "auxiliary_loss_clip": 0.01181999, + "auxiliary_loss_mlp": 0.01045819, + "balance_loss_clip": 1.05705833, + "balance_loss_mlp": 1.03359997, + "epoch": 0.09751698430830277, + "flos": 15743059188480.0, + "grad_norm": 1.7593387970309045, + "language_loss": 0.73458415, + "learning_rate": 3.952446720811004e-06, + "loss": 0.75686234, + "num_input_tokens_seen": 17329245, + "step": 811, + "time_per_iteration": 2.570770740509033 + }, + { + "auxiliary_loss_clip": 0.01092132, + "auxiliary_loss_mlp": 0.01002644, + "balance_loss_clip": 1.02120626, + "balance_loss_mlp": 0.99973536, + "epoch": 0.09763722719894186, + "flos": 63716806800000.0, + "grad_norm": 0.869263627588703, + "language_loss": 0.63636553, + "learning_rate": 3.952277717342995e-06, + "loss": 0.65731323, + "num_input_tokens_seen": 17395680, + "step": 812, + "time_per_iteration": 3.226670265197754 + }, + { + "auxiliary_loss_clip": 0.01226064, + "auxiliary_loss_mlp": 0.01044316, + "balance_loss_clip": 1.06756055, + "balance_loss_mlp": 1.03185868, + "epoch": 0.09775747008958095, + "flos": 22090916275200.0, + "grad_norm": 2.520811823871733, + "language_loss": 0.85380816, + "learning_rate": 3.952108417716009e-06, + "loss": 0.87651193, + "num_input_tokens_seen": 17415135, + "step": 813, + "time_per_iteration": 2.5579311847686768 + }, + { + "auxiliary_loss_clip": 0.01238989, + "auxiliary_loss_mlp": 0.01040418, + "balance_loss_clip": 1.07047713, + "balance_loss_mlp": 1.0280019, + "epoch": 0.09787771298022005, + "flos": 21286050272640.0, + "grad_norm": 1.8465100699645431, + "language_loss": 0.84934223, + "learning_rate": 3.951938821955727e-06, + "loss": 0.87213624, + "num_input_tokens_seen": 17434535, + "step": 814, + "time_per_iteration": 2.528773069381714 + }, + { + "auxiliary_loss_clip": 0.01221234, + "auxiliary_loss_mlp": 0.01049105, + "balance_loss_clip": 1.06754315, + "balance_loss_mlp": 1.03543782, + "epoch": 0.09799795587085913, + "flos": 22054574689920.0, + "grad_norm": 1.5561905640613862, + "language_loss": 0.76486707, + "learning_rate": 3.9517689300878786e-06, + "loss": 0.78757048, + "num_input_tokens_seen": 17454270, + "step": 815, + "time_per_iteration": 2.542106866836548 + }, + { + "auxiliary_loss_clip": 0.01249141, + "auxiliary_loss_mlp": 0.01045917, + "balance_loss_clip": 1.06687069, + "balance_loss_mlp": 1.03345346, + "epoch": 0.09811819876149823, + "flos": 22163743100160.0, + "grad_norm": 1.9605332881327175, + "language_loss": 0.78879702, + "learning_rate": 3.951598742138236e-06, + "loss": 0.81174755, + "num_input_tokens_seen": 17472995, + "step": 816, + "time_per_iteration": 2.473940372467041 + }, + { + "auxiliary_loss_clip": 0.01222896, + "auxiliary_loss_mlp": 0.01045045, + "balance_loss_clip": 1.06064057, + "balance_loss_mlp": 1.0332675, + "epoch": 0.09823844165213731, + "flos": 22231111057920.0, + "grad_norm": 2.821344959223831, + "language_loss": 0.79805344, + "learning_rate": 3.951428258132615e-06, + "loss": 0.82073289, + "num_input_tokens_seen": 17491115, + "step": 817, + "time_per_iteration": 2.5387370586395264 + }, + { + "auxiliary_loss_clip": 0.0122159, + "auxiliary_loss_mlp": 0.01043504, + "balance_loss_clip": 1.06785917, + "balance_loss_mlp": 1.03149962, + "epoch": 0.09835868454277641, + "flos": 22487728798080.0, + "grad_norm": 1.9577481338577645, + "language_loss": 0.84298956, + "learning_rate": 3.951257478096879e-06, + "loss": 0.86564052, + "num_input_tokens_seen": 17509480, + "step": 818, + "time_per_iteration": 2.520198345184326 + }, + { + "auxiliary_loss_clip": 0.01224379, + "auxiliary_loss_mlp": 0.00766859, + "balance_loss_clip": 1.06874049, + "balance_loss_mlp": 1.00054383, + "epoch": 0.0984789274334155, + "flos": 16362554077440.0, + "grad_norm": 3.8288373417849595, + "language_loss": 0.68771029, + "learning_rate": 3.951086402056936e-06, + "loss": 0.70762265, + "num_input_tokens_seen": 17524080, + "step": 819, + "time_per_iteration": 3.2473840713500977 + }, + { + "auxiliary_loss_clip": 0.01152496, + "auxiliary_loss_mlp": 0.00766319, + "balance_loss_clip": 1.0603348, + "balance_loss_mlp": 1.0005064, + "epoch": 0.09859917032405459, + "flos": 24243545416320.0, + "grad_norm": 1.676358966475091, + "language_loss": 0.83690858, + "learning_rate": 3.950915030038735e-06, + "loss": 0.85609674, + "num_input_tokens_seen": 17543875, + "step": 820, + "time_per_iteration": 2.7670979499816895 + }, + { + "auxiliary_loss_clip": 0.01233445, + "auxiliary_loss_mlp": 0.01043418, + "balance_loss_clip": 1.06939077, + "balance_loss_mlp": 1.03159237, + "epoch": 0.09871941321469369, + "flos": 17420195064960.0, + "grad_norm": 2.103521736257372, + "language_loss": 0.83618629, + "learning_rate": 3.9507433620682765e-06, + "loss": 0.85895491, + "num_input_tokens_seen": 17560810, + "step": 821, + "time_per_iteration": 3.399217128753662 + }, + { + "auxiliary_loss_clip": 0.01203531, + "auxiliary_loss_mlp": 0.01038226, + "balance_loss_clip": 1.0627929, + "balance_loss_mlp": 1.02589369, + "epoch": 0.09883965610533277, + "flos": 28477341590400.0, + "grad_norm": 1.9805884995132614, + "language_loss": 0.88129181, + "learning_rate": 3.9505713981716e-06, + "loss": 0.90370941, + "num_input_tokens_seen": 17583640, + "step": 822, + "time_per_iteration": 2.605008840560913 + }, + { + "auxiliary_loss_clip": 0.01217212, + "auxiliary_loss_mlp": 0.01038019, + "balance_loss_clip": 1.06713915, + "balance_loss_mlp": 1.02684927, + "epoch": 0.09895989899597187, + "flos": 23693932437120.0, + "grad_norm": 1.8357779514047379, + "language_loss": 0.80996335, + "learning_rate": 3.950399138374795e-06, + "loss": 0.83251566, + "num_input_tokens_seen": 17602720, + "step": 823, + "time_per_iteration": 4.076177358627319 + }, + { + "auxiliary_loss_clip": 0.01232884, + "auxiliary_loss_mlp": 0.01048483, + "balance_loss_clip": 1.06617749, + "balance_loss_mlp": 1.03576291, + "epoch": 0.09908014188661095, + "flos": 24679608526080.0, + "grad_norm": 3.3281285355771524, + "language_loss": 0.74085295, + "learning_rate": 3.95022658270399e-06, + "loss": 0.76366657, + "num_input_tokens_seen": 17623085, + "step": 824, + "time_per_iteration": 2.5398151874542236 + }, + { + "auxiliary_loss_clip": 0.01215985, + "auxiliary_loss_mlp": 0.01037965, + "balance_loss_clip": 1.06668115, + "balance_loss_mlp": 1.02669966, + "epoch": 0.09920038477725004, + "flos": 14064307200000.0, + "grad_norm": 1.8233563132599488, + "language_loss": 0.77905917, + "learning_rate": 3.9500537311853635e-06, + "loss": 0.80159867, + "num_input_tokens_seen": 17641040, + "step": 825, + "time_per_iteration": 2.499595880508423 + }, + { + "auxiliary_loss_clip": 0.01231545, + "auxiliary_loss_mlp": 0.01041648, + "balance_loss_clip": 1.06278777, + "balance_loss_mlp": 1.02891064, + "epoch": 0.09932062766788914, + "flos": 13407070095360.0, + "grad_norm": 13.027428302364832, + "language_loss": 0.83361077, + "learning_rate": 3.949880583845136e-06, + "loss": 0.85634267, + "num_input_tokens_seen": 17659115, + "step": 826, + "time_per_iteration": 2.4805374145507812 + }, + { + "auxiliary_loss_clip": 0.01217244, + "auxiliary_loss_mlp": 0.01033339, + "balance_loss_clip": 1.06459403, + "balance_loss_mlp": 1.02172256, + "epoch": 0.09944087055852822, + "flos": 19500751566720.0, + "grad_norm": 1.986738412928158, + "language_loss": 0.81363928, + "learning_rate": 3.949707140709575e-06, + "loss": 0.83614516, + "num_input_tokens_seen": 17678845, + "step": 827, + "time_per_iteration": 2.5281829833984375 + }, + { + "auxiliary_loss_clip": 0.01235907, + "auxiliary_loss_mlp": 0.01042023, + "balance_loss_clip": 1.06530321, + "balance_loss_mlp": 1.02948236, + "epoch": 0.09956111344916732, + "flos": 17749100926080.0, + "grad_norm": 2.3391331143034515, + "language_loss": 0.83224058, + "learning_rate": 3.949533401804991e-06, + "loss": 0.85501987, + "num_input_tokens_seen": 17695750, + "step": 828, + "time_per_iteration": 2.4740121364593506 + }, + { + "auxiliary_loss_clip": 0.01232491, + "auxiliary_loss_mlp": 0.00766622, + "balance_loss_clip": 1.06679964, + "balance_loss_mlp": 1.00031209, + "epoch": 0.0996813563398064, + "flos": 17967581400960.0, + "grad_norm": 1.982528679544394, + "language_loss": 0.9047035, + "learning_rate": 3.949359367157739e-06, + "loss": 0.92469466, + "num_input_tokens_seen": 17714445, + "step": 829, + "time_per_iteration": 2.4768590927124023 + }, + { + "auxiliary_loss_clip": 0.01238318, + "auxiliary_loss_mlp": 0.01041941, + "balance_loss_clip": 1.06877434, + "balance_loss_mlp": 1.02950788, + "epoch": 0.0998015992304455, + "flos": 17457039440640.0, + "grad_norm": 2.117809889068935, + "language_loss": 0.75705749, + "learning_rate": 3.949185036794222e-06, + "loss": 0.77986014, + "num_input_tokens_seen": 17732455, + "step": 830, + "time_per_iteration": 2.4751312732696533 + }, + { + "auxiliary_loss_clip": 0.01248348, + "auxiliary_loss_mlp": 0.01043797, + "balance_loss_clip": 1.06953096, + "balance_loss_mlp": 1.03255558, + "epoch": 0.0999218421210846, + "flos": 25888757080320.0, + "grad_norm": 1.70849214052196, + "language_loss": 0.78836644, + "learning_rate": 3.949010410740884e-06, + "loss": 0.81128788, + "num_input_tokens_seen": 17755280, + "step": 831, + "time_per_iteration": 2.549265146255493 + }, + { + "auxiliary_loss_clip": 0.01209222, + "auxiliary_loss_mlp": 0.00766058, + "balance_loss_clip": 1.06251395, + "balance_loss_mlp": 1.00030386, + "epoch": 0.10004208501172368, + "flos": 21215916967680.0, + "grad_norm": 1.8259927493770916, + "language_loss": 0.86512977, + "learning_rate": 3.948835489024216e-06, + "loss": 0.88488257, + "num_input_tokens_seen": 17775015, + "step": 832, + "time_per_iteration": 2.5604610443115234 + }, + { + "auxiliary_loss_clip": 0.01235082, + "auxiliary_loss_mlp": 0.01039964, + "balance_loss_clip": 1.06599545, + "balance_loss_mlp": 1.02811503, + "epoch": 0.10016232790236278, + "flos": 17348409734400.0, + "grad_norm": 2.047958114028037, + "language_loss": 0.90334547, + "learning_rate": 3.948660271670755e-06, + "loss": 0.92609596, + "num_input_tokens_seen": 17792165, + "step": 833, + "time_per_iteration": 2.498408079147339 + }, + { + "auxiliary_loss_clip": 0.01213523, + "auxiliary_loss_mlp": 0.01041586, + "balance_loss_clip": 1.06349277, + "balance_loss_mlp": 1.03021336, + "epoch": 0.10028257079300186, + "flos": 25666541591040.0, + "grad_norm": 2.266923399880475, + "language_loss": 0.84455311, + "learning_rate": 3.948484758707079e-06, + "loss": 0.86710417, + "num_input_tokens_seen": 17811765, + "step": 834, + "time_per_iteration": 2.5506155490875244 + }, + { + "auxiliary_loss_clip": 0.0119298, + "auxiliary_loss_mlp": 0.01040153, + "balance_loss_clip": 1.05855405, + "balance_loss_mlp": 1.02745771, + "epoch": 0.10040281368364096, + "flos": 25156035544320.0, + "grad_norm": 2.9518600229530056, + "language_loss": 0.83683074, + "learning_rate": 3.948308950159815e-06, + "loss": 0.85916209, + "num_input_tokens_seen": 17830445, + "step": 835, + "time_per_iteration": 2.582674026489258 + }, + { + "auxiliary_loss_clip": 0.01198353, + "auxiliary_loss_mlp": 0.01046901, + "balance_loss_clip": 1.06008875, + "balance_loss_mlp": 1.03329325, + "epoch": 0.10052305657428004, + "flos": 17603303621760.0, + "grad_norm": 2.501693931717635, + "language_loss": 0.76091027, + "learning_rate": 3.9481328460556326e-06, + "loss": 0.78336287, + "num_input_tokens_seen": 17847665, + "step": 836, + "time_per_iteration": 2.542973041534424 + }, + { + "auxiliary_loss_clip": 0.01208769, + "auxiliary_loss_mlp": 0.01035301, + "balance_loss_clip": 1.06150818, + "balance_loss_mlp": 1.02336192, + "epoch": 0.10064329946491914, + "flos": 18660154510080.0, + "grad_norm": 2.1049010832716846, + "language_loss": 0.8991791, + "learning_rate": 3.9479564464212455e-06, + "loss": 0.92161977, + "num_input_tokens_seen": 17866825, + "step": 837, + "time_per_iteration": 2.504906415939331 + }, + { + "auxiliary_loss_clip": 0.01253113, + "auxiliary_loss_mlp": 0.01041698, + "balance_loss_clip": 1.06866026, + "balance_loss_mlp": 1.02998614, + "epoch": 0.10076354235555823, + "flos": 17199056983680.0, + "grad_norm": 2.75679295476661, + "language_loss": 0.7619561, + "learning_rate": 3.947779751283414e-06, + "loss": 0.78490418, + "num_input_tokens_seen": 17883995, + "step": 838, + "time_per_iteration": 2.4738874435424805 + }, + { + "auxiliary_loss_clip": 0.01237746, + "auxiliary_loss_mlp": 0.00766752, + "balance_loss_clip": 1.07302344, + "balance_loss_mlp": 1.00020003, + "epoch": 0.10088378524619732, + "flos": 22962252395520.0, + "grad_norm": 1.8520043887728734, + "language_loss": 0.75838196, + "learning_rate": 3.947602760668944e-06, + "loss": 0.77842695, + "num_input_tokens_seen": 17903785, + "step": 839, + "time_per_iteration": 2.547327995300293 + }, + { + "auxiliary_loss_clip": 0.01235129, + "auxiliary_loss_mlp": 0.01042671, + "balance_loss_clip": 1.07093203, + "balance_loss_mlp": 1.03028524, + "epoch": 0.10100402813683641, + "flos": 37885828746240.0, + "grad_norm": 2.4264769427943085, + "language_loss": 0.71322119, + "learning_rate": 3.947425474604684e-06, + "loss": 0.73599929, + "num_input_tokens_seen": 17927720, + "step": 840, + "time_per_iteration": 2.6270503997802734 + }, + { + "auxiliary_loss_clip": 0.01217603, + "auxiliary_loss_mlp": 0.01045622, + "balance_loss_clip": 1.0648315, + "balance_loss_mlp": 1.03383827, + "epoch": 0.1011242710274755, + "flos": 21543458112000.0, + "grad_norm": 2.0899831638047175, + "language_loss": 0.92296994, + "learning_rate": 3.947247893117528e-06, + "loss": 0.94560218, + "num_input_tokens_seen": 17946225, + "step": 841, + "time_per_iteration": 2.5200583934783936 + }, + { + "auxiliary_loss_clip": 0.01230108, + "auxiliary_loss_mlp": 0.01049382, + "balance_loss_clip": 1.06502783, + "balance_loss_mlp": 1.03687727, + "epoch": 0.10124451391811459, + "flos": 13621456419840.0, + "grad_norm": 3.528152303667486, + "language_loss": 0.6993745, + "learning_rate": 3.947070016234413e-06, + "loss": 0.7221694, + "num_input_tokens_seen": 17962015, + "step": 842, + "time_per_iteration": 2.450657367706299 + }, + { + "auxiliary_loss_clip": 0.01229082, + "auxiliary_loss_mlp": 0.01042446, + "balance_loss_clip": 1.06893182, + "balance_loss_mlp": 1.02967286, + "epoch": 0.10136475680875369, + "flos": 16649228522880.0, + "grad_norm": 2.094222864522228, + "language_loss": 0.74779224, + "learning_rate": 3.946891843982326e-06, + "loss": 0.77050751, + "num_input_tokens_seen": 17979680, + "step": 843, + "time_per_iteration": 2.48270583152771 + }, + { + "auxiliary_loss_clip": 0.01236703, + "auxiliary_loss_mlp": 0.01044577, + "balance_loss_clip": 1.06989002, + "balance_loss_mlp": 1.03185105, + "epoch": 0.10148499969939277, + "flos": 19461034103040.0, + "grad_norm": 3.0638419308156073, + "language_loss": 0.74668109, + "learning_rate": 3.9467133763882935e-06, + "loss": 0.76949388, + "num_input_tokens_seen": 17998145, + "step": 844, + "time_per_iteration": 2.481452465057373 + }, + { + "auxiliary_loss_clip": 0.01222899, + "auxiliary_loss_mlp": 0.01047146, + "balance_loss_clip": 1.06495953, + "balance_loss_mlp": 1.03432536, + "epoch": 0.10160524259003187, + "flos": 21104988791040.0, + "grad_norm": 2.260430119844034, + "language_loss": 0.86395693, + "learning_rate": 3.9465346134793905e-06, + "loss": 0.88665736, + "num_input_tokens_seen": 18017955, + "step": 845, + "time_per_iteration": 3.3097543716430664 + }, + { + "auxiliary_loss_clip": 0.01206115, + "auxiliary_loss_mlp": 0.01040574, + "balance_loss_clip": 1.0664047, + "balance_loss_mlp": 1.02873015, + "epoch": 0.10172548548067095, + "flos": 17712687513600.0, + "grad_norm": 2.3329650908996244, + "language_loss": 0.7954011, + "learning_rate": 3.9463555552827335e-06, + "loss": 0.81786799, + "num_input_tokens_seen": 18035125, + "step": 846, + "time_per_iteration": 2.550278902053833 + }, + { + "auxiliary_loss_clip": 0.0122311, + "auxiliary_loss_mlp": 0.0104715, + "balance_loss_clip": 1.06524265, + "balance_loss_mlp": 1.03494895, + "epoch": 0.10184572837131005, + "flos": 21104845136640.0, + "grad_norm": 2.5158626605742116, + "language_loss": 0.86208236, + "learning_rate": 3.946176201825487e-06, + "loss": 0.884785, + "num_input_tokens_seen": 18053160, + "step": 847, + "time_per_iteration": 2.4945614337921143 + }, + { + "auxiliary_loss_clip": 0.01220788, + "auxiliary_loss_mlp": 0.01046906, + "balance_loss_clip": 1.06840253, + "balance_loss_mlp": 1.03463936, + "epoch": 0.10196597126194913, + "flos": 26067591918720.0, + "grad_norm": 1.8885339325241959, + "language_loss": 0.8348856, + "learning_rate": 3.9459965531348575e-06, + "loss": 0.85756254, + "num_input_tokens_seen": 18072815, + "step": 848, + "time_per_iteration": 3.2872347831726074 + }, + { + "auxiliary_loss_clip": 0.01218846, + "auxiliary_loss_mlp": 0.00766734, + "balance_loss_clip": 1.06589198, + "balance_loss_mlp": 1.00030899, + "epoch": 0.10208621415258823, + "flos": 29314634595840.0, + "grad_norm": 4.67342942996785, + "language_loss": 0.85668486, + "learning_rate": 3.945816609238098e-06, + "loss": 0.87654066, + "num_input_tokens_seen": 18092225, + "step": 849, + "time_per_iteration": 2.5906620025634766 + }, + { + "auxiliary_loss_clip": 0.01178674, + "auxiliary_loss_mlp": 0.01045593, + "balance_loss_clip": 1.06167364, + "balance_loss_mlp": 1.03318882, + "epoch": 0.10220645704322733, + "flos": 23805794367360.0, + "grad_norm": 2.0895911474021847, + "language_loss": 0.85123843, + "learning_rate": 3.945636370162507e-06, + "loss": 0.87348109, + "num_input_tokens_seen": 18112335, + "step": 850, + "time_per_iteration": 4.191660404205322 + }, + { + "auxiliary_loss_clip": 0.01232866, + "auxiliary_loss_mlp": 0.01046544, + "balance_loss_clip": 1.06856322, + "balance_loss_mlp": 1.03477764, + "epoch": 0.10232669993386641, + "flos": 23218546913280.0, + "grad_norm": 1.9205845741295278, + "language_loss": 0.79014564, + "learning_rate": 3.945455835935425e-06, + "loss": 0.81293976, + "num_input_tokens_seen": 18131520, + "step": 851, + "time_per_iteration": 2.5060811042785645 + }, + { + "auxiliary_loss_clip": 0.01219658, + "auxiliary_loss_mlp": 0.01046121, + "balance_loss_clip": 1.06509399, + "balance_loss_mlp": 1.03437912, + "epoch": 0.1024469428245055, + "flos": 22922929981440.0, + "grad_norm": 2.276959933066672, + "language_loss": 0.7532922, + "learning_rate": 3.94527500658424e-06, + "loss": 0.77594995, + "num_input_tokens_seen": 18149185, + "step": 852, + "time_per_iteration": 2.524010419845581 + }, + { + "auxiliary_loss_clip": 0.01188362, + "auxiliary_loss_mlp": 0.01042034, + "balance_loss_clip": 1.06221771, + "balance_loss_mlp": 1.03033328, + "epoch": 0.10256718571514459, + "flos": 31359495957120.0, + "grad_norm": 1.9643853399802422, + "language_loss": 0.81112975, + "learning_rate": 3.945093882136382e-06, + "loss": 0.83343375, + "num_input_tokens_seen": 18172960, + "step": 853, + "time_per_iteration": 2.685577392578125 + }, + { + "auxiliary_loss_clip": 0.01216966, + "auxiliary_loss_mlp": 0.00765281, + "balance_loss_clip": 1.06781578, + "balance_loss_mlp": 1.0003463, + "epoch": 0.10268742860578368, + "flos": 23474877344640.0, + "grad_norm": 2.1739199952084522, + "language_loss": 0.84654075, + "learning_rate": 3.944912462619329e-06, + "loss": 0.86636317, + "num_input_tokens_seen": 18191925, + "step": 854, + "time_per_iteration": 2.5390515327453613 + }, + { + "auxiliary_loss_clip": 0.01221521, + "auxiliary_loss_mlp": 0.01051359, + "balance_loss_clip": 1.06616735, + "balance_loss_mlp": 1.03851414, + "epoch": 0.10280767149642277, + "flos": 25520313323520.0, + "grad_norm": 6.081964231014027, + "language_loss": 0.80544841, + "learning_rate": 3.9447307480606025e-06, + "loss": 0.82817721, + "num_input_tokens_seen": 18212010, + "step": 855, + "time_per_iteration": 2.571157932281494 + }, + { + "auxiliary_loss_clip": 0.01213772, + "auxiliary_loss_mlp": 0.01043335, + "balance_loss_clip": 1.06505132, + "balance_loss_mlp": 1.03023934, + "epoch": 0.10292791438706186, + "flos": 17347691462400.0, + "grad_norm": 2.305262285744273, + "language_loss": 0.90197009, + "learning_rate": 3.944548738487767e-06, + "loss": 0.92454112, + "num_input_tokens_seen": 18229525, + "step": 856, + "time_per_iteration": 2.532919406890869 + }, + { + "auxiliary_loss_clip": 0.01257345, + "auxiliary_loss_mlp": 0.01042076, + "balance_loss_clip": 1.0741092, + "balance_loss_mlp": 1.03098345, + "epoch": 0.10304815727770096, + "flos": 27052693390080.0, + "grad_norm": 2.0155284436296412, + "language_loss": 0.90769422, + "learning_rate": 3.944366433928434e-06, + "loss": 0.9306885, + "num_input_tokens_seen": 18249505, + "step": 857, + "time_per_iteration": 2.5007433891296387 + }, + { + "auxiliary_loss_clip": 0.01212342, + "auxiliary_loss_mlp": 0.01044746, + "balance_loss_clip": 1.06194758, + "balance_loss_mlp": 1.03280711, + "epoch": 0.10316840016834004, + "flos": 22782591544320.0, + "grad_norm": 1.5723608132834708, + "language_loss": 0.83530712, + "learning_rate": 3.9441838344102594e-06, + "loss": 0.85787797, + "num_input_tokens_seen": 18269230, + "step": 858, + "time_per_iteration": 2.556318521499634 + }, + { + "auxiliary_loss_clip": 0.01225918, + "auxiliary_loss_mlp": 0.01043718, + "balance_loss_clip": 1.06795287, + "balance_loss_mlp": 1.03220236, + "epoch": 0.10328864305897914, + "flos": 20704584908160.0, + "grad_norm": 2.096390194824153, + "language_loss": 0.66827232, + "learning_rate": 3.944000939960943e-06, + "loss": 0.69096863, + "num_input_tokens_seen": 18287955, + "step": 859, + "time_per_iteration": 2.523285388946533 + }, + { + "auxiliary_loss_clip": 0.01238517, + "auxiliary_loss_mlp": 0.01040144, + "balance_loss_clip": 1.06627941, + "balance_loss_mlp": 1.02915907, + "epoch": 0.10340888594961822, + "flos": 28478814048000.0, + "grad_norm": 1.5824757793222501, + "language_loss": 0.80100703, + "learning_rate": 3.943817750608229e-06, + "loss": 0.82379365, + "num_input_tokens_seen": 18310505, + "step": 860, + "time_per_iteration": 2.5982556343078613 + }, + { + "auxiliary_loss_clip": 0.01238712, + "auxiliary_loss_mlp": 0.01038222, + "balance_loss_clip": 1.07022858, + "balance_loss_mlp": 1.02687979, + "epoch": 0.10352912884025732, + "flos": 13370333460480.0, + "grad_norm": 2.4054679788362585, + "language_loss": 0.82108086, + "learning_rate": 3.943634266379908e-06, + "loss": 0.84385026, + "num_input_tokens_seen": 18327400, + "step": 861, + "time_per_iteration": 2.467580795288086 + }, + { + "auxiliary_loss_clip": 0.01236931, + "auxiliary_loss_mlp": 0.01036532, + "balance_loss_clip": 1.06675553, + "balance_loss_mlp": 1.0251596, + "epoch": 0.10364937173089642, + "flos": 25558558329600.0, + "grad_norm": 1.6701505568462782, + "language_loss": 0.84833217, + "learning_rate": 3.943450487303815e-06, + "loss": 0.87106681, + "num_input_tokens_seen": 18347895, + "step": 862, + "time_per_iteration": 2.546888828277588 + }, + { + "auxiliary_loss_clip": 0.01232347, + "auxiliary_loss_mlp": 0.01038291, + "balance_loss_clip": 1.06823516, + "balance_loss_mlp": 1.02666831, + "epoch": 0.1037696146215355, + "flos": 21215486004480.0, + "grad_norm": 1.869525960389969, + "language_loss": 0.85454553, + "learning_rate": 3.943266413407827e-06, + "loss": 0.87725186, + "num_input_tokens_seen": 18367170, + "step": 863, + "time_per_iteration": 2.49277663230896 + }, + { + "auxiliary_loss_clip": 0.01236144, + "auxiliary_loss_mlp": 0.01043843, + "balance_loss_clip": 1.06892049, + "balance_loss_mlp": 1.03200555, + "epoch": 0.1038898575121746, + "flos": 25807382818560.0, + "grad_norm": 2.045822115293207, + "language_loss": 0.85146117, + "learning_rate": 3.94308204471987e-06, + "loss": 0.87426102, + "num_input_tokens_seen": 18386185, + "step": 864, + "time_per_iteration": 2.5365495681762695 + }, + { + "auxiliary_loss_clip": 0.01205001, + "auxiliary_loss_mlp": 0.01032964, + "balance_loss_clip": 1.06338847, + "balance_loss_mlp": 1.02070975, + "epoch": 0.10401010040281368, + "flos": 19062425900160.0, + "grad_norm": 2.6842382579478725, + "language_loss": 0.745372, + "learning_rate": 3.942897381267912e-06, + "loss": 0.76775169, + "num_input_tokens_seen": 18402550, + "step": 865, + "time_per_iteration": 2.553131580352783 + }, + { + "auxiliary_loss_clip": 0.01240881, + "auxiliary_loss_mlp": 0.01036432, + "balance_loss_clip": 1.07054615, + "balance_loss_mlp": 1.02494645, + "epoch": 0.10413034329345278, + "flos": 16355119962240.0, + "grad_norm": 2.3162175265401297, + "language_loss": 0.66140783, + "learning_rate": 3.942712423079965e-06, + "loss": 0.68418097, + "num_input_tokens_seen": 18418940, + "step": 866, + "time_per_iteration": 2.4716644287109375 + }, + { + "auxiliary_loss_clip": 0.01182991, + "auxiliary_loss_mlp": 0.01035803, + "balance_loss_clip": 1.05337632, + "balance_loss_mlp": 1.02561069, + "epoch": 0.10425058618409186, + "flos": 17236511890560.0, + "grad_norm": 2.2408347178117145, + "language_loss": 0.90118945, + "learning_rate": 3.942527170184088e-06, + "loss": 0.92337745, + "num_input_tokens_seen": 18435560, + "step": 867, + "time_per_iteration": 2.5301923751831055 + }, + { + "auxiliary_loss_clip": 0.01252096, + "auxiliary_loss_mlp": 0.01042189, + "balance_loss_clip": 1.07120979, + "balance_loss_mlp": 1.03015506, + "epoch": 0.10437082907473096, + "flos": 17967365919360.0, + "grad_norm": 2.972293531213243, + "language_loss": 0.77088869, + "learning_rate": 3.942341622608385e-06, + "loss": 0.79383159, + "num_input_tokens_seen": 18452590, + "step": 868, + "time_per_iteration": 2.451101064682007 + }, + { + "auxiliary_loss_clip": 0.0122329, + "auxiliary_loss_mlp": 0.01043138, + "balance_loss_clip": 1.07100654, + "balance_loss_mlp": 1.03153348, + "epoch": 0.10449107196537005, + "flos": 36283315374720.0, + "grad_norm": 1.5827540375668958, + "language_loss": 0.7782805, + "learning_rate": 3.942155780381001e-06, + "loss": 0.80094481, + "num_input_tokens_seen": 18476325, + "step": 869, + "time_per_iteration": 2.6757583618164062 + }, + { + "auxiliary_loss_clip": 0.01218341, + "auxiliary_loss_mlp": 0.01040181, + "balance_loss_clip": 1.06285739, + "balance_loss_mlp": 1.02814102, + "epoch": 0.10461131485600914, + "flos": 23802095266560.0, + "grad_norm": 1.7915182221294372, + "language_loss": 0.75789928, + "learning_rate": 3.94196964353013e-06, + "loss": 0.78048456, + "num_input_tokens_seen": 18495775, + "step": 870, + "time_per_iteration": 2.552426815032959 + }, + { + "auxiliary_loss_clip": 0.01213871, + "auxiliary_loss_mlp": 0.00765222, + "balance_loss_clip": 1.06179106, + "balance_loss_mlp": 1.00037217, + "epoch": 0.10473155774664823, + "flos": 18405476104320.0, + "grad_norm": 5.85289063651792, + "language_loss": 0.80518639, + "learning_rate": 3.941783212084008e-06, + "loss": 0.8249774, + "num_input_tokens_seen": 18513530, + "step": 871, + "time_per_iteration": 2.577425718307495 + }, + { + "auxiliary_loss_clip": 0.01204464, + "auxiliary_loss_mlp": 0.01043174, + "balance_loss_clip": 1.06471479, + "balance_loss_mlp": 1.03119898, + "epoch": 0.10485180063728732, + "flos": 25592637358080.0, + "grad_norm": 3.0892868135710505, + "language_loss": 0.78967702, + "learning_rate": 3.941596486070916e-06, + "loss": 0.81215346, + "num_input_tokens_seen": 18531575, + "step": 872, + "time_per_iteration": 3.3816959857940674 + }, + { + "auxiliary_loss_clip": 0.01183373, + "auxiliary_loss_mlp": 0.01035586, + "balance_loss_clip": 1.0628984, + "balance_loss_mlp": 1.02288437, + "epoch": 0.10497204352792641, + "flos": 27088747666560.0, + "grad_norm": 3.0549925315445288, + "language_loss": 0.58724862, + "learning_rate": 3.941409465519182e-06, + "loss": 0.60943824, + "num_input_tokens_seen": 18552100, + "step": 873, + "time_per_iteration": 2.6261308193206787 + }, + { + "auxiliary_loss_clip": 0.01223713, + "auxiliary_loss_mlp": 0.01042278, + "balance_loss_clip": 1.06333494, + "balance_loss_mlp": 1.02946329, + "epoch": 0.10509228641856551, + "flos": 32858479353600.0, + "grad_norm": 1.6838834251217119, + "language_loss": 0.85139322, + "learning_rate": 3.941222150457176e-06, + "loss": 0.87405318, + "num_input_tokens_seen": 18575355, + "step": 874, + "time_per_iteration": 3.4207234382629395 + }, + { + "auxiliary_loss_clip": 0.01236689, + "auxiliary_loss_mlp": 0.01040105, + "balance_loss_clip": 1.06603503, + "balance_loss_mlp": 1.02812433, + "epoch": 0.10521252930920459, + "flos": 14319165173760.0, + "grad_norm": 2.7578957276863862, + "language_loss": 0.71586943, + "learning_rate": 3.941034540913311e-06, + "loss": 0.73863739, + "num_input_tokens_seen": 18592885, + "step": 875, + "time_per_iteration": 2.4754934310913086 + }, + { + "auxiliary_loss_clip": 0.01233356, + "auxiliary_loss_mlp": 0.00766055, + "balance_loss_clip": 1.06889915, + "balance_loss_mlp": 1.00033796, + "epoch": 0.10533277219984369, + "flos": 21687028773120.0, + "grad_norm": 1.664427919627717, + "language_loss": 0.82600868, + "learning_rate": 3.940846636916051e-06, + "loss": 0.84600282, + "num_input_tokens_seen": 18612920, + "step": 876, + "time_per_iteration": 2.512444257736206 + }, + { + "auxiliary_loss_clip": 0.0121672, + "auxiliary_loss_mlp": 0.01045103, + "balance_loss_clip": 1.06951714, + "balance_loss_mlp": 1.0323596, + "epoch": 0.10545301509048277, + "flos": 22269787027200.0, + "grad_norm": 2.124667979366744, + "language_loss": 0.86512959, + "learning_rate": 3.940658438493899e-06, + "loss": 0.88774788, + "num_input_tokens_seen": 18630765, + "step": 877, + "time_per_iteration": 4.076648235321045 + }, + { + "auxiliary_loss_clip": 0.01250256, + "auxiliary_loss_mlp": 0.01043141, + "balance_loss_clip": 1.06417704, + "balance_loss_mlp": 1.03048134, + "epoch": 0.10557325798112187, + "flos": 22199725549440.0, + "grad_norm": 3.6643130670275483, + "language_loss": 0.75785804, + "learning_rate": 3.940469945675405e-06, + "loss": 0.780792, + "num_input_tokens_seen": 18649150, + "step": 878, + "time_per_iteration": 2.4679622650146484 + }, + { + "auxiliary_loss_clip": 0.01164954, + "auxiliary_loss_mlp": 0.0104512, + "balance_loss_clip": 1.05765224, + "balance_loss_mlp": 1.03384233, + "epoch": 0.10569350087176095, + "flos": 25775889569280.0, + "grad_norm": 1.9709125447804825, + "language_loss": 0.91103446, + "learning_rate": 3.940281158489163e-06, + "loss": 0.93313527, + "num_input_tokens_seen": 18668380, + "step": 879, + "time_per_iteration": 2.618852376937866 + }, + { + "auxiliary_loss_clip": 0.01164953, + "auxiliary_loss_mlp": 0.01042103, + "balance_loss_clip": 1.0535295, + "balance_loss_mlp": 1.03070021, + "epoch": 0.10581374376240005, + "flos": 17311385790720.0, + "grad_norm": 1.7408157265515813, + "language_loss": 0.82873833, + "learning_rate": 3.940092076963812e-06, + "loss": 0.85080886, + "num_input_tokens_seen": 18685875, + "step": 880, + "time_per_iteration": 2.6281678676605225 + }, + { + "auxiliary_loss_clip": 0.01212928, + "auxiliary_loss_mlp": 0.01041354, + "balance_loss_clip": 1.06117392, + "balance_loss_mlp": 1.02931392, + "epoch": 0.10593398665303914, + "flos": 34349454017280.0, + "grad_norm": 2.03989750965166, + "language_loss": 0.78777409, + "learning_rate": 3.9399027011280355e-06, + "loss": 0.81031692, + "num_input_tokens_seen": 18707970, + "step": 881, + "time_per_iteration": 2.6560513973236084 + }, + { + "auxiliary_loss_clip": 0.01216385, + "auxiliary_loss_mlp": 0.01041865, + "balance_loss_clip": 1.06715453, + "balance_loss_mlp": 1.02952671, + "epoch": 0.10605422954367823, + "flos": 23257977068160.0, + "grad_norm": 1.9607519280692702, + "language_loss": 0.77187771, + "learning_rate": 3.939713031010561e-06, + "loss": 0.79446018, + "num_input_tokens_seen": 18726335, + "step": 882, + "time_per_iteration": 2.548799514770508 + }, + { + "auxiliary_loss_clip": 0.01198885, + "auxiliary_loss_mlp": 0.01041095, + "balance_loss_clip": 1.06390989, + "balance_loss_mlp": 1.02830958, + "epoch": 0.10617447243431732, + "flos": 22820118278400.0, + "grad_norm": 2.103386125822078, + "language_loss": 0.77600533, + "learning_rate": 3.939523066640163e-06, + "loss": 0.79840511, + "num_input_tokens_seen": 18745230, + "step": 883, + "time_per_iteration": 2.562774181365967 + }, + { + "auxiliary_loss_clip": 0.01234591, + "auxiliary_loss_mlp": 0.01039984, + "balance_loss_clip": 1.06694865, + "balance_loss_mlp": 1.02823555, + "epoch": 0.10629471532495641, + "flos": 24386577373440.0, + "grad_norm": 1.91804380928004, + "language_loss": 0.8112098, + "learning_rate": 3.939332808045657e-06, + "loss": 0.83395553, + "num_input_tokens_seen": 18764880, + "step": 884, + "time_per_iteration": 2.534285545349121 + }, + { + "auxiliary_loss_clip": 0.01201987, + "auxiliary_loss_mlp": 0.01041318, + "balance_loss_clip": 1.06333113, + "balance_loss_mlp": 1.02992105, + "epoch": 0.1064149582155955, + "flos": 21105491581440.0, + "grad_norm": 1.6704123622725044, + "language_loss": 0.84529018, + "learning_rate": 3.939142255255906e-06, + "loss": 0.86772323, + "num_input_tokens_seen": 18785765, + "step": 885, + "time_per_iteration": 2.5806057453155518 + }, + { + "auxiliary_loss_clip": 0.01233795, + "auxiliary_loss_mlp": 0.01032955, + "balance_loss_clip": 1.06829143, + "balance_loss_mlp": 1.02055132, + "epoch": 0.1065352011062346, + "flos": 20702035042560.0, + "grad_norm": 1.9279211263472542, + "language_loss": 0.87092412, + "learning_rate": 3.938951408299817e-06, + "loss": 0.89359164, + "num_input_tokens_seen": 18804605, + "step": 886, + "time_per_iteration": 2.4967873096466064 + }, + { + "auxiliary_loss_clip": 0.01087883, + "auxiliary_loss_mlp": 0.01013059, + "balance_loss_clip": 1.03823638, + "balance_loss_mlp": 1.0105561, + "epoch": 0.10665544399687368, + "flos": 62659632689280.0, + "grad_norm": 0.8057221194130781, + "language_loss": 0.54502559, + "learning_rate": 3.938760267206342e-06, + "loss": 0.56603503, + "num_input_tokens_seen": 18866425, + "step": 887, + "time_per_iteration": 3.0898914337158203 + }, + { + "auxiliary_loss_clip": 0.0124959, + "auxiliary_loss_mlp": 0.01038674, + "balance_loss_clip": 1.06987429, + "balance_loss_mlp": 1.02625203, + "epoch": 0.10677568688751278, + "flos": 26140382830080.0, + "grad_norm": 2.1173228787834435, + "language_loss": 0.78814077, + "learning_rate": 3.938568832004475e-06, + "loss": 0.81102347, + "num_input_tokens_seen": 18885130, + "step": 888, + "time_per_iteration": 2.536553144454956 + }, + { + "auxiliary_loss_clip": 0.0120631, + "auxiliary_loss_mlp": 0.01051965, + "balance_loss_clip": 1.06083298, + "balance_loss_mlp": 1.03933489, + "epoch": 0.10689592977815186, + "flos": 12786533712000.0, + "grad_norm": 1.9804316650453253, + "language_loss": 0.75158656, + "learning_rate": 3.938377102723257e-06, + "loss": 0.77416927, + "num_input_tokens_seen": 18902265, + "step": 889, + "time_per_iteration": 2.5814859867095947 + }, + { + "auxiliary_loss_clip": 0.01169563, + "auxiliary_loss_mlp": 0.01048546, + "balance_loss_clip": 1.05663574, + "balance_loss_mlp": 1.03551078, + "epoch": 0.10701617266879096, + "flos": 22126683242880.0, + "grad_norm": 2.1676637267406202, + "language_loss": 0.83611071, + "learning_rate": 3.938185079391774e-06, + "loss": 0.8582918, + "num_input_tokens_seen": 18919310, + "step": 890, + "time_per_iteration": 2.605555772781372 + }, + { + "auxiliary_loss_clip": 0.01247917, + "auxiliary_loss_mlp": 0.01033289, + "balance_loss_clip": 1.06719303, + "balance_loss_mlp": 1.02143359, + "epoch": 0.10713641555943004, + "flos": 19745625559680.0, + "grad_norm": 2.5694703350942683, + "language_loss": 1.05580473, + "learning_rate": 3.937992762039157e-06, + "loss": 1.07861686, + "num_input_tokens_seen": 18932635, + "step": 891, + "time_per_iteration": 2.4907374382019043 + }, + { + "auxiliary_loss_clip": 0.01230748, + "auxiliary_loss_mlp": 0.01047326, + "balance_loss_clip": 1.06736135, + "balance_loss_mlp": 1.03590584, + "epoch": 0.10725665845006914, + "flos": 23952992302080.0, + "grad_norm": 1.7821434683741881, + "language_loss": 0.80522215, + "learning_rate": 3.937800150694577e-06, + "loss": 0.82800293, + "num_input_tokens_seen": 18953810, + "step": 892, + "time_per_iteration": 2.5360472202301025 + }, + { + "auxiliary_loss_clip": 0.01186188, + "auxiliary_loss_mlp": 0.01040388, + "balance_loss_clip": 1.0629952, + "balance_loss_mlp": 1.02787721, + "epoch": 0.10737690134070824, + "flos": 18551704371840.0, + "grad_norm": 2.3476363993845433, + "language_loss": 0.75957024, + "learning_rate": 3.937607245387255e-06, + "loss": 0.78183603, + "num_input_tokens_seen": 18973175, + "step": 893, + "time_per_iteration": 2.585219621658325 + }, + { + "auxiliary_loss_clip": 0.01223563, + "auxiliary_loss_mlp": 0.01044536, + "balance_loss_clip": 1.06425643, + "balance_loss_mlp": 1.03314543, + "epoch": 0.10749714423134732, + "flos": 22707609903360.0, + "grad_norm": 2.0400904565630005, + "language_loss": 0.72404915, + "learning_rate": 3.937414046146455e-06, + "loss": 0.74673009, + "num_input_tokens_seen": 18991130, + "step": 894, + "time_per_iteration": 2.586886167526245 + }, + { + "auxiliary_loss_clip": 0.01249153, + "auxiliary_loss_mlp": 0.01047539, + "balance_loss_clip": 1.06900907, + "balance_loss_mlp": 1.03520644, + "epoch": 0.10761738712198642, + "flos": 21106066199040.0, + "grad_norm": 2.0796278841984637, + "language_loss": 0.75211823, + "learning_rate": 3.9372205530014845e-06, + "loss": 0.77508521, + "num_input_tokens_seen": 19009610, + "step": 895, + "time_per_iteration": 2.472721815109253 + }, + { + "auxiliary_loss_clip": 0.01247116, + "auxiliary_loss_mlp": 0.0104892, + "balance_loss_clip": 1.0674696, + "balance_loss_mlp": 1.03722572, + "epoch": 0.1077376300126255, + "flos": 23766723348480.0, + "grad_norm": 3.792365611129924, + "language_loss": 0.71530855, + "learning_rate": 3.937026765981696e-06, + "loss": 0.73826885, + "num_input_tokens_seen": 19029680, + "step": 896, + "time_per_iteration": 2.489267349243164 + }, + { + "auxiliary_loss_clip": 0.01206984, + "auxiliary_loss_mlp": 0.0104662, + "balance_loss_clip": 1.06612313, + "balance_loss_mlp": 1.03450203, + "epoch": 0.1078578729032646, + "flos": 20919581763840.0, + "grad_norm": 2.2411604182068547, + "language_loss": 0.79678714, + "learning_rate": 3.936832685116488e-06, + "loss": 0.81932318, + "num_input_tokens_seen": 19047775, + "step": 897, + "time_per_iteration": 2.5541982650756836 + }, + { + "auxiliary_loss_clip": 0.01245707, + "auxiliary_loss_mlp": 0.01041989, + "balance_loss_clip": 1.06655753, + "balance_loss_mlp": 1.03018165, + "epoch": 0.10797811579390369, + "flos": 14829886702080.0, + "grad_norm": 2.164837198920596, + "language_loss": 0.89992344, + "learning_rate": 3.936638310435301e-06, + "loss": 0.92280042, + "num_input_tokens_seen": 19065640, + "step": 898, + "time_per_iteration": 3.225412607192993 + }, + { + "auxiliary_loss_clip": 0.01235366, + "auxiliary_loss_mlp": 0.01039059, + "balance_loss_clip": 1.06786108, + "balance_loss_mlp": 1.02707863, + "epoch": 0.10809835868454278, + "flos": 19536985411200.0, + "grad_norm": 2.8037550235716338, + "language_loss": 0.81516868, + "learning_rate": 3.936443641967623e-06, + "loss": 0.83791292, + "num_input_tokens_seen": 19084470, + "step": 899, + "time_per_iteration": 2.4846417903900146 + }, + { + "auxiliary_loss_clip": 0.01216082, + "auxiliary_loss_mlp": 0.01045266, + "balance_loss_clip": 1.06576872, + "balance_loss_mlp": 1.03296947, + "epoch": 0.10821860157518187, + "flos": 18442320480000.0, + "grad_norm": 2.0768204511270976, + "language_loss": 0.83333123, + "learning_rate": 3.936248679742983e-06, + "loss": 0.85594463, + "num_input_tokens_seen": 19102965, + "step": 900, + "time_per_iteration": 2.516061782836914 + }, + { + "auxiliary_loss_clip": 0.01085894, + "auxiliary_loss_mlp": 0.01020911, + "balance_loss_clip": 1.02217793, + "balance_loss_mlp": 1.01847935, + "epoch": 0.10833884446582095, + "flos": 49359468447360.0, + "grad_norm": 1.0643618466576865, + "language_loss": 0.70158845, + "learning_rate": 3.936053423790959e-06, + "loss": 0.72265649, + "num_input_tokens_seen": 19151285, + "step": 901, + "time_per_iteration": 3.643098831176758 + }, + { + "auxiliary_loss_clip": 0.0124737, + "auxiliary_loss_mlp": 0.010502, + "balance_loss_clip": 1.07039237, + "balance_loss_mlp": 1.03910804, + "epoch": 0.10845908735646005, + "flos": 20411912891520.0, + "grad_norm": 1.9874395568182022, + "language_loss": 0.77368343, + "learning_rate": 3.935857874141168e-06, + "loss": 0.79665917, + "num_input_tokens_seen": 19170120, + "step": 902, + "time_per_iteration": 2.468705892562866 + }, + { + "auxiliary_loss_clip": 0.01207854, + "auxiliary_loss_mlp": 0.01036413, + "balance_loss_clip": 1.06242228, + "balance_loss_mlp": 1.02414083, + "epoch": 0.10857933024709913, + "flos": 14027750133120.0, + "grad_norm": 2.146152616199026, + "language_loss": 0.83684981, + "learning_rate": 3.935662030823279e-06, + "loss": 0.85929245, + "num_input_tokens_seen": 19186305, + "step": 903, + "time_per_iteration": 3.2677700519561768 + }, + { + "auxiliary_loss_clip": 0.01230151, + "auxiliary_loss_mlp": 0.01042103, + "balance_loss_clip": 1.06432056, + "balance_loss_mlp": 1.03071237, + "epoch": 0.10869957313773823, + "flos": 13369004657280.0, + "grad_norm": 2.1143131436131903, + "language_loss": 0.72037292, + "learning_rate": 3.935465893866998e-06, + "loss": 0.7430954, + "num_input_tokens_seen": 19204530, + "step": 904, + "time_per_iteration": 2.4628746509552 + }, + { + "auxiliary_loss_clip": 0.01216119, + "auxiliary_loss_mlp": 0.0104055, + "balance_loss_clip": 1.0652616, + "balance_loss_mlp": 1.02869511, + "epoch": 0.10881981602837733, + "flos": 25807095509760.0, + "grad_norm": 3.3044629448774954, + "language_loss": 0.80422401, + "learning_rate": 3.935269463302079e-06, + "loss": 0.82679075, + "num_input_tokens_seen": 19222735, + "step": 905, + "time_per_iteration": 2.5592074394226074 + }, + { + "auxiliary_loss_clip": 0.01234474, + "auxiliary_loss_mlp": 0.01044695, + "balance_loss_clip": 1.06723213, + "balance_loss_mlp": 1.032125, + "epoch": 0.10894005891901641, + "flos": 20777555387520.0, + "grad_norm": 2.2524563774317223, + "language_loss": 0.76642632, + "learning_rate": 3.935072739158322e-06, + "loss": 0.78921801, + "num_input_tokens_seen": 19242445, + "step": 906, + "time_per_iteration": 2.519449234008789 + }, + { + "auxiliary_loss_clip": 0.01217591, + "auxiliary_loss_mlp": 0.0104423, + "balance_loss_clip": 1.06459105, + "balance_loss_mlp": 1.03213644, + "epoch": 0.10906030180965551, + "flos": 26649883296000.0, + "grad_norm": 1.6289898353694259, + "language_loss": 0.7969411, + "learning_rate": 3.934875721465569e-06, + "loss": 0.81955928, + "num_input_tokens_seen": 19262865, + "step": 907, + "time_per_iteration": 2.596266508102417 + }, + { + "auxiliary_loss_clip": 0.01208177, + "auxiliary_loss_mlp": 0.01037305, + "balance_loss_clip": 1.05786133, + "balance_loss_mlp": 1.02496099, + "epoch": 0.10918054470029459, + "flos": 36534402420480.0, + "grad_norm": 2.9471604252667523, + "language_loss": 0.71699566, + "learning_rate": 3.9346784102537076e-06, + "loss": 0.73945045, + "num_input_tokens_seen": 19285000, + "step": 908, + "time_per_iteration": 2.6623263359069824 + }, + { + "auxiliary_loss_clip": 0.01243004, + "auxiliary_loss_mlp": 0.01033198, + "balance_loss_clip": 1.06513608, + "balance_loss_mlp": 1.02177811, + "epoch": 0.10930078759093369, + "flos": 21762549118080.0, + "grad_norm": 1.7123411765453698, + "language_loss": 0.78304183, + "learning_rate": 3.934480805552669e-06, + "loss": 0.80580384, + "num_input_tokens_seen": 19306010, + "step": 909, + "time_per_iteration": 2.504971742630005 + }, + { + "auxiliary_loss_clip": 0.0124406, + "auxiliary_loss_mlp": 0.00765778, + "balance_loss_clip": 1.06673217, + "balance_loss_mlp": 1.00027621, + "epoch": 0.10942103048157277, + "flos": 22601781457920.0, + "grad_norm": 2.1429591622483435, + "language_loss": 0.88152695, + "learning_rate": 3.93428290739243e-06, + "loss": 0.90162528, + "num_input_tokens_seen": 19325380, + "step": 910, + "time_per_iteration": 2.4893531799316406 + }, + { + "auxiliary_loss_clip": 0.0121404, + "auxiliary_loss_mlp": 0.01042978, + "balance_loss_clip": 1.06318104, + "balance_loss_mlp": 1.03092027, + "epoch": 0.10954127337221187, + "flos": 15045781397760.0, + "grad_norm": 3.2181363533125906, + "language_loss": 0.8001653, + "learning_rate": 3.9340847158030125e-06, + "loss": 0.82273543, + "num_input_tokens_seen": 19338960, + "step": 911, + "time_per_iteration": 2.5086896419525146 + }, + { + "auxiliary_loss_clip": 0.01230044, + "auxiliary_loss_mlp": 0.0104656, + "balance_loss_clip": 1.06267548, + "balance_loss_mlp": 1.03490114, + "epoch": 0.10966151626285096, + "flos": 21650974496640.0, + "grad_norm": 1.7818302776574981, + "language_loss": 0.75411838, + "learning_rate": 3.9338862308144814e-06, + "loss": 0.77688444, + "num_input_tokens_seen": 19357780, + "step": 912, + "time_per_iteration": 2.499723196029663 + }, + { + "auxiliary_loss_clip": 0.01245031, + "auxiliary_loss_mlp": 0.01038379, + "balance_loss_clip": 1.06613493, + "balance_loss_mlp": 1.02664852, + "epoch": 0.10978175915349005, + "flos": 20121359777280.0, + "grad_norm": 1.6749621259987935, + "language_loss": 0.84432638, + "learning_rate": 3.933687452456946e-06, + "loss": 0.8671605, + "num_input_tokens_seen": 19377680, + "step": 913, + "time_per_iteration": 2.473865509033203 + }, + { + "auxiliary_loss_clip": 0.01196879, + "auxiliary_loss_mlp": 0.01036179, + "balance_loss_clip": 1.05820751, + "balance_loss_mlp": 1.02370358, + "epoch": 0.10990200204412914, + "flos": 20412667077120.0, + "grad_norm": 1.9886008212170507, + "language_loss": 0.86188114, + "learning_rate": 3.933488380760562e-06, + "loss": 0.88421178, + "num_input_tokens_seen": 19397040, + "step": 914, + "time_per_iteration": 2.557819128036499 + }, + { + "auxiliary_loss_clip": 0.01243405, + "auxiliary_loss_mlp": 0.00766478, + "balance_loss_clip": 1.06544256, + "balance_loss_mlp": 1.00020504, + "epoch": 0.11002224493476823, + "flos": 17530117660800.0, + "grad_norm": 2.1017662449174703, + "language_loss": 0.87255228, + "learning_rate": 3.9332890157555286e-06, + "loss": 0.89265108, + "num_input_tokens_seen": 19413975, + "step": 915, + "time_per_iteration": 2.467641592025757 + }, + { + "auxiliary_loss_clip": 0.01219824, + "auxiliary_loss_mlp": 0.01042145, + "balance_loss_clip": 1.06578803, + "balance_loss_mlp": 1.03055167, + "epoch": 0.11014248782540732, + "flos": 12203093099520.0, + "grad_norm": 1.7840609685846178, + "language_loss": 0.76378465, + "learning_rate": 3.933089357472088e-06, + "loss": 0.78640431, + "num_input_tokens_seen": 19432005, + "step": 916, + "time_per_iteration": 2.4897453784942627 + }, + { + "auxiliary_loss_clip": 0.01243777, + "auxiliary_loss_mlp": 0.01038499, + "balance_loss_clip": 1.06770289, + "balance_loss_mlp": 1.0267626, + "epoch": 0.11026273071604642, + "flos": 22382977760640.0, + "grad_norm": 1.7940416439524907, + "language_loss": 0.85796416, + "learning_rate": 3.932889405940529e-06, + "loss": 0.8807869, + "num_input_tokens_seen": 19450100, + "step": 917, + "time_per_iteration": 2.475731372833252 + }, + { + "auxiliary_loss_clip": 0.01217192, + "auxiliary_loss_mlp": 0.01037921, + "balance_loss_clip": 1.06872642, + "balance_loss_mlp": 1.02659655, + "epoch": 0.1103829736066855, + "flos": 19829046896640.0, + "grad_norm": 2.6254153147281154, + "language_loss": 0.79701447, + "learning_rate": 3.932689161191184e-06, + "loss": 0.81956553, + "num_input_tokens_seen": 19467805, + "step": 918, + "time_per_iteration": 2.5285415649414062 + }, + { + "auxiliary_loss_clip": 0.0122857, + "auxiliary_loss_mlp": 0.01041637, + "balance_loss_clip": 1.06400394, + "balance_loss_mlp": 1.02950728, + "epoch": 0.1105032164973246, + "flos": 22669616292480.0, + "grad_norm": 2.1218864391566292, + "language_loss": 0.88229215, + "learning_rate": 3.93248862325443e-06, + "loss": 0.90499419, + "num_input_tokens_seen": 19486710, + "step": 919, + "time_per_iteration": 2.5163044929504395 + }, + { + "auxiliary_loss_clip": 0.01115657, + "auxiliary_loss_mlp": 0.01003221, + "balance_loss_clip": 1.02110374, + "balance_loss_mlp": 1.00028825, + "epoch": 0.11062345938796368, + "flos": 66483507876480.0, + "grad_norm": 0.9387634501909016, + "language_loss": 0.6444062, + "learning_rate": 3.932287792160688e-06, + "loss": 0.665595, + "num_input_tokens_seen": 19545170, + "step": 920, + "time_per_iteration": 2.9633877277374268 + }, + { + "auxiliary_loss_clip": 0.01232964, + "auxiliary_loss_mlp": 0.01039868, + "balance_loss_clip": 1.06414032, + "balance_loss_mlp": 1.02693403, + "epoch": 0.11074370227860278, + "flos": 21907771804800.0, + "grad_norm": 3.0161811912399243, + "language_loss": 0.80348611, + "learning_rate": 3.932086667940424e-06, + "loss": 0.82621443, + "num_input_tokens_seen": 19561875, + "step": 921, + "time_per_iteration": 2.5001609325408936 + }, + { + "auxiliary_loss_clip": 0.01229302, + "auxiliary_loss_mlp": 0.00765889, + "balance_loss_clip": 1.06600738, + "balance_loss_mlp": 1.00022912, + "epoch": 0.11086394516924186, + "flos": 28658115763200.0, + "grad_norm": 1.7588931899202436, + "language_loss": 0.81650496, + "learning_rate": 3.93188525062415e-06, + "loss": 0.83645689, + "num_input_tokens_seen": 19582340, + "step": 922, + "time_per_iteration": 2.558328866958618 + }, + { + "auxiliary_loss_clip": 0.01229974, + "auxiliary_loss_mlp": 0.01047982, + "balance_loss_clip": 1.06462252, + "balance_loss_mlp": 1.03556597, + "epoch": 0.11098418805988096, + "flos": 24535247765760.0, + "grad_norm": 1.7270003118269799, + "language_loss": 0.86212689, + "learning_rate": 3.931683540242418e-06, + "loss": 0.88490641, + "num_input_tokens_seen": 19603405, + "step": 923, + "time_per_iteration": 2.545809745788574 + }, + { + "auxiliary_loss_clip": 0.01223678, + "auxiliary_loss_mlp": 0.01040667, + "balance_loss_clip": 1.06296957, + "balance_loss_mlp": 1.0281744, + "epoch": 0.11110443095052006, + "flos": 22960384888320.0, + "grad_norm": 4.295504248336972, + "language_loss": 0.90994871, + "learning_rate": 3.9314815368258295e-06, + "loss": 0.93259221, + "num_input_tokens_seen": 19619885, + "step": 924, + "time_per_iteration": 2.5811824798583984 + }, + { + "auxiliary_loss_clip": 0.0123546, + "auxiliary_loss_mlp": 0.01039676, + "balance_loss_clip": 1.07114375, + "balance_loss_mlp": 1.02820826, + "epoch": 0.11122467384115914, + "flos": 18950025265920.0, + "grad_norm": 1.8654623820140814, + "language_loss": 0.78543496, + "learning_rate": 3.9312792404050275e-06, + "loss": 0.80818641, + "num_input_tokens_seen": 19637940, + "step": 925, + "time_per_iteration": 3.2651054859161377 + }, + { + "auxiliary_loss_clip": 0.01244193, + "auxiliary_loss_mlp": 0.01041657, + "balance_loss_clip": 1.06838202, + "balance_loss_mlp": 1.03077948, + "epoch": 0.11134491673179824, + "flos": 25082957324160.0, + "grad_norm": 2.0462597798758946, + "language_loss": 0.77219141, + "learning_rate": 3.9310766510107e-06, + "loss": 0.79504991, + "num_input_tokens_seen": 19657115, + "step": 926, + "time_per_iteration": 2.5171711444854736 + }, + { + "auxiliary_loss_clip": 0.01200529, + "auxiliary_loss_mlp": 0.01041715, + "balance_loss_clip": 1.06006134, + "balance_loss_mlp": 1.02926993, + "epoch": 0.11146515962243732, + "flos": 24499121662080.0, + "grad_norm": 2.0402992106556668, + "language_loss": 0.92458391, + "learning_rate": 3.9308737686735806e-06, + "loss": 0.94700634, + "num_input_tokens_seen": 19677075, + "step": 927, + "time_per_iteration": 2.5917551517486572 + }, + { + "auxiliary_loss_clip": 0.01250166, + "auxiliary_loss_mlp": 0.01044989, + "balance_loss_clip": 1.06996226, + "balance_loss_mlp": 1.03354502, + "epoch": 0.11158540251307641, + "flos": 22343763087360.0, + "grad_norm": 1.8843157440060656, + "language_loss": 0.82777524, + "learning_rate": 3.9306705934244455e-06, + "loss": 0.85072678, + "num_input_tokens_seen": 19697155, + "step": 928, + "time_per_iteration": 3.2677390575408936 + }, + { + "auxiliary_loss_clip": 0.01206222, + "auxiliary_loss_mlp": 0.01034895, + "balance_loss_clip": 1.06285763, + "balance_loss_mlp": 1.02318883, + "epoch": 0.11170564540371551, + "flos": 19902304684800.0, + "grad_norm": 1.883865013919055, + "language_loss": 0.88477194, + "learning_rate": 3.930467125294116e-06, + "loss": 0.90718305, + "num_input_tokens_seen": 19716705, + "step": 929, + "time_per_iteration": 2.5528645515441895 + }, + { + "auxiliary_loss_clip": 0.01058249, + "auxiliary_loss_mlp": 0.0100915, + "balance_loss_clip": 1.01575744, + "balance_loss_mlp": 1.00676537, + "epoch": 0.1118258882943546, + "flos": 64586239499520.0, + "grad_norm": 0.9287129110191039, + "language_loss": 0.60387862, + "learning_rate": 3.930263364313458e-06, + "loss": 0.62455261, + "num_input_tokens_seen": 19767275, + "step": 930, + "time_per_iteration": 4.521553993225098 + }, + { + "auxiliary_loss_clip": 0.01198488, + "auxiliary_loss_mlp": 0.01049199, + "balance_loss_clip": 1.06149781, + "balance_loss_mlp": 1.03664649, + "epoch": 0.11194613118499369, + "flos": 17201965985280.0, + "grad_norm": 5.854126799027086, + "language_loss": 0.83303976, + "learning_rate": 3.930059310513384e-06, + "loss": 0.85551673, + "num_input_tokens_seen": 19786315, + "step": 931, + "time_per_iteration": 2.570361614227295 + }, + { + "auxiliary_loss_clip": 0.01184929, + "auxiliary_loss_mlp": 0.00765858, + "balance_loss_clip": 1.0603354, + "balance_loss_mlp": 1.00014174, + "epoch": 0.11206637407563277, + "flos": 31863465728640.0, + "grad_norm": 1.802222106697116, + "language_loss": 0.84064674, + "learning_rate": 3.929854963924846e-06, + "loss": 0.86015463, + "num_input_tokens_seen": 19806580, + "step": 932, + "time_per_iteration": 2.6577320098876953 + }, + { + "auxiliary_loss_clip": 0.01201459, + "auxiliary_loss_mlp": 0.01038919, + "balance_loss_clip": 1.06126976, + "balance_loss_mlp": 1.02760613, + "epoch": 0.11218661696627187, + "flos": 21945621761280.0, + "grad_norm": 1.810455870227726, + "language_loss": 0.77391833, + "learning_rate": 3.929650324578845e-06, + "loss": 0.79632211, + "num_input_tokens_seen": 19826045, + "step": 933, + "time_per_iteration": 2.5815162658691406 + }, + { + "auxiliary_loss_clip": 0.01218867, + "auxiliary_loss_mlp": 0.01040235, + "balance_loss_clip": 1.06382465, + "balance_loss_mlp": 1.02740192, + "epoch": 0.11230685985691095, + "flos": 25878198481920.0, + "grad_norm": 2.130087770134214, + "language_loss": 0.81892979, + "learning_rate": 3.929445392506423e-06, + "loss": 0.84152085, + "num_input_tokens_seen": 19843985, + "step": 934, + "time_per_iteration": 2.560582399368286 + }, + { + "auxiliary_loss_clip": 0.01231121, + "auxiliary_loss_mlp": 0.01045239, + "balance_loss_clip": 1.06972349, + "balance_loss_mlp": 1.03414059, + "epoch": 0.11242710274755005, + "flos": 22231506107520.0, + "grad_norm": 2.132380376626068, + "language_loss": 0.75725627, + "learning_rate": 3.92924016773867e-06, + "loss": 0.78001988, + "num_input_tokens_seen": 19860480, + "step": 935, + "time_per_iteration": 2.554236650466919 + }, + { + "auxiliary_loss_clip": 0.0121576, + "auxiliary_loss_mlp": 0.00765168, + "balance_loss_clip": 1.06192935, + "balance_loss_mlp": 1.00009644, + "epoch": 0.11254734563818915, + "flos": 17712184723200.0, + "grad_norm": 2.6880738016514405, + "language_loss": 0.73802018, + "learning_rate": 3.9290346503067175e-06, + "loss": 0.75782943, + "num_input_tokens_seen": 19877145, + "step": 936, + "time_per_iteration": 2.567462205886841 + }, + { + "auxiliary_loss_clip": 0.01233346, + "auxiliary_loss_mlp": 0.01042245, + "balance_loss_clip": 1.06460643, + "balance_loss_mlp": 1.03067541, + "epoch": 0.11266758852882823, + "flos": 54930397334400.0, + "grad_norm": 1.65833147812522, + "language_loss": 0.78848195, + "learning_rate": 3.9288288402417415e-06, + "loss": 0.81123781, + "num_input_tokens_seen": 19903405, + "step": 937, + "time_per_iteration": 2.8298888206481934 + }, + { + "auxiliary_loss_clip": 0.01234464, + "auxiliary_loss_mlp": 0.01040605, + "balance_loss_clip": 1.06707668, + "balance_loss_mlp": 1.02829027, + "epoch": 0.11278783141946733, + "flos": 18878132194560.0, + "grad_norm": 2.108763382020276, + "language_loss": 0.70737278, + "learning_rate": 3.928622737574964e-06, + "loss": 0.73012346, + "num_input_tokens_seen": 19918740, + "step": 938, + "time_per_iteration": 2.4833967685699463 + }, + { + "auxiliary_loss_clip": 0.01214113, + "auxiliary_loss_mlp": 0.0104255, + "balance_loss_clip": 1.06241214, + "balance_loss_mlp": 1.03087914, + "epoch": 0.11290807431010641, + "flos": 26469252777600.0, + "grad_norm": 1.904839054818962, + "language_loss": 0.91234368, + "learning_rate": 3.928416342337652e-06, + "loss": 0.9349103, + "num_input_tokens_seen": 19938475, + "step": 939, + "time_per_iteration": 2.573127269744873 + }, + { + "auxiliary_loss_clip": 0.01217578, + "auxiliary_loss_mlp": 0.01042622, + "balance_loss_clip": 1.06495047, + "balance_loss_mlp": 1.03069472, + "epoch": 0.1130283172007455, + "flos": 22710590732160.0, + "grad_norm": 1.8176852889059, + "language_loss": 0.82812303, + "learning_rate": 3.928209654561113e-06, + "loss": 0.85072505, + "num_input_tokens_seen": 19959310, + "step": 940, + "time_per_iteration": 2.551373243331909 + }, + { + "auxiliary_loss_clip": 0.01205732, + "auxiliary_loss_mlp": 0.0104106, + "balance_loss_clip": 1.0625577, + "balance_loss_mlp": 1.02967501, + "epoch": 0.1131485600913846, + "flos": 23219911630080.0, + "grad_norm": 1.9709948165066131, + "language_loss": 0.81797826, + "learning_rate": 3.928002674276703e-06, + "loss": 0.84044617, + "num_input_tokens_seen": 19978700, + "step": 941, + "time_per_iteration": 2.5464863777160645 + }, + { + "auxiliary_loss_clip": 0.01164463, + "auxiliary_loss_mlp": 0.01036682, + "balance_loss_clip": 1.05337489, + "balance_loss_mlp": 1.02492249, + "epoch": 0.11326880298202369, + "flos": 14064271286400.0, + "grad_norm": 2.5082050638681217, + "language_loss": 0.74924618, + "learning_rate": 3.92779540151582e-06, + "loss": 0.77125764, + "num_input_tokens_seen": 19995785, + "step": 942, + "time_per_iteration": 2.5501716136932373 + }, + { + "auxiliary_loss_clip": 0.01214479, + "auxiliary_loss_mlp": 0.01031895, + "balance_loss_clip": 1.06404471, + "balance_loss_mlp": 1.02099347, + "epoch": 0.11338904587266278, + "flos": 16325386479360.0, + "grad_norm": 1.891809781181744, + "language_loss": 0.85634178, + "learning_rate": 3.927587836309907e-06, + "loss": 0.87880552, + "num_input_tokens_seen": 20013615, + "step": 943, + "time_per_iteration": 2.5003368854522705 + }, + { + "auxiliary_loss_clip": 0.01210523, + "auxiliary_loss_mlp": 0.01042014, + "balance_loss_clip": 1.0616678, + "balance_loss_mlp": 1.03063595, + "epoch": 0.11350928876330187, + "flos": 24426258923520.0, + "grad_norm": 1.8482480029603485, + "language_loss": 0.7851035, + "learning_rate": 3.927379978690452e-06, + "loss": 0.80762887, + "num_input_tokens_seen": 20032880, + "step": 944, + "time_per_iteration": 2.55751633644104 + }, + { + "auxiliary_loss_clip": 0.01185597, + "auxiliary_loss_mlp": 0.01044856, + "balance_loss_clip": 1.05348706, + "balance_loss_mlp": 1.03328681, + "epoch": 0.11362953165394096, + "flos": 24497074586880.0, + "grad_norm": 2.015977168230092, + "language_loss": 0.87289852, + "learning_rate": 3.927171828688987e-06, + "loss": 0.89520299, + "num_input_tokens_seen": 20052405, + "step": 945, + "time_per_iteration": 2.596437931060791 + }, + { + "auxiliary_loss_clip": 0.01244786, + "auxiliary_loss_mlp": 0.01033974, + "balance_loss_clip": 1.06856227, + "balance_loss_mlp": 1.02255368, + "epoch": 0.11374977454458005, + "flos": 24060831909120.0, + "grad_norm": 2.3981966385538596, + "language_loss": 0.82049328, + "learning_rate": 3.926963386337088e-06, + "loss": 0.84328085, + "num_input_tokens_seen": 20070635, + "step": 946, + "time_per_iteration": 2.4901607036590576 + }, + { + "auxiliary_loss_clip": 0.01248114, + "auxiliary_loss_mlp": 0.01037861, + "balance_loss_clip": 1.06698918, + "balance_loss_mlp": 1.02483177, + "epoch": 0.11387001743521914, + "flos": 39457638967680.0, + "grad_norm": 2.030694373645685, + "language_loss": 0.70031726, + "learning_rate": 3.926754651666375e-06, + "loss": 0.72317708, + "num_input_tokens_seen": 20091195, + "step": 947, + "time_per_iteration": 2.6201679706573486 + }, + { + "auxiliary_loss_clip": 0.01201111, + "auxiliary_loss_mlp": 0.01038696, + "balance_loss_clip": 1.06433392, + "balance_loss_mlp": 1.02723408, + "epoch": 0.11399026032585824, + "flos": 25082454533760.0, + "grad_norm": 2.797461700281995, + "language_loss": 0.78691763, + "learning_rate": 3.926545624708513e-06, + "loss": 0.80931568, + "num_input_tokens_seen": 20110435, + "step": 948, + "time_per_iteration": 2.5864531993865967 + }, + { + "auxiliary_loss_clip": 0.01194985, + "auxiliary_loss_mlp": 0.01047227, + "balance_loss_clip": 1.059623, + "balance_loss_mlp": 1.03540134, + "epoch": 0.11411050321649732, + "flos": 17961835224960.0, + "grad_norm": 1.8305496674180064, + "language_loss": 0.85392094, + "learning_rate": 3.926336305495213e-06, + "loss": 0.87634301, + "num_input_tokens_seen": 20128995, + "step": 949, + "time_per_iteration": 2.577878952026367 + }, + { + "auxiliary_loss_clip": 0.01187801, + "auxiliary_loss_mlp": 0.01044711, + "balance_loss_clip": 1.06163001, + "balance_loss_mlp": 1.03140175, + "epoch": 0.11423074610713642, + "flos": 22455409536000.0, + "grad_norm": 2.095621441295055, + "language_loss": 0.88991046, + "learning_rate": 3.926126694058226e-06, + "loss": 0.91223562, + "num_input_tokens_seen": 20148145, + "step": 950, + "time_per_iteration": 2.6199798583984375 + }, + { + "auxiliary_loss_clip": 0.01180599, + "auxiliary_loss_mlp": 0.01041142, + "balance_loss_clip": 1.06272674, + "balance_loss_mlp": 1.03074694, + "epoch": 0.1143509889977755, + "flos": 19717687756800.0, + "grad_norm": 1.6319873064406516, + "language_loss": 0.81946409, + "learning_rate": 3.92591679042935e-06, + "loss": 0.84168154, + "num_input_tokens_seen": 20168035, + "step": 951, + "time_per_iteration": 2.6896579265594482 + }, + { + "auxiliary_loss_clip": 0.01229356, + "auxiliary_loss_mlp": 0.01043605, + "balance_loss_clip": 1.06644893, + "balance_loss_mlp": 1.03095663, + "epoch": 0.1144712318884146, + "flos": 19822869757440.0, + "grad_norm": 1.7863648595152417, + "language_loss": 0.82403058, + "learning_rate": 3.92570659464043e-06, + "loss": 0.84676015, + "num_input_tokens_seen": 20186095, + "step": 952, + "time_per_iteration": 3.26773738861084 + }, + { + "auxiliary_loss_clip": 0.01225062, + "auxiliary_loss_mlp": 0.00765487, + "balance_loss_clip": 1.06571782, + "balance_loss_mlp": 1.00017571, + "epoch": 0.1145914747790537, + "flos": 14939198766720.0, + "grad_norm": 2.0478724326778743, + "language_loss": 0.7972607, + "learning_rate": 3.925496106723349e-06, + "loss": 0.81716615, + "num_input_tokens_seen": 20203535, + "step": 953, + "time_per_iteration": 2.50225567817688 + }, + { + "auxiliary_loss_clip": 0.01231765, + "auxiliary_loss_mlp": 0.01037903, + "balance_loss_clip": 1.06584585, + "balance_loss_mlp": 1.02715683, + "epoch": 0.11471171766969278, + "flos": 19865029345920.0, + "grad_norm": 2.247015089336728, + "language_loss": 0.83633733, + "learning_rate": 3.9252853267100405e-06, + "loss": 0.85903406, + "num_input_tokens_seen": 20222780, + "step": 954, + "time_per_iteration": 2.496647596359253 + }, + { + "auxiliary_loss_clip": 0.0118747, + "auxiliary_loss_mlp": 0.01040136, + "balance_loss_clip": 1.05874109, + "balance_loss_mlp": 1.02825642, + "epoch": 0.11483196056033187, + "flos": 22526476594560.0, + "grad_norm": 2.2218604099368573, + "language_loss": 0.8388052, + "learning_rate": 3.9250742546324786e-06, + "loss": 0.86108124, + "num_input_tokens_seen": 20243015, + "step": 955, + "time_per_iteration": 3.3627142906188965 + }, + { + "auxiliary_loss_clip": 0.01211943, + "auxiliary_loss_mlp": 0.0104218, + "balance_loss_clip": 1.06178188, + "balance_loss_mlp": 1.031672, + "epoch": 0.11495220345097096, + "flos": 28220292887040.0, + "grad_norm": 1.8312879987267376, + "language_loss": 0.8692376, + "learning_rate": 3.924862890522683e-06, + "loss": 0.89177883, + "num_input_tokens_seen": 20263025, + "step": 956, + "time_per_iteration": 3.3888988494873047 + }, + { + "auxiliary_loss_clip": 0.01228012, + "auxiliary_loss_mlp": 0.01034907, + "balance_loss_clip": 1.06355608, + "balance_loss_mlp": 1.02334428, + "epoch": 0.11507244634161005, + "flos": 17492267704320.0, + "grad_norm": 3.810561164943214, + "language_loss": 0.85890234, + "learning_rate": 3.9246512344127174e-06, + "loss": 0.8815316, + "num_input_tokens_seen": 20280685, + "step": 957, + "time_per_iteration": 3.2152273654937744 + }, + { + "auxiliary_loss_clip": 0.01148117, + "auxiliary_loss_mlp": 0.01038739, + "balance_loss_clip": 1.05484056, + "balance_loss_mlp": 1.02752161, + "epoch": 0.11519268923224914, + "flos": 22564937082240.0, + "grad_norm": 2.1685380396668448, + "language_loss": 0.81496328, + "learning_rate": 3.9244392863346895e-06, + "loss": 0.83683181, + "num_input_tokens_seen": 20300090, + "step": 958, + "time_per_iteration": 2.630455255508423 + }, + { + "auxiliary_loss_clip": 0.01216367, + "auxiliary_loss_mlp": 0.01043482, + "balance_loss_clip": 1.0672071, + "balance_loss_mlp": 1.03085792, + "epoch": 0.11531293212288823, + "flos": 16982839065600.0, + "grad_norm": 1.7965715400389528, + "language_loss": 0.92302424, + "learning_rate": 3.9242270463207524e-06, + "loss": 0.94562268, + "num_input_tokens_seen": 20318480, + "step": 959, + "time_per_iteration": 2.5031332969665527 + }, + { + "auxiliary_loss_clip": 0.01167855, + "auxiliary_loss_mlp": 0.01038073, + "balance_loss_clip": 1.05703521, + "balance_loss_mlp": 1.02603865, + "epoch": 0.11543317501352733, + "flos": 12422004537600.0, + "grad_norm": 2.6948902217477717, + "language_loss": 0.84922314, + "learning_rate": 3.924014514403102e-06, + "loss": 0.87128234, + "num_input_tokens_seen": 20334635, + "step": 960, + "time_per_iteration": 2.56894588470459 + }, + { + "auxiliary_loss_clip": 0.01169355, + "auxiliary_loss_mlp": 0.01041781, + "balance_loss_clip": 1.05709553, + "balance_loss_mlp": 1.02946043, + "epoch": 0.11555341790416641, + "flos": 19821648695040.0, + "grad_norm": 2.26902637622173, + "language_loss": 0.91157931, + "learning_rate": 3.92380169061398e-06, + "loss": 0.93369061, + "num_input_tokens_seen": 20352415, + "step": 961, + "time_per_iteration": 2.590451240539551 + }, + { + "auxiliary_loss_clip": 0.01187298, + "auxiliary_loss_mlp": 0.00765554, + "balance_loss_clip": 1.05516541, + "balance_loss_mlp": 1.00021648, + "epoch": 0.11567366079480551, + "flos": 25738865625600.0, + "grad_norm": 1.8850468493994506, + "language_loss": 0.83561283, + "learning_rate": 3.9235885749856705e-06, + "loss": 0.8551414, + "num_input_tokens_seen": 20371095, + "step": 962, + "time_per_iteration": 2.590169906616211 + }, + { + "auxiliary_loss_clip": 0.01214685, + "auxiliary_loss_mlp": 0.01042383, + "balance_loss_clip": 1.06719422, + "balance_loss_mlp": 1.03079033, + "epoch": 0.1157939036854446, + "flos": 18223301301120.0, + "grad_norm": 1.7309649787576362, + "language_loss": 0.82408154, + "learning_rate": 3.9233751675505035e-06, + "loss": 0.84665227, + "num_input_tokens_seen": 20389805, + "step": 963, + "time_per_iteration": 2.5028634071350098 + }, + { + "auxiliary_loss_clip": 0.0120741, + "auxiliary_loss_mlp": 0.01036577, + "balance_loss_clip": 1.06372213, + "balance_loss_mlp": 1.02425122, + "epoch": 0.11591414657608369, + "flos": 23073755189760.0, + "grad_norm": 1.9213161711514417, + "language_loss": 0.84491217, + "learning_rate": 3.923161468340853e-06, + "loss": 0.86735201, + "num_input_tokens_seen": 20409640, + "step": 964, + "time_per_iteration": 2.537877321243286 + }, + { + "auxiliary_loss_clip": 0.01167556, + "auxiliary_loss_mlp": 0.0103726, + "balance_loss_clip": 1.05501771, + "balance_loss_mlp": 1.02598274, + "epoch": 0.11603438946672277, + "flos": 19461716461440.0, + "grad_norm": 2.2108537739764373, + "language_loss": 0.81268519, + "learning_rate": 3.9229474773891374e-06, + "loss": 0.83473337, + "num_input_tokens_seen": 20428180, + "step": 965, + "time_per_iteration": 2.566885232925415 + }, + { + "auxiliary_loss_clip": 0.01200726, + "auxiliary_loss_mlp": 0.0104679, + "balance_loss_clip": 1.05552864, + "balance_loss_mlp": 1.03435028, + "epoch": 0.11615463235736187, + "flos": 26831986272000.0, + "grad_norm": 2.5950524305649187, + "language_loss": 0.83609474, + "learning_rate": 3.922733194727818e-06, + "loss": 0.85856998, + "num_input_tokens_seen": 20447975, + "step": 966, + "time_per_iteration": 2.5872673988342285 + }, + { + "auxiliary_loss_clip": 0.01232226, + "auxiliary_loss_mlp": 0.01039162, + "balance_loss_clip": 1.06613946, + "balance_loss_mlp": 1.02712774, + "epoch": 0.11627487524800097, + "flos": 18580324533120.0, + "grad_norm": 2.195412183798511, + "language_loss": 0.87559599, + "learning_rate": 3.922518620389402e-06, + "loss": 0.89830983, + "num_input_tokens_seen": 20464840, + "step": 967, + "time_per_iteration": 2.4864048957824707 + }, + { + "auxiliary_loss_clip": 0.01122904, + "auxiliary_loss_mlp": 0.01039511, + "balance_loss_clip": 1.05243158, + "balance_loss_mlp": 1.02774525, + "epoch": 0.11639511813864005, + "flos": 18150474476160.0, + "grad_norm": 1.8129379110398356, + "language_loss": 0.89285898, + "learning_rate": 3.922303754406439e-06, + "loss": 0.91448319, + "num_input_tokens_seen": 20482680, + "step": 968, + "time_per_iteration": 2.632927417755127 + }, + { + "auxiliary_loss_clip": 0.01178102, + "auxiliary_loss_mlp": 0.01048144, + "balance_loss_clip": 1.05531406, + "balance_loss_mlp": 1.03539503, + "epoch": 0.11651536102927915, + "flos": 20922023888640.0, + "grad_norm": 1.774664277235848, + "language_loss": 0.78873163, + "learning_rate": 3.922088596811526e-06, + "loss": 0.81099409, + "num_input_tokens_seen": 20501810, + "step": 969, + "time_per_iteration": 2.574179172515869 + }, + { + "auxiliary_loss_clip": 0.01214942, + "auxiliary_loss_mlp": 0.01039671, + "balance_loss_clip": 1.06239891, + "balance_loss_mlp": 1.02878129, + "epoch": 0.11663560391991823, + "flos": 16508602776960.0, + "grad_norm": 2.088176378958563, + "language_loss": 0.86821932, + "learning_rate": 3.9218731476373e-06, + "loss": 0.89076543, + "num_input_tokens_seen": 20517995, + "step": 970, + "time_per_iteration": 2.4707088470458984 + }, + { + "auxiliary_loss_clip": 0.01232377, + "auxiliary_loss_mlp": 0.01042926, + "balance_loss_clip": 1.06752384, + "balance_loss_mlp": 1.03038561, + "epoch": 0.11675584681055733, + "flos": 19865029345920.0, + "grad_norm": 2.0289596021590954, + "language_loss": 0.84917676, + "learning_rate": 3.9216574069164455e-06, + "loss": 0.87192982, + "num_input_tokens_seen": 20536970, + "step": 971, + "time_per_iteration": 2.4809367656707764 + }, + { + "auxiliary_loss_clip": 0.0124024, + "auxiliary_loss_mlp": 0.0103632, + "balance_loss_clip": 1.06587434, + "balance_loss_mlp": 1.02559125, + "epoch": 0.11687608970119642, + "flos": 21944364785280.0, + "grad_norm": 1.507445810503037, + "language_loss": 0.80209899, + "learning_rate": 3.921441374681691e-06, + "loss": 0.82486451, + "num_input_tokens_seen": 20557030, + "step": 972, + "time_per_iteration": 2.4748337268829346 + }, + { + "auxiliary_loss_clip": 0.01207378, + "auxiliary_loss_mlp": 0.0103679, + "balance_loss_clip": 1.06265986, + "balance_loss_mlp": 1.02549529, + "epoch": 0.1169963325918355, + "flos": 24061155131520.0, + "grad_norm": 1.7343948140317902, + "language_loss": 0.64992559, + "learning_rate": 3.921225050965808e-06, + "loss": 0.67236733, + "num_input_tokens_seen": 20576915, + "step": 973, + "time_per_iteration": 2.5454506874084473 + }, + { + "auxiliary_loss_clip": 0.01193986, + "auxiliary_loss_mlp": 0.01039871, + "balance_loss_clip": 1.05988061, + "balance_loss_mlp": 1.02813482, + "epoch": 0.1171165754824746, + "flos": 23368151059200.0, + "grad_norm": 2.9863108316794915, + "language_loss": 0.75187606, + "learning_rate": 3.921008435801612e-06, + "loss": 0.77421463, + "num_input_tokens_seen": 20596000, + "step": 974, + "time_per_iteration": 2.576889753341675 + }, + { + "auxiliary_loss_clip": 0.01216153, + "auxiliary_loss_mlp": 0.01039913, + "balance_loss_clip": 1.06393492, + "balance_loss_mlp": 1.02768183, + "epoch": 0.11723681837311369, + "flos": 18552243075840.0, + "grad_norm": 2.998306888347016, + "language_loss": 0.75803709, + "learning_rate": 3.920791529221963e-06, + "loss": 0.78059769, + "num_input_tokens_seen": 20614675, + "step": 975, + "time_per_iteration": 2.5087087154388428 + }, + { + "auxiliary_loss_clip": 0.01212085, + "auxiliary_loss_mlp": 0.00765631, + "balance_loss_clip": 1.06156683, + "balance_loss_mlp": 1.00047338, + "epoch": 0.11735706126375278, + "flos": 23550541344000.0, + "grad_norm": 3.0561221374508256, + "language_loss": 0.76094311, + "learning_rate": 3.920574331259768e-06, + "loss": 0.78072029, + "num_input_tokens_seen": 20635875, + "step": 976, + "time_per_iteration": 2.575732469558716 + }, + { + "auxiliary_loss_clip": 0.01200721, + "auxiliary_loss_mlp": 0.01038493, + "balance_loss_clip": 1.05958343, + "balance_loss_mlp": 1.02761555, + "epoch": 0.11747730415439187, + "flos": 22381541216640.0, + "grad_norm": 2.261408520507267, + "language_loss": 0.79592514, + "learning_rate": 3.9203568419479716e-06, + "loss": 0.81831729, + "num_input_tokens_seen": 20656430, + "step": 977, + "time_per_iteration": 2.5242087841033936 + }, + { + "auxiliary_loss_clip": 0.01210749, + "auxiliary_loss_mlp": 0.01030838, + "balance_loss_clip": 1.0626533, + "balance_loss_mlp": 1.02017522, + "epoch": 0.11759754704503096, + "flos": 22200731130240.0, + "grad_norm": 1.8347922533017418, + "language_loss": 0.75218338, + "learning_rate": 3.92013906131957e-06, + "loss": 0.77459919, + "num_input_tokens_seen": 20675360, + "step": 978, + "time_per_iteration": 3.306124448776245 + }, + { + "auxiliary_loss_clip": 0.01192771, + "auxiliary_loss_mlp": 0.0104972, + "balance_loss_clip": 1.06100798, + "balance_loss_mlp": 1.03914022, + "epoch": 0.11771778993567006, + "flos": 22309755886080.0, + "grad_norm": 1.731545335754581, + "language_loss": 0.82769644, + "learning_rate": 3.9199209894076e-06, + "loss": 0.85012138, + "num_input_tokens_seen": 20695675, + "step": 979, + "time_per_iteration": 2.634705066680908 + }, + { + "auxiliary_loss_clip": 0.01241796, + "auxiliary_loss_mlp": 0.01036611, + "balance_loss_clip": 1.06410789, + "balance_loss_mlp": 1.02436852, + "epoch": 0.11783803282630914, + "flos": 21288169175040.0, + "grad_norm": 1.9892014573043397, + "language_loss": 0.89565843, + "learning_rate": 3.919702626245142e-06, + "loss": 0.91844255, + "num_input_tokens_seen": 20715330, + "step": 980, + "time_per_iteration": 2.4749181270599365 + }, + { + "auxiliary_loss_clip": 0.01196062, + "auxiliary_loss_mlp": 0.01038349, + "balance_loss_clip": 1.05921721, + "balance_loss_mlp": 1.0268755, + "epoch": 0.11795827571694824, + "flos": 25371535190400.0, + "grad_norm": 2.502352156104841, + "language_loss": 0.66004789, + "learning_rate": 3.919483971865322e-06, + "loss": 0.682392, + "num_input_tokens_seen": 20735325, + "step": 981, + "time_per_iteration": 2.5485222339630127 + }, + { + "auxiliary_loss_clip": 0.01207878, + "auxiliary_loss_mlp": 0.01037875, + "balance_loss_clip": 1.06465697, + "balance_loss_mlp": 1.02694309, + "epoch": 0.11807851860758732, + "flos": 23622218933760.0, + "grad_norm": 1.9869330605738054, + "language_loss": 0.88015175, + "learning_rate": 3.91926502630131e-06, + "loss": 0.90260935, + "num_input_tokens_seen": 20755940, + "step": 982, + "time_per_iteration": 3.292456865310669 + }, + { + "auxiliary_loss_clip": 0.01230836, + "auxiliary_loss_mlp": 0.0103939, + "balance_loss_clip": 1.06921875, + "balance_loss_mlp": 1.02831006, + "epoch": 0.11819876149822642, + "flos": 24972496024320.0, + "grad_norm": 1.9907957085045982, + "language_loss": 0.72103024, + "learning_rate": 3.91904578958632e-06, + "loss": 0.74373245, + "num_input_tokens_seen": 20775355, + "step": 983, + "time_per_iteration": 3.346870183944702 + }, + { + "auxiliary_loss_clip": 0.01244058, + "auxiliary_loss_mlp": 0.01040461, + "balance_loss_clip": 1.06756914, + "balance_loss_mlp": 1.02926111, + "epoch": 0.11831900438886551, + "flos": 23003226835200.0, + "grad_norm": 2.1430857072774065, + "language_loss": 0.84063303, + "learning_rate": 3.918826261753608e-06, + "loss": 0.86347818, + "num_input_tokens_seen": 20794935, + "step": 984, + "time_per_iteration": 3.4190101623535156 + }, + { + "auxiliary_loss_clip": 0.01209609, + "auxiliary_loss_mlp": 0.01034291, + "balance_loss_clip": 1.06309915, + "balance_loss_mlp": 1.02459931, + "epoch": 0.1184392472795046, + "flos": 27965147604480.0, + "grad_norm": 3.2547121535372394, + "language_loss": 0.70999396, + "learning_rate": 3.918606442836478e-06, + "loss": 0.73243296, + "num_input_tokens_seen": 20817155, + "step": 985, + "time_per_iteration": 2.659194231033325 + }, + { + "auxiliary_loss_clip": 0.01223663, + "auxiliary_loss_mlp": 0.01038215, + "balance_loss_clip": 1.06703138, + "balance_loss_mlp": 1.02761185, + "epoch": 0.1185594901701437, + "flos": 19898497843200.0, + "grad_norm": 1.8835923665142686, + "language_loss": 0.77698404, + "learning_rate": 3.918386332868277e-06, + "loss": 0.79960287, + "num_input_tokens_seen": 20835125, + "step": 986, + "time_per_iteration": 2.531755208969116 + }, + { + "auxiliary_loss_clip": 0.01214708, + "auxiliary_loss_mlp": 0.01046699, + "balance_loss_clip": 1.06207442, + "balance_loss_mlp": 1.03567183, + "epoch": 0.11867973306078278, + "flos": 18912354877440.0, + "grad_norm": 1.987745491114234, + "language_loss": 0.94325513, + "learning_rate": 3.918165931882394e-06, + "loss": 0.96586919, + "num_input_tokens_seen": 20853525, + "step": 987, + "time_per_iteration": 2.5144152641296387 + }, + { + "auxiliary_loss_clip": 0.01152175, + "auxiliary_loss_mlp": 0.01034945, + "balance_loss_clip": 1.05158961, + "balance_loss_mlp": 1.02338827, + "epoch": 0.11879997595142187, + "flos": 16982803152000.0, + "grad_norm": 3.860086173502039, + "language_loss": 0.75119507, + "learning_rate": 3.917945239912264e-06, + "loss": 0.77306628, + "num_input_tokens_seen": 20871000, + "step": 988, + "time_per_iteration": 2.61676287651062 + }, + { + "auxiliary_loss_clip": 0.01175609, + "auxiliary_loss_mlp": 0.01040194, + "balance_loss_clip": 1.05690217, + "balance_loss_mlp": 1.03026342, + "epoch": 0.11892021884206096, + "flos": 17530369056000.0, + "grad_norm": 2.019246030487586, + "language_loss": 0.75543213, + "learning_rate": 3.917724256991367e-06, + "loss": 0.77759016, + "num_input_tokens_seen": 20889745, + "step": 989, + "time_per_iteration": 2.5860583782196045 + }, + { + "auxiliary_loss_clip": 0.01198414, + "auxiliary_loss_mlp": 0.01043537, + "balance_loss_clip": 1.06000936, + "balance_loss_mlp": 1.03252792, + "epoch": 0.11904046173270005, + "flos": 30955895763840.0, + "grad_norm": 2.2599049787532177, + "language_loss": 0.81590056, + "learning_rate": 3.9175029831532245e-06, + "loss": 0.83832008, + "num_input_tokens_seen": 20909260, + "step": 990, + "time_per_iteration": 2.6035945415496826 + }, + { + "auxiliary_loss_clip": 0.01198369, + "auxiliary_loss_mlp": 0.01029952, + "balance_loss_clip": 1.06487334, + "balance_loss_mlp": 1.01970029, + "epoch": 0.11916070462333915, + "flos": 20157234485760.0, + "grad_norm": 2.2220896581073837, + "language_loss": 0.88434994, + "learning_rate": 3.917281418431404e-06, + "loss": 0.90663308, + "num_input_tokens_seen": 20928305, + "step": 991, + "time_per_iteration": 2.5644524097442627 + }, + { + "auxiliary_loss_clip": 0.01207733, + "auxiliary_loss_mlp": 0.01040607, + "balance_loss_clip": 1.06389165, + "balance_loss_mlp": 1.02942562, + "epoch": 0.11928094751397823, + "flos": 23551115961600.0, + "grad_norm": 2.12334113713306, + "language_loss": 0.76963896, + "learning_rate": 3.917059562859516e-06, + "loss": 0.79212236, + "num_input_tokens_seen": 20947630, + "step": 992, + "time_per_iteration": 2.5579957962036133 + }, + { + "auxiliary_loss_clip": 0.01201459, + "auxiliary_loss_mlp": 0.01046014, + "balance_loss_clip": 1.0636301, + "balance_loss_mlp": 1.03369987, + "epoch": 0.11940119040461733, + "flos": 23908426502400.0, + "grad_norm": 2.000616233856319, + "language_loss": 0.88798112, + "learning_rate": 3.916837416471218e-06, + "loss": 0.91045588, + "num_input_tokens_seen": 20964250, + "step": 993, + "time_per_iteration": 2.5860724449157715 + }, + { + "auxiliary_loss_clip": 0.01217463, + "auxiliary_loss_mlp": 0.01039623, + "balance_loss_clip": 1.06013918, + "balance_loss_mlp": 1.0287869, + "epoch": 0.11952143329525641, + "flos": 13844533835520.0, + "grad_norm": 2.4224344105362206, + "language_loss": 0.72160399, + "learning_rate": 3.916614979300207e-06, + "loss": 0.74417478, + "num_input_tokens_seen": 20979095, + "step": 994, + "time_per_iteration": 2.4837634563446045 + }, + { + "auxiliary_loss_clip": 0.01168795, + "auxiliary_loss_mlp": 0.01037884, + "balance_loss_clip": 1.05853438, + "balance_loss_mlp": 1.02726245, + "epoch": 0.11964167618589551, + "flos": 27015525792000.0, + "grad_norm": 1.59077822004138, + "language_loss": 0.78389776, + "learning_rate": 3.9163922513802274e-06, + "loss": 0.80596459, + "num_input_tokens_seen": 21001430, + "step": 995, + "time_per_iteration": 2.682748317718506 + }, + { + "auxiliary_loss_clip": 0.01242959, + "auxiliary_loss_mlp": 0.01036658, + "balance_loss_clip": 1.06562114, + "balance_loss_mlp": 1.02576244, + "epoch": 0.1197619190765346, + "flos": 12567622273920.0, + "grad_norm": 2.544430831439611, + "language_loss": 0.82472622, + "learning_rate": 3.916169232745067e-06, + "loss": 0.84752238, + "num_input_tokens_seen": 21019105, + "step": 996, + "time_per_iteration": 2.449692726135254 + }, + { + "auxiliary_loss_clip": 0.01197225, + "auxiliary_loss_mlp": 0.01042055, + "balance_loss_clip": 1.05960059, + "balance_loss_mlp": 1.03073013, + "epoch": 0.11988216196717369, + "flos": 16909437623040.0, + "grad_norm": 4.00713386410048, + "language_loss": 0.92010868, + "learning_rate": 3.915945923428559e-06, + "loss": 0.94250154, + "num_input_tokens_seen": 21035630, + "step": 997, + "time_per_iteration": 2.5025994777679443 + }, + { + "auxiliary_loss_clip": 0.01218724, + "auxiliary_loss_mlp": 0.010354, + "balance_loss_clip": 1.06076193, + "balance_loss_mlp": 1.02433729, + "epoch": 0.12000240485781279, + "flos": 16216577205120.0, + "grad_norm": 2.2002820741268483, + "language_loss": 0.83175099, + "learning_rate": 3.915722323464577e-06, + "loss": 0.85429215, + "num_input_tokens_seen": 21054235, + "step": 998, + "time_per_iteration": 2.4662156105041504 + }, + { + "auxiliary_loss_clip": 0.01225108, + "auxiliary_loss_mlp": 0.01038958, + "balance_loss_clip": 1.06418014, + "balance_loss_mlp": 1.02782345, + "epoch": 0.12012264774845187, + "flos": 49344887525760.0, + "grad_norm": 3.118239477616213, + "language_loss": 0.70344037, + "learning_rate": 3.91549843288704e-06, + "loss": 0.72608101, + "num_input_tokens_seen": 21077915, + "step": 999, + "time_per_iteration": 2.7323760986328125 + }, + { + "auxiliary_loss_clip": 0.01190554, + "auxiliary_loss_mlp": 0.00765492, + "balance_loss_clip": 1.05584478, + "balance_loss_mlp": 1.00045073, + "epoch": 0.12024289063909097, + "flos": 26979435601920.0, + "grad_norm": 3.2862755160176986, + "language_loss": 0.78844786, + "learning_rate": 3.915274251729916e-06, + "loss": 0.80800831, + "num_input_tokens_seen": 21099205, + "step": 1000, + "time_per_iteration": 2.616658926010132 + }, + { + "auxiliary_loss_clip": 0.01195309, + "auxiliary_loss_mlp": 0.01034671, + "balance_loss_clip": 1.06210959, + "balance_loss_mlp": 1.0232091, + "epoch": 0.12036313352973005, + "flos": 19537308633600.0, + "grad_norm": 2.2424543324362953, + "language_loss": 0.89905041, + "learning_rate": 3.91504978002721e-06, + "loss": 0.92135024, + "num_input_tokens_seen": 21118260, + "step": 1001, + "time_per_iteration": 2.571659803390503 + }, + { + "auxiliary_loss_clip": 0.01212313, + "auxiliary_loss_mlp": 0.00765067, + "balance_loss_clip": 1.06050777, + "balance_loss_mlp": 1.00034499, + "epoch": 0.12048337642036915, + "flos": 17268256535040.0, + "grad_norm": 2.495012207730136, + "language_loss": 0.76347744, + "learning_rate": 3.914825017812974e-06, + "loss": 0.78325123, + "num_input_tokens_seen": 21134910, + "step": 1002, + "time_per_iteration": 2.527287244796753 + }, + { + "auxiliary_loss_clip": 0.0121012, + "auxiliary_loss_mlp": 0.01042553, + "balance_loss_clip": 1.06292295, + "balance_loss_mlp": 1.03119802, + "epoch": 0.12060361931100824, + "flos": 22856962654080.0, + "grad_norm": 2.3731593127385873, + "language_loss": 0.7255789, + "learning_rate": 3.9145999651213065e-06, + "loss": 0.74810565, + "num_input_tokens_seen": 21154150, + "step": 1003, + "time_per_iteration": 2.5315756797790527 + }, + { + "auxiliary_loss_clip": 0.01225386, + "auxiliary_loss_mlp": 0.01041813, + "balance_loss_clip": 1.06308436, + "balance_loss_mlp": 1.02978516, + "epoch": 0.12072386220164733, + "flos": 16726795943040.0, + "grad_norm": 2.5450679864483154, + "language_loss": 0.88542783, + "learning_rate": 3.9143746219863465e-06, + "loss": 0.90809989, + "num_input_tokens_seen": 21171255, + "step": 1004, + "time_per_iteration": 2.479185104370117 + }, + { + "auxiliary_loss_clip": 0.01127345, + "auxiliary_loss_mlp": 0.01012278, + "balance_loss_clip": 1.03379703, + "balance_loss_mlp": 1.00798643, + "epoch": 0.12084410509228642, + "flos": 55144176105600.0, + "grad_norm": 0.9506135959761678, + "language_loss": 0.64736652, + "learning_rate": 3.914148988442278e-06, + "loss": 0.6687628, + "num_input_tokens_seen": 21227045, + "step": 1005, + "time_per_iteration": 3.8375391960144043 + }, + { + "auxiliary_loss_clip": 0.01196326, + "auxiliary_loss_mlp": 0.01037221, + "balance_loss_clip": 1.06030977, + "balance_loss_mlp": 1.02602077, + "epoch": 0.1209643479829255, + "flos": 26760236855040.0, + "grad_norm": 2.893596243662484, + "language_loss": 0.94998348, + "learning_rate": 3.91392306452333e-06, + "loss": 0.97231889, + "num_input_tokens_seen": 21244120, + "step": 1006, + "time_per_iteration": 2.564314126968384 + }, + { + "auxiliary_loss_clip": 0.01243884, + "auxiliary_loss_mlp": 0.01035439, + "balance_loss_clip": 1.06680822, + "balance_loss_mlp": 1.02466297, + "epoch": 0.1210845908735646, + "flos": 11035026725760.0, + "grad_norm": 2.9051659857921237, + "language_loss": 0.66627771, + "learning_rate": 3.913696850263774e-06, + "loss": 0.68907094, + "num_input_tokens_seen": 21258485, + "step": 1007, + "time_per_iteration": 2.44962477684021 + }, + { + "auxiliary_loss_clip": 0.01222199, + "auxiliary_loss_mlp": 0.01036045, + "balance_loss_clip": 1.06305814, + "balance_loss_mlp": 1.02517295, + "epoch": 0.1212048337642037, + "flos": 20484631975680.0, + "grad_norm": 2.160124960372422, + "language_loss": 0.79142213, + "learning_rate": 3.913470345697929e-06, + "loss": 0.81400454, + "num_input_tokens_seen": 21277115, + "step": 1008, + "time_per_iteration": 3.2820069789886475 + }, + { + "auxiliary_loss_clip": 0.01180831, + "auxiliary_loss_mlp": 0.01039169, + "balance_loss_clip": 1.05951071, + "balance_loss_mlp": 1.02840388, + "epoch": 0.12132507665484278, + "flos": 22346061557760.0, + "grad_norm": 1.9923493404607497, + "language_loss": 0.85367453, + "learning_rate": 3.913243550860153e-06, + "loss": 0.87587458, + "num_input_tokens_seen": 21294880, + "step": 1009, + "time_per_iteration": 3.4428417682647705 + }, + { + "auxiliary_loss_clip": 0.01229164, + "auxiliary_loss_mlp": 0.01038916, + "balance_loss_clip": 1.06846964, + "balance_loss_mlp": 1.027174, + "epoch": 0.12144531954548188, + "flos": 29314957818240.0, + "grad_norm": 1.8287365473090584, + "language_loss": 0.75803041, + "learning_rate": 3.913016465784852e-06, + "loss": 0.78071123, + "num_input_tokens_seen": 21315555, + "step": 1010, + "time_per_iteration": 3.3135924339294434 + }, + { + "auxiliary_loss_clip": 0.0117923, + "auxiliary_loss_mlp": 0.01036862, + "balance_loss_clip": 1.05669916, + "balance_loss_mlp": 1.02526319, + "epoch": 0.12156556243612096, + "flos": 20485242506880.0, + "grad_norm": 2.689683786058729, + "language_loss": 0.71877766, + "learning_rate": 3.912789090506474e-06, + "loss": 0.74093866, + "num_input_tokens_seen": 21334815, + "step": 1011, + "time_per_iteration": 2.6023337841033936 + }, + { + "auxiliary_loss_clip": 0.01199186, + "auxiliary_loss_mlp": 0.01044181, + "balance_loss_clip": 1.05843174, + "balance_loss_mlp": 1.03236699, + "epoch": 0.12168580532676006, + "flos": 16472009796480.0, + "grad_norm": 2.5327850866752173, + "language_loss": 0.71582115, + "learning_rate": 3.9125614250595114e-06, + "loss": 0.73825479, + "num_input_tokens_seen": 21351025, + "step": 1012, + "time_per_iteration": 2.5997812747955322 + }, + { + "auxiliary_loss_clip": 0.01224702, + "auxiliary_loss_mlp": 0.01034743, + "balance_loss_clip": 1.06262732, + "balance_loss_mlp": 1.02311993, + "epoch": 0.12180604821739914, + "flos": 15341290588800.0, + "grad_norm": 2.760929419223655, + "language_loss": 0.8936317, + "learning_rate": 3.912333469478502e-06, + "loss": 0.91622609, + "num_input_tokens_seen": 21368990, + "step": 1013, + "time_per_iteration": 2.5036745071411133 + }, + { + "auxiliary_loss_clip": 0.01208019, + "auxiliary_loss_mlp": 0.0103169, + "balance_loss_clip": 1.06008339, + "balance_loss_mlp": 1.02122307, + "epoch": 0.12192629110803824, + "flos": 19318038059520.0, + "grad_norm": 2.3125368075001593, + "language_loss": 0.78108817, + "learning_rate": 3.912105223798025e-06, + "loss": 0.80348527, + "num_input_tokens_seen": 21388410, + "step": 1014, + "time_per_iteration": 2.5596213340759277 + }, + { + "auxiliary_loss_clip": 0.01109247, + "auxiliary_loss_mlp": 0.01007281, + "balance_loss_clip": 1.02981305, + "balance_loss_mlp": 1.00365722, + "epoch": 0.12204653399867733, + "flos": 47725354085760.0, + "grad_norm": 0.9948488256804243, + "language_loss": 0.6767996, + "learning_rate": 3.9118766880527065e-06, + "loss": 0.69796491, + "num_input_tokens_seen": 21442845, + "step": 1015, + "time_per_iteration": 3.054555654525757 + }, + { + "auxiliary_loss_clip": 0.01168725, + "auxiliary_loss_mlp": 0.0103196, + "balance_loss_clip": 1.05640578, + "balance_loss_mlp": 1.02146935, + "epoch": 0.12216677688931642, + "flos": 18221936584320.0, + "grad_norm": 1.6382514576502794, + "language_loss": 0.73793364, + "learning_rate": 3.9116478622772145e-06, + "loss": 0.75994051, + "num_input_tokens_seen": 21461420, + "step": 1016, + "time_per_iteration": 2.5884268283843994 + }, + { + "auxiliary_loss_clip": 0.01222025, + "auxiliary_loss_mlp": 0.01042308, + "balance_loss_clip": 1.06401432, + "balance_loss_mlp": 1.03101873, + "epoch": 0.12228701977995551, + "flos": 27525636789120.0, + "grad_norm": 1.886617060034304, + "language_loss": 0.87957817, + "learning_rate": 3.911418746506261e-06, + "loss": 0.90222144, + "num_input_tokens_seen": 21481550, + "step": 1017, + "time_per_iteration": 2.5491297245025635 + }, + { + "auxiliary_loss_clip": 0.01229126, + "auxiliary_loss_mlp": 0.01042367, + "balance_loss_clip": 1.06916904, + "balance_loss_mlp": 1.03141785, + "epoch": 0.1224072626705946, + "flos": 21798136517760.0, + "grad_norm": 1.95193451134702, + "language_loss": 0.78226215, + "learning_rate": 3.911189340774604e-06, + "loss": 0.80497706, + "num_input_tokens_seen": 21501680, + "step": 1018, + "time_per_iteration": 2.515857219696045 + }, + { + "auxiliary_loss_clip": 0.01216767, + "auxiliary_loss_mlp": 0.01038748, + "balance_loss_clip": 1.06260347, + "balance_loss_mlp": 1.02749467, + "epoch": 0.1225275055612337, + "flos": 20703758895360.0, + "grad_norm": 2.041030030369283, + "language_loss": 0.7948429, + "learning_rate": 3.910959645117043e-06, + "loss": 0.81739807, + "num_input_tokens_seen": 21521015, + "step": 1019, + "time_per_iteration": 2.5398073196411133 + }, + { + "auxiliary_loss_clip": 0.0111412, + "auxiliary_loss_mlp": 0.00755979, + "balance_loss_clip": 1.0288105, + "balance_loss_mlp": 1.00022078, + "epoch": 0.12264774845187278, + "flos": 57745294462080.0, + "grad_norm": 0.8193647556270814, + "language_loss": 0.56714702, + "learning_rate": 3.910729659568423e-06, + "loss": 0.58584797, + "num_input_tokens_seen": 21578200, + "step": 1020, + "time_per_iteration": 3.085686683654785 + }, + { + "auxiliary_loss_clip": 0.0120951, + "auxiliary_loss_mlp": 0.01041591, + "balance_loss_clip": 1.06449008, + "balance_loss_mlp": 1.0314163, + "epoch": 0.12276799134251187, + "flos": 26396282298240.0, + "grad_norm": 1.7063865795684972, + "language_loss": 0.82347667, + "learning_rate": 3.9104993841636344e-06, + "loss": 0.84598774, + "num_input_tokens_seen": 21598770, + "step": 1021, + "time_per_iteration": 2.6150646209716797 + }, + { + "auxiliary_loss_clip": 0.01210581, + "auxiliary_loss_mlp": 0.00764473, + "balance_loss_clip": 1.06787813, + "balance_loss_mlp": 1.0003643, + "epoch": 0.12288823423315097, + "flos": 21064193919360.0, + "grad_norm": 1.7132813908752555, + "language_loss": 0.80821955, + "learning_rate": 3.910268818937608e-06, + "loss": 0.82797015, + "num_input_tokens_seen": 21616925, + "step": 1022, + "time_per_iteration": 2.5742647647857666 + }, + { + "auxiliary_loss_clip": 0.01179661, + "auxiliary_loss_mlp": 0.01034625, + "balance_loss_clip": 1.06122351, + "balance_loss_mlp": 1.02368176, + "epoch": 0.12300847712379005, + "flos": 12312441077760.0, + "grad_norm": 2.5556100071525303, + "language_loss": 0.87332755, + "learning_rate": 3.9100379639253196e-06, + "loss": 0.89547038, + "num_input_tokens_seen": 21633645, + "step": 1023, + "time_per_iteration": 2.5703623294830322 + }, + { + "auxiliary_loss_clip": 0.01208191, + "auxiliary_loss_mlp": 0.01040404, + "balance_loss_clip": 1.0590651, + "balance_loss_mlp": 1.0288769, + "epoch": 0.12312872001442915, + "flos": 16762239688320.0, + "grad_norm": 2.268176392003377, + "language_loss": 0.86018825, + "learning_rate": 3.909806819161791e-06, + "loss": 0.88267416, + "num_input_tokens_seen": 21649120, + "step": 1024, + "time_per_iteration": 2.4996278285980225 + }, + { + "auxiliary_loss_clip": 0.01197656, + "auxiliary_loss_mlp": 0.01032083, + "balance_loss_clip": 1.06013823, + "balance_loss_mlp": 1.02078199, + "epoch": 0.12324896290506823, + "flos": 18404937400320.0, + "grad_norm": 2.402161768786494, + "language_loss": 0.86334985, + "learning_rate": 3.909575384682086e-06, + "loss": 0.8856473, + "num_input_tokens_seen": 21668000, + "step": 1025, + "time_per_iteration": 2.5582594871520996 + }, + { + "auxiliary_loss_clip": 0.0122758, + "auxiliary_loss_mlp": 0.0105029, + "balance_loss_clip": 1.06358445, + "balance_loss_mlp": 1.03881061, + "epoch": 0.12336920579570733, + "flos": 18915407533440.0, + "grad_norm": 1.867875334313174, + "language_loss": 0.6914354, + "learning_rate": 3.9093436605213144e-06, + "loss": 0.71421409, + "num_input_tokens_seen": 21688500, + "step": 1026, + "time_per_iteration": 2.51131534576416 + }, + { + "auxiliary_loss_clip": 0.012109, + "auxiliary_loss_mlp": 0.01042902, + "balance_loss_clip": 1.06294799, + "balance_loss_mlp": 1.03216672, + "epoch": 0.12348944868634643, + "flos": 23878369797120.0, + "grad_norm": 1.9226511412832166, + "language_loss": 0.79420924, + "learning_rate": 3.909111646714627e-06, + "loss": 0.81674731, + "num_input_tokens_seen": 21709345, + "step": 1027, + "time_per_iteration": 2.575458526611328 + }, + { + "auxiliary_loss_clip": 0.01235901, + "auxiliary_loss_mlp": 0.01034317, + "balance_loss_clip": 1.06399333, + "balance_loss_mlp": 1.02393436, + "epoch": 0.12360969157698551, + "flos": 19026084314880.0, + "grad_norm": 2.429132795661575, + "language_loss": 0.72738689, + "learning_rate": 3.9088793432972206e-06, + "loss": 0.75008905, + "num_input_tokens_seen": 21728165, + "step": 1028, + "time_per_iteration": 2.4669811725616455 + }, + { + "auxiliary_loss_clip": 0.01179451, + "auxiliary_loss_mlp": 0.01042688, + "balance_loss_clip": 1.06036329, + "balance_loss_mlp": 1.03150034, + "epoch": 0.1237299344676246, + "flos": 13224607983360.0, + "grad_norm": 2.1065718642925475, + "language_loss": 0.81974977, + "learning_rate": 3.908646750304336e-06, + "loss": 0.84197116, + "num_input_tokens_seen": 21745850, + "step": 1029, + "time_per_iteration": 2.5811209678649902 + }, + { + "auxiliary_loss_clip": 0.01215118, + "auxiliary_loss_mlp": 0.01038261, + "balance_loss_clip": 1.0672617, + "balance_loss_mlp": 1.02732921, + "epoch": 0.12385017735826369, + "flos": 20485673470080.0, + "grad_norm": 1.6902715753931756, + "language_loss": 0.87339199, + "learning_rate": 3.908413867771257e-06, + "loss": 0.89592576, + "num_input_tokens_seen": 21764760, + "step": 1030, + "time_per_iteration": 2.538928747177124 + }, + { + "auxiliary_loss_clip": 0.01225078, + "auxiliary_loss_mlp": 0.01042855, + "balance_loss_clip": 1.06630373, + "balance_loss_mlp": 1.03117847, + "epoch": 0.12397042024890279, + "flos": 17347835116800.0, + "grad_norm": 1.7166033318741447, + "language_loss": 0.80663145, + "learning_rate": 3.908180695733311e-06, + "loss": 0.82931083, + "num_input_tokens_seen": 21784250, + "step": 1031, + "time_per_iteration": 3.2833802700042725 + }, + { + "auxiliary_loss_clip": 0.01159593, + "auxiliary_loss_mlp": 0.01044995, + "balance_loss_clip": 1.0551846, + "balance_loss_mlp": 1.03382528, + "epoch": 0.12409066313954187, + "flos": 20412343854720.0, + "grad_norm": 1.9120816224450847, + "language_loss": 0.83078241, + "learning_rate": 3.907947234225871e-06, + "loss": 0.85282826, + "num_input_tokens_seen": 21803260, + "step": 1032, + "time_per_iteration": 2.611321210861206 + }, + { + "auxiliary_loss_clip": 0.01157554, + "auxiliary_loss_mlp": 0.01035943, + "balance_loss_clip": 1.05684793, + "balance_loss_mlp": 1.02540469, + "epoch": 0.12421090603018096, + "flos": 20736688688640.0, + "grad_norm": 1.8388782370669954, + "language_loss": 0.8740828, + "learning_rate": 3.907713483284352e-06, + "loss": 0.89601773, + "num_input_tokens_seen": 21822735, + "step": 1033, + "time_per_iteration": 2.647279739379883 + }, + { + "auxiliary_loss_clip": 0.01136222, + "auxiliary_loss_mlp": 0.01044487, + "balance_loss_clip": 1.05101001, + "balance_loss_mlp": 1.03177941, + "epoch": 0.12433114892082006, + "flos": 24498834353280.0, + "grad_norm": 2.106567821146967, + "language_loss": 0.97263962, + "learning_rate": 3.907479442944216e-06, + "loss": 0.99444675, + "num_input_tokens_seen": 21841140, + "step": 1034, + "time_per_iteration": 3.5763518810272217 + }, + { + "auxiliary_loss_clip": 0.01224142, + "auxiliary_loss_mlp": 0.01035537, + "balance_loss_clip": 1.06555665, + "balance_loss_mlp": 1.02555907, + "epoch": 0.12445139181145914, + "flos": 19682315838720.0, + "grad_norm": 2.280506538027157, + "language_loss": 0.92429054, + "learning_rate": 3.907245113240963e-06, + "loss": 0.94688737, + "num_input_tokens_seen": 21859260, + "step": 1035, + "time_per_iteration": 2.65336012840271 + }, + { + "auxiliary_loss_clip": 0.01192396, + "auxiliary_loss_mlp": 0.0103665, + "balance_loss_clip": 1.05646026, + "balance_loss_mlp": 1.02526569, + "epoch": 0.12457163470209824, + "flos": 46423087522560.0, + "grad_norm": 1.8627327416979071, + "language_loss": 0.73966986, + "learning_rate": 3.907010494210144e-06, + "loss": 0.76196033, + "num_input_tokens_seen": 21881920, + "step": 1036, + "time_per_iteration": 4.4664623737335205 + }, + { + "auxiliary_loss_clip": 0.01229897, + "auxiliary_loss_mlp": 0.01042241, + "balance_loss_clip": 1.06827021, + "balance_loss_mlp": 1.0302068, + "epoch": 0.12469187759273732, + "flos": 20376289578240.0, + "grad_norm": 2.0946523610655383, + "language_loss": 0.91941917, + "learning_rate": 3.9067755858873495e-06, + "loss": 0.94214058, + "num_input_tokens_seen": 21898720, + "step": 1037, + "time_per_iteration": 2.5376927852630615 + }, + { + "auxiliary_loss_clip": 0.0109134, + "auxiliary_loss_mlp": 0.01009609, + "balance_loss_clip": 1.0210433, + "balance_loss_mlp": 1.00593686, + "epoch": 0.12481212048337642, + "flos": 69224641447680.0, + "grad_norm": 1.0001147507922477, + "language_loss": 0.62824297, + "learning_rate": 3.906540388308214e-06, + "loss": 0.64925241, + "num_input_tokens_seen": 21958305, + "step": 1038, + "time_per_iteration": 3.164268732070923 + }, + { + "auxiliary_loss_clip": 0.01167378, + "auxiliary_loss_mlp": 0.01045937, + "balance_loss_clip": 1.06026554, + "balance_loss_mlp": 1.03440309, + "epoch": 0.12493236337401552, + "flos": 18223696350720.0, + "grad_norm": 1.7051134988560068, + "language_loss": 0.81245005, + "learning_rate": 3.906304901508417e-06, + "loss": 0.83458322, + "num_input_tokens_seen": 21977205, + "step": 1039, + "time_per_iteration": 2.6050333976745605 + }, + { + "auxiliary_loss_clip": 0.01229211, + "auxiliary_loss_mlp": 0.01038798, + "balance_loss_clip": 1.07044268, + "balance_loss_mlp": 1.02856946, + "epoch": 0.12505260626465461, + "flos": 30044375303040.0, + "grad_norm": 2.155077926610924, + "language_loss": 0.75774276, + "learning_rate": 3.9060691255236835e-06, + "loss": 0.78042287, + "num_input_tokens_seen": 21997770, + "step": 1040, + "time_per_iteration": 2.615272045135498 + }, + { + "auxiliary_loss_clip": 0.01219752, + "auxiliary_loss_mlp": 0.01040564, + "balance_loss_clip": 1.06083965, + "balance_loss_mlp": 1.02887535, + "epoch": 0.1251728491552937, + "flos": 24433980347520.0, + "grad_norm": 1.6368553499548537, + "language_loss": 0.80880582, + "learning_rate": 3.905833060389778e-06, + "loss": 0.83140898, + "num_input_tokens_seen": 22021890, + "step": 1041, + "time_per_iteration": 2.695765495300293 + }, + { + "auxiliary_loss_clip": 0.01245234, + "auxiliary_loss_mlp": 0.00765361, + "balance_loss_clip": 1.0701437, + "balance_loss_mlp": 1.00035691, + "epoch": 0.12529309204593278, + "flos": 27119809952640.0, + "grad_norm": 2.1798328646153435, + "language_loss": 0.7858268, + "learning_rate": 3.905596706142513e-06, + "loss": 0.80593276, + "num_input_tokens_seen": 22043300, + "step": 1042, + "time_per_iteration": 2.6093451976776123 + }, + { + "auxiliary_loss_clip": 0.01190812, + "auxiliary_loss_mlp": 0.01040892, + "balance_loss_clip": 1.05983591, + "balance_loss_mlp": 1.02929258, + "epoch": 0.12541333493657186, + "flos": 30774151923840.0, + "grad_norm": 1.9521067804863694, + "language_loss": 0.86141968, + "learning_rate": 3.9053600628177435e-06, + "loss": 0.88373667, + "num_input_tokens_seen": 22062910, + "step": 1043, + "time_per_iteration": 2.675492286682129 + }, + { + "auxiliary_loss_clip": 0.01240633, + "auxiliary_loss_mlp": 0.01037276, + "balance_loss_clip": 1.06728458, + "balance_loss_mlp": 1.02639842, + "epoch": 0.12553357782721097, + "flos": 23659566099840.0, + "grad_norm": 2.62627250865552, + "language_loss": 0.84891093, + "learning_rate": 3.905123130451367e-06, + "loss": 0.87169003, + "num_input_tokens_seen": 22084010, + "step": 1044, + "time_per_iteration": 2.49338698387146 + }, + { + "auxiliary_loss_clip": 0.01245209, + "auxiliary_loss_mlp": 0.0103752, + "balance_loss_clip": 1.07069075, + "balance_loss_mlp": 1.02608204, + "epoch": 0.12565382071785006, + "flos": 24863758577280.0, + "grad_norm": 1.7801995033530973, + "language_loss": 0.79284859, + "learning_rate": 3.904885909079326e-06, + "loss": 0.81567585, + "num_input_tokens_seen": 22102795, + "step": 1045, + "time_per_iteration": 2.5076217651367188 + }, + { + "auxiliary_loss_clip": 0.01227492, + "auxiliary_loss_mlp": 0.01040487, + "balance_loss_clip": 1.06547296, + "balance_loss_mlp": 1.02920365, + "epoch": 0.12577406360848914, + "flos": 21360780518400.0, + "grad_norm": 2.3737440962444496, + "language_loss": 0.77560878, + "learning_rate": 3.904648398737607e-06, + "loss": 0.79828852, + "num_input_tokens_seen": 22121360, + "step": 1046, + "time_per_iteration": 2.490615129470825 + }, + { + "auxiliary_loss_clip": 0.01243166, + "auxiliary_loss_mlp": 0.01038819, + "balance_loss_clip": 1.06848121, + "balance_loss_mlp": 1.02785814, + "epoch": 0.12589430649912825, + "flos": 36138056774400.0, + "grad_norm": 1.7820145029142718, + "language_loss": 0.7803123, + "learning_rate": 3.9044105994622406e-06, + "loss": 0.80313218, + "num_input_tokens_seen": 22142505, + "step": 1047, + "time_per_iteration": 2.5920472145080566 + }, + { + "auxiliary_loss_clip": 0.01214221, + "auxiliary_loss_mlp": 0.00765372, + "balance_loss_clip": 1.06341505, + "balance_loss_mlp": 1.00036967, + "epoch": 0.12601454938976733, + "flos": 25337671643520.0, + "grad_norm": 2.106357175233106, + "language_loss": 0.81806362, + "learning_rate": 3.9041725112893005e-06, + "loss": 0.83785951, + "num_input_tokens_seen": 22163730, + "step": 1048, + "time_per_iteration": 2.6157383918762207 + }, + { + "auxiliary_loss_clip": 0.01191944, + "auxiliary_loss_mlp": 0.01040709, + "balance_loss_clip": 1.06491637, + "balance_loss_mlp": 1.0299263, + "epoch": 0.12613479228040642, + "flos": 15560094286080.0, + "grad_norm": 1.6632733205518466, + "language_loss": 0.7529816, + "learning_rate": 3.903934134254904e-06, + "loss": 0.77530819, + "num_input_tokens_seen": 22181520, + "step": 1049, + "time_per_iteration": 2.632760763168335 + }, + { + "auxiliary_loss_clip": 0.01231224, + "auxiliary_loss_mlp": 0.01041555, + "balance_loss_clip": 1.06608284, + "balance_loss_mlp": 1.03021264, + "epoch": 0.1262550351710455, + "flos": 21470595373440.0, + "grad_norm": 3.7100774714131655, + "language_loss": 0.85142881, + "learning_rate": 3.903695468395213e-06, + "loss": 0.87415665, + "num_input_tokens_seen": 22199390, + "step": 1050, + "time_per_iteration": 2.552403450012207 + }, + { + "auxiliary_loss_clip": 0.01214054, + "auxiliary_loss_mlp": 0.01038958, + "balance_loss_clip": 1.06070554, + "balance_loss_mlp": 1.02858639, + "epoch": 0.1263752780616846, + "flos": 31576719456000.0, + "grad_norm": 2.0985579580011686, + "language_loss": 0.56048161, + "learning_rate": 3.903456513746434e-06, + "loss": 0.58301175, + "num_input_tokens_seen": 22220365, + "step": 1051, + "time_per_iteration": 2.61303448677063 + }, + { + "auxiliary_loss_clip": 0.01237963, + "auxiliary_loss_mlp": 0.01035491, + "balance_loss_clip": 1.06574118, + "balance_loss_mlp": 1.02503014, + "epoch": 0.1264955209523237, + "flos": 28768217927040.0, + "grad_norm": 1.7854380102046692, + "language_loss": 0.8765853, + "learning_rate": 3.903217270344815e-06, + "loss": 0.89931989, + "num_input_tokens_seen": 22240615, + "step": 1052, + "time_per_iteration": 2.5789434909820557 + }, + { + "auxiliary_loss_clip": 0.01186302, + "auxiliary_loss_mlp": 0.01036274, + "balance_loss_clip": 1.05858338, + "balance_loss_mlp": 1.02518749, + "epoch": 0.12661576384296278, + "flos": 29241125412480.0, + "grad_norm": 1.72379239168542, + "language_loss": 0.82266384, + "learning_rate": 3.902977738226648e-06, + "loss": 0.84488958, + "num_input_tokens_seen": 22261350, + "step": 1053, + "time_per_iteration": 2.650184392929077 + }, + { + "auxiliary_loss_clip": 0.01229673, + "auxiliary_loss_mlp": 0.0104705, + "balance_loss_clip": 1.06689847, + "balance_loss_mlp": 1.03505719, + "epoch": 0.12673600673360189, + "flos": 20850346298880.0, + "grad_norm": 1.8044427345592073, + "language_loss": 0.91212952, + "learning_rate": 3.902737917428273e-06, + "loss": 0.93489671, + "num_input_tokens_seen": 22279515, + "step": 1054, + "time_per_iteration": 2.508131265640259 + }, + { + "auxiliary_loss_clip": 0.01240674, + "auxiliary_loss_mlp": 0.01036732, + "balance_loss_clip": 1.06721151, + "balance_loss_mlp": 1.02571154, + "epoch": 0.12685624962424097, + "flos": 25263695583360.0, + "grad_norm": 4.01873402954764, + "language_loss": 0.84079027, + "learning_rate": 3.902497807986068e-06, + "loss": 0.86356437, + "num_input_tokens_seen": 22299535, + "step": 1055, + "time_per_iteration": 2.5503883361816406 + }, + { + "auxiliary_loss_clip": 0.01196004, + "auxiliary_loss_mlp": 0.01036072, + "balance_loss_clip": 1.05921578, + "balance_loss_mlp": 1.02476454, + "epoch": 0.12697649251488005, + "flos": 27527109246720.0, + "grad_norm": 1.6773364274204108, + "language_loss": 0.83840626, + "learning_rate": 3.902257409936458e-06, + "loss": 0.86072701, + "num_input_tokens_seen": 22320300, + "step": 1056, + "time_per_iteration": 2.6635420322418213 + }, + { + "auxiliary_loss_clip": 0.01209388, + "auxiliary_loss_mlp": 0.01040591, + "balance_loss_clip": 1.06616449, + "balance_loss_mlp": 1.0300889, + "epoch": 0.12709673540551916, + "flos": 21251863503360.0, + "grad_norm": 1.927784915369351, + "language_loss": 0.83696556, + "learning_rate": 3.902016723315912e-06, + "loss": 0.85946536, + "num_input_tokens_seen": 22338240, + "step": 1057, + "time_per_iteration": 2.5404887199401855 + }, + { + "auxiliary_loss_clip": 0.01221095, + "auxiliary_loss_mlp": 0.0103467, + "balance_loss_clip": 1.06202698, + "balance_loss_mlp": 1.02409053, + "epoch": 0.12721697829615825, + "flos": 25337707557120.0, + "grad_norm": 2.483943081971737, + "language_loss": 0.69294178, + "learning_rate": 3.901775748160941e-06, + "loss": 0.71549946, + "num_input_tokens_seen": 22357420, + "step": 1058, + "time_per_iteration": 3.3249402046203613 + }, + { + "auxiliary_loss_clip": 0.01102885, + "auxiliary_loss_mlp": 0.01001619, + "balance_loss_clip": 1.026752, + "balance_loss_mlp": 0.99780428, + "epoch": 0.12733722118679733, + "flos": 61943287754880.0, + "grad_norm": 0.7983731988070236, + "language_loss": 0.60895765, + "learning_rate": 3.901534484508101e-06, + "loss": 0.63000274, + "num_input_tokens_seen": 22420095, + "step": 1059, + "time_per_iteration": 3.100820541381836 + }, + { + "auxiliary_loss_clip": 0.01200352, + "auxiliary_loss_mlp": 0.01036665, + "balance_loss_clip": 1.05998981, + "balance_loss_mlp": 1.02549493, + "epoch": 0.1274574640774364, + "flos": 26976742081920.0, + "grad_norm": 1.8629448015417085, + "language_loss": 0.74600029, + "learning_rate": 3.901292932393991e-06, + "loss": 0.76837039, + "num_input_tokens_seen": 22438975, + "step": 1060, + "time_per_iteration": 2.5768985748291016 + }, + { + "auxiliary_loss_clip": 0.01242005, + "auxiliary_loss_mlp": 0.01039056, + "balance_loss_clip": 1.06916153, + "balance_loss_mlp": 1.02796388, + "epoch": 0.12757770696807552, + "flos": 22236318529920.0, + "grad_norm": 2.2893269020592437, + "language_loss": 0.8508355, + "learning_rate": 3.9010510918552555e-06, + "loss": 0.87364608, + "num_input_tokens_seen": 22458050, + "step": 1061, + "time_per_iteration": 3.2835347652435303 + }, + { + "auxiliary_loss_clip": 0.01207859, + "auxiliary_loss_mlp": 0.0104622, + "balance_loss_clip": 1.06139314, + "balance_loss_mlp": 1.03361976, + "epoch": 0.1276979498587146, + "flos": 28547905858560.0, + "grad_norm": 4.188987291251336, + "language_loss": 0.74577963, + "learning_rate": 3.900808962928581e-06, + "loss": 0.7683205, + "num_input_tokens_seen": 22475665, + "step": 1062, + "time_per_iteration": 3.3650522232055664 + }, + { + "auxiliary_loss_clip": 0.01242491, + "auxiliary_loss_mlp": 0.01037452, + "balance_loss_clip": 1.0699898, + "balance_loss_mlp": 1.02603173, + "epoch": 0.1278181927493537, + "flos": 17420338719360.0, + "grad_norm": 2.1479667986531856, + "language_loss": 0.89134306, + "learning_rate": 3.900566545650698e-06, + "loss": 0.91414249, + "num_input_tokens_seen": 22493335, + "step": 1063, + "time_per_iteration": 3.3058266639709473 + }, + { + "auxiliary_loss_clip": 0.01223868, + "auxiliary_loss_mlp": 0.01036379, + "balance_loss_clip": 1.06512141, + "balance_loss_mlp": 1.02440476, + "epoch": 0.1279384356399928, + "flos": 21138636856320.0, + "grad_norm": 2.3922258063771804, + "language_loss": 0.82154095, + "learning_rate": 3.900323840058381e-06, + "loss": 0.84414339, + "num_input_tokens_seen": 22511045, + "step": 1064, + "time_per_iteration": 2.5138700008392334 + }, + { + "auxiliary_loss_clip": 0.01225495, + "auxiliary_loss_mlp": 0.01039248, + "balance_loss_clip": 1.0636059, + "balance_loss_mlp": 1.02915728, + "epoch": 0.12805867853063188, + "flos": 26576733248640.0, + "grad_norm": 1.9512216990672653, + "language_loss": 0.81598788, + "learning_rate": 3.900080846188449e-06, + "loss": 0.83863533, + "num_input_tokens_seen": 22529635, + "step": 1065, + "time_per_iteration": 2.5332136154174805 + }, + { + "auxiliary_loss_clip": 0.01239505, + "auxiliary_loss_mlp": 0.01033592, + "balance_loss_clip": 1.06651747, + "balance_loss_mlp": 1.02227354, + "epoch": 0.12817892142127096, + "flos": 16436206915200.0, + "grad_norm": 1.7980703944891665, + "language_loss": 0.81329292, + "learning_rate": 3.8998375640777625e-06, + "loss": 0.83602387, + "num_input_tokens_seen": 22547505, + "step": 1066, + "time_per_iteration": 2.497640371322632 + }, + { + "auxiliary_loss_clip": 0.01111124, + "auxiliary_loss_mlp": 0.01003771, + "balance_loss_clip": 1.03616667, + "balance_loss_mlp": 1.00154197, + "epoch": 0.12829916431191005, + "flos": 60757049099520.0, + "grad_norm": 0.707113877470407, + "language_loss": 0.52609825, + "learning_rate": 3.899593993763229e-06, + "loss": 0.54724717, + "num_input_tokens_seen": 22608465, + "step": 1067, + "time_per_iteration": 3.0470235347747803 + }, + { + "auxiliary_loss_clip": 0.01188836, + "auxiliary_loss_mlp": 0.01044272, + "balance_loss_clip": 1.06126368, + "balance_loss_mlp": 1.03125417, + "epoch": 0.12841940720254916, + "flos": 29786895636480.0, + "grad_norm": 2.6396501967124175, + "language_loss": 0.80915558, + "learning_rate": 3.899350135281796e-06, + "loss": 0.83148664, + "num_input_tokens_seen": 22629465, + "step": 1068, + "time_per_iteration": 2.640507698059082 + }, + { + "auxiliary_loss_clip": 0.01198568, + "auxiliary_loss_mlp": 0.01036314, + "balance_loss_clip": 1.06479394, + "balance_loss_mlp": 1.02546, + "epoch": 0.12853965009318824, + "flos": 25951851319680.0, + "grad_norm": 1.9389338071821374, + "language_loss": 0.79832393, + "learning_rate": 3.8991059886704585e-06, + "loss": 0.82067275, + "num_input_tokens_seen": 22648970, + "step": 1069, + "time_per_iteration": 2.6761837005615234 + }, + { + "auxiliary_loss_clip": 0.01188789, + "auxiliary_loss_mlp": 0.01042628, + "balance_loss_clip": 1.06134629, + "balance_loss_mlp": 1.0314703, + "epoch": 0.12865989298382732, + "flos": 30846871008000.0, + "grad_norm": 2.2602428824125598, + "language_loss": 0.82847404, + "learning_rate": 3.898861553966252e-06, + "loss": 0.85078824, + "num_input_tokens_seen": 22668620, + "step": 1070, + "time_per_iteration": 2.70988130569458 + }, + { + "auxiliary_loss_clip": 0.01151166, + "auxiliary_loss_mlp": 0.01039337, + "balance_loss_clip": 1.05537891, + "balance_loss_mlp": 1.02836394, + "epoch": 0.12878013587446643, + "flos": 25885776251520.0, + "grad_norm": 1.6666163727059524, + "language_loss": 0.88104689, + "learning_rate": 3.898616831206257e-06, + "loss": 0.90295196, + "num_input_tokens_seen": 22689045, + "step": 1071, + "time_per_iteration": 2.8331408500671387 + }, + { + "auxiliary_loss_clip": 0.01191237, + "auxiliary_loss_mlp": 0.0103736, + "balance_loss_clip": 1.05831516, + "balance_loss_mlp": 1.02466404, + "epoch": 0.12890037876510552, + "flos": 23333138277120.0, + "grad_norm": 2.057483069310142, + "language_loss": 0.77174461, + "learning_rate": 3.8983718204276e-06, + "loss": 0.79403061, + "num_input_tokens_seen": 22711265, + "step": 1072, + "time_per_iteration": 3.0242512226104736 + }, + { + "auxiliary_loss_clip": 0.01206664, + "auxiliary_loss_mlp": 0.01041162, + "balance_loss_clip": 1.06268311, + "balance_loss_mlp": 1.03083849, + "epoch": 0.1290206216557446, + "flos": 23587242065280.0, + "grad_norm": 1.6864039345781348, + "language_loss": 0.82635617, + "learning_rate": 3.898126521667446e-06, + "loss": 0.84883446, + "num_input_tokens_seen": 22731420, + "step": 1073, + "time_per_iteration": 2.6497066020965576 + }, + { + "auxiliary_loss_clip": 0.01221806, + "auxiliary_loss_mlp": 0.01045602, + "balance_loss_clip": 1.0620898, + "balance_loss_mlp": 1.03396761, + "epoch": 0.12914086454638368, + "flos": 24170610850560.0, + "grad_norm": 2.2821662478811953, + "language_loss": 0.83277601, + "learning_rate": 3.897880934963007e-06, + "loss": 0.85545003, + "num_input_tokens_seen": 22750970, + "step": 1074, + "time_per_iteration": 2.5787861347198486 + }, + { + "auxiliary_loss_clip": 0.01205715, + "auxiliary_loss_mlp": 0.01035349, + "balance_loss_clip": 1.06036198, + "balance_loss_mlp": 1.02360654, + "epoch": 0.1292611074370228, + "flos": 20267157081600.0, + "grad_norm": 1.9646564316207322, + "language_loss": 0.78070426, + "learning_rate": 3.89763506035154e-06, + "loss": 0.80311489, + "num_input_tokens_seen": 22768820, + "step": 1075, + "time_per_iteration": 2.59287166595459 + }, + { + "auxiliary_loss_clip": 0.01211128, + "auxiliary_loss_mlp": 0.0103344, + "balance_loss_clip": 1.06238461, + "balance_loss_mlp": 1.02270508, + "epoch": 0.12938135032766188, + "flos": 27377684668800.0, + "grad_norm": 1.910192559898083, + "language_loss": 0.81315136, + "learning_rate": 3.897388897870343e-06, + "loss": 0.8355971, + "num_input_tokens_seen": 22789460, + "step": 1076, + "time_per_iteration": 2.549440860748291 + }, + { + "auxiliary_loss_clip": 0.01222513, + "auxiliary_loss_mlp": 0.01033949, + "balance_loss_clip": 1.06382847, + "balance_loss_mlp": 1.02205777, + "epoch": 0.12950159321830096, + "flos": 29277107861760.0, + "grad_norm": 2.344849219493505, + "language_loss": 0.74978763, + "learning_rate": 3.89714244755676e-06, + "loss": 0.77235222, + "num_input_tokens_seen": 22810820, + "step": 1077, + "time_per_iteration": 2.6218316555023193 + }, + { + "auxiliary_loss_clip": 0.0116415, + "auxiliary_loss_mlp": 0.01042935, + "balance_loss_clip": 1.05471313, + "balance_loss_mlp": 1.03197408, + "epoch": 0.12962183610894007, + "flos": 24534888629760.0, + "grad_norm": 2.348090219930073, + "language_loss": 0.86420631, + "learning_rate": 3.896895709448175e-06, + "loss": 0.88627708, + "num_input_tokens_seen": 22830570, + "step": 1078, + "time_per_iteration": 2.6169447898864746 + }, + { + "auxiliary_loss_clip": 0.01156329, + "auxiliary_loss_mlp": 0.0103629, + "balance_loss_clip": 1.05394197, + "balance_loss_mlp": 1.02596664, + "epoch": 0.12974207899957915, + "flos": 11215944552960.0, + "grad_norm": 2.827150975017507, + "language_loss": 0.76898724, + "learning_rate": 3.896648683582019e-06, + "loss": 0.79091346, + "num_input_tokens_seen": 22845905, + "step": 1079, + "time_per_iteration": 2.5882070064544678 + }, + { + "auxiliary_loss_clip": 0.01176454, + "auxiliary_loss_mlp": 0.01034488, + "balance_loss_clip": 1.06153131, + "balance_loss_mlp": 1.02409339, + "epoch": 0.12986232189021824, + "flos": 24717889445760.0, + "grad_norm": 2.587096699310919, + "language_loss": 0.80798364, + "learning_rate": 3.896401369995766e-06, + "loss": 0.83009303, + "num_input_tokens_seen": 22865710, + "step": 1080, + "time_per_iteration": 2.616283893585205 + }, + { + "auxiliary_loss_clip": 0.01241372, + "auxiliary_loss_mlp": 0.01047147, + "balance_loss_clip": 1.06926274, + "balance_loss_mlp": 1.03635848, + "epoch": 0.12998256478085732, + "flos": 23915357827200.0, + "grad_norm": 2.086462035320557, + "language_loss": 0.79358411, + "learning_rate": 3.896153768726932e-06, + "loss": 0.81646931, + "num_input_tokens_seen": 22886020, + "step": 1081, + "time_per_iteration": 2.4972732067108154 + }, + { + "auxiliary_loss_clip": 0.01224493, + "auxiliary_loss_mlp": 0.01037093, + "balance_loss_clip": 1.06698895, + "balance_loss_mlp": 1.02603006, + "epoch": 0.13010280767149643, + "flos": 18624207974400.0, + "grad_norm": 2.515002351985249, + "language_loss": 0.88045359, + "learning_rate": 3.8959058798130806e-06, + "loss": 0.90306944, + "num_input_tokens_seen": 22903995, + "step": 1082, + "time_per_iteration": 2.5307528972625732 + }, + { + "auxiliary_loss_clip": 0.01212465, + "auxiliary_loss_mlp": 0.00765708, + "balance_loss_clip": 1.06380415, + "balance_loss_mlp": 1.00051832, + "epoch": 0.1302230505621355, + "flos": 22783992174720.0, + "grad_norm": 1.6780325545066037, + "language_loss": 0.74916291, + "learning_rate": 3.895657703291814e-06, + "loss": 0.76894462, + "num_input_tokens_seen": 22924100, + "step": 1083, + "time_per_iteration": 2.555720567703247 + }, + { + "auxiliary_loss_clip": 0.01217538, + "auxiliary_loss_mlp": 0.01034182, + "balance_loss_clip": 1.06215072, + "balance_loss_mlp": 1.02295876, + "epoch": 0.1303432934527746, + "flos": 21323612920320.0, + "grad_norm": 2.377073189770708, + "language_loss": 0.7935738, + "learning_rate": 3.895409239200781e-06, + "loss": 0.81609106, + "num_input_tokens_seen": 22939985, + "step": 1084, + "time_per_iteration": 3.3124284744262695 + }, + { + "auxiliary_loss_clip": 0.01219507, + "auxiliary_loss_mlp": 0.01036353, + "balance_loss_clip": 1.06297183, + "balance_loss_mlp": 1.02396679, + "epoch": 0.1304635363434137, + "flos": 20922490765440.0, + "grad_norm": 2.5365166062190716, + "language_loss": 0.91362435, + "learning_rate": 3.895160487577673e-06, + "loss": 0.93618298, + "num_input_tokens_seen": 22957555, + "step": 1085, + "time_per_iteration": 2.5716919898986816 + }, + { + "auxiliary_loss_clip": 0.01123794, + "auxiliary_loss_mlp": 0.01020162, + "balance_loss_clip": 1.03356051, + "balance_loss_mlp": 1.01613224, + "epoch": 0.1305837792340528, + "flos": 63245659080960.0, + "grad_norm": 0.7918705452581214, + "language_loss": 0.60951227, + "learning_rate": 3.894911448460226e-06, + "loss": 0.63095188, + "num_input_tokens_seen": 23016870, + "step": 1086, + "time_per_iteration": 2.9612009525299072 + }, + { + "auxiliary_loss_clip": 0.01128055, + "auxiliary_loss_mlp": 0.01042751, + "balance_loss_clip": 1.05195141, + "balance_loss_mlp": 1.03134871, + "epoch": 0.13070402212469187, + "flos": 26428852955520.0, + "grad_norm": 2.1325747954274896, + "language_loss": 0.73073757, + "learning_rate": 3.8946621218862195e-06, + "loss": 0.75244564, + "num_input_tokens_seen": 23037870, + "step": 1087, + "time_per_iteration": 2.754434585571289 + }, + { + "auxiliary_loss_clip": 0.01190876, + "auxiliary_loss_mlp": 0.01042647, + "balance_loss_clip": 1.06162572, + "balance_loss_mlp": 1.03193593, + "epoch": 0.13082426501533098, + "flos": 27673409341440.0, + "grad_norm": 1.8341024497763445, + "language_loss": 0.88823581, + "learning_rate": 3.894412507893475e-06, + "loss": 0.91057104, + "num_input_tokens_seen": 23058150, + "step": 1088, + "time_per_iteration": 3.451272487640381 + }, + { + "auxiliary_loss_clip": 0.01185471, + "auxiliary_loss_mlp": 0.01041924, + "balance_loss_clip": 1.05908084, + "balance_loss_mlp": 1.03030729, + "epoch": 0.13094450790597006, + "flos": 24826770547200.0, + "grad_norm": 2.305423270187946, + "language_loss": 0.71718192, + "learning_rate": 3.894162606519859e-06, + "loss": 0.73945582, + "num_input_tokens_seen": 23077100, + "step": 1089, + "time_per_iteration": 3.424536943435669 + }, + { + "auxiliary_loss_clip": 0.01176644, + "auxiliary_loss_mlp": 0.01036515, + "balance_loss_clip": 1.05924201, + "balance_loss_mlp": 1.02611351, + "epoch": 0.13106475079660915, + "flos": 19062605468160.0, + "grad_norm": 2.1704925293889463, + "language_loss": 0.77071321, + "learning_rate": 3.893912417803282e-06, + "loss": 0.79284477, + "num_input_tokens_seen": 23096815, + "step": 1090, + "time_per_iteration": 3.4611706733703613 + }, + { + "auxiliary_loss_clip": 0.01179271, + "auxiliary_loss_mlp": 0.01040492, + "balance_loss_clip": 1.05457497, + "balance_loss_mlp": 1.0285356, + "epoch": 0.13118499368724823, + "flos": 28913189218560.0, + "grad_norm": 1.8436141842004072, + "language_loss": 0.76844847, + "learning_rate": 3.8936619417816975e-06, + "loss": 0.79064614, + "num_input_tokens_seen": 23117145, + "step": 1091, + "time_per_iteration": 2.7274091243743896 + }, + { + "auxiliary_loss_clip": 0.01192311, + "auxiliary_loss_mlp": 0.01033245, + "balance_loss_clip": 1.06194127, + "balance_loss_mlp": 1.02268338, + "epoch": 0.13130523657788734, + "flos": 14283398206080.0, + "grad_norm": 2.9089822881406757, + "language_loss": 0.71836996, + "learning_rate": 3.8934111784931015e-06, + "loss": 0.74062544, + "num_input_tokens_seen": 23134595, + "step": 1092, + "time_per_iteration": 2.54417085647583 + }, + { + "auxiliary_loss_clip": 0.01111666, + "auxiliary_loss_mlp": 0.01003675, + "balance_loss_clip": 1.02979803, + "balance_loss_mlp": 0.99966949, + "epoch": 0.13142547946852642, + "flos": 70174155519360.0, + "grad_norm": 0.9182064505350517, + "language_loss": 0.59089804, + "learning_rate": 3.893160127975535e-06, + "loss": 0.61205149, + "num_input_tokens_seen": 23195285, + "step": 1093, + "time_per_iteration": 3.2065367698669434 + }, + { + "auxiliary_loss_clip": 0.01182517, + "auxiliary_loss_mlp": 0.0103654, + "balance_loss_clip": 1.0578984, + "balance_loss_mlp": 1.02574015, + "epoch": 0.1315457223591655, + "flos": 45805998844800.0, + "grad_norm": 2.40171869423442, + "language_loss": 0.8135581, + "learning_rate": 3.8929087902670826e-06, + "loss": 0.83574867, + "num_input_tokens_seen": 23216915, + "step": 1094, + "time_per_iteration": 2.8053314685821533 + }, + { + "auxiliary_loss_clip": 0.01124692, + "auxiliary_loss_mlp": 0.01004349, + "balance_loss_clip": 1.02959347, + "balance_loss_mlp": 1.00039124, + "epoch": 0.13166596524980462, + "flos": 62881165820160.0, + "grad_norm": 0.9259619091286828, + "language_loss": 0.60837954, + "learning_rate": 3.8926571654058715e-06, + "loss": 0.62966996, + "num_input_tokens_seen": 23273560, + "step": 1095, + "time_per_iteration": 2.9919137954711914 + }, + { + "auxiliary_loss_clip": 0.01188678, + "auxiliary_loss_mlp": 0.01035614, + "balance_loss_clip": 1.06043243, + "balance_loss_mlp": 1.02469397, + "epoch": 0.1317862081404437, + "flos": 23586523793280.0, + "grad_norm": 4.024755878185446, + "language_loss": 0.77067679, + "learning_rate": 3.892405253430074e-06, + "loss": 0.79291975, + "num_input_tokens_seen": 23291080, + "step": 1096, + "time_per_iteration": 2.6017115116119385 + }, + { + "auxiliary_loss_clip": 0.01211783, + "auxiliary_loss_mlp": 0.00765854, + "balance_loss_clip": 1.06415725, + "balance_loss_mlp": 1.00050616, + "epoch": 0.13190645103108278, + "flos": 20260764460800.0, + "grad_norm": 1.884790467101968, + "language_loss": 0.82538348, + "learning_rate": 3.892153054377904e-06, + "loss": 0.84515983, + "num_input_tokens_seen": 23308485, + "step": 1097, + "time_per_iteration": 2.563589572906494 + }, + { + "auxiliary_loss_clip": 0.01066546, + "auxiliary_loss_mlp": 0.01002683, + "balance_loss_clip": 1.0307411, + "balance_loss_mlp": 1.00040579, + "epoch": 0.13202669392172187, + "flos": 53455440136320.0, + "grad_norm": 0.9326760623171575, + "language_loss": 0.5946089, + "learning_rate": 3.891900568287619e-06, + "loss": 0.61530113, + "num_input_tokens_seen": 23360870, + "step": 1098, + "time_per_iteration": 3.0112295150756836 + }, + { + "auxiliary_loss_clip": 0.01196194, + "auxiliary_loss_mlp": 0.01034589, + "balance_loss_clip": 1.06067562, + "balance_loss_mlp": 1.0228529, + "epoch": 0.13214693681236098, + "flos": 15851293845120.0, + "grad_norm": 2.5126301532448934, + "language_loss": 0.72160846, + "learning_rate": 3.891647795197523e-06, + "loss": 0.74391627, + "num_input_tokens_seen": 23376910, + "step": 1099, + "time_per_iteration": 2.585240125656128 + }, + { + "auxiliary_loss_clip": 0.0119825, + "auxiliary_loss_mlp": 0.01040484, + "balance_loss_clip": 1.05765295, + "balance_loss_mlp": 1.02825272, + "epoch": 0.13226717970300006, + "flos": 19353840940800.0, + "grad_norm": 2.1138172419429697, + "language_loss": 0.69070387, + "learning_rate": 3.8913947351459605e-06, + "loss": 0.71309125, + "num_input_tokens_seen": 23394450, + "step": 1100, + "time_per_iteration": 2.564791679382324 + }, + { + "auxiliary_loss_clip": 0.01241522, + "auxiliary_loss_mlp": 0.01036631, + "balance_loss_clip": 1.06765985, + "balance_loss_mlp": 1.02631903, + "epoch": 0.13238742259363914, + "flos": 20698084546560.0, + "grad_norm": 1.846573486931406, + "language_loss": 0.6741792, + "learning_rate": 3.89114138817132e-06, + "loss": 0.69696081, + "num_input_tokens_seen": 23411115, + "step": 1101, + "time_per_iteration": 2.4769484996795654 + }, + { + "auxiliary_loss_clip": 0.01226738, + "auxiliary_loss_mlp": 0.0103629, + "balance_loss_clip": 1.06823516, + "balance_loss_mlp": 1.02520406, + "epoch": 0.13250766548427825, + "flos": 21032449274880.0, + "grad_norm": 2.6470265339604286, + "language_loss": 0.84225464, + "learning_rate": 3.890887754312035e-06, + "loss": 0.86488497, + "num_input_tokens_seen": 23429360, + "step": 1102, + "time_per_iteration": 2.541654348373413 + }, + { + "auxiliary_loss_clip": 0.01200322, + "auxiliary_loss_mlp": 0.01045066, + "balance_loss_clip": 1.05701745, + "balance_loss_mlp": 1.033705, + "epoch": 0.13262790837491734, + "flos": 22637871648000.0, + "grad_norm": 1.7418421327585605, + "language_loss": 0.87645948, + "learning_rate": 3.890633833606581e-06, + "loss": 0.89891344, + "num_input_tokens_seen": 23449050, + "step": 1103, + "time_per_iteration": 2.534417152404785 + }, + { + "auxiliary_loss_clip": 0.01225863, + "auxiliary_loss_mlp": 0.01038766, + "balance_loss_clip": 1.06945336, + "balance_loss_mlp": 1.02812052, + "epoch": 0.13274815126555642, + "flos": 19683141851520.0, + "grad_norm": 2.1745123844976546, + "language_loss": 0.69707787, + "learning_rate": 3.890379626093477e-06, + "loss": 0.71972418, + "num_input_tokens_seen": 23468800, + "step": 1104, + "time_per_iteration": 2.5068857669830322 + }, + { + "auxiliary_loss_clip": 0.01162224, + "auxiliary_loss_mlp": 0.01036663, + "balance_loss_clip": 1.05468297, + "balance_loss_mlp": 1.02495623, + "epoch": 0.1328683941561955, + "flos": 21317687176320.0, + "grad_norm": 2.9169950015818, + "language_loss": 0.92349279, + "learning_rate": 3.890125131811287e-06, + "loss": 0.94548166, + "num_input_tokens_seen": 23486850, + "step": 1105, + "time_per_iteration": 2.569483518600464 + }, + { + "auxiliary_loss_clip": 0.01196696, + "auxiliary_loss_mlp": 0.01036246, + "balance_loss_clip": 1.05753195, + "balance_loss_mlp": 1.02591646, + "epoch": 0.1329886370468346, + "flos": 13699131580800.0, + "grad_norm": 1.9047289971934116, + "language_loss": 0.75431961, + "learning_rate": 3.889870350798618e-06, + "loss": 0.776649, + "num_input_tokens_seen": 23504195, + "step": 1106, + "time_per_iteration": 2.5123963356018066 + }, + { + "auxiliary_loss_clip": 0.01240119, + "auxiliary_loss_mlp": 0.01036762, + "balance_loss_clip": 1.06552362, + "balance_loss_mlp": 1.02633095, + "epoch": 0.1331088799374737, + "flos": 21032413361280.0, + "grad_norm": 1.649439623778697, + "language_loss": 0.78772581, + "learning_rate": 3.889615283094119e-06, + "loss": 0.81049466, + "num_input_tokens_seen": 23523385, + "step": 1107, + "time_per_iteration": 2.463271141052246 + }, + { + "auxiliary_loss_clip": 0.0124413, + "auxiliary_loss_mlp": 0.01038433, + "balance_loss_clip": 1.06622851, + "balance_loss_mlp": 1.02689981, + "epoch": 0.13322912282811278, + "flos": 18260432985600.0, + "grad_norm": 2.0448354768291286, + "language_loss": 0.84314346, + "learning_rate": 3.889359928736485e-06, + "loss": 0.86596912, + "num_input_tokens_seen": 23541330, + "step": 1108, + "time_per_iteration": 2.449154853820801 + }, + { + "auxiliary_loss_clip": 0.01202381, + "auxiliary_loss_mlp": 0.00765783, + "balance_loss_clip": 1.06293809, + "balance_loss_mlp": 1.00054359, + "epoch": 0.1333493657187519, + "flos": 24460876656000.0, + "grad_norm": 2.0009648445900794, + "language_loss": 0.91113901, + "learning_rate": 3.889104287764451e-06, + "loss": 0.93082064, + "num_input_tokens_seen": 23561705, + "step": 1109, + "time_per_iteration": 2.5429112911224365 + }, + { + "auxiliary_loss_clip": 0.01208964, + "auxiliary_loss_mlp": 0.01040772, + "balance_loss_clip": 1.06511521, + "balance_loss_mlp": 1.02978659, + "epoch": 0.13346960860939097, + "flos": 22158930677760.0, + "grad_norm": 1.959870936104569, + "language_loss": 0.90370619, + "learning_rate": 3.888848360216798e-06, + "loss": 0.92620349, + "num_input_tokens_seen": 23579350, + "step": 1110, + "time_per_iteration": 2.5340397357940674 + }, + { + "auxiliary_loss_clip": 0.01120834, + "auxiliary_loss_mlp": 0.01006637, + "balance_loss_clip": 1.03648925, + "balance_loss_mlp": 1.00310802, + "epoch": 0.13358985150003005, + "flos": 67931212608000.0, + "grad_norm": 0.8187175991418526, + "language_loss": 0.56634319, + "learning_rate": 3.888592146132351e-06, + "loss": 0.58761793, + "num_input_tokens_seen": 23640620, + "step": 1111, + "time_per_iteration": 3.993626356124878 + }, + { + "auxiliary_loss_clip": 0.01224771, + "auxiliary_loss_mlp": 0.0104589, + "balance_loss_clip": 1.06687641, + "balance_loss_mlp": 1.03510725, + "epoch": 0.13371009439066917, + "flos": 26834284742400.0, + "grad_norm": 1.7549904588170753, + "language_loss": 0.78138489, + "learning_rate": 3.888335645549978e-06, + "loss": 0.80409151, + "num_input_tokens_seen": 23661040, + "step": 1112, + "time_per_iteration": 2.540123701095581 + }, + { + "auxiliary_loss_clip": 0.01241483, + "auxiliary_loss_mlp": 0.01043448, + "balance_loss_clip": 1.06913567, + "balance_loss_mlp": 1.03260005, + "epoch": 0.13383033728130825, + "flos": 26322844942080.0, + "grad_norm": 2.601664672280248, + "language_loss": 0.81617463, + "learning_rate": 3.888078858508588e-06, + "loss": 0.83902389, + "num_input_tokens_seen": 23680900, + "step": 1113, + "time_per_iteration": 2.5136003494262695 + }, + { + "auxiliary_loss_clip": 0.01205787, + "auxiliary_loss_mlp": 0.01036956, + "balance_loss_clip": 1.0644424, + "balance_loss_mlp": 1.0259707, + "epoch": 0.13395058017194733, + "flos": 22563931501440.0, + "grad_norm": 5.273596510630477, + "language_loss": 0.84812093, + "learning_rate": 3.8878217850471365e-06, + "loss": 0.87054831, + "num_input_tokens_seen": 23700815, + "step": 1114, + "time_per_iteration": 2.549492597579956 + }, + { + "auxiliary_loss_clip": 0.01245048, + "auxiliary_loss_mlp": 0.01042916, + "balance_loss_clip": 1.07029653, + "balance_loss_mlp": 1.03086424, + "epoch": 0.13407082306258641, + "flos": 25810938264960.0, + "grad_norm": 5.146197483849868, + "language_loss": 0.73882359, + "learning_rate": 3.887564425204621e-06, + "loss": 0.76170325, + "num_input_tokens_seen": 23722500, + "step": 1115, + "time_per_iteration": 3.2846789360046387 + }, + { + "auxiliary_loss_clip": 0.01085048, + "auxiliary_loss_mlp": 0.01001573, + "balance_loss_clip": 1.02471066, + "balance_loss_mlp": 0.99825937, + "epoch": 0.13419106595322552, + "flos": 68338365269760.0, + "grad_norm": 0.8382136449086554, + "language_loss": 0.54683167, + "learning_rate": 3.887306779020083e-06, + "loss": 0.56769788, + "num_input_tokens_seen": 23777155, + "step": 1116, + "time_per_iteration": 4.5488197803497314 + }, + { + "auxiliary_loss_clip": 0.01229818, + "auxiliary_loss_mlp": 0.01041454, + "balance_loss_clip": 1.06861925, + "balance_loss_mlp": 1.02987313, + "epoch": 0.1343113088438646, + "flos": 20449080489600.0, + "grad_norm": 2.3019676854766127, + "language_loss": 0.7045058, + "learning_rate": 3.887048846532608e-06, + "loss": 0.72721851, + "num_input_tokens_seen": 23794130, + "step": 1117, + "time_per_iteration": 2.4984283447265625 + }, + { + "auxiliary_loss_clip": 0.01093102, + "auxiliary_loss_mlp": 0.01003031, + "balance_loss_clip": 1.02478826, + "balance_loss_mlp": 0.99986041, + "epoch": 0.1344315517345037, + "flos": 67389784951680.0, + "grad_norm": 0.7619565428591891, + "language_loss": 0.58122981, + "learning_rate": 3.8867906277813224e-06, + "loss": 0.60219109, + "num_input_tokens_seen": 23852285, + "step": 1118, + "time_per_iteration": 3.0141594409942627 + }, + { + "auxiliary_loss_clip": 0.01227155, + "auxiliary_loss_mlp": 0.00765246, + "balance_loss_clip": 1.06483746, + "balance_loss_mlp": 1.00033855, + "epoch": 0.1345517946251428, + "flos": 40734442788480.0, + "grad_norm": 2.7034268604137153, + "language_loss": 0.74009901, + "learning_rate": 3.886532122805399e-06, + "loss": 0.760023, + "num_input_tokens_seen": 23874765, + "step": 1119, + "time_per_iteration": 2.7011048793792725 + }, + { + "auxiliary_loss_clip": 0.01151072, + "auxiliary_loss_mlp": 0.01037463, + "balance_loss_clip": 1.05456972, + "balance_loss_mlp": 1.02597713, + "epoch": 0.13467203751578188, + "flos": 22816850140800.0, + "grad_norm": 1.8460522716056469, + "language_loss": 0.89986748, + "learning_rate": 3.886273331644053e-06, + "loss": 0.92175281, + "num_input_tokens_seen": 23893635, + "step": 1120, + "time_per_iteration": 2.648460865020752 + }, + { + "auxiliary_loss_clip": 0.01174661, + "auxiliary_loss_mlp": 0.0103347, + "balance_loss_clip": 1.06011653, + "balance_loss_mlp": 1.02283692, + "epoch": 0.13479228040642097, + "flos": 17091576512640.0, + "grad_norm": 2.0261732295984554, + "language_loss": 0.82816958, + "learning_rate": 3.886014254336542e-06, + "loss": 0.85025096, + "num_input_tokens_seen": 23910110, + "step": 1121, + "time_per_iteration": 2.5893356800079346 + }, + { + "auxiliary_loss_clip": 0.01221447, + "auxiliary_loss_mlp": 0.01033169, + "balance_loss_clip": 1.06319499, + "balance_loss_mlp": 1.02274978, + "epoch": 0.13491252329706005, + "flos": 23730525417600.0, + "grad_norm": 1.6272840604555014, + "language_loss": 0.92651892, + "learning_rate": 3.885754890922168e-06, + "loss": 0.94906509, + "num_input_tokens_seen": 23930440, + "step": 1122, + "time_per_iteration": 2.5462985038757324 + }, + { + "auxiliary_loss_clip": 0.01131302, + "auxiliary_loss_mlp": 0.01042271, + "balance_loss_clip": 1.05272293, + "balance_loss_mlp": 1.03091049, + "epoch": 0.13503276618769916, + "flos": 34127058960000.0, + "grad_norm": 1.9073501865002436, + "language_loss": 0.78531545, + "learning_rate": 3.885495241440277e-06, + "loss": 0.80705118, + "num_input_tokens_seen": 23954535, + "step": 1123, + "time_per_iteration": 2.7721335887908936 + }, + { + "auxiliary_loss_clip": 0.01241556, + "auxiliary_loss_mlp": 0.01039778, + "balance_loss_clip": 1.0670861, + "balance_loss_mlp": 1.02904892, + "epoch": 0.13515300907833824, + "flos": 17712328377600.0, + "grad_norm": 2.058498202099182, + "language_loss": 0.73846775, + "learning_rate": 3.885235305930257e-06, + "loss": 0.76128107, + "num_input_tokens_seen": 23972735, + "step": 1124, + "time_per_iteration": 2.4607648849487305 + }, + { + "auxiliary_loss_clip": 0.01190969, + "auxiliary_loss_mlp": 0.01044385, + "balance_loss_clip": 1.064008, + "balance_loss_mlp": 1.03221369, + "epoch": 0.13527325196897733, + "flos": 20260872201600.0, + "grad_norm": 2.1421451944037697, + "language_loss": 0.85660142, + "learning_rate": 3.884975084431539e-06, + "loss": 0.87895495, + "num_input_tokens_seen": 23987685, + "step": 1125, + "time_per_iteration": 2.5714898109436035 + }, + { + "auxiliary_loss_clip": 0.01214772, + "auxiliary_loss_mlp": 0.00765599, + "balance_loss_clip": 1.06252337, + "balance_loss_mlp": 1.00038457, + "epoch": 0.13539349485961644, + "flos": 18186492839040.0, + "grad_norm": 2.5021129969765683, + "language_loss": 0.91564107, + "learning_rate": 3.8847145769836e-06, + "loss": 0.93544477, + "num_input_tokens_seen": 24004105, + "step": 1126, + "time_per_iteration": 2.49163818359375 + }, + { + "auxiliary_loss_clip": 0.01240803, + "auxiliary_loss_mlp": 0.01038145, + "balance_loss_clip": 1.0652889, + "balance_loss_mlp": 1.02679026, + "epoch": 0.13551373775025552, + "flos": 19317463441920.0, + "grad_norm": 2.677665819682341, + "language_loss": 0.66741312, + "learning_rate": 3.884453783625959e-06, + "loss": 0.69020253, + "num_input_tokens_seen": 24021715, + "step": 1127, + "time_per_iteration": 2.5781118869781494 + }, + { + "auxiliary_loss_clip": 0.01201759, + "auxiliary_loss_mlp": 0.01029187, + "balance_loss_clip": 1.0614872, + "balance_loss_mlp": 1.01928639, + "epoch": 0.1356339806408946, + "flos": 20850813175680.0, + "grad_norm": 1.981416383329947, + "language_loss": 0.84852916, + "learning_rate": 3.884192704398176e-06, + "loss": 0.87083864, + "num_input_tokens_seen": 24038915, + "step": 1128, + "time_per_iteration": 2.5953991413116455 + }, + { + "auxiliary_loss_clip": 0.01223099, + "auxiliary_loss_mlp": 0.01045926, + "balance_loss_clip": 1.06246591, + "balance_loss_mlp": 1.03547764, + "epoch": 0.13575422353153369, + "flos": 50476037696640.0, + "grad_norm": 1.6110806141275857, + "language_loss": 0.74393928, + "learning_rate": 3.883931339339858e-06, + "loss": 0.76662952, + "num_input_tokens_seen": 24063300, + "step": 1129, + "time_per_iteration": 2.7963764667510986 + }, + { + "auxiliary_loss_clip": 0.01224385, + "auxiliary_loss_mlp": 0.01039252, + "balance_loss_clip": 1.06255102, + "balance_loss_mlp": 1.02813554, + "epoch": 0.1358744664221728, + "flos": 18150797698560.0, + "grad_norm": 1.8533099637166812, + "language_loss": 0.78504401, + "learning_rate": 3.883669688490654e-06, + "loss": 0.80768031, + "num_input_tokens_seen": 24081070, + "step": 1130, + "time_per_iteration": 2.4754745960235596 + }, + { + "auxiliary_loss_clip": 0.01193978, + "auxiliary_loss_mlp": 0.00765002, + "balance_loss_clip": 1.0569737, + "balance_loss_mlp": 1.00041771, + "epoch": 0.13599470931281188, + "flos": 18442966924800.0, + "grad_norm": 1.9472325741395984, + "language_loss": 0.85889775, + "learning_rate": 3.883407751890256e-06, + "loss": 0.87848753, + "num_input_tokens_seen": 24099675, + "step": 1131, + "time_per_iteration": 2.540518283843994 + }, + { + "auxiliary_loss_clip": 0.01188197, + "auxiliary_loss_mlp": 0.01041204, + "balance_loss_clip": 1.056306, + "balance_loss_mlp": 1.02952766, + "epoch": 0.13611495220345096, + "flos": 26680766014080.0, + "grad_norm": 1.6679485437712303, + "language_loss": 0.85708404, + "learning_rate": 3.8831455295783994e-06, + "loss": 0.87937796, + "num_input_tokens_seen": 24118925, + "step": 1132, + "time_per_iteration": 2.591216802597046 + }, + { + "auxiliary_loss_clip": 0.01201798, + "auxiliary_loss_mlp": 0.01036791, + "balance_loss_clip": 1.06113195, + "balance_loss_mlp": 1.02573979, + "epoch": 0.13623519509409007, + "flos": 21686238673920.0, + "grad_norm": 1.7178143238703312, + "language_loss": 0.74197817, + "learning_rate": 3.882883021594864e-06, + "loss": 0.764364, + "num_input_tokens_seen": 24137065, + "step": 1133, + "time_per_iteration": 2.6014578342437744 + }, + { + "auxiliary_loss_clip": 0.01180349, + "auxiliary_loss_mlp": 0.01035856, + "balance_loss_clip": 1.05976903, + "balance_loss_mlp": 1.02533555, + "epoch": 0.13635543798472916, + "flos": 14830389492480.0, + "grad_norm": 2.384426706946309, + "language_loss": 0.86496615, + "learning_rate": 3.8826202279794705e-06, + "loss": 0.88712823, + "num_input_tokens_seen": 24154125, + "step": 1134, + "time_per_iteration": 2.555396795272827 + }, + { + "auxiliary_loss_clip": 0.0123972, + "auxiliary_loss_mlp": 0.01034842, + "balance_loss_clip": 1.06723428, + "balance_loss_mlp": 1.0246737, + "epoch": 0.13647568087536824, + "flos": 22890323410560.0, + "grad_norm": 2.3111098506183936, + "language_loss": 0.70142764, + "learning_rate": 3.882357148772085e-06, + "loss": 0.72417331, + "num_input_tokens_seen": 24171550, + "step": 1135, + "time_per_iteration": 2.4958295822143555 + }, + { + "auxiliary_loss_clip": 0.01173557, + "auxiliary_loss_mlp": 0.01040921, + "balance_loss_clip": 1.05683875, + "balance_loss_mlp": 1.02975094, + "epoch": 0.13659592376600732, + "flos": 19937927998080.0, + "grad_norm": 3.2306821560589034, + "language_loss": 0.84463304, + "learning_rate": 3.882093784012617e-06, + "loss": 0.8667779, + "num_input_tokens_seen": 24190190, + "step": 1136, + "time_per_iteration": 2.534374237060547 + }, + { + "auxiliary_loss_clip": 0.01205097, + "auxiliary_loss_mlp": 0.01036033, + "balance_loss_clip": 1.06269848, + "balance_loss_mlp": 1.02489901, + "epoch": 0.13671616665664643, + "flos": 21428579439360.0, + "grad_norm": 2.011854935474412, + "language_loss": 0.84202623, + "learning_rate": 3.881830133741019e-06, + "loss": 0.86443752, + "num_input_tokens_seen": 24209055, + "step": 1137, + "time_per_iteration": 2.5415680408477783 + }, + { + "auxiliary_loss_clip": 0.01190423, + "auxiliary_loss_mlp": 0.01042749, + "balance_loss_clip": 1.06386101, + "balance_loss_mlp": 1.03144836, + "epoch": 0.13683640954728551, + "flos": 22778138257920.0, + "grad_norm": 2.1307374250683795, + "language_loss": 0.76381576, + "learning_rate": 3.881566197997285e-06, + "loss": 0.78614753, + "num_input_tokens_seen": 24225490, + "step": 1138, + "time_per_iteration": 3.309091091156006 + }, + { + "auxiliary_loss_clip": 0.01201803, + "auxiliary_loss_mlp": 0.0103977, + "balance_loss_clip": 1.06417179, + "balance_loss_mlp": 1.02939296, + "epoch": 0.1369566524379246, + "flos": 21725884310400.0, + "grad_norm": 1.5066913227741021, + "language_loss": 0.7482627, + "learning_rate": 3.881301976821456e-06, + "loss": 0.77067846, + "num_input_tokens_seen": 24245520, + "step": 1139, + "time_per_iteration": 2.550808906555176 + }, + { + "auxiliary_loss_clip": 0.01218155, + "auxiliary_loss_mlp": 0.01041559, + "balance_loss_clip": 1.06372166, + "balance_loss_mlp": 1.03127754, + "epoch": 0.1370768953285637, + "flos": 18624459369600.0, + "grad_norm": 2.082815708170793, + "language_loss": 0.90536702, + "learning_rate": 3.881037470253612e-06, + "loss": 0.92796409, + "num_input_tokens_seen": 24265035, + "step": 1140, + "time_per_iteration": 2.4847500324249268 + }, + { + "auxiliary_loss_clip": 0.01172951, + "auxiliary_loss_mlp": 0.01037426, + "balance_loss_clip": 1.05928838, + "balance_loss_mlp": 1.02738905, + "epoch": 0.1371971382192028, + "flos": 14939521989120.0, + "grad_norm": 2.5088295381221153, + "language_loss": 0.79064256, + "learning_rate": 3.88077267833388e-06, + "loss": 0.81274629, + "num_input_tokens_seen": 24281550, + "step": 1141, + "time_per_iteration": 2.577676773071289 + }, + { + "auxiliary_loss_clip": 0.0117094, + "auxiliary_loss_mlp": 0.01044252, + "balance_loss_clip": 1.05749214, + "balance_loss_mlp": 1.03354681, + "epoch": 0.13731738110984187, + "flos": 19023785844480.0, + "grad_norm": 2.085484375252827, + "language_loss": 0.84136826, + "learning_rate": 3.880507601102427e-06, + "loss": 0.86352021, + "num_input_tokens_seen": 24299485, + "step": 1142, + "time_per_iteration": 3.3645074367523193 + }, + { + "auxiliary_loss_clip": 0.01237473, + "auxiliary_loss_mlp": 0.01040594, + "balance_loss_clip": 1.06740534, + "balance_loss_mlp": 1.03019905, + "epoch": 0.13743762400048098, + "flos": 18187462506240.0, + "grad_norm": 2.1105354606699795, + "language_loss": 0.82316083, + "learning_rate": 3.880242238599467e-06, + "loss": 0.84594148, + "num_input_tokens_seen": 24316010, + "step": 1143, + "time_per_iteration": 3.999807596206665 + }, + { + "auxiliary_loss_clip": 0.01234583, + "auxiliary_loss_mlp": 0.01040063, + "balance_loss_clip": 1.06531119, + "balance_loss_mlp": 1.02951336, + "epoch": 0.13755786689112007, + "flos": 21031982398080.0, + "grad_norm": 1.6600344936377602, + "language_loss": 0.83149546, + "learning_rate": 3.879976590865254e-06, + "loss": 0.85424185, + "num_input_tokens_seen": 24335465, + "step": 1144, + "time_per_iteration": 2.527592897415161 + }, + { + "auxiliary_loss_clip": 0.01204756, + "auxiliary_loss_mlp": 0.0104207, + "balance_loss_clip": 1.0639739, + "balance_loss_mlp": 1.03144872, + "epoch": 0.13767810978175915, + "flos": 21360636864000.0, + "grad_norm": 5.4337402707363704, + "language_loss": 0.87454909, + "learning_rate": 3.879710657940087e-06, + "loss": 0.89701736, + "num_input_tokens_seen": 24354415, + "step": 1145, + "time_per_iteration": 2.5474741458892822 + }, + { + "auxiliary_loss_clip": 0.01225371, + "auxiliary_loss_mlp": 0.01052293, + "balance_loss_clip": 1.06436217, + "balance_loss_mlp": 1.04059255, + "epoch": 0.13779835267239823, + "flos": 30592084861440.0, + "grad_norm": 2.3322244944540267, + "language_loss": 0.70442921, + "learning_rate": 3.879444439864308e-06, + "loss": 0.72720581, + "num_input_tokens_seen": 24373990, + "step": 1146, + "time_per_iteration": 2.5590808391571045 + }, + { + "auxiliary_loss_clip": 0.01221164, + "auxiliary_loss_mlp": 0.00765366, + "balance_loss_clip": 1.06309795, + "balance_loss_mlp": 1.00065362, + "epoch": 0.13791859556303734, + "flos": 22669867687680.0, + "grad_norm": 1.8464652962120083, + "language_loss": 0.86068052, + "learning_rate": 3.879177936678301e-06, + "loss": 0.88054579, + "num_input_tokens_seen": 24392995, + "step": 1147, + "time_per_iteration": 2.5448756217956543 + }, + { + "auxiliary_loss_clip": 0.01209788, + "auxiliary_loss_mlp": 0.01039553, + "balance_loss_clip": 1.06348205, + "balance_loss_mlp": 1.02866888, + "epoch": 0.13803883845367643, + "flos": 35224166016000.0, + "grad_norm": 2.134840070494426, + "language_loss": 0.77335083, + "learning_rate": 3.878911148422496e-06, + "loss": 0.79584426, + "num_input_tokens_seen": 24414470, + "step": 1148, + "time_per_iteration": 2.6417407989501953 + }, + { + "auxiliary_loss_clip": 0.01219679, + "auxiliary_loss_mlp": 0.01035519, + "balance_loss_clip": 1.06255388, + "balance_loss_mlp": 1.02416396, + "epoch": 0.1381590813443155, + "flos": 32014542332160.0, + "grad_norm": 3.1145621046672822, + "language_loss": 0.70440066, + "learning_rate": 3.878644075137364e-06, + "loss": 0.72695261, + "num_input_tokens_seen": 24435120, + "step": 1149, + "time_per_iteration": 2.5786051750183105 + }, + { + "auxiliary_loss_clip": 0.011676, + "auxiliary_loss_mlp": 0.01035, + "balance_loss_clip": 1.05384159, + "balance_loss_mlp": 1.02452123, + "epoch": 0.13827932423495462, + "flos": 17821855923840.0, + "grad_norm": 2.1955728914898396, + "language_loss": 0.78926963, + "learning_rate": 3.878376716863418e-06, + "loss": 0.81129563, + "num_input_tokens_seen": 24451420, + "step": 1150, + "time_per_iteration": 2.524653673171997 + }, + { + "auxiliary_loss_clip": 0.0120306, + "auxiliary_loss_mlp": 0.01040813, + "balance_loss_clip": 1.05941141, + "balance_loss_mlp": 1.02941048, + "epoch": 0.1383995671255937, + "flos": 19427098728960.0, + "grad_norm": 2.2208467527907523, + "language_loss": 0.71904528, + "learning_rate": 3.878109073641219e-06, + "loss": 0.74148393, + "num_input_tokens_seen": 24470450, + "step": 1151, + "time_per_iteration": 2.5673470497131348 + }, + { + "auxiliary_loss_clip": 0.01172214, + "auxiliary_loss_mlp": 0.0103826, + "balance_loss_clip": 1.05937445, + "balance_loss_mlp": 1.02791274, + "epoch": 0.13851981001623279, + "flos": 28296603331200.0, + "grad_norm": 1.594388011633795, + "language_loss": 0.81361222, + "learning_rate": 3.877841145511366e-06, + "loss": 0.83571696, + "num_input_tokens_seen": 24493190, + "step": 1152, + "time_per_iteration": 2.670708656311035 + }, + { + "auxiliary_loss_clip": 0.01225581, + "auxiliary_loss_mlp": 0.01037594, + "balance_loss_clip": 1.06429422, + "balance_loss_mlp": 1.02672815, + "epoch": 0.13864005290687187, + "flos": 21213079793280.0, + "grad_norm": 1.810681993719155, + "language_loss": 0.8260709, + "learning_rate": 3.8775729325145035e-06, + "loss": 0.84870261, + "num_input_tokens_seen": 24512425, + "step": 1153, + "time_per_iteration": 2.5211658477783203 + }, + { + "auxiliary_loss_clip": 0.01067588, + "auxiliary_loss_mlp": 0.01007214, + "balance_loss_clip": 1.01500249, + "balance_loss_mlp": 1.00466299, + "epoch": 0.13876029579751098, + "flos": 71653389413760.0, + "grad_norm": 0.8169070344567309, + "language_loss": 0.64743173, + "learning_rate": 3.877304434691321e-06, + "loss": 0.66817975, + "num_input_tokens_seen": 24579275, + "step": 1154, + "time_per_iteration": 3.268359899520874 + }, + { + "auxiliary_loss_clip": 0.01189284, + "auxiliary_loss_mlp": 0.01031461, + "balance_loss_clip": 1.06265283, + "balance_loss_mlp": 1.02173972, + "epoch": 0.13888053868815006, + "flos": 21941348042880.0, + "grad_norm": 1.7965260696419691, + "language_loss": 0.79513156, + "learning_rate": 3.877035652082548e-06, + "loss": 0.81733906, + "num_input_tokens_seen": 24598720, + "step": 1155, + "time_per_iteration": 2.5922493934631348 + }, + { + "auxiliary_loss_clip": 0.01196451, + "auxiliary_loss_mlp": 0.0103317, + "balance_loss_clip": 1.06207383, + "balance_loss_mlp": 1.02242899, + "epoch": 0.13900078157878915, + "flos": 19608627087360.0, + "grad_norm": 1.661528558479717, + "language_loss": 0.85479182, + "learning_rate": 3.87676658472896e-06, + "loss": 0.87708801, + "num_input_tokens_seen": 24617530, + "step": 1156, + "time_per_iteration": 2.5499255657196045 + }, + { + "auxiliary_loss_clip": 0.01219157, + "auxiliary_loss_mlp": 0.01045979, + "balance_loss_clip": 1.06044507, + "balance_loss_mlp": 1.0354712, + "epoch": 0.13912102446942826, + "flos": 22638051216000.0, + "grad_norm": 1.7932060977210535, + "language_loss": 0.85271388, + "learning_rate": 3.876497232671372e-06, + "loss": 0.87536526, + "num_input_tokens_seen": 24637485, + "step": 1157, + "time_per_iteration": 2.5319864749908447 + }, + { + "auxiliary_loss_clip": 0.01179725, + "auxiliary_loss_mlp": 0.01034744, + "balance_loss_clip": 1.05859292, + "balance_loss_mlp": 1.02450991, + "epoch": 0.13924126736006734, + "flos": 29643324975360.0, + "grad_norm": 2.221053034850036, + "language_loss": 0.83683348, + "learning_rate": 3.876227595950647e-06, + "loss": 0.85897815, + "num_input_tokens_seen": 24656915, + "step": 1158, + "time_per_iteration": 2.6654205322265625 + }, + { + "auxiliary_loss_clip": 0.01236878, + "auxiliary_loss_mlp": 0.01038574, + "balance_loss_clip": 1.06671619, + "balance_loss_mlp": 1.02699888, + "epoch": 0.13936151025070642, + "flos": 27417653527680.0, + "grad_norm": 1.593576283177082, + "language_loss": 0.78836143, + "learning_rate": 3.875957674607686e-06, + "loss": 0.81111586, + "num_input_tokens_seen": 24679190, + "step": 1159, + "time_per_iteration": 2.5553579330444336 + }, + { + "auxiliary_loss_clip": 0.01212885, + "auxiliary_loss_mlp": 0.00766146, + "balance_loss_clip": 1.05932403, + "balance_loss_mlp": 1.00083232, + "epoch": 0.1394817531413455, + "flos": 16399326625920.0, + "grad_norm": 1.8147887066196629, + "language_loss": 0.88076818, + "learning_rate": 3.8756874686834386e-06, + "loss": 0.90055847, + "num_input_tokens_seen": 24697405, + "step": 1160, + "time_per_iteration": 2.490630626678467 + }, + { + "auxiliary_loss_clip": 0.01224913, + "auxiliary_loss_mlp": 0.00765962, + "balance_loss_clip": 1.06240153, + "balance_loss_mlp": 1.00071442, + "epoch": 0.13960199603198462, + "flos": 30922319525760.0, + "grad_norm": 1.9247025491039427, + "language_loss": 0.80491769, + "learning_rate": 3.875416978218893e-06, + "loss": 0.82482648, + "num_input_tokens_seen": 24720600, + "step": 1161, + "time_per_iteration": 2.601969003677368 + }, + { + "auxiliary_loss_clip": 0.01197857, + "auxiliary_loss_mlp": 0.01038845, + "balance_loss_clip": 1.0562346, + "balance_loss_mlp": 1.02762735, + "epoch": 0.1397222389226237, + "flos": 18113773754880.0, + "grad_norm": 2.0533248548080625, + "language_loss": 0.83005345, + "learning_rate": 3.8751462032550835e-06, + "loss": 0.85242045, + "num_input_tokens_seen": 24737605, + "step": 1162, + "time_per_iteration": 2.527703285217285 + }, + { + "auxiliary_loss_clip": 0.01202497, + "auxiliary_loss_mlp": 0.01026131, + "balance_loss_clip": 1.06429219, + "balance_loss_mlp": 1.0156945, + "epoch": 0.13984248181326278, + "flos": 16872772815360.0, + "grad_norm": 2.124421289623326, + "language_loss": 0.82761854, + "learning_rate": 3.874875143833085e-06, + "loss": 0.84990484, + "num_input_tokens_seen": 24755845, + "step": 1163, + "time_per_iteration": 2.5155723094940186 + }, + { + "auxiliary_loss_clip": 0.01220923, + "auxiliary_loss_mlp": 0.01044871, + "balance_loss_clip": 1.0628835, + "balance_loss_mlp": 1.03328431, + "epoch": 0.1399627247039019, + "flos": 54121401267840.0, + "grad_norm": 2.3222036808411812, + "language_loss": 0.68655175, + "learning_rate": 3.874603799994019e-06, + "loss": 0.70920968, + "num_input_tokens_seen": 24779380, + "step": 1164, + "time_per_iteration": 3.6294338703155518 + }, + { + "auxiliary_loss_clip": 0.01183656, + "auxiliary_loss_mlp": 0.01032628, + "balance_loss_clip": 1.05921638, + "balance_loss_mlp": 1.02243567, + "epoch": 0.14008296759454097, + "flos": 11765521618560.0, + "grad_norm": 1.8516818403396467, + "language_loss": 0.86639535, + "learning_rate": 3.874332171779046e-06, + "loss": 0.88855821, + "num_input_tokens_seen": 24794260, + "step": 1165, + "time_per_iteration": 2.5718166828155518 + }, + { + "auxiliary_loss_clip": 0.01182993, + "auxiliary_loss_mlp": 0.0103054, + "balance_loss_clip": 1.0564332, + "balance_loss_mlp": 1.01960301, + "epoch": 0.14020321048518006, + "flos": 22017514832640.0, + "grad_norm": 1.7477498047825006, + "language_loss": 0.75400889, + "learning_rate": 3.874060259229373e-06, + "loss": 0.77614421, + "num_input_tokens_seen": 24815835, + "step": 1166, + "time_per_iteration": 2.5926334857940674 + }, + { + "auxiliary_loss_clip": 0.01224266, + "auxiliary_loss_mlp": 0.01045608, + "balance_loss_clip": 1.0655601, + "balance_loss_mlp": 1.03413987, + "epoch": 0.14032345337581917, + "flos": 23404313076480.0, + "grad_norm": 1.996668701056672, + "language_loss": 0.93550593, + "learning_rate": 3.873788062386249e-06, + "loss": 0.95820469, + "num_input_tokens_seen": 24834095, + "step": 1167, + "time_per_iteration": 2.6722705364227295 + }, + { + "auxiliary_loss_clip": 0.01195328, + "auxiliary_loss_mlp": 0.0104012, + "balance_loss_clip": 1.06403697, + "balance_loss_mlp": 1.02936769, + "epoch": 0.14044369626645825, + "flos": 29645767100160.0, + "grad_norm": 1.7575838240998531, + "language_loss": 0.82136154, + "learning_rate": 3.873515581290965e-06, + "loss": 0.84371603, + "num_input_tokens_seen": 24858900, + "step": 1168, + "time_per_iteration": 2.7244086265563965 + }, + { + "auxiliary_loss_clip": 0.01193701, + "auxiliary_loss_mlp": 0.01037351, + "balance_loss_clip": 1.06442082, + "balance_loss_mlp": 1.02591872, + "epoch": 0.14056393915709733, + "flos": 18332972501760.0, + "grad_norm": 2.175991497979151, + "language_loss": 0.75574231, + "learning_rate": 3.8732428159848575e-06, + "loss": 0.77805281, + "num_input_tokens_seen": 24877875, + "step": 1169, + "time_per_iteration": 4.2049291133880615 + }, + { + "auxiliary_loss_clip": 0.012213, + "auxiliary_loss_mlp": 0.01036242, + "balance_loss_clip": 1.0664413, + "balance_loss_mlp": 1.02516198, + "epoch": 0.14068418204773642, + "flos": 26687517770880.0, + "grad_norm": 2.4359588448772445, + "language_loss": 0.7819348, + "learning_rate": 3.872969766509304e-06, + "loss": 0.80451024, + "num_input_tokens_seen": 24898430, + "step": 1170, + "time_per_iteration": 3.490248918533325 + }, + { + "auxiliary_loss_clip": 0.01077206, + "auxiliary_loss_mlp": 0.01004536, + "balance_loss_clip": 1.01994371, + "balance_loss_mlp": 1.00163889, + "epoch": 0.14080442493837553, + "flos": 65259314501760.0, + "grad_norm": 0.7813112293505173, + "language_loss": 0.55731046, + "learning_rate": 3.872696432905726e-06, + "loss": 0.57812786, + "num_input_tokens_seen": 24959250, + "step": 1171, + "time_per_iteration": 3.1286964416503906 + }, + { + "auxiliary_loss_clip": 0.01224644, + "auxiliary_loss_mlp": 0.01043388, + "balance_loss_clip": 1.0616138, + "balance_loss_mlp": 1.03212845, + "epoch": 0.1409246678290146, + "flos": 25776715582080.0, + "grad_norm": 5.058677607780929, + "language_loss": 0.72017097, + "learning_rate": 3.872422815215589e-06, + "loss": 0.74285132, + "num_input_tokens_seen": 24978330, + "step": 1172, + "time_per_iteration": 2.5396029949188232 + }, + { + "auxiliary_loss_clip": 0.01216447, + "auxiliary_loss_mlp": 0.01039221, + "balance_loss_clip": 1.05932951, + "balance_loss_mlp": 1.02676368, + "epoch": 0.1410449107196537, + "flos": 21868521217920.0, + "grad_norm": 1.74665632249751, + "language_loss": 0.74234706, + "learning_rate": 3.8721489134803994e-06, + "loss": 0.76490372, + "num_input_tokens_seen": 24997120, + "step": 1173, + "time_per_iteration": 2.5236856937408447 + }, + { + "auxiliary_loss_clip": 0.01220147, + "auxiliary_loss_mlp": 0.01043815, + "balance_loss_clip": 1.06414938, + "balance_loss_mlp": 1.03232324, + "epoch": 0.1411651536102928, + "flos": 16684133564160.0, + "grad_norm": 2.7661666471300443, + "language_loss": 0.72404158, + "learning_rate": 3.871874727741707e-06, + "loss": 0.74668121, + "num_input_tokens_seen": 25014350, + "step": 1174, + "time_per_iteration": 2.4692013263702393 + }, + { + "auxiliary_loss_clip": 0.01218056, + "auxiliary_loss_mlp": 0.01039467, + "balance_loss_clip": 1.06591082, + "balance_loss_mlp": 1.02894723, + "epoch": 0.1412853965009319, + "flos": 20992264934400.0, + "grad_norm": 1.813176880819663, + "language_loss": 0.96586323, + "learning_rate": 3.871600258041108e-06, + "loss": 0.98843837, + "num_input_tokens_seen": 25033875, + "step": 1175, + "time_per_iteration": 2.5066468715667725 + }, + { + "auxiliary_loss_clip": 0.01201226, + "auxiliary_loss_mlp": 0.01040755, + "balance_loss_clip": 1.05865264, + "balance_loss_mlp": 1.02861917, + "epoch": 0.14140563939157097, + "flos": 20335279224960.0, + "grad_norm": 2.7511825108380004, + "language_loss": 0.85906392, + "learning_rate": 3.871325504420238e-06, + "loss": 0.88148379, + "num_input_tokens_seen": 25052865, + "step": 1176, + "time_per_iteration": 2.579911947250366 + }, + { + "auxiliary_loss_clip": 0.01237058, + "auxiliary_loss_mlp": 0.01030396, + "balance_loss_clip": 1.06630075, + "balance_loss_mlp": 1.01969671, + "epoch": 0.14152588228221005, + "flos": 21068826773760.0, + "grad_norm": 1.8827499686418243, + "language_loss": 0.81957275, + "learning_rate": 3.871050466920776e-06, + "loss": 0.84224725, + "num_input_tokens_seen": 25072770, + "step": 1177, + "time_per_iteration": 2.636533260345459 + }, + { + "auxiliary_loss_clip": 0.01182425, + "auxiliary_loss_mlp": 0.0102855, + "balance_loss_clip": 1.05850267, + "balance_loss_mlp": 1.01811922, + "epoch": 0.14164612517284916, + "flos": 18223157646720.0, + "grad_norm": 1.8436138561528141, + "language_loss": 0.79825938, + "learning_rate": 3.870775145584447e-06, + "loss": 0.82036912, + "num_input_tokens_seen": 25090550, + "step": 1178, + "time_per_iteration": 2.5385563373565674 + }, + { + "auxiliary_loss_clip": 0.01211595, + "auxiliary_loss_mlp": 0.01045365, + "balance_loss_clip": 1.06037378, + "balance_loss_mlp": 1.0336287, + "epoch": 0.14176636806348825, + "flos": 22744454279040.0, + "grad_norm": 2.956010462590504, + "language_loss": 0.64672774, + "learning_rate": 3.8704995404530145e-06, + "loss": 0.66929734, + "num_input_tokens_seen": 25106175, + "step": 1179, + "time_per_iteration": 2.505155563354492 + }, + { + "auxiliary_loss_clip": 0.01235696, + "auxiliary_loss_mlp": 0.01037305, + "balance_loss_clip": 1.0668118, + "balance_loss_mlp": 1.02682114, + "epoch": 0.14188661095412733, + "flos": 22091095843200.0, + "grad_norm": 1.781307880622551, + "language_loss": 0.85046625, + "learning_rate": 3.87022365156829e-06, + "loss": 0.87319624, + "num_input_tokens_seen": 25126890, + "step": 1180, + "time_per_iteration": 2.470587968826294 + }, + { + "auxiliary_loss_clip": 0.01143109, + "auxiliary_loss_mlp": 0.01036223, + "balance_loss_clip": 1.05429244, + "balance_loss_mlp": 1.02548277, + "epoch": 0.14200685384476644, + "flos": 24352390604160.0, + "grad_norm": 1.9979246663869574, + "language_loss": 0.80855519, + "learning_rate": 3.869947478972123e-06, + "loss": 0.83034849, + "num_input_tokens_seen": 25147915, + "step": 1181, + "time_per_iteration": 2.6558446884155273 + }, + { + "auxiliary_loss_clip": 0.01215977, + "auxiliary_loss_mlp": 0.01038795, + "balance_loss_clip": 1.06193757, + "balance_loss_mlp": 1.02691603, + "epoch": 0.14212709673540552, + "flos": 24022048199040.0, + "grad_norm": 2.054771379546689, + "language_loss": 0.82291663, + "learning_rate": 3.869671022706412e-06, + "loss": 0.84546435, + "num_input_tokens_seen": 25166645, + "step": 1182, + "time_per_iteration": 2.4929404258728027 + }, + { + "auxiliary_loss_clip": 0.01160001, + "auxiliary_loss_mlp": 0.01041855, + "balance_loss_clip": 1.05342746, + "balance_loss_mlp": 1.03083968, + "epoch": 0.1422473396260446, + "flos": 26431797870720.0, + "grad_norm": 1.8787162178728398, + "language_loss": 0.65041476, + "learning_rate": 3.869394282813092e-06, + "loss": 0.67243332, + "num_input_tokens_seen": 25185845, + "step": 1183, + "time_per_iteration": 2.6239099502563477 + }, + { + "auxiliary_loss_clip": 0.01196397, + "auxiliary_loss_mlp": 0.01038939, + "balance_loss_clip": 1.05827236, + "balance_loss_mlp": 1.02816272, + "epoch": 0.1423675825166837, + "flos": 17055306754560.0, + "grad_norm": 2.67323059864377, + "language_loss": 0.8914392, + "learning_rate": 3.869117259334147e-06, + "loss": 0.91379261, + "num_input_tokens_seen": 25203770, + "step": 1184, + "time_per_iteration": 2.5297577381134033 + }, + { + "auxiliary_loss_clip": 0.01217076, + "auxiliary_loss_mlp": 0.01040845, + "balance_loss_clip": 1.06297207, + "balance_loss_mlp": 1.03012252, + "epoch": 0.1424878254073228, + "flos": 17929480049280.0, + "grad_norm": 1.692990392749742, + "language_loss": 0.82497001, + "learning_rate": 3.868839952311599e-06, + "loss": 0.84754926, + "num_input_tokens_seen": 25221725, + "step": 1185, + "time_per_iteration": 2.462214946746826 + }, + { + "auxiliary_loss_clip": 0.01201851, + "auxiliary_loss_mlp": 0.01033956, + "balance_loss_clip": 1.06299663, + "balance_loss_mlp": 1.02275646, + "epoch": 0.14260806829796188, + "flos": 20303606407680.0, + "grad_norm": 2.1215900489038244, + "language_loss": 0.8033309, + "learning_rate": 3.868562361787516e-06, + "loss": 0.82568896, + "num_input_tokens_seen": 25240855, + "step": 1186, + "time_per_iteration": 2.502519369125366 + }, + { + "auxiliary_loss_clip": 0.01136294, + "auxiliary_loss_mlp": 0.01033313, + "balance_loss_clip": 1.05033183, + "balance_loss_mlp": 1.02235746, + "epoch": 0.14272831118860096, + "flos": 23185724860800.0, + "grad_norm": 2.0732757587542334, + "language_loss": 0.6938417, + "learning_rate": 3.868284487804009e-06, + "loss": 0.71553779, + "num_input_tokens_seen": 25260085, + "step": 1187, + "time_per_iteration": 2.6588172912597656 + }, + { + "auxiliary_loss_clip": 0.01210944, + "auxiliary_loss_mlp": 0.01040125, + "balance_loss_clip": 1.06084108, + "balance_loss_mlp": 1.0290978, + "epoch": 0.14284855407924008, + "flos": 27232210586880.0, + "grad_norm": 2.073893828183644, + "language_loss": 0.78131843, + "learning_rate": 3.86800633040323e-06, + "loss": 0.80382913, + "num_input_tokens_seen": 25280675, + "step": 1188, + "time_per_iteration": 2.580793857574463 + }, + { + "auxiliary_loss_clip": 0.01204935, + "auxiliary_loss_mlp": 0.00765606, + "balance_loss_clip": 1.06528711, + "balance_loss_mlp": 1.0008049, + "epoch": 0.14296879696987916, + "flos": 28184202696960.0, + "grad_norm": 5.914538516883636, + "language_loss": 0.78230035, + "learning_rate": 3.867727889627376e-06, + "loss": 0.80200571, + "num_input_tokens_seen": 25300290, + "step": 1189, + "time_per_iteration": 2.572909116744995 + }, + { + "auxiliary_loss_clip": 0.01180231, + "auxiliary_loss_mlp": 0.01040373, + "balance_loss_clip": 1.05891943, + "balance_loss_mlp": 1.02818966, + "epoch": 0.14308903986051824, + "flos": 19390290266880.0, + "grad_norm": 2.121681103699124, + "language_loss": 0.78305447, + "learning_rate": 3.867449165518687e-06, + "loss": 0.80526042, + "num_input_tokens_seen": 25316760, + "step": 1190, + "time_per_iteration": 2.538724184036255 + }, + { + "auxiliary_loss_clip": 0.01239895, + "auxiliary_loss_mlp": 0.00766594, + "balance_loss_clip": 1.06721592, + "balance_loss_mlp": 1.00088954, + "epoch": 0.14320928275115732, + "flos": 17457506317440.0, + "grad_norm": 1.7223467239651202, + "language_loss": 0.7141766, + "learning_rate": 3.867170158119444e-06, + "loss": 0.73424149, + "num_input_tokens_seen": 25335760, + "step": 1191, + "time_per_iteration": 3.25830340385437 + }, + { + "auxiliary_loss_clip": 0.01240399, + "auxiliary_loss_mlp": 0.0103858, + "balance_loss_clip": 1.06788397, + "balance_loss_mlp": 1.02781546, + "epoch": 0.14332952564179643, + "flos": 21466070259840.0, + "grad_norm": 2.3564820897069882, + "language_loss": 0.75529563, + "learning_rate": 3.866890867471972e-06, + "loss": 0.77808547, + "num_input_tokens_seen": 25354230, + "step": 1192, + "time_per_iteration": 2.466306447982788 + }, + { + "auxiliary_loss_clip": 0.01201883, + "auxiliary_loss_mlp": 0.01044376, + "balance_loss_clip": 1.05731475, + "balance_loss_mlp": 1.0328362, + "epoch": 0.14344976853243552, + "flos": 16396992241920.0, + "grad_norm": 2.477858249993943, + "language_loss": 0.89940697, + "learning_rate": 3.86661129361864e-06, + "loss": 0.92186952, + "num_input_tokens_seen": 25368720, + "step": 1193, + "time_per_iteration": 2.483060598373413 + }, + { + "auxiliary_loss_clip": 0.012018, + "auxiliary_loss_mlp": 0.01046013, + "balance_loss_clip": 1.06222689, + "balance_loss_mlp": 1.03434801, + "epoch": 0.1435700114230746, + "flos": 18916736336640.0, + "grad_norm": 3.0643482625437555, + "language_loss": 0.8610611, + "learning_rate": 3.866331436601859e-06, + "loss": 0.88353926, + "num_input_tokens_seen": 25386715, + "step": 1194, + "time_per_iteration": 2.5170211791992188 + }, + { + "auxiliary_loss_clip": 0.01237633, + "auxiliary_loss_mlp": 0.01040167, + "balance_loss_clip": 1.06681669, + "balance_loss_mlp": 1.02902079, + "epoch": 0.1436902543137137, + "flos": 19755394058880.0, + "grad_norm": 1.8858643616533894, + "language_loss": 0.73996961, + "learning_rate": 3.866051296464083e-06, + "loss": 0.76274765, + "num_input_tokens_seen": 25405550, + "step": 1195, + "time_per_iteration": 3.2092301845550537 + }, + { + "auxiliary_loss_clip": 0.01237638, + "auxiliary_loss_mlp": 0.00765946, + "balance_loss_clip": 1.06473398, + "balance_loss_mlp": 1.00085914, + "epoch": 0.1438104972043528, + "flos": 14684807669760.0, + "grad_norm": 2.5576622466949512, + "language_loss": 0.85335284, + "learning_rate": 3.86577087324781e-06, + "loss": 0.87338871, + "num_input_tokens_seen": 25422040, + "step": 1196, + "time_per_iteration": 3.187466621398926 + }, + { + "auxiliary_loss_clip": 0.01219549, + "auxiliary_loss_mlp": 0.01037608, + "balance_loss_clip": 1.06714976, + "balance_loss_mlp": 1.02693927, + "epoch": 0.14393074009499188, + "flos": 17092330698240.0, + "grad_norm": 2.114302731519621, + "language_loss": 0.77534127, + "learning_rate": 3.865490166995578e-06, + "loss": 0.7979129, + "num_input_tokens_seen": 25440270, + "step": 1197, + "time_per_iteration": 3.2333498001098633 + }, + { + "auxiliary_loss_clip": 0.01219549, + "auxiliary_loss_mlp": 0.01037825, + "balance_loss_clip": 1.06405532, + "balance_loss_mlp": 1.02623177, + "epoch": 0.144050982985631, + "flos": 30476200608000.0, + "grad_norm": 3.7221121801765684, + "language_loss": 0.84295624, + "learning_rate": 3.86520917774997e-06, + "loss": 0.86552995, + "num_input_tokens_seen": 25459705, + "step": 1198, + "time_per_iteration": 2.5486042499542236 + }, + { + "auxiliary_loss_clip": 0.0121894, + "auxiliary_loss_mlp": 0.01040853, + "balance_loss_clip": 1.06584895, + "balance_loss_mlp": 1.03044021, + "epoch": 0.14417122587627007, + "flos": 17858484817920.0, + "grad_norm": 2.139128358304188, + "language_loss": 0.7511071, + "learning_rate": 3.864927905553614e-06, + "loss": 0.77370501, + "num_input_tokens_seen": 25477615, + "step": 1199, + "time_per_iteration": 2.463768243789673 + }, + { + "auxiliary_loss_clip": 0.01185953, + "auxiliary_loss_mlp": 0.01041753, + "balance_loss_clip": 1.05981171, + "balance_loss_mlp": 1.03100026, + "epoch": 0.14429146876690915, + "flos": 21613914639360.0, + "grad_norm": 2.408733459805326, + "language_loss": 0.88774979, + "learning_rate": 3.8646463504491765e-06, + "loss": 0.91002679, + "num_input_tokens_seen": 25497750, + "step": 1200, + "time_per_iteration": 2.5518507957458496 + }, + { + "auxiliary_loss_clip": 0.01223676, + "auxiliary_loss_mlp": 0.01039377, + "balance_loss_clip": 1.06728995, + "balance_loss_mlp": 1.02774835, + "epoch": 0.14441171165754824, + "flos": 23258120722560.0, + "grad_norm": 1.7338978026456802, + "language_loss": 0.83356178, + "learning_rate": 3.8643645124793705e-06, + "loss": 0.85619235, + "num_input_tokens_seen": 25516650, + "step": 1201, + "time_per_iteration": 2.517239570617676 + }, + { + "auxiliary_loss_clip": 0.01218106, + "auxiliary_loss_mlp": 0.01034283, + "balance_loss_clip": 1.06260514, + "balance_loss_mlp": 1.02365565, + "epoch": 0.14453195454818735, + "flos": 42854213963520.0, + "grad_norm": 1.5990861529026517, + "language_loss": 0.75095493, + "learning_rate": 3.8640823916869515e-06, + "loss": 0.77347881, + "num_input_tokens_seen": 25540960, + "step": 1202, + "time_per_iteration": 2.702314615249634 + }, + { + "auxiliary_loss_clip": 0.01236021, + "auxiliary_loss_mlp": 0.01036802, + "balance_loss_clip": 1.06602967, + "balance_loss_mlp": 1.02561462, + "epoch": 0.14465219743882643, + "flos": 27235873774080.0, + "grad_norm": 1.579508239692632, + "language_loss": 0.78368461, + "learning_rate": 3.863799988114714e-06, + "loss": 0.80641282, + "num_input_tokens_seen": 25562990, + "step": 1203, + "time_per_iteration": 2.509019374847412 + }, + { + "auxiliary_loss_clip": 0.01240146, + "auxiliary_loss_mlp": 0.01033122, + "balance_loss_clip": 1.06663132, + "balance_loss_mlp": 1.02160048, + "epoch": 0.1447724403294655, + "flos": 16690705752960.0, + "grad_norm": 2.740681435405886, + "language_loss": 0.7050162, + "learning_rate": 3.863517301805502e-06, + "loss": 0.72774887, + "num_input_tokens_seen": 25581380, + "step": 1204, + "time_per_iteration": 2.428670883178711 + }, + { + "auxiliary_loss_clip": 0.01192568, + "auxiliary_loss_mlp": 0.01038368, + "balance_loss_clip": 1.06301999, + "balance_loss_mlp": 1.0274725, + "epoch": 0.14489268322010462, + "flos": 20073741321600.0, + "grad_norm": 2.2435390455142628, + "language_loss": 0.96834362, + "learning_rate": 3.863234332802196e-06, + "loss": 0.99065298, + "num_input_tokens_seen": 25593585, + "step": 1205, + "time_per_iteration": 2.5029139518737793 + }, + { + "auxiliary_loss_clip": 0.01200687, + "auxiliary_loss_mlp": 0.01042095, + "balance_loss_clip": 1.05940127, + "balance_loss_mlp": 1.03175402, + "epoch": 0.1450129261107437, + "flos": 27125627955840.0, + "grad_norm": 1.839828477289339, + "language_loss": 0.74155509, + "learning_rate": 3.862951081147723e-06, + "loss": 0.76398295, + "num_input_tokens_seen": 25613750, + "step": 1206, + "time_per_iteration": 2.557887315750122 + }, + { + "auxiliary_loss_clip": 0.01223425, + "auxiliary_loss_mlp": 0.01038364, + "balance_loss_clip": 1.0684371, + "balance_loss_mlp": 1.0281713, + "epoch": 0.1451331690013828, + "flos": 25702344472320.0, + "grad_norm": 2.011814768220127, + "language_loss": 0.77975202, + "learning_rate": 3.862667546885053e-06, + "loss": 0.80236995, + "num_input_tokens_seen": 25632300, + "step": 1207, + "time_per_iteration": 2.5240836143493652 + }, + { + "auxiliary_loss_clip": 0.01208394, + "auxiliary_loss_mlp": 0.01040442, + "balance_loss_clip": 1.06063557, + "balance_loss_mlp": 1.0290513, + "epoch": 0.14525341189202187, + "flos": 25737393168000.0, + "grad_norm": 2.162486689345549, + "language_loss": 0.73456979, + "learning_rate": 3.8623837300571965e-06, + "loss": 0.75705814, + "num_input_tokens_seen": 25651285, + "step": 1208, + "time_per_iteration": 2.554468870162964 + }, + { + "auxiliary_loss_clip": 0.01238795, + "auxiliary_loss_mlp": 0.01037276, + "balance_loss_clip": 1.06659973, + "balance_loss_mlp": 1.02597463, + "epoch": 0.14537365478266098, + "flos": 23073898844160.0, + "grad_norm": 2.5470850438227495, + "language_loss": 0.83686274, + "learning_rate": 3.8620996307072085e-06, + "loss": 0.85962343, + "num_input_tokens_seen": 25671990, + "step": 1209, + "time_per_iteration": 2.4563210010528564 + }, + { + "auxiliary_loss_clip": 0.01193724, + "auxiliary_loss_mlp": 0.01035421, + "balance_loss_clip": 1.05879402, + "balance_loss_mlp": 1.02420378, + "epoch": 0.14549389767330007, + "flos": 20595021448320.0, + "grad_norm": 1.8361069760117643, + "language_loss": 0.64243007, + "learning_rate": 3.861815248878188e-06, + "loss": 0.66472149, + "num_input_tokens_seen": 25689475, + "step": 1210, + "time_per_iteration": 2.5518076419830322 + }, + { + "auxiliary_loss_clip": 0.01201678, + "auxiliary_loss_mlp": 0.01038828, + "balance_loss_clip": 1.06416297, + "balance_loss_mlp": 1.02826571, + "epoch": 0.14561414056393915, + "flos": 15121804533120.0, + "grad_norm": 2.1880237608647737, + "language_loss": 0.79121423, + "learning_rate": 3.861530584613274e-06, + "loss": 0.81361932, + "num_input_tokens_seen": 25707475, + "step": 1211, + "time_per_iteration": 2.500760555267334 + }, + { + "auxiliary_loss_clip": 0.01225045, + "auxiliary_loss_mlp": 0.0076597, + "balance_loss_clip": 1.0674572, + "balance_loss_mlp": 1.00098395, + "epoch": 0.14573438345457826, + "flos": 19427493778560.0, + "grad_norm": 2.2720667756931943, + "language_loss": 0.81947297, + "learning_rate": 3.86124563795565e-06, + "loss": 0.83938313, + "num_input_tokens_seen": 25726290, + "step": 1212, + "time_per_iteration": 2.483062267303467 + }, + { + "auxiliary_loss_clip": 0.01236881, + "auxiliary_loss_mlp": 0.0103584, + "balance_loss_clip": 1.06809354, + "balance_loss_mlp": 1.02520645, + "epoch": 0.14585462634521734, + "flos": 24828422572800.0, + "grad_norm": 1.9155823746454623, + "language_loss": 0.70323515, + "learning_rate": 3.860960408948543e-06, + "loss": 0.72596234, + "num_input_tokens_seen": 25748040, + "step": 1213, + "time_per_iteration": 2.5139997005462646 + }, + { + "auxiliary_loss_clip": 0.01211509, + "auxiliary_loss_mlp": 0.01040243, + "balance_loss_clip": 1.06484926, + "balance_loss_mlp": 1.03006244, + "epoch": 0.14597486923585642, + "flos": 15448627405440.0, + "grad_norm": 2.3229872805556906, + "language_loss": 0.89444846, + "learning_rate": 3.860674897635222e-06, + "loss": 0.91696602, + "num_input_tokens_seen": 25764525, + "step": 1214, + "time_per_iteration": 2.5042803287506104 + }, + { + "auxiliary_loss_clip": 0.01221619, + "auxiliary_loss_mlp": 0.01043868, + "balance_loss_clip": 1.06697941, + "balance_loss_mlp": 1.0330615, + "epoch": 0.1460951121264955, + "flos": 16655154266880.0, + "grad_norm": 2.833771670206897, + "language_loss": 0.83374512, + "learning_rate": 3.860389104058998e-06, + "loss": 0.85640001, + "num_input_tokens_seen": 25782755, + "step": 1215, + "time_per_iteration": 2.4872803688049316 + }, + { + "auxiliary_loss_clip": 0.01204618, + "auxiliary_loss_mlp": 0.0103489, + "balance_loss_clip": 1.06334209, + "balance_loss_mlp": 1.02417302, + "epoch": 0.14621535501713462, + "flos": 24863291700480.0, + "grad_norm": 2.0124959632903967, + "language_loss": 0.72592223, + "learning_rate": 3.860103028263227e-06, + "loss": 0.74831736, + "num_input_tokens_seen": 25805860, + "step": 1216, + "time_per_iteration": 2.5808331966400146 + }, + { + "auxiliary_loss_clip": 0.01167473, + "auxiliary_loss_mlp": 0.01035562, + "balance_loss_clip": 1.05407476, + "balance_loss_mlp": 1.02504742, + "epoch": 0.1463355979077737, + "flos": 25228000442880.0, + "grad_norm": 1.9492755570615294, + "language_loss": 0.70172369, + "learning_rate": 3.859816670291304e-06, + "loss": 0.72375405, + "num_input_tokens_seen": 25824955, + "step": 1217, + "time_per_iteration": 2.6269657611846924 + }, + { + "auxiliary_loss_clip": 0.01153137, + "auxiliary_loss_mlp": 0.01033767, + "balance_loss_clip": 1.05702794, + "balance_loss_mlp": 1.02182817, + "epoch": 0.14645584079841278, + "flos": 22054143726720.0, + "grad_norm": 3.613137629365155, + "language_loss": 0.89769572, + "learning_rate": 3.859530030186672e-06, + "loss": 0.91956484, + "num_input_tokens_seen": 25841965, + "step": 1218, + "time_per_iteration": 3.3989977836608887 + }, + { + "auxiliary_loss_clip": 0.01208821, + "auxiliary_loss_mlp": 0.01035798, + "balance_loss_clip": 1.06521487, + "balance_loss_mlp": 1.02477741, + "epoch": 0.1465760836890519, + "flos": 23623870959360.0, + "grad_norm": 3.4871093619392877, + "language_loss": 0.82783937, + "learning_rate": 3.859243107992813e-06, + "loss": 0.85028559, + "num_input_tokens_seen": 25860770, + "step": 1219, + "time_per_iteration": 2.5650410652160645 + }, + { + "auxiliary_loss_clip": 0.01189618, + "auxiliary_loss_mlp": 0.01041157, + "balance_loss_clip": 1.05582786, + "balance_loss_mlp": 1.02950454, + "epoch": 0.14669632657969098, + "flos": 37407893356800.0, + "grad_norm": 2.442176392325774, + "language_loss": 0.77617586, + "learning_rate": 3.858955903753252e-06, + "loss": 0.79848361, + "num_input_tokens_seen": 25879410, + "step": 1220, + "time_per_iteration": 2.6592156887054443 + }, + { + "auxiliary_loss_clip": 0.01220965, + "auxiliary_loss_mlp": 0.01036806, + "balance_loss_clip": 1.06336617, + "balance_loss_mlp": 1.02709651, + "epoch": 0.14681656947033006, + "flos": 28365910623360.0, + "grad_norm": 1.4651204836719385, + "language_loss": 0.83403945, + "learning_rate": 3.858668417511559e-06, + "loss": 0.85661715, + "num_input_tokens_seen": 25902160, + "step": 1221, + "time_per_iteration": 2.5627682209014893 + }, + { + "auxiliary_loss_clip": 0.01210987, + "auxiliary_loss_mlp": 0.01032244, + "balance_loss_clip": 1.06635928, + "balance_loss_mlp": 1.02111006, + "epoch": 0.14693681236096917, + "flos": 18479488078080.0, + "grad_norm": 2.409275323565488, + "language_loss": 0.76340622, + "learning_rate": 3.8583806493113445e-06, + "loss": 0.78583848, + "num_input_tokens_seen": 25920505, + "step": 1222, + "time_per_iteration": 3.212742805480957 + }, + { + "auxiliary_loss_clip": 0.01218686, + "auxiliary_loss_mlp": 0.01043644, + "balance_loss_clip": 1.06515849, + "balance_loss_mlp": 1.03285575, + "epoch": 0.14705705525160825, + "flos": 20777806782720.0, + "grad_norm": 2.1954378704329867, + "language_loss": 0.82677865, + "learning_rate": 3.858092599196263e-06, + "loss": 0.84940201, + "num_input_tokens_seen": 25938460, + "step": 1223, + "time_per_iteration": 3.2684061527252197 + }, + { + "auxiliary_loss_clip": 0.01222746, + "auxiliary_loss_mlp": 0.01032315, + "balance_loss_clip": 1.0671128, + "balance_loss_mlp": 1.02184272, + "epoch": 0.14717729814224734, + "flos": 29932944336000.0, + "grad_norm": 2.2229989996306108, + "language_loss": 0.82342988, + "learning_rate": 3.857804267210012e-06, + "loss": 0.84598047, + "num_input_tokens_seen": 25957760, + "step": 1224, + "time_per_iteration": 2.54630184173584 + }, + { + "auxiliary_loss_clip": 0.01176206, + "auxiliary_loss_mlp": 0.01045254, + "balance_loss_clip": 1.05565143, + "balance_loss_mlp": 1.03488851, + "epoch": 0.14729754103288642, + "flos": 20047491457920.0, + "grad_norm": 1.9078152060157765, + "language_loss": 0.88225389, + "learning_rate": 3.857515653396331e-06, + "loss": 0.90446848, + "num_input_tokens_seen": 25974970, + "step": 1225, + "time_per_iteration": 2.6093590259552 + }, + { + "auxiliary_loss_clip": 0.01177425, + "auxiliary_loss_mlp": 0.01039376, + "balance_loss_clip": 1.0598352, + "balance_loss_mlp": 1.02941597, + "epoch": 0.14741778392352553, + "flos": 19281516906240.0, + "grad_norm": 2.550864003538366, + "language_loss": 0.86786014, + "learning_rate": 3.857226757799002e-06, + "loss": 0.89002812, + "num_input_tokens_seen": 25992525, + "step": 1226, + "time_per_iteration": 2.605285882949829 + }, + { + "auxiliary_loss_clip": 0.01205375, + "auxiliary_loss_mlp": 0.01036361, + "balance_loss_clip": 1.06236601, + "balance_loss_mlp": 1.02567363, + "epoch": 0.1475380268141646, + "flos": 25411108999680.0, + "grad_norm": 2.3110465012423154, + "language_loss": 0.74371523, + "learning_rate": 3.85693758046185e-06, + "loss": 0.76613259, + "num_input_tokens_seen": 26010815, + "step": 1227, + "time_per_iteration": 2.6165926456451416 + }, + { + "auxiliary_loss_clip": 0.01235713, + "auxiliary_loss_mlp": 0.01041534, + "balance_loss_clip": 1.0681982, + "balance_loss_mlp": 1.03135943, + "epoch": 0.1476582697048037, + "flos": 20847652778880.0, + "grad_norm": 1.695336046441586, + "language_loss": 0.82868415, + "learning_rate": 3.8566481214287435e-06, + "loss": 0.85145664, + "num_input_tokens_seen": 26028935, + "step": 1228, + "time_per_iteration": 2.501488447189331 + }, + { + "auxiliary_loss_clip": 0.01182928, + "auxiliary_loss_mlp": 0.0104083, + "balance_loss_clip": 1.05831242, + "balance_loss_mlp": 1.03014922, + "epoch": 0.1477785125954428, + "flos": 14028109269120.0, + "grad_norm": 2.0038972184963004, + "language_loss": 0.90537822, + "learning_rate": 3.8563583807435935e-06, + "loss": 0.92761582, + "num_input_tokens_seen": 26045080, + "step": 1229, + "time_per_iteration": 2.5645649433135986 + }, + { + "auxiliary_loss_clip": 0.01224098, + "auxiliary_loss_mlp": 0.00765486, + "balance_loss_clip": 1.06534839, + "balance_loss_mlp": 1.00117838, + "epoch": 0.1478987554860819, + "flos": 20516699842560.0, + "grad_norm": 1.8121961491767322, + "language_loss": 0.7787168, + "learning_rate": 3.856068358450353e-06, + "loss": 0.79861259, + "num_input_tokens_seen": 26065030, + "step": 1230, + "time_per_iteration": 2.5093836784362793 + }, + { + "auxiliary_loss_clip": 0.0120316, + "auxiliary_loss_mlp": 0.01044756, + "balance_loss_clip": 1.06746268, + "balance_loss_mlp": 1.03401542, + "epoch": 0.14801899837672097, + "flos": 17857012360320.0, + "grad_norm": 1.6866421091625126, + "language_loss": 0.85726553, + "learning_rate": 3.8557780545930186e-06, + "loss": 0.87974465, + "num_input_tokens_seen": 26083445, + "step": 1231, + "time_per_iteration": 2.5166187286376953 + }, + { + "auxiliary_loss_clip": 0.01205967, + "auxiliary_loss_mlp": 0.01035006, + "balance_loss_clip": 1.0657202, + "balance_loss_mlp": 1.02502823, + "epoch": 0.14813924126736006, + "flos": 20881408584960.0, + "grad_norm": 2.5587599211693566, + "language_loss": 0.78987879, + "learning_rate": 3.855487469215628e-06, + "loss": 0.81228852, + "num_input_tokens_seen": 26102375, + "step": 1232, + "time_per_iteration": 2.510005235671997 + }, + { + "auxiliary_loss_clip": 0.01190724, + "auxiliary_loss_mlp": 0.01035045, + "balance_loss_clip": 1.06329536, + "balance_loss_mlp": 1.02463782, + "epoch": 0.14825948415799917, + "flos": 37414070496000.0, + "grad_norm": 2.066699252362731, + "language_loss": 0.72518575, + "learning_rate": 3.855196602362264e-06, + "loss": 0.74744344, + "num_input_tokens_seen": 26125295, + "step": 1233, + "time_per_iteration": 2.697993040084839 + }, + { + "auxiliary_loss_clip": 0.01220718, + "auxiliary_loss_mlp": 0.01034729, + "balance_loss_clip": 1.06417799, + "balance_loss_mlp": 1.02417326, + "epoch": 0.14837972704863825, + "flos": 22014641744640.0, + "grad_norm": 2.05519453751274, + "language_loss": 0.94165444, + "learning_rate": 3.854905454077051e-06, + "loss": 0.96420896, + "num_input_tokens_seen": 26142905, + "step": 1234, + "time_per_iteration": 2.4797887802124023 + }, + { + "auxiliary_loss_clip": 0.01139938, + "auxiliary_loss_mlp": 0.01035899, + "balance_loss_clip": 1.05337048, + "balance_loss_mlp": 1.0255636, + "epoch": 0.14849996993927733, + "flos": 20996323171200.0, + "grad_norm": 2.0246299936825807, + "language_loss": 0.88220477, + "learning_rate": 3.854614024404155e-06, + "loss": 0.90396315, + "num_input_tokens_seen": 26161215, + "step": 1235, + "time_per_iteration": 2.693939685821533 + }, + { + "auxiliary_loss_clip": 0.01192047, + "auxiliary_loss_mlp": 0.01030359, + "balance_loss_clip": 1.05931544, + "balance_loss_mlp": 1.02016056, + "epoch": 0.14862021282991644, + "flos": 20047994248320.0, + "grad_norm": 1.831820260690422, + "language_loss": 0.89479816, + "learning_rate": 3.8543223133877865e-06, + "loss": 0.91702223, + "num_input_tokens_seen": 26179810, + "step": 1236, + "time_per_iteration": 2.5243821144104004 + }, + { + "auxiliary_loss_clip": 0.01187312, + "auxiliary_loss_mlp": 0.01038687, + "balance_loss_clip": 1.05854797, + "balance_loss_mlp": 1.02687919, + "epoch": 0.14874045572055553, + "flos": 22712027276160.0, + "grad_norm": 1.7682531004164719, + "language_loss": 0.88488275, + "learning_rate": 3.854030321072198e-06, + "loss": 0.90714276, + "num_input_tokens_seen": 26199715, + "step": 1237, + "time_per_iteration": 2.613600015640259 + }, + { + "auxiliary_loss_clip": 0.01196622, + "auxiliary_loss_mlp": 0.01030629, + "balance_loss_clip": 1.06185091, + "balance_loss_mlp": 1.020293, + "epoch": 0.1488606986111946, + "flos": 25411288567680.0, + "grad_norm": 1.9252657925988816, + "language_loss": 0.72981954, + "learning_rate": 3.853738047501682e-06, + "loss": 0.75209206, + "num_input_tokens_seen": 26220275, + "step": 1238, + "time_per_iteration": 2.627678871154785 + }, + { + "auxiliary_loss_clip": 0.01222963, + "auxiliary_loss_mlp": 0.01038659, + "balance_loss_clip": 1.06796885, + "balance_loss_mlp": 1.02788818, + "epoch": 0.1489809415018337, + "flos": 17018749687680.0, + "grad_norm": 1.691065126123122, + "language_loss": 0.77646339, + "learning_rate": 3.85344549272058e-06, + "loss": 0.7990796, + "num_input_tokens_seen": 26238255, + "step": 1239, + "time_per_iteration": 2.500767469406128 + }, + { + "auxiliary_loss_clip": 0.01215034, + "auxiliary_loss_mlp": 0.01038963, + "balance_loss_clip": 1.0620029, + "balance_loss_mlp": 1.02809107, + "epoch": 0.1491011843924728, + "flos": 33659394860160.0, + "grad_norm": 1.9270864060611244, + "language_loss": 0.82670724, + "learning_rate": 3.853152656773269e-06, + "loss": 0.84924722, + "num_input_tokens_seen": 26259690, + "step": 1240, + "time_per_iteration": 2.618277072906494 + }, + { + "auxiliary_loss_clip": 0.01201957, + "auxiliary_loss_mlp": 0.01033958, + "balance_loss_clip": 1.06310773, + "balance_loss_mlp": 1.0231998, + "epoch": 0.14922142728311188, + "flos": 21179000764800.0, + "grad_norm": 1.8733174910411319, + "language_loss": 0.84897739, + "learning_rate": 3.852859539704174e-06, + "loss": 0.87133658, + "num_input_tokens_seen": 26278990, + "step": 1241, + "time_per_iteration": 2.5360445976257324 + }, + { + "auxiliary_loss_clip": 0.0117066, + "auxiliary_loss_mlp": 0.01040672, + "balance_loss_clip": 1.05678368, + "balance_loss_mlp": 1.0301156, + "epoch": 0.14934167017375097, + "flos": 29860548474240.0, + "grad_norm": 3.603069184250289, + "language_loss": 0.76203936, + "learning_rate": 3.85256614155776e-06, + "loss": 0.78415269, + "num_input_tokens_seen": 26299120, + "step": 1242, + "time_per_iteration": 2.664612054824829 + }, + { + "auxiliary_loss_clip": 0.0121653, + "auxiliary_loss_mlp": 0.01035495, + "balance_loss_clip": 1.06061578, + "balance_loss_mlp": 1.02522504, + "epoch": 0.14946191306439008, + "flos": 17019216564480.0, + "grad_norm": 2.1972008195379362, + "language_loss": 0.7473526, + "learning_rate": 3.852272462378535e-06, + "loss": 0.7698729, + "num_input_tokens_seen": 26316995, + "step": 1243, + "time_per_iteration": 2.531547784805298 + }, + { + "auxiliary_loss_clip": 0.01203476, + "auxiliary_loss_mlp": 0.01043715, + "balance_loss_clip": 1.06307793, + "balance_loss_mlp": 1.03373122, + "epoch": 0.14958215595502916, + "flos": 15669047214720.0, + "grad_norm": 1.978867358816577, + "language_loss": 0.77659172, + "learning_rate": 3.85197850221105e-06, + "loss": 0.79906362, + "num_input_tokens_seen": 26333295, + "step": 1244, + "time_per_iteration": 3.254821538925171 + }, + { + "auxiliary_loss_clip": 0.01218104, + "auxiliary_loss_mlp": 0.010384, + "balance_loss_clip": 1.06695509, + "balance_loss_mlp": 1.02807677, + "epoch": 0.14970239884566824, + "flos": 33108560818560.0, + "grad_norm": 1.6849309210153454, + "language_loss": 0.75763309, + "learning_rate": 3.851684261099899e-06, + "loss": 0.7801981, + "num_input_tokens_seen": 26355035, + "step": 1245, + "time_per_iteration": 2.5982587337493896 + }, + { + "auxiliary_loss_clip": 0.01199202, + "auxiliary_loss_mlp": 0.01037791, + "balance_loss_clip": 1.06015682, + "balance_loss_mlp": 1.02590013, + "epoch": 0.14982264173630733, + "flos": 17821245392640.0, + "grad_norm": 1.8409670534800906, + "language_loss": 0.86573637, + "learning_rate": 3.851389739089718e-06, + "loss": 0.88810623, + "num_input_tokens_seen": 26371655, + "step": 1246, + "time_per_iteration": 2.5064287185668945 + }, + { + "auxiliary_loss_clip": 0.01221585, + "auxiliary_loss_mlp": 0.01039148, + "balance_loss_clip": 1.06757641, + "balance_loss_mlp": 1.02844906, + "epoch": 0.14994288462694644, + "flos": 32409559175040.0, + "grad_norm": 2.25809286970391, + "language_loss": 0.80468398, + "learning_rate": 3.851094936225186e-06, + "loss": 0.82729137, + "num_input_tokens_seen": 26392540, + "step": 1247, + "time_per_iteration": 2.5874621868133545 + }, + { + "auxiliary_loss_clip": 0.01199849, + "auxiliary_loss_mlp": 0.01030771, + "balance_loss_clip": 1.06429863, + "balance_loss_mlp": 1.0202384, + "epoch": 0.15006312751758552, + "flos": 31794661226880.0, + "grad_norm": 1.3886529248723387, + "language_loss": 0.76766324, + "learning_rate": 3.850799852551024e-06, + "loss": 0.78996944, + "num_input_tokens_seen": 26414960, + "step": 1248, + "time_per_iteration": 3.369913101196289 + }, + { + "auxiliary_loss_clip": 0.01209565, + "auxiliary_loss_mlp": 0.01040835, + "balance_loss_clip": 1.05969787, + "balance_loss_mlp": 1.02955198, + "epoch": 0.1501833704082246, + "flos": 16618022582400.0, + "grad_norm": 2.360059947985278, + "language_loss": 0.85731292, + "learning_rate": 3.850504488111995e-06, + "loss": 0.87981689, + "num_input_tokens_seen": 26431635, + "step": 1249, + "time_per_iteration": 2.4924938678741455 + }, + { + "auxiliary_loss_clip": 0.01194572, + "auxiliary_loss_mlp": 0.01032213, + "balance_loss_clip": 1.05813289, + "balance_loss_mlp": 1.0223248, + "epoch": 0.15030361329886371, + "flos": 23471178243840.0, + "grad_norm": 1.7197778986960184, + "language_loss": 0.82604086, + "learning_rate": 3.850208842952907e-06, + "loss": 0.84830868, + "num_input_tokens_seen": 26450440, + "step": 1250, + "time_per_iteration": 3.387383460998535 + }, + { + "auxiliary_loss_clip": 0.011808, + "auxiliary_loss_mlp": 0.01039428, + "balance_loss_clip": 1.05812645, + "balance_loss_mlp": 1.02892542, + "epoch": 0.1504238561895028, + "flos": 25629409906560.0, + "grad_norm": 1.9410564121680909, + "language_loss": 0.79344833, + "learning_rate": 3.849912917118608e-06, + "loss": 0.81565058, + "num_input_tokens_seen": 26471480, + "step": 1251, + "time_per_iteration": 2.6572775840759277 + }, + { + "auxiliary_loss_clip": 0.01113385, + "auxiliary_loss_mlp": 0.01002175, + "balance_loss_clip": 1.02636647, + "balance_loss_mlp": 0.99997008, + "epoch": 0.15054409908014188, + "flos": 52095146129280.0, + "grad_norm": 0.8740185118193246, + "language_loss": 0.59264624, + "learning_rate": 3.849616710653992e-06, + "loss": 0.61380184, + "num_input_tokens_seen": 26532950, + "step": 1252, + "time_per_iteration": 3.0130481719970703 + }, + { + "auxiliary_loss_clip": 0.01215994, + "auxiliary_loss_mlp": 0.01036121, + "balance_loss_clip": 1.06323147, + "balance_loss_mlp": 1.02542782, + "epoch": 0.150664341970781, + "flos": 18880251096960.0, + "grad_norm": 1.9229859661928226, + "language_loss": 0.74900126, + "learning_rate": 3.84932022360399e-06, + "loss": 0.7715224, + "num_input_tokens_seen": 26551615, + "step": 1253, + "time_per_iteration": 2.496476650238037 + }, + { + "auxiliary_loss_clip": 0.0120233, + "auxiliary_loss_mlp": 0.01041563, + "balance_loss_clip": 1.0663178, + "balance_loss_mlp": 1.03037536, + "epoch": 0.15078458486142007, + "flos": 22163240309760.0, + "grad_norm": 2.5694765449995787, + "language_loss": 0.83787704, + "learning_rate": 3.849023456013581e-06, + "loss": 0.86031592, + "num_input_tokens_seen": 26569175, + "step": 1254, + "time_per_iteration": 2.5338311195373535 + }, + { + "auxiliary_loss_clip": 0.01224739, + "auxiliary_loss_mlp": 0.01042491, + "balance_loss_clip": 1.0652802, + "balance_loss_mlp": 1.03140473, + "epoch": 0.15090482775205916, + "flos": 26651894457600.0, + "grad_norm": 2.0192686398079362, + "language_loss": 0.62005895, + "learning_rate": 3.848726407927784e-06, + "loss": 0.64273119, + "num_input_tokens_seen": 26589560, + "step": 1255, + "time_per_iteration": 2.5620899200439453 + }, + { + "auxiliary_loss_clip": 0.01205824, + "auxiliary_loss_mlp": 0.0104001, + "balance_loss_clip": 1.06467891, + "balance_loss_mlp": 1.02972174, + "epoch": 0.15102507064269824, + "flos": 21798998444160.0, + "grad_norm": 2.3198851970343015, + "language_loss": 0.86557257, + "learning_rate": 3.84842907939166e-06, + "loss": 0.88803089, + "num_input_tokens_seen": 26608785, + "step": 1256, + "time_per_iteration": 2.534515142440796 + }, + { + "auxiliary_loss_clip": 0.01179855, + "auxiliary_loss_mlp": 0.01038822, + "balance_loss_clip": 1.05796027, + "balance_loss_mlp": 1.0285759, + "epoch": 0.15114531353333735, + "flos": 22820908377600.0, + "grad_norm": 2.5052719206727483, + "language_loss": 0.70615041, + "learning_rate": 3.8481314704503146e-06, + "loss": 0.72833723, + "num_input_tokens_seen": 26628615, + "step": 1257, + "time_per_iteration": 2.5636789798736572 + }, + { + "auxiliary_loss_clip": 0.01220595, + "auxiliary_loss_mlp": 0.01036081, + "balance_loss_clip": 1.06734562, + "balance_loss_mlp": 1.02629948, + "epoch": 0.15126555642397643, + "flos": 19682674974720.0, + "grad_norm": 2.3626086081169064, + "language_loss": 0.88010496, + "learning_rate": 3.847833581148895e-06, + "loss": 0.90267175, + "num_input_tokens_seen": 26647525, + "step": 1258, + "time_per_iteration": 2.461132049560547 + }, + { + "auxiliary_loss_clip": 0.01231227, + "auxiliary_loss_mlp": 0.01031471, + "balance_loss_clip": 1.06227684, + "balance_loss_mlp": 1.02084363, + "epoch": 0.15138579931461552, + "flos": 28726022424960.0, + "grad_norm": 2.238422323395634, + "language_loss": 0.81430858, + "learning_rate": 3.84753541153259e-06, + "loss": 0.83693552, + "num_input_tokens_seen": 26667095, + "step": 1259, + "time_per_iteration": 2.513589859008789 + }, + { + "auxiliary_loss_clip": 0.0121806, + "auxiliary_loss_mlp": 0.01035728, + "balance_loss_clip": 1.06561875, + "balance_loss_mlp": 1.0257206, + "epoch": 0.15150604220525463, + "flos": 22127006465280.0, + "grad_norm": 1.6147332439185764, + "language_loss": 0.83239353, + "learning_rate": 3.847236961646633e-06, + "loss": 0.85493135, + "num_input_tokens_seen": 26686075, + "step": 1260, + "time_per_iteration": 2.4986495971679688 + }, + { + "auxiliary_loss_clip": 0.01196829, + "auxiliary_loss_mlp": 0.01038784, + "balance_loss_clip": 1.06037068, + "balance_loss_mlp": 1.02772713, + "epoch": 0.1516262850958937, + "flos": 12968708515200.0, + "grad_norm": 2.322687099041264, + "language_loss": 0.77766705, + "learning_rate": 3.846938231536296e-06, + "loss": 0.8000232, + "num_input_tokens_seen": 26701695, + "step": 1261, + "time_per_iteration": 2.498566150665283 + }, + { + "auxiliary_loss_clip": 0.01222659, + "auxiliary_loss_mlp": 0.01029793, + "balance_loss_clip": 1.06719494, + "balance_loss_mlp": 1.01986301, + "epoch": 0.1517465279865328, + "flos": 21797130936960.0, + "grad_norm": 1.6950299432706266, + "language_loss": 0.8047812, + "learning_rate": 3.8466392212468995e-06, + "loss": 0.82730579, + "num_input_tokens_seen": 26721885, + "step": 1262, + "time_per_iteration": 2.47938871383667 + }, + { + "auxiliary_loss_clip": 0.01087664, + "auxiliary_loss_mlp": 0.01002953, + "balance_loss_clip": 1.01904988, + "balance_loss_mlp": 1.00075936, + "epoch": 0.15186677087717187, + "flos": 58174569901440.0, + "grad_norm": 0.8196369844539576, + "language_loss": 0.61938465, + "learning_rate": 3.8463399308238e-06, + "loss": 0.64029086, + "num_input_tokens_seen": 26780990, + "step": 1263, + "time_per_iteration": 3.073401689529419 + }, + { + "auxiliary_loss_clip": 0.01216582, + "auxiliary_loss_mlp": 0.01042705, + "balance_loss_clip": 1.0653646, + "balance_loss_mlp": 1.03104627, + "epoch": 0.15198701376781099, + "flos": 32669696448000.0, + "grad_norm": 1.8218386263261923, + "language_loss": 0.63792729, + "learning_rate": 3.846040360312402e-06, + "loss": 0.66052014, + "num_input_tokens_seen": 26804250, + "step": 1264, + "time_per_iteration": 2.5780036449432373 + }, + { + "auxiliary_loss_clip": 0.01230064, + "auxiliary_loss_mlp": 0.01038467, + "balance_loss_clip": 1.0624311, + "balance_loss_mlp": 1.02796447, + "epoch": 0.15210725665845007, + "flos": 28402575431040.0, + "grad_norm": 6.470510732913258, + "language_loss": 0.80807734, + "learning_rate": 3.8457405097581485e-06, + "loss": 0.83076262, + "num_input_tokens_seen": 26823240, + "step": 1265, + "time_per_iteration": 2.501965045928955 + }, + { + "auxiliary_loss_clip": 0.01173141, + "auxiliary_loss_mlp": 0.01038663, + "balance_loss_clip": 1.05344939, + "balance_loss_mlp": 1.02790451, + "epoch": 0.15222749954908915, + "flos": 19938179393280.0, + "grad_norm": 2.055504481235331, + "language_loss": 0.77932322, + "learning_rate": 3.8454403792065275e-06, + "loss": 0.80144119, + "num_input_tokens_seen": 26842060, + "step": 1266, + "time_per_iteration": 2.5642545223236084 + }, + { + "auxiliary_loss_clip": 0.01174918, + "auxiliary_loss_mlp": 0.01042975, + "balance_loss_clip": 1.05596805, + "balance_loss_mlp": 1.0323832, + "epoch": 0.15234774243972826, + "flos": 21324223451520.0, + "grad_norm": 2.2398128307458642, + "language_loss": 0.85657573, + "learning_rate": 3.845139968703068e-06, + "loss": 0.87875462, + "num_input_tokens_seen": 26859580, + "step": 1267, + "time_per_iteration": 2.530787706375122 + }, + { + "auxiliary_loss_clip": 0.01169967, + "auxiliary_loss_mlp": 0.01038931, + "balance_loss_clip": 1.05488002, + "balance_loss_mlp": 1.02808917, + "epoch": 0.15246798533036734, + "flos": 25957812977280.0, + "grad_norm": 4.345537091713168, + "language_loss": 0.83069861, + "learning_rate": 3.844839278293342e-06, + "loss": 0.85278761, + "num_input_tokens_seen": 26880430, + "step": 1268, + "time_per_iteration": 2.6272730827331543 + }, + { + "auxiliary_loss_clip": 0.01236452, + "auxiliary_loss_mlp": 0.01038142, + "balance_loss_clip": 1.06840181, + "balance_loss_mlp": 1.02734137, + "epoch": 0.15258822822100643, + "flos": 25811907932160.0, + "grad_norm": 2.3240724152930285, + "language_loss": 0.77241635, + "learning_rate": 3.8445383080229654e-06, + "loss": 0.79516232, + "num_input_tokens_seen": 26896445, + "step": 1269, + "time_per_iteration": 2.474691152572632 + }, + { + "auxiliary_loss_clip": 0.01194334, + "auxiliary_loss_mlp": 0.01034827, + "balance_loss_clip": 1.05668187, + "balance_loss_mlp": 1.02411652, + "epoch": 0.1527084711116455, + "flos": 25265455349760.0, + "grad_norm": 2.250531681688304, + "language_loss": 0.73790598, + "learning_rate": 3.844237057937593e-06, + "loss": 0.76019752, + "num_input_tokens_seen": 26915450, + "step": 1270, + "time_per_iteration": 2.545231580734253 + }, + { + "auxiliary_loss_clip": 0.01222504, + "auxiliary_loss_mlp": 0.01033481, + "balance_loss_clip": 1.06271148, + "balance_loss_mlp": 1.02258492, + "epoch": 0.15282871400228462, + "flos": 29240227572480.0, + "grad_norm": 3.796392198337922, + "language_loss": 0.7769326, + "learning_rate": 3.843935528082926e-06, + "loss": 0.79949248, + "num_input_tokens_seen": 26936475, + "step": 1271, + "time_per_iteration": 3.306154251098633 + }, + { + "auxiliary_loss_clip": 0.01219408, + "auxiliary_loss_mlp": 0.01032282, + "balance_loss_clip": 1.06425512, + "balance_loss_mlp": 1.02215564, + "epoch": 0.1529489568929237, + "flos": 20882952869760.0, + "grad_norm": 1.8868605518127923, + "language_loss": 0.85245812, + "learning_rate": 3.843633718504704e-06, + "loss": 0.87497503, + "num_input_tokens_seen": 26954920, + "step": 1272, + "time_per_iteration": 2.5229227542877197 + }, + { + "auxiliary_loss_clip": 0.01185842, + "auxiliary_loss_mlp": 0.01036006, + "balance_loss_clip": 1.05937648, + "balance_loss_mlp": 1.02583194, + "epoch": 0.1530691997835628, + "flos": 20083833043200.0, + "grad_norm": 2.7334912902512576, + "language_loss": 0.89746016, + "learning_rate": 3.843331629248715e-06, + "loss": 0.91967869, + "num_input_tokens_seen": 26972520, + "step": 1273, + "time_per_iteration": 2.5582001209259033 + }, + { + "auxiliary_loss_clip": 0.01234794, + "auxiliary_loss_mlp": 0.01034522, + "balance_loss_clip": 1.06672668, + "balance_loss_mlp": 1.02460408, + "epoch": 0.1531894426742019, + "flos": 28759814144640.0, + "grad_norm": 2.1665716805741386, + "language_loss": 0.76794797, + "learning_rate": 3.843029260360782e-06, + "loss": 0.79064107, + "num_input_tokens_seen": 26990890, + "step": 1274, + "time_per_iteration": 2.5182995796203613 + }, + { + "auxiliary_loss_clip": 0.01217551, + "auxiliary_loss_mlp": 0.0104152, + "balance_loss_clip": 1.06380665, + "balance_loss_mlp": 1.03159571, + "epoch": 0.15330968556484098, + "flos": 22236282616320.0, + "grad_norm": 1.873851636037058, + "language_loss": 0.7854712, + "learning_rate": 3.8427266118867755e-06, + "loss": 0.8080619, + "num_input_tokens_seen": 27010640, + "step": 1275, + "time_per_iteration": 3.2705063819885254 + }, + { + "auxiliary_loss_clip": 0.0119898, + "auxiliary_loss_mlp": 0.01035947, + "balance_loss_clip": 1.06134892, + "balance_loss_mlp": 1.02544475, + "epoch": 0.15342992845548006, + "flos": 27527504296320.0, + "grad_norm": 2.5640919707402965, + "language_loss": 0.82652605, + "learning_rate": 3.842423683872608e-06, + "loss": 0.84887528, + "num_input_tokens_seen": 27031215, + "step": 1276, + "time_per_iteration": 4.206755876541138 + }, + { + "auxiliary_loss_clip": 0.01216637, + "auxiliary_loss_mlp": 0.01038961, + "balance_loss_clip": 1.06243622, + "balance_loss_mlp": 1.02873838, + "epoch": 0.15355017134611917, + "flos": 19609596754560.0, + "grad_norm": 2.398689523270498, + "language_loss": 0.77966297, + "learning_rate": 3.842120476364232e-06, + "loss": 0.80221891, + "num_input_tokens_seen": 27049665, + "step": 1277, + "time_per_iteration": 2.4873099327087402 + }, + { + "auxiliary_loss_clip": 0.01222078, + "auxiliary_loss_mlp": 0.01032601, + "balance_loss_clip": 1.0627048, + "balance_loss_mlp": 1.0222652, + "epoch": 0.15367041423675826, + "flos": 18478590238080.0, + "grad_norm": 2.2648057171015776, + "language_loss": 0.83587515, + "learning_rate": 3.841816989407644e-06, + "loss": 0.85842192, + "num_input_tokens_seen": 27065155, + "step": 1278, + "time_per_iteration": 2.4659810066223145 + }, + { + "auxiliary_loss_clip": 0.01182024, + "auxiliary_loss_mlp": 0.01042841, + "balance_loss_clip": 1.05935264, + "balance_loss_mlp": 1.03277409, + "epoch": 0.15379065712739734, + "flos": 41427662342400.0, + "grad_norm": 2.0585538352754447, + "language_loss": 0.76731616, + "learning_rate": 3.841513223048884e-06, + "loss": 0.78956485, + "num_input_tokens_seen": 27085840, + "step": 1279, + "time_per_iteration": 2.736311912536621 + }, + { + "auxiliary_loss_clip": 0.01181612, + "auxiliary_loss_mlp": 0.01034671, + "balance_loss_clip": 1.05672026, + "balance_loss_mlp": 1.02414453, + "epoch": 0.15391090001803642, + "flos": 22054215553920.0, + "grad_norm": 2.465481839466679, + "language_loss": 0.78506124, + "learning_rate": 3.841209177334031e-06, + "loss": 0.80722404, + "num_input_tokens_seen": 27104200, + "step": 1280, + "time_per_iteration": 2.554771661758423 + }, + { + "auxiliary_loss_clip": 0.01211959, + "auxiliary_loss_mlp": 0.0103287, + "balance_loss_clip": 1.06221199, + "balance_loss_mlp": 1.02271926, + "epoch": 0.15403114290867553, + "flos": 15450351258240.0, + "grad_norm": 4.317631644854921, + "language_loss": 0.74969918, + "learning_rate": 3.84090485230921e-06, + "loss": 0.77214748, + "num_input_tokens_seen": 27122440, + "step": 1281, + "time_per_iteration": 2.4762978553771973 + }, + { + "auxiliary_loss_clip": 0.01231631, + "auxiliary_loss_mlp": 0.01033688, + "balance_loss_clip": 1.06476021, + "balance_loss_mlp": 1.02326918, + "epoch": 0.15415138579931462, + "flos": 17929156826880.0, + "grad_norm": 7.294456208841155, + "language_loss": 0.76413548, + "learning_rate": 3.840600248020588e-06, + "loss": 0.7867887, + "num_input_tokens_seen": 27139380, + "step": 1282, + "time_per_iteration": 2.4242472648620605 + }, + { + "auxiliary_loss_clip": 0.0120542, + "auxiliary_loss_mlp": 0.01045992, + "balance_loss_clip": 1.05814862, + "balance_loss_mlp": 1.03532839, + "epoch": 0.1542716286899537, + "flos": 11429325296640.0, + "grad_norm": 1.9902963137863745, + "language_loss": 0.79555601, + "learning_rate": 3.840295364514371e-06, + "loss": 0.81807017, + "num_input_tokens_seen": 27156760, + "step": 1283, + "time_per_iteration": 2.530721664428711 + }, + { + "auxiliary_loss_clip": 0.01202231, + "auxiliary_loss_mlp": 0.01035049, + "balance_loss_clip": 1.0614028, + "balance_loss_mlp": 1.02486253, + "epoch": 0.1543918715805928, + "flos": 17420338719360.0, + "grad_norm": 2.57225069233365, + "language_loss": 0.7858175, + "learning_rate": 3.83999020183681e-06, + "loss": 0.80819035, + "num_input_tokens_seen": 27175455, + "step": 1284, + "time_per_iteration": 2.490175724029541 + }, + { + "auxiliary_loss_clip": 0.01147854, + "auxiliary_loss_mlp": 0.01038507, + "balance_loss_clip": 1.05376256, + "balance_loss_mlp": 1.02858865, + "epoch": 0.1545121144712319, + "flos": 17786376264960.0, + "grad_norm": 2.317564940822141, + "language_loss": 0.78505892, + "learning_rate": 3.839684760034199e-06, + "loss": 0.80692255, + "num_input_tokens_seen": 27193660, + "step": 1285, + "time_per_iteration": 2.612457513809204 + }, + { + "auxiliary_loss_clip": 0.01180839, + "auxiliary_loss_mlp": 0.01037626, + "balance_loss_clip": 1.05903065, + "balance_loss_mlp": 1.02659917, + "epoch": 0.15463235736187098, + "flos": 28220185146240.0, + "grad_norm": 2.2958846640867336, + "language_loss": 0.6513204, + "learning_rate": 3.8393790391528716e-06, + "loss": 0.67350507, + "num_input_tokens_seen": 27214355, + "step": 1286, + "time_per_iteration": 2.6065094470977783 + }, + { + "auxiliary_loss_clip": 0.01199017, + "auxiliary_loss_mlp": 0.01042261, + "balance_loss_clip": 1.05832982, + "balance_loss_mlp": 1.03199673, + "epoch": 0.15475260025251006, + "flos": 22856890826880.0, + "grad_norm": 2.9903748402883608, + "language_loss": 0.89119834, + "learning_rate": 3.8390730392392075e-06, + "loss": 0.91361105, + "num_input_tokens_seen": 27234335, + "step": 1287, + "time_per_iteration": 2.5319173336029053 + }, + { + "auxiliary_loss_clip": 0.01234955, + "auxiliary_loss_mlp": 0.01037209, + "balance_loss_clip": 1.06673789, + "balance_loss_mlp": 1.02710652, + "epoch": 0.15487284314314917, + "flos": 17602872658560.0, + "grad_norm": 2.046393814696143, + "language_loss": 0.79300618, + "learning_rate": 3.838766760339626e-06, + "loss": 0.81572783, + "num_input_tokens_seen": 27252860, + "step": 1288, + "time_per_iteration": 2.433563709259033 + }, + { + "auxiliary_loss_clip": 0.01168346, + "auxiliary_loss_mlp": 0.01037209, + "balance_loss_clip": 1.05506229, + "balance_loss_mlp": 1.02628946, + "epoch": 0.15499308603378825, + "flos": 20082037363200.0, + "grad_norm": 3.3338827946620135, + "language_loss": 0.7952261, + "learning_rate": 3.838460202500587e-06, + "loss": 0.8172816, + "num_input_tokens_seen": 27268650, + "step": 1289, + "time_per_iteration": 2.5522499084472656 + }, + { + "auxiliary_loss_clip": 0.01180197, + "auxiliary_loss_mlp": 0.01034715, + "balance_loss_clip": 1.06151116, + "balance_loss_mlp": 1.0234139, + "epoch": 0.15511332892442733, + "flos": 15918051271680.0, + "grad_norm": 2.426566387946794, + "language_loss": 0.74421763, + "learning_rate": 3.838153365768599e-06, + "loss": 0.76636672, + "num_input_tokens_seen": 27285160, + "step": 1290, + "time_per_iteration": 2.5142533779144287 + }, + { + "auxiliary_loss_clip": 0.01185318, + "auxiliary_loss_mlp": 0.01046256, + "balance_loss_clip": 1.06369376, + "balance_loss_mlp": 1.03544331, + "epoch": 0.15523357181506645, + "flos": 41282475569280.0, + "grad_norm": 2.598774670501787, + "language_loss": 0.75397611, + "learning_rate": 3.837846250190206e-06, + "loss": 0.77629185, + "num_input_tokens_seen": 27308025, + "step": 1291, + "time_per_iteration": 2.7272045612335205 + }, + { + "auxiliary_loss_clip": 0.01164939, + "auxiliary_loss_mlp": 0.00765557, + "balance_loss_clip": 1.05631399, + "balance_loss_mlp": 1.00129461, + "epoch": 0.15535381470570553, + "flos": 18478769806080.0, + "grad_norm": 2.7147250934687537, + "language_loss": 0.77187312, + "learning_rate": 3.837538855811998e-06, + "loss": 0.79117805, + "num_input_tokens_seen": 27326200, + "step": 1292, + "time_per_iteration": 2.5955021381378174 + }, + { + "auxiliary_loss_clip": 0.01209626, + "auxiliary_loss_mlp": 0.01038464, + "balance_loss_clip": 1.06428766, + "balance_loss_mlp": 1.0281589, + "epoch": 0.1554740575963446, + "flos": 13918150759680.0, + "grad_norm": 2.8845180664244987, + "language_loss": 0.70985061, + "learning_rate": 3.837231182680606e-06, + "loss": 0.73233151, + "num_input_tokens_seen": 27344165, + "step": 1293, + "time_per_iteration": 2.5218522548675537 + }, + { + "auxiliary_loss_clip": 0.01223558, + "auxiliary_loss_mlp": 0.01039141, + "balance_loss_clip": 1.06477022, + "balance_loss_mlp": 1.02837014, + "epoch": 0.1555943004869837, + "flos": 20847078161280.0, + "grad_norm": 1.6194331132294333, + "language_loss": 0.76027066, + "learning_rate": 3.836923230842706e-06, + "loss": 0.78289759, + "num_input_tokens_seen": 27363280, + "step": 1294, + "time_per_iteration": 2.521150827407837 + }, + { + "auxiliary_loss_clip": 0.01170004, + "auxiliary_loss_mlp": 0.01041788, + "balance_loss_clip": 1.05211127, + "balance_loss_mlp": 1.03057015, + "epoch": 0.1557145433776228, + "flos": 22085888371200.0, + "grad_norm": 1.8697810650809068, + "language_loss": 0.80577159, + "learning_rate": 3.836615000345011e-06, + "loss": 0.82788956, + "num_input_tokens_seen": 27381460, + "step": 1295, + "time_per_iteration": 2.654623508453369 + }, + { + "auxiliary_loss_clip": 0.01229685, + "auxiliary_loss_mlp": 0.01030932, + "balance_loss_clip": 1.06363952, + "balance_loss_mlp": 1.02123475, + "epoch": 0.1558347862682619, + "flos": 19791987039360.0, + "grad_norm": 2.411052942526275, + "language_loss": 0.78413373, + "learning_rate": 3.836306491234282e-06, + "loss": 0.80673993, + "num_input_tokens_seen": 27399310, + "step": 1296, + "time_per_iteration": 2.467440605163574 + }, + { + "auxiliary_loss_clip": 0.01196058, + "auxiliary_loss_mlp": 0.01034331, + "balance_loss_clip": 1.06352699, + "balance_loss_mlp": 1.02460337, + "epoch": 0.15595502915890097, + "flos": 17237086508160.0, + "grad_norm": 2.4344004061406896, + "language_loss": 0.75680739, + "learning_rate": 3.835997703557317e-06, + "loss": 0.77911127, + "num_input_tokens_seen": 27416050, + "step": 1297, + "time_per_iteration": 2.5132997035980225 + }, + { + "auxiliary_loss_clip": 0.01170464, + "auxiliary_loss_mlp": 0.01038996, + "balance_loss_clip": 1.0528357, + "balance_loss_mlp": 1.02854097, + "epoch": 0.15607527204954008, + "flos": 19719519350400.0, + "grad_norm": 1.6667622265044246, + "language_loss": 0.80279464, + "learning_rate": 3.83568863736096e-06, + "loss": 0.8248893, + "num_input_tokens_seen": 27434920, + "step": 1298, + "time_per_iteration": 3.401448965072632 + }, + { + "auxiliary_loss_clip": 0.01186742, + "auxiliary_loss_mlp": 0.0103723, + "balance_loss_clip": 1.05607533, + "balance_loss_mlp": 1.02713871, + "epoch": 0.15619551494017916, + "flos": 18515650095360.0, + "grad_norm": 2.1434570931169636, + "language_loss": 0.89244807, + "learning_rate": 3.8353792926920975e-06, + "loss": 0.91468781, + "num_input_tokens_seen": 27453570, + "step": 1299, + "time_per_iteration": 2.540374994277954 + }, + { + "auxiliary_loss_clip": 0.01225458, + "auxiliary_loss_mlp": 0.01043167, + "balance_loss_clip": 1.06707811, + "balance_loss_mlp": 1.03196168, + "epoch": 0.15631575783081825, + "flos": 19902125116800.0, + "grad_norm": 5.145697584500249, + "language_loss": 0.81388086, + "learning_rate": 3.835069669597655e-06, + "loss": 0.83656704, + "num_input_tokens_seen": 27471960, + "step": 1300, + "time_per_iteration": 2.4824445247650146 + }, + { + "auxiliary_loss_clip": 0.01220656, + "auxiliary_loss_mlp": 0.00766102, + "balance_loss_clip": 1.06290781, + "balance_loss_mlp": 1.00126493, + "epoch": 0.15643600072145733, + "flos": 20777663128320.0, + "grad_norm": 2.69642911236602, + "language_loss": 0.79682255, + "learning_rate": 3.834759768124603e-06, + "loss": 0.81669009, + "num_input_tokens_seen": 27490835, + "step": 1301, + "time_per_iteration": 2.4958035945892334 + }, + { + "auxiliary_loss_clip": 0.01189963, + "auxiliary_loss_mlp": 0.01037621, + "balance_loss_clip": 1.06176305, + "balance_loss_mlp": 1.02671361, + "epoch": 0.15655624361209644, + "flos": 18546389159040.0, + "grad_norm": 2.813338781297517, + "language_loss": 0.76571327, + "learning_rate": 3.834449588319953e-06, + "loss": 0.78798914, + "num_input_tokens_seen": 27508870, + "step": 1302, + "time_per_iteration": 3.2799770832061768 + }, + { + "auxiliary_loss_clip": 0.01216229, + "auxiliary_loss_mlp": 0.01037877, + "balance_loss_clip": 1.06583273, + "balance_loss_mlp": 1.02757132, + "epoch": 0.15667648650273552, + "flos": 25229544727680.0, + "grad_norm": 1.805815588605594, + "language_loss": 0.85107619, + "learning_rate": 3.834139130230758e-06, + "loss": 0.87361729, + "num_input_tokens_seen": 27528175, + "step": 1303, + "time_per_iteration": 4.0188987255096436 + }, + { + "auxiliary_loss_clip": 0.01203527, + "auxiliary_loss_mlp": 0.01036518, + "balance_loss_clip": 1.05759072, + "balance_loss_mlp": 1.02587867, + "epoch": 0.1567967293933746, + "flos": 24827093769600.0, + "grad_norm": 1.6732580998680122, + "language_loss": 0.81197751, + "learning_rate": 3.833828393904117e-06, + "loss": 0.83437794, + "num_input_tokens_seen": 27548455, + "step": 1304, + "time_per_iteration": 2.5577292442321777 + }, + { + "auxiliary_loss_clip": 0.01165988, + "auxiliary_loss_mlp": 0.01031949, + "balance_loss_clip": 1.05344558, + "balance_loss_mlp": 1.02086234, + "epoch": 0.15691697228401372, + "flos": 19164555244800.0, + "grad_norm": 2.14821786102956, + "language_loss": 0.77338958, + "learning_rate": 3.833517379387165e-06, + "loss": 0.79536891, + "num_input_tokens_seen": 27564910, + "step": 1305, + "time_per_iteration": 2.537242889404297 + }, + { + "auxiliary_loss_clip": 0.012209, + "auxiliary_loss_mlp": 0.01044088, + "balance_loss_clip": 1.06374943, + "balance_loss_mlp": 1.0332222, + "epoch": 0.1570372151746528, + "flos": 24790931752320.0, + "grad_norm": 3.0230337499216624, + "language_loss": 0.88790011, + "learning_rate": 3.833206086727085e-06, + "loss": 0.91055006, + "num_input_tokens_seen": 27584260, + "step": 1306, + "time_per_iteration": 2.5333144664764404 + }, + { + "auxiliary_loss_clip": 0.01187421, + "auxiliary_loss_mlp": 0.01034253, + "balance_loss_clip": 1.05384469, + "balance_loss_mlp": 1.0242995, + "epoch": 0.15715745806529188, + "flos": 24863650836480.0, + "grad_norm": 6.753830322334322, + "language_loss": 0.70337665, + "learning_rate": 3.8328945159710994e-06, + "loss": 0.72559345, + "num_input_tokens_seen": 27604440, + "step": 1307, + "time_per_iteration": 2.6016299724578857 + }, + { + "auxiliary_loss_clip": 0.01224802, + "auxiliary_loss_mlp": 0.00765003, + "balance_loss_clip": 1.06652427, + "balance_loss_mlp": 1.0012238, + "epoch": 0.157277700955931, + "flos": 21872148491520.0, + "grad_norm": 1.9641914822959965, + "language_loss": 0.88709366, + "learning_rate": 3.832582667166473e-06, + "loss": 0.90699172, + "num_input_tokens_seen": 27624250, + "step": 1308, + "time_per_iteration": 2.4975638389587402 + }, + { + "auxiliary_loss_clip": 0.01203736, + "auxiliary_loss_mlp": 0.01039267, + "balance_loss_clip": 1.06059813, + "balance_loss_mlp": 1.02777505, + "epoch": 0.15739794384657008, + "flos": 24533344344960.0, + "grad_norm": 1.6545399092177522, + "language_loss": 0.81350046, + "learning_rate": 3.8322705403605125e-06, + "loss": 0.83593053, + "num_input_tokens_seen": 27644595, + "step": 1309, + "time_per_iteration": 2.551462173461914 + }, + { + "auxiliary_loss_clip": 0.01193563, + "auxiliary_loss_mlp": 0.01035967, + "balance_loss_clip": 1.05915177, + "balance_loss_mlp": 1.02623975, + "epoch": 0.15751818673720916, + "flos": 17745329998080.0, + "grad_norm": 2.1949808012015612, + "language_loss": 0.81358284, + "learning_rate": 3.831958135600568e-06, + "loss": 0.83587813, + "num_input_tokens_seen": 27662145, + "step": 1310, + "time_per_iteration": 2.502915859222412 + }, + { + "auxiliary_loss_clip": 0.01218721, + "auxiliary_loss_mlp": 0.01031063, + "balance_loss_clip": 1.06354558, + "balance_loss_mlp": 1.02172327, + "epoch": 0.15763842962784824, + "flos": 17858520731520.0, + "grad_norm": 2.1204328094097153, + "language_loss": 0.79785299, + "learning_rate": 3.831645452934032e-06, + "loss": 0.82035083, + "num_input_tokens_seen": 27680575, + "step": 1311, + "time_per_iteration": 2.507802724838257 + }, + { + "auxiliary_loss_clip": 0.01233342, + "auxiliary_loss_mlp": 0.01045171, + "balance_loss_clip": 1.06631517, + "balance_loss_mlp": 1.03495479, + "epoch": 0.15775867251848735, + "flos": 26980908059520.0, + "grad_norm": 1.8003482028944942, + "language_loss": 0.79922634, + "learning_rate": 3.831332492408336e-06, + "loss": 0.82201147, + "num_input_tokens_seen": 27701985, + "step": 1312, + "time_per_iteration": 2.5090577602386475 + }, + { + "auxiliary_loss_clip": 0.01196818, + "auxiliary_loss_mlp": 0.01031779, + "balance_loss_clip": 1.05928946, + "balance_loss_mlp": 1.02147913, + "epoch": 0.15787891540912644, + "flos": 19240398812160.0, + "grad_norm": 3.0877676092392616, + "language_loss": 0.69023454, + "learning_rate": 3.831019254070957e-06, + "loss": 0.7125206, + "num_input_tokens_seen": 27719770, + "step": 1313, + "time_per_iteration": 2.605391263961792 + }, + { + "auxiliary_loss_clip": 0.01173334, + "auxiliary_loss_mlp": 0.01032134, + "balance_loss_clip": 1.05493712, + "balance_loss_mlp": 1.02192354, + "epoch": 0.15799915829976552, + "flos": 27271102037760.0, + "grad_norm": 2.5678210833218733, + "language_loss": 0.94643211, + "learning_rate": 3.8307057379694135e-06, + "loss": 0.96848679, + "num_input_tokens_seen": 27739105, + "step": 1314, + "time_per_iteration": 2.644348382949829 + }, + { + "auxiliary_loss_clip": 0.01230277, + "auxiliary_loss_mlp": 0.01042763, + "balance_loss_clip": 1.06172442, + "balance_loss_mlp": 1.0323205, + "epoch": 0.15811940119040463, + "flos": 20405520270720.0, + "grad_norm": 2.3162766648816517, + "language_loss": 0.82163823, + "learning_rate": 3.830391944151264e-06, + "loss": 0.84436864, + "num_input_tokens_seen": 27754985, + "step": 1315, + "time_per_iteration": 2.4623465538024902 + }, + { + "auxiliary_loss_clip": 0.01198939, + "auxiliary_loss_mlp": 0.01041539, + "balance_loss_clip": 1.05671608, + "balance_loss_mlp": 1.03153777, + "epoch": 0.1582396440810437, + "flos": 32599347661440.0, + "grad_norm": 1.8429100506941156, + "language_loss": 0.67344874, + "learning_rate": 3.830077872664114e-06, + "loss": 0.69585353, + "num_input_tokens_seen": 27776110, + "step": 1316, + "time_per_iteration": 2.6301705837249756 + }, + { + "auxiliary_loss_clip": 0.01153023, + "auxiliary_loss_mlp": 0.0103695, + "balance_loss_clip": 1.05090237, + "balance_loss_mlp": 1.02692437, + "epoch": 0.1583598869716828, + "flos": 33800559310080.0, + "grad_norm": 1.7463579268446645, + "language_loss": 0.7284385, + "learning_rate": 3.829763523555604e-06, + "loss": 0.7503382, + "num_input_tokens_seen": 27796510, + "step": 1317, + "time_per_iteration": 2.7081029415130615 + }, + { + "auxiliary_loss_clip": 0.0121151, + "auxiliary_loss_mlp": 0.01034679, + "balance_loss_clip": 1.06608284, + "balance_loss_mlp": 1.02511835, + "epoch": 0.15848012986232188, + "flos": 24681332378880.0, + "grad_norm": 2.428752423069119, + "language_loss": 0.78396815, + "learning_rate": 3.829448896873423e-06, + "loss": 0.80642998, + "num_input_tokens_seen": 27815610, + "step": 1318, + "time_per_iteration": 2.533863067626953 + }, + { + "auxiliary_loss_clip": 0.01158085, + "auxiliary_loss_mlp": 0.00764812, + "balance_loss_clip": 1.05605018, + "balance_loss_mlp": 1.00117564, + "epoch": 0.158600372752961, + "flos": 22602068766720.0, + "grad_norm": 4.011362588893551, + "language_loss": 0.7901969, + "learning_rate": 3.829133992665299e-06, + "loss": 0.80942589, + "num_input_tokens_seen": 27834735, + "step": 1319, + "time_per_iteration": 2.5953316688537598 + }, + { + "auxiliary_loss_clip": 0.01202159, + "auxiliary_loss_mlp": 0.01037423, + "balance_loss_clip": 1.0590601, + "balance_loss_mlp": 1.02755213, + "epoch": 0.15872061564360007, + "flos": 27927944092800.0, + "grad_norm": 2.2514963805598254, + "language_loss": 0.8891539, + "learning_rate": 3.828818810979002e-06, + "loss": 0.91154969, + "num_input_tokens_seen": 27853065, + "step": 1320, + "time_per_iteration": 2.5516347885131836 + }, + { + "auxiliary_loss_clip": 0.01229374, + "auxiliary_loss_mlp": 0.0103788, + "balance_loss_clip": 1.06602585, + "balance_loss_mlp": 1.02817082, + "epoch": 0.15884085853423915, + "flos": 23696805525120.0, + "grad_norm": 1.7521603422244256, + "language_loss": 0.80245996, + "learning_rate": 3.8285033518623454e-06, + "loss": 0.82513249, + "num_input_tokens_seen": 27873315, + "step": 1321, + "time_per_iteration": 2.488969326019287 + }, + { + "auxiliary_loss_clip": 0.01219787, + "auxiliary_loss_mlp": 0.0103778, + "balance_loss_clip": 1.06343985, + "balance_loss_mlp": 1.02626443, + "epoch": 0.15896110142487826, + "flos": 23112359331840.0, + "grad_norm": 2.5336735906818735, + "language_loss": 0.81191754, + "learning_rate": 3.8281876153631845e-06, + "loss": 0.83449328, + "num_input_tokens_seen": 27890070, + "step": 1322, + "time_per_iteration": 2.495168447494507 + }, + { + "auxiliary_loss_clip": 0.01164433, + "auxiliary_loss_mlp": 0.01042032, + "balance_loss_clip": 1.0537703, + "balance_loss_mlp": 1.03059971, + "epoch": 0.15908134431551735, + "flos": 14685238632960.0, + "grad_norm": 1.9753995074713062, + "language_loss": 0.64497119, + "learning_rate": 3.827871601529416e-06, + "loss": 0.66703582, + "num_input_tokens_seen": 27908590, + "step": 1323, + "time_per_iteration": 2.574603796005249 + }, + { + "auxiliary_loss_clip": 0.01178215, + "auxiliary_loss_mlp": 0.01038451, + "balance_loss_clip": 1.05614161, + "balance_loss_mlp": 1.0282824, + "epoch": 0.15920158720615643, + "flos": 20193611984640.0, + "grad_norm": 1.6649391788857213, + "language_loss": 0.80528545, + "learning_rate": 3.827555310408979e-06, + "loss": 0.82745206, + "num_input_tokens_seen": 27927985, + "step": 1324, + "time_per_iteration": 2.547454357147217 + }, + { + "auxiliary_loss_clip": 0.01178318, + "auxiliary_loss_mlp": 0.01035747, + "balance_loss_clip": 1.05900383, + "balance_loss_mlp": 1.02528095, + "epoch": 0.1593218300967955, + "flos": 24826626892800.0, + "grad_norm": 1.7104298457892781, + "language_loss": 0.82784545, + "learning_rate": 3.827238742049854e-06, + "loss": 0.84998608, + "num_input_tokens_seen": 27948280, + "step": 1325, + "time_per_iteration": 3.410881280899048 + }, + { + "auxiliary_loss_clip": 0.01229611, + "auxiliary_loss_mlp": 0.01036517, + "balance_loss_clip": 1.06336462, + "balance_loss_mlp": 1.02582967, + "epoch": 0.15944207298743462, + "flos": 28328707111680.0, + "grad_norm": 1.8340137306501965, + "language_loss": 0.51664281, + "learning_rate": 3.826921896500066e-06, + "loss": 0.53930414, + "num_input_tokens_seen": 27969565, + "step": 1326, + "time_per_iteration": 2.5373659133911133 + }, + { + "auxiliary_loss_clip": 0.011885, + "auxiliary_loss_mlp": 0.01035291, + "balance_loss_clip": 1.05924237, + "balance_loss_mlp": 1.02443123, + "epoch": 0.1595623158780737, + "flos": 22964838174720.0, + "grad_norm": 1.8178522809156679, + "language_loss": 0.77782857, + "learning_rate": 3.826604773807678e-06, + "loss": 0.80006647, + "num_input_tokens_seen": 27987540, + "step": 1327, + "time_per_iteration": 2.5732202529907227 + }, + { + "auxiliary_loss_clip": 0.01196932, + "auxiliary_loss_mlp": 0.01034478, + "balance_loss_clip": 1.05551779, + "balance_loss_mlp": 1.0233438, + "epoch": 0.1596825587687128, + "flos": 19710540950400.0, + "grad_norm": 2.942889543223668, + "language_loss": 0.73431075, + "learning_rate": 3.826287374020798e-06, + "loss": 0.75662482, + "num_input_tokens_seen": 28002345, + "step": 1328, + "time_per_iteration": 2.4970221519470215 + }, + { + "auxiliary_loss_clip": 0.01231372, + "auxiliary_loss_mlp": 0.01035298, + "balance_loss_clip": 1.06529498, + "balance_loss_mlp": 1.02537429, + "epoch": 0.1598028016593519, + "flos": 22637727993600.0, + "grad_norm": 1.9533009357376354, + "language_loss": 0.81432682, + "learning_rate": 3.825969697187575e-06, + "loss": 0.83699352, + "num_input_tokens_seen": 28021675, + "step": 1329, + "time_per_iteration": 4.204589605331421 + }, + { + "auxiliary_loss_clip": 0.01183176, + "auxiliary_loss_mlp": 0.01033733, + "balance_loss_clip": 1.05720663, + "balance_loss_mlp": 1.02330804, + "epoch": 0.15992304454999098, + "flos": 20482908122880.0, + "grad_norm": 1.9305530175515417, + "language_loss": 0.69668114, + "learning_rate": 3.8256517433562015e-06, + "loss": 0.71885026, + "num_input_tokens_seen": 28039615, + "step": 1330, + "time_per_iteration": 2.6406164169311523 + }, + { + "auxiliary_loss_clip": 0.01225624, + "auxiliary_loss_mlp": 0.01033919, + "balance_loss_clip": 1.06169021, + "balance_loss_mlp": 1.02497864, + "epoch": 0.16004328744063007, + "flos": 17676094533120.0, + "grad_norm": 2.2440534812168056, + "language_loss": 0.91365379, + "learning_rate": 3.82533351257491e-06, + "loss": 0.93624926, + "num_input_tokens_seen": 28057565, + "step": 1331, + "time_per_iteration": 2.4690964221954346 + }, + { + "auxiliary_loss_clip": 0.01214241, + "auxiliary_loss_mlp": 0.01031212, + "balance_loss_clip": 1.06451941, + "balance_loss_mlp": 1.02163363, + "epoch": 0.16016353033126918, + "flos": 24098717779200.0, + "grad_norm": 1.9830441771610918, + "language_loss": 0.88556826, + "learning_rate": 3.825015004891975e-06, + "loss": 0.90802276, + "num_input_tokens_seen": 28076305, + "step": 1332, + "time_per_iteration": 2.539296865463257 + }, + { + "auxiliary_loss_clip": 0.01208344, + "auxiliary_loss_mlp": 0.01028905, + "balance_loss_clip": 1.05928636, + "balance_loss_mlp": 1.01913023, + "epoch": 0.16028377322190826, + "flos": 27634841112960.0, + "grad_norm": 1.893796230559264, + "language_loss": 0.75662684, + "learning_rate": 3.824696220355716e-06, + "loss": 0.77899933, + "num_input_tokens_seen": 28097895, + "step": 1333, + "time_per_iteration": 2.5657358169555664 + }, + { + "auxiliary_loss_clip": 0.01194692, + "auxiliary_loss_mlp": 0.01037494, + "balance_loss_clip": 1.05838823, + "balance_loss_mlp": 1.02795744, + "epoch": 0.16040401611254734, + "flos": 20961202648320.0, + "grad_norm": 2.148470430865136, + "language_loss": 0.7897228, + "learning_rate": 3.824377159014491e-06, + "loss": 0.81204468, + "num_input_tokens_seen": 28118790, + "step": 1334, + "time_per_iteration": 2.550382614135742 + }, + { + "auxiliary_loss_clip": 0.01209731, + "auxiliary_loss_mlp": 0.01032977, + "balance_loss_clip": 1.06180871, + "balance_loss_mlp": 1.02319598, + "epoch": 0.16052425900318643, + "flos": 21247051080960.0, + "grad_norm": 1.7433909771833396, + "language_loss": 0.85018528, + "learning_rate": 3.824057820916702e-06, + "loss": 0.87261236, + "num_input_tokens_seen": 28135995, + "step": 1335, + "time_per_iteration": 2.6674201488494873 + }, + { + "auxiliary_loss_clip": 0.01198942, + "auxiliary_loss_mlp": 0.01032695, + "balance_loss_clip": 1.05949628, + "balance_loss_mlp": 1.02188849, + "epoch": 0.16064450189382554, + "flos": 15524004096000.0, + "grad_norm": 2.6966114111823223, + "language_loss": 0.71571392, + "learning_rate": 3.8237382061107904e-06, + "loss": 0.73803031, + "num_input_tokens_seen": 28152715, + "step": 1336, + "time_per_iteration": 2.5578997135162354 + }, + { + "auxiliary_loss_clip": 0.01123675, + "auxiliary_loss_mlp": 0.01035398, + "balance_loss_clip": 1.04546189, + "balance_loss_mlp": 1.0258311, + "epoch": 0.16076474478446462, + "flos": 21178497974400.0, + "grad_norm": 1.8082631803462297, + "language_loss": 0.78390509, + "learning_rate": 3.823418314645243e-06, + "loss": 0.80549586, + "num_input_tokens_seen": 28171590, + "step": 1337, + "time_per_iteration": 2.759230613708496 + }, + { + "auxiliary_loss_clip": 0.01152249, + "auxiliary_loss_mlp": 0.01038517, + "balance_loss_clip": 1.05588782, + "balance_loss_mlp": 1.02951646, + "epoch": 0.1608849876751037, + "flos": 18366476912640.0, + "grad_norm": 2.07833899534557, + "language_loss": 0.7540887, + "learning_rate": 3.823098146568588e-06, + "loss": 0.77599639, + "num_input_tokens_seen": 28191295, + "step": 1338, + "time_per_iteration": 2.60707950592041 + }, + { + "auxiliary_loss_clip": 0.01210746, + "auxiliary_loss_mlp": 0.01035323, + "balance_loss_clip": 1.06097114, + "balance_loss_mlp": 1.02652574, + "epoch": 0.1610052305657428, + "flos": 29497024880640.0, + "grad_norm": 1.789658889062781, + "language_loss": 0.71696204, + "learning_rate": 3.822777701929394e-06, + "loss": 0.7394228, + "num_input_tokens_seen": 28213120, + "step": 1339, + "time_per_iteration": 2.5680150985717773 + }, + { + "auxiliary_loss_clip": 0.0119987, + "auxiliary_loss_mlp": 0.01038933, + "balance_loss_clip": 1.05643559, + "balance_loss_mlp": 1.02890766, + "epoch": 0.1611254734563819, + "flos": 26797871329920.0, + "grad_norm": 1.8591565731653934, + "language_loss": 0.73437774, + "learning_rate": 3.8224569807762714e-06, + "loss": 0.75676578, + "num_input_tokens_seen": 28232440, + "step": 1340, + "time_per_iteration": 2.527047634124756 + }, + { + "auxiliary_loss_clip": 0.01146718, + "auxiliary_loss_mlp": 0.01038943, + "balance_loss_clip": 1.0482533, + "balance_loss_mlp": 1.02861345, + "epoch": 0.16124571634702098, + "flos": 22419570741120.0, + "grad_norm": 1.8725312614850629, + "language_loss": 0.76285219, + "learning_rate": 3.822135983157873e-06, + "loss": 0.78470874, + "num_input_tokens_seen": 28251715, + "step": 1341, + "time_per_iteration": 2.6579976081848145 + }, + { + "auxiliary_loss_clip": 0.01222689, + "auxiliary_loss_mlp": 0.00764184, + "balance_loss_clip": 1.06132579, + "balance_loss_mlp": 1.00108075, + "epoch": 0.16136595923766006, + "flos": 10999116103680.0, + "grad_norm": 2.151870846874914, + "language_loss": 0.84122682, + "learning_rate": 3.821814709122896e-06, + "loss": 0.86109555, + "num_input_tokens_seen": 28269765, + "step": 1342, + "time_per_iteration": 2.458009958267212 + }, + { + "auxiliary_loss_clip": 0.01193283, + "auxiliary_loss_mlp": 0.01034069, + "balance_loss_clip": 1.0579263, + "balance_loss_mlp": 1.02508605, + "epoch": 0.16148620212829917, + "flos": 21214983214080.0, + "grad_norm": 2.0733341334940127, + "language_loss": 0.84791028, + "learning_rate": 3.821493158720076e-06, + "loss": 0.87018377, + "num_input_tokens_seen": 28288870, + "step": 1343, + "time_per_iteration": 2.5437610149383545 + }, + { + "auxiliary_loss_clip": 0.01179231, + "auxiliary_loss_mlp": 0.01032505, + "balance_loss_clip": 1.05323505, + "balance_loss_mlp": 1.02210402, + "epoch": 0.16160644501893826, + "flos": 16758468760320.0, + "grad_norm": 2.784892966085007, + "language_loss": 0.73080397, + "learning_rate": 3.821171331998191e-06, + "loss": 0.75292134, + "num_input_tokens_seen": 28305400, + "step": 1344, + "time_per_iteration": 2.528989791870117 + }, + { + "auxiliary_loss_clip": 0.01095028, + "auxiliary_loss_mlp": 0.0101739, + "balance_loss_clip": 1.02449751, + "balance_loss_mlp": 1.01402843, + "epoch": 0.16172668790957734, + "flos": 64444967308800.0, + "grad_norm": 0.7252962530506282, + "language_loss": 0.54472971, + "learning_rate": 3.820849229006064e-06, + "loss": 0.56585383, + "num_input_tokens_seen": 28373150, + "step": 1345, + "time_per_iteration": 3.282515525817871 + }, + { + "auxiliary_loss_clip": 0.01228411, + "auxiliary_loss_mlp": 0.010305, + "balance_loss_clip": 1.06349254, + "balance_loss_mlp": 1.02108812, + "epoch": 0.16184693080021645, + "flos": 23257689759360.0, + "grad_norm": 1.998193146638708, + "language_loss": 0.70519841, + "learning_rate": 3.8205268497925564e-06, + "loss": 0.72778749, + "num_input_tokens_seen": 28393620, + "step": 1346, + "time_per_iteration": 2.5129637718200684 + }, + { + "auxiliary_loss_clip": 0.0122754, + "auxiliary_loss_mlp": 0.01034436, + "balance_loss_clip": 1.06389284, + "balance_loss_mlp": 1.02515531, + "epoch": 0.16196717369085553, + "flos": 17451113696640.0, + "grad_norm": 20.60916288029357, + "language_loss": 0.7867831, + "learning_rate": 3.8202041944065725e-06, + "loss": 0.80940282, + "num_input_tokens_seen": 28409440, + "step": 1347, + "time_per_iteration": 2.455448627471924 + }, + { + "auxiliary_loss_clip": 0.01225874, + "auxiliary_loss_mlp": 0.01033004, + "balance_loss_clip": 1.06398296, + "balance_loss_mlp": 1.02330589, + "epoch": 0.16208741658149461, + "flos": 23873377806720.0, + "grad_norm": 2.2252888857420614, + "language_loss": 0.73902524, + "learning_rate": 3.819881262897061e-06, + "loss": 0.76161397, + "num_input_tokens_seen": 28427575, + "step": 1348, + "time_per_iteration": 2.502995014190674 + }, + { + "auxiliary_loss_clip": 0.01184397, + "auxiliary_loss_mlp": 0.01038405, + "balance_loss_clip": 1.06309533, + "balance_loss_mlp": 1.02801609, + "epoch": 0.1622076594721337, + "flos": 25884806584320.0, + "grad_norm": 2.239314925385149, + "language_loss": 0.73366541, + "learning_rate": 3.819558055313008e-06, + "loss": 0.75589341, + "num_input_tokens_seen": 28448260, + "step": 1349, + "time_per_iteration": 2.615514039993286 + }, + { + "auxiliary_loss_clip": 0.01216532, + "auxiliary_loss_mlp": 0.01035659, + "balance_loss_clip": 1.06292272, + "balance_loss_mlp": 1.02630067, + "epoch": 0.1623279023627728, + "flos": 21539759011200.0, + "grad_norm": 1.8748523798812304, + "language_loss": 0.773588, + "learning_rate": 3.819234571703444e-06, + "loss": 0.79610991, + "num_input_tokens_seen": 28467085, + "step": 1350, + "time_per_iteration": 2.4961180686950684 + }, + { + "auxiliary_loss_clip": 0.01204085, + "auxiliary_loss_mlp": 0.01037367, + "balance_loss_clip": 1.05797291, + "balance_loss_mlp": 1.02727556, + "epoch": 0.1624481452534119, + "flos": 22085421494400.0, + "grad_norm": 2.7610831957870348, + "language_loss": 0.8582809, + "learning_rate": 3.8189108121174435e-06, + "loss": 0.88069534, + "num_input_tokens_seen": 28486850, + "step": 1351, + "time_per_iteration": 2.531632423400879 + }, + { + "auxiliary_loss_clip": 0.01176414, + "auxiliary_loss_mlp": 0.010371, + "balance_loss_clip": 1.05993736, + "balance_loss_mlp": 1.02738428, + "epoch": 0.16256838814405097, + "flos": 27087490690560.0, + "grad_norm": 1.9673476629115594, + "language_loss": 0.83232331, + "learning_rate": 3.818586776604118e-06, + "loss": 0.85445845, + "num_input_tokens_seen": 28507490, + "step": 1352, + "time_per_iteration": 3.3700144290924072 + }, + { + "auxiliary_loss_clip": 0.01192897, + "auxiliary_loss_mlp": 0.01036819, + "balance_loss_clip": 1.05808163, + "balance_loss_mlp": 1.02724648, + "epoch": 0.16268863103469008, + "flos": 20120354196480.0, + "grad_norm": 2.8181559113042085, + "language_loss": 0.61437213, + "learning_rate": 3.818262465212625e-06, + "loss": 0.63666928, + "num_input_tokens_seen": 28527615, + "step": 1353, + "time_per_iteration": 2.547781229019165 + }, + { + "auxiliary_loss_clip": 0.01202445, + "auxiliary_loss_mlp": 0.01045785, + "balance_loss_clip": 1.06031585, + "balance_loss_mlp": 1.03530645, + "epoch": 0.16280887392532917, + "flos": 18332792933760.0, + "grad_norm": 1.9561770047747549, + "language_loss": 0.77143109, + "learning_rate": 3.817937877992161e-06, + "loss": 0.79391336, + "num_input_tokens_seen": 28544910, + "step": 1354, + "time_per_iteration": 2.461460828781128 + }, + { + "auxiliary_loss_clip": 0.01182243, + "auxiliary_loss_mlp": 0.00765293, + "balance_loss_clip": 1.0551163, + "balance_loss_mlp": 1.00102496, + "epoch": 0.16292911681596825, + "flos": 11874330892800.0, + "grad_norm": 3.1591208644060265, + "language_loss": 0.85761559, + "learning_rate": 3.817613014991967e-06, + "loss": 0.87709093, + "num_input_tokens_seen": 28561050, + "step": 1355, + "time_per_iteration": 3.329928398132324 + }, + { + "auxiliary_loss_clip": 0.01173289, + "auxiliary_loss_mlp": 0.01033913, + "balance_loss_clip": 1.05550766, + "balance_loss_mlp": 1.02405405, + "epoch": 0.16304935970660733, + "flos": 26103466627200.0, + "grad_norm": 2.1403093024904805, + "language_loss": 0.76804113, + "learning_rate": 3.817287876261323e-06, + "loss": 0.79011315, + "num_input_tokens_seen": 28581385, + "step": 1356, + "time_per_iteration": 4.171458721160889 + }, + { + "auxiliary_loss_clip": 0.01192828, + "auxiliary_loss_mlp": 0.01034313, + "balance_loss_clip": 1.06083727, + "balance_loss_mlp": 1.02383435, + "epoch": 0.16316960259724644, + "flos": 29351945848320.0, + "grad_norm": 1.7162986843845727, + "language_loss": 0.79783416, + "learning_rate": 3.816962461849553e-06, + "loss": 0.82010555, + "num_input_tokens_seen": 28603255, + "step": 1357, + "time_per_iteration": 2.6117117404937744 + }, + { + "auxiliary_loss_clip": 0.01191324, + "auxiliary_loss_mlp": 0.01034237, + "balance_loss_clip": 1.06113815, + "balance_loss_mlp": 1.0238893, + "epoch": 0.16328984548788553, + "flos": 20886759711360.0, + "grad_norm": 1.940191420997136, + "language_loss": 0.84268713, + "learning_rate": 3.8166367718060235e-06, + "loss": 0.86494273, + "num_input_tokens_seen": 28623145, + "step": 1358, + "time_per_iteration": 2.5400679111480713 + }, + { + "auxiliary_loss_clip": 0.01207457, + "auxiliary_loss_mlp": 0.01032731, + "balance_loss_clip": 1.05903888, + "balance_loss_mlp": 1.02283096, + "epoch": 0.1634100883785246, + "flos": 18041090584320.0, + "grad_norm": 2.8115537265020536, + "language_loss": 0.76565862, + "learning_rate": 3.816310806180139e-06, + "loss": 0.78806049, + "num_input_tokens_seen": 28641555, + "step": 1359, + "time_per_iteration": 2.481618642807007 + }, + { + "auxiliary_loss_clip": 0.01190751, + "auxiliary_loss_mlp": 0.01037247, + "balance_loss_clip": 1.05830014, + "balance_loss_mlp": 1.02738261, + "epoch": 0.16353033126916372, + "flos": 24572128055040.0, + "grad_norm": 1.9166995942135943, + "language_loss": 0.80902243, + "learning_rate": 3.81598456502135e-06, + "loss": 0.8313024, + "num_input_tokens_seen": 28661575, + "step": 1360, + "time_per_iteration": 2.541520357131958 + }, + { + "auxiliary_loss_clip": 0.01191018, + "auxiliary_loss_mlp": 0.01038253, + "balance_loss_clip": 1.0596844, + "balance_loss_mlp": 1.02779222, + "epoch": 0.1636505741598028, + "flos": 19892895321600.0, + "grad_norm": 2.122659300141807, + "language_loss": 0.86967784, + "learning_rate": 3.8156580483791455e-06, + "loss": 0.89197052, + "num_input_tokens_seen": 28676765, + "step": 1361, + "time_per_iteration": 2.5105185508728027 + }, + { + "auxiliary_loss_clip": 0.01228905, + "auxiliary_loss_mlp": 0.0103092, + "balance_loss_clip": 1.06384158, + "balance_loss_mlp": 1.02135372, + "epoch": 0.16377081705044189, + "flos": 28402611344640.0, + "grad_norm": 2.071371839931392, + "language_loss": 0.76918191, + "learning_rate": 3.815331256303059e-06, + "loss": 0.79178017, + "num_input_tokens_seen": 28696795, + "step": 1362, + "time_per_iteration": 2.496187686920166 + }, + { + "auxiliary_loss_clip": 0.01176307, + "auxiliary_loss_mlp": 0.01036528, + "balance_loss_clip": 1.0595814, + "balance_loss_mlp": 1.0266273, + "epoch": 0.163891059941081, + "flos": 21908059113600.0, + "grad_norm": 2.308064654605227, + "language_loss": 0.77328253, + "learning_rate": 3.815004188842665e-06, + "loss": 0.79541087, + "num_input_tokens_seen": 28714835, + "step": 1363, + "time_per_iteration": 2.5642151832580566 + }, + { + "auxiliary_loss_clip": 0.01189559, + "auxiliary_loss_mlp": 0.01035339, + "balance_loss_clip": 1.05369782, + "balance_loss_mlp": 1.02505136, + "epoch": 0.16401130283172008, + "flos": 26797619934720.0, + "grad_norm": 1.5521320732443273, + "language_loss": 0.7972858, + "learning_rate": 3.814676846047578e-06, + "loss": 0.81953484, + "num_input_tokens_seen": 28735710, + "step": 1364, + "time_per_iteration": 2.554287910461426 + }, + { + "auxiliary_loss_clip": 0.01209798, + "auxiliary_loss_mlp": 0.01040237, + "balance_loss_clip": 1.06101298, + "balance_loss_mlp": 1.03003824, + "epoch": 0.16413154572235916, + "flos": 32997417160320.0, + "grad_norm": 1.751030844442076, + "language_loss": 0.70041835, + "learning_rate": 3.8143492279674565e-06, + "loss": 0.72291869, + "num_input_tokens_seen": 28758405, + "step": 1365, + "time_per_iteration": 2.6029021739959717 + }, + { + "auxiliary_loss_clip": 0.01084403, + "auxiliary_loss_mlp": 0.0100126, + "balance_loss_clip": 1.01914167, + "balance_loss_mlp": 0.99794561, + "epoch": 0.16425178861299825, + "flos": 40113622074240.0, + "grad_norm": 0.8431105018000821, + "language_loss": 0.58405602, + "learning_rate": 3.8140213346519997e-06, + "loss": 0.60491264, + "num_input_tokens_seen": 28809000, + "step": 1366, + "time_per_iteration": 2.850400447845459 + }, + { + "auxiliary_loss_clip": 0.0116619, + "auxiliary_loss_mlp": 0.01033497, + "balance_loss_clip": 1.05298138, + "balance_loss_mlp": 1.02366257, + "epoch": 0.16437203150363736, + "flos": 25447486498560.0, + "grad_norm": 1.6252901928884553, + "language_loss": 0.76993698, + "learning_rate": 3.813693166150948e-06, + "loss": 0.79193389, + "num_input_tokens_seen": 28829210, + "step": 1367, + "time_per_iteration": 2.601109743118286 + }, + { + "auxiliary_loss_clip": 0.01173515, + "auxiliary_loss_mlp": 0.01032432, + "balance_loss_clip": 1.05617177, + "balance_loss_mlp": 1.02176905, + "epoch": 0.16449227439427644, + "flos": 23476888506240.0, + "grad_norm": 2.8458982021091512, + "language_loss": 0.85549849, + "learning_rate": 3.813364722514086e-06, + "loss": 0.87755799, + "num_input_tokens_seen": 28847545, + "step": 1368, + "time_per_iteration": 2.5538833141326904 + }, + { + "auxiliary_loss_clip": 0.01208762, + "auxiliary_loss_mlp": 0.01033511, + "balance_loss_clip": 1.05802643, + "balance_loss_mlp": 1.02371168, + "epoch": 0.16461251728491552, + "flos": 13545217802880.0, + "grad_norm": 3.1621417507334764, + "language_loss": 0.80156922, + "learning_rate": 3.8130360037912368e-06, + "loss": 0.82399201, + "num_input_tokens_seen": 28863990, + "step": 1369, + "time_per_iteration": 2.4758875370025635 + }, + { + "auxiliary_loss_clip": 0.01207335, + "auxiliary_loss_mlp": 0.01035243, + "balance_loss_clip": 1.05741274, + "balance_loss_mlp": 1.02404964, + "epoch": 0.16473276017555463, + "flos": 23003298662400.0, + "grad_norm": 2.262768502283211, + "language_loss": 0.81607652, + "learning_rate": 3.812707010032268e-06, + "loss": 0.83850229, + "num_input_tokens_seen": 28883045, + "step": 1370, + "time_per_iteration": 2.4940176010131836 + }, + { + "auxiliary_loss_clip": 0.01218181, + "auxiliary_loss_mlp": 0.01041632, + "balance_loss_clip": 1.06435573, + "balance_loss_mlp": 1.03163075, + "epoch": 0.16485300306619372, + "flos": 24790680357120.0, + "grad_norm": 1.9087823496064242, + "language_loss": 0.79013848, + "learning_rate": 3.8123777412870863e-06, + "loss": 0.81273663, + "num_input_tokens_seen": 28902545, + "step": 1371, + "time_per_iteration": 2.525146007537842 + }, + { + "auxiliary_loss_clip": 0.01200437, + "auxiliary_loss_mlp": 0.01042977, + "balance_loss_clip": 1.05682564, + "balance_loss_mlp": 1.03278422, + "epoch": 0.1649732459568328, + "flos": 21106497162240.0, + "grad_norm": 1.8872252569314298, + "language_loss": 0.78490609, + "learning_rate": 3.812048197605643e-06, + "loss": 0.8073402, + "num_input_tokens_seen": 28921440, + "step": 1372, + "time_per_iteration": 2.51423978805542 + }, + { + "auxiliary_loss_clip": 0.01208839, + "auxiliary_loss_mlp": 0.01026573, + "balance_loss_clip": 1.05842566, + "balance_loss_mlp": 1.01670885, + "epoch": 0.16509348884747188, + "flos": 20266726118400.0, + "grad_norm": 1.9184449476498433, + "language_loss": 0.81251442, + "learning_rate": 3.8117183790379277e-06, + "loss": 0.83486855, + "num_input_tokens_seen": 28939890, + "step": 1373, + "time_per_iteration": 2.497551202774048 + }, + { + "auxiliary_loss_clip": 0.01225948, + "auxiliary_loss_mlp": 0.01034365, + "balance_loss_clip": 1.06123495, + "balance_loss_mlp": 1.02407098, + "epoch": 0.165213731738111, + "flos": 11035493602560.0, + "grad_norm": 3.272829632049874, + "language_loss": 0.94000363, + "learning_rate": 3.811388285633976e-06, + "loss": 0.96260679, + "num_input_tokens_seen": 28955875, + "step": 1374, + "time_per_iteration": 2.4995205402374268 + }, + { + "auxiliary_loss_clip": 0.01169022, + "auxiliary_loss_mlp": 0.01039776, + "balance_loss_clip": 1.05553818, + "balance_loss_mlp": 1.02967286, + "epoch": 0.16533397462875007, + "flos": 29972051268480.0, + "grad_norm": 2.0107146797559694, + "language_loss": 0.61983156, + "learning_rate": 3.811057917443861e-06, + "loss": 0.64191955, + "num_input_tokens_seen": 28975140, + "step": 1375, + "time_per_iteration": 2.662984609603882 + }, + { + "auxiliary_loss_clip": 0.01098749, + "auxiliary_loss_mlp": 0.01008916, + "balance_loss_clip": 1.01955914, + "balance_loss_mlp": 1.00574458, + "epoch": 0.16545421751938916, + "flos": 65556763027200.0, + "grad_norm": 0.8517718464327935, + "language_loss": 0.68332338, + "learning_rate": 3.8107272745177e-06, + "loss": 0.70440006, + "num_input_tokens_seen": 29047470, + "step": 1376, + "time_per_iteration": 3.2413716316223145 + }, + { + "auxiliary_loss_clip": 0.01181352, + "auxiliary_loss_mlp": 0.01033184, + "balance_loss_clip": 1.05749238, + "balance_loss_mlp": 1.02328992, + "epoch": 0.16557446041002827, + "flos": 22492361652480.0, + "grad_norm": 1.7428782252596624, + "language_loss": 0.78627014, + "learning_rate": 3.8103963569056513e-06, + "loss": 0.80841547, + "num_input_tokens_seen": 29066605, + "step": 1377, + "time_per_iteration": 2.581875801086426 + }, + { + "auxiliary_loss_clip": 0.01189711, + "auxiliary_loss_mlp": 0.01038128, + "balance_loss_clip": 1.05558491, + "balance_loss_mlp": 1.02801931, + "epoch": 0.16569470330066735, + "flos": 24602723464320.0, + "grad_norm": 1.8892636414778432, + "language_loss": 0.88288069, + "learning_rate": 3.8100651646579146e-06, + "loss": 0.90515912, + "num_input_tokens_seen": 29085815, + "step": 1378, + "time_per_iteration": 3.3581268787384033 + }, + { + "auxiliary_loss_clip": 0.01188603, + "auxiliary_loss_mlp": 0.0103944, + "balance_loss_clip": 1.05233371, + "balance_loss_mlp": 1.0291822, + "epoch": 0.16581494619130643, + "flos": 15006207588480.0, + "grad_norm": 2.234850169674332, + "language_loss": 0.92310059, + "learning_rate": 3.8097336978247317e-06, + "loss": 0.94538099, + "num_input_tokens_seen": 29102520, + "step": 1379, + "time_per_iteration": 2.499394178390503 + }, + { + "auxiliary_loss_clip": 0.01178673, + "auxiliary_loss_mlp": 0.01029664, + "balance_loss_clip": 1.0532366, + "balance_loss_mlp": 1.01906669, + "epoch": 0.16593518908194552, + "flos": 17420338719360.0, + "grad_norm": 2.684813848631838, + "language_loss": 0.88695431, + "learning_rate": 3.8094019564563854e-06, + "loss": 0.90903765, + "num_input_tokens_seen": 29119450, + "step": 1380, + "time_per_iteration": 2.4803667068481445 + }, + { + "auxiliary_loss_clip": 0.01223784, + "auxiliary_loss_mlp": 0.00765279, + "balance_loss_clip": 1.06021237, + "balance_loss_mlp": 1.00118899, + "epoch": 0.16605543197258463, + "flos": 20412631163520.0, + "grad_norm": 2.214113502536482, + "language_loss": 0.75234473, + "learning_rate": 3.809069940603201e-06, + "loss": 0.77223539, + "num_input_tokens_seen": 29137405, + "step": 1381, + "time_per_iteration": 2.4606974124908447 + }, + { + "auxiliary_loss_clip": 0.01184362, + "auxiliary_loss_mlp": 0.01033579, + "balance_loss_clip": 1.05548596, + "balance_loss_mlp": 1.02349353, + "epoch": 0.1661756748632237, + "flos": 14209745368320.0, + "grad_norm": 2.0783125911908598, + "language_loss": 0.77907658, + "learning_rate": 3.8087376503155452e-06, + "loss": 0.801256, + "num_input_tokens_seen": 29154890, + "step": 1382, + "time_per_iteration": 3.2861461639404297 + }, + { + "auxiliary_loss_clip": 0.0109632, + "auxiliary_loss_mlp": 0.0100929, + "balance_loss_clip": 1.02140975, + "balance_loss_mlp": 1.00625002, + "epoch": 0.1662959177538628, + "flos": 66080877350400.0, + "grad_norm": 0.9014256969557136, + "language_loss": 0.56315345, + "learning_rate": 3.808405085643826e-06, + "loss": 0.58420956, + "num_input_tokens_seen": 29219770, + "step": 1383, + "time_per_iteration": 3.886673927307129 + }, + { + "auxiliary_loss_clip": 0.01228491, + "auxiliary_loss_mlp": 0.0076451, + "balance_loss_clip": 1.06327248, + "balance_loss_mlp": 1.00105369, + "epoch": 0.1664161606445019, + "flos": 20740567357440.0, + "grad_norm": 2.305170563232609, + "language_loss": 0.88904238, + "learning_rate": 3.8080722466384925e-06, + "loss": 0.90897238, + "num_input_tokens_seen": 29237620, + "step": 1384, + "time_per_iteration": 2.454413890838623 + }, + { + "auxiliary_loss_clip": 0.0122683, + "auxiliary_loss_mlp": 0.01036886, + "balance_loss_clip": 1.05888963, + "balance_loss_mlp": 1.025841, + "epoch": 0.166536403535141, + "flos": 25260930236160.0, + "grad_norm": 2.1202714440807586, + "language_loss": 0.70521593, + "learning_rate": 3.8077391333500376e-06, + "loss": 0.72785312, + "num_input_tokens_seen": 29256760, + "step": 1385, + "time_per_iteration": 2.474853277206421 + }, + { + "auxiliary_loss_clip": 0.01198367, + "auxiliary_loss_mlp": 0.01033395, + "balance_loss_clip": 1.06163383, + "balance_loss_mlp": 1.02390623, + "epoch": 0.16665664642578007, + "flos": 25447450584960.0, + "grad_norm": 1.8606523411854223, + "language_loss": 0.76268655, + "learning_rate": 3.8074057458289934e-06, + "loss": 0.7850042, + "num_input_tokens_seen": 29277450, + "step": 1386, + "time_per_iteration": 2.537062168121338 + }, + { + "auxiliary_loss_clip": 0.01196588, + "auxiliary_loss_mlp": 0.01031857, + "balance_loss_clip": 1.05642569, + "balance_loss_mlp": 1.02157485, + "epoch": 0.16677688931641918, + "flos": 22200767043840.0, + "grad_norm": 1.879845725470576, + "language_loss": 0.82602817, + "learning_rate": 3.807072084125934e-06, + "loss": 0.84831262, + "num_input_tokens_seen": 29299300, + "step": 1387, + "time_per_iteration": 2.5465996265411377 + }, + { + "auxiliary_loss_clip": 0.01191936, + "auxiliary_loss_mlp": 0.01036335, + "balance_loss_clip": 1.05862331, + "balance_loss_mlp": 1.02554035, + "epoch": 0.16689713220705826, + "flos": 16945958776320.0, + "grad_norm": 3.3106558434379534, + "language_loss": 0.80301118, + "learning_rate": 3.806738148291477e-06, + "loss": 0.8252939, + "num_input_tokens_seen": 29316125, + "step": 1388, + "time_per_iteration": 2.483123779296875 + }, + { + "auxiliary_loss_clip": 0.0115321, + "auxiliary_loss_mlp": 0.01035595, + "balance_loss_clip": 1.05269909, + "balance_loss_mlp": 1.02465117, + "epoch": 0.16701737509769735, + "flos": 36244423923840.0, + "grad_norm": 2.056159266092046, + "language_loss": 0.71318328, + "learning_rate": 3.8064039383762793e-06, + "loss": 0.73507136, + "num_input_tokens_seen": 29338490, + "step": 1389, + "time_per_iteration": 2.72969651222229 + }, + { + "auxiliary_loss_clip": 0.01213329, + "auxiliary_loss_mlp": 0.01036405, + "balance_loss_clip": 1.06428409, + "balance_loss_mlp": 1.02609968, + "epoch": 0.16713761798833643, + "flos": 23258659426560.0, + "grad_norm": 2.304981951727747, + "language_loss": 0.76892173, + "learning_rate": 3.8060694544310396e-06, + "loss": 0.79141915, + "num_input_tokens_seen": 29357000, + "step": 1390, + "time_per_iteration": 2.534376382827759 + }, + { + "auxiliary_loss_clip": 0.01229614, + "auxiliary_loss_mlp": 0.01042237, + "balance_loss_clip": 1.06324935, + "balance_loss_mlp": 1.03090048, + "epoch": 0.16725786087897554, + "flos": 25302515207040.0, + "grad_norm": 2.2984849143783577, + "language_loss": 0.7846272, + "learning_rate": 3.8057346965065006e-06, + "loss": 0.80734575, + "num_input_tokens_seen": 29378230, + "step": 1391, + "time_per_iteration": 2.500256061553955 + }, + { + "auxiliary_loss_clip": 0.01194963, + "auxiliary_loss_mlp": 0.01037638, + "balance_loss_clip": 1.06090724, + "balance_loss_mlp": 1.02744532, + "epoch": 0.16737810376961462, + "flos": 31831541516160.0, + "grad_norm": 1.8809656233079959, + "language_loss": 0.84310651, + "learning_rate": 3.805399664653443e-06, + "loss": 0.86543256, + "num_input_tokens_seen": 29400370, + "step": 1392, + "time_per_iteration": 2.605283737182617 + }, + { + "auxiliary_loss_clip": 0.01229841, + "auxiliary_loss_mlp": 0.0102943, + "balance_loss_clip": 1.0628922, + "balance_loss_mlp": 1.01914835, + "epoch": 0.1674983466602537, + "flos": 27961843553280.0, + "grad_norm": 2.2414342106356018, + "language_loss": 0.74428833, + "learning_rate": 3.805064358922692e-06, + "loss": 0.76688105, + "num_input_tokens_seen": 29418660, + "step": 1393, + "time_per_iteration": 2.4919519424438477 + }, + { + "auxiliary_loss_clip": 0.01217078, + "auxiliary_loss_mlp": 0.01033215, + "balance_loss_clip": 1.06099093, + "balance_loss_mlp": 1.02233112, + "epoch": 0.16761858955089282, + "flos": 21762656858880.0, + "grad_norm": 1.7217256195171802, + "language_loss": 0.80926573, + "learning_rate": 3.8047287793651136e-06, + "loss": 0.83176863, + "num_input_tokens_seen": 29440105, + "step": 1394, + "time_per_iteration": 2.499969959259033 + }, + { + "auxiliary_loss_clip": 0.01183402, + "auxiliary_loss_mlp": 0.01042119, + "balance_loss_clip": 1.05734324, + "balance_loss_mlp": 1.03193855, + "epoch": 0.1677388324415319, + "flos": 23805507058560.0, + "grad_norm": 2.0208557786781998, + "language_loss": 0.88748318, + "learning_rate": 3.8043929260316137e-06, + "loss": 0.90973842, + "num_input_tokens_seen": 29458260, + "step": 1395, + "time_per_iteration": 2.5618607997894287 + }, + { + "auxiliary_loss_clip": 0.01201438, + "auxiliary_loss_mlp": 0.01037296, + "balance_loss_clip": 1.0663377, + "balance_loss_mlp": 1.02665043, + "epoch": 0.16785907533217098, + "flos": 20558859431040.0, + "grad_norm": 1.976170035660376, + "language_loss": 0.83333337, + "learning_rate": 3.8040567989731417e-06, + "loss": 0.8557207, + "num_input_tokens_seen": 29476205, + "step": 1396, + "time_per_iteration": 2.519643545150757 + }, + { + "auxiliary_loss_clip": 0.01207297, + "auxiliary_loss_mlp": 0.01031102, + "balance_loss_clip": 1.06144726, + "balance_loss_mlp": 1.02117205, + "epoch": 0.16797931822281006, + "flos": 15669657745920.0, + "grad_norm": 2.1041604953380344, + "language_loss": 0.79336333, + "learning_rate": 3.8037203982406876e-06, + "loss": 0.81574726, + "num_input_tokens_seen": 29494370, + "step": 1397, + "time_per_iteration": 2.460031509399414 + }, + { + "auxiliary_loss_clip": 0.01228908, + "auxiliary_loss_mlp": 0.01037229, + "balance_loss_clip": 1.06463897, + "balance_loss_mlp": 1.02633297, + "epoch": 0.16809956111344918, + "flos": 16541101607040.0, + "grad_norm": 1.9252838250126378, + "language_loss": 0.72935712, + "learning_rate": 3.8033837238852835e-06, + "loss": 0.75201845, + "num_input_tokens_seen": 29511070, + "step": 1398, + "time_per_iteration": 2.463020086288452 + }, + { + "auxiliary_loss_clip": 0.0118718, + "auxiliary_loss_mlp": 0.01036294, + "balance_loss_clip": 1.05595434, + "balance_loss_mlp": 1.02651286, + "epoch": 0.16821980400408826, + "flos": 23258084808960.0, + "grad_norm": 1.9442599796712918, + "language_loss": 0.69483781, + "learning_rate": 3.8030467759580017e-06, + "loss": 0.71707249, + "num_input_tokens_seen": 29531990, + "step": 1399, + "time_per_iteration": 2.5398035049438477 + }, + { + "auxiliary_loss_clip": 0.0121501, + "auxiliary_loss_mlp": 0.01033649, + "balance_loss_clip": 1.0600518, + "balance_loss_mlp": 1.02272356, + "epoch": 0.16834004689472734, + "flos": 20774754126720.0, + "grad_norm": 1.9276147460915132, + "language_loss": 0.86993229, + "learning_rate": 3.802709554509958e-06, + "loss": 0.89241892, + "num_input_tokens_seen": 29549790, + "step": 1400, + "time_per_iteration": 2.471849203109741 + }, + { + "auxiliary_loss_clip": 0.01195163, + "auxiliary_loss_mlp": 0.01032148, + "balance_loss_clip": 1.0566504, + "balance_loss_mlp": 1.02261698, + "epoch": 0.16846028978536645, + "flos": 26687302289280.0, + "grad_norm": 1.7513453588970855, + "language_loss": 0.79295868, + "learning_rate": 3.8023720595923083e-06, + "loss": 0.81523174, + "num_input_tokens_seen": 29569045, + "step": 1401, + "time_per_iteration": 2.554755449295044 + }, + { + "auxiliary_loss_clip": 0.01162605, + "auxiliary_loss_mlp": 0.01035198, + "balance_loss_clip": 1.05336201, + "balance_loss_mlp": 1.02479696, + "epoch": 0.16858053267600553, + "flos": 18843298980480.0, + "grad_norm": 2.0110455765729696, + "language_loss": 0.87213755, + "learning_rate": 3.80203429125625e-06, + "loss": 0.89411557, + "num_input_tokens_seen": 29587220, + "step": 1402, + "time_per_iteration": 2.5577595233917236 + }, + { + "auxiliary_loss_clip": 0.01141023, + "auxiliary_loss_mlp": 0.01033673, + "balance_loss_clip": 1.05109572, + "balance_loss_mlp": 1.02345133, + "epoch": 0.16870077556664462, + "flos": 27744548227200.0, + "grad_norm": 1.7764171735767604, + "language_loss": 0.69979835, + "learning_rate": 3.8016962495530225e-06, + "loss": 0.72154534, + "num_input_tokens_seen": 29606410, + "step": 1403, + "time_per_iteration": 2.6550164222717285 + }, + { + "auxiliary_loss_clip": 0.01229262, + "auxiliary_loss_mlp": 0.01040182, + "balance_loss_clip": 1.06272984, + "balance_loss_mlp": 1.03041244, + "epoch": 0.1688210184572837, + "flos": 13730768484480.0, + "grad_norm": 2.373469812583808, + "language_loss": 0.77290785, + "learning_rate": 3.8013579345339063e-06, + "loss": 0.79560226, + "num_input_tokens_seen": 29621275, + "step": 1404, + "time_per_iteration": 2.408055067062378 + }, + { + "auxiliary_loss_clip": 0.01187382, + "auxiliary_loss_mlp": 0.0103345, + "balance_loss_clip": 1.05746388, + "balance_loss_mlp": 1.02273345, + "epoch": 0.1689412613479228, + "flos": 26468785900800.0, + "grad_norm": 2.6228533614847422, + "language_loss": 0.69455636, + "learning_rate": 3.801019346250224e-06, + "loss": 0.71676469, + "num_input_tokens_seen": 29641420, + "step": 1405, + "time_per_iteration": 3.297363758087158 + }, + { + "auxiliary_loss_clip": 0.0120986, + "auxiliary_loss_mlp": 0.01034043, + "balance_loss_clip": 1.06108999, + "balance_loss_mlp": 1.02369547, + "epoch": 0.1690615042385619, + "flos": 21138852337920.0, + "grad_norm": 2.2662503727057177, + "language_loss": 0.83462501, + "learning_rate": 3.8006804847533395e-06, + "loss": 0.85706401, + "num_input_tokens_seen": 29660935, + "step": 1406, + "time_per_iteration": 2.4971489906311035 + }, + { + "auxiliary_loss_clip": 0.01230357, + "auxiliary_loss_mlp": 0.0104136, + "balance_loss_clip": 1.06267035, + "balance_loss_mlp": 1.03190672, + "epoch": 0.16918174712920098, + "flos": 20849340718080.0, + "grad_norm": 3.3082241507269083, + "language_loss": 0.85670513, + "learning_rate": 3.8003413500946556e-06, + "loss": 0.87942231, + "num_input_tokens_seen": 29681045, + "step": 1407, + "time_per_iteration": 2.4682304859161377 + }, + { + "auxiliary_loss_clip": 0.01199131, + "auxiliary_loss_mlp": 0.01036574, + "balance_loss_clip": 1.06043482, + "balance_loss_mlp": 1.0259167, + "epoch": 0.1693019900198401, + "flos": 16983270028800.0, + "grad_norm": 3.075384346080213, + "language_loss": 0.82684493, + "learning_rate": 3.8000019423256216e-06, + "loss": 0.84920198, + "num_input_tokens_seen": 29698810, + "step": 1408, + "time_per_iteration": 3.2123184204101562 + }, + { + "auxiliary_loss_clip": 0.01188603, + "auxiliary_loss_mlp": 0.01046088, + "balance_loss_clip": 1.05931282, + "balance_loss_mlp": 1.03587782, + "epoch": 0.16942223291047917, + "flos": 26796901662720.0, + "grad_norm": 40.526622413496796, + "language_loss": 0.88179904, + "learning_rate": 3.7996622614977234e-06, + "loss": 0.90414596, + "num_input_tokens_seen": 29720000, + "step": 1409, + "time_per_iteration": 3.3433187007904053 + }, + { + "auxiliary_loss_clip": 0.01197719, + "auxiliary_loss_mlp": 0.01035362, + "balance_loss_clip": 1.06182671, + "balance_loss_mlp": 1.02554476, + "epoch": 0.16954247580111825, + "flos": 18583700411520.0, + "grad_norm": 1.899948596307772, + "language_loss": 0.79205096, + "learning_rate": 3.799322307662492e-06, + "loss": 0.81438178, + "num_input_tokens_seen": 29737820, + "step": 1410, + "time_per_iteration": 3.338941812515259 + }, + { + "auxiliary_loss_clip": 0.01169645, + "auxiliary_loss_mlp": 0.01029539, + "balance_loss_clip": 1.05447221, + "balance_loss_mlp": 1.01872635, + "epoch": 0.16966271869175734, + "flos": 13983651210240.0, + "grad_norm": 2.5979905882623826, + "language_loss": 0.83707553, + "learning_rate": 3.798982080871496e-06, + "loss": 0.85906732, + "num_input_tokens_seen": 29752960, + "step": 1411, + "time_per_iteration": 2.544449806213379 + }, + { + "auxiliary_loss_clip": 0.01231347, + "auxiliary_loss_mlp": 0.01037666, + "balance_loss_clip": 1.06460643, + "balance_loss_mlp": 1.02671039, + "epoch": 0.16978296158239645, + "flos": 37487328284160.0, + "grad_norm": 2.1069238383350104, + "language_loss": 0.67939728, + "learning_rate": 3.798641581176349e-06, + "loss": 0.7020874, + "num_input_tokens_seen": 29775240, + "step": 1412, + "time_per_iteration": 2.593801736831665 + }, + { + "auxiliary_loss_clip": 0.01199328, + "auxiliary_loss_mlp": 0.01039094, + "balance_loss_clip": 1.05829883, + "balance_loss_mlp": 1.02801943, + "epoch": 0.16990320447303553, + "flos": 28328958506880.0, + "grad_norm": 1.956164345153337, + "language_loss": 0.74738121, + "learning_rate": 3.7983008086287044e-06, + "loss": 0.7697655, + "num_input_tokens_seen": 29796560, + "step": 1413, + "time_per_iteration": 2.5569052696228027 + }, + { + "auxiliary_loss_clip": 0.0119254, + "auxiliary_loss_mlp": 0.01040552, + "balance_loss_clip": 1.05527091, + "balance_loss_mlp": 1.02941751, + "epoch": 0.1700234473636746, + "flos": 20188189031040.0, + "grad_norm": 2.0369325617706275, + "language_loss": 0.79392397, + "learning_rate": 3.797959763280257e-06, + "loss": 0.81625485, + "num_input_tokens_seen": 29815245, + "step": 1414, + "time_per_iteration": 2.506690740585327 + }, + { + "auxiliary_loss_clip": 0.01218445, + "auxiliary_loss_mlp": 0.01044888, + "balance_loss_clip": 1.06301737, + "balance_loss_mlp": 1.03508306, + "epoch": 0.17014369025431372, + "flos": 24858658846080.0, + "grad_norm": 2.1132531328990054, + "language_loss": 0.79518855, + "learning_rate": 3.797618445182743e-06, + "loss": 0.81782186, + "num_input_tokens_seen": 29836640, + "step": 1415, + "time_per_iteration": 2.5186071395874023 + }, + { + "auxiliary_loss_clip": 0.01165759, + "auxiliary_loss_mlp": 0.01033002, + "balance_loss_clip": 1.05507517, + "balance_loss_mlp": 1.02231503, + "epoch": 0.1702639331449528, + "flos": 16467233287680.0, + "grad_norm": 2.094675100520782, + "language_loss": 0.8486228, + "learning_rate": 3.79727685438794e-06, + "loss": 0.87061048, + "num_input_tokens_seen": 29850830, + "step": 1416, + "time_per_iteration": 2.54203462600708 + }, + { + "auxiliary_loss_clip": 0.01115767, + "auxiliary_loss_mlp": 0.01006088, + "balance_loss_clip": 1.02578211, + "balance_loss_mlp": 1.00308406, + "epoch": 0.1703841760355919, + "flos": 52508870979840.0, + "grad_norm": 0.8476870057322425, + "language_loss": 0.61648983, + "learning_rate": 3.796934990947667e-06, + "loss": 0.63770843, + "num_input_tokens_seen": 29912515, + "step": 1417, + "time_per_iteration": 3.1060950756073 + }, + { + "auxiliary_loss_clip": 0.01113239, + "auxiliary_loss_mlp": 0.0100519, + "balance_loss_clip": 1.02469087, + "balance_loss_mlp": 1.00231671, + "epoch": 0.170504418926231, + "flos": 49370637576960.0, + "grad_norm": 0.8765044104997305, + "language_loss": 0.62529755, + "learning_rate": 3.7965928549137854e-06, + "loss": 0.64648187, + "num_input_tokens_seen": 29969330, + "step": 1418, + "time_per_iteration": 2.981626510620117 + }, + { + "auxiliary_loss_clip": 0.01185142, + "auxiliary_loss_mlp": 0.01037886, + "balance_loss_clip": 1.05267906, + "balance_loss_mlp": 1.02701449, + "epoch": 0.17062466181687008, + "flos": 25849219184640.0, + "grad_norm": 2.3818202086542586, + "language_loss": 0.76975733, + "learning_rate": 3.7962504463381953e-06, + "loss": 0.79198766, + "num_input_tokens_seen": 29990820, + "step": 1419, + "time_per_iteration": 2.5914244651794434 + }, + { + "auxiliary_loss_clip": 0.01194665, + "auxiliary_loss_mlp": 0.00766414, + "balance_loss_clip": 1.06262887, + "balance_loss_mlp": 1.00121856, + "epoch": 0.17074490470750917, + "flos": 20960412549120.0, + "grad_norm": 1.7700325433459179, + "language_loss": 0.78936243, + "learning_rate": 3.7959077652728412e-06, + "loss": 0.80897331, + "num_input_tokens_seen": 30009275, + "step": 1420, + "time_per_iteration": 2.5226595401763916 + }, + { + "auxiliary_loss_clip": 0.01196842, + "auxiliary_loss_mlp": 0.0104148, + "balance_loss_clip": 1.05771589, + "balance_loss_mlp": 1.03084624, + "epoch": 0.17086514759814825, + "flos": 20959766104320.0, + "grad_norm": 2.1704789295844127, + "language_loss": 0.77376711, + "learning_rate": 3.795564811769707e-06, + "loss": 0.79615027, + "num_input_tokens_seen": 30027630, + "step": 1421, + "time_per_iteration": 2.531301736831665 + }, + { + "auxiliary_loss_clip": 0.0119857, + "auxiliary_loss_mlp": 0.01036154, + "balance_loss_clip": 1.06361115, + "balance_loss_mlp": 1.02492428, + "epoch": 0.17098539048878736, + "flos": 28474073452800.0, + "grad_norm": 1.8906503562883983, + "language_loss": 0.7779932, + "learning_rate": 3.795221585880818e-06, + "loss": 0.80034047, + "num_input_tokens_seen": 30048310, + "step": 1422, + "time_per_iteration": 2.5752382278442383 + }, + { + "auxiliary_loss_clip": 0.01185124, + "auxiliary_loss_mlp": 0.0103905, + "balance_loss_clip": 1.0626297, + "balance_loss_mlp": 1.02974033, + "epoch": 0.17110563337942644, + "flos": 16290014561280.0, + "grad_norm": 1.9104797072303763, + "language_loss": 0.91254377, + "learning_rate": 3.794878087658242e-06, + "loss": 0.93478549, + "num_input_tokens_seen": 30066080, + "step": 1423, + "time_per_iteration": 2.625685691833496 + }, + { + "auxiliary_loss_clip": 0.01215584, + "auxiliary_loss_mlp": 0.01033737, + "balance_loss_clip": 1.06160808, + "balance_loss_mlp": 1.02393782, + "epoch": 0.17122587627006552, + "flos": 29674207693440.0, + "grad_norm": 1.7218099008650776, + "language_loss": 0.78784055, + "learning_rate": 3.7945343171540873e-06, + "loss": 0.81033373, + "num_input_tokens_seen": 30086955, + "step": 1424, + "time_per_iteration": 2.578352928161621 + }, + { + "auxiliary_loss_clip": 0.01230627, + "auxiliary_loss_mlp": 0.01040311, + "balance_loss_clip": 1.06358242, + "balance_loss_mlp": 1.02924871, + "epoch": 0.17134611916070464, + "flos": 25338389915520.0, + "grad_norm": 17.123868564556847, + "language_loss": 0.78796583, + "learning_rate": 3.7941902744205033e-06, + "loss": 0.81067526, + "num_input_tokens_seen": 30107990, + "step": 1425, + "time_per_iteration": 2.4777586460113525 + }, + { + "auxiliary_loss_clip": 0.01203943, + "auxiliary_loss_mlp": 0.01031556, + "balance_loss_clip": 1.06052256, + "balance_loss_mlp": 1.02021337, + "epoch": 0.17146636205134372, + "flos": 13953845900160.0, + "grad_norm": 2.032145898133943, + "language_loss": 0.83531457, + "learning_rate": 3.7938459595096817e-06, + "loss": 0.85766959, + "num_input_tokens_seen": 30126535, + "step": 1426, + "time_per_iteration": 2.494879961013794 + }, + { + "auxiliary_loss_clip": 0.01220223, + "auxiliary_loss_mlp": 0.01038201, + "balance_loss_clip": 1.06146491, + "balance_loss_mlp": 1.02735317, + "epoch": 0.1715866049419828, + "flos": 23915214172800.0, + "grad_norm": 1.942244623903649, + "language_loss": 0.86069942, + "learning_rate": 3.7935013724738545e-06, + "loss": 0.88328373, + "num_input_tokens_seen": 30147035, + "step": 1427, + "time_per_iteration": 2.4997761249542236 + }, + { + "auxiliary_loss_clip": 0.01208311, + "auxiliary_loss_mlp": 0.01040232, + "balance_loss_clip": 1.06062722, + "balance_loss_mlp": 1.03041482, + "epoch": 0.17170684783262188, + "flos": 22709369669760.0, + "grad_norm": 2.085612864300439, + "language_loss": 0.78063208, + "learning_rate": 3.7931565133652945e-06, + "loss": 0.80311751, + "num_input_tokens_seen": 30167110, + "step": 1428, + "time_per_iteration": 2.5557827949523926 + }, + { + "auxiliary_loss_clip": 0.0122873, + "auxiliary_loss_mlp": 0.01036317, + "balance_loss_clip": 1.06289315, + "balance_loss_mlp": 1.02559459, + "epoch": 0.171827090723261, + "flos": 26613290315520.0, + "grad_norm": 2.159135299795504, + "language_loss": 0.67813182, + "learning_rate": 3.792811382236317e-06, + "loss": 0.7007823, + "num_input_tokens_seen": 30185620, + "step": 1429, + "time_per_iteration": 2.476851224899292 + }, + { + "auxiliary_loss_clip": 0.01217286, + "auxiliary_loss_mlp": 0.01034894, + "balance_loss_clip": 1.06093717, + "balance_loss_mlp": 1.02421916, + "epoch": 0.17194733361390008, + "flos": 28148507556480.0, + "grad_norm": 1.8217266295533343, + "language_loss": 0.78318584, + "learning_rate": 3.792465979139279e-06, + "loss": 0.80570757, + "num_input_tokens_seen": 30208225, + "step": 1430, + "time_per_iteration": 2.5533220767974854 + }, + { + "auxiliary_loss_clip": 0.01088268, + "auxiliary_loss_mlp": 0.01001498, + "balance_loss_clip": 1.02499032, + "balance_loss_mlp": 0.99875641, + "epoch": 0.17206757650453916, + "flos": 65530689753600.0, + "grad_norm": 0.9335995374993428, + "language_loss": 0.65702158, + "learning_rate": 3.792120304126576e-06, + "loss": 0.67791927, + "num_input_tokens_seen": 30271600, + "step": 1431, + "time_per_iteration": 3.1519088745117188 + }, + { + "auxiliary_loss_clip": 0.01139881, + "auxiliary_loss_mlp": 0.01026766, + "balance_loss_clip": 1.05311155, + "balance_loss_mlp": 1.01710987, + "epoch": 0.17218781939517827, + "flos": 22273486128000.0, + "grad_norm": 1.8593450107080032, + "language_loss": 0.83722383, + "learning_rate": 3.791774357250649e-06, + "loss": 0.8588903, + "num_input_tokens_seen": 30290430, + "step": 1432, + "time_per_iteration": 3.4043781757354736 + }, + { + "auxiliary_loss_clip": 0.01194142, + "auxiliary_loss_mlp": 0.01047989, + "balance_loss_clip": 1.0572958, + "balance_loss_mlp": 1.0365808, + "epoch": 0.17230806228581735, + "flos": 14137313592960.0, + "grad_norm": 4.617775161447866, + "language_loss": 0.7929405, + "learning_rate": 3.7914281385639757e-06, + "loss": 0.8153618, + "num_input_tokens_seen": 30308305, + "step": 1433, + "time_per_iteration": 2.5610153675079346 + }, + { + "auxiliary_loss_clip": 0.01211774, + "auxiliary_loss_mlp": 0.0103201, + "balance_loss_clip": 1.05750346, + "balance_loss_mlp": 1.0213294, + "epoch": 0.17242830517645644, + "flos": 20704836303360.0, + "grad_norm": 1.827000107992892, + "language_loss": 0.79663563, + "learning_rate": 3.7910816481190784e-06, + "loss": 0.8190735, + "num_input_tokens_seen": 30328120, + "step": 1434, + "time_per_iteration": 2.5127718448638916 + }, + { + "auxiliary_loss_clip": 0.01184745, + "auxiliary_loss_mlp": 0.01035868, + "balance_loss_clip": 1.05493259, + "balance_loss_mlp": 1.02510309, + "epoch": 0.17254854806709552, + "flos": 30774582887040.0, + "grad_norm": 1.8474801919906163, + "language_loss": 0.74619633, + "learning_rate": 3.7907348859685193e-06, + "loss": 0.76840246, + "num_input_tokens_seen": 30349825, + "step": 1435, + "time_per_iteration": 3.346137523651123 + }, + { + "auxiliary_loss_clip": 0.01206522, + "auxiliary_loss_mlp": 0.01032865, + "balance_loss_clip": 1.06130481, + "balance_loss_mlp": 1.0219034, + "epoch": 0.17266879095773463, + "flos": 26614726859520.0, + "grad_norm": 1.9294039224398274, + "language_loss": 0.80319411, + "learning_rate": 3.790387852164902e-06, + "loss": 0.82558799, + "num_input_tokens_seen": 30370555, + "step": 1436, + "time_per_iteration": 3.3498544692993164 + }, + { + "auxiliary_loss_clip": 0.01212151, + "auxiliary_loss_mlp": 0.01037983, + "balance_loss_clip": 1.05954683, + "balance_loss_mlp": 1.0276413, + "epoch": 0.1727890338483737, + "flos": 20266295155200.0, + "grad_norm": 1.8149498189135815, + "language_loss": 0.76414484, + "learning_rate": 3.7900405467608707e-06, + "loss": 0.78664619, + "num_input_tokens_seen": 30390100, + "step": 1437, + "time_per_iteration": 3.2390265464782715 + }, + { + "auxiliary_loss_clip": 0.01151838, + "auxiliary_loss_mlp": 0.01033412, + "balance_loss_clip": 1.05021501, + "balance_loss_mlp": 1.02249217, + "epoch": 0.1729092767390128, + "flos": 18179812909440.0, + "grad_norm": 3.039020772655978, + "language_loss": 0.78950447, + "learning_rate": 3.7896929698091114e-06, + "loss": 0.8113569, + "num_input_tokens_seen": 30402915, + "step": 1438, + "time_per_iteration": 2.606783151626587 + }, + { + "auxiliary_loss_clip": 0.01232682, + "auxiliary_loss_mlp": 0.01037233, + "balance_loss_clip": 1.06718707, + "balance_loss_mlp": 1.02670717, + "epoch": 0.1730295196296519, + "flos": 26759518583040.0, + "grad_norm": 2.607426514921239, + "language_loss": 0.68094081, + "learning_rate": 3.7893451213623518e-06, + "loss": 0.70363998, + "num_input_tokens_seen": 30420145, + "step": 1439, + "time_per_iteration": 2.5049901008605957 + }, + { + "auxiliary_loss_clip": 0.01212649, + "auxiliary_loss_mlp": 0.00765477, + "balance_loss_clip": 1.06244707, + "balance_loss_mlp": 1.00117683, + "epoch": 0.173149762520291, + "flos": 23842531002240.0, + "grad_norm": 3.394243157308304, + "language_loss": 0.82348162, + "learning_rate": 3.7889970014733606e-06, + "loss": 0.84326291, + "num_input_tokens_seen": 30439250, + "step": 1440, + "time_per_iteration": 2.513359546661377 + }, + { + "auxiliary_loss_clip": 0.01149886, + "auxiliary_loss_mlp": 0.0103358, + "balance_loss_clip": 1.05032659, + "balance_loss_mlp": 1.0224576, + "epoch": 0.17327000541093007, + "flos": 23368186972800.0, + "grad_norm": 1.6639654249004643, + "language_loss": 0.77929735, + "learning_rate": 3.7886486101949463e-06, + "loss": 0.80113202, + "num_input_tokens_seen": 30460430, + "step": 1441, + "time_per_iteration": 2.5950827598571777 + }, + { + "auxiliary_loss_clip": 0.01157507, + "auxiliary_loss_mlp": 0.0104258, + "balance_loss_clip": 1.05316794, + "balance_loss_mlp": 1.03186309, + "epoch": 0.17339024830156918, + "flos": 18221290139520.0, + "grad_norm": 2.289054514356873, + "language_loss": 0.88107783, + "learning_rate": 3.7882999475799594e-06, + "loss": 0.90307868, + "num_input_tokens_seen": 30478465, + "step": 1442, + "time_per_iteration": 2.599398374557495 + }, + { + "auxiliary_loss_clip": 0.01150916, + "auxiliary_loss_mlp": 0.01042176, + "balance_loss_clip": 1.05430889, + "balance_loss_mlp": 1.03178072, + "epoch": 0.17351049119220827, + "flos": 23332024955520.0, + "grad_norm": 1.7834880379982347, + "language_loss": 0.81659609, + "learning_rate": 3.787951013681293e-06, + "loss": 0.83852702, + "num_input_tokens_seen": 30496510, + "step": 1443, + "time_per_iteration": 2.5856363773345947 + }, + { + "auxiliary_loss_clip": 0.01208865, + "auxiliary_loss_mlp": 0.01036473, + "balance_loss_clip": 1.05752826, + "balance_loss_mlp": 1.02563691, + "epoch": 0.17363073408284735, + "flos": 23803495896960.0, + "grad_norm": 1.966776227942772, + "language_loss": 0.77606082, + "learning_rate": 3.787601808551879e-06, + "loss": 0.79851425, + "num_input_tokens_seen": 30516325, + "step": 1444, + "time_per_iteration": 2.537043333053589 + }, + { + "auxiliary_loss_clip": 0.01183626, + "auxiliary_loss_mlp": 0.01043524, + "balance_loss_clip": 1.05813372, + "balance_loss_mlp": 1.03313446, + "epoch": 0.17375097697348643, + "flos": 18515290959360.0, + "grad_norm": 2.300733133698246, + "language_loss": 0.83857614, + "learning_rate": 3.7872523322446926e-06, + "loss": 0.86084765, + "num_input_tokens_seen": 30535210, + "step": 1445, + "time_per_iteration": 2.516009569168091 + }, + { + "auxiliary_loss_clip": 0.01172103, + "auxiliary_loss_mlp": 0.01031732, + "balance_loss_clip": 1.05243063, + "balance_loss_mlp": 1.02195668, + "epoch": 0.17387121986412554, + "flos": 38877897456000.0, + "grad_norm": 1.643053083521112, + "language_loss": 0.60125393, + "learning_rate": 3.7869025848127478e-06, + "loss": 0.62329233, + "num_input_tokens_seen": 30559405, + "step": 1446, + "time_per_iteration": 2.78249454498291 + }, + { + "auxiliary_loss_clip": 0.01211793, + "auxiliary_loss_mlp": 0.01036663, + "balance_loss_clip": 1.05864477, + "balance_loss_mlp": 1.02653623, + "epoch": 0.17399146275476463, + "flos": 20375714960640.0, + "grad_norm": 3.4791127753316378, + "language_loss": 0.8020916, + "learning_rate": 3.786552566309102e-06, + "loss": 0.8245762, + "num_input_tokens_seen": 30577615, + "step": 1447, + "time_per_iteration": 2.478332757949829 + }, + { + "auxiliary_loss_clip": 0.01195006, + "auxiliary_loss_mlp": 0.0076511, + "balance_loss_clip": 1.06208587, + "balance_loss_mlp": 1.00108337, + "epoch": 0.1741117056454037, + "flos": 19164339763200.0, + "grad_norm": 2.2878918011727247, + "language_loss": 0.85866201, + "learning_rate": 3.7862022767868517e-06, + "loss": 0.87826312, + "num_input_tokens_seen": 30595205, + "step": 1448, + "time_per_iteration": 2.522026538848877 + }, + { + "auxiliary_loss_clip": 0.01179713, + "auxiliary_loss_mlp": 0.01041684, + "balance_loss_clip": 1.06241274, + "balance_loss_mlp": 1.03174806, + "epoch": 0.17423194853604282, + "flos": 25374300537600.0, + "grad_norm": 2.04729151966674, + "language_loss": 0.84503436, + "learning_rate": 3.7858517162991367e-06, + "loss": 0.86724836, + "num_input_tokens_seen": 30615280, + "step": 1449, + "time_per_iteration": 2.5765416622161865 + }, + { + "auxiliary_loss_clip": 0.01180928, + "auxiliary_loss_mlp": 0.01035263, + "balance_loss_clip": 1.05569458, + "balance_loss_mlp": 1.0242486, + "epoch": 0.1743521914266819, + "flos": 25191874339200.0, + "grad_norm": 2.7340469965967573, + "language_loss": 0.60872662, + "learning_rate": 3.7855008848991363e-06, + "loss": 0.63088852, + "num_input_tokens_seen": 30633485, + "step": 1450, + "time_per_iteration": 2.583102226257324 + }, + { + "auxiliary_loss_clip": 0.01195128, + "auxiliary_loss_mlp": 0.0103323, + "balance_loss_clip": 1.06107593, + "balance_loss_mlp": 1.02348495, + "epoch": 0.17447243431732098, + "flos": 25666577504640.0, + "grad_norm": 2.330147076323331, + "language_loss": 0.77673542, + "learning_rate": 3.7851497826400714e-06, + "loss": 0.7990191, + "num_input_tokens_seen": 30653625, + "step": 1451, + "time_per_iteration": 2.557863712310791 + }, + { + "auxiliary_loss_clip": 0.01229698, + "auxiliary_loss_mlp": 0.01038068, + "balance_loss_clip": 1.06405401, + "balance_loss_mlp": 1.02768457, + "epoch": 0.17459267720796007, + "flos": 36281950657920.0, + "grad_norm": 2.1106406793445585, + "language_loss": 0.76206172, + "learning_rate": 3.7847984095752034e-06, + "loss": 0.78473938, + "num_input_tokens_seen": 30677080, + "step": 1452, + "time_per_iteration": 2.570648193359375 + }, + { + "auxiliary_loss_clip": 0.01226917, + "auxiliary_loss_mlp": 0.01029264, + "balance_loss_clip": 1.06208837, + "balance_loss_mlp": 1.0196383, + "epoch": 0.17471292009859918, + "flos": 20011113959040.0, + "grad_norm": 2.0755331146313085, + "language_loss": 0.80040216, + "learning_rate": 3.784446765757836e-06, + "loss": 0.82296395, + "num_input_tokens_seen": 30695725, + "step": 1453, + "time_per_iteration": 2.45017147064209 + }, + { + "auxiliary_loss_clip": 0.01164724, + "auxiliary_loss_mlp": 0.01031368, + "balance_loss_clip": 1.05531669, + "balance_loss_mlp": 1.02124727, + "epoch": 0.17483316298923826, + "flos": 27819242559360.0, + "grad_norm": 3.5534701519518057, + "language_loss": 0.77993363, + "learning_rate": 3.7840948512413133e-06, + "loss": 0.80189455, + "num_input_tokens_seen": 30713310, + "step": 1454, + "time_per_iteration": 2.5944151878356934 + }, + { + "auxiliary_loss_clip": 0.01179159, + "auxiliary_loss_mlp": 0.0103326, + "balance_loss_clip": 1.06072724, + "balance_loss_mlp": 1.02219748, + "epoch": 0.17495340587987734, + "flos": 44017934791680.0, + "grad_norm": 2.5993822791654524, + "language_loss": 0.78934515, + "learning_rate": 3.7837426660790196e-06, + "loss": 0.81146932, + "num_input_tokens_seen": 30734725, + "step": 1455, + "time_per_iteration": 2.767207145690918 + }, + { + "auxiliary_loss_clip": 0.01223998, + "auxiliary_loss_mlp": 0.01038741, + "balance_loss_clip": 1.06160498, + "balance_loss_mlp": 1.02909708, + "epoch": 0.17507364877051645, + "flos": 20885825957760.0, + "grad_norm": 2.0307141113853295, + "language_loss": 0.81879014, + "learning_rate": 3.783390210324382e-06, + "loss": 0.84141755, + "num_input_tokens_seen": 30754450, + "step": 1456, + "time_per_iteration": 2.5257537364959717 + }, + { + "auxiliary_loss_clip": 0.01181314, + "auxiliary_loss_mlp": 0.01035316, + "balance_loss_clip": 1.05977201, + "balance_loss_mlp": 1.02556431, + "epoch": 0.17519389166115554, + "flos": 24717602136960.0, + "grad_norm": 2.3525366140017274, + "language_loss": 0.72594523, + "learning_rate": 3.7830374840308676e-06, + "loss": 0.74811155, + "num_input_tokens_seen": 30774605, + "step": 1457, + "time_per_iteration": 2.57861328125 + }, + { + "auxiliary_loss_clip": 0.01214546, + "auxiliary_loss_mlp": 0.01036767, + "balance_loss_clip": 1.06337225, + "balance_loss_mlp": 1.02612793, + "epoch": 0.17531413455179462, + "flos": 23798144770560.0, + "grad_norm": 2.6763302289819597, + "language_loss": 0.82531404, + "learning_rate": 3.7826844872519842e-06, + "loss": 0.84782714, + "num_input_tokens_seen": 30792460, + "step": 1458, + "time_per_iteration": 3.3131322860717773 + }, + { + "auxiliary_loss_clip": 0.01194301, + "auxiliary_loss_mlp": 0.0103509, + "balance_loss_clip": 1.06146669, + "balance_loss_mlp": 1.02572584, + "epoch": 0.1754343774424337, + "flos": 24572379450240.0, + "grad_norm": 2.0018682171355375, + "language_loss": 0.72579277, + "learning_rate": 3.782331220041282e-06, + "loss": 0.74808669, + "num_input_tokens_seen": 30812525, + "step": 1459, + "time_per_iteration": 2.563878297805786 + }, + { + "auxiliary_loss_clip": 0.01189736, + "auxiliary_loss_mlp": 0.01032337, + "balance_loss_clip": 1.05854297, + "balance_loss_mlp": 1.02207279, + "epoch": 0.17555462033307281, + "flos": 18114599767680.0, + "grad_norm": 2.036277257370598, + "language_loss": 0.82869148, + "learning_rate": 3.7819776824523504e-06, + "loss": 0.85091221, + "num_input_tokens_seen": 30830390, + "step": 1460, + "time_per_iteration": 2.5676403045654297 + }, + { + "auxiliary_loss_clip": 0.01204211, + "auxiliary_loss_mlp": 0.01040222, + "balance_loss_clip": 1.05950904, + "balance_loss_mlp": 1.03014886, + "epoch": 0.1756748632237119, + "flos": 28366018364160.0, + "grad_norm": 2.5179061985691455, + "language_loss": 0.8388924, + "learning_rate": 3.7816238745388213e-06, + "loss": 0.86133677, + "num_input_tokens_seen": 30849935, + "step": 1461, + "time_per_iteration": 2.55696439743042 + }, + { + "auxiliary_loss_clip": 0.01201058, + "auxiliary_loss_mlp": 0.01030171, + "balance_loss_clip": 1.05662966, + "balance_loss_mlp": 1.02056849, + "epoch": 0.17579510611435098, + "flos": 25732939881600.0, + "grad_norm": 1.8010657306687143, + "language_loss": 0.87134552, + "learning_rate": 3.781269796354367e-06, + "loss": 0.89365786, + "num_input_tokens_seen": 30869555, + "step": 1462, + "time_per_iteration": 3.3783860206604004 + }, + { + "auxiliary_loss_clip": 0.01197465, + "auxiliary_loss_mlp": 0.0103752, + "balance_loss_clip": 1.06063175, + "balance_loss_mlp": 1.02770281, + "epoch": 0.1759153490049901, + "flos": 18588081870720.0, + "grad_norm": 2.5380605402366063, + "language_loss": 0.86005658, + "learning_rate": 3.7809154479527006e-06, + "loss": 0.88240635, + "num_input_tokens_seen": 30888760, + "step": 1463, + "time_per_iteration": 3.233886241912842 + }, + { + "auxiliary_loss_clip": 0.01172224, + "auxiliary_loss_mlp": 0.01025993, + "balance_loss_clip": 1.05661798, + "balance_loss_mlp": 1.01668262, + "epoch": 0.17603559189562917, + "flos": 18619323724800.0, + "grad_norm": 2.5019822979176096, + "language_loss": 0.84563631, + "learning_rate": 3.780560829387577e-06, + "loss": 0.86761856, + "num_input_tokens_seen": 30907260, + "step": 1464, + "time_per_iteration": 2.5528883934020996 + }, + { + "auxiliary_loss_clip": 0.01115545, + "auxiliary_loss_mlp": 0.01004668, + "balance_loss_clip": 1.02983701, + "balance_loss_mlp": 1.00184274, + "epoch": 0.17615583478626826, + "flos": 60530775373440.0, + "grad_norm": 0.8603435452945104, + "language_loss": 0.57904196, + "learning_rate": 3.7802059407127915e-06, + "loss": 0.60024405, + "num_input_tokens_seen": 30965810, + "step": 1465, + "time_per_iteration": 3.0251476764678955 + }, + { + "auxiliary_loss_clip": 0.01186926, + "auxiliary_loss_mlp": 0.01040275, + "balance_loss_clip": 1.05393887, + "balance_loss_mlp": 1.03014827, + "epoch": 0.17627607767690734, + "flos": 23616221362560.0, + "grad_norm": 2.394734667225439, + "language_loss": 0.8587805, + "learning_rate": 3.7798507819821797e-06, + "loss": 0.88105249, + "num_input_tokens_seen": 30982935, + "step": 1466, + "time_per_iteration": 2.5601916313171387 + }, + { + "auxiliary_loss_clip": 0.01174548, + "auxiliary_loss_mlp": 0.01040668, + "balance_loss_clip": 1.05751872, + "balance_loss_mlp": 1.03040969, + "epoch": 0.17639632056754645, + "flos": 17639070589440.0, + "grad_norm": 2.145710504875776, + "language_loss": 0.78880048, + "learning_rate": 3.7794953532496197e-06, + "loss": 0.81095266, + "num_input_tokens_seen": 30998840, + "step": 1467, + "time_per_iteration": 2.5489675998687744 + }, + { + "auxiliary_loss_clip": 0.01078406, + "auxiliary_loss_mlp": 0.00755774, + "balance_loss_clip": 1.03803587, + "balance_loss_mlp": 1.00079119, + "epoch": 0.17651656345818553, + "flos": 57932604910080.0, + "grad_norm": 0.8647788128344132, + "language_loss": 0.57967615, + "learning_rate": 3.7791396545690295e-06, + "loss": 0.59801799, + "num_input_tokens_seen": 31060075, + "step": 1468, + "time_per_iteration": 3.138390302658081 + }, + { + "auxiliary_loss_clip": 0.01214006, + "auxiliary_loss_mlp": 0.01039039, + "balance_loss_clip": 1.0654645, + "balance_loss_mlp": 1.02925789, + "epoch": 0.17663680634882462, + "flos": 22929502170240.0, + "grad_norm": 2.017487542380312, + "language_loss": 0.80824858, + "learning_rate": 3.7787836859943685e-06, + "loss": 0.83077902, + "num_input_tokens_seen": 31078800, + "step": 1469, + "time_per_iteration": 2.5002601146698 + }, + { + "auxiliary_loss_clip": 0.01210859, + "auxiliary_loss_mlp": 0.01037613, + "balance_loss_clip": 1.06141591, + "balance_loss_mlp": 1.02715826, + "epoch": 0.17675704923946373, + "flos": 22637979388800.0, + "grad_norm": 3.110624949679782, + "language_loss": 0.78913355, + "learning_rate": 3.7784274475796363e-06, + "loss": 0.81161833, + "num_input_tokens_seen": 31097430, + "step": 1470, + "time_per_iteration": 2.5137393474578857 + }, + { + "auxiliary_loss_clip": 0.01182359, + "auxiliary_loss_mlp": 0.01036689, + "balance_loss_clip": 1.05608439, + "balance_loss_mlp": 1.02660394, + "epoch": 0.1768772921301028, + "flos": 27126525795840.0, + "grad_norm": 1.9844180132911435, + "language_loss": 0.76115608, + "learning_rate": 3.7780709393788745e-06, + "loss": 0.78334659, + "num_input_tokens_seen": 31117905, + "step": 1471, + "time_per_iteration": 2.668341636657715 + }, + { + "auxiliary_loss_clip": 0.01226153, + "auxiliary_loss_mlp": 0.01033887, + "balance_loss_clip": 1.06345177, + "balance_loss_mlp": 1.02328384, + "epoch": 0.1769975350207419, + "flos": 19172133014400.0, + "grad_norm": 1.8593704857992914, + "language_loss": 0.75069666, + "learning_rate": 3.777714161446165e-06, + "loss": 0.77329707, + "num_input_tokens_seen": 31137610, + "step": 1472, + "time_per_iteration": 2.4950382709503174 + }, + { + "auxiliary_loss_clip": 0.01209901, + "auxiliary_loss_mlp": 0.01033668, + "balance_loss_clip": 1.06192255, + "balance_loss_mlp": 1.0236361, + "epoch": 0.177117777911381, + "flos": 36134932291200.0, + "grad_norm": 3.2682824829348065, + "language_loss": 0.69470155, + "learning_rate": 3.7773571138356304e-06, + "loss": 0.71713722, + "num_input_tokens_seen": 31157780, + "step": 1473, + "time_per_iteration": 2.606398582458496 + }, + { + "auxiliary_loss_clip": 0.01150699, + "auxiliary_loss_mlp": 0.01031504, + "balance_loss_clip": 1.0538528, + "balance_loss_mlp": 1.0220269, + "epoch": 0.17723802080202009, + "flos": 22090593052800.0, + "grad_norm": 2.2199699318756223, + "language_loss": 0.89429742, + "learning_rate": 3.776999796601435e-06, + "loss": 0.91611946, + "num_input_tokens_seen": 31176540, + "step": 1474, + "time_per_iteration": 2.593149423599243 + }, + { + "auxiliary_loss_clip": 0.01216273, + "auxiliary_loss_mlp": 0.01036077, + "balance_loss_clip": 1.06233168, + "balance_loss_mlp": 1.02558088, + "epoch": 0.17735826369265917, + "flos": 30222671437440.0, + "grad_norm": 2.120814666190962, + "language_loss": 0.72616422, + "learning_rate": 3.776642209797783e-06, + "loss": 0.74868768, + "num_input_tokens_seen": 31198370, + "step": 1475, + "time_per_iteration": 2.5621840953826904 + }, + { + "auxiliary_loss_clip": 0.01206436, + "auxiliary_loss_mlp": 0.01030916, + "balance_loss_clip": 1.05831671, + "balance_loss_mlp": 1.01966238, + "epoch": 0.17747850658329825, + "flos": 21397588980480.0, + "grad_norm": 2.0615471154682345, + "language_loss": 0.77852952, + "learning_rate": 3.7762843534789205e-06, + "loss": 0.80090302, + "num_input_tokens_seen": 31217120, + "step": 1476, + "time_per_iteration": 2.5308239459991455 + }, + { + "auxiliary_loss_clip": 0.01201017, + "auxiliary_loss_mlp": 0.01036476, + "balance_loss_clip": 1.05966902, + "balance_loss_mlp": 1.02599788, + "epoch": 0.17759874947393736, + "flos": 16983341856000.0, + "grad_norm": 2.051757868269177, + "language_loss": 0.88242996, + "learning_rate": 3.7759262276991343e-06, + "loss": 0.90480489, + "num_input_tokens_seen": 31234730, + "step": 1477, + "time_per_iteration": 2.500868797302246 + }, + { + "auxiliary_loss_clip": 0.01202108, + "auxiliary_loss_mlp": 0.01031667, + "balance_loss_clip": 1.06123781, + "balance_loss_mlp": 1.02109933, + "epoch": 0.17771899236457644, + "flos": 11546107390080.0, + "grad_norm": 2.1246491324841648, + "language_loss": 0.8045938, + "learning_rate": 3.7755678325127506e-06, + "loss": 0.82693148, + "num_input_tokens_seen": 31252410, + "step": 1478, + "time_per_iteration": 2.5225179195404053 + }, + { + "auxiliary_loss_clip": 0.01160192, + "auxiliary_loss_mlp": 0.01033442, + "balance_loss_clip": 1.05593753, + "balance_loss_mlp": 1.02323818, + "epoch": 0.17783923525521553, + "flos": 18807747494400.0, + "grad_norm": 1.7661305070959796, + "language_loss": 0.75574833, + "learning_rate": 3.7752091679741393e-06, + "loss": 0.77768469, + "num_input_tokens_seen": 31270200, + "step": 1479, + "time_per_iteration": 2.564220905303955 + }, + { + "auxiliary_loss_clip": 0.01208374, + "auxiliary_loss_mlp": 0.01032192, + "balance_loss_clip": 1.0599606, + "balance_loss_mlp": 1.02148116, + "epoch": 0.17795947814585464, + "flos": 30408365773440.0, + "grad_norm": 5.614044235637648, + "language_loss": 0.77694225, + "learning_rate": 3.774850234137708e-06, + "loss": 0.79934788, + "num_input_tokens_seen": 31287495, + "step": 1480, + "time_per_iteration": 2.6010093688964844 + }, + { + "auxiliary_loss_clip": 0.01207028, + "auxiliary_loss_mlp": 0.01033908, + "balance_loss_clip": 1.05884469, + "balance_loss_mlp": 1.02364445, + "epoch": 0.17807972103649372, + "flos": 24389055411840.0, + "grad_norm": 2.1911865193383413, + "language_loss": 0.83007723, + "learning_rate": 3.7744910310579076e-06, + "loss": 0.85248655, + "num_input_tokens_seen": 31306420, + "step": 1481, + "time_per_iteration": 2.5197112560272217 + }, + { + "auxiliary_loss_clip": 0.01223508, + "auxiliary_loss_mlp": 0.01034089, + "balance_loss_clip": 1.06268179, + "balance_loss_mlp": 1.02483845, + "epoch": 0.1781999639271328, + "flos": 20301559332480.0, + "grad_norm": 1.8918139023434455, + "language_loss": 0.85475373, + "learning_rate": 3.774131558789229e-06, + "loss": 0.87732971, + "num_input_tokens_seen": 31325750, + "step": 1482, + "time_per_iteration": 2.4431729316711426 + }, + { + "auxiliary_loss_clip": 0.01225644, + "auxiliary_loss_mlp": 0.00765284, + "balance_loss_clip": 1.06307089, + "balance_loss_mlp": 1.00107455, + "epoch": 0.1783202068177719, + "flos": 15924479806080.0, + "grad_norm": 4.135116701475838, + "language_loss": 0.69359291, + "learning_rate": 3.773771817386203e-06, + "loss": 0.71350217, + "num_input_tokens_seen": 31343080, + "step": 1483, + "time_per_iteration": 2.4184963703155518 + }, + { + "auxiliary_loss_clip": 0.01192202, + "auxiliary_loss_mlp": 0.01033769, + "balance_loss_clip": 1.05722189, + "balance_loss_mlp": 1.02400577, + "epoch": 0.178440449708411, + "flos": 20631758083200.0, + "grad_norm": 1.5005961101316232, + "language_loss": 0.79096091, + "learning_rate": 3.773411806903403e-06, + "loss": 0.81322062, + "num_input_tokens_seen": 31362160, + "step": 1484, + "time_per_iteration": 2.502218723297119 + }, + { + "auxiliary_loss_clip": 0.01153576, + "auxiliary_loss_mlp": 0.01038187, + "balance_loss_clip": 1.05243945, + "balance_loss_mlp": 1.02717185, + "epoch": 0.17856069259905008, + "flos": 21686059105920.0, + "grad_norm": 1.6524963486728546, + "language_loss": 0.94740731, + "learning_rate": 3.7730515273954415e-06, + "loss": 0.96932495, + "num_input_tokens_seen": 31380770, + "step": 1485, + "time_per_iteration": 3.400381326675415 + }, + { + "auxiliary_loss_clip": 0.01224606, + "auxiliary_loss_mlp": 0.01032793, + "balance_loss_clip": 1.06358218, + "balance_loss_mlp": 1.0232625, + "epoch": 0.17868093548968916, + "flos": 26572962320640.0, + "grad_norm": 2.14686640034565, + "language_loss": 0.84869182, + "learning_rate": 3.772690978916973e-06, + "loss": 0.87126577, + "num_input_tokens_seen": 31400525, + "step": 1486, + "time_per_iteration": 2.4783265590667725 + }, + { + "auxiliary_loss_clip": 0.01208521, + "auxiliary_loss_mlp": 0.01037847, + "balance_loss_clip": 1.06012225, + "balance_loss_mlp": 1.02715933, + "epoch": 0.17880117838032827, + "flos": 18581006891520.0, + "grad_norm": 2.1608809041134225, + "language_loss": 0.86329502, + "learning_rate": 3.772330161522693e-06, + "loss": 0.8857587, + "num_input_tokens_seen": 31418435, + "step": 1487, + "time_per_iteration": 2.4422850608825684 + }, + { + "auxiliary_loss_clip": 0.01196208, + "auxiliary_loss_mlp": 0.01035575, + "balance_loss_clip": 1.06355524, + "balance_loss_mlp": 1.02513218, + "epoch": 0.17892142127096736, + "flos": 26541217676160.0, + "grad_norm": 2.5894403059861517, + "language_loss": 0.79911166, + "learning_rate": 3.7719690752673365e-06, + "loss": 0.82142949, + "num_input_tokens_seen": 31439230, + "step": 1488, + "time_per_iteration": 3.3761420249938965 + }, + { + "auxiliary_loss_clip": 0.01183188, + "auxiliary_loss_mlp": 0.01030105, + "balance_loss_clip": 1.06011152, + "balance_loss_mlp": 1.01998961, + "epoch": 0.17904166416160644, + "flos": 23872623621120.0, + "grad_norm": 2.8690891417482254, + "language_loss": 0.78141117, + "learning_rate": 3.7716077202056796e-06, + "loss": 0.8035441, + "num_input_tokens_seen": 31457705, + "step": 1489, + "time_per_iteration": 3.3504977226257324 + }, + { + "auxiliary_loss_clip": 0.01179548, + "auxiliary_loss_mlp": 0.01033369, + "balance_loss_clip": 1.05524683, + "balance_loss_mlp": 1.02336144, + "epoch": 0.17916190705224552, + "flos": 19134426712320.0, + "grad_norm": 2.30500063364801, + "language_loss": 0.933725, + "learning_rate": 3.7712460963925404e-06, + "loss": 0.95585418, + "num_input_tokens_seen": 31473645, + "step": 1490, + "time_per_iteration": 3.2638635635375977 + }, + { + "auxiliary_loss_clip": 0.0118458, + "auxiliary_loss_mlp": 0.01034354, + "balance_loss_clip": 1.05435503, + "balance_loss_mlp": 1.02389932, + "epoch": 0.17928214994288463, + "flos": 25152120961920.0, + "grad_norm": 1.7279530611083593, + "language_loss": 0.75492775, + "learning_rate": 3.7708842038827775e-06, + "loss": 0.77711707, + "num_input_tokens_seen": 31492605, + "step": 1491, + "time_per_iteration": 2.5277762413024902 + }, + { + "auxiliary_loss_clip": 0.01207567, + "auxiliary_loss_mlp": 0.01037569, + "balance_loss_clip": 1.05812025, + "balance_loss_mlp": 1.02778161, + "epoch": 0.17940239283352372, + "flos": 22384629786240.0, + "grad_norm": 1.8585485688720762, + "language_loss": 0.85827363, + "learning_rate": 3.770522042731288e-06, + "loss": 0.88072503, + "num_input_tokens_seen": 31514500, + "step": 1492, + "time_per_iteration": 2.521674871444702 + }, + { + "auxiliary_loss_clip": 0.01158255, + "auxiliary_loss_mlp": 0.01041193, + "balance_loss_clip": 1.05639625, + "balance_loss_mlp": 1.0308156, + "epoch": 0.1795226357241628, + "flos": 23178685795200.0, + "grad_norm": 2.076235950623573, + "language_loss": 0.87755251, + "learning_rate": 3.7701596129930122e-06, + "loss": 0.89954704, + "num_input_tokens_seen": 31533225, + "step": 1493, + "time_per_iteration": 2.576296329498291 + }, + { + "auxiliary_loss_clip": 0.01188899, + "auxiliary_loss_mlp": 0.01027823, + "balance_loss_clip": 1.05858219, + "balance_loss_mlp": 1.01645041, + "epoch": 0.1796428786148019, + "flos": 22090413484800.0, + "grad_norm": 2.785458796449936, + "language_loss": 0.73371637, + "learning_rate": 3.7697969147229315e-06, + "loss": 0.75588357, + "num_input_tokens_seen": 31551385, + "step": 1494, + "time_per_iteration": 2.5533249378204346 + }, + { + "auxiliary_loss_clip": 0.01205632, + "auxiliary_loss_mlp": 0.01034286, + "balance_loss_clip": 1.05881965, + "balance_loss_mlp": 1.02400994, + "epoch": 0.179763121505441, + "flos": 21324618501120.0, + "grad_norm": 1.8922107388331693, + "language_loss": 0.85288066, + "learning_rate": 3.7694339479760647e-06, + "loss": 0.87527978, + "num_input_tokens_seen": 31570415, + "step": 1495, + "time_per_iteration": 2.4822475910186768 + }, + { + "auxiliary_loss_clip": 0.01091332, + "auxiliary_loss_mlp": 0.01001082, + "balance_loss_clip": 1.01931763, + "balance_loss_mlp": 0.9981972, + "epoch": 0.17988336439608008, + "flos": 68161864815360.0, + "grad_norm": 0.7710145109062748, + "language_loss": 0.5732367, + "learning_rate": 3.769070712807476e-06, + "loss": 0.5941608, + "num_input_tokens_seen": 31632445, + "step": 1496, + "time_per_iteration": 3.1312928199768066 + }, + { + "auxiliary_loss_clip": 0.01137998, + "auxiliary_loss_mlp": 0.01037964, + "balance_loss_clip": 1.05315256, + "balance_loss_mlp": 1.02716374, + "epoch": 0.18000360728671919, + "flos": 21945047143680.0, + "grad_norm": 2.059828841777802, + "language_loss": 0.78899086, + "learning_rate": 3.768707209272266e-06, + "loss": 0.81075048, + "num_input_tokens_seen": 31652575, + "step": 1497, + "time_per_iteration": 2.6141035556793213 + }, + { + "auxiliary_loss_clip": 0.01191001, + "auxiliary_loss_mlp": 0.01036506, + "balance_loss_clip": 1.05778718, + "balance_loss_mlp": 1.02616453, + "epoch": 0.18012385017735827, + "flos": 18986330937600.0, + "grad_norm": 2.6896338818081627, + "language_loss": 0.76710296, + "learning_rate": 3.768343437425579e-06, + "loss": 0.78937799, + "num_input_tokens_seen": 31671145, + "step": 1498, + "time_per_iteration": 2.5079545974731445 + }, + { + "auxiliary_loss_clip": 0.01127163, + "auxiliary_loss_mlp": 0.01031168, + "balance_loss_clip": 1.05173874, + "balance_loss_mlp": 1.02086246, + "epoch": 0.18024409306799735, + "flos": 19748103598080.0, + "grad_norm": 2.452200306391173, + "language_loss": 0.86054248, + "learning_rate": 3.7679793973225987e-06, + "loss": 0.88212579, + "num_input_tokens_seen": 31686955, + "step": 1499, + "time_per_iteration": 2.615321397781372 + }, + { + "auxiliary_loss_clip": 0.01058154, + "auxiliary_loss_mlp": 0.01005576, + "balance_loss_clip": 1.01552987, + "balance_loss_mlp": 1.00278652, + "epoch": 0.18036433595863643, + "flos": 67227183060480.0, + "grad_norm": 0.8476909131912282, + "language_loss": 0.61583358, + "learning_rate": 3.767615089018549e-06, + "loss": 0.63647091, + "num_input_tokens_seen": 31749300, + "step": 1500, + "time_per_iteration": 3.1136059761047363 + }, + { + "auxiliary_loss_clip": 0.01190542, + "auxiliary_loss_mlp": 0.01038474, + "balance_loss_clip": 1.05844581, + "balance_loss_mlp": 1.02779269, + "epoch": 0.18048457884927555, + "flos": 18181464935040.0, + "grad_norm": 2.1585453393438128, + "language_loss": 0.86048263, + "learning_rate": 3.7672505125686966e-06, + "loss": 0.8827728, + "num_input_tokens_seen": 31765665, + "step": 1501, + "time_per_iteration": 2.488764524459839 + }, + { + "auxiliary_loss_clip": 0.01164793, + "auxiliary_loss_mlp": 0.01036258, + "balance_loss_clip": 1.05304003, + "balance_loss_mlp": 1.02628005, + "epoch": 0.18060482173991463, + "flos": 15813767111040.0, + "grad_norm": 4.815674135063707, + "language_loss": 0.84644783, + "learning_rate": 3.7668856680283455e-06, + "loss": 0.86845827, + "num_input_tokens_seen": 31782690, + "step": 1502, + "time_per_iteration": 2.564213752746582 + }, + { + "auxiliary_loss_clip": 0.01200184, + "auxiliary_loss_mlp": 0.0103763, + "balance_loss_clip": 1.05964828, + "balance_loss_mlp": 1.02733052, + "epoch": 0.1807250646305537, + "flos": 18587399512320.0, + "grad_norm": 2.229551989617593, + "language_loss": 0.82122278, + "learning_rate": 3.7665205554528437e-06, + "loss": 0.84360099, + "num_input_tokens_seen": 31802045, + "step": 1503, + "time_per_iteration": 2.500936985015869 + }, + { + "auxiliary_loss_clip": 0.01198514, + "auxiliary_loss_mlp": 0.01030581, + "balance_loss_clip": 1.06229758, + "balance_loss_mlp": 1.02053142, + "epoch": 0.18084530752119282, + "flos": 23149131880320.0, + "grad_norm": 1.6404050933274024, + "language_loss": 0.7419793, + "learning_rate": 3.7661551748975782e-06, + "loss": 0.76427031, + "num_input_tokens_seen": 31820220, + "step": 1504, + "time_per_iteration": 2.5301711559295654 + }, + { + "auxiliary_loss_clip": 0.01090165, + "auxiliary_loss_mlp": 0.01005402, + "balance_loss_clip": 1.01758313, + "balance_loss_mlp": 1.002684, + "epoch": 0.1809655504118319, + "flos": 59803153568640.0, + "grad_norm": 0.8174909049567459, + "language_loss": 0.60562027, + "learning_rate": 3.7657895264179772e-06, + "loss": 0.62657583, + "num_input_tokens_seen": 31876195, + "step": 1505, + "time_per_iteration": 3.0768678188323975 + }, + { + "auxiliary_loss_clip": 0.01185587, + "auxiliary_loss_mlp": 0.01034553, + "balance_loss_clip": 1.05530119, + "balance_loss_mlp": 1.0249033, + "epoch": 0.181085793302471, + "flos": 44201941188480.0, + "grad_norm": 1.8123746817660942, + "language_loss": 0.74597001, + "learning_rate": 3.765423610069509e-06, + "loss": 0.76817143, + "num_input_tokens_seen": 31901585, + "step": 1506, + "time_per_iteration": 2.7013838291168213 + }, + { + "auxiliary_loss_clip": 0.01195532, + "auxiliary_loss_mlp": 0.01035249, + "balance_loss_clip": 1.06085432, + "balance_loss_mlp": 1.02502656, + "epoch": 0.18120603619311007, + "flos": 34898384638080.0, + "grad_norm": 2.4378971215275014, + "language_loss": 0.72365868, + "learning_rate": 3.765057425907683e-06, + "loss": 0.74596649, + "num_input_tokens_seen": 31923045, + "step": 1507, + "time_per_iteration": 2.635519027709961 + }, + { + "auxiliary_loss_clip": 0.01211203, + "auxiliary_loss_mlp": 0.01037031, + "balance_loss_clip": 1.05874825, + "balance_loss_mlp": 1.02634406, + "epoch": 0.18132627908374918, + "flos": 21506757390720.0, + "grad_norm": 1.8331419775078532, + "language_loss": 0.78635204, + "learning_rate": 3.764690973988048e-06, + "loss": 0.80883437, + "num_input_tokens_seen": 31943385, + "step": 1508, + "time_per_iteration": 2.483566999435425 + }, + { + "auxiliary_loss_clip": 0.01182159, + "auxiliary_loss_mlp": 0.01030567, + "balance_loss_clip": 1.05710387, + "balance_loss_mlp": 1.02067876, + "epoch": 0.18144652197438826, + "flos": 29057693633280.0, + "grad_norm": 2.0050978769377745, + "language_loss": 0.73541033, + "learning_rate": 3.7643242543661967e-06, + "loss": 0.7575376, + "num_input_tokens_seen": 31966045, + "step": 1509, + "time_per_iteration": 2.604801654815674 + }, + { + "auxiliary_loss_clip": 0.01079274, + "auxiliary_loss_mlp": 0.01005226, + "balance_loss_clip": 1.01417327, + "balance_loss_mlp": 1.00267494, + "epoch": 0.18156676486502735, + "flos": 68675064382080.0, + "grad_norm": 0.8198320170935484, + "language_loss": 0.6058796, + "learning_rate": 3.7639572670977573e-06, + "loss": 0.6267246, + "num_input_tokens_seen": 32021540, + "step": 1510, + "time_per_iteration": 2.9759364128112793 + }, + { + "auxiliary_loss_clip": 0.01181318, + "auxiliary_loss_mlp": 0.01038909, + "balance_loss_clip": 1.05604398, + "balance_loss_mlp": 1.02858591, + "epoch": 0.18168700775566646, + "flos": 26471515334400.0, + "grad_norm": 1.641609626706433, + "language_loss": 0.76623172, + "learning_rate": 3.7635900122384042e-06, + "loss": 0.78843403, + "num_input_tokens_seen": 32044535, + "step": 1511, + "time_per_iteration": 3.4064042568206787 + }, + { + "auxiliary_loss_clip": 0.01196113, + "auxiliary_loss_mlp": 0.01039114, + "balance_loss_clip": 1.05631971, + "balance_loss_mlp": 1.02799785, + "epoch": 0.18180725064630554, + "flos": 15005668884480.0, + "grad_norm": 2.1291553256178575, + "language_loss": 0.86615384, + "learning_rate": 3.7632224898438477e-06, + "loss": 0.88850605, + "num_input_tokens_seen": 32061010, + "step": 1512, + "time_per_iteration": 2.482506275177002 + }, + { + "auxiliary_loss_clip": 0.0118469, + "auxiliary_loss_mlp": 0.01034769, + "balance_loss_clip": 1.055457, + "balance_loss_mlp": 1.02485681, + "epoch": 0.18192749353694462, + "flos": 19682387665920.0, + "grad_norm": 2.886926856274682, + "language_loss": 0.79017317, + "learning_rate": 3.762854699969842e-06, + "loss": 0.8123678, + "num_input_tokens_seen": 32081520, + "step": 1513, + "time_per_iteration": 2.5991318225860596 + }, + { + "auxiliary_loss_clip": 0.01207685, + "auxiliary_loss_mlp": 0.01044291, + "balance_loss_clip": 1.06301165, + "balance_loss_mlp": 1.03283465, + "epoch": 0.1820477364275837, + "flos": 20702717400960.0, + "grad_norm": 2.115267562581626, + "language_loss": 0.73356509, + "learning_rate": 3.762486642672179e-06, + "loss": 0.7560848, + "num_input_tokens_seen": 32098460, + "step": 1514, + "time_per_iteration": 2.4877190589904785 + }, + { + "auxiliary_loss_clip": 0.01192099, + "auxiliary_loss_mlp": 0.01033882, + "balance_loss_clip": 1.05647898, + "balance_loss_mlp": 1.0235827, + "epoch": 0.18216797931822282, + "flos": 17128708197120.0, + "grad_norm": 1.8794035107029001, + "language_loss": 0.86551112, + "learning_rate": 3.7621183180066946e-06, + "loss": 0.88777089, + "num_input_tokens_seen": 32116420, + "step": 1515, + "time_per_iteration": 3.2551310062408447 + }, + { + "auxiliary_loss_clip": 0.01192733, + "auxiliary_loss_mlp": 0.01036471, + "balance_loss_clip": 1.05649102, + "balance_loss_mlp": 1.02612936, + "epoch": 0.1822882222088619, + "flos": 29242561956480.0, + "grad_norm": 1.5951712589873097, + "language_loss": 0.73647267, + "learning_rate": 3.7617497260292625e-06, + "loss": 0.75876468, + "num_input_tokens_seen": 32138475, + "step": 1516, + "time_per_iteration": 4.141033887863159 + }, + { + "auxiliary_loss_clip": 0.01186952, + "auxiliary_loss_mlp": 0.01036722, + "balance_loss_clip": 1.05819154, + "balance_loss_mlp": 1.02592182, + "epoch": 0.18240846509950098, + "flos": 17702739446400.0, + "grad_norm": 3.0684618804181256, + "language_loss": 0.78448355, + "learning_rate": 3.7613808667957967e-06, + "loss": 0.80672026, + "num_input_tokens_seen": 32151165, + "step": 1517, + "time_per_iteration": 2.4984993934631348 + }, + { + "auxiliary_loss_clip": 0.01194356, + "auxiliary_loss_mlp": 0.01037302, + "balance_loss_clip": 1.05674934, + "balance_loss_mlp": 1.02720511, + "epoch": 0.1825287079901401, + "flos": 14790025584000.0, + "grad_norm": 3.027983933143837, + "language_loss": 0.91451651, + "learning_rate": 3.7610117403622547e-06, + "loss": 0.93683314, + "num_input_tokens_seen": 32167725, + "step": 1518, + "time_per_iteration": 2.472708225250244 + }, + { + "auxiliary_loss_clip": 0.0117019, + "auxiliary_loss_mlp": 0.01039865, + "balance_loss_clip": 1.05195713, + "balance_loss_mlp": 1.02885652, + "epoch": 0.18264895088077918, + "flos": 21946232292480.0, + "grad_norm": 1.8492294924193642, + "language_loss": 0.90191853, + "learning_rate": 3.7606423467846313e-06, + "loss": 0.92401904, + "num_input_tokens_seen": 32187330, + "step": 1519, + "time_per_iteration": 2.6485531330108643 + }, + { + "auxiliary_loss_clip": 0.01185295, + "auxiliary_loss_mlp": 0.01041486, + "balance_loss_clip": 1.05972195, + "balance_loss_mlp": 1.03106749, + "epoch": 0.18276919377141826, + "flos": 20886759711360.0, + "grad_norm": 14.04040563725391, + "language_loss": 0.79334271, + "learning_rate": 3.760272686118964e-06, + "loss": 0.81561053, + "num_input_tokens_seen": 32205550, + "step": 1520, + "time_per_iteration": 2.591616153717041 + }, + { + "auxiliary_loss_clip": 0.01194601, + "auxiliary_loss_mlp": 0.0103858, + "balance_loss_clip": 1.05793369, + "balance_loss_mlp": 1.02856064, + "epoch": 0.18288943666205737, + "flos": 21469877101440.0, + "grad_norm": 1.973854885036507, + "language_loss": 0.93053085, + "learning_rate": 3.7599027584213297e-06, + "loss": 0.95286262, + "num_input_tokens_seen": 32224430, + "step": 1521, + "time_per_iteration": 2.54314923286438 + }, + { + "auxiliary_loss_clip": 0.01211816, + "auxiliary_loss_mlp": 0.01035667, + "balance_loss_clip": 1.05785823, + "balance_loss_mlp": 1.0248791, + "epoch": 0.18300967955269645, + "flos": 21539363961600.0, + "grad_norm": 1.9424408152258148, + "language_loss": 0.77804428, + "learning_rate": 3.7595325637478465e-06, + "loss": 0.80051911, + "num_input_tokens_seen": 32242455, + "step": 1522, + "time_per_iteration": 2.4820189476013184 + }, + { + "auxiliary_loss_clip": 0.01184092, + "auxiliary_loss_mlp": 0.01041946, + "balance_loss_clip": 1.05768704, + "balance_loss_mlp": 1.03043008, + "epoch": 0.18312992244333554, + "flos": 28876237102080.0, + "grad_norm": 1.7642726891077634, + "language_loss": 0.81808722, + "learning_rate": 3.7591621021546723e-06, + "loss": 0.84034759, + "num_input_tokens_seen": 32264450, + "step": 1523, + "time_per_iteration": 2.5989696979522705 + }, + { + "auxiliary_loss_clip": 0.01200657, + "auxiliary_loss_mlp": 0.01035998, + "balance_loss_clip": 1.0563817, + "balance_loss_mlp": 1.02371347, + "epoch": 0.18325016533397462, + "flos": 20120102801280.0, + "grad_norm": 1.7926776247299785, + "language_loss": 0.81478602, + "learning_rate": 3.7587913736980062e-06, + "loss": 0.8371526, + "num_input_tokens_seen": 32284090, + "step": 1524, + "time_per_iteration": 2.4868593215942383 + }, + { + "auxiliary_loss_clip": 0.01131655, + "auxiliary_loss_mlp": 0.01035345, + "balance_loss_clip": 1.04698801, + "balance_loss_mlp": 1.02510512, + "epoch": 0.18337040822461373, + "flos": 23329187781120.0, + "grad_norm": 1.7407891259193067, + "language_loss": 0.84390593, + "learning_rate": 3.7584203784340865e-06, + "loss": 0.86557591, + "num_input_tokens_seen": 32303260, + "step": 1525, + "time_per_iteration": 2.6048583984375 + }, + { + "auxiliary_loss_clip": 0.01188586, + "auxiliary_loss_mlp": 0.01037948, + "balance_loss_clip": 1.05424452, + "balance_loss_mlp": 1.02752364, + "epoch": 0.1834906511152528, + "flos": 25009555881600.0, + "grad_norm": 2.5529212625787383, + "language_loss": 0.85962021, + "learning_rate": 3.7580491164191938e-06, + "loss": 0.88188553, + "num_input_tokens_seen": 32321570, + "step": 1526, + "time_per_iteration": 2.539632558822632 + }, + { + "auxiliary_loss_clip": 0.01099654, + "auxiliary_loss_mlp": 0.01001831, + "balance_loss_clip": 1.0152297, + "balance_loss_mlp": 0.99922007, + "epoch": 0.1836108940058919, + "flos": 67251493589760.0, + "grad_norm": 0.7499431823730937, + "language_loss": 0.6128704, + "learning_rate": 3.757677587709648e-06, + "loss": 0.6338852, + "num_input_tokens_seen": 32384835, + "step": 1527, + "time_per_iteration": 3.1587576866149902 + }, + { + "auxiliary_loss_clip": 0.01172905, + "auxiliary_loss_mlp": 0.01034494, + "balance_loss_clip": 1.05639935, + "balance_loss_mlp": 1.02395582, + "epoch": 0.183731136896531, + "flos": 25738721971200.0, + "grad_norm": 1.8944007541654928, + "language_loss": 0.7549386, + "learning_rate": 3.7573057923618095e-06, + "loss": 0.77701271, + "num_input_tokens_seen": 32404930, + "step": 1528, + "time_per_iteration": 2.5747475624084473 + }, + { + "auxiliary_loss_clip": 0.0116084, + "auxiliary_loss_mlp": 0.01033898, + "balance_loss_clip": 1.05130363, + "balance_loss_mlp": 1.02299035, + "epoch": 0.1838513797871701, + "flos": 20449403712000.0, + "grad_norm": 2.319120587355637, + "language_loss": 0.74532473, + "learning_rate": 3.7569337304320793e-06, + "loss": 0.76727211, + "num_input_tokens_seen": 32424515, + "step": 1529, + "time_per_iteration": 2.5776753425598145 + }, + { + "auxiliary_loss_clip": 0.01086957, + "auxiliary_loss_mlp": 0.01002228, + "balance_loss_clip": 1.01637292, + "balance_loss_mlp": 0.99959368, + "epoch": 0.18397162267780917, + "flos": 68565141786240.0, + "grad_norm": 0.8363631840036638, + "language_loss": 0.6446051, + "learning_rate": 3.756561401976899e-06, + "loss": 0.66549695, + "num_input_tokens_seen": 32484220, + "step": 1530, + "time_per_iteration": 2.956167221069336 + }, + { + "auxiliary_loss_clip": 0.01224812, + "auxiliary_loss_mlp": 0.01035116, + "balance_loss_clip": 1.06176674, + "balance_loss_mlp": 1.02512646, + "epoch": 0.18409186556844825, + "flos": 31941104976000.0, + "grad_norm": 2.1904964011398285, + "language_loss": 0.82274997, + "learning_rate": 3.7561888070527514e-06, + "loss": 0.84534919, + "num_input_tokens_seen": 32506260, + "step": 1531, + "time_per_iteration": 2.5439414978027344 + }, + { + "auxiliary_loss_clip": 0.01162075, + "auxiliary_loss_mlp": 0.00764837, + "balance_loss_clip": 1.0536437, + "balance_loss_mlp": 1.0014441, + "epoch": 0.18421210845908736, + "flos": 20120533764480.0, + "grad_norm": 2.428912250226019, + "language_loss": 0.79895461, + "learning_rate": 3.7558159457161577e-06, + "loss": 0.81822371, + "num_input_tokens_seen": 32524225, + "step": 1532, + "time_per_iteration": 2.539628028869629 + }, + { + "auxiliary_loss_clip": 0.011954, + "auxiliary_loss_mlp": 0.00765489, + "balance_loss_clip": 1.06018102, + "balance_loss_mlp": 1.00158858, + "epoch": 0.18433235134972645, + "flos": 23110491824640.0, + "grad_norm": 6.634434469066184, + "language_loss": 0.77942562, + "learning_rate": 3.755442818023681e-06, + "loss": 0.79903448, + "num_input_tokens_seen": 32543850, + "step": 1533, + "time_per_iteration": 2.5281434059143066 + }, + { + "auxiliary_loss_clip": 0.01180476, + "auxiliary_loss_mlp": 0.01035342, + "balance_loss_clip": 1.05730462, + "balance_loss_mlp": 1.02569175, + "epoch": 0.18445259424036553, + "flos": 18291351617280.0, + "grad_norm": 2.8968256035707185, + "language_loss": 0.76268566, + "learning_rate": 3.7550694240319246e-06, + "loss": 0.7848438, + "num_input_tokens_seen": 32561725, + "step": 1534, + "time_per_iteration": 2.5207998752593994 + }, + { + "auxiliary_loss_clip": 0.01209239, + "auxiliary_loss_mlp": 0.01030188, + "balance_loss_clip": 1.05771995, + "balance_loss_mlp": 1.02021646, + "epoch": 0.18457283713100464, + "flos": 21324079797120.0, + "grad_norm": 2.239984944499428, + "language_loss": 0.76452267, + "learning_rate": 3.7546957637975326e-06, + "loss": 0.78691697, + "num_input_tokens_seen": 32579135, + "step": 1535, + "time_per_iteration": 2.4840400218963623 + }, + { + "auxiliary_loss_clip": 0.01135292, + "auxiliary_loss_mlp": 0.01035125, + "balance_loss_clip": 1.04442894, + "balance_loss_mlp": 1.02522469, + "epoch": 0.18469308002164372, + "flos": 20375679047040.0, + "grad_norm": 1.7036993579069537, + "language_loss": 0.74113375, + "learning_rate": 3.7543218373771873e-06, + "loss": 0.76283795, + "num_input_tokens_seen": 32598460, + "step": 1536, + "time_per_iteration": 2.6975834369659424 + }, + { + "auxiliary_loss_clip": 0.01141217, + "auxiliary_loss_mlp": 0.00764856, + "balance_loss_clip": 1.05184948, + "balance_loss_mlp": 1.00143695, + "epoch": 0.1848133229122828, + "flos": 26435892021120.0, + "grad_norm": 1.3894025450513123, + "language_loss": 0.78139925, + "learning_rate": 3.753947644827615e-06, + "loss": 0.80045998, + "num_input_tokens_seen": 32621920, + "step": 1537, + "time_per_iteration": 3.4919273853302 + }, + { + "auxiliary_loss_clip": 0.01088291, + "auxiliary_loss_mlp": 0.01009532, + "balance_loss_clip": 1.01562738, + "balance_loss_mlp": 1.00688541, + "epoch": 0.1849335658029219, + "flos": 70547447612160.0, + "grad_norm": 1.0739096033911575, + "language_loss": 0.57230234, + "learning_rate": 3.753573186205579e-06, + "loss": 0.59328055, + "num_input_tokens_seen": 32690040, + "step": 1538, + "time_per_iteration": 3.2244858741760254 + }, + { + "auxiliary_loss_clip": 0.01179598, + "auxiliary_loss_mlp": 0.00765086, + "balance_loss_clip": 1.05210328, + "balance_loss_mlp": 1.0014925, + "epoch": 0.185053808693561, + "flos": 17384140788480.0, + "grad_norm": 1.9704461567534102, + "language_loss": 0.78313398, + "learning_rate": 3.753198461567885e-06, + "loss": 0.80258083, + "num_input_tokens_seen": 32707285, + "step": 1539, + "time_per_iteration": 2.5476536750793457 + }, + { + "auxiliary_loss_clip": 0.01170879, + "auxiliary_loss_mlp": 0.01037019, + "balance_loss_clip": 1.05728459, + "balance_loss_mlp": 1.02695787, + "epoch": 0.18517405158420008, + "flos": 28986159697920.0, + "grad_norm": 1.7176315458416926, + "language_loss": 0.91951835, + "learning_rate": 3.7528234709713783e-06, + "loss": 0.94159728, + "num_input_tokens_seen": 32730030, + "step": 1540, + "time_per_iteration": 2.602083921432495 + }, + { + "auxiliary_loss_clip": 0.01208462, + "auxiliary_loss_mlp": 0.01032546, + "balance_loss_clip": 1.06012011, + "balance_loss_mlp": 1.02262151, + "epoch": 0.18529429447483917, + "flos": 26794962328320.0, + "grad_norm": 1.8283771395548039, + "language_loss": 0.84491968, + "learning_rate": 3.7524482144729447e-06, + "loss": 0.86732972, + "num_input_tokens_seen": 32749485, + "step": 1541, + "time_per_iteration": 3.2986974716186523 + }, + { + "auxiliary_loss_clip": 0.01171286, + "auxiliary_loss_mlp": 0.01043933, + "balance_loss_clip": 1.05188191, + "balance_loss_mlp": 1.03343678, + "epoch": 0.18541453736547828, + "flos": 13581595301760.0, + "grad_norm": 2.480807528985103, + "language_loss": 0.83378088, + "learning_rate": 3.7520726921295106e-06, + "loss": 0.85593301, + "num_input_tokens_seen": 32766205, + "step": 1542, + "time_per_iteration": 4.065120458602905 + }, + { + "auxiliary_loss_clip": 0.01200754, + "auxiliary_loss_mlp": 0.01039686, + "balance_loss_clip": 1.05371547, + "balance_loss_mlp": 1.02949405, + "epoch": 0.18553478025611736, + "flos": 24025424077440.0, + "grad_norm": 2.22806396432734, + "language_loss": 0.72247803, + "learning_rate": 3.751696903998042e-06, + "loss": 0.74488246, + "num_input_tokens_seen": 32784840, + "step": 1543, + "time_per_iteration": 2.505246639251709 + }, + { + "auxiliary_loss_clip": 0.01204232, + "auxiliary_loss_mlp": 0.01036615, + "balance_loss_clip": 1.05924189, + "balance_loss_mlp": 1.02627397, + "epoch": 0.18565502314675644, + "flos": 25885165720320.0, + "grad_norm": 1.7283249473649136, + "language_loss": 0.70012486, + "learning_rate": 3.7513208501355456e-06, + "loss": 0.72253335, + "num_input_tokens_seen": 32805945, + "step": 1544, + "time_per_iteration": 2.523005485534668 + }, + { + "auxiliary_loss_clip": 0.01185312, + "auxiliary_loss_mlp": 0.01038154, + "balance_loss_clip": 1.05376899, + "balance_loss_mlp": 1.02830148, + "epoch": 0.18577526603739553, + "flos": 19610063631360.0, + "grad_norm": 1.9039222170045924, + "language_loss": 0.83655876, + "learning_rate": 3.750944530599069e-06, + "loss": 0.85879338, + "num_input_tokens_seen": 32825515, + "step": 1545, + "time_per_iteration": 2.499042510986328 + }, + { + "auxiliary_loss_clip": 0.01212124, + "auxiliary_loss_mlp": 0.01031631, + "balance_loss_clip": 1.06054342, + "balance_loss_mlp": 1.02138519, + "epoch": 0.18589550892803464, + "flos": 18474891137280.0, + "grad_norm": 2.1735151523607077, + "language_loss": 0.80876112, + "learning_rate": 3.7505679454456992e-06, + "loss": 0.83119869, + "num_input_tokens_seen": 32842125, + "step": 1546, + "time_per_iteration": 2.4463369846343994 + }, + { + "auxiliary_loss_clip": 0.01124309, + "auxiliary_loss_mlp": 0.01033847, + "balance_loss_clip": 1.0475862, + "balance_loss_mlp": 1.02347589, + "epoch": 0.18601575181867372, + "flos": 23549966726400.0, + "grad_norm": 2.6711911108814483, + "language_loss": 0.70050895, + "learning_rate": 3.750191094732564e-06, + "loss": 0.72209048, + "num_input_tokens_seen": 32862990, + "step": 1547, + "time_per_iteration": 2.6764867305755615 + }, + { + "auxiliary_loss_clip": 0.01125746, + "auxiliary_loss_mlp": 0.00765346, + "balance_loss_clip": 1.04800963, + "balance_loss_mlp": 1.00135374, + "epoch": 0.1861359947093128, + "flos": 26360192108160.0, + "grad_norm": 1.8303319388126953, + "language_loss": 0.75306904, + "learning_rate": 3.7498139785168313e-06, + "loss": 0.77197999, + "num_input_tokens_seen": 32883595, + "step": 1548, + "time_per_iteration": 2.695932626724243 + }, + { + "auxiliary_loss_clip": 0.01204195, + "auxiliary_loss_mlp": 0.01041068, + "balance_loss_clip": 1.06015885, + "balance_loss_mlp": 1.03069091, + "epoch": 0.1862562375999519, + "flos": 23331198942720.0, + "grad_norm": 1.7108148190525867, + "language_loss": 0.77229643, + "learning_rate": 3.749436596855709e-06, + "loss": 0.79474908, + "num_input_tokens_seen": 32902895, + "step": 1549, + "time_per_iteration": 2.4954729080200195 + }, + { + "auxiliary_loss_clip": 0.0119877, + "auxiliary_loss_mlp": 0.01033977, + "balance_loss_clip": 1.05473328, + "balance_loss_mlp": 1.0232482, + "epoch": 0.186376480490591, + "flos": 16648222942080.0, + "grad_norm": 1.9650986955331127, + "language_loss": 0.90583658, + "learning_rate": 3.749058949806446e-06, + "loss": 0.92816412, + "num_input_tokens_seen": 32919620, + "step": 1550, + "time_per_iteration": 2.4449243545532227 + }, + { + "auxiliary_loss_clip": 0.01205446, + "auxiliary_loss_mlp": 0.01032841, + "balance_loss_clip": 1.0560472, + "balance_loss_mlp": 1.02285159, + "epoch": 0.18649672338123008, + "flos": 21468656039040.0, + "grad_norm": 1.7018593210245228, + "language_loss": 0.84361422, + "learning_rate": 3.748681037426331e-06, + "loss": 0.86599714, + "num_input_tokens_seen": 32938830, + "step": 1551, + "time_per_iteration": 2.491765022277832 + }, + { + "auxiliary_loss_clip": 0.01223168, + "auxiliary_loss_mlp": 0.01041548, + "balance_loss_clip": 1.06118083, + "balance_loss_mlp": 1.03203535, + "epoch": 0.1866169662718692, + "flos": 12312728386560.0, + "grad_norm": 2.212283527900075, + "language_loss": 0.91793215, + "learning_rate": 3.7483028597726936e-06, + "loss": 0.94057941, + "num_input_tokens_seen": 32955600, + "step": 1552, + "time_per_iteration": 2.4316203594207764 + }, + { + "auxiliary_loss_clip": 0.01174117, + "auxiliary_loss_mlp": 0.01040259, + "balance_loss_clip": 1.05496562, + "balance_loss_mlp": 1.0296253, + "epoch": 0.18673720916250827, + "flos": 23581280407680.0, + "grad_norm": 1.8074641017375466, + "language_loss": 0.62250221, + "learning_rate": 3.7479244169029017e-06, + "loss": 0.64464593, + "num_input_tokens_seen": 32975390, + "step": 1553, + "time_per_iteration": 2.5727899074554443 + }, + { + "auxiliary_loss_clip": 0.01206585, + "auxiliary_loss_mlp": 0.01027515, + "balance_loss_clip": 1.05372, + "balance_loss_mlp": 1.01744759, + "epoch": 0.18685745205314735, + "flos": 19718370115200.0, + "grad_norm": 2.6761238943100425, + "language_loss": 0.73541552, + "learning_rate": 3.7475457088743658e-06, + "loss": 0.75775647, + "num_input_tokens_seen": 32992640, + "step": 1554, + "time_per_iteration": 2.453447103500366 + }, + { + "auxiliary_loss_clip": 0.01181812, + "auxiliary_loss_mlp": 0.01038304, + "balance_loss_clip": 1.05492425, + "balance_loss_mlp": 1.02673423, + "epoch": 0.18697769494378644, + "flos": 34204123589760.0, + "grad_norm": 2.4742331589988753, + "language_loss": 0.74778897, + "learning_rate": 3.7471667357445348e-06, + "loss": 0.76999015, + "num_input_tokens_seen": 33012470, + "step": 1555, + "time_per_iteration": 2.6185944080352783 + }, + { + "auxiliary_loss_clip": 0.01146544, + "auxiliary_loss_mlp": 0.01028179, + "balance_loss_clip": 1.0525583, + "balance_loss_mlp": 1.01854062, + "epoch": 0.18709793783442555, + "flos": 34241327101440.0, + "grad_norm": 1.9042703520351787, + "language_loss": 0.72501665, + "learning_rate": 3.7467874975709e-06, + "loss": 0.74676389, + "num_input_tokens_seen": 33033275, + "step": 1556, + "time_per_iteration": 2.704758644104004 + }, + { + "auxiliary_loss_clip": 0.01211658, + "auxiliary_loss_mlp": 0.01043469, + "balance_loss_clip": 1.05950749, + "balance_loss_mlp": 1.03316927, + "epoch": 0.18721818072506463, + "flos": 40734550529280.0, + "grad_norm": 2.2295040899510847, + "language_loss": 0.78239548, + "learning_rate": 3.7464079944109904e-06, + "loss": 0.80494678, + "num_input_tokens_seen": 33055135, + "step": 1557, + "time_per_iteration": 2.6423914432525635 + }, + { + "auxiliary_loss_clip": 0.01179765, + "auxiliary_loss_mlp": 0.01032407, + "balance_loss_clip": 1.05592144, + "balance_loss_mlp": 1.02256656, + "epoch": 0.18733842361570371, + "flos": 22157386392960.0, + "grad_norm": 1.9109744370465356, + "language_loss": 0.77286404, + "learning_rate": 3.746028226322376e-06, + "loss": 0.79498577, + "num_input_tokens_seen": 33071015, + "step": 1558, + "time_per_iteration": 2.5442967414855957 + }, + { + "auxiliary_loss_clip": 0.01187269, + "auxiliary_loss_mlp": 0.01033578, + "balance_loss_clip": 1.05594277, + "balance_loss_mlp": 1.02374268, + "epoch": 0.18745866650634282, + "flos": 18914940656640.0, + "grad_norm": 1.7465495090592167, + "language_loss": 0.75518042, + "learning_rate": 3.745648193362669e-06, + "loss": 0.77738893, + "num_input_tokens_seen": 33090370, + "step": 1559, + "time_per_iteration": 2.5319156646728516 + }, + { + "auxiliary_loss_clip": 0.01191571, + "auxiliary_loss_mlp": 0.01036351, + "balance_loss_clip": 1.05575907, + "balance_loss_mlp": 1.02694559, + "epoch": 0.1875789093969819, + "flos": 19314626267520.0, + "grad_norm": 2.079270316177141, + "language_loss": 0.72078872, + "learning_rate": 3.745267895589518e-06, + "loss": 0.74306798, + "num_input_tokens_seen": 33108910, + "step": 1560, + "time_per_iteration": 2.558060884475708 + }, + { + "auxiliary_loss_clip": 0.01191931, + "auxiliary_loss_mlp": 0.01033021, + "balance_loss_clip": 1.05816209, + "balance_loss_mlp": 1.02301884, + "epoch": 0.187699152287621, + "flos": 17018965169280.0, + "grad_norm": 2.83178849960239, + "language_loss": 0.82118595, + "learning_rate": 3.7448873330606154e-06, + "loss": 0.84343541, + "num_input_tokens_seen": 33126680, + "step": 1561, + "time_per_iteration": 2.516437530517578 + }, + { + "auxiliary_loss_clip": 0.01169904, + "auxiliary_loss_mlp": 0.01035742, + "balance_loss_clip": 1.05656123, + "balance_loss_mlp": 1.02503085, + "epoch": 0.18781939517826007, + "flos": 22346384780160.0, + "grad_norm": 1.9700629096529434, + "language_loss": 0.87488532, + "learning_rate": 3.7445065058336914e-06, + "loss": 0.89694178, + "num_input_tokens_seen": 33145550, + "step": 1562, + "time_per_iteration": 2.5494260787963867 + }, + { + "auxiliary_loss_clip": 0.01145694, + "auxiliary_loss_mlp": 0.01030112, + "balance_loss_clip": 1.04683137, + "balance_loss_mlp": 1.02052808, + "epoch": 0.18793963806889918, + "flos": 14611478054400.0, + "grad_norm": 1.8122669203883948, + "language_loss": 0.8651669, + "learning_rate": 3.7441254139665176e-06, + "loss": 0.88692498, + "num_input_tokens_seen": 33161735, + "step": 1563, + "time_per_iteration": 2.56776762008667 + }, + { + "auxiliary_loss_clip": 0.01219795, + "auxiliary_loss_mlp": 0.0103674, + "balance_loss_clip": 1.06106699, + "balance_loss_mlp": 1.02726305, + "epoch": 0.18805988095953827, + "flos": 17457075354240.0, + "grad_norm": 1.7021917344412678, + "language_loss": 0.82178879, + "learning_rate": 3.743744057516905e-06, + "loss": 0.84435415, + "num_input_tokens_seen": 33179795, + "step": 1564, + "time_per_iteration": 3.2964794635772705 + }, + { + "auxiliary_loss_clip": 0.01161245, + "auxiliary_loss_mlp": 0.01039463, + "balance_loss_clip": 1.05358851, + "balance_loss_mlp": 1.02879405, + "epoch": 0.18818012385017735, + "flos": 15043877976960.0, + "grad_norm": 2.7352306860071005, + "language_loss": 0.87230444, + "learning_rate": 3.743362436542706e-06, + "loss": 0.89431155, + "num_input_tokens_seen": 33194485, + "step": 1565, + "time_per_iteration": 2.535975217819214 + }, + { + "auxiliary_loss_clip": 0.0121809, + "auxiliary_loss_mlp": 0.01033895, + "balance_loss_clip": 1.058779, + "balance_loss_mlp": 1.02423275, + "epoch": 0.18830036674081646, + "flos": 47551975136640.0, + "grad_norm": 1.7251858002540237, + "language_loss": 0.76776516, + "learning_rate": 3.7429805511018115e-06, + "loss": 0.79028511, + "num_input_tokens_seen": 33216145, + "step": 1566, + "time_per_iteration": 2.670905828475952 + }, + { + "auxiliary_loss_clip": 0.01171291, + "auxiliary_loss_mlp": 0.00765223, + "balance_loss_clip": 1.05486369, + "balance_loss_mlp": 1.00120592, + "epoch": 0.18842060963145554, + "flos": 30044626698240.0, + "grad_norm": 1.7597834607356289, + "language_loss": 0.78040373, + "learning_rate": 3.7425984012521524e-06, + "loss": 0.79976887, + "num_input_tokens_seen": 33236345, + "step": 1567, + "time_per_iteration": 2.614617347717285 + }, + { + "auxiliary_loss_clip": 0.01071177, + "auxiliary_loss_mlp": 0.0075558, + "balance_loss_clip": 1.01648927, + "balance_loss_mlp": 1.00086582, + "epoch": 0.18854085252209463, + "flos": 70318372625280.0, + "grad_norm": 0.7466597381056224, + "language_loss": 0.60471791, + "learning_rate": 3.7422159870517025e-06, + "loss": 0.62298554, + "num_input_tokens_seen": 33301600, + "step": 1568, + "time_per_iteration": 3.925588607788086 + }, + { + "auxiliary_loss_clip": 0.01185417, + "auxiliary_loss_mlp": 0.01029467, + "balance_loss_clip": 1.05419266, + "balance_loss_mlp": 1.01947105, + "epoch": 0.1886610954127337, + "flos": 21289318410240.0, + "grad_norm": 2.429705935808205, + "language_loss": 0.7889927, + "learning_rate": 3.7418333085584717e-06, + "loss": 0.81114149, + "num_input_tokens_seen": 33322785, + "step": 1569, + "time_per_iteration": 4.167656898498535 + }, + { + "auxiliary_loss_clip": 0.01177691, + "auxiliary_loss_mlp": 0.0103431, + "balance_loss_clip": 1.05749857, + "balance_loss_mlp": 1.02425492, + "epoch": 0.18878133830337282, + "flos": 17266819991040.0, + "grad_norm": 3.4565412852828947, + "language_loss": 0.90822613, + "learning_rate": 3.7414503658305128e-06, + "loss": 0.93034613, + "num_input_tokens_seen": 33340020, + "step": 1570, + "time_per_iteration": 2.5186564922332764 + }, + { + "auxiliary_loss_clip": 0.01163718, + "auxiliary_loss_mlp": 0.01034178, + "balance_loss_clip": 1.04862928, + "balance_loss_mlp": 1.0239501, + "epoch": 0.1889015811940119, + "flos": 25775207210880.0, + "grad_norm": 2.0166577152545653, + "language_loss": 0.77486259, + "learning_rate": 3.7410671589259185e-06, + "loss": 0.79684156, + "num_input_tokens_seen": 33358620, + "step": 1571, + "time_per_iteration": 2.6210100650787354 + }, + { + "auxiliary_loss_clip": 0.01222254, + "auxiliary_loss_mlp": 0.01040092, + "balance_loss_clip": 1.06168783, + "balance_loss_mlp": 1.02935708, + "epoch": 0.18902182408465099, + "flos": 21032197879680.0, + "grad_norm": 1.7598112065815927, + "language_loss": 0.79384351, + "learning_rate": 3.7406836879028205e-06, + "loss": 0.81646699, + "num_input_tokens_seen": 33378845, + "step": 1572, + "time_per_iteration": 2.454841375350952 + }, + { + "auxiliary_loss_clip": 0.01204607, + "auxiliary_loss_mlp": 0.01035601, + "balance_loss_clip": 1.05873835, + "balance_loss_mlp": 1.02559328, + "epoch": 0.1891420669752901, + "flos": 22272121411200.0, + "grad_norm": 2.004269680621973, + "language_loss": 0.76176798, + "learning_rate": 3.7402999528193907e-06, + "loss": 0.78417003, + "num_input_tokens_seen": 33398345, + "step": 1573, + "time_per_iteration": 2.5197460651397705 + }, + { + "auxiliary_loss_clip": 0.01161352, + "auxiliary_loss_mlp": 0.00765102, + "balance_loss_clip": 1.05365729, + "balance_loss_mlp": 1.00121915, + "epoch": 0.18926230986592918, + "flos": 22017802141440.0, + "grad_norm": 2.2070203598409126, + "language_loss": 0.85365045, + "learning_rate": 3.739915953733842e-06, + "loss": 0.87291497, + "num_input_tokens_seen": 33416390, + "step": 1574, + "time_per_iteration": 2.5772743225097656 + }, + { + "auxiliary_loss_clip": 0.01217705, + "auxiliary_loss_mlp": 0.01031477, + "balance_loss_clip": 1.05857277, + "balance_loss_mlp": 1.02143359, + "epoch": 0.18938255275656826, + "flos": 24462672336000.0, + "grad_norm": 1.5639265663064112, + "language_loss": 0.81622887, + "learning_rate": 3.7395316907044264e-06, + "loss": 0.83872068, + "num_input_tokens_seen": 33437175, + "step": 1575, + "time_per_iteration": 2.481123924255371 + }, + { + "auxiliary_loss_clip": 0.01205269, + "auxiliary_loss_mlp": 0.01035536, + "balance_loss_clip": 1.05761051, + "balance_loss_mlp": 1.02543926, + "epoch": 0.18950279564720737, + "flos": 24427049022720.0, + "grad_norm": 1.6671728509983257, + "language_loss": 0.79252237, + "learning_rate": 3.7391471637894364e-06, + "loss": 0.81493044, + "num_input_tokens_seen": 33459440, + "step": 1576, + "time_per_iteration": 2.5146405696868896 + }, + { + "auxiliary_loss_clip": 0.01175928, + "auxiliary_loss_mlp": 0.01034898, + "balance_loss_clip": 1.05163336, + "balance_loss_mlp": 1.02503967, + "epoch": 0.18962303853784646, + "flos": 19756291898880.0, + "grad_norm": 1.9878035033134998, + "language_loss": 0.84779274, + "learning_rate": 3.738762373047205e-06, + "loss": 0.869901, + "num_input_tokens_seen": 33479360, + "step": 1577, + "time_per_iteration": 2.554781436920166 + }, + { + "auxiliary_loss_clip": 0.01176347, + "auxiliary_loss_mlp": 0.01035171, + "balance_loss_clip": 1.05614686, + "balance_loss_mlp": 1.02518141, + "epoch": 0.18974328142848554, + "flos": 21032054225280.0, + "grad_norm": 1.6611333098244523, + "language_loss": 0.83338356, + "learning_rate": 3.738377318536103e-06, + "loss": 0.85549879, + "num_input_tokens_seen": 33499245, + "step": 1578, + "time_per_iteration": 2.5693328380584717 + }, + { + "auxiliary_loss_clip": 0.01214313, + "auxiliary_loss_mlp": 0.01032525, + "balance_loss_clip": 1.05931211, + "balance_loss_mlp": 1.02333415, + "epoch": 0.18986352431912462, + "flos": 12966122736000.0, + "grad_norm": 2.158336821588647, + "language_loss": 0.71165651, + "learning_rate": 3.7379920003145447e-06, + "loss": 0.7341249, + "num_input_tokens_seen": 33513520, + "step": 1579, + "time_per_iteration": 2.5071375370025635 + }, + { + "auxiliary_loss_clip": 0.01183175, + "auxiliary_loss_mlp": 0.01034621, + "balance_loss_clip": 1.05790281, + "balance_loss_mlp": 1.02401698, + "epoch": 0.18998376720976373, + "flos": 23767908497280.0, + "grad_norm": 1.6969795480213563, + "language_loss": 0.83879662, + "learning_rate": 3.7376064184409817e-06, + "loss": 0.86097455, + "num_input_tokens_seen": 33533100, + "step": 1580, + "time_per_iteration": 2.577732801437378 + }, + { + "auxiliary_loss_clip": 0.01187319, + "auxiliary_loss_mlp": 0.01031204, + "balance_loss_clip": 1.05678725, + "balance_loss_mlp": 1.02091682, + "epoch": 0.19010401010040281, + "flos": 22966023323520.0, + "grad_norm": 1.4802705533213585, + "language_loss": 0.86933339, + "learning_rate": 3.7372205729739063e-06, + "loss": 0.89151859, + "num_input_tokens_seen": 33554915, + "step": 1581, + "time_per_iteration": 2.6190943717956543 + }, + { + "auxiliary_loss_clip": 0.01207741, + "auxiliary_loss_mlp": 0.01033, + "balance_loss_clip": 1.05842233, + "balance_loss_mlp": 1.02219391, + "epoch": 0.1902242529910419, + "flos": 19135647774720.0, + "grad_norm": 2.433403078712931, + "language_loss": 0.71263468, + "learning_rate": 3.7368344639718514e-06, + "loss": 0.7350421, + "num_input_tokens_seen": 33572850, + "step": 1582, + "time_per_iteration": 2.5559990406036377 + }, + { + "auxiliary_loss_clip": 0.01205022, + "auxiliary_loss_mlp": 0.01040011, + "balance_loss_clip": 1.05684984, + "balance_loss_mlp": 1.03099227, + "epoch": 0.190344495881681, + "flos": 25483935824640.0, + "grad_norm": 1.6093113268782369, + "language_loss": 0.80289757, + "learning_rate": 3.7364480914933895e-06, + "loss": 0.82534796, + "num_input_tokens_seen": 33593090, + "step": 1583, + "time_per_iteration": 2.593003749847412 + }, + { + "auxiliary_loss_clip": 0.0115595, + "auxiliary_loss_mlp": 0.00764924, + "balance_loss_clip": 1.05262733, + "balance_loss_mlp": 1.00115514, + "epoch": 0.1904647387723201, + "flos": 26792843425920.0, + "grad_norm": 1.787952832208737, + "language_loss": 0.80973697, + "learning_rate": 3.7360614555971325e-06, + "loss": 0.82894564, + "num_input_tokens_seen": 33612745, + "step": 1584, + "time_per_iteration": 2.6806397438049316 + }, + { + "auxiliary_loss_clip": 0.01202599, + "auxiliary_loss_mlp": 0.00764345, + "balance_loss_clip": 1.05753529, + "balance_loss_mlp": 1.00112784, + "epoch": 0.19058498166295917, + "flos": 23987753688960.0, + "grad_norm": 1.8899846032785734, + "language_loss": 0.85120767, + "learning_rate": 3.735674556341733e-06, + "loss": 0.87087715, + "num_input_tokens_seen": 33632360, + "step": 1585, + "time_per_iteration": 2.50742244720459 + }, + { + "auxiliary_loss_clip": 0.01187049, + "auxiliary_loss_mlp": 0.01037028, + "balance_loss_clip": 1.05881381, + "balance_loss_mlp": 1.0270437, + "epoch": 0.19070522455359826, + "flos": 28293299280000.0, + "grad_norm": 2.0058787032283223, + "language_loss": 0.82644475, + "learning_rate": 3.7352873937858835e-06, + "loss": 0.8486855, + "num_input_tokens_seen": 33653895, + "step": 1586, + "time_per_iteration": 2.569875717163086 + }, + { + "auxiliary_loss_clip": 0.01168469, + "auxiliary_loss_mlp": 0.00764986, + "balance_loss_clip": 1.05505812, + "balance_loss_mlp": 1.00110936, + "epoch": 0.19082546744423737, + "flos": 25660220797440.0, + "grad_norm": 2.0639514613564627, + "language_loss": 0.72184312, + "learning_rate": 3.734899967988316e-06, + "loss": 0.74117768, + "num_input_tokens_seen": 33672075, + "step": 1587, + "time_per_iteration": 2.5744383335113525 + }, + { + "auxiliary_loss_clip": 0.0116488, + "auxiliary_loss_mlp": 0.01032713, + "balance_loss_clip": 1.05114579, + "balance_loss_mlp": 1.02294385, + "epoch": 0.19094571033487645, + "flos": 19719483436800.0, + "grad_norm": 1.9292386780950004, + "language_loss": 0.83774328, + "learning_rate": 3.7345122790078026e-06, + "loss": 0.85971928, + "num_input_tokens_seen": 33689640, + "step": 1588, + "time_per_iteration": 2.529493570327759 + }, + { + "auxiliary_loss_clip": 0.01202829, + "auxiliary_loss_mlp": 0.01033973, + "balance_loss_clip": 1.05815697, + "balance_loss_mlp": 1.02326179, + "epoch": 0.19106595322551553, + "flos": 21616320850560.0, + "grad_norm": 2.628570228226583, + "language_loss": 0.92464364, + "learning_rate": 3.7341243269031556e-06, + "loss": 0.94701159, + "num_input_tokens_seen": 33708630, + "step": 1589, + "time_per_iteration": 2.5039501190185547 + }, + { + "auxiliary_loss_clip": 0.01178841, + "auxiliary_loss_mlp": 0.01031989, + "balance_loss_clip": 1.05440331, + "balance_loss_mlp": 1.02257121, + "epoch": 0.19118619611615464, + "flos": 29896890059520.0, + "grad_norm": 1.664186593009288, + "language_loss": 0.7744596, + "learning_rate": 3.7337361117332275e-06, + "loss": 0.79656792, + "num_input_tokens_seen": 33730370, + "step": 1590, + "time_per_iteration": 3.3563249111175537 + }, + { + "auxiliary_loss_clip": 0.0117399, + "auxiliary_loss_mlp": 0.01031532, + "balance_loss_clip": 1.05281591, + "balance_loss_mlp": 1.02220416, + "epoch": 0.19130643900679373, + "flos": 17273428093440.0, + "grad_norm": 2.687114996303786, + "language_loss": 0.7695992, + "learning_rate": 3.7333476335569087e-06, + "loss": 0.79165441, + "num_input_tokens_seen": 33748370, + "step": 1591, + "time_per_iteration": 2.537489891052246 + }, + { + "auxiliary_loss_clip": 0.01188936, + "auxiliary_loss_mlp": 0.01035863, + "balance_loss_clip": 1.05821419, + "balance_loss_mlp": 1.02511024, + "epoch": 0.1914266818974328, + "flos": 24826339584000.0, + "grad_norm": 2.096149855509539, + "language_loss": 0.66944289, + "learning_rate": 3.7329588924331325e-06, + "loss": 0.69169086, + "num_input_tokens_seen": 33769575, + "step": 1592, + "time_per_iteration": 2.5594723224639893 + }, + { + "auxiliary_loss_clip": 0.01164247, + "auxiliary_loss_mlp": 0.01033614, + "balance_loss_clip": 1.04958296, + "balance_loss_mlp": 1.02365398, + "epoch": 0.1915469247880719, + "flos": 18952467390720.0, + "grad_norm": 1.7873710699849892, + "language_loss": 0.8224054, + "learning_rate": 3.732569888420871e-06, + "loss": 0.84438401, + "num_input_tokens_seen": 33789110, + "step": 1593, + "time_per_iteration": 2.5683860778808594 + }, + { + "auxiliary_loss_clip": 0.01220546, + "auxiliary_loss_mlp": 0.01033202, + "balance_loss_clip": 1.05866086, + "balance_loss_mlp": 1.0222708, + "epoch": 0.191667167678711, + "flos": 21032952065280.0, + "grad_norm": 3.769495896228824, + "language_loss": 0.82556152, + "learning_rate": 3.732180621579134e-06, + "loss": 0.84809899, + "num_input_tokens_seen": 33808325, + "step": 1594, + "time_per_iteration": 3.2231686115264893 + }, + { + "auxiliary_loss_clip": 0.0118595, + "auxiliary_loss_mlp": 0.01035436, + "balance_loss_clip": 1.05878782, + "balance_loss_mlp": 1.0250231, + "epoch": 0.1917874105693501, + "flos": 34237663914240.0, + "grad_norm": 2.1433677002418494, + "language_loss": 0.81127471, + "learning_rate": 3.7317910919669745e-06, + "loss": 0.83348858, + "num_input_tokens_seen": 33829520, + "step": 1595, + "time_per_iteration": 3.377772808074951 + }, + { + "auxiliary_loss_clip": 0.01204067, + "auxiliary_loss_mlp": 0.01042736, + "balance_loss_clip": 1.05965769, + "balance_loss_mlp": 1.03231144, + "epoch": 0.19190765345998917, + "flos": 23550613171200.0, + "grad_norm": 2.4385519433563014, + "language_loss": 0.76144147, + "learning_rate": 3.7314012996434826e-06, + "loss": 0.7839095, + "num_input_tokens_seen": 33848250, + "step": 1596, + "time_per_iteration": 3.2836458683013916 + }, + { + "auxiliary_loss_clip": 0.01190951, + "auxiliary_loss_mlp": 0.01033672, + "balance_loss_clip": 1.05840909, + "balance_loss_mlp": 1.02362895, + "epoch": 0.19202789635062828, + "flos": 19861330245120.0, + "grad_norm": 1.8441689368868186, + "language_loss": 0.80743361, + "learning_rate": 3.7310112446677907e-06, + "loss": 0.82967985, + "num_input_tokens_seen": 33866160, + "step": 1597, + "time_per_iteration": 2.5463080406188965 + }, + { + "auxiliary_loss_clip": 0.01224931, + "auxiliary_loss_mlp": 0.01030683, + "balance_loss_clip": 1.06393647, + "balance_loss_mlp": 1.02101481, + "epoch": 0.19214813924126736, + "flos": 20922957642240.0, + "grad_norm": 2.126187378673066, + "language_loss": 0.69038272, + "learning_rate": 3.7306209270990695e-06, + "loss": 0.7129389, + "num_input_tokens_seen": 33884165, + "step": 1598, + "time_per_iteration": 2.4546408653259277 + }, + { + "auxiliary_loss_clip": 0.01191726, + "auxiliary_loss_mlp": 0.01040066, + "balance_loss_clip": 1.05854762, + "balance_loss_mlp": 1.03030241, + "epoch": 0.19226838213190645, + "flos": 26359725231360.0, + "grad_norm": 2.2782483446654354, + "language_loss": 0.86570835, + "learning_rate": 3.7302303469965292e-06, + "loss": 0.8880263, + "num_input_tokens_seen": 33903705, + "step": 1599, + "time_per_iteration": 2.557832956314087 + }, + { + "auxiliary_loss_clip": 0.0120545, + "auxiliary_loss_mlp": 0.01041991, + "balance_loss_clip": 1.05994034, + "balance_loss_mlp": 1.03178668, + "epoch": 0.19238862502254553, + "flos": 20850525866880.0, + "grad_norm": 4.324619472048507, + "language_loss": 0.70449829, + "learning_rate": 3.7298395044194206e-06, + "loss": 0.7269727, + "num_input_tokens_seen": 33922515, + "step": 1600, + "time_per_iteration": 2.4619052410125732 + }, + { + "auxiliary_loss_clip": 0.01223319, + "auxiliary_loss_mlp": 0.0103473, + "balance_loss_clip": 1.06336462, + "balance_loss_mlp": 1.02465105, + "epoch": 0.19250886791318464, + "flos": 21726063878400.0, + "grad_norm": 1.7651546219867658, + "language_loss": 0.94343126, + "learning_rate": 3.7294483994270356e-06, + "loss": 0.96601176, + "num_input_tokens_seen": 33940840, + "step": 1601, + "time_per_iteration": 2.4464497566223145 + }, + { + "auxiliary_loss_clip": 0.01148265, + "auxiliary_loss_mlp": 0.01032473, + "balance_loss_clip": 1.05154622, + "balance_loss_mlp": 1.0236454, + "epoch": 0.19262911080382372, + "flos": 23367827836800.0, + "grad_norm": 2.025338398857537, + "language_loss": 0.77882242, + "learning_rate": 3.7290570320787033e-06, + "loss": 0.80062979, + "num_input_tokens_seen": 33960420, + "step": 1602, + "time_per_iteration": 2.5911426544189453 + }, + { + "auxiliary_loss_clip": 0.012049, + "auxiliary_loss_mlp": 0.01033284, + "balance_loss_clip": 1.06132007, + "balance_loss_mlp": 1.02329993, + "epoch": 0.1927493536944628, + "flos": 21943502858880.0, + "grad_norm": 2.001300006078848, + "language_loss": 0.71788657, + "learning_rate": 3.728665402433793e-06, + "loss": 0.74026841, + "num_input_tokens_seen": 33978990, + "step": 1603, + "time_per_iteration": 2.480520486831665 + }, + { + "auxiliary_loss_clip": 0.01192019, + "auxiliary_loss_mlp": 0.01035647, + "balance_loss_clip": 1.06003881, + "balance_loss_mlp": 1.02605689, + "epoch": 0.19286959658510192, + "flos": 16545590807040.0, + "grad_norm": 2.318007461691211, + "language_loss": 0.86215878, + "learning_rate": 3.7282735105517164e-06, + "loss": 0.88443542, + "num_input_tokens_seen": 33997115, + "step": 1604, + "time_per_iteration": 2.504701852798462 + }, + { + "auxiliary_loss_clip": 0.01167784, + "auxiliary_loss_mlp": 0.01037927, + "balance_loss_clip": 1.05228364, + "balance_loss_mlp": 1.02766323, + "epoch": 0.192989839475741, + "flos": 21616967295360.0, + "grad_norm": 2.3446381940669974, + "language_loss": 0.67511475, + "learning_rate": 3.727881356491922e-06, + "loss": 0.69717181, + "num_input_tokens_seen": 34015525, + "step": 1605, + "time_per_iteration": 2.5904741287231445 + }, + { + "auxiliary_loss_clip": 0.01220634, + "auxiliary_loss_mlp": 0.01038743, + "balance_loss_clip": 1.06255078, + "balance_loss_mlp": 1.02977848, + "epoch": 0.19311008236638008, + "flos": 19281516906240.0, + "grad_norm": 1.802053817192308, + "language_loss": 0.75617665, + "learning_rate": 3.7274889403139002e-06, + "loss": 0.77877045, + "num_input_tokens_seen": 34033150, + "step": 1606, + "time_per_iteration": 2.437211036682129 + }, + { + "auxiliary_loss_clip": 0.01157519, + "auxiliary_loss_mlp": 0.01034737, + "balance_loss_clip": 1.05579495, + "balance_loss_mlp": 1.02509308, + "epoch": 0.1932303252570192, + "flos": 28652369587200.0, + "grad_norm": 1.967668672213356, + "language_loss": 0.78381407, + "learning_rate": 3.727096262077179e-06, + "loss": 0.8057366, + "num_input_tokens_seen": 34052145, + "step": 1607, + "time_per_iteration": 2.6161866188049316 + }, + { + "auxiliary_loss_clip": 0.01204973, + "auxiliary_loss_mlp": 0.01032505, + "balance_loss_clip": 1.05934238, + "balance_loss_mlp": 1.02275372, + "epoch": 0.19335056814765827, + "flos": 18368990864640.0, + "grad_norm": 1.6893067600023044, + "language_loss": 0.84924138, + "learning_rate": 3.7267033218413285e-06, + "loss": 0.87161618, + "num_input_tokens_seen": 34069940, + "step": 1608, + "time_per_iteration": 2.4560458660125732 + }, + { + "auxiliary_loss_clip": 0.01146363, + "auxiliary_loss_mlp": 0.01037249, + "balance_loss_clip": 1.0491606, + "balance_loss_mlp": 1.02584028, + "epoch": 0.19347081103829736, + "flos": 13260877741440.0, + "grad_norm": 3.4601978400084867, + "language_loss": 0.81134391, + "learning_rate": 3.726310119665957e-06, + "loss": 0.83318001, + "num_input_tokens_seen": 34086275, + "step": 1609, + "time_per_iteration": 2.574298858642578 + }, + { + "auxiliary_loss_clip": 0.01204724, + "auxiliary_loss_mlp": 0.01033128, + "balance_loss_clip": 1.05856633, + "balance_loss_mlp": 1.0232811, + "epoch": 0.19359105392893644, + "flos": 20300122788480.0, + "grad_norm": 1.8434464508740218, + "language_loss": 0.85365683, + "learning_rate": 3.725916655610713e-06, + "loss": 0.87603533, + "num_input_tokens_seen": 34105605, + "step": 1610, + "time_per_iteration": 2.47398042678833 + }, + { + "auxiliary_loss_clip": 0.01183874, + "auxiliary_loss_mlp": 0.01032852, + "balance_loss_clip": 1.05514133, + "balance_loss_mlp": 1.02193832, + "epoch": 0.19371129681957555, + "flos": 20484596062080.0, + "grad_norm": 2.278936497800233, + "language_loss": 0.7491225, + "learning_rate": 3.725522929735284e-06, + "loss": 0.77128983, + "num_input_tokens_seen": 34122540, + "step": 1611, + "time_per_iteration": 2.500734806060791 + }, + { + "auxiliary_loss_clip": 0.01196602, + "auxiliary_loss_mlp": 0.01034025, + "balance_loss_clip": 1.05622268, + "balance_loss_mlp": 1.02374887, + "epoch": 0.19383153971021463, + "flos": 30445497457920.0, + "grad_norm": 2.352612975390438, + "language_loss": 0.7385726, + "learning_rate": 3.725128942099399e-06, + "loss": 0.76087892, + "num_input_tokens_seen": 34142940, + "step": 1612, + "time_per_iteration": 2.588106155395508 + }, + { + "auxiliary_loss_clip": 0.01179311, + "auxiliary_loss_mlp": 0.01035299, + "balance_loss_clip": 1.05461693, + "balance_loss_mlp": 1.0252856, + "epoch": 0.19395178260085372, + "flos": 24569937325440.0, + "grad_norm": 1.7250849596105902, + "language_loss": 0.79771429, + "learning_rate": 3.7247346927628245e-06, + "loss": 0.81986046, + "num_input_tokens_seen": 34162875, + "step": 1613, + "time_per_iteration": 2.547870397567749 + }, + { + "auxiliary_loss_clip": 0.01186782, + "auxiliary_loss_mlp": 0.00765035, + "balance_loss_clip": 1.05668998, + "balance_loss_mlp": 1.00099707, + "epoch": 0.19407202549149283, + "flos": 28950608211840.0, + "grad_norm": 1.7720859182467148, + "language_loss": 0.78901654, + "learning_rate": 3.7243401817853694e-06, + "loss": 0.80853462, + "num_input_tokens_seen": 34183565, + "step": 1614, + "time_per_iteration": 2.5784852504730225 + }, + { + "auxiliary_loss_clip": 0.01197859, + "auxiliary_loss_mlp": 0.01034378, + "balance_loss_clip": 1.05676997, + "balance_loss_mlp": 1.02466846, + "epoch": 0.1941922683821319, + "flos": 18004497603840.0, + "grad_norm": 2.173563087718427, + "language_loss": 0.71601343, + "learning_rate": 3.723945409226879e-06, + "loss": 0.73833585, + "num_input_tokens_seen": 34202055, + "step": 1615, + "time_per_iteration": 2.4714996814727783 + }, + { + "auxiliary_loss_clip": 0.01204744, + "auxiliary_loss_mlp": 0.01036833, + "balance_loss_clip": 1.0596559, + "balance_loss_mlp": 1.02635503, + "epoch": 0.194312511272771, + "flos": 9720337034880.0, + "grad_norm": 2.0778921023230477, + "language_loss": 0.7982955, + "learning_rate": 3.723550375147241e-06, + "loss": 0.82071126, + "num_input_tokens_seen": 34216830, + "step": 1616, + "time_per_iteration": 2.459017515182495 + }, + { + "auxiliary_loss_clip": 0.01161936, + "auxiliary_loss_mlp": 0.01035291, + "balance_loss_clip": 1.05083013, + "balance_loss_mlp": 1.025105, + "epoch": 0.19443275416341008, + "flos": 27016208150400.0, + "grad_norm": 2.25205514671097, + "language_loss": 0.79962617, + "learning_rate": 3.7231550796063816e-06, + "loss": 0.82159841, + "num_input_tokens_seen": 34236840, + "step": 1617, + "time_per_iteration": 3.3820767402648926 + }, + { + "auxiliary_loss_clip": 0.01197225, + "auxiliary_loss_mlp": 0.01040192, + "balance_loss_clip": 1.06113696, + "balance_loss_mlp": 1.02931428, + "epoch": 0.1945529970540492, + "flos": 15846625077120.0, + "grad_norm": 1.921999769786466, + "language_loss": 0.65128493, + "learning_rate": 3.722759522664266e-06, + "loss": 0.67365909, + "num_input_tokens_seen": 34254140, + "step": 1618, + "time_per_iteration": 2.5637381076812744 + }, + { + "auxiliary_loss_clip": 0.01161667, + "auxiliary_loss_mlp": 0.01032717, + "balance_loss_clip": 1.05433869, + "balance_loss_mlp": 1.02247071, + "epoch": 0.19467323994468827, + "flos": 19314985403520.0, + "grad_norm": 2.103462285480115, + "language_loss": 0.81798804, + "learning_rate": 3.7223637043809016e-06, + "loss": 0.83993185, + "num_input_tokens_seen": 34273120, + "step": 1619, + "time_per_iteration": 2.5773160457611084 + }, + { + "auxiliary_loss_clip": 0.01180549, + "auxiliary_loss_mlp": 0.01040516, + "balance_loss_clip": 1.05918205, + "balance_loss_mlp": 1.03094316, + "epoch": 0.19479348283532735, + "flos": 24133227770880.0, + "grad_norm": 2.193831669024652, + "language_loss": 0.86521614, + "learning_rate": 3.7219676248163322e-06, + "loss": 0.88742673, + "num_input_tokens_seen": 34290285, + "step": 1620, + "time_per_iteration": 2.560588836669922 + }, + { + "auxiliary_loss_clip": 0.01212857, + "auxiliary_loss_mlp": 0.01036449, + "balance_loss_clip": 1.06288946, + "balance_loss_mlp": 1.02608418, + "epoch": 0.19491372572596646, + "flos": 25775638174080.0, + "grad_norm": 2.000595382360526, + "language_loss": 0.9354164, + "learning_rate": 3.721571284030643e-06, + "loss": 0.95790952, + "num_input_tokens_seen": 34310095, + "step": 1621, + "time_per_iteration": 3.2301881313323975 + }, + { + "auxiliary_loss_clip": 0.01208916, + "auxiliary_loss_mlp": 0.01030352, + "balance_loss_clip": 1.06043124, + "balance_loss_mlp": 1.02017748, + "epoch": 0.19503396861660555, + "flos": 19645220067840.0, + "grad_norm": 2.0896363869455508, + "language_loss": 0.79467469, + "learning_rate": 3.7211746820839587e-06, + "loss": 0.81706738, + "num_input_tokens_seen": 34327190, + "step": 1622, + "time_per_iteration": 3.9578051567077637 + }, + { + "auxiliary_loss_clip": 0.01111227, + "auxiliary_loss_mlp": 0.01032638, + "balance_loss_clip": 1.04662549, + "balance_loss_mlp": 1.02240384, + "epoch": 0.19515421150724463, + "flos": 21033023892480.0, + "grad_norm": 1.696783933265482, + "language_loss": 0.80513835, + "learning_rate": 3.7207778190364437e-06, + "loss": 0.82657707, + "num_input_tokens_seen": 34345615, + "step": 1623, + "time_per_iteration": 2.626288890838623 + }, + { + "auxiliary_loss_clip": 0.01132454, + "auxiliary_loss_mlp": 0.01033814, + "balance_loss_clip": 1.04982364, + "balance_loss_mlp": 1.0238719, + "epoch": 0.1952744543978837, + "flos": 32961255143040.0, + "grad_norm": 1.4638847598586826, + "language_loss": 0.73623919, + "learning_rate": 3.720380694948302e-06, + "loss": 0.75790191, + "num_input_tokens_seen": 34368500, + "step": 1624, + "time_per_iteration": 2.7049312591552734 + }, + { + "auxiliary_loss_clip": 0.01076761, + "auxiliary_loss_mlp": 0.0101264, + "balance_loss_clip": 1.02257037, + "balance_loss_mlp": 1.01020861, + "epoch": 0.19539469728852282, + "flos": 64044312030720.0, + "grad_norm": 1.0364101841011795, + "language_loss": 0.71278071, + "learning_rate": 3.719983309879777e-06, + "loss": 0.73367476, + "num_input_tokens_seen": 34428280, + "step": 1625, + "time_per_iteration": 3.134880781173706 + }, + { + "auxiliary_loss_clip": 0.01165309, + "auxiliary_loss_mlp": 0.01039949, + "balance_loss_clip": 1.05359721, + "balance_loss_mlp": 1.0302577, + "epoch": 0.1955149401791619, + "flos": 13370908078080.0, + "grad_norm": 1.7719757994961551, + "language_loss": 0.77459759, + "learning_rate": 3.719585663891151e-06, + "loss": 0.79665023, + "num_input_tokens_seen": 34445815, + "step": 1626, + "time_per_iteration": 2.536752223968506 + }, + { + "auxiliary_loss_clip": 0.01153939, + "auxiliary_loss_mlp": 0.01038651, + "balance_loss_clip": 1.0565232, + "balance_loss_mlp": 1.02802348, + "epoch": 0.195635183069801, + "flos": 18728887184640.0, + "grad_norm": 1.9115971427629737, + "language_loss": 0.78491116, + "learning_rate": 3.719187757042747e-06, + "loss": 0.80683708, + "num_input_tokens_seen": 34463635, + "step": 1627, + "time_per_iteration": 2.5708975791931152 + }, + { + "auxiliary_loss_clip": 0.01089492, + "auxiliary_loss_mlp": 0.01003192, + "balance_loss_clip": 1.01952863, + "balance_loss_mlp": 1.00098693, + "epoch": 0.1957554259604401, + "flos": 69313952615040.0, + "grad_norm": 0.7275233901157265, + "language_loss": 0.55004096, + "learning_rate": 3.7187895893949275e-06, + "loss": 0.57096779, + "num_input_tokens_seen": 34530105, + "step": 1628, + "time_per_iteration": 3.1829776763916016 + }, + { + "auxiliary_loss_clip": 0.01146789, + "auxiliary_loss_mlp": 0.01027709, + "balance_loss_clip": 1.05059028, + "balance_loss_mlp": 1.01723635, + "epoch": 0.19587566885107918, + "flos": 21069257736960.0, + "grad_norm": 2.344944906280868, + "language_loss": 0.76290047, + "learning_rate": 3.7183911610080937e-06, + "loss": 0.78464538, + "num_input_tokens_seen": 34546970, + "step": 1629, + "time_per_iteration": 2.5950405597686768 + }, + { + "auxiliary_loss_clip": 0.01177578, + "auxiliary_loss_mlp": 0.01041308, + "balance_loss_clip": 1.05565763, + "balance_loss_mlp": 1.0304482, + "epoch": 0.19599591174171827, + "flos": 22194661731840.0, + "grad_norm": 2.3755218475006163, + "language_loss": 0.74979258, + "learning_rate": 3.7179924719426872e-06, + "loss": 0.77198142, + "num_input_tokens_seen": 34564865, + "step": 1630, + "time_per_iteration": 2.5367929935455322 + }, + { + "auxiliary_loss_clip": 0.01210896, + "auxiliary_loss_mlp": 0.01040067, + "balance_loss_clip": 1.06299531, + "balance_loss_mlp": 1.02975523, + "epoch": 0.19611615463235738, + "flos": 23768375374080.0, + "grad_norm": 2.434653361468261, + "language_loss": 0.75733328, + "learning_rate": 3.7175935222591885e-06, + "loss": 0.77984297, + "num_input_tokens_seen": 34584165, + "step": 1631, + "time_per_iteration": 2.513305902481079 + }, + { + "auxiliary_loss_clip": 0.01194953, + "auxiliary_loss_mlp": 0.01040245, + "balance_loss_clip": 1.06313813, + "balance_loss_mlp": 1.02983177, + "epoch": 0.19623639752299646, + "flos": 28618218731520.0, + "grad_norm": 1.7657114978433064, + "language_loss": 0.74238193, + "learning_rate": 3.717194312018118e-06, + "loss": 0.76473391, + "num_input_tokens_seen": 34603150, + "step": 1632, + "time_per_iteration": 2.569023370742798 + }, + { + "auxiliary_loss_clip": 0.0120285, + "auxiliary_loss_mlp": 0.01036579, + "balance_loss_clip": 1.0572238, + "balance_loss_mlp": 1.02626109, + "epoch": 0.19635664041363554, + "flos": 21032700670080.0, + "grad_norm": 2.1726546015731847, + "language_loss": 0.75836003, + "learning_rate": 3.716794841280036e-06, + "loss": 0.78075427, + "num_input_tokens_seen": 34621855, + "step": 1633, + "time_per_iteration": 2.52150297164917 + }, + { + "auxiliary_loss_clip": 0.01210777, + "auxiliary_loss_mlp": 0.01038089, + "balance_loss_clip": 1.05970049, + "balance_loss_mlp": 1.02799773, + "epoch": 0.19647688330427462, + "flos": 18879748306560.0, + "grad_norm": 1.9199436682184408, + "language_loss": 0.7729094, + "learning_rate": 3.7163951101055407e-06, + "loss": 0.79539806, + "num_input_tokens_seen": 34639915, + "step": 1634, + "time_per_iteration": 2.467607259750366 + }, + { + "auxiliary_loss_clip": 0.01187838, + "auxiliary_loss_mlp": 0.01038278, + "balance_loss_clip": 1.05897641, + "balance_loss_mlp": 1.027776, + "epoch": 0.19659712619491373, + "flos": 24242503921920.0, + "grad_norm": 2.028918565927402, + "language_loss": 0.79037619, + "learning_rate": 3.715995118555273e-06, + "loss": 0.81263733, + "num_input_tokens_seen": 34659890, + "step": 1635, + "time_per_iteration": 2.546248197555542 + }, + { + "auxiliary_loss_clip": 0.01155465, + "auxiliary_loss_mlp": 0.01042331, + "balance_loss_clip": 1.05300164, + "balance_loss_mlp": 1.03128028, + "epoch": 0.19671736908555282, + "flos": 24717422568960.0, + "grad_norm": 1.9141345241461576, + "language_loss": 0.85670495, + "learning_rate": 3.71559486668991e-06, + "loss": 0.87868297, + "num_input_tokens_seen": 34678750, + "step": 1636, + "time_per_iteration": 2.5909173488616943 + }, + { + "auxiliary_loss_clip": 0.01211057, + "auxiliary_loss_mlp": 0.0076435, + "balance_loss_clip": 1.06189179, + "balance_loss_mlp": 1.00108433, + "epoch": 0.1968376119761919, + "flos": 23842279607040.0, + "grad_norm": 1.7094305677155328, + "language_loss": 0.77852374, + "learning_rate": 3.715194354570169e-06, + "loss": 0.79827785, + "num_input_tokens_seen": 34698755, + "step": 1637, + "time_per_iteration": 2.5039117336273193 + }, + { + "auxiliary_loss_clip": 0.01205701, + "auxiliary_loss_mlp": 0.01036746, + "balance_loss_clip": 1.06230152, + "balance_loss_mlp": 1.02701223, + "epoch": 0.196957854866831, + "flos": 18113917409280.0, + "grad_norm": 1.8904646016055393, + "language_loss": 0.83370692, + "learning_rate": 3.714793582256809e-06, + "loss": 0.85613143, + "num_input_tokens_seen": 34715820, + "step": 1638, + "time_per_iteration": 2.4479472637176514 + }, + { + "auxiliary_loss_clip": 0.01218027, + "auxiliary_loss_mlp": 0.01036894, + "balance_loss_clip": 1.05991244, + "balance_loss_mlp": 1.02660036, + "epoch": 0.1970780977574701, + "flos": 21653129312640.0, + "grad_norm": 3.2522607321813592, + "language_loss": 0.8487345, + "learning_rate": 3.7143925498106253e-06, + "loss": 0.87128377, + "num_input_tokens_seen": 34734360, + "step": 1639, + "time_per_iteration": 2.450167417526245 + }, + { + "auxiliary_loss_clip": 0.01186736, + "auxiliary_loss_mlp": 0.01036744, + "balance_loss_clip": 1.05230105, + "balance_loss_mlp": 1.02596116, + "epoch": 0.19719834064810918, + "flos": 20811813984000.0, + "grad_norm": 1.7979614832477042, + "language_loss": 0.78966731, + "learning_rate": 3.7139912572924558e-06, + "loss": 0.81190211, + "num_input_tokens_seen": 34753390, + "step": 1640, + "time_per_iteration": 2.6000232696533203 + }, + { + "auxiliary_loss_clip": 0.01199057, + "auxiliary_loss_mlp": 0.01037053, + "balance_loss_clip": 1.05479252, + "balance_loss_mlp": 1.02749205, + "epoch": 0.19731858353874826, + "flos": 23434800744960.0, + "grad_norm": 2.8546071672365065, + "language_loss": 0.80964756, + "learning_rate": 3.7135897047631744e-06, + "loss": 0.83200866, + "num_input_tokens_seen": 34771275, + "step": 1641, + "time_per_iteration": 2.500296115875244 + }, + { + "auxiliary_loss_clip": 0.01189448, + "auxiliary_loss_mlp": 0.01034493, + "balance_loss_clip": 1.05711889, + "balance_loss_mlp": 1.02418172, + "epoch": 0.19743882642938737, + "flos": 23988184652160.0, + "grad_norm": 2.0971792037151897, + "language_loss": 0.75845778, + "learning_rate": 3.713187892283698e-06, + "loss": 0.78069723, + "num_input_tokens_seen": 34790885, + "step": 1642, + "time_per_iteration": 2.52612566947937 + }, + { + "auxiliary_loss_clip": 0.01157, + "auxiliary_loss_mlp": 0.01037852, + "balance_loss_clip": 1.0505929, + "balance_loss_mlp": 1.02755225, + "epoch": 0.19755906932002645, + "flos": 15004340081280.0, + "grad_norm": 2.203529960141442, + "language_loss": 0.87784272, + "learning_rate": 3.71278581991498e-06, + "loss": 0.89979124, + "num_input_tokens_seen": 34806745, + "step": 1643, + "time_per_iteration": 2.5319788455963135 + }, + { + "auxiliary_loss_clip": 0.01178757, + "auxiliary_loss_mlp": 0.00765451, + "balance_loss_clip": 1.06144381, + "balance_loss_mlp": 1.00098372, + "epoch": 0.19767931221066554, + "flos": 19494466686720.0, + "grad_norm": 1.7536029202846934, + "language_loss": 0.78961885, + "learning_rate": 3.712383487718015e-06, + "loss": 0.80906087, + "num_input_tokens_seen": 34824985, + "step": 1644, + "time_per_iteration": 3.291271209716797 + }, + { + "auxiliary_loss_clip": 0.0113802, + "auxiliary_loss_mlp": 0.01033078, + "balance_loss_clip": 1.05079341, + "balance_loss_mlp": 1.02372599, + "epoch": 0.19779955510130465, + "flos": 25737895958400.0, + "grad_norm": 1.8852843245359392, + "language_loss": 0.86646473, + "learning_rate": 3.7119808957538365e-06, + "loss": 0.88817573, + "num_input_tokens_seen": 34843980, + "step": 1645, + "time_per_iteration": 2.616654634475708 + }, + { + "auxiliary_loss_clip": 0.01181842, + "auxiliary_loss_mlp": 0.01033997, + "balance_loss_clip": 1.05210078, + "balance_loss_mlp": 1.02348852, + "epoch": 0.19791979799194373, + "flos": 20777699041920.0, + "grad_norm": 1.9997839875373977, + "language_loss": 0.80108011, + "learning_rate": 3.711578044083517e-06, + "loss": 0.82323849, + "num_input_tokens_seen": 34860780, + "step": 1646, + "time_per_iteration": 2.52032208442688 + }, + { + "auxiliary_loss_clip": 0.01189972, + "auxiliary_loss_mlp": 0.01038177, + "balance_loss_clip": 1.05615425, + "balance_loss_mlp": 1.02800834, + "epoch": 0.1980400408825828, + "flos": 25589010084480.0, + "grad_norm": 1.7672163085314485, + "language_loss": 0.74709255, + "learning_rate": 3.7111749327681698e-06, + "loss": 0.76937401, + "num_input_tokens_seen": 34880815, + "step": 1647, + "time_per_iteration": 3.2934906482696533 + }, + { + "auxiliary_loss_clip": 0.01209475, + "auxiliary_loss_mlp": 0.01032658, + "balance_loss_clip": 1.06252074, + "balance_loss_mlp": 1.02318084, + "epoch": 0.1981602837732219, + "flos": 23513840622720.0, + "grad_norm": 2.2223468412314014, + "language_loss": 0.86583829, + "learning_rate": 3.7107715618689455e-06, + "loss": 0.88825959, + "num_input_tokens_seen": 34899790, + "step": 1648, + "time_per_iteration": 3.26312518119812 + }, + { + "auxiliary_loss_clip": 0.01200824, + "auxiliary_loss_mlp": 0.01032773, + "balance_loss_clip": 1.05879068, + "balance_loss_mlp": 1.02245593, + "epoch": 0.198280526663861, + "flos": 23185365724800.0, + "grad_norm": 1.4177813906041719, + "language_loss": 0.83388126, + "learning_rate": 3.710367931447035e-06, + "loss": 0.85621721, + "num_input_tokens_seen": 34921570, + "step": 1649, + "time_per_iteration": 3.314767360687256 + }, + { + "auxiliary_loss_clip": 0.01209817, + "auxiliary_loss_mlp": 0.01037667, + "balance_loss_clip": 1.05886245, + "balance_loss_mlp": 1.02721858, + "epoch": 0.1984007695545001, + "flos": 21689470897920.0, + "grad_norm": 2.083396948441338, + "language_loss": 0.86524737, + "learning_rate": 3.70996404156367e-06, + "loss": 0.88772219, + "num_input_tokens_seen": 34941205, + "step": 1650, + "time_per_iteration": 2.4844276905059814 + }, + { + "auxiliary_loss_clip": 0.01148207, + "auxiliary_loss_mlp": 0.01036834, + "balance_loss_clip": 1.04983079, + "balance_loss_mlp": 1.02751732, + "epoch": 0.19852101244513917, + "flos": 36064008887040.0, + "grad_norm": 1.8097942514534386, + "language_loss": 0.72711939, + "learning_rate": 3.7095598922801187e-06, + "loss": 0.74896973, + "num_input_tokens_seen": 34963280, + "step": 1651, + "time_per_iteration": 2.714073419570923 + }, + { + "auxiliary_loss_clip": 0.01218542, + "auxiliary_loss_mlp": 0.01036179, + "balance_loss_clip": 1.06082165, + "balance_loss_mlp": 1.02606356, + "epoch": 0.19864125533577828, + "flos": 23105894883840.0, + "grad_norm": 2.720906502990722, + "language_loss": 0.76149285, + "learning_rate": 3.7091554836576914e-06, + "loss": 0.78404003, + "num_input_tokens_seen": 34979955, + "step": 1652, + "time_per_iteration": 2.458477020263672 + }, + { + "auxiliary_loss_clip": 0.01200833, + "auxiliary_loss_mlp": 0.00764184, + "balance_loss_clip": 1.05937982, + "balance_loss_mlp": 1.00109017, + "epoch": 0.19876149822641737, + "flos": 24608505553920.0, + "grad_norm": 1.7137033518009785, + "language_loss": 0.82971871, + "learning_rate": 3.708750815757736e-06, + "loss": 0.84936887, + "num_input_tokens_seen": 35000725, + "step": 1653, + "time_per_iteration": 2.550105333328247 + }, + { + "auxiliary_loss_clip": 0.01205374, + "auxiliary_loss_mlp": 0.01040227, + "balance_loss_clip": 1.06000972, + "balance_loss_mlp": 1.03005791, + "epoch": 0.19888174111705645, + "flos": 32196645308160.0, + "grad_norm": 2.079088290150475, + "language_loss": 0.72561604, + "learning_rate": 3.7083458886416407e-06, + "loss": 0.74807203, + "num_input_tokens_seen": 35019920, + "step": 1654, + "time_per_iteration": 2.5612294673919678 + }, + { + "auxiliary_loss_clip": 0.0114972, + "auxiliary_loss_mlp": 0.01036029, + "balance_loss_clip": 1.05493355, + "balance_loss_mlp": 1.02627146, + "epoch": 0.19900198400769553, + "flos": 24608469640320.0, + "grad_norm": 2.0799758170907565, + "language_loss": 0.88024938, + "learning_rate": 3.707940702370832e-06, + "loss": 0.90210688, + "num_input_tokens_seen": 35040765, + "step": 1655, + "time_per_iteration": 2.642764091491699 + }, + { + "auxiliary_loss_clip": 0.01092553, + "auxiliary_loss_mlp": 0.01003814, + "balance_loss_clip": 1.01614141, + "balance_loss_mlp": 1.00175214, + "epoch": 0.19912222689833464, + "flos": 67915805673600.0, + "grad_norm": 0.7602396597383855, + "language_loss": 0.58246648, + "learning_rate": 3.707535257006777e-06, + "loss": 0.60343015, + "num_input_tokens_seen": 35106390, + "step": 1656, + "time_per_iteration": 3.130281448364258 + }, + { + "auxiliary_loss_clip": 0.0118961, + "auxiliary_loss_mlp": 0.01032332, + "balance_loss_clip": 1.05609274, + "balance_loss_mlp": 1.02186584, + "epoch": 0.19924246978897373, + "flos": 15742340916480.0, + "grad_norm": 2.232409395644005, + "language_loss": 0.8809886, + "learning_rate": 3.707129552610981e-06, + "loss": 0.90320802, + "num_input_tokens_seen": 35125040, + "step": 1657, + "time_per_iteration": 2.4922120571136475 + }, + { + "auxiliary_loss_clip": 0.01182924, + "auxiliary_loss_mlp": 0.01035728, + "balance_loss_clip": 1.05760002, + "balance_loss_mlp": 1.02568483, + "epoch": 0.1993627126796128, + "flos": 17566566986880.0, + "grad_norm": 1.759749758260652, + "language_loss": 0.73607385, + "learning_rate": 3.70672358924499e-06, + "loss": 0.75826037, + "num_input_tokens_seen": 35144280, + "step": 1658, + "time_per_iteration": 2.4886486530303955 + }, + { + "auxiliary_loss_clip": 0.01172397, + "auxiliary_loss_mlp": 0.0103533, + "balance_loss_clip": 1.0586555, + "balance_loss_mlp": 1.02539384, + "epoch": 0.19948295557025192, + "flos": 40843826680320.0, + "grad_norm": 1.8333911168292059, + "language_loss": 0.78589761, + "learning_rate": 3.706317366970386e-06, + "loss": 0.80797487, + "num_input_tokens_seen": 35165280, + "step": 1659, + "time_per_iteration": 2.710585832595825 + }, + { + "auxiliary_loss_clip": 0.01218612, + "auxiliary_loss_mlp": 0.00765067, + "balance_loss_clip": 1.05762434, + "balance_loss_mlp": 1.00105202, + "epoch": 0.199603198460891, + "flos": 25082418620160.0, + "grad_norm": 2.488692235227868, + "language_loss": 0.83600092, + "learning_rate": 3.705910885848795e-06, + "loss": 0.8558377, + "num_input_tokens_seen": 35183655, + "step": 1660, + "time_per_iteration": 2.5000269412994385 + }, + { + "auxiliary_loss_clip": 0.01201818, + "auxiliary_loss_mlp": 0.01031147, + "balance_loss_clip": 1.05801439, + "balance_loss_mlp": 1.02146721, + "epoch": 0.19972344135153008, + "flos": 20084120352000.0, + "grad_norm": 2.250405896791877, + "language_loss": 0.8469975, + "learning_rate": 3.705504145941879e-06, + "loss": 0.86932707, + "num_input_tokens_seen": 35201825, + "step": 1661, + "time_per_iteration": 2.4888007640838623 + }, + { + "auxiliary_loss_clip": 0.01213866, + "auxiliary_loss_mlp": 0.01030267, + "balance_loss_clip": 1.05741239, + "balance_loss_mlp": 1.02055192, + "epoch": 0.1998436842421692, + "flos": 23727472761600.0, + "grad_norm": 1.8771092475082727, + "language_loss": 0.78576016, + "learning_rate": 3.7050971473113403e-06, + "loss": 0.80820143, + "num_input_tokens_seen": 35221600, + "step": 1662, + "time_per_iteration": 2.46854829788208 + }, + { + "auxiliary_loss_clip": 0.01197472, + "auxiliary_loss_mlp": 0.00764493, + "balance_loss_clip": 1.05537224, + "balance_loss_mlp": 1.00108755, + "epoch": 0.19996392713280828, + "flos": 36102361633920.0, + "grad_norm": 1.6675011010367748, + "language_loss": 0.80218118, + "learning_rate": 3.7046898900189196e-06, + "loss": 0.82180077, + "num_input_tokens_seen": 35245935, + "step": 1663, + "time_per_iteration": 2.6280698776245117 + }, + { + "auxiliary_loss_clip": 0.0117656, + "auxiliary_loss_mlp": 0.01040464, + "balance_loss_clip": 1.05628514, + "balance_loss_mlp": 1.0303427, + "epoch": 0.20008417002344736, + "flos": 23657662679040.0, + "grad_norm": 1.7391775279154624, + "language_loss": 0.83181179, + "learning_rate": 3.704282374126398e-06, + "loss": 0.85398203, + "num_input_tokens_seen": 35265615, + "step": 1664, + "time_per_iteration": 2.5513432025909424 + }, + { + "auxiliary_loss_clip": 0.01170727, + "auxiliary_loss_mlp": 0.01032243, + "balance_loss_clip": 1.05351782, + "balance_loss_mlp": 1.02230692, + "epoch": 0.20020441291408644, + "flos": 21872076664320.0, + "grad_norm": 1.6617546058686343, + "language_loss": 0.8726626, + "learning_rate": 3.7038745996955954e-06, + "loss": 0.8946923, + "num_input_tokens_seen": 35284960, + "step": 1665, + "time_per_iteration": 2.6449663639068604 + }, + { + "auxiliary_loss_clip": 0.01178737, + "auxiliary_loss_mlp": 0.01036493, + "balance_loss_clip": 1.05480957, + "balance_loss_mlp": 1.02677798, + "epoch": 0.20032465580472555, + "flos": 23179691376000.0, + "grad_norm": 2.7600952746067784, + "language_loss": 0.72366995, + "learning_rate": 3.703466566788371e-06, + "loss": 0.74582219, + "num_input_tokens_seen": 35304090, + "step": 1666, + "time_per_iteration": 2.606410264968872 + }, + { + "auxiliary_loss_clip": 0.01181743, + "auxiliary_loss_mlp": 0.01034786, + "balance_loss_clip": 1.05755997, + "balance_loss_mlp": 1.02415264, + "epoch": 0.20044489869536464, + "flos": 23873521461120.0, + "grad_norm": 2.0062847993319264, + "language_loss": 0.74162829, + "learning_rate": 3.703058275466622e-06, + "loss": 0.76379359, + "num_input_tokens_seen": 35323325, + "step": 1667, + "time_per_iteration": 2.5616540908813477 + }, + { + "auxiliary_loss_clip": 0.01186226, + "auxiliary_loss_mlp": 0.01037101, + "balance_loss_clip": 1.05516648, + "balance_loss_mlp": 1.02740383, + "epoch": 0.20056514158600372, + "flos": 21945226711680.0, + "grad_norm": 1.8249108848212514, + "language_loss": 0.77866697, + "learning_rate": 3.7026497257922877e-06, + "loss": 0.80090028, + "num_input_tokens_seen": 35343635, + "step": 1668, + "time_per_iteration": 2.5382180213928223 + }, + { + "auxiliary_loss_clip": 0.01152716, + "auxiliary_loss_mlp": 0.01043122, + "balance_loss_clip": 1.04990566, + "balance_loss_mlp": 1.03301907, + "epoch": 0.20068538447664283, + "flos": 23879159896320.0, + "grad_norm": 1.6374220452727093, + "language_loss": 0.8525337, + "learning_rate": 3.7022409178273436e-06, + "loss": 0.87449205, + "num_input_tokens_seen": 35364615, + "step": 1669, + "time_per_iteration": 2.645381450653076 + }, + { + "auxiliary_loss_clip": 0.01196892, + "auxiliary_loss_mlp": 0.01027836, + "balance_loss_clip": 1.05584598, + "balance_loss_mlp": 1.01817405, + "epoch": 0.2008056273672819, + "flos": 18442823270400.0, + "grad_norm": 1.832103491483546, + "language_loss": 0.7848022, + "learning_rate": 3.7018318516338054e-06, + "loss": 0.80704951, + "num_input_tokens_seen": 35383775, + "step": 1670, + "time_per_iteration": 3.252002477645874 + }, + { + "auxiliary_loss_clip": 0.01204768, + "auxiliary_loss_mlp": 0.01029437, + "balance_loss_clip": 1.05783308, + "balance_loss_mlp": 1.02007866, + "epoch": 0.200925870257921, + "flos": 23659530186240.0, + "grad_norm": 2.539844508064278, + "language_loss": 0.81623685, + "learning_rate": 3.7014225272737284e-06, + "loss": 0.83857882, + "num_input_tokens_seen": 35403000, + "step": 1671, + "time_per_iteration": 2.519498109817505 + }, + { + "auxiliary_loss_clip": 0.01195876, + "auxiliary_loss_mlp": 0.01032772, + "balance_loss_clip": 1.05619872, + "balance_loss_mlp": 1.02248967, + "epoch": 0.20104611314856008, + "flos": 16217115909120.0, + "grad_norm": 2.411971724313284, + "language_loss": 0.74052346, + "learning_rate": 3.701012944809207e-06, + "loss": 0.76280999, + "num_input_tokens_seen": 35420115, + "step": 1672, + "time_per_iteration": 2.4838919639587402 + }, + { + "auxiliary_loss_clip": 0.01186075, + "auxiliary_loss_mlp": 0.0076433, + "balance_loss_clip": 1.05717111, + "balance_loss_mlp": 1.00110435, + "epoch": 0.2011663560391992, + "flos": 21397373498880.0, + "grad_norm": 1.9561852526032824, + "language_loss": 0.78730863, + "learning_rate": 3.700603104302374e-06, + "loss": 0.80681264, + "num_input_tokens_seen": 35439925, + "step": 1673, + "time_per_iteration": 2.536982297897339 + }, + { + "auxiliary_loss_clip": 0.01055517, + "auxiliary_loss_mlp": 0.01005402, + "balance_loss_clip": 1.01344037, + "balance_loss_mlp": 1.00310159, + "epoch": 0.20128659892983827, + "flos": 62229459409920.0, + "grad_norm": 0.9107295701348281, + "language_loss": 0.55953407, + "learning_rate": 3.7001930058154027e-06, + "loss": 0.58014321, + "num_input_tokens_seen": 35504885, + "step": 1674, + "time_per_iteration": 4.710182428359985 + }, + { + "auxiliary_loss_clip": 0.01171592, + "auxiliary_loss_mlp": 0.01039059, + "balance_loss_clip": 1.05443072, + "balance_loss_mlp": 1.0284797, + "epoch": 0.20140684182047736, + "flos": 28438737448320.0, + "grad_norm": 5.198372201749318, + "language_loss": 0.79663372, + "learning_rate": 3.6997826494105037e-06, + "loss": 0.81874019, + "num_input_tokens_seen": 35525330, + "step": 1675, + "time_per_iteration": 3.3821511268615723 + }, + { + "auxiliary_loss_clip": 0.01187997, + "auxiliary_loss_mlp": 0.01032316, + "balance_loss_clip": 1.05710685, + "balance_loss_mlp": 1.0229342, + "epoch": 0.20152708471111647, + "flos": 28074064619520.0, + "grad_norm": 2.009326076317382, + "language_loss": 0.69721359, + "learning_rate": 3.6993720351499286e-06, + "loss": 0.71941674, + "num_input_tokens_seen": 35546455, + "step": 1676, + "time_per_iteration": 2.563492774963379 + }, + { + "auxiliary_loss_clip": 0.01182403, + "auxiliary_loss_mlp": 0.01032721, + "balance_loss_clip": 1.05858994, + "balance_loss_mlp": 1.0229044, + "epoch": 0.20164732760175555, + "flos": 23549751244800.0, + "grad_norm": 1.781870782150959, + "language_loss": 0.76199687, + "learning_rate": 3.6989611630959666e-06, + "loss": 0.78414816, + "num_input_tokens_seen": 35565010, + "step": 1677, + "time_per_iteration": 2.5501210689544678 + }, + { + "auxiliary_loss_clip": 0.01098137, + "auxiliary_loss_mlp": 0.01000794, + "balance_loss_clip": 1.01831424, + "balance_loss_mlp": 0.99873161, + "epoch": 0.20176757049239463, + "flos": 71100616037760.0, + "grad_norm": 0.6833296173510243, + "language_loss": 0.58339119, + "learning_rate": 3.6985500333109474e-06, + "loss": 0.60438055, + "num_input_tokens_seen": 35633340, + "step": 1678, + "time_per_iteration": 3.13846755027771 + }, + { + "auxiliary_loss_clip": 0.011641, + "auxiliary_loss_mlp": 0.01034135, + "balance_loss_clip": 1.05166793, + "balance_loss_mlp": 1.02489638, + "epoch": 0.20188781338303372, + "flos": 21430159637760.0, + "grad_norm": 2.7295621534229224, + "language_loss": 0.76672274, + "learning_rate": 3.6981386458572385e-06, + "loss": 0.78870511, + "num_input_tokens_seen": 35651315, + "step": 1679, + "time_per_iteration": 2.6262078285217285 + }, + { + "auxiliary_loss_clip": 0.01168772, + "auxiliary_loss_mlp": 0.01037212, + "balance_loss_clip": 1.05401015, + "balance_loss_mlp": 1.026829, + "epoch": 0.20200805627367283, + "flos": 11546215130880.0, + "grad_norm": 2.953878397388236, + "language_loss": 0.76264822, + "learning_rate": 3.6977270007972468e-06, + "loss": 0.78470814, + "num_input_tokens_seen": 35668850, + "step": 1680, + "time_per_iteration": 2.637423515319824 + }, + { + "auxiliary_loss_clip": 0.01190005, + "auxiliary_loss_mlp": 0.01034148, + "balance_loss_clip": 1.05696428, + "balance_loss_mlp": 1.02417016, + "epoch": 0.2021282991643119, + "flos": 28545391906560.0, + "grad_norm": 2.542165282447668, + "language_loss": 0.72758234, + "learning_rate": 3.6973150981934196e-06, + "loss": 0.74982381, + "num_input_tokens_seen": 35690080, + "step": 1681, + "time_per_iteration": 2.6684410572052 + }, + { + "auxiliary_loss_clip": 0.01221177, + "auxiliary_loss_mlp": 0.01040016, + "balance_loss_clip": 1.06109345, + "balance_loss_mlp": 1.02960372, + "epoch": 0.202248542054951, + "flos": 17923446564480.0, + "grad_norm": 2.4038106783506357, + "language_loss": 0.83478969, + "learning_rate": 3.6969029381082415e-06, + "loss": 0.85740161, + "num_input_tokens_seen": 35706075, + "step": 1682, + "time_per_iteration": 2.445946216583252 + }, + { + "auxiliary_loss_clip": 0.01185761, + "auxiliary_loss_mlp": 0.01033409, + "balance_loss_clip": 1.05703402, + "balance_loss_mlp": 1.02344918, + "epoch": 0.2023687849455901, + "flos": 19864634296320.0, + "grad_norm": 1.7832035296568476, + "language_loss": 0.7962265, + "learning_rate": 3.696490520604237e-06, + "loss": 0.8184182, + "num_input_tokens_seen": 35724765, + "step": 1683, + "time_per_iteration": 2.556016206741333 + }, + { + "auxiliary_loss_clip": 0.01196538, + "auxiliary_loss_mlp": 0.01027236, + "balance_loss_clip": 1.05771804, + "balance_loss_mlp": 1.01793718, + "epoch": 0.20248902783622919, + "flos": 22564721600640.0, + "grad_norm": 1.6311982192069332, + "language_loss": 0.80746168, + "learning_rate": 3.696077845743968e-06, + "loss": 0.8296994, + "num_input_tokens_seen": 35744355, + "step": 1684, + "time_per_iteration": 2.5150396823883057 + }, + { + "auxiliary_loss_clip": 0.01221644, + "auxiliary_loss_mlp": 0.01037182, + "balance_loss_clip": 1.06109774, + "balance_loss_mlp": 1.02646542, + "epoch": 0.20260927072686827, + "flos": 22709728805760.0, + "grad_norm": 3.0096978607057507, + "language_loss": 0.73074698, + "learning_rate": 3.69566491359004e-06, + "loss": 0.75333524, + "num_input_tokens_seen": 35761000, + "step": 1685, + "time_per_iteration": 2.5037107467651367 + }, + { + "auxiliary_loss_clip": 0.01186118, + "auxiliary_loss_mlp": 0.01033785, + "balance_loss_clip": 1.05625176, + "balance_loss_mlp": 1.02313995, + "epoch": 0.20272951361750738, + "flos": 51023998650240.0, + "grad_norm": 1.723040119623834, + "language_loss": 0.69435787, + "learning_rate": 3.695251724205092e-06, + "loss": 0.71655691, + "num_input_tokens_seen": 35785360, + "step": 1686, + "time_per_iteration": 2.7909364700317383 + }, + { + "auxiliary_loss_clip": 0.01217587, + "auxiliary_loss_mlp": 0.01035147, + "balance_loss_clip": 1.0604496, + "balance_loss_mlp": 1.02481103, + "epoch": 0.20284975650814646, + "flos": 26578133879040.0, + "grad_norm": 1.5944320645738073, + "language_loss": 0.86337817, + "learning_rate": 3.6948382776518054e-06, + "loss": 0.8859055, + "num_input_tokens_seen": 35806065, + "step": 1687, + "time_per_iteration": 2.5082156658172607 + }, + { + "auxiliary_loss_clip": 0.01182622, + "auxiliary_loss_mlp": 0.01044915, + "balance_loss_clip": 1.05501676, + "balance_loss_mlp": 1.03475857, + "epoch": 0.20296999939878554, + "flos": 16034222833920.0, + "grad_norm": 2.237741163203962, + "language_loss": 0.79793084, + "learning_rate": 3.6944245739929e-06, + "loss": 0.82020622, + "num_input_tokens_seen": 35822225, + "step": 1688, + "time_per_iteration": 2.5531702041625977 + }, + { + "auxiliary_loss_clip": 0.0120444, + "auxiliary_loss_mlp": 0.01042662, + "balance_loss_clip": 1.0592041, + "balance_loss_mlp": 1.03206992, + "epoch": 0.20309024228942463, + "flos": 19203374868480.0, + "grad_norm": 2.466695297757984, + "language_loss": 0.72109705, + "learning_rate": 3.6940106132911332e-06, + "loss": 0.74356806, + "num_input_tokens_seen": 35839410, + "step": 1689, + "time_per_iteration": 2.575395345687866 + }, + { + "auxiliary_loss_clip": 0.01205377, + "auxiliary_loss_mlp": 0.01032432, + "balance_loss_clip": 1.05915689, + "balance_loss_mlp": 1.02291954, + "epoch": 0.20321048518006374, + "flos": 22821087945600.0, + "grad_norm": 1.8086022020501362, + "language_loss": 0.88975108, + "learning_rate": 3.6935963956093037e-06, + "loss": 0.91212922, + "num_input_tokens_seen": 35859495, + "step": 1690, + "time_per_iteration": 2.5458178520202637 + }, + { + "auxiliary_loss_clip": 0.01194388, + "auxiliary_loss_mlp": 0.01034093, + "balance_loss_clip": 1.05589962, + "balance_loss_mlp": 1.02449036, + "epoch": 0.20333072807070282, + "flos": 19096397187840.0, + "grad_norm": 1.7384448339389797, + "language_loss": 0.69008064, + "learning_rate": 3.6931819210102474e-06, + "loss": 0.71236545, + "num_input_tokens_seen": 35878890, + "step": 1691, + "time_per_iteration": 2.4805192947387695 + }, + { + "auxiliary_loss_clip": 0.01221821, + "auxiliary_loss_mlp": 0.01034524, + "balance_loss_clip": 1.06139576, + "balance_loss_mlp": 1.02383673, + "epoch": 0.2034509709613419, + "flos": 18180962144640.0, + "grad_norm": 1.7607714682386926, + "language_loss": 0.84233224, + "learning_rate": 3.6927671895568402e-06, + "loss": 0.86489564, + "num_input_tokens_seen": 35897950, + "step": 1692, + "time_per_iteration": 2.4542996883392334 + }, + { + "auxiliary_loss_clip": 0.01220644, + "auxiliary_loss_mlp": 0.01035563, + "balance_loss_clip": 1.06315088, + "balance_loss_mlp": 1.02538252, + "epoch": 0.20357121385198101, + "flos": 22923899648640.0, + "grad_norm": 1.9228970591890346, + "language_loss": 0.86682487, + "learning_rate": 3.692352201311996e-06, + "loss": 0.88938701, + "num_input_tokens_seen": 35916800, + "step": 1693, + "time_per_iteration": 2.441546678543091 + }, + { + "auxiliary_loss_clip": 0.01169847, + "auxiliary_loss_mlp": 0.01030312, + "balance_loss_clip": 1.05487382, + "balance_loss_mlp": 1.02031589, + "epoch": 0.2036914567426201, + "flos": 20922131629440.0, + "grad_norm": 1.8337974063159488, + "language_loss": 0.76770854, + "learning_rate": 3.6919369563386687e-06, + "loss": 0.7897101, + "num_input_tokens_seen": 35936600, + "step": 1694, + "time_per_iteration": 2.561253070831299 + }, + { + "auxiliary_loss_clip": 0.01186778, + "auxiliary_loss_mlp": 0.01032657, + "balance_loss_clip": 1.05872846, + "balance_loss_mlp": 1.02314389, + "epoch": 0.20381169963325918, + "flos": 15519155760000.0, + "grad_norm": 2.0172257156732334, + "language_loss": 0.78715062, + "learning_rate": 3.69152145469985e-06, + "loss": 0.80934501, + "num_input_tokens_seen": 35953645, + "step": 1695, + "time_per_iteration": 2.4953250885009766 + }, + { + "auxiliary_loss_clip": 0.01163673, + "auxiliary_loss_mlp": 0.01047411, + "balance_loss_clip": 1.05437469, + "balance_loss_mlp": 1.03564477, + "epoch": 0.20393194252389826, + "flos": 28833143760000.0, + "grad_norm": 1.9197934423681768, + "language_loss": 0.82247829, + "learning_rate": 3.691105696458572e-06, + "loss": 0.84458917, + "num_input_tokens_seen": 35970940, + "step": 1696, + "time_per_iteration": 2.6696581840515137 + }, + { + "auxiliary_loss_clip": 0.01220422, + "auxiliary_loss_mlp": 0.01029857, + "balance_loss_clip": 1.062994, + "balance_loss_mlp": 1.01984966, + "epoch": 0.20405218541453737, + "flos": 22488554810880.0, + "grad_norm": 3.6112542634842035, + "language_loss": 0.6835922, + "learning_rate": 3.690689681677904e-06, + "loss": 0.70609498, + "num_input_tokens_seen": 35989410, + "step": 1697, + "time_per_iteration": 3.2390434741973877 + }, + { + "auxiliary_loss_clip": 0.0119025, + "auxiliary_loss_mlp": 0.01031609, + "balance_loss_clip": 1.05720317, + "balance_loss_mlp": 1.02214408, + "epoch": 0.20417242830517646, + "flos": 25374408278400.0, + "grad_norm": 1.7237530590599155, + "language_loss": 0.88570535, + "learning_rate": 3.690273410420956e-06, + "loss": 0.90792394, + "num_input_tokens_seen": 36009175, + "step": 1698, + "time_per_iteration": 2.5606610774993896 + }, + { + "auxiliary_loss_clip": 0.01203356, + "auxiliary_loss_mlp": 0.01033863, + "balance_loss_clip": 1.05842781, + "balance_loss_mlp": 1.02417159, + "epoch": 0.20429267119581554, + "flos": 14793078240000.0, + "grad_norm": 2.2285271879809545, + "language_loss": 0.76621985, + "learning_rate": 3.689856882750875e-06, + "loss": 0.78859204, + "num_input_tokens_seen": 36024375, + "step": 1699, + "time_per_iteration": 2.4792251586914062 + }, + { + "auxiliary_loss_clip": 0.0120215, + "auxiliary_loss_mlp": 0.01034877, + "balance_loss_clip": 1.05933189, + "balance_loss_mlp": 1.02540588, + "epoch": 0.20441291408645465, + "flos": 17781851151360.0, + "grad_norm": 1.828151323991799, + "language_loss": 0.79068208, + "learning_rate": 3.6894400987308486e-06, + "loss": 0.8130523, + "num_input_tokens_seen": 36041895, + "step": 1700, + "time_per_iteration": 2.486024856567383 + }, + { + "auxiliary_loss_clip": 0.01209511, + "auxiliary_loss_mlp": 0.01035318, + "balance_loss_clip": 1.06178021, + "balance_loss_mlp": 1.02455342, + "epoch": 0.20453315697709373, + "flos": 16435668211200.0, + "grad_norm": 3.5757465991175073, + "language_loss": 0.85230279, + "learning_rate": 3.6890230584241024e-06, + "loss": 0.87475109, + "num_input_tokens_seen": 36058825, + "step": 1701, + "time_per_iteration": 4.0455498695373535 + }, + { + "auxiliary_loss_clip": 0.01112846, + "auxiliary_loss_mlp": 0.01004111, + "balance_loss_clip": 1.02100849, + "balance_loss_mlp": 1.00197744, + "epoch": 0.20465339986773282, + "flos": 66713085653760.0, + "grad_norm": 1.0719731407861899, + "language_loss": 0.66419625, + "learning_rate": 3.6886057618939016e-06, + "loss": 0.6853658, + "num_input_tokens_seen": 36121645, + "step": 1702, + "time_per_iteration": 3.8683626651763916 + }, + { + "auxiliary_loss_clip": 0.01169713, + "auxiliary_loss_mlp": 0.01041533, + "balance_loss_clip": 1.05503821, + "balance_loss_mlp": 1.03051209, + "epoch": 0.2047736427583719, + "flos": 41974114924800.0, + "grad_norm": 2.3158667787626457, + "language_loss": 0.69203949, + "learning_rate": 3.6881882092035492e-06, + "loss": 0.71415192, + "num_input_tokens_seen": 36143030, + "step": 1703, + "time_per_iteration": 2.734644889831543 + }, + { + "auxiliary_loss_clip": 0.01083386, + "auxiliary_loss_mlp": 0.00755565, + "balance_loss_clip": 1.01988912, + "balance_loss_mlp": 1.00055969, + "epoch": 0.204893885649011, + "flos": 69940878641280.0, + "grad_norm": 0.9324236849058122, + "language_loss": 0.61235088, + "learning_rate": 3.6877704004163873e-06, + "loss": 0.63074046, + "num_input_tokens_seen": 36203435, + "step": 1704, + "time_per_iteration": 3.2693991661071777 + }, + { + "auxiliary_loss_clip": 0.0122273, + "auxiliary_loss_mlp": 0.01036027, + "balance_loss_clip": 1.06252098, + "balance_loss_mlp": 1.02524495, + "epoch": 0.2050141285396501, + "flos": 22200012858240.0, + "grad_norm": 1.6698707095611067, + "language_loss": 0.77673626, + "learning_rate": 3.6873523355957984e-06, + "loss": 0.7993238, + "num_input_tokens_seen": 36222435, + "step": 1705, + "time_per_iteration": 2.47055721282959 + }, + { + "auxiliary_loss_clip": 0.01111419, + "auxiliary_loss_mlp": 0.00999443, + "balance_loss_clip": 1.01974154, + "balance_loss_mlp": 0.99738085, + "epoch": 0.20513437143028918, + "flos": 46283721730560.0, + "grad_norm": 1.1205815225854363, + "language_loss": 0.64079398, + "learning_rate": 3.686934014805201e-06, + "loss": 0.66190261, + "num_input_tokens_seen": 36273065, + "step": 1706, + "time_per_iteration": 2.871128559112549 + }, + { + "auxiliary_loss_clip": 0.01204809, + "auxiliary_loss_mlp": 0.01041829, + "balance_loss_clip": 1.062078, + "balance_loss_mlp": 1.03142786, + "epoch": 0.20525461432092829, + "flos": 21904324099200.0, + "grad_norm": 1.795873376640081, + "language_loss": 0.80734956, + "learning_rate": 3.6865154381080552e-06, + "loss": 0.82981586, + "num_input_tokens_seen": 36293750, + "step": 1707, + "time_per_iteration": 2.5786502361297607 + }, + { + "auxiliary_loss_clip": 0.01129005, + "auxiliary_loss_mlp": 0.01029924, + "balance_loss_clip": 1.05120397, + "balance_loss_mlp": 1.02022624, + "epoch": 0.20537485721156737, + "flos": 21214264942080.0, + "grad_norm": 1.9600475368951487, + "language_loss": 0.82489508, + "learning_rate": 3.6860966055678585e-06, + "loss": 0.84648442, + "num_input_tokens_seen": 36310105, + "step": 1708, + "time_per_iteration": 2.6414997577667236 + }, + { + "auxiliary_loss_clip": 0.01209047, + "auxiliary_loss_mlp": 0.01039522, + "balance_loss_clip": 1.06442308, + "balance_loss_mlp": 1.02847731, + "epoch": 0.20549510010220645, + "flos": 20191205773440.0, + "grad_norm": 2.2034082575863834, + "language_loss": 0.86350727, + "learning_rate": 3.685677517248147e-06, + "loss": 0.88599288, + "num_input_tokens_seen": 36328995, + "step": 1709, + "time_per_iteration": 2.5042967796325684 + }, + { + "auxiliary_loss_clip": 0.01192444, + "auxiliary_loss_mlp": 0.00764633, + "balance_loss_clip": 1.06431222, + "balance_loss_mlp": 1.00099659, + "epoch": 0.20561534299284553, + "flos": 17016702612480.0, + "grad_norm": 3.785651887043093, + "language_loss": 0.80551982, + "learning_rate": 3.6852581732124967e-06, + "loss": 0.82509065, + "num_input_tokens_seen": 36346340, + "step": 1710, + "time_per_iteration": 2.507089138031006 + }, + { + "auxiliary_loss_clip": 0.01209695, + "auxiliary_loss_mlp": 0.01036772, + "balance_loss_clip": 1.06323481, + "balance_loss_mlp": 1.02603686, + "epoch": 0.20573558588348465, + "flos": 22890467064960.0, + "grad_norm": 1.8223009128082983, + "language_loss": 0.76065892, + "learning_rate": 3.6848385735245213e-06, + "loss": 0.78312355, + "num_input_tokens_seen": 36365430, + "step": 1711, + "time_per_iteration": 2.503385543823242 + }, + { + "auxiliary_loss_clip": 0.01188438, + "auxiliary_loss_mlp": 0.01031756, + "balance_loss_clip": 1.05326176, + "balance_loss_mlp": 1.02157557, + "epoch": 0.20585582877412373, + "flos": 24643123286400.0, + "grad_norm": 1.7384755900747544, + "language_loss": 0.8598696, + "learning_rate": 3.6844187182478734e-06, + "loss": 0.88207155, + "num_input_tokens_seen": 36386285, + "step": 1712, + "time_per_iteration": 2.521012783050537 + }, + { + "auxiliary_loss_clip": 0.01181126, + "auxiliary_loss_mlp": 0.01031533, + "balance_loss_clip": 1.05523884, + "balance_loss_mlp": 1.02143645, + "epoch": 0.2059760716647628, + "flos": 24206952435840.0, + "grad_norm": 1.691978998180068, + "language_loss": 0.74836409, + "learning_rate": 3.683998607446246e-06, + "loss": 0.77049065, + "num_input_tokens_seen": 36404935, + "step": 1713, + "time_per_iteration": 2.5693726539611816 + }, + { + "auxiliary_loss_clip": 0.01207266, + "auxiliary_loss_mlp": 0.01044107, + "balance_loss_clip": 1.06324625, + "balance_loss_mlp": 1.0346477, + "epoch": 0.20609631455540192, + "flos": 20229522606720.0, + "grad_norm": 1.8429271713491155, + "language_loss": 0.74775994, + "learning_rate": 3.6835782411833686e-06, + "loss": 0.77027369, + "num_input_tokens_seen": 36424455, + "step": 1714, + "time_per_iteration": 2.487229824066162 + }, + { + "auxiliary_loss_clip": 0.01166494, + "auxiliary_loss_mlp": 0.01033873, + "balance_loss_clip": 1.05518532, + "balance_loss_mlp": 1.02321541, + "epoch": 0.206216557446041, + "flos": 19864957518720.0, + "grad_norm": 1.7190765975867446, + "language_loss": 0.74300838, + "learning_rate": 3.68315761952301e-06, + "loss": 0.76501203, + "num_input_tokens_seen": 36441685, + "step": 1715, + "time_per_iteration": 2.535928249359131 + }, + { + "auxiliary_loss_clip": 0.01223011, + "auxiliary_loss_mlp": 0.01036953, + "balance_loss_clip": 1.06341386, + "balance_loss_mlp": 1.0267489, + "epoch": 0.2063368003366801, + "flos": 24096311568000.0, + "grad_norm": 1.7915560356376032, + "language_loss": 0.82808685, + "learning_rate": 3.6827367425289797e-06, + "loss": 0.85068643, + "num_input_tokens_seen": 36461460, + "step": 1716, + "time_per_iteration": 2.4918673038482666 + }, + { + "auxiliary_loss_clip": 0.01192669, + "auxiliary_loss_mlp": 0.01034974, + "balance_loss_clip": 1.05889821, + "balance_loss_mlp": 1.02372646, + "epoch": 0.2064570432273192, + "flos": 20340163474560.0, + "grad_norm": 2.103300796368785, + "language_loss": 0.71920276, + "learning_rate": 3.6823156102651225e-06, + "loss": 0.74147922, + "num_input_tokens_seen": 36479615, + "step": 1717, + "time_per_iteration": 2.520785093307495 + }, + { + "auxiliary_loss_clip": 0.01133846, + "auxiliary_loss_mlp": 0.01031418, + "balance_loss_clip": 1.05523443, + "balance_loss_mlp": 1.02098167, + "epoch": 0.20657728611795828, + "flos": 20520363029760.0, + "grad_norm": 1.834107188035738, + "language_loss": 0.70608163, + "learning_rate": 3.6818942227953257e-06, + "loss": 0.72773427, + "num_input_tokens_seen": 36500160, + "step": 1718, + "time_per_iteration": 2.611187696456909 + }, + { + "auxiliary_loss_clip": 0.01177567, + "auxiliary_loss_mlp": 0.01030954, + "balance_loss_clip": 1.06028497, + "balance_loss_mlp": 1.02053511, + "epoch": 0.20669752900859736, + "flos": 21799285752960.0, + "grad_norm": 2.392881829218023, + "language_loss": 0.68568128, + "learning_rate": 3.681472580183512e-06, + "loss": 0.70776647, + "num_input_tokens_seen": 36518810, + "step": 1719, + "time_per_iteration": 2.5752499103546143 + }, + { + "auxiliary_loss_clip": 0.01203903, + "auxiliary_loss_mlp": 0.01033943, + "balance_loss_clip": 1.06199515, + "balance_loss_mlp": 1.02438831, + "epoch": 0.20681777189923645, + "flos": 15122020014720.0, + "grad_norm": 2.309358121644905, + "language_loss": 0.86481899, + "learning_rate": 3.6810506824936455e-06, + "loss": 0.88719749, + "num_input_tokens_seen": 36536890, + "step": 1720, + "time_per_iteration": 2.520603895187378 + }, + { + "auxiliary_loss_clip": 0.01089199, + "auxiliary_loss_mlp": 0.01008125, + "balance_loss_clip": 1.0203501, + "balance_loss_mlp": 1.00603855, + "epoch": 0.20693801478987556, + "flos": 56481021509760.0, + "grad_norm": 1.0776509664160638, + "language_loss": 0.62520474, + "learning_rate": 3.680628529789726e-06, + "loss": 0.64617795, + "num_input_tokens_seen": 36589300, + "step": 1721, + "time_per_iteration": 2.9353764057159424 + }, + { + "auxiliary_loss_clip": 0.0122905, + "auxiliary_loss_mlp": 0.01039051, + "balance_loss_clip": 1.06557178, + "balance_loss_mlp": 1.02834034, + "epoch": 0.20705825768051464, + "flos": 21614201948160.0, + "grad_norm": 1.8525857562783208, + "language_loss": 0.86149108, + "learning_rate": 3.680206122135796e-06, + "loss": 0.88417202, + "num_input_tokens_seen": 36609905, + "step": 1722, + "time_per_iteration": 2.496194362640381 + }, + { + "auxiliary_loss_clip": 0.01171676, + "auxiliary_loss_mlp": 0.01040995, + "balance_loss_clip": 1.0646323, + "balance_loss_mlp": 1.03131533, + "epoch": 0.20717850057115372, + "flos": 25848895962240.0, + "grad_norm": 1.9382955080805875, + "language_loss": 0.78472233, + "learning_rate": 3.6797834595959323e-06, + "loss": 0.806849, + "num_input_tokens_seen": 36629805, + "step": 1723, + "time_per_iteration": 3.427685022354126 + }, + { + "auxiliary_loss_clip": 0.01150431, + "auxiliary_loss_mlp": 0.01044364, + "balance_loss_clip": 1.0517199, + "balance_loss_mlp": 1.03330684, + "epoch": 0.20729874346179283, + "flos": 29130807767040.0, + "grad_norm": 2.5801131144618594, + "language_loss": 0.78055739, + "learning_rate": 3.679360542234254e-06, + "loss": 0.80250525, + "num_input_tokens_seen": 36649150, + "step": 1724, + "time_per_iteration": 2.663532257080078 + }, + { + "auxiliary_loss_clip": 0.01183232, + "auxiliary_loss_mlp": 0.00765302, + "balance_loss_clip": 1.05384195, + "balance_loss_mlp": 1.00114787, + "epoch": 0.20741898635243192, + "flos": 29023363209600.0, + "grad_norm": 1.6014247436864042, + "language_loss": 0.72176039, + "learning_rate": 3.678937370114916e-06, + "loss": 0.74124569, + "num_input_tokens_seen": 36668955, + "step": 1725, + "time_per_iteration": 2.601590633392334 + }, + { + "auxiliary_loss_clip": 0.01183905, + "auxiliary_loss_mlp": 0.01030728, + "balance_loss_clip": 1.05930698, + "balance_loss_mlp": 1.02156115, + "epoch": 0.207539229243071, + "flos": 15559447841280.0, + "grad_norm": 2.183957041646784, + "language_loss": 0.7882874, + "learning_rate": 3.678513943302114e-06, + "loss": 0.81043375, + "num_input_tokens_seen": 36685730, + "step": 1726, + "time_per_iteration": 2.5043563842773438 + }, + { + "auxiliary_loss_clip": 0.01219822, + "auxiliary_loss_mlp": 0.01036521, + "balance_loss_clip": 1.06239057, + "balance_loss_mlp": 1.02680588, + "epoch": 0.20765947213371008, + "flos": 20521081301760.0, + "grad_norm": 1.6932208378009808, + "language_loss": 0.8495571, + "learning_rate": 3.678090261860082e-06, + "loss": 0.8721205, + "num_input_tokens_seen": 36705460, + "step": 1727, + "time_per_iteration": 4.081583261489868 + }, + { + "auxiliary_loss_clip": 0.0117357, + "auxiliary_loss_mlp": 0.01036393, + "balance_loss_clip": 1.05225372, + "balance_loss_mlp": 1.02663541, + "epoch": 0.2077797150243492, + "flos": 19354415558400.0, + "grad_norm": 1.984075806545991, + "language_loss": 0.77799201, + "learning_rate": 3.6776663258530906e-06, + "loss": 0.80009168, + "num_input_tokens_seen": 36724110, + "step": 1728, + "time_per_iteration": 2.607701063156128 + }, + { + "auxiliary_loss_clip": 0.01207598, + "auxiliary_loss_mlp": 0.0103458, + "balance_loss_clip": 1.06011534, + "balance_loss_mlp": 1.02529979, + "epoch": 0.20789995791498828, + "flos": 21829952989440.0, + "grad_norm": 1.8519485664591513, + "language_loss": 0.71196914, + "learning_rate": 3.6772421353454516e-06, + "loss": 0.73439085, + "num_input_tokens_seen": 36742705, + "step": 1729, + "time_per_iteration": 3.2758066654205322 + }, + { + "auxiliary_loss_clip": 0.0120397, + "auxiliary_loss_mlp": 0.01033859, + "balance_loss_clip": 1.06211197, + "balance_loss_mlp": 1.02401853, + "epoch": 0.20802020080562736, + "flos": 23148844571520.0, + "grad_norm": 1.7812683787548897, + "language_loss": 0.88521457, + "learning_rate": 3.6768176904015153e-06, + "loss": 0.90759289, + "num_input_tokens_seen": 36762510, + "step": 1730, + "time_per_iteration": 2.5024852752685547 + }, + { + "auxiliary_loss_clip": 0.01205411, + "auxiliary_loss_mlp": 0.01035909, + "balance_loss_clip": 1.05969548, + "balance_loss_mlp": 1.02620566, + "epoch": 0.20814044369626647, + "flos": 23072677781760.0, + "grad_norm": 2.213567118165345, + "language_loss": 0.60290831, + "learning_rate": 3.6763929910856674e-06, + "loss": 0.62532151, + "num_input_tokens_seen": 36780960, + "step": 1731, + "time_per_iteration": 2.534743309020996 + }, + { + "auxiliary_loss_clip": 0.01206469, + "auxiliary_loss_mlp": 0.01047208, + "balance_loss_clip": 1.06370175, + "balance_loss_mlp": 1.03710508, + "epoch": 0.20826068658690555, + "flos": 19608016556160.0, + "grad_norm": 2.078745035374867, + "language_loss": 0.77653372, + "learning_rate": 3.6759680374623365e-06, + "loss": 0.79907048, + "num_input_tokens_seen": 36798875, + "step": 1732, + "time_per_iteration": 2.4726674556732178 + }, + { + "auxiliary_loss_clip": 0.01219617, + "auxiliary_loss_mlp": 0.01033647, + "balance_loss_clip": 1.06351006, + "balance_loss_mlp": 1.02390122, + "epoch": 0.20838092947754464, + "flos": 25374049142400.0, + "grad_norm": 2.263062264022414, + "language_loss": 0.75634426, + "learning_rate": 3.675542829595986e-06, + "loss": 0.7788769, + "num_input_tokens_seen": 36818540, + "step": 1733, + "time_per_iteration": 2.502472400665283 + }, + { + "auxiliary_loss_clip": 0.01188989, + "auxiliary_loss_mlp": 0.01038817, + "balance_loss_clip": 1.05798209, + "balance_loss_mlp": 1.02882719, + "epoch": 0.20850117236818372, + "flos": 24061729749120.0, + "grad_norm": 1.4996788444949536, + "language_loss": 0.79202712, + "learning_rate": 3.6751173675511213e-06, + "loss": 0.81430525, + "num_input_tokens_seen": 36840585, + "step": 1734, + "time_per_iteration": 2.5345499515533447 + }, + { + "auxiliary_loss_clip": 0.01184988, + "auxiliary_loss_mlp": 0.01041874, + "balance_loss_clip": 1.05274761, + "balance_loss_mlp": 1.03237927, + "epoch": 0.20862141525882283, + "flos": 20077799558400.0, + "grad_norm": 2.4943101911584256, + "language_loss": 0.87609804, + "learning_rate": 3.674691651392283e-06, + "loss": 0.89836669, + "num_input_tokens_seen": 36858255, + "step": 1735, + "time_per_iteration": 2.508552312850952 + }, + { + "auxiliary_loss_clip": 0.01196207, + "auxiliary_loss_mlp": 0.01041909, + "balance_loss_clip": 1.06237578, + "balance_loss_mlp": 1.03188324, + "epoch": 0.2087416581494619, + "flos": 39015183237120.0, + "grad_norm": 1.9954158730399787, + "language_loss": 0.75231832, + "learning_rate": 3.674265681184053e-06, + "loss": 0.77469951, + "num_input_tokens_seen": 36881515, + "step": 1736, + "time_per_iteration": 2.678502082824707 + }, + { + "auxiliary_loss_clip": 0.01190416, + "auxiliary_loss_mlp": 0.01033455, + "balance_loss_clip": 1.05690384, + "balance_loss_mlp": 1.02391219, + "epoch": 0.208861901040101, + "flos": 26101994169600.0, + "grad_norm": 2.071340483796828, + "language_loss": 0.86221373, + "learning_rate": 3.6738394569910504e-06, + "loss": 0.88445246, + "num_input_tokens_seen": 36902055, + "step": 1737, + "time_per_iteration": 2.644317626953125 + }, + { + "auxiliary_loss_clip": 0.01205706, + "auxiliary_loss_mlp": 0.01033216, + "balance_loss_clip": 1.06283307, + "balance_loss_mlp": 1.02399516, + "epoch": 0.2089821439307401, + "flos": 28398732675840.0, + "grad_norm": 2.189001585193644, + "language_loss": 0.82425976, + "learning_rate": 3.6734129788779333e-06, + "loss": 0.84664893, + "num_input_tokens_seen": 36921230, + "step": 1738, + "time_per_iteration": 2.615220546722412 + }, + { + "auxiliary_loss_clip": 0.01173981, + "auxiliary_loss_mlp": 0.01033754, + "balance_loss_clip": 1.0598104, + "balance_loss_mlp": 1.02371073, + "epoch": 0.2091023868213792, + "flos": 21069616872960.0, + "grad_norm": 1.7539428932507333, + "language_loss": 0.9030112, + "learning_rate": 3.6729862469093976e-06, + "loss": 0.92508852, + "num_input_tokens_seen": 36940325, + "step": 1739, + "time_per_iteration": 2.6180498600006104 + }, + { + "auxiliary_loss_clip": 0.0117612, + "auxiliary_loss_mlp": 0.0103972, + "balance_loss_clip": 1.05541706, + "balance_loss_mlp": 1.02966428, + "epoch": 0.20922262971201827, + "flos": 22455481363200.0, + "grad_norm": 2.4155502016147303, + "language_loss": 0.82412404, + "learning_rate": 3.6725592611501782e-06, + "loss": 0.84628242, + "num_input_tokens_seen": 36959000, + "step": 1740, + "time_per_iteration": 2.5325069427490234 + }, + { + "auxiliary_loss_clip": 0.0120283, + "auxiliary_loss_mlp": 0.0103729, + "balance_loss_clip": 1.0587616, + "balance_loss_mlp": 1.02738929, + "epoch": 0.20934287260265738, + "flos": 27852244179840.0, + "grad_norm": 1.8316701638286523, + "language_loss": 0.76185668, + "learning_rate": 3.6721320216650496e-06, + "loss": 0.78425789, + "num_input_tokens_seen": 36979615, + "step": 1741, + "time_per_iteration": 2.5563549995422363 + }, + { + "auxiliary_loss_clip": 0.01188272, + "auxiliary_loss_mlp": 0.01038865, + "balance_loss_clip": 1.05753088, + "balance_loss_mlp": 1.02843392, + "epoch": 0.20946311549329646, + "flos": 16435309075200.0, + "grad_norm": 1.9231041705983172, + "language_loss": 0.83529288, + "learning_rate": 3.6717045285188215e-06, + "loss": 0.85756421, + "num_input_tokens_seen": 36997310, + "step": 1742, + "time_per_iteration": 2.502814769744873 + }, + { + "auxiliary_loss_clip": 0.01143168, + "auxiliary_loss_mlp": 0.01036184, + "balance_loss_clip": 1.0495007, + "balance_loss_mlp": 1.02634335, + "epoch": 0.20958335838393555, + "flos": 22492720788480.0, + "grad_norm": 1.9983205811816591, + "language_loss": 0.86964536, + "learning_rate": 3.671276781776346e-06, + "loss": 0.89143896, + "num_input_tokens_seen": 37015965, + "step": 1743, + "time_per_iteration": 2.6087863445281982 + }, + { + "auxiliary_loss_clip": 0.01179946, + "auxiliary_loss_mlp": 0.01034364, + "balance_loss_clip": 1.05289435, + "balance_loss_mlp": 1.02470255, + "epoch": 0.20970360127457463, + "flos": 25224768218880.0, + "grad_norm": 4.636318189766017, + "language_loss": 0.67245233, + "learning_rate": 3.6708487815025128e-06, + "loss": 0.69459546, + "num_input_tokens_seen": 37036545, + "step": 1744, + "time_per_iteration": 2.606788158416748 + }, + { + "auxiliary_loss_clip": 0.01173951, + "auxiliary_loss_mlp": 0.01032188, + "balance_loss_clip": 1.05473471, + "balance_loss_mlp": 1.022174, + "epoch": 0.20982384416521374, + "flos": 18479164855680.0, + "grad_norm": 8.104418439448871, + "language_loss": 0.74489242, + "learning_rate": 3.6704205277622463e-06, + "loss": 0.76695383, + "num_input_tokens_seen": 37054985, + "step": 1745, + "time_per_iteration": 2.558940887451172 + }, + { + "auxiliary_loss_clip": 0.01191666, + "auxiliary_loss_mlp": 0.01035717, + "balance_loss_clip": 1.05628467, + "balance_loss_mlp": 1.02590573, + "epoch": 0.20994408705585282, + "flos": 25373546352000.0, + "grad_norm": 1.671071233469433, + "language_loss": 0.80357838, + "learning_rate": 3.6699920206205146e-06, + "loss": 0.82585222, + "num_input_tokens_seen": 37075725, + "step": 1746, + "time_per_iteration": 2.588385581970215 + }, + { + "auxiliary_loss_clip": 0.01205018, + "auxiliary_loss_mlp": 0.01037125, + "balance_loss_clip": 1.05877066, + "balance_loss_mlp": 1.02742124, + "epoch": 0.2100643299464919, + "flos": 21320955313920.0, + "grad_norm": 1.6485288496874673, + "language_loss": 0.81882024, + "learning_rate": 3.669563260142321e-06, + "loss": 0.84124166, + "num_input_tokens_seen": 37094615, + "step": 1747, + "time_per_iteration": 2.5582385063171387 + }, + { + "auxiliary_loss_clip": 0.01186037, + "auxiliary_loss_mlp": 0.01037207, + "balance_loss_clip": 1.06020606, + "balance_loss_mlp": 1.02752161, + "epoch": 0.21018457283713102, + "flos": 19354379644800.0, + "grad_norm": 2.0054074375586812, + "language_loss": 0.84237683, + "learning_rate": 3.6691342463927083e-06, + "loss": 0.8646093, + "num_input_tokens_seen": 37113610, + "step": 1748, + "time_per_iteration": 2.528087854385376 + }, + { + "auxiliary_loss_clip": 0.01175444, + "auxiliary_loss_mlp": 0.01040289, + "balance_loss_clip": 1.05503464, + "balance_loss_mlp": 1.03029907, + "epoch": 0.2103048157277701, + "flos": 28330035914880.0, + "grad_norm": 1.6814907661290803, + "language_loss": 0.82041872, + "learning_rate": 3.668704979436758e-06, + "loss": 0.84257603, + "num_input_tokens_seen": 37133705, + "step": 1749, + "time_per_iteration": 2.6245553493499756 + }, + { + "auxiliary_loss_clip": 0.01180327, + "auxiliary_loss_mlp": 0.01034603, + "balance_loss_clip": 1.05369627, + "balance_loss_mlp": 1.02499485, + "epoch": 0.21042505861840918, + "flos": 17457290835840.0, + "grad_norm": 2.068124186346052, + "language_loss": 0.7867893, + "learning_rate": 3.668275459339588e-06, + "loss": 0.80893862, + "num_input_tokens_seen": 37152185, + "step": 1750, + "time_per_iteration": 3.34818172454834 + }, + { + "auxiliary_loss_clip": 0.01219372, + "auxiliary_loss_mlp": 0.01034763, + "balance_loss_clip": 1.06229424, + "balance_loss_mlp": 1.02400994, + "epoch": 0.21054530150904827, + "flos": 14209817195520.0, + "grad_norm": 1.8787274333089217, + "language_loss": 0.80324841, + "learning_rate": 3.667845686166358e-06, + "loss": 0.82578981, + "num_input_tokens_seen": 37169110, + "step": 1751, + "time_per_iteration": 2.430054187774658 + }, + { + "auxiliary_loss_clip": 0.01155963, + "auxiliary_loss_mlp": 0.0103857, + "balance_loss_clip": 1.05082965, + "balance_loss_mlp": 1.02837718, + "epoch": 0.21066554439968738, + "flos": 18618210403200.0, + "grad_norm": 1.6648805629172367, + "language_loss": 0.85941154, + "learning_rate": 3.6674156599822634e-06, + "loss": 0.88135684, + "num_input_tokens_seen": 37184905, + "step": 1752, + "time_per_iteration": 2.5319015979766846 + }, + { + "auxiliary_loss_clip": 0.01159072, + "auxiliary_loss_mlp": 0.01038116, + "balance_loss_clip": 1.04906416, + "balance_loss_mlp": 1.0273869, + "epoch": 0.21078578729032646, + "flos": 23658883741440.0, + "grad_norm": 1.8991536126968007, + "language_loss": 0.81465602, + "learning_rate": 3.666985380852539e-06, + "loss": 0.83662796, + "num_input_tokens_seen": 37203910, + "step": 1753, + "time_per_iteration": 2.6158387660980225 + }, + { + "auxiliary_loss_clip": 0.01187303, + "auxiliary_loss_mlp": 0.01031308, + "balance_loss_clip": 1.05815184, + "balance_loss_mlp": 1.02097869, + "epoch": 0.21090603018096554, + "flos": 29346379240320.0, + "grad_norm": 2.139212954929709, + "language_loss": 0.75058281, + "learning_rate": 3.6665548488424576e-06, + "loss": 0.77276897, + "num_input_tokens_seen": 37222670, + "step": 1754, + "time_per_iteration": 4.196468114852905 + }, + { + "auxiliary_loss_clip": 0.01219157, + "auxiliary_loss_mlp": 0.01039651, + "balance_loss_clip": 1.06037116, + "balance_loss_mlp": 1.02882075, + "epoch": 0.21102627307160465, + "flos": 23261245205760.0, + "grad_norm": 1.7396422762218215, + "language_loss": 0.88026595, + "learning_rate": 3.6661240640173307e-06, + "loss": 0.90285403, + "num_input_tokens_seen": 37244140, + "step": 1755, + "time_per_iteration": 3.238462209701538 + }, + { + "auxiliary_loss_clip": 0.01076608, + "auxiliary_loss_mlp": 0.01007191, + "balance_loss_clip": 1.0192678, + "balance_loss_mlp": 1.00499749, + "epoch": 0.21114651596224374, + "flos": 54633454577280.0, + "grad_norm": 0.8546901244353934, + "language_loss": 0.57901704, + "learning_rate": 3.6656930264425085e-06, + "loss": 0.59985501, + "num_input_tokens_seen": 37308185, + "step": 1756, + "time_per_iteration": 3.2091543674468994 + }, + { + "auxiliary_loss_clip": 0.01215166, + "auxiliary_loss_mlp": 0.01036073, + "balance_loss_clip": 1.05863786, + "balance_loss_mlp": 1.02581525, + "epoch": 0.21126675885288282, + "flos": 21543314457600.0, + "grad_norm": 2.4544581398802245, + "language_loss": 0.75469553, + "learning_rate": 3.665261736183378e-06, + "loss": 0.77720797, + "num_input_tokens_seen": 37328220, + "step": 1757, + "time_per_iteration": 2.4976959228515625 + }, + { + "auxiliary_loss_clip": 0.01175456, + "auxiliary_loss_mlp": 0.01030972, + "balance_loss_clip": 1.05787849, + "balance_loss_mlp": 1.02055907, + "epoch": 0.2113870017435219, + "flos": 10961876678400.0, + "grad_norm": 2.281768584694597, + "language_loss": 0.88217473, + "learning_rate": 3.664830193305366e-06, + "loss": 0.90423906, + "num_input_tokens_seen": 37345995, + "step": 1758, + "time_per_iteration": 2.5891008377075195 + }, + { + "auxiliary_loss_clip": 0.01166758, + "auxiliary_loss_mlp": 0.01036923, + "balance_loss_clip": 1.05004537, + "balance_loss_mlp": 1.02642059, + "epoch": 0.211507244634161, + "flos": 16653825463680.0, + "grad_norm": 2.774646593724795, + "language_loss": 0.77170503, + "learning_rate": 3.6643983978739373e-06, + "loss": 0.79374188, + "num_input_tokens_seen": 37362610, + "step": 1759, + "time_per_iteration": 2.570767879486084 + }, + { + "auxiliary_loss_clip": 0.01183075, + "auxiliary_loss_mlp": 0.01035468, + "balance_loss_clip": 1.05819106, + "balance_loss_mlp": 1.02491748, + "epoch": 0.2116274875248001, + "flos": 20954091755520.0, + "grad_norm": 2.1476045392700773, + "language_loss": 0.82120943, + "learning_rate": 3.663966349954596e-06, + "loss": 0.84339488, + "num_input_tokens_seen": 37382790, + "step": 1760, + "time_per_iteration": 2.5672285556793213 + }, + { + "auxiliary_loss_clip": 0.01100446, + "auxiliary_loss_mlp": 0.01005771, + "balance_loss_clip": 1.01895201, + "balance_loss_mlp": 1.00342262, + "epoch": 0.21174773041543918, + "flos": 68196949424640.0, + "grad_norm": 0.7811040743363402, + "language_loss": 0.59691012, + "learning_rate": 3.6635340496128816e-06, + "loss": 0.61797225, + "num_input_tokens_seen": 37439720, + "step": 1761, + "time_per_iteration": 2.988642692565918 + }, + { + "auxiliary_loss_clip": 0.0115538, + "auxiliary_loss_mlp": 0.01035588, + "balance_loss_clip": 1.05208027, + "balance_loss_mlp": 1.02583075, + "epoch": 0.2118679733060783, + "flos": 20668315150080.0, + "grad_norm": 1.6579955395392258, + "language_loss": 0.92607635, + "learning_rate": 3.6631014969143747e-06, + "loss": 0.94798601, + "num_input_tokens_seen": 37459410, + "step": 1762, + "time_per_iteration": 2.5748016834259033 + }, + { + "auxiliary_loss_clip": 0.0120588, + "auxiliary_loss_mlp": 0.01041956, + "balance_loss_clip": 1.06279731, + "balance_loss_mlp": 1.03213322, + "epoch": 0.21198821619671737, + "flos": 23223431162880.0, + "grad_norm": 1.7794120678152738, + "language_loss": 0.89092028, + "learning_rate": 3.662668691924693e-06, + "loss": 0.91339868, + "num_input_tokens_seen": 37480460, + "step": 1763, + "time_per_iteration": 2.5157113075256348 + }, + { + "auxiliary_loss_clip": 0.01172877, + "auxiliary_loss_mlp": 0.01040835, + "balance_loss_clip": 1.05452085, + "balance_loss_mlp": 1.03013563, + "epoch": 0.21210845908735645, + "flos": 24498547044480.0, + "grad_norm": 2.187414431867107, + "language_loss": 0.71556115, + "learning_rate": 3.6622356347094927e-06, + "loss": 0.73769832, + "num_input_tokens_seen": 37502025, + "step": 1764, + "time_per_iteration": 2.5959346294403076 + }, + { + "auxiliary_loss_clip": 0.01174387, + "auxiliary_loss_mlp": 0.01038882, + "balance_loss_clip": 1.05212331, + "balance_loss_mlp": 1.0274204, + "epoch": 0.21222870197799554, + "flos": 27089789160960.0, + "grad_norm": 2.063906721339266, + "language_loss": 0.78484344, + "learning_rate": 3.6618023253344684e-06, + "loss": 0.80697614, + "num_input_tokens_seen": 37520885, + "step": 1765, + "time_per_iteration": 2.5991370677948 + }, + { + "auxiliary_loss_clip": 0.01202301, + "auxiliary_loss_mlp": 0.01042876, + "balance_loss_clip": 1.05738282, + "balance_loss_mlp": 1.03215885, + "epoch": 0.21234894486863465, + "flos": 16873850223360.0, + "grad_norm": 1.664994720787353, + "language_loss": 0.83214998, + "learning_rate": 3.6613687638653527e-06, + "loss": 0.85460174, + "num_input_tokens_seen": 37539055, + "step": 1766, + "time_per_iteration": 2.4921741485595703 + }, + { + "auxiliary_loss_clip": 0.01183622, + "auxiliary_loss_mlp": 0.0103576, + "balance_loss_clip": 1.05584919, + "balance_loss_mlp": 1.02506125, + "epoch": 0.21246918775927373, + "flos": 23474949171840.0, + "grad_norm": 2.8974155351539257, + "language_loss": 0.77964783, + "learning_rate": 3.660934950367916e-06, + "loss": 0.80184174, + "num_input_tokens_seen": 37558300, + "step": 1767, + "time_per_iteration": 2.562304973602295 + }, + { + "auxiliary_loss_clip": 0.01207431, + "auxiliary_loss_mlp": 0.01036241, + "balance_loss_clip": 1.06167364, + "balance_loss_mlp": 1.02584016, + "epoch": 0.21258943064991281, + "flos": 22382295402240.0, + "grad_norm": 1.5736729550180315, + "language_loss": 0.83550304, + "learning_rate": 3.660500884907968e-06, + "loss": 0.85793978, + "num_input_tokens_seen": 37579040, + "step": 1768, + "time_per_iteration": 2.5063905715942383 + }, + { + "auxiliary_loss_clip": 0.01058987, + "auxiliary_loss_mlp": 0.01001718, + "balance_loss_clip": 1.01370049, + "balance_loss_mlp": 0.99941701, + "epoch": 0.21270967354055192, + "flos": 59440168679040.0, + "grad_norm": 0.8438993545463649, + "language_loss": 0.60075164, + "learning_rate": 3.660066567551356e-06, + "loss": 0.62135875, + "num_input_tokens_seen": 37639185, + "step": 1769, + "time_per_iteration": 3.0519490242004395 + }, + { + "auxiliary_loss_clip": 0.01201459, + "auxiliary_loss_mlp": 0.00765302, + "balance_loss_clip": 1.05752373, + "balance_loss_mlp": 1.0011133, + "epoch": 0.212829916431191, + "flos": 21544032729600.0, + "grad_norm": 2.7423184607926983, + "language_loss": 0.83769804, + "learning_rate": 3.6596319983639657e-06, + "loss": 0.85736561, + "num_input_tokens_seen": 37657765, + "step": 1770, + "time_per_iteration": 2.5383799076080322 + }, + { + "auxiliary_loss_clip": 0.01174421, + "auxiliary_loss_mlp": 0.00765856, + "balance_loss_clip": 1.05698252, + "balance_loss_mlp": 1.00108314, + "epoch": 0.2129501593218301, + "flos": 28987739896320.0, + "grad_norm": 1.5848317482489112, + "language_loss": 0.85993075, + "learning_rate": 3.6591971774117214e-06, + "loss": 0.87933362, + "num_input_tokens_seen": 37680740, + "step": 1771, + "time_per_iteration": 2.6238181591033936 + }, + { + "auxiliary_loss_clip": 0.01209753, + "auxiliary_loss_mlp": 0.01041246, + "balance_loss_clip": 1.06193614, + "balance_loss_mlp": 1.0310061, + "epoch": 0.2130704022124692, + "flos": 18806993308800.0, + "grad_norm": 2.074165693386947, + "language_loss": 0.80624902, + "learning_rate": 3.6587621047605833e-06, + "loss": 0.82875907, + "num_input_tokens_seen": 37697910, + "step": 1772, + "time_per_iteration": 2.4761242866516113 + }, + { + "auxiliary_loss_clip": 0.01204999, + "auxiliary_loss_mlp": 0.01038022, + "balance_loss_clip": 1.06071007, + "balance_loss_mlp": 1.0282234, + "epoch": 0.21319064510310828, + "flos": 13918150759680.0, + "grad_norm": 1.9720010397412113, + "language_loss": 0.86325836, + "learning_rate": 3.6583267804765542e-06, + "loss": 0.8856886, + "num_input_tokens_seen": 37712245, + "step": 1773, + "time_per_iteration": 2.461419105529785 + }, + { + "auxiliary_loss_clip": 0.01201666, + "auxiliary_loss_mlp": 0.01040321, + "balance_loss_clip": 1.05790257, + "balance_loss_mlp": 1.02893627, + "epoch": 0.21331088799374737, + "flos": 20959694277120.0, + "grad_norm": 1.710898333457911, + "language_loss": 0.85504293, + "learning_rate": 3.6578912046256702e-06, + "loss": 0.8774628, + "num_input_tokens_seen": 37730765, + "step": 1774, + "time_per_iteration": 2.508920907974243 + }, + { + "auxiliary_loss_clip": 0.01168227, + "auxiliary_loss_mlp": 0.0103476, + "balance_loss_clip": 1.05054498, + "balance_loss_mlp": 1.02338767, + "epoch": 0.21343113088438645, + "flos": 18624638937600.0, + "grad_norm": 2.0638691057770844, + "language_loss": 0.76206177, + "learning_rate": 3.6574553772740083e-06, + "loss": 0.78409165, + "num_input_tokens_seen": 37748695, + "step": 1775, + "time_per_iteration": 2.516122817993164 + }, + { + "auxiliary_loss_clip": 0.01107763, + "auxiliary_loss_mlp": 0.01001154, + "balance_loss_clip": 1.03862703, + "balance_loss_mlp": 0.99916327, + "epoch": 0.21355137377502556, + "flos": 67413128791680.0, + "grad_norm": 0.8722536024325206, + "language_loss": 0.61889637, + "learning_rate": 3.657019298487684e-06, + "loss": 0.63998556, + "num_input_tokens_seen": 37813705, + "step": 1776, + "time_per_iteration": 3.8236348628997803 + }, + { + "auxiliary_loss_clip": 0.01212964, + "auxiliary_loss_mlp": 0.0076569, + "balance_loss_clip": 1.06066751, + "balance_loss_mlp": 1.00110543, + "epoch": 0.21367161666566464, + "flos": 34532095697280.0, + "grad_norm": 1.7077912160169804, + "language_loss": 0.83573717, + "learning_rate": 3.6565829683328495e-06, + "loss": 0.85552382, + "num_input_tokens_seen": 37836330, + "step": 1777, + "time_per_iteration": 2.6448287963867188 + }, + { + "auxiliary_loss_clip": 0.01198717, + "auxiliary_loss_mlp": 0.01034219, + "balance_loss_clip": 1.05855, + "balance_loss_mlp": 1.02369869, + "epoch": 0.21379185955630373, + "flos": 18989347680000.0, + "grad_norm": 1.852296688577234, + "language_loss": 0.86193299, + "learning_rate": 3.6561463868756965e-06, + "loss": 0.88426232, + "num_input_tokens_seen": 37855030, + "step": 1778, + "time_per_iteration": 2.46518611907959 + }, + { + "auxiliary_loss_clip": 0.01203951, + "auxiliary_loss_mlp": 0.01038064, + "balance_loss_clip": 1.06184244, + "balance_loss_mlp": 1.02742434, + "epoch": 0.21391210244694284, + "flos": 28218497207040.0, + "grad_norm": 1.483417258018165, + "language_loss": 0.77868044, + "learning_rate": 3.655709554182452e-06, + "loss": 0.80110061, + "num_input_tokens_seen": 37875370, + "step": 1779, + "time_per_iteration": 2.537999391555786 + }, + { + "auxiliary_loss_clip": 0.01207098, + "auxiliary_loss_mlp": 0.01033464, + "balance_loss_clip": 1.05820894, + "balance_loss_mlp": 1.02364731, + "epoch": 0.21403234533758192, + "flos": 17455064192640.0, + "grad_norm": 1.8500755527066437, + "language_loss": 0.84480441, + "learning_rate": 3.6552724703193855e-06, + "loss": 0.86721003, + "num_input_tokens_seen": 37892560, + "step": 1780, + "time_per_iteration": 2.461216688156128 + }, + { + "auxiliary_loss_clip": 0.01059056, + "auxiliary_loss_mlp": 0.01003368, + "balance_loss_clip": 1.01516616, + "balance_loss_mlp": 1.00094855, + "epoch": 0.214152588228221, + "flos": 51637606686720.0, + "grad_norm": 0.7908191173182713, + "language_loss": 0.55936277, + "learning_rate": 3.654835135352801e-06, + "loss": 0.57998693, + "num_input_tokens_seen": 37947370, + "step": 1781, + "time_per_iteration": 3.8263978958129883 + }, + { + "auxiliary_loss_clip": 0.01157157, + "auxiliary_loss_mlp": 0.01032058, + "balance_loss_clip": 1.04984236, + "balance_loss_mlp": 1.02193689, + "epoch": 0.21427283111886009, + "flos": 19496154625920.0, + "grad_norm": 2.0655693221995866, + "language_loss": 0.87716067, + "learning_rate": 3.654397549349043e-06, + "loss": 0.89905286, + "num_input_tokens_seen": 37964745, + "step": 1782, + "time_per_iteration": 3.3496203422546387 + }, + { + "auxiliary_loss_clip": 0.01187061, + "auxiliary_loss_mlp": 0.01036537, + "balance_loss_clip": 1.05955005, + "balance_loss_mlp": 1.02590919, + "epoch": 0.2143930740094992, + "flos": 20084802710400.0, + "grad_norm": 2.3798017994759006, + "language_loss": 0.75284624, + "learning_rate": 3.653959712374491e-06, + "loss": 0.77508223, + "num_input_tokens_seen": 37982850, + "step": 1783, + "time_per_iteration": 2.5424094200134277 + }, + { + "auxiliary_loss_clip": 0.01167947, + "auxiliary_loss_mlp": 0.0102798, + "balance_loss_clip": 1.05866194, + "balance_loss_mlp": 1.01832449, + "epoch": 0.21451331690013828, + "flos": 21798603394560.0, + "grad_norm": 1.7839342605112742, + "language_loss": 0.82702243, + "learning_rate": 3.6535216244955663e-06, + "loss": 0.84898174, + "num_input_tokens_seen": 38002745, + "step": 1784, + "time_per_iteration": 2.616001844406128 + }, + { + "auxiliary_loss_clip": 0.01185823, + "auxiliary_loss_mlp": 0.01040063, + "balance_loss_clip": 1.05834746, + "balance_loss_mlp": 1.02985275, + "epoch": 0.21463355979077736, + "flos": 32853882412800.0, + "grad_norm": 1.648655177620762, + "language_loss": 0.70993805, + "learning_rate": 3.653083285778726e-06, + "loss": 0.73219693, + "num_input_tokens_seen": 38024115, + "step": 1785, + "time_per_iteration": 2.723299503326416 + }, + { + "auxiliary_loss_clip": 0.01207049, + "auxiliary_loss_mlp": 0.01034721, + "balance_loss_clip": 1.05960059, + "balance_loss_mlp": 1.02338409, + "epoch": 0.21475380268141647, + "flos": 21543817248000.0, + "grad_norm": 2.1444343027378094, + "language_loss": 0.81313586, + "learning_rate": 3.6526446962904653e-06, + "loss": 0.83555359, + "num_input_tokens_seen": 38042830, + "step": 1786, + "time_per_iteration": 2.5556912422180176 + }, + { + "auxiliary_loss_clip": 0.01200092, + "auxiliary_loss_mlp": 0.01043859, + "balance_loss_clip": 1.06097913, + "balance_loss_mlp": 1.03391087, + "epoch": 0.21487404557205556, + "flos": 32159082660480.0, + "grad_norm": 1.5433742486065325, + "language_loss": 0.74362183, + "learning_rate": 3.652205856097318e-06, + "loss": 0.76606131, + "num_input_tokens_seen": 38066015, + "step": 1787, + "time_per_iteration": 2.586217164993286 + }, + { + "auxiliary_loss_clip": 0.01181993, + "auxiliary_loss_mlp": 0.00764718, + "balance_loss_clip": 1.05684829, + "balance_loss_mlp": 1.00089228, + "epoch": 0.21499428846269464, + "flos": 12673091583360.0, + "grad_norm": 3.8995659881645057, + "language_loss": 0.79247397, + "learning_rate": 3.651766765265856e-06, + "loss": 0.81194109, + "num_input_tokens_seen": 38083025, + "step": 1788, + "time_per_iteration": 2.590222120285034 + }, + { + "auxiliary_loss_clip": 0.01182073, + "auxiliary_loss_mlp": 0.01027047, + "balance_loss_clip": 1.05474353, + "balance_loss_mlp": 1.01723599, + "epoch": 0.21511453135333372, + "flos": 23471573293440.0, + "grad_norm": 2.642081424656014, + "language_loss": 0.80542582, + "learning_rate": 3.65132742386269e-06, + "loss": 0.82751703, + "num_input_tokens_seen": 38098245, + "step": 1789, + "time_per_iteration": 2.531926393508911 + }, + { + "auxiliary_loss_clip": 0.01216042, + "auxiliary_loss_mlp": 0.01033584, + "balance_loss_clip": 1.05847883, + "balance_loss_mlp": 1.02267623, + "epoch": 0.21523477424397283, + "flos": 26943560893440.0, + "grad_norm": 1.9741940388429897, + "language_loss": 0.84934831, + "learning_rate": 3.6508878319544656e-06, + "loss": 0.87184459, + "num_input_tokens_seen": 38118460, + "step": 1790, + "time_per_iteration": 2.5033371448516846 + }, + { + "auxiliary_loss_clip": 0.01178635, + "auxiliary_loss_mlp": 0.01045226, + "balance_loss_clip": 1.05864036, + "balance_loss_mlp": 1.03501594, + "epoch": 0.21535501713461191, + "flos": 18916161719040.0, + "grad_norm": 2.846708173622423, + "language_loss": 0.82010996, + "learning_rate": 3.65044798960787e-06, + "loss": 0.84234858, + "num_input_tokens_seen": 38136800, + "step": 1791, + "time_per_iteration": 2.5569820404052734 + }, + { + "auxiliary_loss_clip": 0.01165481, + "auxiliary_loss_mlp": 0.01032162, + "balance_loss_clip": 1.05272448, + "balance_loss_mlp": 1.02221406, + "epoch": 0.215475260025251, + "flos": 17895113712000.0, + "grad_norm": 1.8329032986734437, + "language_loss": 0.78097236, + "learning_rate": 3.650007896889627e-06, + "loss": 0.80294871, + "num_input_tokens_seen": 38155380, + "step": 1792, + "time_per_iteration": 2.5419857501983643 + }, + { + "auxiliary_loss_clip": 0.01216315, + "auxiliary_loss_mlp": 0.01038183, + "balance_loss_clip": 1.06194603, + "balance_loss_mlp": 1.02844405, + "epoch": 0.2155955029158901, + "flos": 16654292340480.0, + "grad_norm": 1.7474537795833884, + "language_loss": 0.80567425, + "learning_rate": 3.6495675538664974e-06, + "loss": 0.82821923, + "num_input_tokens_seen": 38174395, + "step": 1793, + "time_per_iteration": 2.487579584121704 + }, + { + "auxiliary_loss_clip": 0.01187813, + "auxiliary_loss_mlp": 0.01030765, + "balance_loss_clip": 1.05529475, + "balance_loss_mlp": 1.02104998, + "epoch": 0.2157157458065292, + "flos": 23621213352960.0, + "grad_norm": 1.7180689312043975, + "language_loss": 0.82143152, + "learning_rate": 3.649126960605282e-06, + "loss": 0.84361732, + "num_input_tokens_seen": 38195380, + "step": 1794, + "time_per_iteration": 2.565218448638916 + }, + { + "auxiliary_loss_clip": 0.01185441, + "auxiliary_loss_mlp": 0.0103371, + "balance_loss_clip": 1.05882859, + "balance_loss_mlp": 1.02380395, + "epoch": 0.21583598869716827, + "flos": 22127078292480.0, + "grad_norm": 2.3562701721431702, + "language_loss": 0.83375001, + "learning_rate": 3.6486861171728174e-06, + "loss": 0.85594147, + "num_input_tokens_seen": 38213775, + "step": 1795, + "time_per_iteration": 2.5263679027557373 + }, + { + "auxiliary_loss_clip": 0.01171223, + "auxiliary_loss_mlp": 0.01034737, + "balance_loss_clip": 1.05192327, + "balance_loss_mlp": 1.0243423, + "epoch": 0.21595623158780738, + "flos": 23441229279360.0, + "grad_norm": 1.691206418349394, + "language_loss": 0.78355324, + "learning_rate": 3.6482450236359803e-06, + "loss": 0.80561286, + "num_input_tokens_seen": 38235630, + "step": 1796, + "time_per_iteration": 2.5839931964874268 + }, + { + "auxiliary_loss_clip": 0.01202697, + "auxiliary_loss_mlp": 0.01040789, + "balance_loss_clip": 1.06217933, + "balance_loss_mlp": 1.03127062, + "epoch": 0.21607647447844647, + "flos": 26906501036160.0, + "grad_norm": 2.1416445418610848, + "language_loss": 0.77802575, + "learning_rate": 3.647803680061683e-06, + "loss": 0.80046058, + "num_input_tokens_seen": 38256045, + "step": 1797, + "time_per_iteration": 2.5180416107177734 + }, + { + "auxiliary_loss_clip": 0.01190031, + "auxiliary_loss_mlp": 0.01040488, + "balance_loss_clip": 1.06007659, + "balance_loss_mlp": 1.02930057, + "epoch": 0.21619671736908555, + "flos": 14495378319360.0, + "grad_norm": 2.4146091759028705, + "language_loss": 0.74407518, + "learning_rate": 3.6473620865168776e-06, + "loss": 0.76638031, + "num_input_tokens_seen": 38272915, + "step": 1798, + "time_per_iteration": 2.492906332015991 + }, + { + "auxiliary_loss_clip": 0.01187323, + "auxiliary_loss_mlp": 0.01032345, + "balance_loss_clip": 1.06073415, + "balance_loss_mlp": 1.0231595, + "epoch": 0.21631696025972463, + "flos": 17931096161280.0, + "grad_norm": 1.9738781910032082, + "language_loss": 0.81652725, + "learning_rate": 3.646920243068554e-06, + "loss": 0.83872402, + "num_input_tokens_seen": 38290810, + "step": 1799, + "time_per_iteration": 2.498215436935425 + }, + { + "auxiliary_loss_clip": 0.01172019, + "auxiliary_loss_mlp": 0.01034574, + "balance_loss_clip": 1.05429852, + "balance_loss_mlp": 1.02504957, + "epoch": 0.21643720315036374, + "flos": 24462385027200.0, + "grad_norm": 1.6790863393494222, + "language_loss": 0.74248195, + "learning_rate": 3.6464781497837384e-06, + "loss": 0.76454794, + "num_input_tokens_seen": 38312785, + "step": 1800, + "time_per_iteration": 2.5372402667999268 + }, + { + "auxiliary_loss_clip": 0.01187722, + "auxiliary_loss_mlp": 0.01043722, + "balance_loss_clip": 1.05460739, + "balance_loss_mlp": 1.03403044, + "epoch": 0.21655744604100283, + "flos": 28474432588800.0, + "grad_norm": 1.7037833354723768, + "language_loss": 0.7259903, + "learning_rate": 3.6460358067294965e-06, + "loss": 0.74830478, + "num_input_tokens_seen": 38334015, + "step": 1801, + "time_per_iteration": 2.618553876876831 + }, + { + "auxiliary_loss_clip": 0.01220525, + "auxiliary_loss_mlp": 0.01033882, + "balance_loss_clip": 1.06080914, + "balance_loss_mlp": 1.02336812, + "epoch": 0.2166776889316419, + "flos": 20152960767360.0, + "grad_norm": 2.012485173843271, + "language_loss": 0.77699286, + "learning_rate": 3.645593213972932e-06, + "loss": 0.79953694, + "num_input_tokens_seen": 38352920, + "step": 1802, + "time_per_iteration": 2.467527389526367 + }, + { + "auxiliary_loss_clip": 0.0119873, + "auxiliary_loss_mlp": 0.01033366, + "balance_loss_clip": 1.05909491, + "balance_loss_mlp": 1.0225122, + "epoch": 0.21679793182228102, + "flos": 15193482122880.0, + "grad_norm": 1.9256072941466997, + "language_loss": 0.7948668, + "learning_rate": 3.6451503715811852e-06, + "loss": 0.81718779, + "num_input_tokens_seen": 38371230, + "step": 1803, + "time_per_iteration": 3.222223997116089 + }, + { + "auxiliary_loss_clip": 0.0118494, + "auxiliary_loss_mlp": 0.01032977, + "balance_loss_clip": 1.05994058, + "balance_loss_mlp": 1.02440584, + "epoch": 0.2169181747129201, + "flos": 17384464010880.0, + "grad_norm": 2.395264264968195, + "language_loss": 0.80002308, + "learning_rate": 3.6447072796214345e-06, + "loss": 0.82220227, + "num_input_tokens_seen": 38389795, + "step": 1804, + "time_per_iteration": 2.5083682537078857 + }, + { + "auxiliary_loss_clip": 0.01064523, + "auxiliary_loss_mlp": 0.01009565, + "balance_loss_clip": 1.02101469, + "balance_loss_mlp": 1.00678766, + "epoch": 0.21703841760355919, + "flos": 58760955429120.0, + "grad_norm": 1.117159575907689, + "language_loss": 0.63243127, + "learning_rate": 3.644263938160898e-06, + "loss": 0.65317214, + "num_input_tokens_seen": 38445760, + "step": 1805, + "time_per_iteration": 3.0620200634002686 + }, + { + "auxiliary_loss_clip": 0.01170399, + "auxiliary_loss_mlp": 0.01034184, + "balance_loss_clip": 1.05648947, + "balance_loss_mlp": 1.02348495, + "epoch": 0.21715866049419827, + "flos": 22418457419520.0, + "grad_norm": 1.946465305778629, + "language_loss": 0.71877748, + "learning_rate": 3.6438203472668293e-06, + "loss": 0.74082327, + "num_input_tokens_seen": 38465405, + "step": 1806, + "time_per_iteration": 2.580799102783203 + }, + { + "auxiliary_loss_clip": 0.01188831, + "auxiliary_loss_mlp": 0.01033564, + "balance_loss_clip": 1.05754566, + "balance_loss_mlp": 1.02395022, + "epoch": 0.21727890338483738, + "flos": 17237732952960.0, + "grad_norm": 1.8843780817744364, + "language_loss": 0.81982458, + "learning_rate": 3.6433765070065206e-06, + "loss": 0.84204853, + "num_input_tokens_seen": 38483195, + "step": 1807, + "time_per_iteration": 3.292038679122925 + }, + { + "auxiliary_loss_clip": 0.01217335, + "auxiliary_loss_mlp": 0.01032585, + "balance_loss_clip": 1.06033802, + "balance_loss_mlp": 1.02228546, + "epoch": 0.21739914627547646, + "flos": 13434792416640.0, + "grad_norm": 2.700600177239066, + "language_loss": 0.87568307, + "learning_rate": 3.6429324174473025e-06, + "loss": 0.89818227, + "num_input_tokens_seen": 38496735, + "step": 1808, + "time_per_iteration": 3.360882043838501 + }, + { + "auxiliary_loss_clip": 0.01201588, + "auxiliary_loss_mlp": 0.01035564, + "balance_loss_clip": 1.05637419, + "balance_loss_mlp": 1.02618814, + "epoch": 0.21751938916611555, + "flos": 20959514709120.0, + "grad_norm": 2.969346629168455, + "language_loss": 0.84814823, + "learning_rate": 3.6424880786565425e-06, + "loss": 0.87051976, + "num_input_tokens_seen": 38512880, + "step": 1809, + "time_per_iteration": 3.2917778491973877 + }, + { + "auxiliary_loss_clip": 0.01154425, + "auxiliary_loss_mlp": 0.01034567, + "balance_loss_clip": 1.05607057, + "balance_loss_mlp": 1.02394521, + "epoch": 0.21763963205675466, + "flos": 27599936071680.0, + "grad_norm": 2.235760317383145, + "language_loss": 0.79500246, + "learning_rate": 3.6420434907016482e-06, + "loss": 0.81689233, + "num_input_tokens_seen": 38532570, + "step": 1810, + "time_per_iteration": 2.634539842605591 + }, + { + "auxiliary_loss_clip": 0.01201977, + "auxiliary_loss_mlp": 0.01033409, + "balance_loss_clip": 1.0624423, + "balance_loss_mlp": 1.02386618, + "epoch": 0.21775987494739374, + "flos": 21430411032960.0, + "grad_norm": 1.6247681423022957, + "language_loss": 0.81171954, + "learning_rate": 3.6415986536500606e-06, + "loss": 0.83407336, + "num_input_tokens_seen": 38550900, + "step": 1811, + "time_per_iteration": 2.505587339401245 + }, + { + "auxiliary_loss_clip": 0.01152233, + "auxiliary_loss_mlp": 0.01038075, + "balance_loss_clip": 1.06065869, + "balance_loss_mlp": 1.02833605, + "epoch": 0.21788011783803282, + "flos": 18332972501760.0, + "grad_norm": 1.6438257371700515, + "language_loss": 0.80394453, + "learning_rate": 3.641153567569263e-06, + "loss": 0.82584763, + "num_input_tokens_seen": 38569215, + "step": 1812, + "time_per_iteration": 2.573927640914917 + }, + { + "auxiliary_loss_clip": 0.01195879, + "auxiliary_loss_mlp": 0.01028598, + "balance_loss_clip": 1.05825067, + "balance_loss_mlp": 1.01923418, + "epoch": 0.2180003607286719, + "flos": 30262748037120.0, + "grad_norm": 2.370773636203844, + "language_loss": 0.95491767, + "learning_rate": 3.640708232526774e-06, + "loss": 0.97716242, + "num_input_tokens_seen": 38587870, + "step": 1813, + "time_per_iteration": 2.5530569553375244 + }, + { + "auxiliary_loss_clip": 0.01132744, + "auxiliary_loss_mlp": 0.01033653, + "balance_loss_clip": 1.04423559, + "balance_loss_mlp": 1.02383649, + "epoch": 0.21812060361931102, + "flos": 25480272637440.0, + "grad_norm": 2.5162719061164167, + "language_loss": 0.7876687, + "learning_rate": 3.6402626485901504e-06, + "loss": 0.80933273, + "num_input_tokens_seen": 38606965, + "step": 1814, + "time_per_iteration": 2.645617961883545 + }, + { + "auxiliary_loss_clip": 0.01195403, + "auxiliary_loss_mlp": 0.01036473, + "balance_loss_clip": 1.05990887, + "balance_loss_mlp": 1.02728152, + "epoch": 0.2182408465099501, + "flos": 21908166854400.0, + "grad_norm": 1.9251416503944658, + "language_loss": 0.77744764, + "learning_rate": 3.639816815826988e-06, + "loss": 0.79976642, + "num_input_tokens_seen": 38626290, + "step": 1815, + "time_per_iteration": 2.4972410202026367 + }, + { + "auxiliary_loss_clip": 0.0118128, + "auxiliary_loss_mlp": 0.0103225, + "balance_loss_clip": 1.05679321, + "balance_loss_mlp": 1.02308869, + "epoch": 0.21836108940058918, + "flos": 23657339456640.0, + "grad_norm": 1.7517568957339267, + "language_loss": 0.78120685, + "learning_rate": 3.6393707343049176e-06, + "loss": 0.80334222, + "num_input_tokens_seen": 38646620, + "step": 1816, + "time_per_iteration": 2.583723306655884 + }, + { + "auxiliary_loss_clip": 0.01202195, + "auxiliary_loss_mlp": 0.01030433, + "balance_loss_clip": 1.05818737, + "balance_loss_mlp": 1.02083707, + "epoch": 0.2184813322912283, + "flos": 24681009156480.0, + "grad_norm": 2.5317956102595716, + "language_loss": 0.73357201, + "learning_rate": 3.6389244040916104e-06, + "loss": 0.7558983, + "num_input_tokens_seen": 38665695, + "step": 1817, + "time_per_iteration": 2.5284616947174072 + }, + { + "auxiliary_loss_clip": 0.01175107, + "auxiliary_loss_mlp": 0.0076494, + "balance_loss_clip": 1.05399323, + "balance_loss_mlp": 1.00076604, + "epoch": 0.21860157518186737, + "flos": 26574650259840.0, + "grad_norm": 2.239052692504332, + "language_loss": 0.79427141, + "learning_rate": 3.6384778252547747e-06, + "loss": 0.81367183, + "num_input_tokens_seen": 38681575, + "step": 1818, + "time_per_iteration": 2.5431759357452393 + }, + { + "auxiliary_loss_clip": 0.01179846, + "auxiliary_loss_mlp": 0.00764192, + "balance_loss_clip": 1.05840933, + "balance_loss_mlp": 1.00085306, + "epoch": 0.21872181807250646, + "flos": 20886292834560.0, + "grad_norm": 3.7505582803925552, + "language_loss": 0.78490138, + "learning_rate": 3.638030997862155e-06, + "loss": 0.80434173, + "num_input_tokens_seen": 38700510, + "step": 1819, + "time_per_iteration": 2.565150737762451 + }, + { + "auxiliary_loss_clip": 0.01081901, + "auxiliary_loss_mlp": 0.01005257, + "balance_loss_clip": 1.02092111, + "balance_loss_mlp": 1.00323033, + "epoch": 0.21884206096314554, + "flos": 61209452897280.0, + "grad_norm": 0.7626682045220754, + "language_loss": 0.59460229, + "learning_rate": 3.6375839219815356e-06, + "loss": 0.61547387, + "num_input_tokens_seen": 38758310, + "step": 1820, + "time_per_iteration": 3.0047004222869873 + }, + { + "auxiliary_loss_clip": 0.0121285, + "auxiliary_loss_mlp": 0.01036527, + "balance_loss_clip": 1.05938339, + "balance_loss_mlp": 1.02667391, + "epoch": 0.21896230385378465, + "flos": 23473835850240.0, + "grad_norm": 2.00494781610737, + "language_loss": 0.82982862, + "learning_rate": 3.6371365976807375e-06, + "loss": 0.85232234, + "num_input_tokens_seen": 38778705, + "step": 1821, + "time_per_iteration": 2.4754910469055176 + }, + { + "auxiliary_loss_clip": 0.0114731, + "auxiliary_loss_mlp": 0.01030397, + "balance_loss_clip": 1.05445838, + "balance_loss_mlp": 1.02099752, + "epoch": 0.21908254674442373, + "flos": 25081915829760.0, + "grad_norm": 2.381752265504817, + "language_loss": 0.83563364, + "learning_rate": 3.6366890250276185e-06, + "loss": 0.85741067, + "num_input_tokens_seen": 38799660, + "step": 1822, + "time_per_iteration": 2.61612606048584 + }, + { + "auxiliary_loss_clip": 0.01214042, + "auxiliary_loss_mlp": 0.01031128, + "balance_loss_clip": 1.06079197, + "balance_loss_mlp": 1.0215373, + "epoch": 0.21920278963506282, + "flos": 23513768795520.0, + "grad_norm": 2.0035772963165592, + "language_loss": 0.90199423, + "learning_rate": 3.6362412040900764e-06, + "loss": 0.92444587, + "num_input_tokens_seen": 38819450, + "step": 1823, + "time_per_iteration": 2.476703643798828 + }, + { + "auxiliary_loss_clip": 0.01199273, + "auxiliary_loss_mlp": 0.01033695, + "balance_loss_clip": 1.05635548, + "balance_loss_mlp": 1.02415848, + "epoch": 0.21932303252570193, + "flos": 29242238734080.0, + "grad_norm": 1.9609050521986686, + "language_loss": 0.81072438, + "learning_rate": 3.635793134936044e-06, + "loss": 0.83305407, + "num_input_tokens_seen": 38840460, + "step": 1824, + "time_per_iteration": 2.566561698913574 + }, + { + "auxiliary_loss_clip": 0.01195356, + "auxiliary_loss_mlp": 0.01035536, + "balance_loss_clip": 1.05831695, + "balance_loss_mlp": 1.0261538, + "epoch": 0.219443275416341, + "flos": 20806857907200.0, + "grad_norm": 1.6182071903449606, + "language_loss": 0.72935617, + "learning_rate": 3.635344817633494e-06, + "loss": 0.75166512, + "num_input_tokens_seen": 38859775, + "step": 1825, + "time_per_iteration": 2.53867244720459 + }, + { + "auxiliary_loss_clip": 0.01192413, + "auxiliary_loss_mlp": 0.0103212, + "balance_loss_clip": 1.05574012, + "balance_loss_mlp": 1.02262545, + "epoch": 0.2195635183069801, + "flos": 14501555458560.0, + "grad_norm": 2.976662189671193, + "language_loss": 0.75858116, + "learning_rate": 3.634896252250436e-06, + "loss": 0.78082657, + "num_input_tokens_seen": 38876540, + "step": 1826, + "time_per_iteration": 2.560014247894287 + }, + { + "auxiliary_loss_clip": 0.01214276, + "auxiliary_loss_mlp": 0.01040797, + "balance_loss_clip": 1.06018138, + "balance_loss_mlp": 1.03176117, + "epoch": 0.2196837611976192, + "flos": 24243473589120.0, + "grad_norm": 1.7433056330637973, + "language_loss": 0.82359648, + "learning_rate": 3.6344474388549157e-06, + "loss": 0.84614724, + "num_input_tokens_seen": 38896195, + "step": 1827, + "time_per_iteration": 2.5642004013061523 + }, + { + "auxiliary_loss_clip": 0.01202159, + "auxiliary_loss_mlp": 0.0103973, + "balance_loss_clip": 1.0615443, + "balance_loss_mlp": 1.02918053, + "epoch": 0.2198040040882583, + "flos": 18074523168000.0, + "grad_norm": 2.087856989119459, + "language_loss": 0.80338991, + "learning_rate": 3.6339983775150183e-06, + "loss": 0.82580876, + "num_input_tokens_seen": 38912755, + "step": 1828, + "time_per_iteration": 2.4862422943115234 + }, + { + "auxiliary_loss_clip": 0.01194625, + "auxiliary_loss_mlp": 0.01028659, + "balance_loss_clip": 1.05833197, + "balance_loss_mlp": 1.01859164, + "epoch": 0.21992424697889737, + "flos": 17784185535360.0, + "grad_norm": 2.730518147215089, + "language_loss": 0.84639311, + "learning_rate": 3.6335490682988664e-06, + "loss": 0.868626, + "num_input_tokens_seen": 38928365, + "step": 1829, + "time_per_iteration": 3.16715669631958 + }, + { + "auxiliary_loss_clip": 0.01128786, + "auxiliary_loss_mlp": 0.0103012, + "balance_loss_clip": 1.04760718, + "balance_loss_mlp": 1.02090514, + "epoch": 0.22004448986953645, + "flos": 17638495971840.0, + "grad_norm": 2.022877083723112, + "language_loss": 0.82804012, + "learning_rate": 3.63309951127462e-06, + "loss": 0.84962922, + "num_input_tokens_seen": 38945275, + "step": 1830, + "time_per_iteration": 2.583641529083252 + }, + { + "auxiliary_loss_clip": 0.01168499, + "auxiliary_loss_mlp": 0.01036991, + "balance_loss_clip": 1.05688262, + "balance_loss_mlp": 1.02685273, + "epoch": 0.22016473276017556, + "flos": 22275533203200.0, + "grad_norm": 2.0046618014262867, + "language_loss": 0.75382316, + "learning_rate": 3.6326497065104757e-06, + "loss": 0.77587813, + "num_input_tokens_seen": 38965740, + "step": 1831, + "time_per_iteration": 2.584730863571167 + }, + { + "auxiliary_loss_clip": 0.01203722, + "auxiliary_loss_mlp": 0.01033066, + "balance_loss_clip": 1.05944359, + "balance_loss_mlp": 1.02389312, + "epoch": 0.22028497565081465, + "flos": 25556259859200.0, + "grad_norm": 1.925091141759111, + "language_loss": 0.78081775, + "learning_rate": 3.6321996540746697e-06, + "loss": 0.80318558, + "num_input_tokens_seen": 38984815, + "step": 1832, + "time_per_iteration": 2.5513203144073486 + }, + { + "auxiliary_loss_clip": 0.01166381, + "auxiliary_loss_mlp": 0.01029338, + "balance_loss_clip": 1.0551405, + "balance_loss_mlp": 1.01982474, + "epoch": 0.22040521854145373, + "flos": 36247332925440.0, + "grad_norm": 1.9083345527971114, + "language_loss": 0.80450833, + "learning_rate": 3.6317493540354733e-06, + "loss": 0.82646549, + "num_input_tokens_seen": 39008230, + "step": 1833, + "time_per_iteration": 2.6791467666625977 + }, + { + "auxiliary_loss_clip": 0.01194903, + "auxiliary_loss_mlp": 0.01035399, + "balance_loss_clip": 1.05683494, + "balance_loss_mlp": 1.02612472, + "epoch": 0.22052546143209284, + "flos": 11838420270720.0, + "grad_norm": 2.059046035206854, + "language_loss": 0.76853907, + "learning_rate": 3.6312988064611976e-06, + "loss": 0.79084206, + "num_input_tokens_seen": 39026540, + "step": 1834, + "time_per_iteration": 3.275179386138916 + }, + { + "auxiliary_loss_clip": 0.0116843, + "auxiliary_loss_mlp": 0.010331, + "balance_loss_clip": 1.05117965, + "balance_loss_mlp": 1.02353954, + "epoch": 0.22064570432273192, + "flos": 24209250906240.0, + "grad_norm": 1.6572727489599852, + "language_loss": 0.8115716, + "learning_rate": 3.6308480114201896e-06, + "loss": 0.83358693, + "num_input_tokens_seen": 39048460, + "step": 1835, + "time_per_iteration": 3.381585121154785 + }, + { + "auxiliary_loss_clip": 0.01214661, + "auxiliary_loss_mlp": 0.01035631, + "balance_loss_clip": 1.06240094, + "balance_loss_mlp": 1.02597463, + "epoch": 0.220765947213371, + "flos": 17931347556480.0, + "grad_norm": 1.6303187945765194, + "language_loss": 0.76449674, + "learning_rate": 3.630396968980835e-06, + "loss": 0.7869997, + "num_input_tokens_seen": 39066335, + "step": 1836, + "time_per_iteration": 3.226306915283203 + }, + { + "auxiliary_loss_clip": 0.0118379, + "auxiliary_loss_mlp": 0.01036839, + "balance_loss_clip": 1.05556369, + "balance_loss_mlp": 1.02720666, + "epoch": 0.2208861901040101, + "flos": 26757040544640.0, + "grad_norm": 2.34188841610962, + "language_loss": 0.83305025, + "learning_rate": 3.6299456792115575e-06, + "loss": 0.85525656, + "num_input_tokens_seen": 39087590, + "step": 1837, + "time_per_iteration": 2.579756021499634 + }, + { + "auxiliary_loss_clip": 0.01109395, + "auxiliary_loss_mlp": 0.01032051, + "balance_loss_clip": 1.04467535, + "balance_loss_mlp": 1.022681, + "epoch": 0.2210064329946492, + "flos": 17817977255040.0, + "grad_norm": 1.8787733227885366, + "language_loss": 0.81104648, + "learning_rate": 3.629494142180815e-06, + "loss": 0.832461, + "num_input_tokens_seen": 39106335, + "step": 1838, + "time_per_iteration": 2.6428868770599365 + }, + { + "auxiliary_loss_clip": 0.01211607, + "auxiliary_loss_mlp": 0.01032526, + "balance_loss_clip": 1.05945361, + "balance_loss_mlp": 1.02279878, + "epoch": 0.22112667588528828, + "flos": 17967401832960.0, + "grad_norm": 2.192141184488297, + "language_loss": 0.84665704, + "learning_rate": 3.6290423579571075e-06, + "loss": 0.86909837, + "num_input_tokens_seen": 39122875, + "step": 1839, + "time_per_iteration": 2.4318180084228516 + }, + { + "auxiliary_loss_clip": 0.01193168, + "auxiliary_loss_mlp": 0.01033304, + "balance_loss_clip": 1.05728269, + "balance_loss_mlp": 1.02330792, + "epoch": 0.22124691877592736, + "flos": 18369206346240.0, + "grad_norm": 1.5885215616178183, + "language_loss": 0.80233574, + "learning_rate": 3.6285903266089694e-06, + "loss": 0.82460046, + "num_input_tokens_seen": 39142150, + "step": 1840, + "time_per_iteration": 2.489739418029785 + }, + { + "auxiliary_loss_clip": 0.0118646, + "auxiliary_loss_mlp": 0.0102854, + "balance_loss_clip": 1.05894637, + "balance_loss_mlp": 1.01866376, + "epoch": 0.22136716166656648, + "flos": 20813286441600.0, + "grad_norm": 1.8301591364069565, + "language_loss": 0.77234381, + "learning_rate": 3.628138048204974e-06, + "loss": 0.79449379, + "num_input_tokens_seen": 39162835, + "step": 1841, + "time_per_iteration": 2.5371081829071045 + }, + { + "auxiliary_loss_clip": 0.01145703, + "auxiliary_loss_mlp": 0.0103393, + "balance_loss_clip": 1.05325007, + "balance_loss_mlp": 1.02289701, + "epoch": 0.22148740455720556, + "flos": 17675699483520.0, + "grad_norm": 1.8010919136893315, + "language_loss": 0.75790048, + "learning_rate": 3.6276855228137304e-06, + "loss": 0.77969682, + "num_input_tokens_seen": 39181040, + "step": 1842, + "time_per_iteration": 2.565692663192749 + }, + { + "auxiliary_loss_clip": 0.01213577, + "auxiliary_loss_mlp": 0.00764442, + "balance_loss_clip": 1.06047845, + "balance_loss_mlp": 1.00096917, + "epoch": 0.22160764744784464, + "flos": 21726710323200.0, + "grad_norm": 2.0494830063426823, + "language_loss": 0.81767738, + "learning_rate": 3.6272327505038874e-06, + "loss": 0.83745754, + "num_input_tokens_seen": 39197505, + "step": 1843, + "time_per_iteration": 2.4674606323242188 + }, + { + "auxiliary_loss_clip": 0.01156463, + "auxiliary_loss_mlp": 0.01035185, + "balance_loss_clip": 1.0518589, + "balance_loss_mlp": 1.0261606, + "epoch": 0.22172789033848372, + "flos": 23764712186880.0, + "grad_norm": 1.8202353510729208, + "language_loss": 0.78500032, + "learning_rate": 3.626779731344131e-06, + "loss": 0.80691683, + "num_input_tokens_seen": 39217295, + "step": 1844, + "time_per_iteration": 2.614363431930542 + }, + { + "auxiliary_loss_clip": 0.01206011, + "auxiliary_loss_mlp": 0.01035428, + "balance_loss_clip": 1.05712724, + "balance_loss_mlp": 1.02621329, + "epoch": 0.22184813322912283, + "flos": 16982300361600.0, + "grad_norm": 1.925291695313394, + "language_loss": 0.84797025, + "learning_rate": 3.6263264654031814e-06, + "loss": 0.87038469, + "num_input_tokens_seen": 39234195, + "step": 1845, + "time_per_iteration": 2.439263105392456 + }, + { + "auxiliary_loss_clip": 0.01068229, + "auxiliary_loss_mlp": 0.01004223, + "balance_loss_clip": 1.01810765, + "balance_loss_mlp": 1.00216031, + "epoch": 0.22196837611976192, + "flos": 61823740314240.0, + "grad_norm": 0.7002515154104747, + "language_loss": 0.5920229, + "learning_rate": 3.6258729527498008e-06, + "loss": 0.61274743, + "num_input_tokens_seen": 39295040, + "step": 1846, + "time_per_iteration": 3.1123878955841064 + }, + { + "auxiliary_loss_clip": 0.01188137, + "auxiliary_loss_mlp": 0.01031983, + "balance_loss_clip": 1.0601387, + "balance_loss_mlp": 1.02300072, + "epoch": 0.222088619010401, + "flos": 25558019625600.0, + "grad_norm": 2.584833786171382, + "language_loss": 0.64709997, + "learning_rate": 3.6254191934527854e-06, + "loss": 0.66930115, + "num_input_tokens_seen": 39314395, + "step": 1847, + "time_per_iteration": 2.562049150466919 + }, + { + "auxiliary_loss_clip": 0.01166238, + "auxiliary_loss_mlp": 0.01031714, + "balance_loss_clip": 1.0582577, + "balance_loss_mlp": 1.02153349, + "epoch": 0.2222088619010401, + "flos": 19318612677120.0, + "grad_norm": 1.9226166159272982, + "language_loss": 0.64894891, + "learning_rate": 3.6249651875809715e-06, + "loss": 0.67092842, + "num_input_tokens_seen": 39334275, + "step": 1848, + "time_per_iteration": 2.5935308933258057 + }, + { + "auxiliary_loss_clip": 0.01175522, + "auxiliary_loss_mlp": 0.01029503, + "balance_loss_clip": 1.05706406, + "balance_loss_mlp": 1.02031803, + "epoch": 0.2223291047916792, + "flos": 19099342103040.0, + "grad_norm": 7.227544714356135, + "language_loss": 0.89247525, + "learning_rate": 3.62451093520323e-06, + "loss": 0.91452557, + "num_input_tokens_seen": 39352180, + "step": 1849, + "time_per_iteration": 2.5309360027313232 + }, + { + "auxiliary_loss_clip": 0.01148092, + "auxiliary_loss_mlp": 0.01044556, + "balance_loss_clip": 1.05008125, + "balance_loss_mlp": 1.03546, + "epoch": 0.22244934768231828, + "flos": 20850418126080.0, + "grad_norm": 2.1579872598994565, + "language_loss": 0.90356302, + "learning_rate": 3.6240564363884714e-06, + "loss": 0.92548943, + "num_input_tokens_seen": 39372125, + "step": 1850, + "time_per_iteration": 2.63720440864563 + }, + { + "auxiliary_loss_clip": 0.01200005, + "auxiliary_loss_mlp": 0.01036502, + "balance_loss_clip": 1.05682766, + "balance_loss_mlp": 1.02674496, + "epoch": 0.2225695905729574, + "flos": 15632921111040.0, + "grad_norm": 1.75157871921144, + "language_loss": 0.70813847, + "learning_rate": 3.623601691205643e-06, + "loss": 0.73050356, + "num_input_tokens_seen": 39391200, + "step": 1851, + "time_per_iteration": 2.4775195121765137 + }, + { + "auxiliary_loss_clip": 0.01194789, + "auxiliary_loss_mlp": 0.01030802, + "balance_loss_clip": 1.05567741, + "balance_loss_mlp": 1.02156365, + "epoch": 0.22268983346359647, + "flos": 25373582265600.0, + "grad_norm": 1.8621046458517891, + "language_loss": 0.81557226, + "learning_rate": 3.623146699723729e-06, + "loss": 0.83782822, + "num_input_tokens_seen": 39410660, + "step": 1852, + "time_per_iteration": 2.531960964202881 + }, + { + "auxiliary_loss_clip": 0.01186888, + "auxiliary_loss_mlp": 0.01037261, + "balance_loss_clip": 1.06328213, + "balance_loss_mlp": 1.02761054, + "epoch": 0.22281007635423555, + "flos": 13261452359040.0, + "grad_norm": 1.6332051369645153, + "language_loss": 0.77762592, + "learning_rate": 3.6226914620117507e-06, + "loss": 0.79986745, + "num_input_tokens_seen": 39429280, + "step": 1853, + "time_per_iteration": 2.5118017196655273 + }, + { + "auxiliary_loss_clip": 0.01169863, + "auxiliary_loss_mlp": 0.01031593, + "balance_loss_clip": 1.05193532, + "balance_loss_mlp": 1.02275991, + "epoch": 0.22293031924487464, + "flos": 15340536403200.0, + "grad_norm": 2.0006313314340067, + "language_loss": 0.80482614, + "learning_rate": 3.622235978138768e-06, + "loss": 0.82684064, + "num_input_tokens_seen": 39446905, + "step": 1854, + "time_per_iteration": 2.6237547397613525 + }, + { + "auxiliary_loss_clip": 0.01197278, + "auxiliary_loss_mlp": 0.01032242, + "balance_loss_clip": 1.06022823, + "balance_loss_mlp": 1.02272892, + "epoch": 0.22305056213551375, + "flos": 22564649773440.0, + "grad_norm": 2.1398346854486725, + "language_loss": 0.81190479, + "learning_rate": 3.621780248173877e-06, + "loss": 0.83420002, + "num_input_tokens_seen": 39465105, + "step": 1855, + "time_per_iteration": 2.4982926845550537 + }, + { + "auxiliary_loss_clip": 0.01097183, + "auxiliary_loss_mlp": 0.01002805, + "balance_loss_clip": 1.02057099, + "balance_loss_mlp": 1.00080252, + "epoch": 0.22317080502615283, + "flos": 64880419887360.0, + "grad_norm": 0.8410559876680398, + "language_loss": 0.61048156, + "learning_rate": 3.6213242721862125e-06, + "loss": 0.63148147, + "num_input_tokens_seen": 39523560, + "step": 1856, + "time_per_iteration": 3.818101406097412 + }, + { + "auxiliary_loss_clip": 0.01173337, + "auxiliary_loss_mlp": 0.01034791, + "balance_loss_clip": 1.05621672, + "balance_loss_mlp": 1.02550411, + "epoch": 0.2232910479167919, + "flos": 25775997310080.0, + "grad_norm": 1.4987662474977888, + "language_loss": 0.75227791, + "learning_rate": 3.620868050244945e-06, + "loss": 0.77435923, + "num_input_tokens_seen": 39544040, + "step": 1857, + "time_per_iteration": 2.577911376953125 + }, + { + "auxiliary_loss_clip": 0.01178091, + "auxiliary_loss_mlp": 0.01029488, + "balance_loss_clip": 1.05635858, + "balance_loss_mlp": 1.01986146, + "epoch": 0.22341129080743102, + "flos": 23251799928960.0, + "grad_norm": 1.9680800985008076, + "language_loss": 0.7753967, + "learning_rate": 3.6204115824192817e-06, + "loss": 0.79747254, + "num_input_tokens_seen": 39561515, + "step": 1858, + "time_per_iteration": 2.5499627590179443 + }, + { + "auxiliary_loss_clip": 0.01173993, + "auxiliary_loss_mlp": 0.0103242, + "balance_loss_clip": 1.05305171, + "balance_loss_mlp": 1.02219737, + "epoch": 0.2235315336980701, + "flos": 21214552250880.0, + "grad_norm": 2.808298294099733, + "language_loss": 0.76733863, + "learning_rate": 3.619954868778471e-06, + "loss": 0.78940272, + "num_input_tokens_seen": 39578210, + "step": 1859, + "time_per_iteration": 2.5119450092315674 + }, + { + "auxiliary_loss_clip": 0.01183139, + "auxiliary_loss_mlp": 0.01031631, + "balance_loss_clip": 1.05604696, + "balance_loss_mlp": 1.02290463, + "epoch": 0.2236517765887092, + "flos": 19901945548800.0, + "grad_norm": 1.7440016445338404, + "language_loss": 0.82612014, + "learning_rate": 3.6194979093917944e-06, + "loss": 0.84826785, + "num_input_tokens_seen": 39597625, + "step": 1860, + "time_per_iteration": 2.5276501178741455 + }, + { + "auxiliary_loss_clip": 0.01177121, + "auxiliary_loss_mlp": 0.010349, + "balance_loss_clip": 1.05537796, + "balance_loss_mlp": 1.02556002, + "epoch": 0.22377201947934827, + "flos": 23214847812480.0, + "grad_norm": 1.7725746526255604, + "language_loss": 0.87095654, + "learning_rate": 3.6190407043285724e-06, + "loss": 0.89307678, + "num_input_tokens_seen": 39615360, + "step": 1861, + "time_per_iteration": 4.09570837020874 + }, + { + "auxiliary_loss_clip": 0.01215014, + "auxiliary_loss_mlp": 0.01037519, + "balance_loss_clip": 1.06042933, + "balance_loss_mlp": 1.02786326, + "epoch": 0.22389226236998738, + "flos": 26794244056320.0, + "grad_norm": 1.8921639252287807, + "language_loss": 0.75986218, + "learning_rate": 3.618583253658163e-06, + "loss": 0.7823875, + "num_input_tokens_seen": 39635460, + "step": 1862, + "time_per_iteration": 2.5491108894348145 + }, + { + "auxiliary_loss_clip": 0.01152737, + "auxiliary_loss_mlp": 0.00764669, + "balance_loss_clip": 1.05302012, + "balance_loss_mlp": 1.0010612, + "epoch": 0.22401250526062647, + "flos": 24170359455360.0, + "grad_norm": 2.952019235629655, + "language_loss": 0.86152518, + "learning_rate": 3.618125557449961e-06, + "loss": 0.88069916, + "num_input_tokens_seen": 39653515, + "step": 1863, + "time_per_iteration": 3.5280940532684326 + }, + { + "auxiliary_loss_clip": 0.01191843, + "auxiliary_loss_mlp": 0.010288, + "balance_loss_clip": 1.0563668, + "balance_loss_mlp": 1.01929879, + "epoch": 0.22413274815126555, + "flos": 16759761649920.0, + "grad_norm": 2.311019623290249, + "language_loss": 0.83227563, + "learning_rate": 3.6176676157733983e-06, + "loss": 0.85448205, + "num_input_tokens_seen": 39668525, + "step": 1864, + "time_per_iteration": 2.462193489074707 + }, + { + "auxiliary_loss_clip": 0.01161394, + "auxiliary_loss_mlp": 0.01037259, + "balance_loss_clip": 1.05289829, + "balance_loss_mlp": 1.02735877, + "epoch": 0.22425299104190466, + "flos": 21360205900800.0, + "grad_norm": 2.8505556215349483, + "language_loss": 0.75787318, + "learning_rate": 3.6172094286979443e-06, + "loss": 0.77985966, + "num_input_tokens_seen": 39685895, + "step": 1865, + "time_per_iteration": 2.609046697616577 + }, + { + "auxiliary_loss_clip": 0.01181035, + "auxiliary_loss_mlp": 0.01032759, + "balance_loss_clip": 1.05404341, + "balance_loss_mlp": 1.02346706, + "epoch": 0.22437323393254374, + "flos": 32165547108480.0, + "grad_norm": 1.4156214573285086, + "language_loss": 0.81338, + "learning_rate": 3.6167509962931064e-06, + "loss": 0.83551788, + "num_input_tokens_seen": 39711595, + "step": 1866, + "time_per_iteration": 2.6897945404052734 + }, + { + "auxiliary_loss_clip": 0.01161041, + "auxiliary_loss_mlp": 0.01028528, + "balance_loss_clip": 1.05710804, + "balance_loss_mlp": 1.01896751, + "epoch": 0.22449347682318282, + "flos": 18002809664640.0, + "grad_norm": 2.2370069235604504, + "language_loss": 0.76549971, + "learning_rate": 3.6162923186284276e-06, + "loss": 0.78739536, + "num_input_tokens_seen": 39727555, + "step": 1867, + "time_per_iteration": 2.6231961250305176 + }, + { + "auxiliary_loss_clip": 0.01182318, + "auxiliary_loss_mlp": 0.01036307, + "balance_loss_clip": 1.05549192, + "balance_loss_mlp": 1.02699065, + "epoch": 0.2246137197138219, + "flos": 18697286194560.0, + "grad_norm": 2.0421828904511967, + "language_loss": 0.85978276, + "learning_rate": 3.6158333957734888e-06, + "loss": 0.88196903, + "num_input_tokens_seen": 39746145, + "step": 1868, + "time_per_iteration": 2.6381263732910156 + }, + { + "auxiliary_loss_clip": 0.01167205, + "auxiliary_loss_mlp": 0.0103462, + "balance_loss_clip": 1.05100977, + "balance_loss_mlp": 1.02525008, + "epoch": 0.22473396260446102, + "flos": 15590653781760.0, + "grad_norm": 1.9013850075845973, + "language_loss": 0.82532275, + "learning_rate": 3.6153742277979088e-06, + "loss": 0.84734106, + "num_input_tokens_seen": 39763575, + "step": 1869, + "time_per_iteration": 2.545250654220581 + }, + { + "auxiliary_loss_clip": 0.01183896, + "auxiliary_loss_mlp": 0.01039762, + "balance_loss_clip": 1.055112, + "balance_loss_mlp": 1.03070188, + "epoch": 0.2248542054951001, + "flos": 14465501182080.0, + "grad_norm": 2.057626890370743, + "language_loss": 0.77764982, + "learning_rate": 3.6149148147713434e-06, + "loss": 0.79988641, + "num_input_tokens_seen": 39781810, + "step": 1870, + "time_per_iteration": 2.544001579284668 + }, + { + "auxiliary_loss_clip": 0.01201153, + "auxiliary_loss_mlp": 0.01038132, + "balance_loss_clip": 1.06147194, + "balance_loss_mlp": 1.02956057, + "epoch": 0.22497444838573918, + "flos": 19243882431360.0, + "grad_norm": 1.766461432513536, + "language_loss": 0.8652041, + "learning_rate": 3.614455156763484e-06, + "loss": 0.88759691, + "num_input_tokens_seen": 39800115, + "step": 1871, + "time_per_iteration": 2.4834346771240234 + }, + { + "auxiliary_loss_clip": 0.01146557, + "auxiliary_loss_mlp": 0.01032601, + "balance_loss_clip": 1.04785085, + "balance_loss_mlp": 1.02363062, + "epoch": 0.2250946912763783, + "flos": 16910299549440.0, + "grad_norm": 2.6370225819270434, + "language_loss": 0.70896232, + "learning_rate": 3.613995253844061e-06, + "loss": 0.7307539, + "num_input_tokens_seen": 39817795, + "step": 1872, + "time_per_iteration": 2.5602660179138184 + }, + { + "auxiliary_loss_clip": 0.01196406, + "auxiliary_loss_mlp": 0.0103535, + "balance_loss_clip": 1.06020665, + "balance_loss_mlp": 1.02601624, + "epoch": 0.22521493416701738, + "flos": 24681368292480.0, + "grad_norm": 1.798372688699403, + "language_loss": 0.80867589, + "learning_rate": 3.6135351060828414e-06, + "loss": 0.83099341, + "num_input_tokens_seen": 39838270, + "step": 1873, + "time_per_iteration": 2.5409224033355713 + }, + { + "auxiliary_loss_clip": 0.01220434, + "auxiliary_loss_mlp": 0.0103906, + "balance_loss_clip": 1.06436157, + "balance_loss_mlp": 1.02898717, + "epoch": 0.22533517705765646, + "flos": 17821963664640.0, + "grad_norm": 2.0492729597735324, + "language_loss": 0.69010735, + "learning_rate": 3.6130747135496285e-06, + "loss": 0.71270227, + "num_input_tokens_seen": 39857270, + "step": 1874, + "time_per_iteration": 2.4644970893859863 + }, + { + "auxiliary_loss_clip": 0.01210026, + "auxiliary_loss_mlp": 0.01033588, + "balance_loss_clip": 1.05867648, + "balance_loss_mlp": 1.02365756, + "epoch": 0.22545541994829554, + "flos": 33691390899840.0, + "grad_norm": 1.7827911741383184, + "language_loss": 0.66058755, + "learning_rate": 3.6126140763142646e-06, + "loss": 0.68302369, + "num_input_tokens_seen": 39882300, + "step": 1875, + "time_per_iteration": 2.559697151184082 + }, + { + "auxiliary_loss_clip": 0.01212399, + "auxiliary_loss_mlp": 0.01037022, + "balance_loss_clip": 1.06109285, + "balance_loss_mlp": 1.02701998, + "epoch": 0.22557566283893465, + "flos": 19171594310400.0, + "grad_norm": 2.4274741006433356, + "language_loss": 0.85937142, + "learning_rate": 3.6121531944466275e-06, + "loss": 0.88186562, + "num_input_tokens_seen": 39899625, + "step": 1876, + "time_per_iteration": 2.440394878387451 + }, + { + "auxiliary_loss_clip": 0.01194085, + "auxiliary_loss_mlp": 0.01034832, + "balance_loss_clip": 1.05843496, + "balance_loss_mlp": 1.02622509, + "epoch": 0.22569590572957374, + "flos": 20773281669120.0, + "grad_norm": 2.1539290620252047, + "language_loss": 0.77862257, + "learning_rate": 3.611692068016633e-06, + "loss": 0.80091172, + "num_input_tokens_seen": 39915955, + "step": 1877, + "time_per_iteration": 2.4572715759277344 + }, + { + "auxiliary_loss_clip": 0.01161585, + "auxiliary_loss_mlp": 0.01040391, + "balance_loss_clip": 1.05093312, + "balance_loss_mlp": 1.02957284, + "epoch": 0.22581614862021282, + "flos": 18442715529600.0, + "grad_norm": 2.5197761837077675, + "language_loss": 0.75196058, + "learning_rate": 3.611230697094233e-06, + "loss": 0.77398038, + "num_input_tokens_seen": 39932655, + "step": 1878, + "time_per_iteration": 2.526951789855957 + }, + { + "auxiliary_loss_clip": 0.01185388, + "auxiliary_loss_mlp": 0.01034721, + "balance_loss_clip": 1.056512, + "balance_loss_mlp": 1.02591705, + "epoch": 0.22593639151085193, + "flos": 20048389297920.0, + "grad_norm": 1.678854837733348, + "language_loss": 0.87091386, + "learning_rate": 3.6107690817494173e-06, + "loss": 0.89311492, + "num_input_tokens_seen": 39952875, + "step": 1879, + "time_per_iteration": 2.522254467010498 + }, + { + "auxiliary_loss_clip": 0.01148417, + "auxiliary_loss_mlp": 0.01030268, + "balance_loss_clip": 1.05149508, + "balance_loss_mlp": 1.02117801, + "epoch": 0.226056634401491, + "flos": 13115116350720.0, + "grad_norm": 2.1488764209036777, + "language_loss": 0.70709503, + "learning_rate": 3.6103072220522117e-06, + "loss": 0.7288819, + "num_input_tokens_seen": 39968405, + "step": 1880, + "time_per_iteration": 2.560596466064453 + }, + { + "auxiliary_loss_clip": 0.01169747, + "auxiliary_loss_mlp": 0.01033641, + "balance_loss_clip": 1.05342841, + "balance_loss_mlp": 1.02445626, + "epoch": 0.2261768772921301, + "flos": 18988378012800.0, + "grad_norm": 1.7765401844506985, + "language_loss": 0.91854233, + "learning_rate": 3.609845118072682e-06, + "loss": 0.94057626, + "num_input_tokens_seen": 39987075, + "step": 1881, + "time_per_iteration": 2.553225040435791 + }, + { + "auxiliary_loss_clip": 0.01202534, + "auxiliary_loss_mlp": 0.00764683, + "balance_loss_clip": 1.05849695, + "balance_loss_mlp": 1.00138748, + "epoch": 0.2262971201827692, + "flos": 19974054101760.0, + "grad_norm": 1.694630358953258, + "language_loss": 0.79980481, + "learning_rate": 3.6093827698809276e-06, + "loss": 0.81947696, + "num_input_tokens_seen": 40006175, + "step": 1882, + "time_per_iteration": 2.507412910461426 + }, + { + "auxiliary_loss_clip": 0.01194086, + "auxiliary_loss_mlp": 0.01033174, + "balance_loss_clip": 1.05495417, + "balance_loss_mlp": 1.02384615, + "epoch": 0.2264173630734083, + "flos": 16654543735680.0, + "grad_norm": 2.4870651622619486, + "language_loss": 0.84627283, + "learning_rate": 3.6089201775470864e-06, + "loss": 0.86854541, + "num_input_tokens_seen": 40021630, + "step": 1883, + "time_per_iteration": 3.356411933898926 + }, + { + "auxiliary_loss_clip": 0.01156713, + "auxiliary_loss_mlp": 0.01033051, + "balance_loss_clip": 1.05406153, + "balance_loss_mlp": 1.02385366, + "epoch": 0.22653760596404737, + "flos": 24389809597440.0, + "grad_norm": 1.5980422163105645, + "language_loss": 0.7733652, + "learning_rate": 3.6084573411413334e-06, + "loss": 0.79526281, + "num_input_tokens_seen": 40041025, + "step": 1884, + "time_per_iteration": 2.639164447784424 + }, + { + "auxiliary_loss_clip": 0.01167947, + "auxiliary_loss_mlp": 0.01034053, + "balance_loss_clip": 1.05470824, + "balance_loss_mlp": 1.02347338, + "epoch": 0.22665784885468646, + "flos": 18332541538560.0, + "grad_norm": 2.002538122309892, + "language_loss": 0.80942845, + "learning_rate": 3.607994260733881e-06, + "loss": 0.8314485, + "num_input_tokens_seen": 40060265, + "step": 1885, + "time_per_iteration": 2.5440845489501953 + }, + { + "auxiliary_loss_clip": 0.01185279, + "auxiliary_loss_mlp": 0.01031634, + "balance_loss_clip": 1.05463064, + "balance_loss_mlp": 1.02295518, + "epoch": 0.22677809174532557, + "flos": 24058102475520.0, + "grad_norm": 1.5554029587567604, + "language_loss": 0.74481738, + "learning_rate": 3.6075309363949776e-06, + "loss": 0.76698655, + "num_input_tokens_seen": 40079435, + "step": 1886, + "time_per_iteration": 2.522874355316162 + }, + { + "auxiliary_loss_clip": 0.01212279, + "auxiliary_loss_mlp": 0.01032151, + "balance_loss_clip": 1.05919671, + "balance_loss_mlp": 1.02242374, + "epoch": 0.22689833463596465, + "flos": 20374242503040.0, + "grad_norm": 1.85262244672997, + "language_loss": 0.81397605, + "learning_rate": 3.6070673681949094e-06, + "loss": 0.8364203, + "num_input_tokens_seen": 40097800, + "step": 1887, + "time_per_iteration": 2.456326723098755 + }, + { + "auxiliary_loss_clip": 0.01185055, + "auxiliary_loss_mlp": 0.00764559, + "balance_loss_clip": 1.05831265, + "balance_loss_mlp": 1.00133884, + "epoch": 0.22701857752660373, + "flos": 30120398438400.0, + "grad_norm": 1.6268443727496442, + "language_loss": 0.81358945, + "learning_rate": 3.606603556203999e-06, + "loss": 0.8330856, + "num_input_tokens_seen": 40122745, + "step": 1888, + "time_per_iteration": 4.228064298629761 + }, + { + "auxiliary_loss_clip": 0.01198232, + "auxiliary_loss_mlp": 0.01031854, + "balance_loss_clip": 1.05625868, + "balance_loss_mlp": 1.02239454, + "epoch": 0.22713882041724284, + "flos": 22492182084480.0, + "grad_norm": 1.7632587759513543, + "language_loss": 0.83352435, + "learning_rate": 3.6061395004926066e-06, + "loss": 0.85582519, + "num_input_tokens_seen": 40141680, + "step": 1889, + "time_per_iteration": 3.262193441390991 + }, + { + "auxiliary_loss_clip": 0.01211146, + "auxiliary_loss_mlp": 0.01033121, + "balance_loss_clip": 1.05805635, + "balance_loss_mlp": 1.02418613, + "epoch": 0.22725906330788193, + "flos": 20521548178560.0, + "grad_norm": 2.2855377048546073, + "language_loss": 0.85256624, + "learning_rate": 3.605675201131129e-06, + "loss": 0.87500888, + "num_input_tokens_seen": 40160140, + "step": 1890, + "time_per_iteration": 2.4436450004577637 + }, + { + "auxiliary_loss_clip": 0.0120498, + "auxiliary_loss_mlp": 0.0103382, + "balance_loss_clip": 1.06190777, + "balance_loss_mlp": 1.02477169, + "epoch": 0.227379306198521, + "flos": 18989922297600.0, + "grad_norm": 3.085510768938293, + "language_loss": 0.79544914, + "learning_rate": 3.60521065819e-06, + "loss": 0.81783712, + "num_input_tokens_seen": 40177450, + "step": 1891, + "time_per_iteration": 2.471529245376587 + }, + { + "auxiliary_loss_clip": 0.01186725, + "auxiliary_loss_mlp": 0.01030072, + "balance_loss_clip": 1.05681765, + "balance_loss_mlp": 1.02118468, + "epoch": 0.2274995490891601, + "flos": 21798351999360.0, + "grad_norm": 1.750440729399665, + "language_loss": 0.87294722, + "learning_rate": 3.60474587173969e-06, + "loss": 0.8951152, + "num_input_tokens_seen": 40195935, + "step": 1892, + "time_per_iteration": 2.5093612670898438 + }, + { + "auxiliary_loss_clip": 0.01194574, + "auxiliary_loss_mlp": 0.01033906, + "balance_loss_clip": 1.05955219, + "balance_loss_mlp": 1.02458954, + "epoch": 0.2276197919797992, + "flos": 19058654972160.0, + "grad_norm": 2.1311786087909104, + "language_loss": 0.84084237, + "learning_rate": 3.6042808418507084e-06, + "loss": 0.86312717, + "num_input_tokens_seen": 40213620, + "step": 1893, + "time_per_iteration": 2.4648184776306152 + }, + { + "auxiliary_loss_clip": 0.01200071, + "auxiliary_loss_mlp": 0.01035343, + "balance_loss_clip": 1.06109428, + "balance_loss_mlp": 1.02554417, + "epoch": 0.22774003487043828, + "flos": 18806777827200.0, + "grad_norm": 1.955299249982967, + "language_loss": 0.76953387, + "learning_rate": 3.6038155685935976e-06, + "loss": 0.791888, + "num_input_tokens_seen": 40230190, + "step": 1894, + "time_per_iteration": 2.4637415409088135 + }, + { + "auxiliary_loss_clip": 0.01197584, + "auxiliary_loss_mlp": 0.01031712, + "balance_loss_clip": 1.05956101, + "balance_loss_mlp": 1.02306914, + "epoch": 0.22786027776107737, + "flos": 23002544476800.0, + "grad_norm": 2.000883448980883, + "language_loss": 0.7022593, + "learning_rate": 3.6033500520389404e-06, + "loss": 0.72455221, + "num_input_tokens_seen": 40246860, + "step": 1895, + "time_per_iteration": 2.491769790649414 + }, + { + "auxiliary_loss_clip": 0.01068616, + "auxiliary_loss_mlp": 0.01009785, + "balance_loss_clip": 1.02241266, + "balance_loss_mlp": 1.00771105, + "epoch": 0.22798052065171648, + "flos": 66706872600960.0, + "grad_norm": 0.7949700442855614, + "language_loss": 0.64839649, + "learning_rate": 3.6028842922573553e-06, + "loss": 0.66918051, + "num_input_tokens_seen": 40311005, + "step": 1896, + "time_per_iteration": 3.22192645072937 + }, + { + "auxiliary_loss_clip": 0.01077348, + "auxiliary_loss_mlp": 0.00755278, + "balance_loss_clip": 1.01899493, + "balance_loss_mlp": 1.00060868, + "epoch": 0.22810076354235556, + "flos": 62080896758400.0, + "grad_norm": 0.8551623343962836, + "language_loss": 0.62954676, + "learning_rate": 3.602418289319497e-06, + "loss": 0.64787304, + "num_input_tokens_seen": 40369560, + "step": 1897, + "time_per_iteration": 3.0851709842681885 + }, + { + "auxiliary_loss_clip": 0.01149455, + "auxiliary_loss_mlp": 0.01040992, + "balance_loss_clip": 1.05142379, + "balance_loss_mlp": 1.03111541, + "epoch": 0.22822100643299464, + "flos": 23876358635520.0, + "grad_norm": 2.228971319604175, + "language_loss": 0.73193431, + "learning_rate": 3.601952043296059e-06, + "loss": 0.75383878, + "num_input_tokens_seen": 40389555, + "step": 1898, + "time_per_iteration": 2.6397178173065186 + }, + { + "auxiliary_loss_clip": 0.0118906, + "auxiliary_loss_mlp": 0.01031422, + "balance_loss_clip": 1.0557797, + "balance_loss_mlp": 1.02162933, + "epoch": 0.22834124932363373, + "flos": 20991331180800.0, + "grad_norm": 2.074384505395952, + "language_loss": 0.80487108, + "learning_rate": 3.6014855542577696e-06, + "loss": 0.82707584, + "num_input_tokens_seen": 40406765, + "step": 1899, + "time_per_iteration": 2.5139636993408203 + }, + { + "auxiliary_loss_clip": 0.01184238, + "auxiliary_loss_mlp": 0.01029516, + "balance_loss_clip": 1.05886614, + "balance_loss_mlp": 1.01950204, + "epoch": 0.22846149221427284, + "flos": 24901572620160.0, + "grad_norm": 2.543376675448328, + "language_loss": 0.84110832, + "learning_rate": 3.6010188222753943e-06, + "loss": 0.8632459, + "num_input_tokens_seen": 40427535, + "step": 1900, + "time_per_iteration": 2.56365704536438 + }, + { + "auxiliary_loss_clip": 0.01082346, + "auxiliary_loss_mlp": 0.01002792, + "balance_loss_clip": 1.01863575, + "balance_loss_mlp": 1.00069356, + "epoch": 0.22858173510491192, + "flos": 56132294319360.0, + "grad_norm": 0.9022624327597358, + "language_loss": 0.64163357, + "learning_rate": 3.6005518474197372e-06, + "loss": 0.662485, + "num_input_tokens_seen": 40479580, + "step": 1901, + "time_per_iteration": 2.9723317623138428 + }, + { + "auxiliary_loss_clip": 0.01200087, + "auxiliary_loss_mlp": 0.01031786, + "balance_loss_clip": 1.0618825, + "balance_loss_mlp": 1.02179599, + "epoch": 0.228701977995551, + "flos": 24170826332160.0, + "grad_norm": 1.917708266939375, + "language_loss": 0.78678107, + "learning_rate": 3.6000846297616373e-06, + "loss": 0.80909979, + "num_input_tokens_seen": 40497880, + "step": 1902, + "time_per_iteration": 2.5205769538879395 + }, + { + "auxiliary_loss_clip": 0.01217655, + "auxiliary_loss_mlp": 0.01035647, + "balance_loss_clip": 1.06284475, + "balance_loss_mlp": 1.02511525, + "epoch": 0.22882222088619011, + "flos": 21387892308480.0, + "grad_norm": 2.2367049438208877, + "language_loss": 0.72577667, + "learning_rate": 3.5996171693719717e-06, + "loss": 0.74830973, + "num_input_tokens_seen": 40513975, + "step": 1903, + "time_per_iteration": 2.446446180343628 + }, + { + "auxiliary_loss_clip": 0.01093641, + "auxiliary_loss_mlp": 0.01002634, + "balance_loss_clip": 1.01633501, + "balance_loss_mlp": 1.00058365, + "epoch": 0.2289424637768292, + "flos": 64589615377920.0, + "grad_norm": 0.8524544728841859, + "language_loss": 0.64828813, + "learning_rate": 3.5991494663216528e-06, + "loss": 0.66925085, + "num_input_tokens_seen": 40576960, + "step": 1904, + "time_per_iteration": 3.116755723953247 + }, + { + "auxiliary_loss_clip": 0.01213106, + "auxiliary_loss_mlp": 0.01033264, + "balance_loss_clip": 1.06150508, + "balance_loss_mlp": 1.02337575, + "epoch": 0.22906270666746828, + "flos": 22163419877760.0, + "grad_norm": 2.0772396769716686, + "language_loss": 0.8789835, + "learning_rate": 3.5986815206816314e-06, + "loss": 0.90144724, + "num_input_tokens_seen": 40595780, + "step": 1905, + "time_per_iteration": 2.4720869064331055 + }, + { + "auxiliary_loss_clip": 0.01212723, + "auxiliary_loss_mlp": 0.01036613, + "balance_loss_clip": 1.05971622, + "balance_loss_mlp": 1.02758324, + "epoch": 0.2291829495581074, + "flos": 25772334122880.0, + "grad_norm": 1.6721112265936668, + "language_loss": 0.74228716, + "learning_rate": 3.598213332522895e-06, + "loss": 0.76478052, + "num_input_tokens_seen": 40615810, + "step": 1906, + "time_per_iteration": 2.5030322074890137 + }, + { + "auxiliary_loss_clip": 0.0119579, + "auxiliary_loss_mlp": 0.01034573, + "balance_loss_clip": 1.05640984, + "balance_loss_mlp": 1.02469051, + "epoch": 0.22930319244874647, + "flos": 31172760126720.0, + "grad_norm": 1.7626005332426953, + "language_loss": 0.77627993, + "learning_rate": 3.597744901916466e-06, + "loss": 0.79858351, + "num_input_tokens_seen": 40637095, + "step": 1907, + "time_per_iteration": 2.5630226135253906 + }, + { + "auxiliary_loss_clip": 0.01217234, + "auxiliary_loss_mlp": 0.01031279, + "balance_loss_clip": 1.05994177, + "balance_loss_mlp": 1.02094948, + "epoch": 0.22942343533938556, + "flos": 23254098399360.0, + "grad_norm": 2.0036138019384597, + "language_loss": 0.77062041, + "learning_rate": 3.5972762289334058e-06, + "loss": 0.79310548, + "num_input_tokens_seen": 40656725, + "step": 1908, + "time_per_iteration": 2.4795968532562256 + }, + { + "auxiliary_loss_clip": 0.01137738, + "auxiliary_loss_mlp": 0.01030195, + "balance_loss_clip": 1.05492449, + "balance_loss_mlp": 1.01984787, + "epoch": 0.22954367823002464, + "flos": 14610903436800.0, + "grad_norm": 2.344248666402184, + "language_loss": 0.85875928, + "learning_rate": 3.5968073136448116e-06, + "loss": 0.88043857, + "num_input_tokens_seen": 40674745, + "step": 1909, + "time_per_iteration": 2.656871795654297 + }, + { + "auxiliary_loss_clip": 0.01204396, + "auxiliary_loss_mlp": 0.01039377, + "balance_loss_clip": 1.05936444, + "balance_loss_mlp": 1.02869606, + "epoch": 0.22966392112066375, + "flos": 16763604405120.0, + "grad_norm": 1.9527608320634198, + "language_loss": 0.91326749, + "learning_rate": 3.596338156121818e-06, + "loss": 0.93570518, + "num_input_tokens_seen": 40693630, + "step": 1910, + "time_per_iteration": 3.258638620376587 + }, + { + "auxiliary_loss_clip": 0.01079996, + "auxiliary_loss_mlp": 0.01006173, + "balance_loss_clip": 1.01524448, + "balance_loss_mlp": 1.00396776, + "epoch": 0.22978416401130283, + "flos": 67474247783040.0, + "grad_norm": 0.7531319588106659, + "language_loss": 0.59334505, + "learning_rate": 3.595868756435595e-06, + "loss": 0.61420667, + "num_input_tokens_seen": 40761310, + "step": 1911, + "time_per_iteration": 3.21110200881958 + }, + { + "auxiliary_loss_clip": 0.01173709, + "auxiliary_loss_mlp": 0.01033389, + "balance_loss_clip": 1.06015825, + "balance_loss_mlp": 1.02354252, + "epoch": 0.22990440690194192, + "flos": 19865137086720.0, + "grad_norm": 3.039849484425994, + "language_loss": 0.80342269, + "learning_rate": 3.5953991146573504e-06, + "loss": 0.82549369, + "num_input_tokens_seen": 40779955, + "step": 1912, + "time_per_iteration": 2.5468242168426514 + }, + { + "auxiliary_loss_clip": 0.01200103, + "auxiliary_loss_mlp": 0.01035586, + "balance_loss_clip": 1.05556583, + "balance_loss_mlp": 1.02489865, + "epoch": 0.23002464979258103, + "flos": 13289246507520.0, + "grad_norm": 2.5303212855337005, + "language_loss": 0.83767378, + "learning_rate": 3.5949292308583294e-06, + "loss": 0.86003071, + "num_input_tokens_seen": 40793200, + "step": 1913, + "time_per_iteration": 2.487194538116455 + }, + { + "auxiliary_loss_clip": 0.01216072, + "auxiliary_loss_mlp": 0.01037106, + "balance_loss_clip": 1.06302524, + "balance_loss_mlp": 1.02593029, + "epoch": 0.2301448926832201, + "flos": 22163779013760.0, + "grad_norm": 2.0650369610145654, + "language_loss": 0.81123304, + "learning_rate": 3.594459105109811e-06, + "loss": 0.83376479, + "num_input_tokens_seen": 40812380, + "step": 1914, + "time_per_iteration": 2.508103847503662 + }, + { + "auxiliary_loss_clip": 0.01202038, + "auxiliary_loss_mlp": 0.01033226, + "balance_loss_clip": 1.0613215, + "balance_loss_mlp": 1.02423763, + "epoch": 0.2302651355738592, + "flos": 20704477167360.0, + "grad_norm": 1.8871314228454843, + "language_loss": 0.81382042, + "learning_rate": 3.593988737483115e-06, + "loss": 0.83617306, + "num_input_tokens_seen": 40832320, + "step": 1915, + "time_per_iteration": 3.2859911918640137 + }, + { + "auxiliary_loss_clip": 0.01186547, + "auxiliary_loss_mlp": 0.01032597, + "balance_loss_clip": 1.05922151, + "balance_loss_mlp": 1.02239883, + "epoch": 0.23038537846449827, + "flos": 18588943797120.0, + "grad_norm": 1.9987507603901142, + "language_loss": 0.78167987, + "learning_rate": 3.5935181280495947e-06, + "loss": 0.80387127, + "num_input_tokens_seen": 40850900, + "step": 1916, + "time_per_iteration": 3.951631784439087 + }, + { + "auxiliary_loss_clip": 0.01079214, + "auxiliary_loss_mlp": 0.0100613, + "balance_loss_clip": 1.01748371, + "balance_loss_mlp": 1.00391281, + "epoch": 0.23050562135513739, + "flos": 64224260190720.0, + "grad_norm": 1.141582369657413, + "language_loss": 0.54305559, + "learning_rate": 3.5930472768806412e-06, + "loss": 0.56390905, + "num_input_tokens_seen": 40909570, + "step": 1917, + "time_per_iteration": 3.0576629638671875 + }, + { + "auxiliary_loss_clip": 0.01214286, + "auxiliary_loss_mlp": 0.01035096, + "balance_loss_clip": 1.06318712, + "balance_loss_mlp": 1.02474284, + "epoch": 0.23062586424577647, + "flos": 17313396952320.0, + "grad_norm": 2.055180467581532, + "language_loss": 0.7726469, + "learning_rate": 3.5925761840476826e-06, + "loss": 0.79514074, + "num_input_tokens_seen": 40928180, + "step": 1918, + "time_per_iteration": 2.5157463550567627 + }, + { + "auxiliary_loss_clip": 0.01179267, + "auxiliary_loss_mlp": 0.01030805, + "balance_loss_clip": 1.05661821, + "balance_loss_mlp": 1.02135789, + "epoch": 0.23074610713641555, + "flos": 27855979194240.0, + "grad_norm": 2.5827155660946066, + "language_loss": 0.81580555, + "learning_rate": 3.592104849622183e-06, + "loss": 0.83790624, + "num_input_tokens_seen": 40950435, + "step": 1919, + "time_per_iteration": 2.6263978481292725 + }, + { + "auxiliary_loss_clip": 0.01145763, + "auxiliary_loss_mlp": 0.01034038, + "balance_loss_clip": 1.05307734, + "balance_loss_mlp": 1.02380419, + "epoch": 0.23086635002705466, + "flos": 28841798937600.0, + "grad_norm": 1.4116367489873378, + "language_loss": 0.73175931, + "learning_rate": 3.591633273675644e-06, + "loss": 0.75355732, + "num_input_tokens_seen": 40972670, + "step": 1920, + "time_per_iteration": 2.6646108627319336 + }, + { + "auxiliary_loss_clip": 0.01067167, + "auxiliary_loss_mlp": 0.01003754, + "balance_loss_clip": 1.03127885, + "balance_loss_mlp": 1.00181067, + "epoch": 0.23098659291769374, + "flos": 62923681566720.0, + "grad_norm": 0.9106875721211679, + "language_loss": 0.58206427, + "learning_rate": 3.591161456279602e-06, + "loss": 0.60277343, + "num_input_tokens_seen": 41018215, + "step": 1921, + "time_per_iteration": 2.9403340816497803 + }, + { + "auxiliary_loss_clip": 0.0118989, + "auxiliary_loss_mlp": 0.01032541, + "balance_loss_clip": 1.05684257, + "balance_loss_mlp": 1.02245593, + "epoch": 0.23110683580833283, + "flos": 23476816679040.0, + "grad_norm": 1.4167439815836878, + "language_loss": 0.80415785, + "learning_rate": 3.590689397505633e-06, + "loss": 0.8263821, + "num_input_tokens_seen": 41039125, + "step": 1922, + "time_per_iteration": 2.5673224925994873 + }, + { + "auxiliary_loss_clip": 0.01214186, + "auxiliary_loss_mlp": 0.0103608, + "balance_loss_clip": 1.06196582, + "balance_loss_mlp": 1.02591169, + "epoch": 0.2312270786989719, + "flos": 27271066124160.0, + "grad_norm": 1.9743791394477397, + "language_loss": 0.86826575, + "learning_rate": 3.590217097425347e-06, + "loss": 0.89076835, + "num_input_tokens_seen": 41059025, + "step": 1923, + "time_per_iteration": 2.529221296310425 + }, + { + "auxiliary_loss_clip": 0.01219172, + "auxiliary_loss_mlp": 0.01035375, + "balance_loss_clip": 1.06354892, + "balance_loss_mlp": 1.02501535, + "epoch": 0.23134732158961102, + "flos": 13261344618240.0, + "grad_norm": 2.213690190960906, + "language_loss": 0.71270281, + "learning_rate": 3.589744556110391e-06, + "loss": 0.73524827, + "num_input_tokens_seen": 41077015, + "step": 1924, + "time_per_iteration": 2.5460097789764404 + }, + { + "auxiliary_loss_clip": 0.0118301, + "auxiliary_loss_mlp": 0.01035367, + "balance_loss_clip": 1.05683398, + "balance_loss_mlp": 1.02630746, + "epoch": 0.2314675644802501, + "flos": 36977648250240.0, + "grad_norm": 1.6650406098685195, + "language_loss": 0.84148622, + "learning_rate": 3.58927177363245e-06, + "loss": 0.86366999, + "num_input_tokens_seen": 41099840, + "step": 1925, + "time_per_iteration": 2.6951029300689697 + }, + { + "auxiliary_loss_clip": 0.01165886, + "auxiliary_loss_mlp": 0.0103724, + "balance_loss_clip": 1.052665, + "balance_loss_mlp": 1.02649307, + "epoch": 0.2315878073708892, + "flos": 23842207779840.0, + "grad_norm": 1.9643516594188515, + "language_loss": 0.72241181, + "learning_rate": 3.5887987500632447e-06, + "loss": 0.74444306, + "num_input_tokens_seen": 41117845, + "step": 1926, + "time_per_iteration": 2.6027755737304688 + }, + { + "auxiliary_loss_clip": 0.01172444, + "auxiliary_loss_mlp": 0.0103787, + "balance_loss_clip": 1.05474257, + "balance_loss_mlp": 1.0282557, + "epoch": 0.2317080502615283, + "flos": 23039424766080.0, + "grad_norm": 1.8016250581001383, + "language_loss": 0.84067488, + "learning_rate": 3.5883254854745325e-06, + "loss": 0.86277807, + "num_input_tokens_seen": 41136235, + "step": 1927, + "time_per_iteration": 2.5712711811065674 + }, + { + "auxiliary_loss_clip": 0.01202177, + "auxiliary_loss_mlp": 0.01033631, + "balance_loss_clip": 1.05684543, + "balance_loss_mlp": 1.02334356, + "epoch": 0.23182829315216738, + "flos": 11254656435840.0, + "grad_norm": 3.566091232788691, + "language_loss": 0.7510159, + "learning_rate": 3.587851979938107e-06, + "loss": 0.7733739, + "num_input_tokens_seen": 41153125, + "step": 1928, + "time_per_iteration": 2.4758996963500977 + }, + { + "auxiliary_loss_clip": 0.01200271, + "auxiliary_loss_mlp": 0.01032858, + "balance_loss_clip": 1.06127357, + "balance_loss_mlp": 1.02282059, + "epoch": 0.23194853604280646, + "flos": 19828939155840.0, + "grad_norm": 1.82296161915464, + "language_loss": 0.7753073, + "learning_rate": 3.5873782335257985e-06, + "loss": 0.7976386, + "num_input_tokens_seen": 41171290, + "step": 1929, + "time_per_iteration": 2.5345215797424316 + }, + { + "auxiliary_loss_clip": 0.0117114, + "auxiliary_loss_mlp": 0.01032883, + "balance_loss_clip": 1.0597111, + "balance_loss_mlp": 1.02293468, + "epoch": 0.23206877893344555, + "flos": 15305020830720.0, + "grad_norm": 1.9682904941680974, + "language_loss": 0.78569937, + "learning_rate": 3.5869042463094744e-06, + "loss": 0.80773962, + "num_input_tokens_seen": 41189005, + "step": 1930, + "time_per_iteration": 2.5945239067077637 + }, + { + "auxiliary_loss_clip": 0.01137677, + "auxiliary_loss_mlp": 0.01040518, + "balance_loss_clip": 1.05089545, + "balance_loss_mlp": 1.02966452, + "epoch": 0.23218902182408466, + "flos": 22711488572160.0, + "grad_norm": 2.033754221408769, + "language_loss": 0.77478647, + "learning_rate": 3.586430018361038e-06, + "loss": 0.79656845, + "num_input_tokens_seen": 41208775, + "step": 1931, + "time_per_iteration": 2.6398918628692627 + }, + { + "auxiliary_loss_clip": 0.01169643, + "auxiliary_loss_mlp": 0.01036818, + "balance_loss_clip": 1.0527035, + "balance_loss_mlp": 1.02604699, + "epoch": 0.23230926471472374, + "flos": 22710734386560.0, + "grad_norm": 2.2167267803260713, + "language_loss": 0.76422822, + "learning_rate": 3.5859555497524283e-06, + "loss": 0.78629279, + "num_input_tokens_seen": 41226010, + "step": 1932, + "time_per_iteration": 2.540698528289795 + }, + { + "auxiliary_loss_clip": 0.0120198, + "auxiliary_loss_mlp": 0.01041713, + "balance_loss_clip": 1.06218648, + "balance_loss_mlp": 1.03183079, + "epoch": 0.23242950760536282, + "flos": 20375499479040.0, + "grad_norm": 1.7188709737759602, + "language_loss": 0.91859341, + "learning_rate": 3.5854808405556237e-06, + "loss": 0.94103038, + "num_input_tokens_seen": 41245245, + "step": 1933, + "time_per_iteration": 2.535888195037842 + }, + { + "auxiliary_loss_clip": 0.01171082, + "auxiliary_loss_mlp": 0.01036526, + "balance_loss_clip": 1.056126, + "balance_loss_mlp": 1.02753198, + "epoch": 0.23254975049600193, + "flos": 16908324301440.0, + "grad_norm": 2.4861661130431982, + "language_loss": 0.75450265, + "learning_rate": 3.5850058908426355e-06, + "loss": 0.77657866, + "num_input_tokens_seen": 41263795, + "step": 1934, + "time_per_iteration": 2.51936936378479 + }, + { + "auxiliary_loss_clip": 0.01185775, + "auxiliary_loss_mlp": 0.010382, + "balance_loss_clip": 1.05445838, + "balance_loss_mlp": 1.02863395, + "epoch": 0.23266999338664102, + "flos": 23294821443840.0, + "grad_norm": 1.7381419494916412, + "language_loss": 0.85462075, + "learning_rate": 3.584530700685514e-06, + "loss": 0.8768605, + "num_input_tokens_seen": 41284055, + "step": 1935, + "time_per_iteration": 2.5522818565368652 + }, + { + "auxiliary_loss_clip": 0.01185285, + "auxiliary_loss_mlp": 0.01032078, + "balance_loss_clip": 1.06303811, + "balance_loss_mlp": 1.02230859, + "epoch": 0.2327902362772801, + "flos": 19569987031680.0, + "grad_norm": 2.129023727802878, + "language_loss": 0.88705873, + "learning_rate": 3.5840552701563448e-06, + "loss": 0.90923238, + "num_input_tokens_seen": 41300255, + "step": 1936, + "time_per_iteration": 3.2881481647491455 + }, + { + "auxiliary_loss_clip": 0.01210859, + "auxiliary_loss_mlp": 0.01034815, + "balance_loss_clip": 1.05930591, + "balance_loss_mlp": 1.02475381, + "epoch": 0.2329104791679192, + "flos": 16727514215040.0, + "grad_norm": 1.9703345669727705, + "language_loss": 0.82090592, + "learning_rate": 3.5835795993272513e-06, + "loss": 0.84336257, + "num_input_tokens_seen": 41318540, + "step": 1937, + "time_per_iteration": 2.4339427947998047 + }, + { + "auxiliary_loss_clip": 0.01104375, + "auxiliary_loss_mlp": 0.01044577, + "balance_loss_clip": 1.04682565, + "balance_loss_mlp": 1.03427124, + "epoch": 0.2330307220585583, + "flos": 22163743100160.0, + "grad_norm": 1.8755915243729122, + "language_loss": 0.7091279, + "learning_rate": 3.583103688270391e-06, + "loss": 0.7306174, + "num_input_tokens_seen": 41338320, + "step": 1938, + "time_per_iteration": 2.6657626628875732 + }, + { + "auxiliary_loss_clip": 0.01171494, + "auxiliary_loss_mlp": 0.01039476, + "balance_loss_clip": 1.05351317, + "balance_loss_mlp": 1.02815664, + "epoch": 0.23315096494919738, + "flos": 19317319787520.0, + "grad_norm": 2.376216213693678, + "language_loss": 0.89369261, + "learning_rate": 3.58262753705796e-06, + "loss": 0.91580224, + "num_input_tokens_seen": 41353210, + "step": 1939, + "time_per_iteration": 2.488239049911499 + }, + { + "auxiliary_loss_clip": 0.0107791, + "auxiliary_loss_mlp": 0.01007056, + "balance_loss_clip": 1.01747227, + "balance_loss_mlp": 1.00486219, + "epoch": 0.23327120783983646, + "flos": 53031048946560.0, + "grad_norm": 0.7631672700986737, + "language_loss": 0.55516255, + "learning_rate": 3.5821511457621902e-06, + "loss": 0.57601219, + "num_input_tokens_seen": 41410510, + "step": 1940, + "time_per_iteration": 3.0605716705322266 + }, + { + "auxiliary_loss_clip": 0.01180585, + "auxiliary_loss_mlp": 0.0103823, + "balance_loss_clip": 1.05777717, + "balance_loss_mlp": 1.02764463, + "epoch": 0.23339145073047557, + "flos": 17126984344320.0, + "grad_norm": 6.2926689811121745, + "language_loss": 0.81019461, + "learning_rate": 3.5816745144553497e-06, + "loss": 0.8323828, + "num_input_tokens_seen": 41425830, + "step": 1941, + "time_per_iteration": 3.4351396560668945 + }, + { + "auxiliary_loss_clip": 0.01147198, + "auxiliary_loss_mlp": 0.0103042, + "balance_loss_clip": 1.05294192, + "balance_loss_mlp": 1.02071607, + "epoch": 0.23351169362111465, + "flos": 13078918419840.0, + "grad_norm": 1.8039527568336453, + "language_loss": 0.75882763, + "learning_rate": 3.5811976432097424e-06, + "loss": 0.78060377, + "num_input_tokens_seen": 41443500, + "step": 1942, + "time_per_iteration": 3.503387928009033 + }, + { + "auxiliary_loss_clip": 0.01199217, + "auxiliary_loss_mlp": 0.00763914, + "balance_loss_clip": 1.0627718, + "balance_loss_mlp": 1.00112748, + "epoch": 0.23363193651175373, + "flos": 15851257931520.0, + "grad_norm": 1.9695472104332836, + "language_loss": 0.84652603, + "learning_rate": 3.58072053209771e-06, + "loss": 0.86615741, + "num_input_tokens_seen": 41460055, + "step": 1943, + "time_per_iteration": 3.4055943489074707 + }, + { + "auxiliary_loss_clip": 0.01175842, + "auxiliary_loss_mlp": 0.01034016, + "balance_loss_clip": 1.05372202, + "balance_loss_mlp": 1.02397299, + "epoch": 0.23375217940239285, + "flos": 21025769345280.0, + "grad_norm": 2.174746196584717, + "language_loss": 0.79158592, + "learning_rate": 3.5802431811916296e-06, + "loss": 0.81368446, + "num_input_tokens_seen": 41476665, + "step": 1944, + "time_per_iteration": 2.5280187129974365 + }, + { + "auxiliary_loss_clip": 0.01177993, + "auxiliary_loss_mlp": 0.01031896, + "balance_loss_clip": 1.05752134, + "balance_loss_mlp": 1.02278864, + "epoch": 0.23387242229303193, + "flos": 20594698225920.0, + "grad_norm": 1.581849501493909, + "language_loss": 0.80612743, + "learning_rate": 3.579765590563916e-06, + "loss": 0.82822633, + "num_input_tokens_seen": 41496065, + "step": 1945, + "time_per_iteration": 2.517467975616455 + }, + { + "auxiliary_loss_clip": 0.01185959, + "auxiliary_loss_mlp": 0.01034025, + "balance_loss_clip": 1.05641294, + "balance_loss_mlp": 1.02445209, + "epoch": 0.233992665183671, + "flos": 24279491952000.0, + "grad_norm": 2.1421609710953633, + "language_loss": 0.81794375, + "learning_rate": 3.579287760287017e-06, + "loss": 0.84014356, + "num_input_tokens_seen": 41516815, + "step": 1946, + "time_per_iteration": 2.5322391986846924 + }, + { + "auxiliary_loss_clip": 0.01195244, + "auxiliary_loss_mlp": 0.01034821, + "balance_loss_clip": 1.05951786, + "balance_loss_mlp": 1.02550459, + "epoch": 0.2341129080743101, + "flos": 30154621121280.0, + "grad_norm": 1.6701998067578485, + "language_loss": 0.72982132, + "learning_rate": 3.578809690433421e-06, + "loss": 0.75212198, + "num_input_tokens_seen": 41538525, + "step": 1947, + "time_per_iteration": 2.5901741981506348 + }, + { + "auxiliary_loss_clip": 0.01216502, + "auxiliary_loss_mlp": 0.01035292, + "balance_loss_clip": 1.06233728, + "balance_loss_mlp": 1.02536225, + "epoch": 0.2342331509649492, + "flos": 22784135829120.0, + "grad_norm": 3.0899874102552833, + "language_loss": 0.81282669, + "learning_rate": 3.578331381075651e-06, + "loss": 0.83534467, + "num_input_tokens_seen": 41559025, + "step": 1948, + "time_per_iteration": 2.478379249572754 + }, + { + "auxiliary_loss_clip": 0.01195655, + "auxiliary_loss_mlp": 0.01027976, + "balance_loss_clip": 1.0563972, + "balance_loss_mlp": 1.01804018, + "epoch": 0.2343533938555883, + "flos": 23623152687360.0, + "grad_norm": 2.3852847405067155, + "language_loss": 0.7014935, + "learning_rate": 3.5778528322862646e-06, + "loss": 0.72372973, + "num_input_tokens_seen": 41577845, + "step": 1949, + "time_per_iteration": 2.493685722351074 + }, + { + "auxiliary_loss_clip": 0.01197576, + "auxiliary_loss_mlp": 0.01033194, + "balance_loss_clip": 1.05652976, + "balance_loss_mlp": 1.02382398, + "epoch": 0.23447363674622737, + "flos": 24570332375040.0, + "grad_norm": 1.7446932756141125, + "language_loss": 0.86427855, + "learning_rate": 3.5773740441378585e-06, + "loss": 0.88658625, + "num_input_tokens_seen": 41598600, + "step": 1950, + "time_per_iteration": 2.503650665283203 + }, + { + "auxiliary_loss_clip": 0.0119322, + "auxiliary_loss_mlp": 0.01035219, + "balance_loss_clip": 1.05789709, + "balance_loss_mlp": 1.02642751, + "epoch": 0.23459387963686648, + "flos": 53140322119680.0, + "grad_norm": 1.9303240901587033, + "language_loss": 0.7384454, + "learning_rate": 3.5768950167030633e-06, + "loss": 0.76072979, + "num_input_tokens_seen": 41623300, + "step": 1951, + "time_per_iteration": 2.76902174949646 + }, + { + "auxiliary_loss_clip": 0.01168085, + "auxiliary_loss_mlp": 0.01039836, + "balance_loss_clip": 1.05140662, + "balance_loss_mlp": 1.02932715, + "epoch": 0.23471412252750556, + "flos": 23951412103680.0, + "grad_norm": 1.7530416842226695, + "language_loss": 0.7859537, + "learning_rate": 3.576415750054548e-06, + "loss": 0.80803293, + "num_input_tokens_seen": 41643420, + "step": 1952, + "time_per_iteration": 2.532989025115967 + }, + { + "auxiliary_loss_clip": 0.01171753, + "auxiliary_loss_mlp": 0.01029506, + "balance_loss_clip": 1.05436397, + "balance_loss_mlp": 1.01979041, + "epoch": 0.23483436541814465, + "flos": 15706573948800.0, + "grad_norm": 2.122591379026021, + "language_loss": 0.85433239, + "learning_rate": 3.5759362442650172e-06, + "loss": 0.87634498, + "num_input_tokens_seen": 41660170, + "step": 1953, + "time_per_iteration": 2.4692180156707764 + }, + { + "auxiliary_loss_clip": 0.01195762, + "auxiliary_loss_mlp": 0.01034578, + "balance_loss_clip": 1.06047153, + "balance_loss_mlp": 1.0250051, + "epoch": 0.23495460830878373, + "flos": 24936262179840.0, + "grad_norm": 1.9578781640102352, + "language_loss": 0.85788423, + "learning_rate": 3.5754564994072113e-06, + "loss": 0.88018763, + "num_input_tokens_seen": 41679010, + "step": 1954, + "time_per_iteration": 2.5092616081237793 + }, + { + "auxiliary_loss_clip": 0.01176095, + "auxiliary_loss_mlp": 0.0103012, + "balance_loss_clip": 1.05374837, + "balance_loss_mlp": 1.02030933, + "epoch": 0.23507485119942284, + "flos": 30482665056000.0, + "grad_norm": 2.3991324281503976, + "language_loss": 0.59896034, + "learning_rate": 3.5749765155539067e-06, + "loss": 0.62102246, + "num_input_tokens_seen": 41699495, + "step": 1955, + "time_per_iteration": 2.5802903175354004 + }, + { + "auxiliary_loss_clip": 0.011618, + "auxiliary_loss_mlp": 0.01031143, + "balance_loss_clip": 1.05204999, + "balance_loss_mlp": 1.02102184, + "epoch": 0.23519509409006192, + "flos": 18329129746560.0, + "grad_norm": 2.069646590780623, + "language_loss": 0.92129302, + "learning_rate": 3.574496292777917e-06, + "loss": 0.94322246, + "num_input_tokens_seen": 41717705, + "step": 1956, + "time_per_iteration": 2.5243914127349854 + }, + { + "auxiliary_loss_clip": 0.01187219, + "auxiliary_loss_mlp": 0.01038092, + "balance_loss_clip": 1.05769801, + "balance_loss_mlp": 1.02772045, + "epoch": 0.235315336980701, + "flos": 29643217234560.0, + "grad_norm": 2.1397815160009013, + "language_loss": 0.71522522, + "learning_rate": 3.574015831152092e-06, + "loss": 0.73747826, + "num_input_tokens_seen": 41738120, + "step": 1957, + "time_per_iteration": 2.577772617340088 + }, + { + "auxiliary_loss_clip": 0.01169303, + "auxiliary_loss_mlp": 0.01027578, + "balance_loss_clip": 1.05332494, + "balance_loss_mlp": 1.0184114, + "epoch": 0.23543557987134012, + "flos": 18551704371840.0, + "grad_norm": 2.145312357236173, + "language_loss": 0.82958084, + "learning_rate": 3.573535130749316e-06, + "loss": 0.85154963, + "num_input_tokens_seen": 41756070, + "step": 1958, + "time_per_iteration": 2.4939799308776855 + }, + { + "auxiliary_loss_clip": 0.01171354, + "auxiliary_loss_mlp": 0.01032513, + "balance_loss_clip": 1.05547571, + "balance_loss_mlp": 1.02314353, + "epoch": 0.2355558227619792, + "flos": 24679033908480.0, + "grad_norm": 1.780969939571169, + "language_loss": 0.73835349, + "learning_rate": 3.5730541916425127e-06, + "loss": 0.76039213, + "num_input_tokens_seen": 41777550, + "step": 1959, + "time_per_iteration": 2.5523130893707275 + }, + { + "auxiliary_loss_clip": 0.01167102, + "auxiliary_loss_mlp": 0.01030643, + "balance_loss_clip": 1.05516648, + "balance_loss_mlp": 1.02154684, + "epoch": 0.23567606565261828, + "flos": 21944795748480.0, + "grad_norm": 2.1798481064057516, + "language_loss": 0.86333805, + "learning_rate": 3.572573013904639e-06, + "loss": 0.88531554, + "num_input_tokens_seen": 41797460, + "step": 1960, + "time_per_iteration": 2.562910556793213 + }, + { + "auxiliary_loss_clip": 0.01207322, + "auxiliary_loss_mlp": 0.01028693, + "balance_loss_clip": 1.05781198, + "balance_loss_mlp": 1.01959753, + "epoch": 0.2357963085432574, + "flos": 13589352639360.0, + "grad_norm": 1.8989969939690523, + "language_loss": 0.92437601, + "learning_rate": 3.572091597608689e-06, + "loss": 0.94673622, + "num_input_tokens_seen": 41815585, + "step": 1961, + "time_per_iteration": 2.4310882091522217 + }, + { + "auxiliary_loss_clip": 0.01186678, + "auxiliary_loss_mlp": 0.01029173, + "balance_loss_clip": 1.05842948, + "balance_loss_mlp": 1.0190165, + "epoch": 0.23591655143389648, + "flos": 22088689632000.0, + "grad_norm": 2.030006436309881, + "language_loss": 0.73458111, + "learning_rate": 3.571609942827694e-06, + "loss": 0.75673962, + "num_input_tokens_seen": 41834700, + "step": 1962, + "time_per_iteration": 2.5295491218566895 + }, + { + "auxiliary_loss_clip": 0.01175824, + "auxiliary_loss_mlp": 0.01030353, + "balance_loss_clip": 1.05458248, + "balance_loss_mlp": 1.02138197, + "epoch": 0.23603679432453556, + "flos": 17017349057280.0, + "grad_norm": 1.6459726084108075, + "language_loss": 0.8807925, + "learning_rate": 3.57112804963472e-06, + "loss": 0.90285432, + "num_input_tokens_seen": 41852915, + "step": 1963, + "time_per_iteration": 3.2141830921173096 + }, + { + "auxiliary_loss_clip": 0.01161075, + "auxiliary_loss_mlp": 0.01032138, + "balance_loss_clip": 1.05763388, + "balance_loss_mlp": 1.02352524, + "epoch": 0.23615703721517464, + "flos": 19171307001600.0, + "grad_norm": 1.712395445324564, + "language_loss": 0.76341313, + "learning_rate": 3.57064591810287e-06, + "loss": 0.78534532, + "num_input_tokens_seen": 41870415, + "step": 1964, + "time_per_iteration": 2.516754150390625 + }, + { + "auxiliary_loss_clip": 0.01207831, + "auxiliary_loss_mlp": 0.00763411, + "balance_loss_clip": 1.06002223, + "balance_loss_mlp": 1.00098753, + "epoch": 0.23627728010581375, + "flos": 19098803399040.0, + "grad_norm": 2.684279824800443, + "language_loss": 0.80677903, + "learning_rate": 3.570163548305284e-06, + "loss": 0.82649142, + "num_input_tokens_seen": 41889345, + "step": 1965, + "time_per_iteration": 2.4404945373535156 + }, + { + "auxiliary_loss_clip": 0.011796, + "auxiliary_loss_mlp": 0.01036195, + "balance_loss_clip": 1.0570581, + "balance_loss_mlp": 1.02599633, + "epoch": 0.23639752299645284, + "flos": 14282213057280.0, + "grad_norm": 2.205486786075132, + "language_loss": 0.70317101, + "learning_rate": 3.569680940315135e-06, + "loss": 0.72532892, + "num_input_tokens_seen": 41905745, + "step": 1966, + "time_per_iteration": 2.4783945083618164 + }, + { + "auxiliary_loss_clip": 0.01167947, + "auxiliary_loss_mlp": 0.01036493, + "balance_loss_clip": 1.0539552, + "balance_loss_mlp": 1.02604389, + "epoch": 0.23651776588709192, + "flos": 22893411980160.0, + "grad_norm": 1.711111628342339, + "language_loss": 0.82186335, + "learning_rate": 3.5691980942056356e-06, + "loss": 0.84390771, + "num_input_tokens_seen": 41925115, + "step": 1967, + "time_per_iteration": 2.5597589015960693 + }, + { + "auxiliary_loss_clip": 0.01195936, + "auxiliary_loss_mlp": 0.0103177, + "balance_loss_clip": 1.05563736, + "balance_loss_mlp": 1.02214348, + "epoch": 0.23663800877773103, + "flos": 18624531196800.0, + "grad_norm": 1.8968133420157383, + "language_loss": 0.79519069, + "learning_rate": 3.5687150100500332e-06, + "loss": 0.81746775, + "num_input_tokens_seen": 41944815, + "step": 1968, + "time_per_iteration": 4.048121929168701 + }, + { + "auxiliary_loss_clip": 0.01194354, + "auxiliary_loss_mlp": 0.01029718, + "balance_loss_clip": 1.05718446, + "balance_loss_mlp": 1.02028883, + "epoch": 0.2367582516683701, + "flos": 25555828896000.0, + "grad_norm": 1.6384315955063882, + "language_loss": 0.74450326, + "learning_rate": 3.568231687921611e-06, + "loss": 0.76674402, + "num_input_tokens_seen": 41964990, + "step": 1969, + "time_per_iteration": 3.294912815093994 + }, + { + "auxiliary_loss_clip": 0.01206387, + "auxiliary_loss_mlp": 0.01032273, + "balance_loss_clip": 1.05937552, + "balance_loss_mlp": 1.02368355, + "epoch": 0.2368784945590092, + "flos": 23295072839040.0, + "grad_norm": 1.5766181893394022, + "language_loss": 0.803388, + "learning_rate": 3.5677481278936883e-06, + "loss": 0.82577455, + "num_input_tokens_seen": 41984570, + "step": 1970, + "time_per_iteration": 2.4610962867736816 + }, + { + "auxiliary_loss_clip": 0.01078091, + "auxiliary_loss_mlp": 0.01006469, + "balance_loss_clip": 1.01890397, + "balance_loss_mlp": 1.00428724, + "epoch": 0.23699873744964828, + "flos": 69859291875840.0, + "grad_norm": 0.8211361237673271, + "language_loss": 0.5783478, + "learning_rate": 3.5672643300396214e-06, + "loss": 0.59919339, + "num_input_tokens_seen": 42053715, + "step": 1971, + "time_per_iteration": 3.1245429515838623 + }, + { + "auxiliary_loss_clip": 0.01163946, + "auxiliary_loss_mlp": 0.01029084, + "balance_loss_clip": 1.05486631, + "balance_loss_mlp": 1.0205431, + "epoch": 0.2371189803402874, + "flos": 21835052720640.0, + "grad_norm": 2.3621955412448465, + "language_loss": 0.67235255, + "learning_rate": 3.566780294432802e-06, + "loss": 0.69428289, + "num_input_tokens_seen": 42070890, + "step": 1972, + "time_per_iteration": 2.558406352996826 + }, + { + "auxiliary_loss_clip": 0.01208319, + "auxiliary_loss_mlp": 0.01035361, + "balance_loss_clip": 1.05955148, + "balance_loss_mlp": 1.02683783, + "epoch": 0.23723922323092647, + "flos": 21908490076800.0, + "grad_norm": 2.695657757732184, + "language_loss": 0.74392891, + "learning_rate": 3.566296021146657e-06, + "loss": 0.76636571, + "num_input_tokens_seen": 42090270, + "step": 1973, + "time_per_iteration": 2.456747531890869 + }, + { + "auxiliary_loss_clip": 0.01212888, + "auxiliary_loss_mlp": 0.01029933, + "balance_loss_clip": 1.0618751, + "balance_loss_mlp": 1.01995564, + "epoch": 0.23735946612156555, + "flos": 32708803380480.0, + "grad_norm": 1.7048742604732068, + "language_loss": 0.73261094, + "learning_rate": 3.565811510254652e-06, + "loss": 0.75503916, + "num_input_tokens_seen": 42111150, + "step": 1974, + "time_per_iteration": 2.523646116256714 + }, + { + "auxiliary_loss_clip": 0.01092139, + "auxiliary_loss_mlp": 0.01002254, + "balance_loss_clip": 1.02876222, + "balance_loss_mlp": 1.0003947, + "epoch": 0.23747970901220466, + "flos": 70546944821760.0, + "grad_norm": 0.8447967044284997, + "language_loss": 0.58299577, + "learning_rate": 3.5653267618302845e-06, + "loss": 0.60393965, + "num_input_tokens_seen": 42178730, + "step": 1975, + "time_per_iteration": 3.1120734214782715 + }, + { + "auxiliary_loss_clip": 0.01204702, + "auxiliary_loss_mlp": 0.01031236, + "balance_loss_clip": 1.05645812, + "balance_loss_mlp": 1.02193141, + "epoch": 0.23759995190284375, + "flos": 20849807594880.0, + "grad_norm": 1.8125924328341991, + "language_loss": 0.85644114, + "learning_rate": 3.564841775947093e-06, + "loss": 0.87880051, + "num_input_tokens_seen": 42199620, + "step": 1976, + "time_per_iteration": 2.4575963020324707 + }, + { + "auxiliary_loss_clip": 0.01162223, + "auxiliary_loss_mlp": 0.01031819, + "balance_loss_clip": 1.0518837, + "balance_loss_mlp": 1.02215672, + "epoch": 0.23772019479348283, + "flos": 32921645420160.0, + "grad_norm": 2.133168025616641, + "language_loss": 0.75806093, + "learning_rate": 3.5643565526786475e-06, + "loss": 0.78000134, + "num_input_tokens_seen": 42219560, + "step": 1977, + "time_per_iteration": 2.649470090866089 + }, + { + "auxiliary_loss_clip": 0.01207519, + "auxiliary_loss_mlp": 0.0103177, + "balance_loss_clip": 1.05883288, + "balance_loss_mlp": 1.0225966, + "epoch": 0.2378404376841219, + "flos": 32342765834880.0, + "grad_norm": 1.552470044204016, + "language_loss": 0.76914215, + "learning_rate": 3.5638710920985574e-06, + "loss": 0.79153508, + "num_input_tokens_seen": 42241020, + "step": 1978, + "time_per_iteration": 2.5345230102539062 + }, + { + "auxiliary_loss_clip": 0.01198578, + "auxiliary_loss_mlp": 0.00763999, + "balance_loss_clip": 1.05542219, + "balance_loss_mlp": 1.00098896, + "epoch": 0.23796068057476102, + "flos": 22997624313600.0, + "grad_norm": 2.338833278481084, + "language_loss": 0.82406962, + "learning_rate": 3.5633853942804655e-06, + "loss": 0.8436954, + "num_input_tokens_seen": 42259345, + "step": 1979, + "time_per_iteration": 2.4918370246887207 + }, + { + "auxiliary_loss_clip": 0.01164166, + "auxiliary_loss_mlp": 0.0103589, + "balance_loss_clip": 1.05111861, + "balance_loss_mlp": 1.02628231, + "epoch": 0.2380809234654001, + "flos": 13480938414720.0, + "grad_norm": 2.0364476027014216, + "language_loss": 0.76620996, + "learning_rate": 3.5628994592980527e-06, + "loss": 0.78821051, + "num_input_tokens_seen": 42277250, + "step": 1980, + "time_per_iteration": 2.5912833213806152 + }, + { + "auxiliary_loss_clip": 0.01208798, + "auxiliary_loss_mlp": 0.01028968, + "balance_loss_clip": 1.05881119, + "balance_loss_mlp": 1.0203079, + "epoch": 0.2382011663560392, + "flos": 16871803148160.0, + "grad_norm": 1.8294721305396655, + "language_loss": 0.70781136, + "learning_rate": 3.562413287225034e-06, + "loss": 0.73018909, + "num_input_tokens_seen": 42295360, + "step": 1981, + "time_per_iteration": 2.4310972690582275 + }, + { + "auxiliary_loss_clip": 0.01191047, + "auxiliary_loss_mlp": 0.01027398, + "balance_loss_clip": 1.0586338, + "balance_loss_mlp": 1.01831985, + "epoch": 0.2383214092466783, + "flos": 18441135331200.0, + "grad_norm": 2.386765287096748, + "language_loss": 0.89198446, + "learning_rate": 3.5619268781351623e-06, + "loss": 0.91416883, + "num_input_tokens_seen": 42313430, + "step": 1982, + "time_per_iteration": 2.4772651195526123 + }, + { + "auxiliary_loss_clip": 0.01173254, + "auxiliary_loss_mlp": 0.01031986, + "balance_loss_clip": 1.05694747, + "balance_loss_mlp": 1.02380228, + "epoch": 0.23844165213731738, + "flos": 19755717281280.0, + "grad_norm": 1.8584889681004397, + "language_loss": 0.767946, + "learning_rate": 3.5614402321022256e-06, + "loss": 0.78999835, + "num_input_tokens_seen": 42331260, + "step": 1983, + "time_per_iteration": 2.5112931728363037 + }, + { + "auxiliary_loss_clip": 0.01141802, + "auxiliary_loss_mlp": 0.01030043, + "balance_loss_clip": 1.04987848, + "balance_loss_mlp": 1.0207324, + "epoch": 0.23856189502795647, + "flos": 23367360960000.0, + "grad_norm": 2.2106463231753968, + "language_loss": 0.87250721, + "learning_rate": 3.5609533492000463e-06, + "loss": 0.89422572, + "num_input_tokens_seen": 42350150, + "step": 1984, + "time_per_iteration": 2.586664915084839 + }, + { + "auxiliary_loss_clip": 0.01174324, + "auxiliary_loss_mlp": 0.01030017, + "balance_loss_clip": 1.05682325, + "balance_loss_mlp": 1.02032566, + "epoch": 0.23868213791859555, + "flos": 23475056912640.0, + "grad_norm": 2.126939074207812, + "language_loss": 0.78776956, + "learning_rate": 3.560466229502485e-06, + "loss": 0.80981302, + "num_input_tokens_seen": 42369495, + "step": 1985, + "time_per_iteration": 2.52701997756958 + }, + { + "auxiliary_loss_clip": 0.01179355, + "auxiliary_loss_mlp": 0.00763192, + "balance_loss_clip": 1.05996644, + "balance_loss_mlp": 1.00098681, + "epoch": 0.23880238080923466, + "flos": 16617340224000.0, + "grad_norm": 1.9970731593805708, + "language_loss": 0.90112835, + "learning_rate": 3.5599788730834384e-06, + "loss": 0.9205538, + "num_input_tokens_seen": 42387455, + "step": 1986, + "time_per_iteration": 2.508200168609619 + }, + { + "auxiliary_loss_clip": 0.01197186, + "auxiliary_loss_mlp": 0.01026953, + "balance_loss_clip": 1.05838466, + "balance_loss_mlp": 1.01791692, + "epoch": 0.23892262369987374, + "flos": 17348409734400.0, + "grad_norm": 2.151512379617055, + "language_loss": 0.78269207, + "learning_rate": 3.559491280016836e-06, + "loss": 0.80493343, + "num_input_tokens_seen": 42405400, + "step": 1987, + "time_per_iteration": 2.4875130653381348 + }, + { + "auxiliary_loss_clip": 0.01180896, + "auxiliary_loss_mlp": 0.0103454, + "balance_loss_clip": 1.05833578, + "balance_loss_mlp": 1.02456224, + "epoch": 0.23904286659051283, + "flos": 22309899540480.0, + "grad_norm": 1.8069004526455272, + "language_loss": 0.70953858, + "learning_rate": 3.5590034503766465e-06, + "loss": 0.73169297, + "num_input_tokens_seen": 42425065, + "step": 1988, + "time_per_iteration": 2.5603647232055664 + }, + { + "auxiliary_loss_clip": 0.01208011, + "auxiliary_loss_mlp": 0.0103165, + "balance_loss_clip": 1.05911469, + "balance_loss_mlp": 1.02309048, + "epoch": 0.23916310948115194, + "flos": 21178246579200.0, + "grad_norm": 2.0464997506989144, + "language_loss": 0.81018472, + "learning_rate": 3.558515384236874e-06, + "loss": 0.8325814, + "num_input_tokens_seen": 42442495, + "step": 1989, + "time_per_iteration": 2.4839060306549072 + }, + { + "auxiliary_loss_clip": 0.01156913, + "auxiliary_loss_mlp": 0.00763888, + "balance_loss_clip": 1.05504596, + "balance_loss_mlp": 1.00110257, + "epoch": 0.23928335237179102, + "flos": 14137349506560.0, + "grad_norm": 1.7870546748425957, + "language_loss": 0.83781582, + "learning_rate": 3.558027081671556e-06, + "loss": 0.85702384, + "num_input_tokens_seen": 42459480, + "step": 1990, + "time_per_iteration": 3.2810251712799072 + }, + { + "auxiliary_loss_clip": 0.0119544, + "auxiliary_loss_mlp": 0.01033407, + "balance_loss_clip": 1.05576158, + "balance_loss_mlp": 1.02329195, + "epoch": 0.2394035952624301, + "flos": 23769596436480.0, + "grad_norm": 2.0929072045241695, + "language_loss": 0.68961573, + "learning_rate": 3.557538542754769e-06, + "loss": 0.71190417, + "num_input_tokens_seen": 42479175, + "step": 1991, + "time_per_iteration": 2.5202901363372803 + }, + { + "auxiliary_loss_clip": 0.01209581, + "auxiliary_loss_mlp": 0.01033515, + "balance_loss_clip": 1.06051886, + "balance_loss_mlp": 1.02386534, + "epoch": 0.2395238381530692, + "flos": 24206198250240.0, + "grad_norm": 2.0192141408910795, + "language_loss": 0.67045939, + "learning_rate": 3.557049767560623e-06, + "loss": 0.69289041, + "num_input_tokens_seen": 42498090, + "step": 1992, + "time_per_iteration": 2.4792706966400146 + }, + { + "auxiliary_loss_clip": 0.01153063, + "auxiliary_loss_mlp": 0.01032347, + "balance_loss_clip": 1.05626416, + "balance_loss_mlp": 1.02297163, + "epoch": 0.2396440810437083, + "flos": 25295763450240.0, + "grad_norm": 1.9339542582618987, + "language_loss": 0.85597563, + "learning_rate": 3.5565607561632655e-06, + "loss": 0.87782967, + "num_input_tokens_seen": 42516930, + "step": 1993, + "time_per_iteration": 2.616929769515991 + }, + { + "auxiliary_loss_clip": 0.01173317, + "auxiliary_loss_mlp": 0.01030004, + "balance_loss_clip": 1.05414391, + "balance_loss_mlp": 1.01984787, + "epoch": 0.23976432393434738, + "flos": 28543093436160.0, + "grad_norm": 2.788119221953979, + "language_loss": 0.79731739, + "learning_rate": 3.5560715086368787e-06, + "loss": 0.81935066, + "num_input_tokens_seen": 42534800, + "step": 1994, + "time_per_iteration": 2.5620577335357666 + }, + { + "auxiliary_loss_clip": 0.01171578, + "auxiliary_loss_mlp": 0.01033004, + "balance_loss_clip": 1.05495453, + "balance_loss_mlp": 1.02384305, + "epoch": 0.23988456682498646, + "flos": 19494358945920.0, + "grad_norm": 2.1909002606684798, + "language_loss": 0.8248843, + "learning_rate": 3.5555820250556816e-06, + "loss": 0.84693015, + "num_input_tokens_seen": 42552000, + "step": 1995, + "time_per_iteration": 4.017388820648193 + }, + { + "auxiliary_loss_clip": 0.01184664, + "auxiliary_loss_mlp": 0.01031951, + "balance_loss_clip": 1.05884445, + "balance_loss_mlp": 1.02255702, + "epoch": 0.24000480971562557, + "flos": 20266331068800.0, + "grad_norm": 2.248743825185292, + "language_loss": 0.69822192, + "learning_rate": 3.5550923054939278e-06, + "loss": 0.72038805, + "num_input_tokens_seen": 42571455, + "step": 1996, + "time_per_iteration": 3.2839467525482178 + }, + { + "auxiliary_loss_clip": 0.01140029, + "auxiliary_loss_mlp": 0.01031544, + "balance_loss_clip": 1.04854512, + "balance_loss_mlp": 1.02217972, + "epoch": 0.24012505260626466, + "flos": 25443176866560.0, + "grad_norm": 2.9639655675760586, + "language_loss": 0.74442768, + "learning_rate": 3.5546023500259083e-06, + "loss": 0.76614344, + "num_input_tokens_seen": 42592550, + "step": 1997, + "time_per_iteration": 2.6011605262756348 + }, + { + "auxiliary_loss_clip": 0.0115564, + "auxiliary_loss_mlp": 0.01029989, + "balance_loss_clip": 1.05386567, + "balance_loss_mlp": 1.02052331, + "epoch": 0.24024529549690374, + "flos": 15553342529280.0, + "grad_norm": 1.8846879068862301, + "language_loss": 0.80108762, + "learning_rate": 3.5541121587259477e-06, + "loss": 0.82294393, + "num_input_tokens_seen": 42610385, + "step": 1998, + "time_per_iteration": 2.582749128341675 + }, + { + "auxiliary_loss_clip": 0.01089247, + "auxiliary_loss_mlp": 0.01003942, + "balance_loss_clip": 1.0214572, + "balance_loss_mlp": 1.00141513, + "epoch": 0.24036553838754285, + "flos": 57122351867520.0, + "grad_norm": 0.8340095925557408, + "language_loss": 0.5789839, + "learning_rate": 3.553621731668408e-06, + "loss": 0.5999158, + "num_input_tokens_seen": 42673595, + "step": 1999, + "time_per_iteration": 3.048616886138916 + }, + { + "auxiliary_loss_clip": 0.01184895, + "auxiliary_loss_mlp": 0.01030675, + "balance_loss_clip": 1.05141425, + "balance_loss_mlp": 1.02100682, + "epoch": 0.24048578127818193, + "flos": 24969946158720.0, + "grad_norm": 1.8981560504418509, + "language_loss": 0.83125305, + "learning_rate": 3.553131068927688e-06, + "loss": 0.85340869, + "num_input_tokens_seen": 42692000, + "step": 2000, + "time_per_iteration": 2.530499219894409 + }, + { + "auxiliary_loss_clip": 0.01160222, + "auxiliary_loss_mlp": 0.01028787, + "balance_loss_clip": 1.05294812, + "balance_loss_mlp": 1.02071071, + "epoch": 0.24060602416882101, + "flos": 23330947547520.0, + "grad_norm": 1.8327910669173086, + "language_loss": 0.80179268, + "learning_rate": 3.552640170578219e-06, + "loss": 0.82368279, + "num_input_tokens_seen": 42712250, + "step": 2001, + "time_per_iteration": 2.574028491973877 + }, + { + "auxiliary_loss_clip": 0.01178073, + "auxiliary_loss_mlp": 0.01035896, + "balance_loss_clip": 1.05546522, + "balance_loss_mlp": 1.02700877, + "epoch": 0.2407262670594601, + "flos": 14173260128640.0, + "grad_norm": 1.9067136706905778, + "language_loss": 0.77451611, + "learning_rate": 3.5521490366944703e-06, + "loss": 0.79665583, + "num_input_tokens_seen": 42729900, + "step": 2002, + "time_per_iteration": 2.5029213428497314 + }, + { + "auxiliary_loss_clip": 0.01161466, + "auxiliary_loss_mlp": 0.01027709, + "balance_loss_clip": 1.0509522, + "balance_loss_mlp": 1.01848817, + "epoch": 0.2408465099500992, + "flos": 13663113217920.0, + "grad_norm": 2.0582048669559185, + "language_loss": 0.79777724, + "learning_rate": 3.5516576673509474e-06, + "loss": 0.81966895, + "num_input_tokens_seen": 42747900, + "step": 2003, + "time_per_iteration": 2.5298562049865723 + }, + { + "auxiliary_loss_clip": 0.0120791, + "auxiliary_loss_mlp": 0.01032173, + "balance_loss_clip": 1.05792522, + "balance_loss_mlp": 1.02302957, + "epoch": 0.2409667528407383, + "flos": 31248029076480.0, + "grad_norm": 1.594426284355851, + "language_loss": 0.86150956, + "learning_rate": 3.5511660626221896e-06, + "loss": 0.88391042, + "num_input_tokens_seen": 42768540, + "step": 2004, + "time_per_iteration": 2.545865774154663 + }, + { + "auxiliary_loss_clip": 0.01175089, + "auxiliary_loss_mlp": 0.00763756, + "balance_loss_clip": 1.05419433, + "balance_loss_mlp": 1.00105739, + "epoch": 0.24108699573137737, + "flos": 22199941031040.0, + "grad_norm": 2.0535502782337094, + "language_loss": 0.8905865, + "learning_rate": 3.5506742225827744e-06, + "loss": 0.90997493, + "num_input_tokens_seen": 42785395, + "step": 2005, + "time_per_iteration": 2.516042470932007 + }, + { + "auxiliary_loss_clip": 0.01162251, + "auxiliary_loss_mlp": 0.01032731, + "balance_loss_clip": 1.05317962, + "balance_loss_mlp": 1.02315819, + "epoch": 0.24120723862201648, + "flos": 26103035664000.0, + "grad_norm": 3.5417253311645323, + "language_loss": 0.90063924, + "learning_rate": 3.5501821473073116e-06, + "loss": 0.92258906, + "num_input_tokens_seen": 42801980, + "step": 2006, + "time_per_iteration": 2.5967018604278564 + }, + { + "auxiliary_loss_clip": 0.01156385, + "auxiliary_loss_mlp": 0.01039553, + "balance_loss_clip": 1.05226231, + "balance_loss_mlp": 1.02902639, + "epoch": 0.24132748151265557, + "flos": 18624926246400.0, + "grad_norm": 2.8012448702216832, + "language_loss": 0.86970949, + "learning_rate": 3.54968983687045e-06, + "loss": 0.89166892, + "num_input_tokens_seen": 42818850, + "step": 2007, + "time_per_iteration": 2.5102784633636475 + }, + { + "auxiliary_loss_clip": 0.01181425, + "auxiliary_loss_mlp": 0.01039314, + "balance_loss_clip": 1.05729544, + "balance_loss_mlp": 1.02866304, + "epoch": 0.24144772440329465, + "flos": 15267673664640.0, + "grad_norm": 2.524199223419678, + "language_loss": 0.89721704, + "learning_rate": 3.549197291346872e-06, + "loss": 0.91942441, + "num_input_tokens_seen": 42835375, + "step": 2008, + "time_per_iteration": 2.4992246627807617 + }, + { + "auxiliary_loss_clip": 0.01191928, + "auxiliary_loss_mlp": 0.01033139, + "balance_loss_clip": 1.05439138, + "balance_loss_mlp": 1.02400208, + "epoch": 0.24156796729393373, + "flos": 24024274842240.0, + "grad_norm": 1.9092454251733206, + "language_loss": 0.79375637, + "learning_rate": 3.548704510811297e-06, + "loss": 0.81600702, + "num_input_tokens_seen": 42854570, + "step": 2009, + "time_per_iteration": 2.5196666717529297 + }, + { + "auxiliary_loss_clip": 0.01153145, + "auxiliary_loss_mlp": 0.01039455, + "balance_loss_clip": 1.0503161, + "balance_loss_mlp": 1.02925611, + "epoch": 0.24168821018457284, + "flos": 26286790665600.0, + "grad_norm": 2.5175236695059695, + "language_loss": 0.74547952, + "learning_rate": 3.5482114953384787e-06, + "loss": 0.76740551, + "num_input_tokens_seen": 42873800, + "step": 2010, + "time_per_iteration": 2.649287462234497 + }, + { + "auxiliary_loss_clip": 0.0119497, + "auxiliary_loss_mlp": 0.01034397, + "balance_loss_clip": 1.05613387, + "balance_loss_mlp": 1.02440715, + "epoch": 0.24180845307521193, + "flos": 18223193560320.0, + "grad_norm": 2.522405235960374, + "language_loss": 0.84245813, + "learning_rate": 3.5477182450032077e-06, + "loss": 0.86475176, + "num_input_tokens_seen": 42892400, + "step": 2011, + "time_per_iteration": 2.4756505489349365 + }, + { + "auxiliary_loss_clip": 0.0118961, + "auxiliary_loss_mlp": 0.01037936, + "balance_loss_clip": 1.05544782, + "balance_loss_mlp": 1.02836895, + "epoch": 0.241928695965851, + "flos": 20449260057600.0, + "grad_norm": 2.1536014985136185, + "language_loss": 0.83332896, + "learning_rate": 3.5472247598803097e-06, + "loss": 0.85560435, + "num_input_tokens_seen": 42911745, + "step": 2012, + "time_per_iteration": 2.4903135299682617 + }, + { + "auxiliary_loss_clip": 0.01208134, + "auxiliary_loss_mlp": 0.01035715, + "balance_loss_clip": 1.05741262, + "balance_loss_mlp": 1.02555883, + "epoch": 0.24204893885649012, + "flos": 25556475340800.0, + "grad_norm": 4.535859032847777, + "language_loss": 0.85168195, + "learning_rate": 3.546731040044645e-06, + "loss": 0.87412047, + "num_input_tokens_seen": 42926915, + "step": 2013, + "time_per_iteration": 2.4775543212890625 + }, + { + "auxiliary_loss_clip": 0.01207161, + "auxiliary_loss_mlp": 0.01029065, + "balance_loss_clip": 1.05738091, + "balance_loss_mlp": 1.01985633, + "epoch": 0.2421691817471292, + "flos": 30660207004800.0, + "grad_norm": 1.786406169339495, + "language_loss": 0.74875504, + "learning_rate": 3.546237085571112e-06, + "loss": 0.77111733, + "num_input_tokens_seen": 42945350, + "step": 2014, + "time_per_iteration": 2.5072035789489746 + }, + { + "auxiliary_loss_clip": 0.01192789, + "auxiliary_loss_mlp": 0.01032735, + "balance_loss_clip": 1.05797005, + "balance_loss_mlp": 1.02363336, + "epoch": 0.24228942463776829, + "flos": 21945011230080.0, + "grad_norm": 2.0654063264821145, + "language_loss": 0.72315049, + "learning_rate": 3.5457428965346425e-06, + "loss": 0.74540573, + "num_input_tokens_seen": 42964290, + "step": 2015, + "time_per_iteration": 2.4778337478637695 + }, + { + "auxiliary_loss_clip": 0.01130307, + "auxiliary_loss_mlp": 0.01029731, + "balance_loss_clip": 1.04805791, + "balance_loss_mlp": 1.02033162, + "epoch": 0.2424096675284074, + "flos": 33984493879680.0, + "grad_norm": 1.9686037525701883, + "language_loss": 0.7452234, + "learning_rate": 3.545248473010205e-06, + "loss": 0.76682377, + "num_input_tokens_seen": 42987095, + "step": 2016, + "time_per_iteration": 3.4985241889953613 + }, + { + "auxiliary_loss_clip": 0.01211372, + "auxiliary_loss_mlp": 0.00764214, + "balance_loss_clip": 1.05804455, + "balance_loss_mlp": 1.00114107, + "epoch": 0.24252991041904648, + "flos": 21653416621440.0, + "grad_norm": 1.5901044511353306, + "language_loss": 0.87705028, + "learning_rate": 3.544753815072802e-06, + "loss": 0.89680618, + "num_input_tokens_seen": 43005750, + "step": 2017, + "time_per_iteration": 2.4489965438842773 + }, + { + "auxiliary_loss_clip": 0.01111806, + "auxiliary_loss_mlp": 0.01033741, + "balance_loss_clip": 1.04416776, + "balance_loss_mlp": 1.02457356, + "epoch": 0.24265015330968556, + "flos": 21870065502720.0, + "grad_norm": 1.9449020047720769, + "language_loss": 0.88536596, + "learning_rate": 3.544258922797474e-06, + "loss": 0.90682149, + "num_input_tokens_seen": 43023870, + "step": 2018, + "time_per_iteration": 2.6641147136688232 + }, + { + "auxiliary_loss_clip": 0.0120535, + "auxiliary_loss_mlp": 0.01032775, + "balance_loss_clip": 1.05744946, + "balance_loss_mlp": 1.02382874, + "epoch": 0.24277039620032465, + "flos": 25628260671360.0, + "grad_norm": 1.6869452745473819, + "language_loss": 0.78082263, + "learning_rate": 3.543763796259295e-06, + "loss": 0.80320394, + "num_input_tokens_seen": 43043825, + "step": 2019, + "time_per_iteration": 2.505347490310669 + }, + { + "auxiliary_loss_clip": 0.01193251, + "auxiliary_loss_mlp": 0.01038297, + "balance_loss_clip": 1.05625486, + "balance_loss_mlp": 1.02836132, + "epoch": 0.24289063909096376, + "flos": 26286575184000.0, + "grad_norm": 1.816647740402828, + "language_loss": 0.90918803, + "learning_rate": 3.5432684355333754e-06, + "loss": 0.93150353, + "num_input_tokens_seen": 43062480, + "step": 2020, + "time_per_iteration": 2.5252609252929688 + }, + { + "auxiliary_loss_clip": 0.01190485, + "auxiliary_loss_mlp": 0.01038686, + "balance_loss_clip": 1.05345178, + "balance_loss_mlp": 1.02914906, + "epoch": 0.24301088198160284, + "flos": 25075056332160.0, + "grad_norm": 2.0211301304436358, + "language_loss": 0.7666508, + "learning_rate": 3.5427728406948613e-06, + "loss": 0.78894252, + "num_input_tokens_seen": 43081595, + "step": 2021, + "time_per_iteration": 3.3319292068481445 + }, + { + "auxiliary_loss_clip": 0.01084479, + "auxiliary_loss_mlp": 0.01007633, + "balance_loss_clip": 1.02001548, + "balance_loss_mlp": 1.00497425, + "epoch": 0.24313112487224192, + "flos": 69900948673920.0, + "grad_norm": 0.7714755475435252, + "language_loss": 0.57934225, + "learning_rate": 3.542277011818934e-06, + "loss": 0.60026336, + "num_input_tokens_seen": 43145430, + "step": 2022, + "time_per_iteration": 4.001522064208984 + }, + { + "auxiliary_loss_clip": 0.01182377, + "auxiliary_loss_mlp": 0.01035217, + "balance_loss_clip": 1.05960751, + "balance_loss_mlp": 1.02600825, + "epoch": 0.24325136776288103, + "flos": 40662334235520.0, + "grad_norm": 4.6038628471318335, + "language_loss": 0.73980236, + "learning_rate": 3.5417809489808104e-06, + "loss": 0.76197833, + "num_input_tokens_seen": 43167040, + "step": 2023, + "time_per_iteration": 3.545851945877075 + }, + { + "auxiliary_loss_clip": 0.01191905, + "auxiliary_loss_mlp": 0.0103359, + "balance_loss_clip": 1.05643225, + "balance_loss_mlp": 1.02507246, + "epoch": 0.24337161065352012, + "flos": 25046400257280.0, + "grad_norm": 1.695217527810192, + "language_loss": 0.72326112, + "learning_rate": 3.5412846522557422e-06, + "loss": 0.74551606, + "num_input_tokens_seen": 43187930, + "step": 2024, + "time_per_iteration": 2.554652214050293 + }, + { + "auxiliary_loss_clip": 0.01208416, + "auxiliary_loss_mlp": 0.0103682, + "balance_loss_clip": 1.05853128, + "balance_loss_mlp": 1.02726555, + "epoch": 0.2434918535441592, + "flos": 18661160090880.0, + "grad_norm": 1.971468273647503, + "language_loss": 0.7401107, + "learning_rate": 3.540788121719018e-06, + "loss": 0.76256311, + "num_input_tokens_seen": 43206350, + "step": 2025, + "time_per_iteration": 2.4316303730010986 + }, + { + "auxiliary_loss_clip": 0.01156876, + "auxiliary_loss_mlp": 0.01037495, + "balance_loss_clip": 1.05481243, + "balance_loss_mlp": 1.02830958, + "epoch": 0.24361209643479828, + "flos": 23915142345600.0, + "grad_norm": 1.8455876095292723, + "language_loss": 0.82026255, + "learning_rate": 3.5402913574459604e-06, + "loss": 0.8422063, + "num_input_tokens_seen": 43226255, + "step": 2026, + "time_per_iteration": 2.578956365585327 + }, + { + "auxiliary_loss_clip": 0.01128861, + "auxiliary_loss_mlp": 0.010309, + "balance_loss_clip": 1.04742587, + "balance_loss_mlp": 1.02215004, + "epoch": 0.2437323393254374, + "flos": 28657505232000.0, + "grad_norm": 1.5389200720197338, + "language_loss": 0.86036754, + "learning_rate": 3.5397943595119297e-06, + "loss": 0.88196516, + "num_input_tokens_seen": 43247675, + "step": 2027, + "time_per_iteration": 2.7475130558013916 + }, + { + "auxiliary_loss_clip": 0.01172468, + "auxiliary_loss_mlp": 0.0103481, + "balance_loss_clip": 1.05576074, + "balance_loss_mlp": 1.02461183, + "epoch": 0.24385258221607647, + "flos": 23550325862400.0, + "grad_norm": 2.4068773510623753, + "language_loss": 0.772421, + "learning_rate": 3.5392971279923177e-06, + "loss": 0.79449379, + "num_input_tokens_seen": 43265895, + "step": 2028, + "time_per_iteration": 2.645144462585449 + }, + { + "auxiliary_loss_clip": 0.01156208, + "auxiliary_loss_mlp": 0.01034306, + "balance_loss_clip": 1.0498333, + "balance_loss_mlp": 1.02346373, + "epoch": 0.24397282510671556, + "flos": 25336091445120.0, + "grad_norm": 2.0298598940990344, + "language_loss": 0.82716173, + "learning_rate": 3.5387996629625557e-06, + "loss": 0.84906685, + "num_input_tokens_seen": 43283485, + "step": 2029, + "time_per_iteration": 2.652576446533203 + }, + { + "auxiliary_loss_clip": 0.01107679, + "auxiliary_loss_mlp": 0.01004012, + "balance_loss_clip": 1.02228475, + "balance_loss_mlp": 1.00150824, + "epoch": 0.24409306799735467, + "flos": 65187421430400.0, + "grad_norm": 0.8059934306077553, + "language_loss": 0.55080843, + "learning_rate": 3.5383019644981083e-06, + "loss": 0.57192528, + "num_input_tokens_seen": 43347180, + "step": 2030, + "time_per_iteration": 3.0875186920166016 + }, + { + "auxiliary_loss_clip": 0.01177816, + "auxiliary_loss_mlp": 0.01027314, + "balance_loss_clip": 1.05608225, + "balance_loss_mlp": 1.01784301, + "epoch": 0.24421331088799375, + "flos": 19537093152000.0, + "grad_norm": 1.9583131282926063, + "language_loss": 0.72477126, + "learning_rate": 3.5378040326744763e-06, + "loss": 0.74682248, + "num_input_tokens_seen": 43366665, + "step": 2031, + "time_per_iteration": 2.5003750324249268 + }, + { + "auxiliary_loss_clip": 0.01165312, + "auxiliary_loss_mlp": 0.0102782, + "balance_loss_clip": 1.05579281, + "balance_loss_mlp": 1.01914799, + "epoch": 0.24433355377863283, + "flos": 21068575378560.0, + "grad_norm": 2.138662027765353, + "language_loss": 0.85489273, + "learning_rate": 3.5373058675671946e-06, + "loss": 0.87682408, + "num_input_tokens_seen": 43384670, + "step": 2032, + "time_per_iteration": 2.5463173389434814 + }, + { + "auxiliary_loss_clip": 0.01140016, + "auxiliary_loss_mlp": 0.01031579, + "balance_loss_clip": 1.05042791, + "balance_loss_mlp": 1.02182186, + "epoch": 0.24445379666927192, + "flos": 22637189289600.0, + "grad_norm": 2.070611475697021, + "language_loss": 0.72023153, + "learning_rate": 3.536807469251836e-06, + "loss": 0.74194741, + "num_input_tokens_seen": 43403825, + "step": 2033, + "time_per_iteration": 2.595122814178467 + }, + { + "auxiliary_loss_clip": 0.0116683, + "auxiliary_loss_mlp": 0.0103379, + "balance_loss_clip": 1.05102098, + "balance_loss_mlp": 1.02452171, + "epoch": 0.24457403955991103, + "flos": 21251612108160.0, + "grad_norm": 1.9913046301706476, + "language_loss": 0.82866228, + "learning_rate": 3.5363088378040055e-06, + "loss": 0.85066849, + "num_input_tokens_seen": 43422715, + "step": 2034, + "time_per_iteration": 2.562631368637085 + }, + { + "auxiliary_loss_clip": 0.01108323, + "auxiliary_loss_mlp": 0.00754846, + "balance_loss_clip": 1.02275884, + "balance_loss_mlp": 1.00036967, + "epoch": 0.2446942824505501, + "flos": 66997820764800.0, + "grad_norm": 0.7521021190330228, + "language_loss": 0.64390743, + "learning_rate": 3.5358099732993463e-06, + "loss": 0.66253912, + "num_input_tokens_seen": 43481825, + "step": 2035, + "time_per_iteration": 2.9584155082702637 + }, + { + "auxiliary_loss_clip": 0.01184302, + "auxiliary_loss_mlp": 0.01032242, + "balance_loss_clip": 1.05645418, + "balance_loss_mlp": 1.0232898, + "epoch": 0.2448145253411892, + "flos": 20411122792320.0, + "grad_norm": 1.9469029686984902, + "language_loss": 0.89225352, + "learning_rate": 3.535310875813535e-06, + "loss": 0.914419, + "num_input_tokens_seen": 43500220, + "step": 2036, + "time_per_iteration": 2.519622802734375 + }, + { + "auxiliary_loss_clip": 0.01190443, + "auxiliary_loss_mlp": 0.01034839, + "balance_loss_clip": 1.05629146, + "balance_loss_mlp": 1.02588606, + "epoch": 0.2449347682318283, + "flos": 28804739080320.0, + "grad_norm": 1.7606451604611946, + "language_loss": 0.8165729, + "learning_rate": 3.5348115454222843e-06, + "loss": 0.8388257, + "num_input_tokens_seen": 43522805, + "step": 2037, + "time_per_iteration": 2.553697347640991 + }, + { + "auxiliary_loss_clip": 0.01172391, + "auxiliary_loss_mlp": 0.0104319, + "balance_loss_clip": 1.05081272, + "balance_loss_mlp": 1.03398085, + "epoch": 0.2450550111224674, + "flos": 22528990546560.0, + "grad_norm": 2.4005357834164043, + "language_loss": 0.85843873, + "learning_rate": 3.5343119822013425e-06, + "loss": 0.88059461, + "num_input_tokens_seen": 43541915, + "step": 2038, + "time_per_iteration": 2.524523973464966 + }, + { + "auxiliary_loss_clip": 0.01198854, + "auxiliary_loss_mlp": 0.01038093, + "balance_loss_clip": 1.05761945, + "balance_loss_mlp": 1.02826977, + "epoch": 0.24517525401310647, + "flos": 21759137326080.0, + "grad_norm": 1.8023127772541039, + "language_loss": 0.77346599, + "learning_rate": 3.533812186226493e-06, + "loss": 0.79583544, + "num_input_tokens_seen": 43562625, + "step": 2039, + "time_per_iteration": 2.509932041168213 + }, + { + "auxiliary_loss_clip": 0.012032, + "auxiliary_loss_mlp": 0.01029568, + "balance_loss_clip": 1.05670369, + "balance_loss_mlp": 1.02087772, + "epoch": 0.24529549690374555, + "flos": 25043311687680.0, + "grad_norm": 1.7147445115331474, + "language_loss": 0.75700277, + "learning_rate": 3.5333121575735545e-06, + "loss": 0.77933049, + "num_input_tokens_seen": 43582265, + "step": 2040, + "time_per_iteration": 2.4831888675689697 + }, + { + "auxiliary_loss_clip": 0.01177038, + "auxiliary_loss_mlp": 0.01034196, + "balance_loss_clip": 1.05640996, + "balance_loss_mlp": 1.024773, + "epoch": 0.24541573979438466, + "flos": 32123638915200.0, + "grad_norm": 5.378410343875924, + "language_loss": 0.75232208, + "learning_rate": 3.532811896318381e-06, + "loss": 0.77443439, + "num_input_tokens_seen": 43604335, + "step": 2041, + "time_per_iteration": 2.6213223934173584 + }, + { + "auxiliary_loss_clip": 0.0116785, + "auxiliary_loss_mlp": 0.01028629, + "balance_loss_clip": 1.05496454, + "balance_loss_mlp": 1.01922345, + "epoch": 0.24553598268502375, + "flos": 31357556622720.0, + "grad_norm": 2.0881136897706276, + "language_loss": 0.82435435, + "learning_rate": 3.5323114025368615e-06, + "loss": 0.84631914, + "num_input_tokens_seen": 43619400, + "step": 2042, + "time_per_iteration": 3.3722012042999268 + }, + { + "auxiliary_loss_clip": 0.01185971, + "auxiliary_loss_mlp": 0.01030881, + "balance_loss_clip": 1.05261087, + "balance_loss_mlp": 1.02202988, + "epoch": 0.24565622557566283, + "flos": 14027462824320.0, + "grad_norm": 1.9233893935583652, + "language_loss": 0.81537867, + "learning_rate": 3.53181067630492e-06, + "loss": 0.83754718, + "num_input_tokens_seen": 43636870, + "step": 2043, + "time_per_iteration": 2.4582197666168213 + }, + { + "auxiliary_loss_clip": 0.01169134, + "auxiliary_loss_mlp": 0.0104291, + "balance_loss_clip": 1.05348372, + "balance_loss_mlp": 1.03380895, + "epoch": 0.24577646846630194, + "flos": 16581465515520.0, + "grad_norm": 2.6997194556826916, + "language_loss": 0.76166594, + "learning_rate": 3.5313097176985175e-06, + "loss": 0.78378642, + "num_input_tokens_seen": 43655180, + "step": 2044, + "time_per_iteration": 2.501551866531372 + }, + { + "auxiliary_loss_clip": 0.01192053, + "auxiliary_loss_mlp": 0.01032337, + "balance_loss_clip": 1.05740428, + "balance_loss_mlp": 1.02324104, + "epoch": 0.24589671135694102, + "flos": 18807424272000.0, + "grad_norm": 3.558740411184615, + "language_loss": 0.81367165, + "learning_rate": 3.5308085267936482e-06, + "loss": 0.83591557, + "num_input_tokens_seen": 43672895, + "step": 2045, + "time_per_iteration": 2.4823992252349854 + }, + { + "auxiliary_loss_clip": 0.01133025, + "auxiliary_loss_mlp": 0.00762697, + "balance_loss_clip": 1.05104995, + "balance_loss_mlp": 1.00104249, + "epoch": 0.2460169542475801, + "flos": 19938538529280.0, + "grad_norm": 2.8787222093649536, + "language_loss": 0.89792275, + "learning_rate": 3.530307103666342e-06, + "loss": 0.91687995, + "num_input_tokens_seen": 43691975, + "step": 2046, + "time_per_iteration": 2.6011970043182373 + }, + { + "auxiliary_loss_clip": 0.01170398, + "auxiliary_loss_mlp": 0.0103207, + "balance_loss_clip": 1.05599928, + "balance_loss_mlp": 1.0230875, + "epoch": 0.24613719713821922, + "flos": 24171221381760.0, + "grad_norm": 2.1185470433056923, + "language_loss": 0.80085814, + "learning_rate": 3.5298054483926658e-06, + "loss": 0.82288289, + "num_input_tokens_seen": 43712670, + "step": 2047, + "time_per_iteration": 2.5781612396240234 + }, + { + "auxiliary_loss_clip": 0.01200794, + "auxiliary_loss_mlp": 0.01033182, + "balance_loss_clip": 1.05848861, + "balance_loss_mlp": 1.02403247, + "epoch": 0.2462574400288583, + "flos": 30221055325440.0, + "grad_norm": 2.0323089571514426, + "language_loss": 0.82832891, + "learning_rate": 3.5293035610487187e-06, + "loss": 0.85066867, + "num_input_tokens_seen": 43732035, + "step": 2048, + "time_per_iteration": 4.152170658111572 + }, + { + "auxiliary_loss_clip": 0.0107921, + "auxiliary_loss_mlp": 0.01003898, + "balance_loss_clip": 1.02055693, + "balance_loss_mlp": 1.00131083, + "epoch": 0.24637768291949738, + "flos": 68943030819840.0, + "grad_norm": 0.7151327386034538, + "language_loss": 0.61955309, + "learning_rate": 3.5288014417106374e-06, + "loss": 0.64038414, + "num_input_tokens_seen": 43798055, + "step": 2049, + "time_per_iteration": 3.126598358154297 + }, + { + "auxiliary_loss_clip": 0.01160515, + "auxiliary_loss_mlp": 0.01032103, + "balance_loss_clip": 1.05361915, + "balance_loss_mlp": 1.02307296, + "epoch": 0.24649792581013646, + "flos": 34383999922560.0, + "grad_norm": 1.7330107754961421, + "language_loss": 0.75442755, + "learning_rate": 3.528299090454593e-06, + "loss": 0.77635372, + "num_input_tokens_seen": 43818590, + "step": 2050, + "time_per_iteration": 3.396742820739746 + }, + { + "auxiliary_loss_clip": 0.01194728, + "auxiliary_loss_mlp": 0.01032028, + "balance_loss_clip": 1.05538726, + "balance_loss_mlp": 1.02256894, + "epoch": 0.24661816870077558, + "flos": 19680448331520.0, + "grad_norm": 8.805136804209978, + "language_loss": 0.83018601, + "learning_rate": 3.527796507356792e-06, + "loss": 0.85245359, + "num_input_tokens_seen": 43832480, + "step": 2051, + "time_per_iteration": 2.4652762413024902 + }, + { + "auxiliary_loss_clip": 0.01196097, + "auxiliary_loss_mlp": 0.01032613, + "balance_loss_clip": 1.0563848, + "balance_loss_mlp": 1.0233978, + "epoch": 0.24673841159141466, + "flos": 20002279213440.0, + "grad_norm": 2.9959908324371813, + "language_loss": 0.90053374, + "learning_rate": 3.527293692493475e-06, + "loss": 0.92282087, + "num_input_tokens_seen": 43848345, + "step": 2052, + "time_per_iteration": 2.4673361778259277 + }, + { + "auxiliary_loss_clip": 0.0119501, + "auxiliary_loss_mlp": 0.01035237, + "balance_loss_clip": 1.05618596, + "balance_loss_mlp": 1.02550387, + "epoch": 0.24685865448205374, + "flos": 21646593037440.0, + "grad_norm": 2.149996649367818, + "language_loss": 0.73284245, + "learning_rate": 3.52679064594092e-06, + "loss": 0.75514495, + "num_input_tokens_seen": 43865685, + "step": 2053, + "time_per_iteration": 2.487342596054077 + }, + { + "auxiliary_loss_clip": 0.01133938, + "auxiliary_loss_mlp": 0.01033185, + "balance_loss_clip": 1.04168427, + "balance_loss_mlp": 1.02466202, + "epoch": 0.24697889737269285, + "flos": 17960470508160.0, + "grad_norm": 2.1829005464186153, + "language_loss": 0.75161213, + "learning_rate": 3.5262873677754375e-06, + "loss": 0.77328336, + "num_input_tokens_seen": 43883690, + "step": 2054, + "time_per_iteration": 2.5644149780273438 + }, + { + "auxiliary_loss_clip": 0.01201984, + "auxiliary_loss_mlp": 0.01035422, + "balance_loss_clip": 1.05609143, + "balance_loss_mlp": 1.02623689, + "epoch": 0.24709914026333193, + "flos": 27344611221120.0, + "grad_norm": 1.660355249261278, + "language_loss": 0.80397558, + "learning_rate": 3.5257838580733745e-06, + "loss": 0.82634962, + "num_input_tokens_seen": 43903295, + "step": 2055, + "time_per_iteration": 2.5081183910369873 + }, + { + "auxiliary_loss_clip": 0.01195319, + "auxiliary_loss_mlp": 0.01028555, + "balance_loss_clip": 1.05706358, + "balance_loss_mlp": 1.01951301, + "epoch": 0.24721938315397102, + "flos": 19275519335040.0, + "grad_norm": 1.8078848599888697, + "language_loss": 0.87491238, + "learning_rate": 3.5252801169111138e-06, + "loss": 0.89715105, + "num_input_tokens_seen": 43920960, + "step": 2056, + "time_per_iteration": 2.460277557373047 + }, + { + "auxiliary_loss_clip": 0.01173205, + "auxiliary_loss_mlp": 0.0103313, + "balance_loss_clip": 1.05640888, + "balance_loss_mlp": 1.02465439, + "epoch": 0.2473396260446101, + "flos": 23185796688000.0, + "grad_norm": 1.8449399507553064, + "language_loss": 0.8000375, + "learning_rate": 3.524776144365072e-06, + "loss": 0.82210088, + "num_input_tokens_seen": 43939415, + "step": 2057, + "time_per_iteration": 2.5214335918426514 + }, + { + "auxiliary_loss_clip": 0.01168678, + "auxiliary_loss_mlp": 0.01034081, + "balance_loss_clip": 1.05493784, + "balance_loss_mlp": 1.02488399, + "epoch": 0.2474598689352492, + "flos": 21142443697920.0, + "grad_norm": 1.616237507720843, + "language_loss": 0.79173183, + "learning_rate": 3.5242719405117016e-06, + "loss": 0.81375939, + "num_input_tokens_seen": 43959220, + "step": 2058, + "time_per_iteration": 2.5438663959503174 + }, + { + "auxiliary_loss_clip": 0.01182698, + "auxiliary_loss_mlp": 0.00763505, + "balance_loss_clip": 1.05782032, + "balance_loss_mlp": 1.00105941, + "epoch": 0.2475801118258883, + "flos": 21648352803840.0, + "grad_norm": 5.311895719757909, + "language_loss": 0.75013095, + "learning_rate": 3.5237675054274893e-06, + "loss": 0.769593, + "num_input_tokens_seen": 43978420, + "step": 2059, + "time_per_iteration": 2.524357318878174 + }, + { + "auxiliary_loss_clip": 0.01192604, + "auxiliary_loss_mlp": 0.01033138, + "balance_loss_clip": 1.05583048, + "balance_loss_mlp": 1.02358377, + "epoch": 0.24770035471652738, + "flos": 22674500542080.0, + "grad_norm": 2.0627728804080903, + "language_loss": 0.80600667, + "learning_rate": 3.5232628391889584e-06, + "loss": 0.82826412, + "num_input_tokens_seen": 43996710, + "step": 2060, + "time_per_iteration": 2.4958560466766357 + }, + { + "auxiliary_loss_clip": 0.01144994, + "auxiliary_loss_mlp": 0.01026753, + "balance_loss_clip": 1.05303407, + "balance_loss_mlp": 1.01805687, + "epoch": 0.2478205976071665, + "flos": 22163814927360.0, + "grad_norm": 2.146722644947726, + "language_loss": 0.64071292, + "learning_rate": 3.522757941872666e-06, + "loss": 0.66243041, + "num_input_tokens_seen": 44014865, + "step": 2061, + "time_per_iteration": 2.5879876613616943 + }, + { + "auxiliary_loss_clip": 0.01209908, + "auxiliary_loss_mlp": 0.00763771, + "balance_loss_clip": 1.06246006, + "balance_loss_mlp": 1.00116074, + "epoch": 0.24794084049780557, + "flos": 24973106555520.0, + "grad_norm": 1.6082350478658027, + "language_loss": 0.82893956, + "learning_rate": 3.5222528135552042e-06, + "loss": 0.84867644, + "num_input_tokens_seen": 44036325, + "step": 2062, + "time_per_iteration": 2.4987692832946777 + }, + { + "auxiliary_loss_clip": 0.01191118, + "auxiliary_loss_mlp": 0.01037645, + "balance_loss_clip": 1.05895662, + "balance_loss_mlp": 1.02840042, + "epoch": 0.24806108338844465, + "flos": 18296379521280.0, + "grad_norm": 1.915266214543935, + "language_loss": 0.8036167, + "learning_rate": 3.521747454313201e-06, + "loss": 0.82590431, + "num_input_tokens_seen": 44055005, + "step": 2063, + "time_per_iteration": 2.4969632625579834 + }, + { + "auxiliary_loss_clip": 0.01153053, + "auxiliary_loss_mlp": 0.01029002, + "balance_loss_clip": 1.04800653, + "balance_loss_mlp": 1.01987684, + "epoch": 0.24818132627908374, + "flos": 19282163351040.0, + "grad_norm": 2.2549538163321867, + "language_loss": 0.66801786, + "learning_rate": 3.521241864223319e-06, + "loss": 0.68983835, + "num_input_tokens_seen": 44073965, + "step": 2064, + "time_per_iteration": 2.5580391883850098 + }, + { + "auxiliary_loss_clip": 0.0108701, + "auxiliary_loss_mlp": 0.01004368, + "balance_loss_clip": 1.02164376, + "balance_loss_mlp": 1.00185227, + "epoch": 0.24830156916972285, + "flos": 70285837881600.0, + "grad_norm": 0.7856560022803232, + "language_loss": 0.61991769, + "learning_rate": 3.5207360433622552e-06, + "loss": 0.64083153, + "num_input_tokens_seen": 44135965, + "step": 2065, + "time_per_iteration": 3.0908031463623047 + }, + { + "auxiliary_loss_clip": 0.01174263, + "auxiliary_loss_mlp": 0.01034582, + "balance_loss_clip": 1.05703032, + "balance_loss_mlp": 1.02590966, + "epoch": 0.24842181206036193, + "flos": 40409128287360.0, + "grad_norm": 1.8169945278643342, + "language_loss": 0.74467289, + "learning_rate": 3.5202299918067437e-06, + "loss": 0.7667613, + "num_input_tokens_seen": 44159560, + "step": 2066, + "time_per_iteration": 2.7291669845581055 + }, + { + "auxiliary_loss_clip": 0.01190064, + "auxiliary_loss_mlp": 0.0103044, + "balance_loss_clip": 1.05711544, + "balance_loss_mlp": 1.02188683, + "epoch": 0.248542054951001, + "flos": 20082432412800.0, + "grad_norm": 2.236723246678235, + "language_loss": 0.69097441, + "learning_rate": 3.519723709633551e-06, + "loss": 0.71317947, + "num_input_tokens_seen": 44178320, + "step": 2067, + "time_per_iteration": 2.5875680446624756 + }, + { + "auxiliary_loss_clip": 0.0117142, + "auxiliary_loss_mlp": 0.01028515, + "balance_loss_clip": 1.0551393, + "balance_loss_mlp": 1.01873422, + "epoch": 0.24866229784164012, + "flos": 23513948363520.0, + "grad_norm": 1.7741280595413187, + "language_loss": 0.83185148, + "learning_rate": 3.519217196919479e-06, + "loss": 0.85385084, + "num_input_tokens_seen": 44197305, + "step": 2068, + "time_per_iteration": 2.6043484210968018 + }, + { + "auxiliary_loss_clip": 0.01180187, + "auxiliary_loss_mlp": 0.01032433, + "balance_loss_clip": 1.05806375, + "balance_loss_mlp": 1.02360594, + "epoch": 0.2487825407322792, + "flos": 19865101173120.0, + "grad_norm": 1.7766396059247274, + "language_loss": 0.7284705, + "learning_rate": 3.518710453741367e-06, + "loss": 0.75059676, + "num_input_tokens_seen": 44216505, + "step": 2069, + "time_per_iteration": 3.298344612121582 + }, + { + "auxiliary_loss_clip": 0.01168407, + "auxiliary_loss_mlp": 0.00763392, + "balance_loss_clip": 1.05256641, + "balance_loss_mlp": 1.00109017, + "epoch": 0.2489027836229183, + "flos": 22017622573440.0, + "grad_norm": 1.998327049418165, + "language_loss": 0.67599571, + "learning_rate": 3.518203480176086e-06, + "loss": 0.69531369, + "num_input_tokens_seen": 44235435, + "step": 2070, + "time_per_iteration": 2.534142017364502 + }, + { + "auxiliary_loss_clip": 0.01113859, + "auxiliary_loss_mlp": 0.01036649, + "balance_loss_clip": 1.04355216, + "balance_loss_mlp": 1.02819753, + "epoch": 0.2490230265135574, + "flos": 23294354567040.0, + "grad_norm": 1.7376392831414793, + "language_loss": 0.80537808, + "learning_rate": 3.517696276300545e-06, + "loss": 0.8268832, + "num_input_tokens_seen": 44256975, + "step": 2071, + "time_per_iteration": 2.6662168502807617 + }, + { + "auxiliary_loss_clip": 0.01192918, + "auxiliary_loss_mlp": 0.01034432, + "balance_loss_clip": 1.05989075, + "balance_loss_mlp": 1.02466857, + "epoch": 0.24914326940419648, + "flos": 19826784339840.0, + "grad_norm": 2.3700905438595647, + "language_loss": 0.69508755, + "learning_rate": 3.517188842191685e-06, + "loss": 0.71736103, + "num_input_tokens_seen": 44275125, + "step": 2072, + "time_per_iteration": 2.4604008197784424 + }, + { + "auxiliary_loss_clip": 0.01188488, + "auxiliary_loss_mlp": 0.01031753, + "balance_loss_clip": 1.05498528, + "balance_loss_mlp": 1.0224427, + "epoch": 0.24926351229483557, + "flos": 20229271211520.0, + "grad_norm": 1.4894033678442384, + "language_loss": 0.73733652, + "learning_rate": 3.5166811779264837e-06, + "loss": 0.75953895, + "num_input_tokens_seen": 44295445, + "step": 2073, + "time_per_iteration": 2.48484206199646 + }, + { + "auxiliary_loss_clip": 0.01204376, + "auxiliary_loss_mlp": 0.01030532, + "balance_loss_clip": 1.0565058, + "balance_loss_mlp": 1.02105474, + "epoch": 0.24938375518547465, + "flos": 23294570048640.0, + "grad_norm": 1.8053900329989445, + "language_loss": 0.77771568, + "learning_rate": 3.5161732835819545e-06, + "loss": 0.80006474, + "num_input_tokens_seen": 44314755, + "step": 2074, + "time_per_iteration": 3.270387887954712 + }, + { + "auxiliary_loss_clip": 0.01206729, + "auxiliary_loss_mlp": 0.01029703, + "balance_loss_clip": 1.05930114, + "balance_loss_mlp": 1.02090549, + "epoch": 0.24950399807611376, + "flos": 17311673099520.0, + "grad_norm": 2.474127699271511, + "language_loss": 0.83651549, + "learning_rate": 3.515665159235143e-06, + "loss": 0.8588798, + "num_input_tokens_seen": 44333640, + "step": 2075, + "time_per_iteration": 3.215776205062866 + }, + { + "auxiliary_loss_clip": 0.01170189, + "auxiliary_loss_mlp": 0.01026944, + "balance_loss_clip": 1.04938412, + "balance_loss_mlp": 1.01895654, + "epoch": 0.24962424096675284, + "flos": 19024863252480.0, + "grad_norm": 1.5710500551948936, + "language_loss": 0.74759042, + "learning_rate": 3.5151568049631318e-06, + "loss": 0.76956177, + "num_input_tokens_seen": 44352355, + "step": 2076, + "time_per_iteration": 3.2068395614624023 + }, + { + "auxiliary_loss_clip": 0.01205377, + "auxiliary_loss_mlp": 0.01026628, + "balance_loss_clip": 1.05700397, + "balance_loss_mlp": 1.01743734, + "epoch": 0.24974448385739192, + "flos": 33398790710400.0, + "grad_norm": 1.6736870215816135, + "language_loss": 0.79998136, + "learning_rate": 3.5146482208430385e-06, + "loss": 0.82230139, + "num_input_tokens_seen": 44374185, + "step": 2077, + "time_per_iteration": 2.5827832221984863 + }, + { + "auxiliary_loss_clip": 0.01122244, + "auxiliary_loss_mlp": 0.01034244, + "balance_loss_clip": 1.04523683, + "balance_loss_mlp": 1.02416503, + "epoch": 0.24986472674803104, + "flos": 30007279532160.0, + "grad_norm": 24.391860434077277, + "language_loss": 0.67555141, + "learning_rate": 3.514139406952014e-06, + "loss": 0.69711632, + "num_input_tokens_seen": 44396210, + "step": 2078, + "time_per_iteration": 2.671522617340088 + }, + { + "auxiliary_loss_clip": 0.01189219, + "auxiliary_loss_mlp": 0.01030439, + "balance_loss_clip": 1.05614924, + "balance_loss_mlp": 1.02171302, + "epoch": 0.24998496963867012, + "flos": 26613074833920.0, + "grad_norm": 2.281510070401994, + "language_loss": 0.83211589, + "learning_rate": 3.5136303633672454e-06, + "loss": 0.85431242, + "num_input_tokens_seen": 44416340, + "step": 2079, + "time_per_iteration": 2.5306057929992676 + }, + { + "auxiliary_loss_clip": 0.01169668, + "auxiliary_loss_mlp": 0.00763869, + "balance_loss_clip": 1.05464363, + "balance_loss_mlp": 1.00106883, + "epoch": 0.25010521252930923, + "flos": 23553989049600.0, + "grad_norm": 1.6886074564356475, + "language_loss": 0.74556375, + "learning_rate": 3.5131210901659544e-06, + "loss": 0.76489913, + "num_input_tokens_seen": 44438095, + "step": 2080, + "time_per_iteration": 2.6026670932769775 + }, + { + "auxiliary_loss_clip": 0.01153896, + "auxiliary_loss_mlp": 0.0103073, + "balance_loss_clip": 1.04944324, + "balance_loss_mlp": 1.02147913, + "epoch": 0.2502254554199483, + "flos": 23441193365760.0, + "grad_norm": 12.802382808609007, + "language_loss": 0.82329786, + "learning_rate": 3.5126115874253967e-06, + "loss": 0.84514415, + "num_input_tokens_seen": 44457650, + "step": 2081, + "time_per_iteration": 2.5738346576690674 + }, + { + "auxiliary_loss_clip": 0.01161796, + "auxiliary_loss_mlp": 0.0103302, + "balance_loss_clip": 1.05499148, + "balance_loss_mlp": 1.02350748, + "epoch": 0.2503456983105874, + "flos": 28761681651840.0, + "grad_norm": 1.894376235690036, + "language_loss": 0.80865395, + "learning_rate": 3.5121018552228644e-06, + "loss": 0.83060217, + "num_input_tokens_seen": 44476155, + "step": 2082, + "time_per_iteration": 2.6289427280426025 + }, + { + "auxiliary_loss_clip": 0.01162078, + "auxiliary_loss_mlp": 0.01027735, + "balance_loss_clip": 1.05227304, + "balance_loss_mlp": 1.01853228, + "epoch": 0.2504659412012265, + "flos": 18770256673920.0, + "grad_norm": 1.9389337571278342, + "language_loss": 0.76103127, + "learning_rate": 3.5115918936356827e-06, + "loss": 0.78292942, + "num_input_tokens_seen": 44492910, + "step": 2083, + "time_per_iteration": 2.582308053970337 + }, + { + "auxiliary_loss_clip": 0.01144346, + "auxiliary_loss_mlp": 0.01032397, + "balance_loss_clip": 1.05166674, + "balance_loss_mlp": 1.02355194, + "epoch": 0.25058618409186556, + "flos": 16873383346560.0, + "grad_norm": 1.845085670747394, + "language_loss": 0.79015636, + "learning_rate": 3.5110817027412123e-06, + "loss": 0.81192386, + "num_input_tokens_seen": 44512000, + "step": 2084, + "time_per_iteration": 2.5316543579101562 + }, + { + "auxiliary_loss_clip": 0.0115253, + "auxiliary_loss_mlp": 0.01029682, + "balance_loss_clip": 1.04733062, + "balance_loss_mlp": 1.02103329, + "epoch": 0.25070642698250467, + "flos": 24425540651520.0, + "grad_norm": 2.3508980616485218, + "language_loss": 0.6863274, + "learning_rate": 3.5105712826168493e-06, + "loss": 0.70814955, + "num_input_tokens_seen": 44531650, + "step": 2085, + "time_per_iteration": 2.566713571548462 + }, + { + "auxiliary_loss_clip": 0.0118793, + "auxiliary_loss_mlp": 0.00762514, + "balance_loss_clip": 1.05338681, + "balance_loss_mlp": 1.00101376, + "epoch": 0.2508266698731437, + "flos": 20260944028800.0, + "grad_norm": 2.521183852098437, + "language_loss": 0.70570481, + "learning_rate": 3.5100606333400235e-06, + "loss": 0.72520924, + "num_input_tokens_seen": 44548785, + "step": 2086, + "time_per_iteration": 2.490622043609619 + }, + { + "auxiliary_loss_clip": 0.01188485, + "auxiliary_loss_mlp": 0.01034101, + "balance_loss_clip": 1.05594409, + "balance_loss_mlp": 1.02366984, + "epoch": 0.25094691276378284, + "flos": 19245318975360.0, + "grad_norm": 2.505118043987518, + "language_loss": 0.76825035, + "learning_rate": 3.5095497549882006e-06, + "loss": 0.7904762, + "num_input_tokens_seen": 44567230, + "step": 2087, + "time_per_iteration": 2.5021004676818848 + }, + { + "auxiliary_loss_clip": 0.01194059, + "auxiliary_loss_mlp": 0.01030423, + "balance_loss_clip": 1.05876863, + "balance_loss_mlp": 1.02075553, + "epoch": 0.25106715565442195, + "flos": 26943237671040.0, + "grad_norm": 1.7856875865388606, + "language_loss": 0.72501385, + "learning_rate": 3.50903864763888e-06, + "loss": 0.74725866, + "num_input_tokens_seen": 44588020, + "step": 2088, + "time_per_iteration": 2.5332508087158203 + }, + { + "auxiliary_loss_clip": 0.01195339, + "auxiliary_loss_mlp": 0.01031678, + "balance_loss_clip": 1.05631685, + "balance_loss_mlp": 1.02234435, + "epoch": 0.251187398545061, + "flos": 48359570572800.0, + "grad_norm": 2.1494490294723367, + "language_loss": 0.76489413, + "learning_rate": 3.5085273113695965e-06, + "loss": 0.78716433, + "num_input_tokens_seen": 44612590, + "step": 2089, + "time_per_iteration": 2.7272369861602783 + }, + { + "auxiliary_loss_clip": 0.01204864, + "auxiliary_loss_mlp": 0.01033963, + "balance_loss_clip": 1.05696774, + "balance_loss_mlp": 1.02444386, + "epoch": 0.2513076414357001, + "flos": 27016100409600.0, + "grad_norm": 2.2022082981706306, + "language_loss": 0.78651774, + "learning_rate": 3.508015746257919e-06, + "loss": 0.80890596, + "num_input_tokens_seen": 44631630, + "step": 2090, + "time_per_iteration": 2.4948346614837646 + }, + { + "auxiliary_loss_clip": 0.01161877, + "auxiliary_loss_mlp": 0.01033786, + "balance_loss_clip": 1.05158484, + "balance_loss_mlp": 1.02409422, + "epoch": 0.2514278843263392, + "flos": 19463619882240.0, + "grad_norm": 1.8523799501244922, + "language_loss": 0.83220065, + "learning_rate": 3.5075039523814518e-06, + "loss": 0.85415721, + "num_input_tokens_seen": 44650820, + "step": 2091, + "time_per_iteration": 2.5273654460906982 + }, + { + "auxiliary_loss_clip": 0.01195118, + "auxiliary_loss_mlp": 0.01028543, + "balance_loss_clip": 1.05422246, + "balance_loss_mlp": 1.01882124, + "epoch": 0.2515481272169783, + "flos": 16866092885760.0, + "grad_norm": 2.296499500178802, + "language_loss": 0.81582999, + "learning_rate": 3.506991929817834e-06, + "loss": 0.83806658, + "num_input_tokens_seen": 44667540, + "step": 2092, + "time_per_iteration": 2.450315237045288 + }, + { + "auxiliary_loss_clip": 0.01202231, + "auxiliary_loss_mlp": 0.01031858, + "balance_loss_clip": 1.05795836, + "balance_loss_mlp": 1.0233345, + "epoch": 0.2516683701076174, + "flos": 23732464752000.0, + "grad_norm": 1.7012162823085548, + "language_loss": 0.82773089, + "learning_rate": 3.506479678644738e-06, + "loss": 0.85007179, + "num_input_tokens_seen": 44687935, + "step": 2093, + "time_per_iteration": 2.461591958999634 + }, + { + "auxiliary_loss_clip": 0.01137346, + "auxiliary_loss_mlp": 0.01028727, + "balance_loss_clip": 1.04909301, + "balance_loss_mlp": 1.01985812, + "epoch": 0.2517886129982565, + "flos": 27635954434560.0, + "grad_norm": 2.4138318168297213, + "language_loss": 0.73893142, + "learning_rate": 3.505967198939873e-06, + "loss": 0.76059222, + "num_input_tokens_seen": 44704975, + "step": 2094, + "time_per_iteration": 2.6150593757629395 + }, + { + "auxiliary_loss_clip": 0.01169933, + "auxiliary_loss_mlp": 0.01026855, + "balance_loss_clip": 1.04951978, + "balance_loss_mlp": 1.01758683, + "epoch": 0.25190885588889556, + "flos": 38104596529920.0, + "grad_norm": 2.0859092826260066, + "language_loss": 0.78316802, + "learning_rate": 3.5054544907809813e-06, + "loss": 0.80513585, + "num_input_tokens_seen": 44725475, + "step": 2095, + "time_per_iteration": 3.389961004257202 + }, + { + "auxiliary_loss_clip": 0.01173337, + "auxiliary_loss_mlp": 0.00763921, + "balance_loss_clip": 1.05584919, + "balance_loss_mlp": 1.00104451, + "epoch": 0.25202909877953467, + "flos": 22269894768000.0, + "grad_norm": 2.5006869602819957, + "language_loss": 0.80470091, + "learning_rate": 3.50494155424584e-06, + "loss": 0.82407349, + "num_input_tokens_seen": 44744380, + "step": 2096, + "time_per_iteration": 2.5241944789886475 + }, + { + "auxiliary_loss_clip": 0.01192477, + "auxiliary_loss_mlp": 0.01032382, + "balance_loss_clip": 1.05513811, + "balance_loss_mlp": 1.02291703, + "epoch": 0.2521493416701738, + "flos": 21761759018880.0, + "grad_norm": 1.5468160053170006, + "language_loss": 0.83209056, + "learning_rate": 3.504428389412262e-06, + "loss": 0.85433912, + "num_input_tokens_seen": 44765190, + "step": 2097, + "time_per_iteration": 2.4868903160095215 + }, + { + "auxiliary_loss_clip": 0.01185417, + "auxiliary_loss_mlp": 0.01030908, + "balance_loss_clip": 1.05216622, + "balance_loss_mlp": 1.02203846, + "epoch": 0.25226958456081283, + "flos": 27746738956800.0, + "grad_norm": 2.368737394508918, + "language_loss": 0.72954226, + "learning_rate": 3.5039149963580927e-06, + "loss": 0.75170553, + "num_input_tokens_seen": 44785210, + "step": 2098, + "time_per_iteration": 2.5266342163085938 + }, + { + "auxiliary_loss_clip": 0.01170141, + "auxiliary_loss_mlp": 0.01032388, + "balance_loss_clip": 1.05530322, + "balance_loss_mlp": 1.02353656, + "epoch": 0.25238982745145194, + "flos": 30732171903360.0, + "grad_norm": 2.2903867713968027, + "language_loss": 0.702914, + "learning_rate": 3.503401375161215e-06, + "loss": 0.72493935, + "num_input_tokens_seen": 44804955, + "step": 2099, + "time_per_iteration": 2.5666234493255615 + }, + { + "auxiliary_loss_clip": 0.01198194, + "auxiliary_loss_mlp": 0.01026853, + "balance_loss_clip": 1.05336785, + "balance_loss_mlp": 1.01854396, + "epoch": 0.252510070342091, + "flos": 20266331068800.0, + "grad_norm": 1.887993159046584, + "language_loss": 0.83356613, + "learning_rate": 3.502887525899544e-06, + "loss": 0.8558166, + "num_input_tokens_seen": 44823935, + "step": 2100, + "time_per_iteration": 3.2118451595306396 + }, + { + "auxiliary_loss_clip": 0.01173081, + "auxiliary_loss_mlp": 0.01024495, + "balance_loss_clip": 1.05175114, + "balance_loss_mlp": 1.01516736, + "epoch": 0.2526303132327301, + "flos": 22747399194240.0, + "grad_norm": 1.9457786861494408, + "language_loss": 0.82854009, + "learning_rate": 3.50237344865103e-06, + "loss": 0.85051584, + "num_input_tokens_seen": 44844935, + "step": 2101, + "time_per_iteration": 2.570845127105713 + }, + { + "auxiliary_loss_clip": 0.01203381, + "auxiliary_loss_mlp": 0.01036664, + "balance_loss_clip": 1.05619383, + "balance_loss_mlp": 1.02799177, + "epoch": 0.2527505561233692, + "flos": 30263466309120.0, + "grad_norm": 2.3177390050641224, + "language_loss": 0.75918275, + "learning_rate": 3.501859143493658e-06, + "loss": 0.78158319, + "num_input_tokens_seen": 44865565, + "step": 2102, + "time_per_iteration": 3.3352866172790527 + }, + { + "auxiliary_loss_clip": 0.0110148, + "auxiliary_loss_mlp": 0.01007108, + "balance_loss_clip": 1.01767004, + "balance_loss_mlp": 1.0046525, + "epoch": 0.2528707990140083, + "flos": 58492917164160.0, + "grad_norm": 0.921659912550604, + "language_loss": 0.60550296, + "learning_rate": 3.5013446105054488e-06, + "loss": 0.62658882, + "num_input_tokens_seen": 44918485, + "step": 2103, + "time_per_iteration": 3.530062437057495 + }, + { + "auxiliary_loss_clip": 0.01143109, + "auxiliary_loss_mlp": 0.01032228, + "balance_loss_clip": 1.04810238, + "balance_loss_mlp": 1.02281046, + "epoch": 0.2529910419046474, + "flos": 24645134448000.0, + "grad_norm": 2.0905933656515687, + "language_loss": 0.74533308, + "learning_rate": 3.5008298497644555e-06, + "loss": 0.76708651, + "num_input_tokens_seen": 44937530, + "step": 2104, + "time_per_iteration": 2.5955331325531006 + }, + { + "auxiliary_loss_clip": 0.01161526, + "auxiliary_loss_mlp": 0.01031508, + "balance_loss_clip": 1.05237865, + "balance_loss_mlp": 1.02185822, + "epoch": 0.2531112847952865, + "flos": 23842135952640.0, + "grad_norm": 1.625787365777546, + "language_loss": 0.87803912, + "learning_rate": 3.500314861348767e-06, + "loss": 0.89996946, + "num_input_tokens_seen": 44958165, + "step": 2105, + "time_per_iteration": 2.6102147102355957 + }, + { + "auxiliary_loss_clip": 0.01150702, + "auxiliary_loss_mlp": 0.01031743, + "balance_loss_clip": 1.05122256, + "balance_loss_mlp": 1.022892, + "epoch": 0.25323152768592555, + "flos": 16143822207360.0, + "grad_norm": 1.917309762653713, + "language_loss": 0.7711193, + "learning_rate": 3.499799645336507e-06, + "loss": 0.79294378, + "num_input_tokens_seen": 44975060, + "step": 2106, + "time_per_iteration": 2.522082567214966 + }, + { + "auxiliary_loss_clip": 0.01191094, + "auxiliary_loss_mlp": 0.01027348, + "balance_loss_clip": 1.05758011, + "balance_loss_mlp": 1.01914012, + "epoch": 0.25335177057656466, + "flos": 28405161210240.0, + "grad_norm": 1.3905052866961007, + "language_loss": 0.86792123, + "learning_rate": 3.4992842018058336e-06, + "loss": 0.89010561, + "num_input_tokens_seen": 44997960, + "step": 2107, + "time_per_iteration": 2.561736822128296 + }, + { + "auxiliary_loss_clip": 0.011631, + "auxiliary_loss_mlp": 0.01029042, + "balance_loss_clip": 1.05104733, + "balance_loss_mlp": 1.02026212, + "epoch": 0.25347201346720377, + "flos": 18799666934400.0, + "grad_norm": 2.0756051777631086, + "language_loss": 0.88237548, + "learning_rate": 3.4987685308349384e-06, + "loss": 0.90429688, + "num_input_tokens_seen": 45015690, + "step": 2108, + "time_per_iteration": 2.5476276874542236 + }, + { + "auxiliary_loss_clip": 0.0115529, + "auxiliary_loss_mlp": 0.0103338, + "balance_loss_clip": 1.04628479, + "balance_loss_mlp": 1.02460587, + "epoch": 0.2535922563578428, + "flos": 15815490963840.0, + "grad_norm": 2.3927089185915627, + "language_loss": 0.61436939, + "learning_rate": 3.4982526325020497e-06, + "loss": 0.63625616, + "num_input_tokens_seen": 45032660, + "step": 2109, + "time_per_iteration": 2.5201261043548584 + }, + { + "auxiliary_loss_clip": 0.01177109, + "auxiliary_loss_mlp": 0.0103538, + "balance_loss_clip": 1.05469656, + "balance_loss_mlp": 1.02593875, + "epoch": 0.25371249924848194, + "flos": 16318922031360.0, + "grad_norm": 2.030259086439113, + "language_loss": 0.8153547, + "learning_rate": 3.4977365068854273e-06, + "loss": 0.83747959, + "num_input_tokens_seen": 45048280, + "step": 2110, + "time_per_iteration": 2.473249912261963 + }, + { + "auxiliary_loss_clip": 0.01166651, + "auxiliary_loss_mlp": 0.01032793, + "balance_loss_clip": 1.05163777, + "balance_loss_mlp": 1.0235244, + "epoch": 0.25383274213912105, + "flos": 21761615364480.0, + "grad_norm": 1.7392537012980398, + "language_loss": 0.73333615, + "learning_rate": 3.4972201540633676e-06, + "loss": 0.75533056, + "num_input_tokens_seen": 45067635, + "step": 2111, + "time_per_iteration": 2.5331084728240967 + }, + { + "auxiliary_loss_clip": 0.01164426, + "auxiliary_loss_mlp": 0.01030375, + "balance_loss_clip": 1.05154383, + "balance_loss_mlp": 1.02049839, + "epoch": 0.2539529850297601, + "flos": 21396870708480.0, + "grad_norm": 1.92788972733034, + "language_loss": 0.85111225, + "learning_rate": 3.4967035741142008e-06, + "loss": 0.87306023, + "num_input_tokens_seen": 45086455, + "step": 2112, + "time_per_iteration": 2.510530948638916 + }, + { + "auxiliary_loss_clip": 0.0116522, + "auxiliary_loss_mlp": 0.01033141, + "balance_loss_clip": 1.05757272, + "balance_loss_mlp": 1.02461171, + "epoch": 0.2540732279203992, + "flos": 25228467319680.0, + "grad_norm": 1.9004855977842594, + "language_loss": 0.8192845, + "learning_rate": 3.4961867671162917e-06, + "loss": 0.84126812, + "num_input_tokens_seen": 45106385, + "step": 2113, + "time_per_iteration": 2.5713083744049072 + }, + { + "auxiliary_loss_clip": 0.01206565, + "auxiliary_loss_mlp": 0.01030303, + "balance_loss_clip": 1.05733156, + "balance_loss_mlp": 1.0207963, + "epoch": 0.2541934708110383, + "flos": 19427386037760.0, + "grad_norm": 2.5241859440578125, + "language_loss": 0.7741611, + "learning_rate": 3.4956697331480402e-06, + "loss": 0.79652977, + "num_input_tokens_seen": 45124955, + "step": 2114, + "time_per_iteration": 2.4544777870178223 + }, + { + "auxiliary_loss_clip": 0.01166404, + "auxiliary_loss_mlp": 0.01032257, + "balance_loss_clip": 1.05211401, + "balance_loss_mlp": 1.02299428, + "epoch": 0.2543137137016774, + "flos": 23949436855680.0, + "grad_norm": 2.5630143768817146, + "language_loss": 0.79988539, + "learning_rate": 3.495152472287879e-06, + "loss": 0.821872, + "num_input_tokens_seen": 45145665, + "step": 2115, + "time_per_iteration": 2.5874271392822266 + }, + { + "auxiliary_loss_clip": 0.01158812, + "auxiliary_loss_mlp": 0.01028466, + "balance_loss_clip": 1.05328512, + "balance_loss_mlp": 1.0199964, + "epoch": 0.2544339565923165, + "flos": 25593283802880.0, + "grad_norm": 1.7692237210423165, + "language_loss": 0.73716426, + "learning_rate": 3.4946349846142766e-06, + "loss": 0.75903702, + "num_input_tokens_seen": 45164805, + "step": 2116, + "time_per_iteration": 2.58834171295166 + }, + { + "auxiliary_loss_clip": 0.01202686, + "auxiliary_loss_mlp": 0.01036275, + "balance_loss_clip": 1.05644846, + "balance_loss_mlp": 1.027668, + "epoch": 0.25455419948295555, + "flos": 21689470897920.0, + "grad_norm": 2.159284813082856, + "language_loss": 0.7558589, + "learning_rate": 3.4941172702057353e-06, + "loss": 0.77824843, + "num_input_tokens_seen": 45184865, + "step": 2117, + "time_per_iteration": 2.48641300201416 + }, + { + "auxiliary_loss_clip": 0.01171982, + "auxiliary_loss_mlp": 0.01029152, + "balance_loss_clip": 1.05370712, + "balance_loss_mlp": 1.02001441, + "epoch": 0.25467444237359466, + "flos": 26250341339520.0, + "grad_norm": 1.7701298709994755, + "language_loss": 0.80762386, + "learning_rate": 3.4935993291407924e-06, + "loss": 0.82963514, + "num_input_tokens_seen": 45203690, + "step": 2118, + "time_per_iteration": 2.5413289070129395 + }, + { + "auxiliary_loss_clip": 0.01170229, + "auxiliary_loss_mlp": 0.0103097, + "balance_loss_clip": 1.05197501, + "balance_loss_mlp": 1.02149272, + "epoch": 0.25479468526423377, + "flos": 26979686997120.0, + "grad_norm": 2.3802236179075242, + "language_loss": 0.71328574, + "learning_rate": 3.4930811614980183e-06, + "loss": 0.73529768, + "num_input_tokens_seen": 45225385, + "step": 2119, + "time_per_iteration": 2.5493664741516113 + }, + { + "auxiliary_loss_clip": 0.01182388, + "auxiliary_loss_mlp": 0.01032517, + "balance_loss_clip": 1.05390084, + "balance_loss_mlp": 1.02403474, + "epoch": 0.2549149281548728, + "flos": 23475811098240.0, + "grad_norm": 2.0199367282842604, + "language_loss": 0.7922172, + "learning_rate": 3.4925627673560198e-06, + "loss": 0.81436622, + "num_input_tokens_seen": 45246045, + "step": 2120, + "time_per_iteration": 2.5128061771392822 + }, + { + "auxiliary_loss_clip": 0.01158989, + "auxiliary_loss_mlp": 0.0103267, + "balance_loss_clip": 1.05246353, + "balance_loss_mlp": 1.02473652, + "epoch": 0.25503517104551193, + "flos": 25812302981760.0, + "grad_norm": 1.9230073912946524, + "language_loss": 0.88416111, + "learning_rate": 3.4920441467934357e-06, + "loss": 0.90607774, + "num_input_tokens_seen": 45266560, + "step": 2121, + "time_per_iteration": 2.5851340293884277 + }, + { + "auxiliary_loss_clip": 0.01151488, + "auxiliary_loss_mlp": 0.01033735, + "balance_loss_clip": 1.05129361, + "balance_loss_mlp": 1.02486539, + "epoch": 0.25515541393615104, + "flos": 26645106787200.0, + "grad_norm": 2.195490402675023, + "language_loss": 0.83231664, + "learning_rate": 3.491525299888941e-06, + "loss": 0.85416889, + "num_input_tokens_seen": 45285405, + "step": 2122, + "time_per_iteration": 2.5899102687835693 + }, + { + "auxiliary_loss_clip": 0.01072679, + "auxiliary_loss_mlp": 0.0075546, + "balance_loss_clip": 1.01948035, + "balance_loss_mlp": 1.000265, + "epoch": 0.2552756568267901, + "flos": 65955945847680.0, + "grad_norm": 0.8770652259580193, + "language_loss": 0.62685966, + "learning_rate": 3.491006226721244e-06, + "loss": 0.64514112, + "num_input_tokens_seen": 45349615, + "step": 2123, + "time_per_iteration": 3.859295129776001 + }, + { + "auxiliary_loss_clip": 0.01178364, + "auxiliary_loss_mlp": 0.00763318, + "balance_loss_clip": 1.05693626, + "balance_loss_mlp": 1.00105333, + "epoch": 0.2553958997174292, + "flos": 17931096161280.0, + "grad_norm": 2.0628697575423196, + "language_loss": 0.77470613, + "learning_rate": 3.4904869273690882e-06, + "loss": 0.79412293, + "num_input_tokens_seen": 45367505, + "step": 2124, + "time_per_iteration": 2.495497226715088 + }, + { + "auxiliary_loss_clip": 0.01191121, + "auxiliary_loss_mlp": 0.01026454, + "balance_loss_clip": 1.05583692, + "balance_loss_mlp": 1.01768041, + "epoch": 0.2555161426080683, + "flos": 23367791923200.0, + "grad_norm": 1.7247341695777525, + "language_loss": 0.88699746, + "learning_rate": 3.489967401911251e-06, + "loss": 0.90917325, + "num_input_tokens_seen": 45386805, + "step": 2125, + "time_per_iteration": 2.5121734142303467 + }, + { + "auxiliary_loss_clip": 0.01210419, + "auxiliary_loss_mlp": 0.0103368, + "balance_loss_clip": 1.06005621, + "balance_loss_mlp": 1.02292109, + "epoch": 0.2556363854987074, + "flos": 40625130723840.0, + "grad_norm": 1.9436663693449536, + "language_loss": 0.69397521, + "learning_rate": 3.4894476504265428e-06, + "loss": 0.71641624, + "num_input_tokens_seen": 45411045, + "step": 2126, + "time_per_iteration": 2.72096848487854 + }, + { + "auxiliary_loss_clip": 0.010864, + "auxiliary_loss_mlp": 0.01003403, + "balance_loss_clip": 1.01794052, + "balance_loss_mlp": 1.0007925, + "epoch": 0.2557566283893465, + "flos": 68019443389440.0, + "grad_norm": 0.7623882409950254, + "language_loss": 0.5443691, + "learning_rate": 3.4889276729938104e-06, + "loss": 0.56526709, + "num_input_tokens_seen": 45469575, + "step": 2127, + "time_per_iteration": 3.7172720432281494 + }, + { + "auxiliary_loss_clip": 0.01169167, + "auxiliary_loss_mlp": 0.01027402, + "balance_loss_clip": 1.05271685, + "balance_loss_mlp": 1.01758492, + "epoch": 0.2558768712799856, + "flos": 22635645004800.0, + "grad_norm": 2.1553164962821385, + "language_loss": 0.8039782, + "learning_rate": 3.488407469691934e-06, + "loss": 0.82594395, + "num_input_tokens_seen": 45490270, + "step": 2128, + "time_per_iteration": 2.576204299926758 + }, + { + "auxiliary_loss_clip": 0.01174485, + "auxiliary_loss_mlp": 0.0103009, + "balance_loss_clip": 1.05356598, + "balance_loss_mlp": 1.02067304, + "epoch": 0.25599711417062465, + "flos": 26396354125440.0, + "grad_norm": 2.0318334419234345, + "language_loss": 0.80968535, + "learning_rate": 3.487887040599828e-06, + "loss": 0.83173108, + "num_input_tokens_seen": 45510070, + "step": 2129, + "time_per_iteration": 3.3072869777679443 + }, + { + "auxiliary_loss_clip": 0.01208361, + "auxiliary_loss_mlp": 0.01034468, + "balance_loss_clip": 1.06080854, + "balance_loss_mlp": 1.02462125, + "epoch": 0.25611735706126376, + "flos": 22852042490880.0, + "grad_norm": 2.4573005951446105, + "language_loss": 0.7607857, + "learning_rate": 3.4873663857964407e-06, + "loss": 0.78321397, + "num_input_tokens_seen": 45527285, + "step": 2130, + "time_per_iteration": 3.2227623462677 + }, + { + "auxiliary_loss_clip": 0.0114358, + "auxiliary_loss_mlp": 0.010317, + "balance_loss_clip": 1.05134583, + "balance_loss_mlp": 1.02217555, + "epoch": 0.2562375999519028, + "flos": 23367863750400.0, + "grad_norm": 1.794674663180846, + "language_loss": 0.6612438, + "learning_rate": 3.4868455053607556e-06, + "loss": 0.68299663, + "num_input_tokens_seen": 45546900, + "step": 2131, + "time_per_iteration": 2.5921573638916016 + }, + { + "auxiliary_loss_clip": 0.01193877, + "auxiliary_loss_mlp": 0.01034462, + "balance_loss_clip": 1.05515027, + "balance_loss_mlp": 1.02475262, + "epoch": 0.2563578428425419, + "flos": 22856962654080.0, + "grad_norm": 1.944124922297124, + "language_loss": 0.71568626, + "learning_rate": 3.486324399371789e-06, + "loss": 0.73796964, + "num_input_tokens_seen": 45566200, + "step": 2132, + "time_per_iteration": 2.4922196865081787 + }, + { + "auxiliary_loss_clip": 0.0115738, + "auxiliary_loss_mlp": 0.01033829, + "balance_loss_clip": 1.05402684, + "balance_loss_mlp": 1.02504873, + "epoch": 0.25647808573318104, + "flos": 21653883498240.0, + "grad_norm": 1.9819089517378328, + "language_loss": 0.78778768, + "learning_rate": 3.485803067908593e-06, + "loss": 0.80969977, + "num_input_tokens_seen": 45585710, + "step": 2133, + "time_per_iteration": 2.551889657974243 + }, + { + "auxiliary_loss_clip": 0.01106926, + "auxiliary_loss_mlp": 0.01026539, + "balance_loss_clip": 1.04171038, + "balance_loss_mlp": 1.01760411, + "epoch": 0.2565983286238201, + "flos": 33730569659520.0, + "grad_norm": 2.634108145692624, + "language_loss": 0.79778075, + "learning_rate": 3.485281511050253e-06, + "loss": 0.8191154, + "num_input_tokens_seen": 45607845, + "step": 2134, + "time_per_iteration": 2.723620653152466 + }, + { + "auxiliary_loss_clip": 0.01193201, + "auxiliary_loss_mlp": 0.01033918, + "balance_loss_clip": 1.05552959, + "balance_loss_mlp": 1.02433932, + "epoch": 0.2567185715144592, + "flos": 16216002587520.0, + "grad_norm": 2.2570794717233706, + "language_loss": 0.89735299, + "learning_rate": 3.484759728875889e-06, + "loss": 0.91962421, + "num_input_tokens_seen": 45623210, + "step": 2135, + "time_per_iteration": 2.45483660697937 + }, + { + "auxiliary_loss_clip": 0.01131446, + "auxiliary_loss_mlp": 0.01038552, + "balance_loss_clip": 1.0490222, + "balance_loss_mlp": 1.02992082, + "epoch": 0.2568388144050983, + "flos": 17458475984640.0, + "grad_norm": 3.8938070243407465, + "language_loss": 0.81019562, + "learning_rate": 3.4842377214646543e-06, + "loss": 0.83189565, + "num_input_tokens_seen": 45641505, + "step": 2136, + "time_per_iteration": 2.5678458213806152 + }, + { + "auxiliary_loss_clip": 0.01202163, + "auxiliary_loss_mlp": 0.01032075, + "balance_loss_clip": 1.05674243, + "balance_loss_mlp": 1.02314651, + "epoch": 0.25695905729573737, + "flos": 20887442069760.0, + "grad_norm": 1.6604658790826252, + "language_loss": 0.66533345, + "learning_rate": 3.483715488895737e-06, + "loss": 0.68767583, + "num_input_tokens_seen": 45661835, + "step": 2137, + "time_per_iteration": 2.4626834392547607 + }, + { + "auxiliary_loss_clip": 0.01142181, + "auxiliary_loss_mlp": 0.01026987, + "balance_loss_clip": 1.04717219, + "balance_loss_mlp": 1.01777244, + "epoch": 0.2570793001863765, + "flos": 24717278914560.0, + "grad_norm": 1.8344274365224138, + "language_loss": 0.78482902, + "learning_rate": 3.48319303124836e-06, + "loss": 0.8065207, + "num_input_tokens_seen": 45682215, + "step": 2138, + "time_per_iteration": 2.5964195728302 + }, + { + "auxiliary_loss_clip": 0.01174013, + "auxiliary_loss_mlp": 0.01028037, + "balance_loss_clip": 1.05889273, + "balance_loss_mlp": 1.01898885, + "epoch": 0.2571995430770156, + "flos": 26906896085760.0, + "grad_norm": 2.3729620435613246, + "language_loss": 0.66944039, + "learning_rate": 3.4826703486017798e-06, + "loss": 0.69146091, + "num_input_tokens_seen": 45701840, + "step": 2139, + "time_per_iteration": 2.5900418758392334 + }, + { + "auxiliary_loss_clip": 0.01191061, + "auxiliary_loss_mlp": 0.01029475, + "balance_loss_clip": 1.05879545, + "balance_loss_mlp": 1.02052879, + "epoch": 0.25731978596765465, + "flos": 19792561656960.0, + "grad_norm": 2.4247302024072517, + "language_loss": 0.76890409, + "learning_rate": 3.4821474410352867e-06, + "loss": 0.79110944, + "num_input_tokens_seen": 45720500, + "step": 2140, + "time_per_iteration": 2.488004207611084 + }, + { + "auxiliary_loss_clip": 0.01082749, + "auxiliary_loss_mlp": 0.01001075, + "balance_loss_clip": 1.04038715, + "balance_loss_mlp": 0.99894094, + "epoch": 0.25744002885829376, + "flos": 70564970471040.0, + "grad_norm": 0.9191272701673664, + "language_loss": 0.62697124, + "learning_rate": 3.481624308628205e-06, + "loss": 0.64780945, + "num_input_tokens_seen": 45781870, + "step": 2141, + "time_per_iteration": 3.2358360290527344 + }, + { + "auxiliary_loss_clip": 0.01174224, + "auxiliary_loss_mlp": 0.01032784, + "balance_loss_clip": 1.05396783, + "balance_loss_mlp": 1.02337289, + "epoch": 0.25756027174893287, + "flos": 18038181582720.0, + "grad_norm": 2.970278769193481, + "language_loss": 1.00205362, + "learning_rate": 3.481100951459893e-06, + "loss": 1.02412367, + "num_input_tokens_seen": 45794890, + "step": 2142, + "time_per_iteration": 2.5204684734344482 + }, + { + "auxiliary_loss_clip": 0.01185209, + "auxiliary_loss_mlp": 0.01027551, + "balance_loss_clip": 1.05430675, + "balance_loss_mlp": 1.01856256, + "epoch": 0.2576805146395719, + "flos": 22674069578880.0, + "grad_norm": 1.6469891855405603, + "language_loss": 0.78745556, + "learning_rate": 3.4805773696097453e-06, + "loss": 0.80958319, + "num_input_tokens_seen": 45815780, + "step": 2143, + "time_per_iteration": 2.520623207092285 + }, + { + "auxiliary_loss_clip": 0.01172738, + "auxiliary_loss_mlp": 0.0102886, + "balance_loss_clip": 1.05776381, + "balance_loss_mlp": 1.01972246, + "epoch": 0.25780075753021103, + "flos": 16472225278080.0, + "grad_norm": 1.9411438451505176, + "language_loss": 0.87831646, + "learning_rate": 3.4800535631571874e-06, + "loss": 0.90033245, + "num_input_tokens_seen": 45831310, + "step": 2144, + "time_per_iteration": 2.4799108505249023 + }, + { + "auxiliary_loss_clip": 0.01180654, + "auxiliary_loss_mlp": 0.01038249, + "balance_loss_clip": 1.05542707, + "balance_loss_mlp": 1.02825308, + "epoch": 0.25792100042085014, + "flos": 22820297846400.0, + "grad_norm": 2.046896774358001, + "language_loss": 0.76298684, + "learning_rate": 3.4795295321816804e-06, + "loss": 0.78517586, + "num_input_tokens_seen": 45850135, + "step": 2145, + "time_per_iteration": 2.538241147994995 + }, + { + "auxiliary_loss_clip": 0.01165637, + "auxiliary_loss_mlp": 0.01036551, + "balance_loss_clip": 1.05468774, + "balance_loss_mlp": 1.02724075, + "epoch": 0.2580412433114892, + "flos": 18697286194560.0, + "grad_norm": 1.9800848795841062, + "language_loss": 0.90962821, + "learning_rate": 3.47900527676272e-06, + "loss": 0.93165016, + "num_input_tokens_seen": 45868470, + "step": 2146, + "time_per_iteration": 2.496004104614258 + }, + { + "auxiliary_loss_clip": 0.01208492, + "auxiliary_loss_mlp": 0.01032033, + "balance_loss_clip": 1.06136215, + "balance_loss_mlp": 1.02269852, + "epoch": 0.2581614862021283, + "flos": 14283146810880.0, + "grad_norm": 2.086702487798552, + "language_loss": 0.88554198, + "learning_rate": 3.478480796979835e-06, + "loss": 0.90794718, + "num_input_tokens_seen": 45886355, + "step": 2147, + "time_per_iteration": 2.434720277786255 + }, + { + "auxiliary_loss_clip": 0.01173988, + "auxiliary_loss_mlp": 0.01027981, + "balance_loss_clip": 1.05591857, + "balance_loss_mlp": 1.01867032, + "epoch": 0.25828172909276736, + "flos": 29498281856640.0, + "grad_norm": 1.552640635970955, + "language_loss": 0.78249371, + "learning_rate": 3.4779560929125894e-06, + "loss": 0.80451345, + "num_input_tokens_seen": 45907900, + "step": 2148, + "time_per_iteration": 2.597754716873169 + }, + { + "auxiliary_loss_clip": 0.01065175, + "auxiliary_loss_mlp": 0.01009215, + "balance_loss_clip": 1.01912022, + "balance_loss_mlp": 1.00667632, + "epoch": 0.2584019719834065, + "flos": 67114387376640.0, + "grad_norm": 0.6815891593892291, + "language_loss": 0.56860423, + "learning_rate": 3.4774311646405783e-06, + "loss": 0.58934808, + "num_input_tokens_seen": 45977805, + "step": 2149, + "time_per_iteration": 3.9923486709594727 + }, + { + "auxiliary_loss_clip": 0.01153783, + "auxiliary_loss_mlp": 0.01028883, + "balance_loss_clip": 1.05303276, + "balance_loss_mlp": 1.01968575, + "epoch": 0.2585222148740456, + "flos": 22893555634560.0, + "grad_norm": 1.8853672249802291, + "language_loss": 0.83582449, + "learning_rate": 3.476906012243435e-06, + "loss": 0.85765111, + "num_input_tokens_seen": 45996715, + "step": 2150, + "time_per_iteration": 2.5583300590515137 + }, + { + "auxiliary_loss_clip": 0.01180545, + "auxiliary_loss_mlp": 0.01027528, + "balance_loss_clip": 1.05625606, + "balance_loss_mlp": 1.01795554, + "epoch": 0.25864245776468464, + "flos": 28909202808960.0, + "grad_norm": 1.511076469453165, + "language_loss": 0.8103593, + "learning_rate": 3.476380635800824e-06, + "loss": 0.83244002, + "num_input_tokens_seen": 46017915, + "step": 2151, + "time_per_iteration": 2.5390567779541016 + }, + { + "auxiliary_loss_clip": 0.01174346, + "auxiliary_loss_mlp": 0.01027347, + "balance_loss_clip": 1.05616903, + "balance_loss_mlp": 1.018448, + "epoch": 0.25876270065532375, + "flos": 14793185980800.0, + "grad_norm": 2.600215039857511, + "language_loss": 0.8640222, + "learning_rate": 3.475855035392444e-06, + "loss": 0.88603908, + "num_input_tokens_seen": 46033235, + "step": 2152, + "time_per_iteration": 2.4948666095733643 + }, + { + "auxiliary_loss_clip": 0.01127073, + "auxiliary_loss_mlp": 0.01028314, + "balance_loss_clip": 1.04968023, + "balance_loss_mlp": 1.01910496, + "epoch": 0.25888294354596286, + "flos": 60467821810560.0, + "grad_norm": 1.6634025288863168, + "language_loss": 0.71352565, + "learning_rate": 3.475329211098029e-06, + "loss": 0.73507953, + "num_input_tokens_seen": 46056390, + "step": 2153, + "time_per_iteration": 2.9425621032714844 + }, + { + "auxiliary_loss_clip": 0.01149225, + "auxiliary_loss_mlp": 0.01031092, + "balance_loss_clip": 1.05364001, + "balance_loss_mlp": 1.02187717, + "epoch": 0.2590031864366019, + "flos": 27851166771840.0, + "grad_norm": 1.6357315921339164, + "language_loss": 0.82333028, + "learning_rate": 3.4748031629973453e-06, + "loss": 0.84513342, + "num_input_tokens_seen": 46077120, + "step": 2154, + "time_per_iteration": 3.4575836658477783 + }, + { + "auxiliary_loss_clip": 0.0105047, + "auxiliary_loss_mlp": 0.01005885, + "balance_loss_clip": 1.01598382, + "balance_loss_mlp": 1.00310707, + "epoch": 0.25912342932724103, + "flos": 62422444206720.0, + "grad_norm": 0.9155316019613133, + "language_loss": 0.56618762, + "learning_rate": 3.4742768911701944e-06, + "loss": 0.58675122, + "num_input_tokens_seen": 46139815, + "step": 2155, + "time_per_iteration": 3.219165802001953 + }, + { + "auxiliary_loss_clip": 0.01196505, + "auxiliary_loss_mlp": 0.01038668, + "balance_loss_clip": 1.05883133, + "balance_loss_mlp": 1.02793944, + "epoch": 0.25924367221788014, + "flos": 12378839368320.0, + "grad_norm": 2.761802512396883, + "language_loss": 0.69847536, + "learning_rate": 3.4737503956964113e-06, + "loss": 0.7208271, + "num_input_tokens_seen": 46152120, + "step": 2156, + "time_per_iteration": 3.961956024169922 + }, + { + "auxiliary_loss_clip": 0.01169004, + "auxiliary_loss_mlp": 0.01040266, + "balance_loss_clip": 1.05336964, + "balance_loss_mlp": 1.0294416, + "epoch": 0.2593639151085192, + "flos": 14575208296320.0, + "grad_norm": 3.99632305862847, + "language_loss": 0.66945368, + "learning_rate": 3.473223676655865e-06, + "loss": 0.69154638, + "num_input_tokens_seen": 46170120, + "step": 2157, + "time_per_iteration": 2.564803123474121 + }, + { + "auxiliary_loss_clip": 0.01166583, + "auxiliary_loss_mlp": 0.01035743, + "balance_loss_clip": 1.04949403, + "balance_loss_mlp": 1.0252409, + "epoch": 0.2594841579991583, + "flos": 15230937029760.0, + "grad_norm": 1.7268037665950935, + "language_loss": 0.79892325, + "learning_rate": 3.472696734128459e-06, + "loss": 0.82094657, + "num_input_tokens_seen": 46187985, + "step": 2158, + "time_per_iteration": 2.5056703090667725 + }, + { + "auxiliary_loss_clip": 0.01191107, + "auxiliary_loss_mlp": 0.01030637, + "balance_loss_clip": 1.05745745, + "balance_loss_mlp": 1.02120125, + "epoch": 0.2596044008897974, + "flos": 23623583650560.0, + "grad_norm": 1.7326816651202874, + "language_loss": 0.75859487, + "learning_rate": 3.4721695681941286e-06, + "loss": 0.78081238, + "num_input_tokens_seen": 46207025, + "step": 2159, + "time_per_iteration": 2.511003255844116 + }, + { + "auxiliary_loss_clip": 0.01173823, + "auxiliary_loss_mlp": 0.0076421, + "balance_loss_clip": 1.05419731, + "balance_loss_mlp": 1.00102949, + "epoch": 0.25972464378043647, + "flos": 13772281628160.0, + "grad_norm": 1.895377383427217, + "language_loss": 0.82572401, + "learning_rate": 3.471642178932845e-06, + "loss": 0.84510434, + "num_input_tokens_seen": 46225670, + "step": 2160, + "time_per_iteration": 2.500204563140869 + }, + { + "auxiliary_loss_clip": 0.01177199, + "auxiliary_loss_mlp": 0.01027939, + "balance_loss_clip": 1.05440021, + "balance_loss_mlp": 1.01855111, + "epoch": 0.2598448866710756, + "flos": 19573578391680.0, + "grad_norm": 2.0725371981061635, + "language_loss": 0.89428699, + "learning_rate": 3.471114566424613e-06, + "loss": 0.91633838, + "num_input_tokens_seen": 46244130, + "step": 2161, + "time_per_iteration": 2.504873037338257 + }, + { + "auxiliary_loss_clip": 0.01174941, + "auxiliary_loss_mlp": 0.01031028, + "balance_loss_clip": 1.05605197, + "balance_loss_mlp": 1.02069259, + "epoch": 0.25996512956171464, + "flos": 21653237053440.0, + "grad_norm": 2.0793829364174092, + "language_loss": 0.75630188, + "learning_rate": 3.4705867307494715e-06, + "loss": 0.7783615, + "num_input_tokens_seen": 46263200, + "step": 2162, + "time_per_iteration": 2.5136795043945312 + }, + { + "auxiliary_loss_clip": 0.01193925, + "auxiliary_loss_mlp": 0.01031052, + "balance_loss_clip": 1.05673909, + "balance_loss_mlp": 1.02176583, + "epoch": 0.26008537245235375, + "flos": 18223480869120.0, + "grad_norm": 2.0488846872950495, + "language_loss": 0.84577638, + "learning_rate": 3.470058671987492e-06, + "loss": 0.86802614, + "num_input_tokens_seen": 46281465, + "step": 2163, + "time_per_iteration": 2.5224063396453857 + }, + { + "auxiliary_loss_clip": 0.01193518, + "auxiliary_loss_mlp": 0.01039213, + "balance_loss_clip": 1.05534148, + "balance_loss_mlp": 1.02899122, + "epoch": 0.26020561534299286, + "flos": 24645385843200.0, + "grad_norm": 2.2072860031485626, + "language_loss": 0.84033632, + "learning_rate": 3.4695303902187805e-06, + "loss": 0.86266363, + "num_input_tokens_seen": 46301020, + "step": 2164, + "time_per_iteration": 2.5219271183013916 + }, + { + "auxiliary_loss_clip": 0.01156089, + "auxiliary_loss_mlp": 0.01038292, + "balance_loss_clip": 1.04995465, + "balance_loss_mlp": 1.02805805, + "epoch": 0.2603258582336319, + "flos": 25773662926080.0, + "grad_norm": 2.087429528600077, + "language_loss": 0.7827996, + "learning_rate": 3.469001885523478e-06, + "loss": 0.80474341, + "num_input_tokens_seen": 46321740, + "step": 2165, + "time_per_iteration": 2.5905425548553467 + }, + { + "auxiliary_loss_clip": 0.01201688, + "auxiliary_loss_mlp": 0.01039362, + "balance_loss_clip": 1.05543303, + "balance_loss_mlp": 1.02952147, + "epoch": 0.260446101124271, + "flos": 28766314506240.0, + "grad_norm": 2.3091625099173303, + "language_loss": 0.80929351, + "learning_rate": 3.4684731579817568e-06, + "loss": 0.83170396, + "num_input_tokens_seen": 46342730, + "step": 2166, + "time_per_iteration": 2.50532865524292 + }, + { + "auxiliary_loss_clip": 0.01127048, + "auxiliary_loss_mlp": 0.01035525, + "balance_loss_clip": 1.04990387, + "balance_loss_mlp": 1.02618456, + "epoch": 0.26056634401491013, + "flos": 25666757072640.0, + "grad_norm": 1.6475196472935836, + "language_loss": 0.76369023, + "learning_rate": 3.4679442076738247e-06, + "loss": 0.78531599, + "num_input_tokens_seen": 46362445, + "step": 2167, + "time_per_iteration": 2.6562869548797607 + }, + { + "auxiliary_loss_clip": 0.01207455, + "auxiliary_loss_mlp": 0.01032657, + "balance_loss_clip": 1.05886889, + "balance_loss_mlp": 1.02220261, + "epoch": 0.2606865869055492, + "flos": 27052765217280.0, + "grad_norm": 2.021215652572569, + "language_loss": 0.83549005, + "learning_rate": 3.4674150346799245e-06, + "loss": 0.8578912, + "num_input_tokens_seen": 46382145, + "step": 2168, + "time_per_iteration": 2.5033414363861084 + }, + { + "auxiliary_loss_clip": 0.01172219, + "auxiliary_loss_mlp": 0.01033851, + "balance_loss_clip": 1.05346251, + "balance_loss_mlp": 1.02427304, + "epoch": 0.2608068297961883, + "flos": 17712615686400.0, + "grad_norm": 2.156546396890759, + "language_loss": 0.79640681, + "learning_rate": 3.4668856390803295e-06, + "loss": 0.8184675, + "num_input_tokens_seen": 46400025, + "step": 2169, + "time_per_iteration": 2.4903669357299805 + }, + { + "auxiliary_loss_clip": 0.01178426, + "auxiliary_loss_mlp": 0.01032955, + "balance_loss_clip": 1.05446303, + "balance_loss_mlp": 1.02337074, + "epoch": 0.2609270726868274, + "flos": 18551632544640.0, + "grad_norm": 3.5386152308414105, + "language_loss": 0.89807487, + "learning_rate": 3.4663560209553495e-06, + "loss": 0.92018867, + "num_input_tokens_seen": 46418090, + "step": 2170, + "time_per_iteration": 2.4745211601257324 + }, + { + "auxiliary_loss_clip": 0.01164864, + "auxiliary_loss_mlp": 0.01034136, + "balance_loss_clip": 1.0516758, + "balance_loss_mlp": 1.02445638, + "epoch": 0.26104731557746647, + "flos": 21835699165440.0, + "grad_norm": 1.9023450193216993, + "language_loss": 0.78966999, + "learning_rate": 3.4658261803853267e-06, + "loss": 0.81165999, + "num_input_tokens_seen": 46436015, + "step": 2171, + "time_per_iteration": 2.5177197456359863 + }, + { + "auxiliary_loss_clip": 0.01169403, + "auxiliary_loss_mlp": 0.01030314, + "balance_loss_clip": 1.0544219, + "balance_loss_mlp": 1.02049088, + "epoch": 0.2611675584681056, + "flos": 21689650465920.0, + "grad_norm": 2.6510269437858316, + "language_loss": 0.80750978, + "learning_rate": 3.4652961174506383e-06, + "loss": 0.82950693, + "num_input_tokens_seen": 46455885, + "step": 2172, + "time_per_iteration": 2.5097367763519287 + }, + { + "auxiliary_loss_clip": 0.01084714, + "auxiliary_loss_mlp": 0.01011892, + "balance_loss_clip": 1.01756299, + "balance_loss_mlp": 1.00910234, + "epoch": 0.2612878013587447, + "flos": 71862101389440.0, + "grad_norm": 1.0193643733502542, + "language_loss": 0.58153141, + "learning_rate": 3.464765832231694e-06, + "loss": 0.60249746, + "num_input_tokens_seen": 46510050, + "step": 2173, + "time_per_iteration": 3.0923948287963867 + }, + { + "auxiliary_loss_clip": 0.01192084, + "auxiliary_loss_mlp": 0.01030414, + "balance_loss_clip": 1.05852592, + "balance_loss_mlp": 1.02082407, + "epoch": 0.26140804424938374, + "flos": 20227511445120.0, + "grad_norm": 2.3149253550655096, + "language_loss": 0.70618618, + "learning_rate": 3.4642353248089373e-06, + "loss": 0.7284112, + "num_input_tokens_seen": 46528810, + "step": 2174, + "time_per_iteration": 2.48984956741333 + }, + { + "auxiliary_loss_clip": 0.01169046, + "auxiliary_loss_mlp": 0.01030071, + "balance_loss_clip": 1.05276036, + "balance_loss_mlp": 1.01990223, + "epoch": 0.26152828714002285, + "flos": 25557085872000.0, + "grad_norm": 2.000723183186798, + "language_loss": 0.80713701, + "learning_rate": 3.463704595262846e-06, + "loss": 0.82912821, + "num_input_tokens_seen": 46549690, + "step": 2175, + "time_per_iteration": 2.5934367179870605 + }, + { + "auxiliary_loss_clip": 0.01156081, + "auxiliary_loss_mlp": 0.01037349, + "balance_loss_clip": 1.0525136, + "balance_loss_mlp": 1.02777624, + "epoch": 0.26164853003066196, + "flos": 25446516831360.0, + "grad_norm": 1.957270693339975, + "language_loss": 0.70680332, + "learning_rate": 3.463173643673931e-06, + "loss": 0.72873759, + "num_input_tokens_seen": 46572215, + "step": 2176, + "time_per_iteration": 3.34092116355896 + }, + { + "auxiliary_loss_clip": 0.01089714, + "auxiliary_loss_mlp": 0.01004585, + "balance_loss_clip": 1.01649427, + "balance_loss_mlp": 1.00189066, + "epoch": 0.261768772921301, + "flos": 53944580568960.0, + "grad_norm": 0.9009487366286343, + "language_loss": 0.63512474, + "learning_rate": 3.4626424701227387e-06, + "loss": 0.65606773, + "num_input_tokens_seen": 46627275, + "step": 2177, + "time_per_iteration": 2.9852020740509033 + }, + { + "auxiliary_loss_clip": 0.01099125, + "auxiliary_loss_mlp": 0.01002059, + "balance_loss_clip": 1.01644278, + "balance_loss_mlp": 0.99941272, + "epoch": 0.26188901581194013, + "flos": 70687606481280.0, + "grad_norm": 0.8507838338671989, + "language_loss": 0.55781054, + "learning_rate": 3.4621110746898452e-06, + "loss": 0.57882243, + "num_input_tokens_seen": 46695135, + "step": 2178, + "time_per_iteration": 3.1157422065734863 + }, + { + "auxiliary_loss_clip": 0.01193719, + "auxiliary_loss_mlp": 0.01031432, + "balance_loss_clip": 1.05774999, + "balance_loss_mlp": 1.02217567, + "epoch": 0.2620092587025792, + "flos": 21069580959360.0, + "grad_norm": 1.4873439361725416, + "language_loss": 0.74587929, + "learning_rate": 3.4615794574558654e-06, + "loss": 0.76813078, + "num_input_tokens_seen": 46714145, + "step": 2179, + "time_per_iteration": 2.5133144855499268 + }, + { + "auxiliary_loss_clip": 0.01174367, + "auxiliary_loss_mlp": 0.01027193, + "balance_loss_clip": 1.05410087, + "balance_loss_mlp": 1.01858366, + "epoch": 0.2621295015932183, + "flos": 18369601395840.0, + "grad_norm": 2.651718185785411, + "language_loss": 0.83523309, + "learning_rate": 3.4610476185014436e-06, + "loss": 0.85724866, + "num_input_tokens_seen": 46731405, + "step": 2180, + "time_per_iteration": 3.3816909790039062 + }, + { + "auxiliary_loss_clip": 0.01204113, + "auxiliary_loss_mlp": 0.01034241, + "balance_loss_clip": 1.05562973, + "balance_loss_mlp": 1.02438819, + "epoch": 0.2622497444838574, + "flos": 23659997063040.0, + "grad_norm": 1.509056360992545, + "language_loss": 0.7924732, + "learning_rate": 3.4605155579072597e-06, + "loss": 0.81485671, + "num_input_tokens_seen": 46751260, + "step": 2181, + "time_per_iteration": 2.511284589767456 + }, + { + "auxiliary_loss_clip": 0.0113947, + "auxiliary_loss_mlp": 0.01027914, + "balance_loss_clip": 1.05005026, + "balance_loss_mlp": 1.01863933, + "epoch": 0.26236998737449646, + "flos": 22123810154880.0, + "grad_norm": 1.8796463360328792, + "language_loss": 0.71368301, + "learning_rate": 3.459983275754027e-06, + "loss": 0.73535681, + "num_input_tokens_seen": 46770155, + "step": 2182, + "time_per_iteration": 2.566173553466797 + }, + { + "auxiliary_loss_clip": 0.012013, + "auxiliary_loss_mlp": 0.01028907, + "balance_loss_clip": 1.05554163, + "balance_loss_mlp": 1.01951289, + "epoch": 0.26249023026513557, + "flos": 17895185539200.0, + "grad_norm": 2.372062124286327, + "language_loss": 0.795398, + "learning_rate": 3.4594507721224918e-06, + "loss": 0.81770003, + "num_input_tokens_seen": 46788805, + "step": 2183, + "time_per_iteration": 3.967097043991089 + }, + { + "auxiliary_loss_clip": 0.01175374, + "auxiliary_loss_mlp": 0.01041499, + "balance_loss_clip": 1.051723, + "balance_loss_mlp": 1.03203917, + "epoch": 0.2626104731557747, + "flos": 18332936588160.0, + "grad_norm": 1.9438006175057327, + "language_loss": 0.81874543, + "learning_rate": 3.4589180470934353e-06, + "loss": 0.84091413, + "num_input_tokens_seen": 46808670, + "step": 2184, + "time_per_iteration": 2.502563953399658 + }, + { + "auxiliary_loss_clip": 0.01194775, + "auxiliary_loss_mlp": 0.01033721, + "balance_loss_clip": 1.05380929, + "balance_loss_mlp": 1.02364182, + "epoch": 0.26273071604641374, + "flos": 19317714837120.0, + "grad_norm": 2.3723940443817413, + "language_loss": 0.76666892, + "learning_rate": 3.4583851007476713e-06, + "loss": 0.7889539, + "num_input_tokens_seen": 46827140, + "step": 2185, + "time_per_iteration": 2.4666144847869873 + }, + { + "auxiliary_loss_clip": 0.01162923, + "auxiliary_loss_mlp": 0.0103576, + "balance_loss_clip": 1.05326986, + "balance_loss_mlp": 1.02512026, + "epoch": 0.26285095893705285, + "flos": 18327477720960.0, + "grad_norm": 2.0768769104053653, + "language_loss": 0.68215311, + "learning_rate": 3.4578519331660464e-06, + "loss": 0.70413995, + "num_input_tokens_seen": 46844135, + "step": 2186, + "time_per_iteration": 2.509894609451294 + }, + { + "auxiliary_loss_clip": 0.01188431, + "auxiliary_loss_mlp": 0.01034435, + "balance_loss_clip": 1.05918777, + "balance_loss_mlp": 1.02554226, + "epoch": 0.26297120182769196, + "flos": 20193827466240.0, + "grad_norm": 1.9117775937352737, + "language_loss": 0.82438302, + "learning_rate": 3.4573185444294426e-06, + "loss": 0.84661174, + "num_input_tokens_seen": 46862500, + "step": 2187, + "time_per_iteration": 2.474818468093872 + }, + { + "auxiliary_loss_clip": 0.01172769, + "auxiliary_loss_mlp": 0.00764158, + "balance_loss_clip": 1.05395246, + "balance_loss_mlp": 1.00117302, + "epoch": 0.263091444718331, + "flos": 22418421505920.0, + "grad_norm": 1.6784443473579422, + "language_loss": 0.78736544, + "learning_rate": 3.456784934618774e-06, + "loss": 0.80673468, + "num_input_tokens_seen": 46883665, + "step": 2188, + "time_per_iteration": 2.5658860206604004 + }, + { + "auxiliary_loss_clip": 0.0117216, + "auxiliary_loss_mlp": 0.0103019, + "balance_loss_clip": 1.0523355, + "balance_loss_mlp": 1.02147555, + "epoch": 0.2632116876089701, + "flos": 19024827338880.0, + "grad_norm": 1.9898179887242398, + "language_loss": 0.7982949, + "learning_rate": 3.4562511038149897e-06, + "loss": 0.8203184, + "num_input_tokens_seen": 46899160, + "step": 2189, + "time_per_iteration": 2.484616279602051 + }, + { + "auxiliary_loss_clip": 0.01043269, + "auxiliary_loss_mlp": 0.01010622, + "balance_loss_clip": 1.01337743, + "balance_loss_mlp": 1.0079751, + "epoch": 0.26333193049960923, + "flos": 67308054531840.0, + "grad_norm": 0.8585495073503778, + "language_loss": 0.57825124, + "learning_rate": 3.4557170520990705e-06, + "loss": 0.59879017, + "num_input_tokens_seen": 46959835, + "step": 2190, + "time_per_iteration": 3.191999912261963 + }, + { + "auxiliary_loss_clip": 0.01183314, + "auxiliary_loss_mlp": 0.01032531, + "balance_loss_clip": 1.05419266, + "balance_loss_mlp": 1.02329838, + "epoch": 0.2634521733902483, + "flos": 25048806468480.0, + "grad_norm": 1.6137718148324807, + "language_loss": 0.86583769, + "learning_rate": 3.4551827795520324e-06, + "loss": 0.8879962, + "num_input_tokens_seen": 46982720, + "step": 2191, + "time_per_iteration": 2.5362210273742676 + }, + { + "auxiliary_loss_clip": 0.0118983, + "auxiliary_loss_mlp": 0.01031593, + "balance_loss_clip": 1.05418825, + "balance_loss_mlp": 1.02272391, + "epoch": 0.2635724162808874, + "flos": 20594985534720.0, + "grad_norm": 2.4627701690007116, + "language_loss": 0.84891498, + "learning_rate": 3.4546482862549226e-06, + "loss": 0.87112916, + "num_input_tokens_seen": 47003035, + "step": 2192, + "time_per_iteration": 2.5503673553466797 + }, + { + "auxiliary_loss_clip": 0.01153429, + "auxiliary_loss_mlp": 0.010393, + "balance_loss_clip": 1.04966354, + "balance_loss_mlp": 1.02913737, + "epoch": 0.2636926591715265, + "flos": 19244636616960.0, + "grad_norm": 2.113200385810639, + "language_loss": 0.78424251, + "learning_rate": 3.4541135722888253e-06, + "loss": 0.80616981, + "num_input_tokens_seen": 47019625, + "step": 2193, + "time_per_iteration": 2.534825325012207 + }, + { + "auxiliary_loss_clip": 0.01200181, + "auxiliary_loss_mlp": 0.01034314, + "balance_loss_clip": 1.05460715, + "balance_loss_mlp": 1.02436638, + "epoch": 0.26381290206216557, + "flos": 28804882734720.0, + "grad_norm": 1.7417622829250472, + "language_loss": 0.80277115, + "learning_rate": 3.453578637734854e-06, + "loss": 0.8251161, + "num_input_tokens_seen": 47040815, + "step": 2194, + "time_per_iteration": 2.506690740585327 + }, + { + "auxiliary_loss_clip": 0.01206586, + "auxiliary_loss_mlp": 0.01032307, + "balance_loss_clip": 1.06002569, + "balance_loss_mlp": 1.02282357, + "epoch": 0.2639331449528047, + "flos": 25008909436800.0, + "grad_norm": 2.124110737139238, + "language_loss": 0.78589243, + "learning_rate": 3.4530434826741605e-06, + "loss": 0.80828136, + "num_input_tokens_seen": 47061755, + "step": 2195, + "time_per_iteration": 2.477501153945923 + }, + { + "auxiliary_loss_clip": 0.0116866, + "auxiliary_loss_mlp": 0.0102956, + "balance_loss_clip": 1.05284405, + "balance_loss_mlp": 1.0206964, + "epoch": 0.26405338784344373, + "flos": 46535775465600.0, + "grad_norm": 1.7624789130128642, + "language_loss": 0.68885505, + "learning_rate": 3.452508107187926e-06, + "loss": 0.71083724, + "num_input_tokens_seen": 47085130, + "step": 2196, + "time_per_iteration": 2.711853504180908 + }, + { + "auxiliary_loss_clip": 0.01128438, + "auxiliary_loss_mlp": 0.01031513, + "balance_loss_clip": 1.04503417, + "balance_loss_mlp": 1.02129078, + "epoch": 0.26417363073408284, + "flos": 21179467641600.0, + "grad_norm": 2.0704652117169404, + "language_loss": 0.77017713, + "learning_rate": 3.451972511357366e-06, + "loss": 0.79177666, + "num_input_tokens_seen": 47104675, + "step": 2197, + "time_per_iteration": 2.613565683364868 + }, + { + "auxiliary_loss_clip": 0.01184976, + "auxiliary_loss_mlp": 0.01028763, + "balance_loss_clip": 1.05547154, + "balance_loss_mlp": 1.02033532, + "epoch": 0.26429387362472195, + "flos": 22674751937280.0, + "grad_norm": 1.8053649881975398, + "language_loss": 0.85383511, + "learning_rate": 3.45143669526373e-06, + "loss": 0.87597251, + "num_input_tokens_seen": 47124435, + "step": 2198, + "time_per_iteration": 2.4960427284240723 + }, + { + "auxiliary_loss_clip": 0.01074997, + "auxiliary_loss_mlp": 0.01000853, + "balance_loss_clip": 1.01265836, + "balance_loss_mlp": 0.99854034, + "epoch": 0.264414116515361, + "flos": 67180534272000.0, + "grad_norm": 0.7760007277976674, + "language_loss": 0.63225877, + "learning_rate": 3.450900658988302e-06, + "loss": 0.65301728, + "num_input_tokens_seen": 47185985, + "step": 2199, + "time_per_iteration": 3.0208792686462402 + }, + { + "auxiliary_loss_clip": 0.01164046, + "auxiliary_loss_mlp": 0.01035002, + "balance_loss_clip": 1.05286777, + "balance_loss_mlp": 1.02541804, + "epoch": 0.2645343594060001, + "flos": 25664709997440.0, + "grad_norm": 2.0911592359675817, + "language_loss": 0.77644074, + "learning_rate": 3.450364402612397e-06, + "loss": 0.79843128, + "num_input_tokens_seen": 47203140, + "step": 2200, + "time_per_iteration": 2.5397510528564453 + }, + { + "auxiliary_loss_clip": 0.0116943, + "auxiliary_loss_mlp": 0.01030431, + "balance_loss_clip": 1.05281305, + "balance_loss_mlp": 1.02053034, + "epoch": 0.26465460229663923, + "flos": 22491822948480.0, + "grad_norm": 1.8175951547127978, + "language_loss": 0.83739245, + "learning_rate": 3.449827926217366e-06, + "loss": 0.85939097, + "num_input_tokens_seen": 47222575, + "step": 2201, + "time_per_iteration": 2.5314478874206543 + }, + { + "auxiliary_loss_clip": 0.01176666, + "auxiliary_loss_mlp": 0.01035788, + "balance_loss_clip": 1.0501864, + "balance_loss_mlp": 1.02631664, + "epoch": 0.2647748451872783, + "flos": 29388036038400.0, + "grad_norm": 3.022004525286648, + "language_loss": 0.8025347, + "learning_rate": 3.449291229884591e-06, + "loss": 0.82465917, + "num_input_tokens_seen": 47243815, + "step": 2202, + "time_per_iteration": 2.590207099914551 + }, + { + "auxiliary_loss_clip": 0.01162712, + "auxiliary_loss_mlp": 0.01029764, + "balance_loss_clip": 1.05078983, + "balance_loss_mlp": 1.02038872, + "epoch": 0.2648950880779174, + "flos": 26797799502720.0, + "grad_norm": 1.876144453309393, + "language_loss": 0.86557508, + "learning_rate": 3.4487543136954887e-06, + "loss": 0.88749981, + "num_input_tokens_seen": 47263435, + "step": 2203, + "time_per_iteration": 3.4295365810394287 + }, + { + "auxiliary_loss_clip": 0.01158906, + "auxiliary_loss_mlp": 0.01036772, + "balance_loss_clip": 1.05280054, + "balance_loss_mlp": 1.0273186, + "epoch": 0.2650153309685565, + "flos": 28841008838400.0, + "grad_norm": 1.7569116288390871, + "language_loss": 0.91059256, + "learning_rate": 3.448217177731509e-06, + "loss": 0.93254936, + "num_input_tokens_seen": 47283920, + "step": 2204, + "time_per_iteration": 2.666504144668579 + }, + { + "auxiliary_loss_clip": 0.01167977, + "auxiliary_loss_mlp": 0.01031441, + "balance_loss_clip": 1.0534606, + "balance_loss_mlp": 1.02271509, + "epoch": 0.26513557385919556, + "flos": 20303247271680.0, + "grad_norm": 2.076702108588571, + "language_loss": 0.77804524, + "learning_rate": 3.4476798220741348e-06, + "loss": 0.80003941, + "num_input_tokens_seen": 47302800, + "step": 2205, + "time_per_iteration": 2.506582260131836 + }, + { + "auxiliary_loss_clip": 0.01204099, + "auxiliary_loss_mlp": 0.01033077, + "balance_loss_clip": 1.05876458, + "balance_loss_mlp": 1.02458942, + "epoch": 0.26525581674983467, + "flos": 17676274101120.0, + "grad_norm": 1.5377220142754608, + "language_loss": 0.78563148, + "learning_rate": 3.4471422468048826e-06, + "loss": 0.80800319, + "num_input_tokens_seen": 47321525, + "step": 2206, + "time_per_iteration": 2.4324004650115967 + }, + { + "auxiliary_loss_clip": 0.01181336, + "auxiliary_loss_mlp": 0.01030957, + "balance_loss_clip": 1.05627084, + "balance_loss_mlp": 1.02190924, + "epoch": 0.2653760596404738, + "flos": 26833746038400.0, + "grad_norm": 2.4293612628419563, + "language_loss": 0.72851622, + "learning_rate": 3.4466044520053022e-06, + "loss": 0.7506392, + "num_input_tokens_seen": 47340530, + "step": 2207, + "time_per_iteration": 3.343350410461426 + }, + { + "auxiliary_loss_clip": 0.01158914, + "auxiliary_loss_mlp": 0.0103649, + "balance_loss_clip": 1.04943585, + "balance_loss_mlp": 1.02736473, + "epoch": 0.26549630253111284, + "flos": 22782160581120.0, + "grad_norm": 2.6193198541305973, + "language_loss": 0.60256457, + "learning_rate": 3.446066437756977e-06, + "loss": 0.62451863, + "num_input_tokens_seen": 47359735, + "step": 2208, + "time_per_iteration": 2.5609195232391357 + }, + { + "auxiliary_loss_clip": 0.01173404, + "auxiliary_loss_mlp": 0.01027945, + "balance_loss_clip": 1.05463648, + "balance_loss_mlp": 1.01878965, + "epoch": 0.26561654542175195, + "flos": 23550002640000.0, + "grad_norm": 2.033555887106039, + "language_loss": 0.7517978, + "learning_rate": 3.4455282041415224e-06, + "loss": 0.77381128, + "num_input_tokens_seen": 47378945, + "step": 2209, + "time_per_iteration": 2.5359060764312744 + }, + { + "auxiliary_loss_clip": 0.01160092, + "auxiliary_loss_mlp": 0.01028059, + "balance_loss_clip": 1.05170918, + "balance_loss_mlp": 1.01913643, + "epoch": 0.265736788312391, + "flos": 26906680604160.0, + "grad_norm": 3.415288458782137, + "language_loss": 0.87245369, + "learning_rate": 3.4449897512405894e-06, + "loss": 0.89433515, + "num_input_tokens_seen": 47398095, + "step": 2210, + "time_per_iteration": 4.097186088562012 + }, + { + "auxiliary_loss_clip": 0.01124718, + "auxiliary_loss_mlp": 0.00763835, + "balance_loss_clip": 1.04819703, + "balance_loss_mlp": 1.0010736, + "epoch": 0.2658570312030301, + "flos": 23477139901440.0, + "grad_norm": 2.0643176309109084, + "language_loss": 0.74773562, + "learning_rate": 3.444451079135859e-06, + "loss": 0.76662117, + "num_input_tokens_seen": 47417605, + "step": 2211, + "time_per_iteration": 2.653106451034546 + }, + { + "auxiliary_loss_clip": 0.01134759, + "auxiliary_loss_mlp": 0.00764244, + "balance_loss_clip": 1.0465467, + "balance_loss_mlp": 1.00101697, + "epoch": 0.2659772740936692, + "flos": 21866402315520.0, + "grad_norm": 1.8901220590190684, + "language_loss": 0.74185812, + "learning_rate": 3.4439121879090493e-06, + "loss": 0.76084816, + "num_input_tokens_seen": 47435385, + "step": 2212, + "time_per_iteration": 2.583094596862793 + }, + { + "auxiliary_loss_clip": 0.01179155, + "auxiliary_loss_mlp": 0.0103461, + "balance_loss_clip": 1.05463803, + "balance_loss_mlp": 1.02503157, + "epoch": 0.2660975169843083, + "flos": 19793100360960.0, + "grad_norm": 2.094716576850705, + "language_loss": 0.8358289, + "learning_rate": 3.4433730776419082e-06, + "loss": 0.85796654, + "num_input_tokens_seen": 47454310, + "step": 2213, + "time_per_iteration": 2.5191752910614014 + }, + { + "auxiliary_loss_clip": 0.01192885, + "auxiliary_loss_mlp": 0.00764167, + "balance_loss_clip": 1.05421555, + "balance_loss_mlp": 1.00111341, + "epoch": 0.2662177598749474, + "flos": 29018981750400.0, + "grad_norm": 2.6335578401491517, + "language_loss": 0.80358464, + "learning_rate": 3.4428337484162183e-06, + "loss": 0.82315516, + "num_input_tokens_seen": 47475120, + "step": 2214, + "time_per_iteration": 2.564159870147705 + }, + { + "auxiliary_loss_clip": 0.01170177, + "auxiliary_loss_mlp": 0.01032072, + "balance_loss_clip": 1.05160952, + "balance_loss_mlp": 1.02280319, + "epoch": 0.2663380027655865, + "flos": 21762549118080.0, + "grad_norm": 2.0414288396681783, + "language_loss": 0.84393287, + "learning_rate": 3.442294200313797e-06, + "loss": 0.86595541, + "num_input_tokens_seen": 47493150, + "step": 2215, + "time_per_iteration": 2.528904914855957 + }, + { + "auxiliary_loss_clip": 0.01098147, + "auxiliary_loss_mlp": 0.01001948, + "balance_loss_clip": 1.01628137, + "balance_loss_mlp": 0.99949211, + "epoch": 0.26645824565622556, + "flos": 66980333819520.0, + "grad_norm": 0.764134775831612, + "language_loss": 0.52714628, + "learning_rate": 3.4417544334164916e-06, + "loss": 0.54814726, + "num_input_tokens_seen": 47557295, + "step": 2216, + "time_per_iteration": 3.159410238265991 + }, + { + "auxiliary_loss_clip": 0.01153893, + "auxiliary_loss_mlp": 0.01031995, + "balance_loss_clip": 1.051566, + "balance_loss_mlp": 1.02252388, + "epoch": 0.26657848854686467, + "flos": 25264198373760.0, + "grad_norm": 1.614354961373347, + "language_loss": 0.7732051, + "learning_rate": 3.4412144478061854e-06, + "loss": 0.79506397, + "num_input_tokens_seen": 47579705, + "step": 2217, + "time_per_iteration": 2.5838255882263184 + }, + { + "auxiliary_loss_clip": 0.01099447, + "auxiliary_loss_mlp": 0.01032661, + "balance_loss_clip": 1.04400384, + "balance_loss_mlp": 1.02233744, + "epoch": 0.2666987314375038, + "flos": 23696769611520.0, + "grad_norm": 1.9028512135368856, + "language_loss": 0.75595045, + "learning_rate": 3.4406742435647925e-06, + "loss": 0.77727151, + "num_input_tokens_seen": 47599770, + "step": 2218, + "time_per_iteration": 2.685718059539795 + }, + { + "auxiliary_loss_clip": 0.01187427, + "auxiliary_loss_mlp": 0.01034809, + "balance_loss_clip": 1.0579381, + "balance_loss_mlp": 1.02580237, + "epoch": 0.26681897432814283, + "flos": 27048958375680.0, + "grad_norm": 1.9107099969155412, + "language_loss": 0.78908145, + "learning_rate": 3.440133820774263e-06, + "loss": 0.81130379, + "num_input_tokens_seen": 47619580, + "step": 2219, + "time_per_iteration": 2.5247557163238525 + }, + { + "auxiliary_loss_clip": 0.0117846, + "auxiliary_loss_mlp": 0.01044208, + "balance_loss_clip": 1.05452466, + "balance_loss_mlp": 1.03407562, + "epoch": 0.26693921721878194, + "flos": 28985944216320.0, + "grad_norm": 2.4920134387046584, + "language_loss": 0.81755316, + "learning_rate": 3.439593179516578e-06, + "loss": 0.83977985, + "num_input_tokens_seen": 47639490, + "step": 2220, + "time_per_iteration": 2.581641674041748 + }, + { + "auxiliary_loss_clip": 0.01181368, + "auxiliary_loss_mlp": 0.01032315, + "balance_loss_clip": 1.05764306, + "balance_loss_mlp": 1.02239132, + "epoch": 0.26705946010942105, + "flos": 21507834798720.0, + "grad_norm": 2.0489288468415103, + "language_loss": 0.8075105, + "learning_rate": 3.4390523198737524e-06, + "loss": 0.8296473, + "num_input_tokens_seen": 47658650, + "step": 2221, + "time_per_iteration": 2.524139165878296 + }, + { + "auxiliary_loss_clip": 0.0120737, + "auxiliary_loss_mlp": 0.00763749, + "balance_loss_clip": 1.05965745, + "balance_loss_mlp": 1.00104129, + "epoch": 0.2671797030000601, + "flos": 21471277731840.0, + "grad_norm": 1.6014056937611698, + "language_loss": 0.73457223, + "learning_rate": 3.4385112419278333e-06, + "loss": 0.75428337, + "num_input_tokens_seen": 47679875, + "step": 2222, + "time_per_iteration": 2.5295331478118896 + }, + { + "auxiliary_loss_clip": 0.01091251, + "auxiliary_loss_mlp": 0.01003801, + "balance_loss_clip": 1.01866007, + "balance_loss_mlp": 1.00129783, + "epoch": 0.2672999458906992, + "flos": 64189929767040.0, + "grad_norm": 0.7900199534345798, + "language_loss": 0.64841199, + "learning_rate": 3.4379699457609033e-06, + "loss": 0.66936255, + "num_input_tokens_seen": 47737700, + "step": 2223, + "time_per_iteration": 2.9351255893707275 + }, + { + "auxiliary_loss_clip": 0.0116483, + "auxiliary_loss_mlp": 0.01027367, + "balance_loss_clip": 1.0507673, + "balance_loss_mlp": 1.01781273, + "epoch": 0.26742018878133833, + "flos": 16909042573440.0, + "grad_norm": 1.7615984780781517, + "language_loss": 0.90343904, + "learning_rate": 3.4374284314550755e-06, + "loss": 0.92536098, + "num_input_tokens_seen": 47756740, + "step": 2224, + "time_per_iteration": 2.5056350231170654 + }, + { + "auxiliary_loss_clip": 0.01202904, + "auxiliary_loss_mlp": 0.01026196, + "balance_loss_clip": 1.05745387, + "balance_loss_mlp": 1.01711786, + "epoch": 0.2675404316719774, + "flos": 20667560964480.0, + "grad_norm": 2.351744639910629, + "language_loss": 0.80957329, + "learning_rate": 3.436886699092498e-06, + "loss": 0.83186424, + "num_input_tokens_seen": 47775255, + "step": 2225, + "time_per_iteration": 2.4478039741516113 + }, + { + "auxiliary_loss_clip": 0.01207421, + "auxiliary_loss_mlp": 0.01032841, + "balance_loss_clip": 1.05852139, + "balance_loss_mlp": 1.02335739, + "epoch": 0.2676606745626165, + "flos": 17485013157120.0, + "grad_norm": 2.5878221634172514, + "language_loss": 0.71396798, + "learning_rate": 3.4363447487553502e-06, + "loss": 0.73637056, + "num_input_tokens_seen": 47788570, + "step": 2226, + "time_per_iteration": 2.39943265914917 + }, + { + "auxiliary_loss_clip": 0.01171403, + "auxiliary_loss_mlp": 0.0103302, + "balance_loss_clip": 1.05395949, + "balance_loss_mlp": 1.02295303, + "epoch": 0.26778091745325555, + "flos": 27852675143040.0, + "grad_norm": 2.104374636794237, + "language_loss": 0.77803779, + "learning_rate": 3.4358025805258455e-06, + "loss": 0.80008197, + "num_input_tokens_seen": 47808275, + "step": 2227, + "time_per_iteration": 2.5696909427642822 + }, + { + "auxiliary_loss_clip": 0.01152862, + "auxiliary_loss_mlp": 0.01028216, + "balance_loss_clip": 1.05227089, + "balance_loss_mlp": 1.01900136, + "epoch": 0.26790116034389466, + "flos": 20955995176320.0, + "grad_norm": 1.7738211629019744, + "language_loss": 0.83427048, + "learning_rate": 3.435260194486232e-06, + "loss": 0.85608125, + "num_input_tokens_seen": 47826245, + "step": 2228, + "time_per_iteration": 2.589035749435425 + }, + { + "auxiliary_loss_clip": 0.01176924, + "auxiliary_loss_mlp": 0.01030863, + "balance_loss_clip": 1.05611813, + "balance_loss_mlp": 1.0211947, + "epoch": 0.2680214032345338, + "flos": 18040659621120.0, + "grad_norm": 2.6312617273620273, + "language_loss": 0.82104003, + "learning_rate": 3.4347175907187875e-06, + "loss": 0.84311795, + "num_input_tokens_seen": 47843235, + "step": 2229, + "time_per_iteration": 3.2971348762512207 + }, + { + "auxiliary_loss_clip": 0.011916, + "auxiliary_loss_mlp": 0.01033808, + "balance_loss_clip": 1.0582844, + "balance_loss_mlp": 1.02513516, + "epoch": 0.26814164612517283, + "flos": 22419427086720.0, + "grad_norm": 1.7780046072250697, + "language_loss": 0.88172007, + "learning_rate": 3.4341747693058254e-06, + "loss": 0.90397412, + "num_input_tokens_seen": 47861710, + "step": 2230, + "time_per_iteration": 2.488434314727783 + }, + { + "auxiliary_loss_clip": 0.01095647, + "auxiliary_loss_mlp": 0.01030255, + "balance_loss_clip": 1.04775536, + "balance_loss_mlp": 1.02148104, + "epoch": 0.26826188901581194, + "flos": 35627371159680.0, + "grad_norm": 1.694001288651647, + "language_loss": 0.77246088, + "learning_rate": 3.4336317303296916e-06, + "loss": 0.79371989, + "num_input_tokens_seen": 47882685, + "step": 2231, + "time_per_iteration": 2.8612496852874756 + }, + { + "auxiliary_loss_clip": 0.01186304, + "auxiliary_loss_mlp": 0.01031324, + "balance_loss_clip": 1.05541062, + "balance_loss_mlp": 1.02234769, + "epoch": 0.26838213190645105, + "flos": 17639788861440.0, + "grad_norm": 2.1642863689158665, + "language_loss": 0.7533586, + "learning_rate": 3.4330884738727635e-06, + "loss": 0.77553487, + "num_input_tokens_seen": 47900860, + "step": 2232, + "time_per_iteration": 2.723206043243408 + }, + { + "auxiliary_loss_clip": 0.01138939, + "auxiliary_loss_mlp": 0.01027509, + "balance_loss_clip": 1.05131388, + "balance_loss_mlp": 1.01827621, + "epoch": 0.2685023747970901, + "flos": 22674823764480.0, + "grad_norm": 1.7779594628435271, + "language_loss": 0.70812666, + "learning_rate": 3.4325450000174535e-06, + "loss": 0.72979116, + "num_input_tokens_seen": 47917500, + "step": 2233, + "time_per_iteration": 2.572047710418701 + }, + { + "auxiliary_loss_clip": 0.01138278, + "auxiliary_loss_mlp": 0.01033443, + "balance_loss_clip": 1.05128264, + "balance_loss_mlp": 1.02361381, + "epoch": 0.2686226176877292, + "flos": 20120533764480.0, + "grad_norm": 1.618402985167039, + "language_loss": 0.74359787, + "learning_rate": 3.4320013088462067e-06, + "loss": 0.76531506, + "num_input_tokens_seen": 47934860, + "step": 2234, + "time_per_iteration": 3.381277561187744 + }, + { + "auxiliary_loss_clip": 0.01164374, + "auxiliary_loss_mlp": 0.01030542, + "balance_loss_clip": 1.05154228, + "balance_loss_mlp": 1.02150619, + "epoch": 0.2687428605783683, + "flos": 21872040750720.0, + "grad_norm": 1.5235009537634756, + "language_loss": 0.8156352, + "learning_rate": 3.431457400441499e-06, + "loss": 0.83758438, + "num_input_tokens_seen": 47955255, + "step": 2235, + "time_per_iteration": 3.347637414932251 + }, + { + "auxiliary_loss_clip": 0.01032622, + "auxiliary_loss_mlp": 0.01008453, + "balance_loss_clip": 1.01459813, + "balance_loss_mlp": 1.00600934, + "epoch": 0.2688631034690074, + "flos": 69943320766080.0, + "grad_norm": 0.9079366678636234, + "language_loss": 0.60892451, + "learning_rate": 3.4309132748858424e-06, + "loss": 0.62933517, + "num_input_tokens_seen": 48016245, + "step": 2236, + "time_per_iteration": 3.226901054382324 + }, + { + "auxiliary_loss_clip": 0.01187918, + "auxiliary_loss_mlp": 0.01035537, + "balance_loss_clip": 1.05914545, + "balance_loss_mlp": 1.02591646, + "epoch": 0.2689833463596465, + "flos": 22856639431680.0, + "grad_norm": 1.9115887793382014, + "language_loss": 0.83717406, + "learning_rate": 3.430368932261779e-06, + "loss": 0.85940862, + "num_input_tokens_seen": 48036600, + "step": 2237, + "time_per_iteration": 3.245305061340332 + }, + { + "auxiliary_loss_clip": 0.01174612, + "auxiliary_loss_mlp": 0.01030244, + "balance_loss_clip": 1.05683541, + "balance_loss_mlp": 1.0206064, + "epoch": 0.2691035892502856, + "flos": 17200242132480.0, + "grad_norm": 1.9192311229629062, + "language_loss": 0.74674374, + "learning_rate": 3.429824372651886e-06, + "loss": 0.76879233, + "num_input_tokens_seen": 48054750, + "step": 2238, + "time_per_iteration": 2.4908552169799805 + }, + { + "auxiliary_loss_clip": 0.01153453, + "auxiliary_loss_mlp": 0.01037304, + "balance_loss_clip": 1.05393147, + "balance_loss_mlp": 1.02757645, + "epoch": 0.26922383214092466, + "flos": 17747484814080.0, + "grad_norm": 1.9152688897953654, + "language_loss": 0.83339441, + "learning_rate": 3.4292795961387732e-06, + "loss": 0.85530198, + "num_input_tokens_seen": 48072650, + "step": 2239, + "time_per_iteration": 2.5509543418884277 + }, + { + "auxiliary_loss_clip": 0.01206093, + "auxiliary_loss_mlp": 0.01032319, + "balance_loss_clip": 1.05873489, + "balance_loss_mlp": 1.02366459, + "epoch": 0.26934407503156377, + "flos": 16173376122240.0, + "grad_norm": 2.9537618891360564, + "language_loss": 0.87759751, + "learning_rate": 3.4287346028050818e-06, + "loss": 0.8999815, + "num_input_tokens_seen": 48088720, + "step": 2240, + "time_per_iteration": 2.43803071975708 + }, + { + "auxiliary_loss_clip": 0.01172164, + "auxiliary_loss_mlp": 0.01028178, + "balance_loss_clip": 1.05370331, + "balance_loss_mlp": 1.01926684, + "epoch": 0.2694643179222028, + "flos": 23732895715200.0, + "grad_norm": 1.5180598978251174, + "language_loss": 0.79426318, + "learning_rate": 3.4281893927334866e-06, + "loss": 0.81626666, + "num_input_tokens_seen": 48108630, + "step": 2241, + "time_per_iteration": 2.5908823013305664 + }, + { + "auxiliary_loss_clip": 0.01190714, + "auxiliary_loss_mlp": 0.01029958, + "balance_loss_clip": 1.0588851, + "balance_loss_mlp": 1.0213455, + "epoch": 0.26958456081284193, + "flos": 24718140840960.0, + "grad_norm": 2.4072975760603303, + "language_loss": 0.74991274, + "learning_rate": 3.4276439660066963e-06, + "loss": 0.77211952, + "num_input_tokens_seen": 48128330, + "step": 2242, + "time_per_iteration": 2.57828426361084 + }, + { + "auxiliary_loss_clip": 0.01201659, + "auxiliary_loss_mlp": 0.01035403, + "balance_loss_clip": 1.0576973, + "balance_loss_mlp": 1.02556777, + "epoch": 0.26970480370348104, + "flos": 18112588606080.0, + "grad_norm": 2.415408427330128, + "language_loss": 0.84210706, + "learning_rate": 3.427098322707452e-06, + "loss": 0.86447763, + "num_input_tokens_seen": 48144295, + "step": 2243, + "time_per_iteration": 2.477755546569824 + }, + { + "auxiliary_loss_clip": 0.01192129, + "auxiliary_loss_mlp": 0.01036879, + "balance_loss_clip": 1.06364441, + "balance_loss_mlp": 1.0266571, + "epoch": 0.2698250465941201, + "flos": 10816546250880.0, + "grad_norm": 2.0413057675091517, + "language_loss": 0.89391434, + "learning_rate": 3.426552462918526e-06, + "loss": 0.91620433, + "num_input_tokens_seen": 48162230, + "step": 2244, + "time_per_iteration": 2.557405948638916 + }, + { + "auxiliary_loss_clip": 0.01202954, + "auxiliary_loss_mlp": 0.01033081, + "balance_loss_clip": 1.05979586, + "balance_loss_mlp": 1.02462363, + "epoch": 0.2699452894847592, + "flos": 17308117653120.0, + "grad_norm": 2.3563285545082757, + "language_loss": 0.72778511, + "learning_rate": 3.426006386722726e-06, + "loss": 0.75014549, + "num_input_tokens_seen": 48180290, + "step": 2245, + "time_per_iteration": 2.499812126159668 + }, + { + "auxiliary_loss_clip": 0.01160799, + "auxiliary_loss_mlp": 0.0103593, + "balance_loss_clip": 1.0564723, + "balance_loss_mlp": 1.02704871, + "epoch": 0.2700655323753983, + "flos": 18078150441600.0, + "grad_norm": 1.9394817916798117, + "language_loss": 0.924034, + "learning_rate": 3.4254600942028914e-06, + "loss": 0.94600129, + "num_input_tokens_seen": 48198165, + "step": 2246, + "time_per_iteration": 2.5455679893493652 + }, + { + "auxiliary_loss_clip": 0.01173129, + "auxiliary_loss_mlp": 0.01032121, + "balance_loss_clip": 1.05868292, + "balance_loss_mlp": 1.02346087, + "epoch": 0.2701857752660374, + "flos": 18186636493440.0, + "grad_norm": 1.9558654313809667, + "language_loss": 0.82698524, + "learning_rate": 3.424913585441893e-06, + "loss": 0.84903765, + "num_input_tokens_seen": 48216000, + "step": 2247, + "time_per_iteration": 2.49153208732605 + }, + { + "auxiliary_loss_clip": 0.01183419, + "auxiliary_loss_mlp": 0.0102853, + "balance_loss_clip": 1.05600238, + "balance_loss_mlp": 1.01927936, + "epoch": 0.2703060181566765, + "flos": 16319496648960.0, + "grad_norm": 3.0844617480972163, + "language_loss": 0.8738001, + "learning_rate": 3.4243668605226374e-06, + "loss": 0.89591956, + "num_input_tokens_seen": 48233025, + "step": 2248, + "time_per_iteration": 2.496372938156128 + }, + { + "auxiliary_loss_clip": 0.01157713, + "auxiliary_loss_mlp": 0.01031133, + "balance_loss_clip": 1.05244195, + "balance_loss_mlp": 1.02209091, + "epoch": 0.2704262610473156, + "flos": 19572357329280.0, + "grad_norm": 2.4724546689546907, + "language_loss": 0.82301068, + "learning_rate": 3.423819919528061e-06, + "loss": 0.84489918, + "num_input_tokens_seen": 48251110, + "step": 2249, + "time_per_iteration": 2.5219507217407227 + }, + { + "auxiliary_loss_clip": 0.01148657, + "auxiliary_loss_mlp": 0.01030327, + "balance_loss_clip": 1.04885507, + "balance_loss_mlp": 1.02076626, + "epoch": 0.27054650393795465, + "flos": 20740746925440.0, + "grad_norm": 1.7424485186820948, + "language_loss": 0.77837956, + "learning_rate": 3.4232727625411355e-06, + "loss": 0.80016935, + "num_input_tokens_seen": 48270215, + "step": 2250, + "time_per_iteration": 2.6221792697906494 + }, + { + "auxiliary_loss_clip": 0.01120946, + "auxiliary_loss_mlp": 0.01025229, + "balance_loss_clip": 1.04688549, + "balance_loss_mlp": 1.01664042, + "epoch": 0.27066674682859376, + "flos": 18658322916480.0, + "grad_norm": 1.8363451363014656, + "language_loss": 0.86388594, + "learning_rate": 3.4227253896448626e-06, + "loss": 0.88534766, + "num_input_tokens_seen": 48288075, + "step": 2251, + "time_per_iteration": 2.580113410949707 + }, + { + "auxiliary_loss_clip": 0.01202227, + "auxiliary_loss_mlp": 0.01030729, + "balance_loss_clip": 1.05694735, + "balance_loss_mlp": 1.02182984, + "epoch": 0.2707869897192329, + "flos": 23002759958400.0, + "grad_norm": 3.0654507773860775, + "language_loss": 0.8228516, + "learning_rate": 3.42217780092228e-06, + "loss": 0.84518123, + "num_input_tokens_seen": 48306415, + "step": 2252, + "time_per_iteration": 2.514127016067505 + }, + { + "auxiliary_loss_clip": 0.01069877, + "auxiliary_loss_mlp": 0.01003671, + "balance_loss_clip": 1.01821923, + "balance_loss_mlp": 1.00133419, + "epoch": 0.27090723260987193, + "flos": 58323240293760.0, + "grad_norm": 0.7939388415959332, + "language_loss": 0.60349905, + "learning_rate": 3.421629996456456e-06, + "loss": 0.62423456, + "num_input_tokens_seen": 48365035, + "step": 2253, + "time_per_iteration": 3.033374786376953 + }, + { + "auxiliary_loss_clip": 0.01186188, + "auxiliary_loss_mlp": 0.01033637, + "balance_loss_clip": 1.05457258, + "balance_loss_mlp": 1.02388012, + "epoch": 0.27102747550051104, + "flos": 11984540797440.0, + "grad_norm": 1.7923050664996663, + "language_loss": 0.82629859, + "learning_rate": 3.421081976330491e-06, + "loss": 0.84849685, + "num_input_tokens_seen": 48383550, + "step": 2254, + "time_per_iteration": 2.5073399543762207 + }, + { + "auxiliary_loss_clip": 0.01167662, + "auxiliary_loss_mlp": 0.01033449, + "balance_loss_clip": 1.05178583, + "balance_loss_mlp": 1.02436543, + "epoch": 0.27114771839115015, + "flos": 19900401264000.0, + "grad_norm": 1.8449444252528846, + "language_loss": 0.87963831, + "learning_rate": 3.4205337406275207e-06, + "loss": 0.90164942, + "num_input_tokens_seen": 48403670, + "step": 2255, + "time_per_iteration": 3.304549217224121 + }, + { + "auxiliary_loss_clip": 0.01200529, + "auxiliary_loss_mlp": 0.01026523, + "balance_loss_clip": 1.05661619, + "balance_loss_mlp": 1.0179038, + "epoch": 0.2712679612817892, + "flos": 18331966920960.0, + "grad_norm": 2.454956727448668, + "language_loss": 0.75660276, + "learning_rate": 3.4199852894307114e-06, + "loss": 0.77887332, + "num_input_tokens_seen": 48420420, + "step": 2256, + "time_per_iteration": 2.4426522254943848 + }, + { + "auxiliary_loss_clip": 0.01131246, + "auxiliary_loss_mlp": 0.01031409, + "balance_loss_clip": 1.04980803, + "balance_loss_mlp": 1.02267694, + "epoch": 0.2713882041724283, + "flos": 24460302038400.0, + "grad_norm": 2.0623465347040284, + "language_loss": 0.78563803, + "learning_rate": 3.419436622823262e-06, + "loss": 0.80726457, + "num_input_tokens_seen": 48441140, + "step": 2257, + "time_per_iteration": 2.6230876445770264 + }, + { + "auxiliary_loss_clip": 0.01171958, + "auxiliary_loss_mlp": 0.01031579, + "balance_loss_clip": 1.05612302, + "balance_loss_mlp": 1.02276921, + "epoch": 0.27150844706306737, + "flos": 23039317025280.0, + "grad_norm": 1.750021589003096, + "language_loss": 0.74228567, + "learning_rate": 3.4188877408884063e-06, + "loss": 0.76432103, + "num_input_tokens_seen": 48461845, + "step": 2258, + "time_per_iteration": 2.572782516479492 + }, + { + "auxiliary_loss_clip": 0.01171091, + "auxiliary_loss_mlp": 0.01038582, + "balance_loss_clip": 1.05486631, + "balance_loss_mlp": 1.02887201, + "epoch": 0.2716286899537065, + "flos": 22563644192640.0, + "grad_norm": 2.2965124581100267, + "language_loss": 0.64860356, + "learning_rate": 3.4183386437094088e-06, + "loss": 0.67070031, + "num_input_tokens_seen": 48478510, + "step": 2259, + "time_per_iteration": 2.5134525299072266 + }, + { + "auxiliary_loss_clip": 0.01173097, + "auxiliary_loss_mlp": 0.01026293, + "balance_loss_clip": 1.05262482, + "balance_loss_mlp": 1.01748359, + "epoch": 0.2717489328443456, + "flos": 13115044523520.0, + "grad_norm": 2.089597740704766, + "language_loss": 0.82078403, + "learning_rate": 3.417789331369565e-06, + "loss": 0.84277797, + "num_input_tokens_seen": 48494300, + "step": 2260, + "time_per_iteration": 2.5238311290740967 + }, + { + "auxiliary_loss_clip": 0.01205462, + "auxiliary_loss_mlp": 0.01035265, + "balance_loss_clip": 1.05910349, + "balance_loss_mlp": 1.02563906, + "epoch": 0.27186917573498465, + "flos": 29278688060160.0, + "grad_norm": 2.0035661617620146, + "language_loss": 0.91081125, + "learning_rate": 3.4172398039522088e-06, + "loss": 0.93321848, + "num_input_tokens_seen": 48515585, + "step": 2261, + "time_per_iteration": 3.3564252853393555 + }, + { + "auxiliary_loss_clip": 0.0118778, + "auxiliary_loss_mlp": 0.01024489, + "balance_loss_clip": 1.05621779, + "balance_loss_mlp": 1.01544082, + "epoch": 0.27198941862562376, + "flos": 26032220000640.0, + "grad_norm": 7.4990576122739645, + "language_loss": 0.79958421, + "learning_rate": 3.4166900615407e-06, + "loss": 0.82170689, + "num_input_tokens_seen": 48533500, + "step": 2262, + "time_per_iteration": 3.297567367553711 + }, + { + "auxiliary_loss_clip": 0.01187378, + "auxiliary_loss_mlp": 0.01030643, + "balance_loss_clip": 1.05705428, + "balance_loss_mlp": 1.02142191, + "epoch": 0.27210966151626287, + "flos": 32780983760640.0, + "grad_norm": 2.0809221044117785, + "language_loss": 0.74877006, + "learning_rate": 3.416140104218436e-06, + "loss": 0.7709502, + "num_input_tokens_seen": 48552865, + "step": 2263, + "time_per_iteration": 2.5746586322784424 + }, + { + "auxiliary_loss_clip": 0.01073062, + "auxiliary_loss_mlp": 0.00754645, + "balance_loss_clip": 1.0178287, + "balance_loss_mlp": 1.00041854, + "epoch": 0.2722299044069019, + "flos": 65471043219840.0, + "grad_norm": 0.8441307811209007, + "language_loss": 0.69659412, + "learning_rate": 3.4155899320688437e-06, + "loss": 0.71487117, + "num_input_tokens_seen": 48618940, + "step": 2264, + "time_per_iteration": 3.8723556995391846 + }, + { + "auxiliary_loss_clip": 0.01129499, + "auxiliary_loss_mlp": 0.01026858, + "balance_loss_clip": 1.05041099, + "balance_loss_mlp": 1.01699901, + "epoch": 0.27235014729754103, + "flos": 15334143782400.0, + "grad_norm": 2.7247681020806813, + "language_loss": 0.73862362, + "learning_rate": 3.415039545175384e-06, + "loss": 0.76018715, + "num_input_tokens_seen": 48634665, + "step": 2265, + "time_per_iteration": 2.577755928039551 + }, + { + "auxiliary_loss_clip": 0.01188957, + "auxiliary_loss_mlp": 0.01033404, + "balance_loss_clip": 1.05678117, + "balance_loss_mlp": 1.0244875, + "epoch": 0.27247039018818014, + "flos": 21872363973120.0, + "grad_norm": 4.106364057414082, + "language_loss": 0.64824224, + "learning_rate": 3.414488943621551e-06, + "loss": 0.67046589, + "num_input_tokens_seen": 48653330, + "step": 2266, + "time_per_iteration": 2.5087485313415527 + }, + { + "auxiliary_loss_clip": 0.01184554, + "auxiliary_loss_mlp": 0.01031649, + "balance_loss_clip": 1.05613995, + "balance_loss_mlp": 1.02265429, + "epoch": 0.2725906330788192, + "flos": 18695490514560.0, + "grad_norm": 1.7279238970775026, + "language_loss": 0.73628432, + "learning_rate": 3.41393812749087e-06, + "loss": 0.75844634, + "num_input_tokens_seen": 48671375, + "step": 2267, + "time_per_iteration": 2.461599826812744 + }, + { + "auxiliary_loss_clip": 0.01171601, + "auxiliary_loss_mlp": 0.01036462, + "balance_loss_clip": 1.05656242, + "balance_loss_mlp": 1.02733684, + "epoch": 0.2727108759694583, + "flos": 17886099398400.0, + "grad_norm": 4.317295058800439, + "language_loss": 0.71910489, + "learning_rate": 3.4133870968668984e-06, + "loss": 0.74118555, + "num_input_tokens_seen": 48686175, + "step": 2268, + "time_per_iteration": 2.48836350440979 + }, + { + "auxiliary_loss_clip": 0.01177322, + "auxiliary_loss_mlp": 0.01029565, + "balance_loss_clip": 1.05702877, + "balance_loss_mlp": 1.02101207, + "epoch": 0.2728311188600974, + "flos": 24461666755200.0, + "grad_norm": 2.5641728966422135, + "language_loss": 0.78570765, + "learning_rate": 3.412835851833229e-06, + "loss": 0.80777657, + "num_input_tokens_seen": 48708370, + "step": 2269, + "time_per_iteration": 2.58099627494812 + }, + { + "auxiliary_loss_clip": 0.01185767, + "auxiliary_loss_mlp": 0.01029022, + "balance_loss_clip": 1.0590229, + "balance_loss_mlp": 1.01998627, + "epoch": 0.2729513617507365, + "flos": 30993314757120.0, + "grad_norm": 1.7158798314755568, + "language_loss": 0.77657616, + "learning_rate": 3.4122843924734834e-06, + "loss": 0.79872406, + "num_input_tokens_seen": 48730670, + "step": 2270, + "time_per_iteration": 2.597369432449341 + }, + { + "auxiliary_loss_clip": 0.01168535, + "auxiliary_loss_mlp": 0.01032153, + "balance_loss_clip": 1.05401993, + "balance_loss_mlp": 1.02259791, + "epoch": 0.2730716046413756, + "flos": 19094637421440.0, + "grad_norm": 1.976933745833997, + "language_loss": 0.8811577, + "learning_rate": 3.411732718871319e-06, + "loss": 0.90316451, + "num_input_tokens_seen": 48746510, + "step": 2271, + "time_per_iteration": 2.5825579166412354 + }, + { + "auxiliary_loss_clip": 0.01199637, + "auxiliary_loss_mlp": 0.01032203, + "balance_loss_clip": 1.05929899, + "balance_loss_mlp": 1.023507, + "epoch": 0.27319184753201464, + "flos": 26944566474240.0, + "grad_norm": 1.5312387268328578, + "language_loss": 0.78541899, + "learning_rate": 3.4111808311104227e-06, + "loss": 0.80773735, + "num_input_tokens_seen": 48768825, + "step": 2272, + "time_per_iteration": 2.6019158363342285 + }, + { + "auxiliary_loss_clip": 0.01178437, + "auxiliary_loss_mlp": 0.01031944, + "balance_loss_clip": 1.05322373, + "balance_loss_mlp": 1.02199638, + "epoch": 0.27331209042265375, + "flos": 31759828012800.0, + "grad_norm": 1.6953429846659578, + "language_loss": 0.69546032, + "learning_rate": 3.410628729274517e-06, + "loss": 0.71756411, + "num_input_tokens_seen": 48790345, + "step": 2273, + "time_per_iteration": 2.647488832473755 + }, + { + "auxiliary_loss_clip": 0.0116744, + "auxiliary_loss_mlp": 0.00763594, + "balance_loss_clip": 1.05319655, + "balance_loss_mlp": 1.00075436, + "epoch": 0.27343233331329286, + "flos": 25739081107200.0, + "grad_norm": 1.8351678930949986, + "language_loss": 0.82305956, + "learning_rate": 3.4100764134473546e-06, + "loss": 0.84236991, + "num_input_tokens_seen": 48809630, + "step": 2274, + "time_per_iteration": 2.6007306575775146 + }, + { + "auxiliary_loss_clip": 0.01202305, + "auxiliary_loss_mlp": 0.0103245, + "balance_loss_clip": 1.05969596, + "balance_loss_mlp": 1.02352095, + "epoch": 0.2735525762039319, + "flos": 24389414547840.0, + "grad_norm": 2.267955903135014, + "language_loss": 0.85345608, + "learning_rate": 3.4095238837127215e-06, + "loss": 0.87580359, + "num_input_tokens_seen": 48828770, + "step": 2275, + "time_per_iteration": 2.541872978210449 + }, + { + "auxiliary_loss_clip": 0.01154724, + "auxiliary_loss_mlp": 0.01025331, + "balance_loss_clip": 1.05276263, + "balance_loss_mlp": 1.01671827, + "epoch": 0.27367281909457103, + "flos": 14465357527680.0, + "grad_norm": 1.847822954034739, + "language_loss": 0.79121614, + "learning_rate": 3.4089711401544355e-06, + "loss": 0.81301665, + "num_input_tokens_seen": 48846365, + "step": 2276, + "time_per_iteration": 2.5663187503814697 + }, + { + "auxiliary_loss_clip": 0.01183497, + "auxiliary_loss_mlp": 0.010289, + "balance_loss_clip": 1.05241752, + "balance_loss_mlp": 1.02004313, + "epoch": 0.27379306198521014, + "flos": 23476996247040.0, + "grad_norm": 2.1089054010346446, + "language_loss": 0.67675102, + "learning_rate": 3.4084181828563486e-06, + "loss": 0.69887507, + "num_input_tokens_seen": 48863085, + "step": 2277, + "time_per_iteration": 2.4974398612976074 + }, + { + "auxiliary_loss_clip": 0.01141783, + "auxiliary_loss_mlp": 0.01029247, + "balance_loss_clip": 1.04935431, + "balance_loss_mlp": 1.02056885, + "epoch": 0.2739133048758492, + "flos": 17458152762240.0, + "grad_norm": 2.118327131127108, + "language_loss": 0.70089364, + "learning_rate": 3.4078650119023428e-06, + "loss": 0.72260398, + "num_input_tokens_seen": 48881400, + "step": 2278, + "time_per_iteration": 2.553011417388916 + }, + { + "auxiliary_loss_clip": 0.01131024, + "auxiliary_loss_mlp": 0.01031893, + "balance_loss_clip": 1.04689741, + "balance_loss_mlp": 1.02211189, + "epoch": 0.2740335477664883, + "flos": 19273113123840.0, + "grad_norm": 1.9891293582302516, + "language_loss": 0.74029219, + "learning_rate": 3.4073116273763337e-06, + "loss": 0.76192141, + "num_input_tokens_seen": 48895845, + "step": 2279, + "time_per_iteration": 2.5776617527008057 + }, + { + "auxiliary_loss_clip": 0.01177898, + "auxiliary_loss_mlp": 0.01034254, + "balance_loss_clip": 1.05381751, + "balance_loss_mlp": 1.02486658, + "epoch": 0.2741537906571274, + "flos": 26104723603200.0, + "grad_norm": 1.8173216063267028, + "language_loss": 0.81338096, + "learning_rate": 3.40675802936227e-06, + "loss": 0.83550245, + "num_input_tokens_seen": 48916630, + "step": 2280, + "time_per_iteration": 2.557279348373413 + }, + { + "auxiliary_loss_clip": 0.01165441, + "auxiliary_loss_mlp": 0.01035203, + "balance_loss_clip": 1.05500388, + "balance_loss_mlp": 1.02528501, + "epoch": 0.27427403354776647, + "flos": 34164190644480.0, + "grad_norm": 2.0120829622774288, + "language_loss": 0.71578515, + "learning_rate": 3.4062042179441318e-06, + "loss": 0.7377916, + "num_input_tokens_seen": 48937100, + "step": 2281, + "time_per_iteration": 2.6321053504943848 + }, + { + "auxiliary_loss_clip": 0.01183399, + "auxiliary_loss_mlp": 0.01026043, + "balance_loss_clip": 1.05692315, + "balance_loss_mlp": 1.01804411, + "epoch": 0.2743942764384056, + "flos": 18766988536320.0, + "grad_norm": 1.888945029234877, + "language_loss": 0.8050859, + "learning_rate": 3.4056501932059314e-06, + "loss": 0.82718027, + "num_input_tokens_seen": 48955175, + "step": 2282, + "time_per_iteration": 3.224766731262207 + }, + { + "auxiliary_loss_clip": 0.01098046, + "auxiliary_loss_mlp": 0.01002884, + "balance_loss_clip": 1.02019668, + "balance_loss_mlp": 1.00088143, + "epoch": 0.2745145193290447, + "flos": 64904048058240.0, + "grad_norm": 0.7720408541971505, + "language_loss": 0.58097768, + "learning_rate": 3.405095955231715e-06, + "loss": 0.601987, + "num_input_tokens_seen": 49006830, + "step": 2283, + "time_per_iteration": 2.9714646339416504 + }, + { + "auxiliary_loss_clip": 0.01188518, + "auxiliary_loss_mlp": 0.0102755, + "balance_loss_clip": 1.05505872, + "balance_loss_mlp": 1.01865697, + "epoch": 0.27463476221968375, + "flos": 16136926796160.0, + "grad_norm": 2.1741690648242655, + "language_loss": 0.9450897, + "learning_rate": 3.4045415041055585e-06, + "loss": 0.96725035, + "num_input_tokens_seen": 49022470, + "step": 2284, + "time_per_iteration": 2.4591708183288574 + }, + { + "auxiliary_loss_clip": 0.01173987, + "auxiliary_loss_mlp": 0.01032189, + "balance_loss_clip": 1.05468285, + "balance_loss_mlp": 1.02241373, + "epoch": 0.27475500511032286, + "flos": 10376712213120.0, + "grad_norm": 2.0922483824219418, + "language_loss": 0.77582753, + "learning_rate": 3.4039868399115728e-06, + "loss": 0.79788935, + "num_input_tokens_seen": 49037110, + "step": 2285, + "time_per_iteration": 2.486236095428467 + }, + { + "auxiliary_loss_clip": 0.01137074, + "auxiliary_loss_mlp": 0.01031016, + "balance_loss_clip": 1.05535293, + "balance_loss_mlp": 1.02185488, + "epoch": 0.27487524800096197, + "flos": 17311062568320.0, + "grad_norm": 1.7079963444319017, + "language_loss": 0.80161762, + "learning_rate": 3.4034319627339003e-06, + "loss": 0.82329857, + "num_input_tokens_seen": 49053975, + "step": 2286, + "time_per_iteration": 2.577749490737915 + }, + { + "auxiliary_loss_clip": 0.01173063, + "auxiliary_loss_mlp": 0.01032705, + "balance_loss_clip": 1.05578542, + "balance_loss_mlp": 1.02373743, + "epoch": 0.274995490891601, + "flos": 27120205002240.0, + "grad_norm": 2.2733543328001224, + "language_loss": 0.69459945, + "learning_rate": 3.402876872656715e-06, + "loss": 0.7166571, + "num_input_tokens_seen": 49072295, + "step": 2287, + "time_per_iteration": 3.2814736366271973 + }, + { + "auxiliary_loss_clip": 0.01168879, + "auxiliary_loss_mlp": 0.01033716, + "balance_loss_clip": 1.05464721, + "balance_loss_mlp": 1.02534163, + "epoch": 0.27511573378224013, + "flos": 23436093634560.0, + "grad_norm": 1.9538448033868343, + "language_loss": 0.89727247, + "learning_rate": 3.402321569764223e-06, + "loss": 0.91929841, + "num_input_tokens_seen": 49091600, + "step": 2288, + "time_per_iteration": 2.529625654220581 + }, + { + "auxiliary_loss_clip": 0.01149446, + "auxiliary_loss_mlp": 0.00764286, + "balance_loss_clip": 1.05186009, + "balance_loss_mlp": 1.00075102, + "epoch": 0.2752359766728792, + "flos": 16722019434240.0, + "grad_norm": 1.7896239462008774, + "language_loss": 0.83615708, + "learning_rate": 3.4017660541406635e-06, + "loss": 0.85529447, + "num_input_tokens_seen": 49107665, + "step": 2289, + "time_per_iteration": 3.339085578918457 + }, + { + "auxiliary_loss_clip": 0.01180377, + "auxiliary_loss_mlp": 0.01030219, + "balance_loss_clip": 1.05411553, + "balance_loss_mlp": 1.02121866, + "epoch": 0.2753562195635183, + "flos": 25297738698240.0, + "grad_norm": 1.6235601579192631, + "language_loss": 0.74558789, + "learning_rate": 3.4012103258703092e-06, + "loss": 0.76769388, + "num_input_tokens_seen": 49126420, + "step": 2290, + "time_per_iteration": 3.3334591388702393 + }, + { + "auxiliary_loss_clip": 0.01156835, + "auxiliary_loss_mlp": 0.01027229, + "balance_loss_clip": 1.05222917, + "balance_loss_mlp": 1.01825821, + "epoch": 0.2754764624541574, + "flos": 27338972785920.0, + "grad_norm": 1.9537099951387693, + "language_loss": 0.8310011, + "learning_rate": 3.4006543850374616e-06, + "loss": 0.85284173, + "num_input_tokens_seen": 49141470, + "step": 2291, + "time_per_iteration": 2.5735504627227783 + }, + { + "auxiliary_loss_clip": 0.0118753, + "auxiliary_loss_mlp": 0.01033444, + "balance_loss_clip": 1.05375385, + "balance_loss_mlp": 1.02508771, + "epoch": 0.27559670534479647, + "flos": 17238379397760.0, + "grad_norm": 2.4366729795552686, + "language_loss": 0.74829698, + "learning_rate": 3.400098231726458e-06, + "loss": 0.77050674, + "num_input_tokens_seen": 49158570, + "step": 2292, + "time_per_iteration": 2.4738380908966064 + }, + { + "auxiliary_loss_clip": 0.0116283, + "auxiliary_loss_mlp": 0.01033512, + "balance_loss_clip": 1.0519228, + "balance_loss_mlp": 1.0241363, + "epoch": 0.2757169482354356, + "flos": 21939085486080.0, + "grad_norm": 2.269342486923487, + "language_loss": 0.86767769, + "learning_rate": 3.3995418660216657e-06, + "loss": 0.88964105, + "num_input_tokens_seen": 49176025, + "step": 2293, + "time_per_iteration": 2.5456478595733643 + }, + { + "auxiliary_loss_clip": 0.01206292, + "auxiliary_loss_mlp": 0.01034747, + "balance_loss_clip": 1.05945075, + "balance_loss_mlp": 1.02541232, + "epoch": 0.2758371911260747, + "flos": 20850669521280.0, + "grad_norm": 8.326101500915094, + "language_loss": 0.8062436, + "learning_rate": 3.3989852880074848e-06, + "loss": 0.82865399, + "num_input_tokens_seen": 49197455, + "step": 2294, + "time_per_iteration": 2.47609806060791 + }, + { + "auxiliary_loss_clip": 0.01080653, + "auxiliary_loss_mlp": 0.01006757, + "balance_loss_clip": 1.0312072, + "balance_loss_mlp": 1.00504005, + "epoch": 0.27595743401671374, + "flos": 69269063592960.0, + "grad_norm": 0.7447433026111202, + "language_loss": 0.60545325, + "learning_rate": 3.398428497768348e-06, + "loss": 0.6263274, + "num_input_tokens_seen": 49262625, + "step": 2295, + "time_per_iteration": 3.1804356575012207 + }, + { + "auxiliary_loss_clip": 0.01164833, + "auxiliary_loss_mlp": 0.01029052, + "balance_loss_clip": 1.05078638, + "balance_loss_mlp": 1.01972389, + "epoch": 0.27607767690735285, + "flos": 21215019127680.0, + "grad_norm": 1.7064427161306066, + "language_loss": 0.72132552, + "learning_rate": 3.3978714953887205e-06, + "loss": 0.74326438, + "num_input_tokens_seen": 49282380, + "step": 2296, + "time_per_iteration": 2.556641101837158 + }, + { + "auxiliary_loss_clip": 0.01130376, + "auxiliary_loss_mlp": 0.01026846, + "balance_loss_clip": 1.046983, + "balance_loss_mlp": 1.01791728, + "epoch": 0.27619791979799196, + "flos": 24825334003200.0, + "grad_norm": 1.7580722797118933, + "language_loss": 0.86251491, + "learning_rate": 3.397314280953098e-06, + "loss": 0.88408709, + "num_input_tokens_seen": 49303205, + "step": 2297, + "time_per_iteration": 2.6071643829345703 + }, + { + "auxiliary_loss_clip": 0.0116711, + "auxiliary_loss_mlp": 0.01026992, + "balance_loss_clip": 1.05485415, + "balance_loss_mlp": 1.01848674, + "epoch": 0.276318162688631, + "flos": 24753548672640.0, + "grad_norm": 2.0319146513926256, + "language_loss": 0.79759866, + "learning_rate": 3.3967568545460108e-06, + "loss": 0.81953967, + "num_input_tokens_seen": 49322745, + "step": 2298, + "time_per_iteration": 2.5397722721099854 + }, + { + "auxiliary_loss_clip": 0.01182366, + "auxiliary_loss_mlp": 0.01031051, + "balance_loss_clip": 1.0554657, + "balance_loss_mlp": 1.02193809, + "epoch": 0.27643840557927013, + "flos": 18150007599360.0, + "grad_norm": 2.141071414158842, + "language_loss": 0.80571592, + "learning_rate": 3.3961992162520185e-06, + "loss": 0.82785016, + "num_input_tokens_seen": 49341370, + "step": 2299, + "time_per_iteration": 2.462484121322632 + }, + { + "auxiliary_loss_clip": 0.01186763, + "auxiliary_loss_mlp": 0.01029355, + "balance_loss_clip": 1.05610192, + "balance_loss_mlp": 1.01980662, + "epoch": 0.27655864846990924, + "flos": 24823933372800.0, + "grad_norm": 2.4374309186250747, + "language_loss": 0.71832407, + "learning_rate": 3.3956413661557156e-06, + "loss": 0.74048525, + "num_input_tokens_seen": 49361545, + "step": 2300, + "time_per_iteration": 2.505204439163208 + }, + { + "auxiliary_loss_clip": 0.011627, + "auxiliary_loss_mlp": 0.01032639, + "balance_loss_clip": 1.0504421, + "balance_loss_mlp": 1.02372265, + "epoch": 0.2766788913605483, + "flos": 20266582464000.0, + "grad_norm": 2.332256386458549, + "language_loss": 0.65851909, + "learning_rate": 3.3950833043417273e-06, + "loss": 0.68047249, + "num_input_tokens_seen": 49379690, + "step": 2301, + "time_per_iteration": 2.5431363582611084 + }, + { + "auxiliary_loss_clip": 0.01191912, + "auxiliary_loss_mlp": 0.01029594, + "balance_loss_clip": 1.06008828, + "balance_loss_mlp": 1.0197829, + "epoch": 0.2767991342511874, + "flos": 21470272151040.0, + "grad_norm": 2.564510674146828, + "language_loss": 0.73335481, + "learning_rate": 3.3945250308947105e-06, + "loss": 0.75556993, + "num_input_tokens_seen": 49395995, + "step": 2302, + "time_per_iteration": 2.4799954891204834 + }, + { + "auxiliary_loss_clip": 0.01086608, + "auxiliary_loss_mlp": 0.01003241, + "balance_loss_clip": 1.01902223, + "balance_loss_mlp": 1.00132167, + "epoch": 0.2769193771418265, + "flos": 66002627571840.0, + "grad_norm": 1.2368054861640663, + "language_loss": 0.68407762, + "learning_rate": 3.3939665458993556e-06, + "loss": 0.70497608, + "num_input_tokens_seen": 49450415, + "step": 2303, + "time_per_iteration": 3.0190844535827637 + }, + { + "auxiliary_loss_clip": 0.01160472, + "auxiliary_loss_mlp": 0.01031581, + "balance_loss_clip": 1.04988074, + "balance_loss_mlp": 1.02244961, + "epoch": 0.27703962003246557, + "flos": 20704441253760.0, + "grad_norm": 1.9149130915296293, + "language_loss": 0.77224886, + "learning_rate": 3.3934078494403843e-06, + "loss": 0.79416943, + "num_input_tokens_seen": 49469990, + "step": 2304, + "time_per_iteration": 2.560169219970703 + }, + { + "auxiliary_loss_clip": 0.0110887, + "auxiliary_loss_mlp": 0.00764204, + "balance_loss_clip": 1.04689586, + "balance_loss_mlp": 1.00076091, + "epoch": 0.2771598629231047, + "flos": 22929897219840.0, + "grad_norm": 2.0262789314995517, + "language_loss": 0.81280172, + "learning_rate": 3.3928489416025495e-06, + "loss": 0.83153248, + "num_input_tokens_seen": 49490835, + "step": 2305, + "time_per_iteration": 2.6838669776916504 + }, + { + "auxiliary_loss_clip": 0.01170896, + "auxiliary_loss_mlp": 0.0103879, + "balance_loss_clip": 1.05362749, + "balance_loss_mlp": 1.02901459, + "epoch": 0.27728010581374374, + "flos": 18369457741440.0, + "grad_norm": 3.6564461869436005, + "language_loss": 0.78945541, + "learning_rate": 3.392289822470638e-06, + "loss": 0.81155223, + "num_input_tokens_seen": 49508815, + "step": 2306, + "time_per_iteration": 2.505054235458374 + }, + { + "auxiliary_loss_clip": 0.01167313, + "auxiliary_loss_mlp": 0.01028708, + "balance_loss_clip": 1.05132985, + "balance_loss_mlp": 1.01915324, + "epoch": 0.27740034870438285, + "flos": 19427637432960.0, + "grad_norm": 2.0278779499264, + "language_loss": 0.75823689, + "learning_rate": 3.3917304921294674e-06, + "loss": 0.78019702, + "num_input_tokens_seen": 49526980, + "step": 2307, + "time_per_iteration": 2.531632423400879 + }, + { + "auxiliary_loss_clip": 0.01185965, + "auxiliary_loss_mlp": 0.01034296, + "balance_loss_clip": 1.05378306, + "balance_loss_mlp": 1.02474713, + "epoch": 0.27752059159502196, + "flos": 21614776565760.0, + "grad_norm": 1.6049234830708237, + "language_loss": 0.80435765, + "learning_rate": 3.3911709506638876e-06, + "loss": 0.82656026, + "num_input_tokens_seen": 49546290, + "step": 2308, + "time_per_iteration": 3.2812206745147705 + }, + { + "auxiliary_loss_clip": 0.01145651, + "auxiliary_loss_mlp": 0.00764116, + "balance_loss_clip": 1.04594135, + "balance_loss_mlp": 1.00067067, + "epoch": 0.277640834485661, + "flos": 26608011016320.0, + "grad_norm": 2.195426830187953, + "language_loss": 0.81018525, + "learning_rate": 3.390611198158781e-06, + "loss": 0.82928288, + "num_input_tokens_seen": 49564165, + "step": 2309, + "time_per_iteration": 2.650442123413086 + }, + { + "auxiliary_loss_clip": 0.0120493, + "auxiliary_loss_mlp": 0.01033865, + "balance_loss_clip": 1.05827141, + "balance_loss_mlp": 1.02431655, + "epoch": 0.2777610773763001, + "flos": 19492814661120.0, + "grad_norm": 2.0754508241735734, + "language_loss": 0.89975816, + "learning_rate": 3.3900512346990612e-06, + "loss": 0.92214608, + "num_input_tokens_seen": 49580155, + "step": 2310, + "time_per_iteration": 2.4658799171447754 + }, + { + "auxiliary_loss_clip": 0.01142277, + "auxiliary_loss_mlp": 0.01032015, + "balance_loss_clip": 1.04538655, + "balance_loss_mlp": 1.02198935, + "epoch": 0.27788132026693924, + "flos": 38290650001920.0, + "grad_norm": 1.761615268802473, + "language_loss": 0.65840805, + "learning_rate": 3.389491060369674e-06, + "loss": 0.68015093, + "num_input_tokens_seen": 49605830, + "step": 2311, + "time_per_iteration": 2.7144243717193604 + }, + { + "auxiliary_loss_clip": 0.01134731, + "auxiliary_loss_mlp": 0.01025046, + "balance_loss_clip": 1.0485121, + "balance_loss_mlp": 1.01608753, + "epoch": 0.2780015631575783, + "flos": 22382546797440.0, + "grad_norm": 1.946746240814894, + "language_loss": 0.89514649, + "learning_rate": 3.388930675255598e-06, + "loss": 0.91674423, + "num_input_tokens_seen": 49625680, + "step": 2312, + "time_per_iteration": 2.573726177215576 + }, + { + "auxiliary_loss_clip": 0.01175037, + "auxiliary_loss_mlp": 0.01031735, + "balance_loss_clip": 1.05189776, + "balance_loss_mlp": 1.02125621, + "epoch": 0.2781218060482174, + "flos": 12203200840320.0, + "grad_norm": 2.580904290640667, + "language_loss": 0.79389232, + "learning_rate": 3.388370079441843e-06, + "loss": 0.81596005, + "num_input_tokens_seen": 49641195, + "step": 2313, + "time_per_iteration": 3.2898614406585693 + }, + { + "auxiliary_loss_clip": 0.01158608, + "auxiliary_loss_mlp": 0.01038841, + "balance_loss_clip": 1.05477417, + "balance_loss_mlp": 1.02988243, + "epoch": 0.2782420489388565, + "flos": 18107632529280.0, + "grad_norm": 1.9906415435122813, + "language_loss": 0.92803097, + "learning_rate": 3.3878092730134505e-06, + "loss": 0.95000547, + "num_input_tokens_seen": 49659180, + "step": 2314, + "time_per_iteration": 2.5185654163360596 + }, + { + "auxiliary_loss_clip": 0.01178838, + "auxiliary_loss_mlp": 0.01036701, + "balance_loss_clip": 1.05347562, + "balance_loss_mlp": 1.02714598, + "epoch": 0.27836229182949557, + "flos": 18514752255360.0, + "grad_norm": 1.5579045052359195, + "language_loss": 0.80564702, + "learning_rate": 3.3872482560554947e-06, + "loss": 0.82780242, + "num_input_tokens_seen": 49677955, + "step": 2315, + "time_per_iteration": 3.2326622009277344 + }, + { + "auxiliary_loss_clip": 0.01083084, + "auxiliary_loss_mlp": 0.01006062, + "balance_loss_clip": 1.01481414, + "balance_loss_mlp": 1.0040592, + "epoch": 0.2784825347201347, + "flos": 67079230940160.0, + "grad_norm": 0.8063319163991048, + "language_loss": 0.57058167, + "learning_rate": 3.386687028653082e-06, + "loss": 0.5914731, + "num_input_tokens_seen": 49740800, + "step": 2316, + "time_per_iteration": 3.851491928100586 + }, + { + "auxiliary_loss_clip": 0.01144156, + "auxiliary_loss_mlp": 0.0103644, + "balance_loss_clip": 1.05221343, + "balance_loss_mlp": 1.02625394, + "epoch": 0.2786027776107738, + "flos": 22631119891200.0, + "grad_norm": 1.7954174169580868, + "language_loss": 0.84954596, + "learning_rate": 3.386125590891349e-06, + "loss": 0.87135184, + "num_input_tokens_seen": 49757675, + "step": 2317, + "time_per_iteration": 2.6290371417999268 + }, + { + "auxiliary_loss_clip": 0.01159027, + "auxiliary_loss_mlp": 0.01031057, + "balance_loss_clip": 1.04833603, + "balance_loss_mlp": 1.02211642, + "epoch": 0.27872302050141284, + "flos": 15778826156160.0, + "grad_norm": 2.2703377961890716, + "language_loss": 0.82678318, + "learning_rate": 3.3855639428554657e-06, + "loss": 0.84868395, + "num_input_tokens_seen": 49775205, + "step": 2318, + "time_per_iteration": 2.5087952613830566 + }, + { + "auxiliary_loss_clip": 0.01142417, + "auxiliary_loss_mlp": 0.0102989, + "balance_loss_clip": 1.047611, + "balance_loss_mlp": 1.02106249, + "epoch": 0.27884326339205195, + "flos": 22126970551680.0, + "grad_norm": 1.7450866435047927, + "language_loss": 0.80245805, + "learning_rate": 3.385002084630635e-06, + "loss": 0.82418108, + "num_input_tokens_seen": 49794175, + "step": 2319, + "time_per_iteration": 2.5397939682006836 + }, + { + "auxiliary_loss_clip": 0.01189765, + "auxiliary_loss_mlp": 0.01033094, + "balance_loss_clip": 1.05411983, + "balance_loss_mlp": 1.02305675, + "epoch": 0.278963506282691, + "flos": 20558715776640.0, + "grad_norm": 1.9070598156100238, + "language_loss": 0.84930193, + "learning_rate": 3.384440016302088e-06, + "loss": 0.87153053, + "num_input_tokens_seen": 49812850, + "step": 2320, + "time_per_iteration": 2.4887290000915527 + }, + { + "auxiliary_loss_clip": 0.01181045, + "auxiliary_loss_mlp": 0.01033022, + "balance_loss_clip": 1.05216265, + "balance_loss_mlp": 1.02380133, + "epoch": 0.2790837491733301, + "flos": 21942928241280.0, + "grad_norm": 1.9453138591826662, + "language_loss": 0.62253368, + "learning_rate": 3.3838777379550923e-06, + "loss": 0.64467436, + "num_input_tokens_seen": 49832295, + "step": 2321, + "time_per_iteration": 2.491974115371704 + }, + { + "auxiliary_loss_clip": 0.01176229, + "auxiliary_loss_mlp": 0.01036347, + "balance_loss_clip": 1.05478883, + "balance_loss_mlp": 1.02708459, + "epoch": 0.27920399206396923, + "flos": 26286790665600.0, + "grad_norm": 2.052427941123117, + "language_loss": 0.78341538, + "learning_rate": 3.383315249674944e-06, + "loss": 0.80554116, + "num_input_tokens_seen": 49850860, + "step": 2322, + "time_per_iteration": 2.5757744312286377 + }, + { + "auxiliary_loss_clip": 0.01158244, + "auxiliary_loss_mlp": 0.01033826, + "balance_loss_clip": 1.05114353, + "balance_loss_mlp": 1.02475452, + "epoch": 0.2793242349546083, + "flos": 25400981364480.0, + "grad_norm": 2.233450997040461, + "language_loss": 0.86211574, + "learning_rate": 3.3827525515469715e-06, + "loss": 0.88403642, + "num_input_tokens_seen": 49865765, + "step": 2323, + "time_per_iteration": 2.5762529373168945 + }, + { + "auxiliary_loss_clip": 0.01147884, + "auxiliary_loss_mlp": 0.01038483, + "balance_loss_clip": 1.04694986, + "balance_loss_mlp": 1.02810597, + "epoch": 0.2794444778452474, + "flos": 20850346298880.0, + "grad_norm": 1.9651322610483422, + "language_loss": 0.70649409, + "learning_rate": 3.3821896436565367e-06, + "loss": 0.72835773, + "num_input_tokens_seen": 49885425, + "step": 2324, + "time_per_iteration": 2.581455945968628 + }, + { + "auxiliary_loss_clip": 0.01188039, + "auxiliary_loss_mlp": 0.0103715, + "balance_loss_clip": 1.05675387, + "balance_loss_mlp": 1.02768469, + "epoch": 0.2795647207358865, + "flos": 21576244250880.0, + "grad_norm": 1.6135009909079394, + "language_loss": 0.7003535, + "learning_rate": 3.381626526089032e-06, + "loss": 0.72260541, + "num_input_tokens_seen": 49904990, + "step": 2325, + "time_per_iteration": 2.5057051181793213 + }, + { + "auxiliary_loss_clip": 0.01168022, + "auxiliary_loss_mlp": 0.01028706, + "balance_loss_clip": 1.05013525, + "balance_loss_mlp": 1.01927614, + "epoch": 0.27968496362652556, + "flos": 21471744608640.0, + "grad_norm": 2.21070530249561, + "language_loss": 0.7901414, + "learning_rate": 3.3810631989298815e-06, + "loss": 0.8121087, + "num_input_tokens_seen": 49924600, + "step": 2326, + "time_per_iteration": 2.523284912109375 + }, + { + "auxiliary_loss_clip": 0.01148414, + "auxiliary_loss_mlp": 0.01033099, + "balance_loss_clip": 1.05246973, + "balance_loss_mlp": 1.02242947, + "epoch": 0.2798052065171647, + "flos": 23258695340160.0, + "grad_norm": 3.159023601797608, + "language_loss": 0.84114981, + "learning_rate": 3.3804996622645423e-06, + "loss": 0.86296487, + "num_input_tokens_seen": 49942600, + "step": 2327, + "time_per_iteration": 2.585935115814209 + }, + { + "auxiliary_loss_clip": 0.01200026, + "auxiliary_loss_mlp": 0.01032764, + "balance_loss_clip": 1.05527616, + "balance_loss_mlp": 1.0234834, + "epoch": 0.2799254494078038, + "flos": 21539328048000.0, + "grad_norm": 1.7533201674168544, + "language_loss": 0.8885119, + "learning_rate": 3.3799359161785015e-06, + "loss": 0.91083986, + "num_input_tokens_seen": 49962250, + "step": 2328, + "time_per_iteration": 2.4738399982452393 + }, + { + "auxiliary_loss_clip": 0.01182196, + "auxiliary_loss_mlp": 0.01034203, + "balance_loss_clip": 1.05325937, + "balance_loss_mlp": 1.02438593, + "epoch": 0.28004569229844284, + "flos": 26393912000640.0, + "grad_norm": 1.5484996730590455, + "language_loss": 0.85729688, + "learning_rate": 3.3793719607572798e-06, + "loss": 0.87946081, + "num_input_tokens_seen": 49983215, + "step": 2329, + "time_per_iteration": 2.5337815284729004 + }, + { + "auxiliary_loss_clip": 0.0115462, + "auxiliary_loss_mlp": 0.01030334, + "balance_loss_clip": 1.0490644, + "balance_loss_mlp": 1.02118468, + "epoch": 0.28016593518908195, + "flos": 33547676584320.0, + "grad_norm": 2.198308710139611, + "language_loss": 0.77017224, + "learning_rate": 3.378807796086428e-06, + "loss": 0.79202175, + "num_input_tokens_seen": 50006075, + "step": 2330, + "time_per_iteration": 2.6813347339630127 + }, + { + "auxiliary_loss_clip": 0.01204614, + "auxiliary_loss_mlp": 0.01026869, + "balance_loss_clip": 1.05922103, + "balance_loss_mlp": 1.01694536, + "epoch": 0.28028617807972106, + "flos": 15340823712000.0, + "grad_norm": 2.086899190801339, + "language_loss": 0.77176982, + "learning_rate": 3.37824342225153e-06, + "loss": 0.79408467, + "num_input_tokens_seen": 50022495, + "step": 2331, + "time_per_iteration": 2.422131061553955 + }, + { + "auxiliary_loss_clip": 0.0114404, + "auxiliary_loss_mlp": 0.0103546, + "balance_loss_clip": 1.05248499, + "balance_loss_mlp": 1.02604222, + "epoch": 0.2804064209703601, + "flos": 25520277409920.0, + "grad_norm": 1.8358749039132878, + "language_loss": 0.7809965, + "learning_rate": 3.3776788393382006e-06, + "loss": 0.80279154, + "num_input_tokens_seen": 50041975, + "step": 2332, + "time_per_iteration": 2.606081485748291 + }, + { + "auxiliary_loss_clip": 0.01204102, + "auxiliary_loss_mlp": 0.01032722, + "balance_loss_clip": 1.0589236, + "balance_loss_mlp": 1.02316141, + "epoch": 0.2805266638609992, + "flos": 29351766280320.0, + "grad_norm": 3.1417873551953153, + "language_loss": 0.76790988, + "learning_rate": 3.3771140474320872e-06, + "loss": 0.79027808, + "num_input_tokens_seen": 50061925, + "step": 2333, + "time_per_iteration": 2.4954373836517334 + }, + { + "auxiliary_loss_clip": 0.01163795, + "auxiliary_loss_mlp": 0.0103564, + "balance_loss_clip": 1.05249536, + "balance_loss_mlp": 1.0266937, + "epoch": 0.28064690675163834, + "flos": 21463735875840.0, + "grad_norm": 1.8018314387155527, + "language_loss": 0.79445678, + "learning_rate": 3.3765490466188664e-06, + "loss": 0.81645113, + "num_input_tokens_seen": 50079325, + "step": 2334, + "time_per_iteration": 2.534405469894409 + }, + { + "auxiliary_loss_clip": 0.01155617, + "auxiliary_loss_mlp": 0.01027621, + "balance_loss_clip": 1.05128098, + "balance_loss_mlp": 1.01750588, + "epoch": 0.2807671496422774, + "flos": 20995640812800.0, + "grad_norm": 2.5607590601014003, + "language_loss": 0.7412467, + "learning_rate": 3.3759838369842508e-06, + "loss": 0.76307911, + "num_input_tokens_seen": 50097400, + "step": 2335, + "time_per_iteration": 3.3088274002075195 + }, + { + "auxiliary_loss_clip": 0.01160129, + "auxiliary_loss_mlp": 0.01029882, + "balance_loss_clip": 1.05462992, + "balance_loss_mlp": 1.02077448, + "epoch": 0.2808873925329165, + "flos": 21506577822720.0, + "grad_norm": 1.987384008819735, + "language_loss": 0.73043388, + "learning_rate": 3.375418418613981e-06, + "loss": 0.75233394, + "num_input_tokens_seen": 50116425, + "step": 2336, + "time_per_iteration": 2.5368258953094482 + }, + { + "auxiliary_loss_clip": 0.01174746, + "auxiliary_loss_mlp": 0.01036263, + "balance_loss_clip": 1.05543685, + "balance_loss_mlp": 1.02570105, + "epoch": 0.28100763542355556, + "flos": 16070815814400.0, + "grad_norm": 2.4718092887675507, + "language_loss": 0.8383745, + "learning_rate": 3.374852791593831e-06, + "loss": 0.8604846, + "num_input_tokens_seen": 50132625, + "step": 2337, + "time_per_iteration": 2.4755783081054688 + }, + { + "auxiliary_loss_clip": 0.01153983, + "auxiliary_loss_mlp": 0.01036448, + "balance_loss_clip": 1.0497036, + "balance_loss_mlp": 1.02657747, + "epoch": 0.28112787831419467, + "flos": 19062605468160.0, + "grad_norm": 3.1564557560265687, + "language_loss": 0.54312253, + "learning_rate": 3.374286956009605e-06, + "loss": 0.56502688, + "num_input_tokens_seen": 50151190, + "step": 2338, + "time_per_iteration": 2.539555311203003 + }, + { + "auxiliary_loss_clip": 0.01189133, + "auxiliary_loss_mlp": 0.01032135, + "balance_loss_clip": 1.05914164, + "balance_loss_mlp": 1.0225625, + "epoch": 0.2812481212048338, + "flos": 12823629482880.0, + "grad_norm": 1.9370127880947288, + "language_loss": 0.75235868, + "learning_rate": 3.3737209119471405e-06, + "loss": 0.77457136, + "num_input_tokens_seen": 50167700, + "step": 2339, + "time_per_iteration": 2.4491734504699707 + }, + { + "auxiliary_loss_clip": 0.01193302, + "auxiliary_loss_mlp": 0.01029395, + "balance_loss_clip": 1.05701113, + "balance_loss_mlp": 1.01945341, + "epoch": 0.28136836409547283, + "flos": 15633064765440.0, + "grad_norm": 6.47923911150797, + "language_loss": 0.63818443, + "learning_rate": 3.373154659492306e-06, + "loss": 0.66041142, + "num_input_tokens_seen": 50185840, + "step": 2340, + "time_per_iteration": 3.2575736045837402 + }, + { + "auxiliary_loss_clip": 0.01176038, + "auxiliary_loss_mlp": 0.01038731, + "balance_loss_clip": 1.05444598, + "balance_loss_mlp": 1.02920032, + "epoch": 0.28148860698611194, + "flos": 19933726106880.0, + "grad_norm": 2.2358945468412306, + "language_loss": 0.85057402, + "learning_rate": 3.3725881987310016e-06, + "loss": 0.87272167, + "num_input_tokens_seen": 50203375, + "step": 2341, + "time_per_iteration": 2.497758626937866 + }, + { + "auxiliary_loss_clip": 0.01169327, + "auxiliary_loss_mlp": 0.01033171, + "balance_loss_clip": 1.05149806, + "balance_loss_mlp": 1.02431917, + "epoch": 0.28160884987675106, + "flos": 17457219008640.0, + "grad_norm": 1.761771620931767, + "language_loss": 0.87663031, + "learning_rate": 3.372021529749159e-06, + "loss": 0.8986553, + "num_input_tokens_seen": 50222435, + "step": 2342, + "time_per_iteration": 3.258157968521118 + }, + { + "auxiliary_loss_clip": 0.01131204, + "auxiliary_loss_mlp": 0.01031392, + "balance_loss_clip": 1.05166888, + "balance_loss_mlp": 1.02230215, + "epoch": 0.2817290927673901, + "flos": 16834743290880.0, + "grad_norm": 1.814463301251626, + "language_loss": 0.92512649, + "learning_rate": 3.3714546526327405e-06, + "loss": 0.94675243, + "num_input_tokens_seen": 50240435, + "step": 2343, + "time_per_iteration": 3.3690741062164307 + }, + { + "auxiliary_loss_clip": 0.01163567, + "auxiliary_loss_mlp": 0.01029739, + "balance_loss_clip": 1.05135274, + "balance_loss_mlp": 1.02002907, + "epoch": 0.2818493356580292, + "flos": 15414081500160.0, + "grad_norm": 2.4669494414551885, + "language_loss": 0.87734061, + "learning_rate": 3.3708875674677423e-06, + "loss": 0.89927369, + "num_input_tokens_seen": 50258410, + "step": 2344, + "time_per_iteration": 2.525028944015503 + }, + { + "auxiliary_loss_clip": 0.01184113, + "auxiliary_loss_mlp": 0.0103465, + "balance_loss_clip": 1.05882394, + "balance_loss_mlp": 1.02421331, + "epoch": 0.28196957854866833, + "flos": 20412451595520.0, + "grad_norm": 1.8934678296545489, + "language_loss": 0.83706337, + "learning_rate": 3.37032027434019e-06, + "loss": 0.85925102, + "num_input_tokens_seen": 50277930, + "step": 2345, + "time_per_iteration": 2.5002338886260986 + }, + { + "auxiliary_loss_clip": 0.01197775, + "auxiliary_loss_mlp": 0.01034362, + "balance_loss_clip": 1.05686533, + "balance_loss_mlp": 1.02283478, + "epoch": 0.2820898214393074, + "flos": 19973120348160.0, + "grad_norm": 1.687936219898487, + "language_loss": 0.82544774, + "learning_rate": 3.369752773336141e-06, + "loss": 0.84776914, + "num_input_tokens_seen": 50297410, + "step": 2346, + "time_per_iteration": 2.493079423904419 + }, + { + "auxiliary_loss_clip": 0.01173113, + "auxiliary_loss_mlp": 0.01034618, + "balance_loss_clip": 1.05320084, + "balance_loss_mlp": 1.02387762, + "epoch": 0.2822100643299465, + "flos": 22528308188160.0, + "grad_norm": 2.111955578038509, + "language_loss": 0.77881289, + "learning_rate": 3.3691850645416864e-06, + "loss": 0.80089021, + "num_input_tokens_seen": 50317120, + "step": 2347, + "time_per_iteration": 2.5359771251678467 + }, + { + "auxiliary_loss_clip": 0.01194597, + "auxiliary_loss_mlp": 0.01037807, + "balance_loss_clip": 1.05712557, + "balance_loss_mlp": 1.02798462, + "epoch": 0.2823303072205856, + "flos": 11546682007680.0, + "grad_norm": 2.0333827759556997, + "language_loss": 0.83057237, + "learning_rate": 3.368617148042945e-06, + "loss": 0.85289639, + "num_input_tokens_seen": 50334790, + "step": 2348, + "time_per_iteration": 2.45088791847229 + }, + { + "auxiliary_loss_clip": 0.01168827, + "auxiliary_loss_mlp": 0.01039709, + "balance_loss_clip": 1.05184078, + "balance_loss_mlp": 1.0291177, + "epoch": 0.28245055011122466, + "flos": 18259894281600.0, + "grad_norm": 1.626063197412419, + "language_loss": 0.844854, + "learning_rate": 3.368049023926071e-06, + "loss": 0.86693937, + "num_input_tokens_seen": 50353785, + "step": 2349, + "time_per_iteration": 2.479128360748291 + }, + { + "auxiliary_loss_clip": 0.01188413, + "auxiliary_loss_mlp": 0.01036141, + "balance_loss_clip": 1.05792475, + "balance_loss_mlp": 1.02733123, + "epoch": 0.2825707930018638, + "flos": 24608110504320.0, + "grad_norm": 1.59664924794735, + "language_loss": 0.83632267, + "learning_rate": 3.3674806922772476e-06, + "loss": 0.85856819, + "num_input_tokens_seen": 50374670, + "step": 2350, + "time_per_iteration": 2.5665299892425537 + }, + { + "auxiliary_loss_clip": 0.01166877, + "auxiliary_loss_mlp": 0.01039083, + "balance_loss_clip": 1.05348635, + "balance_loss_mlp": 1.02935576, + "epoch": 0.28269103589250283, + "flos": 25226994862080.0, + "grad_norm": 2.662647506288476, + "language_loss": 0.74989188, + "learning_rate": 3.3669121531826904e-06, + "loss": 0.77195144, + "num_input_tokens_seen": 50395650, + "step": 2351, + "time_per_iteration": 2.564743995666504 + }, + { + "auxiliary_loss_clip": 0.01158087, + "auxiliary_loss_mlp": 0.01034961, + "balance_loss_clip": 1.05798984, + "balance_loss_mlp": 1.02582979, + "epoch": 0.28281127878314194, + "flos": 19281552819840.0, + "grad_norm": 2.051276095864392, + "language_loss": 0.83562344, + "learning_rate": 3.366343406728647e-06, + "loss": 0.85755396, + "num_input_tokens_seen": 50415100, + "step": 2352, + "time_per_iteration": 2.530811071395874 + }, + { + "auxiliary_loss_clip": 0.01181252, + "auxiliary_loss_mlp": 0.01033338, + "balance_loss_clip": 1.05238128, + "balance_loss_mlp": 1.02364612, + "epoch": 0.28293152167378105, + "flos": 23878405710720.0, + "grad_norm": 1.712551968085562, + "language_loss": 0.68301731, + "learning_rate": 3.3657744530013946e-06, + "loss": 0.70516324, + "num_input_tokens_seen": 50434335, + "step": 2353, + "time_per_iteration": 2.4976794719696045 + }, + { + "auxiliary_loss_clip": 0.01192938, + "auxiliary_loss_mlp": 0.0103614, + "balance_loss_clip": 1.05834866, + "balance_loss_mlp": 1.02640617, + "epoch": 0.2830517645644201, + "flos": 43866965928960.0, + "grad_norm": 2.2259243436136313, + "language_loss": 0.71173358, + "learning_rate": 3.3652052920872437e-06, + "loss": 0.73402429, + "num_input_tokens_seen": 50457200, + "step": 2354, + "time_per_iteration": 2.686675548553467 + }, + { + "auxiliary_loss_clip": 0.01175803, + "auxiliary_loss_mlp": 0.01034779, + "balance_loss_clip": 1.05379176, + "balance_loss_mlp": 1.02487826, + "epoch": 0.2831720074550592, + "flos": 26651750803200.0, + "grad_norm": 1.8731973072616739, + "language_loss": 0.85396349, + "learning_rate": 3.3646359240725355e-06, + "loss": 0.87606943, + "num_input_tokens_seen": 50476390, + "step": 2355, + "time_per_iteration": 2.5436437129974365 + }, + { + "auxiliary_loss_clip": 0.01184295, + "auxiliary_loss_mlp": 0.00764471, + "balance_loss_clip": 1.05635715, + "balance_loss_mlp": 1.00076258, + "epoch": 0.2832922503456983, + "flos": 31029979564800.0, + "grad_norm": 1.9863561496636233, + "language_loss": 0.67752451, + "learning_rate": 3.364066349043643e-06, + "loss": 0.69701219, + "num_input_tokens_seen": 50497595, + "step": 2356, + "time_per_iteration": 2.6090378761291504 + }, + { + "auxiliary_loss_clip": 0.01171365, + "auxiliary_loss_mlp": 0.01031482, + "balance_loss_clip": 1.05309498, + "balance_loss_mlp": 1.02254117, + "epoch": 0.2834124932363374, + "flos": 20405699838720.0, + "grad_norm": 1.9082457955920311, + "language_loss": 0.82257456, + "learning_rate": 3.363496567086969e-06, + "loss": 0.844603, + "num_input_tokens_seen": 50514690, + "step": 2357, + "time_per_iteration": 2.549438714981079 + }, + { + "auxiliary_loss_clip": 0.01203717, + "auxiliary_loss_mlp": 0.01030246, + "balance_loss_clip": 1.05880666, + "balance_loss_mlp": 1.02102554, + "epoch": 0.2835327361269765, + "flos": 39384848056320.0, + "grad_norm": 1.7520129815472119, + "language_loss": 0.75845194, + "learning_rate": 3.3629265782889506e-06, + "loss": 0.78079152, + "num_input_tokens_seen": 50536515, + "step": 2358, + "time_per_iteration": 2.733365774154663 + }, + { + "auxiliary_loss_clip": 0.01154449, + "auxiliary_loss_mlp": 0.01034371, + "balance_loss_clip": 1.04967237, + "balance_loss_mlp": 1.02450085, + "epoch": 0.2836529790176156, + "flos": 30261598801920.0, + "grad_norm": 2.0592614301101393, + "language_loss": 0.71492195, + "learning_rate": 3.362356382736054e-06, + "loss": 0.73681021, + "num_input_tokens_seen": 50557120, + "step": 2359, + "time_per_iteration": 2.716064691543579 + }, + { + "auxiliary_loss_clip": 0.01157256, + "auxiliary_loss_mlp": 0.0102692, + "balance_loss_clip": 1.04826736, + "balance_loss_mlp": 1.01802742, + "epoch": 0.28377322190825466, + "flos": 12677796264960.0, + "grad_norm": 1.9449594632425184, + "language_loss": 0.9052217, + "learning_rate": 3.361785980514777e-06, + "loss": 0.92706347, + "num_input_tokens_seen": 50573320, + "step": 2360, + "time_per_iteration": 2.5039758682250977 + }, + { + "auxiliary_loss_clip": 0.01125939, + "auxiliary_loss_mlp": 0.01032348, + "balance_loss_clip": 1.05088794, + "balance_loss_mlp": 1.02340138, + "epoch": 0.28389346479889377, + "flos": 18296666830080.0, + "grad_norm": 1.839037386041354, + "language_loss": 0.76697838, + "learning_rate": 3.361215371711649e-06, + "loss": 0.78856128, + "num_input_tokens_seen": 50592415, + "step": 2361, + "time_per_iteration": 2.563485622406006 + }, + { + "auxiliary_loss_clip": 0.01153794, + "auxiliary_loss_mlp": 0.0103147, + "balance_loss_clip": 1.05213022, + "balance_loss_mlp": 1.02236831, + "epoch": 0.2840137076895329, + "flos": 20406992728320.0, + "grad_norm": 1.8870989081083962, + "language_loss": 0.83678472, + "learning_rate": 3.3606445564132326e-06, + "loss": 0.85863733, + "num_input_tokens_seen": 50609710, + "step": 2362, + "time_per_iteration": 3.290483236312866 + }, + { + "auxiliary_loss_clip": 0.01207105, + "auxiliary_loss_mlp": 0.00764491, + "balance_loss_clip": 1.06206274, + "balance_loss_mlp": 1.00075698, + "epoch": 0.28413395058017193, + "flos": 20048030161920.0, + "grad_norm": 2.0573964286343664, + "language_loss": 0.82072949, + "learning_rate": 3.360073534706118e-06, + "loss": 0.84044546, + "num_input_tokens_seen": 50626865, + "step": 2363, + "time_per_iteration": 2.463432550430298 + }, + { + "auxiliary_loss_clip": 0.01176163, + "auxiliary_loss_mlp": 0.01030787, + "balance_loss_clip": 1.05543804, + "balance_loss_mlp": 1.02130997, + "epoch": 0.28425419347081105, + "flos": 37663613256960.0, + "grad_norm": 2.4648114391340985, + "language_loss": 0.75899667, + "learning_rate": 3.35950230667693e-06, + "loss": 0.78106612, + "num_input_tokens_seen": 50648560, + "step": 2364, + "time_per_iteration": 2.653156280517578 + }, + { + "auxiliary_loss_clip": 0.0119154, + "auxiliary_loss_mlp": 0.01026469, + "balance_loss_clip": 1.05684626, + "balance_loss_mlp": 1.01753986, + "epoch": 0.28437443636145016, + "flos": 13845072539520.0, + "grad_norm": 2.2838904259630577, + "language_loss": 0.85940874, + "learning_rate": 3.358930872412323e-06, + "loss": 0.88158882, + "num_input_tokens_seen": 50665725, + "step": 2365, + "time_per_iteration": 2.4490907192230225 + }, + { + "auxiliary_loss_clip": 0.01187158, + "auxiliary_loss_mlp": 0.01035131, + "balance_loss_clip": 1.05815125, + "balance_loss_mlp": 1.02523088, + "epoch": 0.2844946792520892, + "flos": 22747794243840.0, + "grad_norm": 1.5849280438136113, + "language_loss": 0.80899012, + "learning_rate": 3.3583592319989825e-06, + "loss": 0.831213, + "num_input_tokens_seen": 50685095, + "step": 2366, + "time_per_iteration": 2.4878427982330322 + }, + { + "auxiliary_loss_clip": 0.01198923, + "auxiliary_loss_mlp": 0.01038479, + "balance_loss_clip": 1.05902076, + "balance_loss_mlp": 1.02863836, + "epoch": 0.2846149221427283, + "flos": 32415987709440.0, + "grad_norm": 2.2632231228337276, + "language_loss": 0.68940639, + "learning_rate": 3.357787385523627e-06, + "loss": 0.71178043, + "num_input_tokens_seen": 50706500, + "step": 2367, + "time_per_iteration": 3.3429088592529297 + }, + { + "auxiliary_loss_clip": 0.01140252, + "auxiliary_loss_mlp": 0.01032114, + "balance_loss_clip": 1.05168712, + "balance_loss_mlp": 1.02335203, + "epoch": 0.2847351650333674, + "flos": 28475976873600.0, + "grad_norm": 2.076632434329613, + "language_loss": 0.83044511, + "learning_rate": 3.3572153330730048e-06, + "loss": 0.85216868, + "num_input_tokens_seen": 50727595, + "step": 2368, + "time_per_iteration": 3.384120225906372 + }, + { + "auxiliary_loss_clip": 0.01071966, + "auxiliary_loss_mlp": 0.01006321, + "balance_loss_clip": 1.02223301, + "balance_loss_mlp": 1.00428224, + "epoch": 0.2848554079240065, + "flos": 55753399704960.0, + "grad_norm": 0.8317110730286299, + "language_loss": 0.64706755, + "learning_rate": 3.3566430747338956e-06, + "loss": 0.66785043, + "num_input_tokens_seen": 50782800, + "step": 2369, + "time_per_iteration": 2.964275598526001 + }, + { + "auxiliary_loss_clip": 0.01192055, + "auxiliary_loss_mlp": 0.01033791, + "balance_loss_clip": 1.0563854, + "balance_loss_mlp": 1.02415919, + "epoch": 0.2849756508146456, + "flos": 11836875985920.0, + "grad_norm": 2.0406870253047074, + "language_loss": 0.86709535, + "learning_rate": 3.35607061059311e-06, + "loss": 0.88935381, + "num_input_tokens_seen": 50797730, + "step": 2370, + "time_per_iteration": 3.1984758377075195 + }, + { + "auxiliary_loss_clip": 0.01202086, + "auxiliary_loss_mlp": 0.01033501, + "balance_loss_clip": 1.05846632, + "balance_loss_mlp": 1.02469707, + "epoch": 0.28509589370528465, + "flos": 25155209531520.0, + "grad_norm": 1.9125630942944392, + "language_loss": 0.74544221, + "learning_rate": 3.3554979407374917e-06, + "loss": 0.76779807, + "num_input_tokens_seen": 50819840, + "step": 2371, + "time_per_iteration": 2.489224672317505 + }, + { + "auxiliary_loss_clip": 0.01188056, + "auxiliary_loss_mlp": 0.01034657, + "balance_loss_clip": 1.05491257, + "balance_loss_mlp": 1.02541256, + "epoch": 0.28521613659592376, + "flos": 19974808287360.0, + "grad_norm": 1.721734563621852, + "language_loss": 0.73328078, + "learning_rate": 3.3549250652539134e-06, + "loss": 0.75550795, + "num_input_tokens_seen": 50838935, + "step": 2372, + "time_per_iteration": 2.4685988426208496 + }, + { + "auxiliary_loss_clip": 0.01173158, + "auxiliary_loss_mlp": 0.0103241, + "balance_loss_clip": 1.05278635, + "balance_loss_mlp": 1.0225333, + "epoch": 0.2853363794865629, + "flos": 23367971491200.0, + "grad_norm": 2.4514854571927676, + "language_loss": 0.81851482, + "learning_rate": 3.3543519842292794e-06, + "loss": 0.84057057, + "num_input_tokens_seen": 50858590, + "step": 2373, + "time_per_iteration": 2.5097877979278564 + }, + { + "auxiliary_loss_clip": 0.01205025, + "auxiliary_loss_mlp": 0.00764181, + "balance_loss_clip": 1.05952501, + "balance_loss_mlp": 1.00074458, + "epoch": 0.28545662237720193, + "flos": 19861940776320.0, + "grad_norm": 1.7079465311882895, + "language_loss": 0.83655947, + "learning_rate": 3.353778697750527e-06, + "loss": 0.85625154, + "num_input_tokens_seen": 50876995, + "step": 2374, + "time_per_iteration": 2.4435722827911377 + }, + { + "auxiliary_loss_clip": 0.01165261, + "auxiliary_loss_mlp": 0.01031816, + "balance_loss_clip": 1.05217087, + "balance_loss_mlp": 1.02255905, + "epoch": 0.28557686526784104, + "flos": 23879016241920.0, + "grad_norm": 1.6266394554552952, + "language_loss": 0.89321744, + "learning_rate": 3.353205205904622e-06, + "loss": 0.91518819, + "num_input_tokens_seen": 50896105, + "step": 2375, + "time_per_iteration": 2.5227699279785156 + }, + { + "auxiliary_loss_clip": 0.0117461, + "auxiliary_loss_mlp": 0.01031888, + "balance_loss_clip": 1.05547249, + "balance_loss_mlp": 1.02280378, + "epoch": 0.28569710815848015, + "flos": 44890384233600.0, + "grad_norm": 2.0680721363398544, + "language_loss": 0.71972942, + "learning_rate": 3.3526315087785637e-06, + "loss": 0.74179435, + "num_input_tokens_seen": 50917220, + "step": 2376, + "time_per_iteration": 2.718351125717163 + }, + { + "auxiliary_loss_clip": 0.01124767, + "auxiliary_loss_mlp": 0.01032627, + "balance_loss_clip": 1.04895198, + "balance_loss_mlp": 1.02339387, + "epoch": 0.2858173510491192, + "flos": 26829759628800.0, + "grad_norm": 1.8267009517060295, + "language_loss": 0.80833274, + "learning_rate": 3.3520576064593805e-06, + "loss": 0.82990664, + "num_input_tokens_seen": 50937175, + "step": 2377, + "time_per_iteration": 2.5934243202209473 + }, + { + "auxiliary_loss_clip": 0.01192052, + "auxiliary_loss_mlp": 0.01027449, + "balance_loss_clip": 1.05739677, + "balance_loss_mlp": 1.01804912, + "epoch": 0.2859375939397583, + "flos": 23148916398720.0, + "grad_norm": 1.4718260084042614, + "language_loss": 0.81620139, + "learning_rate": 3.3514834990341337e-06, + "loss": 0.83839643, + "num_input_tokens_seen": 50957500, + "step": 2378, + "time_per_iteration": 2.6286611557006836 + }, + { + "auxiliary_loss_clip": 0.01180746, + "auxiliary_loss_mlp": 0.01030944, + "balance_loss_clip": 1.0572027, + "balance_loss_mlp": 1.0221225, + "epoch": 0.2860578368303974, + "flos": 12129799397760.0, + "grad_norm": 3.5772276685011826, + "language_loss": 0.92890531, + "learning_rate": 3.3509091865899144e-06, + "loss": 0.95102221, + "num_input_tokens_seen": 50972690, + "step": 2379, + "time_per_iteration": 2.5246775150299072 + }, + { + "auxiliary_loss_clip": 0.01202573, + "auxiliary_loss_mlp": 0.01034176, + "balance_loss_clip": 1.05735791, + "balance_loss_mlp": 1.02453756, + "epoch": 0.2861780797210365, + "flos": 19938035738880.0, + "grad_norm": 1.8994286832307044, + "language_loss": 0.70686579, + "learning_rate": 3.350334669213846e-06, + "loss": 0.72923326, + "num_input_tokens_seen": 50990095, + "step": 2380, + "time_per_iteration": 2.4332733154296875 + }, + { + "auxiliary_loss_clip": 0.01185044, + "auxiliary_loss_mlp": 0.01035112, + "balance_loss_clip": 1.05670452, + "balance_loss_mlp": 1.02632594, + "epoch": 0.2862983226116756, + "flos": 27563127609600.0, + "grad_norm": 2.0322494598270175, + "language_loss": 0.7583915, + "learning_rate": 3.3497599469930816e-06, + "loss": 0.7805931, + "num_input_tokens_seen": 51008305, + "step": 2381, + "time_per_iteration": 2.5757036209106445 + }, + { + "auxiliary_loss_clip": 0.01202703, + "auxiliary_loss_mlp": 0.01029636, + "balance_loss_clip": 1.05606604, + "balance_loss_mlp": 1.02051616, + "epoch": 0.28641856550231465, + "flos": 22053964158720.0, + "grad_norm": 2.2027389898339664, + "language_loss": 0.83262545, + "learning_rate": 3.349185020014807e-06, + "loss": 0.85494882, + "num_input_tokens_seen": 51025570, + "step": 2382, + "time_per_iteration": 2.435563325881958 + }, + { + "auxiliary_loss_clip": 0.01189833, + "auxiliary_loss_mlp": 0.01032428, + "balance_loss_clip": 1.05472064, + "balance_loss_mlp": 1.02350509, + "epoch": 0.28653880839295376, + "flos": 22378775869440.0, + "grad_norm": 1.8373407040862622, + "language_loss": 0.74476016, + "learning_rate": 3.348609888366237e-06, + "loss": 0.76698279, + "num_input_tokens_seen": 51044585, + "step": 2383, + "time_per_iteration": 2.475032329559326 + }, + { + "auxiliary_loss_clip": 0.01124766, + "auxiliary_loss_mlp": 0.01027789, + "balance_loss_clip": 1.04806566, + "balance_loss_mlp": 1.01788831, + "epoch": 0.28665905128359287, + "flos": 23367971491200.0, + "grad_norm": 6.2501423357386745, + "language_loss": 0.62811613, + "learning_rate": 3.348034552134619e-06, + "loss": 0.64964169, + "num_input_tokens_seen": 51063990, + "step": 2384, + "time_per_iteration": 2.6175882816314697 + }, + { + "auxiliary_loss_clip": 0.01138402, + "auxiliary_loss_mlp": 0.01035076, + "balance_loss_clip": 1.05255008, + "balance_loss_mlp": 1.02627254, + "epoch": 0.2867792941742319, + "flos": 20881695893760.0, + "grad_norm": 2.126472579452702, + "language_loss": 0.84336251, + "learning_rate": 3.3474590114072316e-06, + "loss": 0.86509728, + "num_input_tokens_seen": 51081990, + "step": 2385, + "time_per_iteration": 2.5451855659484863 + }, + { + "auxiliary_loss_clip": 0.01156447, + "auxiliary_loss_mlp": 0.01035558, + "balance_loss_clip": 1.05498612, + "balance_loss_mlp": 1.0259198, + "epoch": 0.28689953706487104, + "flos": 20664005518080.0, + "grad_norm": 2.282412868540887, + "language_loss": 0.82925087, + "learning_rate": 3.3468832662713836e-06, + "loss": 0.8511709, + "num_input_tokens_seen": 51100235, + "step": 2386, + "time_per_iteration": 2.5391006469726562 + }, + { + "auxiliary_loss_clip": 0.01154314, + "auxiliary_loss_mlp": 0.01038903, + "balance_loss_clip": 1.05312598, + "balance_loss_mlp": 1.02952719, + "epoch": 0.28701977995551015, + "flos": 12675533708160.0, + "grad_norm": 2.0967116487847615, + "language_loss": 0.83800143, + "learning_rate": 3.346307316814415e-06, + "loss": 0.85993361, + "num_input_tokens_seen": 51115405, + "step": 2387, + "time_per_iteration": 2.484759569168091 + }, + { + "auxiliary_loss_clip": 0.01188587, + "auxiliary_loss_mlp": 0.01030105, + "balance_loss_clip": 1.05720258, + "balance_loss_mlp": 1.02059257, + "epoch": 0.2871400228461492, + "flos": 21252366293760.0, + "grad_norm": 6.247691494833906, + "language_loss": 0.75458324, + "learning_rate": 3.3457311631236965e-06, + "loss": 0.77677011, + "num_input_tokens_seen": 51136390, + "step": 2388, + "time_per_iteration": 3.260347604751587 + }, + { + "auxiliary_loss_clip": 0.01158991, + "auxiliary_loss_mlp": 0.01031644, + "balance_loss_clip": 1.05095267, + "balance_loss_mlp": 1.02260816, + "epoch": 0.2872602657367883, + "flos": 25119262995840.0, + "grad_norm": 1.6998938477906913, + "language_loss": 0.84057903, + "learning_rate": 3.345154805286631e-06, + "loss": 0.86248535, + "num_input_tokens_seen": 51156650, + "step": 2389, + "time_per_iteration": 2.541138172149658 + }, + { + "auxiliary_loss_clip": 0.01181289, + "auxiliary_loss_mlp": 0.0103575, + "balance_loss_clip": 1.053689, + "balance_loss_mlp": 1.02621877, + "epoch": 0.2873805086274274, + "flos": 16646606830080.0, + "grad_norm": 2.5275943057213, + "language_loss": 0.76285869, + "learning_rate": 3.344578243390651e-06, + "loss": 0.78502911, + "num_input_tokens_seen": 51172210, + "step": 2390, + "time_per_iteration": 2.4250078201293945 + }, + { + "auxiliary_loss_clip": 0.01168014, + "auxiliary_loss_mlp": 0.01031772, + "balance_loss_clip": 1.05394089, + "balance_loss_mlp": 1.02191937, + "epoch": 0.2875007515180665, + "flos": 17420123237760.0, + "grad_norm": 2.0965725866060714, + "language_loss": 0.78556859, + "learning_rate": 3.3440014775232206e-06, + "loss": 0.8075664, + "num_input_tokens_seen": 51190265, + "step": 2391, + "time_per_iteration": 2.462538480758667 + }, + { + "auxiliary_loss_clip": 0.01159219, + "auxiliary_loss_mlp": 0.01029254, + "balance_loss_clip": 1.05179751, + "balance_loss_mlp": 1.02099848, + "epoch": 0.2876209944087056, + "flos": 23434190213760.0, + "grad_norm": 1.8874084205534576, + "language_loss": 0.71218908, + "learning_rate": 3.343424507771834e-06, + "loss": 0.73407376, + "num_input_tokens_seen": 51208475, + "step": 2392, + "time_per_iteration": 2.5912468433380127 + }, + { + "auxiliary_loss_clip": 0.01153126, + "auxiliary_loss_mlp": 0.01029177, + "balance_loss_clip": 1.05061436, + "balance_loss_mlp": 1.02033138, + "epoch": 0.2877412372993447, + "flos": 13735509079680.0, + "grad_norm": 1.7834454548168148, + "language_loss": 0.86143434, + "learning_rate": 3.342847334224018e-06, + "loss": 0.88325739, + "num_input_tokens_seen": 51225875, + "step": 2393, + "time_per_iteration": 3.218336343765259 + }, + { + "auxiliary_loss_clip": 0.01087164, + "auxiliary_loss_mlp": 0.01002086, + "balance_loss_clip": 1.01973295, + "balance_loss_mlp": 1.00007176, + "epoch": 0.28786148018998375, + "flos": 58079695104000.0, + "grad_norm": 0.9478263594478998, + "language_loss": 0.62366164, + "learning_rate": 3.342269956967329e-06, + "loss": 0.64455414, + "num_input_tokens_seen": 51287780, + "step": 2394, + "time_per_iteration": 3.8577001094818115 + }, + { + "auxiliary_loss_clip": 0.01190933, + "auxiliary_loss_mlp": 0.01033346, + "balance_loss_clip": 1.05706334, + "balance_loss_mlp": 1.02281404, + "epoch": 0.28798172308062286, + "flos": 23435052140160.0, + "grad_norm": 3.073057445212874, + "language_loss": 0.71450794, + "learning_rate": 3.341692376089355e-06, + "loss": 0.73675072, + "num_input_tokens_seen": 51303335, + "step": 2395, + "time_per_iteration": 2.4864211082458496 + }, + { + "auxiliary_loss_clip": 0.01184768, + "auxiliary_loss_mlp": 0.010343, + "balance_loss_clip": 1.0566386, + "balance_loss_mlp": 1.02525806, + "epoch": 0.288101965971262, + "flos": 25110033200640.0, + "grad_norm": 3.372009950856185, + "language_loss": 0.83958256, + "learning_rate": 3.3411145916777146e-06, + "loss": 0.86177325, + "num_input_tokens_seen": 51317495, + "step": 2396, + "time_per_iteration": 3.1827924251556396 + }, + { + "auxiliary_loss_clip": 0.01163528, + "auxiliary_loss_mlp": 0.01031403, + "balance_loss_clip": 1.050668, + "balance_loss_mlp": 1.02171111, + "epoch": 0.28822220886190103, + "flos": 16252559654400.0, + "grad_norm": 2.160618640793434, + "language_loss": 0.9046061, + "learning_rate": 3.3405366038200566e-06, + "loss": 0.9265554, + "num_input_tokens_seen": 51336430, + "step": 2397, + "time_per_iteration": 2.4871184825897217 + }, + { + "auxiliary_loss_clip": 0.01175065, + "auxiliary_loss_mlp": 0.01037632, + "balance_loss_clip": 1.05854273, + "balance_loss_mlp": 1.02802932, + "epoch": 0.28834245175254014, + "flos": 24535642815360.0, + "grad_norm": 2.147584468957998, + "language_loss": 0.84940994, + "learning_rate": 3.3399584126040617e-06, + "loss": 0.87153697, + "num_input_tokens_seen": 51355930, + "step": 2398, + "time_per_iteration": 2.539724588394165 + }, + { + "auxiliary_loss_clip": 0.01199822, + "auxiliary_loss_mlp": 0.00763119, + "balance_loss_clip": 1.05635023, + "balance_loss_mlp": 1.00065255, + "epoch": 0.2884626946431792, + "flos": 24571445696640.0, + "grad_norm": 1.7711798751449048, + "language_loss": 0.90402812, + "learning_rate": 3.339380018117441e-06, + "loss": 0.92365754, + "num_input_tokens_seen": 51376765, + "step": 2399, + "time_per_iteration": 2.4872472286224365 + }, + { + "auxiliary_loss_clip": 0.01182608, + "auxiliary_loss_mlp": 0.01029685, + "balance_loss_clip": 1.05494773, + "balance_loss_mlp": 1.02077985, + "epoch": 0.2885829375338183, + "flos": 16544657053440.0, + "grad_norm": 2.820666522739627, + "language_loss": 0.78235555, + "learning_rate": 3.3388014204479366e-06, + "loss": 0.80447847, + "num_input_tokens_seen": 51394570, + "step": 2400, + "time_per_iteration": 2.4397788047790527 + }, + { + "auxiliary_loss_clip": 0.01204086, + "auxiliary_loss_mlp": 0.01030055, + "balance_loss_clip": 1.05814958, + "balance_loss_mlp": 1.02138853, + "epoch": 0.2887031804244574, + "flos": 24061226958720.0, + "grad_norm": 2.1151192650512076, + "language_loss": 0.91268396, + "learning_rate": 3.338222619683321e-06, + "loss": 0.93502533, + "num_input_tokens_seen": 51414535, + "step": 2401, + "time_per_iteration": 2.4566564559936523 + }, + { + "auxiliary_loss_clip": 0.01173129, + "auxiliary_loss_mlp": 0.01031146, + "balance_loss_clip": 1.05269945, + "balance_loss_mlp": 1.02148986, + "epoch": 0.2888234233150965, + "flos": 23330696152320.0, + "grad_norm": 2.5441735263022762, + "language_loss": 0.73238444, + "learning_rate": 3.337643615911398e-06, + "loss": 0.75442719, + "num_input_tokens_seen": 51434160, + "step": 2402, + "time_per_iteration": 2.5001654624938965 + }, + { + "auxiliary_loss_clip": 0.0118528, + "auxiliary_loss_mlp": 0.0102712, + "balance_loss_clip": 1.05256808, + "balance_loss_mlp": 1.01757169, + "epoch": 0.2889436662057356, + "flos": 22272767856000.0, + "grad_norm": 2.080043123679177, + "language_loss": 0.78807032, + "learning_rate": 3.3370644092200026e-06, + "loss": 0.81019437, + "num_input_tokens_seen": 51451435, + "step": 2403, + "time_per_iteration": 2.466914653778076 + }, + { + "auxiliary_loss_clip": 0.01142025, + "auxiliary_loss_mlp": 0.01032041, + "balance_loss_clip": 1.04685938, + "balance_loss_mlp": 1.02276111, + "epoch": 0.2890639090963747, + "flos": 21616931381760.0, + "grad_norm": 2.067465751089719, + "language_loss": 0.78036833, + "learning_rate": 3.3364849996969985e-06, + "loss": 0.80210906, + "num_input_tokens_seen": 51471455, + "step": 2404, + "time_per_iteration": 2.534834146499634 + }, + { + "auxiliary_loss_clip": 0.01182525, + "auxiliary_loss_mlp": 0.01029978, + "balance_loss_clip": 1.05438113, + "balance_loss_mlp": 1.02097785, + "epoch": 0.28918415198701375, + "flos": 28585540333440.0, + "grad_norm": 2.040191263217132, + "language_loss": 0.85362363, + "learning_rate": 3.335905387430283e-06, + "loss": 0.87574863, + "num_input_tokens_seen": 51492890, + "step": 2405, + "time_per_iteration": 2.516786575317383 + }, + { + "auxiliary_loss_clip": 0.01172375, + "auxiliary_loss_mlp": 0.01027669, + "balance_loss_clip": 1.05089033, + "balance_loss_mlp": 1.01836467, + "epoch": 0.28930439487765286, + "flos": 21944688007680.0, + "grad_norm": 1.8842489435204612, + "language_loss": 0.82923317, + "learning_rate": 3.335325572507782e-06, + "loss": 0.8512336, + "num_input_tokens_seen": 51513390, + "step": 2406, + "time_per_iteration": 2.5143544673919678 + }, + { + "auxiliary_loss_clip": 0.01204003, + "auxiliary_loss_mlp": 0.00763511, + "balance_loss_clip": 1.06050014, + "balance_loss_mlp": 1.00065541, + "epoch": 0.28942463776829197, + "flos": 19281911955840.0, + "grad_norm": 1.6079823416928278, + "language_loss": 0.74296784, + "learning_rate": 3.3347455550174537e-06, + "loss": 0.76264292, + "num_input_tokens_seen": 51532730, + "step": 2407, + "time_per_iteration": 2.5010061264038086 + }, + { + "auxiliary_loss_clip": 0.01153183, + "auxiliary_loss_mlp": 0.01031223, + "balance_loss_clip": 1.04938626, + "balance_loss_mlp": 1.02143025, + "epoch": 0.289544880658931, + "flos": 14645700737280.0, + "grad_norm": 2.1852375363036782, + "language_loss": 0.68211901, + "learning_rate": 3.3341653350472864e-06, + "loss": 0.70396316, + "num_input_tokens_seen": 51549560, + "step": 2408, + "time_per_iteration": 2.498410701751709 + }, + { + "auxiliary_loss_clip": 0.01208388, + "auxiliary_loss_mlp": 0.01027048, + "balance_loss_clip": 1.05733418, + "balance_loss_mlp": 1.0171833, + "epoch": 0.28966512354957014, + "flos": 28621881918720.0, + "grad_norm": 2.6711812491681237, + "language_loss": 0.69099116, + "learning_rate": 3.333584912685298e-06, + "loss": 0.71334553, + "num_input_tokens_seen": 51568180, + "step": 2409, + "time_per_iteration": 2.4798927307128906 + }, + { + "auxiliary_loss_clip": 0.01067134, + "auxiliary_loss_mlp": 0.0100317, + "balance_loss_clip": 1.02252793, + "balance_loss_mlp": 1.00129879, + "epoch": 0.28978536644020925, + "flos": 64711784511360.0, + "grad_norm": 0.8904835565904227, + "language_loss": 0.55542809, + "learning_rate": 3.3330042880195385e-06, + "loss": 0.57613111, + "num_input_tokens_seen": 51622530, + "step": 2410, + "time_per_iteration": 3.067418336868286 + }, + { + "auxiliary_loss_clip": 0.01169426, + "auxiliary_loss_mlp": 0.01027609, + "balance_loss_clip": 1.0516293, + "balance_loss_mlp": 1.01873362, + "epoch": 0.2899056093308483, + "flos": 18624638937600.0, + "grad_norm": 2.445098259490908, + "language_loss": 0.78262663, + "learning_rate": 3.3324234611380888e-06, + "loss": 0.80459702, + "num_input_tokens_seen": 51641260, + "step": 2411, + "time_per_iteration": 2.5085813999176025 + }, + { + "auxiliary_loss_clip": 0.01150407, + "auxiliary_loss_mlp": 0.01032074, + "balance_loss_clip": 1.05189574, + "balance_loss_mlp": 1.02313352, + "epoch": 0.2900258522214874, + "flos": 22893735202560.0, + "grad_norm": 1.7640510579060784, + "language_loss": 0.81574023, + "learning_rate": 3.3318424321290596e-06, + "loss": 0.83756506, + "num_input_tokens_seen": 51660975, + "step": 2412, + "time_per_iteration": 2.5389678478240967 + }, + { + "auxiliary_loss_clip": 0.0106707, + "auxiliary_loss_mlp": 0.0100912, + "balance_loss_clip": 1.02073026, + "balance_loss_mlp": 1.00730848, + "epoch": 0.2901460951121265, + "flos": 71106036013440.0, + "grad_norm": 0.8393681495604112, + "language_loss": 0.60000694, + "learning_rate": 3.3312612010805917e-06, + "loss": 0.62076885, + "num_input_tokens_seen": 51720550, + "step": 2413, + "time_per_iteration": 3.1584129333496094 + }, + { + "auxiliary_loss_clip": 0.01158673, + "auxiliary_loss_mlp": 0.01032139, + "balance_loss_clip": 1.05110264, + "balance_loss_mlp": 1.02304339, + "epoch": 0.2902663380027656, + "flos": 32160986081280.0, + "grad_norm": 1.9813355825465908, + "language_loss": 0.69914615, + "learning_rate": 3.330679768080858e-06, + "loss": 0.72105432, + "num_input_tokens_seen": 51744435, + "step": 2414, + "time_per_iteration": 2.568317174911499 + }, + { + "auxiliary_loss_clip": 0.01185813, + "auxiliary_loss_mlp": 0.01034507, + "balance_loss_clip": 1.05795217, + "balance_loss_mlp": 1.02489233, + "epoch": 0.2903865808934047, + "flos": 29351658539520.0, + "grad_norm": 2.099278729551678, + "language_loss": 0.83380932, + "learning_rate": 3.3300981332180627e-06, + "loss": 0.85601258, + "num_input_tokens_seen": 51763640, + "step": 2415, + "time_per_iteration": 3.2428994178771973 + }, + { + "auxiliary_loss_clip": 0.01162258, + "auxiliary_loss_mlp": 0.01029987, + "balance_loss_clip": 1.05289888, + "balance_loss_mlp": 1.02132022, + "epoch": 0.29050682378404374, + "flos": 17089026647040.0, + "grad_norm": 2.1817272513231165, + "language_loss": 0.80006099, + "learning_rate": 3.3295162965804373e-06, + "loss": 0.8219834, + "num_input_tokens_seen": 51782135, + "step": 2416, + "time_per_iteration": 2.509383201599121 + }, + { + "auxiliary_loss_clip": 0.01155466, + "auxiliary_loss_mlp": 0.01028053, + "balance_loss_clip": 1.05495715, + "balance_loss_mlp": 1.01898742, + "epoch": 0.29062706667468285, + "flos": 17858233422720.0, + "grad_norm": 2.4608281694997527, + "language_loss": 0.78774983, + "learning_rate": 3.328934258256247e-06, + "loss": 0.80958503, + "num_input_tokens_seen": 51800200, + "step": 2417, + "time_per_iteration": 2.506577968597412 + }, + { + "auxiliary_loss_clip": 0.01185753, + "auxiliary_loss_mlp": 0.01027793, + "balance_loss_clip": 1.05353189, + "balance_loss_mlp": 1.01773751, + "epoch": 0.29074730956532197, + "flos": 24279815174400.0, + "grad_norm": 2.573442890855167, + "language_loss": 0.67055053, + "learning_rate": 3.3283520183337856e-06, + "loss": 0.69268596, + "num_input_tokens_seen": 51819905, + "step": 2418, + "time_per_iteration": 2.481616973876953 + }, + { + "auxiliary_loss_clip": 0.01167264, + "auxiliary_loss_mlp": 0.01029814, + "balance_loss_clip": 1.05187654, + "balance_loss_mlp": 1.02064085, + "epoch": 0.290867552455961, + "flos": 22340961826560.0, + "grad_norm": 1.6560616401151063, + "language_loss": 0.6911037, + "learning_rate": 3.3277695769013797e-06, + "loss": 0.71307456, + "num_input_tokens_seen": 51839350, + "step": 2419, + "time_per_iteration": 2.4936470985412598 + }, + { + "auxiliary_loss_clip": 0.01188156, + "auxiliary_loss_mlp": 0.01027219, + "balance_loss_clip": 1.0560329, + "balance_loss_mlp": 1.01793885, + "epoch": 0.29098779534660013, + "flos": 23186155824000.0, + "grad_norm": 2.025361220275039, + "language_loss": 0.77087486, + "learning_rate": 3.327186934047385e-06, + "loss": 0.79302859, + "num_input_tokens_seen": 51858045, + "step": 2420, + "time_per_iteration": 2.4886300563812256 + }, + { + "auxiliary_loss_clip": 0.01159085, + "auxiliary_loss_mlp": 0.0103457, + "balance_loss_clip": 1.04741573, + "balance_loss_mlp": 1.0253669, + "epoch": 0.29110803823723924, + "flos": 15304194817920.0, + "grad_norm": 2.0242228397918463, + "language_loss": 0.65506786, + "learning_rate": 3.3266040898601877e-06, + "loss": 0.6770044, + "num_input_tokens_seen": 51875880, + "step": 2421, + "time_per_iteration": 4.001987934112549 + }, + { + "auxiliary_loss_clip": 0.01136219, + "auxiliary_loss_mlp": 0.01032982, + "balance_loss_clip": 1.04722714, + "balance_loss_mlp": 1.02362967, + "epoch": 0.2912282811278783, + "flos": 22595352923520.0, + "grad_norm": 1.9998348394439445, + "language_loss": 0.77803397, + "learning_rate": 3.3260210444282045e-06, + "loss": 0.79972601, + "num_input_tokens_seen": 51893835, + "step": 2422, + "time_per_iteration": 2.578239917755127 + }, + { + "auxiliary_loss_clip": 0.0118118, + "auxiliary_loss_mlp": 0.01033667, + "balance_loss_clip": 1.05574489, + "balance_loss_mlp": 1.02476788, + "epoch": 0.2913485240185174, + "flos": 24497900599680.0, + "grad_norm": 3.9419091331223415, + "language_loss": 0.73035151, + "learning_rate": 3.325437797839883e-06, + "loss": 0.7525, + "num_input_tokens_seen": 51912205, + "step": 2423, + "time_per_iteration": 3.264910936355591 + }, + { + "auxiliary_loss_clip": 0.01199425, + "auxiliary_loss_mlp": 0.01031397, + "balance_loss_clip": 1.05534387, + "balance_loss_mlp": 1.02215767, + "epoch": 0.2914687669091565, + "flos": 17931024334080.0, + "grad_norm": 2.516497581512564, + "language_loss": 0.74583304, + "learning_rate": 3.3248543501837015e-06, + "loss": 0.76814127, + "num_input_tokens_seen": 51929410, + "step": 2424, + "time_per_iteration": 2.400017499923706 + }, + { + "auxiliary_loss_clip": 0.01147579, + "auxiliary_loss_mlp": 0.01031209, + "balance_loss_clip": 1.05279946, + "balance_loss_mlp": 1.0221076, + "epoch": 0.2915890097997956, + "flos": 22529313768960.0, + "grad_norm": 1.8444977398646434, + "language_loss": 0.77217901, + "learning_rate": 3.3242707015481684e-06, + "loss": 0.79396695, + "num_input_tokens_seen": 51949345, + "step": 2425, + "time_per_iteration": 2.575032949447632 + }, + { + "auxiliary_loss_clip": 0.01167607, + "auxiliary_loss_mlp": 0.01034991, + "balance_loss_clip": 1.04899824, + "balance_loss_mlp": 1.02629495, + "epoch": 0.2917092526904347, + "flos": 13845216193920.0, + "grad_norm": 2.03106254367417, + "language_loss": 0.8060813, + "learning_rate": 3.323686852021823e-06, + "loss": 0.8281073, + "num_input_tokens_seen": 51966855, + "step": 2426, + "time_per_iteration": 2.4820852279663086 + }, + { + "auxiliary_loss_clip": 0.01158322, + "auxiliary_loss_mlp": 0.01029025, + "balance_loss_clip": 1.04836667, + "balance_loss_mlp": 1.01972079, + "epoch": 0.2918294955810738, + "flos": 22674859678080.0, + "grad_norm": 1.9102581675735015, + "language_loss": 0.79625994, + "learning_rate": 3.323102801693235e-06, + "loss": 0.81813347, + "num_input_tokens_seen": 51985620, + "step": 2427, + "time_per_iteration": 2.542752504348755 + }, + { + "auxiliary_loss_clip": 0.01178878, + "auxiliary_loss_mlp": 0.01030773, + "balance_loss_clip": 1.05184543, + "balance_loss_mlp": 1.02142119, + "epoch": 0.29194973847171285, + "flos": 23438284364160.0, + "grad_norm": 3.796897234784888, + "language_loss": 0.80358291, + "learning_rate": 3.322518550651003e-06, + "loss": 0.82567942, + "num_input_tokens_seen": 52004930, + "step": 2428, + "time_per_iteration": 2.4942822456359863 + }, + { + "auxiliary_loss_clip": 0.01177106, + "auxiliary_loss_mlp": 0.01033415, + "balance_loss_clip": 1.05161238, + "balance_loss_mlp": 1.02413464, + "epoch": 0.29206998136235196, + "flos": 21909064694400.0, + "grad_norm": 2.062836124093299, + "language_loss": 0.81219769, + "learning_rate": 3.3219340989837586e-06, + "loss": 0.8343029, + "num_input_tokens_seen": 52024920, + "step": 2429, + "time_per_iteration": 2.5155718326568604 + }, + { + "auxiliary_loss_clip": 0.01170017, + "auxiliary_loss_mlp": 0.01031133, + "balance_loss_clip": 1.05379033, + "balance_loss_mlp": 1.02236485, + "epoch": 0.292190224252991, + "flos": 23215925220480.0, + "grad_norm": 1.8657792607064143, + "language_loss": 0.80631399, + "learning_rate": 3.3213494467801625e-06, + "loss": 0.82832551, + "num_input_tokens_seen": 52044095, + "step": 2430, + "time_per_iteration": 2.5029332637786865 + }, + { + "auxiliary_loss_clip": 0.01113482, + "auxiliary_loss_mlp": 0.01028758, + "balance_loss_clip": 1.04419184, + "balance_loss_mlp": 1.01952481, + "epoch": 0.2923104671436301, + "flos": 20740818752640.0, + "grad_norm": 2.553169053963252, + "language_loss": 0.71240121, + "learning_rate": 3.3207645941289063e-06, + "loss": 0.73382366, + "num_input_tokens_seen": 52062440, + "step": 2431, + "time_per_iteration": 2.6259143352508545 + }, + { + "auxiliary_loss_clip": 0.01187137, + "auxiliary_loss_mlp": 0.00763855, + "balance_loss_clip": 1.05796981, + "balance_loss_mlp": 1.00060785, + "epoch": 0.29243071003426924, + "flos": 35809114999680.0, + "grad_norm": 1.8435931835345114, + "language_loss": 0.79985166, + "learning_rate": 3.320179541118711e-06, + "loss": 0.81936157, + "num_input_tokens_seen": 52084940, + "step": 2432, + "time_per_iteration": 2.5962798595428467 + }, + { + "auxiliary_loss_clip": 0.01087847, + "auxiliary_loss_mlp": 0.01004804, + "balance_loss_clip": 1.02000499, + "balance_loss_mlp": 1.00290823, + "epoch": 0.2925509529249083, + "flos": 58081598524800.0, + "grad_norm": 1.0183191325945622, + "language_loss": 0.60319144, + "learning_rate": 3.3195942878383293e-06, + "loss": 0.62411791, + "num_input_tokens_seen": 52141040, + "step": 2433, + "time_per_iteration": 3.0756001472473145 + }, + { + "auxiliary_loss_clip": 0.01188656, + "auxiliary_loss_mlp": 0.01034915, + "balance_loss_clip": 1.05596089, + "balance_loss_mlp": 1.02484787, + "epoch": 0.2926711958155474, + "flos": 21397122103680.0, + "grad_norm": 2.610221029038429, + "language_loss": 0.78121299, + "learning_rate": 3.319008834376543e-06, + "loss": 0.80344868, + "num_input_tokens_seen": 52160730, + "step": 2434, + "time_per_iteration": 2.4809672832489014 + }, + { + "auxiliary_loss_clip": 0.01159963, + "auxiliary_loss_mlp": 0.01027331, + "balance_loss_clip": 1.0470376, + "balance_loss_mlp": 1.01846218, + "epoch": 0.2927914387061865, + "flos": 23185796688000.0, + "grad_norm": 4.66601792487937, + "language_loss": 0.88483757, + "learning_rate": 3.3184231808221654e-06, + "loss": 0.90671051, + "num_input_tokens_seen": 52175055, + "step": 2435, + "time_per_iteration": 2.548916816711426 + }, + { + "auxiliary_loss_clip": 0.01157416, + "auxiliary_loss_mlp": 0.01035041, + "balance_loss_clip": 1.05254483, + "balance_loss_mlp": 1.02547455, + "epoch": 0.29291168159682557, + "flos": 22455553190400.0, + "grad_norm": 2.6976111841813077, + "language_loss": 0.62746024, + "learning_rate": 3.3178373272640394e-06, + "loss": 0.64938474, + "num_input_tokens_seen": 52194150, + "step": 2436, + "time_per_iteration": 2.523909330368042 + }, + { + "auxiliary_loss_clip": 0.01197028, + "auxiliary_loss_mlp": 0.01033638, + "balance_loss_clip": 1.05518866, + "balance_loss_mlp": 1.02475691, + "epoch": 0.2930319244874647, + "flos": 21170632896000.0, + "grad_norm": 2.7410208047822238, + "language_loss": 0.85211843, + "learning_rate": 3.3172512737910387e-06, + "loss": 0.87442517, + "num_input_tokens_seen": 52211660, + "step": 2437, + "time_per_iteration": 2.43072772026062 + }, + { + "auxiliary_loss_clip": 0.01185492, + "auxiliary_loss_mlp": 0.01031838, + "balance_loss_clip": 1.0524925, + "balance_loss_mlp": 1.0226891, + "epoch": 0.2931521673781038, + "flos": 31357843931520.0, + "grad_norm": 2.2633635206784173, + "language_loss": 0.88324749, + "learning_rate": 3.3166650204920674e-06, + "loss": 0.90542078, + "num_input_tokens_seen": 52232830, + "step": 2438, + "time_per_iteration": 2.5341997146606445 + }, + { + "auxiliary_loss_clip": 0.01185836, + "auxiliary_loss_mlp": 0.01033199, + "balance_loss_clip": 1.05614364, + "balance_loss_mlp": 1.0235312, + "epoch": 0.29327241026874284, + "flos": 24200990778240.0, + "grad_norm": 1.6396838622430097, + "language_loss": 0.81626642, + "learning_rate": 3.316078567456059e-06, + "loss": 0.83845669, + "num_input_tokens_seen": 52250670, + "step": 2439, + "time_per_iteration": 2.482994794845581 + }, + { + "auxiliary_loss_clip": 0.01132853, + "auxiliary_loss_mlp": 0.01026363, + "balance_loss_clip": 1.04879665, + "balance_loss_mlp": 1.017887, + "epoch": 0.29339265315938196, + "flos": 24242611662720.0, + "grad_norm": 2.9745805649260624, + "language_loss": 0.75868368, + "learning_rate": 3.3154919147719786e-06, + "loss": 0.78027582, + "num_input_tokens_seen": 52271685, + "step": 2440, + "time_per_iteration": 2.6047627925872803 + }, + { + "auxiliary_loss_clip": 0.01185665, + "auxiliary_loss_mlp": 0.01030599, + "balance_loss_clip": 1.05426598, + "balance_loss_mlp": 1.02130032, + "epoch": 0.29351289605002107, + "flos": 16946641134720.0, + "grad_norm": 2.0207683883778484, + "language_loss": 0.862607, + "learning_rate": 3.31490506252882e-06, + "loss": 0.88476968, + "num_input_tokens_seen": 52291065, + "step": 2441, + "time_per_iteration": 2.541123628616333 + }, + { + "auxiliary_loss_clip": 0.01145731, + "auxiliary_loss_mlp": 0.01030766, + "balance_loss_clip": 1.04526687, + "balance_loss_mlp": 1.02228427, + "epoch": 0.2936331389406601, + "flos": 19829082810240.0, + "grad_norm": 2.0940847438779895, + "language_loss": 0.84270442, + "learning_rate": 3.31431801081561e-06, + "loss": 0.86446941, + "num_input_tokens_seen": 52310000, + "step": 2442, + "time_per_iteration": 3.254779577255249 + }, + { + "auxiliary_loss_clip": 0.01068838, + "auxiliary_loss_mlp": 0.01005439, + "balance_loss_clip": 1.01782298, + "balance_loss_mlp": 1.00350785, + "epoch": 0.29375338183129923, + "flos": 71416844398080.0, + "grad_norm": 0.9140513116593686, + "language_loss": 0.67910665, + "learning_rate": 3.313730759721402e-06, + "loss": 0.69984943, + "num_input_tokens_seen": 52372930, + "step": 2443, + "time_per_iteration": 3.166480302810669 + }, + { + "auxiliary_loss_clip": 0.01169043, + "auxiliary_loss_mlp": 0.01038843, + "balance_loss_clip": 1.05354297, + "balance_loss_mlp": 1.02890682, + "epoch": 0.29387362472193834, + "flos": 22054502862720.0, + "grad_norm": 4.239531646022478, + "language_loss": 0.86321497, + "learning_rate": 3.313143309335282e-06, + "loss": 0.88529378, + "num_input_tokens_seen": 52391420, + "step": 2444, + "time_per_iteration": 2.508636236190796 + }, + { + "auxiliary_loss_clip": 0.01158592, + "auxiliary_loss_mlp": 0.01031725, + "balance_loss_clip": 1.05544329, + "balance_loss_mlp": 1.02227807, + "epoch": 0.2939938676125774, + "flos": 22966418373120.0, + "grad_norm": 1.7214011247876329, + "language_loss": 0.84558386, + "learning_rate": 3.3125556597463665e-06, + "loss": 0.86748701, + "num_input_tokens_seen": 52410725, + "step": 2445, + "time_per_iteration": 2.5342917442321777 + }, + { + "auxiliary_loss_clip": 0.01185201, + "auxiliary_loss_mlp": 0.01034558, + "balance_loss_clip": 1.05806446, + "balance_loss_mlp": 1.0255096, + "epoch": 0.2941141105032165, + "flos": 31358705857920.0, + "grad_norm": 1.807370251725048, + "language_loss": 0.66127098, + "learning_rate": 3.311967811043801e-06, + "loss": 0.68346858, + "num_input_tokens_seen": 52432645, + "step": 2446, + "time_per_iteration": 2.554311990737915 + }, + { + "auxiliary_loss_clip": 0.01185955, + "auxiliary_loss_mlp": 0.01030624, + "balance_loss_clip": 1.05592465, + "balance_loss_mlp": 1.02110493, + "epoch": 0.29423435339385556, + "flos": 23222138273280.0, + "grad_norm": 2.7310824779178327, + "language_loss": 0.82088995, + "learning_rate": 3.3113797633167617e-06, + "loss": 0.84305567, + "num_input_tokens_seen": 52450940, + "step": 2447, + "time_per_iteration": 3.278484344482422 + }, + { + "auxiliary_loss_clip": 0.01200003, + "auxiliary_loss_mlp": 0.01031764, + "balance_loss_clip": 1.05587387, + "balance_loss_mlp": 1.02188778, + "epoch": 0.2943545962844947, + "flos": 26864054138880.0, + "grad_norm": 2.6456389881024185, + "language_loss": 0.68730146, + "learning_rate": 3.310791516654455e-06, + "loss": 0.7096191, + "num_input_tokens_seen": 52468000, + "step": 2448, + "time_per_iteration": 3.2276830673217773 + }, + { + "auxiliary_loss_clip": 0.01166658, + "auxiliary_loss_mlp": 0.01041568, + "balance_loss_clip": 1.05293345, + "balance_loss_mlp": 1.03104186, + "epoch": 0.2944748391751338, + "flos": 20231677422720.0, + "grad_norm": 1.9565562944355324, + "language_loss": 0.79427361, + "learning_rate": 3.3102030711461177e-06, + "loss": 0.81635588, + "num_input_tokens_seen": 52487575, + "step": 2449, + "time_per_iteration": 2.5254619121551514 + }, + { + "auxiliary_loss_clip": 0.01159164, + "auxiliary_loss_mlp": 0.01028173, + "balance_loss_clip": 1.05274415, + "balance_loss_mlp": 1.0187968, + "epoch": 0.29459508206577284, + "flos": 15960965045760.0, + "grad_norm": 1.7516200206097263, + "language_loss": 0.67712176, + "learning_rate": 3.3096144268810156e-06, + "loss": 0.69899517, + "num_input_tokens_seen": 52506335, + "step": 2450, + "time_per_iteration": 3.251459836959839 + }, + { + "auxiliary_loss_clip": 0.01175112, + "auxiliary_loss_mlp": 0.0103255, + "balance_loss_clip": 1.05180323, + "balance_loss_mlp": 1.02302575, + "epoch": 0.29471532495641195, + "flos": 20412882558720.0, + "grad_norm": 1.9151731270349088, + "language_loss": 0.73001391, + "learning_rate": 3.3090255839484462e-06, + "loss": 0.75209057, + "num_input_tokens_seen": 52524330, + "step": 2451, + "time_per_iteration": 2.4693732261657715 + }, + { + "auxiliary_loss_clip": 0.01172766, + "auxiliary_loss_mlp": 0.01028385, + "balance_loss_clip": 1.05162716, + "balance_loss_mlp": 1.01860952, + "epoch": 0.29483556784705106, + "flos": 20376576887040.0, + "grad_norm": 2.1219099742923655, + "language_loss": 0.85244715, + "learning_rate": 3.3084365424377366e-06, + "loss": 0.87445867, + "num_input_tokens_seen": 52543095, + "step": 2452, + "time_per_iteration": 2.482821226119995 + }, + { + "auxiliary_loss_clip": 0.01053928, + "auxiliary_loss_mlp": 0.01005472, + "balance_loss_clip": 1.02595353, + "balance_loss_mlp": 1.00346947, + "epoch": 0.2949558107376901, + "flos": 68555660595840.0, + "grad_norm": 0.722286984838734, + "language_loss": 0.55924046, + "learning_rate": 3.307847302438245e-06, + "loss": 0.57983446, + "num_input_tokens_seen": 52597075, + "step": 2453, + "time_per_iteration": 2.9926278591156006 + }, + { + "auxiliary_loss_clip": 0.01126102, + "auxiliary_loss_mlp": 0.01029046, + "balance_loss_clip": 1.0444386, + "balance_loss_mlp": 1.01880598, + "epoch": 0.2950760536283292, + "flos": 16107085572480.0, + "grad_norm": 2.121494147862325, + "language_loss": 0.77777827, + "learning_rate": 3.3072578640393562e-06, + "loss": 0.7993297, + "num_input_tokens_seen": 52614410, + "step": 2454, + "time_per_iteration": 2.520268201828003 + }, + { + "auxiliary_loss_clip": 0.01171643, + "auxiliary_loss_mlp": 0.01030485, + "balance_loss_clip": 1.05373824, + "balance_loss_mlp": 1.02084708, + "epoch": 0.29519629651896834, + "flos": 20483626394880.0, + "grad_norm": 1.866248750379111, + "language_loss": 0.79569191, + "learning_rate": 3.3066682273304886e-06, + "loss": 0.8177132, + "num_input_tokens_seen": 52632055, + "step": 2455, + "time_per_iteration": 2.513395309448242 + }, + { + "auxiliary_loss_clip": 0.01191586, + "auxiliary_loss_mlp": 0.00764167, + "balance_loss_clip": 1.05792308, + "balance_loss_mlp": 1.000808, + "epoch": 0.2953165394096074, + "flos": 18916484941440.0, + "grad_norm": 1.9389719675868118, + "language_loss": 0.7899335, + "learning_rate": 3.3060783924010904e-06, + "loss": 0.80949104, + "num_input_tokens_seen": 52649980, + "step": 2456, + "time_per_iteration": 2.488424062728882 + }, + { + "auxiliary_loss_clip": 0.01157507, + "auxiliary_loss_mlp": 0.0103278, + "balance_loss_clip": 1.05261123, + "balance_loss_mlp": 1.02289164, + "epoch": 0.2954367823002465, + "flos": 20624467622400.0, + "grad_norm": 2.1427928755048895, + "language_loss": 0.84938538, + "learning_rate": 3.3054883593406387e-06, + "loss": 0.8712883, + "num_input_tokens_seen": 52664730, + "step": 2457, + "time_per_iteration": 2.5230014324188232 + }, + { + "auxiliary_loss_clip": 0.01171867, + "auxiliary_loss_mlp": 0.0103036, + "balance_loss_clip": 1.05271828, + "balance_loss_mlp": 1.02131236, + "epoch": 0.2955570251908856, + "flos": 31175525473920.0, + "grad_norm": 2.1574354384872567, + "language_loss": 0.65141249, + "learning_rate": 3.3048981282386404e-06, + "loss": 0.67343473, + "num_input_tokens_seen": 52686040, + "step": 2458, + "time_per_iteration": 2.5645525455474854 + }, + { + "auxiliary_loss_clip": 0.0114263, + "auxiliary_loss_mlp": 0.01026988, + "balance_loss_clip": 1.04898977, + "balance_loss_mlp": 1.0183754, + "epoch": 0.29567726808152467, + "flos": 21650328051840.0, + "grad_norm": 5.689207043016745, + "language_loss": 0.8235622, + "learning_rate": 3.304307699184634e-06, + "loss": 0.84525836, + "num_input_tokens_seen": 52704630, + "step": 2459, + "time_per_iteration": 2.5216524600982666 + }, + { + "auxiliary_loss_clip": 0.01174822, + "auxiliary_loss_mlp": 0.01035596, + "balance_loss_clip": 1.05777264, + "balance_loss_mlp": 1.02633309, + "epoch": 0.2957975109721638, + "flos": 24243868638720.0, + "grad_norm": 1.6983132567408177, + "language_loss": 0.78875875, + "learning_rate": 3.3037170722681866e-06, + "loss": 0.81086296, + "num_input_tokens_seen": 52725465, + "step": 2460, + "time_per_iteration": 2.5452377796173096 + }, + { + "auxiliary_loss_clip": 0.01149578, + "auxiliary_loss_mlp": 0.01032872, + "balance_loss_clip": 1.05133641, + "balance_loss_mlp": 1.02345467, + "epoch": 0.29591775386280283, + "flos": 13479717352320.0, + "grad_norm": 1.999717170917322, + "language_loss": 0.68074548, + "learning_rate": 3.3031262475788956e-06, + "loss": 0.70256996, + "num_input_tokens_seen": 52742405, + "step": 2461, + "time_per_iteration": 2.504343032836914 + }, + { + "auxiliary_loss_clip": 0.01168219, + "auxiliary_loss_mlp": 0.01029191, + "balance_loss_clip": 1.05210793, + "balance_loss_mlp": 1.02035189, + "epoch": 0.29603799675344195, + "flos": 17749783284480.0, + "grad_norm": 2.077207248058627, + "language_loss": 0.7267977, + "learning_rate": 3.3025352252063897e-06, + "loss": 0.74877173, + "num_input_tokens_seen": 52761100, + "step": 2462, + "time_per_iteration": 2.486560344696045 + }, + { + "auxiliary_loss_clip": 0.01185737, + "auxiliary_loss_mlp": 0.01040374, + "balance_loss_clip": 1.0589968, + "balance_loss_mlp": 1.03075385, + "epoch": 0.29615823964408106, + "flos": 22783920347520.0, + "grad_norm": 1.6182239396642195, + "language_loss": 0.7492286, + "learning_rate": 3.3019440052403252e-06, + "loss": 0.77148974, + "num_input_tokens_seen": 52780965, + "step": 2463, + "time_per_iteration": 2.4917075634002686 + }, + { + "auxiliary_loss_clip": 0.01172625, + "auxiliary_loss_mlp": 0.01029653, + "balance_loss_clip": 1.05284715, + "balance_loss_mlp": 1.02072978, + "epoch": 0.2962784825347201, + "flos": 23514199758720.0, + "grad_norm": 1.8302181899767005, + "language_loss": 0.71171826, + "learning_rate": 3.30135258777039e-06, + "loss": 0.73374104, + "num_input_tokens_seen": 52800335, + "step": 2464, + "time_per_iteration": 2.524017095565796 + }, + { + "auxiliary_loss_clip": 0.01188655, + "auxiliary_loss_mlp": 0.00764079, + "balance_loss_clip": 1.05350351, + "balance_loss_mlp": 1.00079226, + "epoch": 0.2963987254253592, + "flos": 16362769559040.0, + "grad_norm": 3.157012517718674, + "language_loss": 0.70173341, + "learning_rate": 3.3007609728863024e-06, + "loss": 0.72126073, + "num_input_tokens_seen": 52818425, + "step": 2465, + "time_per_iteration": 2.461383581161499 + }, + { + "auxiliary_loss_clip": 0.01123333, + "auxiliary_loss_mlp": 0.0103086, + "balance_loss_clip": 1.05408776, + "balance_loss_mlp": 1.0212698, + "epoch": 0.29651896831599833, + "flos": 33472263980160.0, + "grad_norm": 1.7983662064091672, + "language_loss": 0.72843826, + "learning_rate": 3.300169160677809e-06, + "loss": 0.74998021, + "num_input_tokens_seen": 52842340, + "step": 2466, + "time_per_iteration": 2.7185440063476562 + }, + { + "auxiliary_loss_clip": 0.01166489, + "auxiliary_loss_mlp": 0.01026456, + "balance_loss_clip": 1.05588913, + "balance_loss_mlp": 1.01679385, + "epoch": 0.2966392112066374, + "flos": 23805363404160.0, + "grad_norm": 2.5295914131785984, + "language_loss": 0.76980782, + "learning_rate": 3.2995771512346878e-06, + "loss": 0.79173726, + "num_input_tokens_seen": 52860690, + "step": 2467, + "time_per_iteration": 2.5602166652679443 + }, + { + "auxiliary_loss_clip": 0.0120319, + "auxiliary_loss_mlp": 0.00764308, + "balance_loss_clip": 1.05798841, + "balance_loss_mlp": 1.00086892, + "epoch": 0.2967594540972765, + "flos": 19938466702080.0, + "grad_norm": 1.9815558864827936, + "language_loss": 0.72911394, + "learning_rate": 3.298984944646746e-06, + "loss": 0.74878883, + "num_input_tokens_seen": 52879370, + "step": 2468, + "time_per_iteration": 2.483952522277832 + }, + { + "auxiliary_loss_clip": 0.01189037, + "auxiliary_loss_mlp": 0.00763386, + "balance_loss_clip": 1.05691409, + "balance_loss_mlp": 1.00093293, + "epoch": 0.2968796969879156, + "flos": 23732823888000.0, + "grad_norm": 2.019194064477367, + "language_loss": 0.81703359, + "learning_rate": 3.298392541003822e-06, + "loss": 0.83655775, + "num_input_tokens_seen": 52898775, + "step": 2469, + "time_per_iteration": 3.3063807487487793 + }, + { + "auxiliary_loss_clip": 0.01168667, + "auxiliary_loss_mlp": 0.0102893, + "balance_loss_clip": 1.05360901, + "balance_loss_mlp": 1.02035308, + "epoch": 0.29699993987855466, + "flos": 22893699288960.0, + "grad_norm": 1.624829082146354, + "language_loss": 0.89567292, + "learning_rate": 3.2977999403957806e-06, + "loss": 0.91764891, + "num_input_tokens_seen": 52917535, + "step": 2470, + "time_per_iteration": 2.5292575359344482 + }, + { + "auxiliary_loss_clip": 0.01202747, + "auxiliary_loss_mlp": 0.01033263, + "balance_loss_clip": 1.06000257, + "balance_loss_mlp": 1.02335072, + "epoch": 0.2971201827691938, + "flos": 33832555349760.0, + "grad_norm": 3.6081758718000723, + "language_loss": 0.6750772, + "learning_rate": 3.2972071429125207e-06, + "loss": 0.69743729, + "num_input_tokens_seen": 52938755, + "step": 2471, + "time_per_iteration": 2.535362482070923 + }, + { + "auxiliary_loss_clip": 0.01151827, + "auxiliary_loss_mlp": 0.01032514, + "balance_loss_clip": 1.0513947, + "balance_loss_mlp": 1.02309704, + "epoch": 0.2972404256598329, + "flos": 22054359208320.0, + "grad_norm": 2.0698902325769875, + "language_loss": 0.8819958, + "learning_rate": 3.2966141486439682e-06, + "loss": 0.90383923, + "num_input_tokens_seen": 52957945, + "step": 2472, + "time_per_iteration": 2.5347063541412354 + }, + { + "auxiliary_loss_clip": 0.01128153, + "auxiliary_loss_mlp": 0.01027125, + "balance_loss_clip": 1.04509997, + "balance_loss_mlp": 1.01748729, + "epoch": 0.29736066855047194, + "flos": 31978595796480.0, + "grad_norm": 2.4960292945071556, + "language_loss": 0.64439154, + "learning_rate": 3.29602095768008e-06, + "loss": 0.66594434, + "num_input_tokens_seen": 52978460, + "step": 2473, + "time_per_iteration": 2.687833070755005 + }, + { + "auxiliary_loss_clip": 0.01165307, + "auxiliary_loss_mlp": 0.01031447, + "balance_loss_clip": 1.05491471, + "balance_loss_mlp": 1.02274454, + "epoch": 0.29748091144111105, + "flos": 33510401245440.0, + "grad_norm": 2.0574256527760952, + "language_loss": 0.63856012, + "learning_rate": 3.2954275701108437e-06, + "loss": 0.66052771, + "num_input_tokens_seen": 52999640, + "step": 2474, + "time_per_iteration": 4.166942596435547 + }, + { + "auxiliary_loss_clip": 0.0113728, + "auxiliary_loss_mlp": 0.01026169, + "balance_loss_clip": 1.04836631, + "balance_loss_mlp": 1.01726997, + "epoch": 0.29760115433175016, + "flos": 41283373409280.0, + "grad_norm": 2.2870299280426787, + "language_loss": 0.68748081, + "learning_rate": 3.294833986026275e-06, + "loss": 0.70911533, + "num_input_tokens_seen": 53022880, + "step": 2475, + "time_per_iteration": 2.7452657222747803 + }, + { + "auxiliary_loss_clip": 0.01150233, + "auxiliary_loss_mlp": 0.01025493, + "balance_loss_clip": 1.05230975, + "balance_loss_mlp": 1.01667762, + "epoch": 0.2977213972223892, + "flos": 24493339572480.0, + "grad_norm": 2.4850113664849385, + "language_loss": 0.84978664, + "learning_rate": 3.29424020551642e-06, + "loss": 0.87154388, + "num_input_tokens_seen": 53041515, + "step": 2476, + "time_per_iteration": 3.3446261882781982 + }, + { + "auxiliary_loss_clip": 0.01204144, + "auxiliary_loss_mlp": 0.01034595, + "balance_loss_clip": 1.05735517, + "balance_loss_mlp": 1.02433074, + "epoch": 0.2978416401130283, + "flos": 21285116519040.0, + "grad_norm": 2.225116230480764, + "language_loss": 0.72278345, + "learning_rate": 3.2936462286713546e-06, + "loss": 0.74517083, + "num_input_tokens_seen": 53059865, + "step": 2477, + "time_per_iteration": 2.449409008026123 + }, + { + "auxiliary_loss_clip": 0.01186119, + "auxiliary_loss_mlp": 0.01032861, + "balance_loss_clip": 1.05516696, + "balance_loss_mlp": 1.02329469, + "epoch": 0.2979618830036674, + "flos": 25772154554880.0, + "grad_norm": 1.9956462671490998, + "language_loss": 0.77517784, + "learning_rate": 3.2930520555811846e-06, + "loss": 0.79736769, + "num_input_tokens_seen": 53079490, + "step": 2478, + "time_per_iteration": 2.503162384033203 + }, + { + "auxiliary_loss_clip": 0.01093077, + "auxiliary_loss_mlp": 0.00764623, + "balance_loss_clip": 1.04548526, + "balance_loss_mlp": 1.00094807, + "epoch": 0.2980821258943065, + "flos": 23476996247040.0, + "grad_norm": 1.7596503005787725, + "language_loss": 0.80058646, + "learning_rate": 3.292457686336046e-06, + "loss": 0.81916344, + "num_input_tokens_seen": 53098810, + "step": 2479, + "time_per_iteration": 2.654806613922119 + }, + { + "auxiliary_loss_clip": 0.01077945, + "auxiliary_loss_mlp": 0.01013592, + "balance_loss_clip": 1.01925814, + "balance_loss_mlp": 1.01131558, + "epoch": 0.2982023687849456, + "flos": 69752314195200.0, + "grad_norm": 0.8564972900421306, + "language_loss": 0.61202013, + "learning_rate": 3.291863121026105e-06, + "loss": 0.63293552, + "num_input_tokens_seen": 53162590, + "step": 2480, + "time_per_iteration": 3.164839744567871 + }, + { + "auxiliary_loss_clip": 0.01184878, + "auxiliary_loss_mlp": 0.01029153, + "balance_loss_clip": 1.05524588, + "balance_loss_mlp": 1.01987851, + "epoch": 0.29832261167558466, + "flos": 29825930741760.0, + "grad_norm": 2.468363931558221, + "language_loss": 0.76921868, + "learning_rate": 3.2912683597415547e-06, + "loss": 0.79135895, + "num_input_tokens_seen": 53186675, + "step": 2481, + "time_per_iteration": 2.5792932510375977 + }, + { + "auxiliary_loss_clip": 0.01158772, + "auxiliary_loss_mlp": 0.01034582, + "balance_loss_clip": 1.05225861, + "balance_loss_mlp": 1.02542627, + "epoch": 0.29844285456622377, + "flos": 33910158683520.0, + "grad_norm": 2.723046737462571, + "language_loss": 0.77832633, + "learning_rate": 3.2906734025726213e-06, + "loss": 0.80025989, + "num_input_tokens_seen": 53205940, + "step": 2482, + "time_per_iteration": 2.641230583190918 + }, + { + "auxiliary_loss_clip": 0.01192615, + "auxiliary_loss_mlp": 0.01034047, + "balance_loss_clip": 1.057513, + "balance_loss_mlp": 1.02470732, + "epoch": 0.2985630974568629, + "flos": 23876933253120.0, + "grad_norm": 2.1864240451823593, + "language_loss": 0.8798371, + "learning_rate": 3.290078249609559e-06, + "loss": 0.90210378, + "num_input_tokens_seen": 53225360, + "step": 2483, + "time_per_iteration": 2.498595714569092 + }, + { + "auxiliary_loss_clip": 0.01183589, + "auxiliary_loss_mlp": 0.01031841, + "balance_loss_clip": 1.05779314, + "balance_loss_mlp": 1.02239323, + "epoch": 0.29868334034750194, + "flos": 21799106184960.0, + "grad_norm": 3.5212082650975134, + "language_loss": 0.880328, + "learning_rate": 3.2894829009426514e-06, + "loss": 0.90248227, + "num_input_tokens_seen": 53243195, + "step": 2484, + "time_per_iteration": 2.48541522026062 + }, + { + "auxiliary_loss_clip": 0.01182677, + "auxiliary_loss_mlp": 0.01031915, + "balance_loss_clip": 1.05501568, + "balance_loss_mlp": 1.02275419, + "epoch": 0.29880358323814105, + "flos": 25666649331840.0, + "grad_norm": 2.02379478270742, + "language_loss": 0.77917325, + "learning_rate": 3.288887356662213e-06, + "loss": 0.80131924, + "num_input_tokens_seen": 53264530, + "step": 2485, + "time_per_iteration": 2.5035083293914795 + }, + { + "auxiliary_loss_clip": 0.01078202, + "auxiliary_loss_mlp": 0.01004012, + "balance_loss_clip": 1.01684928, + "balance_loss_mlp": 1.00186622, + "epoch": 0.29892382612878016, + "flos": 71005846003200.0, + "grad_norm": 0.7715877074941658, + "language_loss": 0.59693527, + "learning_rate": 3.288291616858588e-06, + "loss": 0.61775732, + "num_input_tokens_seen": 53319920, + "step": 2486, + "time_per_iteration": 2.9387173652648926 + }, + { + "auxiliary_loss_clip": 0.01136061, + "auxiliary_loss_mlp": 0.01031784, + "balance_loss_clip": 1.0512675, + "balance_loss_mlp": 1.02278948, + "epoch": 0.2990440690194192, + "flos": 25481134563840.0, + "grad_norm": 1.7621126108276361, + "language_loss": 0.76760113, + "learning_rate": 3.287695681622149e-06, + "loss": 0.78927958, + "num_input_tokens_seen": 53339270, + "step": 2487, + "time_per_iteration": 2.5824060440063477 + }, + { + "auxiliary_loss_clip": 0.01175337, + "auxiliary_loss_mlp": 0.01025804, + "balance_loss_clip": 1.0533129, + "balance_loss_mlp": 1.01727486, + "epoch": 0.2991643119100583, + "flos": 23732357011200.0, + "grad_norm": 1.8514958423313588, + "language_loss": 0.81237018, + "learning_rate": 3.2870995510432982e-06, + "loss": 0.83438158, + "num_input_tokens_seen": 53357750, + "step": 2488, + "time_per_iteration": 2.5265843868255615 + }, + { + "auxiliary_loss_clip": 0.01178229, + "auxiliary_loss_mlp": 0.01029812, + "balance_loss_clip": 1.05412138, + "balance_loss_mlp": 1.02119875, + "epoch": 0.29928455480069743, + "flos": 27417545786880.0, + "grad_norm": 1.7626933623400147, + "language_loss": 0.76863003, + "learning_rate": 3.2865032252124697e-06, + "loss": 0.79071039, + "num_input_tokens_seen": 53378265, + "step": 2489, + "time_per_iteration": 2.5650012493133545 + }, + { + "auxiliary_loss_clip": 0.01170224, + "auxiliary_loss_mlp": 0.01031594, + "balance_loss_clip": 1.05231869, + "balance_loss_mlp": 1.02300489, + "epoch": 0.2994047976913365, + "flos": 33692935184640.0, + "grad_norm": 1.460107558535683, + "language_loss": 0.77668238, + "learning_rate": 3.2859067042201243e-06, + "loss": 0.79870051, + "num_input_tokens_seen": 53400305, + "step": 2490, + "time_per_iteration": 2.6227469444274902 + }, + { + "auxiliary_loss_clip": 0.01105181, + "auxiliary_loss_mlp": 0.01032945, + "balance_loss_clip": 1.04509151, + "balance_loss_mlp": 1.02387309, + "epoch": 0.2995250405819756, + "flos": 16763963541120.0, + "grad_norm": 2.2386469775589184, + "language_loss": 0.77712286, + "learning_rate": 3.2853099881567544e-06, + "loss": 0.79850411, + "num_input_tokens_seen": 53418705, + "step": 2491, + "time_per_iteration": 2.5785326957702637 + }, + { + "auxiliary_loss_clip": 0.01195824, + "auxiliary_loss_mlp": 0.01033691, + "balance_loss_clip": 1.05612326, + "balance_loss_mlp": 1.02565575, + "epoch": 0.29964528347261465, + "flos": 22963976248320.0, + "grad_norm": 1.9211278119417388, + "language_loss": 0.79198897, + "learning_rate": 3.284713077112881e-06, + "loss": 0.81428415, + "num_input_tokens_seen": 53438135, + "step": 2492, + "time_per_iteration": 2.4606575965881348 + }, + { + "auxiliary_loss_clip": 0.01164724, + "auxiliary_loss_mlp": 0.01032522, + "balance_loss_clip": 1.05659127, + "balance_loss_mlp": 1.02294362, + "epoch": 0.29976552636325376, + "flos": 16938021870720.0, + "grad_norm": 2.8497012408684292, + "language_loss": 0.86191618, + "learning_rate": 3.284115971179056e-06, + "loss": 0.88388866, + "num_input_tokens_seen": 53452165, + "step": 2493, + "time_per_iteration": 2.4819416999816895 + }, + { + "auxiliary_loss_clip": 0.01135603, + "auxiliary_loss_mlp": 0.01031931, + "balance_loss_clip": 1.05421305, + "balance_loss_mlp": 1.02286518, + "epoch": 0.2998857692538929, + "flos": 17056455989760.0, + "grad_norm": 1.7423784875572517, + "language_loss": 0.78256708, + "learning_rate": 3.283518670445859e-06, + "loss": 0.80424249, + "num_input_tokens_seen": 53470075, + "step": 2494, + "time_per_iteration": 2.5678627490997314 + }, + { + "auxiliary_loss_clip": 0.01068076, + "auxiliary_loss_mlp": 0.00755115, + "balance_loss_clip": 1.01801038, + "balance_loss_mlp": 1.00108683, + "epoch": 0.30000601214453193, + "flos": 68831528025600.0, + "grad_norm": 0.6748333268592323, + "language_loss": 0.54374009, + "learning_rate": 3.2829211750038995e-06, + "loss": 0.5619719, + "num_input_tokens_seen": 53538705, + "step": 2495, + "time_per_iteration": 3.140913486480713 + }, + { + "auxiliary_loss_clip": 0.01152592, + "auxiliary_loss_mlp": 0.01030694, + "balance_loss_clip": 1.05001545, + "balance_loss_mlp": 1.02169394, + "epoch": 0.30012625503517104, + "flos": 17603267708160.0, + "grad_norm": 1.82735650042927, + "language_loss": 0.89302659, + "learning_rate": 3.2823234849438183e-06, + "loss": 0.91485947, + "num_input_tokens_seen": 53556740, + "step": 2496, + "time_per_iteration": 3.2761878967285156 + }, + { + "auxiliary_loss_clip": 0.01172273, + "auxiliary_loss_mlp": 0.01034063, + "balance_loss_clip": 1.05423617, + "balance_loss_mlp": 1.02515233, + "epoch": 0.30024649792581015, + "flos": 21252581775360.0, + "grad_norm": 2.513259873511321, + "language_loss": 0.76102847, + "learning_rate": 3.2817256003562836e-06, + "loss": 0.78309184, + "num_input_tokens_seen": 53577115, + "step": 2497, + "time_per_iteration": 2.5053296089172363 + }, + { + "auxiliary_loss_clip": 0.01129009, + "auxiliary_loss_mlp": 0.01036764, + "balance_loss_clip": 1.0506947, + "balance_loss_mlp": 1.02741754, + "epoch": 0.3003667408164492, + "flos": 23003262748800.0, + "grad_norm": 2.0974071800885614, + "language_loss": 0.65806961, + "learning_rate": 3.281127521331995e-06, + "loss": 0.67972738, + "num_input_tokens_seen": 53598295, + "step": 2498, + "time_per_iteration": 2.6352765560150146 + }, + { + "auxiliary_loss_clip": 0.01094444, + "auxiliary_loss_mlp": 0.01012228, + "balance_loss_clip": 1.01849246, + "balance_loss_mlp": 1.01035643, + "epoch": 0.3004869837070883, + "flos": 64232340750720.0, + "grad_norm": 0.8874254356169468, + "language_loss": 0.60689318, + "learning_rate": 3.2805292479616798e-06, + "loss": 0.62795991, + "num_input_tokens_seen": 53657160, + "step": 2499, + "time_per_iteration": 2.9262173175811768 + }, + { + "auxiliary_loss_clip": 0.01175485, + "auxiliary_loss_mlp": 0.01031212, + "balance_loss_clip": 1.0546031, + "balance_loss_mlp": 1.02202713, + "epoch": 0.30060722659772743, + "flos": 26248653400320.0, + "grad_norm": 3.6829118725581664, + "language_loss": 0.92026901, + "learning_rate": 3.2799307803360955e-06, + "loss": 0.94233596, + "num_input_tokens_seen": 53673090, + "step": 2500, + "time_per_iteration": 3.315781831741333 + }, + { + "auxiliary_loss_clip": 0.01197139, + "auxiliary_loss_mlp": 0.01031586, + "balance_loss_clip": 1.05705976, + "balance_loss_mlp": 1.02263975, + "epoch": 0.3007274694883665, + "flos": 24970879912320.0, + "grad_norm": 25.236992247207322, + "language_loss": 0.81451976, + "learning_rate": 3.27933211854603e-06, + "loss": 0.83680707, + "num_input_tokens_seen": 53692145, + "step": 2501, + "time_per_iteration": 3.2769172191619873 + }, + { + "auxiliary_loss_clip": 0.01171474, + "auxiliary_loss_mlp": 0.0102795, + "balance_loss_clip": 1.05593157, + "balance_loss_mlp": 1.01886606, + "epoch": 0.3008477123790056, + "flos": 17055845458560.0, + "grad_norm": 1.525072486581109, + "language_loss": 0.86899537, + "learning_rate": 3.278733262682299e-06, + "loss": 0.8909896, + "num_input_tokens_seen": 53710000, + "step": 2502, + "time_per_iteration": 2.5035667419433594 + }, + { + "auxiliary_loss_clip": 0.01201692, + "auxiliary_loss_mlp": 0.01027399, + "balance_loss_clip": 1.05749476, + "balance_loss_mlp": 1.01867938, + "epoch": 0.3009679552696447, + "flos": 21506398254720.0, + "grad_norm": 3.290124167051677, + "language_loss": 0.82766056, + "learning_rate": 3.2781342128357484e-06, + "loss": 0.84995151, + "num_input_tokens_seen": 53729355, + "step": 2503, + "time_per_iteration": 3.208062171936035 + }, + { + "auxiliary_loss_clip": 0.0115717, + "auxiliary_loss_mlp": 0.01030252, + "balance_loss_clip": 1.05039263, + "balance_loss_mlp": 1.02084625, + "epoch": 0.30108819816028376, + "flos": 21134004001920.0, + "grad_norm": 2.505299706010715, + "language_loss": 0.80336165, + "learning_rate": 3.2775349690972547e-06, + "loss": 0.82523596, + "num_input_tokens_seen": 53743505, + "step": 2504, + "time_per_iteration": 2.5413811206817627 + }, + { + "auxiliary_loss_clip": 0.01076648, + "auxiliary_loss_mlp": 0.01002954, + "balance_loss_clip": 1.01711345, + "balance_loss_mlp": 1.00108266, + "epoch": 0.30120844105092287, + "flos": 71126434938240.0, + "grad_norm": 0.7615450673016307, + "language_loss": 0.5179621, + "learning_rate": 3.276935531557722e-06, + "loss": 0.53875816, + "num_input_tokens_seen": 53808725, + "step": 2505, + "time_per_iteration": 3.1433565616607666 + }, + { + "auxiliary_loss_clip": 0.01148611, + "auxiliary_loss_mlp": 0.01035244, + "balance_loss_clip": 1.05224943, + "balance_loss_mlp": 1.02622604, + "epoch": 0.301328683941562, + "flos": 20264571302400.0, + "grad_norm": 2.087993586799016, + "language_loss": 0.79522461, + "learning_rate": 3.2763359003080837e-06, + "loss": 0.81706321, + "num_input_tokens_seen": 53825680, + "step": 2506, + "time_per_iteration": 2.559074878692627 + }, + { + "auxiliary_loss_clip": 0.01072362, + "auxiliary_loss_mlp": 0.01001926, + "balance_loss_clip": 1.01772642, + "balance_loss_mlp": 1.00012612, + "epoch": 0.30144892683220104, + "flos": 70648212240000.0, + "grad_norm": 0.8240641730860566, + "language_loss": 0.62502193, + "learning_rate": 3.2757360754393047e-06, + "loss": 0.64576483, + "num_input_tokens_seen": 53889750, + "step": 2507, + "time_per_iteration": 3.1738297939300537 + }, + { + "auxiliary_loss_clip": 0.01188201, + "auxiliary_loss_mlp": 0.0103106, + "balance_loss_clip": 1.05687964, + "balance_loss_mlp": 1.02150512, + "epoch": 0.30156916972284015, + "flos": 22820549241600.0, + "grad_norm": 2.418075678500557, + "language_loss": 0.63774389, + "learning_rate": 3.2751360570423767e-06, + "loss": 0.65993649, + "num_input_tokens_seen": 53908135, + "step": 2508, + "time_per_iteration": 2.478472948074341 + }, + { + "auxiliary_loss_clip": 0.01171636, + "auxiliary_loss_mlp": 0.01033215, + "balance_loss_clip": 1.05470896, + "balance_loss_mlp": 1.02419686, + "epoch": 0.3016894126134792, + "flos": 29899188529920.0, + "grad_norm": 2.0243185112602204, + "language_loss": 0.75777733, + "learning_rate": 3.2745358452083236e-06, + "loss": 0.77982581, + "num_input_tokens_seen": 53931035, + "step": 2509, + "time_per_iteration": 2.5564770698547363 + }, + { + "auxiliary_loss_clip": 0.01188288, + "auxiliary_loss_mlp": 0.0102794, + "balance_loss_clip": 1.05897141, + "balance_loss_mlp": 1.01930356, + "epoch": 0.3018096555041183, + "flos": 21546331200000.0, + "grad_norm": 1.4031798404313849, + "language_loss": 0.82120281, + "learning_rate": 3.2739354400281955e-06, + "loss": 0.84336507, + "num_input_tokens_seen": 53952255, + "step": 2510, + "time_per_iteration": 2.5157511234283447 + }, + { + "auxiliary_loss_clip": 0.01058749, + "auxiliary_loss_mlp": 0.00755343, + "balance_loss_clip": 1.01367581, + "balance_loss_mlp": 1.00117922, + "epoch": 0.3019298983947574, + "flos": 59136294597120.0, + "grad_norm": 0.8644638003068023, + "language_loss": 0.63699847, + "learning_rate": 3.2733348415930744e-06, + "loss": 0.65513939, + "num_input_tokens_seen": 54014125, + "step": 2511, + "time_per_iteration": 3.1334095001220703 + }, + { + "auxiliary_loss_clip": 0.01156073, + "auxiliary_loss_mlp": 0.0102945, + "balance_loss_clip": 1.05587804, + "balance_loss_mlp": 1.02053356, + "epoch": 0.3020501412853965, + "flos": 34423070941440.0, + "grad_norm": 1.8510799936246471, + "language_loss": 0.80480975, + "learning_rate": 3.27273404999407e-06, + "loss": 0.82666498, + "num_input_tokens_seen": 54036345, + "step": 2512, + "time_per_iteration": 2.6459858417510986 + }, + { + "auxiliary_loss_clip": 0.01073561, + "auxiliary_loss_mlp": 0.01006911, + "balance_loss_clip": 1.0180428, + "balance_loss_mlp": 1.00515831, + "epoch": 0.3021703841760356, + "flos": 71008288128000.0, + "grad_norm": 0.8055678978541427, + "language_loss": 0.60440016, + "learning_rate": 3.272133065322322e-06, + "loss": 0.62520486, + "num_input_tokens_seen": 54094615, + "step": 2513, + "time_per_iteration": 3.0579633712768555 + }, + { + "auxiliary_loss_clip": 0.01198631, + "auxiliary_loss_mlp": 0.0103285, + "balance_loss_clip": 1.05646288, + "balance_loss_mlp": 1.02396321, + "epoch": 0.3022906270666747, + "flos": 21510528318720.0, + "grad_norm": 1.765801752687423, + "language_loss": 0.79717511, + "learning_rate": 3.271531887669e-06, + "loss": 0.81948996, + "num_input_tokens_seen": 54114675, + "step": 2514, + "time_per_iteration": 2.4647700786590576 + }, + { + "auxiliary_loss_clip": 0.01146533, + "auxiliary_loss_mlp": 0.0103232, + "balance_loss_clip": 1.04934919, + "balance_loss_mlp": 1.02243757, + "epoch": 0.30241086995731375, + "flos": 31132001168640.0, + "grad_norm": 2.0064954711626446, + "language_loss": 0.63161111, + "learning_rate": 3.2709305171253015e-06, + "loss": 0.65339965, + "num_input_tokens_seen": 54134795, + "step": 2515, + "time_per_iteration": 2.618454694747925 + }, + { + "auxiliary_loss_clip": 0.01187532, + "auxiliary_loss_mlp": 0.01034447, + "balance_loss_clip": 1.05828452, + "balance_loss_mlp": 1.02543509, + "epoch": 0.30253111284795287, + "flos": 23511542152320.0, + "grad_norm": 1.9854173316296873, + "language_loss": 0.77630484, + "learning_rate": 3.2703289537824536e-06, + "loss": 0.79852462, + "num_input_tokens_seen": 54154595, + "step": 2516, + "time_per_iteration": 2.487072706222534 + }, + { + "auxiliary_loss_clip": 0.01146192, + "auxiliary_loss_mlp": 0.01035655, + "balance_loss_clip": 1.05187845, + "balance_loss_mlp": 1.02617764, + "epoch": 0.302651355738592, + "flos": 18725367651840.0, + "grad_norm": 8.765984341453152, + "language_loss": 0.78323245, + "learning_rate": 3.269727197731714e-06, + "loss": 0.80505097, + "num_input_tokens_seen": 54167360, + "step": 2517, + "time_per_iteration": 2.522299289703369 + }, + { + "auxiliary_loss_clip": 0.01139102, + "auxiliary_loss_mlp": 0.01032392, + "balance_loss_clip": 1.05192518, + "balance_loss_mlp": 1.02331448, + "epoch": 0.30277159862923103, + "flos": 22418888382720.0, + "grad_norm": 1.731107744353414, + "language_loss": 0.77939731, + "learning_rate": 3.269125249064367e-06, + "loss": 0.80111223, + "num_input_tokens_seen": 54187055, + "step": 2518, + "time_per_iteration": 2.5696253776550293 + }, + { + "auxiliary_loss_clip": 0.01201608, + "auxiliary_loss_mlp": 0.01032691, + "balance_loss_clip": 1.05681038, + "balance_loss_mlp": 1.02377367, + "epoch": 0.30289184151987014, + "flos": 22273126992000.0, + "grad_norm": 1.7359065923946193, + "language_loss": 0.83360636, + "learning_rate": 3.2685231078717297e-06, + "loss": 0.85594934, + "num_input_tokens_seen": 54207245, + "step": 2519, + "time_per_iteration": 2.4540438652038574 + }, + { + "auxiliary_loss_clip": 0.01147852, + "auxiliary_loss_mlp": 0.00763938, + "balance_loss_clip": 1.05207491, + "balance_loss_mlp": 1.00074685, + "epoch": 0.30301208441050925, + "flos": 25225594231680.0, + "grad_norm": 1.9226719142019164, + "language_loss": 0.75365186, + "learning_rate": 3.267920774245145e-06, + "loss": 0.77276969, + "num_input_tokens_seen": 54226650, + "step": 2520, + "time_per_iteration": 2.561476230621338 + }, + { + "auxiliary_loss_clip": 0.01189791, + "auxiliary_loss_mlp": 0.01035449, + "balance_loss_clip": 1.05862212, + "balance_loss_mlp": 1.02519083, + "epoch": 0.3031323273011483, + "flos": 23039245198080.0, + "grad_norm": 1.8175976557392557, + "language_loss": 0.84835804, + "learning_rate": 3.2673182482759876e-06, + "loss": 0.87061048, + "num_input_tokens_seen": 54245765, + "step": 2521, + "time_per_iteration": 2.483659029006958 + }, + { + "auxiliary_loss_clip": 0.01187312, + "auxiliary_loss_mlp": 0.01027743, + "balance_loss_clip": 1.05817103, + "balance_loss_mlp": 1.01840854, + "epoch": 0.3032525701917874, + "flos": 18876695650560.0, + "grad_norm": 2.412776159410696, + "language_loss": 0.65920305, + "learning_rate": 3.266715530055659e-06, + "loss": 0.68135357, + "num_input_tokens_seen": 54263915, + "step": 2522, + "time_per_iteration": 2.4782700538635254 + }, + { + "auxiliary_loss_clip": 0.01177197, + "auxiliary_loss_mlp": 0.0103206, + "balance_loss_clip": 1.05355191, + "balance_loss_mlp": 1.02277923, + "epoch": 0.30337281308242653, + "flos": 17782641250560.0, + "grad_norm": 2.145194287233782, + "language_loss": 0.8016035, + "learning_rate": 3.2661126196755927e-06, + "loss": 0.82369602, + "num_input_tokens_seen": 54283025, + "step": 2523, + "time_per_iteration": 3.197591543197632 + }, + { + "auxiliary_loss_clip": 0.01095045, + "auxiliary_loss_mlp": 0.01003638, + "balance_loss_clip": 1.02028394, + "balance_loss_mlp": 1.00187361, + "epoch": 0.3034930559730656, + "flos": 57824298426240.0, + "grad_norm": 0.7865592190449694, + "language_loss": 0.5598352, + "learning_rate": 3.265509517227248e-06, + "loss": 0.58082199, + "num_input_tokens_seen": 54339840, + "step": 2524, + "time_per_iteration": 3.0162839889526367 + }, + { + "auxiliary_loss_clip": 0.0117312, + "auxiliary_loss_mlp": 0.01029033, + "balance_loss_clip": 1.0530293, + "balance_loss_mlp": 1.02011657, + "epoch": 0.3036132988637047, + "flos": 14755587419520.0, + "grad_norm": 2.5838360711240136, + "language_loss": 0.80833572, + "learning_rate": 3.264906222802115e-06, + "loss": 0.83035731, + "num_input_tokens_seen": 54357690, + "step": 2525, + "time_per_iteration": 2.489438056945801 + }, + { + "auxiliary_loss_clip": 0.01205359, + "auxiliary_loss_mlp": 0.01030971, + "balance_loss_clip": 1.05842972, + "balance_loss_mlp": 1.02068877, + "epoch": 0.30373354175434375, + "flos": 21033203460480.0, + "grad_norm": 1.9804279888786198, + "language_loss": 0.77706057, + "learning_rate": 3.264302736491715e-06, + "loss": 0.79942381, + "num_input_tokens_seen": 54377810, + "step": 2526, + "time_per_iteration": 3.23544979095459 + }, + { + "auxiliary_loss_clip": 0.01188045, + "auxiliary_loss_mlp": 0.01026371, + "balance_loss_clip": 1.06247687, + "balance_loss_mlp": 1.01732314, + "epoch": 0.30385378464498286, + "flos": 21143233797120.0, + "grad_norm": 2.003850914865006, + "language_loss": 0.86954695, + "learning_rate": 3.263699058387594e-06, + "loss": 0.89169109, + "num_input_tokens_seen": 54395245, + "step": 2527, + "time_per_iteration": 2.502690315246582 + }, + { + "auxiliary_loss_clip": 0.01155179, + "auxiliary_loss_mlp": 0.01034901, + "balance_loss_clip": 1.05094206, + "balance_loss_mlp": 1.0251199, + "epoch": 0.30397402753562197, + "flos": 20629244131200.0, + "grad_norm": 2.5329528042288594, + "language_loss": 0.90252823, + "learning_rate": 3.2630951885813315e-06, + "loss": 0.92442906, + "num_input_tokens_seen": 54412640, + "step": 2528, + "time_per_iteration": 3.285228729248047 + }, + { + "auxiliary_loss_clip": 0.01172679, + "auxiliary_loss_mlp": 0.01032726, + "balance_loss_clip": 1.05343485, + "balance_loss_mlp": 1.02343953, + "epoch": 0.304094270426261, + "flos": 15085678429440.0, + "grad_norm": 2.0059486702397096, + "language_loss": 0.7762779, + "learning_rate": 3.262491127164533e-06, + "loss": 0.79833198, + "num_input_tokens_seen": 54431455, + "step": 2529, + "time_per_iteration": 3.2982656955718994 + }, + { + "auxiliary_loss_clip": 0.01181116, + "auxiliary_loss_mlp": 0.00764021, + "balance_loss_clip": 1.05698943, + "balance_loss_mlp": 1.00066924, + "epoch": 0.30421451331690014, + "flos": 13845216193920.0, + "grad_norm": 2.4460807295985796, + "language_loss": 0.80305123, + "learning_rate": 3.2618868742288337e-06, + "loss": 0.82250261, + "num_input_tokens_seen": 54448380, + "step": 2530, + "time_per_iteration": 2.505429267883301 + }, + { + "auxiliary_loss_clip": 0.01187139, + "auxiliary_loss_mlp": 0.01034983, + "balance_loss_clip": 1.05802727, + "balance_loss_mlp": 1.02582812, + "epoch": 0.30433475620753925, + "flos": 17384212615680.0, + "grad_norm": 1.7841812618885684, + "language_loss": 0.71865618, + "learning_rate": 3.261282429865899e-06, + "loss": 0.74087739, + "num_input_tokens_seen": 54466385, + "step": 2531, + "time_per_iteration": 2.4859228134155273 + }, + { + "auxiliary_loss_clip": 0.01180157, + "auxiliary_loss_mlp": 0.00763031, + "balance_loss_clip": 1.05840683, + "balance_loss_mlp": 1.00070214, + "epoch": 0.3044549990981783, + "flos": 18916951818240.0, + "grad_norm": 1.6549076443397663, + "language_loss": 0.72392803, + "learning_rate": 3.2606777941674225e-06, + "loss": 0.74335992, + "num_input_tokens_seen": 54485040, + "step": 2532, + "time_per_iteration": 2.5331203937530518 + }, + { + "auxiliary_loss_clip": 0.01136987, + "auxiliary_loss_mlp": 0.01034809, + "balance_loss_clip": 1.05217671, + "balance_loss_mlp": 1.02509391, + "epoch": 0.3045752419888174, + "flos": 21068431724160.0, + "grad_norm": 3.291682003980467, + "language_loss": 0.84338677, + "learning_rate": 3.2600729672251276e-06, + "loss": 0.86510468, + "num_input_tokens_seen": 54502755, + "step": 2533, + "time_per_iteration": 2.5850987434387207 + }, + { + "auxiliary_loss_clip": 0.01203537, + "auxiliary_loss_mlp": 0.00764238, + "balance_loss_clip": 1.06078911, + "balance_loss_mlp": 1.00072002, + "epoch": 0.3046954848794565, + "flos": 29096405516160.0, + "grad_norm": 2.304002945129114, + "language_loss": 0.65246743, + "learning_rate": 3.259467949130765e-06, + "loss": 0.67214525, + "num_input_tokens_seen": 54524165, + "step": 2534, + "time_per_iteration": 2.527325391769409 + }, + { + "auxiliary_loss_clip": 0.01177588, + "auxiliary_loss_mlp": 0.01028405, + "balance_loss_clip": 1.05957103, + "balance_loss_mlp": 1.0194701, + "epoch": 0.3048157277700956, + "flos": 20295346279680.0, + "grad_norm": 2.426150618857323, + "language_loss": 0.82710397, + "learning_rate": 3.2588627399761164e-06, + "loss": 0.84916389, + "num_input_tokens_seen": 54540160, + "step": 2535, + "time_per_iteration": 2.5196805000305176 + }, + { + "auxiliary_loss_clip": 0.01172393, + "auxiliary_loss_mlp": 0.01028173, + "balance_loss_clip": 1.05574954, + "balance_loss_mlp": 1.01970303, + "epoch": 0.3049359706607347, + "flos": 22739929165440.0, + "grad_norm": 1.7973879770199437, + "language_loss": 0.70521808, + "learning_rate": 3.2582573398529903e-06, + "loss": 0.72722369, + "num_input_tokens_seen": 54557515, + "step": 2536, + "time_per_iteration": 2.530954360961914 + }, + { + "auxiliary_loss_clip": 0.01159671, + "auxiliary_loss_mlp": 0.01030963, + "balance_loss_clip": 1.05309546, + "balance_loss_mlp": 1.02111077, + "epoch": 0.3050562135513738, + "flos": 18434634969600.0, + "grad_norm": 3.3070487616014064, + "language_loss": 0.74034721, + "learning_rate": 3.2576517488532265e-06, + "loss": 0.76225352, + "num_input_tokens_seen": 54573865, + "step": 2537, + "time_per_iteration": 2.5862491130828857 + }, + { + "auxiliary_loss_clip": 0.01185308, + "auxiliary_loss_mlp": 0.01033563, + "balance_loss_clip": 1.05427647, + "balance_loss_mlp": 1.02523613, + "epoch": 0.30517645644201286, + "flos": 20370327920640.0, + "grad_norm": 2.0015367207433084, + "language_loss": 0.87042534, + "learning_rate": 3.257045967068692e-06, + "loss": 0.89261407, + "num_input_tokens_seen": 54593120, + "step": 2538, + "time_per_iteration": 2.527109384536743 + }, + { + "auxiliary_loss_clip": 0.01205321, + "auxiliary_loss_mlp": 0.01036776, + "balance_loss_clip": 1.06001019, + "balance_loss_mlp": 1.02688181, + "epoch": 0.30529669933265197, + "flos": 21945118970880.0, + "grad_norm": 1.5467589263869173, + "language_loss": 0.81804323, + "learning_rate": 3.2564399945912848e-06, + "loss": 0.84046423, + "num_input_tokens_seen": 54612910, + "step": 2539, + "time_per_iteration": 2.525859832763672 + }, + { + "auxiliary_loss_clip": 0.01149227, + "auxiliary_loss_mlp": 0.01030078, + "balance_loss_clip": 1.05330062, + "balance_loss_mlp": 1.02138793, + "epoch": 0.305416942223291, + "flos": 21835411856640.0, + "grad_norm": 2.111954132320059, + "language_loss": 0.82669222, + "learning_rate": 3.2558338315129287e-06, + "loss": 0.84848523, + "num_input_tokens_seen": 54631055, + "step": 2540, + "time_per_iteration": 2.6807541847229004 + }, + { + "auxiliary_loss_clip": 0.01181895, + "auxiliary_loss_mlp": 0.01028712, + "balance_loss_clip": 1.05581355, + "balance_loss_mlp": 1.0193485, + "epoch": 0.30553718511393013, + "flos": 33911810709120.0, + "grad_norm": 2.2385220702306765, + "language_loss": 0.75821769, + "learning_rate": 3.2552274779255785e-06, + "loss": 0.78032374, + "num_input_tokens_seen": 54651985, + "step": 2541, + "time_per_iteration": 2.6057441234588623 + }, + { + "auxiliary_loss_clip": 0.01185685, + "auxiliary_loss_mlp": 0.01032761, + "balance_loss_clip": 1.05594146, + "balance_loss_mlp": 1.02352202, + "epoch": 0.30565742800456924, + "flos": 22268530051200.0, + "grad_norm": 3.800077469579308, + "language_loss": 0.76972282, + "learning_rate": 3.2546209339212184e-06, + "loss": 0.79190731, + "num_input_tokens_seen": 54671005, + "step": 2542, + "time_per_iteration": 2.4897189140319824 + }, + { + "auxiliary_loss_clip": 0.01174011, + "auxiliary_loss_mlp": 0.0103372, + "balance_loss_clip": 1.05387998, + "balance_loss_mlp": 1.02402186, + "epoch": 0.3057776708952083, + "flos": 22565044823040.0, + "grad_norm": 1.5194516369487892, + "language_loss": 0.77485311, + "learning_rate": 3.25401419959186e-06, + "loss": 0.79693043, + "num_input_tokens_seen": 54691615, + "step": 2543, + "time_per_iteration": 2.524869203567505 + }, + { + "auxiliary_loss_clip": 0.01185903, + "auxiliary_loss_mlp": 0.0103821, + "balance_loss_clip": 1.06147981, + "balance_loss_mlp": 1.02832139, + "epoch": 0.3058979137858474, + "flos": 21799213925760.0, + "grad_norm": 3.6244941066014116, + "language_loss": 0.76060641, + "learning_rate": 3.253407275029545e-06, + "loss": 0.78284752, + "num_input_tokens_seen": 54710520, + "step": 2544, + "time_per_iteration": 2.5260775089263916 + }, + { + "auxiliary_loss_clip": 0.01161512, + "auxiliary_loss_mlp": 0.01029146, + "balance_loss_clip": 1.05639136, + "balance_loss_mlp": 1.01913261, + "epoch": 0.3060181566764865, + "flos": 26979435601920.0, + "grad_norm": 2.0974124427739027, + "language_loss": 0.8015433, + "learning_rate": 3.2528001603263425e-06, + "loss": 0.82344985, + "num_input_tokens_seen": 54732590, + "step": 2545, + "time_per_iteration": 2.589590072631836 + }, + { + "auxiliary_loss_clip": 0.01190331, + "auxiliary_loss_mlp": 0.01028857, + "balance_loss_clip": 1.06138647, + "balance_loss_mlp": 1.01986217, + "epoch": 0.3061383995671256, + "flos": 19865101173120.0, + "grad_norm": 1.6946238714032222, + "language_loss": 0.81379014, + "learning_rate": 3.2521928555743514e-06, + "loss": 0.83598197, + "num_input_tokens_seen": 54749935, + "step": 2546, + "time_per_iteration": 2.505105972290039 + }, + { + "auxiliary_loss_clip": 0.0116726, + "auxiliary_loss_mlp": 0.00763801, + "balance_loss_clip": 1.05303741, + "balance_loss_mlp": 1.00071049, + "epoch": 0.3062586424577647, + "flos": 22127509255680.0, + "grad_norm": 3.202359788436518, + "language_loss": 0.6777339, + "learning_rate": 3.2515853608657e-06, + "loss": 0.69704449, + "num_input_tokens_seen": 54767935, + "step": 2547, + "time_per_iteration": 2.5154521465301514 + }, + { + "auxiliary_loss_clip": 0.01182518, + "auxiliary_loss_mlp": 0.01032171, + "balance_loss_clip": 1.05502975, + "balance_loss_mlp": 1.02293229, + "epoch": 0.3063788853484038, + "flos": 20845497962880.0, + "grad_norm": 2.8116694274902216, + "language_loss": 0.74778497, + "learning_rate": 3.250977676292545e-06, + "loss": 0.76993185, + "num_input_tokens_seen": 54786175, + "step": 2548, + "time_per_iteration": 2.4643547534942627 + }, + { + "auxiliary_loss_clip": 0.01176531, + "auxiliary_loss_mlp": 0.01028878, + "balance_loss_clip": 1.05489039, + "balance_loss_mlp": 1.01963937, + "epoch": 0.30649912823904285, + "flos": 16209717707520.0, + "grad_norm": 2.3184535834639783, + "language_loss": 0.79520291, + "learning_rate": 3.2503698019470712e-06, + "loss": 0.81725699, + "num_input_tokens_seen": 54801945, + "step": 2549, + "time_per_iteration": 3.23138689994812 + }, + { + "auxiliary_loss_clip": 0.01186336, + "auxiliary_loss_mlp": 0.01035329, + "balance_loss_clip": 1.05514467, + "balance_loss_mlp": 1.02567863, + "epoch": 0.30661937112968196, + "flos": 18617815353600.0, + "grad_norm": 4.73258453166715, + "language_loss": 0.78154463, + "learning_rate": 3.249761737921492e-06, + "loss": 0.8037613, + "num_input_tokens_seen": 54818475, + "step": 2550, + "time_per_iteration": 2.4473471641540527 + }, + { + "auxiliary_loss_clip": 0.01170807, + "auxiliary_loss_mlp": 0.01034588, + "balance_loss_clip": 1.0566361, + "balance_loss_mlp": 1.02575445, + "epoch": 0.30673961402032107, + "flos": 31390809638400.0, + "grad_norm": 2.124669572902979, + "language_loss": 0.74401361, + "learning_rate": 3.249153484308051e-06, + "loss": 0.76606756, + "num_input_tokens_seen": 54837090, + "step": 2551, + "time_per_iteration": 2.574333667755127 + }, + { + "auxiliary_loss_clip": 0.01132926, + "auxiliary_loss_mlp": 0.0102878, + "balance_loss_clip": 1.04906511, + "balance_loss_mlp": 1.01943421, + "epoch": 0.3068598569109601, + "flos": 20229809915520.0, + "grad_norm": 2.3332073095632473, + "language_loss": 0.77330548, + "learning_rate": 3.2485450411990194e-06, + "loss": 0.79492259, + "num_input_tokens_seen": 54856445, + "step": 2552, + "time_per_iteration": 2.5554463863372803 + }, + { + "auxiliary_loss_clip": 0.01202115, + "auxiliary_loss_mlp": 0.01032038, + "balance_loss_clip": 1.05641019, + "balance_loss_mlp": 1.02263844, + "epoch": 0.30698009980159924, + "flos": 29601991399680.0, + "grad_norm": 2.3569081423364886, + "language_loss": 0.82334292, + "learning_rate": 3.2479364086866983e-06, + "loss": 0.84568447, + "num_input_tokens_seen": 54876700, + "step": 2553, + "time_per_iteration": 3.244306802749634 + }, + { + "auxiliary_loss_clip": 0.01176121, + "auxiliary_loss_mlp": 0.00764544, + "balance_loss_clip": 1.05931354, + "balance_loss_mlp": 1.0008533, + "epoch": 0.30710034269223835, + "flos": 23842423261440.0, + "grad_norm": 1.7386914480023101, + "language_loss": 0.8133384, + "learning_rate": 3.247327586863416e-06, + "loss": 0.83274502, + "num_input_tokens_seen": 54897580, + "step": 2554, + "time_per_iteration": 2.526728630065918 + }, + { + "auxiliary_loss_clip": 0.01164105, + "auxiliary_loss_mlp": 0.0103037, + "balance_loss_clip": 1.05524743, + "balance_loss_mlp": 1.0205828, + "epoch": 0.3072205855828774, + "flos": 25884986152320.0, + "grad_norm": 2.0911902679382415, + "language_loss": 0.76988637, + "learning_rate": 3.2467185758215304e-06, + "loss": 0.7918312, + "num_input_tokens_seen": 54917320, + "step": 2555, + "time_per_iteration": 3.2796480655670166 + }, + { + "auxiliary_loss_clip": 0.01164608, + "auxiliary_loss_mlp": 0.00764015, + "balance_loss_clip": 1.0579468, + "balance_loss_mlp": 1.00101805, + "epoch": 0.3073408284735165, + "flos": 22236390357120.0, + "grad_norm": 2.4699894447050585, + "language_loss": 0.85803807, + "learning_rate": 3.246109375653428e-06, + "loss": 0.87732422, + "num_input_tokens_seen": 54934085, + "step": 2556, + "time_per_iteration": 3.2339515686035156 + }, + { + "auxiliary_loss_clip": 0.0120133, + "auxiliary_loss_mlp": 0.01029942, + "balance_loss_clip": 1.05763865, + "balance_loss_mlp": 1.02063799, + "epoch": 0.30746107136415557, + "flos": 19500284689920.0, + "grad_norm": 2.7186356704422905, + "language_loss": 0.78322113, + "learning_rate": 3.2454999864515243e-06, + "loss": 0.80553389, + "num_input_tokens_seen": 54953460, + "step": 2557, + "time_per_iteration": 2.4408984184265137 + }, + { + "auxiliary_loss_clip": 0.01169391, + "auxiliary_loss_mlp": 0.00764411, + "balance_loss_clip": 1.05517209, + "balance_loss_mlp": 1.00079215, + "epoch": 0.3075813142547947, + "flos": 21724806902400.0, + "grad_norm": 1.7331233103793475, + "language_loss": 0.69445086, + "learning_rate": 3.244890408308263e-06, + "loss": 0.71378887, + "num_input_tokens_seen": 54974165, + "step": 2558, + "time_per_iteration": 2.5187361240386963 + }, + { + "auxiliary_loss_clip": 0.01141689, + "auxiliary_loss_mlp": 0.0102496, + "balance_loss_clip": 1.04759371, + "balance_loss_mlp": 1.01584613, + "epoch": 0.3077015571454338, + "flos": 24097963593600.0, + "grad_norm": 1.9534027367190079, + "language_loss": 0.60958248, + "learning_rate": 3.2442806413161165e-06, + "loss": 0.63124895, + "num_input_tokens_seen": 54993810, + "step": 2559, + "time_per_iteration": 2.597407341003418 + }, + { + "auxiliary_loss_clip": 0.01144354, + "auxiliary_loss_mlp": 0.01035546, + "balance_loss_clip": 1.04970706, + "balance_loss_mlp": 1.02578259, + "epoch": 0.30782180003607285, + "flos": 18405476104320.0, + "grad_norm": 1.9618799272418443, + "language_loss": 0.75697857, + "learning_rate": 3.243670685567586e-06, + "loss": 0.7787776, + "num_input_tokens_seen": 55011210, + "step": 2560, + "time_per_iteration": 2.5404207706451416 + }, + { + "auxiliary_loss_clip": 0.01168382, + "auxiliary_loss_mlp": 0.00763356, + "balance_loss_clip": 1.05300069, + "balance_loss_mlp": 1.00093436, + "epoch": 0.30794204292671196, + "flos": 23878549365120.0, + "grad_norm": 2.6857151164185513, + "language_loss": 0.80765629, + "learning_rate": 3.2430605411552012e-06, + "loss": 0.82697362, + "num_input_tokens_seen": 55031325, + "step": 2561, + "time_per_iteration": 2.5303900241851807 + }, + { + "auxiliary_loss_clip": 0.01068775, + "auxiliary_loss_mlp": 0.01004038, + "balance_loss_clip": 1.02218151, + "balance_loss_mlp": 1.00228524, + "epoch": 0.30806228581735107, + "flos": 67927800816000.0, + "grad_norm": 0.8965795880954966, + "language_loss": 0.70538545, + "learning_rate": 3.2424502081715205e-06, + "loss": 0.72611362, + "num_input_tokens_seen": 55094440, + "step": 2562, + "time_per_iteration": 3.142545223236084 + }, + { + "auxiliary_loss_clip": 0.01171869, + "auxiliary_loss_mlp": 0.01032133, + "balance_loss_clip": 1.05361295, + "balance_loss_mlp": 1.02256048, + "epoch": 0.3081825287079901, + "flos": 23843213360640.0, + "grad_norm": 1.7320978409565082, + "language_loss": 0.78238487, + "learning_rate": 3.241839686709132e-06, + "loss": 0.80442488, + "num_input_tokens_seen": 55115375, + "step": 2563, + "time_per_iteration": 2.5377750396728516 + }, + { + "auxiliary_loss_clip": 0.01182539, + "auxiliary_loss_mlp": 0.01033215, + "balance_loss_clip": 1.05164051, + "balance_loss_mlp": 1.02406001, + "epoch": 0.30830277159862923, + "flos": 16209969102720.0, + "grad_norm": 2.326660641756058, + "language_loss": 0.82333434, + "learning_rate": 3.2412289768606495e-06, + "loss": 0.84549189, + "num_input_tokens_seen": 55131945, + "step": 2564, + "time_per_iteration": 2.441175937652588 + }, + { + "auxiliary_loss_clip": 0.01188051, + "auxiliary_loss_mlp": 0.01034922, + "balance_loss_clip": 1.05649352, + "balance_loss_mlp": 1.02598763, + "epoch": 0.30842301448926834, + "flos": 29349503723520.0, + "grad_norm": 1.7398229986105391, + "language_loss": 0.82579434, + "learning_rate": 3.240618078718718e-06, + "loss": 0.84802401, + "num_input_tokens_seen": 55153405, + "step": 2565, + "time_per_iteration": 2.5484530925750732 + }, + { + "auxiliary_loss_clip": 0.01155098, + "auxiliary_loss_mlp": 0.01034919, + "balance_loss_clip": 1.05151784, + "balance_loss_mlp": 1.0254662, + "epoch": 0.3085432573799074, + "flos": 21945190798080.0, + "grad_norm": 1.891449132954379, + "language_loss": 0.74362397, + "learning_rate": 3.240006992376011e-06, + "loss": 0.76552415, + "num_input_tokens_seen": 55173030, + "step": 2566, + "time_per_iteration": 2.5448029041290283 + }, + { + "auxiliary_loss_clip": 0.01176965, + "auxiliary_loss_mlp": 0.01038406, + "balance_loss_clip": 1.05636835, + "balance_loss_mlp": 1.02907825, + "epoch": 0.3086635002705465, + "flos": 22054718344320.0, + "grad_norm": 2.1485648877897243, + "language_loss": 0.76202482, + "learning_rate": 3.2393957179252284e-06, + "loss": 0.7841785, + "num_input_tokens_seen": 55189565, + "step": 2567, + "time_per_iteration": 2.50209379196167 + }, + { + "auxiliary_loss_clip": 0.01201735, + "auxiliary_loss_mlp": 0.01034561, + "balance_loss_clip": 1.05872965, + "balance_loss_mlp": 1.0256145, + "epoch": 0.3087837431611856, + "flos": 32665925520000.0, + "grad_norm": 1.9750883130911696, + "language_loss": 0.806054, + "learning_rate": 3.2387842554591016e-06, + "loss": 0.82841694, + "num_input_tokens_seen": 55210380, + "step": 2568, + "time_per_iteration": 2.5329971313476562 + }, + { + "auxiliary_loss_clip": 0.01199924, + "auxiliary_loss_mlp": 0.01035601, + "balance_loss_clip": 1.05753613, + "balance_loss_mlp": 1.02643967, + "epoch": 0.3089039860518247, + "flos": 17599245384960.0, + "grad_norm": 3.2675444431606566, + "language_loss": 0.87637973, + "learning_rate": 3.238172605070388e-06, + "loss": 0.89873505, + "num_input_tokens_seen": 55225795, + "step": 2569, + "time_per_iteration": 2.398620128631592 + }, + { + "auxiliary_loss_clip": 0.0118356, + "auxiliary_loss_mlp": 0.00764234, + "balance_loss_clip": 1.05408311, + "balance_loss_mlp": 1.00073957, + "epoch": 0.3090242289424638, + "flos": 14383839611520.0, + "grad_norm": 2.752897411143526, + "language_loss": 0.78433633, + "learning_rate": 3.2375607668518745e-06, + "loss": 0.80381429, + "num_input_tokens_seen": 55238830, + "step": 2570, + "time_per_iteration": 2.4081361293792725 + }, + { + "auxiliary_loss_clip": 0.01162338, + "auxiliary_loss_mlp": 0.01030215, + "balance_loss_clip": 1.05226731, + "balance_loss_mlp": 1.02105355, + "epoch": 0.30914447183310284, + "flos": 16068625084800.0, + "grad_norm": 2.1071443271998715, + "language_loss": 0.90099549, + "learning_rate": 3.236948740896377e-06, + "loss": 0.92292106, + "num_input_tokens_seen": 55253630, + "step": 2571, + "time_per_iteration": 2.468604326248169 + }, + { + "auxiliary_loss_clip": 0.01185108, + "auxiliary_loss_mlp": 0.01031677, + "balance_loss_clip": 1.05697584, + "balance_loss_mlp": 1.02282596, + "epoch": 0.30926471472374195, + "flos": 32230221546240.0, + "grad_norm": 1.5770248505030828, + "language_loss": 0.84101689, + "learning_rate": 3.2363365272967384e-06, + "loss": 0.86318469, + "num_input_tokens_seen": 55276200, + "step": 2572, + "time_per_iteration": 2.550359010696411 + }, + { + "auxiliary_loss_clip": 0.01184313, + "auxiliary_loss_mlp": 0.01035711, + "balance_loss_clip": 1.05869985, + "balance_loss_mlp": 1.02523232, + "epoch": 0.30938495761438106, + "flos": 20370722970240.0, + "grad_norm": 1.9290802082717589, + "language_loss": 0.81552643, + "learning_rate": 3.235724126145832e-06, + "loss": 0.83772665, + "num_input_tokens_seen": 55292235, + "step": 2573, + "time_per_iteration": 2.4409923553466797 + }, + { + "auxiliary_loss_clip": 0.01174884, + "auxiliary_loss_mlp": 0.01035565, + "balance_loss_clip": 1.05230546, + "balance_loss_mlp": 1.02581918, + "epoch": 0.3095052005050201, + "flos": 24061155131520.0, + "grad_norm": 1.4392135167105027, + "language_loss": 0.77719545, + "learning_rate": 3.235111537536558e-06, + "loss": 0.79929996, + "num_input_tokens_seen": 55313050, + "step": 2574, + "time_per_iteration": 2.4907896518707275 + }, + { + "auxiliary_loss_clip": 0.01185841, + "auxiliary_loss_mlp": 0.01025476, + "balance_loss_clip": 1.05592513, + "balance_loss_mlp": 1.01664889, + "epoch": 0.30962544339565923, + "flos": 23401547729280.0, + "grad_norm": 1.7451999928140285, + "language_loss": 0.82785141, + "learning_rate": 3.2344987615618456e-06, + "loss": 0.84996456, + "num_input_tokens_seen": 55332885, + "step": 2575, + "time_per_iteration": 2.484001874923706 + }, + { + "auxiliary_loss_clip": 0.01157122, + "auxiliary_loss_mlp": 0.0103278, + "balance_loss_clip": 1.05674314, + "balance_loss_mlp": 1.02382159, + "epoch": 0.30974568628629834, + "flos": 33799984692480.0, + "grad_norm": 1.6821408967736775, + "language_loss": 0.78300238, + "learning_rate": 3.2338857983146533e-06, + "loss": 0.80490136, + "num_input_tokens_seen": 55354385, + "step": 2576, + "time_per_iteration": 3.358790397644043 + }, + { + "auxiliary_loss_clip": 0.01160885, + "auxiliary_loss_mlp": 0.010281, + "balance_loss_clip": 1.05339277, + "balance_loss_mlp": 1.01829529, + "epoch": 0.3098659291769374, + "flos": 20229594433920.0, + "grad_norm": 2.8256684533385936, + "language_loss": 0.76021326, + "learning_rate": 3.233272647887966e-06, + "loss": 0.78210318, + "num_input_tokens_seen": 55373275, + "step": 2577, + "time_per_iteration": 2.4911482334136963 + }, + { + "auxiliary_loss_clip": 0.01201286, + "auxiliary_loss_mlp": 0.01035179, + "balance_loss_clip": 1.0581224, + "balance_loss_mlp": 1.02634585, + "epoch": 0.3099861720675765, + "flos": 24748556682240.0, + "grad_norm": 1.5803702312044177, + "language_loss": 0.90052879, + "learning_rate": 3.2326593103747985e-06, + "loss": 0.9228934, + "num_input_tokens_seen": 55392290, + "step": 2578, + "time_per_iteration": 2.470684766769409 + }, + { + "auxiliary_loss_clip": 0.0118436, + "auxiliary_loss_mlp": 0.01030833, + "balance_loss_clip": 1.05745554, + "balance_loss_mlp": 1.02148128, + "epoch": 0.3101064149582156, + "flos": 11765485704960.0, + "grad_norm": 2.187603865727778, + "language_loss": 0.84692764, + "learning_rate": 3.2320457858681936e-06, + "loss": 0.86907959, + "num_input_tokens_seen": 55410680, + "step": 2579, + "time_per_iteration": 2.4308996200561523 + }, + { + "auxiliary_loss_clip": 0.0117009, + "auxiliary_loss_mlp": 0.01024248, + "balance_loss_clip": 1.05086231, + "balance_loss_mlp": 1.01525998, + "epoch": 0.31022665784885467, + "flos": 23033247626880.0, + "grad_norm": 2.436028435065168, + "language_loss": 0.85723537, + "learning_rate": 3.2314320744612228e-06, + "loss": 0.87917876, + "num_input_tokens_seen": 55425980, + "step": 2580, + "time_per_iteration": 3.282902956008911 + }, + { + "auxiliary_loss_clip": 0.01182362, + "auxiliary_loss_mlp": 0.01031011, + "balance_loss_clip": 1.0555625, + "balance_loss_mlp": 1.02243972, + "epoch": 0.3103469007394938, + "flos": 16289188548480.0, + "grad_norm": 1.5149229972133844, + "language_loss": 0.76251328, + "learning_rate": 3.2308181762469854e-06, + "loss": 0.78464699, + "num_input_tokens_seen": 55443925, + "step": 2581, + "time_per_iteration": 3.2462916374206543 + }, + { + "auxiliary_loss_clip": 0.01201783, + "auxiliary_loss_mlp": 0.01035935, + "balance_loss_clip": 1.05692792, + "balance_loss_mlp": 1.02652383, + "epoch": 0.3104671436301329, + "flos": 30515271626880.0, + "grad_norm": 2.6458011703575055, + "language_loss": 0.78393877, + "learning_rate": 3.230204091318609e-06, + "loss": 0.8063159, + "num_input_tokens_seen": 55464465, + "step": 2582, + "time_per_iteration": 3.2229530811309814 + }, + { + "auxiliary_loss_clip": 0.01197435, + "auxiliary_loss_mlp": 0.00762855, + "balance_loss_clip": 1.05553293, + "balance_loss_mlp": 1.00081205, + "epoch": 0.31058738652077195, + "flos": 20047240062720.0, + "grad_norm": 1.9173459027775241, + "language_loss": 0.84756494, + "learning_rate": 3.2295898197692503e-06, + "loss": 0.86716783, + "num_input_tokens_seen": 55483425, + "step": 2583, + "time_per_iteration": 2.440253973007202 + }, + { + "auxiliary_loss_clip": 0.01199621, + "auxiliary_loss_mlp": 0.01034472, + "balance_loss_clip": 1.05685723, + "balance_loss_mlp": 1.02584767, + "epoch": 0.31070762941141106, + "flos": 28074639237120.0, + "grad_norm": 1.7899250672940263, + "language_loss": 0.79149044, + "learning_rate": 3.228975361692094e-06, + "loss": 0.81383139, + "num_input_tokens_seen": 55504445, + "step": 2584, + "time_per_iteration": 2.490823745727539 + }, + { + "auxiliary_loss_clip": 0.01191406, + "auxiliary_loss_mlp": 0.00764139, + "balance_loss_clip": 1.05578446, + "balance_loss_mlp": 1.00087905, + "epoch": 0.31082787230205017, + "flos": 20521907314560.0, + "grad_norm": 2.181523258170889, + "language_loss": 0.80188715, + "learning_rate": 3.228360717180352e-06, + "loss": 0.8214426, + "num_input_tokens_seen": 55521970, + "step": 2585, + "time_per_iteration": 2.477066993713379 + }, + { + "auxiliary_loss_clip": 0.01099452, + "auxiliary_loss_mlp": 0.00754563, + "balance_loss_clip": 1.024194, + "balance_loss_mlp": 1.00086939, + "epoch": 0.3109481151926892, + "flos": 62445928723200.0, + "grad_norm": 0.8116670593120461, + "language_loss": 0.59393859, + "learning_rate": 3.227745886327266e-06, + "loss": 0.61247873, + "num_input_tokens_seen": 55580665, + "step": 2586, + "time_per_iteration": 2.968832015991211 + }, + { + "auxiliary_loss_clip": 0.01098078, + "auxiliary_loss_mlp": 0.01003987, + "balance_loss_clip": 1.02295804, + "balance_loss_mlp": 1.00232971, + "epoch": 0.31106835808332833, + "flos": 44746744723200.0, + "grad_norm": 0.8064047377985103, + "language_loss": 0.5584833, + "learning_rate": 3.227130869226105e-06, + "loss": 0.57950395, + "num_input_tokens_seen": 55637825, + "step": 2587, + "time_per_iteration": 3.0334458351135254 + }, + { + "auxiliary_loss_clip": 0.01183064, + "auxiliary_loss_mlp": 0.01023286, + "balance_loss_clip": 1.05347919, + "balance_loss_mlp": 1.01493573, + "epoch": 0.3111886009739674, + "flos": 23403056100480.0, + "grad_norm": 2.6605026699328254, + "language_loss": 0.82296556, + "learning_rate": 3.226515665970167e-06, + "loss": 0.84502906, + "num_input_tokens_seen": 55655365, + "step": 2588, + "time_per_iteration": 2.474820375442505 + }, + { + "auxiliary_loss_clip": 0.01184169, + "auxiliary_loss_mlp": 0.01028226, + "balance_loss_clip": 1.05488729, + "balance_loss_mlp": 1.01846862, + "epoch": 0.3113088438646065, + "flos": 17530728192000.0, + "grad_norm": 8.236541698590202, + "language_loss": 0.8616966, + "learning_rate": 3.225900276652777e-06, + "loss": 0.88382059, + "num_input_tokens_seen": 55672140, + "step": 2589, + "time_per_iteration": 2.4443235397338867 + }, + { + "auxiliary_loss_clip": 0.01174457, + "auxiliary_loss_mlp": 0.0103426, + "balance_loss_clip": 1.05271256, + "balance_loss_mlp": 1.02514029, + "epoch": 0.3114290867552456, + "flos": 28365802882560.0, + "grad_norm": 1.981061449276491, + "language_loss": 0.7554971, + "learning_rate": 3.2252847013672906e-06, + "loss": 0.77758431, + "num_input_tokens_seen": 55694800, + "step": 2590, + "time_per_iteration": 2.5533342361450195 + }, + { + "auxiliary_loss_clip": 0.01146194, + "auxiliary_loss_mlp": 0.01028238, + "balance_loss_clip": 1.04749298, + "balance_loss_mlp": 1.01906478, + "epoch": 0.31154932964588467, + "flos": 27379157126400.0, + "grad_norm": 2.0386035993882268, + "language_loss": 0.76459521, + "learning_rate": 3.224668940207089e-06, + "loss": 0.78633952, + "num_input_tokens_seen": 55713785, + "step": 2591, + "time_per_iteration": 2.5842764377593994 + }, + { + "auxiliary_loss_clip": 0.01132678, + "auxiliary_loss_mlp": 0.01036601, + "balance_loss_clip": 1.04711628, + "balance_loss_mlp": 1.02746964, + "epoch": 0.3116695725365238, + "flos": 26541864120960.0, + "grad_norm": 1.8718584674024563, + "language_loss": 0.86576921, + "learning_rate": 3.2240529932655828e-06, + "loss": 0.88746202, + "num_input_tokens_seen": 55733050, + "step": 2592, + "time_per_iteration": 2.6130611896514893 + }, + { + "auxiliary_loss_clip": 0.01168659, + "auxiliary_loss_mlp": 0.01036467, + "balance_loss_clip": 1.05554962, + "balance_loss_mlp": 1.02701366, + "epoch": 0.3117898154271629, + "flos": 21177600134400.0, + "grad_norm": 3.192636509926996, + "language_loss": 0.88244963, + "learning_rate": 3.223436860636211e-06, + "loss": 0.90450096, + "num_input_tokens_seen": 55748685, + "step": 2593, + "time_per_iteration": 2.491716146469116 + }, + { + "auxiliary_loss_clip": 0.01200362, + "auxiliary_loss_mlp": 0.0103643, + "balance_loss_clip": 1.05738378, + "balance_loss_mlp": 1.02735269, + "epoch": 0.31191005831780194, + "flos": 27272430840960.0, + "grad_norm": 1.744085072795191, + "language_loss": 0.74220413, + "learning_rate": 3.2228205424124403e-06, + "loss": 0.76457208, + "num_input_tokens_seen": 55771840, + "step": 2594, + "time_per_iteration": 2.5219876766204834 + }, + { + "auxiliary_loss_clip": 0.0115742, + "auxiliary_loss_mlp": 0.01026809, + "balance_loss_clip": 1.05089748, + "balance_loss_mlp": 1.01748049, + "epoch": 0.31203030120844105, + "flos": 12963501043200.0, + "grad_norm": 2.157426421246413, + "language_loss": 0.74724472, + "learning_rate": 3.222204038687765e-06, + "loss": 0.76908696, + "num_input_tokens_seen": 55784975, + "step": 2595, + "time_per_iteration": 2.4434499740600586 + }, + { + "auxiliary_loss_clip": 0.01183191, + "auxiliary_loss_mlp": 0.01029833, + "balance_loss_clip": 1.05606425, + "balance_loss_mlp": 1.02125025, + "epoch": 0.31215054409908016, + "flos": 27562014288000.0, + "grad_norm": 1.6411761227567865, + "language_loss": 0.879242, + "learning_rate": 3.221587349555709e-06, + "loss": 0.90137219, + "num_input_tokens_seen": 55805235, + "step": 2596, + "time_per_iteration": 2.5279417037963867 + }, + { + "auxiliary_loss_clip": 0.01172655, + "auxiliary_loss_mlp": 0.01024933, + "balance_loss_clip": 1.05310476, + "balance_loss_mlp": 1.01596224, + "epoch": 0.3122707869897192, + "flos": 21506326427520.0, + "grad_norm": 1.724655709147222, + "language_loss": 0.69735658, + "learning_rate": 3.2209704751098236e-06, + "loss": 0.71933246, + "num_input_tokens_seen": 55824265, + "step": 2597, + "time_per_iteration": 2.501622438430786 + }, + { + "auxiliary_loss_clip": 0.01171958, + "auxiliary_loss_mlp": 0.01034022, + "balance_loss_clip": 1.05440986, + "balance_loss_mlp": 1.02474213, + "epoch": 0.31239102988035833, + "flos": 15187017674880.0, + "grad_norm": 2.0325261551593643, + "language_loss": 0.82428563, + "learning_rate": 3.2203534154436875e-06, + "loss": 0.84634548, + "num_input_tokens_seen": 55838620, + "step": 2598, + "time_per_iteration": 2.46083402633667 + }, + { + "auxiliary_loss_clip": 0.01122639, + "auxiliary_loss_mlp": 0.01039894, + "balance_loss_clip": 1.04739702, + "balance_loss_mlp": 1.03063726, + "epoch": 0.31251127277099744, + "flos": 22053712763520.0, + "grad_norm": 2.186492914617151, + "language_loss": 0.75376076, + "learning_rate": 3.2197361706509084e-06, + "loss": 0.7753861, + "num_input_tokens_seen": 55859375, + "step": 2599, + "time_per_iteration": 2.593080997467041 + }, + { + "auxiliary_loss_clip": 0.01202051, + "auxiliary_loss_mlp": 0.01033968, + "balance_loss_clip": 1.05585003, + "balance_loss_mlp": 1.02359128, + "epoch": 0.3126315156616365, + "flos": 15193984913280.0, + "grad_norm": 2.8121364719983157, + "language_loss": 0.83569753, + "learning_rate": 3.2191187408251228e-06, + "loss": 0.85805774, + "num_input_tokens_seen": 55876535, + "step": 2600, + "time_per_iteration": 2.4241302013397217 + }, + { + "auxiliary_loss_clip": 0.01191144, + "auxiliary_loss_mlp": 0.01035647, + "balance_loss_clip": 1.05395138, + "balance_loss_mlp": 1.02534151, + "epoch": 0.3127517585522756, + "flos": 18145338831360.0, + "grad_norm": 2.335980401032884, + "language_loss": 0.78270864, + "learning_rate": 3.218501126059993e-06, + "loss": 0.80497658, + "num_input_tokens_seen": 55891930, + "step": 2601, + "time_per_iteration": 2.431917667388916 + }, + { + "auxiliary_loss_clip": 0.01187844, + "auxiliary_loss_mlp": 0.01026847, + "balance_loss_clip": 1.05359221, + "balance_loss_mlp": 1.0172509, + "epoch": 0.31287200144291466, + "flos": 21908633731200.0, + "grad_norm": 2.0345523467244218, + "language_loss": 0.81268549, + "learning_rate": 3.2178833264492116e-06, + "loss": 0.83483243, + "num_input_tokens_seen": 55910635, + "step": 2602, + "time_per_iteration": 2.4773690700531006 + }, + { + "auxiliary_loss_clip": 0.01193072, + "auxiliary_loss_mlp": 0.01028438, + "balance_loss_clip": 1.05636668, + "balance_loss_mlp": 1.01947331, + "epoch": 0.31299224433355377, + "flos": 29896997800320.0, + "grad_norm": 1.9548580941937144, + "language_loss": 0.76021892, + "learning_rate": 3.217265342086498e-06, + "loss": 0.78243399, + "num_input_tokens_seen": 55931125, + "step": 2603, + "time_per_iteration": 3.285104990005493 + }, + { + "auxiliary_loss_clip": 0.01161923, + "auxiliary_loss_mlp": 0.00765119, + "balance_loss_clip": 1.05645227, + "balance_loss_mlp": 1.00071001, + "epoch": 0.3131124872241929, + "flos": 11655886331520.0, + "grad_norm": 2.6380927766306934, + "language_loss": 0.72934413, + "learning_rate": 3.216647173065599e-06, + "loss": 0.74861461, + "num_input_tokens_seen": 55946590, + "step": 2604, + "time_per_iteration": 2.5114340782165527 + }, + { + "auxiliary_loss_clip": 0.01169651, + "auxiliary_loss_mlp": 0.01034457, + "balance_loss_clip": 1.05608237, + "balance_loss_mlp": 1.02461624, + "epoch": 0.31323273011483194, + "flos": 49848785470080.0, + "grad_norm": 1.6641296540075465, + "language_loss": 0.73552442, + "learning_rate": 3.216028819480292e-06, + "loss": 0.7575655, + "num_input_tokens_seen": 55967930, + "step": 2605, + "time_per_iteration": 2.753655433654785 + }, + { + "auxiliary_loss_clip": 0.01157946, + "auxiliary_loss_mlp": 0.01033355, + "balance_loss_clip": 1.05197382, + "balance_loss_mlp": 1.02366877, + "epoch": 0.31335297300547105, + "flos": 22601278667520.0, + "grad_norm": 2.469518517334508, + "language_loss": 0.75079054, + "learning_rate": 3.2154102814243793e-06, + "loss": 0.77270353, + "num_input_tokens_seen": 55987070, + "step": 2606, + "time_per_iteration": 2.531189441680908 + }, + { + "auxiliary_loss_clip": 0.01161129, + "auxiliary_loss_mlp": 0.01035403, + "balance_loss_clip": 1.05415034, + "balance_loss_mlp": 1.02639067, + "epoch": 0.31347321589611016, + "flos": 34710858708480.0, + "grad_norm": 2.3675450574986163, + "language_loss": 0.6641553, + "learning_rate": 3.2147915589916937e-06, + "loss": 0.68612063, + "num_input_tokens_seen": 56008630, + "step": 2607, + "time_per_iteration": 3.4016571044921875 + }, + { + "auxiliary_loss_clip": 0.01163757, + "auxiliary_loss_mlp": 0.01031369, + "balance_loss_clip": 1.04987872, + "balance_loss_mlp": 1.02211881, + "epoch": 0.3135934587867492, + "flos": 19755789108480.0, + "grad_norm": 2.0001008708934163, + "language_loss": 0.8244381, + "learning_rate": 3.2141726522760938e-06, + "loss": 0.84638935, + "num_input_tokens_seen": 56026690, + "step": 2608, + "time_per_iteration": 3.251903533935547 + }, + { + "auxiliary_loss_clip": 0.01086162, + "auxiliary_loss_mlp": 0.01002721, + "balance_loss_clip": 1.02424312, + "balance_loss_mlp": 1.00092065, + "epoch": 0.3137137016773883, + "flos": 65815535583360.0, + "grad_norm": 0.7743452806189648, + "language_loss": 0.52574605, + "learning_rate": 3.213553561371469e-06, + "loss": 0.54663485, + "num_input_tokens_seen": 56090425, + "step": 2609, + "time_per_iteration": 3.8581387996673584 + }, + { + "auxiliary_loss_clip": 0.01141027, + "auxiliary_loss_mlp": 0.01036, + "balance_loss_clip": 1.05288935, + "balance_loss_mlp": 1.02722597, + "epoch": 0.31383394456802743, + "flos": 16252739222400.0, + "grad_norm": 2.285127030804708, + "language_loss": 0.95446002, + "learning_rate": 3.212934286371733e-06, + "loss": 0.9762302, + "num_input_tokens_seen": 56107135, + "step": 2610, + "time_per_iteration": 2.5417184829711914 + }, + { + "auxiliary_loss_clip": 0.01187149, + "auxiliary_loss_mlp": 0.0103118, + "balance_loss_clip": 1.05882668, + "balance_loss_mlp": 1.02152395, + "epoch": 0.3139541874586665, + "flos": 38795517613440.0, + "grad_norm": 3.143727892318906, + "language_loss": 0.83107209, + "learning_rate": 3.2123148273708304e-06, + "loss": 0.85325539, + "num_input_tokens_seen": 56127325, + "step": 2611, + "time_per_iteration": 2.607090711593628 + }, + { + "auxiliary_loss_clip": 0.01199453, + "auxiliary_loss_mlp": 0.01029234, + "balance_loss_clip": 1.05707073, + "balance_loss_mlp": 1.01974452, + "epoch": 0.3140744303493056, + "flos": 25046328430080.0, + "grad_norm": 2.086627232189235, + "language_loss": 0.77019238, + "learning_rate": 3.211695184462733e-06, + "loss": 0.79247928, + "num_input_tokens_seen": 56148500, + "step": 2612, + "time_per_iteration": 2.4744715690612793 + }, + { + "auxiliary_loss_clip": 0.01062896, + "auxiliary_loss_mlp": 0.01002007, + "balance_loss_clip": 1.02017021, + "balance_loss_mlp": 1.00018358, + "epoch": 0.3141946732399447, + "flos": 72504254782080.0, + "grad_norm": 0.8783736705866158, + "language_loss": 0.6040771, + "learning_rate": 3.2110753577414383e-06, + "loss": 0.62472606, + "num_input_tokens_seen": 56210080, + "step": 2613, + "time_per_iteration": 3.0886476039886475 + }, + { + "auxiliary_loss_clip": 0.01174962, + "auxiliary_loss_mlp": 0.01026264, + "balance_loss_clip": 1.05395889, + "balance_loss_mlp": 1.0170964, + "epoch": 0.31431491613058377, + "flos": 19239788280960.0, + "grad_norm": 1.850668286001808, + "language_loss": 0.78684151, + "learning_rate": 3.2104553473009757e-06, + "loss": 0.80885375, + "num_input_tokens_seen": 56228200, + "step": 2614, + "time_per_iteration": 2.491018772125244 + }, + { + "auxiliary_loss_clip": 0.01137304, + "auxiliary_loss_mlp": 0.01028042, + "balance_loss_clip": 1.05114961, + "balance_loss_mlp": 1.01870215, + "epoch": 0.3144351590212229, + "flos": 36210596290560.0, + "grad_norm": 2.0703559687289457, + "language_loss": 0.68146676, + "learning_rate": 3.209835153235399e-06, + "loss": 0.70312023, + "num_input_tokens_seen": 56249755, + "step": 2615, + "time_per_iteration": 2.6715166568756104 + }, + { + "auxiliary_loss_clip": 0.01145858, + "auxiliary_loss_mlp": 0.01030026, + "balance_loss_clip": 1.04922652, + "balance_loss_mlp": 1.02114534, + "epoch": 0.314555401911862, + "flos": 18551740285440.0, + "grad_norm": 1.7955090248411647, + "language_loss": 0.67448878, + "learning_rate": 3.2092147756387916e-06, + "loss": 0.69624764, + "num_input_tokens_seen": 56270080, + "step": 2616, + "time_per_iteration": 2.5371596813201904 + }, + { + "auxiliary_loss_clip": 0.01162549, + "auxiliary_loss_mlp": 0.01032041, + "balance_loss_clip": 1.04909754, + "balance_loss_mlp": 1.02225399, + "epoch": 0.31467564480250104, + "flos": 16362877299840.0, + "grad_norm": 1.9014289305387788, + "language_loss": 0.83542949, + "learning_rate": 3.208594214605264e-06, + "loss": 0.85737538, + "num_input_tokens_seen": 56288625, + "step": 2617, + "time_per_iteration": 2.4932265281677246 + }, + { + "auxiliary_loss_clip": 0.01158721, + "auxiliary_loss_mlp": 0.01031336, + "balance_loss_clip": 1.05079639, + "balance_loss_mlp": 1.0226692, + "epoch": 0.31479588769314015, + "flos": 21652375127040.0, + "grad_norm": 2.055571104792497, + "language_loss": 0.77484751, + "learning_rate": 3.2079734702289553e-06, + "loss": 0.79674804, + "num_input_tokens_seen": 56307520, + "step": 2618, + "time_per_iteration": 2.4984939098358154 + }, + { + "auxiliary_loss_clip": 0.01080784, + "auxiliary_loss_mlp": 0.00754693, + "balance_loss_clip": 1.02057171, + "balance_loss_mlp": 1.00054348, + "epoch": 0.3149161305837792, + "flos": 66051072040320.0, + "grad_norm": 0.8143373375956612, + "language_loss": 0.60464978, + "learning_rate": 3.207352542604031e-06, + "loss": 0.62300456, + "num_input_tokens_seen": 56369855, + "step": 2619, + "time_per_iteration": 3.142498731613159 + }, + { + "auxiliary_loss_clip": 0.0114239, + "auxiliary_loss_mlp": 0.01032647, + "balance_loss_clip": 1.04816353, + "balance_loss_mlp": 1.02403998, + "epoch": 0.3150363734744183, + "flos": 28987201192320.0, + "grad_norm": 1.5599764318705276, + "language_loss": 0.78256941, + "learning_rate": 3.2067314318246864e-06, + "loss": 0.80431974, + "num_input_tokens_seen": 56390570, + "step": 2620, + "time_per_iteration": 2.586515188217163 + }, + { + "auxiliary_loss_clip": 0.01161658, + "auxiliary_loss_mlp": 0.01025928, + "balance_loss_clip": 1.05657816, + "balance_loss_mlp": 1.01713061, + "epoch": 0.31515661636505743, + "flos": 27636600879360.0, + "grad_norm": 1.7063650443129241, + "language_loss": 0.77187741, + "learning_rate": 3.206110137985143e-06, + "loss": 0.79375327, + "num_input_tokens_seen": 56410775, + "step": 2621, + "time_per_iteration": 2.5860049724578857 + }, + { + "auxiliary_loss_clip": 0.01144364, + "auxiliary_loss_mlp": 0.01029814, + "balance_loss_clip": 1.04942799, + "balance_loss_mlp": 1.02036071, + "epoch": 0.3152768592556965, + "flos": 24605632465920.0, + "grad_norm": 1.8448223226229101, + "language_loss": 0.91684401, + "learning_rate": 3.2054886611796505e-06, + "loss": 0.93858582, + "num_input_tokens_seen": 56429770, + "step": 2622, + "time_per_iteration": 2.568725824356079 + }, + { + "auxiliary_loss_clip": 0.0109677, + "auxiliary_loss_mlp": 0.01001678, + "balance_loss_clip": 1.02263594, + "balance_loss_mlp": 0.99992579, + "epoch": 0.3153971021463356, + "flos": 68476908026880.0, + "grad_norm": 0.9013492352560792, + "language_loss": 0.63561332, + "learning_rate": 3.204867001502487e-06, + "loss": 0.65659779, + "num_input_tokens_seen": 56488425, + "step": 2623, + "time_per_iteration": 3.00846791267395 + }, + { + "auxiliary_loss_clip": 0.01200559, + "auxiliary_loss_mlp": 0.01033688, + "balance_loss_clip": 1.0576117, + "balance_loss_mlp": 1.02347803, + "epoch": 0.3155173450369747, + "flos": 25593714766080.0, + "grad_norm": 1.9913748489953615, + "language_loss": 0.80340046, + "learning_rate": 3.2042451590479567e-06, + "loss": 0.8257429, + "num_input_tokens_seen": 56508940, + "step": 2624, + "time_per_iteration": 2.4830477237701416 + }, + { + "auxiliary_loss_clip": 0.01195754, + "auxiliary_loss_mlp": 0.01028968, + "balance_loss_clip": 1.05615854, + "balance_loss_mlp": 1.01997411, + "epoch": 0.31563758792761376, + "flos": 24309333175680.0, + "grad_norm": 1.7065929958971866, + "language_loss": 0.86771673, + "learning_rate": 3.203623133910394e-06, + "loss": 0.88996398, + "num_input_tokens_seen": 56527245, + "step": 2625, + "time_per_iteration": 2.4774670600891113 + }, + { + "auxiliary_loss_clip": 0.01128292, + "auxiliary_loss_mlp": 0.01033585, + "balance_loss_clip": 1.04812956, + "balance_loss_mlp": 1.02481127, + "epoch": 0.31575783081825287, + "flos": 31903865550720.0, + "grad_norm": 2.358627048278041, + "language_loss": 0.77657378, + "learning_rate": 3.203000926184158e-06, + "loss": 0.7981925, + "num_input_tokens_seen": 56546170, + "step": 2626, + "time_per_iteration": 2.6714534759521484 + }, + { + "auxiliary_loss_clip": 0.01198528, + "auxiliary_loss_mlp": 0.01029843, + "balance_loss_clip": 1.05716836, + "balance_loss_mlp": 1.02174008, + "epoch": 0.315878073708892, + "flos": 30810960385920.0, + "grad_norm": 1.7862218193846526, + "language_loss": 0.77818298, + "learning_rate": 3.202378535963639e-06, + "loss": 0.80046666, + "num_input_tokens_seen": 56567085, + "step": 2627, + "time_per_iteration": 2.4999771118164062 + }, + { + "auxiliary_loss_clip": 0.01161381, + "auxiliary_loss_mlp": 0.0076444, + "balance_loss_clip": 1.05214381, + "balance_loss_mlp": 1.00072443, + "epoch": 0.31599831659953104, + "flos": 22200264253440.0, + "grad_norm": 2.124887225905449, + "language_loss": 0.83284426, + "learning_rate": 3.2017559633432516e-06, + "loss": 0.85210252, + "num_input_tokens_seen": 56586715, + "step": 2628, + "time_per_iteration": 2.510676622390747 + }, + { + "auxiliary_loss_clip": 0.0117718, + "auxiliary_loss_mlp": 0.01031596, + "balance_loss_clip": 1.05303597, + "balance_loss_mlp": 1.0225842, + "epoch": 0.31611855949017015, + "flos": 25593463370880.0, + "grad_norm": 1.8811919163351134, + "language_loss": 0.66267812, + "learning_rate": 3.2011332084174398e-06, + "loss": 0.68476582, + "num_input_tokens_seen": 56607585, + "step": 2629, + "time_per_iteration": 2.5298099517822266 + }, + { + "auxiliary_loss_clip": 0.01181745, + "auxiliary_loss_mlp": 0.01030936, + "balance_loss_clip": 1.05576825, + "balance_loss_mlp": 1.02184665, + "epoch": 0.31623880238080926, + "flos": 20594087694720.0, + "grad_norm": 8.163114250852294, + "language_loss": 0.88971621, + "learning_rate": 3.2005102712806756e-06, + "loss": 0.91184306, + "num_input_tokens_seen": 56626415, + "step": 2630, + "time_per_iteration": 3.2038793563842773 + }, + { + "auxiliary_loss_clip": 0.01187055, + "auxiliary_loss_mlp": 0.01034133, + "balance_loss_clip": 1.05543363, + "balance_loss_mlp": 1.02499533, + "epoch": 0.3163590452714483, + "flos": 12784917600000.0, + "grad_norm": 2.0110783021194707, + "language_loss": 0.72861832, + "learning_rate": 3.1998871520274575e-06, + "loss": 0.75083017, + "num_input_tokens_seen": 56641750, + "step": 2631, + "time_per_iteration": 2.423333168029785 + }, + { + "auxiliary_loss_clip": 0.01169973, + "auxiliary_loss_mlp": 0.01033785, + "balance_loss_clip": 1.05091405, + "balance_loss_mlp": 1.02486813, + "epoch": 0.3164792881620874, + "flos": 23041292273280.0, + "grad_norm": 1.9667117042964626, + "language_loss": 0.84918535, + "learning_rate": 3.199263850752312e-06, + "loss": 0.87122297, + "num_input_tokens_seen": 56662585, + "step": 2632, + "time_per_iteration": 2.5151607990264893 + }, + { + "auxiliary_loss_clip": 0.01183456, + "auxiliary_loss_mlp": 0.01031778, + "balance_loss_clip": 1.05396509, + "balance_loss_mlp": 1.02214003, + "epoch": 0.31659953105272653, + "flos": 18296271780480.0, + "grad_norm": 2.0939513413928905, + "language_loss": 0.85332835, + "learning_rate": 3.198640367549795e-06, + "loss": 0.87548065, + "num_input_tokens_seen": 56681480, + "step": 2633, + "time_per_iteration": 3.1969892978668213 + }, + { + "auxiliary_loss_clip": 0.01181472, + "auxiliary_loss_mlp": 0.00763436, + "balance_loss_clip": 1.05226755, + "balance_loss_mlp": 1.00063336, + "epoch": 0.3167197739433656, + "flos": 25703421880320.0, + "grad_norm": 1.6355012981556705, + "language_loss": 0.85738385, + "learning_rate": 3.198016702514487e-06, + "loss": 0.87683296, + "num_input_tokens_seen": 56701760, + "step": 2634, + "time_per_iteration": 3.2695462703704834 + }, + { + "auxiliary_loss_clip": 0.01196091, + "auxiliary_loss_mlp": 0.01030272, + "balance_loss_clip": 1.05537128, + "balance_loss_mlp": 1.02170038, + "epoch": 0.3168400168340047, + "flos": 23546016230400.0, + "grad_norm": 1.6270182248402385, + "language_loss": 0.8490752, + "learning_rate": 3.1973928557409972e-06, + "loss": 0.87133884, + "num_input_tokens_seen": 56719800, + "step": 2635, + "time_per_iteration": 2.48649525642395 + }, + { + "auxiliary_loss_clip": 0.0119519, + "auxiliary_loss_mlp": 0.01029862, + "balance_loss_clip": 1.05547941, + "balance_loss_mlp": 1.02092755, + "epoch": 0.31696025972464376, + "flos": 28366449327360.0, + "grad_norm": 14.260331279797413, + "language_loss": 0.7103647, + "learning_rate": 3.1967688273239636e-06, + "loss": 0.73261523, + "num_input_tokens_seen": 56739605, + "step": 2636, + "time_per_iteration": 3.5414607524871826 + }, + { + "auxiliary_loss_clip": 0.01154153, + "auxiliary_loss_mlp": 0.01031605, + "balance_loss_clip": 1.0520649, + "balance_loss_mlp": 1.02268779, + "epoch": 0.31708050261528287, + "flos": 16399111144320.0, + "grad_norm": 1.7347145802318515, + "language_loss": 0.81985247, + "learning_rate": 3.1961446173580503e-06, + "loss": 0.84171003, + "num_input_tokens_seen": 56756545, + "step": 2637, + "time_per_iteration": 2.5163064002990723 + }, + { + "auxiliary_loss_clip": 0.01165142, + "auxiliary_loss_mlp": 0.01029891, + "balance_loss_clip": 1.05303407, + "balance_loss_mlp": 1.02125978, + "epoch": 0.317200745505922, + "flos": 26212347728640.0, + "grad_norm": 1.7263943719929746, + "language_loss": 0.77091151, + "learning_rate": 3.1955202259379502e-06, + "loss": 0.79286188, + "num_input_tokens_seen": 56778275, + "step": 2638, + "time_per_iteration": 2.5472452640533447 + }, + { + "auxiliary_loss_clip": 0.01181769, + "auxiliary_loss_mlp": 0.01030336, + "balance_loss_clip": 1.0538404, + "balance_loss_mlp": 1.02162218, + "epoch": 0.31732098839656103, + "flos": 31350876693120.0, + "grad_norm": 1.6281998933789583, + "language_loss": 0.82666028, + "learning_rate": 3.194895653158381e-06, + "loss": 0.84878135, + "num_input_tokens_seen": 56797215, + "step": 2639, + "time_per_iteration": 2.5508198738098145 + }, + { + "auxiliary_loss_clip": 0.01096226, + "auxiliary_loss_mlp": 0.01003902, + "balance_loss_clip": 1.02238989, + "balance_loss_mlp": 1.00228071, + "epoch": 0.31744123128720014, + "flos": 58989024835200.0, + "grad_norm": 0.7700196818730055, + "language_loss": 0.55551016, + "learning_rate": 3.194270899114093e-06, + "loss": 0.57651138, + "num_input_tokens_seen": 56863010, + "step": 2640, + "time_per_iteration": 3.122354745864868 + }, + { + "auxiliary_loss_clip": 0.01190445, + "auxiliary_loss_mlp": 0.01034567, + "balance_loss_clip": 1.05737782, + "balance_loss_mlp": 1.02482152, + "epoch": 0.31756147417783925, + "flos": 17417573372160.0, + "grad_norm": 1.8618097155476754, + "language_loss": 0.81921887, + "learning_rate": 3.193645963899858e-06, + "loss": 0.84146899, + "num_input_tokens_seen": 56880625, + "step": 2641, + "time_per_iteration": 2.4650070667266846 + }, + { + "auxiliary_loss_clip": 0.01163625, + "auxiliary_loss_mlp": 0.01026131, + "balance_loss_clip": 1.0531404, + "balance_loss_mlp": 1.01783395, + "epoch": 0.3176817170684783, + "flos": 25481673267840.0, + "grad_norm": 1.8033339524092595, + "language_loss": 0.83931476, + "learning_rate": 3.193020847610479e-06, + "loss": 0.86121231, + "num_input_tokens_seen": 56900945, + "step": 2642, + "time_per_iteration": 2.52522873878479 + }, + { + "auxiliary_loss_clip": 0.01163567, + "auxiliary_loss_mlp": 0.01032519, + "balance_loss_clip": 1.05569673, + "balance_loss_mlp": 1.02347064, + "epoch": 0.3178019599591174, + "flos": 24972603765120.0, + "grad_norm": 5.016620518105046, + "language_loss": 0.712484, + "learning_rate": 3.192395550340787e-06, + "loss": 0.73444486, + "num_input_tokens_seen": 56918895, + "step": 2643, + "time_per_iteration": 2.5277838706970215 + }, + { + "auxiliary_loss_clip": 0.01183301, + "auxiliary_loss_mlp": 0.01030008, + "balance_loss_clip": 1.0555222, + "balance_loss_mlp": 1.0216161, + "epoch": 0.31792220284975653, + "flos": 12422220019200.0, + "grad_norm": 2.169524321508281, + "language_loss": 0.7699393, + "learning_rate": 3.191770072185638e-06, + "loss": 0.79207242, + "num_input_tokens_seen": 56935890, + "step": 2644, + "time_per_iteration": 2.4622392654418945 + }, + { + "auxiliary_loss_clip": 0.01181257, + "auxiliary_loss_mlp": 0.01032501, + "balance_loss_clip": 1.05403161, + "balance_loss_mlp": 1.02333999, + "epoch": 0.3180424457403956, + "flos": 15485759089920.0, + "grad_norm": 2.2796203647570557, + "language_loss": 0.72508717, + "learning_rate": 3.191144413239916e-06, + "loss": 0.74722481, + "num_input_tokens_seen": 56952460, + "step": 2645, + "time_per_iteration": 2.4200439453125 + }, + { + "auxiliary_loss_clip": 0.0117038, + "auxiliary_loss_mlp": 0.0103609, + "balance_loss_clip": 1.053702, + "balance_loss_mlp": 1.02694082, + "epoch": 0.3181626886310347, + "flos": 26174964648960.0, + "grad_norm": 5.116912272222358, + "language_loss": 0.8796736, + "learning_rate": 3.190518573598534e-06, + "loss": 0.90173829, + "num_input_tokens_seen": 56969065, + "step": 2646, + "time_per_iteration": 2.5268490314483643 + }, + { + "auxiliary_loss_clip": 0.01160153, + "auxiliary_loss_mlp": 0.01035495, + "balance_loss_clip": 1.0505439, + "balance_loss_mlp": 1.02609539, + "epoch": 0.3182829315216738, + "flos": 25483109811840.0, + "grad_norm": 1.478628588711772, + "language_loss": 0.77458417, + "learning_rate": 3.1898925533564308e-06, + "loss": 0.79654062, + "num_input_tokens_seen": 56990535, + "step": 2647, + "time_per_iteration": 2.575714588165283 + }, + { + "auxiliary_loss_clip": 0.01144559, + "auxiliary_loss_mlp": 0.01032357, + "balance_loss_clip": 1.05087948, + "balance_loss_mlp": 1.02308822, + "epoch": 0.31840317441231286, + "flos": 18113701927680.0, + "grad_norm": 2.5112678346911785, + "language_loss": 0.64085007, + "learning_rate": 3.1892663526085733e-06, + "loss": 0.66261923, + "num_input_tokens_seen": 57008910, + "step": 2648, + "time_per_iteration": 2.505758047103882 + }, + { + "auxiliary_loss_clip": 0.01094315, + "auxiliary_loss_mlp": 0.01001825, + "balance_loss_clip": 1.02105796, + "balance_loss_mlp": 1.00009656, + "epoch": 0.31852341730295197, + "flos": 64741948957440.0, + "grad_norm": 0.7378669773576715, + "language_loss": 0.56882715, + "learning_rate": 3.188639971449956e-06, + "loss": 0.58978856, + "num_input_tokens_seen": 57074960, + "step": 2649, + "time_per_iteration": 2.993439197540283 + }, + { + "auxiliary_loss_clip": 0.01200289, + "auxiliary_loss_mlp": 0.01028353, + "balance_loss_clip": 1.05812693, + "balance_loss_mlp": 1.01944852, + "epoch": 0.318643660193591, + "flos": 20668135582080.0, + "grad_norm": 1.9412150794843985, + "language_loss": 0.71740568, + "learning_rate": 3.1880134099756e-06, + "loss": 0.73969215, + "num_input_tokens_seen": 57094595, + "step": 2650, + "time_per_iteration": 2.439453363418579 + }, + { + "auxiliary_loss_clip": 0.01179005, + "auxiliary_loss_mlp": 0.01024103, + "balance_loss_clip": 1.05146909, + "balance_loss_mlp": 1.01544273, + "epoch": 0.31876390308423014, + "flos": 26943345411840.0, + "grad_norm": 2.071014922135872, + "language_loss": 0.69667393, + "learning_rate": 3.1873866682805535e-06, + "loss": 0.71870506, + "num_input_tokens_seen": 57115290, + "step": 2651, + "time_per_iteration": 2.526967763900757 + }, + { + "auxiliary_loss_clip": 0.01173858, + "auxiliary_loss_mlp": 0.01029224, + "balance_loss_clip": 1.05532432, + "balance_loss_mlp": 1.02041447, + "epoch": 0.31888414597486925, + "flos": 18041916597120.0, + "grad_norm": 1.9763546844974356, + "language_loss": 0.88351083, + "learning_rate": 3.186759746459894e-06, + "loss": 0.90554166, + "num_input_tokens_seen": 57134400, + "step": 2652, + "time_per_iteration": 2.517998456954956 + }, + { + "auxiliary_loss_clip": 0.011717, + "auxiliary_loss_mlp": 0.01026544, + "balance_loss_clip": 1.0567311, + "balance_loss_mlp": 1.01747239, + "epoch": 0.3190043888655083, + "flos": 25149319701120.0, + "grad_norm": 1.8838131245056928, + "language_loss": 0.79488409, + "learning_rate": 3.1861326446087246e-06, + "loss": 0.81686658, + "num_input_tokens_seen": 57153140, + "step": 2653, + "time_per_iteration": 2.5384435653686523 + }, + { + "auxiliary_loss_clip": 0.01185326, + "auxiliary_loss_mlp": 0.01029134, + "balance_loss_clip": 1.05470014, + "balance_loss_mlp": 1.01944268, + "epoch": 0.3191246317561474, + "flos": 22053892331520.0, + "grad_norm": 2.074932612104354, + "language_loss": 0.7165615, + "learning_rate": 3.1855053628221763e-06, + "loss": 0.73870611, + "num_input_tokens_seen": 57172395, + "step": 2654, + "time_per_iteration": 2.4717390537261963 + }, + { + "auxiliary_loss_clip": 0.01147092, + "auxiliary_loss_mlp": 0.01029273, + "balance_loss_clip": 1.04911578, + "balance_loss_mlp": 1.01976037, + "epoch": 0.3192448746467865, + "flos": 14901815687040.0, + "grad_norm": 2.503759009675309, + "language_loss": 0.89959013, + "learning_rate": 3.184877901195407e-06, + "loss": 0.92135382, + "num_input_tokens_seen": 57189090, + "step": 2655, + "time_per_iteration": 2.4931271076202393 + }, + { + "auxiliary_loss_clip": 0.01070384, + "auxiliary_loss_mlp": 0.01011551, + "balance_loss_clip": 1.02411723, + "balance_loss_mlp": 1.00970364, + "epoch": 0.3193651175374256, + "flos": 67234832657280.0, + "grad_norm": 0.8162088215508979, + "language_loss": 0.62847888, + "learning_rate": 3.184250259823602e-06, + "loss": 0.64929819, + "num_input_tokens_seen": 57251620, + "step": 2656, + "time_per_iteration": 3.8533098697662354 + }, + { + "auxiliary_loss_clip": 0.01155689, + "auxiliary_loss_mlp": 0.01033572, + "balance_loss_clip": 1.05261636, + "balance_loss_mlp": 1.02412486, + "epoch": 0.3194853604280647, + "flos": 12233077977600.0, + "grad_norm": 2.6299314740257085, + "language_loss": 0.81567627, + "learning_rate": 3.183622438801974e-06, + "loss": 0.83756888, + "num_input_tokens_seen": 57266910, + "step": 2657, + "time_per_iteration": 2.510697603225708 + }, + { + "auxiliary_loss_clip": 0.01203241, + "auxiliary_loss_mlp": 0.01035455, + "balance_loss_clip": 1.06093216, + "balance_loss_mlp": 1.02666914, + "epoch": 0.3196056033187038, + "flos": 14939917038720.0, + "grad_norm": 1.8720572058097522, + "language_loss": 0.75034583, + "learning_rate": 3.1829944382257637e-06, + "loss": 0.77273279, + "num_input_tokens_seen": 57285040, + "step": 2658, + "time_per_iteration": 2.44197678565979 + }, + { + "auxiliary_loss_clip": 0.01182919, + "auxiliary_loss_mlp": 0.01029969, + "balance_loss_clip": 1.05681264, + "balance_loss_mlp": 1.02141011, + "epoch": 0.31972584620934286, + "flos": 23768878164480.0, + "grad_norm": 2.772633530992558, + "language_loss": 0.81245041, + "learning_rate": 3.1823662581902373e-06, + "loss": 0.83457935, + "num_input_tokens_seen": 57302725, + "step": 2659, + "time_per_iteration": 2.5027949810028076 + }, + { + "auxiliary_loss_clip": 0.01138991, + "auxiliary_loss_mlp": 0.01032327, + "balance_loss_clip": 1.04393208, + "balance_loss_mlp": 1.02283216, + "epoch": 0.31984608909998197, + "flos": 21251540280960.0, + "grad_norm": 1.953228005026313, + "language_loss": 0.74722189, + "learning_rate": 3.1817378987906896e-06, + "loss": 0.76893508, + "num_input_tokens_seen": 57322230, + "step": 2660, + "time_per_iteration": 3.260767698287964 + }, + { + "auxiliary_loss_clip": 0.01137363, + "auxiliary_loss_mlp": 0.01036617, + "balance_loss_clip": 1.05257249, + "balance_loss_mlp": 1.02715731, + "epoch": 0.3199663319906211, + "flos": 18296235866880.0, + "grad_norm": 1.8968072335925663, + "language_loss": 0.79911685, + "learning_rate": 3.181109360122442e-06, + "loss": 0.82085663, + "num_input_tokens_seen": 57339820, + "step": 2661, + "time_per_iteration": 3.325761556625366 + }, + { + "auxiliary_loss_clip": 0.01152568, + "auxiliary_loss_mlp": 0.01030345, + "balance_loss_clip": 1.05190444, + "balance_loss_mlp": 1.02107024, + "epoch": 0.32008657488126013, + "flos": 18733627779840.0, + "grad_norm": 2.125774745304905, + "language_loss": 0.78244263, + "learning_rate": 3.1804806422808445e-06, + "loss": 0.80427176, + "num_input_tokens_seen": 57356955, + "step": 2662, + "time_per_iteration": 3.2926275730133057 + }, + { + "auxiliary_loss_clip": 0.01158914, + "auxiliary_loss_mlp": 0.01034001, + "balance_loss_clip": 1.05006111, + "balance_loss_mlp": 1.02458954, + "epoch": 0.32020681777189924, + "flos": 20595344670720.0, + "grad_norm": 1.7905689567434808, + "language_loss": 0.73344374, + "learning_rate": 3.1798517453612714e-06, + "loss": 0.75537288, + "num_input_tokens_seen": 57376760, + "step": 2663, + "time_per_iteration": 2.514641523361206 + }, + { + "auxiliary_loss_clip": 0.01185759, + "auxiliary_loss_mlp": 0.01035577, + "balance_loss_clip": 1.06126308, + "balance_loss_mlp": 1.02701187, + "epoch": 0.32032706066253835, + "flos": 35261692750080.0, + "grad_norm": 1.7699949531014219, + "language_loss": 0.75344533, + "learning_rate": 3.1792226694591265e-06, + "loss": 0.77565867, + "num_input_tokens_seen": 57398145, + "step": 2664, + "time_per_iteration": 2.605893135070801 + }, + { + "auxiliary_loss_clip": 0.01151585, + "auxiliary_loss_mlp": 0.01026244, + "balance_loss_clip": 1.05338585, + "balance_loss_mlp": 1.01758909, + "epoch": 0.3204473035531774, + "flos": 15304230731520.0, + "grad_norm": 1.9760547933982293, + "language_loss": 0.80296624, + "learning_rate": 3.178593414669841e-06, + "loss": 0.82474452, + "num_input_tokens_seen": 57416730, + "step": 2665, + "time_per_iteration": 2.5311057567596436 + }, + { + "auxiliary_loss_clip": 0.01189132, + "auxiliary_loss_mlp": 0.01030793, + "balance_loss_clip": 1.05884397, + "balance_loss_mlp": 1.0215838, + "epoch": 0.3205675464438165, + "flos": 24462564595200.0, + "grad_norm": 2.0963162028926927, + "language_loss": 0.70260096, + "learning_rate": 3.1779639810888707e-06, + "loss": 0.72480023, + "num_input_tokens_seen": 57436325, + "step": 2666, + "time_per_iteration": 2.4995834827423096 + }, + { + "auxiliary_loss_clip": 0.01183598, + "auxiliary_loss_mlp": 0.01035168, + "balance_loss_clip": 1.05855751, + "balance_loss_mlp": 1.02562547, + "epoch": 0.3206877893344556, + "flos": 22456235548800.0, + "grad_norm": 1.8628636576511324, + "language_loss": 0.7586599, + "learning_rate": 3.1773343688117013e-06, + "loss": 0.78084755, + "num_input_tokens_seen": 57457235, + "step": 2667, + "time_per_iteration": 2.502856969833374 + }, + { + "auxiliary_loss_clip": 0.01174484, + "auxiliary_loss_mlp": 0.00763227, + "balance_loss_clip": 1.05508852, + "balance_loss_mlp": 1.00062799, + "epoch": 0.3208080322250947, + "flos": 20412236113920.0, + "grad_norm": 2.3254188746694915, + "language_loss": 0.84051573, + "learning_rate": 3.1767045779338445e-06, + "loss": 0.85989285, + "num_input_tokens_seen": 57474895, + "step": 2668, + "time_per_iteration": 2.5071640014648438 + }, + { + "auxiliary_loss_clip": 0.01180005, + "auxiliary_loss_mlp": 0.01028535, + "balance_loss_clip": 1.05166471, + "balance_loss_mlp": 1.02058411, + "epoch": 0.3209282751157338, + "flos": 21762118154880.0, + "grad_norm": 1.947844402617174, + "language_loss": 0.91487145, + "learning_rate": 3.176074608550839e-06, + "loss": 0.93695688, + "num_input_tokens_seen": 57490715, + "step": 2669, + "time_per_iteration": 2.4701504707336426 + }, + { + "auxiliary_loss_clip": 0.01126832, + "auxiliary_loss_mlp": 0.01034288, + "balance_loss_clip": 1.04980314, + "balance_loss_mlp": 1.02528167, + "epoch": 0.32104851800637285, + "flos": 22055041566720.0, + "grad_norm": 2.499392324898509, + "language_loss": 0.82422119, + "learning_rate": 3.17544446075825e-06, + "loss": 0.84583241, + "num_input_tokens_seen": 57509880, + "step": 2670, + "time_per_iteration": 2.614842176437378 + }, + { + "auxiliary_loss_clip": 0.01170675, + "auxiliary_loss_mlp": 0.0103213, + "balance_loss_clip": 1.05145931, + "balance_loss_mlp": 1.02382159, + "epoch": 0.32116876089701196, + "flos": 37012301896320.0, + "grad_norm": 1.5296495839859063, + "language_loss": 0.71071321, + "learning_rate": 3.174814134651671e-06, + "loss": 0.73274124, + "num_input_tokens_seen": 57532430, + "step": 2671, + "time_per_iteration": 2.630574941635132 + }, + { + "auxiliary_loss_clip": 0.01193212, + "auxiliary_loss_mlp": 0.01029687, + "balance_loss_clip": 1.05584431, + "balance_loss_mlp": 1.02081239, + "epoch": 0.3212890037876511, + "flos": 21979233912960.0, + "grad_norm": 1.698827045261783, + "language_loss": 0.80024511, + "learning_rate": 3.1741836303267215e-06, + "loss": 0.82247412, + "num_input_tokens_seen": 57551965, + "step": 2672, + "time_per_iteration": 2.4482243061065674 + }, + { + "auxiliary_loss_clip": 0.01197403, + "auxiliary_loss_mlp": 0.01032268, + "balance_loss_clip": 1.05785489, + "balance_loss_mlp": 1.02366734, + "epoch": 0.32140924667829013, + "flos": 10342345875840.0, + "grad_norm": 2.4938858532174906, + "language_loss": 0.74717385, + "learning_rate": 3.1735529478790496e-06, + "loss": 0.76947057, + "num_input_tokens_seen": 57569955, + "step": 2673, + "time_per_iteration": 2.4303486347198486 + }, + { + "auxiliary_loss_clip": 0.01184206, + "auxiliary_loss_mlp": 0.01036747, + "balance_loss_clip": 1.05565667, + "balance_loss_mlp": 1.02731144, + "epoch": 0.32152948956892924, + "flos": 50798910072960.0, + "grad_norm": 2.4063442551935266, + "language_loss": 0.79601586, + "learning_rate": 3.172922087404328e-06, + "loss": 0.81822538, + "num_input_tokens_seen": 57592215, + "step": 2674, + "time_per_iteration": 2.72676157951355 + }, + { + "auxiliary_loss_clip": 0.01092916, + "auxiliary_loss_mlp": 0.01010854, + "balance_loss_clip": 1.02002215, + "balance_loss_mlp": 1.00948358, + "epoch": 0.32164973245956835, + "flos": 63863250549120.0, + "grad_norm": 0.7707314387373306, + "language_loss": 0.55224103, + "learning_rate": 3.1722910489982586e-06, + "loss": 0.57327873, + "num_input_tokens_seen": 57652575, + "step": 2675, + "time_per_iteration": 3.1237881183624268 + }, + { + "auxiliary_loss_clip": 0.01163886, + "auxiliary_loss_mlp": 0.01034671, + "balance_loss_clip": 1.05245841, + "balance_loss_mlp": 1.02481246, + "epoch": 0.3217699753502074, + "flos": 23513948363520.0, + "grad_norm": 1.6502368158376535, + "language_loss": 0.80044043, + "learning_rate": 3.1716598327565694e-06, + "loss": 0.82242602, + "num_input_tokens_seen": 57672215, + "step": 2676, + "time_per_iteration": 2.528956890106201 + }, + { + "auxiliary_loss_clip": 0.01194899, + "auxiliary_loss_mlp": 0.01028833, + "balance_loss_clip": 1.05655432, + "balance_loss_mlp": 1.01999962, + "epoch": 0.3218902182408465, + "flos": 19062533640960.0, + "grad_norm": 1.4831869881905126, + "language_loss": 0.84150463, + "learning_rate": 3.171028438775015e-06, + "loss": 0.86374193, + "num_input_tokens_seen": 57691410, + "step": 2677, + "time_per_iteration": 2.451852798461914 + }, + { + "auxiliary_loss_clip": 0.01196067, + "auxiliary_loss_mlp": 0.01026596, + "balance_loss_clip": 1.05606842, + "balance_loss_mlp": 1.01790512, + "epoch": 0.3220104611314856, + "flos": 20375571306240.0, + "grad_norm": 1.8074374959184454, + "language_loss": 0.84374976, + "learning_rate": 3.170396867149377e-06, + "loss": 0.86597633, + "num_input_tokens_seen": 57709415, + "step": 2678, + "time_per_iteration": 2.43979811668396 + }, + { + "auxiliary_loss_clip": 0.01133654, + "auxiliary_loss_mlp": 0.01035946, + "balance_loss_clip": 1.05165195, + "balance_loss_mlp": 1.02676725, + "epoch": 0.3221307040221247, + "flos": 20117014231680.0, + "grad_norm": 2.0008282052522466, + "language_loss": 0.86208737, + "learning_rate": 3.1697651179754653e-06, + "loss": 0.88378334, + "num_input_tokens_seen": 57728075, + "step": 2679, + "time_per_iteration": 2.5684173107147217 + }, + { + "auxiliary_loss_clip": 0.01156213, + "auxiliary_loss_mlp": 0.01030921, + "balance_loss_clip": 1.05777764, + "balance_loss_mlp": 1.02197468, + "epoch": 0.3222509469127638, + "flos": 23987789602560.0, + "grad_norm": 1.8692215743656206, + "language_loss": 0.72507495, + "learning_rate": 3.1691331913491153e-06, + "loss": 0.74694628, + "num_input_tokens_seen": 57750645, + "step": 2680, + "time_per_iteration": 2.624061107635498 + }, + { + "auxiliary_loss_clip": 0.01196665, + "auxiliary_loss_mlp": 0.01029452, + "balance_loss_clip": 1.05548632, + "balance_loss_mlp": 1.02124465, + "epoch": 0.32237118980340285, + "flos": 17675735397120.0, + "grad_norm": 1.939409379701754, + "language_loss": 0.84830856, + "learning_rate": 3.1685010873661898e-06, + "loss": 0.87056983, + "num_input_tokens_seen": 57769820, + "step": 2681, + "time_per_iteration": 2.424513578414917 + }, + { + "auxiliary_loss_clip": 0.01180259, + "auxiliary_loss_mlp": 0.01034394, + "balance_loss_clip": 1.05473995, + "balance_loss_mlp": 1.02457142, + "epoch": 0.32249143269404196, + "flos": 23147982645120.0, + "grad_norm": 1.7724503176797812, + "language_loss": 0.79186046, + "learning_rate": 3.167868806122578e-06, + "loss": 0.81400692, + "num_input_tokens_seen": 57788870, + "step": 2682, + "time_per_iteration": 2.5035159587860107 + }, + { + "auxiliary_loss_clip": 0.011715, + "auxiliary_loss_mlp": 0.01031095, + "balance_loss_clip": 1.05469, + "balance_loss_mlp": 1.02213037, + "epoch": 0.32261167558468107, + "flos": 24422308427520.0, + "grad_norm": 1.9495859899568004, + "language_loss": 0.66128838, + "learning_rate": 3.1672363477141968e-06, + "loss": 0.68331432, + "num_input_tokens_seen": 57808165, + "step": 2683, + "time_per_iteration": 3.332672119140625 + }, + { + "auxiliary_loss_clip": 0.01172896, + "auxiliary_loss_mlp": 0.01033673, + "balance_loss_clip": 1.05403292, + "balance_loss_mlp": 1.0241425, + "epoch": 0.3227319184753201, + "flos": 30367175852160.0, + "grad_norm": 2.2947186139337807, + "language_loss": 0.84680235, + "learning_rate": 3.1666037122369903e-06, + "loss": 0.86886799, + "num_input_tokens_seen": 57828825, + "step": 2684, + "time_per_iteration": 2.608609914779663 + }, + { + "auxiliary_loss_clip": 0.01178337, + "auxiliary_loss_mlp": 0.01028968, + "balance_loss_clip": 1.05092168, + "balance_loss_mlp": 1.02071261, + "epoch": 0.32285216136595923, + "flos": 16946174257920.0, + "grad_norm": 1.9578937502283034, + "language_loss": 0.86098379, + "learning_rate": 3.165970899786928e-06, + "loss": 0.88305682, + "num_input_tokens_seen": 57846740, + "step": 2685, + "time_per_iteration": 2.486828565597534 + }, + { + "auxiliary_loss_clip": 0.01156262, + "auxiliary_loss_mlp": 0.01029008, + "balance_loss_clip": 1.05097556, + "balance_loss_mlp": 1.02024007, + "epoch": 0.32297240425659834, + "flos": 21981532383360.0, + "grad_norm": 1.7642745578538344, + "language_loss": 0.75392663, + "learning_rate": 3.1653379104600067e-06, + "loss": 0.77577925, + "num_input_tokens_seen": 57866885, + "step": 2686, + "time_per_iteration": 2.5977187156677246 + }, + { + "auxiliary_loss_clip": 0.01180056, + "auxiliary_loss_mlp": 0.0102883, + "balance_loss_clip": 1.05377781, + "balance_loss_mlp": 1.02030635, + "epoch": 0.3230926471472374, + "flos": 22748045639040.0, + "grad_norm": 1.7159944434712597, + "language_loss": 0.69325596, + "learning_rate": 3.164704744352251e-06, + "loss": 0.71534479, + "num_input_tokens_seen": 57887690, + "step": 2687, + "time_per_iteration": 4.040462255477905 + }, + { + "auxiliary_loss_clip": 0.01179025, + "auxiliary_loss_mlp": 0.01031177, + "balance_loss_clip": 1.0531683, + "balance_loss_mlp": 1.02308893, + "epoch": 0.3232128900378765, + "flos": 16942977947520.0, + "grad_norm": 1.7777130642975978, + "language_loss": 0.80960113, + "learning_rate": 3.164071401559713e-06, + "loss": 0.83170319, + "num_input_tokens_seen": 57905090, + "step": 2688, + "time_per_iteration": 2.445493459701538 + }, + { + "auxiliary_loss_clip": 0.01168831, + "auxiliary_loss_mlp": 0.01031494, + "balance_loss_clip": 1.05359173, + "balance_loss_mlp": 1.02278543, + "epoch": 0.3233331329285156, + "flos": 24023736138240.0, + "grad_norm": 1.9197625781546181, + "language_loss": 0.71219003, + "learning_rate": 3.1634378821784674e-06, + "loss": 0.73419333, + "num_input_tokens_seen": 57925305, + "step": 2689, + "time_per_iteration": 3.292010545730591 + }, + { + "auxiliary_loss_clip": 0.0115726, + "auxiliary_loss_mlp": 0.01032408, + "balance_loss_clip": 1.05533051, + "balance_loss_mlp": 1.02390885, + "epoch": 0.3234533758191547, + "flos": 18113845582080.0, + "grad_norm": 2.470326534727171, + "language_loss": 0.73527181, + "learning_rate": 3.1628041863046208e-06, + "loss": 0.75716847, + "num_input_tokens_seen": 57942720, + "step": 2690, + "time_per_iteration": 2.5170297622680664 + }, + { + "auxiliary_loss_clip": 0.01199781, + "auxiliary_loss_mlp": 0.01032985, + "balance_loss_clip": 1.05511689, + "balance_loss_mlp": 1.02328753, + "epoch": 0.3235736187097938, + "flos": 16946138344320.0, + "grad_norm": 2.0010021595896332, + "language_loss": 0.9110775, + "learning_rate": 3.162170314034304e-06, + "loss": 0.93340522, + "num_input_tokens_seen": 57960135, + "step": 2691, + "time_per_iteration": 2.4119389057159424 + }, + { + "auxiliary_loss_clip": 0.01199474, + "auxiliary_loss_mlp": 0.01032901, + "balance_loss_clip": 1.05615497, + "balance_loss_mlp": 1.0233705, + "epoch": 0.3236938616004329, + "flos": 22127150119680.0, + "grad_norm": 1.8760269033822736, + "language_loss": 0.81023288, + "learning_rate": 3.1615362654636738e-06, + "loss": 0.83255661, + "num_input_tokens_seen": 57980875, + "step": 2692, + "time_per_iteration": 2.4539642333984375 + }, + { + "auxiliary_loss_clip": 0.01146675, + "auxiliary_loss_mlp": 0.0103465, + "balance_loss_clip": 1.05474424, + "balance_loss_mlp": 1.02604926, + "epoch": 0.32381410449107195, + "flos": 17164618819200.0, + "grad_norm": 1.8382800297038502, + "language_loss": 0.86811411, + "learning_rate": 3.1609020406889163e-06, + "loss": 0.88992739, + "num_input_tokens_seen": 57998310, + "step": 2693, + "time_per_iteration": 2.5080347061157227 + }, + { + "auxiliary_loss_clip": 0.01169948, + "auxiliary_loss_mlp": 0.01033546, + "balance_loss_clip": 1.05251408, + "balance_loss_mlp": 1.02341318, + "epoch": 0.32393434738171106, + "flos": 16578125550720.0, + "grad_norm": 1.6440058362721257, + "language_loss": 0.8532235, + "learning_rate": 3.1602676398062416e-06, + "loss": 0.87525845, + "num_input_tokens_seen": 58017220, + "step": 2694, + "time_per_iteration": 2.4832286834716797 + }, + { + "auxiliary_loss_clip": 0.01179039, + "auxiliary_loss_mlp": 0.01028417, + "balance_loss_clip": 1.05406702, + "balance_loss_mlp": 1.01906526, + "epoch": 0.3240545902723502, + "flos": 25483612602240.0, + "grad_norm": 2.1179173702396863, + "language_loss": 0.61864489, + "learning_rate": 3.1596330629118886e-06, + "loss": 0.64071947, + "num_input_tokens_seen": 58037190, + "step": 2695, + "time_per_iteration": 2.4912328720092773 + }, + { + "auxiliary_loss_clip": 0.01132065, + "auxiliary_loss_mlp": 0.01031783, + "balance_loss_clip": 1.04854774, + "balance_loss_mlp": 1.0222578, + "epoch": 0.32417483316298923, + "flos": 35845851634560.0, + "grad_norm": 1.893781614079185, + "language_loss": 0.72916615, + "learning_rate": 3.1589983101021223e-06, + "loss": 0.75080466, + "num_input_tokens_seen": 58055820, + "step": 2696, + "time_per_iteration": 2.6818079948425293 + }, + { + "auxiliary_loss_clip": 0.01168199, + "auxiliary_loss_mlp": 0.01029418, + "balance_loss_clip": 1.05292678, + "balance_loss_mlp": 1.02068579, + "epoch": 0.32429507605362834, + "flos": 30080501406720.0, + "grad_norm": 1.9577613916441583, + "language_loss": 0.84597111, + "learning_rate": 3.1583633814732337e-06, + "loss": 0.86794734, + "num_input_tokens_seen": 58075340, + "step": 2697, + "time_per_iteration": 2.5700199604034424 + }, + { + "auxiliary_loss_clip": 0.01192433, + "auxiliary_loss_mlp": 0.01033357, + "balance_loss_clip": 1.05284238, + "balance_loss_mlp": 1.0242852, + "epoch": 0.3244153189442674, + "flos": 18223265387520.0, + "grad_norm": 2.3761237306453147, + "language_loss": 0.71976578, + "learning_rate": 3.157728277121541e-06, + "loss": 0.74202365, + "num_input_tokens_seen": 58093515, + "step": 2698, + "time_per_iteration": 2.4299566745758057 + }, + { + "auxiliary_loss_clip": 0.0119341, + "auxiliary_loss_mlp": 0.01027828, + "balance_loss_clip": 1.0510484, + "balance_loss_mlp": 1.01845837, + "epoch": 0.3245355618349065, + "flos": 17710317216000.0, + "grad_norm": 6.400596133314306, + "language_loss": 0.78365457, + "learning_rate": 3.1570929971433897e-06, + "loss": 0.80586696, + "num_input_tokens_seen": 58109300, + "step": 2699, + "time_per_iteration": 2.424875020980835 + }, + { + "auxiliary_loss_clip": 0.01181156, + "auxiliary_loss_mlp": 0.01036957, + "balance_loss_clip": 1.05574501, + "balance_loss_mlp": 1.02833796, + "epoch": 0.3246558047255456, + "flos": 23440798316160.0, + "grad_norm": 1.9365195358887277, + "language_loss": 0.83794731, + "learning_rate": 3.1564575416351504e-06, + "loss": 0.8601284, + "num_input_tokens_seen": 58128000, + "step": 2700, + "time_per_iteration": 2.5128276348114014 + }, + { + "auxiliary_loss_clip": 0.01196366, + "auxiliary_loss_mlp": 0.01030984, + "balance_loss_clip": 1.05536652, + "balance_loss_mlp": 1.0214653, + "epoch": 0.32477604761618467, + "flos": 21760861178880.0, + "grad_norm": 3.1787355314932295, + "language_loss": 0.74069202, + "learning_rate": 3.155821910693221e-06, + "loss": 0.76296556, + "num_input_tokens_seen": 58147415, + "step": 2701, + "time_per_iteration": 2.4385783672332764 + }, + { + "auxiliary_loss_clip": 0.01167244, + "auxiliary_loss_mlp": 0.01029603, + "balance_loss_clip": 1.05251956, + "balance_loss_mlp": 1.02025723, + "epoch": 0.3248962905068238, + "flos": 19828328624640.0, + "grad_norm": 1.5612096909784519, + "language_loss": 0.85856962, + "learning_rate": 3.1551861044140275e-06, + "loss": 0.88053811, + "num_input_tokens_seen": 58167050, + "step": 2702, + "time_per_iteration": 2.4990007877349854 + }, + { + "auxiliary_loss_clip": 0.01132051, + "auxiliary_loss_mlp": 0.01028084, + "balance_loss_clip": 1.0496099, + "balance_loss_mlp": 1.0193758, + "epoch": 0.3250165333974629, + "flos": 23948215793280.0, + "grad_norm": 1.6658724709294723, + "language_loss": 0.77569652, + "learning_rate": 3.15455012289402e-06, + "loss": 0.79729784, + "num_input_tokens_seen": 58186695, + "step": 2703, + "time_per_iteration": 2.5946218967437744 + }, + { + "auxiliary_loss_clip": 0.01182007, + "auxiliary_loss_mlp": 0.01030869, + "balance_loss_clip": 1.05555046, + "balance_loss_mlp": 1.02167821, + "epoch": 0.32513677628810195, + "flos": 23989333887360.0, + "grad_norm": 3.9607965909644904, + "language_loss": 0.84283894, + "learning_rate": 3.153913966229677e-06, + "loss": 0.8649677, + "num_input_tokens_seen": 58205815, + "step": 2704, + "time_per_iteration": 2.502291440963745 + }, + { + "auxiliary_loss_clip": 0.01078712, + "auxiliary_loss_mlp": 0.01005777, + "balance_loss_clip": 1.01642036, + "balance_loss_mlp": 1.00431657, + "epoch": 0.32525701917874106, + "flos": 70655790009600.0, + "grad_norm": 0.6387040425963723, + "language_loss": 0.50328881, + "learning_rate": 3.1532776345175027e-06, + "loss": 0.52413368, + "num_input_tokens_seen": 58270960, + "step": 2705, + "time_per_iteration": 3.036266326904297 + }, + { + "auxiliary_loss_clip": 0.01194323, + "auxiliary_loss_mlp": 0.01032407, + "balance_loss_clip": 1.05550146, + "balance_loss_mlp": 1.02324569, + "epoch": 0.32537726206938017, + "flos": 19682639061120.0, + "grad_norm": 2.132131082803712, + "language_loss": 0.78440779, + "learning_rate": 3.1526411278540285e-06, + "loss": 0.80667508, + "num_input_tokens_seen": 58289390, + "step": 2706, + "time_per_iteration": 2.42107892036438 + }, + { + "auxiliary_loss_clip": 0.01175325, + "auxiliary_loss_mlp": 0.01034778, + "balance_loss_clip": 1.0531894, + "balance_loss_mlp": 1.02490735, + "epoch": 0.3254975049600192, + "flos": 28760999293440.0, + "grad_norm": 2.7585289727314612, + "language_loss": 0.8130399, + "learning_rate": 3.1520044463358116e-06, + "loss": 0.83514094, + "num_input_tokens_seen": 58306120, + "step": 2707, + "time_per_iteration": 2.557586193084717 + }, + { + "auxiliary_loss_clip": 0.0118009, + "auxiliary_loss_mlp": 0.01029569, + "balance_loss_clip": 1.05453086, + "balance_loss_mlp": 1.02064061, + "epoch": 0.32561774785065833, + "flos": 18877378008960.0, + "grad_norm": 1.4917316052584815, + "language_loss": 0.80196697, + "learning_rate": 3.151367590059436e-06, + "loss": 0.82406354, + "num_input_tokens_seen": 58324545, + "step": 2708, + "time_per_iteration": 2.468174695968628 + }, + { + "auxiliary_loss_clip": 0.01197417, + "auxiliary_loss_mlp": 0.00763989, + "balance_loss_clip": 1.05582774, + "balance_loss_mlp": 1.00070953, + "epoch": 0.32573799074129745, + "flos": 23112107936640.0, + "grad_norm": 2.134552221016289, + "language_loss": 0.86764133, + "learning_rate": 3.1507305591215117e-06, + "loss": 0.88725543, + "num_input_tokens_seen": 58342455, + "step": 2709, + "time_per_iteration": 2.457151412963867 + }, + { + "auxiliary_loss_clip": 0.0108015, + "auxiliary_loss_mlp": 0.01004537, + "balance_loss_clip": 1.01838183, + "balance_loss_mlp": 1.00319028, + "epoch": 0.3258582336319365, + "flos": 71237650423680.0, + "grad_norm": 0.6997374908262437, + "language_loss": 0.55731535, + "learning_rate": 3.150093353618677e-06, + "loss": 0.57816219, + "num_input_tokens_seen": 58407185, + "step": 2710, + "time_per_iteration": 3.792940139770508 + }, + { + "auxiliary_loss_clip": 0.01184014, + "auxiliary_loss_mlp": 0.01029317, + "balance_loss_clip": 1.05206847, + "balance_loss_mlp": 1.02055478, + "epoch": 0.3259784765225756, + "flos": 22456020067200.0, + "grad_norm": 2.3522942503819775, + "language_loss": 0.88419425, + "learning_rate": 3.149455973647596e-06, + "loss": 0.90632755, + "num_input_tokens_seen": 58425245, + "step": 2711, + "time_per_iteration": 2.4712884426116943 + }, + { + "auxiliary_loss_clip": 0.01142173, + "auxiliary_loss_mlp": 0.01026528, + "balance_loss_clip": 1.0448451, + "balance_loss_mlp": 1.01640081, + "epoch": 0.32609871941321467, + "flos": 20484811543680.0, + "grad_norm": 1.8723609759291062, + "language_loss": 0.76585746, + "learning_rate": 3.1488184193049563e-06, + "loss": 0.78754443, + "num_input_tokens_seen": 58444780, + "step": 2712, + "time_per_iteration": 2.5319721698760986 + }, + { + "auxiliary_loss_clip": 0.01196111, + "auxiliary_loss_mlp": 0.01028959, + "balance_loss_clip": 1.05733132, + "balance_loss_mlp": 1.02011943, + "epoch": 0.3262189623038538, + "flos": 22416805393920.0, + "grad_norm": 1.5342698818434903, + "language_loss": 0.71899927, + "learning_rate": 3.1481806906874767e-06, + "loss": 0.74124992, + "num_input_tokens_seen": 58466090, + "step": 2713, + "time_per_iteration": 2.4679436683654785 + }, + { + "auxiliary_loss_clip": 0.01195718, + "auxiliary_loss_mlp": 0.01025873, + "balance_loss_clip": 1.05623674, + "balance_loss_mlp": 1.01780272, + "epoch": 0.3263392051944929, + "flos": 20923496346240.0, + "grad_norm": 1.539643688737575, + "language_loss": 0.87801594, + "learning_rate": 3.147542787891899e-06, + "loss": 0.90023184, + "num_input_tokens_seen": 58485435, + "step": 2714, + "time_per_iteration": 3.2124831676483154 + }, + { + "auxiliary_loss_clip": 0.01170528, + "auxiliary_loss_mlp": 0.01032889, + "balance_loss_clip": 1.05606866, + "balance_loss_mlp": 1.02342927, + "epoch": 0.32645944808513194, + "flos": 24025172682240.0, + "grad_norm": 1.928348410934563, + "language_loss": 0.74885017, + "learning_rate": 3.1469047110149926e-06, + "loss": 0.77088434, + "num_input_tokens_seen": 58504175, + "step": 2715, + "time_per_iteration": 3.2598342895507812 + }, + { + "auxiliary_loss_clip": 0.0113328, + "auxiliary_loss_mlp": 0.01028616, + "balance_loss_clip": 1.05044687, + "balance_loss_mlp": 1.01994312, + "epoch": 0.32657969097577105, + "flos": 21032413361280.0, + "grad_norm": 1.8091559194013147, + "language_loss": 0.85203236, + "learning_rate": 3.146266460153554e-06, + "loss": 0.87365127, + "num_input_tokens_seen": 58523885, + "step": 2716, + "time_per_iteration": 3.3400070667266846 + }, + { + "auxiliary_loss_clip": 0.01165621, + "auxiliary_loss_mlp": 0.0076389, + "balance_loss_clip": 1.05274141, + "balance_loss_mlp": 1.00067139, + "epoch": 0.32669993386641016, + "flos": 22710267509760.0, + "grad_norm": 1.8019803664941803, + "language_loss": 0.79855025, + "learning_rate": 3.145628035404404e-06, + "loss": 0.81784528, + "num_input_tokens_seen": 58543085, + "step": 2717, + "time_per_iteration": 2.531035900115967 + }, + { + "auxiliary_loss_clip": 0.01077913, + "auxiliary_loss_mlp": 0.01001952, + "balance_loss_clip": 1.01711452, + "balance_loss_mlp": 1.00058711, + "epoch": 0.3268201767570492, + "flos": 72105718406400.0, + "grad_norm": 0.8626224525147229, + "language_loss": 0.57516921, + "learning_rate": 3.1449894368643922e-06, + "loss": 0.59596789, + "num_input_tokens_seen": 58605400, + "step": 2718, + "time_per_iteration": 3.123241424560547 + }, + { + "auxiliary_loss_clip": 0.01151925, + "auxiliary_loss_mlp": 0.01032484, + "balance_loss_clip": 1.05336189, + "balance_loss_mlp": 1.02343583, + "epoch": 0.32694041964768833, + "flos": 24535175938560.0, + "grad_norm": 1.433767800586921, + "language_loss": 0.7115978, + "learning_rate": 3.1443506646303934e-06, + "loss": 0.73344189, + "num_input_tokens_seen": 58626700, + "step": 2719, + "time_per_iteration": 2.5780811309814453 + }, + { + "auxiliary_loss_clip": 0.01183286, + "auxiliary_loss_mlp": 0.01029743, + "balance_loss_clip": 1.05314898, + "balance_loss_mlp": 1.02038455, + "epoch": 0.32706066253832744, + "flos": 33183003755520.0, + "grad_norm": 1.9554570547083465, + "language_loss": 0.66973567, + "learning_rate": 3.1437117187993086e-06, + "loss": 0.69186592, + "num_input_tokens_seen": 58649020, + "step": 2720, + "time_per_iteration": 2.5814831256866455 + }, + { + "auxiliary_loss_clip": 0.01146446, + "auxiliary_loss_mlp": 0.01035203, + "balance_loss_clip": 1.04847312, + "balance_loss_mlp": 1.02634025, + "epoch": 0.3271809054289665, + "flos": 24061622008320.0, + "grad_norm": 1.6146316645045231, + "language_loss": 0.80015755, + "learning_rate": 3.143072599468065e-06, + "loss": 0.82197404, + "num_input_tokens_seen": 58668845, + "step": 2721, + "time_per_iteration": 2.5650877952575684 + }, + { + "auxiliary_loss_clip": 0.01167006, + "auxiliary_loss_mlp": 0.01027684, + "balance_loss_clip": 1.05463767, + "balance_loss_mlp": 1.01905346, + "epoch": 0.3273011483196056, + "flos": 38253769712640.0, + "grad_norm": 1.5675336047065764, + "language_loss": 0.75422573, + "learning_rate": 3.1424333067336174e-06, + "loss": 0.7761727, + "num_input_tokens_seen": 58691610, + "step": 2722, + "time_per_iteration": 2.6553666591644287 + }, + { + "auxiliary_loss_clip": 0.01185385, + "auxiliary_loss_mlp": 0.01030762, + "balance_loss_clip": 1.05302215, + "balance_loss_mlp": 1.02116001, + "epoch": 0.3274213912102447, + "flos": 29054389582080.0, + "grad_norm": 2.0408481163148777, + "language_loss": 0.78059494, + "learning_rate": 3.141793840692945e-06, + "loss": 0.80275631, + "num_input_tokens_seen": 58712360, + "step": 2723, + "time_per_iteration": 2.5347023010253906 + }, + { + "auxiliary_loss_clip": 0.01159556, + "auxiliary_loss_mlp": 0.01033845, + "balance_loss_clip": 1.05201828, + "balance_loss_mlp": 1.02399266, + "epoch": 0.32754163410088377, + "flos": 29133249891840.0, + "grad_norm": 8.740937954453335, + "language_loss": 0.61173701, + "learning_rate": 3.1411542014430553e-06, + "loss": 0.63367105, + "num_input_tokens_seen": 58733440, + "step": 2724, + "time_per_iteration": 2.575085163116455 + }, + { + "auxiliary_loss_clip": 0.01148998, + "auxiliary_loss_mlp": 0.01027746, + "balance_loss_clip": 1.04746938, + "balance_loss_mlp": 1.01944327, + "epoch": 0.3276618769915229, + "flos": 20631075724800.0, + "grad_norm": 1.761884134429568, + "language_loss": 0.81555158, + "learning_rate": 3.1405143890809804e-06, + "loss": 0.83731902, + "num_input_tokens_seen": 58752735, + "step": 2725, + "time_per_iteration": 2.53847074508667 + }, + { + "auxiliary_loss_clip": 0.0116511, + "auxiliary_loss_mlp": 0.01028105, + "balance_loss_clip": 1.05330873, + "balance_loss_mlp": 1.0199213, + "epoch": 0.327782119882162, + "flos": 18657425076480.0, + "grad_norm": 1.5618795700319843, + "language_loss": 0.69929993, + "learning_rate": 3.1398744037037796e-06, + "loss": 0.72123206, + "num_input_tokens_seen": 58772070, + "step": 2726, + "time_per_iteration": 2.500041961669922 + }, + { + "auxiliary_loss_clip": 0.0116664, + "auxiliary_loss_mlp": 0.01031488, + "balance_loss_clip": 1.05530238, + "balance_loss_mlp": 1.02306557, + "epoch": 0.32790236277280105, + "flos": 21795802133760.0, + "grad_norm": 1.9923204000146297, + "language_loss": 0.84092665, + "learning_rate": 3.139234245408538e-06, + "loss": 0.86290789, + "num_input_tokens_seen": 58790950, + "step": 2727, + "time_per_iteration": 2.509899139404297 + }, + { + "auxiliary_loss_clip": 0.0115562, + "auxiliary_loss_mlp": 0.00763273, + "balance_loss_clip": 1.05505037, + "balance_loss_mlp": 1.00075626, + "epoch": 0.32802260566344016, + "flos": 23331414424320.0, + "grad_norm": 2.551165177946793, + "language_loss": 0.76160979, + "learning_rate": 3.1385939142923666e-06, + "loss": 0.78079873, + "num_input_tokens_seen": 58813340, + "step": 2728, + "time_per_iteration": 2.5747344493865967 + }, + { + "auxiliary_loss_clip": 0.01169282, + "auxiliary_loss_mlp": 0.0103031, + "balance_loss_clip": 1.05262113, + "balance_loss_mlp": 1.02110684, + "epoch": 0.3281428485540792, + "flos": 24206988349440.0, + "grad_norm": 1.9820175924414678, + "language_loss": 0.78339201, + "learning_rate": 3.137953410452405e-06, + "loss": 0.80538797, + "num_input_tokens_seen": 58833610, + "step": 2729, + "time_per_iteration": 2.5413856506347656 + }, + { + "auxiliary_loss_clip": 0.01163273, + "auxiliary_loss_mlp": 0.0103085, + "balance_loss_clip": 1.05036521, + "balance_loss_mlp": 1.02248168, + "epoch": 0.3282630914447183, + "flos": 34128962380800.0, + "grad_norm": 2.1221133362935296, + "language_loss": 0.74244004, + "learning_rate": 3.1373127339858146e-06, + "loss": 0.76438129, + "num_input_tokens_seen": 58856210, + "step": 2730, + "time_per_iteration": 2.608194589614868 + }, + { + "auxiliary_loss_clip": 0.01145347, + "auxiliary_loss_mlp": 0.01028558, + "balance_loss_clip": 1.04652357, + "balance_loss_mlp": 1.02100015, + "epoch": 0.32838333433535744, + "flos": 27600726170880.0, + "grad_norm": 2.0690146127856117, + "language_loss": 0.74560702, + "learning_rate": 3.136671884989787e-06, + "loss": 0.76734602, + "num_input_tokens_seen": 58876120, + "step": 2731, + "time_per_iteration": 2.608880043029785 + }, + { + "auxiliary_loss_clip": 0.01128067, + "auxiliary_loss_mlp": 0.01031004, + "balance_loss_clip": 1.04907537, + "balance_loss_mlp": 1.02197933, + "epoch": 0.3285035772259965, + "flos": 12349500935040.0, + "grad_norm": 3.5404495958594357, + "language_loss": 0.87351179, + "learning_rate": 3.1360308635615383e-06, + "loss": 0.8951025, + "num_input_tokens_seen": 58894660, + "step": 2732, + "time_per_iteration": 2.624431848526001 + }, + { + "auxiliary_loss_clip": 0.01173104, + "auxiliary_loss_mlp": 0.01028303, + "balance_loss_clip": 1.05331719, + "balance_loss_mlp": 1.01854002, + "epoch": 0.3286238201166356, + "flos": 24316084932480.0, + "grad_norm": 9.221007978121659, + "language_loss": 0.78402007, + "learning_rate": 3.135389669798311e-06, + "loss": 0.80603415, + "num_input_tokens_seen": 58912720, + "step": 2733, + "time_per_iteration": 2.5329174995422363 + }, + { + "auxiliary_loss_clip": 0.01178229, + "auxiliary_loss_mlp": 0.00762775, + "balance_loss_clip": 1.05219889, + "balance_loss_mlp": 1.00064731, + "epoch": 0.3287440630072747, + "flos": 21392812471680.0, + "grad_norm": 1.7703876398850429, + "language_loss": 0.79878092, + "learning_rate": 3.134748303797373e-06, + "loss": 0.81819093, + "num_input_tokens_seen": 58930090, + "step": 2734, + "time_per_iteration": 2.5293099880218506 + }, + { + "auxiliary_loss_clip": 0.01142246, + "auxiliary_loss_mlp": 0.01032995, + "balance_loss_clip": 1.05022717, + "balance_loss_mlp": 1.02349973, + "epoch": 0.32886430589791377, + "flos": 23732536579200.0, + "grad_norm": 3.649784484599669, + "language_loss": 0.8135736, + "learning_rate": 3.1341067656560203e-06, + "loss": 0.83532596, + "num_input_tokens_seen": 58947935, + "step": 2735, + "time_per_iteration": 2.584841012954712 + }, + { + "auxiliary_loss_clip": 0.01175786, + "auxiliary_loss_mlp": 0.01027925, + "balance_loss_clip": 1.05361867, + "balance_loss_mlp": 1.01877546, + "epoch": 0.3289845487885529, + "flos": 22418708814720.0, + "grad_norm": 1.9252001770875868, + "language_loss": 0.86269391, + "learning_rate": 3.133465055471572e-06, + "loss": 0.88473094, + "num_input_tokens_seen": 58967720, + "step": 2736, + "time_per_iteration": 2.5454940795898438 + }, + { + "auxiliary_loss_clip": 0.01148171, + "auxiliary_loss_mlp": 0.0102709, + "balance_loss_clip": 1.04972839, + "balance_loss_mlp": 1.01839387, + "epoch": 0.329104791679192, + "flos": 19682603147520.0, + "grad_norm": 2.735990568750217, + "language_loss": 0.66060233, + "learning_rate": 3.1328231733413767e-06, + "loss": 0.68235493, + "num_input_tokens_seen": 58984360, + "step": 2737, + "time_per_iteration": 3.2627367973327637 + }, + { + "auxiliary_loss_clip": 0.01177688, + "auxiliary_loss_mlp": 0.01034726, + "balance_loss_clip": 1.05348146, + "balance_loss_mlp": 1.02508819, + "epoch": 0.32922503456983104, + "flos": 15997234803840.0, + "grad_norm": 2.908290414522752, + "language_loss": 0.90680486, + "learning_rate": 3.1321811193628067e-06, + "loss": 0.92892897, + "num_input_tokens_seen": 59002505, + "step": 2738, + "time_per_iteration": 2.448829174041748 + }, + { + "auxiliary_loss_clip": 0.01183221, + "auxiliary_loss_mlp": 0.0076382, + "balance_loss_clip": 1.05626762, + "balance_loss_mlp": 1.00060153, + "epoch": 0.32934527746047015, + "flos": 26834069260800.0, + "grad_norm": 1.8988373423312366, + "language_loss": 0.70013416, + "learning_rate": 3.131538893633261e-06, + "loss": 0.71960455, + "num_input_tokens_seen": 59022065, + "step": 2739, + "time_per_iteration": 2.5276527404785156 + }, + { + "auxiliary_loss_clip": 0.01196965, + "auxiliary_loss_mlp": 0.01033193, + "balance_loss_clip": 1.0562135, + "balance_loss_mlp": 1.02463353, + "epoch": 0.32946552035110926, + "flos": 23403774372480.0, + "grad_norm": 2.2813781371060458, + "language_loss": 0.78175813, + "learning_rate": 3.130896496250165e-06, + "loss": 0.80405968, + "num_input_tokens_seen": 59041890, + "step": 2740, + "time_per_iteration": 3.176656484603882 + }, + { + "auxiliary_loss_clip": 0.01197647, + "auxiliary_loss_mlp": 0.01029702, + "balance_loss_clip": 1.05646992, + "balance_loss_mlp": 1.02079129, + "epoch": 0.3295857632417483, + "flos": 14172470029440.0, + "grad_norm": 1.8876667187012213, + "language_loss": 0.86343688, + "learning_rate": 3.1302539273109693e-06, + "loss": 0.88571036, + "num_input_tokens_seen": 59058715, + "step": 2741, + "time_per_iteration": 3.2213900089263916 + }, + { + "auxiliary_loss_clip": 0.01161216, + "auxiliary_loss_mlp": 0.01034194, + "balance_loss_clip": 1.05381393, + "balance_loss_mlp": 1.02447283, + "epoch": 0.32970600613238743, + "flos": 22196708807040.0, + "grad_norm": 3.971633298452612, + "language_loss": 0.8052001, + "learning_rate": 3.1296111869131513e-06, + "loss": 0.82715422, + "num_input_tokens_seen": 59076140, + "step": 2742, + "time_per_iteration": 3.220444917678833 + }, + { + "auxiliary_loss_clip": 0.01194825, + "auxiliary_loss_mlp": 0.01029972, + "balance_loss_clip": 1.05545866, + "balance_loss_mlp": 1.02132308, + "epoch": 0.32982624902302654, + "flos": 22053784590720.0, + "grad_norm": 1.849692002799749, + "language_loss": 0.85697186, + "learning_rate": 3.1289682751542153e-06, + "loss": 0.87921989, + "num_input_tokens_seen": 59095700, + "step": 2743, + "time_per_iteration": 2.43133544921875 + }, + { + "auxiliary_loss_clip": 0.01179622, + "auxiliary_loss_mlp": 0.01030113, + "balance_loss_clip": 1.0541333, + "balance_loss_mlp": 1.02128005, + "epoch": 0.3299464919136656, + "flos": 18661626967680.0, + "grad_norm": 1.9997369560301048, + "language_loss": 0.71524131, + "learning_rate": 3.1283251921316883e-06, + "loss": 0.73733866, + "num_input_tokens_seen": 59113445, + "step": 2744, + "time_per_iteration": 2.4615464210510254 + }, + { + "auxiliary_loss_clip": 0.01136017, + "auxiliary_loss_mlp": 0.01033641, + "balance_loss_clip": 1.05174375, + "balance_loss_mlp": 1.02424121, + "epoch": 0.3300667348043047, + "flos": 13407357404160.0, + "grad_norm": 1.8641877584954194, + "language_loss": 0.80454659, + "learning_rate": 3.1276819379431277e-06, + "loss": 0.82624316, + "num_input_tokens_seen": 59131535, + "step": 2745, + "time_per_iteration": 2.538282632827759 + }, + { + "auxiliary_loss_clip": 0.01178047, + "auxiliary_loss_mlp": 0.00764139, + "balance_loss_clip": 1.05588245, + "balance_loss_mlp": 1.00054383, + "epoch": 0.33018697769494376, + "flos": 15742556398080.0, + "grad_norm": 1.820541862399045, + "language_loss": 0.75198758, + "learning_rate": 3.1270385126861134e-06, + "loss": 0.77140939, + "num_input_tokens_seen": 59149520, + "step": 2746, + "time_per_iteration": 2.4763944149017334 + }, + { + "auxiliary_loss_clip": 0.01198134, + "auxiliary_loss_mlp": 0.01040051, + "balance_loss_clip": 1.05665684, + "balance_loss_mlp": 1.03024626, + "epoch": 0.3303072205855829, + "flos": 18258601392000.0, + "grad_norm": 1.8000886720331322, + "language_loss": 0.81976956, + "learning_rate": 3.1263949164582533e-06, + "loss": 0.8421514, + "num_input_tokens_seen": 59169170, + "step": 2747, + "time_per_iteration": 2.4179556369781494 + }, + { + "auxiliary_loss_clip": 0.0119555, + "auxiliary_loss_mlp": 0.01026834, + "balance_loss_clip": 1.05339181, + "balance_loss_mlp": 1.01797688, + "epoch": 0.330427463476222, + "flos": 17749424148480.0, + "grad_norm": 2.303559699338341, + "language_loss": 0.78236842, + "learning_rate": 3.1257511493571797e-06, + "loss": 0.80459231, + "num_input_tokens_seen": 59187675, + "step": 2748, + "time_per_iteration": 2.4106059074401855 + }, + { + "auxiliary_loss_clip": 0.01153933, + "auxiliary_loss_mlp": 0.01030465, + "balance_loss_clip": 1.05153894, + "balance_loss_mlp": 1.02158403, + "epoch": 0.33054770636686104, + "flos": 27162580072320.0, + "grad_norm": 1.6869049760509292, + "language_loss": 0.78404516, + "learning_rate": 3.125107211480552e-06, + "loss": 0.80588913, + "num_input_tokens_seen": 59207610, + "step": 2749, + "time_per_iteration": 2.5942020416259766 + }, + { + "auxiliary_loss_clip": 0.01120788, + "auxiliary_loss_mlp": 0.01035796, + "balance_loss_clip": 1.04685295, + "balance_loss_mlp": 1.02657521, + "epoch": 0.33066794925750015, + "flos": 20117193799680.0, + "grad_norm": 1.7659177497402145, + "language_loss": 0.79705429, + "learning_rate": 3.124463102926054e-06, + "loss": 0.8186202, + "num_input_tokens_seen": 59226945, + "step": 2750, + "time_per_iteration": 2.583251953125 + }, + { + "auxiliary_loss_clip": 0.01073866, + "auxiliary_loss_mlp": 0.01005386, + "balance_loss_clip": 1.01833987, + "balance_loss_mlp": 1.00391424, + "epoch": 0.33078819214813926, + "flos": 70642609718400.0, + "grad_norm": 1.0876534535459346, + "language_loss": 0.61599857, + "learning_rate": 3.1238188237913984e-06, + "loss": 0.63679111, + "num_input_tokens_seen": 59291485, + "step": 2751, + "time_per_iteration": 3.1297686100006104 + }, + { + "auxiliary_loss_clip": 0.01203184, + "auxiliary_loss_mlp": 0.01033621, + "balance_loss_clip": 1.0604074, + "balance_loss_mlp": 1.02421594, + "epoch": 0.3309084350387783, + "flos": 21141940907520.0, + "grad_norm": 2.068443483577795, + "language_loss": 0.76035947, + "learning_rate": 3.1231743741743202e-06, + "loss": 0.78272754, + "num_input_tokens_seen": 59310990, + "step": 2752, + "time_per_iteration": 2.434490203857422 + }, + { + "auxiliary_loss_clip": 0.01176722, + "auxiliary_loss_mlp": 0.01033209, + "balance_loss_clip": 1.05196226, + "balance_loss_mlp": 1.02442336, + "epoch": 0.3310286779294174, + "flos": 14209350318720.0, + "grad_norm": 2.201309852993666, + "language_loss": 0.83492947, + "learning_rate": 3.122529754172582e-06, + "loss": 0.85702872, + "num_input_tokens_seen": 59327875, + "step": 2753, + "time_per_iteration": 2.4350156784057617 + }, + { + "auxiliary_loss_clip": 0.01183822, + "auxiliary_loss_mlp": 0.0103552, + "balance_loss_clip": 1.0571171, + "balance_loss_mlp": 1.02617395, + "epoch": 0.33114892082005654, + "flos": 20778130005120.0, + "grad_norm": 2.2289943341632963, + "language_loss": 0.7240442, + "learning_rate": 3.1218849638839736e-06, + "loss": 0.74623764, + "num_input_tokens_seen": 59347135, + "step": 2754, + "time_per_iteration": 2.5005574226379395 + }, + { + "auxiliary_loss_clip": 0.01140478, + "auxiliary_loss_mlp": 0.01033764, + "balance_loss_clip": 1.04526937, + "balance_loss_mlp": 1.02378011, + "epoch": 0.3312691637106956, + "flos": 17090750499840.0, + "grad_norm": 1.9909636651248352, + "language_loss": 0.78276849, + "learning_rate": 3.121240003406307e-06, + "loss": 0.80451095, + "num_input_tokens_seen": 59365985, + "step": 2755, + "time_per_iteration": 2.5181753635406494 + }, + { + "auxiliary_loss_clip": 0.01157636, + "auxiliary_loss_mlp": 0.01031759, + "balance_loss_clip": 1.05443132, + "balance_loss_mlp": 1.02201378, + "epoch": 0.3313894066013347, + "flos": 29456230008960.0, + "grad_norm": 2.3311156101318455, + "language_loss": 0.72529775, + "learning_rate": 3.120594872837425e-06, + "loss": 0.74719167, + "num_input_tokens_seen": 59384655, + "step": 2756, + "time_per_iteration": 2.60080623626709 + }, + { + "auxiliary_loss_clip": 0.0107489, + "auxiliary_loss_mlp": 0.00754626, + "balance_loss_clip": 1.01484716, + "balance_loss_mlp": 1.00035918, + "epoch": 0.3315096494919738, + "flos": 61419242280960.0, + "grad_norm": 0.8274160298475397, + "language_loss": 0.62366676, + "learning_rate": 3.1199495722751906e-06, + "loss": 0.64196193, + "num_input_tokens_seen": 59444185, + "step": 2757, + "time_per_iteration": 3.0746567249298096 + }, + { + "auxiliary_loss_clip": 0.01139573, + "auxiliary_loss_mlp": 0.01035412, + "balance_loss_clip": 1.04863906, + "balance_loss_mlp": 1.02612519, + "epoch": 0.33162989238261287, + "flos": 21653057485440.0, + "grad_norm": 2.5878155170851973, + "language_loss": 0.83746129, + "learning_rate": 3.1193041018174972e-06, + "loss": 0.85921115, + "num_input_tokens_seen": 59464900, + "step": 2758, + "time_per_iteration": 2.5973706245422363 + }, + { + "auxiliary_loss_clip": 0.01187239, + "auxiliary_loss_mlp": 0.01025052, + "balance_loss_clip": 1.05716693, + "balance_loss_mlp": 1.01559317, + "epoch": 0.331750135273252, + "flos": 22674787850880.0, + "grad_norm": 2.0806539944430127, + "language_loss": 0.94700098, + "learning_rate": 3.118658461562261e-06, + "loss": 0.96912384, + "num_input_tokens_seen": 59481000, + "step": 2759, + "time_per_iteration": 2.4847652912139893 + }, + { + "auxiliary_loss_clip": 0.01169892, + "auxiliary_loss_mlp": 0.01032296, + "balance_loss_clip": 1.0566169, + "balance_loss_mlp": 1.02211511, + "epoch": 0.33187037816389103, + "flos": 22746896403840.0, + "grad_norm": 1.6285847912223168, + "language_loss": 0.84643918, + "learning_rate": 3.118012651607426e-06, + "loss": 0.86846107, + "num_input_tokens_seen": 59502605, + "step": 2760, + "time_per_iteration": 2.526370048522949 + }, + { + "auxiliary_loss_clip": 0.01198454, + "auxiliary_loss_mlp": 0.01039406, + "balance_loss_clip": 1.05795038, + "balance_loss_mlp": 1.02912462, + "epoch": 0.33199062105453014, + "flos": 19203769918080.0, + "grad_norm": 1.9876924705126986, + "language_loss": 0.83491468, + "learning_rate": 3.1173666720509603e-06, + "loss": 0.85729337, + "num_input_tokens_seen": 59519540, + "step": 2761, + "time_per_iteration": 2.422773599624634 + }, + { + "auxiliary_loss_clip": 0.01172078, + "auxiliary_loss_mlp": 0.01032797, + "balance_loss_clip": 1.05278134, + "balance_loss_mlp": 1.02368319, + "epoch": 0.33211086394516925, + "flos": 31577006764800.0, + "grad_norm": 1.6952519731642108, + "language_loss": 0.68371475, + "learning_rate": 3.116720522990859e-06, + "loss": 0.70576346, + "num_input_tokens_seen": 59540415, + "step": 2762, + "time_per_iteration": 2.575589179992676 + }, + { + "auxiliary_loss_clip": 0.01126358, + "auxiliary_loss_mlp": 0.0103493, + "balance_loss_clip": 1.05060244, + "balance_loss_mlp": 1.02572727, + "epoch": 0.3322311068358083, + "flos": 17932496791680.0, + "grad_norm": 2.050752629920095, + "language_loss": 0.62380207, + "learning_rate": 3.116074204525142e-06, + "loss": 0.64541489, + "num_input_tokens_seen": 59558590, + "step": 2763, + "time_per_iteration": 2.614272356033325 + }, + { + "auxiliary_loss_clip": 0.01173944, + "auxiliary_loss_mlp": 0.01029726, + "balance_loss_clip": 1.05423427, + "balance_loss_mlp": 1.0209403, + "epoch": 0.3323513497264474, + "flos": 32269831269120.0, + "grad_norm": 1.6859074107508745, + "language_loss": 0.83667606, + "learning_rate": 3.1154277167518553e-06, + "loss": 0.85871279, + "num_input_tokens_seen": 59580205, + "step": 2764, + "time_per_iteration": 3.7032220363616943 + }, + { + "auxiliary_loss_clip": 0.01059655, + "auxiliary_loss_mlp": 0.01001942, + "balance_loss_clip": 1.01379657, + "balance_loss_mlp": 1.00043356, + "epoch": 0.33247159261708653, + "flos": 52668674588160.0, + "grad_norm": 0.776819517404951, + "language_loss": 0.59479421, + "learning_rate": 3.114781059769072e-06, + "loss": 0.61541021, + "num_input_tokens_seen": 59631530, + "step": 2765, + "time_per_iteration": 2.938133478164673 + }, + { + "auxiliary_loss_clip": 0.01166919, + "auxiliary_loss_mlp": 0.01030275, + "balance_loss_clip": 1.0539465, + "balance_loss_mlp": 1.02089322, + "epoch": 0.3325918355077256, + "flos": 27125232906240.0, + "grad_norm": 2.839466661643603, + "language_loss": 0.6695714, + "learning_rate": 3.1141342336748874e-06, + "loss": 0.69154334, + "num_input_tokens_seen": 59651090, + "step": 2766, + "time_per_iteration": 2.539414167404175 + }, + { + "auxiliary_loss_clip": 0.01180182, + "auxiliary_loss_mlp": 0.01031354, + "balance_loss_clip": 1.05643678, + "balance_loss_mlp": 1.02242541, + "epoch": 0.3327120783983647, + "flos": 23664414435840.0, + "grad_norm": 1.4358032431631715, + "language_loss": 0.82020867, + "learning_rate": 3.1134872385674253e-06, + "loss": 0.84232402, + "num_input_tokens_seen": 59675245, + "step": 2767, + "time_per_iteration": 3.3064076900482178 + }, + { + "auxiliary_loss_clip": 0.01169402, + "auxiliary_loss_mlp": 0.01031205, + "balance_loss_clip": 1.05055547, + "balance_loss_mlp": 1.02179384, + "epoch": 0.3328323212890038, + "flos": 19171378828800.0, + "grad_norm": 2.006591968840961, + "language_loss": 0.85361969, + "learning_rate": 3.1128400745448353e-06, + "loss": 0.87562585, + "num_input_tokens_seen": 59694625, + "step": 2768, + "time_per_iteration": 3.257232904434204 + }, + { + "auxiliary_loss_clip": 0.01183108, + "auxiliary_loss_mlp": 0.01032945, + "balance_loss_clip": 1.05654144, + "balance_loss_mlp": 1.02412951, + "epoch": 0.33295256417964286, + "flos": 37706347463040.0, + "grad_norm": 2.2084032097622273, + "language_loss": 0.62541157, + "learning_rate": 3.11219274170529e-06, + "loss": 0.6475721, + "num_input_tokens_seen": 59716435, + "step": 2769, + "time_per_iteration": 3.319678544998169 + }, + { + "auxiliary_loss_clip": 0.01158605, + "auxiliary_loss_mlp": 0.01035464, + "balance_loss_clip": 1.0504148, + "balance_loss_mlp": 1.02715504, + "epoch": 0.333072807070282, + "flos": 26505989412480.0, + "grad_norm": 1.59051022200319, + "language_loss": 0.81172323, + "learning_rate": 3.1115452401469903e-06, + "loss": 0.83366394, + "num_input_tokens_seen": 59736835, + "step": 2770, + "time_per_iteration": 2.5382778644561768 + }, + { + "auxiliary_loss_clip": 0.01127599, + "auxiliary_loss_mlp": 0.01031026, + "balance_loss_clip": 1.04588032, + "balance_loss_mlp": 1.022264, + "epoch": 0.3331930499609211, + "flos": 21430913823360.0, + "grad_norm": 2.3641673178546108, + "language_loss": 0.86370301, + "learning_rate": 3.1108975699681613e-06, + "loss": 0.88528925, + "num_input_tokens_seen": 59754230, + "step": 2771, + "time_per_iteration": 2.568330764770508 + }, + { + "auxiliary_loss_clip": 0.01150873, + "auxiliary_loss_mlp": 0.0103277, + "balance_loss_clip": 1.05190873, + "balance_loss_mlp": 1.02394879, + "epoch": 0.33331329285156014, + "flos": 20659947281280.0, + "grad_norm": 6.594183178884757, + "language_loss": 0.71611154, + "learning_rate": 3.1102497312670542e-06, + "loss": 0.73794794, + "num_input_tokens_seen": 59772235, + "step": 2772, + "time_per_iteration": 2.5261595249176025 + }, + { + "auxiliary_loss_clip": 0.01156358, + "auxiliary_loss_mlp": 0.01037273, + "balance_loss_clip": 1.05158055, + "balance_loss_mlp": 1.02820158, + "epoch": 0.33343353574219925, + "flos": 28001596930560.0, + "grad_norm": 1.975653713722697, + "language_loss": 0.80851769, + "learning_rate": 3.109601724141946e-06, + "loss": 0.83045399, + "num_input_tokens_seen": 59791230, + "step": 2773, + "time_per_iteration": 2.5471620559692383 + }, + { + "auxiliary_loss_clip": 0.01163587, + "auxiliary_loss_mlp": 0.01027218, + "balance_loss_clip": 1.05329001, + "balance_loss_mlp": 1.01871204, + "epoch": 0.33355377863283836, + "flos": 23764963582080.0, + "grad_norm": 1.658377033556966, + "language_loss": 0.68058264, + "learning_rate": 3.108953548691138e-06, + "loss": 0.70249063, + "num_input_tokens_seen": 59811315, + "step": 2774, + "time_per_iteration": 2.5239810943603516 + }, + { + "auxiliary_loss_clip": 0.01197075, + "auxiliary_loss_mlp": 0.01031884, + "balance_loss_clip": 1.05595493, + "balance_loss_mlp": 1.0226754, + "epoch": 0.3336740215234774, + "flos": 37779677078400.0, + "grad_norm": 3.232646274383352, + "language_loss": 0.73092693, + "learning_rate": 3.108305205012959e-06, + "loss": 0.75321651, + "num_input_tokens_seen": 59832010, + "step": 2775, + "time_per_iteration": 2.566007137298584 + }, + { + "auxiliary_loss_clip": 0.01163709, + "auxiliary_loss_mlp": 0.01028586, + "balance_loss_clip": 1.05256629, + "balance_loss_mlp": 1.01951408, + "epoch": 0.3337942644141165, + "flos": 25519056347520.0, + "grad_norm": 2.2503175041184544, + "language_loss": 0.87371236, + "learning_rate": 3.107656693205761e-06, + "loss": 0.89563531, + "num_input_tokens_seen": 59851450, + "step": 2776, + "time_per_iteration": 2.532508134841919 + }, + { + "auxiliary_loss_clip": 0.01199765, + "auxiliary_loss_mlp": 0.010334, + "balance_loss_clip": 1.05680847, + "balance_loss_mlp": 1.02382135, + "epoch": 0.3339145073047556, + "flos": 25989844930560.0, + "grad_norm": 3.0228131326438232, + "language_loss": 0.70522356, + "learning_rate": 3.107008013367924e-06, + "loss": 0.72755522, + "num_input_tokens_seen": 59870245, + "step": 2777, + "time_per_iteration": 2.459268808364868 + }, + { + "auxiliary_loss_clip": 0.01151072, + "auxiliary_loss_mlp": 0.01030935, + "balance_loss_clip": 1.05158031, + "balance_loss_mlp": 1.02143979, + "epoch": 0.3340347501953947, + "flos": 19062569554560.0, + "grad_norm": 2.0281695013834304, + "language_loss": 0.86833531, + "learning_rate": 3.1063591655978507e-06, + "loss": 0.89015532, + "num_input_tokens_seen": 59886195, + "step": 2778, + "time_per_iteration": 2.5294137001037598 + }, + { + "auxiliary_loss_clip": 0.01124292, + "auxiliary_loss_mlp": 0.01028144, + "balance_loss_clip": 1.04460466, + "balance_loss_mlp": 1.01936436, + "epoch": 0.3341549930860338, + "flos": 18109715518080.0, + "grad_norm": 1.7695263572444149, + "language_loss": 0.79744351, + "learning_rate": 3.105710149993972e-06, + "loss": 0.81896794, + "num_input_tokens_seen": 59905525, + "step": 2779, + "time_per_iteration": 2.634417772293091 + }, + { + "auxiliary_loss_clip": 0.01196065, + "auxiliary_loss_mlp": 0.01023772, + "balance_loss_clip": 1.05518579, + "balance_loss_mlp": 1.01499832, + "epoch": 0.33427523597667286, + "flos": 22674967418880.0, + "grad_norm": 1.8780994154841402, + "language_loss": 0.85269618, + "learning_rate": 3.1050609666547427e-06, + "loss": 0.8748945, + "num_input_tokens_seen": 59925085, + "step": 2780, + "time_per_iteration": 2.50347900390625 + }, + { + "auxiliary_loss_clip": 0.01160638, + "auxiliary_loss_mlp": 0.01039705, + "balance_loss_clip": 1.05348706, + "balance_loss_mlp": 1.03090179, + "epoch": 0.33439547886731197, + "flos": 22638338524800.0, + "grad_norm": 2.0566901031763667, + "language_loss": 0.77478468, + "learning_rate": 3.104411615678644e-06, + "loss": 0.7967881, + "num_input_tokens_seen": 59943935, + "step": 2781, + "time_per_iteration": 2.617816686630249 + }, + { + "auxiliary_loss_clip": 0.01160876, + "auxiliary_loss_mlp": 0.01031815, + "balance_loss_clip": 1.05148852, + "balance_loss_mlp": 1.02222419, + "epoch": 0.3345157217579511, + "flos": 24096383395200.0, + "grad_norm": 2.5609143214363743, + "language_loss": 0.73944271, + "learning_rate": 3.1037620971641803e-06, + "loss": 0.76136965, + "num_input_tokens_seen": 59963725, + "step": 2782, + "time_per_iteration": 2.564547538757324 + }, + { + "auxiliary_loss_clip": 0.01197304, + "auxiliary_loss_mlp": 0.01030572, + "balance_loss_clip": 1.05652332, + "balance_loss_mlp": 1.02120829, + "epoch": 0.33463596464859013, + "flos": 18989491334400.0, + "grad_norm": 2.4751779839160224, + "language_loss": 0.64949286, + "learning_rate": 3.1031124112098844e-06, + "loss": 0.67177165, + "num_input_tokens_seen": 59981935, + "step": 2783, + "time_per_iteration": 2.500908136367798 + }, + { + "auxiliary_loss_clip": 0.01168858, + "auxiliary_loss_mlp": 0.01028811, + "balance_loss_clip": 1.0537982, + "balance_loss_mlp": 1.01990604, + "epoch": 0.33475620753922924, + "flos": 20375607219840.0, + "grad_norm": 2.7291093703767353, + "language_loss": 0.72199702, + "learning_rate": 3.1024625579143127e-06, + "loss": 0.74397367, + "num_input_tokens_seen": 59999455, + "step": 2784, + "time_per_iteration": 2.528395891189575 + }, + { + "auxiliary_loss_clip": 0.01192551, + "auxiliary_loss_mlp": 0.01031195, + "balance_loss_clip": 1.05430436, + "balance_loss_mlp": 1.02252817, + "epoch": 0.33487645042986836, + "flos": 18182578256640.0, + "grad_norm": 1.860290443162847, + "language_loss": 0.7289046, + "learning_rate": 3.101812537376048e-06, + "loss": 0.75114202, + "num_input_tokens_seen": 60018475, + "step": 2785, + "time_per_iteration": 2.4462618827819824 + }, + { + "auxiliary_loss_clip": 0.01159367, + "auxiliary_loss_mlp": 0.00763256, + "balance_loss_clip": 1.05157399, + "balance_loss_mlp": 1.00045025, + "epoch": 0.3349966933205074, + "flos": 25848824135040.0, + "grad_norm": 2.2245869185266516, + "language_loss": 0.84502202, + "learning_rate": 3.1011623496936973e-06, + "loss": 0.86424822, + "num_input_tokens_seen": 60036770, + "step": 2786, + "time_per_iteration": 2.5411758422851562 + }, + { + "auxiliary_loss_clip": 0.01193342, + "auxiliary_loss_mlp": 0.01028936, + "balance_loss_clip": 1.05574608, + "balance_loss_mlp": 1.02038312, + "epoch": 0.3351169362111465, + "flos": 28111447699200.0, + "grad_norm": 1.8198320627287339, + "language_loss": 0.70103872, + "learning_rate": 3.100511994965893e-06, + "loss": 0.72326154, + "num_input_tokens_seen": 60056725, + "step": 2787, + "time_per_iteration": 2.4958789348602295 + }, + { + "auxiliary_loss_clip": 0.01174671, + "auxiliary_loss_mlp": 0.01031243, + "balance_loss_clip": 1.05419672, + "balance_loss_mlp": 1.02276731, + "epoch": 0.33523717910178563, + "flos": 22673315393280.0, + "grad_norm": 1.6698142601911563, + "language_loss": 0.84157336, + "learning_rate": 3.0998614732912947e-06, + "loss": 0.86363244, + "num_input_tokens_seen": 60076100, + "step": 2788, + "time_per_iteration": 2.4920570850372314 + }, + { + "auxiliary_loss_clip": 0.0118115, + "auxiliary_loss_mlp": 0.01034254, + "balance_loss_clip": 1.05667675, + "balance_loss_mlp": 1.0253129, + "epoch": 0.3353574219924247, + "flos": 15669801400320.0, + "grad_norm": 1.8609673665458077, + "language_loss": 0.68414795, + "learning_rate": 3.0992107847685855e-06, + "loss": 0.70630205, + "num_input_tokens_seen": 60093815, + "step": 2789, + "time_per_iteration": 2.5370123386383057 + }, + { + "auxiliary_loss_clip": 0.01169417, + "auxiliary_loss_mlp": 0.0103822, + "balance_loss_clip": 1.0561955, + "balance_loss_mlp": 1.02885056, + "epoch": 0.3354776648830638, + "flos": 24790644443520.0, + "grad_norm": 1.7153876994758497, + "language_loss": 0.79374933, + "learning_rate": 3.0985599294964736e-06, + "loss": 0.8158257, + "num_input_tokens_seen": 60113370, + "step": 2790, + "time_per_iteration": 3.302722930908203 + }, + { + "auxiliary_loss_clip": 0.01161426, + "auxiliary_loss_mlp": 0.01038819, + "balance_loss_clip": 1.0519104, + "balance_loss_mlp": 1.02881122, + "epoch": 0.33559790777370285, + "flos": 28694852398080.0, + "grad_norm": 1.80723828922953, + "language_loss": 0.69966519, + "learning_rate": 3.097908907573695e-06, + "loss": 0.72166771, + "num_input_tokens_seen": 60131350, + "step": 2791, + "time_per_iteration": 2.5775246620178223 + }, + { + "auxiliary_loss_clip": 0.0112229, + "auxiliary_loss_mlp": 0.01033426, + "balance_loss_clip": 1.0505439, + "balance_loss_mlp": 1.02481878, + "epoch": 0.33571815066434196, + "flos": 22235779825920.0, + "grad_norm": 1.9227386168855174, + "language_loss": 0.89309853, + "learning_rate": 3.0972577190990067e-06, + "loss": 0.91465569, + "num_input_tokens_seen": 60149830, + "step": 2792, + "time_per_iteration": 2.5962321758270264 + }, + { + "auxiliary_loss_clip": 0.01157604, + "auxiliary_loss_mlp": 0.01031931, + "balance_loss_clip": 1.05242658, + "balance_loss_mlp": 1.02325869, + "epoch": 0.3358383935549811, + "flos": 23842279607040.0, + "grad_norm": 1.8388312756443639, + "language_loss": 0.79677629, + "learning_rate": 3.096606364171196e-06, + "loss": 0.81867164, + "num_input_tokens_seen": 60169620, + "step": 2793, + "time_per_iteration": 3.3267576694488525 + }, + { + "auxiliary_loss_clip": 0.01136688, + "auxiliary_loss_mlp": 0.01028853, + "balance_loss_clip": 1.04789472, + "balance_loss_mlp": 1.01993072, + "epoch": 0.33595863644562013, + "flos": 22267308988800.0, + "grad_norm": 1.8504680237729352, + "language_loss": 0.85316384, + "learning_rate": 3.0959548428890703e-06, + "loss": 0.87481916, + "num_input_tokens_seen": 60188490, + "step": 2794, + "time_per_iteration": 3.2673916816711426 + }, + { + "auxiliary_loss_clip": 0.01178654, + "auxiliary_loss_mlp": 0.01037049, + "balance_loss_clip": 1.05613935, + "balance_loss_mlp": 1.02798307, + "epoch": 0.33607887933625924, + "flos": 20119779578880.0, + "grad_norm": 1.7495317411453148, + "language_loss": 0.83980733, + "learning_rate": 3.095303155351468e-06, + "loss": 0.8619644, + "num_input_tokens_seen": 60208695, + "step": 2795, + "time_per_iteration": 3.2482402324676514 + }, + { + "auxiliary_loss_clip": 0.01128935, + "auxiliary_loss_mlp": 0.01028728, + "balance_loss_clip": 1.04865205, + "balance_loss_mlp": 1.0201385, + "epoch": 0.33619912222689835, + "flos": 19318109886720.0, + "grad_norm": 2.1017120701939813, + "language_loss": 0.7880587, + "learning_rate": 3.0946513016572464e-06, + "loss": 0.80963528, + "num_input_tokens_seen": 60227600, + "step": 2796, + "time_per_iteration": 2.552891969680786 + }, + { + "auxiliary_loss_clip": 0.01183804, + "auxiliary_loss_mlp": 0.01032866, + "balance_loss_clip": 1.05454445, + "balance_loss_mlp": 1.02365124, + "epoch": 0.3363193651175374, + "flos": 16800664262400.0, + "grad_norm": 1.9815685001560148, + "language_loss": 0.77233356, + "learning_rate": 3.0939992819052938e-06, + "loss": 0.79450029, + "num_input_tokens_seen": 60245110, + "step": 2797, + "time_per_iteration": 2.4783852100372314 + }, + { + "auxiliary_loss_clip": 0.01168759, + "auxiliary_loss_mlp": 0.01028799, + "balance_loss_clip": 1.05570984, + "balance_loss_mlp": 1.01944113, + "epoch": 0.3364396080081765, + "flos": 23550289948800.0, + "grad_norm": 1.9435047025462893, + "language_loss": 0.81554043, + "learning_rate": 3.0933470961945193e-06, + "loss": 0.83751595, + "num_input_tokens_seen": 60263405, + "step": 2798, + "time_per_iteration": 2.5222079753875732 + }, + { + "auxiliary_loss_clip": 0.01162975, + "auxiliary_loss_mlp": 0.01033404, + "balance_loss_clip": 1.0547955, + "balance_loss_mlp": 1.02490497, + "epoch": 0.3365598508988156, + "flos": 28037902602240.0, + "grad_norm": 1.701911349593511, + "language_loss": 0.68186045, + "learning_rate": 3.0926947446238597e-06, + "loss": 0.70382416, + "num_input_tokens_seen": 60282975, + "step": 2799, + "time_per_iteration": 2.5563158988952637 + }, + { + "auxiliary_loss_clip": 0.01182497, + "auxiliary_loss_mlp": 0.01027482, + "balance_loss_clip": 1.05070531, + "balance_loss_mlp": 1.01815367, + "epoch": 0.3366800937894547, + "flos": 16982767238400.0, + "grad_norm": 2.1835892486521917, + "language_loss": 0.82299548, + "learning_rate": 3.092042227292276e-06, + "loss": 0.84509528, + "num_input_tokens_seen": 60299810, + "step": 2800, + "time_per_iteration": 2.4463043212890625 + }, + { + "auxiliary_loss_clip": 0.01192033, + "auxiliary_loss_mlp": 0.01026838, + "balance_loss_clip": 1.05623484, + "balance_loss_mlp": 1.0186007, + "epoch": 0.3368003366800938, + "flos": 23915321913600.0, + "grad_norm": 1.6332855588479496, + "language_loss": 0.88064623, + "learning_rate": 3.0913895442987557e-06, + "loss": 0.90283489, + "num_input_tokens_seen": 60320775, + "step": 2801, + "time_per_iteration": 2.4716455936431885 + }, + { + "auxiliary_loss_clip": 0.01151576, + "auxiliary_loss_mlp": 0.00763314, + "balance_loss_clip": 1.05189085, + "balance_loss_mlp": 1.00048876, + "epoch": 0.3369205795707329, + "flos": 24791219061120.0, + "grad_norm": 1.620982573046577, + "language_loss": 0.85746223, + "learning_rate": 3.090736695742308e-06, + "loss": 0.87661111, + "num_input_tokens_seen": 60341905, + "step": 2802, + "time_per_iteration": 2.5614137649536133 + }, + { + "auxiliary_loss_clip": 0.01129967, + "auxiliary_loss_mlp": 0.01026857, + "balance_loss_clip": 1.04808879, + "balance_loss_mlp": 1.01842928, + "epoch": 0.33704082246137196, + "flos": 17931096161280.0, + "grad_norm": 2.921528959595355, + "language_loss": 0.5193584, + "learning_rate": 3.0900836817219713e-06, + "loss": 0.54092664, + "num_input_tokens_seen": 60358335, + "step": 2803, + "time_per_iteration": 2.542318344116211 + }, + { + "auxiliary_loss_clip": 0.01192377, + "auxiliary_loss_mlp": 0.01028863, + "balance_loss_clip": 1.0539875, + "balance_loss_mlp": 1.02061379, + "epoch": 0.33716106535201107, + "flos": 21286517149440.0, + "grad_norm": 1.5796391585000833, + "language_loss": 0.83512259, + "learning_rate": 3.089430502336807e-06, + "loss": 0.85733497, + "num_input_tokens_seen": 60378305, + "step": 2804, + "time_per_iteration": 2.451876640319824 + }, + { + "auxiliary_loss_clip": 0.01182694, + "auxiliary_loss_mlp": 0.01028625, + "balance_loss_clip": 1.05368209, + "balance_loss_mlp": 1.01945233, + "epoch": 0.3372813082426502, + "flos": 18402962152320.0, + "grad_norm": 2.980042675342192, + "language_loss": 0.90636277, + "learning_rate": 3.088777157685902e-06, + "loss": 0.92847598, + "num_input_tokens_seen": 60393895, + "step": 2805, + "time_per_iteration": 2.435588836669922 + }, + { + "auxiliary_loss_clip": 0.01160035, + "auxiliary_loss_mlp": 0.01025276, + "balance_loss_clip": 1.05210519, + "balance_loss_mlp": 1.01697326, + "epoch": 0.33740155113328923, + "flos": 17201391367680.0, + "grad_norm": 1.8180086365341472, + "language_loss": 0.85605407, + "learning_rate": 3.088123647868367e-06, + "loss": 0.87790728, + "num_input_tokens_seen": 60410445, + "step": 2806, + "time_per_iteration": 2.4814612865448 + }, + { + "auxiliary_loss_clip": 0.01183357, + "auxiliary_loss_mlp": 0.0102762, + "balance_loss_clip": 1.0536828, + "balance_loss_mlp": 1.01935911, + "epoch": 0.33752179402392835, + "flos": 29058950609280.0, + "grad_norm": 1.9432047891494473, + "language_loss": 0.81264329, + "learning_rate": 3.0874699729833405e-06, + "loss": 0.83475304, + "num_input_tokens_seen": 60431815, + "step": 2807, + "time_per_iteration": 2.5346319675445557 + }, + { + "auxiliary_loss_clip": 0.01159519, + "auxiliary_loss_mlp": 0.01026133, + "balance_loss_clip": 1.05103445, + "balance_loss_mlp": 1.0173353, + "epoch": 0.3376420369145674, + "flos": 25080730680960.0, + "grad_norm": 1.685472865214439, + "language_loss": 0.79853666, + "learning_rate": 3.086816133129983e-06, + "loss": 0.8203932, + "num_input_tokens_seen": 60452075, + "step": 2808, + "time_per_iteration": 2.5172410011291504 + }, + { + "auxiliary_loss_clip": 0.0119517, + "auxiliary_loss_mlp": 0.01027307, + "balance_loss_clip": 1.05738378, + "balance_loss_mlp": 1.01868832, + "epoch": 0.3377622798052065, + "flos": 27490624007040.0, + "grad_norm": 1.763657921669673, + "language_loss": 0.76424825, + "learning_rate": 3.0861621284074826e-06, + "loss": 0.78647304, + "num_input_tokens_seen": 60472600, + "step": 2809, + "time_per_iteration": 2.481370210647583 + }, + { + "auxiliary_loss_clip": 0.01173998, + "auxiliary_loss_mlp": 0.0102982, + "balance_loss_clip": 1.05541325, + "balance_loss_mlp": 1.02161241, + "epoch": 0.3378825226958456, + "flos": 21975211589760.0, + "grad_norm": 1.554605895077336, + "language_loss": 0.72844672, + "learning_rate": 3.085507958915051e-06, + "loss": 0.75048494, + "num_input_tokens_seen": 60491030, + "step": 2810, + "time_per_iteration": 2.507603168487549 + }, + { + "auxiliary_loss_clip": 0.01163574, + "auxiliary_loss_mlp": 0.01032636, + "balance_loss_clip": 1.05453765, + "balance_loss_mlp": 1.02282488, + "epoch": 0.3380027655864847, + "flos": 42523189200000.0, + "grad_norm": 2.0081554362838783, + "language_loss": 0.71657622, + "learning_rate": 3.084853624751925e-06, + "loss": 0.73853838, + "num_input_tokens_seen": 60512615, + "step": 2811, + "time_per_iteration": 2.6775050163269043 + }, + { + "auxiliary_loss_clip": 0.01154413, + "auxiliary_loss_mlp": 0.01030483, + "balance_loss_clip": 1.05399144, + "balance_loss_mlp": 1.02170348, + "epoch": 0.3381230084771238, + "flos": 26725080418560.0, + "grad_norm": 1.7209325901806252, + "language_loss": 0.85711229, + "learning_rate": 3.0841991260173668e-06, + "loss": 0.87896127, + "num_input_tokens_seen": 60532520, + "step": 2812, + "time_per_iteration": 2.600926399230957 + }, + { + "auxiliary_loss_clip": 0.01196585, + "auxiliary_loss_mlp": 0.01028908, + "balance_loss_clip": 1.05614567, + "balance_loss_mlp": 1.01963902, + "epoch": 0.3382432513677629, + "flos": 22710375250560.0, + "grad_norm": 2.0480715071425446, + "language_loss": 0.80364215, + "learning_rate": 3.0835444628106634e-06, + "loss": 0.8258971, + "num_input_tokens_seen": 60551500, + "step": 2813, + "time_per_iteration": 2.447578191757202 + }, + { + "auxiliary_loss_clip": 0.01193361, + "auxiliary_loss_mlp": 0.00763819, + "balance_loss_clip": 1.05497169, + "balance_loss_mlp": 1.00059795, + "epoch": 0.33836349425840195, + "flos": 22122409524480.0, + "grad_norm": 1.7782112627386923, + "language_loss": 0.82858235, + "learning_rate": 3.082889635231126e-06, + "loss": 0.84815419, + "num_input_tokens_seen": 60570160, + "step": 2814, + "time_per_iteration": 2.460442066192627 + }, + { + "auxiliary_loss_clip": 0.01167816, + "auxiliary_loss_mlp": 0.01028051, + "balance_loss_clip": 1.05154061, + "balance_loss_mlp": 1.01831794, + "epoch": 0.33848373714904106, + "flos": 27308090067840.0, + "grad_norm": 2.2182946634887895, + "language_loss": 0.76632744, + "learning_rate": 3.0822346433780925e-06, + "loss": 0.78828609, + "num_input_tokens_seen": 60590885, + "step": 2815, + "time_per_iteration": 2.539652109146118 + }, + { + "auxiliary_loss_clip": 0.01179624, + "auxiliary_loss_mlp": 0.01028512, + "balance_loss_clip": 1.05145371, + "balance_loss_mlp": 1.01926756, + "epoch": 0.3386039800396802, + "flos": 25848716394240.0, + "grad_norm": 2.0880702580194543, + "language_loss": 0.87021315, + "learning_rate": 3.0815794873509237e-06, + "loss": 0.89229453, + "num_input_tokens_seen": 60609170, + "step": 2816, + "time_per_iteration": 2.4881913661956787 + }, + { + "auxiliary_loss_clip": 0.0119547, + "auxiliary_loss_mlp": 0.01026632, + "balance_loss_clip": 1.05662513, + "balance_loss_mlp": 1.0177505, + "epoch": 0.33872422293031923, + "flos": 18880646146560.0, + "grad_norm": 1.7243083386827662, + "language_loss": 0.72432935, + "learning_rate": 3.0809241672490066e-06, + "loss": 0.74655044, + "num_input_tokens_seen": 60627340, + "step": 2817, + "time_per_iteration": 3.2079405784606934 + }, + { + "auxiliary_loss_clip": 0.01169284, + "auxiliary_loss_mlp": 0.01024636, + "balance_loss_clip": 1.05437398, + "balance_loss_mlp": 1.01606441, + "epoch": 0.33884446582095834, + "flos": 23146977064320.0, + "grad_norm": 1.7762620534327487, + "language_loss": 0.84928894, + "learning_rate": 3.080268683171753e-06, + "loss": 0.8712281, + "num_input_tokens_seen": 60647630, + "step": 2818, + "time_per_iteration": 2.5100321769714355 + }, + { + "auxiliary_loss_clip": 0.01179764, + "auxiliary_loss_mlp": 0.0102559, + "balance_loss_clip": 1.05281353, + "balance_loss_mlp": 1.01679254, + "epoch": 0.33896470871159745, + "flos": 15997342544640.0, + "grad_norm": 2.2119361191134153, + "language_loss": 0.89167893, + "learning_rate": 3.0796130352185985e-06, + "loss": 0.91373253, + "num_input_tokens_seen": 60664485, + "step": 2819, + "time_per_iteration": 2.455230236053467 + }, + { + "auxiliary_loss_clip": 0.0115052, + "auxiliary_loss_mlp": 0.00764166, + "balance_loss_clip": 1.04622388, + "balance_loss_mlp": 1.00067246, + "epoch": 0.3390849516022365, + "flos": 34495754112000.0, + "grad_norm": 1.8828522302480248, + "language_loss": 0.66561961, + "learning_rate": 3.0789572234890057e-06, + "loss": 0.68476653, + "num_input_tokens_seen": 60686125, + "step": 2820, + "time_per_iteration": 3.369786500930786 + }, + { + "auxiliary_loss_clip": 0.01166101, + "auxiliary_loss_mlp": 0.01030655, + "balance_loss_clip": 1.05550504, + "balance_loss_mlp": 1.02115428, + "epoch": 0.3392051944928756, + "flos": 16180307447040.0, + "grad_norm": 1.5994058529433655, + "language_loss": 0.77301955, + "learning_rate": 3.0783012480824596e-06, + "loss": 0.79498714, + "num_input_tokens_seen": 60705270, + "step": 2821, + "time_per_iteration": 3.2776005268096924 + }, + { + "auxiliary_loss_clip": 0.01194351, + "auxiliary_loss_mlp": 0.0103527, + "balance_loss_clip": 1.05468702, + "balance_loss_mlp": 1.02591777, + "epoch": 0.33932543738351467, + "flos": 17086656349440.0, + "grad_norm": 2.15283091698306, + "language_loss": 0.7447924, + "learning_rate": 3.077645109098471e-06, + "loss": 0.76708865, + "num_input_tokens_seen": 60721540, + "step": 2822, + "time_per_iteration": 3.1694374084472656 + }, + { + "auxiliary_loss_clip": 0.01137958, + "auxiliary_loss_mlp": 0.01029965, + "balance_loss_clip": 1.04967141, + "balance_loss_mlp": 1.02120936, + "epoch": 0.3394456802741538, + "flos": 22126970551680.0, + "grad_norm": 1.7508029972104264, + "language_loss": 0.72149694, + "learning_rate": 3.076988806636577e-06, + "loss": 0.74317622, + "num_input_tokens_seen": 60739300, + "step": 2823, + "time_per_iteration": 2.5394911766052246 + }, + { + "auxiliary_loss_clip": 0.01172915, + "auxiliary_loss_mlp": 0.00763616, + "balance_loss_clip": 1.05687618, + "balance_loss_mlp": 1.00054646, + "epoch": 0.3395659231647929, + "flos": 25226887121280.0, + "grad_norm": 1.898647894009044, + "language_loss": 0.88642913, + "learning_rate": 3.0763323407963377e-06, + "loss": 0.90579444, + "num_input_tokens_seen": 60758910, + "step": 2824, + "time_per_iteration": 2.529021739959717 + }, + { + "auxiliary_loss_clip": 0.01179606, + "auxiliary_loss_mlp": 0.0103084, + "balance_loss_clip": 1.05196631, + "balance_loss_mlp": 1.02267385, + "epoch": 0.33968616605543195, + "flos": 29096477343360.0, + "grad_norm": 1.7647684155073269, + "language_loss": 0.80052042, + "learning_rate": 3.075675711677337e-06, + "loss": 0.82262486, + "num_input_tokens_seen": 60779005, + "step": 2825, + "time_per_iteration": 2.5303430557250977 + }, + { + "auxiliary_loss_clip": 0.01161367, + "auxiliary_loss_mlp": 0.01033824, + "balance_loss_clip": 1.05430329, + "balance_loss_mlp": 1.02427578, + "epoch": 0.33980640894607106, + "flos": 21433966479360.0, + "grad_norm": 2.088752964946192, + "language_loss": 0.78208482, + "learning_rate": 3.0750189193791865e-06, + "loss": 0.80403674, + "num_input_tokens_seen": 60798590, + "step": 2826, + "time_per_iteration": 2.487916946411133 + }, + { + "auxiliary_loss_clip": 0.01176894, + "auxiliary_loss_mlp": 0.01027327, + "balance_loss_clip": 1.05180717, + "balance_loss_mlp": 1.01827276, + "epoch": 0.33992665183671017, + "flos": 32490035596800.0, + "grad_norm": 1.9996859240619962, + "language_loss": 0.70187426, + "learning_rate": 3.0743619640015203e-06, + "loss": 0.72391653, + "num_input_tokens_seen": 60818840, + "step": 2827, + "time_per_iteration": 2.54532527923584 + }, + { + "auxiliary_loss_clip": 0.01169218, + "auxiliary_loss_mlp": 0.01030761, + "balance_loss_clip": 1.0498457, + "balance_loss_mlp": 1.02131367, + "epoch": 0.3400468947273492, + "flos": 17055414495360.0, + "grad_norm": 1.8046604752336783, + "language_loss": 0.92365134, + "learning_rate": 3.073704845643999e-06, + "loss": 0.94565111, + "num_input_tokens_seen": 60835965, + "step": 2828, + "time_per_iteration": 2.490739107131958 + }, + { + "auxiliary_loss_clip": 0.01181157, + "auxiliary_loss_mlp": 0.01035741, + "balance_loss_clip": 1.0513711, + "balance_loss_mlp": 1.02647281, + "epoch": 0.34016713761798834, + "flos": 16872988296960.0, + "grad_norm": 2.5683273839063037, + "language_loss": 0.77425086, + "learning_rate": 3.0730475644063063e-06, + "loss": 0.79641986, + "num_input_tokens_seen": 60851065, + "step": 2829, + "time_per_iteration": 2.4292831420898438 + }, + { + "auxiliary_loss_clip": 0.01156507, + "auxiliary_loss_mlp": 0.00762665, + "balance_loss_clip": 1.04791045, + "balance_loss_mlp": 1.00064147, + "epoch": 0.34028738050862745, + "flos": 21907161273600.0, + "grad_norm": 1.7953608356090969, + "language_loss": 0.64969099, + "learning_rate": 3.072390120388151e-06, + "loss": 0.66888267, + "num_input_tokens_seen": 60869390, + "step": 2830, + "time_per_iteration": 2.4928319454193115 + }, + { + "auxiliary_loss_clip": 0.01181998, + "auxiliary_loss_mlp": 0.01027225, + "balance_loss_clip": 1.05504847, + "balance_loss_mlp": 1.01798058, + "epoch": 0.3404076233992665, + "flos": 22746034477440.0, + "grad_norm": 4.37954189337278, + "language_loss": 0.71196973, + "learning_rate": 3.071732513689267e-06, + "loss": 0.73406196, + "num_input_tokens_seen": 60887925, + "step": 2831, + "time_per_iteration": 2.473571300506592 + }, + { + "auxiliary_loss_clip": 0.01184416, + "auxiliary_loss_mlp": 0.01030301, + "balance_loss_clip": 1.05841494, + "balance_loss_mlp": 1.02143741, + "epoch": 0.3405278662899056, + "flos": 17052361839360.0, + "grad_norm": 2.4326893930126254, + "language_loss": 0.66981602, + "learning_rate": 3.0710747444094134e-06, + "loss": 0.6919632, + "num_input_tokens_seen": 60905955, + "step": 2832, + "time_per_iteration": 2.5303688049316406 + }, + { + "auxiliary_loss_clip": 0.01169734, + "auxiliary_loss_mlp": 0.01032364, + "balance_loss_clip": 1.0544908, + "balance_loss_mlp": 1.02314317, + "epoch": 0.3406481091805447, + "flos": 42813131783040.0, + "grad_norm": 1.728398525867113, + "language_loss": 0.64991343, + "learning_rate": 3.070416812648372e-06, + "loss": 0.67193443, + "num_input_tokens_seen": 60929405, + "step": 2833, + "time_per_iteration": 2.6866061687469482 + }, + { + "auxiliary_loss_clip": 0.0114815, + "auxiliary_loss_mlp": 0.01028457, + "balance_loss_clip": 1.04658735, + "balance_loss_mlp": 1.01992214, + "epoch": 0.3407683520711838, + "flos": 26761457917440.0, + "grad_norm": 2.6372775928677807, + "language_loss": 0.65778387, + "learning_rate": 3.069758718505951e-06, + "loss": 0.67954999, + "num_input_tokens_seen": 60951145, + "step": 2834, + "time_per_iteration": 2.581080675125122 + }, + { + "auxiliary_loss_clip": 0.01195987, + "auxiliary_loss_mlp": 0.01035713, + "balance_loss_clip": 1.05831909, + "balance_loss_mlp": 1.02621782, + "epoch": 0.3408885949618229, + "flos": 28767643309440.0, + "grad_norm": 1.7434404872494706, + "language_loss": 0.79925263, + "learning_rate": 3.0691004620819836e-06, + "loss": 0.82156956, + "num_input_tokens_seen": 60971275, + "step": 2835, + "time_per_iteration": 2.4990763664245605 + }, + { + "auxiliary_loss_clip": 0.01047633, + "auxiliary_loss_mlp": 0.01010401, + "balance_loss_clip": 1.02263391, + "balance_loss_mlp": 1.00880325, + "epoch": 0.341008837852462, + "flos": 63576252881280.0, + "grad_norm": 0.804935301143918, + "language_loss": 0.60201168, + "learning_rate": 3.0684420434763254e-06, + "loss": 0.62259209, + "num_input_tokens_seen": 61037460, + "step": 2836, + "time_per_iteration": 3.1547634601593018 + }, + { + "auxiliary_loss_clip": 0.01142323, + "auxiliary_loss_mlp": 0.01031713, + "balance_loss_clip": 1.05144668, + "balance_loss_mlp": 1.02376497, + "epoch": 0.34112908074310105, + "flos": 20812173120000.0, + "grad_norm": 2.532127692830593, + "language_loss": 0.76650447, + "learning_rate": 3.06778346278886e-06, + "loss": 0.78824484, + "num_input_tokens_seen": 61056295, + "step": 2837, + "time_per_iteration": 2.5629236698150635 + }, + { + "auxiliary_loss_clip": 0.01197743, + "auxiliary_loss_mlp": 0.01026809, + "balance_loss_clip": 1.05803323, + "balance_loss_mlp": 1.01801145, + "epoch": 0.34124932363374016, + "flos": 24976446520320.0, + "grad_norm": 1.901391275406814, + "language_loss": 0.79017985, + "learning_rate": 3.0671247201194906e-06, + "loss": 0.81242537, + "num_input_tokens_seen": 61078430, + "step": 2838, + "time_per_iteration": 2.5084943771362305 + }, + { + "auxiliary_loss_clip": 0.01153372, + "auxiliary_loss_mlp": 0.01033838, + "balance_loss_clip": 1.05002093, + "balance_loss_mlp": 1.02485526, + "epoch": 0.3413695665243792, + "flos": 28402970480640.0, + "grad_norm": 1.981325245154195, + "language_loss": 0.75134712, + "learning_rate": 3.066465815568151e-06, + "loss": 0.77321923, + "num_input_tokens_seen": 61099260, + "step": 2839, + "time_per_iteration": 2.5819296836853027 + }, + { + "auxiliary_loss_clip": 0.01181746, + "auxiliary_loss_mlp": 0.010247, + "balance_loss_clip": 1.05310631, + "balance_loss_mlp": 1.01633167, + "epoch": 0.34148980941501833, + "flos": 25302012416640.0, + "grad_norm": 1.676943970896905, + "language_loss": 0.68603557, + "learning_rate": 3.0658067492347947e-06, + "loss": 0.70810008, + "num_input_tokens_seen": 61121900, + "step": 2840, + "time_per_iteration": 2.52191162109375 + }, + { + "auxiliary_loss_clip": 0.01102427, + "auxiliary_loss_mlp": 0.01028357, + "balance_loss_clip": 1.04642701, + "balance_loss_mlp": 1.01899266, + "epoch": 0.34161005230565744, + "flos": 17530081747200.0, + "grad_norm": 2.1757534570510915, + "language_loss": 0.66721874, + "learning_rate": 3.065147521219402e-06, + "loss": 0.68852663, + "num_input_tokens_seen": 61141155, + "step": 2841, + "time_per_iteration": 2.621976613998413 + }, + { + "auxiliary_loss_clip": 0.01157426, + "auxiliary_loss_mlp": 0.01031312, + "balance_loss_clip": 1.05363345, + "balance_loss_mlp": 1.02253175, + "epoch": 0.3417302951962965, + "flos": 43650101566080.0, + "grad_norm": 1.505900205489974, + "language_loss": 0.7447449, + "learning_rate": 3.064488131621977e-06, + "loss": 0.76663226, + "num_input_tokens_seen": 61164480, + "step": 2842, + "time_per_iteration": 2.7003087997436523 + }, + { + "auxiliary_loss_clip": 0.01173072, + "auxiliary_loss_mlp": 0.01030308, + "balance_loss_clip": 1.05102408, + "balance_loss_mlp": 1.02116454, + "epoch": 0.3418505380869356, + "flos": 30882207012480.0, + "grad_norm": 1.9670325899209, + "language_loss": 0.74013901, + "learning_rate": 3.063828580542549e-06, + "loss": 0.76217288, + "num_input_tokens_seen": 61185675, + "step": 2843, + "time_per_iteration": 2.563671112060547 + }, + { + "auxiliary_loss_clip": 0.01164156, + "auxiliary_loss_mlp": 0.0103268, + "balance_loss_clip": 1.05249429, + "balance_loss_mlp": 1.02444267, + "epoch": 0.3419707809775747, + "flos": 19463871277440.0, + "grad_norm": 1.8238388582688947, + "language_loss": 0.73330635, + "learning_rate": 3.0631688680811706e-06, + "loss": 0.75527471, + "num_input_tokens_seen": 61205300, + "step": 2844, + "time_per_iteration": 2.5249104499816895 + }, + { + "auxiliary_loss_clip": 0.01196271, + "auxiliary_loss_mlp": 0.01033767, + "balance_loss_clip": 1.05699253, + "balance_loss_mlp": 1.0250113, + "epoch": 0.3420910238682138, + "flos": 28727818104960.0, + "grad_norm": 1.8384254467176504, + "language_loss": 0.75472414, + "learning_rate": 3.062508994337921e-06, + "loss": 0.77702457, + "num_input_tokens_seen": 61224905, + "step": 2845, + "time_per_iteration": 3.244797945022583 + }, + { + "auxiliary_loss_clip": 0.01180601, + "auxiliary_loss_mlp": 0.01027806, + "balance_loss_clip": 1.05151391, + "balance_loss_mlp": 1.019032, + "epoch": 0.3422112667588529, + "flos": 21397265758080.0, + "grad_norm": 1.8659524879895217, + "language_loss": 0.79738677, + "learning_rate": 3.0618489594129013e-06, + "loss": 0.81947082, + "num_input_tokens_seen": 61243045, + "step": 2846, + "time_per_iteration": 2.4746851921081543 + }, + { + "auxiliary_loss_clip": 0.01156673, + "auxiliary_loss_mlp": 0.01027428, + "balance_loss_clip": 1.05447721, + "balance_loss_mlp": 1.01917887, + "epoch": 0.342331509649492, + "flos": 13881450038400.0, + "grad_norm": 2.2217191477369993, + "language_loss": 0.71086645, + "learning_rate": 3.061188763406239e-06, + "loss": 0.73270744, + "num_input_tokens_seen": 61259190, + "step": 2847, + "time_per_iteration": 3.288712978363037 + }, + { + "auxiliary_loss_clip": 0.01160767, + "auxiliary_loss_mlp": 0.01032608, + "balance_loss_clip": 1.05152798, + "balance_loss_mlp": 1.02380443, + "epoch": 0.34245175254013105, + "flos": 28621450955520.0, + "grad_norm": 2.232644053188864, + "language_loss": 0.82202387, + "learning_rate": 3.060528406418085e-06, + "loss": 0.84395766, + "num_input_tokens_seen": 61279040, + "step": 2848, + "time_per_iteration": 3.3496503829956055 + }, + { + "auxiliary_loss_clip": 0.01159428, + "auxiliary_loss_mlp": 0.01030887, + "balance_loss_clip": 1.05271101, + "balance_loss_mlp": 1.0225898, + "epoch": 0.34257199543077016, + "flos": 34127058960000.0, + "grad_norm": 2.1846822642160797, + "language_loss": 0.61838329, + "learning_rate": 3.0598678885486145e-06, + "loss": 0.64028645, + "num_input_tokens_seen": 61301580, + "step": 2849, + "time_per_iteration": 3.3382015228271484 + }, + { + "auxiliary_loss_clip": 0.01154722, + "auxiliary_loss_mlp": 0.00763371, + "balance_loss_clip": 1.04985929, + "balance_loss_mlp": 1.00064027, + "epoch": 0.34269223832140927, + "flos": 19974018188160.0, + "grad_norm": 1.7196059660632055, + "language_loss": 0.74220562, + "learning_rate": 3.0592072098980282e-06, + "loss": 0.76138657, + "num_input_tokens_seen": 61321240, + "step": 2850, + "time_per_iteration": 2.517838716506958 + }, + { + "auxiliary_loss_clip": 0.0115647, + "auxiliary_loss_mlp": 0.01030834, + "balance_loss_clip": 1.04971373, + "balance_loss_mlp": 1.02232289, + "epoch": 0.3428124812120483, + "flos": 27235658292480.0, + "grad_norm": 2.39301587021939, + "language_loss": 0.72636414, + "learning_rate": 3.0585463705665514e-06, + "loss": 0.74823725, + "num_input_tokens_seen": 61341615, + "step": 2851, + "time_per_iteration": 2.529114007949829 + }, + { + "auxiliary_loss_clip": 0.01148109, + "auxiliary_loss_mlp": 0.01030737, + "balance_loss_clip": 1.04759347, + "balance_loss_mlp": 1.02185023, + "epoch": 0.34293272410268744, + "flos": 24570871079040.0, + "grad_norm": 2.4022085548549525, + "language_loss": 0.71231651, + "learning_rate": 3.0578853706544304e-06, + "loss": 0.73410493, + "num_input_tokens_seen": 61359005, + "step": 2852, + "time_per_iteration": 2.546445608139038 + }, + { + "auxiliary_loss_clip": 0.01154135, + "auxiliary_loss_mlp": 0.00763597, + "balance_loss_clip": 1.05195951, + "balance_loss_mlp": 1.00066793, + "epoch": 0.34305296699332655, + "flos": 21506865131520.0, + "grad_norm": 1.9874235671092233, + "language_loss": 0.65565145, + "learning_rate": 3.0572242102619404e-06, + "loss": 0.67482877, + "num_input_tokens_seen": 61376160, + "step": 2853, + "time_per_iteration": 2.533574342727661 + }, + { + "auxiliary_loss_clip": 0.01163533, + "auxiliary_loss_mlp": 0.0102933, + "balance_loss_clip": 1.05257344, + "balance_loss_mlp": 1.02061033, + "epoch": 0.3431732098839656, + "flos": 24056665931520.0, + "grad_norm": 1.888413133850911, + "language_loss": 0.80521393, + "learning_rate": 3.0565628894893784e-06, + "loss": 0.8271426, + "num_input_tokens_seen": 61396795, + "step": 2854, + "time_per_iteration": 2.541962146759033 + }, + { + "auxiliary_loss_clip": 0.01173105, + "auxiliary_loss_mlp": 0.01033501, + "balance_loss_clip": 1.05500293, + "balance_loss_mlp": 1.02445865, + "epoch": 0.3432934527746047, + "flos": 16800879744000.0, + "grad_norm": 1.652124412071531, + "language_loss": 0.74729729, + "learning_rate": 3.0559014084370655e-06, + "loss": 0.7693634, + "num_input_tokens_seen": 61415320, + "step": 2855, + "time_per_iteration": 2.451120376586914 + }, + { + "auxiliary_loss_clip": 0.01171198, + "auxiliary_loss_mlp": 0.01034298, + "balance_loss_clip": 1.05384779, + "balance_loss_mlp": 1.02477908, + "epoch": 0.34341369566524377, + "flos": 23439720908160.0, + "grad_norm": 1.590507137768576, + "language_loss": 0.78410745, + "learning_rate": 3.055239767205349e-06, + "loss": 0.80616236, + "num_input_tokens_seen": 61437070, + "step": 2856, + "time_per_iteration": 2.533008337020874 + }, + { + "auxiliary_loss_clip": 0.01182863, + "auxiliary_loss_mlp": 0.01032343, + "balance_loss_clip": 1.0596844, + "balance_loss_mlp": 1.02349186, + "epoch": 0.3435339385558829, + "flos": 17267466435840.0, + "grad_norm": 2.123952501271127, + "language_loss": 0.78046811, + "learning_rate": 3.054577965894599e-06, + "loss": 0.80262017, + "num_input_tokens_seen": 61453215, + "step": 2857, + "time_per_iteration": 2.4439613819122314 + }, + { + "auxiliary_loss_clip": 0.01176925, + "auxiliary_loss_mlp": 0.01028202, + "balance_loss_clip": 1.05975938, + "balance_loss_mlp": 1.01802754, + "epoch": 0.343654181446522, + "flos": 22199366413440.0, + "grad_norm": 1.6531798875423647, + "language_loss": 0.70256084, + "learning_rate": 3.0539160046052094e-06, + "loss": 0.72461212, + "num_input_tokens_seen": 61472915, + "step": 2858, + "time_per_iteration": 2.5130748748779297 + }, + { + "auxiliary_loss_clip": 0.01158663, + "auxiliary_loss_mlp": 0.01034269, + "balance_loss_clip": 1.05098045, + "balance_loss_mlp": 1.02420175, + "epoch": 0.34377442433716104, + "flos": 19901801894400.0, + "grad_norm": 2.029582133045863, + "language_loss": 0.70467836, + "learning_rate": 3.0532538834376003e-06, + "loss": 0.72660768, + "num_input_tokens_seen": 61492475, + "step": 2859, + "time_per_iteration": 2.4995782375335693 + }, + { + "auxiliary_loss_clip": 0.01184671, + "auxiliary_loss_mlp": 0.01035972, + "balance_loss_clip": 1.05428779, + "balance_loss_mlp": 1.02691555, + "epoch": 0.34389466722780015, + "flos": 22197678474240.0, + "grad_norm": 1.9430886281417006, + "language_loss": 0.77912956, + "learning_rate": 3.0525916024922143e-06, + "loss": 0.80133599, + "num_input_tokens_seen": 61511660, + "step": 2860, + "time_per_iteration": 2.4941091537475586 + }, + { + "auxiliary_loss_clip": 0.01165472, + "auxiliary_loss_mlp": 0.01030495, + "balance_loss_clip": 1.05243683, + "balance_loss_mlp": 1.02170372, + "epoch": 0.34401491011843927, + "flos": 18624567110400.0, + "grad_norm": 2.5838928541188526, + "language_loss": 0.83576131, + "learning_rate": 3.0519291618695193e-06, + "loss": 0.85772097, + "num_input_tokens_seen": 61529060, + "step": 2861, + "time_per_iteration": 2.4648218154907227 + }, + { + "auxiliary_loss_clip": 0.01143939, + "auxiliary_loss_mlp": 0.01032326, + "balance_loss_clip": 1.04797721, + "balance_loss_mlp": 1.02353394, + "epoch": 0.3441351530090783, + "flos": 17858197509120.0, + "grad_norm": 1.6306772263983482, + "language_loss": 0.75700188, + "learning_rate": 3.0512665616700065e-06, + "loss": 0.77876449, + "num_input_tokens_seen": 61548125, + "step": 2862, + "time_per_iteration": 2.509263753890991 + }, + { + "auxiliary_loss_clip": 0.01129015, + "auxiliary_loss_mlp": 0.01039081, + "balance_loss_clip": 1.04776955, + "balance_loss_mlp": 1.03077245, + "epoch": 0.34425539589971743, + "flos": 23112754381440.0, + "grad_norm": 2.3566564738114053, + "language_loss": 0.89074892, + "learning_rate": 3.0506038019941933e-06, + "loss": 0.91242981, + "num_input_tokens_seen": 61568135, + "step": 2863, + "time_per_iteration": 2.5926826000213623 + }, + { + "auxiliary_loss_clip": 0.01152188, + "auxiliary_loss_mlp": 0.01027343, + "balance_loss_clip": 1.05234241, + "balance_loss_mlp": 1.01860547, + "epoch": 0.34437563879035654, + "flos": 21907699977600.0, + "grad_norm": 2.401009723314293, + "language_loss": 0.67920655, + "learning_rate": 3.049940882942617e-06, + "loss": 0.70100188, + "num_input_tokens_seen": 61586920, + "step": 2864, + "time_per_iteration": 2.5518712997436523 + }, + { + "auxiliary_loss_clip": 0.01193904, + "auxiliary_loss_mlp": 0.01033711, + "balance_loss_clip": 1.05401921, + "balance_loss_mlp": 1.02460361, + "epoch": 0.3444958816809956, + "flos": 23076915586560.0, + "grad_norm": 1.9229640955338199, + "language_loss": 0.80120778, + "learning_rate": 3.0492778046158448e-06, + "loss": 0.82348394, + "num_input_tokens_seen": 61608340, + "step": 2865, + "time_per_iteration": 2.4503495693206787 + }, + { + "auxiliary_loss_clip": 0.01180552, + "auxiliary_loss_mlp": 0.01031823, + "balance_loss_clip": 1.05709839, + "balance_loss_mlp": 1.0235256, + "epoch": 0.3446161245716347, + "flos": 21908633731200.0, + "grad_norm": 2.2726674770985467, + "language_loss": 0.76528776, + "learning_rate": 3.0486145671144633e-06, + "loss": 0.78741145, + "num_input_tokens_seen": 61628130, + "step": 2866, + "time_per_iteration": 2.4698193073272705 + }, + { + "auxiliary_loss_clip": 0.01104164, + "auxiliary_loss_mlp": 0.01033032, + "balance_loss_clip": 1.04523563, + "balance_loss_mlp": 1.0235008, + "epoch": 0.3447363674622738, + "flos": 25112834461440.0, + "grad_norm": 2.5861919413039445, + "language_loss": 0.76856124, + "learning_rate": 3.047951170539086e-06, + "loss": 0.78993315, + "num_input_tokens_seen": 61647755, + "step": 2867, + "time_per_iteration": 2.6253538131713867 + }, + { + "auxiliary_loss_clip": 0.01151535, + "auxiliary_loss_mlp": 0.0103634, + "balance_loss_clip": 1.05722439, + "balance_loss_mlp": 1.02795339, + "epoch": 0.3448566103529129, + "flos": 11984684451840.0, + "grad_norm": 1.818736361571733, + "language_loss": 0.84449422, + "learning_rate": 3.047287614990349e-06, + "loss": 0.86637294, + "num_input_tokens_seen": 61665675, + "step": 2868, + "time_per_iteration": 2.5070178508758545 + }, + { + "auxiliary_loss_clip": 0.01160307, + "auxiliary_loss_mlp": 0.01029714, + "balance_loss_clip": 1.0528332, + "balance_loss_mlp": 1.02020121, + "epoch": 0.344976853243552, + "flos": 40187882465280.0, + "grad_norm": 3.8288414197554945, + "language_loss": 0.61763012, + "learning_rate": 3.046623900568914e-06, + "loss": 0.63953042, + "num_input_tokens_seen": 61688240, + "step": 2869, + "time_per_iteration": 2.663675546646118 + }, + { + "auxiliary_loss_clip": 0.01161073, + "auxiliary_loss_mlp": 0.01030376, + "balance_loss_clip": 1.05132258, + "balance_loss_mlp": 1.02111959, + "epoch": 0.34509709613419104, + "flos": 28723652127360.0, + "grad_norm": 2.4285785816624346, + "language_loss": 0.70211416, + "learning_rate": 3.045960027375465e-06, + "loss": 0.72402865, + "num_input_tokens_seen": 61706075, + "step": 2870, + "time_per_iteration": 2.541485071182251 + }, + { + "auxiliary_loss_clip": 0.01183639, + "auxiliary_loss_mlp": 0.01028973, + "balance_loss_clip": 1.0530045, + "balance_loss_mlp": 1.01969886, + "epoch": 0.34521733902483015, + "flos": 29967597982080.0, + "grad_norm": 3.1966799991610553, + "language_loss": 0.82810187, + "learning_rate": 3.045295995510711e-06, + "loss": 0.85022801, + "num_input_tokens_seen": 61723045, + "step": 2871, + "time_per_iteration": 3.2142112255096436 + }, + { + "auxiliary_loss_clip": 0.01162186, + "auxiliary_loss_mlp": 0.01029203, + "balance_loss_clip": 1.05269814, + "balance_loss_mlp": 1.0210377, + "epoch": 0.34533758191546926, + "flos": 27923059843200.0, + "grad_norm": 1.741775565869239, + "language_loss": 0.73754513, + "learning_rate": 3.0446318050753865e-06, + "loss": 0.75945908, + "num_input_tokens_seen": 61743525, + "step": 2872, + "time_per_iteration": 2.555711269378662 + }, + { + "auxiliary_loss_clip": 0.0117349, + "auxiliary_loss_mlp": 0.01033844, + "balance_loss_clip": 1.05234313, + "balance_loss_mlp": 1.02534986, + "epoch": 0.3454578248061083, + "flos": 27125879351040.0, + "grad_norm": 1.9503033197779922, + "language_loss": 0.77452099, + "learning_rate": 3.0439674561702474e-06, + "loss": 0.79659432, + "num_input_tokens_seen": 61763025, + "step": 2873, + "time_per_iteration": 2.5056686401367188 + }, + { + "auxiliary_loss_clip": 0.01177953, + "auxiliary_loss_mlp": 0.01030093, + "balance_loss_clip": 1.05479527, + "balance_loss_mlp": 1.02139044, + "epoch": 0.3455780676967474, + "flos": 19024899166080.0, + "grad_norm": 2.233078591609664, + "language_loss": 0.87695014, + "learning_rate": 3.043302948896076e-06, + "loss": 0.89903069, + "num_input_tokens_seen": 61781630, + "step": 2874, + "time_per_iteration": 3.96063232421875 + }, + { + "auxiliary_loss_clip": 0.01126114, + "auxiliary_loss_mlp": 0.01032574, + "balance_loss_clip": 1.04957259, + "balance_loss_mlp": 1.02322853, + "epoch": 0.34569831058738654, + "flos": 34496005507200.0, + "grad_norm": 2.221843418065651, + "language_loss": 0.60392714, + "learning_rate": 3.0426382833536756e-06, + "loss": 0.62551403, + "num_input_tokens_seen": 61804985, + "step": 2875, + "time_per_iteration": 3.3508224487304688 + }, + { + "auxiliary_loss_clip": 0.01145961, + "auxiliary_loss_mlp": 0.01026781, + "balance_loss_clip": 1.04783463, + "balance_loss_mlp": 1.01859164, + "epoch": 0.3458185534780256, + "flos": 31138681098240.0, + "grad_norm": 3.4498638258087397, + "language_loss": 0.77351832, + "learning_rate": 3.041973459643877e-06, + "loss": 0.79524577, + "num_input_tokens_seen": 61824440, + "step": 2876, + "time_per_iteration": 2.610325574874878 + }, + { + "auxiliary_loss_clip": 0.0112718, + "auxiliary_loss_mlp": 0.01028744, + "balance_loss_clip": 1.04467893, + "balance_loss_mlp": 1.0199405, + "epoch": 0.3459387963686647, + "flos": 32452508862720.0, + "grad_norm": 1.9542463769361458, + "language_loss": 0.66959, + "learning_rate": 3.0413084778675334e-06, + "loss": 0.69114923, + "num_input_tokens_seen": 61845690, + "step": 2877, + "time_per_iteration": 2.6605048179626465 + }, + { + "auxiliary_loss_clip": 0.01155939, + "auxiliary_loss_mlp": 0.00762881, + "balance_loss_clip": 1.04765248, + "balance_loss_mlp": 1.00075459, + "epoch": 0.3460590392593038, + "flos": 24675658030080.0, + "grad_norm": 1.866440939610675, + "language_loss": 0.84204972, + "learning_rate": 3.0406433381255214e-06, + "loss": 0.86123794, + "num_input_tokens_seen": 61863725, + "step": 2878, + "time_per_iteration": 2.531266689300537 + }, + { + "auxiliary_loss_clip": 0.01179018, + "auxiliary_loss_mlp": 0.01029473, + "balance_loss_clip": 1.05627298, + "balance_loss_mlp": 1.02150941, + "epoch": 0.34617928214994287, + "flos": 18807316531200.0, + "grad_norm": 2.0942274569896995, + "language_loss": 0.82148784, + "learning_rate": 3.0399780405187425e-06, + "loss": 0.84357274, + "num_input_tokens_seen": 61882720, + "step": 2879, + "time_per_iteration": 2.469900369644165 + }, + { + "auxiliary_loss_clip": 0.01175139, + "auxiliary_loss_mlp": 0.01026508, + "balance_loss_clip": 1.05226398, + "balance_loss_mlp": 1.01878929, + "epoch": 0.346299525040582, + "flos": 24857653265280.0, + "grad_norm": 1.7994011445703668, + "language_loss": 0.78476977, + "learning_rate": 3.0393125851481216e-06, + "loss": 0.80678624, + "num_input_tokens_seen": 61902595, + "step": 2880, + "time_per_iteration": 2.5140111446380615 + }, + { + "auxiliary_loss_clip": 0.01147224, + "auxiliary_loss_mlp": 0.01024335, + "balance_loss_clip": 1.05094981, + "balance_loss_mlp": 1.0160439, + "epoch": 0.3464197679312211, + "flos": 16434914025600.0, + "grad_norm": 2.2592347423582, + "language_loss": 0.86543477, + "learning_rate": 3.038646972114608e-06, + "loss": 0.88715041, + "num_input_tokens_seen": 61918920, + "step": 2881, + "time_per_iteration": 2.564629316329956 + }, + { + "auxiliary_loss_clip": 0.01147571, + "auxiliary_loss_mlp": 0.01036466, + "balance_loss_clip": 1.05250168, + "balance_loss_mlp": 1.02788949, + "epoch": 0.34654001082186014, + "flos": 22382474970240.0, + "grad_norm": 2.632703589677175, + "language_loss": 0.67288917, + "learning_rate": 3.037981201519174e-06, + "loss": 0.69472957, + "num_input_tokens_seen": 61939520, + "step": 2882, + "time_per_iteration": 2.5628559589385986 + }, + { + "auxiliary_loss_clip": 0.0117856, + "auxiliary_loss_mlp": 0.01030865, + "balance_loss_clip": 1.05576372, + "balance_loss_mlp": 1.02261257, + "epoch": 0.34666025371249926, + "flos": 19573901614080.0, + "grad_norm": 2.295539426339597, + "language_loss": 0.71642482, + "learning_rate": 3.0373152734628175e-06, + "loss": 0.73851907, + "num_input_tokens_seen": 61957800, + "step": 2883, + "time_per_iteration": 2.555556535720825 + }, + { + "auxiliary_loss_clip": 0.0117369, + "auxiliary_loss_mlp": 0.01025671, + "balance_loss_clip": 1.05097461, + "balance_loss_mlp": 1.01721883, + "epoch": 0.34678049660313837, + "flos": 15267637751040.0, + "grad_norm": 1.814490843707594, + "language_loss": 0.75928497, + "learning_rate": 3.0366491880465584e-06, + "loss": 0.78127849, + "num_input_tokens_seen": 61975820, + "step": 2884, + "time_per_iteration": 2.441594362258911 + }, + { + "auxiliary_loss_clip": 0.01196632, + "auxiliary_loss_mlp": 0.01030364, + "balance_loss_clip": 1.05709314, + "balance_loss_mlp": 1.02142954, + "epoch": 0.3469007394937774, + "flos": 21181550630400.0, + "grad_norm": 1.63405934323175, + "language_loss": 0.82011348, + "learning_rate": 3.035982945371443e-06, + "loss": 0.8423835, + "num_input_tokens_seen": 61997515, + "step": 2885, + "time_per_iteration": 2.4701333045959473 + }, + { + "auxiliary_loss_clip": 0.01170726, + "auxiliary_loss_mlp": 0.01028236, + "balance_loss_clip": 1.05226326, + "balance_loss_mlp": 1.01917577, + "epoch": 0.34702098238441653, + "flos": 22375471818240.0, + "grad_norm": 2.541085749889714, + "language_loss": 0.85459268, + "learning_rate": 3.035316545538537e-06, + "loss": 0.87658221, + "num_input_tokens_seen": 62016310, + "step": 2886, + "time_per_iteration": 2.486790895462036 + }, + { + "auxiliary_loss_clip": 0.01164861, + "auxiliary_loss_mlp": 0.01029194, + "balance_loss_clip": 1.05683661, + "balance_loss_mlp": 1.02094507, + "epoch": 0.3471412252750556, + "flos": 22929430343040.0, + "grad_norm": 2.24358098581038, + "language_loss": 0.79081726, + "learning_rate": 3.034649988648935e-06, + "loss": 0.81275785, + "num_input_tokens_seen": 62036075, + "step": 2887, + "time_per_iteration": 2.5009753704071045 + }, + { + "auxiliary_loss_clip": 0.0116696, + "auxiliary_loss_mlp": 0.01024694, + "balance_loss_clip": 1.05226016, + "balance_loss_mlp": 1.01618838, + "epoch": 0.3472614681656947, + "flos": 21324259365120.0, + "grad_norm": 1.972103109058321, + "language_loss": 0.80416858, + "learning_rate": 3.033983274803752e-06, + "loss": 0.82608509, + "num_input_tokens_seen": 62055865, + "step": 2888, + "time_per_iteration": 2.4922170639038086 + }, + { + "auxiliary_loss_clip": 0.01158647, + "auxiliary_loss_mlp": 0.01031665, + "balance_loss_clip": 1.04899359, + "balance_loss_mlp": 1.02322173, + "epoch": 0.3473817110563338, + "flos": 23475739271040.0, + "grad_norm": 2.231706700079561, + "language_loss": 0.71795428, + "learning_rate": 3.0333164041041283e-06, + "loss": 0.73985744, + "num_input_tokens_seen": 62072180, + "step": 2889, + "time_per_iteration": 2.4915952682495117 + }, + { + "auxiliary_loss_clip": 0.01121197, + "auxiliary_loss_mlp": 0.01024944, + "balance_loss_clip": 1.04535258, + "balance_loss_mlp": 1.01693308, + "epoch": 0.34750195394697286, + "flos": 22346025644160.0, + "grad_norm": 1.7688964068689104, + "language_loss": 0.71430612, + "learning_rate": 3.032649376651228e-06, + "loss": 0.73576754, + "num_input_tokens_seen": 62091600, + "step": 2890, + "time_per_iteration": 2.601106643676758 + }, + { + "auxiliary_loss_clip": 0.01149724, + "auxiliary_loss_mlp": 0.01028287, + "balance_loss_clip": 1.05027318, + "balance_loss_mlp": 1.01927471, + "epoch": 0.347622196837612, + "flos": 29095004885760.0, + "grad_norm": 1.5623329932587184, + "language_loss": 0.75879991, + "learning_rate": 3.031982192546238e-06, + "loss": 0.78058004, + "num_input_tokens_seen": 62114695, + "step": 2891, + "time_per_iteration": 2.6009016036987305 + }, + { + "auxiliary_loss_clip": 0.01180851, + "auxiliary_loss_mlp": 0.01029665, + "balance_loss_clip": 1.05389166, + "balance_loss_mlp": 1.02132034, + "epoch": 0.3477424397282511, + "flos": 22455732758400.0, + "grad_norm": 2.299443022530129, + "language_loss": 0.94334924, + "learning_rate": 3.0313148518903696e-06, + "loss": 0.9654544, + "num_input_tokens_seen": 62134520, + "step": 2892, + "time_per_iteration": 2.482887029647827 + }, + { + "auxiliary_loss_clip": 0.01168184, + "auxiliary_loss_mlp": 0.01026642, + "balance_loss_clip": 1.05532682, + "balance_loss_mlp": 1.01804698, + "epoch": 0.34786268261889014, + "flos": 15778790242560.0, + "grad_norm": 2.2438598632383235, + "language_loss": 0.81531549, + "learning_rate": 3.030647354784859e-06, + "loss": 0.83726382, + "num_input_tokens_seen": 62151560, + "step": 2893, + "time_per_iteration": 2.4820363521575928 + }, + { + "auxiliary_loss_clip": 0.01147992, + "auxiliary_loss_mlp": 0.01033732, + "balance_loss_clip": 1.04956007, + "balance_loss_mlp": 1.02530408, + "epoch": 0.34798292550952925, + "flos": 20777627214720.0, + "grad_norm": 1.9508901538156684, + "language_loss": 0.77212918, + "learning_rate": 3.029979701330964e-06, + "loss": 0.7939465, + "num_input_tokens_seen": 62170985, + "step": 2894, + "time_per_iteration": 2.5340960025787354 + }, + { + "auxiliary_loss_clip": 0.01169157, + "auxiliary_loss_mlp": 0.01030579, + "balance_loss_clip": 1.05256808, + "balance_loss_mlp": 1.0220679, + "epoch": 0.34810316840016836, + "flos": 19937820257280.0, + "grad_norm": 2.09732149988426, + "language_loss": 0.79709423, + "learning_rate": 3.029311891629966e-06, + "loss": 0.81909156, + "num_input_tokens_seen": 62189440, + "step": 2895, + "time_per_iteration": 2.5017571449279785 + }, + { + "auxiliary_loss_clip": 0.01161541, + "auxiliary_loss_mlp": 0.01034964, + "balance_loss_clip": 1.0529263, + "balance_loss_mlp": 1.02623141, + "epoch": 0.3482234112908074, + "flos": 23623296341760.0, + "grad_norm": 1.7290500468284633, + "language_loss": 0.74346375, + "learning_rate": 3.0286439257831744e-06, + "loss": 0.76542884, + "num_input_tokens_seen": 62208910, + "step": 2896, + "time_per_iteration": 2.525639533996582 + }, + { + "auxiliary_loss_clip": 0.01198218, + "auxiliary_loss_mlp": 0.01034307, + "balance_loss_clip": 1.05644393, + "balance_loss_mlp": 1.02434754, + "epoch": 0.3483436541814465, + "flos": 23986712194560.0, + "grad_norm": 2.1745056098220474, + "language_loss": 0.71632069, + "learning_rate": 3.0279758038919156e-06, + "loss": 0.73864591, + "num_input_tokens_seen": 62227135, + "step": 2897, + "time_per_iteration": 3.250478982925415 + }, + { + "auxiliary_loss_clip": 0.01180537, + "auxiliary_loss_mlp": 0.01029945, + "balance_loss_clip": 1.05506778, + "balance_loss_mlp": 1.02134705, + "epoch": 0.34846389707208564, + "flos": 22638338524800.0, + "grad_norm": 2.045968673671511, + "language_loss": 0.78465033, + "learning_rate": 3.0273075260575455e-06, + "loss": 0.80675507, + "num_input_tokens_seen": 62246035, + "step": 2898, + "time_per_iteration": 2.492673873901367 + }, + { + "auxiliary_loss_clip": 0.01168181, + "auxiliary_loss_mlp": 0.01030396, + "balance_loss_clip": 1.05336714, + "balance_loss_mlp": 1.02095509, + "epoch": 0.3485841399627247, + "flos": 21792857218560.0, + "grad_norm": 1.8857491523208423, + "language_loss": 0.80677193, + "learning_rate": 3.0266390923814396e-06, + "loss": 0.8287577, + "num_input_tokens_seen": 62264095, + "step": 2899, + "time_per_iteration": 2.497706651687622 + }, + { + "auxiliary_loss_clip": 0.01171523, + "auxiliary_loss_mlp": 0.01030591, + "balance_loss_clip": 1.05844331, + "balance_loss_mlp": 1.02109587, + "epoch": 0.3487043828533638, + "flos": 17019036996480.0, + "grad_norm": 1.7691304519686462, + "language_loss": 0.81762958, + "learning_rate": 3.0259705029650008e-06, + "loss": 0.83965075, + "num_input_tokens_seen": 62282025, + "step": 2900, + "time_per_iteration": 2.4797298908233643 + }, + { + "auxiliary_loss_clip": 0.01180214, + "auxiliary_loss_mlp": 0.01025323, + "balance_loss_clip": 1.0529325, + "balance_loss_mlp": 1.01727676, + "epoch": 0.34882462574400286, + "flos": 22601135013120.0, + "grad_norm": 1.6867583786971387, + "language_loss": 0.72933799, + "learning_rate": 3.025301757909652e-06, + "loss": 0.75139332, + "num_input_tokens_seen": 62302220, + "step": 2901, + "time_per_iteration": 3.9207589626312256 + }, + { + "auxiliary_loss_clip": 0.01154307, + "auxiliary_loss_mlp": 0.00764023, + "balance_loss_clip": 1.05216193, + "balance_loss_mlp": 1.00083578, + "epoch": 0.34894486863464197, + "flos": 29861518141440.0, + "grad_norm": 1.5395629920749376, + "language_loss": 0.80679584, + "learning_rate": 3.024632857316842e-06, + "loss": 0.82597917, + "num_input_tokens_seen": 62323535, + "step": 2902, + "time_per_iteration": 3.3698437213897705 + }, + { + "auxiliary_loss_clip": 0.01184507, + "auxiliary_loss_mlp": 0.01027958, + "balance_loss_clip": 1.0586791, + "balance_loss_mlp": 1.01893973, + "epoch": 0.3490651115252811, + "flos": 22122265870080.0, + "grad_norm": 3.109435833484366, + "language_loss": 0.7714867, + "learning_rate": 3.0239638012880412e-06, + "loss": 0.79361141, + "num_input_tokens_seen": 62343430, + "step": 2903, + "time_per_iteration": 2.516875982284546 + }, + { + "auxiliary_loss_clip": 0.01127513, + "auxiliary_loss_mlp": 0.01025843, + "balance_loss_clip": 1.04602373, + "balance_loss_mlp": 1.01674724, + "epoch": 0.34918535441592014, + "flos": 12676682943360.0, + "grad_norm": 2.2237183465939943, + "language_loss": 0.80886829, + "learning_rate": 3.0232945899247466e-06, + "loss": 0.8304019, + "num_input_tokens_seen": 62360365, + "step": 2904, + "time_per_iteration": 2.6393516063690186 + }, + { + "auxiliary_loss_clip": 0.01181155, + "auxiliary_loss_mlp": 0.01035234, + "balance_loss_clip": 1.05302548, + "balance_loss_mlp": 1.02579236, + "epoch": 0.34930559730655925, + "flos": 23185617120000.0, + "grad_norm": 2.1096444568463237, + "language_loss": 0.77372742, + "learning_rate": 3.022625223328476e-06, + "loss": 0.79589123, + "num_input_tokens_seen": 62382105, + "step": 2905, + "time_per_iteration": 2.5398383140563965 + }, + { + "auxiliary_loss_clip": 0.01188624, + "auxiliary_loss_mlp": 0.01032831, + "balance_loss_clip": 1.05731726, + "balance_loss_mlp": 1.0223701, + "epoch": 0.34942584019719836, + "flos": 22855023319680.0, + "grad_norm": 1.3900172053642388, + "language_loss": 0.69309574, + "learning_rate": 3.0219557016007723e-06, + "loss": 0.71531028, + "num_input_tokens_seen": 62402235, + "step": 2906, + "time_per_iteration": 2.4978513717651367 + }, + { + "auxiliary_loss_clip": 0.01177456, + "auxiliary_loss_mlp": 0.01030193, + "balance_loss_clip": 1.05566347, + "balance_loss_mlp": 1.02103209, + "epoch": 0.3495460830878374, + "flos": 24426043441920.0, + "grad_norm": 1.8452138626268817, + "language_loss": 0.69535267, + "learning_rate": 3.021286024843202e-06, + "loss": 0.71742922, + "num_input_tokens_seen": 62420430, + "step": 2907, + "time_per_iteration": 2.4991188049316406 + }, + { + "auxiliary_loss_clip": 0.01099158, + "auxiliary_loss_mlp": 0.01003375, + "balance_loss_clip": 1.02698827, + "balance_loss_mlp": 1.00177729, + "epoch": 0.3496663259784765, + "flos": 70008749389440.0, + "grad_norm": 1.0688307229658462, + "language_loss": 0.64837867, + "learning_rate": 3.0206161931573526e-06, + "loss": 0.66940397, + "num_input_tokens_seen": 62472980, + "step": 2908, + "time_per_iteration": 2.9564716815948486 + }, + { + "auxiliary_loss_clip": 0.01162324, + "auxiliary_loss_mlp": 0.0102968, + "balance_loss_clip": 1.04948294, + "balance_loss_mlp": 1.02126956, + "epoch": 0.34978656886911563, + "flos": 28692805322880.0, + "grad_norm": 1.6125963697306167, + "language_loss": 0.93101496, + "learning_rate": 3.0199462066448388e-06, + "loss": 0.95293498, + "num_input_tokens_seen": 62495175, + "step": 2909, + "time_per_iteration": 2.625577926635742 + }, + { + "auxiliary_loss_clip": 0.01181338, + "auxiliary_loss_mlp": 0.01027369, + "balance_loss_clip": 1.05695713, + "balance_loss_mlp": 1.01827288, + "epoch": 0.3499068117597547, + "flos": 21142156389120.0, + "grad_norm": 1.742618865085266, + "language_loss": 0.69488662, + "learning_rate": 3.019276065407296e-06, + "loss": 0.71697366, + "num_input_tokens_seen": 62514295, + "step": 2910, + "time_per_iteration": 2.5353353023529053 + }, + { + "auxiliary_loss_clip": 0.01139763, + "auxiliary_loss_mlp": 0.01033824, + "balance_loss_clip": 1.05003905, + "balance_loss_mlp": 1.02496088, + "epoch": 0.3500270546503938, + "flos": 22782699285120.0, + "grad_norm": 1.7406970800295702, + "language_loss": 0.80826402, + "learning_rate": 3.018605769546385e-06, + "loss": 0.82999986, + "num_input_tokens_seen": 62534850, + "step": 2911, + "time_per_iteration": 2.6635499000549316 + }, + { + "auxiliary_loss_clip": 0.01178094, + "auxiliary_loss_mlp": 0.01032485, + "balance_loss_clip": 1.0518055, + "balance_loss_mlp": 1.02328229, + "epoch": 0.3501472975410329, + "flos": 22894058424960.0, + "grad_norm": 1.9306785258517345, + "language_loss": 0.79697907, + "learning_rate": 3.017935319163788e-06, + "loss": 0.81908488, + "num_input_tokens_seen": 62553810, + "step": 2912, + "time_per_iteration": 2.4965648651123047 + }, + { + "auxiliary_loss_clip": 0.01180672, + "auxiliary_loss_mlp": 0.01032813, + "balance_loss_clip": 1.05506146, + "balance_loss_mlp": 1.02273965, + "epoch": 0.35026754043167196, + "flos": 25446588658560.0, + "grad_norm": 1.7356348559772652, + "language_loss": 0.70570105, + "learning_rate": 3.017264714361213e-06, + "loss": 0.72783589, + "num_input_tokens_seen": 62573460, + "step": 2913, + "time_per_iteration": 2.5187482833862305 + }, + { + "auxiliary_loss_clip": 0.01164368, + "auxiliary_loss_mlp": 0.00763999, + "balance_loss_clip": 1.05260348, + "balance_loss_mlp": 1.00088406, + "epoch": 0.3503877833223111, + "flos": 19573757959680.0, + "grad_norm": 2.759164743883132, + "language_loss": 0.82112199, + "learning_rate": 3.016593955240389e-06, + "loss": 0.8404057, + "num_input_tokens_seen": 62592150, + "step": 2914, + "time_per_iteration": 2.5254945755004883 + }, + { + "auxiliary_loss_clip": 0.01083358, + "auxiliary_loss_mlp": 0.01002033, + "balance_loss_clip": 1.02355206, + "balance_loss_mlp": 1.00048339, + "epoch": 0.3505080262129502, + "flos": 65072075880960.0, + "grad_norm": 0.8215890082987174, + "language_loss": 0.63707143, + "learning_rate": 3.015923041903071e-06, + "loss": 0.65792531, + "num_input_tokens_seen": 62658275, + "step": 2915, + "time_per_iteration": 3.11413311958313 + }, + { + "auxiliary_loss_clip": 0.01182578, + "auxiliary_loss_mlp": 0.01031794, + "balance_loss_clip": 1.05847073, + "balance_loss_mlp": 1.02225733, + "epoch": 0.35062826910358924, + "flos": 29314562768640.0, + "grad_norm": 2.054957092863907, + "language_loss": 0.8302536, + "learning_rate": 3.0152519744510347e-06, + "loss": 0.85239732, + "num_input_tokens_seen": 62678075, + "step": 2916, + "time_per_iteration": 2.5410103797912598 + }, + { + "auxiliary_loss_clip": 0.01153892, + "auxiliary_loss_mlp": 0.01030466, + "balance_loss_clip": 1.05183697, + "balance_loss_mlp": 1.0211916, + "epoch": 0.35074851199422835, + "flos": 23987717775360.0, + "grad_norm": 2.0007461982092316, + "language_loss": 0.82824576, + "learning_rate": 3.014580752986081e-06, + "loss": 0.85008937, + "num_input_tokens_seen": 62696950, + "step": 2917, + "time_per_iteration": 2.5614936351776123 + }, + { + "auxiliary_loss_clip": 0.01137944, + "auxiliary_loss_mlp": 0.01033988, + "balance_loss_clip": 1.05007589, + "balance_loss_mlp": 1.02535772, + "epoch": 0.3508687548848674, + "flos": 15224436668160.0, + "grad_norm": 2.028301814558777, + "language_loss": 0.78548515, + "learning_rate": 3.0139093776100345e-06, + "loss": 0.80720448, + "num_input_tokens_seen": 62713540, + "step": 2918, + "time_per_iteration": 2.5484001636505127 + }, + { + "auxiliary_loss_clip": 0.01191651, + "auxiliary_loss_mlp": 0.01028657, + "balance_loss_clip": 1.05457401, + "balance_loss_mlp": 1.01946545, + "epoch": 0.3509889977755065, + "flos": 21361750185600.0, + "grad_norm": 1.757032227733582, + "language_loss": 0.75583351, + "learning_rate": 3.013237848424741e-06, + "loss": 0.77803659, + "num_input_tokens_seen": 62732925, + "step": 2919, + "time_per_iteration": 2.4567196369171143 + }, + { + "auxiliary_loss_clip": 0.01167114, + "auxiliary_loss_mlp": 0.01032861, + "balance_loss_clip": 1.05388904, + "balance_loss_mlp": 1.02423, + "epoch": 0.35110924066614563, + "flos": 19135360465920.0, + "grad_norm": 2.9658738545001517, + "language_loss": 0.75327283, + "learning_rate": 3.012566165532072e-06, + "loss": 0.77527255, + "num_input_tokens_seen": 62751715, + "step": 2920, + "time_per_iteration": 2.491041421890259 + }, + { + "auxiliary_loss_clip": 0.01128089, + "auxiliary_loss_mlp": 0.01027958, + "balance_loss_clip": 1.0478723, + "balance_loss_mlp": 1.01907086, + "epoch": 0.3512294835567847, + "flos": 21980885938560.0, + "grad_norm": 2.259115742230466, + "language_loss": 0.76858199, + "learning_rate": 3.0118943290339207e-06, + "loss": 0.79014242, + "num_input_tokens_seen": 62771925, + "step": 2921, + "time_per_iteration": 2.610241174697876 + }, + { + "auxiliary_loss_clip": 0.01141564, + "auxiliary_loss_mlp": 0.0102622, + "balance_loss_clip": 1.04698241, + "balance_loss_mlp": 1.01679683, + "epoch": 0.3513497264474238, + "flos": 17817294896640.0, + "grad_norm": 2.3742067937869025, + "language_loss": 0.68295681, + "learning_rate": 3.011222339032204e-06, + "loss": 0.70463467, + "num_input_tokens_seen": 62790075, + "step": 2922, + "time_per_iteration": 2.516279935836792 + }, + { + "auxiliary_loss_clip": 0.01193754, + "auxiliary_loss_mlp": 0.01030063, + "balance_loss_clip": 1.0556494, + "balance_loss_mlp": 1.02075291, + "epoch": 0.3514699693380629, + "flos": 26943417239040.0, + "grad_norm": 1.806566019088795, + "language_loss": 0.69603145, + "learning_rate": 3.0105501956288626e-06, + "loss": 0.71826959, + "num_input_tokens_seen": 62810545, + "step": 2923, + "time_per_iteration": 2.5018436908721924 + }, + { + "auxiliary_loss_clip": 0.01184657, + "auxiliary_loss_mlp": 0.01030363, + "balance_loss_clip": 1.05351758, + "balance_loss_mlp": 1.02095151, + "epoch": 0.35159021222870196, + "flos": 15267565923840.0, + "grad_norm": 2.0380662326793386, + "language_loss": 0.72570086, + "learning_rate": 3.0098778989258602e-06, + "loss": 0.74785101, + "num_input_tokens_seen": 62829155, + "step": 2924, + "time_per_iteration": 3.310882329940796 + }, + { + "auxiliary_loss_clip": 0.01144729, + "auxiliary_loss_mlp": 0.01031884, + "balance_loss_clip": 1.048388, + "balance_loss_mlp": 1.02271104, + "epoch": 0.35171045511934107, + "flos": 13984154000640.0, + "grad_norm": 2.4696782085541873, + "language_loss": 0.8838051, + "learning_rate": 3.009205449025183e-06, + "loss": 0.90557128, + "num_input_tokens_seen": 62845350, + "step": 2925, + "time_per_iteration": 2.6281306743621826 + }, + { + "auxiliary_loss_clip": 0.01147914, + "auxiliary_loss_mlp": 0.0103221, + "balance_loss_clip": 1.04759514, + "balance_loss_mlp": 1.02293587, + "epoch": 0.3518306980099802, + "flos": 14283434119680.0, + "grad_norm": 2.037533634168966, + "language_loss": 0.6332128, + "learning_rate": 3.008532846028842e-06, + "loss": 0.65501404, + "num_input_tokens_seen": 62862110, + "step": 2926, + "time_per_iteration": 2.5037240982055664 + }, + { + "auxiliary_loss_clip": 0.01196237, + "auxiliary_loss_mlp": 0.01034779, + "balance_loss_clip": 1.05563378, + "balance_loss_mlp": 1.02486074, + "epoch": 0.35195094090061924, + "flos": 27052872958080.0, + "grad_norm": 2.797148268544584, + "language_loss": 0.72386122, + "learning_rate": 3.0078600900388694e-06, + "loss": 0.74617136, + "num_input_tokens_seen": 62882415, + "step": 2927, + "time_per_iteration": 3.164428234100342 + }, + { + "auxiliary_loss_clip": 0.01139407, + "auxiliary_loss_mlp": 0.01029828, + "balance_loss_clip": 1.04641426, + "balance_loss_mlp": 1.02028489, + "epoch": 0.35207118379125835, + "flos": 25629266252160.0, + "grad_norm": 2.4719103956670727, + "language_loss": 0.74218988, + "learning_rate": 3.007187181157323e-06, + "loss": 0.76388216, + "num_input_tokens_seen": 62902425, + "step": 2928, + "time_per_iteration": 4.118063688278198 + }, + { + "auxiliary_loss_clip": 0.01112754, + "auxiliary_loss_mlp": 0.01029475, + "balance_loss_clip": 1.04469979, + "balance_loss_mlp": 1.0205462, + "epoch": 0.35219142668189746, + "flos": 18004713085440.0, + "grad_norm": 2.2372592514393603, + "language_loss": 0.68368363, + "learning_rate": 3.006514119486282e-06, + "loss": 0.7051059, + "num_input_tokens_seen": 62919255, + "step": 2929, + "time_per_iteration": 2.6357386112213135 + }, + { + "auxiliary_loss_clip": 0.01143555, + "auxiliary_loss_mlp": 0.01027847, + "balance_loss_clip": 1.04823184, + "balance_loss_mlp": 1.01865637, + "epoch": 0.3523116695725365, + "flos": 14028109269120.0, + "grad_norm": 2.1325371754391305, + "language_loss": 0.69424045, + "learning_rate": 3.005840905127849e-06, + "loss": 0.71595442, + "num_input_tokens_seen": 62936160, + "step": 2930, + "time_per_iteration": 2.5993599891662598 + }, + { + "auxiliary_loss_clip": 0.01193495, + "auxiliary_loss_mlp": 0.01029188, + "balance_loss_clip": 1.05688131, + "balance_loss_mlp": 1.02008057, + "epoch": 0.3524319124631756, + "flos": 21433966479360.0, + "grad_norm": 2.098199068919238, + "language_loss": 0.86806887, + "learning_rate": 3.0051675381841516e-06, + "loss": 0.89029568, + "num_input_tokens_seen": 62953470, + "step": 2931, + "time_per_iteration": 2.456681489944458 + }, + { + "auxiliary_loss_clip": 0.01105043, + "auxiliary_loss_mlp": 0.00764441, + "balance_loss_clip": 1.04289591, + "balance_loss_mlp": 1.00089908, + "epoch": 0.3525521553538147, + "flos": 26322773114880.0, + "grad_norm": 1.556935297345776, + "language_loss": 0.77151555, + "learning_rate": 3.0044940187573363e-06, + "loss": 0.79021037, + "num_input_tokens_seen": 62974480, + "step": 2932, + "time_per_iteration": 2.68114972114563 + }, + { + "auxiliary_loss_clip": 0.01182464, + "auxiliary_loss_mlp": 0.01036334, + "balance_loss_clip": 1.05190444, + "balance_loss_mlp": 1.02723849, + "epoch": 0.3526723982444538, + "flos": 21543314457600.0, + "grad_norm": 1.7418988424933224, + "language_loss": 0.65024471, + "learning_rate": 3.003820346949578e-06, + "loss": 0.67243266, + "num_input_tokens_seen": 62992560, + "step": 2933, + "time_per_iteration": 2.484895944595337 + }, + { + "auxiliary_loss_clip": 0.01193678, + "auxiliary_loss_mlp": 0.01034406, + "balance_loss_clip": 1.05458057, + "balance_loss_mlp": 1.02506006, + "epoch": 0.3527926411350929, + "flos": 23733649900800.0, + "grad_norm": 1.9235227648894597, + "language_loss": 0.79110312, + "learning_rate": 3.003146522863071e-06, + "loss": 0.81338406, + "num_input_tokens_seen": 63013445, + "step": 2934, + "time_per_iteration": 2.4880247116088867 + }, + { + "auxiliary_loss_clip": 0.01164279, + "auxiliary_loss_mlp": 0.01032884, + "balance_loss_clip": 1.05473506, + "balance_loss_mlp": 1.02394903, + "epoch": 0.35291288402573195, + "flos": 30445461544320.0, + "grad_norm": 2.539468856966958, + "language_loss": 0.86193335, + "learning_rate": 3.0024725466000345e-06, + "loss": 0.88390493, + "num_input_tokens_seen": 63033400, + "step": 2935, + "time_per_iteration": 2.5903403759002686 + }, + { + "auxiliary_loss_clip": 0.0118095, + "auxiliary_loss_mlp": 0.01025514, + "balance_loss_clip": 1.0561142, + "balance_loss_mlp": 1.01709819, + "epoch": 0.35303312691637107, + "flos": 23112179763840.0, + "grad_norm": 1.7194895663354746, + "language_loss": 0.78653032, + "learning_rate": 3.0017984182627087e-06, + "loss": 0.80859494, + "num_input_tokens_seen": 63052725, + "step": 2936, + "time_per_iteration": 2.568108558654785 + }, + { + "auxiliary_loss_clip": 0.01150553, + "auxiliary_loss_mlp": 0.00764173, + "balance_loss_clip": 1.04913497, + "balance_loss_mlp": 1.00091624, + "epoch": 0.3531533698070102, + "flos": 21835699165440.0, + "grad_norm": 2.089502575316517, + "language_loss": 0.82140255, + "learning_rate": 3.00112413795336e-06, + "loss": 0.84054983, + "num_input_tokens_seen": 63072560, + "step": 2937, + "time_per_iteration": 2.563861846923828 + }, + { + "auxiliary_loss_clip": 0.0116006, + "auxiliary_loss_mlp": 0.01032495, + "balance_loss_clip": 1.04707634, + "balance_loss_mlp": 1.02334583, + "epoch": 0.35327361269764923, + "flos": 15778969810560.0, + "grad_norm": 1.9563094507759902, + "language_loss": 0.79935193, + "learning_rate": 3.000449705774275e-06, + "loss": 0.8212775, + "num_input_tokens_seen": 63090800, + "step": 2938, + "time_per_iteration": 2.491534948348999 + }, + { + "auxiliary_loss_clip": 0.01180544, + "auxiliary_loss_mlp": 0.0102837, + "balance_loss_clip": 1.0556711, + "balance_loss_mlp": 1.01917863, + "epoch": 0.35339385558828834, + "flos": 22090413484800.0, + "grad_norm": 2.1610588918383775, + "language_loss": 0.71308088, + "learning_rate": 2.9997751218277654e-06, + "loss": 0.73517001, + "num_input_tokens_seen": 63108955, + "step": 2939, + "time_per_iteration": 2.4800937175750732 + }, + { + "auxiliary_loss_clip": 0.01196004, + "auxiliary_loss_mlp": 0.01029991, + "balance_loss_clip": 1.05655324, + "balance_loss_mlp": 1.0205909, + "epoch": 0.35351409847892745, + "flos": 24165008328960.0, + "grad_norm": 1.933678978650086, + "language_loss": 0.77712452, + "learning_rate": 2.999100386216166e-06, + "loss": 0.79938447, + "num_input_tokens_seen": 63127895, + "step": 2940, + "time_per_iteration": 2.5025548934936523 + }, + { + "auxiliary_loss_clip": 0.01166447, + "auxiliary_loss_mlp": 0.01030537, + "balance_loss_clip": 1.05341065, + "balance_loss_mlp": 1.02160883, + "epoch": 0.3536343413695665, + "flos": 27052298340480.0, + "grad_norm": 1.6662009467643306, + "language_loss": 0.73934233, + "learning_rate": 2.998425499041831e-06, + "loss": 0.76131219, + "num_input_tokens_seen": 63148410, + "step": 2941, + "time_per_iteration": 2.6178712844848633 + }, + { + "auxiliary_loss_clip": 0.01083086, + "auxiliary_loss_mlp": 0.01001648, + "balance_loss_clip": 1.02351975, + "balance_loss_mlp": 1.00008631, + "epoch": 0.3537545842602056, + "flos": 65991066370560.0, + "grad_norm": 1.277865412599564, + "language_loss": 0.64492404, + "learning_rate": 2.997750460407142e-06, + "loss": 0.66577148, + "num_input_tokens_seen": 63209765, + "step": 2942, + "time_per_iteration": 3.102172613143921 + }, + { + "auxiliary_loss_clip": 0.01152851, + "auxiliary_loss_mlp": 0.01025208, + "balance_loss_clip": 1.04814744, + "balance_loss_mlp": 1.01558828, + "epoch": 0.35387482715084473, + "flos": 18436897526400.0, + "grad_norm": 3.7883632652060086, + "language_loss": 0.70078856, + "learning_rate": 2.997075270414501e-06, + "loss": 0.72256923, + "num_input_tokens_seen": 63226980, + "step": 2943, + "time_per_iteration": 2.53021502494812 + }, + { + "auxiliary_loss_clip": 0.010717, + "auxiliary_loss_mlp": 0.01001744, + "balance_loss_clip": 1.02472997, + "balance_loss_mlp": 1.00019407, + "epoch": 0.3539950700414838, + "flos": 65588579498880.0, + "grad_norm": 0.6999483844428628, + "language_loss": 0.57707971, + "learning_rate": 2.9963999291663347e-06, + "loss": 0.59781414, + "num_input_tokens_seen": 63292760, + "step": 2944, + "time_per_iteration": 3.0967941284179688 + }, + { + "auxiliary_loss_clip": 0.0114061, + "auxiliary_loss_mlp": 0.01033647, + "balance_loss_clip": 1.05423665, + "balance_loss_mlp": 1.0249145, + "epoch": 0.3541153129321229, + "flos": 20521655919360.0, + "grad_norm": 2.7749019869139984, + "language_loss": 0.74105537, + "learning_rate": 2.9957244367650915e-06, + "loss": 0.76279795, + "num_input_tokens_seen": 63309005, + "step": 2945, + "time_per_iteration": 2.5676310062408447 + }, + { + "auxiliary_loss_clip": 0.01130921, + "auxiliary_loss_mlp": 0.01029801, + "balance_loss_clip": 1.04929161, + "balance_loss_mlp": 1.02034199, + "epoch": 0.354235555822762, + "flos": 19573578391680.0, + "grad_norm": 2.832813797005831, + "language_loss": 0.83783197, + "learning_rate": 2.9950487933132425e-06, + "loss": 0.85943919, + "num_input_tokens_seen": 63326420, + "step": 2946, + "time_per_iteration": 2.5886452198028564 + }, + { + "auxiliary_loss_clip": 0.0118442, + "auxiliary_loss_mlp": 0.01031694, + "balance_loss_clip": 1.05441689, + "balance_loss_mlp": 1.02315891, + "epoch": 0.35435579871340106, + "flos": 20777268078720.0, + "grad_norm": 1.9671283874581984, + "language_loss": 0.71307999, + "learning_rate": 2.994372998913283e-06, + "loss": 0.73524117, + "num_input_tokens_seen": 63344925, + "step": 2947, + "time_per_iteration": 2.487884998321533 + }, + { + "auxiliary_loss_clip": 0.01166548, + "auxiliary_loss_mlp": 0.01028796, + "balance_loss_clip": 1.05571032, + "balance_loss_mlp": 1.01990271, + "epoch": 0.35447604160404017, + "flos": 23951807153280.0, + "grad_norm": 4.0280884712870355, + "language_loss": 0.62163466, + "learning_rate": 2.99369705366773e-06, + "loss": 0.64358813, + "num_input_tokens_seen": 63365170, + "step": 2948, + "time_per_iteration": 2.5440735816955566 + }, + { + "auxiliary_loss_clip": 0.01162289, + "auxiliary_loss_mlp": 0.01026343, + "balance_loss_clip": 1.05306792, + "balance_loss_mlp": 1.01752138, + "epoch": 0.3545962844946792, + "flos": 23435662671360.0, + "grad_norm": 2.115986406565818, + "language_loss": 0.8210367, + "learning_rate": 2.9930209576791244e-06, + "loss": 0.84292299, + "num_input_tokens_seen": 63383645, + "step": 2949, + "time_per_iteration": 2.5209121704101562 + }, + { + "auxiliary_loss_clip": 0.01177183, + "auxiliary_loss_mlp": 0.0103113, + "balance_loss_clip": 1.05382192, + "balance_loss_mlp": 1.02277923, + "epoch": 0.35471652738531834, + "flos": 22085134185600.0, + "grad_norm": 1.8560774614613498, + "language_loss": 0.63254881, + "learning_rate": 2.9923447110500285e-06, + "loss": 0.65463191, + "num_input_tokens_seen": 63402390, + "step": 2950, + "time_per_iteration": 2.474945306777954 + }, + { + "auxiliary_loss_clip": 0.01168885, + "auxiliary_loss_mlp": 0.0103284, + "balance_loss_clip": 1.05291057, + "balance_loss_mlp": 1.02417922, + "epoch": 0.35483677027595745, + "flos": 27341881787520.0, + "grad_norm": 1.4306212983640971, + "language_loss": 0.7556594, + "learning_rate": 2.9916683138830295e-06, + "loss": 0.77767658, + "num_input_tokens_seen": 63423055, + "step": 2951, + "time_per_iteration": 3.255747079849243 + }, + { + "auxiliary_loss_clip": 0.01160722, + "auxiliary_loss_mlp": 0.01034398, + "balance_loss_clip": 1.05235219, + "balance_loss_mlp": 1.02486086, + "epoch": 0.3549570131665965, + "flos": 13516166678400.0, + "grad_norm": 2.0452022724648202, + "language_loss": 0.80733633, + "learning_rate": 2.9909917662807353e-06, + "loss": 0.82928753, + "num_input_tokens_seen": 63440855, + "step": 2952, + "time_per_iteration": 2.5028767585754395 + }, + { + "auxiliary_loss_clip": 0.01175315, + "auxiliary_loss_mlp": 0.01030982, + "balance_loss_clip": 1.05144119, + "balance_loss_mlp": 1.02168965, + "epoch": 0.3550772560572356, + "flos": 20887549810560.0, + "grad_norm": 2.2839256672397084, + "language_loss": 0.6978454, + "learning_rate": 2.9903150683457783e-06, + "loss": 0.71990836, + "num_input_tokens_seen": 63459400, + "step": 2953, + "time_per_iteration": 2.4838714599609375 + }, + { + "auxiliary_loss_clip": 0.01165377, + "auxiliary_loss_mlp": 0.01025176, + "balance_loss_clip": 1.05061817, + "balance_loss_mlp": 1.01657546, + "epoch": 0.3551974989478747, + "flos": 20194042947840.0, + "grad_norm": 1.6514649143781757, + "language_loss": 0.64932835, + "learning_rate": 2.9896382201808126e-06, + "loss": 0.67123389, + "num_input_tokens_seen": 63476800, + "step": 2954, + "time_per_iteration": 4.005391836166382 + }, + { + "auxiliary_loss_clip": 0.0119475, + "auxiliary_loss_mlp": 0.01028377, + "balance_loss_clip": 1.05494833, + "balance_loss_mlp": 1.01952529, + "epoch": 0.3553177418385138, + "flos": 19828831415040.0, + "grad_norm": 2.21143138931372, + "language_loss": 0.81018388, + "learning_rate": 2.988961221888516e-06, + "loss": 0.83241516, + "num_input_tokens_seen": 63493475, + "step": 2955, + "time_per_iteration": 3.2053122520446777 + }, + { + "auxiliary_loss_clip": 0.01139909, + "auxiliary_loss_mlp": 0.01030497, + "balance_loss_clip": 1.04876518, + "balance_loss_mlp": 1.02158666, + "epoch": 0.3554379847291529, + "flos": 14829132516480.0, + "grad_norm": 2.640397780999298, + "language_loss": 0.78795302, + "learning_rate": 2.988284073571589e-06, + "loss": 0.80965704, + "num_input_tokens_seen": 63509560, + "step": 2956, + "time_per_iteration": 2.5139997005462646 + }, + { + "auxiliary_loss_clip": 0.01181499, + "auxiliary_loss_mlp": 0.00763665, + "balance_loss_clip": 1.05479956, + "balance_loss_mlp": 1.00099087, + "epoch": 0.355558227619792, + "flos": 20485350247680.0, + "grad_norm": 2.7674243625798693, + "language_loss": 0.73201692, + "learning_rate": 2.9876067753327528e-06, + "loss": 0.75146866, + "num_input_tokens_seen": 63527290, + "step": 2957, + "time_per_iteration": 2.5094165802001953 + }, + { + "auxiliary_loss_clip": 0.01182476, + "auxiliary_loss_mlp": 0.01038999, + "balance_loss_clip": 1.05305433, + "balance_loss_mlp": 1.02946818, + "epoch": 0.35567847051043106, + "flos": 37663613256960.0, + "grad_norm": 1.7893643607995722, + "language_loss": 0.80349624, + "learning_rate": 2.986929327274754e-06, + "loss": 0.82571101, + "num_input_tokens_seen": 63547870, + "step": 2958, + "time_per_iteration": 2.6088979244232178 + }, + { + "auxiliary_loss_clip": 0.01179597, + "auxiliary_loss_mlp": 0.01030672, + "balance_loss_clip": 1.05643702, + "balance_loss_mlp": 1.02199388, + "epoch": 0.35579871340107017, + "flos": 26943058103040.0, + "grad_norm": 1.6010258712772498, + "language_loss": 0.78947467, + "learning_rate": 2.9862517295003617e-06, + "loss": 0.81157732, + "num_input_tokens_seen": 63568285, + "step": 2959, + "time_per_iteration": 2.5211431980133057 + }, + { + "auxiliary_loss_clip": 0.01146679, + "auxiliary_loss_mlp": 0.01027534, + "balance_loss_clip": 1.04777873, + "balance_loss_mlp": 1.01926696, + "epoch": 0.3559189562917093, + "flos": 28293335193600.0, + "grad_norm": 1.4762875914256506, + "language_loss": 0.72553027, + "learning_rate": 2.9855739821123654e-06, + "loss": 0.74727237, + "num_input_tokens_seen": 63589865, + "step": 2960, + "time_per_iteration": 2.586106777191162 + }, + { + "auxiliary_loss_clip": 0.01173211, + "auxiliary_loss_mlp": 0.01026894, + "balance_loss_clip": 1.05253005, + "balance_loss_mlp": 1.01787555, + "epoch": 0.35603919918234833, + "flos": 25664063552640.0, + "grad_norm": 1.715690629277751, + "language_loss": 0.81939, + "learning_rate": 2.98489608521358e-06, + "loss": 0.84139109, + "num_input_tokens_seen": 63609805, + "step": 2961, + "time_per_iteration": 2.5166659355163574 + }, + { + "auxiliary_loss_clip": 0.01184547, + "auxiliary_loss_mlp": 0.00763424, + "balance_loss_clip": 1.05448675, + "balance_loss_mlp": 1.00103712, + "epoch": 0.35615944207298744, + "flos": 23000856537600.0, + "grad_norm": 2.1048408449056066, + "language_loss": 0.79396176, + "learning_rate": 2.9842180389068425e-06, + "loss": 0.81344146, + "num_input_tokens_seen": 63627115, + "step": 2962, + "time_per_iteration": 2.4896087646484375 + }, + { + "auxiliary_loss_clip": 0.0106199, + "auxiliary_loss_mlp": 0.01004915, + "balance_loss_clip": 1.03027737, + "balance_loss_mlp": 1.00316286, + "epoch": 0.35627968496362655, + "flos": 68251283723520.0, + "grad_norm": 0.7710288514580721, + "language_loss": 0.59268218, + "learning_rate": 2.98353984329501e-06, + "loss": 0.61335123, + "num_input_tokens_seen": 63691460, + "step": 2963, + "time_per_iteration": 3.13114857673645 + }, + { + "auxiliary_loss_clip": 0.01165649, + "auxiliary_loss_mlp": 0.01029909, + "balance_loss_clip": 1.05404186, + "balance_loss_mlp": 1.02071202, + "epoch": 0.3563999278542656, + "flos": 22641714403200.0, + "grad_norm": 1.6094763097317621, + "language_loss": 0.7054069, + "learning_rate": 2.982861498480965e-06, + "loss": 0.72736251, + "num_input_tokens_seen": 63713840, + "step": 2964, + "time_per_iteration": 2.558201313018799 + }, + { + "auxiliary_loss_clip": 0.01144508, + "auxiliary_loss_mlp": 0.01028934, + "balance_loss_clip": 1.04619324, + "balance_loss_mlp": 1.0204289, + "epoch": 0.3565201707449047, + "flos": 25952533678080.0, + "grad_norm": 1.6155271100627757, + "language_loss": 0.82426095, + "learning_rate": 2.9821830045676122e-06, + "loss": 0.84599543, + "num_input_tokens_seen": 63733540, + "step": 2965, + "time_per_iteration": 2.580876350402832 + }, + { + "auxiliary_loss_clip": 0.011957, + "auxiliary_loss_mlp": 0.01032853, + "balance_loss_clip": 1.05643177, + "balance_loss_mlp": 1.02447271, + "epoch": 0.3566404136355438, + "flos": 28475725478400.0, + "grad_norm": 1.6851530867551345, + "language_loss": 0.72287077, + "learning_rate": 2.9815043616578793e-06, + "loss": 0.74515629, + "num_input_tokens_seen": 63754335, + "step": 2966, + "time_per_iteration": 2.501889228820801 + }, + { + "auxiliary_loss_clip": 0.01145151, + "auxiliary_loss_mlp": 0.01031661, + "balance_loss_clip": 1.047521, + "balance_loss_mlp": 1.02301192, + "epoch": 0.3567606565261829, + "flos": 38363117690880.0, + "grad_norm": 1.9672735720964156, + "language_loss": 0.76793003, + "learning_rate": 2.9808255698547145e-06, + "loss": 0.78969812, + "num_input_tokens_seen": 63777135, + "step": 2967, + "time_per_iteration": 2.697685480117798 + }, + { + "auxiliary_loss_clip": 0.01180829, + "auxiliary_loss_mlp": 0.01027202, + "balance_loss_clip": 1.05704069, + "balance_loss_mlp": 1.01862502, + "epoch": 0.356880899416822, + "flos": 21981029592960.0, + "grad_norm": 1.8907010493744505, + "language_loss": 0.79215145, + "learning_rate": 2.9801466292610913e-06, + "loss": 0.81423175, + "num_input_tokens_seen": 63797020, + "step": 2968, + "time_per_iteration": 2.492971420288086 + }, + { + "auxiliary_loss_clip": 0.01176303, + "auxiliary_loss_mlp": 0.01023819, + "balance_loss_clip": 1.05208409, + "balance_loss_mlp": 1.01534879, + "epoch": 0.35700114230746105, + "flos": 18989132198400.0, + "grad_norm": 1.892451886096719, + "language_loss": 0.80608523, + "learning_rate": 2.979467539980003e-06, + "loss": 0.82808644, + "num_input_tokens_seen": 63813810, + "step": 2969, + "time_per_iteration": 2.468421459197998 + }, + { + "auxiliary_loss_clip": 0.01181965, + "auxiliary_loss_mlp": 0.01034779, + "balance_loss_clip": 1.05523312, + "balance_loss_mlp": 1.02619553, + "epoch": 0.35712138519810016, + "flos": 19756112330880.0, + "grad_norm": 1.8376733447711207, + "language_loss": 0.76900375, + "learning_rate": 2.978788302114468e-06, + "loss": 0.79117119, + "num_input_tokens_seen": 63830925, + "step": 2970, + "time_per_iteration": 2.4757986068725586 + }, + { + "auxiliary_loss_clip": 0.01176111, + "auxiliary_loss_mlp": 0.01032546, + "balance_loss_clip": 1.05230856, + "balance_loss_mlp": 1.02348626, + "epoch": 0.35724162808873927, + "flos": 35183012008320.0, + "grad_norm": 1.9127139100265984, + "language_loss": 0.83265674, + "learning_rate": 2.9781089157675255e-06, + "loss": 0.85474336, + "num_input_tokens_seen": 63849385, + "step": 2971, + "time_per_iteration": 2.5789730548858643 + }, + { + "auxiliary_loss_clip": 0.0117568, + "auxiliary_loss_mlp": 0.01029915, + "balance_loss_clip": 1.05510092, + "balance_loss_mlp": 1.02098644, + "epoch": 0.3573618709793783, + "flos": 25556726736000.0, + "grad_norm": 1.6922252461455731, + "language_loss": 0.88475376, + "learning_rate": 2.977429381042238e-06, + "loss": 0.90680975, + "num_input_tokens_seen": 63870060, + "step": 2972, + "time_per_iteration": 2.519880533218384 + }, + { + "auxiliary_loss_clip": 0.01163935, + "auxiliary_loss_mlp": 0.01027649, + "balance_loss_clip": 1.05183411, + "balance_loss_mlp": 1.01985276, + "epoch": 0.35748211387001744, + "flos": 29132352051840.0, + "grad_norm": 2.0589665361419196, + "language_loss": 0.89450073, + "learning_rate": 2.9767496980416913e-06, + "loss": 0.91641653, + "num_input_tokens_seen": 63889355, + "step": 2973, + "time_per_iteration": 2.5562148094177246 + }, + { + "auxiliary_loss_clip": 0.01161397, + "auxiliary_loss_mlp": 0.01032575, + "balance_loss_clip": 1.05072212, + "balance_loss_mlp": 1.0231812, + "epoch": 0.35760235676065655, + "flos": 13954169122560.0, + "grad_norm": 2.120709317937726, + "language_loss": 0.80698943, + "learning_rate": 2.9760698668689914e-06, + "loss": 0.82892919, + "num_input_tokens_seen": 63905580, + "step": 2974, + "time_per_iteration": 2.4723055362701416 + }, + { + "auxiliary_loss_clip": 0.01180078, + "auxiliary_loss_mlp": 0.01027293, + "balance_loss_clip": 1.05320978, + "balance_loss_mlp": 1.01909184, + "epoch": 0.3577225996512956, + "flos": 44018688977280.0, + "grad_norm": 1.8622284197043966, + "language_loss": 0.71415317, + "learning_rate": 2.975389887627269e-06, + "loss": 0.73622686, + "num_input_tokens_seen": 63928180, + "step": 2975, + "time_per_iteration": 2.6653809547424316 + }, + { + "auxiliary_loss_clip": 0.01154487, + "auxiliary_loss_mlp": 0.01032971, + "balance_loss_clip": 1.05185485, + "balance_loss_mlp": 1.02483535, + "epoch": 0.3578428425419347, + "flos": 17055199013760.0, + "grad_norm": 2.2116613149784436, + "language_loss": 0.89979887, + "learning_rate": 2.9747097604196764e-06, + "loss": 0.92167342, + "num_input_tokens_seen": 63944825, + "step": 2976, + "time_per_iteration": 2.514312982559204 + }, + { + "auxiliary_loss_clip": 0.01055667, + "auxiliary_loss_mlp": 0.01002945, + "balance_loss_clip": 1.02691388, + "balance_loss_mlp": 1.00149667, + "epoch": 0.3579630854325738, + "flos": 71676550707840.0, + "grad_norm": 0.6686711694492313, + "language_loss": 0.56685817, + "learning_rate": 2.9740294853493875e-06, + "loss": 0.58744431, + "num_input_tokens_seen": 64016385, + "step": 2977, + "time_per_iteration": 3.373563289642334 + }, + { + "auxiliary_loss_clip": 0.01140126, + "auxiliary_loss_mlp": 0.01030761, + "balance_loss_clip": 1.04895449, + "balance_loss_mlp": 1.02203488, + "epoch": 0.3580833283232129, + "flos": 25046651652480.0, + "grad_norm": 1.9207310028797013, + "language_loss": 0.67075092, + "learning_rate": 2.9733490625196008e-06, + "loss": 0.69245982, + "num_input_tokens_seen": 64036245, + "step": 2978, + "time_per_iteration": 3.3269331455230713 + }, + { + "auxiliary_loss_clip": 0.01137166, + "auxiliary_loss_mlp": 0.01031838, + "balance_loss_clip": 1.04669976, + "balance_loss_mlp": 1.02358294, + "epoch": 0.358203571213852, + "flos": 13953127628160.0, + "grad_norm": 5.468672218663334, + "language_loss": 0.75303203, + "learning_rate": 2.9726684920335353e-06, + "loss": 0.7747221, + "num_input_tokens_seen": 64054110, + "step": 2979, + "time_per_iteration": 2.5449371337890625 + }, + { + "auxiliary_loss_clip": 0.01196361, + "auxiliary_loss_mlp": 0.0076423, + "balance_loss_clip": 1.05477118, + "balance_loss_mlp": 1.00118017, + "epoch": 0.35832381410449105, + "flos": 20302457172480.0, + "grad_norm": 2.4504333636026105, + "language_loss": 0.8175661, + "learning_rate": 2.971987773994432e-06, + "loss": 0.83717197, + "num_input_tokens_seen": 64070295, + "step": 2980, + "time_per_iteration": 3.241222381591797 + }, + { + "auxiliary_loss_clip": 0.01169243, + "auxiliary_loss_mlp": 0.01022011, + "balance_loss_clip": 1.04955971, + "balance_loss_mlp": 1.01373172, + "epoch": 0.35844405699513016, + "flos": 16983234115200.0, + "grad_norm": 2.6200780662412764, + "language_loss": 0.83041722, + "learning_rate": 2.9713069085055566e-06, + "loss": 0.85232973, + "num_input_tokens_seen": 64088605, + "step": 2981, + "time_per_iteration": 3.514241933822632 + }, + { + "auxiliary_loss_clip": 0.01149458, + "auxiliary_loss_mlp": 0.01025458, + "balance_loss_clip": 1.05024314, + "balance_loss_mlp": 1.016958, + "epoch": 0.35856429988576927, + "flos": 23216858974080.0, + "grad_norm": 1.595965763978449, + "language_loss": 0.78832895, + "learning_rate": 2.9706258956701958e-06, + "loss": 0.81007808, + "num_input_tokens_seen": 64108595, + "step": 2982, + "time_per_iteration": 3.38713002204895 + }, + { + "auxiliary_loss_clip": 0.01181422, + "auxiliary_loss_mlp": 0.01030294, + "balance_loss_clip": 1.05318177, + "balance_loss_mlp": 1.02160382, + "epoch": 0.3586845427764083, + "flos": 23034576430080.0, + "grad_norm": 2.234041527829723, + "language_loss": 0.77602327, + "learning_rate": 2.9699447355916575e-06, + "loss": 0.79814041, + "num_input_tokens_seen": 64127405, + "step": 2983, + "time_per_iteration": 2.4963278770446777 + }, + { + "auxiliary_loss_clip": 0.01191972, + "auxiliary_loss_mlp": 0.00763152, + "balance_loss_clip": 1.05453491, + "balance_loss_mlp": 1.00099254, + "epoch": 0.35880478566704743, + "flos": 20010682995840.0, + "grad_norm": 1.9272104410992146, + "language_loss": 0.7378509, + "learning_rate": 2.969263428373275e-06, + "loss": 0.75740218, + "num_input_tokens_seen": 64145755, + "step": 2984, + "time_per_iteration": 2.416092872619629 + }, + { + "auxiliary_loss_clip": 0.01166958, + "auxiliary_loss_mlp": 0.0103262, + "balance_loss_clip": 1.05198956, + "balance_loss_mlp": 1.02391839, + "epoch": 0.35892502855768654, + "flos": 13699095667200.0, + "grad_norm": 1.852289430670097, + "language_loss": 0.79407018, + "learning_rate": 2.9685819741184007e-06, + "loss": 0.81606603, + "num_input_tokens_seen": 64164195, + "step": 2985, + "time_per_iteration": 2.472299337387085 + }, + { + "auxiliary_loss_clip": 0.01143603, + "auxiliary_loss_mlp": 0.01031328, + "balance_loss_clip": 1.04989493, + "balance_loss_mlp": 1.02271533, + "epoch": 0.3590452714483256, + "flos": 18114096977280.0, + "grad_norm": 2.494779574374427, + "language_loss": 0.69101542, + "learning_rate": 2.967900372930411e-06, + "loss": 0.71276474, + "num_input_tokens_seen": 64182705, + "step": 2986, + "time_per_iteration": 2.520228385925293 + }, + { + "auxiliary_loss_clip": 0.01158289, + "auxiliary_loss_mlp": 0.01032464, + "balance_loss_clip": 1.05070567, + "balance_loss_mlp": 1.02335072, + "epoch": 0.3591655143389647, + "flos": 17749352321280.0, + "grad_norm": 4.960883703058457, + "language_loss": 0.78636014, + "learning_rate": 2.9672186249127046e-06, + "loss": 0.80826771, + "num_input_tokens_seen": 64202170, + "step": 2987, + "time_per_iteration": 2.494638442993164 + }, + { + "auxiliary_loss_clip": 0.01162521, + "auxiliary_loss_mlp": 0.01031366, + "balance_loss_clip": 1.05257308, + "balance_loss_mlp": 1.02298546, + "epoch": 0.3592857572296038, + "flos": 25224409082880.0, + "grad_norm": 2.008542588110922, + "language_loss": 0.7859664, + "learning_rate": 2.9665367301687014e-06, + "loss": 0.8079052, + "num_input_tokens_seen": 64220415, + "step": 2988, + "time_per_iteration": 2.5226986408233643 + }, + { + "auxiliary_loss_clip": 0.01156587, + "auxiliary_loss_mlp": 0.01029566, + "balance_loss_clip": 1.05172157, + "balance_loss_mlp": 1.02112556, + "epoch": 0.3594060001202429, + "flos": 29384408764800.0, + "grad_norm": 1.906975701038394, + "language_loss": 0.7613821, + "learning_rate": 2.965854688801845e-06, + "loss": 0.78324366, + "num_input_tokens_seen": 64242475, + "step": 2989, + "time_per_iteration": 2.560519218444824 + }, + { + "auxiliary_loss_clip": 0.01174219, + "auxiliary_loss_mlp": 0.01026766, + "balance_loss_clip": 1.04797161, + "balance_loss_mlp": 1.01839733, + "epoch": 0.359526243010882, + "flos": 17052900543360.0, + "grad_norm": 2.0720914539537745, + "language_loss": 0.76311678, + "learning_rate": 2.9651725009156005e-06, + "loss": 0.78512669, + "num_input_tokens_seen": 64260220, + "step": 2990, + "time_per_iteration": 2.442748546600342 + }, + { + "auxiliary_loss_clip": 0.01156623, + "auxiliary_loss_mlp": 0.01036043, + "balance_loss_clip": 1.04807699, + "balance_loss_mlp": 1.0265069, + "epoch": 0.3596464859015211, + "flos": 22965089569920.0, + "grad_norm": 1.674863332415262, + "language_loss": 0.74161518, + "learning_rate": 2.964490166613454e-06, + "loss": 0.76354188, + "num_input_tokens_seen": 64280145, + "step": 2991, + "time_per_iteration": 2.5128822326660156 + }, + { + "auxiliary_loss_clip": 0.01102105, + "auxiliary_loss_mlp": 0.01003859, + "balance_loss_clip": 1.02999473, + "balance_loss_mlp": 1.00252938, + "epoch": 0.35976672879216015, + "flos": 54739462590720.0, + "grad_norm": 0.7581655646714275, + "language_loss": 0.57796133, + "learning_rate": 2.963807685998917e-06, + "loss": 0.59902108, + "num_input_tokens_seen": 64336010, + "step": 2992, + "time_per_iteration": 2.8690991401672363 + }, + { + "auxiliary_loss_clip": 0.01139601, + "auxiliary_loss_mlp": 0.01025058, + "balance_loss_clip": 1.04726744, + "balance_loss_mlp": 1.01668382, + "epoch": 0.35988697168279926, + "flos": 43139020901760.0, + "grad_norm": 1.8389102963405752, + "language_loss": 0.78028035, + "learning_rate": 2.9631250591755196e-06, + "loss": 0.80192697, + "num_input_tokens_seen": 64358725, + "step": 2993, + "time_per_iteration": 2.7731502056121826 + }, + { + "auxiliary_loss_clip": 0.01159776, + "auxiliary_loss_mlp": 0.01029956, + "balance_loss_clip": 1.0526762, + "balance_loss_mlp": 1.02059841, + "epoch": 0.36000721457343837, + "flos": 35845600239360.0, + "grad_norm": 1.8759420810592262, + "language_loss": 0.57939368, + "learning_rate": 2.962442286246817e-06, + "loss": 0.601291, + "num_input_tokens_seen": 64381555, + "step": 2994, + "time_per_iteration": 2.6461756229400635 + }, + { + "auxiliary_loss_clip": 0.01167741, + "auxiliary_loss_mlp": 0.01026646, + "balance_loss_clip": 1.05144024, + "balance_loss_mlp": 1.01837325, + "epoch": 0.3601274574640774, + "flos": 18291100222080.0, + "grad_norm": 2.184968785056233, + "language_loss": 0.69659621, + "learning_rate": 2.9617593673163853e-06, + "loss": 0.71854007, + "num_input_tokens_seen": 64400375, + "step": 2995, + "time_per_iteration": 2.484778881072998 + }, + { + "auxiliary_loss_clip": 0.01166768, + "auxiliary_loss_mlp": 0.01022543, + "balance_loss_clip": 1.0495764, + "balance_loss_mlp": 1.01474094, + "epoch": 0.36024770035471654, + "flos": 13333955961600.0, + "grad_norm": 2.4604765506634236, + "language_loss": 0.77232778, + "learning_rate": 2.9610763024878216e-06, + "loss": 0.79422086, + "num_input_tokens_seen": 64415880, + "step": 2996, + "time_per_iteration": 2.4713191986083984 + }, + { + "auxiliary_loss_clip": 0.01158087, + "auxiliary_loss_mlp": 0.01038232, + "balance_loss_clip": 1.0508213, + "balance_loss_mlp": 1.02937508, + "epoch": 0.3603679432453556, + "flos": 20267013427200.0, + "grad_norm": 1.6515793612720704, + "language_loss": 0.91609943, + "learning_rate": 2.960393091864747e-06, + "loss": 0.93806255, + "num_input_tokens_seen": 64434260, + "step": 2997, + "time_per_iteration": 2.5082666873931885 + }, + { + "auxiliary_loss_clip": 0.01167687, + "auxiliary_loss_mlp": 0.01021837, + "balance_loss_clip": 1.05470765, + "balance_loss_mlp": 1.01342726, + "epoch": 0.3604881861359947, + "flos": 22451135817600.0, + "grad_norm": 1.721073395514577, + "language_loss": 0.74719965, + "learning_rate": 2.959709735550804e-06, + "loss": 0.76909494, + "num_input_tokens_seen": 64453855, + "step": 2998, + "time_per_iteration": 2.5285511016845703 + }, + { + "auxiliary_loss_clip": 0.01138217, + "auxiliary_loss_mlp": 0.01026094, + "balance_loss_clip": 1.04902458, + "balance_loss_mlp": 1.01755297, + "epoch": 0.3606084290266338, + "flos": 22054251467520.0, + "grad_norm": 2.05436931736247, + "language_loss": 0.76149011, + "learning_rate": 2.9590262336496575e-06, + "loss": 0.78313327, + "num_input_tokens_seen": 64473585, + "step": 2999, + "time_per_iteration": 2.5666000843048096 + }, + { + "auxiliary_loss_clip": 0.01146954, + "auxiliary_loss_mlp": 0.01034502, + "balance_loss_clip": 1.05244696, + "balance_loss_mlp": 1.02529263, + "epoch": 0.36072867191727287, + "flos": 15632921111040.0, + "grad_norm": 2.0297915806796523, + "language_loss": 0.85136294, + "learning_rate": 2.9583425862649936e-06, + "loss": 0.87317753, + "num_input_tokens_seen": 64491720, + "step": 3000, + "time_per_iteration": 2.513604164123535 + }, + { + "auxiliary_loss_clip": 0.01196364, + "auxiliary_loss_mlp": 0.01031988, + "balance_loss_clip": 1.05697274, + "balance_loss_mlp": 1.02330375, + "epoch": 0.360848914807912, + "flos": 19677000625920.0, + "grad_norm": 2.374962428819542, + "language_loss": 0.73927605, + "learning_rate": 2.9576587935005215e-06, + "loss": 0.76155967, + "num_input_tokens_seen": 64509800, + "step": 3001, + "time_per_iteration": 2.4502017498016357 + }, + { + "auxiliary_loss_clip": 0.01179419, + "auxiliary_loss_mlp": 0.01026129, + "balance_loss_clip": 1.05162978, + "balance_loss_mlp": 1.01736116, + "epoch": 0.3609691576985511, + "flos": 18877808972160.0, + "grad_norm": 7.289746687845646, + "language_loss": 0.72044164, + "learning_rate": 2.9569748554599713e-06, + "loss": 0.74249715, + "num_input_tokens_seen": 64525410, + "step": 3002, + "time_per_iteration": 2.4753754138946533 + }, + { + "auxiliary_loss_clip": 0.01163817, + "auxiliary_loss_mlp": 0.01030634, + "balance_loss_clip": 1.05151999, + "balance_loss_mlp": 1.02222991, + "epoch": 0.36108940058919015, + "flos": 42224088648960.0, + "grad_norm": 2.119903577841608, + "language_loss": 0.7318235, + "learning_rate": 2.956290772247097e-06, + "loss": 0.75376797, + "num_input_tokens_seen": 64544085, + "step": 3003, + "time_per_iteration": 2.7162811756134033 + }, + { + "auxiliary_loss_clip": 0.01127654, + "auxiliary_loss_mlp": 0.01030207, + "balance_loss_clip": 1.04863822, + "balance_loss_mlp": 1.02190399, + "epoch": 0.36120964347982926, + "flos": 23185150243200.0, + "grad_norm": 1.8065066122504105, + "language_loss": 0.72831357, + "learning_rate": 2.9556065439656724e-06, + "loss": 0.74989212, + "num_input_tokens_seen": 64563135, + "step": 3004, + "time_per_iteration": 3.3535516262054443 + }, + { + "auxiliary_loss_clip": 0.01112704, + "auxiliary_loss_mlp": 0.01029042, + "balance_loss_clip": 1.0423696, + "balance_loss_mlp": 1.02055395, + "epoch": 0.36132988637046837, + "flos": 18113055482880.0, + "grad_norm": 1.8402972987188115, + "language_loss": 0.81524402, + "learning_rate": 2.9549221707194952e-06, + "loss": 0.83666146, + "num_input_tokens_seen": 64581985, + "step": 3005, + "time_per_iteration": 2.5822973251342773 + }, + { + "auxiliary_loss_clip": 0.01180805, + "auxiliary_loss_mlp": 0.01028146, + "balance_loss_clip": 1.05465746, + "balance_loss_mlp": 1.01970625, + "epoch": 0.3614501292611074, + "flos": 27813101333760.0, + "grad_norm": 1.9889031989041093, + "language_loss": 0.72706735, + "learning_rate": 2.954237652612384e-06, + "loss": 0.74915683, + "num_input_tokens_seen": 64601035, + "step": 3006, + "time_per_iteration": 2.5357017517089844 + }, + { + "auxiliary_loss_clip": 0.01161113, + "auxiliary_loss_mlp": 0.01023943, + "balance_loss_clip": 1.05082726, + "balance_loss_mlp": 1.01586032, + "epoch": 0.36157037215174653, + "flos": 22634926732800.0, + "grad_norm": 2.0455919601986374, + "language_loss": 0.84189796, + "learning_rate": 2.9535529897481796e-06, + "loss": 0.86374849, + "num_input_tokens_seen": 64618580, + "step": 3007, + "time_per_iteration": 4.039163589477539 + }, + { + "auxiliary_loss_clip": 0.01194095, + "auxiliary_loss_mlp": 0.01026727, + "balance_loss_clip": 1.0542419, + "balance_loss_mlp": 1.01810813, + "epoch": 0.36169061504238564, + "flos": 12600839376000.0, + "grad_norm": 2.2946750586251627, + "language_loss": 0.77446079, + "learning_rate": 2.9528681822307446e-06, + "loss": 0.79666901, + "num_input_tokens_seen": 64635430, + "step": 3008, + "time_per_iteration": 2.4106006622314453 + }, + { + "auxiliary_loss_clip": 0.01177201, + "auxiliary_loss_mlp": 0.00762955, + "balance_loss_clip": 1.05690312, + "balance_loss_mlp": 1.0010829, + "epoch": 0.3618108579330247, + "flos": 26684644682880.0, + "grad_norm": 2.6094297926685104, + "language_loss": 0.82253683, + "learning_rate": 2.952183230163964e-06, + "loss": 0.84193838, + "num_input_tokens_seen": 64655005, + "step": 3009, + "time_per_iteration": 3.2444658279418945 + }, + { + "auxiliary_loss_clip": 0.01144756, + "auxiliary_loss_mlp": 0.01024377, + "balance_loss_clip": 1.04775929, + "balance_loss_mlp": 1.01633048, + "epoch": 0.3619311008236638, + "flos": 22817029708800.0, + "grad_norm": 2.0116165362627996, + "language_loss": 0.72824228, + "learning_rate": 2.9514981336517448e-06, + "loss": 0.7499336, + "num_input_tokens_seen": 64674775, + "step": 3010, + "time_per_iteration": 2.5562846660614014 + }, + { + "auxiliary_loss_clip": 0.0117641, + "auxiliary_loss_mlp": 0.01028897, + "balance_loss_clip": 1.05362415, + "balance_loss_mlp": 1.02023697, + "epoch": 0.36205134371430286, + "flos": 25919603884800.0, + "grad_norm": 2.055639578545794, + "language_loss": 0.81225431, + "learning_rate": 2.950812892798015e-06, + "loss": 0.83430737, + "num_input_tokens_seen": 64695670, + "step": 3011, + "time_per_iteration": 2.50288987159729 + }, + { + "auxiliary_loss_clip": 0.01132177, + "auxiliary_loss_mlp": 0.00762941, + "balance_loss_clip": 1.05102026, + "balance_loss_mlp": 1.00105715, + "epoch": 0.362171586604942, + "flos": 26139592730880.0, + "grad_norm": 1.7843502608677646, + "language_loss": 0.87061656, + "learning_rate": 2.9501275077067256e-06, + "loss": 0.88956773, + "num_input_tokens_seen": 64716290, + "step": 3012, + "time_per_iteration": 2.61016583442688 + }, + { + "auxiliary_loss_clip": 0.01104562, + "auxiliary_loss_mlp": 0.01028619, + "balance_loss_clip": 1.04347646, + "balance_loss_mlp": 1.02066803, + "epoch": 0.3622918294955811, + "flos": 28074208273920.0, + "grad_norm": 1.5426646371693207, + "language_loss": 0.88605332, + "learning_rate": 2.949441978481848e-06, + "loss": 0.90738517, + "num_input_tokens_seen": 64737190, + "step": 3013, + "time_per_iteration": 2.6564478874206543 + }, + { + "auxiliary_loss_clip": 0.01153846, + "auxiliary_loss_mlp": 0.01031144, + "balance_loss_clip": 1.049703, + "balance_loss_mlp": 1.02222157, + "epoch": 0.36241207238622014, + "flos": 19828005402240.0, + "grad_norm": 2.2772051931441397, + "language_loss": 0.79984951, + "learning_rate": 2.9487563052273778e-06, + "loss": 0.82169944, + "num_input_tokens_seen": 64753950, + "step": 3014, + "time_per_iteration": 2.52813720703125 + }, + { + "auxiliary_loss_clip": 0.0117483, + "auxiliary_loss_mlp": 0.01030794, + "balance_loss_clip": 1.05601382, + "balance_loss_mlp": 1.02257419, + "epoch": 0.36253231527685925, + "flos": 21397158017280.0, + "grad_norm": 1.8578505001811427, + "language_loss": 0.853333, + "learning_rate": 2.94807048804733e-06, + "loss": 0.87538928, + "num_input_tokens_seen": 64773570, + "step": 3015, + "time_per_iteration": 2.4718966484069824 + }, + { + "auxiliary_loss_clip": 0.0115187, + "auxiliary_loss_mlp": 0.01029484, + "balance_loss_clip": 1.04650152, + "balance_loss_mlp": 1.02051377, + "epoch": 0.36265255816749836, + "flos": 18362885552640.0, + "grad_norm": 1.7588456577501477, + "language_loss": 0.89936584, + "learning_rate": 2.9473845270457434e-06, + "loss": 0.92117941, + "num_input_tokens_seen": 64790385, + "step": 3016, + "time_per_iteration": 2.5161244869232178 + }, + { + "auxiliary_loss_clip": 0.01154258, + "auxiliary_loss_mlp": 0.01026823, + "balance_loss_clip": 1.04842019, + "balance_loss_mlp": 1.01872253, + "epoch": 0.3627728010581374, + "flos": 18660046769280.0, + "grad_norm": 2.6651258765279926, + "language_loss": 0.69879234, + "learning_rate": 2.946698422326677e-06, + "loss": 0.72060311, + "num_input_tokens_seen": 64807845, + "step": 3017, + "time_per_iteration": 2.48779296875 + }, + { + "auxiliary_loss_clip": 0.01129531, + "auxiliary_loss_mlp": 0.01027123, + "balance_loss_clip": 1.04301262, + "balance_loss_mlp": 1.01891541, + "epoch": 0.36289304394877653, + "flos": 27524272072320.0, + "grad_norm": 2.1718563079735094, + "language_loss": 0.79315299, + "learning_rate": 2.946012173994213e-06, + "loss": 0.81471956, + "num_input_tokens_seen": 64827630, + "step": 3018, + "time_per_iteration": 2.6118555068969727 + }, + { + "auxiliary_loss_clip": 0.0117195, + "auxiliary_loss_mlp": 0.01026013, + "balance_loss_clip": 1.05392432, + "balance_loss_mlp": 1.01781774, + "epoch": 0.36301328683941564, + "flos": 34533244932480.0, + "grad_norm": 1.4249237960596954, + "language_loss": 0.67468995, + "learning_rate": 2.945325782152454e-06, + "loss": 0.69666958, + "num_input_tokens_seen": 64850665, + "step": 3019, + "time_per_iteration": 2.6009719371795654 + }, + { + "auxiliary_loss_clip": 0.01163061, + "auxiliary_loss_mlp": 0.01024863, + "balance_loss_clip": 1.04839861, + "balance_loss_mlp": 1.01738834, + "epoch": 0.3631335297300547, + "flos": 19025976574080.0, + "grad_norm": 1.9904886021081238, + "language_loss": 0.79073894, + "learning_rate": 2.9446392469055257e-06, + "loss": 0.81261826, + "num_input_tokens_seen": 64868700, + "step": 3020, + "time_per_iteration": 2.493896722793579 + }, + { + "auxiliary_loss_clip": 0.0114352, + "auxiliary_loss_mlp": 0.01027847, + "balance_loss_clip": 1.05338264, + "balance_loss_mlp": 1.01984859, + "epoch": 0.3632537726206938, + "flos": 19536769929600.0, + "grad_norm": 2.0139607658520187, + "language_loss": 0.79792511, + "learning_rate": 2.9439525683575745e-06, + "loss": 0.81963873, + "num_input_tokens_seen": 64887620, + "step": 3021, + "time_per_iteration": 2.5338191986083984 + }, + { + "auxiliary_loss_clip": 0.01197202, + "auxiliary_loss_mlp": 0.01028395, + "balance_loss_clip": 1.05816674, + "balance_loss_mlp": 1.01959157, + "epoch": 0.3633740155113329, + "flos": 21068611292160.0, + "grad_norm": 5.0748488845131865, + "language_loss": 0.74883235, + "learning_rate": 2.9432657466127694e-06, + "loss": 0.77108824, + "num_input_tokens_seen": 64907190, + "step": 3022, + "time_per_iteration": 2.456545829772949 + }, + { + "auxiliary_loss_clip": 0.0113551, + "auxiliary_loss_mlp": 0.01026245, + "balance_loss_clip": 1.05209208, + "balance_loss_mlp": 1.01827264, + "epoch": 0.36349425840197197, + "flos": 20298722158080.0, + "grad_norm": 1.7785154378363923, + "language_loss": 0.76676941, + "learning_rate": 2.9425787817753007e-06, + "loss": 0.788387, + "num_input_tokens_seen": 64925850, + "step": 3023, + "time_per_iteration": 2.5770018100738525 + }, + { + "auxiliary_loss_clip": 0.01149594, + "auxiliary_loss_mlp": 0.01029203, + "balance_loss_clip": 1.05003405, + "balance_loss_mlp": 1.02131748, + "epoch": 0.3636145012926111, + "flos": 29716762331520.0, + "grad_norm": 1.5773029959983795, + "language_loss": 0.70998418, + "learning_rate": 2.94189167394938e-06, + "loss": 0.73177218, + "num_input_tokens_seen": 64948285, + "step": 3024, + "time_per_iteration": 2.6074488162994385 + }, + { + "auxiliary_loss_clip": 0.0119417, + "auxiliary_loss_mlp": 0.01033631, + "balance_loss_clip": 1.05728912, + "balance_loss_mlp": 1.02518523, + "epoch": 0.3637347441832502, + "flos": 21431847576960.0, + "grad_norm": 1.6849645712699641, + "language_loss": 0.8081466, + "learning_rate": 2.941204423239241e-06, + "loss": 0.83042461, + "num_input_tokens_seen": 64967160, + "step": 3025, + "time_per_iteration": 2.4415781497955322 + }, + { + "auxiliary_loss_clip": 0.01175204, + "auxiliary_loss_mlp": 0.01030265, + "balance_loss_clip": 1.0534333, + "balance_loss_mlp": 1.02220094, + "epoch": 0.36385498707388925, + "flos": 29533941083520.0, + "grad_norm": 1.8636065202492489, + "language_loss": 0.75934619, + "learning_rate": 2.9405170297491395e-06, + "loss": 0.78140092, + "num_input_tokens_seen": 64987155, + "step": 3026, + "time_per_iteration": 2.5286355018615723 + }, + { + "auxiliary_loss_clip": 0.01111375, + "auxiliary_loss_mlp": 0.00763158, + "balance_loss_clip": 1.05018687, + "balance_loss_mlp": 1.00106025, + "epoch": 0.36397522996452836, + "flos": 22236569925120.0, + "grad_norm": 2.052175378037083, + "language_loss": 0.80241466, + "learning_rate": 2.939829493583353e-06, + "loss": 0.82115996, + "num_input_tokens_seen": 65003800, + "step": 3027, + "time_per_iteration": 2.6339378356933594 + }, + { + "auxiliary_loss_clip": 0.01138738, + "auxiliary_loss_mlp": 0.01027814, + "balance_loss_clip": 1.04433024, + "balance_loss_mlp": 1.01959443, + "epoch": 0.3640954728551674, + "flos": 21506505995520.0, + "grad_norm": 2.4460394944514645, + "language_loss": 0.83279574, + "learning_rate": 2.939141814846179e-06, + "loss": 0.85446131, + "num_input_tokens_seen": 65021215, + "step": 3028, + "time_per_iteration": 2.7613539695739746 + }, + { + "auxiliary_loss_clip": 0.01162869, + "auxiliary_loss_mlp": 0.01028471, + "balance_loss_clip": 1.05058944, + "balance_loss_mlp": 1.02002513, + "epoch": 0.3642157157458065, + "flos": 17712867081600.0, + "grad_norm": 1.7228242964781113, + "language_loss": 0.82304138, + "learning_rate": 2.938453993641938e-06, + "loss": 0.84495479, + "num_input_tokens_seen": 65039590, + "step": 3029, + "time_per_iteration": 2.4811513423919678 + }, + { + "auxiliary_loss_clip": 0.01163682, + "auxiliary_loss_mlp": 0.01028899, + "balance_loss_clip": 1.0546453, + "balance_loss_mlp": 1.02012527, + "epoch": 0.36433595863644563, + "flos": 17639537466240.0, + "grad_norm": 2.9564243194839452, + "language_loss": 0.70017207, + "learning_rate": 2.937766030074973e-06, + "loss": 0.72209787, + "num_input_tokens_seen": 65056845, + "step": 3030, + "time_per_iteration": 2.467878818511963 + }, + { + "auxiliary_loss_clip": 0.01153138, + "auxiliary_loss_mlp": 0.0102939, + "balance_loss_clip": 1.05115426, + "balance_loss_mlp": 1.02108979, + "epoch": 0.3644562015270847, + "flos": 26833279161600.0, + "grad_norm": 1.718025696969916, + "language_loss": 0.82702601, + "learning_rate": 2.937077924249646e-06, + "loss": 0.8488512, + "num_input_tokens_seen": 65079435, + "step": 3031, + "time_per_iteration": 3.3296446800231934 + }, + { + "auxiliary_loss_clip": 0.0116784, + "auxiliary_loss_mlp": 0.01027143, + "balance_loss_clip": 1.0509423, + "balance_loss_mlp": 1.018453, + "epoch": 0.3645764444177238, + "flos": 14282715847680.0, + "grad_norm": 2.365563099533592, + "language_loss": 0.75697184, + "learning_rate": 2.9363896762703443e-06, + "loss": 0.77892166, + "num_input_tokens_seen": 65096500, + "step": 3032, + "time_per_iteration": 2.4787187576293945 + }, + { + "auxiliary_loss_clip": 0.01193553, + "auxiliary_loss_mlp": 0.01030956, + "balance_loss_clip": 1.05577683, + "balance_loss_mlp": 1.02187181, + "epoch": 0.3646966873083629, + "flos": 20667489137280.0, + "grad_norm": 1.8528128080652155, + "language_loss": 0.8428179, + "learning_rate": 2.9357012862414725e-06, + "loss": 0.86506301, + "num_input_tokens_seen": 65115860, + "step": 3033, + "time_per_iteration": 3.1724753379821777 + }, + { + "auxiliary_loss_clip": 0.01175095, + "auxiliary_loss_mlp": 0.01027238, + "balance_loss_clip": 1.05235791, + "balance_loss_mlp": 1.01909018, + "epoch": 0.36481693019900197, + "flos": 27782613665280.0, + "grad_norm": 1.9059575029926248, + "language_loss": 0.71673876, + "learning_rate": 2.9350127542674593e-06, + "loss": 0.73876214, + "num_input_tokens_seen": 65138070, + "step": 3034, + "time_per_iteration": 3.284646987915039 + }, + { + "auxiliary_loss_clip": 0.01170848, + "auxiliary_loss_mlp": 0.01030894, + "balance_loss_clip": 1.05391359, + "balance_loss_mlp": 1.0226922, + "epoch": 0.3649371730896411, + "flos": 19712588025600.0, + "grad_norm": 1.828526501894972, + "language_loss": 0.76613283, + "learning_rate": 2.934324080452755e-06, + "loss": 0.78815019, + "num_input_tokens_seen": 65155860, + "step": 3035, + "time_per_iteration": 3.2370824813842773 + }, + { + "auxiliary_loss_clip": 0.0113797, + "auxiliary_loss_mlp": 0.00763723, + "balance_loss_clip": 1.04566586, + "balance_loss_mlp": 1.00110626, + "epoch": 0.3650574159802802, + "flos": 24750496016640.0, + "grad_norm": 1.6336841584527497, + "language_loss": 0.7806145, + "learning_rate": 2.9336352649018307e-06, + "loss": 0.79963148, + "num_input_tokens_seen": 65175930, + "step": 3036, + "time_per_iteration": 2.595656394958496 + }, + { + "auxiliary_loss_clip": 0.01165131, + "auxiliary_loss_mlp": 0.0103036, + "balance_loss_clip": 1.05216479, + "balance_loss_mlp": 1.02251053, + "epoch": 0.36517765887091924, + "flos": 32853487363200.0, + "grad_norm": 1.9640710108059138, + "language_loss": 0.70164788, + "learning_rate": 2.9329463077191783e-06, + "loss": 0.72360277, + "num_input_tokens_seen": 65199305, + "step": 3037, + "time_per_iteration": 2.667443037033081 + }, + { + "auxiliary_loss_clip": 0.01134535, + "auxiliary_loss_mlp": 0.01025402, + "balance_loss_clip": 1.04922581, + "balance_loss_mlp": 1.01671159, + "epoch": 0.36529790176155835, + "flos": 20120318282880.0, + "grad_norm": 1.9799017178242466, + "language_loss": 0.6392594, + "learning_rate": 2.9322572090093135e-06, + "loss": 0.66085875, + "num_input_tokens_seen": 65218010, + "step": 3038, + "time_per_iteration": 2.591963768005371 + }, + { + "auxiliary_loss_clip": 0.01134589, + "auxiliary_loss_mlp": 0.01028375, + "balance_loss_clip": 1.04629207, + "balance_loss_mlp": 1.01966107, + "epoch": 0.36541814465219746, + "flos": 17639573379840.0, + "grad_norm": 3.0462795059622385, + "language_loss": 0.76937914, + "learning_rate": 2.9315679688767713e-06, + "loss": 0.79100871, + "num_input_tokens_seen": 65236020, + "step": 3039, + "time_per_iteration": 2.5724377632141113 + }, + { + "auxiliary_loss_clip": 0.01157992, + "auxiliary_loss_mlp": 0.01029231, + "balance_loss_clip": 1.04963923, + "balance_loss_mlp": 1.02093959, + "epoch": 0.3655383875428365, + "flos": 22674356887680.0, + "grad_norm": 1.572947987706956, + "language_loss": 0.66405702, + "learning_rate": 2.9308785874261085e-06, + "loss": 0.68592918, + "num_input_tokens_seen": 65256210, + "step": 3040, + "time_per_iteration": 2.5768134593963623 + }, + { + "auxiliary_loss_clip": 0.01192121, + "auxiliary_loss_mlp": 0.01030592, + "balance_loss_clip": 1.0548811, + "balance_loss_mlp": 1.02209222, + "epoch": 0.36565863043347563, + "flos": 21981173247360.0, + "grad_norm": 1.6731062575429592, + "language_loss": 0.81402814, + "learning_rate": 2.9301890647619045e-06, + "loss": 0.83625525, + "num_input_tokens_seen": 65275505, + "step": 3041, + "time_per_iteration": 2.532811403274536 + }, + { + "auxiliary_loss_clip": 0.01169811, + "auxiliary_loss_mlp": 0.01032449, + "balance_loss_clip": 1.052742, + "balance_loss_mlp": 1.02356219, + "epoch": 0.36577887332411474, + "flos": 24827632473600.0, + "grad_norm": 4.20187609252267, + "language_loss": 0.80440903, + "learning_rate": 2.929499400988759e-06, + "loss": 0.82643163, + "num_input_tokens_seen": 65296665, + "step": 3042, + "time_per_iteration": 2.586682081222534 + }, + { + "auxiliary_loss_clip": 0.01176507, + "auxiliary_loss_mlp": 0.0103491, + "balance_loss_clip": 1.05420065, + "balance_loss_mlp": 1.02602911, + "epoch": 0.3658991162147538, + "flos": 28293191539200.0, + "grad_norm": 1.704863575063424, + "language_loss": 0.64945388, + "learning_rate": 2.9288095962112927e-06, + "loss": 0.67156804, + "num_input_tokens_seen": 65317370, + "step": 3043, + "time_per_iteration": 2.591489315032959 + }, + { + "auxiliary_loss_clip": 0.01191903, + "auxiliary_loss_mlp": 0.0102504, + "balance_loss_clip": 1.05467069, + "balance_loss_mlp": 1.01646268, + "epoch": 0.3660193591053929, + "flos": 17785550252160.0, + "grad_norm": 1.909316814230319, + "language_loss": 0.84847921, + "learning_rate": 2.9281196505341503e-06, + "loss": 0.87064862, + "num_input_tokens_seen": 65334540, + "step": 3044, + "time_per_iteration": 2.4473659992218018 + }, + { + "auxiliary_loss_clip": 0.01126692, + "auxiliary_loss_mlp": 0.00762912, + "balance_loss_clip": 1.0482564, + "balance_loss_mlp": 1.00100541, + "epoch": 0.36613960199603196, + "flos": 10342776839040.0, + "grad_norm": 1.8655004907883397, + "language_loss": 0.78758228, + "learning_rate": 2.9274295640619946e-06, + "loss": 0.80647838, + "num_input_tokens_seen": 65351670, + "step": 3045, + "time_per_iteration": 2.5557045936584473 + }, + { + "auxiliary_loss_clip": 0.01146941, + "auxiliary_loss_mlp": 0.01028611, + "balance_loss_clip": 1.04737043, + "balance_loss_mlp": 1.02072561, + "epoch": 0.36625984488667107, + "flos": 19755609540480.0, + "grad_norm": 1.669828286333152, + "language_loss": 0.7832998, + "learning_rate": 2.9267393368995103e-06, + "loss": 0.80505526, + "num_input_tokens_seen": 65370900, + "step": 3046, + "time_per_iteration": 2.5633721351623535 + }, + { + "auxiliary_loss_clip": 0.01193425, + "auxiliary_loss_mlp": 0.01032438, + "balance_loss_clip": 1.0559535, + "balance_loss_mlp": 1.02414143, + "epoch": 0.3663800877773102, + "flos": 17674262939520.0, + "grad_norm": 2.575268681449834, + "language_loss": 0.74821925, + "learning_rate": 2.926048969151407e-06, + "loss": 0.77047789, + "num_input_tokens_seen": 65388185, + "step": 3047, + "time_per_iteration": 2.403458595275879 + }, + { + "auxiliary_loss_clip": 0.01131763, + "auxiliary_loss_mlp": 0.01027811, + "balance_loss_clip": 1.05157542, + "balance_loss_mlp": 1.01872158, + "epoch": 0.36650033066794924, + "flos": 20303606407680.0, + "grad_norm": 2.2448845789023752, + "language_loss": 0.68375981, + "learning_rate": 2.92535846092241e-06, + "loss": 0.70535553, + "num_input_tokens_seen": 65407200, + "step": 3048, + "time_per_iteration": 2.5871474742889404 + }, + { + "auxiliary_loss_clip": 0.0116827, + "auxiliary_loss_mlp": 0.01029229, + "balance_loss_clip": 1.05513561, + "balance_loss_mlp": 1.02098536, + "epoch": 0.36662057355858835, + "flos": 24716237420160.0, + "grad_norm": 1.8116369391103293, + "language_loss": 0.82664728, + "learning_rate": 2.9246678123172704e-06, + "loss": 0.8486222, + "num_input_tokens_seen": 65427290, + "step": 3049, + "time_per_iteration": 2.5559206008911133 + }, + { + "auxiliary_loss_clip": 0.01195161, + "auxiliary_loss_mlp": 0.0103634, + "balance_loss_clip": 1.05612946, + "balance_loss_mlp": 1.02771497, + "epoch": 0.36674081644922746, + "flos": 12385267902720.0, + "grad_norm": 2.04697376952541, + "language_loss": 0.73679733, + "learning_rate": 2.9239770234407596e-06, + "loss": 0.75911236, + "num_input_tokens_seen": 65445595, + "step": 3050, + "time_per_iteration": 2.4189882278442383 + }, + { + "auxiliary_loss_clip": 0.01178112, + "auxiliary_loss_mlp": 0.01028865, + "balance_loss_clip": 1.05214143, + "balance_loss_mlp": 1.02027631, + "epoch": 0.3668610593398665, + "flos": 21105922544640.0, + "grad_norm": 4.954295388709076, + "language_loss": 0.68027169, + "learning_rate": 2.9232860943976686e-06, + "loss": 0.7023415, + "num_input_tokens_seen": 65466330, + "step": 3051, + "time_per_iteration": 2.5187578201293945 + }, + { + "auxiliary_loss_clip": 0.0116323, + "auxiliary_loss_mlp": 0.01026175, + "balance_loss_clip": 1.05425143, + "balance_loss_mlp": 1.01799154, + "epoch": 0.3669813022305056, + "flos": 26758082039040.0, + "grad_norm": 1.7215765354201837, + "language_loss": 0.83930767, + "learning_rate": 2.9225950252928115e-06, + "loss": 0.8612017, + "num_input_tokens_seen": 65487180, + "step": 3052, + "time_per_iteration": 2.594301223754883 + }, + { + "auxiliary_loss_clip": 0.01180322, + "auxiliary_loss_mlp": 0.01033167, + "balance_loss_clip": 1.0568049, + "balance_loss_mlp": 1.02393436, + "epoch": 0.36710154512114473, + "flos": 19099521671040.0, + "grad_norm": 2.7698181161057347, + "language_loss": 0.81785631, + "learning_rate": 2.9219038162310217e-06, + "loss": 0.83999121, + "num_input_tokens_seen": 65505380, + "step": 3053, + "time_per_iteration": 2.461793899536133 + }, + { + "auxiliary_loss_clip": 0.01109923, + "auxiliary_loss_mlp": 0.00763331, + "balance_loss_clip": 1.04819489, + "balance_loss_mlp": 1.00102115, + "epoch": 0.3672217880117838, + "flos": 20812029465600.0, + "grad_norm": 2.1860907294747087, + "language_loss": 0.82694322, + "learning_rate": 2.921212467317157e-06, + "loss": 0.84567571, + "num_input_tokens_seen": 65524825, + "step": 3054, + "time_per_iteration": 2.641822338104248 + }, + { + "auxiliary_loss_clip": 0.01150713, + "auxiliary_loss_mlp": 0.01032084, + "balance_loss_clip": 1.04902053, + "balance_loss_mlp": 1.02307725, + "epoch": 0.3673420309024229, + "flos": 13590394133760.0, + "grad_norm": 1.9200813807305483, + "language_loss": 0.80107194, + "learning_rate": 2.920520978656093e-06, + "loss": 0.82289994, + "num_input_tokens_seen": 65541790, + "step": 3055, + "time_per_iteration": 2.4931459426879883 + }, + { + "auxiliary_loss_clip": 0.01188066, + "auxiliary_loss_mlp": 0.00762946, + "balance_loss_clip": 1.05350733, + "balance_loss_mlp": 1.00100303, + "epoch": 0.367462273793062, + "flos": 28986877969920.0, + "grad_norm": 1.9058368498606395, + "language_loss": 0.77218366, + "learning_rate": 2.919829350352729e-06, + "loss": 0.79169369, + "num_input_tokens_seen": 65563395, + "step": 3056, + "time_per_iteration": 2.513169288635254 + }, + { + "auxiliary_loss_clip": 0.0109835, + "auxiliary_loss_mlp": 0.01005264, + "balance_loss_clip": 1.02573657, + "balance_loss_mlp": 1.00401235, + "epoch": 0.36758251668370107, + "flos": 62643148346880.0, + "grad_norm": 0.7579296095354364, + "language_loss": 0.600281, + "learning_rate": 2.919137582511983e-06, + "loss": 0.62131715, + "num_input_tokens_seen": 65619835, + "step": 3057, + "time_per_iteration": 3.0350253582000732 + }, + { + "auxiliary_loss_clip": 0.01159229, + "auxiliary_loss_mlp": 0.01030156, + "balance_loss_clip": 1.05790305, + "balance_loss_mlp": 1.02220488, + "epoch": 0.3677027595743402, + "flos": 12713886455040.0, + "grad_norm": 2.26107783397582, + "language_loss": 0.63552034, + "learning_rate": 2.918445675238797e-06, + "loss": 0.6574142, + "num_input_tokens_seen": 65636760, + "step": 3058, + "time_per_iteration": 3.299171209335327 + }, + { + "auxiliary_loss_clip": 0.01192121, + "auxiliary_loss_mlp": 0.010281, + "balance_loss_clip": 1.05486989, + "balance_loss_mlp": 1.01974368, + "epoch": 0.36782300246497923, + "flos": 25046579825280.0, + "grad_norm": 1.8237414824734344, + "language_loss": 0.69290239, + "learning_rate": 2.917753628638132e-06, + "loss": 0.71510464, + "num_input_tokens_seen": 65657065, + "step": 3059, + "time_per_iteration": 2.489309310913086 + }, + { + "auxiliary_loss_clip": 0.01167095, + "auxiliary_loss_mlp": 0.01030974, + "balance_loss_clip": 1.05634093, + "balance_loss_mlp": 1.02236748, + "epoch": 0.36794324535561834, + "flos": 17419512706560.0, + "grad_norm": 2.6183062231459413, + "language_loss": 0.70561266, + "learning_rate": 2.9170614428149716e-06, + "loss": 0.72759342, + "num_input_tokens_seen": 65675400, + "step": 3060, + "time_per_iteration": 3.98986554145813 + }, + { + "auxiliary_loss_clip": 0.01146879, + "auxiliary_loss_mlp": 0.01036668, + "balance_loss_clip": 1.05245233, + "balance_loss_mlp": 1.02756596, + "epoch": 0.36806348824625745, + "flos": 24089128848000.0, + "grad_norm": 2.7089908882331097, + "language_loss": 0.86843646, + "learning_rate": 2.9163691178743195e-06, + "loss": 0.8902719, + "num_input_tokens_seen": 65694050, + "step": 3061, + "time_per_iteration": 2.5551674365997314 + }, + { + "auxiliary_loss_clip": 0.01174912, + "auxiliary_loss_mlp": 0.01028129, + "balance_loss_clip": 1.05413425, + "balance_loss_mlp": 1.01942658, + "epoch": 0.3681837311368965, + "flos": 20521871400960.0, + "grad_norm": 1.8585504719992176, + "language_loss": 0.7732482, + "learning_rate": 2.9156766539212006e-06, + "loss": 0.79527861, + "num_input_tokens_seen": 65711695, + "step": 3062, + "time_per_iteration": 3.2989916801452637 + }, + { + "auxiliary_loss_clip": 0.01181104, + "auxiliary_loss_mlp": 0.01040014, + "balance_loss_clip": 1.05301976, + "balance_loss_mlp": 1.03157401, + "epoch": 0.3683039740275356, + "flos": 21466644877440.0, + "grad_norm": 1.9379026572123674, + "language_loss": 0.7202903, + "learning_rate": 2.9149840510606614e-06, + "loss": 0.7425015, + "num_input_tokens_seen": 65730350, + "step": 3063, + "time_per_iteration": 2.515798330307007 + }, + { + "auxiliary_loss_clip": 0.01080985, + "auxiliary_loss_mlp": 0.00753609, + "balance_loss_clip": 1.02150583, + "balance_loss_mlp": 1.00013781, + "epoch": 0.36842421691817473, + "flos": 70380999987840.0, + "grad_norm": 1.0321286817707773, + "language_loss": 0.64245653, + "learning_rate": 2.914291309397769e-06, + "loss": 0.66080248, + "num_input_tokens_seen": 65787820, + "step": 3064, + "time_per_iteration": 3.1828489303588867 + }, + { + "auxiliary_loss_clip": 0.01108301, + "auxiliary_loss_mlp": 0.0102664, + "balance_loss_clip": 1.04472542, + "balance_loss_mlp": 1.0172106, + "epoch": 0.3685444598088138, + "flos": 23331378510720.0, + "grad_norm": 2.2089565006565217, + "language_loss": 0.78168494, + "learning_rate": 2.9135984290376117e-06, + "loss": 0.80303431, + "num_input_tokens_seen": 65806685, + "step": 3065, + "time_per_iteration": 2.622854232788086 + }, + { + "auxiliary_loss_clip": 0.01117638, + "auxiliary_loss_mlp": 0.01034531, + "balance_loss_clip": 1.0460844, + "balance_loss_mlp": 1.02641833, + "epoch": 0.3686647026994529, + "flos": 23070271570560.0, + "grad_norm": 1.6563955230769725, + "language_loss": 0.82828397, + "learning_rate": 2.9129054100853e-06, + "loss": 0.84980559, + "num_input_tokens_seen": 65825525, + "step": 3066, + "time_per_iteration": 2.6349875926971436 + }, + { + "auxiliary_loss_clip": 0.01165209, + "auxiliary_loss_mlp": 0.01033073, + "balance_loss_clip": 1.05267501, + "balance_loss_mlp": 1.02435291, + "epoch": 0.368784945590092, + "flos": 25119909440640.0, + "grad_norm": 1.7243759064948965, + "language_loss": 0.7583282, + "learning_rate": 2.912212252645963e-06, + "loss": 0.78031099, + "num_input_tokens_seen": 65848110, + "step": 3067, + "time_per_iteration": 2.5973150730133057 + }, + { + "auxiliary_loss_clip": 0.01182099, + "auxiliary_loss_mlp": 0.01031908, + "balance_loss_clip": 1.0526824, + "balance_loss_mlp": 1.02309227, + "epoch": 0.36890518848073106, + "flos": 18442284566400.0, + "grad_norm": 2.253434315331508, + "language_loss": 0.76589632, + "learning_rate": 2.9115189568247523e-06, + "loss": 0.78803635, + "num_input_tokens_seen": 65865670, + "step": 3068, + "time_per_iteration": 2.46769118309021 + }, + { + "auxiliary_loss_clip": 0.01123751, + "auxiliary_loss_mlp": 0.01032711, + "balance_loss_clip": 1.05180871, + "balance_loss_mlp": 1.02396727, + "epoch": 0.36902543137137017, + "flos": 16362446336640.0, + "grad_norm": 19.24089554921268, + "language_loss": 0.92110282, + "learning_rate": 2.910825522726841e-06, + "loss": 0.94266748, + "num_input_tokens_seen": 65883195, + "step": 3069, + "time_per_iteration": 2.558656692504883 + }, + { + "auxiliary_loss_clip": 0.01128305, + "auxiliary_loss_mlp": 0.01029897, + "balance_loss_clip": 1.04617906, + "balance_loss_mlp": 1.02136779, + "epoch": 0.3691456742620093, + "flos": 12275596702080.0, + "grad_norm": 3.2271963424618653, + "language_loss": 0.77351522, + "learning_rate": 2.9101319504574215e-06, + "loss": 0.79509723, + "num_input_tokens_seen": 65899635, + "step": 3070, + "time_per_iteration": 2.556763172149658 + }, + { + "auxiliary_loss_clip": 0.01167682, + "auxiliary_loss_mlp": 0.01029297, + "balance_loss_clip": 1.05020094, + "balance_loss_mlp": 1.02021325, + "epoch": 0.36926591715264834, + "flos": 17786412178560.0, + "grad_norm": 1.6424160850239697, + "language_loss": 0.7639991, + "learning_rate": 2.909438240121709e-06, + "loss": 0.7859689, + "num_input_tokens_seen": 65919910, + "step": 3071, + "time_per_iteration": 2.5164809226989746 + }, + { + "auxiliary_loss_clip": 0.01155365, + "auxiliary_loss_mlp": 0.01027367, + "balance_loss_clip": 1.05041385, + "balance_loss_mlp": 1.01889765, + "epoch": 0.36938616004328745, + "flos": 28948309741440.0, + "grad_norm": 1.6993263695908127, + "language_loss": 0.70246166, + "learning_rate": 2.908744391824939e-06, + "loss": 0.724289, + "num_input_tokens_seen": 65940930, + "step": 3072, + "time_per_iteration": 2.5610437393188477 + }, + { + "auxiliary_loss_clip": 0.01119523, + "auxiliary_loss_mlp": 0.01029394, + "balance_loss_clip": 1.04532933, + "balance_loss_mlp": 1.02071571, + "epoch": 0.36950640293392656, + "flos": 29205394358400.0, + "grad_norm": 1.7996661198555521, + "language_loss": 0.79312503, + "learning_rate": 2.908050405672367e-06, + "loss": 0.81461418, + "num_input_tokens_seen": 65960475, + "step": 3073, + "time_per_iteration": 2.673146963119507 + }, + { + "auxiliary_loss_clip": 0.01167828, + "auxiliary_loss_mlp": 0.0103215, + "balance_loss_clip": 1.04883337, + "balance_loss_mlp": 1.02372205, + "epoch": 0.3696266458245656, + "flos": 24827776128000.0, + "grad_norm": 1.7341463481392692, + "language_loss": 0.79336685, + "learning_rate": 2.9073562817692703e-06, + "loss": 0.81536669, + "num_input_tokens_seen": 65979160, + "step": 3074, + "time_per_iteration": 2.5473012924194336 + }, + { + "auxiliary_loss_clip": 0.01051214, + "auxiliary_loss_mlp": 0.01002923, + "balance_loss_clip": 1.01779461, + "balance_loss_mlp": 1.00151598, + "epoch": 0.3697468887152047, + "flos": 59887257264000.0, + "grad_norm": 0.7587985068506973, + "language_loss": 0.5654335, + "learning_rate": 2.9066620202209468e-06, + "loss": 0.58597481, + "num_input_tokens_seen": 66041650, + "step": 3075, + "time_per_iteration": 3.0957071781158447 + }, + { + "auxiliary_loss_clip": 0.01140094, + "auxiliary_loss_mlp": 0.01028207, + "balance_loss_clip": 1.04969549, + "balance_loss_mlp": 1.01971912, + "epoch": 0.3698671316058438, + "flos": 26137581569280.0, + "grad_norm": 9.14463939503411, + "language_loss": 0.77552319, + "learning_rate": 2.905967621132716e-06, + "loss": 0.79720616, + "num_input_tokens_seen": 66059260, + "step": 3076, + "time_per_iteration": 2.5599634647369385 + }, + { + "auxiliary_loss_clip": 0.01167153, + "auxiliary_loss_mlp": 0.01032468, + "balance_loss_clip": 1.05150652, + "balance_loss_mlp": 1.02393234, + "epoch": 0.3699873744964829, + "flos": 24607464059520.0, + "grad_norm": 2.339850335720611, + "language_loss": 0.75094903, + "learning_rate": 2.9052730846099172e-06, + "loss": 0.77294523, + "num_input_tokens_seen": 66080605, + "step": 3077, + "time_per_iteration": 2.568876028060913 + }, + { + "auxiliary_loss_clip": 0.01065614, + "auxiliary_loss_mlp": 0.01002566, + "balance_loss_clip": 1.01674652, + "balance_loss_mlp": 1.00117111, + "epoch": 0.370107617387122, + "flos": 64885340050560.0, + "grad_norm": 0.8738773065256834, + "language_loss": 0.60969317, + "learning_rate": 2.9045784107579123e-06, + "loss": 0.63037497, + "num_input_tokens_seen": 66140710, + "step": 3078, + "time_per_iteration": 3.1088595390319824 + }, + { + "auxiliary_loss_clip": 0.01191119, + "auxiliary_loss_mlp": 0.01029921, + "balance_loss_clip": 1.05488992, + "balance_loss_mlp": 1.02130234, + "epoch": 0.37022786027776106, + "flos": 15961683317760.0, + "grad_norm": 1.647473918267833, + "language_loss": 0.66941798, + "learning_rate": 2.9038835996820807e-06, + "loss": 0.69162834, + "num_input_tokens_seen": 66158320, + "step": 3079, + "time_per_iteration": 2.448870897293091 + }, + { + "auxiliary_loss_clip": 0.01151655, + "auxiliary_loss_mlp": 0.01029209, + "balance_loss_clip": 1.04708052, + "balance_loss_mlp": 1.02031565, + "epoch": 0.37034810316840017, + "flos": 18546927863040.0, + "grad_norm": 2.0631106939895294, + "language_loss": 0.79308009, + "learning_rate": 2.903188651487826e-06, + "loss": 0.81488872, + "num_input_tokens_seen": 66176875, + "step": 3080, + "time_per_iteration": 2.5275886058807373 + }, + { + "auxiliary_loss_clip": 0.01181462, + "auxiliary_loss_mlp": 0.01027459, + "balance_loss_clip": 1.05475307, + "balance_loss_mlp": 1.01866174, + "epoch": 0.3704683460590393, + "flos": 17821927751040.0, + "grad_norm": 2.117503038323308, + "language_loss": 0.86489373, + "learning_rate": 2.902493566280571e-06, + "loss": 0.88698298, + "num_input_tokens_seen": 66194980, + "step": 3081, + "time_per_iteration": 2.446384906768799 + }, + { + "auxiliary_loss_clip": 0.0116001, + "auxiliary_loss_mlp": 0.01027185, + "balance_loss_clip": 1.05099952, + "balance_loss_mlp": 1.01826847, + "epoch": 0.37058858894967833, + "flos": 14134081368960.0, + "grad_norm": 2.287972642185122, + "language_loss": 0.81071758, + "learning_rate": 2.9017983441657595e-06, + "loss": 0.83258957, + "num_input_tokens_seen": 66212310, + "step": 3082, + "time_per_iteration": 2.466175079345703 + }, + { + "auxiliary_loss_clip": 0.01134576, + "auxiliary_loss_mlp": 0.01028324, + "balance_loss_clip": 1.04540968, + "balance_loss_mlp": 1.01975834, + "epoch": 0.37070883184031744, + "flos": 13954492344960.0, + "grad_norm": 7.16382161254955, + "language_loss": 0.7527864, + "learning_rate": 2.9011029852488564e-06, + "loss": 0.77441537, + "num_input_tokens_seen": 66229545, + "step": 3083, + "time_per_iteration": 2.5379066467285156 + }, + { + "auxiliary_loss_clip": 0.01089579, + "auxiliary_loss_mlp": 0.01003155, + "balance_loss_clip": 1.01875913, + "balance_loss_mlp": 1.00172448, + "epoch": 0.37082907473095655, + "flos": 52315419306240.0, + "grad_norm": 0.9940398621656151, + "language_loss": 0.62432909, + "learning_rate": 2.9004074896353465e-06, + "loss": 0.6452564, + "num_input_tokens_seen": 66283545, + "step": 3084, + "time_per_iteration": 2.9306206703186035 + }, + { + "auxiliary_loss_clip": 0.01192401, + "auxiliary_loss_mlp": 0.01025818, + "balance_loss_clip": 1.05867946, + "balance_loss_mlp": 1.01822424, + "epoch": 0.3709493176215956, + "flos": 15998096730240.0, + "grad_norm": 1.736929490763857, + "language_loss": 0.81214768, + "learning_rate": 2.8997118574307362e-06, + "loss": 0.83432984, + "num_input_tokens_seen": 66300500, + "step": 3085, + "time_per_iteration": 3.1478543281555176 + }, + { + "auxiliary_loss_clip": 0.01153353, + "auxiliary_loss_mlp": 0.01031921, + "balance_loss_clip": 1.05045426, + "balance_loss_mlp": 1.02324319, + "epoch": 0.3710695605122347, + "flos": 20959837931520.0, + "grad_norm": 2.2549367575947405, + "language_loss": 0.74246365, + "learning_rate": 2.899016088740553e-06, + "loss": 0.76431638, + "num_input_tokens_seen": 66318610, + "step": 3086, + "time_per_iteration": 2.5307259559631348 + }, + { + "auxiliary_loss_clip": 0.01132169, + "auxiliary_loss_mlp": 0.01024678, + "balance_loss_clip": 1.04800367, + "balance_loss_mlp": 1.0167805, + "epoch": 0.37118980340287383, + "flos": 14355578586240.0, + "grad_norm": 2.21691805707039, + "language_loss": 0.79284656, + "learning_rate": 2.898320183670344e-06, + "loss": 0.81441504, + "num_input_tokens_seen": 66336025, + "step": 3087, + "time_per_iteration": 4.07132363319397 + }, + { + "auxiliary_loss_clip": 0.01132814, + "auxiliary_loss_mlp": 0.01025468, + "balance_loss_clip": 1.05176783, + "balance_loss_mlp": 1.01729023, + "epoch": 0.3713100462935129, + "flos": 25885381201920.0, + "grad_norm": 2.0409372384188504, + "language_loss": 0.89151645, + "learning_rate": 2.8976241423256767e-06, + "loss": 0.91309923, + "num_input_tokens_seen": 66356120, + "step": 3088, + "time_per_iteration": 2.6052331924438477 + }, + { + "auxiliary_loss_clip": 0.01157195, + "auxiliary_loss_mlp": 0.01031168, + "balance_loss_clip": 1.04968858, + "balance_loss_mlp": 1.02350926, + "epoch": 0.371430289184152, + "flos": 30518934814080.0, + "grad_norm": 1.8529075656328007, + "language_loss": 0.68191314, + "learning_rate": 2.896927964812142e-06, + "loss": 0.70379674, + "num_input_tokens_seen": 66376685, + "step": 3089, + "time_per_iteration": 3.3276634216308594 + }, + { + "auxiliary_loss_clip": 0.01159699, + "auxiliary_loss_mlp": 0.010291, + "balance_loss_clip": 1.05346227, + "balance_loss_mlp": 1.02029681, + "epoch": 0.37155053207479105, + "flos": 15742233175680.0, + "grad_norm": 2.5552072056504818, + "language_loss": 0.7518819, + "learning_rate": 2.8962316512353465e-06, + "loss": 0.77376992, + "num_input_tokens_seen": 66394230, + "step": 3090, + "time_per_iteration": 2.476945400238037 + }, + { + "auxiliary_loss_clip": 0.01112779, + "auxiliary_loss_mlp": 0.01032446, + "balance_loss_clip": 1.04287982, + "balance_loss_mlp": 1.02400005, + "epoch": 0.37167077496543016, + "flos": 23404061681280.0, + "grad_norm": 1.74675263597639, + "language_loss": 0.74659991, + "learning_rate": 2.8955352017009233e-06, + "loss": 0.7680521, + "num_input_tokens_seen": 66413475, + "step": 3091, + "time_per_iteration": 2.6104304790496826 + }, + { + "auxiliary_loss_clip": 0.0116036, + "auxiliary_loss_mlp": 0.0103431, + "balance_loss_clip": 1.05417323, + "balance_loss_mlp": 1.0258584, + "epoch": 0.3717910178560693, + "flos": 22088653718400.0, + "grad_norm": 2.14081391314354, + "language_loss": 0.77055526, + "learning_rate": 2.8948386163145212e-06, + "loss": 0.79250193, + "num_input_tokens_seen": 66432685, + "step": 3092, + "time_per_iteration": 2.5397090911865234 + }, + { + "auxiliary_loss_clip": 0.01180656, + "auxiliary_loss_mlp": 0.01029508, + "balance_loss_clip": 1.0541383, + "balance_loss_mlp": 1.02121663, + "epoch": 0.3719112607467083, + "flos": 26939969533440.0, + "grad_norm": 2.040249217265961, + "language_loss": 0.79463327, + "learning_rate": 2.8941418951818135e-06, + "loss": 0.81673491, + "num_input_tokens_seen": 66452245, + "step": 3093, + "time_per_iteration": 2.5063908100128174 + }, + { + "auxiliary_loss_clip": 0.01148349, + "auxiliary_loss_mlp": 0.01031231, + "balance_loss_clip": 1.04957724, + "balance_loss_mlp": 1.02311301, + "epoch": 0.37203150363734744, + "flos": 12166500119040.0, + "grad_norm": 2.0937025816388037, + "language_loss": 0.7097798, + "learning_rate": 2.8934450384084903e-06, + "loss": 0.73157561, + "num_input_tokens_seen": 66469760, + "step": 3094, + "time_per_iteration": 2.5148086547851562 + }, + { + "auxiliary_loss_clip": 0.01155097, + "auxiliary_loss_mlp": 0.01030076, + "balance_loss_clip": 1.05006075, + "balance_loss_mlp": 1.02147508, + "epoch": 0.37215174652798655, + "flos": 23697595624320.0, + "grad_norm": 2.174604496665335, + "language_loss": 0.69865584, + "learning_rate": 2.8927480461002653e-06, + "loss": 0.72050756, + "num_input_tokens_seen": 66489730, + "step": 3095, + "time_per_iteration": 2.536237955093384 + }, + { + "auxiliary_loss_clip": 0.01160198, + "auxiliary_loss_mlp": 0.01038716, + "balance_loss_clip": 1.04834914, + "balance_loss_mlp": 1.02934623, + "epoch": 0.3722719894186256, + "flos": 17887751424000.0, + "grad_norm": 2.4415883518430763, + "language_loss": 0.85568643, + "learning_rate": 2.892050918362872e-06, + "loss": 0.87767559, + "num_input_tokens_seen": 66504785, + "step": 3096, + "time_per_iteration": 2.4629268646240234 + }, + { + "auxiliary_loss_clip": 0.01019025, + "auxiliary_loss_mlp": 0.01003076, + "balance_loss_clip": 1.0169735, + "balance_loss_mlp": 1.00181794, + "epoch": 0.3723922323092647, + "flos": 62419891363200.0, + "grad_norm": 0.8472121981428948, + "language_loss": 0.55863339, + "learning_rate": 2.8913536553020626e-06, + "loss": 0.57885438, + "num_input_tokens_seen": 66558840, + "step": 3097, + "time_per_iteration": 3.3385910987854004 + }, + { + "auxiliary_loss_clip": 0.01124982, + "auxiliary_loss_mlp": 0.0102408, + "balance_loss_clip": 1.04651952, + "balance_loss_mlp": 1.01614404, + "epoch": 0.3725124751999038, + "flos": 23039747988480.0, + "grad_norm": 1.84450807744699, + "language_loss": 0.84592658, + "learning_rate": 2.8906562570236137e-06, + "loss": 0.86741722, + "num_input_tokens_seen": 66576750, + "step": 3098, + "time_per_iteration": 2.7380518913269043 + }, + { + "auxiliary_loss_clip": 0.01113445, + "auxiliary_loss_mlp": 0.01030475, + "balance_loss_clip": 1.04361391, + "balance_loss_mlp": 1.0227623, + "epoch": 0.3726327180905429, + "flos": 20920551431040.0, + "grad_norm": 1.4675435708787519, + "language_loss": 0.76613659, + "learning_rate": 2.889958723633318e-06, + "loss": 0.78757572, + "num_input_tokens_seen": 66595690, + "step": 3099, + "time_per_iteration": 2.653425455093384 + }, + { + "auxiliary_loss_clip": 0.01147246, + "auxiliary_loss_mlp": 0.01027771, + "balance_loss_clip": 1.05042577, + "balance_loss_mlp": 1.01972985, + "epoch": 0.372752960981182, + "flos": 30592156688640.0, + "grad_norm": 2.401223811205223, + "language_loss": 0.7361232, + "learning_rate": 2.889261055236992e-06, + "loss": 0.75787336, + "num_input_tokens_seen": 66617905, + "step": 3100, + "time_per_iteration": 2.6470608711242676 + }, + { + "auxiliary_loss_clip": 0.01158297, + "auxiliary_loss_mlp": 0.01024098, + "balance_loss_clip": 1.05246282, + "balance_loss_mlp": 1.01633143, + "epoch": 0.3728732038718211, + "flos": 25116749043840.0, + "grad_norm": 1.7550235127103473, + "language_loss": 0.82402718, + "learning_rate": 2.8885632519404704e-06, + "loss": 0.84585106, + "num_input_tokens_seen": 66638175, + "step": 3101, + "time_per_iteration": 2.5550262928009033 + }, + { + "auxiliary_loss_clip": 0.01161099, + "auxiliary_loss_mlp": 0.01026945, + "balance_loss_clip": 1.0534879, + "balance_loss_mlp": 1.01862442, + "epoch": 0.37299344676246016, + "flos": 25302048330240.0, + "grad_norm": 1.855362727662051, + "language_loss": 0.75470686, + "learning_rate": 2.8878653138496107e-06, + "loss": 0.77658737, + "num_input_tokens_seen": 66658670, + "step": 3102, + "time_per_iteration": 2.5495691299438477 + }, + { + "auxiliary_loss_clip": 0.01114086, + "auxiliary_loss_mlp": 0.01028652, + "balance_loss_clip": 1.04112399, + "balance_loss_mlp": 1.02000952, + "epoch": 0.37311368965309927, + "flos": 23842531002240.0, + "grad_norm": 2.378476007529622, + "language_loss": 0.76244062, + "learning_rate": 2.8871672410702878e-06, + "loss": 0.78386801, + "num_input_tokens_seen": 66676030, + "step": 3103, + "time_per_iteration": 2.672511100769043 + }, + { + "auxiliary_loss_clip": 0.01155179, + "auxiliary_loss_mlp": 0.01030849, + "balance_loss_clip": 1.04894245, + "balance_loss_mlp": 1.02174163, + "epoch": 0.3732339325437384, + "flos": 25811943845760.0, + "grad_norm": 1.8114104979472243, + "language_loss": 0.81988025, + "learning_rate": 2.8864690337084008e-06, + "loss": 0.84174049, + "num_input_tokens_seen": 66695305, + "step": 3104, + "time_per_iteration": 2.585179328918457 + }, + { + "auxiliary_loss_clip": 0.01170857, + "auxiliary_loss_mlp": 0.01034362, + "balance_loss_clip": 1.05090237, + "balance_loss_mlp": 1.0254333, + "epoch": 0.37335417543437743, + "flos": 26208433146240.0, + "grad_norm": 1.7065070238635465, + "language_loss": 0.77907956, + "learning_rate": 2.885770691869866e-06, + "loss": 0.80113173, + "num_input_tokens_seen": 66716185, + "step": 3105, + "time_per_iteration": 2.4977896213531494 + }, + { + "auxiliary_loss_clip": 0.01170466, + "auxiliary_loss_mlp": 0.01032127, + "balance_loss_clip": 1.05186367, + "balance_loss_mlp": 1.02447379, + "epoch": 0.37347441832501654, + "flos": 24023879792640.0, + "grad_norm": 2.2521158517455078, + "language_loss": 0.74392855, + "learning_rate": 2.8850722156606207e-06, + "loss": 0.76595449, + "num_input_tokens_seen": 66734575, + "step": 3106, + "time_per_iteration": 2.496741771697998 + }, + { + "auxiliary_loss_clip": 0.01168509, + "auxiliary_loss_mlp": 0.01032356, + "balance_loss_clip": 1.05093443, + "balance_loss_mlp": 1.02411902, + "epoch": 0.3735946612156556, + "flos": 19714922409600.0, + "grad_norm": 1.6600206666944297, + "language_loss": 0.6683315, + "learning_rate": 2.8843736051866252e-06, + "loss": 0.69034016, + "num_input_tokens_seen": 66753500, + "step": 3107, + "time_per_iteration": 2.45283579826355 + }, + { + "auxiliary_loss_clip": 0.0112752, + "auxiliary_loss_mlp": 0.0076284, + "balance_loss_clip": 1.04506874, + "balance_loss_mlp": 1.00111175, + "epoch": 0.3737149041062947, + "flos": 23039604334080.0, + "grad_norm": 1.7703690502488603, + "language_loss": 0.691046, + "learning_rate": 2.8836748605538557e-06, + "loss": 0.70994961, + "num_input_tokens_seen": 66775140, + "step": 3108, + "time_per_iteration": 2.6106255054473877 + }, + { + "auxiliary_loss_clip": 0.01165205, + "auxiliary_loss_mlp": 0.01025198, + "balance_loss_clip": 1.05067968, + "balance_loss_mlp": 1.0166266, + "epoch": 0.3738351469969338, + "flos": 34678108483200.0, + "grad_norm": 2.266363278461431, + "language_loss": 0.63532197, + "learning_rate": 2.882975981868313e-06, + "loss": 0.65722597, + "num_input_tokens_seen": 66795525, + "step": 3109, + "time_per_iteration": 2.6212661266326904 + }, + { + "auxiliary_loss_clip": 0.01173763, + "auxiliary_loss_mlp": 0.01029265, + "balance_loss_clip": 1.05274141, + "balance_loss_mlp": 1.02109277, + "epoch": 0.3739553898875729, + "flos": 43507967448960.0, + "grad_norm": 2.3143078493225455, + "language_loss": 0.69028062, + "learning_rate": 2.882276969236016e-06, + "loss": 0.71231091, + "num_input_tokens_seen": 66816885, + "step": 3110, + "time_per_iteration": 2.6467862129211426 + }, + { + "auxiliary_loss_clip": 0.01157995, + "auxiliary_loss_mlp": 0.01029995, + "balance_loss_clip": 1.05000913, + "balance_loss_mlp": 1.02133417, + "epoch": 0.374075632778212, + "flos": 12856487448960.0, + "grad_norm": 2.382380461641803, + "language_loss": 0.76268029, + "learning_rate": 2.881577822763005e-06, + "loss": 0.7845602, + "num_input_tokens_seen": 66834835, + "step": 3111, + "time_per_iteration": 3.2109649181365967 + }, + { + "auxiliary_loss_clip": 0.01172908, + "auxiliary_loss_mlp": 0.01023463, + "balance_loss_clip": 1.05087948, + "balance_loss_mlp": 1.01570797, + "epoch": 0.3741958756688511, + "flos": 26024031699840.0, + "grad_norm": 1.8805328147040214, + "language_loss": 0.87334019, + "learning_rate": 2.880878542555338e-06, + "loss": 0.89530391, + "num_input_tokens_seen": 66852600, + "step": 3112, + "time_per_iteration": 3.543414354324341 + }, + { + "auxiliary_loss_clip": 0.01189416, + "auxiliary_loss_mlp": 0.01032469, + "balance_loss_clip": 1.05248535, + "balance_loss_mlp": 1.02385616, + "epoch": 0.37431611855949015, + "flos": 21433894652160.0, + "grad_norm": 1.980615551311542, + "language_loss": 0.80827427, + "learning_rate": 2.8801791287190976e-06, + "loss": 0.83049309, + "num_input_tokens_seen": 66870595, + "step": 3113, + "time_per_iteration": 2.4515652656555176 + }, + { + "auxiliary_loss_clip": 0.01174889, + "auxiliary_loss_mlp": 0.01028235, + "balance_loss_clip": 1.0501225, + "balance_loss_mlp": 1.01996779, + "epoch": 0.37443636145012926, + "flos": 24207096090240.0, + "grad_norm": 3.548475786024503, + "language_loss": 0.8576709, + "learning_rate": 2.8794795813603817e-06, + "loss": 0.87970221, + "num_input_tokens_seen": 66886060, + "step": 3114, + "time_per_iteration": 3.480597496032715 + }, + { + "auxiliary_loss_clip": 0.01180351, + "auxiliary_loss_mlp": 0.0103119, + "balance_loss_clip": 1.05150187, + "balance_loss_mlp": 1.02258348, + "epoch": 0.3745566043407684, + "flos": 15378601841280.0, + "grad_norm": 1.748019793079824, + "language_loss": 0.81310833, + "learning_rate": 2.878779900585314e-06, + "loss": 0.83522379, + "num_input_tokens_seen": 66903900, + "step": 3115, + "time_per_iteration": 3.2120823860168457 + }, + { + "auxiliary_loss_clip": 0.0116731, + "auxiliary_loss_mlp": 0.01031987, + "balance_loss_clip": 1.05122995, + "balance_loss_mlp": 1.02377987, + "epoch": 0.37467684723140743, + "flos": 24608218245120.0, + "grad_norm": 1.6410729599353246, + "language_loss": 0.75142133, + "learning_rate": 2.8780800865000336e-06, + "loss": 0.77341431, + "num_input_tokens_seen": 66925210, + "step": 3116, + "time_per_iteration": 2.5497376918792725 + }, + { + "auxiliary_loss_clip": 0.01075743, + "auxiliary_loss_mlp": 0.01002733, + "balance_loss_clip": 1.01699471, + "balance_loss_mlp": 1.00141001, + "epoch": 0.37479709012204654, + "flos": 64377491610240.0, + "grad_norm": 0.9781184322427354, + "language_loss": 0.59142959, + "learning_rate": 2.877380139210702e-06, + "loss": 0.61221439, + "num_input_tokens_seen": 66983880, + "step": 3117, + "time_per_iteration": 3.000652313232422 + }, + { + "auxiliary_loss_clip": 0.01149313, + "auxiliary_loss_mlp": 0.01033953, + "balance_loss_clip": 1.04955983, + "balance_loss_mlp": 1.0243566, + "epoch": 0.37491733301268565, + "flos": 23803962773760.0, + "grad_norm": 1.7045312165378947, + "language_loss": 0.76387167, + "learning_rate": 2.876680058823501e-06, + "loss": 0.78570437, + "num_input_tokens_seen": 67004280, + "step": 3118, + "time_per_iteration": 2.5589442253112793 + }, + { + "auxiliary_loss_clip": 0.01150782, + "auxiliary_loss_mlp": 0.01029016, + "balance_loss_clip": 1.04802775, + "balance_loss_mlp": 1.0201292, + "epoch": 0.3750375759033247, + "flos": 32160950167680.0, + "grad_norm": 1.831709276530866, + "language_loss": 0.65802348, + "learning_rate": 2.8759798454446314e-06, + "loss": 0.67982143, + "num_input_tokens_seen": 67027445, + "step": 3119, + "time_per_iteration": 2.5808918476104736 + }, + { + "auxiliary_loss_clip": 0.01178239, + "auxiliary_loss_mlp": 0.01035929, + "balance_loss_clip": 1.05237317, + "balance_loss_mlp": 1.02796578, + "epoch": 0.3751578187939638, + "flos": 23367791923200.0, + "grad_norm": 1.8772422491493554, + "language_loss": 0.81195843, + "learning_rate": 2.8752794991803173e-06, + "loss": 0.83410007, + "num_input_tokens_seen": 67045130, + "step": 3120, + "time_per_iteration": 2.483297109603882 + }, + { + "auxiliary_loss_clip": 0.01161299, + "auxiliary_loss_mlp": 0.01031967, + "balance_loss_clip": 1.05206382, + "balance_loss_mlp": 1.02341378, + "epoch": 0.37527806168460287, + "flos": 14605731878400.0, + "grad_norm": 2.338423105447056, + "language_loss": 0.75078845, + "learning_rate": 2.8745790201367976e-06, + "loss": 0.77272105, + "num_input_tokens_seen": 67060885, + "step": 3121, + "time_per_iteration": 2.4613256454467773 + }, + { + "auxiliary_loss_clip": 0.01192591, + "auxiliary_loss_mlp": 0.01037603, + "balance_loss_clip": 1.05431247, + "balance_loss_mlp": 1.02859652, + "epoch": 0.375398304575242, + "flos": 26390823431040.0, + "grad_norm": 1.8248159912763378, + "language_loss": 0.84048373, + "learning_rate": 2.8738784084203373e-06, + "loss": 0.8627857, + "num_input_tokens_seen": 67080960, + "step": 3122, + "time_per_iteration": 2.497607469558716 + }, + { + "auxiliary_loss_clip": 0.0115226, + "auxiliary_loss_mlp": 0.01029806, + "balance_loss_clip": 1.04605818, + "balance_loss_mlp": 1.02155077, + "epoch": 0.3755185474658811, + "flos": 22236605838720.0, + "grad_norm": 3.1320258020678935, + "language_loss": 0.78991497, + "learning_rate": 2.873177664137216e-06, + "loss": 0.81173569, + "num_input_tokens_seen": 67101890, + "step": 3123, + "time_per_iteration": 2.520380735397339 + }, + { + "auxiliary_loss_clip": 0.01142893, + "auxiliary_loss_mlp": 0.01025592, + "balance_loss_clip": 1.05087578, + "balance_loss_mlp": 1.01692593, + "epoch": 0.37563879035652015, + "flos": 30812935633920.0, + "grad_norm": 1.7947396057319003, + "language_loss": 0.69544816, + "learning_rate": 2.8724767873937384e-06, + "loss": 0.71713305, + "num_input_tokens_seen": 67126010, + "step": 3124, + "time_per_iteration": 2.6158249378204346 + }, + { + "auxiliary_loss_clip": 0.01161605, + "auxiliary_loss_mlp": 0.01034263, + "balance_loss_clip": 1.0502063, + "balance_loss_mlp": 1.0260253, + "epoch": 0.37575903324715926, + "flos": 20773533064320.0, + "grad_norm": 2.1100093570652163, + "language_loss": 0.87425297, + "learning_rate": 2.871775778296225e-06, + "loss": 0.89621162, + "num_input_tokens_seen": 67143100, + "step": 3125, + "time_per_iteration": 2.475726842880249 + }, + { + "auxiliary_loss_clip": 0.01179852, + "auxiliary_loss_mlp": 0.01032616, + "balance_loss_clip": 1.05676675, + "balance_loss_mlp": 1.02281749, + "epoch": 0.37587927613779837, + "flos": 18697681244160.0, + "grad_norm": 2.016334580385799, + "language_loss": 0.78344429, + "learning_rate": 2.8710746369510196e-06, + "loss": 0.80556905, + "num_input_tokens_seen": 67161085, + "step": 3126, + "time_per_iteration": 2.451413869857788 + }, + { + "auxiliary_loss_clip": 0.01157423, + "auxiliary_loss_mlp": 0.01031795, + "balance_loss_clip": 1.05222332, + "balance_loss_mlp": 1.02325952, + "epoch": 0.3759995190284374, + "flos": 13624796384640.0, + "grad_norm": 2.290953197637681, + "language_loss": 0.83373678, + "learning_rate": 2.8703733634644846e-06, + "loss": 0.85562897, + "num_input_tokens_seen": 67175840, + "step": 3127, + "time_per_iteration": 2.452998161315918 + }, + { + "auxiliary_loss_clip": 0.01188375, + "auxiliary_loss_mlp": 0.01030287, + "balance_loss_clip": 1.05390429, + "balance_loss_mlp": 1.02178097, + "epoch": 0.37611976191907653, + "flos": 20484847457280.0, + "grad_norm": 1.64422697702506, + "language_loss": 0.79357541, + "learning_rate": 2.869671957943002e-06, + "loss": 0.81576204, + "num_input_tokens_seen": 67194995, + "step": 3128, + "time_per_iteration": 2.4322943687438965 + }, + { + "auxiliary_loss_clip": 0.01156782, + "auxiliary_loss_mlp": 0.01032296, + "balance_loss_clip": 1.05445397, + "balance_loss_mlp": 1.02376628, + "epoch": 0.37624000480971564, + "flos": 21141797253120.0, + "grad_norm": 1.7667082301921835, + "language_loss": 0.74015152, + "learning_rate": 2.8689704204929747e-06, + "loss": 0.76204228, + "num_input_tokens_seen": 67214175, + "step": 3129, + "time_per_iteration": 2.487802743911743 + }, + { + "auxiliary_loss_clip": 0.01190428, + "auxiliary_loss_mlp": 0.01028587, + "balance_loss_clip": 1.05404365, + "balance_loss_mlp": 1.01968181, + "epoch": 0.3763602477003547, + "flos": 22564470205440.0, + "grad_norm": 1.7911917298512094, + "language_loss": 0.81018579, + "learning_rate": 2.8682687512208253e-06, + "loss": 0.83237594, + "num_input_tokens_seen": 67233185, + "step": 3130, + "time_per_iteration": 2.432046413421631 + }, + { + "auxiliary_loss_clip": 0.0118088, + "auxiliary_loss_mlp": 0.01034758, + "balance_loss_clip": 1.05217648, + "balance_loss_mlp": 1.02574587, + "epoch": 0.3764804905909938, + "flos": 27526857851520.0, + "grad_norm": 1.8574780558333144, + "language_loss": 0.80431747, + "learning_rate": 2.8675669502329972e-06, + "loss": 0.82647395, + "num_input_tokens_seen": 67254715, + "step": 3131, + "time_per_iteration": 2.5049939155578613 + }, + { + "auxiliary_loss_clip": 0.01176248, + "auxiliary_loss_mlp": 0.00763442, + "balance_loss_clip": 1.05272603, + "balance_loss_mlp": 1.00078869, + "epoch": 0.3766007334816329, + "flos": 22528092706560.0, + "grad_norm": 2.3010206646813103, + "language_loss": 0.8533507, + "learning_rate": 2.866865017635952e-06, + "loss": 0.8727476, + "num_input_tokens_seen": 67272535, + "step": 3132, + "time_per_iteration": 2.464721918106079 + }, + { + "auxiliary_loss_clip": 0.01146003, + "auxiliary_loss_mlp": 0.01027387, + "balance_loss_clip": 1.05251324, + "balance_loss_mlp": 1.01844013, + "epoch": 0.376720976372272, + "flos": 25957166532480.0, + "grad_norm": 1.5672637417947253, + "language_loss": 0.793033, + "learning_rate": 2.866162953536174e-06, + "loss": 0.81476688, + "num_input_tokens_seen": 67293505, + "step": 3133, + "time_per_iteration": 2.5626220703125 + }, + { + "auxiliary_loss_clip": 0.01159484, + "auxiliary_loss_mlp": 0.00763353, + "balance_loss_clip": 1.05027246, + "balance_loss_mlp": 1.00096583, + "epoch": 0.3768412192629111, + "flos": 18041162411520.0, + "grad_norm": 1.6020651849298397, + "language_loss": 0.75022525, + "learning_rate": 2.8654607580401634e-06, + "loss": 0.76945359, + "num_input_tokens_seen": 67313240, + "step": 3134, + "time_per_iteration": 2.4999778270721436 + }, + { + "auxiliary_loss_clip": 0.01074116, + "auxiliary_loss_mlp": 0.01002365, + "balance_loss_clip": 1.01676786, + "balance_loss_mlp": 1.00098765, + "epoch": 0.3769614621535502, + "flos": 62989472304000.0, + "grad_norm": 0.8801163871902931, + "language_loss": 0.65209067, + "learning_rate": 2.8647584312544446e-06, + "loss": 0.6728555, + "num_input_tokens_seen": 67378445, + "step": 3135, + "time_per_iteration": 3.056159734725952 + }, + { + "auxiliary_loss_clip": 0.0114137, + "auxiliary_loss_mlp": 0.00763137, + "balance_loss_clip": 1.04724526, + "balance_loss_mlp": 1.00083601, + "epoch": 0.37708170504418925, + "flos": 23661685002240.0, + "grad_norm": 1.3596607528114248, + "language_loss": 0.84957659, + "learning_rate": 2.864055973285559e-06, + "loss": 0.86862159, + "num_input_tokens_seen": 67400445, + "step": 3136, + "time_per_iteration": 2.60711932182312 + }, + { + "auxiliary_loss_clip": 0.01151246, + "auxiliary_loss_mlp": 0.01036329, + "balance_loss_clip": 1.0493294, + "balance_loss_mlp": 1.02716804, + "epoch": 0.37720194793482836, + "flos": 24423170353920.0, + "grad_norm": 1.8239451927971622, + "language_loss": 0.86332643, + "learning_rate": 2.8633533842400698e-06, + "loss": 0.88520223, + "num_input_tokens_seen": 67420645, + "step": 3137, + "time_per_iteration": 3.256028413772583 + }, + { + "auxiliary_loss_clip": 0.0117802, + "auxiliary_loss_mlp": 0.00763646, + "balance_loss_clip": 1.05426228, + "balance_loss_mlp": 1.00088239, + "epoch": 0.3773221908254674, + "flos": 20996502739200.0, + "grad_norm": 1.7874299134979899, + "language_loss": 0.77068555, + "learning_rate": 2.862650664224558e-06, + "loss": 0.79010224, + "num_input_tokens_seen": 67439495, + "step": 3138, + "time_per_iteration": 2.476050853729248 + }, + { + "auxiliary_loss_clip": 0.01174828, + "auxiliary_loss_mlp": 0.01025521, + "balance_loss_clip": 1.05631542, + "balance_loss_mlp": 1.01764727, + "epoch": 0.37744243371610653, + "flos": 37631724958080.0, + "grad_norm": 1.4478709301472377, + "language_loss": 0.69828653, + "learning_rate": 2.861947813345627e-06, + "loss": 0.72029006, + "num_input_tokens_seen": 67462195, + "step": 3139, + "time_per_iteration": 3.3796896934509277 + }, + { + "auxiliary_loss_clip": 0.01193073, + "auxiliary_loss_mlp": 0.00763529, + "balance_loss_clip": 1.05666709, + "balance_loss_mlp": 1.00077462, + "epoch": 0.37756267660674564, + "flos": 26140526484480.0, + "grad_norm": 2.532623964747018, + "language_loss": 0.7262494, + "learning_rate": 2.8612448317098974e-06, + "loss": 0.74581546, + "num_input_tokens_seen": 67482530, + "step": 3140, + "time_per_iteration": 3.207472562789917 + }, + { + "auxiliary_loss_clip": 0.01149641, + "auxiliary_loss_mlp": 0.00763236, + "balance_loss_clip": 1.04840446, + "balance_loss_mlp": 1.00077355, + "epoch": 0.3776829194973847, + "flos": 19427888828160.0, + "grad_norm": 2.0913726956607452, + "language_loss": 0.83086401, + "learning_rate": 2.8605417194240114e-06, + "loss": 0.84999275, + "num_input_tokens_seen": 67500890, + "step": 3141, + "time_per_iteration": 2.5316617488861084 + }, + { + "auxiliary_loss_clip": 0.01169511, + "auxiliary_loss_mlp": 0.01029557, + "balance_loss_clip": 1.05030966, + "balance_loss_mlp": 1.02156448, + "epoch": 0.3778031623880238, + "flos": 17382309194880.0, + "grad_norm": 1.7710796154901607, + "language_loss": 0.79167366, + "learning_rate": 2.8598384765946315e-06, + "loss": 0.81366432, + "num_input_tokens_seen": 67519545, + "step": 3142, + "time_per_iteration": 3.2323837280273438 + }, + { + "auxiliary_loss_clip": 0.01187718, + "auxiliary_loss_mlp": 0.01029917, + "balance_loss_clip": 1.05157638, + "balance_loss_mlp": 1.0216558, + "epoch": 0.3779234052786629, + "flos": 27125843437440.0, + "grad_norm": 2.0295518691693313, + "language_loss": 0.71220005, + "learning_rate": 2.8591351033284377e-06, + "loss": 0.73437637, + "num_input_tokens_seen": 67539275, + "step": 3143, + "time_per_iteration": 2.615920305252075 + }, + { + "auxiliary_loss_clip": 0.01178917, + "auxiliary_loss_mlp": 0.01028614, + "balance_loss_clip": 1.05163288, + "balance_loss_mlp": 1.02054691, + "epoch": 0.37804364816930197, + "flos": 19682639061120.0, + "grad_norm": 2.100986322883124, + "language_loss": 0.84192884, + "learning_rate": 2.8584315997321325e-06, + "loss": 0.86400419, + "num_input_tokens_seen": 67558280, + "step": 3144, + "time_per_iteration": 2.453859567642212 + }, + { + "auxiliary_loss_clip": 0.01188, + "auxiliary_loss_mlp": 0.01027556, + "balance_loss_clip": 1.05201435, + "balance_loss_mlp": 1.01895547, + "epoch": 0.3781638910599411, + "flos": 22702905221760.0, + "grad_norm": 2.443362317468289, + "language_loss": 0.77795631, + "learning_rate": 2.8577279659124356e-06, + "loss": 0.80011183, + "num_input_tokens_seen": 67575955, + "step": 3145, + "time_per_iteration": 2.4231045246124268 + }, + { + "auxiliary_loss_clip": 0.0117053, + "auxiliary_loss_mlp": 0.01028057, + "balance_loss_clip": 1.05040359, + "balance_loss_mlp": 1.02086592, + "epoch": 0.3782841339505802, + "flos": 14647604158080.0, + "grad_norm": 1.6972181207884232, + "language_loss": 0.831743, + "learning_rate": 2.857024201976089e-06, + "loss": 0.85372895, + "num_input_tokens_seen": 67593515, + "step": 3146, + "time_per_iteration": 2.4346871376037598 + }, + { + "auxiliary_loss_clip": 0.01157952, + "auxiliary_loss_mlp": 0.01025565, + "balance_loss_clip": 1.05263507, + "balance_loss_mlp": 1.01677299, + "epoch": 0.37840437684121925, + "flos": 32818223185920.0, + "grad_norm": 2.2101119973202574, + "language_loss": 0.72749776, + "learning_rate": 2.8563203080298516e-06, + "loss": 0.74933296, + "num_input_tokens_seen": 67614290, + "step": 3147, + "time_per_iteration": 2.61379337310791 + }, + { + "auxiliary_loss_clip": 0.01160387, + "auxiliary_loss_mlp": 0.00763133, + "balance_loss_clip": 1.05141902, + "balance_loss_mlp": 1.00067282, + "epoch": 0.37852461973185836, + "flos": 18369206346240.0, + "grad_norm": 4.6106595534715105, + "language_loss": 0.89026368, + "learning_rate": 2.855616284180505e-06, + "loss": 0.90949881, + "num_input_tokens_seen": 67631340, + "step": 3148, + "time_per_iteration": 2.561791181564331 + }, + { + "auxiliary_loss_clip": 0.01077209, + "auxiliary_loss_mlp": 0.01001221, + "balance_loss_clip": 1.01652598, + "balance_loss_mlp": 0.99983233, + "epoch": 0.37864486262249747, + "flos": 59500680117120.0, + "grad_norm": 0.8731548279943586, + "language_loss": 0.66127032, + "learning_rate": 2.8549121305348477e-06, + "loss": 0.68205464, + "num_input_tokens_seen": 67691125, + "step": 3149, + "time_per_iteration": 3.0365450382232666 + }, + { + "auxiliary_loss_clip": 0.01175242, + "auxiliary_loss_mlp": 0.01029618, + "balance_loss_clip": 1.05207467, + "balance_loss_mlp": 1.02201903, + "epoch": 0.3787651055131365, + "flos": 23363015414400.0, + "grad_norm": 2.1049245919539814, + "language_loss": 0.83170342, + "learning_rate": 2.8542078471997006e-06, + "loss": 0.85375196, + "num_input_tokens_seen": 67708740, + "step": 3150, + "time_per_iteration": 2.4768893718719482 + }, + { + "auxiliary_loss_clip": 0.01173493, + "auxiliary_loss_mlp": 0.01025705, + "balance_loss_clip": 1.05140281, + "balance_loss_mlp": 1.01784348, + "epoch": 0.37888534840377563, + "flos": 24601394661120.0, + "grad_norm": 1.7049322380606104, + "language_loss": 0.75572044, + "learning_rate": 2.8535034342819013e-06, + "loss": 0.77771235, + "num_input_tokens_seen": 67726150, + "step": 3151, + "time_per_iteration": 2.481928825378418 + }, + { + "auxiliary_loss_clip": 0.01187051, + "auxiliary_loss_mlp": 0.01032707, + "balance_loss_clip": 1.05344319, + "balance_loss_mlp": 1.02428484, + "epoch": 0.37900559129441475, + "flos": 23986891762560.0, + "grad_norm": 1.49349442565046, + "language_loss": 0.72766745, + "learning_rate": 2.85279889188831e-06, + "loss": 0.74986506, + "num_input_tokens_seen": 67746525, + "step": 3152, + "time_per_iteration": 2.44193172454834 + }, + { + "auxiliary_loss_clip": 0.01146559, + "auxiliary_loss_mlp": 0.01027454, + "balance_loss_clip": 1.04610348, + "balance_loss_mlp": 1.0184238, + "epoch": 0.3791258341850538, + "flos": 24644667571200.0, + "grad_norm": 2.245938894107152, + "language_loss": 0.8111288, + "learning_rate": 2.852094220125805e-06, + "loss": 0.83286893, + "num_input_tokens_seen": 67766035, + "step": 3153, + "time_per_iteration": 2.5510175228118896 + }, + { + "auxiliary_loss_clip": 0.01178136, + "auxiliary_loss_mlp": 0.01034986, + "balance_loss_clip": 1.05375159, + "balance_loss_mlp": 1.02642727, + "epoch": 0.3792460770756929, + "flos": 17420841509760.0, + "grad_norm": 2.159299189886023, + "language_loss": 0.71090901, + "learning_rate": 2.8513894191012846e-06, + "loss": 0.73304021, + "num_input_tokens_seen": 67785015, + "step": 3154, + "time_per_iteration": 2.4451329708099365 + }, + { + "auxiliary_loss_clip": 0.01190883, + "auxiliary_loss_mlp": 0.01031132, + "balance_loss_clip": 1.05481398, + "balance_loss_mlp": 1.02235866, + "epoch": 0.37936631996633197, + "flos": 24206557386240.0, + "grad_norm": 1.5625760769897412, + "language_loss": 0.78725827, + "learning_rate": 2.8506844889216664e-06, + "loss": 0.80947846, + "num_input_tokens_seen": 67804400, + "step": 3155, + "time_per_iteration": 2.4478907585144043 + }, + { + "auxiliary_loss_clip": 0.01066894, + "auxiliary_loss_mlp": 0.01002025, + "balance_loss_clip": 1.01437306, + "balance_loss_mlp": 1.00063026, + "epoch": 0.3794865628569711, + "flos": 70297114752000.0, + "grad_norm": 0.9887199281549752, + "language_loss": 0.62887514, + "learning_rate": 2.849979429693887e-06, + "loss": 0.64956433, + "num_input_tokens_seen": 67865385, + "step": 3156, + "time_per_iteration": 3.0906295776367188 + }, + { + "auxiliary_loss_clip": 0.01187254, + "auxiliary_loss_mlp": 0.01028153, + "balance_loss_clip": 1.05346489, + "balance_loss_mlp": 1.01985621, + "epoch": 0.3796068057476102, + "flos": 15779364860160.0, + "grad_norm": 2.3273739612660487, + "language_loss": 0.74154675, + "learning_rate": 2.8492742415249042e-06, + "loss": 0.76370084, + "num_input_tokens_seen": 67883030, + "step": 3157, + "time_per_iteration": 2.407865047454834 + }, + { + "auxiliary_loss_clip": 0.01185958, + "auxiliary_loss_mlp": 0.01024848, + "balance_loss_clip": 1.05063379, + "balance_loss_mlp": 1.01676512, + "epoch": 0.37972704863824924, + "flos": 25191694771200.0, + "grad_norm": 1.8416808400383977, + "language_loss": 0.76109278, + "learning_rate": 2.848568924521694e-06, + "loss": 0.78320086, + "num_input_tokens_seen": 67903810, + "step": 3158, + "time_per_iteration": 2.467496395111084 + }, + { + "auxiliary_loss_clip": 0.01166156, + "auxiliary_loss_mlp": 0.01026908, + "balance_loss_clip": 1.04768264, + "balance_loss_mlp": 1.01818764, + "epoch": 0.37984729152888835, + "flos": 26210372480640.0, + "grad_norm": 1.8905754883693315, + "language_loss": 0.73526585, + "learning_rate": 2.8478634787912526e-06, + "loss": 0.75719655, + "num_input_tokens_seen": 67921865, + "step": 3159, + "time_per_iteration": 2.5172624588012695 + }, + { + "auxiliary_loss_clip": 0.01176627, + "auxiliary_loss_mlp": 0.01027564, + "balance_loss_clip": 1.05203128, + "balance_loss_mlp": 1.01914203, + "epoch": 0.37996753441952746, + "flos": 25629302165760.0, + "grad_norm": 3.5990998283965783, + "language_loss": 0.76533628, + "learning_rate": 2.847157904440596e-06, + "loss": 0.78737813, + "num_input_tokens_seen": 67941595, + "step": 3160, + "time_per_iteration": 2.4966588020324707 + }, + { + "auxiliary_loss_clip": 0.01175337, + "auxiliary_loss_mlp": 0.01028677, + "balance_loss_clip": 1.05186117, + "balance_loss_mlp": 1.02030849, + "epoch": 0.3800877773101665, + "flos": 20118414862080.0, + "grad_norm": 1.5411592901592224, + "language_loss": 0.73935425, + "learning_rate": 2.846452201576759e-06, + "loss": 0.76139438, + "num_input_tokens_seen": 67960970, + "step": 3161, + "time_per_iteration": 2.4609439373016357 + }, + { + "auxiliary_loss_clip": 0.01066184, + "auxiliary_loss_mlp": 0.01003256, + "balance_loss_clip": 1.01581335, + "balance_loss_mlp": 1.00184965, + "epoch": 0.38020802020080563, + "flos": 63053608037760.0, + "grad_norm": 0.866208791051278, + "language_loss": 0.62767988, + "learning_rate": 2.845746370306795e-06, + "loss": 0.64837426, + "num_input_tokens_seen": 68026160, + "step": 3162, + "time_per_iteration": 3.16306734085083 + }, + { + "auxiliary_loss_clip": 0.01177162, + "auxiliary_loss_mlp": 0.0102974, + "balance_loss_clip": 1.05346608, + "balance_loss_mlp": 1.02177107, + "epoch": 0.38032826309144474, + "flos": 21288420570240.0, + "grad_norm": 1.8770742998895633, + "language_loss": 0.78245687, + "learning_rate": 2.84504041073778e-06, + "loss": 0.80452585, + "num_input_tokens_seen": 68044575, + "step": 3163, + "time_per_iteration": 2.4679360389709473 + }, + { + "auxiliary_loss_clip": 0.01154571, + "auxiliary_loss_mlp": 0.01038246, + "balance_loss_clip": 1.05049682, + "balance_loss_mlp": 1.02932358, + "epoch": 0.3804485059820838, + "flos": 18954119416320.0, + "grad_norm": 1.6552211631212665, + "language_loss": 0.78906953, + "learning_rate": 2.844334322976806e-06, + "loss": 0.81099772, + "num_input_tokens_seen": 68064790, + "step": 3164, + "time_per_iteration": 3.2394626140594482 + }, + { + "auxiliary_loss_clip": 0.01133401, + "auxiliary_loss_mlp": 0.01037691, + "balance_loss_clip": 1.0464859, + "balance_loss_mlp": 1.02885163, + "epoch": 0.3805687488727229, + "flos": 21833759831040.0, + "grad_norm": 1.8288156683620507, + "language_loss": 0.83728778, + "learning_rate": 2.8436281071309866e-06, + "loss": 0.85899878, + "num_input_tokens_seen": 68083330, + "step": 3165, + "time_per_iteration": 3.23736310005188 + }, + { + "auxiliary_loss_clip": 0.01047656, + "auxiliary_loss_mlp": 0.01005312, + "balance_loss_clip": 1.01555443, + "balance_loss_mlp": 1.00388181, + "epoch": 0.380688991763362, + "flos": 58546209968640.0, + "grad_norm": 0.7256345337621618, + "language_loss": 0.53034234, + "learning_rate": 2.842921763307455e-06, + "loss": 0.55087203, + "num_input_tokens_seen": 68146140, + "step": 3166, + "time_per_iteration": 3.101109027862549 + }, + { + "auxiliary_loss_clip": 0.01152612, + "auxiliary_loss_mlp": 0.01028919, + "balance_loss_clip": 1.04843152, + "balance_loss_mlp": 1.02113509, + "epoch": 0.38080923465400107, + "flos": 23799509487360.0, + "grad_norm": 1.877918620692197, + "language_loss": 0.82658517, + "learning_rate": 2.842215291613361e-06, + "loss": 0.84840053, + "num_input_tokens_seen": 68164520, + "step": 3167, + "time_per_iteration": 3.2507102489471436 + }, + { + "auxiliary_loss_clip": 0.01009392, + "auxiliary_loss_mlp": 0.01007699, + "balance_loss_clip": 1.01433146, + "balance_loss_mlp": 1.00642383, + "epoch": 0.3809294775446402, + "flos": 54969866380800.0, + "grad_norm": 0.7809892694138563, + "language_loss": 0.59179974, + "learning_rate": 2.8415086921558774e-06, + "loss": 0.61197066, + "num_input_tokens_seen": 68227945, + "step": 3168, + "time_per_iteration": 4.110116720199585 + }, + { + "auxiliary_loss_clip": 0.01143417, + "auxiliary_loss_mlp": 0.01025315, + "balance_loss_clip": 1.04340196, + "balance_loss_mlp": 1.01742911, + "epoch": 0.38104972043527924, + "flos": 24643697904000.0, + "grad_norm": 1.5797470237746236, + "language_loss": 0.7851339, + "learning_rate": 2.840801965042194e-06, + "loss": 0.80682123, + "num_input_tokens_seen": 68247405, + "step": 3169, + "time_per_iteration": 2.7408645153045654 + }, + { + "auxiliary_loss_clip": 0.01149173, + "auxiliary_loss_mlp": 0.010257, + "balance_loss_clip": 1.04521787, + "balance_loss_mlp": 1.01660991, + "epoch": 0.38116996332591835, + "flos": 22856783086080.0, + "grad_norm": 1.6993328206605518, + "language_loss": 0.83599293, + "learning_rate": 2.840095110379521e-06, + "loss": 0.85774171, + "num_input_tokens_seen": 68266925, + "step": 3170, + "time_per_iteration": 2.5126190185546875 + }, + { + "auxiliary_loss_clip": 0.01039245, + "auxiliary_loss_mlp": 0.01004343, + "balance_loss_clip": 1.01573229, + "balance_loss_mlp": 1.00299597, + "epoch": 0.38129020621655746, + "flos": 60836160804480.0, + "grad_norm": 0.7326084716476218, + "language_loss": 0.53903019, + "learning_rate": 2.8393881282750884e-06, + "loss": 0.559466, + "num_input_tokens_seen": 68329755, + "step": 3171, + "time_per_iteration": 3.0684103965759277 + }, + { + "auxiliary_loss_clip": 0.01157629, + "auxiliary_loss_mlp": 0.01026547, + "balance_loss_clip": 1.0510273, + "balance_loss_mlp": 1.01781523, + "epoch": 0.3814104491071965, + "flos": 21648101408640.0, + "grad_norm": 1.8993201331274752, + "language_loss": 0.78697097, + "learning_rate": 2.838681018836144e-06, + "loss": 0.80881274, + "num_input_tokens_seen": 68347075, + "step": 3172, + "time_per_iteration": 2.505084991455078 + }, + { + "auxiliary_loss_clip": 0.01145656, + "auxiliary_loss_mlp": 0.0076262, + "balance_loss_clip": 1.04640985, + "balance_loss_mlp": 1.00063705, + "epoch": 0.3815306919978356, + "flos": 19099090707840.0, + "grad_norm": 2.28643976596327, + "language_loss": 0.78313833, + "learning_rate": 2.837973782169955e-06, + "loss": 0.80222112, + "num_input_tokens_seen": 68365450, + "step": 3173, + "time_per_iteration": 2.534075975418091 + }, + { + "auxiliary_loss_clip": 0.01085271, + "auxiliary_loss_mlp": 0.01003408, + "balance_loss_clip": 1.01607049, + "balance_loss_mlp": 1.00207829, + "epoch": 0.38165093488847474, + "flos": 67067918156160.0, + "grad_norm": 0.8068834453526245, + "language_loss": 0.59189445, + "learning_rate": 2.8372664183838096e-06, + "loss": 0.61278123, + "num_input_tokens_seen": 68428470, + "step": 3174, + "time_per_iteration": 3.082252025604248 + }, + { + "auxiliary_loss_clip": 0.01185871, + "auxiliary_loss_mlp": 0.01031686, + "balance_loss_clip": 1.05211544, + "balance_loss_mlp": 1.02312648, + "epoch": 0.3817711777791138, + "flos": 22341105480960.0, + "grad_norm": 2.045209278304174, + "language_loss": 0.67827439, + "learning_rate": 2.836558927585015e-06, + "loss": 0.70045, + "num_input_tokens_seen": 68445440, + "step": 3175, + "time_per_iteration": 2.4387662410736084 + }, + { + "auxiliary_loss_clip": 0.01177258, + "auxiliary_loss_mlp": 0.01035359, + "balance_loss_clip": 1.05313754, + "balance_loss_mlp": 1.02779508, + "epoch": 0.3818914206697529, + "flos": 22820621068800.0, + "grad_norm": 1.7481426922130603, + "language_loss": 0.82139122, + "learning_rate": 2.8358513098808957e-06, + "loss": 0.84351742, + "num_input_tokens_seen": 68465755, + "step": 3176, + "time_per_iteration": 2.476637601852417 + }, + { + "auxiliary_loss_clip": 0.01121367, + "auxiliary_loss_mlp": 0.01028607, + "balance_loss_clip": 1.04558897, + "balance_loss_mlp": 1.02015471, + "epoch": 0.382011663560392, + "flos": 24386074583040.0, + "grad_norm": 1.8662169615465982, + "language_loss": 0.7679733, + "learning_rate": 2.835143565378798e-06, + "loss": 0.78947306, + "num_input_tokens_seen": 68486220, + "step": 3177, + "time_per_iteration": 2.6047961711883545 + }, + { + "auxiliary_loss_clip": 0.01114251, + "auxiliary_loss_mlp": 0.01023195, + "balance_loss_clip": 1.04525208, + "balance_loss_mlp": 1.01493096, + "epoch": 0.38213190645103107, + "flos": 21981568296960.0, + "grad_norm": 1.7712434822843037, + "language_loss": 0.78555936, + "learning_rate": 2.8344356941860847e-06, + "loss": 0.80693382, + "num_input_tokens_seen": 68505850, + "step": 3178, + "time_per_iteration": 2.602914333343506 + }, + { + "auxiliary_loss_clip": 0.01143106, + "auxiliary_loss_mlp": 0.01030653, + "balance_loss_clip": 1.05021894, + "balance_loss_mlp": 1.0222609, + "epoch": 0.3822521493416702, + "flos": 35516945773440.0, + "grad_norm": 3.003331579831365, + "language_loss": 0.65981853, + "learning_rate": 2.8337276964101403e-06, + "loss": 0.68155611, + "num_input_tokens_seen": 68526290, + "step": 3179, + "time_per_iteration": 2.650407075881958 + }, + { + "auxiliary_loss_clip": 0.0117581, + "auxiliary_loss_mlp": 0.01030332, + "balance_loss_clip": 1.05220747, + "balance_loss_mlp": 1.02213621, + "epoch": 0.3823723922323093, + "flos": 21069904181760.0, + "grad_norm": 1.7736032645276683, + "language_loss": 0.76258481, + "learning_rate": 2.833019572158367e-06, + "loss": 0.78464627, + "num_input_tokens_seen": 68544725, + "step": 3180, + "time_per_iteration": 2.4794230461120605 + }, + { + "auxiliary_loss_clip": 0.01161069, + "auxiliary_loss_mlp": 0.0102874, + "balance_loss_clip": 1.05180693, + "balance_loss_mlp": 1.02074075, + "epoch": 0.38249263512294834, + "flos": 19789149864960.0, + "grad_norm": 3.276709976472021, + "language_loss": 0.80086541, + "learning_rate": 2.8323113215381872e-06, + "loss": 0.82276356, + "num_input_tokens_seen": 68563070, + "step": 3181, + "time_per_iteration": 2.5403943061828613 + }, + { + "auxiliary_loss_clip": 0.0114344, + "auxiliary_loss_mlp": 0.01035018, + "balance_loss_clip": 1.04902911, + "balance_loss_mlp": 1.02639914, + "epoch": 0.38261287801358745, + "flos": 21433930565760.0, + "grad_norm": 1.880533396872952, + "language_loss": 0.76010156, + "learning_rate": 2.831602944657042e-06, + "loss": 0.78188616, + "num_input_tokens_seen": 68581150, + "step": 3182, + "time_per_iteration": 2.534196376800537 + }, + { + "auxiliary_loss_clip": 0.01166568, + "auxiliary_loss_mlp": 0.0102525, + "balance_loss_clip": 1.05014265, + "balance_loss_mlp": 1.01731062, + "epoch": 0.38273312090422656, + "flos": 21981568296960.0, + "grad_norm": 2.2313793170415566, + "language_loss": 0.74304986, + "learning_rate": 2.830894441622391e-06, + "loss": 0.76496804, + "num_input_tokens_seen": 68597800, + "step": 3183, + "time_per_iteration": 2.525327682495117 + }, + { + "auxiliary_loss_clip": 0.01142759, + "auxiliary_loss_mlp": 0.00762995, + "balance_loss_clip": 1.04491472, + "balance_loss_mlp": 1.00059462, + "epoch": 0.3828533637948656, + "flos": 24790895838720.0, + "grad_norm": 1.7308028297022664, + "language_loss": 0.79919136, + "learning_rate": 2.8301858125417134e-06, + "loss": 0.81824887, + "num_input_tokens_seen": 68617640, + "step": 3184, + "time_per_iteration": 2.560779571533203 + }, + { + "auxiliary_loss_clip": 0.01161167, + "auxiliary_loss_mlp": 0.0102638, + "balance_loss_clip": 1.0539391, + "balance_loss_mlp": 1.0188098, + "epoch": 0.38297360668550473, + "flos": 22455445449600.0, + "grad_norm": 1.7797336657597636, + "language_loss": 0.73623663, + "learning_rate": 2.8294770575225082e-06, + "loss": 0.75811207, + "num_input_tokens_seen": 68637770, + "step": 3185, + "time_per_iteration": 2.496901273727417 + }, + { + "auxiliary_loss_clip": 0.01178426, + "auxiliary_loss_mlp": 0.0102846, + "balance_loss_clip": 1.05611134, + "balance_loss_mlp": 1.02009118, + "epoch": 0.3830938495761438, + "flos": 24896903852160.0, + "grad_norm": 1.7384096997192884, + "language_loss": 0.83674181, + "learning_rate": 2.828768176672293e-06, + "loss": 0.85881066, + "num_input_tokens_seen": 68656885, + "step": 3186, + "time_per_iteration": 2.4967262744903564 + }, + { + "auxiliary_loss_clip": 0.01142251, + "auxiliary_loss_mlp": 0.0102566, + "balance_loss_clip": 1.04794359, + "balance_loss_mlp": 1.01735139, + "epoch": 0.3832140924667829, + "flos": 33036236784000.0, + "grad_norm": 1.7222632593203098, + "language_loss": 0.71677762, + "learning_rate": 2.8280591700986044e-06, + "loss": 0.73845673, + "num_input_tokens_seen": 68678750, + "step": 3187, + "time_per_iteration": 2.625314712524414 + }, + { + "auxiliary_loss_clip": 0.01165163, + "auxiliary_loss_mlp": 0.01032122, + "balance_loss_clip": 1.05163944, + "balance_loss_mlp": 1.02365804, + "epoch": 0.383334335357422, + "flos": 31903721896320.0, + "grad_norm": 1.9111060376373055, + "language_loss": 0.74984473, + "learning_rate": 2.827350037908999e-06, + "loss": 0.77181756, + "num_input_tokens_seen": 68698190, + "step": 3188, + "time_per_iteration": 2.5840306282043457 + }, + { + "auxiliary_loss_clip": 0.01150755, + "auxiliary_loss_mlp": 0.01028935, + "balance_loss_clip": 1.04868817, + "balance_loss_mlp": 1.02004838, + "epoch": 0.38345457824806106, + "flos": 19791915212160.0, + "grad_norm": 2.0075226751293744, + "language_loss": 0.79067981, + "learning_rate": 2.8266407802110496e-06, + "loss": 0.81247675, + "num_input_tokens_seen": 68716445, + "step": 3189, + "time_per_iteration": 2.536592721939087 + }, + { + "auxiliary_loss_clip": 0.0110653, + "auxiliary_loss_mlp": 0.01030244, + "balance_loss_clip": 1.04265285, + "balance_loss_mlp": 1.02103519, + "epoch": 0.3835748211387002, + "flos": 22419391173120.0, + "grad_norm": 1.8423770531532775, + "language_loss": 0.76526183, + "learning_rate": 2.8259313971123515e-06, + "loss": 0.78662956, + "num_input_tokens_seen": 68737565, + "step": 3190, + "time_per_iteration": 2.66110897064209 + }, + { + "auxiliary_loss_clip": 0.01170949, + "auxiliary_loss_mlp": 0.01025673, + "balance_loss_clip": 1.05324817, + "balance_loss_mlp": 1.01795411, + "epoch": 0.3836950640293393, + "flos": 25118436983040.0, + "grad_norm": 2.067459349283177, + "language_loss": 0.78136671, + "learning_rate": 2.8252218887205166e-06, + "loss": 0.80333292, + "num_input_tokens_seen": 68758255, + "step": 3191, + "time_per_iteration": 3.246490478515625 + }, + { + "auxiliary_loss_clip": 0.01120386, + "auxiliary_loss_mlp": 0.01029218, + "balance_loss_clip": 1.0492295, + "balance_loss_mlp": 1.02131999, + "epoch": 0.38381530691997834, + "flos": 21799213925760.0, + "grad_norm": 1.912341454415238, + "language_loss": 0.80635965, + "learning_rate": 2.824512255143178e-06, + "loss": 0.82785571, + "num_input_tokens_seen": 68777490, + "step": 3192, + "time_per_iteration": 3.694352388381958 + }, + { + "auxiliary_loss_clip": 0.01148671, + "auxiliary_loss_mlp": 0.01023547, + "balance_loss_clip": 1.04973173, + "balance_loss_mlp": 1.01601899, + "epoch": 0.38393554981061745, + "flos": 21252689516160.0, + "grad_norm": 1.6832719840583708, + "language_loss": 0.79179144, + "learning_rate": 2.8238024964879855e-06, + "loss": 0.81351364, + "num_input_tokens_seen": 68798385, + "step": 3193, + "time_per_iteration": 3.386354684829712 + }, + { + "auxiliary_loss_clip": 0.01192599, + "auxiliary_loss_mlp": 0.01031095, + "balance_loss_clip": 1.05689216, + "balance_loss_mlp": 1.02232099, + "epoch": 0.38405579270125656, + "flos": 17019360218880.0, + "grad_norm": 2.128503737840623, + "language_loss": 0.76395535, + "learning_rate": 2.8230926128626095e-06, + "loss": 0.78619224, + "num_input_tokens_seen": 68816880, + "step": 3194, + "time_per_iteration": 3.1593387126922607 + }, + { + "auxiliary_loss_clip": 0.01156584, + "auxiliary_loss_mlp": 0.01029633, + "balance_loss_clip": 1.05037189, + "balance_loss_mlp": 1.02047729, + "epoch": 0.3841760355918956, + "flos": 21835375943040.0, + "grad_norm": 1.8631716530833362, + "language_loss": 0.79378551, + "learning_rate": 2.822382604374738e-06, + "loss": 0.81564772, + "num_input_tokens_seen": 68835805, + "step": 3195, + "time_per_iteration": 2.5086755752563477 + }, + { + "auxiliary_loss_clip": 0.01160662, + "auxiliary_loss_mlp": 0.01034006, + "balance_loss_clip": 1.05367565, + "balance_loss_mlp": 1.0255059, + "epoch": 0.3842962784825347, + "flos": 25915114684800.0, + "grad_norm": 1.833045694517602, + "language_loss": 0.65254748, + "learning_rate": 2.8216724711320793e-06, + "loss": 0.67449415, + "num_input_tokens_seen": 68854930, + "step": 3196, + "time_per_iteration": 2.5682830810546875 + }, + { + "auxiliary_loss_clip": 0.01187226, + "auxiliary_loss_mlp": 0.0076203, + "balance_loss_clip": 1.05408287, + "balance_loss_mlp": 1.00057459, + "epoch": 0.38441652137317384, + "flos": 25337492075520.0, + "grad_norm": 1.6941106336955158, + "language_loss": 0.79502535, + "learning_rate": 2.820962213242361e-06, + "loss": 0.81451786, + "num_input_tokens_seen": 68874260, + "step": 3197, + "time_per_iteration": 2.4837470054626465 + }, + { + "auxiliary_loss_clip": 0.01170498, + "auxiliary_loss_mlp": 0.01033913, + "balance_loss_clip": 1.0546366, + "balance_loss_mlp": 1.02571774, + "epoch": 0.3845367642638129, + "flos": 18113486446080.0, + "grad_norm": 2.0158098986431283, + "language_loss": 0.83977127, + "learning_rate": 2.8202518308133264e-06, + "loss": 0.86181533, + "num_input_tokens_seen": 68891535, + "step": 3198, + "time_per_iteration": 2.4352469444274902 + }, + { + "auxiliary_loss_clip": 0.01189035, + "auxiliary_loss_mlp": 0.01030203, + "balance_loss_clip": 1.05349898, + "balance_loss_mlp": 1.02134013, + "epoch": 0.384657007154452, + "flos": 25228395492480.0, + "grad_norm": 2.0817545060715155, + "language_loss": 0.72954822, + "learning_rate": 2.8195413239527426e-06, + "loss": 0.75174057, + "num_input_tokens_seen": 68911275, + "step": 3199, + "time_per_iteration": 2.461425304412842 + }, + { + "auxiliary_loss_clip": 0.01170922, + "auxiliary_loss_mlp": 0.01027606, + "balance_loss_clip": 1.05137038, + "balance_loss_mlp": 1.01950574, + "epoch": 0.38477725004509106, + "flos": 19865855358720.0, + "grad_norm": 1.9679412880398812, + "language_loss": 0.80538732, + "learning_rate": 2.8188306927683906e-06, + "loss": 0.82737255, + "num_input_tokens_seen": 68930745, + "step": 3200, + "time_per_iteration": 2.4712040424346924 + }, + { + "auxiliary_loss_clip": 0.01164079, + "auxiliary_loss_mlp": 0.01026071, + "balance_loss_clip": 1.05453467, + "balance_loss_mlp": 1.01833105, + "epoch": 0.38489749293573017, + "flos": 18259391491200.0, + "grad_norm": 2.004297863251266, + "language_loss": 0.75104862, + "learning_rate": 2.818119937368074e-06, + "loss": 0.77295017, + "num_input_tokens_seen": 68949380, + "step": 3201, + "time_per_iteration": 2.48164439201355 + }, + { + "auxiliary_loss_clip": 0.01179806, + "auxiliary_loss_mlp": 0.01026507, + "balance_loss_clip": 1.05305719, + "balance_loss_mlp": 1.01782882, + "epoch": 0.3850177358263693, + "flos": 24389163152640.0, + "grad_norm": 2.1094564712690023, + "language_loss": 0.65306115, + "learning_rate": 2.817409057859613e-06, + "loss": 0.67512429, + "num_input_tokens_seen": 68968370, + "step": 3202, + "time_per_iteration": 2.4926886558532715 + }, + { + "auxiliary_loss_clip": 0.01123878, + "auxiliary_loss_mlp": 0.01028141, + "balance_loss_clip": 1.0455153, + "balance_loss_mlp": 1.01949215, + "epoch": 0.38513797871700833, + "flos": 17671533505920.0, + "grad_norm": 1.8996655057402256, + "language_loss": 0.79070032, + "learning_rate": 2.8166980543508482e-06, + "loss": 0.81222057, + "num_input_tokens_seen": 68984260, + "step": 3203, + "time_per_iteration": 2.5463900566101074 + }, + { + "auxiliary_loss_clip": 0.01191942, + "auxiliary_loss_mlp": 0.01029737, + "balance_loss_clip": 1.05714226, + "balance_loss_mlp": 1.02154148, + "epoch": 0.38525822160764744, + "flos": 25739583897600.0, + "grad_norm": 7.248596264867761, + "language_loss": 0.80085993, + "learning_rate": 2.815986926949638e-06, + "loss": 0.82307667, + "num_input_tokens_seen": 69002760, + "step": 3204, + "time_per_iteration": 2.4806149005889893 + }, + { + "auxiliary_loss_clip": 0.01177053, + "auxiliary_loss_mlp": 0.01027443, + "balance_loss_clip": 1.05599904, + "balance_loss_mlp": 1.01984358, + "epoch": 0.38537846449828655, + "flos": 20193647898240.0, + "grad_norm": 1.8963975084712985, + "language_loss": 0.80122173, + "learning_rate": 2.8152756757638597e-06, + "loss": 0.82326669, + "num_input_tokens_seen": 69021260, + "step": 3205, + "time_per_iteration": 2.477736234664917 + }, + { + "auxiliary_loss_clip": 0.01172802, + "auxiliary_loss_mlp": 0.01026702, + "balance_loss_clip": 1.05343008, + "balance_loss_mlp": 1.01859629, + "epoch": 0.3854987073889256, + "flos": 23039352938880.0, + "grad_norm": 1.8365686127039333, + "language_loss": 0.84219998, + "learning_rate": 2.8145643009014093e-06, + "loss": 0.86419499, + "num_input_tokens_seen": 69039755, + "step": 3206, + "time_per_iteration": 2.4714813232421875 + }, + { + "auxiliary_loss_clip": 0.01175339, + "auxiliary_loss_mlp": 0.01026571, + "balance_loss_clip": 1.05363345, + "balance_loss_mlp": 1.01906705, + "epoch": 0.3856189502795647, + "flos": 20190631155840.0, + "grad_norm": 1.8864911287417003, + "language_loss": 0.79259455, + "learning_rate": 2.813852802470202e-06, + "loss": 0.8146137, + "num_input_tokens_seen": 69057650, + "step": 3207, + "time_per_iteration": 2.4524972438812256 + }, + { + "auxiliary_loss_clip": 0.01154794, + "auxiliary_loss_mlp": 0.01026161, + "balance_loss_clip": 1.05061734, + "balance_loss_mlp": 1.01760793, + "epoch": 0.38573919317020383, + "flos": 25702631781120.0, + "grad_norm": 2.1672101529886496, + "language_loss": 0.72634465, + "learning_rate": 2.8131411805781717e-06, + "loss": 0.74815422, + "num_input_tokens_seen": 69077775, + "step": 3208, + "time_per_iteration": 2.5233190059661865 + }, + { + "auxiliary_loss_clip": 0.01162423, + "auxiliary_loss_mlp": 0.01030454, + "balance_loss_clip": 1.05395591, + "balance_loss_mlp": 1.02187729, + "epoch": 0.3858594360608429, + "flos": 29821405628160.0, + "grad_norm": 2.208506737034783, + "language_loss": 0.6433804, + "learning_rate": 2.8124294353332707e-06, + "loss": 0.66530919, + "num_input_tokens_seen": 69096450, + "step": 3209, + "time_per_iteration": 2.551723003387451 + }, + { + "auxiliary_loss_clip": 0.0115224, + "auxiliary_loss_mlp": 0.01028782, + "balance_loss_clip": 1.05115402, + "balance_loss_mlp": 1.02100396, + "epoch": 0.385979678951482, + "flos": 24790428961920.0, + "grad_norm": 1.7663668954383198, + "language_loss": 0.77496374, + "learning_rate": 2.8117175668434713e-06, + "loss": 0.79677403, + "num_input_tokens_seen": 69116110, + "step": 3210, + "time_per_iteration": 2.5684077739715576 + }, + { + "auxiliary_loss_clip": 0.01189135, + "auxiliary_loss_mlp": 0.0102436, + "balance_loss_clip": 1.05399179, + "balance_loss_mlp": 1.01655209, + "epoch": 0.3860999218421211, + "flos": 21287881866240.0, + "grad_norm": 3.361303882772123, + "language_loss": 0.70240217, + "learning_rate": 2.811005575216762e-06, + "loss": 0.72453713, + "num_input_tokens_seen": 69134825, + "step": 3211, + "time_per_iteration": 2.4319911003112793 + }, + { + "auxiliary_loss_clip": 0.01138174, + "auxiliary_loss_mlp": 0.01030748, + "balance_loss_clip": 1.04848516, + "balance_loss_mlp": 1.02258205, + "epoch": 0.38622016473276016, + "flos": 24536720223360.0, + "grad_norm": 1.4284564901491403, + "language_loss": 0.78813976, + "learning_rate": 2.8102934605611513e-06, + "loss": 0.809829, + "num_input_tokens_seen": 69156460, + "step": 3212, + "time_per_iteration": 2.573782205581665 + }, + { + "auxiliary_loss_clip": 0.01167637, + "auxiliary_loss_mlp": 0.01027573, + "balance_loss_clip": 1.05499506, + "balance_loss_mlp": 1.01966965, + "epoch": 0.3863404076233993, + "flos": 20558212986240.0, + "grad_norm": 2.4921149517652212, + "language_loss": 0.67301804, + "learning_rate": 2.8095812229846665e-06, + "loss": 0.69497013, + "num_input_tokens_seen": 69176420, + "step": 3213, + "time_per_iteration": 2.491069793701172 + }, + { + "auxiliary_loss_clip": 0.01159569, + "auxiliary_loss_mlp": 0.01028049, + "balance_loss_clip": 1.05010223, + "balance_loss_mlp": 1.01982975, + "epoch": 0.3864606505140384, + "flos": 22346277039360.0, + "grad_norm": 2.929142806388339, + "language_loss": 0.68555355, + "learning_rate": 2.808868862595355e-06, + "loss": 0.70742965, + "num_input_tokens_seen": 69196665, + "step": 3214, + "time_per_iteration": 2.507185697555542 + }, + { + "auxiliary_loss_clip": 0.01177345, + "auxiliary_loss_mlp": 0.0103225, + "balance_loss_clip": 1.05302083, + "balance_loss_mlp": 1.02425671, + "epoch": 0.38658089340467744, + "flos": 25703601448320.0, + "grad_norm": 2.0995455321936007, + "language_loss": 0.79442453, + "learning_rate": 2.8081563795012795e-06, + "loss": 0.81652045, + "num_input_tokens_seen": 69216290, + "step": 3215, + "time_per_iteration": 2.4962172508239746 + }, + { + "auxiliary_loss_clip": 0.01166214, + "auxiliary_loss_mlp": 0.01024746, + "balance_loss_clip": 1.05090523, + "balance_loss_mlp": 1.01689577, + "epoch": 0.38670113629531655, + "flos": 33802534558080.0, + "grad_norm": 1.678800266986969, + "language_loss": 0.73683947, + "learning_rate": 2.807443773810524e-06, + "loss": 0.75874907, + "num_input_tokens_seen": 69237550, + "step": 3216, + "time_per_iteration": 2.6362569332122803 + }, + { + "auxiliary_loss_clip": 0.01145739, + "auxiliary_loss_mlp": 0.01030627, + "balance_loss_clip": 1.05215621, + "balance_loss_mlp": 1.02293801, + "epoch": 0.3868213791859556, + "flos": 23331522165120.0, + "grad_norm": 2.096905531403631, + "language_loss": 0.89434588, + "learning_rate": 2.80673104563119e-06, + "loss": 0.91610944, + "num_input_tokens_seen": 69258175, + "step": 3217, + "time_per_iteration": 2.5602893829345703 + }, + { + "auxiliary_loss_clip": 0.01171479, + "auxiliary_loss_mlp": 0.01023295, + "balance_loss_clip": 1.05332553, + "balance_loss_mlp": 1.01564145, + "epoch": 0.3869416220765947, + "flos": 18441530380800.0, + "grad_norm": 2.1503520618165117, + "language_loss": 0.78740221, + "learning_rate": 2.8060181950713976e-06, + "loss": 0.80934995, + "num_input_tokens_seen": 69274965, + "step": 3218, + "time_per_iteration": 3.942660093307495 + }, + { + "auxiliary_loss_clip": 0.01145032, + "auxiliary_loss_mlp": 0.01028312, + "balance_loss_clip": 1.04966664, + "balance_loss_mlp": 1.01953793, + "epoch": 0.3870618649672338, + "flos": 15632992938240.0, + "grad_norm": 2.1452018011343923, + "language_loss": 0.80616432, + "learning_rate": 2.805305222239286e-06, + "loss": 0.82789779, + "num_input_tokens_seen": 69292220, + "step": 3219, + "time_per_iteration": 2.5046682357788086 + }, + { + "auxiliary_loss_clip": 0.01154564, + "auxiliary_loss_mlp": 0.01028516, + "balance_loss_clip": 1.05003524, + "balance_loss_mlp": 1.02031446, + "epoch": 0.3871821078578729, + "flos": 23513804709120.0, + "grad_norm": 2.27420562599294, + "language_loss": 0.73998421, + "learning_rate": 2.8045921272430118e-06, + "loss": 0.76181495, + "num_input_tokens_seen": 69311900, + "step": 3220, + "time_per_iteration": 3.2807748317718506 + }, + { + "auxiliary_loss_clip": 0.01181119, + "auxiliary_loss_mlp": 0.01029949, + "balance_loss_clip": 1.05299664, + "balance_loss_mlp": 1.02130604, + "epoch": 0.387302350748512, + "flos": 17778259791360.0, + "grad_norm": 3.9264526609141974, + "language_loss": 0.7621215, + "learning_rate": 2.803878910190753e-06, + "loss": 0.78423214, + "num_input_tokens_seen": 69328820, + "step": 3221, + "time_per_iteration": 3.17667555809021 + }, + { + "auxiliary_loss_clip": 0.01177719, + "auxiliary_loss_mlp": 0.01027392, + "balance_loss_clip": 1.05275118, + "balance_loss_mlp": 1.01907146, + "epoch": 0.3874225936391511, + "flos": 11503409097600.0, + "grad_norm": 3.1250988261114214, + "language_loss": 0.82141954, + "learning_rate": 2.8031655711907017e-06, + "loss": 0.84347063, + "num_input_tokens_seen": 69342525, + "step": 3222, + "time_per_iteration": 2.402496814727783 + }, + { + "auxiliary_loss_clip": 0.01178493, + "auxiliary_loss_mlp": 0.01030909, + "balance_loss_clip": 1.05509162, + "balance_loss_mlp": 1.02271891, + "epoch": 0.38754283652979016, + "flos": 21945154884480.0, + "grad_norm": 1.9775449140909926, + "language_loss": 0.80763507, + "learning_rate": 2.8024521103510723e-06, + "loss": 0.82972908, + "num_input_tokens_seen": 69359295, + "step": 3223, + "time_per_iteration": 2.492771625518799 + }, + { + "auxiliary_loss_clip": 0.01172606, + "auxiliary_loss_mlp": 0.01026987, + "balance_loss_clip": 1.0505451, + "balance_loss_mlp": 1.01930976, + "epoch": 0.38766307942042927, + "flos": 21175984022400.0, + "grad_norm": 2.230670351550865, + "language_loss": 0.75071359, + "learning_rate": 2.8017385277800952e-06, + "loss": 0.77270949, + "num_input_tokens_seen": 69377650, + "step": 3224, + "time_per_iteration": 2.457061529159546 + }, + { + "auxiliary_loss_clip": 0.01148933, + "auxiliary_loss_mlp": 0.01029974, + "balance_loss_clip": 1.04999447, + "balance_loss_mlp": 1.02175474, + "epoch": 0.3877833223110684, + "flos": 27417294391680.0, + "grad_norm": 12.625492384132471, + "language_loss": 0.75167978, + "learning_rate": 2.8010248235860213e-06, + "loss": 0.77346885, + "num_input_tokens_seen": 69397765, + "step": 3225, + "time_per_iteration": 2.57161545753479 + }, + { + "auxiliary_loss_clip": 0.01070137, + "auxiliary_loss_mlp": 0.00753048, + "balance_loss_clip": 1.01927161, + "balance_loss_mlp": 1.00001168, + "epoch": 0.38790356520170743, + "flos": 64500019879680.0, + "grad_norm": 0.8292281331291705, + "language_loss": 0.62739354, + "learning_rate": 2.8003109978771192e-06, + "loss": 0.64562541, + "num_input_tokens_seen": 69458930, + "step": 3226, + "time_per_iteration": 3.1268928050994873 + }, + { + "auxiliary_loss_clip": 0.01134282, + "auxiliary_loss_mlp": 0.01028188, + "balance_loss_clip": 1.04307199, + "balance_loss_mlp": 1.01977229, + "epoch": 0.38802380809234654, + "flos": 22345415112960.0, + "grad_norm": 2.0090832153089084, + "language_loss": 0.78891635, + "learning_rate": 2.799597050761674e-06, + "loss": 0.81054103, + "num_input_tokens_seen": 69475135, + "step": 3227, + "time_per_iteration": 2.533569097518921 + }, + { + "auxiliary_loss_clip": 0.01188669, + "auxiliary_loss_mlp": 0.01028189, + "balance_loss_clip": 1.0537703, + "balance_loss_mlp": 1.02005243, + "epoch": 0.38814405098298566, + "flos": 25261361199360.0, + "grad_norm": 2.261986442750261, + "language_loss": 0.78953636, + "learning_rate": 2.7988829823479924e-06, + "loss": 0.81170487, + "num_input_tokens_seen": 69493525, + "step": 3228, + "time_per_iteration": 2.4581143856048584 + }, + { + "auxiliary_loss_clip": 0.01153515, + "auxiliary_loss_mlp": 0.01035015, + "balance_loss_clip": 1.04830551, + "balance_loss_mlp": 1.02587116, + "epoch": 0.3882642938736247, + "flos": 18841180078080.0, + "grad_norm": 1.830543672538457, + "language_loss": 0.64145052, + "learning_rate": 2.7981687927443976e-06, + "loss": 0.66333586, + "num_input_tokens_seen": 69510325, + "step": 3229, + "time_per_iteration": 2.4758455753326416 + }, + { + "auxiliary_loss_clip": 0.01171896, + "auxiliary_loss_mlp": 0.01027052, + "balance_loss_clip": 1.04956651, + "balance_loss_mlp": 1.01955974, + "epoch": 0.3883845367642638, + "flos": 21652806090240.0, + "grad_norm": 1.837970159969352, + "language_loss": 0.8565948, + "learning_rate": 2.797454482059231e-06, + "loss": 0.87858427, + "num_input_tokens_seen": 69530480, + "step": 3230, + "time_per_iteration": 2.4745049476623535 + }, + { + "auxiliary_loss_clip": 0.01192411, + "auxiliary_loss_mlp": 0.01022268, + "balance_loss_clip": 1.05577898, + "balance_loss_mlp": 1.01445973, + "epoch": 0.3885047796549029, + "flos": 20557530627840.0, + "grad_norm": 1.59163920081342, + "language_loss": 0.84359998, + "learning_rate": 2.7967400504008537e-06, + "loss": 0.86574674, + "num_input_tokens_seen": 69549780, + "step": 3231, + "time_per_iteration": 2.4382472038269043 + }, + { + "auxiliary_loss_clip": 0.01036301, + "auxiliary_loss_mlp": 0.01003845, + "balance_loss_clip": 1.01383162, + "balance_loss_mlp": 1.0025754, + "epoch": 0.388625022545542, + "flos": 64325491695360.0, + "grad_norm": 0.8023884800238352, + "language_loss": 0.57527661, + "learning_rate": 2.7960254978776456e-06, + "loss": 0.59567809, + "num_input_tokens_seen": 69611870, + "step": 3232, + "time_per_iteration": 3.1137988567352295 + }, + { + "auxiliary_loss_clip": 0.01193374, + "auxiliary_loss_mlp": 0.01034039, + "balance_loss_clip": 1.05680609, + "balance_loss_mlp": 1.02525973, + "epoch": 0.3887452654361811, + "flos": 18113881495680.0, + "grad_norm": 2.6616739996999725, + "language_loss": 0.81554574, + "learning_rate": 2.7953108245980006e-06, + "loss": 0.83781993, + "num_input_tokens_seen": 69630385, + "step": 3233, + "time_per_iteration": 2.4122281074523926 + }, + { + "auxiliary_loss_clip": 0.01153906, + "auxiliary_loss_mlp": 0.01030988, + "balance_loss_clip": 1.0518415, + "balance_loss_mlp": 1.02310193, + "epoch": 0.38886550832682015, + "flos": 24975261371520.0, + "grad_norm": 1.678920788923037, + "language_loss": 0.73505026, + "learning_rate": 2.7945960306703365e-06, + "loss": 0.75689924, + "num_input_tokens_seen": 69653370, + "step": 3234, + "time_per_iteration": 2.5331473350524902 + }, + { + "auxiliary_loss_clip": 0.01178417, + "auxiliary_loss_mlp": 0.01026057, + "balance_loss_clip": 1.05299115, + "balance_loss_mlp": 1.01736689, + "epoch": 0.38898575121745926, + "flos": 27199496275200.0, + "grad_norm": 1.8411007447152663, + "language_loss": 0.6553492, + "learning_rate": 2.7938811162030865e-06, + "loss": 0.67739391, + "num_input_tokens_seen": 69673635, + "step": 3235, + "time_per_iteration": 2.5246870517730713 + }, + { + "auxiliary_loss_clip": 0.0117438, + "auxiliary_loss_mlp": 0.01034036, + "balance_loss_clip": 1.05468535, + "balance_loss_mlp": 1.02617717, + "epoch": 0.3891059941080984, + "flos": 28763728727040.0, + "grad_norm": 1.8562889769162105, + "language_loss": 0.82174468, + "learning_rate": 2.793166081304702e-06, + "loss": 0.8438288, + "num_input_tokens_seen": 69694130, + "step": 3236, + "time_per_iteration": 2.516050338745117 + }, + { + "auxiliary_loss_clip": 0.01151755, + "auxiliary_loss_mlp": 0.01029576, + "balance_loss_clip": 1.04899216, + "balance_loss_mlp": 1.0210824, + "epoch": 0.38922623699873743, + "flos": 22893447893760.0, + "grad_norm": 2.2499868416004576, + "language_loss": 0.82365829, + "learning_rate": 2.7924509260836543e-06, + "loss": 0.84547162, + "num_input_tokens_seen": 69713255, + "step": 3237, + "time_per_iteration": 2.5524778366088867 + }, + { + "auxiliary_loss_clip": 0.01144305, + "auxiliary_loss_mlp": 0.01025808, + "balance_loss_clip": 1.04759359, + "balance_loss_mlp": 1.01779151, + "epoch": 0.38934647988937654, + "flos": 19792418002560.0, + "grad_norm": 1.460196046779333, + "language_loss": 0.68450689, + "learning_rate": 2.791735650648431e-06, + "loss": 0.70620799, + "num_input_tokens_seen": 69732375, + "step": 3238, + "time_per_iteration": 2.583017110824585 + }, + { + "auxiliary_loss_clip": 0.0115695, + "auxiliary_loss_mlp": 0.01026393, + "balance_loss_clip": 1.04983985, + "balance_loss_mlp": 1.01853156, + "epoch": 0.38946672278001565, + "flos": 19202081978880.0, + "grad_norm": 1.8885700919675747, + "language_loss": 0.74340498, + "learning_rate": 2.791020255107538e-06, + "loss": 0.7652384, + "num_input_tokens_seen": 69749745, + "step": 3239, + "time_per_iteration": 2.5017075538635254 + }, + { + "auxiliary_loss_clip": 0.01139798, + "auxiliary_loss_mlp": 0.01025802, + "balance_loss_clip": 1.04569006, + "balance_loss_mlp": 1.0181365, + "epoch": 0.3895869656706547, + "flos": 24936477661440.0, + "grad_norm": 1.7109603805064366, + "language_loss": 0.80775297, + "learning_rate": 2.7903047395695023e-06, + "loss": 0.829409, + "num_input_tokens_seen": 69769645, + "step": 3240, + "time_per_iteration": 2.5772907733917236 + }, + { + "auxiliary_loss_clip": 0.01174698, + "auxiliary_loss_mlp": 0.00762804, + "balance_loss_clip": 1.05441272, + "balance_loss_mlp": 1.00043464, + "epoch": 0.3897072085612938, + "flos": 24133622820480.0, + "grad_norm": 1.9577590484505931, + "language_loss": 0.90019292, + "learning_rate": 2.789589104142865e-06, + "loss": 0.91956794, + "num_input_tokens_seen": 69787270, + "step": 3241, + "time_per_iteration": 2.4981305599212646 + }, + { + "auxiliary_loss_clip": 0.01147983, + "auxiliary_loss_mlp": 0.01029391, + "balance_loss_clip": 1.05088067, + "balance_loss_mlp": 1.0212667, + "epoch": 0.3898274514519329, + "flos": 17166342672000.0, + "grad_norm": 1.8481949433691975, + "language_loss": 0.76186192, + "learning_rate": 2.7888733489361895e-06, + "loss": 0.78363568, + "num_input_tokens_seen": 69805685, + "step": 3242, + "time_per_iteration": 2.5130040645599365 + }, + { + "auxiliary_loss_clip": 0.01084344, + "auxiliary_loss_mlp": 0.01002965, + "balance_loss_clip": 1.01577473, + "balance_loss_mlp": 1.00190377, + "epoch": 0.389947694342572, + "flos": 66074807952000.0, + "grad_norm": 0.7268039947379752, + "language_loss": 0.58720362, + "learning_rate": 2.788157474058054e-06, + "loss": 0.60807681, + "num_input_tokens_seen": 69867960, + "step": 3243, + "time_per_iteration": 3.918940305709839 + }, + { + "auxiliary_loss_clip": 0.01187256, + "auxiliary_loss_mlp": 0.01024746, + "balance_loss_clip": 1.05484128, + "balance_loss_mlp": 1.01689577, + "epoch": 0.3900679372332111, + "flos": 25740912700800.0, + "grad_norm": 1.7511282338470942, + "language_loss": 0.69780844, + "learning_rate": 2.7874414796170555e-06, + "loss": 0.71992844, + "num_input_tokens_seen": 69889450, + "step": 3244, + "time_per_iteration": 2.507272958755493 + }, + { + "auxiliary_loss_clip": 0.0116973, + "auxiliary_loss_mlp": 0.01031554, + "balance_loss_clip": 1.04970145, + "balance_loss_mlp": 1.02252424, + "epoch": 0.3901881801238502, + "flos": 11801611808640.0, + "grad_norm": 2.4731015409778765, + "language_loss": 0.83439147, + "learning_rate": 2.7867253657218113e-06, + "loss": 0.8564043, + "num_input_tokens_seen": 69903340, + "step": 3245, + "time_per_iteration": 3.20247745513916 + }, + { + "auxiliary_loss_clip": 0.01157025, + "auxiliary_loss_mlp": 0.00762488, + "balance_loss_clip": 1.04869843, + "balance_loss_mlp": 1.00037825, + "epoch": 0.39030842301448926, + "flos": 27308951994240.0, + "grad_norm": 2.0514956719858732, + "language_loss": 0.72944283, + "learning_rate": 2.7860091324809544e-06, + "loss": 0.74863797, + "num_input_tokens_seen": 69924400, + "step": 3246, + "time_per_iteration": 3.358154535293579 + }, + { + "auxiliary_loss_clip": 0.01172715, + "auxiliary_loss_mlp": 0.01024562, + "balance_loss_clip": 1.05511057, + "balance_loss_mlp": 1.01652145, + "epoch": 0.39042866590512837, + "flos": 27163334257920.0, + "grad_norm": 1.7244623449810104, + "language_loss": 0.81484377, + "learning_rate": 2.7852927800031377e-06, + "loss": 0.83681655, + "num_input_tokens_seen": 69944565, + "step": 3247, + "time_per_iteration": 2.513159990310669 + }, + { + "auxiliary_loss_clip": 0.01164514, + "auxiliary_loss_mlp": 0.01024371, + "balance_loss_clip": 1.05084372, + "balance_loss_mlp": 1.01670575, + "epoch": 0.3905489087957674, + "flos": 29716115886720.0, + "grad_norm": 1.6961402500091034, + "language_loss": 0.82896584, + "learning_rate": 2.7845763083970298e-06, + "loss": 0.85085469, + "num_input_tokens_seen": 69964965, + "step": 3248, + "time_per_iteration": 3.3343276977539062 + }, + { + "auxiliary_loss_clip": 0.01166368, + "auxiliary_loss_mlp": 0.0102798, + "balance_loss_clip": 1.05156446, + "balance_loss_mlp": 1.01958752, + "epoch": 0.39066915168640653, + "flos": 24498618871680.0, + "grad_norm": 2.3417374675691884, + "language_loss": 0.82111371, + "learning_rate": 2.7838597177713205e-06, + "loss": 0.84305716, + "num_input_tokens_seen": 69986055, + "step": 3249, + "time_per_iteration": 2.494866132736206 + }, + { + "auxiliary_loss_clip": 0.0110427, + "auxiliary_loss_mlp": 0.01035821, + "balance_loss_clip": 1.04708958, + "balance_loss_mlp": 1.02646291, + "epoch": 0.39078939457704565, + "flos": 20558572122240.0, + "grad_norm": 1.7987328096892303, + "language_loss": 0.73747766, + "learning_rate": 2.7831430082347143e-06, + "loss": 0.75887859, + "num_input_tokens_seen": 70005260, + "step": 3250, + "time_per_iteration": 2.594383478164673 + }, + { + "auxiliary_loss_clip": 0.01176427, + "auxiliary_loss_mlp": 0.00761547, + "balance_loss_clip": 1.05340886, + "balance_loss_mlp": 1.00045085, + "epoch": 0.3909096374676847, + "flos": 22783417557120.0, + "grad_norm": 2.447068392171386, + "language_loss": 0.82033813, + "learning_rate": 2.7824261798959373e-06, + "loss": 0.83971786, + "num_input_tokens_seen": 70023440, + "step": 3251, + "time_per_iteration": 2.478611469268799 + }, + { + "auxiliary_loss_clip": 0.0116052, + "auxiliary_loss_mlp": 0.01031259, + "balance_loss_clip": 1.04854357, + "balance_loss_mlp": 1.02322435, + "epoch": 0.3910298803583238, + "flos": 23003119094400.0, + "grad_norm": 1.834267439107053, + "language_loss": 0.79226691, + "learning_rate": 2.78170923286373e-06, + "loss": 0.81418467, + "num_input_tokens_seen": 70043040, + "step": 3252, + "time_per_iteration": 2.5022308826446533 + }, + { + "auxiliary_loss_clip": 0.01094811, + "auxiliary_loss_mlp": 0.01030118, + "balance_loss_clip": 1.04641747, + "balance_loss_mlp": 1.02180958, + "epoch": 0.3911501232489629, + "flos": 24316264500480.0, + "grad_norm": 2.673181291766418, + "language_loss": 0.83831692, + "learning_rate": 2.780992167246854e-06, + "loss": 0.85956621, + "num_input_tokens_seen": 70060565, + "step": 3253, + "time_per_iteration": 2.6493852138519287 + }, + { + "auxiliary_loss_clip": 0.01066962, + "auxiliary_loss_mlp": 0.01001215, + "balance_loss_clip": 1.01653171, + "balance_loss_mlp": 1.00008833, + "epoch": 0.391270366139602, + "flos": 60869054684160.0, + "grad_norm": 0.9811389084125217, + "language_loss": 0.72160172, + "learning_rate": 2.7802749831540883e-06, + "loss": 0.74228346, + "num_input_tokens_seen": 70119465, + "step": 3254, + "time_per_iteration": 3.0975182056427 + }, + { + "auxiliary_loss_clip": 0.0113688, + "auxiliary_loss_mlp": 0.01027013, + "balance_loss_clip": 1.05007315, + "balance_loss_mlp": 1.0200367, + "epoch": 0.3913906090302411, + "flos": 21543494025600.0, + "grad_norm": 1.8490273475055274, + "language_loss": 0.81600785, + "learning_rate": 2.7795576806942268e-06, + "loss": 0.83764678, + "num_input_tokens_seen": 70138270, + "step": 3255, + "time_per_iteration": 2.568605899810791 + }, + { + "auxiliary_loss_clip": 0.01066615, + "auxiliary_loss_mlp": 0.01002925, + "balance_loss_clip": 1.02779222, + "balance_loss_mlp": 1.00158358, + "epoch": 0.3915108519208802, + "flos": 49839953702400.0, + "grad_norm": 0.7572656097740709, + "language_loss": 0.54898071, + "learning_rate": 2.778840259976085e-06, + "loss": 0.56967616, + "num_input_tokens_seen": 70193500, + "step": 3256, + "time_per_iteration": 3.06623911857605 + }, + { + "auxiliary_loss_clip": 0.01173793, + "auxiliary_loss_mlp": 0.01031943, + "balance_loss_clip": 1.05137181, + "balance_loss_mlp": 1.0238781, + "epoch": 0.39163109481151925, + "flos": 16506447960960.0, + "grad_norm": 2.204125050215817, + "language_loss": 0.77047038, + "learning_rate": 2.778122721108495e-06, + "loss": 0.79252774, + "num_input_tokens_seen": 70211730, + "step": 3257, + "time_per_iteration": 2.464665174484253 + }, + { + "auxiliary_loss_clip": 0.01171077, + "auxiliary_loss_mlp": 0.01027063, + "balance_loss_clip": 1.05341446, + "balance_loss_mlp": 1.01892757, + "epoch": 0.39175133770215836, + "flos": 26067484177920.0, + "grad_norm": 1.8672357136892448, + "language_loss": 0.882725, + "learning_rate": 2.7774050642003076e-06, + "loss": 0.90470636, + "num_input_tokens_seen": 70232540, + "step": 3258, + "time_per_iteration": 2.520674467086792 + }, + { + "auxiliary_loss_clip": 0.01191679, + "auxiliary_loss_mlp": 0.01031831, + "balance_loss_clip": 1.05539298, + "balance_loss_mlp": 1.02312267, + "epoch": 0.3918715805927975, + "flos": 21872076664320.0, + "grad_norm": 1.885725561659562, + "language_loss": 0.92974734, + "learning_rate": 2.7766872893603896e-06, + "loss": 0.9519825, + "num_input_tokens_seen": 70252515, + "step": 3259, + "time_per_iteration": 2.5070040225982666 + }, + { + "auxiliary_loss_clip": 0.01173924, + "auxiliary_loss_mlp": 0.01037117, + "balance_loss_clip": 1.05219817, + "balance_loss_mlp": 1.02967262, + "epoch": 0.39199182348343653, + "flos": 20376181837440.0, + "grad_norm": 2.1175813422825147, + "language_loss": 0.73421961, + "learning_rate": 2.7759693966976275e-06, + "loss": 0.75633001, + "num_input_tokens_seen": 70271020, + "step": 3260, + "time_per_iteration": 2.4811835289001465 + }, + { + "auxiliary_loss_clip": 0.0113944, + "auxiliary_loss_mlp": 0.010285, + "balance_loss_clip": 1.04592037, + "balance_loss_mlp": 1.01999497, + "epoch": 0.39211206637407564, + "flos": 21683545153920.0, + "grad_norm": 7.259481278652138, + "language_loss": 0.85176474, + "learning_rate": 2.7752513863209242e-06, + "loss": 0.87344414, + "num_input_tokens_seen": 70289600, + "step": 3261, + "time_per_iteration": 2.5382423400878906 + }, + { + "auxiliary_loss_clip": 0.01153812, + "auxiliary_loss_mlp": 0.00762205, + "balance_loss_clip": 1.05141902, + "balance_loss_mlp": 1.00038075, + "epoch": 0.39223230926471475, + "flos": 21066276908160.0, + "grad_norm": 1.694888748570929, + "language_loss": 0.84475338, + "learning_rate": 2.774533258339203e-06, + "loss": 0.86391354, + "num_input_tokens_seen": 70307060, + "step": 3262, + "time_per_iteration": 2.5144894123077393 + }, + { + "auxiliary_loss_clip": 0.01131034, + "auxiliary_loss_mlp": 0.01032831, + "balance_loss_clip": 1.04483116, + "balance_loss_mlp": 1.02343154, + "epoch": 0.3923525521553538, + "flos": 17603016312960.0, + "grad_norm": 3.0736135782484064, + "language_loss": 0.79768473, + "learning_rate": 2.7738150128614014e-06, + "loss": 0.81932342, + "num_input_tokens_seen": 70324465, + "step": 3263, + "time_per_iteration": 2.549433708190918 + }, + { + "auxiliary_loss_clip": 0.01138302, + "auxiliary_loss_mlp": 0.01033036, + "balance_loss_clip": 1.04997241, + "balance_loss_mlp": 1.02496505, + "epoch": 0.3924727950459929, + "flos": 20558284813440.0, + "grad_norm": 1.8157544306429125, + "language_loss": 0.89499652, + "learning_rate": 2.7730966499964777e-06, + "loss": 0.9167099, + "num_input_tokens_seen": 70341415, + "step": 3264, + "time_per_iteration": 2.5198991298675537 + }, + { + "auxiliary_loss_clip": 0.01190409, + "auxiliary_loss_mlp": 0.0103, + "balance_loss_clip": 1.05457211, + "balance_loss_mlp": 1.02154803, + "epoch": 0.39259303793663197, + "flos": 16216110328320.0, + "grad_norm": 6.219106226790861, + "language_loss": 0.80706483, + "learning_rate": 2.772378169853408e-06, + "loss": 0.82926893, + "num_input_tokens_seen": 70358985, + "step": 3265, + "time_per_iteration": 2.424022674560547 + }, + { + "auxiliary_loss_clip": 0.01145431, + "auxiliary_loss_mlp": 0.0102762, + "balance_loss_clip": 1.05041492, + "balance_loss_mlp": 1.01901925, + "epoch": 0.3927132808272711, + "flos": 16797001075200.0, + "grad_norm": 2.236348567217187, + "language_loss": 0.74376065, + "learning_rate": 2.771659572541183e-06, + "loss": 0.76549113, + "num_input_tokens_seen": 70376915, + "step": 3266, + "time_per_iteration": 2.5079658031463623 + }, + { + "auxiliary_loss_clip": 0.01178683, + "auxiliary_loss_mlp": 0.01029312, + "balance_loss_clip": 1.05608737, + "balance_loss_mlp": 1.02126575, + "epoch": 0.3928335237179102, + "flos": 20267228908800.0, + "grad_norm": 2.0563979194892457, + "language_loss": 0.86684871, + "learning_rate": 2.7709408581688143e-06, + "loss": 0.88892865, + "num_input_tokens_seen": 70396900, + "step": 3267, + "time_per_iteration": 2.4652278423309326 + }, + { + "auxiliary_loss_clip": 0.01154357, + "auxiliary_loss_mlp": 0.01028369, + "balance_loss_clip": 1.05061054, + "balance_loss_mlp": 1.02038801, + "epoch": 0.39295376660854925, + "flos": 24973250209920.0, + "grad_norm": 1.6535363570585413, + "language_loss": 0.87832403, + "learning_rate": 2.7702220268453307e-06, + "loss": 0.90015125, + "num_input_tokens_seen": 70417260, + "step": 3268, + "time_per_iteration": 2.5706140995025635 + }, + { + "auxiliary_loss_clip": 0.01161825, + "auxiliary_loss_mlp": 0.01027578, + "balance_loss_clip": 1.05060267, + "balance_loss_mlp": 1.01905429, + "epoch": 0.39307400949918836, + "flos": 18697788984960.0, + "grad_norm": 2.319327931351527, + "language_loss": 0.84805, + "learning_rate": 2.7695030786797785e-06, + "loss": 0.86994398, + "num_input_tokens_seen": 70433155, + "step": 3269, + "time_per_iteration": 2.4702348709106445 + }, + { + "auxiliary_loss_clip": 0.01126041, + "auxiliary_loss_mlp": 0.01025635, + "balance_loss_clip": 1.04748535, + "balance_loss_mlp": 1.01718879, + "epoch": 0.39319425238982747, + "flos": 22415476590720.0, + "grad_norm": 2.2415945368721433, + "language_loss": 0.74607718, + "learning_rate": 2.7687840137812206e-06, + "loss": 0.76759398, + "num_input_tokens_seen": 70451240, + "step": 3270, + "time_per_iteration": 3.343665361404419 + }, + { + "auxiliary_loss_clip": 0.01068676, + "auxiliary_loss_mlp": 0.01001742, + "balance_loss_clip": 1.016312, + "balance_loss_mlp": 1.00054359, + "epoch": 0.3933144952804665, + "flos": 66192954762240.0, + "grad_norm": 0.7932749573688521, + "language_loss": 0.62130201, + "learning_rate": 2.7680648322587395e-06, + "loss": 0.64200622, + "num_input_tokens_seen": 70516115, + "step": 3271, + "time_per_iteration": 3.1011946201324463 + }, + { + "auxiliary_loss_clip": 0.01186332, + "auxiliary_loss_mlp": 0.01027198, + "balance_loss_clip": 1.05333197, + "balance_loss_mlp": 1.01965547, + "epoch": 0.39343473817110564, + "flos": 15487159720320.0, + "grad_norm": 2.5086110622961946, + "language_loss": 0.80857927, + "learning_rate": 2.7673455342214334e-06, + "loss": 0.83071458, + "num_input_tokens_seen": 70533105, + "step": 3272, + "time_per_iteration": 3.1498918533325195 + }, + { + "auxiliary_loss_clip": 0.01175204, + "auxiliary_loss_mlp": 0.01025113, + "balance_loss_clip": 1.05401397, + "balance_loss_mlp": 1.01754665, + "epoch": 0.39355498106174475, + "flos": 21324905809920.0, + "grad_norm": 2.1036112231633286, + "language_loss": 0.76147461, + "learning_rate": 2.7666261197784198e-06, + "loss": 0.78347778, + "num_input_tokens_seen": 70551920, + "step": 3273, + "time_per_iteration": 3.252664089202881 + }, + { + "auxiliary_loss_clip": 0.01155864, + "auxiliary_loss_mlp": 0.01028693, + "balance_loss_clip": 1.05403721, + "balance_loss_mlp": 1.02050352, + "epoch": 0.3936752239523838, + "flos": 13296357400320.0, + "grad_norm": 2.0267563882040713, + "language_loss": 0.76668888, + "learning_rate": 2.7659065890388336e-06, + "loss": 0.78853446, + "num_input_tokens_seen": 70567920, + "step": 3274, + "time_per_iteration": 3.2408175468444824 + }, + { + "auxiliary_loss_clip": 0.01163268, + "auxiliary_loss_mlp": 0.01032564, + "balance_loss_clip": 1.05225778, + "balance_loss_mlp": 1.02417171, + "epoch": 0.3937954668430229, + "flos": 16800161472000.0, + "grad_norm": 2.592259828158815, + "language_loss": 0.84895349, + "learning_rate": 2.7651869421118266e-06, + "loss": 0.87091178, + "num_input_tokens_seen": 70584530, + "step": 3275, + "time_per_iteration": 2.4859871864318848 + }, + { + "auxiliary_loss_clip": 0.01181336, + "auxiliary_loss_mlp": 0.01032217, + "balance_loss_clip": 1.05702877, + "balance_loss_mlp": 1.02418208, + "epoch": 0.393915709733662, + "flos": 21064229832960.0, + "grad_norm": 2.2074002028430857, + "language_loss": 0.82884693, + "learning_rate": 2.76446717910657e-06, + "loss": 0.85098243, + "num_input_tokens_seen": 70605235, + "step": 3276, + "time_per_iteration": 2.4869489669799805 + }, + { + "auxiliary_loss_clip": 0.01171685, + "auxiliary_loss_mlp": 0.01029076, + "balance_loss_clip": 1.0525229, + "balance_loss_mlp": 1.02116632, + "epoch": 0.3940359526243011, + "flos": 17165265264000.0, + "grad_norm": 2.536526060859399, + "language_loss": 0.76832962, + "learning_rate": 2.763747300132249e-06, + "loss": 0.7903372, + "num_input_tokens_seen": 70622675, + "step": 3277, + "time_per_iteration": 2.4423882961273193 + }, + { + "auxiliary_loss_clip": 0.01189673, + "auxiliary_loss_mlp": 0.01029067, + "balance_loss_clip": 1.05546737, + "balance_loss_mlp": 1.0211606, + "epoch": 0.3941561955149402, + "flos": 20995856294400.0, + "grad_norm": 1.6934135284573004, + "language_loss": 0.86582929, + "learning_rate": 2.7630273052980704e-06, + "loss": 0.8880167, + "num_input_tokens_seen": 70643265, + "step": 3278, + "time_per_iteration": 2.4439072608947754 + }, + { + "auxiliary_loss_clip": 0.01151141, + "auxiliary_loss_mlp": 0.01025957, + "balance_loss_clip": 1.05144954, + "balance_loss_mlp": 1.01816058, + "epoch": 0.39427643840557924, + "flos": 18843406721280.0, + "grad_norm": 2.0786149378665084, + "language_loss": 0.67233002, + "learning_rate": 2.7623071947132554e-06, + "loss": 0.69410098, + "num_input_tokens_seen": 70660295, + "step": 3279, + "time_per_iteration": 2.4801411628723145 + }, + { + "auxiliary_loss_clip": 0.01165667, + "auxiliary_loss_mlp": 0.010316, + "balance_loss_clip": 1.05093813, + "balance_loss_mlp": 1.02334189, + "epoch": 0.39439668129621835, + "flos": 23258659426560.0, + "grad_norm": 1.8913293855597857, + "language_loss": 0.78825486, + "learning_rate": 2.7615869684870458e-06, + "loss": 0.81022751, + "num_input_tokens_seen": 70679605, + "step": 3280, + "time_per_iteration": 2.515402317047119 + }, + { + "auxiliary_loss_clip": 0.01172909, + "auxiliary_loss_mlp": 0.01028575, + "balance_loss_clip": 1.05440855, + "balance_loss_mlp": 1.02043259, + "epoch": 0.39451692418685746, + "flos": 26652289507200.0, + "grad_norm": 1.6926133783485442, + "language_loss": 0.84506148, + "learning_rate": 2.7608666267286986e-06, + "loss": 0.86707628, + "num_input_tokens_seen": 70699835, + "step": 3281, + "time_per_iteration": 2.51479172706604 + }, + { + "auxiliary_loss_clip": 0.01113709, + "auxiliary_loss_mlp": 0.01028883, + "balance_loss_clip": 1.04207671, + "balance_loss_mlp": 1.02058601, + "epoch": 0.3946371670774965, + "flos": 18258709132800.0, + "grad_norm": 2.649130998989593, + "language_loss": 0.86638319, + "learning_rate": 2.760146169547489e-06, + "loss": 0.88780916, + "num_input_tokens_seen": 70716600, + "step": 3282, + "time_per_iteration": 2.569162368774414 + }, + { + "auxiliary_loss_clip": 0.01162203, + "auxiliary_loss_mlp": 0.01029183, + "balance_loss_clip": 1.05493093, + "balance_loss_mlp": 1.02098107, + "epoch": 0.39475740996813563, + "flos": 24206126423040.0, + "grad_norm": 1.5308973326541964, + "language_loss": 0.76363611, + "learning_rate": 2.75942559705271e-06, + "loss": 0.78555, + "num_input_tokens_seen": 70736335, + "step": 3283, + "time_per_iteration": 2.553554058074951 + }, + { + "auxiliary_loss_clip": 0.0117071, + "auxiliary_loss_mlp": 0.01032516, + "balance_loss_clip": 1.0525763, + "balance_loss_mlp": 1.02479172, + "epoch": 0.39487765285877474, + "flos": 19317858491520.0, + "grad_norm": 2.483908002730313, + "language_loss": 0.8926003, + "learning_rate": 2.7587049093536713e-06, + "loss": 0.91463256, + "num_input_tokens_seen": 70752665, + "step": 3284, + "time_per_iteration": 2.4653451442718506 + }, + { + "auxiliary_loss_clip": 0.01177851, + "auxiliary_loss_mlp": 0.01033946, + "balance_loss_clip": 1.0531013, + "balance_loss_mlp": 1.02577984, + "epoch": 0.3949978957494138, + "flos": 17311744926720.0, + "grad_norm": 1.8609683879244894, + "language_loss": 0.80339694, + "learning_rate": 2.757984106559701e-06, + "loss": 0.82551491, + "num_input_tokens_seen": 70771650, + "step": 3285, + "time_per_iteration": 2.4574968814849854 + }, + { + "auxiliary_loss_clip": 0.01154849, + "auxiliary_loss_mlp": 0.01028114, + "balance_loss_clip": 1.05343187, + "balance_loss_mlp": 1.01991224, + "epoch": 0.3951181386400529, + "flos": 36317861280000.0, + "grad_norm": 3.435921973537331, + "language_loss": 0.7147857, + "learning_rate": 2.7572631887801446e-06, + "loss": 0.7366153, + "num_input_tokens_seen": 70793275, + "step": 3286, + "time_per_iteration": 2.615586042404175 + }, + { + "auxiliary_loss_clip": 0.01173754, + "auxiliary_loss_mlp": 0.01028518, + "balance_loss_clip": 1.05302215, + "balance_loss_mlp": 1.01976752, + "epoch": 0.395238381530692, + "flos": 23110348170240.0, + "grad_norm": 1.6400338964438717, + "language_loss": 0.76651073, + "learning_rate": 2.7565421561243654e-06, + "loss": 0.78853345, + "num_input_tokens_seen": 70811440, + "step": 3287, + "time_per_iteration": 2.5139527320861816 + }, + { + "auxiliary_loss_clip": 0.0113757, + "auxiliary_loss_mlp": 0.01025051, + "balance_loss_clip": 1.04817545, + "balance_loss_mlp": 1.016873, + "epoch": 0.3953586244213311, + "flos": 24347614095360.0, + "grad_norm": 1.9686969399302643, + "language_loss": 0.81830096, + "learning_rate": 2.7558210087017413e-06, + "loss": 0.8399272, + "num_input_tokens_seen": 70831375, + "step": 3288, + "time_per_iteration": 2.5509328842163086 + }, + { + "auxiliary_loss_clip": 0.0114329, + "auxiliary_loss_mlp": 0.01026692, + "balance_loss_clip": 1.0553019, + "balance_loss_mlp": 1.01776969, + "epoch": 0.3954788673119702, + "flos": 23440080044160.0, + "grad_norm": 2.083902233851938, + "language_loss": 0.73256904, + "learning_rate": 2.7550997466216724e-06, + "loss": 0.75426888, + "num_input_tokens_seen": 70849170, + "step": 3289, + "time_per_iteration": 2.540123701095581 + }, + { + "auxiliary_loss_clip": 0.01158934, + "auxiliary_loss_mlp": 0.01031376, + "balance_loss_clip": 1.05655098, + "balance_loss_mlp": 1.02314496, + "epoch": 0.3955991102026093, + "flos": 17494063384320.0, + "grad_norm": 2.238991352097241, + "language_loss": 0.81019682, + "learning_rate": 2.7543783699935714e-06, + "loss": 0.83209991, + "num_input_tokens_seen": 70867200, + "step": 3290, + "time_per_iteration": 2.4614505767822266 + }, + { + "auxiliary_loss_clip": 0.01174665, + "auxiliary_loss_mlp": 0.01031343, + "balance_loss_clip": 1.05785632, + "balance_loss_mlp": 1.02319491, + "epoch": 0.39571935309324835, + "flos": 18221326053120.0, + "grad_norm": 2.812448435214635, + "language_loss": 0.86176658, + "learning_rate": 2.753656878926872e-06, + "loss": 0.88382661, + "num_input_tokens_seen": 70883080, + "step": 3291, + "time_per_iteration": 2.481881856918335 + }, + { + "auxiliary_loss_clip": 0.01149204, + "auxiliary_loss_mlp": 0.01026971, + "balance_loss_clip": 1.04963636, + "balance_loss_mlp": 1.01857924, + "epoch": 0.39583959598388746, + "flos": 17748813617280.0, + "grad_norm": 2.58853945956802, + "language_loss": 0.73873657, + "learning_rate": 2.752935273531023e-06, + "loss": 0.76049834, + "num_input_tokens_seen": 70901230, + "step": 3292, + "time_per_iteration": 2.485473155975342 + }, + { + "auxiliary_loss_clip": 0.01175304, + "auxiliary_loss_mlp": 0.01030025, + "balance_loss_clip": 1.05456305, + "balance_loss_mlp": 1.02112007, + "epoch": 0.39595983887452657, + "flos": 19352368483200.0, + "grad_norm": 1.8373649462941501, + "language_loss": 0.78387642, + "learning_rate": 2.752213553915492e-06, + "loss": 0.80592978, + "num_input_tokens_seen": 70919585, + "step": 3293, + "time_per_iteration": 2.473907709121704 + }, + { + "auxiliary_loss_clip": 0.01062952, + "auxiliary_loss_mlp": 0.01003457, + "balance_loss_clip": 1.0180341, + "balance_loss_mlp": 1.00212812, + "epoch": 0.3960800817651656, + "flos": 60682282940160.0, + "grad_norm": 0.8146893721300985, + "language_loss": 0.66065681, + "learning_rate": 2.751491720189762e-06, + "loss": 0.68132091, + "num_input_tokens_seen": 70977695, + "step": 3294, + "time_per_iteration": 3.031970977783203 + }, + { + "auxiliary_loss_clip": 0.01160753, + "auxiliary_loss_mlp": 0.00762293, + "balance_loss_clip": 1.05386448, + "balance_loss_mlp": 1.0004499, + "epoch": 0.39620032465580474, + "flos": 16836718538880.0, + "grad_norm": 2.2867624600261176, + "language_loss": 0.91729188, + "learning_rate": 2.7507697724633364e-06, + "loss": 0.93652236, + "num_input_tokens_seen": 70994455, + "step": 3295, + "time_per_iteration": 2.5394127368927 + }, + { + "auxiliary_loss_clip": 0.01055594, + "auxiliary_loss_mlp": 0.0100528, + "balance_loss_clip": 1.02758682, + "balance_loss_mlp": 1.00365925, + "epoch": 0.3963205675464438, + "flos": 69071445941760.0, + "grad_norm": 0.7700004829472183, + "language_loss": 0.54665661, + "learning_rate": 2.7500477108457327e-06, + "loss": 0.56726533, + "num_input_tokens_seen": 71046465, + "step": 3296, + "time_per_iteration": 2.9330365657806396 + }, + { + "auxiliary_loss_clip": 0.01172782, + "auxiliary_loss_mlp": 0.01025491, + "balance_loss_clip": 1.05387557, + "balance_loss_mlp": 1.01707149, + "epoch": 0.3964408104370829, + "flos": 25667439431040.0, + "grad_norm": 1.9894239742997892, + "language_loss": 0.80694437, + "learning_rate": 2.7493255354464877e-06, + "loss": 0.82892704, + "num_input_tokens_seen": 71064275, + "step": 3297, + "time_per_iteration": 2.5229454040527344 + }, + { + "auxiliary_loss_clip": 0.01056476, + "auxiliary_loss_mlp": 0.01027414, + "balance_loss_clip": 1.03739834, + "balance_loss_mlp": 1.01942086, + "epoch": 0.396561053327722, + "flos": 24277480790400.0, + "grad_norm": 1.8284338356588723, + "language_loss": 0.76072466, + "learning_rate": 2.748603246375156e-06, + "loss": 0.78156358, + "num_input_tokens_seen": 71082290, + "step": 3298, + "time_per_iteration": 3.58309006690979 + }, + { + "auxiliary_loss_clip": 0.01188833, + "auxiliary_loss_mlp": 0.0103088, + "balance_loss_clip": 1.05668199, + "balance_loss_mlp": 1.02264297, + "epoch": 0.39668129621836107, + "flos": 20522302364160.0, + "grad_norm": 2.4566714783585026, + "language_loss": 0.69738567, + "learning_rate": 2.7478808437413055e-06, + "loss": 0.7195828, + "num_input_tokens_seen": 71101700, + "step": 3299, + "time_per_iteration": 3.179110288619995 + }, + { + "auxiliary_loss_clip": 0.011343, + "auxiliary_loss_mlp": 0.01025889, + "balance_loss_clip": 1.05562639, + "balance_loss_mlp": 1.01747859, + "epoch": 0.3968015391090002, + "flos": 27052585649280.0, + "grad_norm": 2.080970082098457, + "language_loss": 0.66052812, + "learning_rate": 2.7471583276545263e-06, + "loss": 0.68213004, + "num_input_tokens_seen": 71122360, + "step": 3300, + "time_per_iteration": 4.157140016555786 + }, + { + "auxiliary_loss_clip": 0.01159888, + "auxiliary_loss_mlp": 0.01029289, + "balance_loss_clip": 1.05217075, + "balance_loss_mlp": 1.02139759, + "epoch": 0.3969217819996393, + "flos": 12531819392640.0, + "grad_norm": 1.957548301449416, + "language_loss": 0.7065146, + "learning_rate": 2.7464356982244224e-06, + "loss": 0.72840637, + "num_input_tokens_seen": 71140360, + "step": 3301, + "time_per_iteration": 2.5129597187042236 + }, + { + "auxiliary_loss_clip": 0.01078083, + "auxiliary_loss_mlp": 0.01003951, + "balance_loss_clip": 1.02731037, + "balance_loss_mlp": 1.00240135, + "epoch": 0.39704202489027834, + "flos": 66241399230720.0, + "grad_norm": 0.7716940674069127, + "language_loss": 0.617203, + "learning_rate": 2.745712955560617e-06, + "loss": 0.63802326, + "num_input_tokens_seen": 71196565, + "step": 3302, + "time_per_iteration": 3.0492496490478516 + }, + { + "auxiliary_loss_clip": 0.01116622, + "auxiliary_loss_mlp": 0.01028907, + "balance_loss_clip": 1.04874635, + "balance_loss_mlp": 1.01981711, + "epoch": 0.39716226778091746, + "flos": 16982982720000.0, + "grad_norm": 2.4330349315988555, + "language_loss": 0.77439916, + "learning_rate": 2.7449900997727496e-06, + "loss": 0.79585445, + "num_input_tokens_seen": 71214675, + "step": 3303, + "time_per_iteration": 2.6078805923461914 + }, + { + "auxiliary_loss_clip": 0.01158845, + "auxiliary_loss_mlp": 0.01027873, + "balance_loss_clip": 1.0555321, + "balance_loss_mlp": 1.02037501, + "epoch": 0.39728251067155657, + "flos": 23477139901440.0, + "grad_norm": 1.7622823751079242, + "language_loss": 0.84114408, + "learning_rate": 2.744267130970476e-06, + "loss": 0.86301124, + "num_input_tokens_seen": 71234400, + "step": 3304, + "time_per_iteration": 2.5348589420318604 + }, + { + "auxiliary_loss_clip": 0.0115565, + "auxiliary_loss_mlp": 0.01027069, + "balance_loss_clip": 1.0534029, + "balance_loss_mlp": 1.01886749, + "epoch": 0.3974027535621956, + "flos": 20704441253760.0, + "grad_norm": 1.8457561291275235, + "language_loss": 0.7701537, + "learning_rate": 2.7435440492634697e-06, + "loss": 0.79198086, + "num_input_tokens_seen": 71253725, + "step": 3305, + "time_per_iteration": 2.527259349822998 + }, + { + "auxiliary_loss_clip": 0.01159171, + "auxiliary_loss_mlp": 0.01030255, + "balance_loss_clip": 1.05205595, + "balance_loss_mlp": 1.02139759, + "epoch": 0.39752299645283473, + "flos": 21543278544000.0, + "grad_norm": 2.156725030586891, + "language_loss": 0.67011982, + "learning_rate": 2.7428208547614228e-06, + "loss": 0.6920141, + "num_input_tokens_seen": 71273220, + "step": 3306, + "time_per_iteration": 2.5116069316864014 + }, + { + "auxiliary_loss_clip": 0.01176974, + "auxiliary_loss_mlp": 0.01032976, + "balance_loss_clip": 1.05642366, + "balance_loss_mlp": 1.02507293, + "epoch": 0.39764323934347384, + "flos": 19208295031680.0, + "grad_norm": 1.9700944684148092, + "language_loss": 0.7707026, + "learning_rate": 2.742097547574043e-06, + "loss": 0.7928021, + "num_input_tokens_seen": 71291445, + "step": 3307, + "time_per_iteration": 2.4921252727508545 + }, + { + "auxiliary_loss_clip": 0.01164649, + "auxiliary_loss_mlp": 0.00762728, + "balance_loss_clip": 1.05241549, + "balance_loss_mlp": 1.00040865, + "epoch": 0.3977634822341129, + "flos": 20850202644480.0, + "grad_norm": 1.9159188620328245, + "language_loss": 0.77526528, + "learning_rate": 2.7413741278110544e-06, + "loss": 0.79453909, + "num_input_tokens_seen": 71310135, + "step": 3308, + "time_per_iteration": 2.5261104106903076 + }, + { + "auxiliary_loss_clip": 0.01165988, + "auxiliary_loss_mlp": 0.01030535, + "balance_loss_clip": 1.05567241, + "balance_loss_mlp": 1.02170789, + "epoch": 0.397883725124752, + "flos": 39786042038400.0, + "grad_norm": 2.7333888532743846, + "language_loss": 0.68953776, + "learning_rate": 2.7406505955822016e-06, + "loss": 0.71150303, + "num_input_tokens_seen": 71331160, + "step": 3309, + "time_per_iteration": 2.677307367324829 + }, + { + "auxiliary_loss_clip": 0.01160191, + "auxiliary_loss_mlp": 0.01027148, + "balance_loss_clip": 1.05124772, + "balance_loss_mlp": 1.01912487, + "epoch": 0.39800396801539106, + "flos": 17379507934080.0, + "grad_norm": 2.663054352573472, + "language_loss": 0.65836632, + "learning_rate": 2.7399269509972415e-06, + "loss": 0.68023968, + "num_input_tokens_seen": 71345315, + "step": 3310, + "time_per_iteration": 2.4549624919891357 + }, + { + "auxiliary_loss_clip": 0.01152613, + "auxiliary_loss_mlp": 0.0102739, + "balance_loss_clip": 1.04771292, + "balance_loss_mlp": 1.01838362, + "epoch": 0.3981242109060302, + "flos": 19202764337280.0, + "grad_norm": 2.1699671629343316, + "language_loss": 0.84959233, + "learning_rate": 2.7392031941659514e-06, + "loss": 0.87139237, + "num_input_tokens_seen": 71363160, + "step": 3311, + "time_per_iteration": 2.4882733821868896 + }, + { + "auxiliary_loss_clip": 0.01160256, + "auxiliary_loss_mlp": 0.01034336, + "balance_loss_clip": 1.05572152, + "balance_loss_mlp": 1.02578866, + "epoch": 0.3982444537966693, + "flos": 24565124903040.0, + "grad_norm": 2.133513389274206, + "language_loss": 0.85792911, + "learning_rate": 2.7384793251981244e-06, + "loss": 0.87987506, + "num_input_tokens_seen": 71382145, + "step": 3312, + "time_per_iteration": 2.5193159580230713 + }, + { + "auxiliary_loss_clip": 0.01179032, + "auxiliary_loss_mlp": 0.01025268, + "balance_loss_clip": 1.05350542, + "balance_loss_mlp": 1.01767468, + "epoch": 0.39836469668730834, + "flos": 26213856099840.0, + "grad_norm": 2.931130188443631, + "language_loss": 0.80752569, + "learning_rate": 2.737755344203571e-06, + "loss": 0.82956862, + "num_input_tokens_seen": 71402095, + "step": 3313, + "time_per_iteration": 2.525136709213257 + }, + { + "auxiliary_loss_clip": 0.01181087, + "auxiliary_loss_mlp": 0.01028367, + "balance_loss_clip": 1.05852127, + "balance_loss_mlp": 1.02036238, + "epoch": 0.39848493957794745, + "flos": 27636134002560.0, + "grad_norm": 2.054232904132072, + "language_loss": 0.79899204, + "learning_rate": 2.7370312512921186e-06, + "loss": 0.82108659, + "num_input_tokens_seen": 71423875, + "step": 3314, + "time_per_iteration": 2.521677017211914 + }, + { + "auxiliary_loss_clip": 0.01162674, + "auxiliary_loss_mlp": 0.01032292, + "balance_loss_clip": 1.0514282, + "balance_loss_mlp": 1.02327394, + "epoch": 0.39860518246858656, + "flos": 12239326944000.0, + "grad_norm": 2.5434099138492305, + "language_loss": 0.76567411, + "learning_rate": 2.736307046573611e-06, + "loss": 0.78762376, + "num_input_tokens_seen": 71439745, + "step": 3315, + "time_per_iteration": 2.4765560626983643 + }, + { + "auxiliary_loss_clip": 0.01186616, + "auxiliary_loss_mlp": 0.01025059, + "balance_loss_clip": 1.054075, + "balance_loss_mlp": 1.01745367, + "epoch": 0.3987254253592256, + "flos": 22379135005440.0, + "grad_norm": 1.8530663957277151, + "language_loss": 0.81462198, + "learning_rate": 2.73558273015791e-06, + "loss": 0.83673871, + "num_input_tokens_seen": 71459575, + "step": 3316, + "time_per_iteration": 2.445626735687256 + }, + { + "auxiliary_loss_clip": 0.01192046, + "auxiliary_loss_mlp": 0.01030716, + "balance_loss_clip": 1.05575705, + "balance_loss_mlp": 1.021734, + "epoch": 0.3988456682498647, + "flos": 23514020190720.0, + "grad_norm": 2.6676061722559163, + "language_loss": 0.70846486, + "learning_rate": 2.734858302154894e-06, + "loss": 0.73069251, + "num_input_tokens_seen": 71481075, + "step": 3317, + "time_per_iteration": 2.462660312652588 + }, + { + "auxiliary_loss_clip": 0.01153183, + "auxiliary_loss_mlp": 0.01026935, + "balance_loss_clip": 1.05095959, + "balance_loss_mlp": 1.01866221, + "epoch": 0.39896591114050384, + "flos": 19208761908480.0, + "grad_norm": 2.0279872953881366, + "language_loss": 0.76402962, + "learning_rate": 2.734133762674457e-06, + "loss": 0.78583086, + "num_input_tokens_seen": 71500665, + "step": 3318, + "time_per_iteration": 2.4859869480133057 + }, + { + "auxiliary_loss_clip": 0.01160963, + "auxiliary_loss_mlp": 0.01029116, + "balance_loss_clip": 1.05278337, + "balance_loss_mlp": 1.02069438, + "epoch": 0.3990861540311429, + "flos": 28401031146240.0, + "grad_norm": 3.150370027877685, + "language_loss": 0.70471942, + "learning_rate": 2.7334091118265124e-06, + "loss": 0.7266202, + "num_input_tokens_seen": 71522560, + "step": 3319, + "time_per_iteration": 2.5645322799682617 + }, + { + "auxiliary_loss_clip": 0.0107901, + "auxiliary_loss_mlp": 0.01004414, + "balance_loss_clip": 1.01996183, + "balance_loss_mlp": 1.00322235, + "epoch": 0.399206396921782, + "flos": 61758563086080.0, + "grad_norm": 0.6745513060907466, + "language_loss": 0.57823151, + "learning_rate": 2.732684349720989e-06, + "loss": 0.59906578, + "num_input_tokens_seen": 71590520, + "step": 3320, + "time_per_iteration": 3.0778231620788574 + }, + { + "auxiliary_loss_clip": 0.0115087, + "auxiliary_loss_mlp": 0.01028434, + "balance_loss_clip": 1.05114448, + "balance_loss_mlp": 1.02016699, + "epoch": 0.3993266398124211, + "flos": 28074567409920.0, + "grad_norm": 1.7791256025592914, + "language_loss": 0.75540328, + "learning_rate": 2.7319594764678318e-06, + "loss": 0.77719641, + "num_input_tokens_seen": 71612620, + "step": 3321, + "time_per_iteration": 2.5943443775177 + }, + { + "auxiliary_loss_clip": 0.0113667, + "auxiliary_loss_mlp": 0.01031831, + "balance_loss_clip": 1.05008006, + "balance_loss_mlp": 1.02294445, + "epoch": 0.39944688270306017, + "flos": 23225083188480.0, + "grad_norm": 1.8097458456065358, + "language_loss": 0.83163452, + "learning_rate": 2.7312344921770044e-06, + "loss": 0.85331953, + "num_input_tokens_seen": 71634320, + "step": 3322, + "time_per_iteration": 2.6100685596466064 + }, + { + "auxiliary_loss_clip": 0.0116139, + "auxiliary_loss_mlp": 0.010299, + "balance_loss_clip": 1.05151963, + "balance_loss_mlp": 1.02200246, + "epoch": 0.3995671255936993, + "flos": 19390433921280.0, + "grad_norm": 1.8428185212899342, + "language_loss": 0.78458834, + "learning_rate": 2.7305093969584857e-06, + "loss": 0.80650127, + "num_input_tokens_seen": 71653145, + "step": 3323, + "time_per_iteration": 2.526556968688965 + }, + { + "auxiliary_loss_clip": 0.01167466, + "auxiliary_loss_mlp": 0.01028202, + "balance_loss_clip": 1.05169046, + "balance_loss_mlp": 1.02016091, + "epoch": 0.3996873684843384, + "flos": 23842638743040.0, + "grad_norm": 1.7578831181860115, + "language_loss": 0.79773772, + "learning_rate": 2.729784190922272e-06, + "loss": 0.8196944, + "num_input_tokens_seen": 71674580, + "step": 3324, + "time_per_iteration": 3.2774930000305176 + }, + { + "auxiliary_loss_clip": 0.01061985, + "auxiliary_loss_mlp": 0.01005496, + "balance_loss_clip": 1.01634336, + "balance_loss_mlp": 1.00429189, + "epoch": 0.39980761137497745, + "flos": 66576877280640.0, + "grad_norm": 0.9543373549933556, + "language_loss": 0.57216817, + "learning_rate": 2.729058874178378e-06, + "loss": 0.59284294, + "num_input_tokens_seen": 71745260, + "step": 3325, + "time_per_iteration": 3.9042651653289795 + }, + { + "auxiliary_loss_clip": 0.01165019, + "auxiliary_loss_mlp": 0.01033672, + "balance_loss_clip": 1.05389953, + "balance_loss_mlp": 1.02495813, + "epoch": 0.39992785426561656, + "flos": 28549162834560.0, + "grad_norm": 2.174016048910255, + "language_loss": 0.69218481, + "learning_rate": 2.7283334468368315e-06, + "loss": 0.71417177, + "num_input_tokens_seen": 71766540, + "step": 3326, + "time_per_iteration": 3.304060220718384 + }, + { + "auxiliary_loss_clip": 0.01086587, + "auxiliary_loss_mlp": 0.01026168, + "balance_loss_clip": 1.04057956, + "balance_loss_mlp": 1.01723886, + "epoch": 0.4000480971562556, + "flos": 15049408671360.0, + "grad_norm": 2.0814819218730967, + "language_loss": 0.73365706, + "learning_rate": 2.72760790900768e-06, + "loss": 0.75478458, + "num_input_tokens_seen": 71783125, + "step": 3327, + "time_per_iteration": 3.543264389038086 + }, + { + "auxiliary_loss_clip": 0.01193598, + "auxiliary_loss_mlp": 0.01031056, + "balance_loss_clip": 1.05891061, + "balance_loss_mlp": 1.02269959, + "epoch": 0.4001683400468947, + "flos": 23915609222400.0, + "grad_norm": 2.150570009842076, + "language_loss": 0.78716481, + "learning_rate": 2.7268822608009875e-06, + "loss": 0.80941135, + "num_input_tokens_seen": 71802500, + "step": 3328, + "time_per_iteration": 2.680457830429077 + }, + { + "auxiliary_loss_clip": 0.01151168, + "auxiliary_loss_mlp": 0.01029652, + "balance_loss_clip": 1.05056453, + "balance_loss_mlp": 1.02139091, + "epoch": 0.40028858293753383, + "flos": 24352677912960.0, + "grad_norm": 2.229175024203181, + "language_loss": 0.77765095, + "learning_rate": 2.726156502326834e-06, + "loss": 0.7994591, + "num_input_tokens_seen": 71823800, + "step": 3329, + "time_per_iteration": 2.6513571739196777 + }, + { + "auxiliary_loss_clip": 0.01036, + "auxiliary_loss_mlp": 0.01006989, + "balance_loss_clip": 1.0219841, + "balance_loss_mlp": 1.00533223, + "epoch": 0.4004088258281729, + "flos": 66787025800320.0, + "grad_norm": 0.6948601876514632, + "language_loss": 0.60258204, + "learning_rate": 2.725430633695316e-06, + "loss": 0.62301195, + "num_input_tokens_seen": 71886880, + "step": 3330, + "time_per_iteration": 3.262364387512207 + }, + { + "auxiliary_loss_clip": 0.0108194, + "auxiliary_loss_mlp": 0.01004789, + "balance_loss_clip": 1.01412714, + "balance_loss_mlp": 1.00350761, + "epoch": 0.400529068718812, + "flos": 58598386473600.0, + "grad_norm": 2.080801513145063, + "language_loss": 0.57934374, + "learning_rate": 2.7247046550165485e-06, + "loss": 0.60021096, + "num_input_tokens_seen": 71939005, + "step": 3331, + "time_per_iteration": 2.8246400356292725 + }, + { + "auxiliary_loss_clip": 0.01192142, + "auxiliary_loss_mlp": 0.01031511, + "balance_loss_clip": 1.05786371, + "balance_loss_mlp": 1.02350664, + "epoch": 0.4006493116094511, + "flos": 25377460934400.0, + "grad_norm": 1.4002474364562716, + "language_loss": 0.75599504, + "learning_rate": 2.7239785664006606e-06, + "loss": 0.77823162, + "num_input_tokens_seen": 71962545, + "step": 3332, + "time_per_iteration": 2.5311455726623535 + }, + { + "auxiliary_loss_clip": 0.01072039, + "auxiliary_loss_mlp": 0.01002403, + "balance_loss_clip": 1.01290953, + "balance_loss_mlp": 1.00116873, + "epoch": 0.40076955450009016, + "flos": 60280729822080.0, + "grad_norm": 0.7700802835392234, + "language_loss": 0.6182704, + "learning_rate": 2.7232523679578002e-06, + "loss": 0.63901484, + "num_input_tokens_seen": 72025625, + "step": 3333, + "time_per_iteration": 3.1059629917144775 + }, + { + "auxiliary_loss_clip": 0.01171825, + "auxiliary_loss_mlp": 0.01023962, + "balance_loss_clip": 1.05510819, + "balance_loss_mlp": 1.01605892, + "epoch": 0.4008897973907293, + "flos": 16617268396800.0, + "grad_norm": 2.3961807137709674, + "language_loss": 0.79424703, + "learning_rate": 2.7225260597981295e-06, + "loss": 0.81620491, + "num_input_tokens_seen": 72043330, + "step": 3334, + "time_per_iteration": 2.496061325073242 + }, + { + "auxiliary_loss_clip": 0.01145861, + "auxiliary_loss_mlp": 0.00763264, + "balance_loss_clip": 1.05299187, + "balance_loss_mlp": 1.0006479, + "epoch": 0.4010100402813684, + "flos": 15377344865280.0, + "grad_norm": 3.098198080878277, + "language_loss": 0.78881025, + "learning_rate": 2.721799642031831e-06, + "loss": 0.8079015, + "num_input_tokens_seen": 72059500, + "step": 3335, + "time_per_iteration": 2.5402019023895264 + }, + { + "auxiliary_loss_clip": 0.01164672, + "auxiliary_loss_mlp": 0.01032574, + "balance_loss_clip": 1.04936671, + "balance_loss_mlp": 1.02433074, + "epoch": 0.40113028317200744, + "flos": 13298835438720.0, + "grad_norm": 1.9153628685419277, + "language_loss": 0.78004599, + "learning_rate": 2.721073114769101e-06, + "loss": 0.80201846, + "num_input_tokens_seen": 72077175, + "step": 3336, + "time_per_iteration": 2.5096514225006104 + }, + { + "auxiliary_loss_clip": 0.0114311, + "auxiliary_loss_mlp": 0.01031514, + "balance_loss_clip": 1.05090725, + "balance_loss_mlp": 1.0234673, + "epoch": 0.40125052606264655, + "flos": 20668027841280.0, + "grad_norm": 1.808016590669293, + "language_loss": 0.74746376, + "learning_rate": 2.7203464781201523e-06, + "loss": 0.76920998, + "num_input_tokens_seen": 72096490, + "step": 3337, + "time_per_iteration": 2.5656578540802 + }, + { + "auxiliary_loss_clip": 0.01192844, + "auxiliary_loss_mlp": 0.01033562, + "balance_loss_clip": 1.05860531, + "balance_loss_mlp": 1.02534878, + "epoch": 0.40137076895328566, + "flos": 24607679541120.0, + "grad_norm": 1.9038373027171485, + "language_loss": 0.77944303, + "learning_rate": 2.719619732195215e-06, + "loss": 0.80170703, + "num_input_tokens_seen": 72118130, + "step": 3338, + "time_per_iteration": 2.4942355155944824 + }, + { + "auxiliary_loss_clip": 0.01148346, + "auxiliary_loss_mlp": 0.01024267, + "balance_loss_clip": 1.05144536, + "balance_loss_mlp": 1.01655984, + "epoch": 0.4014910118439247, + "flos": 24206593299840.0, + "grad_norm": 1.4866763016886382, + "language_loss": 0.72520113, + "learning_rate": 2.7188928771045377e-06, + "loss": 0.74692726, + "num_input_tokens_seen": 72139450, + "step": 3339, + "time_per_iteration": 2.592337131500244 + }, + { + "auxiliary_loss_clip": 0.01143076, + "auxiliary_loss_mlp": 0.01028642, + "balance_loss_clip": 1.0509342, + "balance_loss_mlp": 1.0207448, + "epoch": 0.4016112547345638, + "flos": 26725080418560.0, + "grad_norm": 2.0161338610365456, + "language_loss": 0.80004019, + "learning_rate": 2.7181659129583815e-06, + "loss": 0.82175738, + "num_input_tokens_seen": 72159040, + "step": 3340, + "time_per_iteration": 2.6222496032714844 + }, + { + "auxiliary_loss_clip": 0.01149663, + "auxiliary_loss_mlp": 0.01025321, + "balance_loss_clip": 1.04571569, + "balance_loss_mlp": 1.01731646, + "epoch": 0.4017314976252029, + "flos": 21288025520640.0, + "grad_norm": 1.7162224431247446, + "language_loss": 0.75670826, + "learning_rate": 2.7174388398670276e-06, + "loss": 0.77845812, + "num_input_tokens_seen": 72178220, + "step": 3341, + "time_per_iteration": 2.558103322982788 + }, + { + "auxiliary_loss_clip": 0.01189937, + "auxiliary_loss_mlp": 0.0103439, + "balance_loss_clip": 1.05244887, + "balance_loss_mlp": 1.02617073, + "epoch": 0.401851740515842, + "flos": 25484690010240.0, + "grad_norm": 2.116026755798316, + "language_loss": 0.92116606, + "learning_rate": 2.716711657940773e-06, + "loss": 0.94340932, + "num_input_tokens_seen": 72199230, + "step": 3342, + "time_per_iteration": 2.9729599952697754 + }, + { + "auxiliary_loss_clip": 0.0104828, + "auxiliary_loss_mlp": 0.01001858, + "balance_loss_clip": 1.01201439, + "balance_loss_mlp": 1.00061262, + "epoch": 0.4019719834064811, + "flos": 55395334978560.0, + "grad_norm": 0.8137515212271276, + "language_loss": 0.56510848, + "learning_rate": 2.7159843672899284e-06, + "loss": 0.58560991, + "num_input_tokens_seen": 72263430, + "step": 3343, + "time_per_iteration": 3.2894105911254883 + }, + { + "auxiliary_loss_clip": 0.01177096, + "auxiliary_loss_mlp": 0.01027623, + "balance_loss_clip": 1.05613887, + "balance_loss_mlp": 1.01911187, + "epoch": 0.40209222629712016, + "flos": 18180100218240.0, + "grad_norm": 1.995904799377066, + "language_loss": 0.81423163, + "learning_rate": 2.715256968024825e-06, + "loss": 0.83627892, + "num_input_tokens_seen": 72280505, + "step": 3344, + "time_per_iteration": 2.462897539138794 + }, + { + "auxiliary_loss_clip": 0.01166518, + "auxiliary_loss_mlp": 0.01028151, + "balance_loss_clip": 1.05251837, + "balance_loss_mlp": 1.01988935, + "epoch": 0.40221246918775927, + "flos": 25961009287680.0, + "grad_norm": 1.6228542581282883, + "language_loss": 0.82006609, + "learning_rate": 2.7145294602558083e-06, + "loss": 0.84201282, + "num_input_tokens_seen": 72301215, + "step": 3345, + "time_per_iteration": 2.5736207962036133 + }, + { + "auxiliary_loss_clip": 0.01176001, + "auxiliary_loss_mlp": 0.01027288, + "balance_loss_clip": 1.05460215, + "balance_loss_mlp": 1.01820993, + "epoch": 0.4023327120783984, + "flos": 33838912056960.0, + "grad_norm": 6.6190236081884235, + "language_loss": 0.70220411, + "learning_rate": 2.713801844093241e-06, + "loss": 0.72423697, + "num_input_tokens_seen": 72322365, + "step": 3346, + "time_per_iteration": 2.5817642211914062 + }, + { + "auxiliary_loss_clip": 0.01175566, + "auxiliary_loss_mlp": 0.01031792, + "balance_loss_clip": 1.05441129, + "balance_loss_mlp": 1.0235846, + "epoch": 0.40245295496903744, + "flos": 26900252069760.0, + "grad_norm": 2.257300439097208, + "language_loss": 0.88455737, + "learning_rate": 2.7130741196475014e-06, + "loss": 0.90663099, + "num_input_tokens_seen": 72340495, + "step": 3347, + "time_per_iteration": 2.529357671737671 + }, + { + "auxiliary_loss_clip": 0.01164132, + "auxiliary_loss_mlp": 0.01032444, + "balance_loss_clip": 1.0554781, + "balance_loss_mlp": 1.02378917, + "epoch": 0.40257319785967655, + "flos": 36902738436480.0, + "grad_norm": 1.8856522106072577, + "language_loss": 0.79283142, + "learning_rate": 2.7123462870289848e-06, + "loss": 0.81479722, + "num_input_tokens_seen": 72360545, + "step": 3348, + "time_per_iteration": 2.6562905311584473 + }, + { + "auxiliary_loss_clip": 0.01160756, + "auxiliary_loss_mlp": 0.01028275, + "balance_loss_clip": 1.04919887, + "balance_loss_mlp": 1.0195787, + "epoch": 0.40269344075031566, + "flos": 24353180703360.0, + "grad_norm": 1.5333621637429775, + "language_loss": 0.80996132, + "learning_rate": 2.711618346348102e-06, + "loss": 0.83185166, + "num_input_tokens_seen": 72381070, + "step": 3349, + "time_per_iteration": 2.565377950668335 + }, + { + "auxiliary_loss_clip": 0.0115434, + "auxiliary_loss_mlp": 0.01036855, + "balance_loss_clip": 1.05283678, + "balance_loss_mlp": 1.02826571, + "epoch": 0.4028136836409547, + "flos": 14389657614720.0, + "grad_norm": 1.5927095704525607, + "language_loss": 0.63420039, + "learning_rate": 2.7108902977152825e-06, + "loss": 0.65611231, + "num_input_tokens_seen": 72398970, + "step": 3350, + "time_per_iteration": 3.5454392433166504 + }, + { + "auxiliary_loss_clip": 0.01169739, + "auxiliary_loss_mlp": 0.01025369, + "balance_loss_clip": 1.05140293, + "balance_loss_mlp": 1.0170244, + "epoch": 0.4029339265315938, + "flos": 26136037284480.0, + "grad_norm": 2.1746592194775363, + "language_loss": 0.75036407, + "learning_rate": 2.7101621412409704e-06, + "loss": 0.77231514, + "num_input_tokens_seen": 72418455, + "step": 3351, + "time_per_iteration": 3.325833320617676 + }, + { + "auxiliary_loss_clip": 0.01188848, + "auxiliary_loss_mlp": 0.01032187, + "balance_loss_clip": 1.05494964, + "balance_loss_mlp": 1.02397299, + "epoch": 0.40305416942223293, + "flos": 23256325042560.0, + "grad_norm": 1.8398116161912201, + "language_loss": 0.85598671, + "learning_rate": 2.7094338770356256e-06, + "loss": 0.87819707, + "num_input_tokens_seen": 72437540, + "step": 3352, + "time_per_iteration": 2.4549169540405273 + }, + { + "auxiliary_loss_clip": 0.01156183, + "auxiliary_loss_mlp": 0.01030068, + "balance_loss_clip": 1.05280566, + "balance_loss_mlp": 1.02215219, + "epoch": 0.403174412312872, + "flos": 27089645506560.0, + "grad_norm": 2.033566684808109, + "language_loss": 0.63932502, + "learning_rate": 2.708705505209726e-06, + "loss": 0.66118753, + "num_input_tokens_seen": 72458315, + "step": 3353, + "time_per_iteration": 3.4310190677642822 + }, + { + "auxiliary_loss_clip": 0.01124356, + "auxiliary_loss_mlp": 0.01024354, + "balance_loss_clip": 1.04532325, + "balance_loss_mlp": 1.016415, + "epoch": 0.4032946552035111, + "flos": 21756336065280.0, + "grad_norm": 2.150736066071613, + "language_loss": 0.91906774, + "learning_rate": 2.7079770258737646e-06, + "loss": 0.94055486, + "num_input_tokens_seen": 72476225, + "step": 3354, + "time_per_iteration": 2.6121082305908203 + }, + { + "auxiliary_loss_clip": 0.01142199, + "auxiliary_loss_mlp": 0.01030092, + "balance_loss_clip": 1.04831982, + "balance_loss_mlp": 1.02073979, + "epoch": 0.4034148980941502, + "flos": 17343956448000.0, + "grad_norm": 2.201689803165314, + "language_loss": 0.75071657, + "learning_rate": 2.707248439138251e-06, + "loss": 0.77243948, + "num_input_tokens_seen": 72492460, + "step": 3355, + "time_per_iteration": 2.546983003616333 + }, + { + "auxiliary_loss_clip": 0.01156409, + "auxiliary_loss_mlp": 0.01028833, + "balance_loss_clip": 1.05354249, + "balance_loss_mlp": 1.02106929, + "epoch": 0.40353514098478926, + "flos": 22017838055040.0, + "grad_norm": 1.7525433914089898, + "language_loss": 0.6533891, + "learning_rate": 2.7065197451137114e-06, + "loss": 0.67524153, + "num_input_tokens_seen": 72513840, + "step": 3356, + "time_per_iteration": 2.641221284866333 + }, + { + "auxiliary_loss_clip": 0.01159177, + "auxiliary_loss_mlp": 0.01030025, + "balance_loss_clip": 1.05222619, + "balance_loss_mlp": 1.02191854, + "epoch": 0.4036553838754284, + "flos": 14246446089600.0, + "grad_norm": 4.634573133330839, + "language_loss": 0.67784619, + "learning_rate": 2.7057909439106894e-06, + "loss": 0.69973826, + "num_input_tokens_seen": 72531695, + "step": 3357, + "time_per_iteration": 2.556687593460083 + }, + { + "auxiliary_loss_clip": 0.01166069, + "auxiliary_loss_mlp": 0.00763196, + "balance_loss_clip": 1.05235612, + "balance_loss_mlp": 1.00063968, + "epoch": 0.40377562676606743, + "flos": 24790644443520.0, + "grad_norm": 1.8197027659565153, + "language_loss": 0.78258085, + "learning_rate": 2.7050620356397417e-06, + "loss": 0.80187345, + "num_input_tokens_seen": 72550645, + "step": 3358, + "time_per_iteration": 2.601382255554199 + }, + { + "auxiliary_loss_clip": 0.01187792, + "auxiliary_loss_mlp": 0.01024552, + "balance_loss_clip": 1.05745316, + "balance_loss_mlp": 1.0167737, + "epoch": 0.40389586965670654, + "flos": 24061226958720.0, + "grad_norm": 1.7769511329795789, + "language_loss": 0.72769475, + "learning_rate": 2.7043330204114437e-06, + "loss": 0.74981821, + "num_input_tokens_seen": 72569355, + "step": 3359, + "time_per_iteration": 2.5475287437438965 + }, + { + "auxiliary_loss_clip": 0.01183028, + "auxiliary_loss_mlp": 0.0102568, + "balance_loss_clip": 1.05226421, + "balance_loss_mlp": 1.01767492, + "epoch": 0.40401611254734565, + "flos": 16399613934720.0, + "grad_norm": 2.513769609287101, + "language_loss": 0.85774803, + "learning_rate": 2.7036038983363862e-06, + "loss": 0.87983513, + "num_input_tokens_seen": 72585960, + "step": 3360, + "time_per_iteration": 2.4227218627929688 + }, + { + "auxiliary_loss_clip": 0.01168042, + "auxiliary_loss_mlp": 0.01027211, + "balance_loss_clip": 1.05344689, + "balance_loss_mlp": 1.01967692, + "epoch": 0.4041363554379847, + "flos": 23988220565760.0, + "grad_norm": 1.9699704767183963, + "language_loss": 0.83907604, + "learning_rate": 2.702874669525177e-06, + "loss": 0.86102855, + "num_input_tokens_seen": 72604440, + "step": 3361, + "time_per_iteration": 2.4971797466278076 + }, + { + "auxiliary_loss_clip": 0.01149977, + "auxiliary_loss_mlp": 0.01030198, + "balance_loss_clip": 1.05567169, + "balance_loss_mlp": 1.02178204, + "epoch": 0.4042565983286238, + "flos": 28401964899840.0, + "grad_norm": 2.995160829185528, + "language_loss": 0.69837582, + "learning_rate": 2.7021453340884394e-06, + "loss": 0.72017753, + "num_input_tokens_seen": 72622165, + "step": 3362, + "time_per_iteration": 2.684680700302124 + }, + { + "auxiliary_loss_clip": 0.01150932, + "auxiliary_loss_mlp": 0.00762455, + "balance_loss_clip": 1.05304241, + "balance_loss_mlp": 1.00068498, + "epoch": 0.40437684121926293, + "flos": 17710963660800.0, + "grad_norm": 3.768968076684646, + "language_loss": 0.72998071, + "learning_rate": 2.7014158921368125e-06, + "loss": 0.74911463, + "num_input_tokens_seen": 72640490, + "step": 3363, + "time_per_iteration": 2.6950089931488037 + }, + { + "auxiliary_loss_clip": 0.01188787, + "auxiliary_loss_mlp": 0.01033315, + "balance_loss_clip": 1.05638933, + "balance_loss_mlp": 1.02474952, + "epoch": 0.404497084109902, + "flos": 24018959629440.0, + "grad_norm": 3.003598760845803, + "language_loss": 0.85531569, + "learning_rate": 2.700686343780953e-06, + "loss": 0.87753671, + "num_input_tokens_seen": 72660360, + "step": 3364, + "time_per_iteration": 2.525210380554199 + }, + { + "auxiliary_loss_clip": 0.01159583, + "auxiliary_loss_mlp": 0.01027271, + "balance_loss_clip": 1.05077744, + "balance_loss_mlp": 1.01924253, + "epoch": 0.4046173270005411, + "flos": 22929861306240.0, + "grad_norm": 1.8443931757346368, + "language_loss": 0.88296902, + "learning_rate": 2.699956689131532e-06, + "loss": 0.90483749, + "num_input_tokens_seen": 72680345, + "step": 3365, + "time_per_iteration": 2.582543134689331 + }, + { + "auxiliary_loss_clip": 0.01161955, + "auxiliary_loss_mlp": 0.01030167, + "balance_loss_clip": 1.05262172, + "balance_loss_mlp": 1.02207828, + "epoch": 0.4047375698911802, + "flos": 20668135582080.0, + "grad_norm": 2.1777282076134727, + "language_loss": 0.85241216, + "learning_rate": 2.699226928299238e-06, + "loss": 0.87433338, + "num_input_tokens_seen": 72698365, + "step": 3366, + "time_per_iteration": 2.5951168537139893 + }, + { + "auxiliary_loss_clip": 0.01178453, + "auxiliary_loss_mlp": 0.01033274, + "balance_loss_clip": 1.05593991, + "balance_loss_mlp": 1.0252986, + "epoch": 0.40485781278181926, + "flos": 28912865996160.0, + "grad_norm": 2.545876461898942, + "language_loss": 0.7878238, + "learning_rate": 2.698497061394774e-06, + "loss": 0.80994105, + "num_input_tokens_seen": 72716850, + "step": 3367, + "time_per_iteration": 2.6000421047210693 + }, + { + "auxiliary_loss_clip": 0.01152743, + "auxiliary_loss_mlp": 0.00762473, + "balance_loss_clip": 1.05250108, + "balance_loss_mlp": 1.00057828, + "epoch": 0.40497805567245837, + "flos": 23148377694720.0, + "grad_norm": 1.74949930950621, + "language_loss": 0.80760825, + "learning_rate": 2.6977670885288627e-06, + "loss": 0.82676041, + "num_input_tokens_seen": 72738250, + "step": 3368, + "time_per_iteration": 2.6162679195404053 + }, + { + "auxiliary_loss_clip": 0.0114638, + "auxiliary_loss_mlp": 0.01031059, + "balance_loss_clip": 1.04826427, + "balance_loss_mlp": 1.02307153, + "epoch": 0.4050982985630975, + "flos": 16289404030080.0, + "grad_norm": 1.8785039459881647, + "language_loss": 0.75565028, + "learning_rate": 2.6970370098122378e-06, + "loss": 0.77742469, + "num_input_tokens_seen": 72755235, + "step": 3369, + "time_per_iteration": 2.4803152084350586 + }, + { + "auxiliary_loss_clip": 0.01185934, + "auxiliary_loss_mlp": 0.01029498, + "balance_loss_clip": 1.05309784, + "balance_loss_mlp": 1.02171946, + "epoch": 0.40521854145373654, + "flos": 34459484353920.0, + "grad_norm": 1.517422566860198, + "language_loss": 0.86573255, + "learning_rate": 2.6963068253556535e-06, + "loss": 0.88788688, + "num_input_tokens_seen": 72776620, + "step": 3370, + "time_per_iteration": 2.5501656532287598 + }, + { + "auxiliary_loss_clip": 0.01182134, + "auxiliary_loss_mlp": 0.01031548, + "balance_loss_clip": 1.05451584, + "balance_loss_mlp": 1.02255392, + "epoch": 0.40533878434437565, + "flos": 25331099454720.0, + "grad_norm": 2.0743434455320546, + "language_loss": 0.8571654, + "learning_rate": 2.6955765352698763e-06, + "loss": 0.87930226, + "num_input_tokens_seen": 72796765, + "step": 3371, + "time_per_iteration": 2.507810115814209 + }, + { + "auxiliary_loss_clip": 0.01188746, + "auxiliary_loss_mlp": 0.01029788, + "balance_loss_clip": 1.05356407, + "balance_loss_mlp": 1.02094293, + "epoch": 0.40545902723501476, + "flos": 15012061505280.0, + "grad_norm": 2.040736628151257, + "language_loss": 0.73082161, + "learning_rate": 2.6948461396656923e-06, + "loss": 0.75300694, + "num_input_tokens_seen": 72814175, + "step": 3372, + "time_per_iteration": 2.421536684036255 + }, + { + "auxiliary_loss_clip": 0.01179019, + "auxiliary_loss_mlp": 0.01030794, + "balance_loss_clip": 1.05413508, + "balance_loss_mlp": 1.02248538, + "epoch": 0.4055792701256538, + "flos": 25521103422720.0, + "grad_norm": 5.505246459876507, + "language_loss": 0.74623084, + "learning_rate": 2.6941156386539013e-06, + "loss": 0.76832896, + "num_input_tokens_seen": 72834125, + "step": 3373, + "time_per_iteration": 2.5127618312835693 + }, + { + "auxiliary_loss_clip": 0.0115617, + "auxiliary_loss_mlp": 0.0103319, + "balance_loss_clip": 1.05351293, + "balance_loss_mlp": 1.02477944, + "epoch": 0.4056995130162929, + "flos": 19574583972480.0, + "grad_norm": 3.4102113129976916, + "language_loss": 0.80934817, + "learning_rate": 2.6933850323453203e-06, + "loss": 0.83124173, + "num_input_tokens_seen": 72852570, + "step": 3374, + "time_per_iteration": 2.5012967586517334 + }, + { + "auxiliary_loss_clip": 0.01188675, + "auxiliary_loss_mlp": 0.01030222, + "balance_loss_clip": 1.05739272, + "balance_loss_mlp": 1.02186513, + "epoch": 0.405819755906932, + "flos": 15413794191360.0, + "grad_norm": 1.9696994552580973, + "language_loss": 0.7519725, + "learning_rate": 2.6926543208507806e-06, + "loss": 0.77416146, + "num_input_tokens_seen": 72871250, + "step": 3375, + "time_per_iteration": 2.4081358909606934 + }, + { + "auxiliary_loss_clip": 0.01172094, + "auxiliary_loss_mlp": 0.01031509, + "balance_loss_clip": 1.05341518, + "balance_loss_mlp": 1.02301526, + "epoch": 0.4059399987975711, + "flos": 21433930565760.0, + "grad_norm": 3.541814341524239, + "language_loss": 0.80036485, + "learning_rate": 2.6919235042811316e-06, + "loss": 0.82240087, + "num_input_tokens_seen": 72890035, + "step": 3376, + "time_per_iteration": 2.4642858505249023 + }, + { + "auxiliary_loss_clip": 0.01142091, + "auxiliary_loss_mlp": 0.01030828, + "balance_loss_clip": 1.04982281, + "balance_loss_mlp": 1.02175069, + "epoch": 0.4060602416882102, + "flos": 25556942217600.0, + "grad_norm": 1.8242513807755671, + "language_loss": 0.76261628, + "learning_rate": 2.691192582747237e-06, + "loss": 0.78434551, + "num_input_tokens_seen": 72909665, + "step": 3377, + "time_per_iteration": 3.2723262310028076 + }, + { + "auxiliary_loss_clip": 0.01191082, + "auxiliary_loss_mlp": 0.01028176, + "balance_loss_clip": 1.05693889, + "balance_loss_mlp": 1.02045703, + "epoch": 0.40618048457884925, + "flos": 23766759262080.0, + "grad_norm": 2.1210763516408337, + "language_loss": 0.74016714, + "learning_rate": 2.6904615563599765e-06, + "loss": 0.76235974, + "num_input_tokens_seen": 72929465, + "step": 3378, + "time_per_iteration": 3.3012568950653076 + }, + { + "auxiliary_loss_clip": 0.01136995, + "auxiliary_loss_mlp": 0.01025958, + "balance_loss_clip": 1.04615712, + "balance_loss_mlp": 1.01786411, + "epoch": 0.40630072746948837, + "flos": 17639681120640.0, + "grad_norm": 1.8152916060523518, + "language_loss": 0.83151984, + "learning_rate": 2.6897304252302477e-06, + "loss": 0.85314941, + "num_input_tokens_seen": 72946785, + "step": 3379, + "time_per_iteration": 2.551290988922119 + }, + { + "auxiliary_loss_clip": 0.01048165, + "auxiliary_loss_mlp": 0.01008659, + "balance_loss_clip": 1.01534498, + "balance_loss_mlp": 1.00733614, + "epoch": 0.4064209703601275, + "flos": 60836053063680.0, + "grad_norm": 0.7878742042878282, + "language_loss": 0.54784918, + "learning_rate": 2.688999189468962e-06, + "loss": 0.56841743, + "num_input_tokens_seen": 73003215, + "step": 3380, + "time_per_iteration": 4.535002708435059 + }, + { + "auxiliary_loss_clip": 0.0117292, + "auxiliary_loss_mlp": 0.01030892, + "balance_loss_clip": 1.0547663, + "balance_loss_mlp": 1.02280903, + "epoch": 0.40654121325076653, + "flos": 24024346669440.0, + "grad_norm": 2.879956396444131, + "language_loss": 0.7597326, + "learning_rate": 2.6882678491870464e-06, + "loss": 0.78177071, + "num_input_tokens_seen": 73023650, + "step": 3381, + "time_per_iteration": 2.5100486278533936 + }, + { + "auxiliary_loss_clip": 0.01177505, + "auxiliary_loss_mlp": 0.0102287, + "balance_loss_clip": 1.05418301, + "balance_loss_mlp": 1.01376295, + "epoch": 0.40666145614140564, + "flos": 27344252085120.0, + "grad_norm": 1.8367609078169695, + "language_loss": 0.71530521, + "learning_rate": 2.6875364044954453e-06, + "loss": 0.73730898, + "num_input_tokens_seen": 73043880, + "step": 3382, + "time_per_iteration": 2.5359699726104736 + }, + { + "auxiliary_loss_clip": 0.01155083, + "auxiliary_loss_mlp": 0.01026472, + "balance_loss_clip": 1.04630446, + "balance_loss_mlp": 1.01832986, + "epoch": 0.40678169903204475, + "flos": 26176724415360.0, + "grad_norm": 1.5538350475338232, + "language_loss": 0.81987649, + "learning_rate": 2.6868048555051185e-06, + "loss": 0.84169203, + "num_input_tokens_seen": 73065410, + "step": 3383, + "time_per_iteration": 2.560206174850464 + }, + { + "auxiliary_loss_clip": 0.01164935, + "auxiliary_loss_mlp": 0.01026914, + "balance_loss_clip": 1.05048323, + "balance_loss_mlp": 1.01887965, + "epoch": 0.4069019419226838, + "flos": 28622420622720.0, + "grad_norm": 3.3567499022227496, + "language_loss": 0.85436887, + "learning_rate": 2.686073202327041e-06, + "loss": 0.87628734, + "num_input_tokens_seen": 73084410, + "step": 3384, + "time_per_iteration": 2.574819564819336 + }, + { + "auxiliary_loss_clip": 0.01148739, + "auxiliary_loss_mlp": 0.01035883, + "balance_loss_clip": 1.04770923, + "balance_loss_mlp": 1.02688885, + "epoch": 0.4070221848133229, + "flos": 25229006023680.0, + "grad_norm": 1.7324597997257603, + "language_loss": 0.73131609, + "learning_rate": 2.6853414450722043e-06, + "loss": 0.75316232, + "num_input_tokens_seen": 73104075, + "step": 3385, + "time_per_iteration": 2.564838409423828 + }, + { + "auxiliary_loss_clip": 0.0117173, + "auxiliary_loss_mlp": 0.01025349, + "balance_loss_clip": 1.05288458, + "balance_loss_mlp": 1.01702213, + "epoch": 0.40714242770396203, + "flos": 18405224709120.0, + "grad_norm": 1.6292772548161176, + "language_loss": 0.85018742, + "learning_rate": 2.684609583851616e-06, + "loss": 0.87215823, + "num_input_tokens_seen": 73122250, + "step": 3386, + "time_per_iteration": 2.461782217025757 + }, + { + "auxiliary_loss_clip": 0.01131979, + "auxiliary_loss_mlp": 0.01028676, + "balance_loss_clip": 1.04925036, + "balance_loss_mlp": 1.02048016, + "epoch": 0.4072626705946011, + "flos": 30228920403840.0, + "grad_norm": 1.57998487975598, + "language_loss": 0.80748272, + "learning_rate": 2.683877618776297e-06, + "loss": 0.82908928, + "num_input_tokens_seen": 73144505, + "step": 3387, + "time_per_iteration": 2.675855875015259 + }, + { + "auxiliary_loss_clip": 0.01154777, + "auxiliary_loss_mlp": 0.01031731, + "balance_loss_clip": 1.04938114, + "balance_loss_mlp": 1.02265358, + "epoch": 0.4073829134852402, + "flos": 21834549930240.0, + "grad_norm": 2.106791882240175, + "language_loss": 0.7427578, + "learning_rate": 2.6831455499572876e-06, + "loss": 0.76462287, + "num_input_tokens_seen": 73162440, + "step": 3388, + "time_per_iteration": 2.5137040615081787 + }, + { + "auxiliary_loss_clip": 0.01189557, + "auxiliary_loss_mlp": 0.01029566, + "balance_loss_clip": 1.05378747, + "balance_loss_mlp": 1.02075052, + "epoch": 0.40750315637587925, + "flos": 25260211964160.0, + "grad_norm": 1.9460588460424426, + "language_loss": 0.77921486, + "learning_rate": 2.682413377505641e-06, + "loss": 0.80140603, + "num_input_tokens_seen": 73181245, + "step": 3389, + "time_per_iteration": 2.4701693058013916 + }, + { + "auxiliary_loss_clip": 0.01174005, + "auxiliary_loss_mlp": 0.01028821, + "balance_loss_clip": 1.05134273, + "balance_loss_mlp": 1.02021456, + "epoch": 0.40762339926651836, + "flos": 19712767593600.0, + "grad_norm": 1.746821684602618, + "language_loss": 0.76063877, + "learning_rate": 2.6816811015324284e-06, + "loss": 0.78266704, + "num_input_tokens_seen": 73199295, + "step": 3390, + "time_per_iteration": 2.4750306606292725 + }, + { + "auxiliary_loss_clip": 0.01080721, + "auxiliary_loss_mlp": 0.01002791, + "balance_loss_clip": 1.01425374, + "balance_loss_mlp": 1.00154543, + "epoch": 0.40774364215715747, + "flos": 71449307314560.0, + "grad_norm": 0.7235013260472605, + "language_loss": 0.56672317, + "learning_rate": 2.6809487221487343e-06, + "loss": 0.58755827, + "num_input_tokens_seen": 73258780, + "step": 3391, + "time_per_iteration": 2.9528729915618896 + }, + { + "auxiliary_loss_clip": 0.01166255, + "auxiliary_loss_mlp": 0.01025042, + "balance_loss_clip": 1.05072784, + "balance_loss_mlp": 1.01650655, + "epoch": 0.4078638850477965, + "flos": 15084134144640.0, + "grad_norm": 2.1466642956922035, + "language_loss": 0.82090342, + "learning_rate": 2.6802162394656605e-06, + "loss": 0.84281635, + "num_input_tokens_seen": 73275490, + "step": 3392, + "time_per_iteration": 2.446164846420288 + }, + { + "auxiliary_loss_clip": 0.01155929, + "auxiliary_loss_mlp": 0.01032675, + "balance_loss_clip": 1.04787469, + "balance_loss_mlp": 1.02456248, + "epoch": 0.40798412793843564, + "flos": 23842890138240.0, + "grad_norm": 2.76695444627874, + "language_loss": 0.71378803, + "learning_rate": 2.679483653594324e-06, + "loss": 0.73567402, + "num_input_tokens_seen": 73297260, + "step": 3393, + "time_per_iteration": 2.545386552810669 + }, + { + "auxiliary_loss_clip": 0.01177303, + "auxiliary_loss_mlp": 0.0103089, + "balance_loss_clip": 1.05362964, + "balance_loss_mlp": 1.0225389, + "epoch": 0.40810437082907475, + "flos": 21065774117760.0, + "grad_norm": 2.348562773448833, + "language_loss": 0.76341552, + "learning_rate": 2.678750964645857e-06, + "loss": 0.78549743, + "num_input_tokens_seen": 73316340, + "step": 3394, + "time_per_iteration": 2.5399010181427 + }, + { + "auxiliary_loss_clip": 0.01179296, + "auxiliary_loss_mlp": 0.01028489, + "balance_loss_clip": 1.0584774, + "balance_loss_mlp": 1.01988244, + "epoch": 0.4082246137197138, + "flos": 11321377948800.0, + "grad_norm": 2.5485368526909595, + "language_loss": 0.84008265, + "learning_rate": 2.6780181727314094e-06, + "loss": 0.86216056, + "num_input_tokens_seen": 73331245, + "step": 3395, + "time_per_iteration": 2.5160892009735107 + }, + { + "auxiliary_loss_clip": 0.0115031, + "auxiliary_loss_mlp": 0.00763064, + "balance_loss_clip": 1.05069304, + "balance_loss_mlp": 1.00093842, + "epoch": 0.4083448566103529, + "flos": 19062569554560.0, + "grad_norm": 2.7262063810145323, + "language_loss": 0.77627873, + "learning_rate": 2.6772852779621435e-06, + "loss": 0.79541248, + "num_input_tokens_seen": 73349105, + "step": 3396, + "time_per_iteration": 2.5755107402801514 + }, + { + "auxiliary_loss_clip": 0.01171115, + "auxiliary_loss_mlp": 0.00762818, + "balance_loss_clip": 1.05643058, + "balance_loss_mlp": 1.00097108, + "epoch": 0.408465099500992, + "flos": 23550254035200.0, + "grad_norm": 1.9034177985132772, + "language_loss": 0.86282933, + "learning_rate": 2.676552280449239e-06, + "loss": 0.88216865, + "num_input_tokens_seen": 73368990, + "step": 3397, + "time_per_iteration": 2.6085634231567383 + }, + { + "auxiliary_loss_clip": 0.01166316, + "auxiliary_loss_mlp": 0.01030689, + "balance_loss_clip": 1.05149829, + "balance_loss_mlp": 1.0216527, + "epoch": 0.4085853423916311, + "flos": 12750012558720.0, + "grad_norm": 2.4323699504329377, + "language_loss": 0.7562567, + "learning_rate": 2.6758191803038917e-06, + "loss": 0.77822673, + "num_input_tokens_seen": 73387485, + "step": 3398, + "time_per_iteration": 2.523700475692749 + }, + { + "auxiliary_loss_clip": 0.01108179, + "auxiliary_loss_mlp": 0.0102616, + "balance_loss_clip": 1.04716229, + "balance_loss_mlp": 1.01758277, + "epoch": 0.4087055852822702, + "flos": 24353072962560.0, + "grad_norm": 3.0275940700559296, + "language_loss": 0.82751322, + "learning_rate": 2.6750859776373125e-06, + "loss": 0.84885663, + "num_input_tokens_seen": 73406940, + "step": 3399, + "time_per_iteration": 2.7062489986419678 + }, + { + "auxiliary_loss_clip": 0.01031151, + "auxiliary_loss_mlp": 0.01000693, + "balance_loss_clip": 1.01309025, + "balance_loss_mlp": 0.99925613, + "epoch": 0.4088258281729093, + "flos": 66387950720640.0, + "grad_norm": 0.767527535391311, + "language_loss": 0.60357404, + "learning_rate": 2.674352672560727e-06, + "loss": 0.62389243, + "num_input_tokens_seen": 73468385, + "step": 3400, + "time_per_iteration": 3.261997699737549 + }, + { + "auxiliary_loss_clip": 0.01144754, + "auxiliary_loss_mlp": 0.01027954, + "balance_loss_clip": 1.04836059, + "balance_loss_mlp": 1.01894224, + "epoch": 0.40894607106354836, + "flos": 20449260057600.0, + "grad_norm": 1.9782434343100483, + "language_loss": 0.76856035, + "learning_rate": 2.673619265185377e-06, + "loss": 0.79028744, + "num_input_tokens_seen": 73488225, + "step": 3401, + "time_per_iteration": 2.7532010078430176 + }, + { + "auxiliary_loss_clip": 0.01175874, + "auxiliary_loss_mlp": 0.01032631, + "balance_loss_clip": 1.05194306, + "balance_loss_mlp": 1.0236963, + "epoch": 0.40906631395418747, + "flos": 27053627143680.0, + "grad_norm": 1.6622402653856048, + "language_loss": 0.7786988, + "learning_rate": 2.672885755622521e-06, + "loss": 0.80078387, + "num_input_tokens_seen": 73510640, + "step": 3402, + "time_per_iteration": 2.563615322113037 + }, + { + "auxiliary_loss_clip": 0.01130408, + "auxiliary_loss_mlp": 0.01029622, + "balance_loss_clip": 1.04636717, + "balance_loss_mlp": 1.02114654, + "epoch": 0.4091865568448266, + "flos": 25484151306240.0, + "grad_norm": 2.2898814671604923, + "language_loss": 0.69673264, + "learning_rate": 2.67215214398343e-06, + "loss": 0.71833295, + "num_input_tokens_seen": 73530655, + "step": 3403, + "time_per_iteration": 3.396601676940918 + }, + { + "auxiliary_loss_clip": 0.01134747, + "auxiliary_loss_mlp": 0.01031552, + "balance_loss_clip": 1.04578376, + "balance_loss_mlp": 1.02227724, + "epoch": 0.40930679973546563, + "flos": 28657864368000.0, + "grad_norm": 3.225292698512265, + "language_loss": 0.77634311, + "learning_rate": 2.671418430379393e-06, + "loss": 0.79800606, + "num_input_tokens_seen": 73549340, + "step": 3404, + "time_per_iteration": 2.643008232116699 + }, + { + "auxiliary_loss_clip": 0.01189202, + "auxiliary_loss_mlp": 0.01023139, + "balance_loss_clip": 1.05460966, + "balance_loss_mlp": 1.01481223, + "epoch": 0.40942704262610474, + "flos": 20886292834560.0, + "grad_norm": 2.309107184282275, + "language_loss": 0.83156872, + "learning_rate": 2.670684614921715e-06, + "loss": 0.85369217, + "num_input_tokens_seen": 73568315, + "step": 3405, + "time_per_iteration": 3.2745964527130127 + }, + { + "auxiliary_loss_clip": 0.01161819, + "auxiliary_loss_mlp": 0.01030672, + "balance_loss_clip": 1.05047178, + "balance_loss_mlp": 1.02193379, + "epoch": 0.4095472855167438, + "flos": 21618080616960.0, + "grad_norm": 2.34073979956227, + "language_loss": 0.694103, + "learning_rate": 2.6699506977217128e-06, + "loss": 0.71602786, + "num_input_tokens_seen": 73588490, + "step": 3406, + "time_per_iteration": 4.122801780700684 + }, + { + "auxiliary_loss_clip": 0.01171967, + "auxiliary_loss_mlp": 0.0103042, + "balance_loss_clip": 1.05481625, + "balance_loss_mlp": 1.02218819, + "epoch": 0.4096675284073829, + "flos": 27926112499200.0, + "grad_norm": 2.288117727134032, + "language_loss": 0.70290232, + "learning_rate": 2.6692166788907233e-06, + "loss": 0.72492617, + "num_input_tokens_seen": 73608685, + "step": 3407, + "time_per_iteration": 2.5622925758361816 + }, + { + "auxiliary_loss_clip": 0.01161344, + "auxiliary_loss_mlp": 0.01028624, + "balance_loss_clip": 1.05118001, + "balance_loss_mlp": 1.01988602, + "epoch": 0.409787771298022, + "flos": 19206607092480.0, + "grad_norm": 2.1197579290866195, + "language_loss": 0.77056277, + "learning_rate": 2.6684825585400957e-06, + "loss": 0.79246247, + "num_input_tokens_seen": 73627630, + "step": 3408, + "time_per_iteration": 2.503199815750122 + }, + { + "auxiliary_loss_clip": 0.01056151, + "auxiliary_loss_mlp": 0.01005147, + "balance_loss_clip": 1.01473641, + "balance_loss_mlp": 1.00376463, + "epoch": 0.4099080141886611, + "flos": 59269234832640.0, + "grad_norm": 0.8197824288229554, + "language_loss": 0.65154827, + "learning_rate": 2.6677483367811947e-06, + "loss": 0.67216122, + "num_input_tokens_seen": 73687670, + "step": 3409, + "time_per_iteration": 3.190415143966675 + }, + { + "auxiliary_loss_clip": 0.01176952, + "auxiliary_loss_mlp": 0.01023807, + "balance_loss_clip": 1.05206275, + "balance_loss_mlp": 1.01610065, + "epoch": 0.4100282570793002, + "flos": 21906443001600.0, + "grad_norm": 2.0445731237543034, + "language_loss": 0.75462395, + "learning_rate": 2.6670140137254028e-06, + "loss": 0.77663153, + "num_input_tokens_seen": 73707145, + "step": 3410, + "time_per_iteration": 2.4818191528320312 + }, + { + "auxiliary_loss_clip": 0.01125111, + "auxiliary_loss_mlp": 0.01027905, + "balance_loss_clip": 1.04633403, + "balance_loss_mlp": 1.01972151, + "epoch": 0.4101484999699393, + "flos": 18551596631040.0, + "grad_norm": 2.5634665694265144, + "language_loss": 0.89821774, + "learning_rate": 2.666279589484115e-06, + "loss": 0.91974789, + "num_input_tokens_seen": 73725045, + "step": 3411, + "time_per_iteration": 2.5508110523223877 + }, + { + "auxiliary_loss_clip": 0.01131196, + "auxiliary_loss_mlp": 0.01027709, + "balance_loss_clip": 1.04761612, + "balance_loss_mlp": 1.01956677, + "epoch": 0.41026874286057835, + "flos": 19094529680640.0, + "grad_norm": 2.0592856593785056, + "language_loss": 0.80735946, + "learning_rate": 2.6655450641687435e-06, + "loss": 0.8289485, + "num_input_tokens_seen": 73742610, + "step": 3412, + "time_per_iteration": 2.5557940006256104 + }, + { + "auxiliary_loss_clip": 0.0118996, + "auxiliary_loss_mlp": 0.01028472, + "balance_loss_clip": 1.05770516, + "balance_loss_mlp": 1.01988292, + "epoch": 0.41038898575121746, + "flos": 31209568588800.0, + "grad_norm": 2.6448639136807257, + "language_loss": 0.69549823, + "learning_rate": 2.664810437890715e-06, + "loss": 0.71768248, + "num_input_tokens_seen": 73764280, + "step": 3413, + "time_per_iteration": 2.5437657833099365 + }, + { + "auxiliary_loss_clip": 0.01107465, + "auxiliary_loss_mlp": 0.01025385, + "balance_loss_clip": 1.04959929, + "balance_loss_mlp": 1.01795793, + "epoch": 0.41050922864185657, + "flos": 14355865895040.0, + "grad_norm": 2.0539130652973125, + "language_loss": 0.79385108, + "learning_rate": 2.6640757107614714e-06, + "loss": 0.81517959, + "num_input_tokens_seen": 73782375, + "step": 3414, + "time_per_iteration": 2.5709445476531982 + }, + { + "auxiliary_loss_clip": 0.01140304, + "auxiliary_loss_mlp": 0.01025376, + "balance_loss_clip": 1.05044413, + "balance_loss_mlp": 1.01674509, + "epoch": 0.4106294715324956, + "flos": 30956290813440.0, + "grad_norm": 2.1589525166816923, + "language_loss": 0.69237727, + "learning_rate": 2.6633408828924697e-06, + "loss": 0.71403408, + "num_input_tokens_seen": 73801240, + "step": 3415, + "time_per_iteration": 2.6221256256103516 + }, + { + "auxiliary_loss_clip": 0.01154148, + "auxiliary_loss_mlp": 0.0103062, + "balance_loss_clip": 1.05271721, + "balance_loss_mlp": 1.02241242, + "epoch": 0.41074971442313474, + "flos": 24457321209600.0, + "grad_norm": 1.6333547079799817, + "language_loss": 0.69926041, + "learning_rate": 2.662605954395185e-06, + "loss": 0.72110808, + "num_input_tokens_seen": 73821200, + "step": 3416, + "time_per_iteration": 2.5888845920562744 + }, + { + "auxiliary_loss_clip": 0.011766, + "auxiliary_loss_mlp": 0.01025112, + "balance_loss_clip": 1.05222785, + "balance_loss_mlp": 1.01677883, + "epoch": 0.41086995731377385, + "flos": 21542991235200.0, + "grad_norm": 1.6837673350531277, + "language_loss": 0.83418494, + "learning_rate": 2.6618709253811027e-06, + "loss": 0.85620201, + "num_input_tokens_seen": 73840655, + "step": 3417, + "time_per_iteration": 2.52815842628479 + }, + { + "auxiliary_loss_clip": 0.01185472, + "auxiliary_loss_mlp": 0.01027254, + "balance_loss_clip": 1.0554291, + "balance_loss_mlp": 1.01994085, + "epoch": 0.4109902002044129, + "flos": 20702753314560.0, + "grad_norm": 1.6793550258344474, + "language_loss": 0.87687373, + "learning_rate": 2.6611357959617277e-06, + "loss": 0.899001, + "num_input_tokens_seen": 73860275, + "step": 3418, + "time_per_iteration": 2.4705028533935547 + }, + { + "auxiliary_loss_clip": 0.01140621, + "auxiliary_loss_mlp": 0.01033788, + "balance_loss_clip": 1.04940045, + "balance_loss_mlp": 1.02503836, + "epoch": 0.411110443095052, + "flos": 18179992477440.0, + "grad_norm": 1.8821408728280047, + "language_loss": 0.90859193, + "learning_rate": 2.660400566248578e-06, + "loss": 0.93033606, + "num_input_tokens_seen": 73878400, + "step": 3419, + "time_per_iteration": 2.547661066055298 + }, + { + "auxiliary_loss_clip": 0.01145698, + "auxiliary_loss_mlp": 0.01036795, + "balance_loss_clip": 1.04919338, + "balance_loss_mlp": 1.02731156, + "epoch": 0.41123068598569107, + "flos": 14575244209920.0, + "grad_norm": 3.072109335260676, + "language_loss": 0.66615963, + "learning_rate": 2.6596652363531876e-06, + "loss": 0.68798453, + "num_input_tokens_seen": 73894275, + "step": 3420, + "time_per_iteration": 2.528951406478882 + }, + { + "auxiliary_loss_clip": 0.01188424, + "auxiliary_loss_mlp": 0.01027847, + "balance_loss_clip": 1.0549283, + "balance_loss_mlp": 1.01972866, + "epoch": 0.4113509288763302, + "flos": 21177995184000.0, + "grad_norm": 1.5638907938798754, + "language_loss": 0.77947402, + "learning_rate": 2.6589298063871055e-06, + "loss": 0.80163676, + "num_input_tokens_seen": 73914450, + "step": 3421, + "time_per_iteration": 2.482619524002075 + }, + { + "auxiliary_loss_clip": 0.01186769, + "auxiliary_loss_mlp": 0.01030386, + "balance_loss_clip": 1.05388308, + "balance_loss_mlp": 1.02182066, + "epoch": 0.4114711717669693, + "flos": 18442212739200.0, + "grad_norm": 2.080904653297268, + "language_loss": 0.69872248, + "learning_rate": 2.658194276461895e-06, + "loss": 0.7208941, + "num_input_tokens_seen": 73932375, + "step": 3422, + "time_per_iteration": 2.4291203022003174 + }, + { + "auxiliary_loss_clip": 0.01161257, + "auxiliary_loss_mlp": 0.01025921, + "balance_loss_clip": 1.04919267, + "balance_loss_mlp": 1.01704597, + "epoch": 0.41159141465760835, + "flos": 27233395735680.0, + "grad_norm": 1.9636517805485045, + "language_loss": 0.67030615, + "learning_rate": 2.6574586466891368e-06, + "loss": 0.69217795, + "num_input_tokens_seen": 73952850, + "step": 3423, + "time_per_iteration": 2.5547432899475098 + }, + { + "auxiliary_loss_clip": 0.01160122, + "auxiliary_loss_mlp": 0.00762727, + "balance_loss_clip": 1.05127549, + "balance_loss_mlp": 1.00093436, + "epoch": 0.41171165754824746, + "flos": 20006876154240.0, + "grad_norm": 2.1985992797445735, + "language_loss": 0.64934325, + "learning_rate": 2.6567229171804247e-06, + "loss": 0.66857171, + "num_input_tokens_seen": 73970735, + "step": 3424, + "time_per_iteration": 2.5145697593688965 + }, + { + "auxiliary_loss_clip": 0.01152046, + "auxiliary_loss_mlp": 0.01033767, + "balance_loss_clip": 1.04720902, + "balance_loss_mlp": 1.02428436, + "epoch": 0.41183190043888657, + "flos": 18004318035840.0, + "grad_norm": 2.62721766601594, + "language_loss": 0.87519068, + "learning_rate": 2.655987088047368e-06, + "loss": 0.89704877, + "num_input_tokens_seen": 73989080, + "step": 3425, + "time_per_iteration": 2.5357494354248047 + }, + { + "auxiliary_loss_clip": 0.01154778, + "auxiliary_loss_mlp": 0.01028459, + "balance_loss_clip": 1.04980755, + "balance_loss_mlp": 1.01971483, + "epoch": 0.4119521433295256, + "flos": 27163370171520.0, + "grad_norm": 2.0266619826274384, + "language_loss": 0.78607047, + "learning_rate": 2.6552511594015912e-06, + "loss": 0.80790281, + "num_input_tokens_seen": 74009470, + "step": 3426, + "time_per_iteration": 2.570268154144287 + }, + { + "auxiliary_loss_clip": 0.01159552, + "auxiliary_loss_mlp": 0.01027946, + "balance_loss_clip": 1.04875731, + "balance_loss_mlp": 1.01886845, + "epoch": 0.41207238622016473, + "flos": 15122020014720.0, + "grad_norm": 2.115372562519257, + "language_loss": 0.85554671, + "learning_rate": 2.654515131354735e-06, + "loss": 0.87742174, + "num_input_tokens_seen": 74027735, + "step": 3427, + "time_per_iteration": 2.556579351425171 + }, + { + "auxiliary_loss_clip": 0.01145987, + "auxiliary_loss_mlp": 0.0102836, + "balance_loss_clip": 1.05081773, + "balance_loss_mlp": 1.02053988, + "epoch": 0.41219262911080384, + "flos": 27052872958080.0, + "grad_norm": 5.5979589959061515, + "language_loss": 0.85114133, + "learning_rate": 2.653779004018453e-06, + "loss": 0.87288475, + "num_input_tokens_seen": 74048300, + "step": 3428, + "time_per_iteration": 2.6243696212768555 + }, + { + "auxiliary_loss_clip": 0.0115452, + "auxiliary_loss_mlp": 0.01022999, + "balance_loss_clip": 1.05247712, + "balance_loss_mlp": 1.01508331, + "epoch": 0.4123128720014429, + "flos": 24686360282880.0, + "grad_norm": 1.872620260758483, + "language_loss": 0.82152724, + "learning_rate": 2.653042777504417e-06, + "loss": 0.84330249, + "num_input_tokens_seen": 74070890, + "step": 3429, + "time_per_iteration": 2.5929832458496094 + }, + { + "auxiliary_loss_clip": 0.01168484, + "auxiliary_loss_mlp": 0.01024946, + "balance_loss_clip": 1.05186653, + "balance_loss_mlp": 1.01638126, + "epoch": 0.412433114892082, + "flos": 26244774731520.0, + "grad_norm": 2.59469031871355, + "language_loss": 0.80026925, + "learning_rate": 2.6523064519243105e-06, + "loss": 0.82220352, + "num_input_tokens_seen": 74090460, + "step": 3430, + "time_per_iteration": 3.329582929611206 + }, + { + "auxiliary_loss_clip": 0.01177108, + "auxiliary_loss_mlp": 0.0103062, + "balance_loss_clip": 1.05589819, + "balance_loss_mlp": 1.02112496, + "epoch": 0.4125533577827211, + "flos": 21361031913600.0, + "grad_norm": 2.4809419560474595, + "language_loss": 0.78626812, + "learning_rate": 2.6515700273898333e-06, + "loss": 0.80834544, + "num_input_tokens_seen": 74108335, + "step": 3431, + "time_per_iteration": 3.3137619495391846 + }, + { + "auxiliary_loss_clip": 0.01150531, + "auxiliary_loss_mlp": 0.01031288, + "balance_loss_clip": 1.05218363, + "balance_loss_mlp": 1.02180445, + "epoch": 0.4126736006733602, + "flos": 26067556005120.0, + "grad_norm": 1.975818722427587, + "language_loss": 0.68737942, + "learning_rate": 2.6508335040127018e-06, + "loss": 0.70919758, + "num_input_tokens_seen": 74128030, + "step": 3432, + "time_per_iteration": 4.114506006240845 + }, + { + "auxiliary_loss_clip": 0.01179899, + "auxiliary_loss_mlp": 0.01031088, + "balance_loss_clip": 1.05597699, + "balance_loss_mlp": 1.02267814, + "epoch": 0.4127938435639993, + "flos": 25666146541440.0, + "grad_norm": 1.6011759157930252, + "language_loss": 0.77187073, + "learning_rate": 2.6500968819046446e-06, + "loss": 0.79398066, + "num_input_tokens_seen": 74148330, + "step": 3433, + "time_per_iteration": 2.523697853088379 + }, + { + "auxiliary_loss_clip": 0.01133074, + "auxiliary_loss_mlp": 0.01032778, + "balance_loss_clip": 1.045367, + "balance_loss_mlp": 1.02430248, + "epoch": 0.4129140864546384, + "flos": 17995914253440.0, + "grad_norm": 3.6070242218080013, + "language_loss": 0.58792591, + "learning_rate": 2.649360161177408e-06, + "loss": 0.60958445, + "num_input_tokens_seen": 74163390, + "step": 3434, + "time_per_iteration": 2.510352849960327 + }, + { + "auxiliary_loss_clip": 0.01184508, + "auxiliary_loss_mlp": 0.01030482, + "balance_loss_clip": 1.0556457, + "balance_loss_mlp": 1.0215652, + "epoch": 0.41303432934527745, + "flos": 23732895715200.0, + "grad_norm": 3.326652894347205, + "language_loss": 0.73621893, + "learning_rate": 2.6486233419427504e-06, + "loss": 0.75836885, + "num_input_tokens_seen": 74183205, + "step": 3435, + "time_per_iteration": 2.499953508377075 + }, + { + "auxiliary_loss_clip": 0.01136742, + "auxiliary_loss_mlp": 0.01023384, + "balance_loss_clip": 1.0495373, + "balance_loss_mlp": 1.01446676, + "epoch": 0.41315457223591656, + "flos": 19755286318080.0, + "grad_norm": 2.2446074165718777, + "language_loss": 0.74869645, + "learning_rate": 2.6478864243124484e-06, + "loss": 0.77029777, + "num_input_tokens_seen": 74202870, + "step": 3436, + "time_per_iteration": 2.5414175987243652 + }, + { + "auxiliary_loss_clip": 0.0117579, + "auxiliary_loss_mlp": 0.01022029, + "balance_loss_clip": 1.05158401, + "balance_loss_mlp": 1.01428032, + "epoch": 0.4132748151265556, + "flos": 20923316778240.0, + "grad_norm": 2.31771363954229, + "language_loss": 0.85162759, + "learning_rate": 2.6471494083982903e-06, + "loss": 0.87360579, + "num_input_tokens_seen": 74222255, + "step": 3437, + "time_per_iteration": 2.499105453491211 + }, + { + "auxiliary_loss_clip": 0.01148945, + "auxiliary_loss_mlp": 0.01025664, + "balance_loss_clip": 1.04856646, + "balance_loss_mlp": 1.01762378, + "epoch": 0.4133950580171947, + "flos": 32232520016640.0, + "grad_norm": 1.6624276064415302, + "language_loss": 0.74479854, + "learning_rate": 2.6464122943120818e-06, + "loss": 0.76654464, + "num_input_tokens_seen": 74242480, + "step": 3438, + "time_per_iteration": 2.6180479526519775 + }, + { + "auxiliary_loss_clip": 0.01144039, + "auxiliary_loss_mlp": 0.01024243, + "balance_loss_clip": 1.05103528, + "balance_loss_mlp": 1.01525426, + "epoch": 0.41351530090783384, + "flos": 23292487059840.0, + "grad_norm": 3.7352269677433436, + "language_loss": 0.82428938, + "learning_rate": 2.645675082165642e-06, + "loss": 0.84597224, + "num_input_tokens_seen": 74258690, + "step": 3439, + "time_per_iteration": 2.5618815422058105 + }, + { + "auxiliary_loss_clip": 0.01161618, + "auxiliary_loss_mlp": 0.01030596, + "balance_loss_clip": 1.05296695, + "balance_loss_mlp": 1.02150035, + "epoch": 0.4136355437984729, + "flos": 25593571111680.0, + "grad_norm": 2.1371899138338586, + "language_loss": 0.75619322, + "learning_rate": 2.644937772070806e-06, + "loss": 0.77811539, + "num_input_tokens_seen": 74277135, + "step": 3440, + "time_per_iteration": 2.554091691970825 + }, + { + "auxiliary_loss_clip": 0.01191809, + "auxiliary_loss_mlp": 0.01027805, + "balance_loss_clip": 1.05635095, + "balance_loss_mlp": 1.01911438, + "epoch": 0.413755786689112, + "flos": 19828615933440.0, + "grad_norm": 2.2618170838188556, + "language_loss": 0.83301812, + "learning_rate": 2.6442003641394225e-06, + "loss": 0.85521424, + "num_input_tokens_seen": 74294730, + "step": 3441, + "time_per_iteration": 2.451998233795166 + }, + { + "auxiliary_loss_clip": 0.01159878, + "auxiliary_loss_mlp": 0.01027359, + "balance_loss_clip": 1.0508014, + "balance_loss_mlp": 1.01900887, + "epoch": 0.4138760295797511, + "flos": 26870446759680.0, + "grad_norm": 1.944563850336249, + "language_loss": 0.84026408, + "learning_rate": 2.643462858483356e-06, + "loss": 0.86213648, + "num_input_tokens_seen": 74315015, + "step": 3442, + "time_per_iteration": 2.565828323364258 + }, + { + "auxiliary_loss_clip": 0.01129051, + "auxiliary_loss_mlp": 0.01028199, + "balance_loss_clip": 1.04881001, + "balance_loss_mlp": 1.01899636, + "epoch": 0.41399627247039017, + "flos": 16399254798720.0, + "grad_norm": 1.7978710911546514, + "language_loss": 0.7218293, + "learning_rate": 2.6427252552144856e-06, + "loss": 0.74340189, + "num_input_tokens_seen": 74333665, + "step": 3443, + "time_per_iteration": 2.5774409770965576 + }, + { + "auxiliary_loss_clip": 0.01190194, + "auxiliary_loss_mlp": 0.01032007, + "balance_loss_clip": 1.05418873, + "balance_loss_mlp": 1.02322078, + "epoch": 0.4141165153610293, + "flos": 22930220442240.0, + "grad_norm": 5.370349999888561, + "language_loss": 0.75062072, + "learning_rate": 2.6419875544447044e-06, + "loss": 0.77284276, + "num_input_tokens_seen": 74355065, + "step": 3444, + "time_per_iteration": 2.4893596172332764 + }, + { + "auxiliary_loss_clip": 0.01190307, + "auxiliary_loss_mlp": 0.01031189, + "balance_loss_clip": 1.05368412, + "balance_loss_mlp": 1.02231371, + "epoch": 0.4142367582516684, + "flos": 25192556697600.0, + "grad_norm": 2.0599400003892376, + "language_loss": 0.71195054, + "learning_rate": 2.6412497562859218e-06, + "loss": 0.73416555, + "num_input_tokens_seen": 74376345, + "step": 3445, + "time_per_iteration": 2.4946181774139404 + }, + { + "auxiliary_loss_clip": 0.01180622, + "auxiliary_loss_mlp": 0.01027869, + "balance_loss_clip": 1.05375731, + "balance_loss_mlp": 1.01917315, + "epoch": 0.41435700114230745, + "flos": 21690476478720.0, + "grad_norm": 2.4185624792496063, + "language_loss": 0.75729215, + "learning_rate": 2.6405118608500617e-06, + "loss": 0.77937698, + "num_input_tokens_seen": 74395170, + "step": 3446, + "time_per_iteration": 2.499781608581543 + }, + { + "auxiliary_loss_clip": 0.01138678, + "auxiliary_loss_mlp": 0.01029815, + "balance_loss_clip": 1.05182171, + "balance_loss_mlp": 1.02151167, + "epoch": 0.41447724403294656, + "flos": 25995160143360.0, + "grad_norm": 1.7583774641793604, + "language_loss": 0.81425726, + "learning_rate": 2.6397738682490613e-06, + "loss": 0.83594227, + "num_input_tokens_seen": 74416070, + "step": 3447, + "time_per_iteration": 2.6017513275146484 + }, + { + "auxiliary_loss_clip": 0.01188759, + "auxiliary_loss_mlp": 0.01025379, + "balance_loss_clip": 1.05385566, + "balance_loss_mlp": 1.01680779, + "epoch": 0.41459748692358567, + "flos": 18259678800000.0, + "grad_norm": 1.9741175799269242, + "language_loss": 0.75492561, + "learning_rate": 2.6390357785948734e-06, + "loss": 0.77706695, + "num_input_tokens_seen": 74433185, + "step": 3448, + "time_per_iteration": 2.435663938522339 + }, + { + "auxiliary_loss_clip": 0.01178312, + "auxiliary_loss_mlp": 0.01032145, + "balance_loss_clip": 1.05584478, + "balance_loss_mlp": 1.02313232, + "epoch": 0.4147177298142247, + "flos": 24168456034560.0, + "grad_norm": 1.883460808037715, + "language_loss": 0.80439609, + "learning_rate": 2.6382975919994667e-06, + "loss": 0.82650065, + "num_input_tokens_seen": 74453760, + "step": 3449, + "time_per_iteration": 2.514569044113159 + }, + { + "auxiliary_loss_clip": 0.0116452, + "auxiliary_loss_mlp": 0.01026545, + "balance_loss_clip": 1.05328178, + "balance_loss_mlp": 1.01888335, + "epoch": 0.41483797270486383, + "flos": 20084659056000.0, + "grad_norm": 2.362100286580372, + "language_loss": 0.72843051, + "learning_rate": 2.637559308574822e-06, + "loss": 0.75034118, + "num_input_tokens_seen": 74473505, + "step": 3450, + "time_per_iteration": 2.513347864151001 + }, + { + "auxiliary_loss_clip": 0.01189985, + "auxiliary_loss_mlp": 0.01028643, + "balance_loss_clip": 1.05414605, + "balance_loss_mlp": 1.02029872, + "epoch": 0.4149582155955029, + "flos": 30081040110720.0, + "grad_norm": 2.633180002858095, + "language_loss": 0.709126, + "learning_rate": 2.6368209284329376e-06, + "loss": 0.73131227, + "num_input_tokens_seen": 74494135, + "step": 3451, + "time_per_iteration": 2.520345687866211 + }, + { + "auxiliary_loss_clip": 0.01172299, + "auxiliary_loss_mlp": 0.01029916, + "balance_loss_clip": 1.04958916, + "balance_loss_mlp": 1.02108264, + "epoch": 0.415078458486142, + "flos": 16764394504320.0, + "grad_norm": 1.9735396194302728, + "language_loss": 0.75512826, + "learning_rate": 2.636082451685825e-06, + "loss": 0.77715039, + "num_input_tokens_seen": 74512335, + "step": 3452, + "time_per_iteration": 2.4536478519439697 + }, + { + "auxiliary_loss_clip": 0.01165164, + "auxiliary_loss_mlp": 0.01029352, + "balance_loss_clip": 1.05451453, + "balance_loss_mlp": 1.02073264, + "epoch": 0.4151987013767811, + "flos": 26033692458240.0, + "grad_norm": 1.5665437396242603, + "language_loss": 0.86353528, + "learning_rate": 2.6353438784455094e-06, + "loss": 0.8854804, + "num_input_tokens_seen": 74535620, + "step": 3453, + "time_per_iteration": 2.568577289581299 + }, + { + "auxiliary_loss_clip": 0.01157677, + "auxiliary_loss_mlp": 0.01031472, + "balance_loss_clip": 1.05245972, + "balance_loss_mlp": 1.02216733, + "epoch": 0.41531894426742016, + "flos": 24608002763520.0, + "grad_norm": 2.2674039890765263, + "language_loss": 0.71252942, + "learning_rate": 2.6346052088240326e-06, + "loss": 0.7344209, + "num_input_tokens_seen": 74555140, + "step": 3454, + "time_per_iteration": 2.5342743396759033 + }, + { + "auxiliary_loss_clip": 0.01164606, + "auxiliary_loss_mlp": 0.01031931, + "balance_loss_clip": 1.05224299, + "balance_loss_mlp": 1.02244782, + "epoch": 0.4154391871580593, + "flos": 14975791747200.0, + "grad_norm": 2.040463534528826, + "language_loss": 0.76712239, + "learning_rate": 2.63386644293345e-06, + "loss": 0.78908777, + "num_input_tokens_seen": 74571485, + "step": 3455, + "time_per_iteration": 2.4877262115478516 + }, + { + "auxiliary_loss_clip": 0.01140245, + "auxiliary_loss_mlp": 0.01027571, + "balance_loss_clip": 1.04618835, + "balance_loss_mlp": 1.01984644, + "epoch": 0.4155594300486984, + "flos": 14647173194880.0, + "grad_norm": 2.231062851446935, + "language_loss": 0.83186746, + "learning_rate": 2.633127580885833e-06, + "loss": 0.85354561, + "num_input_tokens_seen": 74585985, + "step": 3456, + "time_per_iteration": 2.501678943634033 + }, + { + "auxiliary_loss_clip": 0.01190721, + "auxiliary_loss_mlp": 0.0103453, + "balance_loss_clip": 1.05790162, + "balance_loss_mlp": 1.0260179, + "epoch": 0.41567967293933744, + "flos": 29497276275840.0, + "grad_norm": 2.102304569738913, + "language_loss": 0.64589775, + "learning_rate": 2.632388622793265e-06, + "loss": 0.66815025, + "num_input_tokens_seen": 74605140, + "step": 3457, + "time_per_iteration": 3.291083335876465 + }, + { + "auxiliary_loss_clip": 0.01174412, + "auxiliary_loss_mlp": 0.01029728, + "balance_loss_clip": 1.05391014, + "balance_loss_mlp": 1.02164578, + "epoch": 0.41579991582997655, + "flos": 19238387650560.0, + "grad_norm": 1.658609990276113, + "language_loss": 0.67766488, + "learning_rate": 2.6316495687678457e-06, + "loss": 0.69970632, + "num_input_tokens_seen": 74623790, + "step": 3458, + "time_per_iteration": 3.2127020359039307 + }, + { + "auxiliary_loss_clip": 0.01125088, + "auxiliary_loss_mlp": 0.01026822, + "balance_loss_clip": 1.04655552, + "balance_loss_mlp": 1.01792336, + "epoch": 0.41592015872061566, + "flos": 24462061804800.0, + "grad_norm": 9.331168603365139, + "language_loss": 0.76401633, + "learning_rate": 2.6309104189216887e-06, + "loss": 0.7855354, + "num_input_tokens_seen": 74641355, + "step": 3459, + "time_per_iteration": 3.4677317142486572 + }, + { + "auxiliary_loss_clip": 0.01132949, + "auxiliary_loss_mlp": 0.00763744, + "balance_loss_clip": 1.04670978, + "balance_loss_mlp": 1.00101507, + "epoch": 0.4160404016112547, + "flos": 20775651966720.0, + "grad_norm": 2.2877990173276435, + "language_loss": 0.74585223, + "learning_rate": 2.630171173366923e-06, + "loss": 0.76481915, + "num_input_tokens_seen": 74657155, + "step": 3460, + "time_per_iteration": 2.5467512607574463 + }, + { + "auxiliary_loss_clip": 0.01130984, + "auxiliary_loss_mlp": 0.01031716, + "balance_loss_clip": 1.04767704, + "balance_loss_mlp": 1.02295387, + "epoch": 0.41616064450189383, + "flos": 13916462820480.0, + "grad_norm": 2.702301844786639, + "language_loss": 0.74477947, + "learning_rate": 2.629431832215691e-06, + "loss": 0.76640648, + "num_input_tokens_seen": 74671960, + "step": 3461, + "time_per_iteration": 2.556220769882202 + }, + { + "auxiliary_loss_clip": 0.01156754, + "auxiliary_loss_mlp": 0.01027058, + "balance_loss_clip": 1.05076814, + "balance_loss_mlp": 1.01860046, + "epoch": 0.41628088739253294, + "flos": 20010826650240.0, + "grad_norm": 2.0613190240429127, + "language_loss": 0.86791301, + "learning_rate": 2.628692395580151e-06, + "loss": 0.88975108, + "num_input_tokens_seen": 74692050, + "step": 3462, + "time_per_iteration": 2.5221316814422607 + }, + { + "auxiliary_loss_clip": 0.01099916, + "auxiliary_loss_mlp": 0.01030282, + "balance_loss_clip": 1.04486358, + "balance_loss_mlp": 1.02165115, + "epoch": 0.416401130283172, + "flos": 29168801377920.0, + "grad_norm": 1.839447696257545, + "language_loss": 0.79159367, + "learning_rate": 2.6279528635724747e-06, + "loss": 0.81289566, + "num_input_tokens_seen": 74712205, + "step": 3463, + "time_per_iteration": 2.6796391010284424 + }, + { + "auxiliary_loss_clip": 0.0117419, + "auxiliary_loss_mlp": 0.01027755, + "balance_loss_clip": 1.05193007, + "balance_loss_mlp": 1.01882648, + "epoch": 0.4165213731738111, + "flos": 16246813478400.0, + "grad_norm": 2.715601156266373, + "language_loss": 0.7848922, + "learning_rate": 2.627213236304848e-06, + "loss": 0.80691171, + "num_input_tokens_seen": 74729005, + "step": 3464, + "time_per_iteration": 2.4726943969726562 + }, + { + "auxiliary_loss_clip": 0.01177684, + "auxiliary_loss_mlp": 0.01027916, + "balance_loss_clip": 1.05385947, + "balance_loss_mlp": 1.0194881, + "epoch": 0.4166416160644502, + "flos": 33765438787200.0, + "grad_norm": 2.7449656616161553, + "language_loss": 0.70561326, + "learning_rate": 2.626473513889472e-06, + "loss": 0.7276693, + "num_input_tokens_seen": 74751385, + "step": 3465, + "time_per_iteration": 2.60478138923645 + }, + { + "auxiliary_loss_clip": 0.01167656, + "auxiliary_loss_mlp": 0.01034103, + "balance_loss_clip": 1.05309749, + "balance_loss_mlp": 1.02557349, + "epoch": 0.41676185895508927, + "flos": 20917498775040.0, + "grad_norm": 1.9001674434772267, + "language_loss": 0.82632542, + "learning_rate": 2.625733696438562e-06, + "loss": 0.84834307, + "num_input_tokens_seen": 74768890, + "step": 3466, + "time_per_iteration": 2.4906673431396484 + }, + { + "auxiliary_loss_clip": 0.01157938, + "auxiliary_loss_mlp": 0.01034228, + "balance_loss_clip": 1.0513196, + "balance_loss_mlp": 1.02578771, + "epoch": 0.4168821018457284, + "flos": 18406122549120.0, + "grad_norm": 2.1702257087218872, + "language_loss": 0.75085616, + "learning_rate": 2.6249937840643476e-06, + "loss": 0.77277786, + "num_input_tokens_seen": 74787195, + "step": 3467, + "time_per_iteration": 2.5407562255859375 + }, + { + "auxiliary_loss_clip": 0.01189682, + "auxiliary_loss_mlp": 0.00763093, + "balance_loss_clip": 1.05568016, + "balance_loss_mlp": 1.00104451, + "epoch": 0.41700234473636744, + "flos": 18698399516160.0, + "grad_norm": 1.8715868159004367, + "language_loss": 0.66421157, + "learning_rate": 2.6242537768790733e-06, + "loss": 0.6837393, + "num_input_tokens_seen": 74806350, + "step": 3468, + "time_per_iteration": 2.454354763031006 + }, + { + "auxiliary_loss_clip": 0.01176558, + "auxiliary_loss_mlp": 0.01029369, + "balance_loss_clip": 1.05520415, + "balance_loss_mlp": 1.02064252, + "epoch": 0.41712258762700655, + "flos": 31033283616000.0, + "grad_norm": 2.038563304210909, + "language_loss": 0.68517339, + "learning_rate": 2.6235136749949975e-06, + "loss": 0.70723265, + "num_input_tokens_seen": 74829800, + "step": 3469, + "time_per_iteration": 2.598973274230957 + }, + { + "auxiliary_loss_clip": 0.01187551, + "auxiliary_loss_mlp": 0.01031701, + "balance_loss_clip": 1.05353045, + "balance_loss_mlp": 1.02230096, + "epoch": 0.41724283051764566, + "flos": 35914763877120.0, + "grad_norm": 2.4419294828880487, + "language_loss": 0.61288643, + "learning_rate": 2.6227734785243924e-06, + "loss": 0.63507891, + "num_input_tokens_seen": 74849760, + "step": 3470, + "time_per_iteration": 2.5549545288085938 + }, + { + "auxiliary_loss_clip": 0.01111716, + "auxiliary_loss_mlp": 0.01028187, + "balance_loss_clip": 1.04584169, + "balance_loss_mlp": 1.01998568, + "epoch": 0.4173630734082847, + "flos": 25333649320320.0, + "grad_norm": 1.8840772629605942, + "language_loss": 0.79315943, + "learning_rate": 2.6220331875795466e-06, + "loss": 0.81455851, + "num_input_tokens_seen": 74869110, + "step": 3471, + "time_per_iteration": 2.666322708129883 + }, + { + "auxiliary_loss_clip": 0.01171999, + "auxiliary_loss_mlp": 0.01033987, + "balance_loss_clip": 1.05335581, + "balance_loss_mlp": 1.02521324, + "epoch": 0.4174833162989238, + "flos": 26685398868480.0, + "grad_norm": 1.9210752676351268, + "language_loss": 0.75105059, + "learning_rate": 2.62129280227276e-06, + "loss": 0.77311039, + "num_input_tokens_seen": 74889110, + "step": 3472, + "time_per_iteration": 2.5295073986053467 + }, + { + "auxiliary_loss_clip": 0.0118069, + "auxiliary_loss_mlp": 0.01035503, + "balance_loss_clip": 1.05491567, + "balance_loss_mlp": 1.02647305, + "epoch": 0.41760355918956293, + "flos": 74739584010240.0, + "grad_norm": 3.7829807651617147, + "language_loss": 0.68234503, + "learning_rate": 2.62055232271635e-06, + "loss": 0.70450693, + "num_input_tokens_seen": 74916260, + "step": 3473, + "time_per_iteration": 2.9050538539886475 + }, + { + "auxiliary_loss_clip": 0.01133526, + "auxiliary_loss_mlp": 0.01027503, + "balance_loss_clip": 1.04614854, + "balance_loss_mlp": 1.0192064, + "epoch": 0.417723802080202, + "flos": 14317513148160.0, + "grad_norm": 2.7645154966832517, + "language_loss": 0.87595415, + "learning_rate": 2.619811749022646e-06, + "loss": 0.89756447, + "num_input_tokens_seen": 74931570, + "step": 3474, + "time_per_iteration": 2.5031561851501465 + }, + { + "auxiliary_loss_clip": 0.01175842, + "auxiliary_loss_mlp": 0.01031264, + "balance_loss_clip": 1.05421603, + "balance_loss_mlp": 1.02250803, + "epoch": 0.4178440449708411, + "flos": 14643797316480.0, + "grad_norm": 4.8183255778805245, + "language_loss": 0.71306741, + "learning_rate": 2.6190710813039917e-06, + "loss": 0.73513848, + "num_input_tokens_seen": 74944695, + "step": 3475, + "time_per_iteration": 2.4628403186798096 + }, + { + "auxiliary_loss_clip": 0.01123567, + "auxiliary_loss_mlp": 0.00764121, + "balance_loss_clip": 1.04365325, + "balance_loss_mlp": 1.00102484, + "epoch": 0.4179642878614802, + "flos": 21507296094720.0, + "grad_norm": 2.8863694495539973, + "language_loss": 0.83734798, + "learning_rate": 2.618330319672747e-06, + "loss": 0.85622483, + "num_input_tokens_seen": 74964115, + "step": 3476, + "time_per_iteration": 2.6058709621429443 + }, + { + "auxiliary_loss_clip": 0.01192055, + "auxiliary_loss_mlp": 0.0102607, + "balance_loss_clip": 1.05630863, + "balance_loss_mlp": 1.0175885, + "epoch": 0.41808453075211927, + "flos": 18441997257600.0, + "grad_norm": 2.4610044931564534, + "language_loss": 0.92091203, + "learning_rate": 2.617589464241284e-06, + "loss": 0.94309318, + "num_input_tokens_seen": 74978515, + "step": 3477, + "time_per_iteration": 2.46171498298645 + }, + { + "auxiliary_loss_clip": 0.01150226, + "auxiliary_loss_mlp": 0.0102418, + "balance_loss_clip": 1.05105114, + "balance_loss_mlp": 1.01629376, + "epoch": 0.4182047736427584, + "flos": 20301020628480.0, + "grad_norm": 1.9483303480914416, + "language_loss": 0.74687004, + "learning_rate": 2.6168485151219914e-06, + "loss": 0.76861411, + "num_input_tokens_seen": 74998135, + "step": 3478, + "time_per_iteration": 2.551893472671509 + }, + { + "auxiliary_loss_clip": 0.01175574, + "auxiliary_loss_mlp": 0.01025942, + "balance_loss_clip": 1.05421329, + "balance_loss_mlp": 1.01767492, + "epoch": 0.4183250165333975, + "flos": 18876623823360.0, + "grad_norm": 2.299363239499081, + "language_loss": 0.71469885, + "learning_rate": 2.616107472427269e-06, + "loss": 0.73671401, + "num_input_tokens_seen": 75012830, + "step": 3479, + "time_per_iteration": 2.464343547821045 + }, + { + "auxiliary_loss_clip": 0.01179618, + "auxiliary_loss_mlp": 0.01023755, + "balance_loss_clip": 1.05265951, + "balance_loss_mlp": 1.01520145, + "epoch": 0.41844525942403654, + "flos": 17740050698880.0, + "grad_norm": 2.727757836844151, + "language_loss": 0.76294971, + "learning_rate": 2.615366336269533e-06, + "loss": 0.78498346, + "num_input_tokens_seen": 75026495, + "step": 3480, + "time_per_iteration": 2.4509074687957764 + }, + { + "auxiliary_loss_clip": 0.01191037, + "auxiliary_loss_mlp": 0.01034239, + "balance_loss_clip": 1.05402267, + "balance_loss_mlp": 1.02473807, + "epoch": 0.41856550231467565, + "flos": 18361377181440.0, + "grad_norm": 2.3598214760850484, + "language_loss": 0.80205107, + "learning_rate": 2.6146251067612126e-06, + "loss": 0.82430387, + "num_input_tokens_seen": 75041970, + "step": 3481, + "time_per_iteration": 2.435164451599121 + }, + { + "auxiliary_loss_clip": 0.01175409, + "auxiliary_loss_mlp": 0.01026528, + "balance_loss_clip": 1.05578566, + "balance_loss_mlp": 1.01809096, + "epoch": 0.41868574520531476, + "flos": 22781801445120.0, + "grad_norm": 1.598876626495541, + "language_loss": 0.82450414, + "learning_rate": 2.6138837840147525e-06, + "loss": 0.84652352, + "num_input_tokens_seen": 75061005, + "step": 3482, + "time_per_iteration": 2.4863033294677734 + }, + { + "auxiliary_loss_clip": 0.01142537, + "auxiliary_loss_mlp": 0.01023846, + "balance_loss_clip": 1.0487628, + "balance_loss_mlp": 1.01559711, + "epoch": 0.4188059880959538, + "flos": 13699167494400.0, + "grad_norm": 2.0579916483577603, + "language_loss": 0.75930727, + "learning_rate": 2.6131423681426103e-06, + "loss": 0.78097117, + "num_input_tokens_seen": 75076920, + "step": 3483, + "time_per_iteration": 2.53536057472229 + }, + { + "auxiliary_loss_clip": 0.01189831, + "auxiliary_loss_mlp": 0.01029078, + "balance_loss_clip": 1.05591559, + "balance_loss_mlp": 1.02130008, + "epoch": 0.41892623098659293, + "flos": 37818281220480.0, + "grad_norm": 2.266913966403171, + "language_loss": 0.72712296, + "learning_rate": 2.6124008592572587e-06, + "loss": 0.74931204, + "num_input_tokens_seen": 75100905, + "step": 3484, + "time_per_iteration": 3.4063210487365723 + }, + { + "auxiliary_loss_clip": 0.01193637, + "auxiliary_loss_mlp": 0.01028462, + "balance_loss_clip": 1.05554807, + "balance_loss_mlp": 1.01933658, + "epoch": 0.419046473877232, + "flos": 23258874908160.0, + "grad_norm": 2.2173286397977057, + "language_loss": 0.81458151, + "learning_rate": 2.6116592574711835e-06, + "loss": 0.83680248, + "num_input_tokens_seen": 75119205, + "step": 3485, + "time_per_iteration": 4.065755844116211 + }, + { + "auxiliary_loss_clip": 0.01194668, + "auxiliary_loss_mlp": 0.01035505, + "balance_loss_clip": 1.05684423, + "balance_loss_mlp": 1.02651632, + "epoch": 0.4191667167678711, + "flos": 20741034234240.0, + "grad_norm": 1.8827815591875772, + "language_loss": 0.84146267, + "learning_rate": 2.6109175628968853e-06, + "loss": 0.86376441, + "num_input_tokens_seen": 75138970, + "step": 3486, + "time_per_iteration": 2.4712445735931396 + }, + { + "auxiliary_loss_clip": 0.01166116, + "auxiliary_loss_mlp": 0.0102905, + "balance_loss_clip": 1.05160439, + "balance_loss_mlp": 1.02109838, + "epoch": 0.4192869596585102, + "flos": 23586416052480.0, + "grad_norm": 1.9851027735929678, + "language_loss": 0.82739055, + "learning_rate": 2.610175775646878e-06, + "loss": 0.84934223, + "num_input_tokens_seen": 75157550, + "step": 3487, + "time_per_iteration": 2.520092725753784 + }, + { + "auxiliary_loss_clip": 0.01156899, + "auxiliary_loss_mlp": 0.01027873, + "balance_loss_clip": 1.04882836, + "balance_loss_mlp": 1.01938581, + "epoch": 0.41940720254914926, + "flos": 25081269384960.0, + "grad_norm": 2.4490729230421486, + "language_loss": 0.72809613, + "learning_rate": 2.6094338958336907e-06, + "loss": 0.74994385, + "num_input_tokens_seen": 75176220, + "step": 3488, + "time_per_iteration": 2.5395095348358154 + }, + { + "auxiliary_loss_clip": 0.01161786, + "auxiliary_loss_mlp": 0.01029023, + "balance_loss_clip": 1.05337429, + "balance_loss_mlp": 1.02052879, + "epoch": 0.41952744543978837, + "flos": 15554132628480.0, + "grad_norm": 1.8951299767257388, + "language_loss": 0.82117403, + "learning_rate": 2.608691923569867e-06, + "loss": 0.84308213, + "num_input_tokens_seen": 75193095, + "step": 3489, + "time_per_iteration": 2.4990618228912354 + }, + { + "auxiliary_loss_clip": 0.01177873, + "auxiliary_loss_mlp": 0.01032727, + "balance_loss_clip": 1.05430293, + "balance_loss_mlp": 1.02410841, + "epoch": 0.4196476883304275, + "flos": 24644775312000.0, + "grad_norm": 1.6147117197637222, + "language_loss": 0.75685179, + "learning_rate": 2.6079498589679616e-06, + "loss": 0.77895778, + "num_input_tokens_seen": 75214185, + "step": 3490, + "time_per_iteration": 2.5129334926605225 + }, + { + "auxiliary_loss_clip": 0.01110892, + "auxiliary_loss_mlp": 0.0103392, + "balance_loss_clip": 1.04302645, + "balance_loss_mlp": 1.0240854, + "epoch": 0.41976793122106654, + "flos": 24531333183360.0, + "grad_norm": 1.7830113594350796, + "language_loss": 0.75915766, + "learning_rate": 2.6072077021405465e-06, + "loss": 0.78060579, + "num_input_tokens_seen": 75233020, + "step": 3491, + "time_per_iteration": 2.614131450653076 + }, + { + "auxiliary_loss_clip": 0.01156719, + "auxiliary_loss_mlp": 0.01031432, + "balance_loss_clip": 1.05162644, + "balance_loss_mlp": 1.02335596, + "epoch": 0.41988817411170565, + "flos": 21175301664000.0, + "grad_norm": 2.1163672858063567, + "language_loss": 0.69011343, + "learning_rate": 2.6064654532002054e-06, + "loss": 0.71199489, + "num_input_tokens_seen": 75252030, + "step": 3492, + "time_per_iteration": 2.5539956092834473 + }, + { + "auxiliary_loss_clip": 0.0119167, + "auxiliary_loss_mlp": 0.01032436, + "balance_loss_clip": 1.0567143, + "balance_loss_mlp": 1.02382922, + "epoch": 0.42000841700234476, + "flos": 31649402626560.0, + "grad_norm": 1.4450134985771452, + "language_loss": 0.75386608, + "learning_rate": 2.6057231122595375e-06, + "loss": 0.77610713, + "num_input_tokens_seen": 75273340, + "step": 3493, + "time_per_iteration": 2.5253312587738037 + }, + { + "auxiliary_loss_clip": 0.01159667, + "auxiliary_loss_mlp": 0.0103167, + "balance_loss_clip": 1.04780209, + "balance_loss_mlp": 1.02245545, + "epoch": 0.4201286598929838, + "flos": 21281525159040.0, + "grad_norm": 1.7089358698449764, + "language_loss": 0.72760445, + "learning_rate": 2.604980679431154e-06, + "loss": 0.7495178, + "num_input_tokens_seen": 75291580, + "step": 3494, + "time_per_iteration": 2.519972801208496 + }, + { + "auxiliary_loss_clip": 0.01177382, + "auxiliary_loss_mlp": 0.01027077, + "balance_loss_clip": 1.0515542, + "balance_loss_mlp": 1.01850557, + "epoch": 0.4202489027836229, + "flos": 18546532813440.0, + "grad_norm": 2.0336143283287176, + "language_loss": 0.74710155, + "learning_rate": 2.604238154827684e-06, + "loss": 0.7691462, + "num_input_tokens_seen": 75308205, + "step": 3495, + "time_per_iteration": 2.4846532344818115 + }, + { + "auxiliary_loss_clip": 0.01176325, + "auxiliary_loss_mlp": 0.01024711, + "balance_loss_clip": 1.05256784, + "balance_loss_mlp": 1.01654553, + "epoch": 0.42036914567426203, + "flos": 19317643009920.0, + "grad_norm": 2.0980661855429314, + "language_loss": 0.72188562, + "learning_rate": 2.6034955385617656e-06, + "loss": 0.74389601, + "num_input_tokens_seen": 75326535, + "step": 3496, + "time_per_iteration": 2.471580743789673 + }, + { + "auxiliary_loss_clip": 0.01054153, + "auxiliary_loss_mlp": 0.01004044, + "balance_loss_clip": 1.01591682, + "balance_loss_mlp": 1.00257778, + "epoch": 0.4204893885649011, + "flos": 67842942935040.0, + "grad_norm": 0.7333953532477733, + "language_loss": 0.61687207, + "learning_rate": 2.6027528307460544e-06, + "loss": 0.63745403, + "num_input_tokens_seen": 75390540, + "step": 3497, + "time_per_iteration": 3.199315309524536 + }, + { + "auxiliary_loss_clip": 0.01191256, + "auxiliary_loss_mlp": 0.01025887, + "balance_loss_clip": 1.05574179, + "balance_loss_mlp": 1.01796269, + "epoch": 0.4206096314555402, + "flos": 21908777385600.0, + "grad_norm": 2.12285768972535, + "language_loss": 0.8647393, + "learning_rate": 2.602010031493217e-06, + "loss": 0.88691068, + "num_input_tokens_seen": 75408770, + "step": 3498, + "time_per_iteration": 2.4900784492492676 + }, + { + "auxiliary_loss_clip": 0.0114328, + "auxiliary_loss_mlp": 0.01030677, + "balance_loss_clip": 1.05007291, + "balance_loss_mlp": 1.02215338, + "epoch": 0.42072987434617926, + "flos": 29278185269760.0, + "grad_norm": 2.291255916031736, + "language_loss": 0.86735177, + "learning_rate": 2.6012671409159367e-06, + "loss": 0.88909125, + "num_input_tokens_seen": 75430105, + "step": 3499, + "time_per_iteration": 2.6590232849121094 + }, + { + "auxiliary_loss_clip": 0.01154102, + "auxiliary_loss_mlp": 0.01033193, + "balance_loss_clip": 1.05083954, + "balance_loss_mlp": 1.02416277, + "epoch": 0.42085011723681837, + "flos": 27600726170880.0, + "grad_norm": 1.7181104998745889, + "language_loss": 0.81492782, + "learning_rate": 2.6005241591269097e-06, + "loss": 0.83680081, + "num_input_tokens_seen": 75449475, + "step": 3500, + "time_per_iteration": 2.575469732284546 + }, + { + "auxiliary_loss_clip": 0.01142654, + "auxiliary_loss_mlp": 0.01031355, + "balance_loss_clip": 1.05152833, + "balance_loss_mlp": 1.02288532, + "epoch": 0.4209703601274575, + "flos": 27818632028160.0, + "grad_norm": 1.6887969528336675, + "language_loss": 0.79595047, + "learning_rate": 2.5997810862388454e-06, + "loss": 0.81769049, + "num_input_tokens_seen": 75469315, + "step": 3501, + "time_per_iteration": 2.600532293319702 + }, + { + "auxiliary_loss_clip": 0.01161014, + "auxiliary_loss_mlp": 0.01029515, + "balance_loss_clip": 1.04907227, + "balance_loss_mlp": 1.02103353, + "epoch": 0.42109060301809653, + "flos": 27525529048320.0, + "grad_norm": 2.046016184157551, + "language_loss": 0.7552889, + "learning_rate": 2.599037922364467e-06, + "loss": 0.7771942, + "num_input_tokens_seen": 75488215, + "step": 3502, + "time_per_iteration": 2.5647120475769043 + }, + { + "auxiliary_loss_clip": 0.01143284, + "auxiliary_loss_mlp": 0.01026817, + "balance_loss_clip": 1.05208302, + "balance_loss_mlp": 1.01845479, + "epoch": 0.42121084590873564, + "flos": 29314275459840.0, + "grad_norm": 2.3460773174421696, + "language_loss": 0.75430435, + "learning_rate": 2.5982946676165112e-06, + "loss": 0.77600533, + "num_input_tokens_seen": 75507985, + "step": 3503, + "time_per_iteration": 2.5956649780273438 + }, + { + "auxiliary_loss_clip": 0.01052277, + "auxiliary_loss_mlp": 0.01012446, + "balance_loss_clip": 1.02293074, + "balance_loss_mlp": 1.0110091, + "epoch": 0.42133108879937475, + "flos": 67398835178880.0, + "grad_norm": 0.7310233881376569, + "language_loss": 0.57607746, + "learning_rate": 2.5975513221077313e-06, + "loss": 0.59672463, + "num_input_tokens_seen": 75571955, + "step": 3504, + "time_per_iteration": 3.2074835300445557 + }, + { + "auxiliary_loss_clip": 0.01150946, + "auxiliary_loss_mlp": 0.01034453, + "balance_loss_clip": 1.04931688, + "balance_loss_mlp": 1.02561343, + "epoch": 0.4214513316900138, + "flos": 23106038538240.0, + "grad_norm": 2.481812360177097, + "language_loss": 0.8865, + "learning_rate": 2.5968078859508897e-06, + "loss": 0.90835398, + "num_input_tokens_seen": 75589155, + "step": 3505, + "time_per_iteration": 2.505711317062378 + }, + { + "auxiliary_loss_clip": 0.0117435, + "auxiliary_loss_mlp": 0.01031571, + "balance_loss_clip": 1.05252457, + "balance_loss_mlp": 1.02300525, + "epoch": 0.4215715745806529, + "flos": 15336190857600.0, + "grad_norm": 1.953511522354087, + "language_loss": 0.79631031, + "learning_rate": 2.5960643592587673e-06, + "loss": 0.81836945, + "num_input_tokens_seen": 75606565, + "step": 3506, + "time_per_iteration": 2.448652505874634 + }, + { + "auxiliary_loss_clip": 0.01146847, + "auxiliary_loss_mlp": 0.01025325, + "balance_loss_clip": 1.0494771, + "balance_loss_mlp": 1.01715279, + "epoch": 0.42169181747129203, + "flos": 22127257860480.0, + "grad_norm": 2.1833551887855838, + "language_loss": 0.81234121, + "learning_rate": 2.5953207421441553e-06, + "loss": 0.83406293, + "num_input_tokens_seen": 75625165, + "step": 3507, + "time_per_iteration": 2.5624711513519287 + }, + { + "auxiliary_loss_clip": 0.01149346, + "auxiliary_loss_mlp": 0.01035414, + "balance_loss_clip": 1.05182385, + "balance_loss_mlp": 1.02715909, + "epoch": 0.4218120603619311, + "flos": 22630724841600.0, + "grad_norm": 3.849736724148446, + "language_loss": 0.74571514, + "learning_rate": 2.5945770347198603e-06, + "loss": 0.76756275, + "num_input_tokens_seen": 75643320, + "step": 3508, + "time_per_iteration": 2.5645337104797363 + }, + { + "auxiliary_loss_clip": 0.01156803, + "auxiliary_loss_mlp": 0.01024123, + "balance_loss_clip": 1.04918838, + "balance_loss_mlp": 1.01634455, + "epoch": 0.4219323032525702, + "flos": 19682818629120.0, + "grad_norm": 1.98499449679653, + "language_loss": 0.81629097, + "learning_rate": 2.593833237098701e-06, + "loss": 0.83810025, + "num_input_tokens_seen": 75660920, + "step": 3509, + "time_per_iteration": 2.5085511207580566 + }, + { + "auxiliary_loss_clip": 0.0117187, + "auxiliary_loss_mlp": 0.01029955, + "balance_loss_clip": 1.04954505, + "balance_loss_mlp": 1.02109146, + "epoch": 0.4220525461432093, + "flos": 30190747224960.0, + "grad_norm": 3.751532826431559, + "language_loss": 0.62619823, + "learning_rate": 2.593089349393512e-06, + "loss": 0.64821649, + "num_input_tokens_seen": 75681410, + "step": 3510, + "time_per_iteration": 3.288443088531494 + }, + { + "auxiliary_loss_clip": 0.01173089, + "auxiliary_loss_mlp": 0.01025398, + "balance_loss_clip": 1.05476713, + "balance_loss_mlp": 1.01707137, + "epoch": 0.42217278903384836, + "flos": 24315941278080.0, + "grad_norm": 1.9909224260930234, + "language_loss": 0.83453238, + "learning_rate": 2.592345371717141e-06, + "loss": 0.85651726, + "num_input_tokens_seen": 75700940, + "step": 3511, + "time_per_iteration": 3.3189971446990967 + }, + { + "auxiliary_loss_clip": 0.01175861, + "auxiliary_loss_mlp": 0.01031004, + "balance_loss_clip": 1.05731058, + "balance_loss_mlp": 1.02242088, + "epoch": 0.42229303192448747, + "flos": 17092474352640.0, + "grad_norm": 2.1103998523997527, + "language_loss": 0.71999228, + "learning_rate": 2.591601304182448e-06, + "loss": 0.7420609, + "num_input_tokens_seen": 75718910, + "step": 3512, + "time_per_iteration": 4.003085613250732 + }, + { + "auxiliary_loss_clip": 0.01162635, + "auxiliary_loss_mlp": 0.0102833, + "balance_loss_clip": 1.05576611, + "balance_loss_mlp": 1.02047431, + "epoch": 0.4224132748151266, + "flos": 22784530878720.0, + "grad_norm": 2.0464092764950115, + "language_loss": 0.79513341, + "learning_rate": 2.5908571469023067e-06, + "loss": 0.81704313, + "num_input_tokens_seen": 75738395, + "step": 3513, + "time_per_iteration": 2.5183985233306885 + }, + { + "auxiliary_loss_clip": 0.01189004, + "auxiliary_loss_mlp": 0.01030466, + "balance_loss_clip": 1.05447102, + "balance_loss_mlp": 1.02236533, + "epoch": 0.42253351770576564, + "flos": 17819090576640.0, + "grad_norm": 2.273941266220984, + "language_loss": 0.75347197, + "learning_rate": 2.5901128999896067e-06, + "loss": 0.77566671, + "num_input_tokens_seen": 75753825, + "step": 3514, + "time_per_iteration": 2.4072694778442383 + }, + { + "auxiliary_loss_clip": 0.01173109, + "auxiliary_loss_mlp": 0.01030206, + "balance_loss_clip": 1.05427957, + "balance_loss_mlp": 1.02204597, + "epoch": 0.42265376059640475, + "flos": 28512390286080.0, + "grad_norm": 4.542278998507587, + "language_loss": 0.68032622, + "learning_rate": 2.5893685635572487e-06, + "loss": 0.70235932, + "num_input_tokens_seen": 75774675, + "step": 3515, + "time_per_iteration": 2.533892869949341 + }, + { + "auxiliary_loss_clip": 0.01159244, + "auxiliary_loss_mlp": 0.01027857, + "balance_loss_clip": 1.05318165, + "balance_loss_mlp": 1.01909542, + "epoch": 0.4227740034870438, + "flos": 16253349753600.0, + "grad_norm": 2.055861926004233, + "language_loss": 0.69263786, + "learning_rate": 2.5886241377181483e-06, + "loss": 0.71450889, + "num_input_tokens_seen": 75793545, + "step": 3516, + "time_per_iteration": 2.4837658405303955 + }, + { + "auxiliary_loss_clip": 0.01179812, + "auxiliary_loss_mlp": 0.01028734, + "balance_loss_clip": 1.05507624, + "balance_loss_mlp": 1.01967955, + "epoch": 0.4228942463776829, + "flos": 25295691623040.0, + "grad_norm": 1.6658863291425987, + "language_loss": 0.811966, + "learning_rate": 2.587879622585234e-06, + "loss": 0.83405149, + "num_input_tokens_seen": 75812145, + "step": 3517, + "time_per_iteration": 2.5101120471954346 + }, + { + "auxiliary_loss_clip": 0.01174665, + "auxiliary_loss_mlp": 0.01034348, + "balance_loss_clip": 1.05477643, + "balance_loss_mlp": 1.02627182, + "epoch": 0.423014489268322, + "flos": 26395779507840.0, + "grad_norm": 1.9197067490702595, + "language_loss": 0.76246238, + "learning_rate": 2.5871350182714486e-06, + "loss": 0.78455257, + "num_input_tokens_seen": 75833025, + "step": 3518, + "time_per_iteration": 2.543936014175415 + }, + { + "auxiliary_loss_clip": 0.01187116, + "auxiliary_loss_mlp": 0.01028785, + "balance_loss_clip": 1.05428982, + "balance_loss_mlp": 1.02091694, + "epoch": 0.4231347321589611, + "flos": 17274002711040.0, + "grad_norm": 2.018556620146434, + "language_loss": 0.8042022, + "learning_rate": 2.586390324889748e-06, + "loss": 0.82636124, + "num_input_tokens_seen": 75848925, + "step": 3519, + "time_per_iteration": 2.416271686553955 + }, + { + "auxiliary_loss_clip": 0.01174204, + "auxiliary_loss_mlp": 0.01030125, + "balance_loss_clip": 1.05541825, + "balance_loss_mlp": 1.02213788, + "epoch": 0.4232549750496002, + "flos": 22999635475200.0, + "grad_norm": 2.0782301853949696, + "language_loss": 0.67419446, + "learning_rate": 2.5856455425531003e-06, + "loss": 0.69623774, + "num_input_tokens_seen": 75870400, + "step": 3520, + "time_per_iteration": 2.526453971862793 + }, + { + "auxiliary_loss_clip": 0.01176119, + "auxiliary_loss_mlp": 0.01023267, + "balance_loss_clip": 1.05637157, + "balance_loss_mlp": 1.01513672, + "epoch": 0.4233752179402393, + "flos": 21248343970560.0, + "grad_norm": 1.7816349043083086, + "language_loss": 0.80859518, + "learning_rate": 2.5849006713744902e-06, + "loss": 0.83058906, + "num_input_tokens_seen": 75889195, + "step": 3521, + "time_per_iteration": 2.4756033420562744 + }, + { + "auxiliary_loss_clip": 0.01158112, + "auxiliary_loss_mlp": 0.01027202, + "balance_loss_clip": 1.05154943, + "balance_loss_mlp": 1.01877439, + "epoch": 0.42349546083087836, + "flos": 20704297599360.0, + "grad_norm": 2.218890376939721, + "language_loss": 0.72718036, + "learning_rate": 2.5841557114669135e-06, + "loss": 0.74903351, + "num_input_tokens_seen": 75906055, + "step": 3522, + "time_per_iteration": 2.506930112838745 + }, + { + "auxiliary_loss_clip": 0.01190865, + "auxiliary_loss_mlp": 0.01029893, + "balance_loss_clip": 1.05310369, + "balance_loss_mlp": 1.0208447, + "epoch": 0.42361570372151747, + "flos": 18585065128320.0, + "grad_norm": 2.5921318421451516, + "language_loss": 0.67329425, + "learning_rate": 2.58341066294338e-06, + "loss": 0.6955018, + "num_input_tokens_seen": 75922720, + "step": 3523, + "time_per_iteration": 2.4292924404144287 + }, + { + "auxiliary_loss_clip": 0.01137617, + "auxiliary_loss_mlp": 0.00763498, + "balance_loss_clip": 1.04998827, + "balance_loss_mlp": 1.00096464, + "epoch": 0.4237359466121566, + "flos": 20959478795520.0, + "grad_norm": 2.446936630389301, + "language_loss": 0.84996665, + "learning_rate": 2.5826655259169124e-06, + "loss": 0.86897779, + "num_input_tokens_seen": 75941375, + "step": 3524, + "time_per_iteration": 2.6585588455200195 + }, + { + "auxiliary_loss_clip": 0.01191313, + "auxiliary_loss_mlp": 0.01032589, + "balance_loss_clip": 1.05708718, + "balance_loss_mlp": 1.02401209, + "epoch": 0.42385618950279563, + "flos": 18038181582720.0, + "grad_norm": 1.8160008095532152, + "language_loss": 0.90343839, + "learning_rate": 2.5819203005005475e-06, + "loss": 0.92567742, + "num_input_tokens_seen": 75958710, + "step": 3525, + "time_per_iteration": 2.4367117881774902 + }, + { + "auxiliary_loss_clip": 0.0115578, + "auxiliary_loss_mlp": 0.01031828, + "balance_loss_clip": 1.05326915, + "balance_loss_mlp": 1.02375197, + "epoch": 0.42397643239343474, + "flos": 23769129559680.0, + "grad_norm": 1.5441465003315147, + "language_loss": 0.78568864, + "learning_rate": 2.581174986807336e-06, + "loss": 0.80756474, + "num_input_tokens_seen": 75978945, + "step": 3526, + "time_per_iteration": 2.562203884124756 + }, + { + "auxiliary_loss_clip": 0.0116726, + "auxiliary_loss_mlp": 0.00763172, + "balance_loss_clip": 1.05318248, + "balance_loss_mlp": 1.00088191, + "epoch": 0.42409667528407385, + "flos": 16545088016640.0, + "grad_norm": 2.239414521223409, + "language_loss": 0.90856451, + "learning_rate": 2.580429584950341e-06, + "loss": 0.92786884, + "num_input_tokens_seen": 75994695, + "step": 3527, + "time_per_iteration": 2.464878559112549 + }, + { + "auxiliary_loss_clip": 0.01152516, + "auxiliary_loss_mlp": 0.01025346, + "balance_loss_clip": 1.05227792, + "balance_loss_mlp": 1.01614869, + "epoch": 0.4242169181747129, + "flos": 16034186920320.0, + "grad_norm": 2.043706137452233, + "language_loss": 0.65804565, + "learning_rate": 2.5796840950426397e-06, + "loss": 0.67982423, + "num_input_tokens_seen": 76011780, + "step": 3528, + "time_per_iteration": 2.545253038406372 + }, + { + "auxiliary_loss_clip": 0.01165652, + "auxiliary_loss_mlp": 0.01028439, + "balance_loss_clip": 1.05196035, + "balance_loss_mlp": 1.02057171, + "epoch": 0.424337161065352, + "flos": 20084012611200.0, + "grad_norm": 1.6796992742549495, + "language_loss": 0.66212428, + "learning_rate": 2.578938517197322e-06, + "loss": 0.68406516, + "num_input_tokens_seen": 76029875, + "step": 3529, + "time_per_iteration": 2.479879140853882 + }, + { + "auxiliary_loss_clip": 0.01151123, + "auxiliary_loss_mlp": 0.01030508, + "balance_loss_clip": 1.05118203, + "balance_loss_mlp": 1.02207363, + "epoch": 0.4244574039559911, + "flos": 23878369797120.0, + "grad_norm": 2.333138365302593, + "language_loss": 0.62785226, + "learning_rate": 2.5781928515274916e-06, + "loss": 0.64966857, + "num_input_tokens_seen": 76048595, + "step": 3530, + "time_per_iteration": 2.5389153957366943 + }, + { + "auxiliary_loss_clip": 0.01179545, + "auxiliary_loss_mlp": 0.01029726, + "balance_loss_clip": 1.0565778, + "balance_loss_mlp": 1.02170908, + "epoch": 0.4245776468466302, + "flos": 17565920542080.0, + "grad_norm": 1.8436793409413998, + "language_loss": 0.67584121, + "learning_rate": 2.577447098146265e-06, + "loss": 0.69793391, + "num_input_tokens_seen": 76065770, + "step": 3531, + "time_per_iteration": 2.4602129459381104 + }, + { + "auxiliary_loss_clip": 0.01148056, + "auxiliary_loss_mlp": 0.01034112, + "balance_loss_clip": 1.0519042, + "balance_loss_mlp": 1.025738, + "epoch": 0.4246978897372693, + "flos": 27776256958080.0, + "grad_norm": 2.9936698923247627, + "language_loss": 0.79057491, + "learning_rate": 2.5767012571667724e-06, + "loss": 0.81239659, + "num_input_tokens_seen": 76085250, + "step": 3532, + "time_per_iteration": 2.601968288421631 + }, + { + "auxiliary_loss_clip": 0.01175602, + "auxiliary_loss_mlp": 0.01027039, + "balance_loss_clip": 1.05193365, + "balance_loss_mlp": 1.01775861, + "epoch": 0.42481813262790835, + "flos": 15596615439360.0, + "grad_norm": 1.6924606226362067, + "language_loss": 0.679928, + "learning_rate": 2.5759553287021587e-06, + "loss": 0.70195442, + "num_input_tokens_seen": 76103580, + "step": 3533, + "time_per_iteration": 2.460476875305176 + }, + { + "auxiliary_loss_clip": 0.01161065, + "auxiliary_loss_mlp": 0.01028712, + "balance_loss_clip": 1.05514598, + "balance_loss_mlp": 1.019503, + "epoch": 0.42493837551854746, + "flos": 23951088881280.0, + "grad_norm": 1.7849189462973698, + "language_loss": 0.77626801, + "learning_rate": 2.5752093128655786e-06, + "loss": 0.7981658, + "num_input_tokens_seen": 76121825, + "step": 3534, + "time_per_iteration": 2.598724842071533 + }, + { + "auxiliary_loss_clip": 0.01153934, + "auxiliary_loss_mlp": 0.01029775, + "balance_loss_clip": 1.05036569, + "balance_loss_mlp": 1.02086961, + "epoch": 0.4250586184091866, + "flos": 20813466009600.0, + "grad_norm": 1.806640764347127, + "language_loss": 0.73998928, + "learning_rate": 2.574463209770204e-06, + "loss": 0.7618264, + "num_input_tokens_seen": 76141140, + "step": 3535, + "time_per_iteration": 2.506723165512085 + }, + { + "auxiliary_loss_clip": 0.01143821, + "auxiliary_loss_mlp": 0.01031167, + "balance_loss_clip": 1.04928553, + "balance_loss_mlp": 1.02236891, + "epoch": 0.42517886129982563, + "flos": 30371018607360.0, + "grad_norm": 1.6364828685314328, + "language_loss": 0.79331428, + "learning_rate": 2.5737170195292165e-06, + "loss": 0.81506419, + "num_input_tokens_seen": 76164475, + "step": 3536, + "time_per_iteration": 2.6222691535949707 + }, + { + "auxiliary_loss_clip": 0.01146593, + "auxiliary_loss_mlp": 0.01028023, + "balance_loss_clip": 1.04977441, + "balance_loss_mlp": 1.01899838, + "epoch": 0.42529910419046474, + "flos": 20080636732800.0, + "grad_norm": 2.6320260934259445, + "language_loss": 0.78109586, + "learning_rate": 2.572970742255814e-06, + "loss": 0.80284202, + "num_input_tokens_seen": 76182965, + "step": 3537, + "time_per_iteration": 3.2968976497650146 + }, + { + "auxiliary_loss_clip": 0.01175508, + "auxiliary_loss_mlp": 0.01031114, + "balance_loss_clip": 1.05660224, + "balance_loss_mlp": 1.02299583, + "epoch": 0.42541934708110385, + "flos": 22632448694400.0, + "grad_norm": 1.7159919769069822, + "language_loss": 0.81548345, + "learning_rate": 2.5722243780632046e-06, + "loss": 0.83754969, + "num_input_tokens_seen": 76201230, + "step": 3538, + "time_per_iteration": 4.093797445297241 + }, + { + "auxiliary_loss_clip": 0.01046086, + "auxiliary_loss_mlp": 0.01009579, + "balance_loss_clip": 1.01934218, + "balance_loss_mlp": 1.00824952, + "epoch": 0.4255395899717429, + "flos": 66200676186240.0, + "grad_norm": 0.7908389020356351, + "language_loss": 0.60495532, + "learning_rate": 2.5714779270646125e-06, + "loss": 0.625512, + "num_input_tokens_seen": 76262000, + "step": 3539, + "time_per_iteration": 3.812236785888672 + }, + { + "auxiliary_loss_clip": 0.0116563, + "auxiliary_loss_mlp": 0.00763209, + "balance_loss_clip": 1.05451441, + "balance_loss_mlp": 1.00076604, + "epoch": 0.425659832862382, + "flos": 17931814433280.0, + "grad_norm": 2.7506810506086445, + "language_loss": 0.77832073, + "learning_rate": 2.5707313893732735e-06, + "loss": 0.79760915, + "num_input_tokens_seen": 76280540, + "step": 3540, + "time_per_iteration": 2.508814573287964 + }, + { + "auxiliary_loss_clip": 0.01095815, + "auxiliary_loss_mlp": 0.01031193, + "balance_loss_clip": 1.04161859, + "balance_loss_mlp": 1.02249038, + "epoch": 0.4257800757530211, + "flos": 24022550989440.0, + "grad_norm": 1.7240976511884503, + "language_loss": 0.77135223, + "learning_rate": 2.5699847651024364e-06, + "loss": 0.79262233, + "num_input_tokens_seen": 76301180, + "step": 3541, + "time_per_iteration": 2.8240203857421875 + }, + { + "auxiliary_loss_clip": 0.01173899, + "auxiliary_loss_mlp": 0.01028549, + "balance_loss_clip": 1.05604601, + "balance_loss_mlp": 1.02028167, + "epoch": 0.4259003186436602, + "flos": 23696015425920.0, + "grad_norm": 2.3392060534688524, + "language_loss": 0.77000767, + "learning_rate": 2.5692380543653627e-06, + "loss": 0.79203212, + "num_input_tokens_seen": 76319335, + "step": 3542, + "time_per_iteration": 2.89711856842041 + }, + { + "auxiliary_loss_clip": 0.01179223, + "auxiliary_loss_mlp": 0.00763428, + "balance_loss_clip": 1.05503583, + "balance_loss_mlp": 1.00086522, + "epoch": 0.4260205615342993, + "flos": 15259772672640.0, + "grad_norm": 1.9039222131022138, + "language_loss": 0.69902748, + "learning_rate": 2.5684912572753293e-06, + "loss": 0.718454, + "num_input_tokens_seen": 76335010, + "step": 3543, + "time_per_iteration": 2.50866961479187 + }, + { + "auxiliary_loss_clip": 0.01186568, + "auxiliary_loss_mlp": 0.01024491, + "balance_loss_clip": 1.05500758, + "balance_loss_mlp": 1.01618814, + "epoch": 0.4261408044249384, + "flos": 30665306736000.0, + "grad_norm": 1.6956378590732561, + "language_loss": 0.83982515, + "learning_rate": 2.5677443739456245e-06, + "loss": 0.86193579, + "num_input_tokens_seen": 76356670, + "step": 3544, + "time_per_iteration": 2.551555633544922 + }, + { + "auxiliary_loss_clip": 0.01162175, + "auxiliary_loss_mlp": 0.01025805, + "balance_loss_clip": 1.05476117, + "balance_loss_mlp": 1.01735318, + "epoch": 0.42626104731557746, + "flos": 23257905240960.0, + "grad_norm": 2.5758888004786167, + "language_loss": 0.79791993, + "learning_rate": 2.5669974044895495e-06, + "loss": 0.81979972, + "num_input_tokens_seen": 76373065, + "step": 3545, + "time_per_iteration": 2.519752264022827 + }, + { + "auxiliary_loss_clip": 0.01154224, + "auxiliary_loss_mlp": 0.01029088, + "balance_loss_clip": 1.05045414, + "balance_loss_mlp": 1.02025437, + "epoch": 0.42638129020621657, + "flos": 25884770670720.0, + "grad_norm": 1.708559155724473, + "language_loss": 0.79179156, + "learning_rate": 2.5662503490204187e-06, + "loss": 0.81362468, + "num_input_tokens_seen": 76393230, + "step": 3546, + "time_per_iteration": 2.599933624267578 + }, + { + "auxiliary_loss_clip": 0.01157036, + "auxiliary_loss_mlp": 0.01025324, + "balance_loss_clip": 1.04910755, + "balance_loss_mlp": 1.0170269, + "epoch": 0.4265015330968556, + "flos": 26502362138880.0, + "grad_norm": 3.9837957416182843, + "language_loss": 0.76178068, + "learning_rate": 2.5655032076515603e-06, + "loss": 0.78360426, + "num_input_tokens_seen": 76412555, + "step": 3547, + "time_per_iteration": 2.5743513107299805 + }, + { + "auxiliary_loss_clip": 0.01161605, + "auxiliary_loss_mlp": 0.0102856, + "balance_loss_clip": 1.05158854, + "balance_loss_mlp": 1.02013803, + "epoch": 0.42662177598749473, + "flos": 24389522288640.0, + "grad_norm": 1.9460039034635452, + "language_loss": 0.81945848, + "learning_rate": 2.5647559804963155e-06, + "loss": 0.84136009, + "num_input_tokens_seen": 76432485, + "step": 3548, + "time_per_iteration": 2.5392303466796875 + }, + { + "auxiliary_loss_clip": 0.01141008, + "auxiliary_loss_mlp": 0.01034287, + "balance_loss_clip": 1.05158806, + "balance_loss_mlp": 1.02612114, + "epoch": 0.42674201887813384, + "flos": 23148629089920.0, + "grad_norm": 2.166845106776401, + "language_loss": 0.78636414, + "learning_rate": 2.5640086676680364e-06, + "loss": 0.80811709, + "num_input_tokens_seen": 76453980, + "step": 3549, + "time_per_iteration": 2.601253032684326 + }, + { + "auxiliary_loss_clip": 0.01178474, + "auxiliary_loss_mlp": 0.01029982, + "balance_loss_clip": 1.05597687, + "balance_loss_mlp": 1.02091599, + "epoch": 0.4268622617687729, + "flos": 21689614552320.0, + "grad_norm": 2.021476734226666, + "language_loss": 0.81005144, + "learning_rate": 2.5632612692800923e-06, + "loss": 0.83213603, + "num_input_tokens_seen": 76473045, + "step": 3550, + "time_per_iteration": 2.502255439758301 + }, + { + "auxiliary_loss_clip": 0.01147838, + "auxiliary_loss_mlp": 0.01036776, + "balance_loss_clip": 1.05058324, + "balance_loss_mlp": 1.02731681, + "epoch": 0.426982504659412, + "flos": 23440151871360.0, + "grad_norm": 2.2492731144866087, + "language_loss": 0.75448591, + "learning_rate": 2.5625137854458603e-06, + "loss": 0.77633202, + "num_input_tokens_seen": 76492060, + "step": 3551, + "time_per_iteration": 2.5537657737731934 + }, + { + "auxiliary_loss_clip": 0.011641, + "auxiliary_loss_mlp": 0.0103047, + "balance_loss_clip": 1.05386424, + "balance_loss_mlp": 1.02236414, + "epoch": 0.4271027475500511, + "flos": 18916556768640.0, + "grad_norm": 1.8952889413386307, + "language_loss": 0.80163133, + "learning_rate": 2.561766216278735e-06, + "loss": 0.82357705, + "num_input_tokens_seen": 76509655, + "step": 3552, + "time_per_iteration": 2.5150306224823 + }, + { + "auxiliary_loss_clip": 0.01132104, + "auxiliary_loss_mlp": 0.01031452, + "balance_loss_clip": 1.05022907, + "balance_loss_mlp": 1.02251697, + "epoch": 0.4272229904406902, + "flos": 26870554500480.0, + "grad_norm": 1.993301261180141, + "language_loss": 0.81249309, + "learning_rate": 2.561018561892121e-06, + "loss": 0.83412862, + "num_input_tokens_seen": 76528795, + "step": 3553, + "time_per_iteration": 2.643104314804077 + }, + { + "auxiliary_loss_clip": 0.01157248, + "auxiliary_loss_mlp": 0.01031803, + "balance_loss_clip": 1.04910541, + "balance_loss_mlp": 1.0236845, + "epoch": 0.4273432333313293, + "flos": 23951376190080.0, + "grad_norm": 2.003270104340414, + "language_loss": 0.76630217, + "learning_rate": 2.5602708223994363e-06, + "loss": 0.78819269, + "num_input_tokens_seen": 76550660, + "step": 3554, + "time_per_iteration": 2.5413594245910645 + }, + { + "auxiliary_loss_clip": 0.01145909, + "auxiliary_loss_mlp": 0.01028081, + "balance_loss_clip": 1.0463531, + "balance_loss_mlp": 1.01993275, + "epoch": 0.4274634762219684, + "flos": 29570354496000.0, + "grad_norm": 2.680941506577205, + "language_loss": 0.67009354, + "learning_rate": 2.559522997914115e-06, + "loss": 0.69183344, + "num_input_tokens_seen": 76570240, + "step": 3555, + "time_per_iteration": 2.60569167137146 + }, + { + "auxiliary_loss_clip": 0.01187839, + "auxiliary_loss_mlp": 0.01031404, + "balance_loss_clip": 1.05702758, + "balance_loss_mlp": 1.02339315, + "epoch": 0.42758371911260745, + "flos": 21434146047360.0, + "grad_norm": 2.1613410191122164, + "language_loss": 0.84247726, + "learning_rate": 2.558775088549599e-06, + "loss": 0.86466974, + "num_input_tokens_seen": 76589820, + "step": 3556, + "time_per_iteration": 2.464864492416382 + }, + { + "auxiliary_loss_clip": 0.01180378, + "auxiliary_loss_mlp": 0.0103014, + "balance_loss_clip": 1.05405354, + "balance_loss_mlp": 1.02162886, + "epoch": 0.42770396200324656, + "flos": 14752822072320.0, + "grad_norm": 2.527884912499418, + "language_loss": 0.66423124, + "learning_rate": 2.5580270944193467e-06, + "loss": 0.6863364, + "num_input_tokens_seen": 76606640, + "step": 3557, + "time_per_iteration": 2.442399024963379 + }, + { + "auxiliary_loss_clip": 0.01086301, + "auxiliary_loss_mlp": 0.01001949, + "balance_loss_clip": 1.0195719, + "balance_loss_mlp": 1.00058413, + "epoch": 0.4278242048938857, + "flos": 70654712601600.0, + "grad_norm": 0.7501552567736547, + "language_loss": 0.55486923, + "learning_rate": 2.557279015636827e-06, + "loss": 0.57575172, + "num_input_tokens_seen": 76667050, + "step": 3558, + "time_per_iteration": 3.025799512863159 + }, + { + "auxiliary_loss_clip": 0.01070983, + "auxiliary_loss_mlp": 0.01001544, + "balance_loss_clip": 1.01891494, + "balance_loss_mlp": 1.00025642, + "epoch": 0.42794444778452473, + "flos": 69366165033600.0, + "grad_norm": 0.7651693105828318, + "language_loss": 0.61282784, + "learning_rate": 2.5565308523155245e-06, + "loss": 0.63355309, + "num_input_tokens_seen": 76726650, + "step": 3559, + "time_per_iteration": 2.9872488975524902 + }, + { + "auxiliary_loss_clip": 0.01127029, + "auxiliary_loss_mlp": 0.01027322, + "balance_loss_clip": 1.0497241, + "balance_loss_mlp": 1.01895356, + "epoch": 0.42806469067516384, + "flos": 18215328481920.0, + "grad_norm": 2.4549283053145383, + "language_loss": 0.81956893, + "learning_rate": 2.5557826045689336e-06, + "loss": 0.84111243, + "num_input_tokens_seen": 76742890, + "step": 3560, + "time_per_iteration": 2.5713448524475098 + }, + { + "auxiliary_loss_clip": 0.01051977, + "auxiliary_loss_mlp": 0.01002156, + "balance_loss_clip": 1.02244353, + "balance_loss_mlp": 1.00089288, + "epoch": 0.4281849335658029, + "flos": 54535814432640.0, + "grad_norm": 0.852857577156619, + "language_loss": 0.58900601, + "learning_rate": 2.5550342725105643e-06, + "loss": 0.60954732, + "num_input_tokens_seen": 76801055, + "step": 3561, + "time_per_iteration": 3.0844850540161133 + }, + { + "auxiliary_loss_clip": 0.01176495, + "auxiliary_loss_mlp": 0.01035809, + "balance_loss_clip": 1.0572257, + "balance_loss_mlp": 1.02738047, + "epoch": 0.428305176456442, + "flos": 17274828723840.0, + "grad_norm": 1.970653701870008, + "language_loss": 0.80933803, + "learning_rate": 2.554285856253937e-06, + "loss": 0.83146107, + "num_input_tokens_seen": 76819890, + "step": 3562, + "time_per_iteration": 2.4988064765930176 + }, + { + "auxiliary_loss_clip": 0.01157881, + "auxiliary_loss_mlp": 0.01031704, + "balance_loss_clip": 1.0532788, + "balance_loss_mlp": 1.02314472, + "epoch": 0.4284254193470811, + "flos": 26359509749760.0, + "grad_norm": 2.0710510790640306, + "language_loss": 0.77540159, + "learning_rate": 2.5535373559125855e-06, + "loss": 0.79729736, + "num_input_tokens_seen": 76840255, + "step": 3563, + "time_per_iteration": 2.5640928745269775 + }, + { + "auxiliary_loss_clip": 0.01102681, + "auxiliary_loss_mlp": 0.01024263, + "balance_loss_clip": 1.04380846, + "balance_loss_mlp": 1.01500654, + "epoch": 0.42854566223772017, + "flos": 29714248379520.0, + "grad_norm": 1.6083680009454338, + "language_loss": 0.81882542, + "learning_rate": 2.552788771600057e-06, + "loss": 0.84009486, + "num_input_tokens_seen": 76860565, + "step": 3564, + "time_per_iteration": 3.523402214050293 + }, + { + "auxiliary_loss_clip": 0.0115028, + "auxiliary_loss_mlp": 0.01034369, + "balance_loss_clip": 1.05432725, + "balance_loss_mlp": 1.025226, + "epoch": 0.4286659051283593, + "flos": 22018161277440.0, + "grad_norm": 1.8794331573325378, + "language_loss": 0.81823325, + "learning_rate": 2.5520401034299118e-06, + "loss": 0.84007978, + "num_input_tokens_seen": 76878325, + "step": 3565, + "time_per_iteration": 4.247715711593628 + }, + { + "auxiliary_loss_clip": 0.01177738, + "auxiliary_loss_mlp": 0.01029852, + "balance_loss_clip": 1.05476904, + "balance_loss_mlp": 1.02110267, + "epoch": 0.4287861480189984, + "flos": 13334422838400.0, + "grad_norm": 2.1645049597680686, + "language_loss": 0.88124537, + "learning_rate": 2.551291351515722e-06, + "loss": 0.90332127, + "num_input_tokens_seen": 76895340, + "step": 3566, + "time_per_iteration": 2.4879322052001953 + }, + { + "auxiliary_loss_clip": 0.0114086, + "auxiliary_loss_mlp": 0.00764102, + "balance_loss_clip": 1.04679751, + "balance_loss_mlp": 1.00079131, + "epoch": 0.42890639090963745, + "flos": 26651535321600.0, + "grad_norm": 1.6340418285127332, + "language_loss": 0.85763371, + "learning_rate": 2.5505425159710726e-06, + "loss": 0.87668335, + "num_input_tokens_seen": 76915150, + "step": 3567, + "time_per_iteration": 2.5875203609466553 + }, + { + "auxiliary_loss_clip": 0.01166514, + "auxiliary_loss_mlp": 0.00763466, + "balance_loss_clip": 1.05092442, + "balance_loss_mlp": 1.00074458, + "epoch": 0.42902663380027656, + "flos": 24055768091520.0, + "grad_norm": 2.113159540320061, + "language_loss": 0.82754731, + "learning_rate": 2.549793596909561e-06, + "loss": 0.84684706, + "num_input_tokens_seen": 76933770, + "step": 3568, + "time_per_iteration": 2.5692074298858643 + }, + { + "auxiliary_loss_clip": 0.0115801, + "auxiliary_loss_mlp": 0.01027317, + "balance_loss_clip": 1.05392718, + "balance_loss_mlp": 1.0185492, + "epoch": 0.42914687669091567, + "flos": 15632561975040.0, + "grad_norm": 2.2845898615076163, + "language_loss": 0.66028476, + "learning_rate": 2.5490445944447976e-06, + "loss": 0.68213809, + "num_input_tokens_seen": 76952265, + "step": 3569, + "time_per_iteration": 2.48679518699646 + }, + { + "auxiliary_loss_clip": 0.01172922, + "auxiliary_loss_mlp": 0.01027135, + "balance_loss_clip": 1.05240059, + "balance_loss_mlp": 1.01905274, + "epoch": 0.4292671195815547, + "flos": 31467802440960.0, + "grad_norm": 2.0249316320053197, + "language_loss": 0.64870954, + "learning_rate": 2.548295508690406e-06, + "loss": 0.67071009, + "num_input_tokens_seen": 76973560, + "step": 3570, + "time_per_iteration": 2.556165933609009 + }, + { + "auxiliary_loss_clip": 0.01175953, + "auxiliary_loss_mlp": 0.01026689, + "balance_loss_clip": 1.05287075, + "balance_loss_mlp": 1.01818979, + "epoch": 0.42938736247219383, + "flos": 30257756046720.0, + "grad_norm": 1.860625048063495, + "language_loss": 0.7618739, + "learning_rate": 2.5475463397600217e-06, + "loss": 0.78390032, + "num_input_tokens_seen": 76993640, + "step": 3571, + "time_per_iteration": 2.555751085281372 + }, + { + "auxiliary_loss_clip": 0.01194602, + "auxiliary_loss_mlp": 0.01028095, + "balance_loss_clip": 1.05841196, + "balance_loss_mlp": 1.01950002, + "epoch": 0.42950760536283294, + "flos": 29349683291520.0, + "grad_norm": 1.8621323109174717, + "language_loss": 0.76970553, + "learning_rate": 2.546797087767293e-06, + "loss": 0.79193252, + "num_input_tokens_seen": 77013765, + "step": 3572, + "time_per_iteration": 2.5080058574676514 + }, + { + "auxiliary_loss_clip": 0.01129557, + "auxiliary_loss_mlp": 0.01034076, + "balance_loss_clip": 1.04971027, + "balance_loss_mlp": 1.02524829, + "epoch": 0.429627848253472, + "flos": 26869943969280.0, + "grad_norm": 1.7623197241242514, + "language_loss": 0.86981213, + "learning_rate": 2.546047752825881e-06, + "loss": 0.89144844, + "num_input_tokens_seen": 77034370, + "step": 3573, + "time_per_iteration": 2.614128828048706 + }, + { + "auxiliary_loss_clip": 0.01135752, + "auxiliary_loss_mlp": 0.01033334, + "balance_loss_clip": 1.0489285, + "balance_loss_mlp": 1.02466154, + "epoch": 0.4297480911441111, + "flos": 13881270470400.0, + "grad_norm": 2.1231181559495487, + "language_loss": 0.92988396, + "learning_rate": 2.5452983350494595e-06, + "loss": 0.95157486, + "num_input_tokens_seen": 77049925, + "step": 3574, + "time_per_iteration": 2.5671660900115967 + }, + { + "auxiliary_loss_clip": 0.01175247, + "auxiliary_loss_mlp": 0.00763276, + "balance_loss_clip": 1.05444968, + "balance_loss_mlp": 1.0008986, + "epoch": 0.4298683340347502, + "flos": 20741141975040.0, + "grad_norm": 2.0457065869958173, + "language_loss": 0.6489557, + "learning_rate": 2.544548834551713e-06, + "loss": 0.66834092, + "num_input_tokens_seen": 77068930, + "step": 3575, + "time_per_iteration": 2.5002431869506836 + }, + { + "auxiliary_loss_clip": 0.01142323, + "auxiliary_loss_mlp": 0.00763539, + "balance_loss_clip": 1.05028892, + "balance_loss_mlp": 1.00094366, + "epoch": 0.4299885769253893, + "flos": 20882126856960.0, + "grad_norm": 2.346920074118686, + "language_loss": 0.94666654, + "learning_rate": 2.5437992514463424e-06, + "loss": 0.96572518, + "num_input_tokens_seen": 77082255, + "step": 3576, + "time_per_iteration": 2.547612190246582 + }, + { + "auxiliary_loss_clip": 0.01176247, + "auxiliary_loss_mlp": 0.01030084, + "balance_loss_clip": 1.05546415, + "balance_loss_mlp": 1.02084541, + "epoch": 0.4301088198160284, + "flos": 25484618183040.0, + "grad_norm": 1.79016896397431, + "language_loss": 0.87678808, + "learning_rate": 2.5430495858470565e-06, + "loss": 0.89885139, + "num_input_tokens_seen": 77101725, + "step": 3577, + "time_per_iteration": 2.521101951599121 + }, + { + "auxiliary_loss_clip": 0.01174295, + "auxiliary_loss_mlp": 0.01027959, + "balance_loss_clip": 1.05631936, + "balance_loss_mlp": 1.01947784, + "epoch": 0.43022906270666744, + "flos": 18259427404800.0, + "grad_norm": 2.7491543959471008, + "language_loss": 0.77179706, + "learning_rate": 2.54229983786758e-06, + "loss": 0.79381967, + "num_input_tokens_seen": 77119670, + "step": 3578, + "time_per_iteration": 2.4567689895629883 + }, + { + "auxiliary_loss_clip": 0.01159351, + "auxiliary_loss_mlp": 0.01028983, + "balance_loss_clip": 1.05049658, + "balance_loss_mlp": 1.01989317, + "epoch": 0.43034930559730655, + "flos": 23399536567680.0, + "grad_norm": 1.8179057046001015, + "language_loss": 0.84877324, + "learning_rate": 2.541550007621651e-06, + "loss": 0.87065661, + "num_input_tokens_seen": 77138160, + "step": 3579, + "time_per_iteration": 2.531923532485962 + }, + { + "auxiliary_loss_clip": 0.01173692, + "auxiliary_loss_mlp": 0.0102877, + "balance_loss_clip": 1.05580008, + "balance_loss_mlp": 1.02089596, + "epoch": 0.43046954848794566, + "flos": 28184382264960.0, + "grad_norm": 1.7505892953080777, + "language_loss": 0.79961473, + "learning_rate": 2.5408000952230156e-06, + "loss": 0.8216393, + "num_input_tokens_seen": 77156950, + "step": 3580, + "time_per_iteration": 2.619529962539673 + }, + { + "auxiliary_loss_clip": 0.01155658, + "auxiliary_loss_mlp": 0.01028021, + "balance_loss_clip": 1.0495261, + "balance_loss_mlp": 1.01903868, + "epoch": 0.4305897913785847, + "flos": 28580476515840.0, + "grad_norm": 2.125148865616275, + "language_loss": 0.90619218, + "learning_rate": 2.5400501007854357e-06, + "loss": 0.92802906, + "num_input_tokens_seen": 77176395, + "step": 3581, + "time_per_iteration": 2.6639652252197266 + }, + { + "auxiliary_loss_clip": 0.01130697, + "auxiliary_loss_mlp": 0.01032483, + "balance_loss_clip": 1.04603958, + "balance_loss_mlp": 1.02443695, + "epoch": 0.43071003426922383, + "flos": 20448721353600.0, + "grad_norm": 2.6056719451792447, + "language_loss": 0.75566447, + "learning_rate": 2.539300024422685e-06, + "loss": 0.77729625, + "num_input_tokens_seen": 77194340, + "step": 3582, + "time_per_iteration": 2.6800506114959717 + }, + { + "auxiliary_loss_clip": 0.01048508, + "auxiliary_loss_mlp": 0.01005281, + "balance_loss_clip": 1.01508141, + "balance_loss_mlp": 1.00398183, + "epoch": 0.43083027715986294, + "flos": 51997969883520.0, + "grad_norm": 0.7884406122568653, + "language_loss": 0.6098237, + "learning_rate": 2.538549866248549e-06, + "loss": 0.63036156, + "num_input_tokens_seen": 77249320, + "step": 3583, + "time_per_iteration": 3.0076229572296143 + }, + { + "auxiliary_loss_clip": 0.0117598, + "auxiliary_loss_mlp": 0.01028609, + "balance_loss_clip": 1.05262423, + "balance_loss_mlp": 1.01969862, + "epoch": 0.430950520050502, + "flos": 16690885320960.0, + "grad_norm": 1.9715963419466505, + "language_loss": 0.81306481, + "learning_rate": 2.5377996263768274e-06, + "loss": 0.83511072, + "num_input_tokens_seen": 77267400, + "step": 3584, + "time_per_iteration": 2.5533456802368164 + }, + { + "auxiliary_loss_clip": 0.01173631, + "auxiliary_loss_mlp": 0.01033645, + "balance_loss_clip": 1.05408645, + "balance_loss_mlp": 1.02488899, + "epoch": 0.4310707629411411, + "flos": 24608433726720.0, + "grad_norm": 1.802014657603254, + "language_loss": 0.68220663, + "learning_rate": 2.5370493049213293e-06, + "loss": 0.70427942, + "num_input_tokens_seen": 77287045, + "step": 3585, + "time_per_iteration": 2.571535110473633 + }, + { + "auxiliary_loss_clip": 0.0108167, + "auxiliary_loss_mlp": 0.0102719, + "balance_loss_clip": 1.0437274, + "balance_loss_mlp": 1.018273, + "epoch": 0.4311910058317802, + "flos": 26432983019520.0, + "grad_norm": 2.0239083543836434, + "language_loss": 0.80124938, + "learning_rate": 2.536298901995878e-06, + "loss": 0.82233804, + "num_input_tokens_seen": 77306255, + "step": 3586, + "time_per_iteration": 2.8800737857818604 + }, + { + "auxiliary_loss_clip": 0.01162684, + "auxiliary_loss_mlp": 0.01026602, + "balance_loss_clip": 1.05375767, + "balance_loss_mlp": 1.01813245, + "epoch": 0.43131124872241927, + "flos": 25155891889920.0, + "grad_norm": 1.6428715217712708, + "language_loss": 0.80230314, + "learning_rate": 2.535548417714311e-06, + "loss": 0.82419598, + "num_input_tokens_seen": 77325555, + "step": 3587, + "time_per_iteration": 3.1103615760803223 + }, + { + "auxiliary_loss_clip": 0.01180398, + "auxiliary_loss_mlp": 0.01030576, + "balance_loss_clip": 1.05354679, + "balance_loss_mlp": 1.0214206, + "epoch": 0.4314314916130584, + "flos": 21614812479360.0, + "grad_norm": 1.895885336678126, + "language_loss": 0.87263393, + "learning_rate": 2.534797852190474e-06, + "loss": 0.89474374, + "num_input_tokens_seen": 77345735, + "step": 3588, + "time_per_iteration": 2.4910190105438232 + }, + { + "auxiliary_loss_clip": 0.01172685, + "auxiliary_loss_mlp": 0.01036618, + "balance_loss_clip": 1.05305362, + "balance_loss_mlp": 1.0279758, + "epoch": 0.4315517345036975, + "flos": 19275016544640.0, + "grad_norm": 1.9214529020130318, + "language_loss": 0.81729043, + "learning_rate": 2.5340472055382283e-06, + "loss": 0.83938348, + "num_input_tokens_seen": 77361765, + "step": 3589, + "time_per_iteration": 2.473909378051758 + }, + { + "auxiliary_loss_clip": 0.01142795, + "auxiliary_loss_mlp": 0.01025235, + "balance_loss_clip": 1.04677069, + "balance_loss_mlp": 1.01726902, + "epoch": 0.43167197739433655, + "flos": 24273853516800.0, + "grad_norm": 2.781656336428968, + "language_loss": 0.80906343, + "learning_rate": 2.5332964778714468e-06, + "loss": 0.83074373, + "num_input_tokens_seen": 77378950, + "step": 3590, + "time_per_iteration": 2.5622973442077637 + }, + { + "auxiliary_loss_clip": 0.0114533, + "auxiliary_loss_mlp": 0.01026086, + "balance_loss_clip": 1.05345297, + "balance_loss_mlp": 1.01803339, + "epoch": 0.43179222028497566, + "flos": 16867816738560.0, + "grad_norm": 2.582154843067964, + "language_loss": 0.66378891, + "learning_rate": 2.5325456693040123e-06, + "loss": 0.68550313, + "num_input_tokens_seen": 77396145, + "step": 3591, + "time_per_iteration": 4.062847375869751 + }, + { + "auxiliary_loss_clip": 0.01181919, + "auxiliary_loss_mlp": 0.0102392, + "balance_loss_clip": 1.05348694, + "balance_loss_mlp": 1.01515806, + "epoch": 0.43191246317561477, + "flos": 17639214243840.0, + "grad_norm": 2.260475643126832, + "language_loss": 0.74788469, + "learning_rate": 2.531794779949824e-06, + "loss": 0.76994306, + "num_input_tokens_seen": 77414045, + "step": 3592, + "time_per_iteration": 4.2467100620269775 + }, + { + "auxiliary_loss_clip": 0.01139026, + "auxiliary_loss_mlp": 0.0102674, + "balance_loss_clip": 1.05037892, + "balance_loss_mlp": 1.01887202, + "epoch": 0.4320327060662538, + "flos": 23878800760320.0, + "grad_norm": 2.384466274965046, + "language_loss": 0.87612528, + "learning_rate": 2.5310438099227903e-06, + "loss": 0.89778292, + "num_input_tokens_seen": 77431310, + "step": 3593, + "time_per_iteration": 2.5849857330322266 + }, + { + "auxiliary_loss_clip": 0.01071393, + "auxiliary_loss_mlp": 0.01007169, + "balance_loss_clip": 1.01522338, + "balance_loss_mlp": 1.00587547, + "epoch": 0.43215294895689293, + "flos": 66394917959040.0, + "grad_norm": 0.7962678798907534, + "language_loss": 0.53351188, + "learning_rate": 2.530292759336833e-06, + "loss": 0.55429757, + "num_input_tokens_seen": 77492045, + "step": 3594, + "time_per_iteration": 3.1064646244049072 + }, + { + "auxiliary_loss_clip": 0.01157438, + "auxiliary_loss_mlp": 0.01027258, + "balance_loss_clip": 1.05313063, + "balance_loss_mlp": 1.01872897, + "epoch": 0.432273191847532, + "flos": 20594267262720.0, + "grad_norm": 2.0833003888120554, + "language_loss": 0.69560504, + "learning_rate": 2.5295416283058855e-06, + "loss": 0.71745205, + "num_input_tokens_seen": 77510910, + "step": 3595, + "time_per_iteration": 2.5023372173309326 + }, + { + "auxiliary_loss_clip": 0.01155858, + "auxiliary_loss_mlp": 0.00763119, + "balance_loss_clip": 1.05149794, + "balance_loss_mlp": 1.00097954, + "epoch": 0.4323934347381711, + "flos": 19282127437440.0, + "grad_norm": 1.6550958640246884, + "language_loss": 0.66046226, + "learning_rate": 2.5287904169438943e-06, + "loss": 0.67965204, + "num_input_tokens_seen": 77530115, + "step": 3596, + "time_per_iteration": 2.5143041610717773 + }, + { + "auxiliary_loss_clip": 0.01113699, + "auxiliary_loss_mlp": 0.0103813, + "balance_loss_clip": 1.0498054, + "balance_loss_mlp": 1.02847385, + "epoch": 0.4325136776288102, + "flos": 21726315273600.0, + "grad_norm": 4.456272226288392, + "language_loss": 0.63985974, + "learning_rate": 2.528039125364817e-06, + "loss": 0.66137803, + "num_input_tokens_seen": 77548920, + "step": 3597, + "time_per_iteration": 2.6986124515533447 + }, + { + "auxiliary_loss_clip": 0.01147828, + "auxiliary_loss_mlp": 0.01028875, + "balance_loss_clip": 1.05018473, + "balance_loss_mlp": 1.01994634, + "epoch": 0.43263392051944927, + "flos": 22340746344960.0, + "grad_norm": 2.6280826679866895, + "language_loss": 0.75492954, + "learning_rate": 2.5272877536826246e-06, + "loss": 0.77669662, + "num_input_tokens_seen": 77567715, + "step": 3598, + "time_per_iteration": 2.7387547492980957 + }, + { + "auxiliary_loss_clip": 0.01133504, + "auxiliary_loss_mlp": 0.01031397, + "balance_loss_clip": 1.04551411, + "balance_loss_mlp": 1.02277172, + "epoch": 0.4327541634100884, + "flos": 29168406328320.0, + "grad_norm": 2.488662082258157, + "language_loss": 0.69918466, + "learning_rate": 2.5265363020112986e-06, + "loss": 0.72083366, + "num_input_tokens_seen": 77588035, + "step": 3599, + "time_per_iteration": 2.668870449066162 + }, + { + "auxiliary_loss_clip": 0.01175836, + "auxiliary_loss_mlp": 0.01035741, + "balance_loss_clip": 1.0561831, + "balance_loss_mlp": 1.02670527, + "epoch": 0.4328744063007275, + "flos": 26067448264320.0, + "grad_norm": 1.8845720127141063, + "language_loss": 0.83931732, + "learning_rate": 2.5257847704648344e-06, + "loss": 0.86143309, + "num_input_tokens_seen": 77609265, + "step": 3600, + "time_per_iteration": 2.523301601409912 + }, + { + "auxiliary_loss_clip": 0.01186761, + "auxiliary_loss_mlp": 0.01028523, + "balance_loss_clip": 1.05455494, + "balance_loss_mlp": 1.02048206, + "epoch": 0.43299464919136654, + "flos": 16581357774720.0, + "grad_norm": 1.7716669013467503, + "language_loss": 0.75687003, + "learning_rate": 2.525033159157239e-06, + "loss": 0.77902281, + "num_input_tokens_seen": 77625580, + "step": 3601, + "time_per_iteration": 2.4308271408081055 + }, + { + "auxiliary_loss_clip": 0.01172653, + "auxiliary_loss_mlp": 0.01039843, + "balance_loss_clip": 1.05417931, + "balance_loss_mlp": 1.03074765, + "epoch": 0.43311489208200565, + "flos": 16107265140480.0, + "grad_norm": 1.6755562373143098, + "language_loss": 0.76956749, + "learning_rate": 2.52428146820253e-06, + "loss": 0.7916925, + "num_input_tokens_seen": 77643835, + "step": 3602, + "time_per_iteration": 2.4573476314544678 + }, + { + "auxiliary_loss_clip": 0.01148586, + "auxiliary_loss_mlp": 0.01028433, + "balance_loss_clip": 1.05211258, + "balance_loss_mlp": 1.01906288, + "epoch": 0.43323513497264476, + "flos": 22930220442240.0, + "grad_norm": 1.7313800684400367, + "language_loss": 0.81860596, + "learning_rate": 2.52352969771474e-06, + "loss": 0.84037614, + "num_input_tokens_seen": 77663060, + "step": 3603, + "time_per_iteration": 2.699051856994629 + }, + { + "auxiliary_loss_clip": 0.01162332, + "auxiliary_loss_mlp": 0.01029896, + "balance_loss_clip": 1.05273771, + "balance_loss_mlp": 1.02150917, + "epoch": 0.4333553778632838, + "flos": 25299031587840.0, + "grad_norm": 2.0128399877500214, + "language_loss": 0.88539636, + "learning_rate": 2.5227778478079106e-06, + "loss": 0.90731871, + "num_input_tokens_seen": 77682470, + "step": 3604, + "time_per_iteration": 2.6738884449005127 + }, + { + "auxiliary_loss_clip": 0.01170154, + "auxiliary_loss_mlp": 0.01032014, + "balance_loss_clip": 1.05217481, + "balance_loss_mlp": 1.02393794, + "epoch": 0.43347562075392293, + "flos": 19387165783680.0, + "grad_norm": 1.5984201730015817, + "language_loss": 0.76519328, + "learning_rate": 2.522025918596098e-06, + "loss": 0.78721499, + "num_input_tokens_seen": 77700770, + "step": 3605, + "time_per_iteration": 2.4863343238830566 + }, + { + "auxiliary_loss_clip": 0.01177606, + "auxiliary_loss_mlp": 0.01029094, + "balance_loss_clip": 1.05498242, + "balance_loss_mlp": 1.02110076, + "epoch": 0.43359586364456204, + "flos": 26325969425280.0, + "grad_norm": 1.3863928500819878, + "language_loss": 0.65235567, + "learning_rate": 2.521273910193368e-06, + "loss": 0.67442274, + "num_input_tokens_seen": 77723950, + "step": 3606, + "time_per_iteration": 2.5447587966918945 + }, + { + "auxiliary_loss_clip": 0.01181592, + "auxiliary_loss_mlp": 0.01026993, + "balance_loss_clip": 1.05592251, + "balance_loss_mlp": 1.01885664, + "epoch": 0.4337161065352011, + "flos": 15989261984640.0, + "grad_norm": 2.3122672213116244, + "language_loss": 0.86968464, + "learning_rate": 2.5205218227138006e-06, + "loss": 0.89177048, + "num_input_tokens_seen": 77736905, + "step": 3607, + "time_per_iteration": 2.4545371532440186 + }, + { + "auxiliary_loss_clip": 0.01189181, + "auxiliary_loss_mlp": 0.0102505, + "balance_loss_clip": 1.05592036, + "balance_loss_mlp": 1.01690757, + "epoch": 0.4338363494258402, + "flos": 20224710184320.0, + "grad_norm": 3.115647373073245, + "language_loss": 0.79094398, + "learning_rate": 2.519769656271486e-06, + "loss": 0.81308627, + "num_input_tokens_seen": 77754325, + "step": 3608, + "time_per_iteration": 2.4514801502227783 + }, + { + "auxiliary_loss_clip": 0.01120225, + "auxiliary_loss_mlp": 0.01029298, + "balance_loss_clip": 1.04712176, + "balance_loss_mlp": 1.02051187, + "epoch": 0.43395659231647926, + "flos": 20083904870400.0, + "grad_norm": 2.0980201960115052, + "language_loss": 0.6759907, + "learning_rate": 2.5190174109805285e-06, + "loss": 0.69748598, + "num_input_tokens_seen": 77774150, + "step": 3609, + "time_per_iteration": 2.561511754989624 + }, + { + "auxiliary_loss_clip": 0.01149662, + "auxiliary_loss_mlp": 0.01027679, + "balance_loss_clip": 1.04916596, + "balance_loss_mlp": 1.01874399, + "epoch": 0.43407683520711837, + "flos": 19901801894400.0, + "grad_norm": 1.8310014329147213, + "language_loss": 0.63620895, + "learning_rate": 2.518265086955042e-06, + "loss": 0.65798235, + "num_input_tokens_seen": 77791870, + "step": 3610, + "time_per_iteration": 2.5113365650177 + }, + { + "auxiliary_loss_clip": 0.01188503, + "auxiliary_loss_mlp": 0.01034916, + "balance_loss_clip": 1.05453634, + "balance_loss_mlp": 1.02683949, + "epoch": 0.4341970780977575, + "flos": 23108732058240.0, + "grad_norm": 1.8824899945624796, + "language_loss": 0.83890343, + "learning_rate": 2.5175126843091534e-06, + "loss": 0.86113757, + "num_input_tokens_seen": 77811240, + "step": 3611, + "time_per_iteration": 2.4656591415405273 + }, + { + "auxiliary_loss_clip": 0.0116217, + "auxiliary_loss_mlp": 0.01025396, + "balance_loss_clip": 1.0510664, + "balance_loss_mlp": 1.01721835, + "epoch": 0.43431732098839654, + "flos": 37408288406400.0, + "grad_norm": 1.9989194219490038, + "language_loss": 0.75761902, + "learning_rate": 2.5167602031570034e-06, + "loss": 0.77949464, + "num_input_tokens_seen": 77831425, + "step": 3612, + "time_per_iteration": 2.6311330795288086 + }, + { + "auxiliary_loss_clip": 0.01188591, + "auxiliary_loss_mlp": 0.01027424, + "balance_loss_clip": 1.05566108, + "balance_loss_mlp": 1.01922274, + "epoch": 0.43443756387903565, + "flos": 31868206323840.0, + "grad_norm": 1.7886001006449956, + "language_loss": 0.73300385, + "learning_rate": 2.51600764361274e-06, + "loss": 0.75516403, + "num_input_tokens_seen": 77852950, + "step": 3613, + "time_per_iteration": 2.5321552753448486 + }, + { + "auxiliary_loss_clip": 0.01189383, + "auxiliary_loss_mlp": 0.01024225, + "balance_loss_clip": 1.05551887, + "balance_loss_mlp": 1.01614857, + "epoch": 0.43455780676967476, + "flos": 23477139901440.0, + "grad_norm": 3.061168452608522, + "language_loss": 0.78446949, + "learning_rate": 2.5152550057905283e-06, + "loss": 0.80660564, + "num_input_tokens_seen": 77872840, + "step": 3614, + "time_per_iteration": 2.485994577407837 + }, + { + "auxiliary_loss_clip": 0.01176785, + "auxiliary_loss_mlp": 0.00763286, + "balance_loss_clip": 1.05619431, + "balance_loss_mlp": 1.0010668, + "epoch": 0.4346780496603138, + "flos": 24207060176640.0, + "grad_norm": 2.63152046475893, + "language_loss": 0.76724696, + "learning_rate": 2.5145022898045415e-06, + "loss": 0.78664768, + "num_input_tokens_seen": 77892025, + "step": 3615, + "time_per_iteration": 2.5125367641448975 + }, + { + "auxiliary_loss_clip": 0.01160471, + "auxiliary_loss_mlp": 0.01032472, + "balance_loss_clip": 1.04918551, + "balance_loss_mlp": 1.02359116, + "epoch": 0.4347982925509529, + "flos": 17092366611840.0, + "grad_norm": 2.3674461486701333, + "language_loss": 0.89844716, + "learning_rate": 2.5137494957689664e-06, + "loss": 0.92037654, + "num_input_tokens_seen": 77907635, + "step": 3616, + "time_per_iteration": 2.468151807785034 + }, + { + "auxiliary_loss_clip": 0.01061563, + "auxiliary_loss_mlp": 0.01003751, + "balance_loss_clip": 1.01676595, + "balance_loss_mlp": 1.00255287, + "epoch": 0.43491853544159204, + "flos": 60945544696320.0, + "grad_norm": 0.7662077309802043, + "language_loss": 0.57317322, + "learning_rate": 2.5129966237980016e-06, + "loss": 0.59382635, + "num_input_tokens_seen": 77970630, + "step": 3617, + "time_per_iteration": 4.1156134605407715 + }, + { + "auxiliary_loss_clip": 0.01145022, + "auxiliary_loss_mlp": 0.01024589, + "balance_loss_clip": 1.04775465, + "balance_loss_mlp": 1.01657176, + "epoch": 0.4350387783322311, + "flos": 21944652094080.0, + "grad_norm": 1.8511183248158212, + "language_loss": 0.78008586, + "learning_rate": 2.512243674005857e-06, + "loss": 0.80178201, + "num_input_tokens_seen": 77989995, + "step": 3618, + "time_per_iteration": 3.5941507816314697 + }, + { + "auxiliary_loss_clip": 0.01114338, + "auxiliary_loss_mlp": 0.01031534, + "balance_loss_clip": 1.04700279, + "balance_loss_mlp": 1.02330828, + "epoch": 0.4351590212228702, + "flos": 25082705928960.0, + "grad_norm": 2.0229219633648636, + "language_loss": 0.85986203, + "learning_rate": 2.5114906465067537e-06, + "loss": 0.88132071, + "num_input_tokens_seen": 78010980, + "step": 3619, + "time_per_iteration": 4.220703840255737 + }, + { + "auxiliary_loss_clip": 0.0117302, + "auxiliary_loss_mlp": 0.01023746, + "balance_loss_clip": 1.0500313, + "balance_loss_mlp": 1.0156163, + "epoch": 0.4352792641135093, + "flos": 21506541909120.0, + "grad_norm": 2.1111680774684833, + "language_loss": 0.75054479, + "learning_rate": 2.5107375414149264e-06, + "loss": 0.77251244, + "num_input_tokens_seen": 78030225, + "step": 3620, + "time_per_iteration": 2.5348122119903564 + }, + { + "auxiliary_loss_clip": 0.01123651, + "auxiliary_loss_mlp": 0.01029554, + "balance_loss_clip": 1.04463911, + "balance_loss_mlp": 1.02064323, + "epoch": 0.43539950700414837, + "flos": 16253457494400.0, + "grad_norm": 2.360480711516488, + "language_loss": 0.71477342, + "learning_rate": 2.5099843588446197e-06, + "loss": 0.73630548, + "num_input_tokens_seen": 78048545, + "step": 3621, + "time_per_iteration": 2.579052686691284 + }, + { + "auxiliary_loss_clip": 0.0113952, + "auxiliary_loss_mlp": 0.01029847, + "balance_loss_clip": 1.05127275, + "balance_loss_mlp": 1.02165723, + "epoch": 0.4355197498947875, + "flos": 16691819074560.0, + "grad_norm": 1.6365046167696908, + "language_loss": 0.61305642, + "learning_rate": 2.509231098910091e-06, + "loss": 0.63475013, + "num_input_tokens_seen": 78068415, + "step": 3622, + "time_per_iteration": 2.6133010387420654 + }, + { + "auxiliary_loss_clip": 0.01157272, + "auxiliary_loss_mlp": 0.01026664, + "balance_loss_clip": 1.05511928, + "balance_loss_mlp": 1.01789618, + "epoch": 0.4356399927854266, + "flos": 16362733645440.0, + "grad_norm": 4.262992717606915, + "language_loss": 0.74554563, + "learning_rate": 2.508477761725611e-06, + "loss": 0.76738501, + "num_input_tokens_seen": 78086690, + "step": 3623, + "time_per_iteration": 2.5307250022888184 + }, + { + "auxiliary_loss_clip": 0.01177026, + "auxiliary_loss_mlp": 0.01025201, + "balance_loss_clip": 1.05400741, + "balance_loss_mlp": 1.01702905, + "epoch": 0.43576023567606564, + "flos": 17202037812480.0, + "grad_norm": 2.021003707947955, + "language_loss": 0.80206394, + "learning_rate": 2.507724347405458e-06, + "loss": 0.82408619, + "num_input_tokens_seen": 78104640, + "step": 3624, + "time_per_iteration": 2.4751169681549072 + }, + { + "auxiliary_loss_clip": 0.0112268, + "auxiliary_loss_mlp": 0.01027609, + "balance_loss_clip": 1.04432869, + "balance_loss_mlp": 1.01960444, + "epoch": 0.43588047856670475, + "flos": 15917656222080.0, + "grad_norm": 1.8897988961449161, + "language_loss": 0.81745028, + "learning_rate": 2.5069708560639243e-06, + "loss": 0.8389532, + "num_input_tokens_seen": 78122550, + "step": 3625, + "time_per_iteration": 2.586461305618286 + }, + { + "auxiliary_loss_clip": 0.01144625, + "auxiliary_loss_mlp": 0.01025485, + "balance_loss_clip": 1.04854274, + "balance_loss_mlp": 1.01708698, + "epoch": 0.4360007214573438, + "flos": 23659566099840.0, + "grad_norm": 2.079247945066852, + "language_loss": 0.61604166, + "learning_rate": 2.5062172878153158e-06, + "loss": 0.63774276, + "num_input_tokens_seen": 78141825, + "step": 3626, + "time_per_iteration": 2.5757083892822266 + }, + { + "auxiliary_loss_clip": 0.0112509, + "auxiliary_loss_mlp": 0.01029041, + "balance_loss_clip": 1.04881847, + "balance_loss_mlp": 1.0201776, + "epoch": 0.4361209643479829, + "flos": 21978767036160.0, + "grad_norm": 1.8918838982563875, + "language_loss": 0.87617743, + "learning_rate": 2.505463642773947e-06, + "loss": 0.89771873, + "num_input_tokens_seen": 78161790, + "step": 3627, + "time_per_iteration": 2.6861870288848877 + }, + { + "auxiliary_loss_clip": 0.0114651, + "auxiliary_loss_mlp": 0.00763065, + "balance_loss_clip": 1.05148315, + "balance_loss_mlp": 1.00099134, + "epoch": 0.43624120723862203, + "flos": 17420159151360.0, + "grad_norm": 2.220933959478786, + "language_loss": 0.75184739, + "learning_rate": 2.504709921054146e-06, + "loss": 0.77094316, + "num_input_tokens_seen": 78178605, + "step": 3628, + "time_per_iteration": 2.5630338191986084 + }, + { + "auxiliary_loss_clip": 0.01139945, + "auxiliary_loss_mlp": 0.010306, + "balance_loss_clip": 1.04564631, + "balance_loss_mlp": 1.02190387, + "epoch": 0.4363614501292611, + "flos": 17895293280000.0, + "grad_norm": 1.9777551833602096, + "language_loss": 0.83724046, + "learning_rate": 2.50395612277025e-06, + "loss": 0.85894597, + "num_input_tokens_seen": 78194460, + "step": 3629, + "time_per_iteration": 2.547746181488037 + }, + { + "auxiliary_loss_clip": 0.01160312, + "auxiliary_loss_mlp": 0.01027494, + "balance_loss_clip": 1.0496037, + "balance_loss_mlp": 1.01936412, + "epoch": 0.4364816930199002, + "flos": 20302888135680.0, + "grad_norm": 2.152056421754016, + "language_loss": 0.72902846, + "learning_rate": 2.503202248036612e-06, + "loss": 0.75090647, + "num_input_tokens_seen": 78213315, + "step": 3630, + "time_per_iteration": 2.5831878185272217 + }, + { + "auxiliary_loss_clip": 0.0118441, + "auxiliary_loss_mlp": 0.01032259, + "balance_loss_clip": 1.05273855, + "balance_loss_mlp": 1.02377748, + "epoch": 0.4366019359105393, + "flos": 24061334699520.0, + "grad_norm": 2.595875352256163, + "language_loss": 0.73058152, + "learning_rate": 2.5024482969675927e-06, + "loss": 0.75274825, + "num_input_tokens_seen": 78233270, + "step": 3631, + "time_per_iteration": 2.4993038177490234 + }, + { + "auxiliary_loss_clip": 0.01133959, + "auxiliary_loss_mlp": 0.01024745, + "balance_loss_clip": 1.04735863, + "balance_loss_mlp": 1.0171454, + "epoch": 0.43672217880117836, + "flos": 21754109422080.0, + "grad_norm": 2.457261700320629, + "language_loss": 0.84437376, + "learning_rate": 2.501694269677566e-06, + "loss": 0.86596084, + "num_input_tokens_seen": 78251040, + "step": 3632, + "time_per_iteration": 2.593393325805664 + }, + { + "auxiliary_loss_clip": 0.01175539, + "auxiliary_loss_mlp": 0.01025275, + "balance_loss_clip": 1.05175805, + "balance_loss_mlp": 1.01730013, + "epoch": 0.4368424216918175, + "flos": 18035200753920.0, + "grad_norm": 3.7752919016787523, + "language_loss": 0.804681, + "learning_rate": 2.500940166280918e-06, + "loss": 0.82668912, + "num_input_tokens_seen": 78269470, + "step": 3633, + "time_per_iteration": 2.500882863998413 + }, + { + "auxiliary_loss_clip": 0.0116847, + "auxiliary_loss_mlp": 0.01026848, + "balance_loss_clip": 1.05122209, + "balance_loss_mlp": 1.01881945, + "epoch": 0.4369626645824566, + "flos": 25447127362560.0, + "grad_norm": 1.7616203628198737, + "language_loss": 0.7891469, + "learning_rate": 2.500185986892045e-06, + "loss": 0.81110007, + "num_input_tokens_seen": 78288955, + "step": 3634, + "time_per_iteration": 2.5174083709716797 + }, + { + "auxiliary_loss_clip": 0.01167705, + "auxiliary_loss_mlp": 0.01031167, + "balance_loss_clip": 1.05053806, + "balance_loss_mlp": 1.0230372, + "epoch": 0.43708290747309564, + "flos": 25302694775040.0, + "grad_norm": 2.295443329527029, + "language_loss": 0.77415931, + "learning_rate": 2.499431731625355e-06, + "loss": 0.79614806, + "num_input_tokens_seen": 78307980, + "step": 3635, + "time_per_iteration": 2.533064842224121 + }, + { + "auxiliary_loss_clip": 0.01186642, + "auxiliary_loss_mlp": 0.01026779, + "balance_loss_clip": 1.05328345, + "balance_loss_mlp": 1.01813054, + "epoch": 0.43720315036373475, + "flos": 31575103344000.0, + "grad_norm": 9.85080068793762, + "language_loss": 0.79656434, + "learning_rate": 2.4986774005952686e-06, + "loss": 0.81869853, + "num_input_tokens_seen": 78330355, + "step": 3636, + "time_per_iteration": 2.5505666732788086 + }, + { + "auxiliary_loss_clip": 0.01170862, + "auxiliary_loss_mlp": 0.01025304, + "balance_loss_clip": 1.05475628, + "balance_loss_mlp": 1.01762128, + "epoch": 0.43732339325437386, + "flos": 23112000195840.0, + "grad_norm": 2.424742936642961, + "language_loss": 0.84503591, + "learning_rate": 2.4979229939162166e-06, + "loss": 0.8669976, + "num_input_tokens_seen": 78349135, + "step": 3637, + "time_per_iteration": 2.5708019733428955 + }, + { + "auxiliary_loss_clip": 0.01168112, + "auxiliary_loss_mlp": 0.01024249, + "balance_loss_clip": 1.05337572, + "balance_loss_mlp": 1.01661408, + "epoch": 0.4374436361450129, + "flos": 27746272080000.0, + "grad_norm": 1.7492333203691137, + "language_loss": 0.80315125, + "learning_rate": 2.4971685117026433e-06, + "loss": 0.82507491, + "num_input_tokens_seen": 78368900, + "step": 3638, + "time_per_iteration": 2.5272703170776367 + }, + { + "auxiliary_loss_clip": 0.01173102, + "auxiliary_loss_mlp": 0.0102451, + "balance_loss_clip": 1.05275822, + "balance_loss_mlp": 1.01640964, + "epoch": 0.437563879035652, + "flos": 24172370616960.0, + "grad_norm": 2.2775220872922444, + "language_loss": 0.76611459, + "learning_rate": 2.4964139540690018e-06, + "loss": 0.78809071, + "num_input_tokens_seen": 78392235, + "step": 3639, + "time_per_iteration": 2.570781707763672 + }, + { + "auxiliary_loss_clip": 0.01143737, + "auxiliary_loss_mlp": 0.01026747, + "balance_loss_clip": 1.04837251, + "balance_loss_mlp": 1.01829529, + "epoch": 0.4376841219262911, + "flos": 23477211728640.0, + "grad_norm": 3.45164196704111, + "language_loss": 0.72713393, + "learning_rate": 2.495659321129758e-06, + "loss": 0.74883878, + "num_input_tokens_seen": 78409980, + "step": 3640, + "time_per_iteration": 2.6368091106414795 + }, + { + "auxiliary_loss_clip": 0.01166591, + "auxiliary_loss_mlp": 0.0103038, + "balance_loss_clip": 1.04923725, + "balance_loss_mlp": 1.02273297, + "epoch": 0.4378043648169302, + "flos": 25447809720960.0, + "grad_norm": 2.239554227582977, + "language_loss": 0.75213873, + "learning_rate": 2.494904612999389e-06, + "loss": 0.77410841, + "num_input_tokens_seen": 78428690, + "step": 3641, + "time_per_iteration": 2.5573673248291016 + }, + { + "auxiliary_loss_clip": 0.01066901, + "auxiliary_loss_mlp": 0.01003019, + "balance_loss_clip": 1.01387191, + "balance_loss_mlp": 1.00168943, + "epoch": 0.4379246077075693, + "flos": 53914056986880.0, + "grad_norm": 0.7442459627992538, + "language_loss": 0.56537509, + "learning_rate": 2.4941498297923843e-06, + "loss": 0.58607429, + "num_input_tokens_seen": 78489260, + "step": 3642, + "time_per_iteration": 3.0121705532073975 + }, + { + "auxiliary_loss_clip": 0.01168649, + "auxiliary_loss_mlp": 0.01025485, + "balance_loss_clip": 1.05156064, + "balance_loss_mlp": 1.01773071, + "epoch": 0.43804485059820836, + "flos": 20588305605120.0, + "grad_norm": 1.7853857968269768, + "language_loss": 0.69626528, + "learning_rate": 2.4933949716232424e-06, + "loss": 0.71820664, + "num_input_tokens_seen": 78506785, + "step": 3643, + "time_per_iteration": 3.2825655937194824 + }, + { + "auxiliary_loss_clip": 0.01142728, + "auxiliary_loss_mlp": 0.01025687, + "balance_loss_clip": 1.05096233, + "balance_loss_mlp": 1.0175271, + "epoch": 0.43816509348884747, + "flos": 23876214981120.0, + "grad_norm": 2.116870828353088, + "language_loss": 0.73904252, + "learning_rate": 2.492640038606476e-06, + "loss": 0.76072669, + "num_input_tokens_seen": 78525150, + "step": 3644, + "time_per_iteration": 3.3740460872650146 + }, + { + "auxiliary_loss_clip": 0.01170123, + "auxiliary_loss_mlp": 0.01028076, + "balance_loss_clip": 1.04998016, + "balance_loss_mlp": 1.01968396, + "epoch": 0.4382853363794866, + "flos": 14684448533760.0, + "grad_norm": 1.9441837685561105, + "language_loss": 0.78655189, + "learning_rate": 2.491885030856608e-06, + "loss": 0.80853391, + "num_input_tokens_seen": 78543245, + "step": 3645, + "time_per_iteration": 2.523808240890503 + }, + { + "auxiliary_loss_clip": 0.01158554, + "auxiliary_loss_mlp": 0.0102551, + "balance_loss_clip": 1.05147529, + "balance_loss_mlp": 1.01761258, + "epoch": 0.43840557927012563, + "flos": 17165301177600.0, + "grad_norm": 11.20642491841528, + "language_loss": 0.8258056, + "learning_rate": 2.4911299484881713e-06, + "loss": 0.84764624, + "num_input_tokens_seen": 78560775, + "step": 3646, + "time_per_iteration": 3.961313486099243 + }, + { + "auxiliary_loss_clip": 0.01150571, + "auxiliary_loss_mlp": 0.01023762, + "balance_loss_clip": 1.04839671, + "balance_loss_mlp": 1.01582265, + "epoch": 0.43852582216076474, + "flos": 19390685316480.0, + "grad_norm": 1.6402037704421604, + "language_loss": 0.80898076, + "learning_rate": 2.490374791615712e-06, + "loss": 0.83072412, + "num_input_tokens_seen": 78580800, + "step": 3647, + "time_per_iteration": 2.653379201889038 + }, + { + "auxiliary_loss_clip": 0.01191553, + "auxiliary_loss_mlp": 0.00762991, + "balance_loss_clip": 1.0557754, + "balance_loss_mlp": 1.00096965, + "epoch": 0.43864606505140386, + "flos": 18075133699200.0, + "grad_norm": 2.5319044065166936, + "language_loss": 0.77278507, + "learning_rate": 2.4896195603537867e-06, + "loss": 0.7923305, + "num_input_tokens_seen": 78595410, + "step": 3648, + "time_per_iteration": 2.4322571754455566 + }, + { + "auxiliary_loss_clip": 0.01124417, + "auxiliary_loss_mlp": 0.01023804, + "balance_loss_clip": 1.05056286, + "balance_loss_mlp": 1.01566172, + "epoch": 0.4387663079420429, + "flos": 19644896845440.0, + "grad_norm": 1.7853968236520714, + "language_loss": 0.7425313, + "learning_rate": 2.488864254816964e-06, + "loss": 0.76401353, + "num_input_tokens_seen": 78614100, + "step": 3649, + "time_per_iteration": 2.6201717853546143 + }, + { + "auxiliary_loss_clip": 0.01172774, + "auxiliary_loss_mlp": 0.01035167, + "balance_loss_clip": 1.05354595, + "balance_loss_mlp": 1.02673876, + "epoch": 0.438886550832682, + "flos": 19719339782400.0, + "grad_norm": 2.404680695233135, + "language_loss": 0.68455541, + "learning_rate": 2.4881088751198218e-06, + "loss": 0.70663488, + "num_input_tokens_seen": 78632260, + "step": 3650, + "time_per_iteration": 2.4715523719787598 + }, + { + "auxiliary_loss_clip": 0.01157846, + "auxiliary_loss_mlp": 0.01025243, + "balance_loss_clip": 1.04843926, + "balance_loss_mlp": 1.01706505, + "epoch": 0.43900679372332113, + "flos": 14536675981440.0, + "grad_norm": 2.5837730257461593, + "language_loss": 0.64134473, + "learning_rate": 2.4873534213769517e-06, + "loss": 0.66317558, + "num_input_tokens_seen": 78647490, + "step": 3651, + "time_per_iteration": 2.5342273712158203 + }, + { + "auxiliary_loss_clip": 0.01136455, + "auxiliary_loss_mlp": 0.01028328, + "balance_loss_clip": 1.05003428, + "balance_loss_mlp": 1.0203948, + "epoch": 0.4391270366139602, + "flos": 24056234968320.0, + "grad_norm": 1.7869093475850568, + "language_loss": 0.71853435, + "learning_rate": 2.4865978937029547e-06, + "loss": 0.74018216, + "num_input_tokens_seen": 78666470, + "step": 3652, + "time_per_iteration": 2.5829286575317383 + }, + { + "auxiliary_loss_clip": 0.0111927, + "auxiliary_loss_mlp": 0.01031829, + "balance_loss_clip": 1.04741883, + "balance_loss_mlp": 1.02337754, + "epoch": 0.4392472795045993, + "flos": 31538510363520.0, + "grad_norm": 1.897101178982169, + "language_loss": 0.66380948, + "learning_rate": 2.485842292212445e-06, + "loss": 0.6853205, + "num_input_tokens_seen": 78687685, + "step": 3653, + "time_per_iteration": 2.7081780433654785 + }, + { + "auxiliary_loss_clip": 0.01186716, + "auxiliary_loss_mlp": 0.01031334, + "balance_loss_clip": 1.05411744, + "balance_loss_mlp": 1.02313805, + "epoch": 0.4393675223952384, + "flos": 14866300114560.0, + "grad_norm": 1.9167261233997483, + "language_loss": 0.80071169, + "learning_rate": 2.485086617020045e-06, + "loss": 0.82289219, + "num_input_tokens_seen": 78706180, + "step": 3654, + "time_per_iteration": 2.434624433517456 + }, + { + "auxiliary_loss_clip": 0.01149023, + "auxiliary_loss_mlp": 0.01022036, + "balance_loss_clip": 1.04801023, + "balance_loss_mlp": 1.01367366, + "epoch": 0.43948776528587746, + "flos": 14825900292480.0, + "grad_norm": 2.3516229889782556, + "language_loss": 0.81515968, + "learning_rate": 2.4843308682403903e-06, + "loss": 0.83687025, + "num_input_tokens_seen": 78723095, + "step": 3655, + "time_per_iteration": 2.5429933071136475 + }, + { + "auxiliary_loss_clip": 0.01183775, + "auxiliary_loss_mlp": 0.01026764, + "balance_loss_clip": 1.05255687, + "balance_loss_mlp": 1.01906872, + "epoch": 0.4396080081765166, + "flos": 13914523486080.0, + "grad_norm": 1.7384254872870264, + "language_loss": 0.82481277, + "learning_rate": 2.4835750459881294e-06, + "loss": 0.84691823, + "num_input_tokens_seen": 78739720, + "step": 3656, + "time_per_iteration": 2.4180355072021484 + }, + { + "auxiliary_loss_clip": 0.01149602, + "auxiliary_loss_mlp": 0.01033213, + "balance_loss_clip": 1.04817128, + "balance_loss_mlp": 1.02441549, + "epoch": 0.43972825106715563, + "flos": 18222978078720.0, + "grad_norm": 1.8648655488741956, + "language_loss": 0.82069302, + "learning_rate": 2.4828191503779177e-06, + "loss": 0.84252119, + "num_input_tokens_seen": 78757820, + "step": 3657, + "time_per_iteration": 2.5231990814208984 + }, + { + "auxiliary_loss_clip": 0.0113981, + "auxiliary_loss_mlp": 0.01024762, + "balance_loss_clip": 1.04679561, + "balance_loss_mlp": 1.0163995, + "epoch": 0.43984849395779474, + "flos": 16873239692160.0, + "grad_norm": 2.213352219364349, + "language_loss": 0.89260304, + "learning_rate": 2.482063181524425e-06, + "loss": 0.91424876, + "num_input_tokens_seen": 78773720, + "step": 3658, + "time_per_iteration": 2.499063014984131 + }, + { + "auxiliary_loss_clip": 0.01187436, + "auxiliary_loss_mlp": 0.01034373, + "balance_loss_clip": 1.05435598, + "balance_loss_mlp": 1.02604055, + "epoch": 0.43996873684843385, + "flos": 18691504104960.0, + "grad_norm": 2.3963257457471556, + "language_loss": 0.81337357, + "learning_rate": 2.4813071395423307e-06, + "loss": 0.83559161, + "num_input_tokens_seen": 78791285, + "step": 3659, + "time_per_iteration": 2.472607374191284 + }, + { + "auxiliary_loss_clip": 0.0117055, + "auxiliary_loss_mlp": 0.01030258, + "balance_loss_clip": 1.0514189, + "balance_loss_mlp": 1.02124619, + "epoch": 0.4400889797390729, + "flos": 23653460787840.0, + "grad_norm": 1.7173190274939754, + "language_loss": 0.6425283, + "learning_rate": 2.4805510245463263e-06, + "loss": 0.66453636, + "num_input_tokens_seen": 78811440, + "step": 3660, + "time_per_iteration": 2.5268502235412598 + }, + { + "auxiliary_loss_clip": 0.0117018, + "auxiliary_loss_mlp": 0.01029413, + "balance_loss_clip": 1.05035126, + "balance_loss_mlp": 1.02100265, + "epoch": 0.440209222629712, + "flos": 23149203707520.0, + "grad_norm": 2.0810536263100867, + "language_loss": 0.59642947, + "learning_rate": 2.4797948366511137e-06, + "loss": 0.61842537, + "num_input_tokens_seen": 78831150, + "step": 3661, + "time_per_iteration": 2.5582566261291504 + }, + { + "auxiliary_loss_clip": 0.01142228, + "auxiliary_loss_mlp": 0.01030283, + "balance_loss_clip": 1.04714608, + "balance_loss_mlp": 1.02218294, + "epoch": 0.4403294655203511, + "flos": 24823394668800.0, + "grad_norm": 2.1958751190535732, + "language_loss": 0.76200855, + "learning_rate": 2.4790385759714055e-06, + "loss": 0.78373367, + "num_input_tokens_seen": 78850215, + "step": 3662, + "time_per_iteration": 2.5873031616210938 + }, + { + "auxiliary_loss_clip": 0.01171225, + "auxiliary_loss_mlp": 0.01028072, + "balance_loss_clip": 1.05589151, + "balance_loss_mlp": 1.02007294, + "epoch": 0.4404497084109902, + "flos": 22565080736640.0, + "grad_norm": 2.0053236597744863, + "language_loss": 0.71200073, + "learning_rate": 2.478282242621926e-06, + "loss": 0.73399365, + "num_input_tokens_seen": 78870675, + "step": 3663, + "time_per_iteration": 2.5775856971740723 + }, + { + "auxiliary_loss_clip": 0.01049105, + "auxiliary_loss_mlp": 0.01001351, + "balance_loss_clip": 1.01540351, + "balance_loss_mlp": 1.00000405, + "epoch": 0.4405699513016293, + "flos": 64967073448320.0, + "grad_norm": 0.8342674023161435, + "language_loss": 0.59588355, + "learning_rate": 2.477525836717411e-06, + "loss": 0.61638814, + "num_input_tokens_seen": 78938440, + "step": 3664, + "time_per_iteration": 3.2807576656341553 + }, + { + "auxiliary_loss_clip": 0.01168627, + "auxiliary_loss_mlp": 0.01027091, + "balance_loss_clip": 1.04937458, + "balance_loss_mlp": 1.01906812, + "epoch": 0.4406901941922684, + "flos": 35661952978560.0, + "grad_norm": 3.4488844858826795, + "language_loss": 0.79299021, + "learning_rate": 2.476769358372606e-06, + "loss": 0.81494743, + "num_input_tokens_seen": 78960090, + "step": 3665, + "time_per_iteration": 2.6583526134490967 + }, + { + "auxiliary_loss_clip": 0.01139013, + "auxiliary_loss_mlp": 0.01025363, + "balance_loss_clip": 1.05059946, + "balance_loss_mlp": 1.01810932, + "epoch": 0.44081043708290746, + "flos": 18040767361920.0, + "grad_norm": 2.0630928425824715, + "language_loss": 0.74562895, + "learning_rate": 2.4760128077022683e-06, + "loss": 0.76727271, + "num_input_tokens_seen": 78978225, + "step": 3666, + "time_per_iteration": 2.5342626571655273 + }, + { + "auxiliary_loss_clip": 0.01121764, + "auxiliary_loss_mlp": 0.01023342, + "balance_loss_clip": 1.04983139, + "balance_loss_mlp": 1.01563501, + "epoch": 0.44093067997354657, + "flos": 30153507799680.0, + "grad_norm": 1.5160058059876906, + "language_loss": 0.68164217, + "learning_rate": 2.4752561848211672e-06, + "loss": 0.70309323, + "num_input_tokens_seen": 79000625, + "step": 3667, + "time_per_iteration": 2.6499717235565186 + }, + { + "auxiliary_loss_clip": 0.01171049, + "auxiliary_loss_mlp": 0.0103187, + "balance_loss_clip": 1.05676365, + "balance_loss_mlp": 1.02369201, + "epoch": 0.4410509228641857, + "flos": 23255068066560.0, + "grad_norm": 1.9330302266982509, + "language_loss": 0.71117651, + "learning_rate": 2.4744994898440797e-06, + "loss": 0.7332058, + "num_input_tokens_seen": 79019415, + "step": 3668, + "time_per_iteration": 2.5080621242523193 + }, + { + "auxiliary_loss_clip": 0.01146239, + "auxiliary_loss_mlp": 0.01031626, + "balance_loss_clip": 1.04872656, + "balance_loss_mlp": 1.02285814, + "epoch": 0.44117116575482473, + "flos": 19500571998720.0, + "grad_norm": 4.004353888370738, + "language_loss": 0.83547121, + "learning_rate": 2.473742722885797e-06, + "loss": 0.85724986, + "num_input_tokens_seen": 79038435, + "step": 3669, + "time_per_iteration": 2.5301356315612793 + }, + { + "auxiliary_loss_clip": 0.01173227, + "auxiliary_loss_mlp": 0.00763045, + "balance_loss_clip": 1.05578077, + "balance_loss_mlp": 1.00108886, + "epoch": 0.44129140864546385, + "flos": 27053124353280.0, + "grad_norm": 3.651437640540076, + "language_loss": 0.65362, + "learning_rate": 2.4729858840611197e-06, + "loss": 0.67298275, + "num_input_tokens_seen": 79057345, + "step": 3670, + "time_per_iteration": 3.2535464763641357 + }, + { + "auxiliary_loss_clip": 0.01185202, + "auxiliary_loss_mlp": 0.01026332, + "balance_loss_clip": 1.05463433, + "balance_loss_mlp": 1.01827991, + "epoch": 0.4414116515361029, + "flos": 26102101910400.0, + "grad_norm": 18.895538519980732, + "language_loss": 0.72678542, + "learning_rate": 2.4722289734848605e-06, + "loss": 0.74890071, + "num_input_tokens_seen": 79077810, + "step": 3671, + "time_per_iteration": 3.3236656188964844 + }, + { + "auxiliary_loss_clip": 0.01141637, + "auxiliary_loss_mlp": 0.01028102, + "balance_loss_clip": 1.05332184, + "balance_loss_mlp": 1.01959085, + "epoch": 0.441531894426742, + "flos": 21906083865600.0, + "grad_norm": 2.0086615819674054, + "language_loss": 0.77848721, + "learning_rate": 2.471471991271841e-06, + "loss": 0.80018461, + "num_input_tokens_seen": 79094935, + "step": 3672, + "time_per_iteration": 3.2673275470733643 + }, + { + "auxiliary_loss_clip": 0.01164714, + "auxiliary_loss_mlp": 0.01022577, + "balance_loss_clip": 1.05168915, + "balance_loss_mlp": 1.01434517, + "epoch": 0.4416521373173811, + "flos": 23437099215360.0, + "grad_norm": 1.8734151892321949, + "language_loss": 0.79352969, + "learning_rate": 2.470714937536896e-06, + "loss": 0.81540263, + "num_input_tokens_seen": 79113660, + "step": 3673, + "time_per_iteration": 3.235666275024414 + }, + { + "auxiliary_loss_clip": 0.01126381, + "auxiliary_loss_mlp": 0.01030908, + "balance_loss_clip": 1.04822385, + "balance_loss_mlp": 1.02251601, + "epoch": 0.4417723802080202, + "flos": 20334345471360.0, + "grad_norm": 1.906123576142502, + "language_loss": 0.7031883, + "learning_rate": 2.469957812394868e-06, + "loss": 0.72476113, + "num_input_tokens_seen": 79132470, + "step": 3674, + "time_per_iteration": 2.562448501586914 + }, + { + "auxiliary_loss_clip": 0.01185031, + "auxiliary_loss_mlp": 0.01027512, + "balance_loss_clip": 1.05577469, + "balance_loss_mlp": 1.01951945, + "epoch": 0.4418926230986593, + "flos": 18880682060160.0, + "grad_norm": 2.014583485922744, + "language_loss": 0.76072431, + "learning_rate": 2.4692006159606148e-06, + "loss": 0.78284979, + "num_input_tokens_seen": 79150000, + "step": 3675, + "time_per_iteration": 2.454901933670044 + }, + { + "auxiliary_loss_clip": 0.01186138, + "auxiliary_loss_mlp": 0.01027592, + "balance_loss_clip": 1.05505228, + "balance_loss_mlp": 1.01946771, + "epoch": 0.4420128659892984, + "flos": 19464409981440.0, + "grad_norm": 2.206368717475748, + "language_loss": 0.78355205, + "learning_rate": 2.468443348349e-06, + "loss": 0.80568939, + "num_input_tokens_seen": 79167875, + "step": 3676, + "time_per_iteration": 2.4531409740448 + }, + { + "auxiliary_loss_clip": 0.01127596, + "auxiliary_loss_mlp": 0.01027989, + "balance_loss_clip": 1.04799747, + "balance_loss_mlp": 1.01893544, + "epoch": 0.44213310887993745, + "flos": 17894359526400.0, + "grad_norm": 2.5919450178990844, + "language_loss": 0.82125032, + "learning_rate": 2.467686009674902e-06, + "loss": 0.84280622, + "num_input_tokens_seen": 79182325, + "step": 3677, + "time_per_iteration": 2.56907057762146 + }, + { + "auxiliary_loss_clip": 0.01166478, + "auxiliary_loss_mlp": 0.0102698, + "balance_loss_clip": 1.05003583, + "balance_loss_mlp": 1.01846814, + "epoch": 0.44225335177057656, + "flos": 19204667758080.0, + "grad_norm": 2.3453879253106464, + "language_loss": 0.85462546, + "learning_rate": 2.466928600053209e-06, + "loss": 0.87656009, + "num_input_tokens_seen": 79197630, + "step": 3678, + "time_per_iteration": 2.4917778968811035 + }, + { + "auxiliary_loss_clip": 0.01155868, + "auxiliary_loss_mlp": 0.01026488, + "balance_loss_clip": 1.05031204, + "balance_loss_mlp": 1.01838803, + "epoch": 0.4423735946612157, + "flos": 23471321898240.0, + "grad_norm": 2.5774450864154423, + "language_loss": 0.71773267, + "learning_rate": 2.466171119598818e-06, + "loss": 0.73955625, + "num_input_tokens_seen": 79217600, + "step": 3679, + "time_per_iteration": 2.635673999786377 + }, + { + "auxiliary_loss_clip": 0.011767, + "auxiliary_loss_mlp": 0.01034047, + "balance_loss_clip": 1.05092132, + "balance_loss_mlp": 1.02538645, + "epoch": 0.44249383755185473, + "flos": 26685398868480.0, + "grad_norm": 2.0970139793008222, + "language_loss": 0.77242517, + "learning_rate": 2.465413568426639e-06, + "loss": 0.79453266, + "num_input_tokens_seen": 79238550, + "step": 3680, + "time_per_iteration": 2.5305044651031494 + }, + { + "auxiliary_loss_clip": 0.01164319, + "auxiliary_loss_mlp": 0.010222, + "balance_loss_clip": 1.05071378, + "balance_loss_mlp": 1.01485705, + "epoch": 0.44261408044249384, + "flos": 23147659422720.0, + "grad_norm": 1.69248722816639, + "language_loss": 0.81201935, + "learning_rate": 2.464655946651591e-06, + "loss": 0.83388448, + "num_input_tokens_seen": 79257555, + "step": 3681, + "time_per_iteration": 2.4880049228668213 + }, + { + "auxiliary_loss_clip": 0.01175097, + "auxiliary_loss_mlp": 0.01029676, + "balance_loss_clip": 1.05392647, + "balance_loss_mlp": 1.0211823, + "epoch": 0.44273432333313295, + "flos": 24462564595200.0, + "grad_norm": 1.7989483464787672, + "language_loss": 0.81008714, + "learning_rate": 2.4638982543886065e-06, + "loss": 0.8321349, + "num_input_tokens_seen": 79277595, + "step": 3682, + "time_per_iteration": 2.518897771835327 + }, + { + "auxiliary_loss_clip": 0.01175817, + "auxiliary_loss_mlp": 0.01032169, + "balance_loss_clip": 1.05526721, + "balance_loss_mlp": 1.02370524, + "epoch": 0.442854566223772, + "flos": 17528932512000.0, + "grad_norm": 3.969333833877155, + "language_loss": 0.87480175, + "learning_rate": 2.4631404917526254e-06, + "loss": 0.89688152, + "num_input_tokens_seen": 79294550, + "step": 3683, + "time_per_iteration": 2.45359468460083 + }, + { + "auxiliary_loss_clip": 0.01165165, + "auxiliary_loss_mlp": 0.01025928, + "balance_loss_clip": 1.05046737, + "balance_loss_mlp": 1.0182364, + "epoch": 0.4429748091144111, + "flos": 24896293320960.0, + "grad_norm": 1.6923987672756866, + "language_loss": 0.78999799, + "learning_rate": 2.4623826588586e-06, + "loss": 0.81190896, + "num_input_tokens_seen": 79314820, + "step": 3684, + "time_per_iteration": 2.508892774581909 + }, + { + "auxiliary_loss_clip": 0.01152292, + "auxiliary_loss_mlp": 0.01029236, + "balance_loss_clip": 1.04835343, + "balance_loss_mlp": 1.02020025, + "epoch": 0.4430950520050502, + "flos": 21614704738560.0, + "grad_norm": 1.539078075406758, + "language_loss": 0.82669181, + "learning_rate": 2.461624755821492e-06, + "loss": 0.84850711, + "num_input_tokens_seen": 79334300, + "step": 3685, + "time_per_iteration": 2.5164449214935303 + }, + { + "auxiliary_loss_clip": 0.01142108, + "auxiliary_loss_mlp": 0.01024968, + "balance_loss_clip": 1.04881752, + "balance_loss_mlp": 1.01733279, + "epoch": 0.4432152948956893, + "flos": 24572271709440.0, + "grad_norm": 1.728230573475544, + "language_loss": 0.76319766, + "learning_rate": 2.4608667827562763e-06, + "loss": 0.78486842, + "num_input_tokens_seen": 79353630, + "step": 3686, + "time_per_iteration": 2.567831039428711 + }, + { + "auxiliary_loss_clip": 0.01177852, + "auxiliary_loss_mlp": 0.01027481, + "balance_loss_clip": 1.05558908, + "balance_loss_mlp": 1.01905262, + "epoch": 0.4433355377863284, + "flos": 21762261809280.0, + "grad_norm": 2.308062198843961, + "language_loss": 0.90178633, + "learning_rate": 2.460108739777936e-06, + "loss": 0.92383969, + "num_input_tokens_seen": 79372765, + "step": 3687, + "time_per_iteration": 2.495622158050537 + }, + { + "auxiliary_loss_clip": 0.01157233, + "auxiliary_loss_mlp": 0.01027074, + "balance_loss_clip": 1.05376327, + "balance_loss_mlp": 1.01874685, + "epoch": 0.44345578067696745, + "flos": 20084479488000.0, + "grad_norm": 1.6417739206907631, + "language_loss": 0.76667941, + "learning_rate": 2.4593506270014656e-06, + "loss": 0.78852248, + "num_input_tokens_seen": 79391735, + "step": 3688, + "time_per_iteration": 2.5313634872436523 + }, + { + "auxiliary_loss_clip": 0.01159865, + "auxiliary_loss_mlp": 0.01028993, + "balance_loss_clip": 1.04774332, + "balance_loss_mlp": 1.02080965, + "epoch": 0.44357602356760656, + "flos": 24169497528960.0, + "grad_norm": 1.5256739137076971, + "language_loss": 0.81878138, + "learning_rate": 2.45859244454187e-06, + "loss": 0.84066993, + "num_input_tokens_seen": 79411525, + "step": 3689, + "time_per_iteration": 2.55718994140625 + }, + { + "auxiliary_loss_clip": 0.01169388, + "auxiliary_loss_mlp": 0.01026954, + "balance_loss_clip": 1.05301714, + "balance_loss_mlp": 1.01926529, + "epoch": 0.44369626645824567, + "flos": 22707717644160.0, + "grad_norm": 1.8442170236383653, + "language_loss": 0.66023189, + "learning_rate": 2.4578341925141655e-06, + "loss": 0.68219531, + "num_input_tokens_seen": 79430740, + "step": 3690, + "time_per_iteration": 2.49944806098938 + }, + { + "auxiliary_loss_clip": 0.01179213, + "auxiliary_loss_mlp": 0.01025429, + "balance_loss_clip": 1.05356419, + "balance_loss_mlp": 1.01713169, + "epoch": 0.4438165093488847, + "flos": 38030225420160.0, + "grad_norm": 2.7773661089092756, + "language_loss": 0.72537309, + "learning_rate": 2.457075871033378e-06, + "loss": 0.74741954, + "num_input_tokens_seen": 79452615, + "step": 3691, + "time_per_iteration": 2.6270432472229004 + }, + { + "auxiliary_loss_clip": 0.01140988, + "auxiliary_loss_mlp": 0.01025142, + "balance_loss_clip": 1.04884505, + "balance_loss_mlp": 1.01692867, + "epoch": 0.44393675223952384, + "flos": 15523213996800.0, + "grad_norm": 2.179152535185978, + "language_loss": 0.88103139, + "learning_rate": 2.4563174802145445e-06, + "loss": 0.90269268, + "num_input_tokens_seen": 79469865, + "step": 3692, + "time_per_iteration": 2.520759344100952 + }, + { + "auxiliary_loss_clip": 0.01062828, + "auxiliary_loss_mlp": 0.0100075, + "balance_loss_clip": 1.01732206, + "balance_loss_mlp": 0.99951631, + "epoch": 0.44405699513016295, + "flos": 64574893779840.0, + "grad_norm": 0.6387990750543886, + "language_loss": 0.48657721, + "learning_rate": 2.455559020172712e-06, + "loss": 0.507213, + "num_input_tokens_seen": 79537220, + "step": 3693, + "time_per_iteration": 3.1723289489746094 + }, + { + "auxiliary_loss_clip": 0.01134298, + "auxiliary_loss_mlp": 0.01038165, + "balance_loss_clip": 1.0535543, + "balance_loss_mlp": 1.03004694, + "epoch": 0.444177238020802, + "flos": 23987394552960.0, + "grad_norm": 1.9687172400303308, + "language_loss": 0.8961271, + "learning_rate": 2.4548004910229385e-06, + "loss": 0.91785169, + "num_input_tokens_seen": 79554795, + "step": 3694, + "time_per_iteration": 2.5971620082855225 + }, + { + "auxiliary_loss_clip": 0.01174203, + "auxiliary_loss_mlp": 0.00763347, + "balance_loss_clip": 1.0533967, + "balance_loss_mlp": 1.00113714, + "epoch": 0.4442974809114411, + "flos": 22563069575040.0, + "grad_norm": 2.263028480673852, + "language_loss": 0.86778724, + "learning_rate": 2.4540418928802913e-06, + "loss": 0.8871628, + "num_input_tokens_seen": 79573530, + "step": 3695, + "time_per_iteration": 2.503726005554199 + }, + { + "auxiliary_loss_clip": 0.01158559, + "auxiliary_loss_mlp": 0.01029304, + "balance_loss_clip": 1.0514673, + "balance_loss_mlp": 1.02048254, + "epoch": 0.4444177238020802, + "flos": 17675699483520.0, + "grad_norm": 2.2316207472432965, + "language_loss": 0.66515034, + "learning_rate": 2.4532832258598506e-06, + "loss": 0.68702888, + "num_input_tokens_seen": 79591360, + "step": 3696, + "time_per_iteration": 2.4810051918029785 + }, + { + "auxiliary_loss_clip": 0.01184331, + "auxiliary_loss_mlp": 0.0102384, + "balance_loss_clip": 1.05440581, + "balance_loss_mlp": 1.01572156, + "epoch": 0.4445379666927193, + "flos": 28621594609920.0, + "grad_norm": 1.6934592007271958, + "language_loss": 0.80433381, + "learning_rate": 2.4525244900767047e-06, + "loss": 0.82641554, + "num_input_tokens_seen": 79612175, + "step": 3697, + "time_per_iteration": 4.251093626022339 + }, + { + "auxiliary_loss_clip": 0.0106844, + "auxiliary_loss_mlp": 0.01002289, + "balance_loss_clip": 1.01861727, + "balance_loss_mlp": 1.00106668, + "epoch": 0.4446582095833584, + "flos": 70487370115200.0, + "grad_norm": 0.7653794317204278, + "language_loss": 0.60457045, + "learning_rate": 2.4517656856459536e-06, + "loss": 0.62527776, + "num_input_tokens_seen": 79678020, + "step": 3698, + "time_per_iteration": 3.1677353382110596 + }, + { + "auxiliary_loss_clip": 0.0117077, + "auxiliary_loss_mlp": 0.01029193, + "balance_loss_clip": 1.05076122, + "balance_loss_mlp": 1.02122974, + "epoch": 0.4447784524739975, + "flos": 26505199313280.0, + "grad_norm": 1.6868393364186658, + "language_loss": 0.678105, + "learning_rate": 2.4510068126827073e-06, + "loss": 0.70010459, + "num_input_tokens_seen": 79699020, + "step": 3699, + "time_per_iteration": 3.4156439304351807 + }, + { + "auxiliary_loss_clip": 0.01157505, + "auxiliary_loss_mlp": 0.01035098, + "balance_loss_clip": 1.0514816, + "balance_loss_mlp": 1.02700949, + "epoch": 0.44489869536463655, + "flos": 11656209553920.0, + "grad_norm": 2.287557304991087, + "language_loss": 0.81624317, + "learning_rate": 2.450247871302086e-06, + "loss": 0.83816916, + "num_input_tokens_seen": 79716795, + "step": 3700, + "time_per_iteration": 2.5261268615722656 + }, + { + "auxiliary_loss_clip": 0.01176728, + "auxiliary_loss_mlp": 0.01027603, + "balance_loss_clip": 1.05423951, + "balance_loss_mlp": 1.01975894, + "epoch": 0.44501893825527566, + "flos": 20448469958400.0, + "grad_norm": 3.3351549287535143, + "language_loss": 0.8280136, + "learning_rate": 2.44948886161922e-06, + "loss": 0.85005695, + "num_input_tokens_seen": 79735810, + "step": 3701, + "time_per_iteration": 2.5061447620391846 + }, + { + "auxiliary_loss_clip": 0.01173655, + "auxiliary_loss_mlp": 0.01026141, + "balance_loss_clip": 1.05308807, + "balance_loss_mlp": 1.01861048, + "epoch": 0.4451391811459148, + "flos": 18261079430400.0, + "grad_norm": 1.6520079475904499, + "language_loss": 0.84643602, + "learning_rate": 2.4487297837492524e-06, + "loss": 0.86843401, + "num_input_tokens_seen": 79754975, + "step": 3702, + "time_per_iteration": 2.486421823501587 + }, + { + "auxiliary_loss_clip": 0.01140515, + "auxiliary_loss_mlp": 0.01027589, + "balance_loss_clip": 1.04842901, + "balance_loss_mlp": 1.01917863, + "epoch": 0.44525942403655383, + "flos": 16910155895040.0, + "grad_norm": 2.3778325551524113, + "language_loss": 0.62149262, + "learning_rate": 2.4479706378073323e-06, + "loss": 0.64317369, + "num_input_tokens_seen": 79773515, + "step": 3703, + "time_per_iteration": 2.5334553718566895 + }, + { + "auxiliary_loss_clip": 0.0113377, + "auxiliary_loss_mlp": 0.01026757, + "balance_loss_clip": 1.04695976, + "balance_loss_mlp": 1.01877618, + "epoch": 0.44537966692719294, + "flos": 23258838994560.0, + "grad_norm": 1.630253800030373, + "language_loss": 0.83667994, + "learning_rate": 2.447211423908623e-06, + "loss": 0.85828519, + "num_input_tokens_seen": 79793560, + "step": 3704, + "time_per_iteration": 2.5876448154449463 + }, + { + "auxiliary_loss_clip": 0.01173591, + "auxiliary_loss_mlp": 0.01030084, + "balance_loss_clip": 1.05248618, + "balance_loss_mlp": 1.02210259, + "epoch": 0.445499909817832, + "flos": 21724160457600.0, + "grad_norm": 1.9142539895790645, + "language_loss": 0.74731648, + "learning_rate": 2.4464521421682966e-06, + "loss": 0.76935327, + "num_input_tokens_seen": 79811150, + "step": 3705, + "time_per_iteration": 2.4882283210754395 + }, + { + "auxiliary_loss_clip": 0.01165002, + "auxiliary_loss_mlp": 0.01023849, + "balance_loss_clip": 1.05247331, + "balance_loss_mlp": 1.01657772, + "epoch": 0.4456201527084711, + "flos": 23987969170560.0, + "grad_norm": 1.3561734571072717, + "language_loss": 0.87657183, + "learning_rate": 2.4456927927015345e-06, + "loss": 0.89846039, + "num_input_tokens_seen": 79832190, + "step": 3706, + "time_per_iteration": 2.524193048477173 + }, + { + "auxiliary_loss_clip": 0.01162726, + "auxiliary_loss_mlp": 0.01028592, + "balance_loss_clip": 1.05380368, + "balance_loss_mlp": 1.020051, + "epoch": 0.4457403955991102, + "flos": 18807065136000.0, + "grad_norm": 2.6263783064131703, + "language_loss": 0.76170599, + "learning_rate": 2.4449333756235307e-06, + "loss": 0.78361917, + "num_input_tokens_seen": 79848905, + "step": 3707, + "time_per_iteration": 2.497467517852783 + }, + { + "auxiliary_loss_clip": 0.01177752, + "auxiliary_loss_mlp": 0.01032797, + "balance_loss_clip": 1.05532622, + "balance_loss_mlp": 1.02445197, + "epoch": 0.4458606384897493, + "flos": 19207756327680.0, + "grad_norm": 2.7197149305623176, + "language_loss": 0.78645122, + "learning_rate": 2.4441738910494876e-06, + "loss": 0.80855674, + "num_input_tokens_seen": 79863640, + "step": 3708, + "time_per_iteration": 2.4795079231262207 + }, + { + "auxiliary_loss_clip": 0.01162983, + "auxiliary_loss_mlp": 0.01033393, + "balance_loss_clip": 1.04941607, + "balance_loss_mlp": 1.02562702, + "epoch": 0.4459808813803884, + "flos": 21361283308800.0, + "grad_norm": 1.8220040188784998, + "language_loss": 0.8201139, + "learning_rate": 2.4434143390946176e-06, + "loss": 0.84207767, + "num_input_tokens_seen": 79882450, + "step": 3709, + "time_per_iteration": 2.5214312076568604 + }, + { + "auxiliary_loss_clip": 0.01140065, + "auxiliary_loss_mlp": 0.01029536, + "balance_loss_clip": 1.04788446, + "balance_loss_mlp": 1.02152491, + "epoch": 0.4461011242710275, + "flos": 23288967527040.0, + "grad_norm": 1.8845129218249341, + "language_loss": 0.85344696, + "learning_rate": 2.4426547198741457e-06, + "loss": 0.87514305, + "num_input_tokens_seen": 79900655, + "step": 3710, + "time_per_iteration": 2.5798499584198 + }, + { + "auxiliary_loss_clip": 0.0112993, + "auxiliary_loss_mlp": 0.01034569, + "balance_loss_clip": 1.05163264, + "balance_loss_mlp": 1.0263797, + "epoch": 0.44622136716166655, + "flos": 20193001453440.0, + "grad_norm": 2.0798719127479317, + "language_loss": 0.74298322, + "learning_rate": 2.441895033503305e-06, + "loss": 0.76462823, + "num_input_tokens_seen": 79918575, + "step": 3711, + "time_per_iteration": 2.569758176803589 + }, + { + "auxiliary_loss_clip": 0.01169247, + "auxiliary_loss_mlp": 0.01029185, + "balance_loss_clip": 1.05139267, + "balance_loss_mlp": 1.02060807, + "epoch": 0.44634161005230566, + "flos": 21283033530240.0, + "grad_norm": 1.7054267741403497, + "language_loss": 0.81553555, + "learning_rate": 2.4411352800973375e-06, + "loss": 0.83751976, + "num_input_tokens_seen": 79937010, + "step": 3712, + "time_per_iteration": 2.477693557739258 + }, + { + "auxiliary_loss_clip": 0.0113722, + "auxiliary_loss_mlp": 0.01026605, + "balance_loss_clip": 1.04651976, + "balance_loss_mlp": 1.01777184, + "epoch": 0.44646185294294477, + "flos": 22929358515840.0, + "grad_norm": 2.4119785650077357, + "language_loss": 0.75168568, + "learning_rate": 2.4403754597715005e-06, + "loss": 0.77332389, + "num_input_tokens_seen": 79956455, + "step": 3713, + "time_per_iteration": 2.566734552383423 + }, + { + "auxiliary_loss_clip": 0.01158966, + "auxiliary_loss_mlp": 0.01034404, + "balance_loss_clip": 1.04733562, + "balance_loss_mlp": 1.0254631, + "epoch": 0.4465820958335838, + "flos": 22637692080000.0, + "grad_norm": 2.1204060532456257, + "language_loss": 0.92439157, + "learning_rate": 2.4396155726410553e-06, + "loss": 0.9463253, + "num_input_tokens_seen": 79975065, + "step": 3714, + "time_per_iteration": 2.5298476219177246 + }, + { + "auxiliary_loss_clip": 0.01175295, + "auxiliary_loss_mlp": 0.01026241, + "balance_loss_clip": 1.05045128, + "balance_loss_mlp": 1.01858163, + "epoch": 0.44670233872422294, + "flos": 22672525294080.0, + "grad_norm": 2.544655582669896, + "language_loss": 0.91009068, + "learning_rate": 2.438855618821278e-06, + "loss": 0.93210602, + "num_input_tokens_seen": 79990865, + "step": 3715, + "time_per_iteration": 2.4794702529907227 + }, + { + "auxiliary_loss_clip": 0.0115969, + "auxiliary_loss_mlp": 0.01029833, + "balance_loss_clip": 1.04620743, + "balance_loss_mlp": 1.02130401, + "epoch": 0.44682258161486205, + "flos": 23582178247680.0, + "grad_norm": 1.8705565709339231, + "language_loss": 0.67755759, + "learning_rate": 2.4380955984274517e-06, + "loss": 0.69945288, + "num_input_tokens_seen": 80009520, + "step": 3716, + "time_per_iteration": 2.5032546520233154 + }, + { + "auxiliary_loss_clip": 0.01169631, + "auxiliary_loss_mlp": 0.01032977, + "balance_loss_clip": 1.05081761, + "balance_loss_mlp": 1.02510357, + "epoch": 0.4469428245055011, + "flos": 26501356558080.0, + "grad_norm": 3.4609074367474975, + "language_loss": 0.76860845, + "learning_rate": 2.4373355115748716e-06, + "loss": 0.79063451, + "num_input_tokens_seen": 80030350, + "step": 3717, + "time_per_iteration": 2.5246150493621826 + }, + { + "auxiliary_loss_clip": 0.01151625, + "auxiliary_loss_mlp": 0.01034074, + "balance_loss_clip": 1.05128646, + "balance_loss_mlp": 1.02572346, + "epoch": 0.4470630673961402, + "flos": 21504925797120.0, + "grad_norm": 1.8537523171985857, + "language_loss": 0.72111791, + "learning_rate": 2.436575358378842e-06, + "loss": 0.74297494, + "num_input_tokens_seen": 80049840, + "step": 3718, + "time_per_iteration": 2.5317256450653076 + }, + { + "auxiliary_loss_clip": 0.01164819, + "auxiliary_loss_mlp": 0.01031652, + "balance_loss_clip": 1.05181468, + "balance_loss_mlp": 1.02279425, + "epoch": 0.44718331028677927, + "flos": 16173986653440.0, + "grad_norm": 2.7015878969059486, + "language_loss": 0.83165389, + "learning_rate": 2.4358151389546782e-06, + "loss": 0.85361856, + "num_input_tokens_seen": 80066525, + "step": 3719, + "time_per_iteration": 2.4966166019439697 + }, + { + "auxiliary_loss_clip": 0.0118438, + "auxiliary_loss_mlp": 0.01029724, + "balance_loss_clip": 1.0531981, + "balance_loss_mlp": 1.0215404, + "epoch": 0.4473035531774184, + "flos": 19681238430720.0, + "grad_norm": 2.424375245522688, + "language_loss": 0.7591967, + "learning_rate": 2.4350548534177035e-06, + "loss": 0.78133774, + "num_input_tokens_seen": 80083355, + "step": 3720, + "time_per_iteration": 2.42197847366333 + }, + { + "auxiliary_loss_clip": 0.01139943, + "auxiliary_loss_mlp": 0.01032909, + "balance_loss_clip": 1.04715157, + "balance_loss_mlp": 1.0250349, + "epoch": 0.4474237960680575, + "flos": 41427590515200.0, + "grad_norm": 1.629153179499626, + "language_loss": 0.66596258, + "learning_rate": 2.434294501883254e-06, + "loss": 0.68769109, + "num_input_tokens_seen": 80106450, + "step": 3721, + "time_per_iteration": 2.727205991744995 + }, + { + "auxiliary_loss_clip": 0.01147386, + "auxiliary_loss_mlp": 0.0102756, + "balance_loss_clip": 1.04695988, + "balance_loss_mlp": 1.01903045, + "epoch": 0.44754403895869654, + "flos": 22891328991360.0, + "grad_norm": 1.7227799616800592, + "language_loss": 0.65845042, + "learning_rate": 2.433534084466674e-06, + "loss": 0.68019992, + "num_input_tokens_seen": 80125670, + "step": 3722, + "time_per_iteration": 2.515187978744507 + }, + { + "auxiliary_loss_clip": 0.01181451, + "auxiliary_loss_mlp": 0.0102689, + "balance_loss_clip": 1.05286002, + "balance_loss_mlp": 1.01918912, + "epoch": 0.44766428184933565, + "flos": 25630271832960.0, + "grad_norm": 1.4625862018617437, + "language_loss": 0.70492649, + "learning_rate": 2.4327736012833178e-06, + "loss": 0.72700989, + "num_input_tokens_seen": 80147390, + "step": 3723, + "time_per_iteration": 3.2620458602905273 + }, + { + "auxiliary_loss_clip": 0.01171568, + "auxiliary_loss_mlp": 0.01029017, + "balance_loss_clip": 1.05356276, + "balance_loss_mlp": 1.0209105, + "epoch": 0.44778452473997477, + "flos": 20448972748800.0, + "grad_norm": 2.095383494222623, + "language_loss": 0.76471847, + "learning_rate": 2.4320130524485506e-06, + "loss": 0.78672433, + "num_input_tokens_seen": 80166185, + "step": 3724, + "time_per_iteration": 3.2452681064605713 + }, + { + "auxiliary_loss_clip": 0.0115058, + "auxiliary_loss_mlp": 0.01022329, + "balance_loss_clip": 1.05411303, + "balance_loss_mlp": 1.01522994, + "epoch": 0.4479047676306138, + "flos": 21975462984960.0, + "grad_norm": 1.8680086759214134, + "language_loss": 0.79612243, + "learning_rate": 2.431252438077746e-06, + "loss": 0.81785154, + "num_input_tokens_seen": 80185685, + "step": 3725, + "time_per_iteration": 2.519768238067627 + }, + { + "auxiliary_loss_clip": 0.01173629, + "auxiliary_loss_mlp": 0.00763077, + "balance_loss_clip": 1.05078328, + "balance_loss_mlp": 1.00109482, + "epoch": 0.44802501052125293, + "flos": 21467219495040.0, + "grad_norm": 2.5549038487077778, + "language_loss": 0.7718066, + "learning_rate": 2.4304917582862906e-06, + "loss": 0.79117364, + "num_input_tokens_seen": 80204865, + "step": 3726, + "time_per_iteration": 3.362299680709839 + }, + { + "auxiliary_loss_clip": 0.01184131, + "auxiliary_loss_mlp": 0.01025132, + "balance_loss_clip": 1.05309319, + "balance_loss_mlp": 1.01709139, + "epoch": 0.44814525341189204, + "flos": 22126970551680.0, + "grad_norm": 1.8926506982205589, + "language_loss": 0.87526488, + "learning_rate": 2.4297310131895774e-06, + "loss": 0.89735758, + "num_input_tokens_seen": 80223410, + "step": 3727, + "time_per_iteration": 2.4442331790924072 + }, + { + "auxiliary_loss_clip": 0.01168978, + "auxiliary_loss_mlp": 0.01030527, + "balance_loss_clip": 1.05078506, + "balance_loss_mlp": 1.02210224, + "epoch": 0.4482654963025311, + "flos": 16653933204480.0, + "grad_norm": 1.9819595953526872, + "language_loss": 0.74292016, + "learning_rate": 2.4289702029030113e-06, + "loss": 0.76491523, + "num_input_tokens_seen": 80240880, + "step": 3728, + "time_per_iteration": 2.607011556625366 + }, + { + "auxiliary_loss_clip": 0.01172054, + "auxiliary_loss_mlp": 0.01027086, + "balance_loss_clip": 1.05557394, + "balance_loss_mlp": 1.01892662, + "epoch": 0.4483857391931702, + "flos": 18841251905280.0, + "grad_norm": 1.9413602089933866, + "language_loss": 0.82940733, + "learning_rate": 2.4282093275420057e-06, + "loss": 0.85139871, + "num_input_tokens_seen": 80259910, + "step": 3729, + "time_per_iteration": 2.4701099395751953 + }, + { + "auxiliary_loss_clip": 0.01174453, + "auxiliary_loss_mlp": 0.01030053, + "balance_loss_clip": 1.05320108, + "balance_loss_mlp": 1.02220893, + "epoch": 0.4485059820838093, + "flos": 20372590477440.0, + "grad_norm": 2.210423358115156, + "language_loss": 0.70486093, + "learning_rate": 2.4274483872219863e-06, + "loss": 0.726906, + "num_input_tokens_seen": 80277270, + "step": 3730, + "time_per_iteration": 2.466702699661255 + }, + { + "auxiliary_loss_clip": 0.0116792, + "auxiliary_loss_mlp": 0.0102636, + "balance_loss_clip": 1.05118179, + "balance_loss_mlp": 1.01833725, + "epoch": 0.4486262249744484, + "flos": 20047742853120.0, + "grad_norm": 1.7454530163240416, + "language_loss": 0.93713659, + "learning_rate": 2.426687382058386e-06, + "loss": 0.95907933, + "num_input_tokens_seen": 80295550, + "step": 3731, + "time_per_iteration": 2.4585182666778564 + }, + { + "auxiliary_loss_clip": 0.01067467, + "auxiliary_loss_mlp": 0.01006457, + "balance_loss_clip": 1.01850903, + "balance_loss_mlp": 1.00516403, + "epoch": 0.4487464678650875, + "flos": 64595684776320.0, + "grad_norm": 0.8590520474081823, + "language_loss": 0.5984298, + "learning_rate": 2.425926312166649e-06, + "loss": 0.61916906, + "num_input_tokens_seen": 80348425, + "step": 3732, + "time_per_iteration": 2.9267184734344482 + }, + { + "auxiliary_loss_clip": 0.01160521, + "auxiliary_loss_mlp": 0.01021658, + "balance_loss_clip": 1.05208969, + "balance_loss_mlp": 1.01323056, + "epoch": 0.4488667107557266, + "flos": 20769798049920.0, + "grad_norm": 6.679673392191002, + "language_loss": 0.73042786, + "learning_rate": 2.42516517766223e-06, + "loss": 0.75224972, + "num_input_tokens_seen": 80366505, + "step": 3733, + "time_per_iteration": 2.4960789680480957 + }, + { + "auxiliary_loss_clip": 0.01183955, + "auxiliary_loss_mlp": 0.01024717, + "balance_loss_clip": 1.05496144, + "balance_loss_mlp": 1.01664627, + "epoch": 0.44898695364636565, + "flos": 23951735326080.0, + "grad_norm": 1.9761527297277013, + "language_loss": 0.68137336, + "learning_rate": 2.4244039786605907e-06, + "loss": 0.7034601, + "num_input_tokens_seen": 80387510, + "step": 3734, + "time_per_iteration": 2.4876041412353516 + }, + { + "auxiliary_loss_clip": 0.01129353, + "auxiliary_loss_mlp": 0.01025759, + "balance_loss_clip": 1.04687738, + "balance_loss_mlp": 1.01719952, + "epoch": 0.44910719653700476, + "flos": 18624351628800.0, + "grad_norm": 2.4086885357995347, + "language_loss": 0.82092834, + "learning_rate": 2.4236427152772055e-06, + "loss": 0.84247947, + "num_input_tokens_seen": 80405915, + "step": 3735, + "time_per_iteration": 2.5946853160858154 + }, + { + "auxiliary_loss_clip": 0.01037067, + "auxiliary_loss_mlp": 0.01002955, + "balance_loss_clip": 1.01624715, + "balance_loss_mlp": 1.00168562, + "epoch": 0.4492274394276438, + "flos": 57033435749760.0, + "grad_norm": 0.8293255790934682, + "language_loss": 0.57375425, + "learning_rate": 2.422881387627557e-06, + "loss": 0.59415454, + "num_input_tokens_seen": 80458365, + "step": 3736, + "time_per_iteration": 2.8463284969329834 + }, + { + "auxiliary_loss_clip": 0.01158857, + "auxiliary_loss_mlp": 0.01025173, + "balance_loss_clip": 1.05162191, + "balance_loss_mlp": 1.01723993, + "epoch": 0.4493476823182829, + "flos": 23254888498560.0, + "grad_norm": 1.7227491789603777, + "language_loss": 0.77247453, + "learning_rate": 2.422119995827139e-06, + "loss": 0.79431486, + "num_input_tokens_seen": 80478490, + "step": 3737, + "time_per_iteration": 2.5243077278137207 + }, + { + "auxiliary_loss_clip": 0.01176617, + "auxiliary_loss_mlp": 0.01026417, + "balance_loss_clip": 1.05473435, + "balance_loss_mlp": 1.01835823, + "epoch": 0.44946792520892204, + "flos": 15815131827840.0, + "grad_norm": 3.7111782120196093, + "language_loss": 0.73649776, + "learning_rate": 2.4213585399914528e-06, + "loss": 0.75852811, + "num_input_tokens_seen": 80495695, + "step": 3738, + "time_per_iteration": 2.468519926071167 + }, + { + "auxiliary_loss_clip": 0.01168802, + "auxiliary_loss_mlp": 0.01025363, + "balance_loss_clip": 1.0515393, + "balance_loss_mlp": 1.01734066, + "epoch": 0.4495881680995611, + "flos": 19610063631360.0, + "grad_norm": 1.8606439953643148, + "language_loss": 0.85420942, + "learning_rate": 2.4205970202360113e-06, + "loss": 0.87615108, + "num_input_tokens_seen": 80515260, + "step": 3739, + "time_per_iteration": 2.4823338985443115 + }, + { + "auxiliary_loss_clip": 0.0111885, + "auxiliary_loss_mlp": 0.01024503, + "balance_loss_clip": 1.04669237, + "balance_loss_mlp": 1.01606297, + "epoch": 0.4497084109902002, + "flos": 26031465815040.0, + "grad_norm": 2.063321858019845, + "language_loss": 0.78055829, + "learning_rate": 2.4198354366763354e-06, + "loss": 0.80199182, + "num_input_tokens_seen": 80533900, + "step": 3740, + "time_per_iteration": 2.605041265487671 + }, + { + "auxiliary_loss_clip": 0.01157755, + "auxiliary_loss_mlp": 0.01025193, + "balance_loss_clip": 1.04937935, + "balance_loss_mlp": 1.01673508, + "epoch": 0.4498286538808393, + "flos": 14793688771200.0, + "grad_norm": 2.8012099692571133, + "language_loss": 0.78789192, + "learning_rate": 2.4190737894279587e-06, + "loss": 0.80972135, + "num_input_tokens_seen": 80551270, + "step": 3741, + "time_per_iteration": 2.6010689735412598 + }, + { + "auxiliary_loss_clip": 0.01131117, + "auxiliary_loss_mlp": 0.01023322, + "balance_loss_clip": 1.04328048, + "balance_loss_mlp": 1.01546645, + "epoch": 0.44994889677147837, + "flos": 15450171690240.0, + "grad_norm": 2.3422488481332198, + "language_loss": 0.80865079, + "learning_rate": 2.4183120786064203e-06, + "loss": 0.83019519, + "num_input_tokens_seen": 80568145, + "step": 3742, + "time_per_iteration": 2.5368831157684326 + }, + { + "auxiliary_loss_clip": 0.01169678, + "auxiliary_loss_mlp": 0.00762594, + "balance_loss_clip": 1.05367708, + "balance_loss_mlp": 1.00108171, + "epoch": 0.4500691396621175, + "flos": 21798316085760.0, + "grad_norm": 2.384977892114976, + "language_loss": 0.85216409, + "learning_rate": 2.417550304327273e-06, + "loss": 0.87148678, + "num_input_tokens_seen": 80586185, + "step": 3743, + "time_per_iteration": 2.4989261627197266 + }, + { + "auxiliary_loss_clip": 0.01188, + "auxiliary_loss_mlp": 0.01033257, + "balance_loss_clip": 1.0553745, + "balance_loss_mlp": 1.02455449, + "epoch": 0.4501893825527566, + "flos": 32382016421760.0, + "grad_norm": 1.571051479951227, + "language_loss": 0.75958061, + "learning_rate": 2.4167884667060763e-06, + "loss": 0.78179318, + "num_input_tokens_seen": 80608895, + "step": 3744, + "time_per_iteration": 2.587252616882324 + }, + { + "auxiliary_loss_clip": 0.0115776, + "auxiliary_loss_mlp": 0.01032656, + "balance_loss_clip": 1.05077147, + "balance_loss_mlp": 1.02397776, + "epoch": 0.45030962544339564, + "flos": 16544944362240.0, + "grad_norm": 2.072541814647317, + "language_loss": 0.87478578, + "learning_rate": 2.4160265658584e-06, + "loss": 0.89668989, + "num_input_tokens_seen": 80623785, + "step": 3745, + "time_per_iteration": 2.4939701557159424 + }, + { + "auxiliary_loss_clip": 0.01175536, + "auxiliary_loss_mlp": 0.01027851, + "balance_loss_clip": 1.05430782, + "balance_loss_mlp": 1.01945853, + "epoch": 0.45042986833403476, + "flos": 19573039687680.0, + "grad_norm": 1.932321717943956, + "language_loss": 0.68001223, + "learning_rate": 2.4152646018998253e-06, + "loss": 0.7020461, + "num_input_tokens_seen": 80642735, + "step": 3746, + "time_per_iteration": 2.471222162246704 + }, + { + "auxiliary_loss_clip": 0.0116815, + "auxiliary_loss_mlp": 0.01030432, + "balance_loss_clip": 1.05150545, + "balance_loss_mlp": 1.02244496, + "epoch": 0.45055011122467387, + "flos": 23112467072640.0, + "grad_norm": 1.9366916915023034, + "language_loss": 0.71604025, + "learning_rate": 2.4145025749459403e-06, + "loss": 0.73802614, + "num_input_tokens_seen": 80663760, + "step": 3747, + "time_per_iteration": 2.5087404251098633 + }, + { + "auxiliary_loss_clip": 0.01099669, + "auxiliary_loss_mlp": 0.0103289, + "balance_loss_clip": 1.04708028, + "balance_loss_mlp": 1.02405715, + "epoch": 0.4506703541153129, + "flos": 19934623946880.0, + "grad_norm": 1.9239265678863988, + "language_loss": 0.70189524, + "learning_rate": 2.413740485112344e-06, + "loss": 0.72322083, + "num_input_tokens_seen": 80682100, + "step": 3748, + "time_per_iteration": 2.9243733882904053 + }, + { + "auxiliary_loss_clip": 0.01150203, + "auxiliary_loss_mlp": 0.01025302, + "balance_loss_clip": 1.05158389, + "balance_loss_mlp": 1.01684368, + "epoch": 0.45079059700595203, + "flos": 19499530504320.0, + "grad_norm": 1.6229205417550048, + "language_loss": 0.81866008, + "learning_rate": 2.412978332514646e-06, + "loss": 0.84041512, + "num_input_tokens_seen": 80700880, + "step": 3749, + "time_per_iteration": 2.7466907501220703 + }, + { + "auxiliary_loss_clip": 0.01158646, + "auxiliary_loss_mlp": 0.01031181, + "balance_loss_clip": 1.05189157, + "balance_loss_mlp": 1.02208579, + "epoch": 0.4509108398965911, + "flos": 27636313570560.0, + "grad_norm": 2.2119337159457535, + "language_loss": 0.72291756, + "learning_rate": 2.4122161172684623e-06, + "loss": 0.74481583, + "num_input_tokens_seen": 80721675, + "step": 3750, + "time_per_iteration": 3.3403773307800293 + }, + { + "auxiliary_loss_clip": 0.01159567, + "auxiliary_loss_mlp": 0.01034491, + "balance_loss_clip": 1.05174088, + "balance_loss_mlp": 1.02564597, + "epoch": 0.4510310827872302, + "flos": 20995712640000.0, + "grad_norm": 2.1625822947518025, + "language_loss": 0.84358662, + "learning_rate": 2.4114538394894216e-06, + "loss": 0.86552715, + "num_input_tokens_seen": 80739315, + "step": 3751, + "time_per_iteration": 3.458739995956421 + }, + { + "auxiliary_loss_clip": 0.01153223, + "auxiliary_loss_mlp": 0.0102488, + "balance_loss_clip": 1.04638326, + "balance_loss_mlp": 1.01681268, + "epoch": 0.4511513256778693, + "flos": 16216684945920.0, + "grad_norm": 1.8254439226027996, + "language_loss": 0.82928562, + "learning_rate": 2.410691499293161e-06, + "loss": 0.85106659, + "num_input_tokens_seen": 80757470, + "step": 3752, + "time_per_iteration": 4.042910575866699 + }, + { + "auxiliary_loss_clip": 0.011696, + "auxiliary_loss_mlp": 0.01030384, + "balance_loss_clip": 1.05161226, + "balance_loss_mlp": 1.02201581, + "epoch": 0.45127156856850836, + "flos": 25186702780800.0, + "grad_norm": 1.5709518140959784, + "language_loss": 0.73562324, + "learning_rate": 2.409929096795326e-06, + "loss": 0.75762308, + "num_input_tokens_seen": 80777840, + "step": 3753, + "time_per_iteration": 2.5137381553649902 + }, + { + "auxiliary_loss_clip": 0.01170471, + "auxiliary_loss_mlp": 0.01028193, + "balance_loss_clip": 1.05150127, + "balance_loss_mlp": 1.01957989, + "epoch": 0.4513918114591475, + "flos": 20412523422720.0, + "grad_norm": 1.8551457961333016, + "language_loss": 0.79209471, + "learning_rate": 2.409166632111573e-06, + "loss": 0.81408131, + "num_input_tokens_seen": 80795975, + "step": 3754, + "time_per_iteration": 2.475938558578491 + }, + { + "auxiliary_loss_clip": 0.01178995, + "auxiliary_loss_mlp": 0.01022083, + "balance_loss_clip": 1.05363142, + "balance_loss_mlp": 1.01315987, + "epoch": 0.4515120543497866, + "flos": 26648482665600.0, + "grad_norm": 1.8103435870234974, + "language_loss": 0.80255926, + "learning_rate": 2.4084041053575674e-06, + "loss": 0.82457006, + "num_input_tokens_seen": 80815395, + "step": 3755, + "time_per_iteration": 2.5258824825286865 + }, + { + "auxiliary_loss_clip": 0.01162796, + "auxiliary_loss_mlp": 0.01025666, + "balance_loss_clip": 1.05391181, + "balance_loss_mlp": 1.01749444, + "epoch": 0.45163229724042564, + "flos": 20595093275520.0, + "grad_norm": 2.0421881568986096, + "language_loss": 0.72608399, + "learning_rate": 2.4076415166489834e-06, + "loss": 0.74796867, + "num_input_tokens_seen": 80834805, + "step": 3756, + "time_per_iteration": 2.512739896774292 + }, + { + "auxiliary_loss_clip": 0.01134188, + "auxiliary_loss_mlp": 0.01030597, + "balance_loss_clip": 1.04872036, + "balance_loss_mlp": 1.02252066, + "epoch": 0.45175254013106475, + "flos": 21689004021120.0, + "grad_norm": 1.8024042824236737, + "language_loss": 0.79231262, + "learning_rate": 2.406878866101506e-06, + "loss": 0.81396043, + "num_input_tokens_seen": 80853770, + "step": 3757, + "time_per_iteration": 2.597973585128784 + }, + { + "auxiliary_loss_clip": 0.01185426, + "auxiliary_loss_mlp": 0.0102436, + "balance_loss_clip": 1.05587435, + "balance_loss_mlp": 1.01655757, + "epoch": 0.45187278302170386, + "flos": 18878850466560.0, + "grad_norm": 3.895163206210726, + "language_loss": 0.78340054, + "learning_rate": 2.4061161538308273e-06, + "loss": 0.80549836, + "num_input_tokens_seen": 80870615, + "step": 3758, + "time_per_iteration": 2.4360644817352295 + }, + { + "auxiliary_loss_clip": 0.01170109, + "auxiliary_loss_mlp": 0.01024014, + "balance_loss_clip": 1.05297446, + "balance_loss_mlp": 1.01568735, + "epoch": 0.4519930259123429, + "flos": 18582479349120.0, + "grad_norm": 1.9301838786840155, + "language_loss": 0.89062703, + "learning_rate": 2.4053533799526523e-06, + "loss": 0.91256827, + "num_input_tokens_seen": 80886335, + "step": 3759, + "time_per_iteration": 2.442965030670166 + }, + { + "auxiliary_loss_clip": 0.01152308, + "auxiliary_loss_mlp": 0.01029515, + "balance_loss_clip": 1.05148864, + "balance_loss_mlp": 1.02131653, + "epoch": 0.452113268802982, + "flos": 25192377129600.0, + "grad_norm": 1.7067164861256747, + "language_loss": 0.86274189, + "learning_rate": 2.404590544582691e-06, + "loss": 0.88456011, + "num_input_tokens_seen": 80904570, + "step": 3760, + "time_per_iteration": 2.5384647846221924 + }, + { + "auxiliary_loss_clip": 0.01130464, + "auxiliary_loss_mlp": 0.01029287, + "balance_loss_clip": 1.04317689, + "balance_loss_mlp": 1.0208714, + "epoch": 0.45223351169362114, + "flos": 39378922312320.0, + "grad_norm": 1.6623132144517105, + "language_loss": 0.80983478, + "learning_rate": 2.403827647836666e-06, + "loss": 0.83143228, + "num_input_tokens_seen": 80925125, + "step": 3761, + "time_per_iteration": 2.747931957244873 + }, + { + "auxiliary_loss_clip": 0.0118636, + "auxiliary_loss_mlp": 0.01029778, + "balance_loss_clip": 1.05330157, + "balance_loss_mlp": 1.02107, + "epoch": 0.4523537545842602, + "flos": 21582169994880.0, + "grad_norm": 1.8962078429385896, + "language_loss": 0.69320178, + "learning_rate": 2.4030646898303075e-06, + "loss": 0.71536314, + "num_input_tokens_seen": 80946615, + "step": 3762, + "time_per_iteration": 2.4746901988983154 + }, + { + "auxiliary_loss_clip": 0.01160385, + "auxiliary_loss_mlp": 0.01032258, + "balance_loss_clip": 1.0502826, + "balance_loss_mlp": 1.02385974, + "epoch": 0.4524739974748993, + "flos": 28439527547520.0, + "grad_norm": 2.4537052550876837, + "language_loss": 0.82091081, + "learning_rate": 2.4023016706793566e-06, + "loss": 0.84283721, + "num_input_tokens_seen": 80966410, + "step": 3763, + "time_per_iteration": 2.5834741592407227 + }, + { + "auxiliary_loss_clip": 0.01051717, + "auxiliary_loss_mlp": 0.0100653, + "balance_loss_clip": 1.0119648, + "balance_loss_mlp": 1.00515938, + "epoch": 0.4525942403655384, + "flos": 61556492148480.0, + "grad_norm": 0.7618802389366243, + "language_loss": 0.56869155, + "learning_rate": 2.401538590499561e-06, + "loss": 0.58927405, + "num_input_tokens_seen": 81026865, + "step": 3764, + "time_per_iteration": 3.1445624828338623 + }, + { + "auxiliary_loss_clip": 0.01173418, + "auxiliary_loss_mlp": 0.00763228, + "balance_loss_clip": 1.05290341, + "balance_loss_mlp": 1.00113273, + "epoch": 0.45271448325617747, + "flos": 27529838680320.0, + "grad_norm": 1.8537469799106148, + "language_loss": 0.71815532, + "learning_rate": 2.400775449406682e-06, + "loss": 0.73752177, + "num_input_tokens_seen": 81050060, + "step": 3765, + "time_per_iteration": 2.57112717628479 + }, + { + "auxiliary_loss_clip": 0.01169249, + "auxiliary_loss_mlp": 0.01028089, + "balance_loss_clip": 1.04965043, + "balance_loss_mlp": 1.02004886, + "epoch": 0.4528347261468166, + "flos": 22452608275200.0, + "grad_norm": 1.9645406092297686, + "language_loss": 0.72786027, + "learning_rate": 2.400012247516485e-06, + "loss": 0.7498337, + "num_input_tokens_seen": 81070625, + "step": 3766, + "time_per_iteration": 2.506150007247925 + }, + { + "auxiliary_loss_clip": 0.01146299, + "auxiliary_loss_mlp": 0.01029544, + "balance_loss_clip": 1.04746234, + "balance_loss_mlp": 1.02121735, + "epoch": 0.45295496903745563, + "flos": 21103875469440.0, + "grad_norm": 2.427905669369554, + "language_loss": 0.90125823, + "learning_rate": 2.3992489849447484e-06, + "loss": 0.92301667, + "num_input_tokens_seen": 81089080, + "step": 3767, + "time_per_iteration": 2.5660502910614014 + }, + { + "auxiliary_loss_clip": 0.0114777, + "auxiliary_loss_mlp": 0.01027632, + "balance_loss_clip": 1.04859447, + "balance_loss_mlp": 1.01940656, + "epoch": 0.45307521192809475, + "flos": 23221168606080.0, + "grad_norm": 1.627105619564824, + "language_loss": 0.78802955, + "learning_rate": 2.3984856618072584e-06, + "loss": 0.80978358, + "num_input_tokens_seen": 81109115, + "step": 3768, + "time_per_iteration": 2.5684854984283447 + }, + { + "auxiliary_loss_clip": 0.01146346, + "auxiliary_loss_mlp": 0.01029719, + "balance_loss_clip": 1.04948604, + "balance_loss_mlp": 1.02142763, + "epoch": 0.45319545481873386, + "flos": 15560094286080.0, + "grad_norm": 2.1016198605783796, + "language_loss": 0.73658037, + "learning_rate": 2.3977222782198098e-06, + "loss": 0.75834101, + "num_input_tokens_seen": 81127750, + "step": 3769, + "time_per_iteration": 2.580767869949341 + }, + { + "auxiliary_loss_clip": 0.01132964, + "auxiliary_loss_mlp": 0.01034897, + "balance_loss_clip": 1.04704392, + "balance_loss_mlp": 1.02558637, + "epoch": 0.4533156977093729, + "flos": 21944759834880.0, + "grad_norm": 1.9003067310195882, + "language_loss": 0.75299728, + "learning_rate": 2.3969588342982077e-06, + "loss": 0.77467591, + "num_input_tokens_seen": 81147125, + "step": 3770, + "time_per_iteration": 2.583207368850708 + }, + { + "auxiliary_loss_clip": 0.01170276, + "auxiliary_loss_mlp": 0.01028146, + "balance_loss_clip": 1.054703, + "balance_loss_mlp": 1.01971185, + "epoch": 0.453435940600012, + "flos": 24242180699520.0, + "grad_norm": 1.6372065367677662, + "language_loss": 0.72546375, + "learning_rate": 2.396195330158267e-06, + "loss": 0.74744791, + "num_input_tokens_seen": 81167015, + "step": 3771, + "time_per_iteration": 2.5195019245147705 + }, + { + "auxiliary_loss_clip": 0.01186085, + "auxiliary_loss_mlp": 0.01026434, + "balance_loss_clip": 1.05400765, + "balance_loss_mlp": 1.01817286, + "epoch": 0.45355618349065113, + "flos": 23440367352960.0, + "grad_norm": 2.696660634704043, + "language_loss": 0.79439187, + "learning_rate": 2.3954317659158094e-06, + "loss": 0.81651706, + "num_input_tokens_seen": 81187350, + "step": 3772, + "time_per_iteration": 2.480041027069092 + }, + { + "auxiliary_loss_clip": 0.01080928, + "auxiliary_loss_mlp": 0.01002106, + "balance_loss_clip": 1.01489878, + "balance_loss_mlp": 1.00081241, + "epoch": 0.4536764263812902, + "flos": 66903161448960.0, + "grad_norm": 0.8916204632967881, + "language_loss": 0.56986022, + "learning_rate": 2.394668141686667e-06, + "loss": 0.59069061, + "num_input_tokens_seen": 81249315, + "step": 3773, + "time_per_iteration": 3.0579044818878174 + }, + { + "auxiliary_loss_clip": 0.01166981, + "auxiliary_loss_mlp": 0.0102544, + "balance_loss_clip": 1.05021739, + "balance_loss_mlp": 1.01754224, + "epoch": 0.4537966692719293, + "flos": 42739766254080.0, + "grad_norm": 1.7900573149658712, + "language_loss": 0.69327211, + "learning_rate": 2.3939044575866813e-06, + "loss": 0.71519631, + "num_input_tokens_seen": 81272065, + "step": 3774, + "time_per_iteration": 2.6814446449279785 + }, + { + "auxiliary_loss_clip": 0.01152601, + "auxiliary_loss_mlp": 0.00763094, + "balance_loss_clip": 1.04951715, + "balance_loss_mlp": 1.00103617, + "epoch": 0.4539169121625684, + "flos": 35549480517120.0, + "grad_norm": 4.877670164451679, + "language_loss": 0.75689042, + "learning_rate": 2.3931407137317024e-06, + "loss": 0.77604735, + "num_input_tokens_seen": 81292220, + "step": 3775, + "time_per_iteration": 2.627772569656372 + }, + { + "auxiliary_loss_clip": 0.01141251, + "auxiliary_loss_mlp": 0.01034075, + "balance_loss_clip": 1.04727447, + "balance_loss_mlp": 1.02527165, + "epoch": 0.45403715505320746, + "flos": 18514716341760.0, + "grad_norm": 1.9955866743395956, + "language_loss": 0.85194933, + "learning_rate": 2.3923769102375907e-06, + "loss": 0.87370259, + "num_input_tokens_seen": 81311085, + "step": 3776, + "time_per_iteration": 2.539416551589966 + }, + { + "auxiliary_loss_clip": 0.01141731, + "auxiliary_loss_mlp": 0.01032314, + "balance_loss_clip": 1.04842997, + "balance_loss_mlp": 1.02333748, + "epoch": 0.4541573979438466, + "flos": 25045825639680.0, + "grad_norm": 1.9487410289083456, + "language_loss": 0.78535193, + "learning_rate": 2.391613047220213e-06, + "loss": 0.80709243, + "num_input_tokens_seen": 81330985, + "step": 3777, + "time_per_iteration": 3.3955883979797363 + }, + { + "auxiliary_loss_clip": 0.01134833, + "auxiliary_loss_mlp": 0.0102583, + "balance_loss_clip": 1.0482316, + "balance_loss_mlp": 1.01751566, + "epoch": 0.4542776408344857, + "flos": 18332397884160.0, + "grad_norm": 1.93808479275615, + "language_loss": 0.790851, + "learning_rate": 2.390849124795447e-06, + "loss": 0.81245768, + "num_input_tokens_seen": 81346985, + "step": 3778, + "time_per_iteration": 3.3574955463409424 + }, + { + "auxiliary_loss_clip": 0.01184826, + "auxiliary_loss_mlp": 0.01024439, + "balance_loss_clip": 1.0527904, + "balance_loss_mlp": 1.01669645, + "epoch": 0.45439788372512474, + "flos": 20701173116160.0, + "grad_norm": 1.9675016088722534, + "language_loss": 0.84010541, + "learning_rate": 2.3900851430791804e-06, + "loss": 0.86219811, + "num_input_tokens_seen": 81365005, + "step": 3779, + "time_per_iteration": 3.3166849613189697 + }, + { + "auxiliary_loss_clip": 0.01189605, + "auxiliary_loss_mlp": 0.01030336, + "balance_loss_clip": 1.05474687, + "balance_loss_mlp": 1.02142501, + "epoch": 0.45451812661576385, + "flos": 22309432663680.0, + "grad_norm": 2.148842755041505, + "language_loss": 0.84864902, + "learning_rate": 2.389321102187307e-06, + "loss": 0.87084842, + "num_input_tokens_seen": 81383785, + "step": 3780, + "time_per_iteration": 2.4629969596862793 + }, + { + "auxiliary_loss_clip": 0.01159044, + "auxiliary_loss_mlp": 0.0076354, + "balance_loss_clip": 1.05280018, + "balance_loss_mlp": 1.00107145, + "epoch": 0.4546383695064029, + "flos": 21763303303680.0, + "grad_norm": 1.7954402258464124, + "language_loss": 0.81634498, + "learning_rate": 2.3885570022357326e-06, + "loss": 0.83557075, + "num_input_tokens_seen": 81402915, + "step": 3781, + "time_per_iteration": 2.5696513652801514 + }, + { + "auxiliary_loss_clip": 0.01054354, + "auxiliary_loss_mlp": 0.01004395, + "balance_loss_clip": 1.01459157, + "balance_loss_mlp": 1.00311983, + "epoch": 0.454758612397042, + "flos": 64242755694720.0, + "grad_norm": 0.7973788126056941, + "language_loss": 0.60869634, + "learning_rate": 2.38779284334037e-06, + "loss": 0.62928385, + "num_input_tokens_seen": 81467890, + "step": 3782, + "time_per_iteration": 3.1680099964141846 + }, + { + "auxiliary_loss_clip": 0.01115517, + "auxiliary_loss_mlp": 0.01028994, + "balance_loss_clip": 1.04463506, + "balance_loss_mlp": 1.02058947, + "epoch": 0.4548788552876811, + "flos": 27304175485440.0, + "grad_norm": 1.8958010701524415, + "language_loss": 0.78706783, + "learning_rate": 2.387028625617141e-06, + "loss": 0.80851293, + "num_input_tokens_seen": 81487105, + "step": 3783, + "time_per_iteration": 2.6238291263580322 + }, + { + "auxiliary_loss_clip": 0.0114234, + "auxiliary_loss_mlp": 0.01027987, + "balance_loss_clip": 1.04632902, + "balance_loss_mlp": 1.01986241, + "epoch": 0.4549990981783202, + "flos": 22857142222080.0, + "grad_norm": 2.5683731266840346, + "language_loss": 0.84741384, + "learning_rate": 2.3862643491819766e-06, + "loss": 0.86911708, + "num_input_tokens_seen": 81505670, + "step": 3784, + "time_per_iteration": 2.530148506164551 + }, + { + "auxiliary_loss_clip": 0.01167161, + "auxiliary_loss_mlp": 0.01025265, + "balance_loss_clip": 1.04913592, + "balance_loss_mlp": 1.01724815, + "epoch": 0.4551193410689593, + "flos": 23258587599360.0, + "grad_norm": 1.7283996951520264, + "language_loss": 0.84445333, + "learning_rate": 2.3855000141508186e-06, + "loss": 0.86637765, + "num_input_tokens_seen": 81525825, + "step": 3785, + "time_per_iteration": 2.5085561275482178 + }, + { + "auxiliary_loss_clip": 0.0116221, + "auxiliary_loss_mlp": 0.0102773, + "balance_loss_clip": 1.05470455, + "balance_loss_mlp": 1.01877713, + "epoch": 0.4552395839595984, + "flos": 20777519473920.0, + "grad_norm": 2.9708052690350364, + "language_loss": 0.83610135, + "learning_rate": 2.3847356206396143e-06, + "loss": 0.85800076, + "num_input_tokens_seen": 81543135, + "step": 3786, + "time_per_iteration": 2.5143332481384277 + }, + { + "auxiliary_loss_clip": 0.01185695, + "auxiliary_loss_mlp": 0.01027264, + "balance_loss_clip": 1.0538044, + "balance_loss_mlp": 1.01903844, + "epoch": 0.45535982685023746, + "flos": 23257510191360.0, + "grad_norm": 5.2294463740143, + "language_loss": 0.78453231, + "learning_rate": 2.3839711687643227e-06, + "loss": 0.80666196, + "num_input_tokens_seen": 81564360, + "step": 3787, + "time_per_iteration": 2.4759509563446045 + }, + { + "auxiliary_loss_clip": 0.01170346, + "auxiliary_loss_mlp": 0.01030479, + "balance_loss_clip": 1.0520997, + "balance_loss_mlp": 1.02131772, + "epoch": 0.45548006974087657, + "flos": 19646117907840.0, + "grad_norm": 2.322081252632029, + "language_loss": 0.74065268, + "learning_rate": 2.38320665864091e-06, + "loss": 0.76266092, + "num_input_tokens_seen": 81583710, + "step": 3788, + "time_per_iteration": 2.4734272956848145 + }, + { + "auxiliary_loss_clip": 0.01116062, + "auxiliary_loss_mlp": 0.01026678, + "balance_loss_clip": 1.04444742, + "balance_loss_mlp": 1.01785624, + "epoch": 0.4556003126315157, + "flos": 20047778766720.0, + "grad_norm": 1.5910185325190516, + "language_loss": 0.8184799, + "learning_rate": 2.3824420903853516e-06, + "loss": 0.83990729, + "num_input_tokens_seen": 81602175, + "step": 3789, + "time_per_iteration": 2.6015727519989014 + }, + { + "auxiliary_loss_clip": 0.0117192, + "auxiliary_loss_mlp": 0.01027061, + "balance_loss_clip": 1.05459905, + "balance_loss_mlp": 1.01884782, + "epoch": 0.45572055552215474, + "flos": 22959738443520.0, + "grad_norm": 2.2219521386909538, + "language_loss": 0.8229177, + "learning_rate": 2.3816774641136324e-06, + "loss": 0.84490752, + "num_input_tokens_seen": 81619430, + "step": 3790, + "time_per_iteration": 2.486428737640381 + }, + { + "auxiliary_loss_clip": 0.01170746, + "auxiliary_loss_mlp": 0.00763214, + "balance_loss_clip": 1.0530833, + "balance_loss_mlp": 1.00111771, + "epoch": 0.45584079841279385, + "flos": 33109925535360.0, + "grad_norm": 2.219251069282314, + "language_loss": 0.71097648, + "learning_rate": 2.380912779941745e-06, + "loss": 0.73031604, + "num_input_tokens_seen": 81642550, + "step": 3791, + "time_per_iteration": 2.5875277519226074 + }, + { + "auxiliary_loss_clip": 0.0117379, + "auxiliary_loss_mlp": 0.01034308, + "balance_loss_clip": 1.05048442, + "balance_loss_mlp": 1.02475333, + "epoch": 0.45596104130343296, + "flos": 27272179445760.0, + "grad_norm": 2.293564128110959, + "language_loss": 0.82992077, + "learning_rate": 2.3801480379856918e-06, + "loss": 0.85200173, + "num_input_tokens_seen": 81664260, + "step": 3792, + "time_per_iteration": 2.5449390411376953 + }, + { + "auxiliary_loss_clip": 0.01161464, + "auxiliary_loss_mlp": 0.01031292, + "balance_loss_clip": 1.05318105, + "balance_loss_mlp": 1.02306688, + "epoch": 0.456081284194072, + "flos": 21579799697280.0, + "grad_norm": 1.6322981107712597, + "language_loss": 0.83290493, + "learning_rate": 2.379383238361484e-06, + "loss": 0.85483247, + "num_input_tokens_seen": 81683620, + "step": 3793, + "time_per_iteration": 2.5320584774017334 + }, + { + "auxiliary_loss_clip": 0.011695, + "auxiliary_loss_mlp": 0.01028857, + "balance_loss_clip": 1.05093026, + "balance_loss_mlp": 1.02036285, + "epoch": 0.4562015270847111, + "flos": 35918822113920.0, + "grad_norm": 1.9106788099382603, + "language_loss": 0.79763007, + "learning_rate": 2.3786183811851407e-06, + "loss": 0.8196137, + "num_input_tokens_seen": 81704325, + "step": 3794, + "time_per_iteration": 2.6068520545959473 + }, + { + "auxiliary_loss_clip": 0.01188485, + "auxiliary_loss_mlp": 0.01030923, + "balance_loss_clip": 1.05659413, + "balance_loss_mlp": 1.02264953, + "epoch": 0.45632176997535023, + "flos": 13589783602560.0, + "grad_norm": 2.022486346209857, + "language_loss": 0.79931951, + "learning_rate": 2.3778534665726892e-06, + "loss": 0.82151353, + "num_input_tokens_seen": 81721155, + "step": 3795, + "time_per_iteration": 2.421072483062744 + }, + { + "auxiliary_loss_clip": 0.01161323, + "auxiliary_loss_mlp": 0.01032686, + "balance_loss_clip": 1.05132866, + "balance_loss_mlp": 1.0248183, + "epoch": 0.4564420128659893, + "flos": 32635401937920.0, + "grad_norm": 1.711898497650298, + "language_loss": 0.72497964, + "learning_rate": 2.377088494640168e-06, + "loss": 0.74691975, + "num_input_tokens_seen": 81742905, + "step": 3796, + "time_per_iteration": 2.582486629486084 + }, + { + "auxiliary_loss_clip": 0.01166234, + "auxiliary_loss_mlp": 0.01028027, + "balance_loss_clip": 1.05265045, + "balance_loss_mlp": 1.01988542, + "epoch": 0.4565622557566284, + "flos": 20377690208640.0, + "grad_norm": 1.7384837973893488, + "language_loss": 0.78213203, + "learning_rate": 2.3763234655036216e-06, + "loss": 0.80407459, + "num_input_tokens_seen": 81762105, + "step": 3797, + "time_per_iteration": 2.487492799758911 + }, + { + "auxiliary_loss_clip": 0.01138936, + "auxiliary_loss_mlp": 0.01029277, + "balance_loss_clip": 1.04432416, + "balance_loss_mlp": 1.02087259, + "epoch": 0.45668249864726745, + "flos": 25374372364800.0, + "grad_norm": 2.453159007806204, + "language_loss": 0.87288356, + "learning_rate": 2.3755583792791046e-06, + "loss": 0.8945657, + "num_input_tokens_seen": 81781975, + "step": 3798, + "time_per_iteration": 2.5710980892181396 + }, + { + "auxiliary_loss_clip": 0.01173467, + "auxiliary_loss_mlp": 0.01022243, + "balance_loss_clip": 1.05282807, + "balance_loss_mlp": 1.0144403, + "epoch": 0.45680274153790656, + "flos": 15559806977280.0, + "grad_norm": 2.0139197868242396, + "language_loss": 0.74323344, + "learning_rate": 2.3747932360826803e-06, + "loss": 0.76519048, + "num_input_tokens_seen": 81798905, + "step": 3799, + "time_per_iteration": 2.467055082321167 + }, + { + "auxiliary_loss_clip": 0.0116877, + "auxiliary_loss_mlp": 0.01026016, + "balance_loss_clip": 1.05217099, + "balance_loss_mlp": 1.01739764, + "epoch": 0.4569229844285457, + "flos": 19792884879360.0, + "grad_norm": 3.4472106711480905, + "language_loss": 0.81986177, + "learning_rate": 2.3740280360304205e-06, + "loss": 0.84180963, + "num_input_tokens_seen": 81816630, + "step": 3800, + "time_per_iteration": 2.4792532920837402 + }, + { + "auxiliary_loss_clip": 0.01142101, + "auxiliary_loss_mlp": 0.01029213, + "balance_loss_clip": 1.05142987, + "balance_loss_mlp": 1.02032638, + "epoch": 0.45704322731918473, + "flos": 24093941270400.0, + "grad_norm": 1.652243551898501, + "language_loss": 0.67889154, + "learning_rate": 2.3732627792384038e-06, + "loss": 0.70060468, + "num_input_tokens_seen": 81837700, + "step": 3801, + "time_per_iteration": 2.575906753540039 + }, + { + "auxiliary_loss_clip": 0.01185367, + "auxiliary_loss_mlp": 0.01028899, + "balance_loss_clip": 1.05279469, + "balance_loss_mlp": 1.02029216, + "epoch": 0.45716347020982384, + "flos": 31317803245440.0, + "grad_norm": 2.065932389314388, + "language_loss": 0.75069058, + "learning_rate": 2.3724974658227207e-06, + "loss": 0.77283323, + "num_input_tokens_seen": 81858490, + "step": 3802, + "time_per_iteration": 2.5347228050231934 + }, + { + "auxiliary_loss_clip": 0.01156654, + "auxiliary_loss_mlp": 0.00763387, + "balance_loss_clip": 1.05278063, + "balance_loss_mlp": 1.00107169, + "epoch": 0.45728371310046295, + "flos": 26501392471680.0, + "grad_norm": 1.9296820270294237, + "language_loss": 0.71028113, + "learning_rate": 2.3717320958994687e-06, + "loss": 0.72948158, + "num_input_tokens_seen": 81876050, + "step": 3803, + "time_per_iteration": 2.5438599586486816 + }, + { + "auxiliary_loss_clip": 0.01140662, + "auxiliary_loss_mlp": 0.01025292, + "balance_loss_clip": 1.04383111, + "balance_loss_mlp": 1.01765072, + "epoch": 0.457403955991102, + "flos": 17929408222080.0, + "grad_norm": 1.852953896375088, + "language_loss": 0.70588982, + "learning_rate": 2.3709666695847534e-06, + "loss": 0.72754931, + "num_input_tokens_seen": 81894230, + "step": 3804, + "time_per_iteration": 3.2624685764312744 + }, + { + "auxiliary_loss_clip": 0.01121388, + "auxiliary_loss_mlp": 0.01028779, + "balance_loss_clip": 1.04548311, + "balance_loss_mlp": 1.02044594, + "epoch": 0.4575241988817411, + "flos": 42230660837760.0, + "grad_norm": 1.762395536493893, + "language_loss": 0.69906682, + "learning_rate": 2.370201186994689e-06, + "loss": 0.72056848, + "num_input_tokens_seen": 81917915, + "step": 3805, + "time_per_iteration": 3.5883049964904785 + }, + { + "auxiliary_loss_clip": 0.01148612, + "auxiliary_loss_mlp": 0.01028592, + "balance_loss_clip": 1.05082273, + "balance_loss_mlp": 1.02044368, + "epoch": 0.45764444177238023, + "flos": 30117309868800.0, + "grad_norm": 2.2583271794853514, + "language_loss": 0.69762599, + "learning_rate": 2.369435648245399e-06, + "loss": 0.71939802, + "num_input_tokens_seen": 81938130, + "step": 3806, + "time_per_iteration": 3.4811604022979736 + }, + { + "auxiliary_loss_clip": 0.01155475, + "auxiliary_loss_mlp": 0.01034425, + "balance_loss_clip": 1.05039847, + "balance_loss_mlp": 1.02552009, + "epoch": 0.4577646846630193, + "flos": 24060293205120.0, + "grad_norm": 2.1844116626357417, + "language_loss": 0.8503893, + "learning_rate": 2.368670053453015e-06, + "loss": 0.87228835, + "num_input_tokens_seen": 81959820, + "step": 3807, + "time_per_iteration": 2.5478711128234863 + }, + { + "auxiliary_loss_clip": 0.01180554, + "auxiliary_loss_mlp": 0.01027949, + "balance_loss_clip": 1.05699468, + "balance_loss_mlp": 1.01876998, + "epoch": 0.4578849275536584, + "flos": 17418578952960.0, + "grad_norm": 2.186982270867477, + "language_loss": 0.73981822, + "learning_rate": 2.3679044027336757e-06, + "loss": 0.76190329, + "num_input_tokens_seen": 81975710, + "step": 3808, + "time_per_iteration": 2.458109140396118 + }, + { + "auxiliary_loss_clip": 0.01186359, + "auxiliary_loss_mlp": 0.01026974, + "balance_loss_clip": 1.05385721, + "balance_loss_mlp": 1.01776457, + "epoch": 0.4580051704442975, + "flos": 13510169107200.0, + "grad_norm": 2.5348765843538064, + "language_loss": 0.6897074, + "learning_rate": 2.3671386962035326e-06, + "loss": 0.71184063, + "num_input_tokens_seen": 81993180, + "step": 3809, + "time_per_iteration": 2.465155601501465 + }, + { + "auxiliary_loss_clip": 0.01171438, + "auxiliary_loss_mlp": 0.01032417, + "balance_loss_clip": 1.05200005, + "balance_loss_mlp": 1.02407193, + "epoch": 0.45812541333493656, + "flos": 18037606965120.0, + "grad_norm": 2.8124139846066285, + "language_loss": 0.68512332, + "learning_rate": 2.3663729339787405e-06, + "loss": 0.70716178, + "num_input_tokens_seen": 82010115, + "step": 3810, + "time_per_iteration": 2.4524624347686768 + }, + { + "auxiliary_loss_clip": 0.01185299, + "auxiliary_loss_mlp": 0.01027442, + "balance_loss_clip": 1.05308008, + "balance_loss_mlp": 1.01911569, + "epoch": 0.45824565622557567, + "flos": 20222196232320.0, + "grad_norm": 3.1113632840572643, + "language_loss": 0.73469621, + "learning_rate": 2.365607116175466e-06, + "loss": 0.75682366, + "num_input_tokens_seen": 82025540, + "step": 3811, + "time_per_iteration": 2.4052023887634277 + }, + { + "auxiliary_loss_clip": 0.01184029, + "auxiliary_loss_mlp": 0.0102695, + "balance_loss_clip": 1.05366874, + "balance_loss_mlp": 1.01893282, + "epoch": 0.4583658991162148, + "flos": 19864885691520.0, + "grad_norm": 2.5033275447230174, + "language_loss": 0.6672222, + "learning_rate": 2.3648412429098825e-06, + "loss": 0.68933201, + "num_input_tokens_seen": 82043890, + "step": 3812, + "time_per_iteration": 2.437814235687256 + }, + { + "auxiliary_loss_clip": 0.01136724, + "auxiliary_loss_mlp": 0.01036437, + "balance_loss_clip": 1.04809904, + "balance_loss_mlp": 1.02718604, + "epoch": 0.45848614200685384, + "flos": 21029935322880.0, + "grad_norm": 1.777860059220046, + "language_loss": 0.81999928, + "learning_rate": 2.364075314298172e-06, + "loss": 0.84173089, + "num_input_tokens_seen": 82061345, + "step": 3813, + "time_per_iteration": 2.5303571224212646 + }, + { + "auxiliary_loss_clip": 0.01175458, + "auxiliary_loss_mlp": 0.00763309, + "balance_loss_clip": 1.05323553, + "balance_loss_mlp": 1.00098157, + "epoch": 0.45860638489749295, + "flos": 21069293650560.0, + "grad_norm": 2.005253650598378, + "language_loss": 0.70031315, + "learning_rate": 2.3633093304565267e-06, + "loss": 0.71970087, + "num_input_tokens_seen": 82080400, + "step": 3814, + "time_per_iteration": 2.4949448108673096 + }, + { + "auxiliary_loss_clip": 0.01189875, + "auxiliary_loss_mlp": 0.01030118, + "balance_loss_clip": 1.05565274, + "balance_loss_mlp": 1.02165437, + "epoch": 0.458726627788132, + "flos": 26833889692800.0, + "grad_norm": 2.137607702257952, + "language_loss": 0.62508571, + "learning_rate": 2.3625432915011443e-06, + "loss": 0.6472857, + "num_input_tokens_seen": 82102310, + "step": 3815, + "time_per_iteration": 2.5441720485687256 + }, + { + "auxiliary_loss_clip": 0.011528, + "auxiliary_loss_mlp": 0.01030597, + "balance_loss_clip": 1.05098331, + "balance_loss_mlp": 1.02197206, + "epoch": 0.4588468706787711, + "flos": 24097927680000.0, + "grad_norm": 2.110034719797862, + "language_loss": 0.65091646, + "learning_rate": 2.3617771975482334e-06, + "loss": 0.67275047, + "num_input_tokens_seen": 82121140, + "step": 3816, + "time_per_iteration": 2.523953914642334 + }, + { + "auxiliary_loss_clip": 0.01123361, + "auxiliary_loss_mlp": 0.01024759, + "balance_loss_clip": 1.04744613, + "balance_loss_mlp": 1.01695049, + "epoch": 0.4589671135694102, + "flos": 17889331622400.0, + "grad_norm": 1.5900883624492286, + "language_loss": 0.74421918, + "learning_rate": 2.3610110487140083e-06, + "loss": 0.76570034, + "num_input_tokens_seen": 82139575, + "step": 3817, + "time_per_iteration": 2.622431755065918 + }, + { + "auxiliary_loss_clip": 0.01156432, + "auxiliary_loss_mlp": 0.0103209, + "balance_loss_clip": 1.05170798, + "balance_loss_mlp": 1.02355504, + "epoch": 0.4590873564600493, + "flos": 25626967781760.0, + "grad_norm": 1.6782792795399193, + "language_loss": 0.8056767, + "learning_rate": 2.360244845114695e-06, + "loss": 0.82756186, + "num_input_tokens_seen": 82159195, + "step": 3818, + "time_per_iteration": 2.5536482334136963 + }, + { + "auxiliary_loss_clip": 0.01148926, + "auxiliary_loss_mlp": 0.01028323, + "balance_loss_clip": 1.05071926, + "balance_loss_mlp": 1.01939404, + "epoch": 0.4592075993506884, + "flos": 18514788168960.0, + "grad_norm": 2.0687068822331063, + "language_loss": 0.68487084, + "learning_rate": 2.3594785868665245e-06, + "loss": 0.70664328, + "num_input_tokens_seen": 82175500, + "step": 3819, + "time_per_iteration": 2.554961681365967 + }, + { + "auxiliary_loss_clip": 0.01143146, + "auxiliary_loss_mlp": 0.00763442, + "balance_loss_clip": 1.04841459, + "balance_loss_mlp": 1.00103259, + "epoch": 0.4593278422413275, + "flos": 20631111638400.0, + "grad_norm": 4.50669604057574, + "language_loss": 0.80327129, + "learning_rate": 2.3587122740857386e-06, + "loss": 0.82233727, + "num_input_tokens_seen": 82192600, + "step": 3820, + "time_per_iteration": 2.5499517917633057 + }, + { + "auxiliary_loss_clip": 0.01168459, + "auxiliary_loss_mlp": 0.01028208, + "balance_loss_clip": 1.05014312, + "balance_loss_mlp": 1.02051902, + "epoch": 0.45944808513196655, + "flos": 21358517961600.0, + "grad_norm": 1.639040220610083, + "language_loss": 0.77943093, + "learning_rate": 2.357945906888586e-06, + "loss": 0.80139762, + "num_input_tokens_seen": 82212040, + "step": 3821, + "time_per_iteration": 2.538455009460449 + }, + { + "auxiliary_loss_clip": 0.01174026, + "auxiliary_loss_mlp": 0.01031066, + "balance_loss_clip": 1.05387831, + "balance_loss_mlp": 1.02217877, + "epoch": 0.45956832802260567, + "flos": 21427789340160.0, + "grad_norm": 2.5311943406435637, + "language_loss": 0.79488242, + "learning_rate": 2.357179485391324e-06, + "loss": 0.81693333, + "num_input_tokens_seen": 82229895, + "step": 3822, + "time_per_iteration": 2.500826358795166 + }, + { + "auxiliary_loss_clip": 0.01183153, + "auxiliary_loss_mlp": 0.01026162, + "balance_loss_clip": 1.05330348, + "balance_loss_mlp": 1.01767421, + "epoch": 0.4596885709132448, + "flos": 22382654538240.0, + "grad_norm": 1.979403854952371, + "language_loss": 0.86339903, + "learning_rate": 2.3564130097102173e-06, + "loss": 0.88549215, + "num_input_tokens_seen": 82249550, + "step": 3823, + "time_per_iteration": 2.5028653144836426 + }, + { + "auxiliary_loss_clip": 0.01150539, + "auxiliary_loss_mlp": 0.01026208, + "balance_loss_clip": 1.05264044, + "balance_loss_mlp": 1.01742816, + "epoch": 0.45980881380388383, + "flos": 28981957806720.0, + "grad_norm": 1.764813109346763, + "language_loss": 0.75042325, + "learning_rate": 2.355646479961541e-06, + "loss": 0.77219075, + "num_input_tokens_seen": 82268860, + "step": 3824, + "time_per_iteration": 2.5680360794067383 + }, + { + "auxiliary_loss_clip": 0.01183974, + "auxiliary_loss_mlp": 0.01025319, + "balance_loss_clip": 1.05324697, + "balance_loss_mlp": 1.01626515, + "epoch": 0.45992905669452294, + "flos": 33396599980800.0, + "grad_norm": 1.8795672804052275, + "language_loss": 0.71396816, + "learning_rate": 2.354879896261576e-06, + "loss": 0.7360611, + "num_input_tokens_seen": 82289070, + "step": 3825, + "time_per_iteration": 2.6062655448913574 + }, + { + "auxiliary_loss_clip": 0.01138587, + "auxiliary_loss_mlp": 0.01028535, + "balance_loss_clip": 1.05055487, + "balance_loss_mlp": 1.02036881, + "epoch": 0.46004929958516205, + "flos": 36318184502400.0, + "grad_norm": 1.8804040799686086, + "language_loss": 0.56753772, + "learning_rate": 2.3541132587266133e-06, + "loss": 0.58920896, + "num_input_tokens_seen": 82311790, + "step": 3826, + "time_per_iteration": 2.707261323928833 + }, + { + "auxiliary_loss_clip": 0.01148344, + "auxiliary_loss_mlp": 0.01026314, + "balance_loss_clip": 1.0490675, + "balance_loss_mlp": 1.01799977, + "epoch": 0.4601695424758011, + "flos": 17238451224960.0, + "grad_norm": 2.215376080216147, + "language_loss": 0.69204199, + "learning_rate": 2.3533465674729515e-06, + "loss": 0.71378863, + "num_input_tokens_seen": 82329020, + "step": 3827, + "time_per_iteration": 2.6277542114257812 + }, + { + "auxiliary_loss_clip": 0.01184877, + "auxiliary_loss_mlp": 0.01033114, + "balance_loss_clip": 1.05310237, + "balance_loss_mlp": 1.02420342, + "epoch": 0.4602897853664402, + "flos": 15888425529600.0, + "grad_norm": 1.956432455205805, + "language_loss": 0.73053432, + "learning_rate": 2.352579822616895e-06, + "loss": 0.75271422, + "num_input_tokens_seen": 82346455, + "step": 3828, + "time_per_iteration": 2.435443639755249 + }, + { + "auxiliary_loss_clip": 0.01160404, + "auxiliary_loss_mlp": 0.01027314, + "balance_loss_clip": 1.05247569, + "balance_loss_mlp": 1.01917839, + "epoch": 0.4604100282570793, + "flos": 25412617370880.0, + "grad_norm": 1.736652348569789, + "language_loss": 0.77528667, + "learning_rate": 2.351813024274761e-06, + "loss": 0.79716384, + "num_input_tokens_seen": 82367810, + "step": 3829, + "time_per_iteration": 2.604726552963257 + }, + { + "auxiliary_loss_clip": 0.01148615, + "auxiliary_loss_mlp": 0.01030894, + "balance_loss_clip": 1.05138004, + "balance_loss_mlp": 1.02250814, + "epoch": 0.4605302711477184, + "flos": 27630711048960.0, + "grad_norm": 1.6676665972091207, + "language_loss": 0.73599809, + "learning_rate": 2.3510461725628693e-06, + "loss": 0.75779319, + "num_input_tokens_seen": 82388275, + "step": 3830, + "time_per_iteration": 3.3838343620300293 + }, + { + "auxiliary_loss_clip": 0.01144111, + "auxiliary_loss_mlp": 0.01027622, + "balance_loss_clip": 1.04860473, + "balance_loss_mlp": 1.01922989, + "epoch": 0.4606505140383575, + "flos": 23839657914240.0, + "grad_norm": 1.8858757974463702, + "language_loss": 0.7089783, + "learning_rate": 2.350279267597554e-06, + "loss": 0.73069561, + "num_input_tokens_seen": 82408915, + "step": 3831, + "time_per_iteration": 3.3336188793182373 + }, + { + "auxiliary_loss_clip": 0.01171532, + "auxiliary_loss_mlp": 0.01029437, + "balance_loss_clip": 1.05386066, + "balance_loss_mlp": 1.02105093, + "epoch": 0.46077075692899655, + "flos": 16107013745280.0, + "grad_norm": 2.600281992718426, + "language_loss": 0.82817781, + "learning_rate": 2.3495123094951515e-06, + "loss": 0.85018742, + "num_input_tokens_seen": 82427260, + "step": 3832, + "time_per_iteration": 2.4579901695251465 + }, + { + "auxiliary_loss_clip": 0.01149228, + "auxiliary_loss_mlp": 0.01022074, + "balance_loss_clip": 1.05004859, + "balance_loss_mlp": 1.01372385, + "epoch": 0.46089099981963566, + "flos": 48798147634560.0, + "grad_norm": 4.352959572367825, + "language_loss": 0.76183784, + "learning_rate": 2.34874529837201e-06, + "loss": 0.78355086, + "num_input_tokens_seen": 82450805, + "step": 3833, + "time_per_iteration": 4.239989757537842 + }, + { + "auxiliary_loss_clip": 0.01109082, + "auxiliary_loss_mlp": 0.01024552, + "balance_loss_clip": 1.04308951, + "balance_loss_mlp": 1.01642239, + "epoch": 0.46101124271027477, + "flos": 19099234362240.0, + "grad_norm": 1.9161475482591606, + "language_loss": 0.78401434, + "learning_rate": 2.347978234344483e-06, + "loss": 0.80535078, + "num_input_tokens_seen": 82467010, + "step": 3834, + "time_per_iteration": 2.6001031398773193 + }, + { + "auxiliary_loss_clip": 0.0117498, + "auxiliary_loss_mlp": 0.0103079, + "balance_loss_clip": 1.05387282, + "balance_loss_mlp": 1.0216763, + "epoch": 0.4611314856009138, + "flos": 39347931853440.0, + "grad_norm": 1.817459481655564, + "language_loss": 0.6904192, + "learning_rate": 2.347211117528935e-06, + "loss": 0.71247685, + "num_input_tokens_seen": 82489310, + "step": 3835, + "time_per_iteration": 2.6847920417785645 + }, + { + "auxiliary_loss_clip": 0.01152935, + "auxiliary_loss_mlp": 0.0103016, + "balance_loss_clip": 1.05365491, + "balance_loss_mlp": 1.02143943, + "epoch": 0.46125172849155294, + "flos": 20810772489600.0, + "grad_norm": 2.3144097664822914, + "language_loss": 0.71774387, + "learning_rate": 2.3464439480417374e-06, + "loss": 0.73957479, + "num_input_tokens_seen": 82508830, + "step": 3836, + "time_per_iteration": 2.6416471004486084 + }, + { + "auxiliary_loss_clip": 0.01171949, + "auxiliary_loss_mlp": 0.01032885, + "balance_loss_clip": 1.05221343, + "balance_loss_mlp": 1.02431428, + "epoch": 0.46137197138219205, + "flos": 17930808852480.0, + "grad_norm": 4.268713589450237, + "language_loss": 0.76944977, + "learning_rate": 2.3456767259992676e-06, + "loss": 0.79149806, + "num_input_tokens_seen": 82526475, + "step": 3837, + "time_per_iteration": 2.5514307022094727 + }, + { + "auxiliary_loss_clip": 0.01185374, + "auxiliary_loss_mlp": 0.00763598, + "balance_loss_clip": 1.05290151, + "balance_loss_mlp": 1.00099301, + "epoch": 0.4614922142728311, + "flos": 16836610798080.0, + "grad_norm": 2.274241204965949, + "language_loss": 0.88726342, + "learning_rate": 2.3449094515179135e-06, + "loss": 0.90675312, + "num_input_tokens_seen": 82543935, + "step": 3838, + "time_per_iteration": 2.5093882083892822 + }, + { + "auxiliary_loss_clip": 0.01162503, + "auxiliary_loss_mlp": 0.010291, + "balance_loss_clip": 1.05044687, + "balance_loss_mlp": 1.02040994, + "epoch": 0.4616124571634702, + "flos": 26614906427520.0, + "grad_norm": 1.7888199668256353, + "language_loss": 0.81584197, + "learning_rate": 2.34414212471407e-06, + "loss": 0.837758, + "num_input_tokens_seen": 82563730, + "step": 3839, + "time_per_iteration": 2.6540603637695312 + }, + { + "auxiliary_loss_clip": 0.0117855, + "auxiliary_loss_mlp": 0.01024473, + "balance_loss_clip": 1.05323565, + "balance_loss_mlp": 1.01577687, + "epoch": 0.4617327000541093, + "flos": 20340127560960.0, + "grad_norm": 2.1057658388960703, + "language_loss": 0.72625226, + "learning_rate": 2.3433747457041394e-06, + "loss": 0.74828255, + "num_input_tokens_seen": 82582435, + "step": 3840, + "time_per_iteration": 2.488163709640503 + }, + { + "auxiliary_loss_clip": 0.01142952, + "auxiliary_loss_mlp": 0.01028719, + "balance_loss_clip": 1.05040407, + "balance_loss_mlp": 1.0199151, + "epoch": 0.4618529429447484, + "flos": 29570749545600.0, + "grad_norm": 1.7995683296768377, + "language_loss": 0.84334362, + "learning_rate": 2.342607314604533e-06, + "loss": 0.86506033, + "num_input_tokens_seen": 82602185, + "step": 3841, + "time_per_iteration": 2.6281909942626953 + }, + { + "auxiliary_loss_clip": 0.01171526, + "auxiliary_loss_mlp": 0.01027745, + "balance_loss_clip": 1.05519676, + "balance_loss_mlp": 1.01927519, + "epoch": 0.4619731858353875, + "flos": 19787030962560.0, + "grad_norm": 1.7417179504219005, + "language_loss": 0.84206319, + "learning_rate": 2.3418398315316694e-06, + "loss": 0.86405593, + "num_input_tokens_seen": 82620005, + "step": 3842, + "time_per_iteration": 2.479717254638672 + }, + { + "auxiliary_loss_clip": 0.01188512, + "auxiliary_loss_mlp": 0.0103485, + "balance_loss_clip": 1.05804431, + "balance_loss_mlp": 1.026106, + "epoch": 0.4620934287260266, + "flos": 18951138587520.0, + "grad_norm": 2.5456917897279183, + "language_loss": 0.78580201, + "learning_rate": 2.3410722966019755e-06, + "loss": 0.80803561, + "num_input_tokens_seen": 82635120, + "step": 3843, + "time_per_iteration": 2.4385809898376465 + }, + { + "auxiliary_loss_clip": 0.01169961, + "auxiliary_loss_mlp": 0.01025817, + "balance_loss_clip": 1.05184269, + "balance_loss_mlp": 1.01720428, + "epoch": 0.46221367161666566, + "flos": 37341674634240.0, + "grad_norm": 1.8765939430526237, + "language_loss": 0.65669405, + "learning_rate": 2.3403047099318848e-06, + "loss": 0.67865181, + "num_input_tokens_seen": 82659190, + "step": 3844, + "time_per_iteration": 2.617234945297241 + }, + { + "auxiliary_loss_clip": 0.01125595, + "auxiliary_loss_mlp": 0.01026737, + "balance_loss_clip": 1.04693973, + "balance_loss_mlp": 1.01835632, + "epoch": 0.46233391450730477, + "flos": 14428549065600.0, + "grad_norm": 2.3829777004402013, + "language_loss": 0.75030506, + "learning_rate": 2.3395370716378405e-06, + "loss": 0.77182829, + "num_input_tokens_seen": 82676635, + "step": 3845, + "time_per_iteration": 2.549920082092285 + }, + { + "auxiliary_loss_clip": 0.01174665, + "auxiliary_loss_mlp": 0.01031429, + "balance_loss_clip": 1.05274272, + "balance_loss_mlp": 1.02335572, + "epoch": 0.4624541573979438, + "flos": 22493044010880.0, + "grad_norm": 2.5408856460463536, + "language_loss": 0.72633898, + "learning_rate": 2.338769381836292e-06, + "loss": 0.74839985, + "num_input_tokens_seen": 82696245, + "step": 3846, + "time_per_iteration": 2.489337682723999 + }, + { + "auxiliary_loss_clip": 0.01139206, + "auxiliary_loss_mlp": 0.01034159, + "balance_loss_clip": 1.05110586, + "balance_loss_mlp": 1.02588558, + "epoch": 0.46257440028858293, + "flos": 14465070218880.0, + "grad_norm": 2.3152882272761484, + "language_loss": 0.7313664, + "learning_rate": 2.3380016406436984e-06, + "loss": 0.75310004, + "num_input_tokens_seen": 82713725, + "step": 3847, + "time_per_iteration": 2.5463552474975586 + }, + { + "auxiliary_loss_clip": 0.01127737, + "auxiliary_loss_mlp": 0.01031403, + "balance_loss_clip": 1.05174208, + "balance_loss_mlp": 1.02264142, + "epoch": 0.46269464317922204, + "flos": 23332204523520.0, + "grad_norm": 2.2385976034909696, + "language_loss": 0.81261694, + "learning_rate": 2.337233848176524e-06, + "loss": 0.83420837, + "num_input_tokens_seen": 82731495, + "step": 3848, + "time_per_iteration": 2.6000545024871826 + }, + { + "auxiliary_loss_clip": 0.01122695, + "auxiliary_loss_mlp": 0.01029295, + "balance_loss_clip": 1.04756796, + "balance_loss_mlp": 1.02078366, + "epoch": 0.4628148860698611, + "flos": 18552027594240.0, + "grad_norm": 2.108883413229591, + "language_loss": 0.83065462, + "learning_rate": 2.3364660045512435e-06, + "loss": 0.85217452, + "num_input_tokens_seen": 82750255, + "step": 3849, + "time_per_iteration": 2.5837385654449463 + }, + { + "auxiliary_loss_clip": 0.01064431, + "auxiliary_loss_mlp": 0.01008618, + "balance_loss_clip": 1.02211046, + "balance_loss_mlp": 1.00730062, + "epoch": 0.4629351289605002, + "flos": 70667569670400.0, + "grad_norm": 0.7610018898295714, + "language_loss": 0.58214754, + "learning_rate": 2.335698109884337e-06, + "loss": 0.60287803, + "num_input_tokens_seen": 82815460, + "step": 3850, + "time_per_iteration": 3.247786045074463 + }, + { + "auxiliary_loss_clip": 0.0104103, + "auxiliary_loss_mlp": 0.01007645, + "balance_loss_clip": 1.02139902, + "balance_loss_mlp": 1.00601232, + "epoch": 0.4630553718511393, + "flos": 59687200465920.0, + "grad_norm": 0.7918547308996485, + "language_loss": 0.59882319, + "learning_rate": 2.334930164292294e-06, + "loss": 0.61931002, + "num_input_tokens_seen": 82878010, + "step": 3851, + "time_per_iteration": 3.2624716758728027 + }, + { + "auxiliary_loss_clip": 0.01123206, + "auxiliary_loss_mlp": 0.01026636, + "balance_loss_clip": 1.04730678, + "balance_loss_mlp": 1.01880407, + "epoch": 0.4631756147417784, + "flos": 15960605909760.0, + "grad_norm": 2.0128419550626346, + "language_loss": 0.79534495, + "learning_rate": 2.334162167891612e-06, + "loss": 0.81684339, + "num_input_tokens_seen": 82895275, + "step": 3852, + "time_per_iteration": 2.585947036743164 + }, + { + "auxiliary_loss_clip": 0.01159568, + "auxiliary_loss_mlp": 0.01032549, + "balance_loss_clip": 1.05061495, + "balance_loss_mlp": 1.02368569, + "epoch": 0.4632958576324175, + "flos": 16472907636480.0, + "grad_norm": 2.9654729196608147, + "language_loss": 0.75227356, + "learning_rate": 2.333394120798795e-06, + "loss": 0.77419472, + "num_input_tokens_seen": 82914010, + "step": 3853, + "time_per_iteration": 2.5249991416931152 + }, + { + "auxiliary_loss_clip": 0.01156688, + "auxiliary_loss_mlp": 0.01024654, + "balance_loss_clip": 1.04957545, + "balance_loss_mlp": 1.01567745, + "epoch": 0.4634161005230566, + "flos": 22346492520960.0, + "grad_norm": 2.422891981458628, + "language_loss": 0.71709096, + "learning_rate": 2.3326260231303545e-06, + "loss": 0.73890436, + "num_input_tokens_seen": 82932610, + "step": 3854, + "time_per_iteration": 2.54032301902771 + }, + { + "auxiliary_loss_clip": 0.0118522, + "auxiliary_loss_mlp": 0.01025053, + "balance_loss_clip": 1.05610657, + "balance_loss_mlp": 1.01677442, + "epoch": 0.46353634341369565, + "flos": 15742233175680.0, + "grad_norm": 1.9086107386450508, + "language_loss": 0.86679405, + "learning_rate": 2.331857875002811e-06, + "loss": 0.88889682, + "num_input_tokens_seen": 82951210, + "step": 3855, + "time_per_iteration": 2.424968719482422 + }, + { + "auxiliary_loss_clip": 0.01160584, + "auxiliary_loss_mlp": 0.01036059, + "balance_loss_clip": 1.0559721, + "balance_loss_mlp": 1.02797687, + "epoch": 0.46365658630433476, + "flos": 28329820433280.0, + "grad_norm": 1.6461069036307594, + "language_loss": 0.7636469, + "learning_rate": 2.3310896765326916e-06, + "loss": 0.78561336, + "num_input_tokens_seen": 82972210, + "step": 3856, + "time_per_iteration": 2.570831298828125 + }, + { + "auxiliary_loss_clip": 0.01139764, + "auxiliary_loss_mlp": 0.01032754, + "balance_loss_clip": 1.04992533, + "balance_loss_mlp": 1.02371812, + "epoch": 0.46377682919497387, + "flos": 24608074590720.0, + "grad_norm": 1.5421362104864564, + "language_loss": 0.83689618, + "learning_rate": 2.330321427836531e-06, + "loss": 0.85862136, + "num_input_tokens_seen": 82994080, + "step": 3857, + "time_per_iteration": 3.3699898719787598 + }, + { + "auxiliary_loss_clip": 0.01168138, + "auxiliary_loss_mlp": 0.01024878, + "balance_loss_clip": 1.05221474, + "balance_loss_mlp": 1.01576459, + "epoch": 0.4638970720856129, + "flos": 19060953442560.0, + "grad_norm": 1.6974406211480293, + "language_loss": 0.82886839, + "learning_rate": 2.3295531290308733e-06, + "loss": 0.85079855, + "num_input_tokens_seen": 83012230, + "step": 3858, + "time_per_iteration": 3.299046039581299 + }, + { + "auxiliary_loss_clip": 0.01188189, + "auxiliary_loss_mlp": 0.00763576, + "balance_loss_clip": 1.05522513, + "balance_loss_mlp": 1.00094247, + "epoch": 0.46401731497625204, + "flos": 18471012468480.0, + "grad_norm": 2.6527832845508925, + "language_loss": 0.75685275, + "learning_rate": 2.3287847802322678e-06, + "loss": 0.77637041, + "num_input_tokens_seen": 83027800, + "step": 3859, + "time_per_iteration": 3.9202687740325928 + }, + { + "auxiliary_loss_clip": 0.0116564, + "auxiliary_loss_mlp": 0.01029977, + "balance_loss_clip": 1.05462122, + "balance_loss_mlp": 1.0209707, + "epoch": 0.4641375578668911, + "flos": 26067053214720.0, + "grad_norm": 1.8335321418848698, + "language_loss": 0.83905274, + "learning_rate": 2.3280163815572723e-06, + "loss": 0.86100888, + "num_input_tokens_seen": 83048395, + "step": 3860, + "time_per_iteration": 2.56221342086792 + }, + { + "auxiliary_loss_clip": 0.01150525, + "auxiliary_loss_mlp": 0.01022837, + "balance_loss_clip": 1.05057454, + "balance_loss_mlp": 1.01444221, + "epoch": 0.4642578007575302, + "flos": 19570382081280.0, + "grad_norm": 2.240716898917825, + "language_loss": 0.77354729, + "learning_rate": 2.3272479331224522e-06, + "loss": 0.79528093, + "num_input_tokens_seen": 83065825, + "step": 3861, + "time_per_iteration": 2.5321383476257324 + }, + { + "auxiliary_loss_clip": 0.01188248, + "auxiliary_loss_mlp": 0.01027805, + "balance_loss_clip": 1.05521131, + "balance_loss_mlp": 1.01960909, + "epoch": 0.4643780436481693, + "flos": 28186249772160.0, + "grad_norm": 1.9255133317341533, + "language_loss": 0.78206986, + "learning_rate": 2.3264794350443817e-06, + "loss": 0.80423033, + "num_input_tokens_seen": 83087920, + "step": 3862, + "time_per_iteration": 2.546922445297241 + }, + { + "auxiliary_loss_clip": 0.01172577, + "auxiliary_loss_mlp": 0.01028834, + "balance_loss_clip": 1.04979825, + "balance_loss_mlp": 1.02020383, + "epoch": 0.46449828653880837, + "flos": 25375270204800.0, + "grad_norm": 1.848366953202521, + "language_loss": 0.78939283, + "learning_rate": 2.3257108874396396e-06, + "loss": 0.81140685, + "num_input_tokens_seen": 83109015, + "step": 3863, + "time_per_iteration": 2.548980951309204 + }, + { + "auxiliary_loss_clip": 0.01156101, + "auxiliary_loss_mlp": 0.01033733, + "balance_loss_clip": 1.04931307, + "balance_loss_mlp": 1.02483368, + "epoch": 0.4646185294294475, + "flos": 16034330574720.0, + "grad_norm": 2.153601497035227, + "language_loss": 0.73924994, + "learning_rate": 2.3249422904248152e-06, + "loss": 0.76114833, + "num_input_tokens_seen": 83127450, + "step": 3864, + "time_per_iteration": 2.5068795680999756 + }, + { + "auxiliary_loss_clip": 0.01174482, + "auxiliary_loss_mlp": 0.01032618, + "balance_loss_clip": 1.05223489, + "balance_loss_mlp": 1.0248276, + "epoch": 0.4647387723200866, + "flos": 26363101109760.0, + "grad_norm": 1.4918497648869953, + "language_loss": 0.87269819, + "learning_rate": 2.324173644116504e-06, + "loss": 0.89476919, + "num_input_tokens_seen": 83150300, + "step": 3865, + "time_per_iteration": 2.562601089477539 + }, + { + "auxiliary_loss_clip": 0.01170142, + "auxiliary_loss_mlp": 0.01026899, + "balance_loss_clip": 1.05535269, + "balance_loss_mlp": 1.01910853, + "epoch": 0.46485901521072565, + "flos": 27160209774720.0, + "grad_norm": 1.709879478249082, + "language_loss": 0.81414068, + "learning_rate": 2.3234049486313087e-06, + "loss": 0.83611107, + "num_input_tokens_seen": 83171750, + "step": 3866, + "time_per_iteration": 2.538662910461426 + }, + { + "auxiliary_loss_clip": 0.01172113, + "auxiliary_loss_mlp": 0.01028585, + "balance_loss_clip": 1.05490255, + "balance_loss_mlp": 1.02075338, + "epoch": 0.46497925810136476, + "flos": 24279851088000.0, + "grad_norm": 2.2113046324021526, + "language_loss": 0.75516474, + "learning_rate": 2.322636204085839e-06, + "loss": 0.77717173, + "num_input_tokens_seen": 83191820, + "step": 3867, + "time_per_iteration": 2.543057680130005 + }, + { + "auxiliary_loss_clip": 0.01149323, + "auxiliary_loss_mlp": 0.01033498, + "balance_loss_clip": 1.04865181, + "balance_loss_mlp": 1.02472973, + "epoch": 0.46509950099200387, + "flos": 16253134272000.0, + "grad_norm": 2.284936534898148, + "language_loss": 0.78752649, + "learning_rate": 2.3218674105967143e-06, + "loss": 0.80935472, + "num_input_tokens_seen": 83210085, + "step": 3868, + "time_per_iteration": 2.4971303939819336 + }, + { + "auxiliary_loss_clip": 0.01151488, + "auxiliary_loss_mlp": 0.01028036, + "balance_loss_clip": 1.0492537, + "balance_loss_mlp": 1.01962566, + "epoch": 0.4652197438826429, + "flos": 23442270773760.0, + "grad_norm": 1.871680505004283, + "language_loss": 0.83513993, + "learning_rate": 2.3210985682805593e-06, + "loss": 0.85693514, + "num_input_tokens_seen": 83231865, + "step": 3869, + "time_per_iteration": 2.5768353939056396 + }, + { + "auxiliary_loss_clip": 0.01189086, + "auxiliary_loss_mlp": 0.01027205, + "balance_loss_clip": 1.05702257, + "balance_loss_mlp": 1.01889634, + "epoch": 0.46533998677328203, + "flos": 16216397637120.0, + "grad_norm": 2.417363521594835, + "language_loss": 0.68301749, + "learning_rate": 2.320329677254007e-06, + "loss": 0.70518041, + "num_input_tokens_seen": 83249195, + "step": 3870, + "time_per_iteration": 2.4394614696502686 + }, + { + "auxiliary_loss_clip": 0.01186249, + "auxiliary_loss_mlp": 0.01027772, + "balance_loss_clip": 1.0546155, + "balance_loss_mlp": 1.01905227, + "epoch": 0.46546022966392114, + "flos": 21141869080320.0, + "grad_norm": 2.0865214265587766, + "language_loss": 0.72546327, + "learning_rate": 2.319560737633697e-06, + "loss": 0.74760348, + "num_input_tokens_seen": 83267915, + "step": 3871, + "time_per_iteration": 2.449795722961426 + }, + { + "auxiliary_loss_clip": 0.01148312, + "auxiliary_loss_mlp": 0.01030551, + "balance_loss_clip": 1.04806411, + "balance_loss_mlp": 1.02163458, + "epoch": 0.4655804725545602, + "flos": 41171942442240.0, + "grad_norm": 1.5838451795654485, + "language_loss": 0.67827994, + "learning_rate": 2.3187917495362775e-06, + "loss": 0.70006859, + "num_input_tokens_seen": 83292325, + "step": 3872, + "time_per_iteration": 2.7101778984069824 + }, + { + "auxiliary_loss_clip": 0.01129707, + "auxiliary_loss_mlp": 0.01035794, + "balance_loss_clip": 1.05009222, + "balance_loss_mlp": 1.02684188, + "epoch": 0.4657007154451993, + "flos": 19570956698880.0, + "grad_norm": 3.0492821605503897, + "language_loss": 0.76949644, + "learning_rate": 2.318022713078403e-06, + "loss": 0.79115152, + "num_input_tokens_seen": 83306905, + "step": 3873, + "time_per_iteration": 2.5658650398254395 + }, + { + "auxiliary_loss_clip": 0.01156496, + "auxiliary_loss_mlp": 0.01034278, + "balance_loss_clip": 1.05094934, + "balance_loss_mlp": 1.02545655, + "epoch": 0.4658209583358384, + "flos": 15517826956800.0, + "grad_norm": 7.959295894973777, + "language_loss": 0.85449076, + "learning_rate": 2.3172536283767354e-06, + "loss": 0.8763985, + "num_input_tokens_seen": 83320665, + "step": 3874, + "time_per_iteration": 2.49249005317688 + }, + { + "auxiliary_loss_clip": 0.01140444, + "auxiliary_loss_mlp": 0.01026205, + "balance_loss_clip": 1.05053306, + "balance_loss_mlp": 1.01672173, + "epoch": 0.4659412012264775, + "flos": 14903180403840.0, + "grad_norm": 2.1812579923216275, + "language_loss": 0.80921459, + "learning_rate": 2.3164844955479447e-06, + "loss": 0.83088106, + "num_input_tokens_seen": 83336475, + "step": 3875, + "time_per_iteration": 2.5767595767974854 + }, + { + "auxiliary_loss_clip": 0.01139385, + "auxiliary_loss_mlp": 0.01030324, + "balance_loss_clip": 1.05045617, + "balance_loss_mlp": 1.02116919, + "epoch": 0.4660614441171166, + "flos": 24425612478720.0, + "grad_norm": 1.8121579205679483, + "language_loss": 0.70559531, + "learning_rate": 2.3157153147087082e-06, + "loss": 0.72729242, + "num_input_tokens_seen": 83358365, + "step": 3876, + "time_per_iteration": 2.6223678588867188 + }, + { + "auxiliary_loss_clip": 0.01139346, + "auxiliary_loss_mlp": 0.01029661, + "balance_loss_clip": 1.05302751, + "balance_loss_mlp": 1.02107739, + "epoch": 0.46618168700775564, + "flos": 22091095843200.0, + "grad_norm": 1.7908368288243544, + "language_loss": 0.83006161, + "learning_rate": 2.314946085975709e-06, + "loss": 0.85175174, + "num_input_tokens_seen": 83377345, + "step": 3877, + "time_per_iteration": 2.5895214080810547 + }, + { + "auxiliary_loss_clip": 0.01134228, + "auxiliary_loss_mlp": 0.01030207, + "balance_loss_clip": 1.05097318, + "balance_loss_mlp": 1.02159441, + "epoch": 0.46630192989839475, + "flos": 26176975810560.0, + "grad_norm": 1.7978554297864546, + "language_loss": 0.82365751, + "learning_rate": 2.3141768094656393e-06, + "loss": 0.84530187, + "num_input_tokens_seen": 83395920, + "step": 3878, + "time_per_iteration": 2.5838611125946045 + }, + { + "auxiliary_loss_clip": 0.01108871, + "auxiliary_loss_mlp": 0.0102787, + "balance_loss_clip": 1.04392815, + "balance_loss_mlp": 1.01973426, + "epoch": 0.46642217278903386, + "flos": 11509622150400.0, + "grad_norm": 2.3285181806341964, + "language_loss": 0.82736552, + "learning_rate": 2.3134074852951966e-06, + "loss": 0.84873295, + "num_input_tokens_seen": 83412510, + "step": 3879, + "time_per_iteration": 2.6894168853759766 + }, + { + "auxiliary_loss_clip": 0.01121712, + "auxiliary_loss_mlp": 0.01030374, + "balance_loss_clip": 1.04368722, + "balance_loss_mlp": 1.02190447, + "epoch": 0.4665424156796729, + "flos": 32306819299200.0, + "grad_norm": 1.6274174052701096, + "language_loss": 0.77357167, + "learning_rate": 2.312638113581088e-06, + "loss": 0.79509258, + "num_input_tokens_seen": 83432995, + "step": 3880, + "time_per_iteration": 2.9000189304351807 + }, + { + "auxiliary_loss_clip": 0.01171068, + "auxiliary_loss_mlp": 0.01027536, + "balance_loss_clip": 1.05055809, + "balance_loss_mlp": 1.01908445, + "epoch": 0.46666265857031203, + "flos": 18436179254400.0, + "grad_norm": 3.041909533035715, + "language_loss": 0.78398108, + "learning_rate": 2.311868694440027e-06, + "loss": 0.80596709, + "num_input_tokens_seen": 83447415, + "step": 3881, + "time_per_iteration": 2.4504470825195312 + }, + { + "auxiliary_loss_clip": 0.0108498, + "auxiliary_loss_mlp": 0.01001534, + "balance_loss_clip": 1.01908886, + "balance_loss_mlp": 1.000175, + "epoch": 0.46678290146095114, + "flos": 68438989221120.0, + "grad_norm": 0.7338529661476942, + "language_loss": 0.62493116, + "learning_rate": 2.3110992279887323e-06, + "loss": 0.6457963, + "num_input_tokens_seen": 83519340, + "step": 3882, + "time_per_iteration": 3.235837697982788 + }, + { + "auxiliary_loss_clip": 0.01150537, + "auxiliary_loss_mlp": 0.01029943, + "balance_loss_clip": 1.05301833, + "balance_loss_mlp": 1.02094316, + "epoch": 0.4669031443515902, + "flos": 17712507945600.0, + "grad_norm": 3.179798087679495, + "language_loss": 0.85020196, + "learning_rate": 2.310329714343932e-06, + "loss": 0.87200677, + "num_input_tokens_seen": 83535490, + "step": 3883, + "time_per_iteration": 2.5507760047912598 + }, + { + "auxiliary_loss_clip": 0.01152602, + "auxiliary_loss_mlp": 0.01024288, + "balance_loss_clip": 1.05005252, + "balance_loss_mlp": 1.01617599, + "epoch": 0.4670233872422293, + "flos": 23947748916480.0, + "grad_norm": 1.900319155565472, + "language_loss": 0.81793249, + "learning_rate": 2.309560153622361e-06, + "loss": 0.83970141, + "num_input_tokens_seen": 83552400, + "step": 3884, + "time_per_iteration": 4.289768934249878 + }, + { + "auxiliary_loss_clip": 0.01145725, + "auxiliary_loss_mlp": 0.01028557, + "balance_loss_clip": 1.05218196, + "balance_loss_mlp": 1.01942003, + "epoch": 0.4671436301328684, + "flos": 28111268131200.0, + "grad_norm": 2.251722940900377, + "language_loss": 0.74693477, + "learning_rate": 2.3087905459407602e-06, + "loss": 0.76867759, + "num_input_tokens_seen": 83571340, + "step": 3885, + "time_per_iteration": 3.4185678958892822 + }, + { + "auxiliary_loss_clip": 0.01074, + "auxiliary_loss_mlp": 0.01001427, + "balance_loss_clip": 1.01708817, + "balance_loss_mlp": 1.00015116, + "epoch": 0.46726387302350747, + "flos": 69369684566400.0, + "grad_norm": 0.7972510719336078, + "language_loss": 0.62918979, + "learning_rate": 2.3080208914158795e-06, + "loss": 0.64994407, + "num_input_tokens_seen": 83634340, + "step": 3886, + "time_per_iteration": 3.811882734298706 + }, + { + "auxiliary_loss_clip": 0.01158347, + "auxiliary_loss_mlp": 0.01025932, + "balance_loss_clip": 1.05453098, + "balance_loss_mlp": 1.017838, + "epoch": 0.4673841159141466, + "flos": 25519666878720.0, + "grad_norm": 3.1608940328000252, + "language_loss": 0.72071838, + "learning_rate": 2.3072511901644753e-06, + "loss": 0.74256116, + "num_input_tokens_seen": 83653410, + "step": 3887, + "time_per_iteration": 2.5506579875946045 + }, + { + "auxiliary_loss_clip": 0.01185384, + "auxiliary_loss_mlp": 0.01025243, + "balance_loss_clip": 1.05517483, + "balance_loss_mlp": 1.01710093, + "epoch": 0.4675043588047857, + "flos": 24499265316480.0, + "grad_norm": 2.034609953320813, + "language_loss": 0.80788273, + "learning_rate": 2.306481442303309e-06, + "loss": 0.82998896, + "num_input_tokens_seen": 83672985, + "step": 3888, + "time_per_iteration": 2.4789021015167236 + }, + { + "auxiliary_loss_clip": 0.01174656, + "auxiliary_loss_mlp": 0.01029855, + "balance_loss_clip": 1.0525856, + "balance_loss_mlp": 1.0209738, + "epoch": 0.46762460169542475, + "flos": 20960771685120.0, + "grad_norm": 1.9315849591042826, + "language_loss": 0.73343223, + "learning_rate": 2.3057116479491515e-06, + "loss": 0.75547731, + "num_input_tokens_seen": 83692395, + "step": 3889, + "time_per_iteration": 2.526035785675049 + }, + { + "auxiliary_loss_clip": 0.01164991, + "auxiliary_loss_mlp": 0.01028834, + "balance_loss_clip": 1.0481739, + "balance_loss_mlp": 1.02063847, + "epoch": 0.46774484458606386, + "flos": 19171666137600.0, + "grad_norm": 2.076559305821489, + "language_loss": 0.75986159, + "learning_rate": 2.30494180721878e-06, + "loss": 0.78179979, + "num_input_tokens_seen": 83709735, + "step": 3890, + "time_per_iteration": 2.4608423709869385 + }, + { + "auxiliary_loss_clip": 0.01169565, + "auxiliary_loss_mlp": 0.01033468, + "balance_loss_clip": 1.05119276, + "balance_loss_mlp": 1.02541518, + "epoch": 0.4678650874767029, + "flos": 17967689141760.0, + "grad_norm": 3.0075894180971248, + "language_loss": 0.89972115, + "learning_rate": 2.3041719202289794e-06, + "loss": 0.92175144, + "num_input_tokens_seen": 83725910, + "step": 3891, + "time_per_iteration": 2.469712972640991 + }, + { + "auxiliary_loss_clip": 0.01170958, + "auxiliary_loss_mlp": 0.01031105, + "balance_loss_clip": 1.0528276, + "balance_loss_mlp": 1.02300143, + "epoch": 0.467985330367342, + "flos": 21360816432000.0, + "grad_norm": 1.895692616418478, + "language_loss": 0.80343759, + "learning_rate": 2.30340198709654e-06, + "loss": 0.82545823, + "num_input_tokens_seen": 83745745, + "step": 3892, + "time_per_iteration": 2.4777379035949707 + }, + { + "auxiliary_loss_clip": 0.01161481, + "auxiliary_loss_mlp": 0.01030233, + "balance_loss_clip": 1.04940438, + "balance_loss_mlp": 1.02201962, + "epoch": 0.46810557325798113, + "flos": 20521835487360.0, + "grad_norm": 4.384827110951784, + "language_loss": 0.74272597, + "learning_rate": 2.3026320079382605e-06, + "loss": 0.76464307, + "num_input_tokens_seen": 83762680, + "step": 3893, + "time_per_iteration": 2.495185613632202 + }, + { + "auxiliary_loss_clip": 0.01184089, + "auxiliary_loss_mlp": 0.01028778, + "balance_loss_clip": 1.0542779, + "balance_loss_mlp": 1.02023089, + "epoch": 0.4682258161486202, + "flos": 30117848572800.0, + "grad_norm": 2.3991734261358784, + "language_loss": 0.76039088, + "learning_rate": 2.3018619828709454e-06, + "loss": 0.78251958, + "num_input_tokens_seen": 83784220, + "step": 3894, + "time_per_iteration": 2.5153274536132812 + }, + { + "auxiliary_loss_clip": 0.01169391, + "auxiliary_loss_mlp": 0.00763308, + "balance_loss_clip": 1.05571663, + "balance_loss_mlp": 1.00106668, + "epoch": 0.4683460590392593, + "flos": 25293357239040.0, + "grad_norm": 2.3952997040502537, + "language_loss": 0.82023036, + "learning_rate": 2.3010919120114084e-06, + "loss": 0.83955735, + "num_input_tokens_seen": 83800750, + "step": 3895, + "time_per_iteration": 2.4955766201019287 + }, + { + "auxiliary_loss_clip": 0.01165504, + "auxiliary_loss_mlp": 0.01032459, + "balance_loss_clip": 1.0463239, + "balance_loss_mlp": 1.02425754, + "epoch": 0.4684663019298984, + "flos": 15368330551680.0, + "grad_norm": 2.136206212658361, + "language_loss": 0.65906143, + "learning_rate": 2.3003217954764672e-06, + "loss": 0.68104112, + "num_input_tokens_seen": 83815455, + "step": 3896, + "time_per_iteration": 2.4174294471740723 + }, + { + "auxiliary_loss_clip": 0.01171501, + "auxiliary_loss_mlp": 0.01024956, + "balance_loss_clip": 1.04816544, + "balance_loss_mlp": 1.01647973, + "epoch": 0.46858654482053747, + "flos": 27778842737280.0, + "grad_norm": 2.0195488076185955, + "language_loss": 0.79432738, + "learning_rate": 2.299551633382949e-06, + "loss": 0.81629193, + "num_input_tokens_seen": 83835765, + "step": 3897, + "time_per_iteration": 2.537306547164917 + }, + { + "auxiliary_loss_clip": 0.01150222, + "auxiliary_loss_mlp": 0.01029888, + "balance_loss_clip": 1.04940403, + "balance_loss_mlp": 1.02134013, + "epoch": 0.4687067877111766, + "flos": 18040623707520.0, + "grad_norm": 1.960588216800312, + "language_loss": 0.8532356, + "learning_rate": 2.2987814258476854e-06, + "loss": 0.87503672, + "num_input_tokens_seen": 83853565, + "step": 3898, + "time_per_iteration": 2.501908779144287 + }, + { + "auxiliary_loss_clip": 0.0112911, + "auxiliary_loss_mlp": 0.01025405, + "balance_loss_clip": 1.04418039, + "balance_loss_mlp": 1.01672077, + "epoch": 0.4688270306018157, + "flos": 16977380198400.0, + "grad_norm": 2.7094441251910237, + "language_loss": 0.67999649, + "learning_rate": 2.2980111729875177e-06, + "loss": 0.7015416, + "num_input_tokens_seen": 83869815, + "step": 3899, + "time_per_iteration": 2.5614476203918457 + }, + { + "auxiliary_loss_clip": 0.01151599, + "auxiliary_loss_mlp": 0.01038014, + "balance_loss_clip": 1.05172729, + "balance_loss_mlp": 1.02974701, + "epoch": 0.46894727349245474, + "flos": 17821640442240.0, + "grad_norm": 1.7252171698950065, + "language_loss": 0.82025468, + "learning_rate": 2.2972408749192917e-06, + "loss": 0.84215081, + "num_input_tokens_seen": 83887545, + "step": 3900, + "time_per_iteration": 2.489886999130249 + }, + { + "auxiliary_loss_clip": 0.01169317, + "auxiliary_loss_mlp": 0.00762577, + "balance_loss_clip": 1.05313408, + "balance_loss_mlp": 1.00103319, + "epoch": 0.46906751638309385, + "flos": 21471349559040.0, + "grad_norm": 1.7193928955795308, + "language_loss": 0.66897368, + "learning_rate": 2.296470531759861e-06, + "loss": 0.68829262, + "num_input_tokens_seen": 83905645, + "step": 3901, + "time_per_iteration": 2.4887218475341797 + }, + { + "auxiliary_loss_clip": 0.01137927, + "auxiliary_loss_mlp": 0.01025419, + "balance_loss_clip": 1.04659367, + "balance_loss_mlp": 1.01616287, + "epoch": 0.46918775927373296, + "flos": 20337829090560.0, + "grad_norm": 1.8675888067048279, + "language_loss": 0.79181206, + "learning_rate": 2.2957001436260866e-06, + "loss": 0.81344557, + "num_input_tokens_seen": 83922705, + "step": 3902, + "time_per_iteration": 2.5314321517944336 + }, + { + "auxiliary_loss_clip": 0.01153431, + "auxiliary_loss_mlp": 0.01032761, + "balance_loss_clip": 1.05017543, + "balance_loss_mlp": 1.02441001, + "epoch": 0.469308002164372, + "flos": 18403249461120.0, + "grad_norm": 1.5344949605748113, + "language_loss": 0.72974038, + "learning_rate": 2.294929710634836e-06, + "loss": 0.75160229, + "num_input_tokens_seen": 83940795, + "step": 3903, + "time_per_iteration": 2.49879789352417 + }, + { + "auxiliary_loss_clip": 0.01167978, + "auxiliary_loss_mlp": 0.01033518, + "balance_loss_clip": 1.0490737, + "balance_loss_mlp": 1.02467847, + "epoch": 0.46942824505501113, + "flos": 37962067363200.0, + "grad_norm": 2.4228145675920993, + "language_loss": 0.61010426, + "learning_rate": 2.2941592329029823e-06, + "loss": 0.63211924, + "num_input_tokens_seen": 83961900, + "step": 3904, + "time_per_iteration": 2.620600938796997 + }, + { + "auxiliary_loss_clip": 0.01166282, + "auxiliary_loss_mlp": 0.01032126, + "balance_loss_clip": 1.05073273, + "balance_loss_mlp": 1.02318537, + "epoch": 0.46954848794565024, + "flos": 21872507627520.0, + "grad_norm": 1.8394657448428398, + "language_loss": 0.79107463, + "learning_rate": 2.2933887105474067e-06, + "loss": 0.81305867, + "num_input_tokens_seen": 83980075, + "step": 3905, + "time_per_iteration": 2.4907572269439697 + }, + { + "auxiliary_loss_clip": 0.01167607, + "auxiliary_loss_mlp": 0.01030776, + "balance_loss_clip": 1.05282903, + "balance_loss_mlp": 1.02252698, + "epoch": 0.4696687308362893, + "flos": 22016545165440.0, + "grad_norm": 1.635301465675978, + "language_loss": 0.8144666, + "learning_rate": 2.2926181436849974e-06, + "loss": 0.83645046, + "num_input_tokens_seen": 83999430, + "step": 3906, + "time_per_iteration": 2.497793674468994 + }, + { + "auxiliary_loss_clip": 0.01170028, + "auxiliary_loss_mlp": 0.01030279, + "balance_loss_clip": 1.05334401, + "balance_loss_mlp": 1.02118933, + "epoch": 0.4697889737269284, + "flos": 21613663244160.0, + "grad_norm": 1.574899707055541, + "language_loss": 0.72447211, + "learning_rate": 2.2918475324326478e-06, + "loss": 0.7464751, + "num_input_tokens_seen": 84019150, + "step": 3907, + "time_per_iteration": 2.50534987449646 + }, + { + "auxiliary_loss_clip": 0.01173731, + "auxiliary_loss_mlp": 0.00763309, + "balance_loss_clip": 1.05338383, + "balance_loss_mlp": 1.00102258, + "epoch": 0.46990921661756746, + "flos": 25228323665280.0, + "grad_norm": 2.7635634026479208, + "language_loss": 0.91661859, + "learning_rate": 2.2910768769072603e-06, + "loss": 0.93598914, + "num_input_tokens_seen": 84037930, + "step": 3908, + "time_per_iteration": 2.514024496078491 + }, + { + "auxiliary_loss_clip": 0.01163906, + "auxiliary_loss_mlp": 0.01031853, + "balance_loss_clip": 1.05005538, + "balance_loss_mlp": 1.02351999, + "epoch": 0.47002945950820657, + "flos": 13844031045120.0, + "grad_norm": 1.9377127653444945, + "language_loss": 0.76173598, + "learning_rate": 2.2903061772257417e-06, + "loss": 0.78369355, + "num_input_tokens_seen": 84055915, + "step": 3909, + "time_per_iteration": 2.4838593006134033 + }, + { + "auxiliary_loss_clip": 0.01168662, + "auxiliary_loss_mlp": 0.0103056, + "balance_loss_clip": 1.05136919, + "balance_loss_mlp": 1.02204275, + "epoch": 0.4701497023988457, + "flos": 26247001374720.0, + "grad_norm": 1.575958110293281, + "language_loss": 0.78513384, + "learning_rate": 2.289535433505007e-06, + "loss": 0.80712599, + "num_input_tokens_seen": 84077270, + "step": 3910, + "time_per_iteration": 3.3169257640838623 + }, + { + "auxiliary_loss_clip": 0.01158989, + "auxiliary_loss_mlp": 0.01026273, + "balance_loss_clip": 1.05011749, + "balance_loss_mlp": 1.0178926, + "epoch": 0.47026994528948474, + "flos": 25629517647360.0, + "grad_norm": 1.9077784592721239, + "language_loss": 0.63906217, + "learning_rate": 2.2887646458619767e-06, + "loss": 0.66091478, + "num_input_tokens_seen": 84098635, + "step": 3911, + "time_per_iteration": 3.3754525184631348 + }, + { + "auxiliary_loss_clip": 0.01150778, + "auxiliary_loss_mlp": 0.01034102, + "balance_loss_clip": 1.05168891, + "balance_loss_mlp": 1.02506566, + "epoch": 0.47039018818012385, + "flos": 20554406144640.0, + "grad_norm": 1.8978296820455465, + "language_loss": 0.76459676, + "learning_rate": 2.2879938144135797e-06, + "loss": 0.7864455, + "num_input_tokens_seen": 84114740, + "step": 3912, + "time_per_iteration": 3.3869776725769043 + }, + { + "auxiliary_loss_clip": 0.01140374, + "auxiliary_loss_mlp": 0.00762677, + "balance_loss_clip": 1.04814339, + "balance_loss_mlp": 1.00101268, + "epoch": 0.47051043107076296, + "flos": 21577249831680.0, + "grad_norm": 2.2069767899920434, + "language_loss": 0.74725664, + "learning_rate": 2.2872229392767496e-06, + "loss": 0.76628709, + "num_input_tokens_seen": 84134845, + "step": 3913, + "time_per_iteration": 2.549499034881592 + }, + { + "auxiliary_loss_clip": 0.01175046, + "auxiliary_loss_mlp": 0.01030552, + "balance_loss_clip": 1.05335689, + "balance_loss_mlp": 1.02249336, + "epoch": 0.470630673961402, + "flos": 18953185662720.0, + "grad_norm": 1.6064915662151849, + "language_loss": 0.74691296, + "learning_rate": 2.286452020568428e-06, + "loss": 0.76896894, + "num_input_tokens_seen": 84152920, + "step": 3914, + "time_per_iteration": 2.468733310699463 + }, + { + "auxiliary_loss_clip": 0.01191018, + "auxiliary_loss_mlp": 0.01030126, + "balance_loss_clip": 1.05506182, + "balance_loss_mlp": 1.02080989, + "epoch": 0.4707509168520411, + "flos": 19938969492480.0, + "grad_norm": 2.419183401308264, + "language_loss": 0.7290802, + "learning_rate": 2.2856810584055637e-06, + "loss": 0.75129163, + "num_input_tokens_seen": 84170455, + "step": 3915, + "time_per_iteration": 2.4494223594665527 + }, + { + "auxiliary_loss_clip": 0.01170658, + "auxiliary_loss_mlp": 0.01023842, + "balance_loss_clip": 1.0511353, + "balance_loss_mlp": 1.01551509, + "epoch": 0.47087115974268023, + "flos": 40118754741120.0, + "grad_norm": 1.5696334523141082, + "language_loss": 0.67756897, + "learning_rate": 2.2849100529051085e-06, + "loss": 0.69951397, + "num_input_tokens_seen": 84197390, + "step": 3916, + "time_per_iteration": 2.7078371047973633 + }, + { + "auxiliary_loss_clip": 0.01184092, + "auxiliary_loss_mlp": 0.01030048, + "balance_loss_clip": 1.05436039, + "balance_loss_mlp": 1.02124405, + "epoch": 0.4709914026333193, + "flos": 13552723745280.0, + "grad_norm": 2.5470852474807635, + "language_loss": 0.79886806, + "learning_rate": 2.284139004184026e-06, + "loss": 0.82100946, + "num_input_tokens_seen": 84214620, + "step": 3917, + "time_per_iteration": 2.400747537612915 + }, + { + "auxiliary_loss_clip": 0.01187525, + "auxiliary_loss_mlp": 0.01027089, + "balance_loss_clip": 1.05415583, + "balance_loss_mlp": 1.01850033, + "epoch": 0.4711116455239584, + "flos": 19974628719360.0, + "grad_norm": 2.1084345253755554, + "language_loss": 0.74239242, + "learning_rate": 2.2833679123592814e-06, + "loss": 0.76453853, + "num_input_tokens_seen": 84231880, + "step": 3918, + "time_per_iteration": 2.414987564086914 + }, + { + "auxiliary_loss_clip": 0.0115497, + "auxiliary_loss_mlp": 0.01028541, + "balance_loss_clip": 1.05006599, + "balance_loss_mlp": 1.01921868, + "epoch": 0.4712318884145975, + "flos": 32124824064000.0, + "grad_norm": 1.7107402432613679, + "language_loss": 0.6352573, + "learning_rate": 2.2825967775478508e-06, + "loss": 0.65709245, + "num_input_tokens_seen": 84252980, + "step": 3919, + "time_per_iteration": 2.581728458404541 + }, + { + "auxiliary_loss_clip": 0.01182655, + "auxiliary_loss_mlp": 0.0102828, + "balance_loss_clip": 1.05057836, + "balance_loss_mlp": 1.01976848, + "epoch": 0.47135213130523657, + "flos": 20047850593920.0, + "grad_norm": 2.258040518178833, + "language_loss": 0.83380151, + "learning_rate": 2.2818255998667135e-06, + "loss": 0.8559109, + "num_input_tokens_seen": 84271490, + "step": 3920, + "time_per_iteration": 2.43392276763916 + }, + { + "auxiliary_loss_clip": 0.01170339, + "auxiliary_loss_mlp": 0.01026077, + "balance_loss_clip": 1.0532428, + "balance_loss_mlp": 1.01826525, + "epoch": 0.4714723741958757, + "flos": 19426990988160.0, + "grad_norm": 1.7770146691330486, + "language_loss": 0.78677619, + "learning_rate": 2.2810543794328566e-06, + "loss": 0.80874026, + "num_input_tokens_seen": 84290525, + "step": 3921, + "time_per_iteration": 2.4521031379699707 + }, + { + "auxiliary_loss_clip": 0.01174211, + "auxiliary_loss_mlp": 0.01034197, + "balance_loss_clip": 1.05231142, + "balance_loss_mlp": 1.02591228, + "epoch": 0.4715926170865148, + "flos": 20373883367040.0, + "grad_norm": 1.7501721663822023, + "language_loss": 0.82475066, + "learning_rate": 2.2802831163632735e-06, + "loss": 0.84683472, + "num_input_tokens_seen": 84309245, + "step": 3922, + "time_per_iteration": 2.485379934310913 + }, + { + "auxiliary_loss_clip": 0.011169, + "auxiliary_loss_mlp": 0.01028877, + "balance_loss_clip": 1.04796648, + "balance_loss_mlp": 1.01997852, + "epoch": 0.47171285997715384, + "flos": 22672884430080.0, + "grad_norm": 1.6905538417180093, + "language_loss": 0.74326032, + "learning_rate": 2.279511810774965e-06, + "loss": 0.76471806, + "num_input_tokens_seen": 84330775, + "step": 3923, + "time_per_iteration": 2.6111972332000732 + }, + { + "auxiliary_loss_clip": 0.01186459, + "auxiliary_loss_mlp": 0.01028929, + "balance_loss_clip": 1.05418921, + "balance_loss_mlp": 1.02056015, + "epoch": 0.47183310286779295, + "flos": 21105419754240.0, + "grad_norm": 1.9837447210026333, + "language_loss": 0.7156443, + "learning_rate": 2.2787404627849364e-06, + "loss": 0.73779815, + "num_input_tokens_seen": 84349985, + "step": 3924, + "time_per_iteration": 2.5492537021636963 + }, + { + "auxiliary_loss_clip": 0.01154776, + "auxiliary_loss_mlp": 0.01030679, + "balance_loss_clip": 1.0494715, + "balance_loss_mlp": 1.02231681, + "epoch": 0.471953345758432, + "flos": 21726566668800.0, + "grad_norm": 1.69589166744865, + "language_loss": 0.79572678, + "learning_rate": 2.277969072510202e-06, + "loss": 0.8175813, + "num_input_tokens_seen": 84368965, + "step": 3925, + "time_per_iteration": 2.5521223545074463 + }, + { + "auxiliary_loss_clip": 0.01156737, + "auxiliary_loss_mlp": 0.01026949, + "balance_loss_clip": 1.05131102, + "balance_loss_mlp": 1.01868176, + "epoch": 0.4720735886490711, + "flos": 19861078849920.0, + "grad_norm": 1.6273029777248258, + "language_loss": 0.8126632, + "learning_rate": 2.2771976400677803e-06, + "loss": 0.83450001, + "num_input_tokens_seen": 84387795, + "step": 3926, + "time_per_iteration": 2.5193252563476562 + }, + { + "auxiliary_loss_clip": 0.01117866, + "auxiliary_loss_mlp": 0.01027017, + "balance_loss_clip": 1.04536724, + "balance_loss_mlp": 1.01874971, + "epoch": 0.47219383153971023, + "flos": 19171809792000.0, + "grad_norm": 2.1953479796271447, + "language_loss": 0.7916587, + "learning_rate": 2.2764261655746965e-06, + "loss": 0.81310749, + "num_input_tokens_seen": 84405290, + "step": 3927, + "time_per_iteration": 2.5929343700408936 + }, + { + "auxiliary_loss_clip": 0.0113954, + "auxiliary_loss_mlp": 0.01025576, + "balance_loss_clip": 1.04811883, + "balance_loss_mlp": 1.0168674, + "epoch": 0.4723140744303493, + "flos": 23224005780480.0, + "grad_norm": 1.8736410799977397, + "language_loss": 0.75723279, + "learning_rate": 2.2756546491479832e-06, + "loss": 0.77888393, + "num_input_tokens_seen": 84426205, + "step": 3928, + "time_per_iteration": 2.5752274990081787 + }, + { + "auxiliary_loss_clip": 0.0118508, + "auxiliary_loss_mlp": 0.00763182, + "balance_loss_clip": 1.05216146, + "balance_loss_mlp": 1.00108087, + "epoch": 0.4724343173209884, + "flos": 18223265387520.0, + "grad_norm": 3.899181336011399, + "language_loss": 0.8058939, + "learning_rate": 2.274883090904679e-06, + "loss": 0.82537651, + "num_input_tokens_seen": 84443970, + "step": 3929, + "time_per_iteration": 2.438016891479492 + }, + { + "auxiliary_loss_clip": 0.01189375, + "auxiliary_loss_mlp": 0.0102905, + "balance_loss_clip": 1.05714035, + "balance_loss_mlp": 1.02057445, + "epoch": 0.4725545602116275, + "flos": 21251037490560.0, + "grad_norm": 2.281988346801082, + "language_loss": 0.67641485, + "learning_rate": 2.2741114909618283e-06, + "loss": 0.69859904, + "num_input_tokens_seen": 84459865, + "step": 3930, + "time_per_iteration": 2.46445894241333 + }, + { + "auxiliary_loss_clip": 0.01144584, + "auxiliary_loss_mlp": 0.0102409, + "balance_loss_clip": 1.0500927, + "balance_loss_mlp": 1.01573944, + "epoch": 0.47267480310226656, + "flos": 21434002392960.0, + "grad_norm": 2.47730353090478, + "language_loss": 0.71948814, + "learning_rate": 2.2733398494364828e-06, + "loss": 0.74117482, + "num_input_tokens_seen": 84479110, + "step": 3931, + "time_per_iteration": 2.5802714824676514 + }, + { + "auxiliary_loss_clip": 0.01150273, + "auxiliary_loss_mlp": 0.01026756, + "balance_loss_clip": 1.05179369, + "balance_loss_mlp": 1.0186615, + "epoch": 0.47279504599290567, + "flos": 18770508069120.0, + "grad_norm": 2.1085040113380975, + "language_loss": 0.84302682, + "learning_rate": 2.272568166445699e-06, + "loss": 0.86479712, + "num_input_tokens_seen": 84497675, + "step": 3932, + "time_per_iteration": 2.5071794986724854 + }, + { + "auxiliary_loss_clip": 0.01172175, + "auxiliary_loss_mlp": 0.0102275, + "balance_loss_clip": 1.05157852, + "balance_loss_mlp": 1.01406574, + "epoch": 0.4729152888835448, + "flos": 21105742976640.0, + "grad_norm": 2.457968920991937, + "language_loss": 0.64108062, + "learning_rate": 2.271796442106541e-06, + "loss": 0.66302985, + "num_input_tokens_seen": 84517030, + "step": 3933, + "time_per_iteration": 2.505333662033081 + }, + { + "auxiliary_loss_clip": 0.01043746, + "auxiliary_loss_mlp": 0.01003998, + "balance_loss_clip": 1.012779, + "balance_loss_mlp": 1.0027045, + "epoch": 0.47303553177418384, + "flos": 70201877840640.0, + "grad_norm": 0.8059211126205432, + "language_loss": 0.56445384, + "learning_rate": 2.271024676536079e-06, + "loss": 0.58493125, + "num_input_tokens_seen": 84577290, + "step": 3934, + "time_per_iteration": 3.074902057647705 + }, + { + "auxiliary_loss_clip": 0.01165166, + "auxiliary_loss_mlp": 0.01028383, + "balance_loss_clip": 1.05659449, + "balance_loss_mlp": 1.01907873, + "epoch": 0.47315577466482295, + "flos": 22455122227200.0, + "grad_norm": 1.901255476163418, + "language_loss": 0.73186195, + "learning_rate": 2.2702528698513894e-06, + "loss": 0.75379741, + "num_input_tokens_seen": 84598415, + "step": 3935, + "time_per_iteration": 2.5602760314941406 + }, + { + "auxiliary_loss_clip": 0.01157117, + "auxiliary_loss_mlp": 0.01030352, + "balance_loss_clip": 1.04856133, + "balance_loss_mlp": 1.02190602, + "epoch": 0.47327601755546206, + "flos": 24352857480960.0, + "grad_norm": 1.8904608004103072, + "language_loss": 0.78845137, + "learning_rate": 2.269481022169554e-06, + "loss": 0.81032604, + "num_input_tokens_seen": 84617010, + "step": 3936, + "time_per_iteration": 2.5390491485595703 + }, + { + "auxiliary_loss_clip": 0.01163495, + "auxiliary_loss_mlp": 0.01025583, + "balance_loss_clip": 1.04992616, + "balance_loss_mlp": 1.01672006, + "epoch": 0.4733962604461011, + "flos": 22926772736640.0, + "grad_norm": 1.8695599366049065, + "language_loss": 0.80280679, + "learning_rate": 2.2687091336076614e-06, + "loss": 0.82469761, + "num_input_tokens_seen": 84636350, + "step": 3937, + "time_per_iteration": 3.264953851699829 + }, + { + "auxiliary_loss_clip": 0.01170563, + "auxiliary_loss_mlp": 0.01032694, + "balance_loss_clip": 1.05280066, + "balance_loss_mlp": 1.02442694, + "epoch": 0.4735165033367402, + "flos": 18327369980160.0, + "grad_norm": 2.264712176014261, + "language_loss": 0.79939836, + "learning_rate": 2.267937204282807e-06, + "loss": 0.82143092, + "num_input_tokens_seen": 84653490, + "step": 3938, + "time_per_iteration": 3.2362310886383057 + }, + { + "auxiliary_loss_clip": 0.01179576, + "auxiliary_loss_mlp": 0.01029712, + "balance_loss_clip": 1.05523002, + "balance_loss_mlp": 1.02086067, + "epoch": 0.4736367462273793, + "flos": 23037018554880.0, + "grad_norm": 2.8256280088376213, + "language_loss": 0.78809446, + "learning_rate": 2.2671652343120926e-06, + "loss": 0.81018734, + "num_input_tokens_seen": 84673965, + "step": 3939, + "time_per_iteration": 4.2765889167785645 + }, + { + "auxiliary_loss_clip": 0.0118628, + "auxiliary_loss_mlp": 0.01029963, + "balance_loss_clip": 1.0556339, + "balance_loss_mlp": 1.02182126, + "epoch": 0.4737569891180184, + "flos": 25374336451200.0, + "grad_norm": 2.1970691620524954, + "language_loss": 0.80469441, + "learning_rate": 2.2663932238126236e-06, + "loss": 0.82685691, + "num_input_tokens_seen": 84692525, + "step": 3940, + "time_per_iteration": 2.470597267150879 + }, + { + "auxiliary_loss_clip": 0.01171708, + "auxiliary_loss_mlp": 0.01025165, + "balance_loss_clip": 1.05112755, + "balance_loss_mlp": 1.01659441, + "epoch": 0.4738772320086575, + "flos": 25849326925440.0, + "grad_norm": 1.4047547658902153, + "language_loss": 0.80069321, + "learning_rate": 2.265621172901515e-06, + "loss": 0.82266188, + "num_input_tokens_seen": 84715640, + "step": 3941, + "time_per_iteration": 2.615795612335205 + }, + { + "auxiliary_loss_clip": 0.01190723, + "auxiliary_loss_mlp": 0.01034086, + "balance_loss_clip": 1.05870438, + "balance_loss_mlp": 1.02539599, + "epoch": 0.47399747489929656, + "flos": 27564420499200.0, + "grad_norm": 3.1809830214006136, + "language_loss": 0.71346724, + "learning_rate": 2.2648490816958854e-06, + "loss": 0.73571533, + "num_input_tokens_seen": 84736635, + "step": 3942, + "time_per_iteration": 2.4922237396240234 + }, + { + "auxiliary_loss_clip": 0.01170322, + "auxiliary_loss_mlp": 0.0102967, + "balance_loss_clip": 1.05015206, + "balance_loss_mlp": 1.0199362, + "epoch": 0.47411771778993567, + "flos": 24863650836480.0, + "grad_norm": 2.288808527978002, + "language_loss": 0.7309463, + "learning_rate": 2.264076950312861e-06, + "loss": 0.7529462, + "num_input_tokens_seen": 84755445, + "step": 3943, + "time_per_iteration": 2.549288272857666 + }, + { + "auxiliary_loss_clip": 0.01163794, + "auxiliary_loss_mlp": 0.01032534, + "balance_loss_clip": 1.05231416, + "balance_loss_mlp": 1.02339709, + "epoch": 0.4742379606805748, + "flos": 22748009725440.0, + "grad_norm": 3.2470638164743226, + "language_loss": 0.82375771, + "learning_rate": 2.2633047788695727e-06, + "loss": 0.84572101, + "num_input_tokens_seen": 84775750, + "step": 3944, + "time_per_iteration": 2.543494939804077 + }, + { + "auxiliary_loss_clip": 0.01157334, + "auxiliary_loss_mlp": 0.0103153, + "balance_loss_clip": 1.05249655, + "balance_loss_mlp": 1.02372181, + "epoch": 0.47435820357121383, + "flos": 19681130689920.0, + "grad_norm": 1.7663522546875994, + "language_loss": 0.63862389, + "learning_rate": 2.262532567483159e-06, + "loss": 0.66051251, + "num_input_tokens_seen": 84794310, + "step": 3945, + "time_per_iteration": 2.524092197418213 + }, + { + "auxiliary_loss_clip": 0.01190008, + "auxiliary_loss_mlp": 0.00763706, + "balance_loss_clip": 1.05637956, + "balance_loss_mlp": 1.00118947, + "epoch": 0.47447844646185294, + "flos": 25228718714880.0, + "grad_norm": 2.5677480288358003, + "language_loss": 0.80135918, + "learning_rate": 2.2617603162707635e-06, + "loss": 0.82089627, + "num_input_tokens_seen": 84814720, + "step": 3946, + "time_per_iteration": 2.4764559268951416 + }, + { + "auxiliary_loss_clip": 0.0118545, + "auxiliary_loss_mlp": 0.01027254, + "balance_loss_clip": 1.05496192, + "balance_loss_mlp": 1.01900494, + "epoch": 0.47459868935249205, + "flos": 24570619683840.0, + "grad_norm": 1.6103788506427592, + "language_loss": 0.82362378, + "learning_rate": 2.2609880253495363e-06, + "loss": 0.84575081, + "num_input_tokens_seen": 84834355, + "step": 3947, + "time_per_iteration": 2.5416409969329834 + }, + { + "auxiliary_loss_clip": 0.01152281, + "auxiliary_loss_mlp": 0.0103384, + "balance_loss_clip": 1.04879367, + "balance_loss_mlp": 1.02509582, + "epoch": 0.4747189322431311, + "flos": 20558500295040.0, + "grad_norm": 2.4372798064348324, + "language_loss": 0.86385518, + "learning_rate": 2.260215694836633e-06, + "loss": 0.88571644, + "num_input_tokens_seen": 84853530, + "step": 3948, + "time_per_iteration": 2.536519765853882 + }, + { + "auxiliary_loss_clip": 0.01131334, + "auxiliary_loss_mlp": 0.00763623, + "balance_loss_clip": 1.04625809, + "balance_loss_mlp": 1.0011543, + "epoch": 0.4748391751337702, + "flos": 25995231970560.0, + "grad_norm": 1.9719580604090101, + "language_loss": 0.64771843, + "learning_rate": 2.2594433248492157e-06, + "loss": 0.66666806, + "num_input_tokens_seen": 84872505, + "step": 3949, + "time_per_iteration": 2.6254489421844482 + }, + { + "auxiliary_loss_clip": 0.01176961, + "auxiliary_loss_mlp": 0.01031338, + "balance_loss_clip": 1.05352545, + "balance_loss_mlp": 1.02311873, + "epoch": 0.47495941802440933, + "flos": 22821052032000.0, + "grad_norm": 1.6681336459023572, + "language_loss": 0.80055147, + "learning_rate": 2.2586709155044527e-06, + "loss": 0.82263446, + "num_input_tokens_seen": 84893105, + "step": 3950, + "time_per_iteration": 2.4892799854278564 + }, + { + "auxiliary_loss_clip": 0.01188509, + "auxiliary_loss_mlp": 0.01027379, + "balance_loss_clip": 1.05654168, + "balance_loss_mlp": 1.01880157, + "epoch": 0.4750796609150484, + "flos": 27891782075520.0, + "grad_norm": 2.3766313165224084, + "language_loss": 0.76058662, + "learning_rate": 2.2578984669195167e-06, + "loss": 0.78274548, + "num_input_tokens_seen": 84914070, + "step": 3951, + "time_per_iteration": 2.5137388706207275 + }, + { + "auxiliary_loss_clip": 0.01169079, + "auxiliary_loss_mlp": 0.01028894, + "balance_loss_clip": 1.04992807, + "balance_loss_mlp": 1.02082944, + "epoch": 0.4751999038056875, + "flos": 35660085471360.0, + "grad_norm": 1.9849046345433643, + "language_loss": 0.67663378, + "learning_rate": 2.2571259792115887e-06, + "loss": 0.69861352, + "num_input_tokens_seen": 84935290, + "step": 3952, + "time_per_iteration": 2.602705955505371 + }, + { + "auxiliary_loss_clip": 0.0116646, + "auxiliary_loss_mlp": 0.01030911, + "balance_loss_clip": 1.05176818, + "balance_loss_mlp": 1.02334404, + "epoch": 0.4753201466963266, + "flos": 22090880361600.0, + "grad_norm": 1.7488173641248974, + "language_loss": 0.79129195, + "learning_rate": 2.2563534524978544e-06, + "loss": 0.81326568, + "num_input_tokens_seen": 84952760, + "step": 3953, + "time_per_iteration": 2.5082902908325195 + }, + { + "auxiliary_loss_clip": 0.01138423, + "auxiliary_loss_mlp": 0.01024674, + "balance_loss_clip": 1.05234194, + "balance_loss_mlp": 1.01717615, + "epoch": 0.47544038958696566, + "flos": 30190854965760.0, + "grad_norm": 1.7655945748177642, + "language_loss": 0.70506728, + "learning_rate": 2.2555808868955052e-06, + "loss": 0.72669828, + "num_input_tokens_seen": 84974890, + "step": 3954, + "time_per_iteration": 2.603116750717163 + }, + { + "auxiliary_loss_clip": 0.01130936, + "auxiliary_loss_mlp": 0.01030617, + "balance_loss_clip": 1.05005598, + "balance_loss_mlp": 1.0218854, + "epoch": 0.47556063247760477, + "flos": 23472219738240.0, + "grad_norm": 7.7472256288491375, + "language_loss": 0.73861599, + "learning_rate": 2.254808282521738e-06, + "loss": 0.76023149, + "num_input_tokens_seen": 84993640, + "step": 3955, + "time_per_iteration": 2.6175308227539062 + }, + { + "auxiliary_loss_clip": 0.01146945, + "auxiliary_loss_mlp": 0.00763617, + "balance_loss_clip": 1.05018485, + "balance_loss_mlp": 1.00117278, + "epoch": 0.4756808753682438, + "flos": 25155209531520.0, + "grad_norm": 1.7503208463245088, + "language_loss": 0.81085849, + "learning_rate": 2.2540356394937573e-06, + "loss": 0.8299641, + "num_input_tokens_seen": 85012340, + "step": 3956, + "time_per_iteration": 2.565983295440674 + }, + { + "auxiliary_loss_clip": 0.01148297, + "auxiliary_loss_mlp": 0.01026674, + "balance_loss_clip": 1.04974699, + "balance_loss_mlp": 1.01813829, + "epoch": 0.47580111825888294, + "flos": 15669729573120.0, + "grad_norm": 2.156724803545884, + "language_loss": 0.83695728, + "learning_rate": 2.253262957928772e-06, + "loss": 0.85870701, + "num_input_tokens_seen": 85029225, + "step": 3957, + "time_per_iteration": 2.5100200176239014 + }, + { + "auxiliary_loss_clip": 0.0115101, + "auxiliary_loss_mlp": 0.01027108, + "balance_loss_clip": 1.04886603, + "balance_loss_mlp": 1.0184412, + "epoch": 0.47592136114952205, + "flos": 17636556637440.0, + "grad_norm": 1.6423244097393466, + "language_loss": 0.72003567, + "learning_rate": 2.2524902379439976e-06, + "loss": 0.74181682, + "num_input_tokens_seen": 85047895, + "step": 3958, + "time_per_iteration": 2.487999439239502 + }, + { + "auxiliary_loss_clip": 0.01027847, + "auxiliary_loss_mlp": 0.01010023, + "balance_loss_clip": 1.02166319, + "balance_loss_mlp": 1.00846148, + "epoch": 0.4760416040401611, + "flos": 61417159292160.0, + "grad_norm": 0.7555301352078916, + "language_loss": 0.63698328, + "learning_rate": 2.251717479656655e-06, + "loss": 0.65736198, + "num_input_tokens_seen": 85112690, + "step": 3959, + "time_per_iteration": 3.45175838470459 + }, + { + "auxiliary_loss_clip": 0.01187165, + "auxiliary_loss_mlp": 0.01029764, + "balance_loss_clip": 1.05484891, + "balance_loss_mlp": 1.02082944, + "epoch": 0.4761618469308002, + "flos": 18405871153920.0, + "grad_norm": 2.4451734265478087, + "language_loss": 0.76149899, + "learning_rate": 2.2509446831839704e-06, + "loss": 0.78366828, + "num_input_tokens_seen": 85132130, + "step": 3960, + "time_per_iteration": 2.6824309825897217 + }, + { + "auxiliary_loss_clip": 0.01158924, + "auxiliary_loss_mlp": 0.01032622, + "balance_loss_clip": 1.04977381, + "balance_loss_mlp": 1.0238961, + "epoch": 0.4762820898214393, + "flos": 18040911016320.0, + "grad_norm": 2.332376005385212, + "language_loss": 0.82407176, + "learning_rate": 2.250171848643177e-06, + "loss": 0.8459872, + "num_input_tokens_seen": 85149420, + "step": 3961, + "time_per_iteration": 2.4860422611236572 + }, + { + "auxiliary_loss_clip": 0.01155801, + "auxiliary_loss_mlp": 0.0103091, + "balance_loss_clip": 1.05286419, + "balance_loss_mlp": 1.02327764, + "epoch": 0.4764023327120784, + "flos": 19318253541120.0, + "grad_norm": 4.70389401345793, + "language_loss": 0.86024487, + "learning_rate": 2.249398976151513e-06, + "loss": 0.88211191, + "num_input_tokens_seen": 85166970, + "step": 3962, + "time_per_iteration": 2.4850995540618896 + }, + { + "auxiliary_loss_clip": 0.01184772, + "auxiliary_loss_mlp": 0.01030392, + "balance_loss_clip": 1.05474329, + "balance_loss_mlp": 1.02222633, + "epoch": 0.4765225756027175, + "flos": 22747255539840.0, + "grad_norm": 2.4564164110509843, + "language_loss": 0.78420484, + "learning_rate": 2.248626065826223e-06, + "loss": 0.80635655, + "num_input_tokens_seen": 85185175, + "step": 3963, + "time_per_iteration": 2.446481227874756 + }, + { + "auxiliary_loss_clip": 0.01082187, + "auxiliary_loss_mlp": 0.01001692, + "balance_loss_clip": 1.01751113, + "balance_loss_mlp": 1.00043488, + "epoch": 0.4766428184933566, + "flos": 65933392106880.0, + "grad_norm": 0.7570812521196986, + "language_loss": 0.62526309, + "learning_rate": 2.2478531177845564e-06, + "loss": 0.64610195, + "num_input_tokens_seen": 85246170, + "step": 3964, + "time_per_iteration": 3.6735103130340576 + }, + { + "auxiliary_loss_clip": 0.01159685, + "auxiliary_loss_mlp": 0.01025032, + "balance_loss_clip": 1.05221295, + "balance_loss_mlp": 1.01740599, + "epoch": 0.47676306138399566, + "flos": 24136495908480.0, + "grad_norm": 1.9014065471747013, + "language_loss": 0.85079837, + "learning_rate": 2.247080132143769e-06, + "loss": 0.8726455, + "num_input_tokens_seen": 85268525, + "step": 3965, + "time_per_iteration": 3.4240031242370605 + }, + { + "auxiliary_loss_clip": 0.0114075, + "auxiliary_loss_mlp": 0.01025683, + "balance_loss_clip": 1.04544032, + "balance_loss_mlp": 1.0169332, + "epoch": 0.47688330427463477, + "flos": 12604322995200.0, + "grad_norm": 2.2154098129040127, + "language_loss": 0.69141799, + "learning_rate": 2.246307109021121e-06, + "loss": 0.71308231, + "num_input_tokens_seen": 85285930, + "step": 3966, + "time_per_iteration": 4.040900468826294 + }, + { + "auxiliary_loss_clip": 0.01153356, + "auxiliary_loss_mlp": 0.01034796, + "balance_loss_clip": 1.04819846, + "balance_loss_mlp": 1.02625513, + "epoch": 0.4770035471652739, + "flos": 21390585828480.0, + "grad_norm": 1.6340952306882015, + "language_loss": 0.82467616, + "learning_rate": 2.2455340485338817e-06, + "loss": 0.84655774, + "num_input_tokens_seen": 85303565, + "step": 3967, + "time_per_iteration": 2.510218381881714 + }, + { + "auxiliary_loss_clip": 0.01172583, + "auxiliary_loss_mlp": 0.01025928, + "balance_loss_clip": 1.05231261, + "balance_loss_mlp": 1.0174942, + "epoch": 0.47712379005591293, + "flos": 25156251025920.0, + "grad_norm": 5.648057760057541, + "language_loss": 0.67824304, + "learning_rate": 2.244760950799322e-06, + "loss": 0.7002281, + "num_input_tokens_seen": 85321835, + "step": 3968, + "time_per_iteration": 2.5096890926361084 + }, + { + "auxiliary_loss_clip": 0.01128427, + "auxiliary_loss_mlp": 0.01026267, + "balance_loss_clip": 1.0481993, + "balance_loss_mlp": 1.01817882, + "epoch": 0.47724403294655204, + "flos": 22054323294720.0, + "grad_norm": 5.657364221396888, + "language_loss": 0.72441423, + "learning_rate": 2.2439878159347203e-06, + "loss": 0.74596113, + "num_input_tokens_seen": 85341260, + "step": 3969, + "time_per_iteration": 2.560011863708496 + }, + { + "auxiliary_loss_clip": 0.01081751, + "auxiliary_loss_mlp": 0.01006393, + "balance_loss_clip": 1.0171113, + "balance_loss_mlp": 1.00509405, + "epoch": 0.4773642758371911, + "flos": 70229387658240.0, + "grad_norm": 0.7320177983577608, + "language_loss": 0.55241841, + "learning_rate": 2.2432146440573616e-06, + "loss": 0.57329988, + "num_input_tokens_seen": 85407220, + "step": 3970, + "time_per_iteration": 3.127586603164673 + }, + { + "auxiliary_loss_clip": 0.01156896, + "auxiliary_loss_mlp": 0.01025386, + "balance_loss_clip": 1.05292273, + "balance_loss_mlp": 1.01730394, + "epoch": 0.4774845187278302, + "flos": 23548602009600.0, + "grad_norm": 1.7882889946513547, + "language_loss": 0.66444355, + "learning_rate": 2.242441435284534e-06, + "loss": 0.68626642, + "num_input_tokens_seen": 85426095, + "step": 3971, + "time_per_iteration": 2.5367238521575928 + }, + { + "auxiliary_loss_clip": 0.01176631, + "auxiliary_loss_mlp": 0.0103204, + "balance_loss_clip": 1.05670321, + "balance_loss_mlp": 1.02263451, + "epoch": 0.4776047616184693, + "flos": 23075371301760.0, + "grad_norm": 2.73952413880156, + "language_loss": 0.85288602, + "learning_rate": 2.2416681897335337e-06, + "loss": 0.8749727, + "num_input_tokens_seen": 85444245, + "step": 3972, + "time_per_iteration": 2.5440890789031982 + }, + { + "auxiliary_loss_clip": 0.01132366, + "auxiliary_loss_mlp": 0.0103368, + "balance_loss_clip": 1.05335736, + "balance_loss_mlp": 1.02518666, + "epoch": 0.4777250045091084, + "flos": 31898119374720.0, + "grad_norm": 4.469773436149562, + "language_loss": 0.6675511, + "learning_rate": 2.240894907521661e-06, + "loss": 0.68921161, + "num_input_tokens_seen": 85463325, + "step": 3973, + "time_per_iteration": 2.6620609760284424 + }, + { + "auxiliary_loss_clip": 0.01156094, + "auxiliary_loss_mlp": 0.01023899, + "balance_loss_clip": 1.05079663, + "balance_loss_mlp": 1.01592374, + "epoch": 0.4778452473997475, + "flos": 24278163148800.0, + "grad_norm": 1.816020260250139, + "language_loss": 0.63940871, + "learning_rate": 2.240121588766223e-06, + "loss": 0.66120863, + "num_input_tokens_seen": 85483375, + "step": 3974, + "time_per_iteration": 2.5428290367126465 + }, + { + "auxiliary_loss_clip": 0.01151225, + "auxiliary_loss_mlp": 0.01028643, + "balance_loss_clip": 1.04980528, + "balance_loss_mlp": 1.02039409, + "epoch": 0.4779654902903866, + "flos": 31575031516800.0, + "grad_norm": 1.9384248281837673, + "language_loss": 0.71075135, + "learning_rate": 2.239348233584531e-06, + "loss": 0.73255002, + "num_input_tokens_seen": 85504230, + "step": 3975, + "time_per_iteration": 2.5895838737487793 + }, + { + "auxiliary_loss_clip": 0.01172367, + "auxiliary_loss_mlp": 0.01032434, + "balance_loss_clip": 1.05305934, + "balance_loss_mlp": 1.02379727, + "epoch": 0.47808573318102565, + "flos": 19500428344320.0, + "grad_norm": 1.8383424692299628, + "language_loss": 0.81022573, + "learning_rate": 2.2385748420939013e-06, + "loss": 0.83227372, + "num_input_tokens_seen": 85523425, + "step": 3976, + "time_per_iteration": 2.4615306854248047 + }, + { + "auxiliary_loss_clip": 0.01188132, + "auxiliary_loss_mlp": 0.01028044, + "balance_loss_clip": 1.0600431, + "balance_loss_mlp": 1.01992011, + "epoch": 0.47820597607166476, + "flos": 22601135013120.0, + "grad_norm": 1.6266814385083956, + "language_loss": 0.72175592, + "learning_rate": 2.2378014144116583e-06, + "loss": 0.74391764, + "num_input_tokens_seen": 85542235, + "step": 3977, + "time_per_iteration": 2.479724884033203 + }, + { + "auxiliary_loss_clip": 0.01189988, + "auxiliary_loss_mlp": 0.01032073, + "balance_loss_clip": 1.05664182, + "balance_loss_mlp": 1.02446115, + "epoch": 0.4783262189623039, + "flos": 23003011353600.0, + "grad_norm": 1.9831183828831853, + "language_loss": 0.79686451, + "learning_rate": 2.23702795065513e-06, + "loss": 0.81908512, + "num_input_tokens_seen": 85561815, + "step": 3978, + "time_per_iteration": 2.487156867980957 + }, + { + "auxiliary_loss_clip": 0.01073295, + "auxiliary_loss_mlp": 0.01002713, + "balance_loss_clip": 1.01822829, + "balance_loss_mlp": 1.0014137, + "epoch": 0.47844646185294293, + "flos": 49772801226240.0, + "grad_norm": 0.9873804409205034, + "language_loss": 0.67458367, + "learning_rate": 2.2362544509416493e-06, + "loss": 0.69534373, + "num_input_tokens_seen": 85613930, + "step": 3979, + "time_per_iteration": 2.931875705718994 + }, + { + "auxiliary_loss_clip": 0.01149185, + "auxiliary_loss_mlp": 0.01033096, + "balance_loss_clip": 1.04938376, + "balance_loss_mlp": 1.02480471, + "epoch": 0.47856670474358204, + "flos": 20229558520320.0, + "grad_norm": 2.4221522443648635, + "language_loss": 0.82556736, + "learning_rate": 2.2354809153885572e-06, + "loss": 0.84739017, + "num_input_tokens_seen": 85631000, + "step": 3980, + "time_per_iteration": 2.4895663261413574 + }, + { + "auxiliary_loss_clip": 0.01170488, + "auxiliary_loss_mlp": 0.01030871, + "balance_loss_clip": 1.05155241, + "balance_loss_mlp": 1.02255642, + "epoch": 0.47868694763422115, + "flos": 20990936131200.0, + "grad_norm": 1.924611359749461, + "language_loss": 0.83347321, + "learning_rate": 2.234707344113197e-06, + "loss": 0.85548675, + "num_input_tokens_seen": 85649095, + "step": 3981, + "time_per_iteration": 2.4829602241516113 + }, + { + "auxiliary_loss_clip": 0.01184147, + "auxiliary_loss_mlp": 0.01028038, + "balance_loss_clip": 1.05536425, + "balance_loss_mlp": 1.01990807, + "epoch": 0.4788071905248602, + "flos": 19026551191680.0, + "grad_norm": 1.6136820242751273, + "language_loss": 0.7772404, + "learning_rate": 2.233933737232919e-06, + "loss": 0.79936224, + "num_input_tokens_seen": 85666875, + "step": 3982, + "time_per_iteration": 2.412339210510254 + }, + { + "auxiliary_loss_clip": 0.01121305, + "auxiliary_loss_mlp": 0.00762701, + "balance_loss_clip": 1.04722488, + "balance_loss_mlp": 1.00121808, + "epoch": 0.4789274334154993, + "flos": 23002221254400.0, + "grad_norm": 1.9151088052299572, + "language_loss": 0.78215021, + "learning_rate": 2.2331600948650793e-06, + "loss": 0.80099028, + "num_input_tokens_seen": 85687020, + "step": 3983, + "time_per_iteration": 2.589496374130249 + }, + { + "auxiliary_loss_clip": 0.01131975, + "auxiliary_loss_mlp": 0.00763815, + "balance_loss_clip": 1.05009294, + "balance_loss_mlp": 1.00111842, + "epoch": 0.4790476763061384, + "flos": 23075586783360.0, + "grad_norm": 1.577291920154657, + "language_loss": 0.80205178, + "learning_rate": 2.2323864171270386e-06, + "loss": 0.8210097, + "num_input_tokens_seen": 85708290, + "step": 3984, + "time_per_iteration": 2.611584186553955 + }, + { + "auxiliary_loss_clip": 0.01145885, + "auxiliary_loss_mlp": 0.01028516, + "balance_loss_clip": 1.04903316, + "balance_loss_mlp": 1.01936662, + "epoch": 0.4791679191967775, + "flos": 21179288073600.0, + "grad_norm": 1.7571689664644574, + "language_loss": 0.72002494, + "learning_rate": 2.231612704136164e-06, + "loss": 0.74176896, + "num_input_tokens_seen": 85728660, + "step": 3985, + "time_per_iteration": 2.5553925037384033 + }, + { + "auxiliary_loss_clip": 0.0116901, + "auxiliary_loss_mlp": 0.01033547, + "balance_loss_clip": 1.05279517, + "balance_loss_mlp": 1.02494597, + "epoch": 0.4792881620874166, + "flos": 22301495758080.0, + "grad_norm": 3.651770987215003, + "language_loss": 0.74834251, + "learning_rate": 2.2308389560098253e-06, + "loss": 0.77036816, + "num_input_tokens_seen": 85745035, + "step": 3986, + "time_per_iteration": 2.4697351455688477 + }, + { + "auxiliary_loss_clip": 0.01146836, + "auxiliary_loss_mlp": 0.01028952, + "balance_loss_clip": 1.05266142, + "balance_loss_mlp": 1.02005935, + "epoch": 0.47940840497805565, + "flos": 17420877423360.0, + "grad_norm": 1.9848512774865092, + "language_loss": 0.77092183, + "learning_rate": 2.2300651728654008e-06, + "loss": 0.79267979, + "num_input_tokens_seen": 85760295, + "step": 3987, + "time_per_iteration": 2.508781909942627 + }, + { + "auxiliary_loss_clip": 0.01065005, + "auxiliary_loss_mlp": 0.00752996, + "balance_loss_clip": 1.01599646, + "balance_loss_mlp": 1.00053298, + "epoch": 0.47952864786869476, + "flos": 65358175708800.0, + "grad_norm": 0.7568150239673535, + "language_loss": 0.60213393, + "learning_rate": 2.229291354820272e-06, + "loss": 0.62031394, + "num_input_tokens_seen": 85821305, + "step": 3988, + "time_per_iteration": 3.0682573318481445 + }, + { + "auxiliary_loss_clip": 0.01169283, + "auxiliary_loss_mlp": 0.01031873, + "balance_loss_clip": 1.05158436, + "balance_loss_mlp": 1.02346301, + "epoch": 0.47964889075933387, + "flos": 16799802336000.0, + "grad_norm": 17.006024527022372, + "language_loss": 0.75681174, + "learning_rate": 2.228517501991828e-06, + "loss": 0.77882332, + "num_input_tokens_seen": 85840105, + "step": 3989, + "time_per_iteration": 2.4831626415252686 + }, + { + "auxiliary_loss_clip": 0.01055702, + "auxiliary_loss_mlp": 0.01001371, + "balance_loss_clip": 1.016747, + "balance_loss_mlp": 0.99994034, + "epoch": 0.4797691336499729, + "flos": 70079244808320.0, + "grad_norm": 0.8069814311621979, + "language_loss": 0.61086237, + "learning_rate": 2.22774361449746e-06, + "loss": 0.63143301, + "num_input_tokens_seen": 85896585, + "step": 3990, + "time_per_iteration": 3.8004941940307617 + }, + { + "auxiliary_loss_clip": 0.01116412, + "auxiliary_loss_mlp": 0.01028945, + "balance_loss_clip": 1.0496527, + "balance_loss_mlp": 1.02058244, + "epoch": 0.47988937654061203, + "flos": 18953329317120.0, + "grad_norm": 3.8989387559137065, + "language_loss": 0.708018, + "learning_rate": 2.2269696924545668e-06, + "loss": 0.72947162, + "num_input_tokens_seen": 85914415, + "step": 3991, + "time_per_iteration": 2.587409496307373 + }, + { + "auxiliary_loss_clip": 0.01147547, + "auxiliary_loss_mlp": 0.01029938, + "balance_loss_clip": 1.05498672, + "balance_loss_mlp": 1.02185535, + "epoch": 0.48000961943125114, + "flos": 14461981649280.0, + "grad_norm": 2.478936853786927, + "language_loss": 0.7793448, + "learning_rate": 2.2261957359805523e-06, + "loss": 0.80111963, + "num_input_tokens_seen": 85931650, + "step": 3992, + "time_per_iteration": 4.908561706542969 + }, + { + "auxiliary_loss_clip": 0.01187746, + "auxiliary_loss_mlp": 0.0102479, + "balance_loss_clip": 1.05610037, + "balance_loss_mlp": 1.01653457, + "epoch": 0.4801298623218902, + "flos": 27051149105280.0, + "grad_norm": 3.2815407262587244, + "language_loss": 0.73958009, + "learning_rate": 2.225421745192823e-06, + "loss": 0.76170552, + "num_input_tokens_seen": 85951805, + "step": 3993, + "time_per_iteration": 2.478771209716797 + }, + { + "auxiliary_loss_clip": 0.01170467, + "auxiliary_loss_mlp": 0.01031228, + "balance_loss_clip": 1.05434072, + "balance_loss_mlp": 1.02279937, + "epoch": 0.4802501052125293, + "flos": 26355236031360.0, + "grad_norm": 2.0490086116355655, + "language_loss": 0.78207862, + "learning_rate": 2.2246477202087955e-06, + "loss": 0.80409557, + "num_input_tokens_seen": 85972485, + "step": 3994, + "time_per_iteration": 2.513536214828491 + }, + { + "auxiliary_loss_clip": 0.01160523, + "auxiliary_loss_mlp": 0.01029811, + "balance_loss_clip": 1.05291867, + "balance_loss_mlp": 1.0223366, + "epoch": 0.4803703481031684, + "flos": 20993916960000.0, + "grad_norm": 1.6905923988438096, + "language_loss": 0.82985258, + "learning_rate": 2.223873661145887e-06, + "loss": 0.85175598, + "num_input_tokens_seen": 85992540, + "step": 3995, + "time_per_iteration": 2.516037940979004 + }, + { + "auxiliary_loss_clip": 0.01156677, + "auxiliary_loss_mlp": 0.00762833, + "balance_loss_clip": 1.05618787, + "balance_loss_mlp": 1.00109053, + "epoch": 0.4804905909938075, + "flos": 20703722981760.0, + "grad_norm": 1.5783941425529924, + "language_loss": 0.71068484, + "learning_rate": 2.2230995681215226e-06, + "loss": 0.72987998, + "num_input_tokens_seen": 86012065, + "step": 3996, + "time_per_iteration": 2.5360305309295654 + }, + { + "auxiliary_loss_clip": 0.01140375, + "auxiliary_loss_mlp": 0.01024934, + "balance_loss_clip": 1.05123198, + "balance_loss_mlp": 1.0170784, + "epoch": 0.4806108338844466, + "flos": 16654831044480.0, + "grad_norm": 1.9907120174429613, + "language_loss": 0.78113991, + "learning_rate": 2.2223254412531305e-06, + "loss": 0.80279303, + "num_input_tokens_seen": 86029435, + "step": 3997, + "time_per_iteration": 2.517875909805298 + }, + { + "auxiliary_loss_clip": 0.01143783, + "auxiliary_loss_mlp": 0.010228, + "balance_loss_clip": 1.04783177, + "balance_loss_mlp": 1.01500392, + "epoch": 0.4807310767750857, + "flos": 20011329440640.0, + "grad_norm": 1.9356769415675852, + "language_loss": 0.82213223, + "learning_rate": 2.221551280658146e-06, + "loss": 0.84379804, + "num_input_tokens_seen": 86048495, + "step": 3998, + "time_per_iteration": 2.49727201461792 + }, + { + "auxiliary_loss_clip": 0.01125852, + "auxiliary_loss_mlp": 0.01025989, + "balance_loss_clip": 1.04939485, + "balance_loss_mlp": 1.0180316, + "epoch": 0.48085131966572475, + "flos": 23185257984000.0, + "grad_norm": 1.5964496052482238, + "language_loss": 0.74135441, + "learning_rate": 2.2207770864540085e-06, + "loss": 0.76287282, + "num_input_tokens_seen": 86067470, + "step": 3999, + "time_per_iteration": 2.592019557952881 + }, + { + "auxiliary_loss_clip": 0.01150168, + "auxiliary_loss_mlp": 0.01027129, + "balance_loss_clip": 1.05169392, + "balance_loss_mlp": 1.01883805, + "epoch": 0.48097156255636386, + "flos": 20558643949440.0, + "grad_norm": 2.8586037176817705, + "language_loss": 0.72546291, + "learning_rate": 2.220002858758162e-06, + "loss": 0.74723595, + "num_input_tokens_seen": 86085460, + "step": 4000, + "time_per_iteration": 2.4975180625915527 + }, + { + "auxiliary_loss_clip": 0.0107421, + "auxiliary_loss_mlp": 0.01002454, + "balance_loss_clip": 1.01924598, + "balance_loss_mlp": 1.00113034, + "epoch": 0.481091805447003, + "flos": 70511608817280.0, + "grad_norm": 0.8819114821958366, + "language_loss": 0.60865355, + "learning_rate": 2.2192285976880573e-06, + "loss": 0.62942016, + "num_input_tokens_seen": 86149715, + "step": 4001, + "time_per_iteration": 3.0493931770324707 + }, + { + "auxiliary_loss_clip": 0.01146799, + "auxiliary_loss_mlp": 0.00762197, + "balance_loss_clip": 1.05128455, + "balance_loss_mlp": 1.00117433, + "epoch": 0.48121204833764203, + "flos": 36428214839040.0, + "grad_norm": 1.7887200453390508, + "language_loss": 0.81031519, + "learning_rate": 2.2184543033611485e-06, + "loss": 0.82940519, + "num_input_tokens_seen": 86170795, + "step": 4002, + "time_per_iteration": 2.6921818256378174 + }, + { + "auxiliary_loss_clip": 0.01174868, + "auxiliary_loss_mlp": 0.01027104, + "balance_loss_clip": 1.05453551, + "balance_loss_mlp": 1.0193975, + "epoch": 0.48133229122828114, + "flos": 27490264871040.0, + "grad_norm": 3.9759282643073375, + "language_loss": 0.81713855, + "learning_rate": 2.2176799758948957e-06, + "loss": 0.83915824, + "num_input_tokens_seen": 86190955, + "step": 4003, + "time_per_iteration": 2.541100263595581 + }, + { + "auxiliary_loss_clip": 0.01151689, + "auxiliary_loss_mlp": 0.01030915, + "balance_loss_clip": 1.05183756, + "balance_loss_mlp": 1.02300549, + "epoch": 0.4814525341189202, + "flos": 43072802179200.0, + "grad_norm": 2.0634746985131858, + "language_loss": 0.72952735, + "learning_rate": 2.2169056154067635e-06, + "loss": 0.75135338, + "num_input_tokens_seen": 86214875, + "step": 4004, + "time_per_iteration": 2.7171976566314697 + }, + { + "auxiliary_loss_clip": 0.01174926, + "auxiliary_loss_mlp": 0.00762676, + "balance_loss_clip": 1.05681086, + "balance_loss_mlp": 1.00095391, + "epoch": 0.4815727770095593, + "flos": 24236901400320.0, + "grad_norm": 1.7598162860504147, + "language_loss": 0.82296354, + "learning_rate": 2.216131222014222e-06, + "loss": 0.84233958, + "num_input_tokens_seen": 86232950, + "step": 4005, + "time_per_iteration": 2.512775421142578 + }, + { + "auxiliary_loss_clip": 0.01137149, + "auxiliary_loss_mlp": 0.01028719, + "balance_loss_clip": 1.0501771, + "balance_loss_mlp": 1.02020693, + "epoch": 0.4816930199001984, + "flos": 18113630100480.0, + "grad_norm": 1.9793803576504083, + "language_loss": 0.80120224, + "learning_rate": 2.2153567958347455e-06, + "loss": 0.82286096, + "num_input_tokens_seen": 86249160, + "step": 4006, + "time_per_iteration": 2.533860445022583 + }, + { + "auxiliary_loss_clip": 0.01156189, + "auxiliary_loss_mlp": 0.01027426, + "balance_loss_clip": 1.05331612, + "balance_loss_mlp": 1.01902771, + "epoch": 0.48181326279083747, + "flos": 17274720983040.0, + "grad_norm": 5.401977962109008, + "language_loss": 0.79660422, + "learning_rate": 2.214582336985815e-06, + "loss": 0.81844032, + "num_input_tokens_seen": 86267060, + "step": 4007, + "time_per_iteration": 2.4850873947143555 + }, + { + "auxiliary_loss_clip": 0.01147676, + "auxiliary_loss_mlp": 0.01027316, + "balance_loss_clip": 1.04956138, + "balance_loss_mlp": 1.01813078, + "epoch": 0.4819335056814766, + "flos": 14903252231040.0, + "grad_norm": 2.5318158315475263, + "language_loss": 0.66174316, + "learning_rate": 2.2138078455849142e-06, + "loss": 0.68349302, + "num_input_tokens_seen": 86285055, + "step": 4008, + "time_per_iteration": 2.5234034061431885 + }, + { + "auxiliary_loss_clip": 0.01176442, + "auxiliary_loss_mlp": 0.01028918, + "balance_loss_clip": 1.05433989, + "balance_loss_mlp": 1.02104414, + "epoch": 0.4820537485721157, + "flos": 19244888012160.0, + "grad_norm": 1.9798239965103617, + "language_loss": 0.78634942, + "learning_rate": 2.2130333217495334e-06, + "loss": 0.80840302, + "num_input_tokens_seen": 86304225, + "step": 4009, + "time_per_iteration": 2.476163148880005 + }, + { + "auxiliary_loss_clip": 0.01151548, + "auxiliary_loss_mlp": 0.01026114, + "balance_loss_clip": 1.05003428, + "balance_loss_mlp": 1.01801968, + "epoch": 0.48217399146275475, + "flos": 16033791870720.0, + "grad_norm": 4.379671376727975, + "language_loss": 0.68024701, + "learning_rate": 2.2122587655971665e-06, + "loss": 0.70202363, + "num_input_tokens_seen": 86319170, + "step": 4010, + "time_per_iteration": 2.4738988876342773 + }, + { + "auxiliary_loss_clip": 0.01155606, + "auxiliary_loss_mlp": 0.01029701, + "balance_loss_clip": 1.05047274, + "balance_loss_mlp": 1.02192235, + "epoch": 0.48229423435339386, + "flos": 24134197438080.0, + "grad_norm": 1.7919899300001405, + "language_loss": 0.63651812, + "learning_rate": 2.211484177245314e-06, + "loss": 0.65837121, + "num_input_tokens_seen": 86338760, + "step": 4011, + "time_per_iteration": 2.5339369773864746 + }, + { + "auxiliary_loss_clip": 0.01186819, + "auxiliary_loss_mlp": 0.01033627, + "balance_loss_clip": 1.05515957, + "balance_loss_mlp": 1.02530003, + "epoch": 0.48241447724403297, + "flos": 23805435231360.0, + "grad_norm": 2.4899155773938473, + "language_loss": 0.72346967, + "learning_rate": 2.21070955681148e-06, + "loss": 0.74567413, + "num_input_tokens_seen": 86357865, + "step": 4012, + "time_per_iteration": 2.516618013381958 + }, + { + "auxiliary_loss_clip": 0.01132249, + "auxiliary_loss_mlp": 0.01025576, + "balance_loss_clip": 1.04841626, + "balance_loss_mlp": 1.01770771, + "epoch": 0.482534720134672, + "flos": 23110312256640.0, + "grad_norm": 1.6935244638655456, + "language_loss": 0.77961624, + "learning_rate": 2.209934904413174e-06, + "loss": 0.80119443, + "num_input_tokens_seen": 86379470, + "step": 4013, + "time_per_iteration": 2.5706331729888916 + }, + { + "auxiliary_loss_clip": 0.01107258, + "auxiliary_loss_mlp": 0.01027372, + "balance_loss_clip": 1.03835642, + "balance_loss_mlp": 1.01883101, + "epoch": 0.48265496302531113, + "flos": 20923819568640.0, + "grad_norm": 1.9258740418687212, + "language_loss": 0.71542048, + "learning_rate": 2.2091602201679095e-06, + "loss": 0.73676682, + "num_input_tokens_seen": 86399080, + "step": 4014, + "time_per_iteration": 2.642437696456909 + }, + { + "auxiliary_loss_clip": 0.01145141, + "auxiliary_loss_mlp": 0.01024246, + "balance_loss_clip": 1.05029583, + "balance_loss_mlp": 1.01609826, + "epoch": 0.48277520591595025, + "flos": 15231152511360.0, + "grad_norm": 2.142852785227502, + "language_loss": 0.83034098, + "learning_rate": 2.208385504193206e-06, + "loss": 0.85203481, + "num_input_tokens_seen": 86416580, + "step": 4015, + "time_per_iteration": 2.555332899093628 + }, + { + "auxiliary_loss_clip": 0.01184154, + "auxiliary_loss_mlp": 0.01020898, + "balance_loss_clip": 1.05370808, + "balance_loss_mlp": 1.01317978, + "epoch": 0.4828954488065893, + "flos": 17858664385920.0, + "grad_norm": 2.203787175886485, + "language_loss": 0.81058502, + "learning_rate": 2.2076107566065873e-06, + "loss": 0.83263558, + "num_input_tokens_seen": 86434365, + "step": 4016, + "time_per_iteration": 2.433837652206421 + }, + { + "auxiliary_loss_clip": 0.01175674, + "auxiliary_loss_mlp": 0.01031604, + "balance_loss_clip": 1.05489159, + "balance_loss_mlp": 1.02428424, + "epoch": 0.4830156916972284, + "flos": 32087405070720.0, + "grad_norm": 2.313430111011053, + "language_loss": 0.75473124, + "learning_rate": 2.2068359775255816e-06, + "loss": 0.77680403, + "num_input_tokens_seen": 86452675, + "step": 4017, + "time_per_iteration": 3.2859675884246826 + }, + { + "auxiliary_loss_clip": 0.01122719, + "auxiliary_loss_mlp": 0.01027031, + "balance_loss_clip": 1.04659295, + "balance_loss_mlp": 1.01891327, + "epoch": 0.48313593458786747, + "flos": 21871717528320.0, + "grad_norm": 3.0141107346096803, + "language_loss": 0.78491157, + "learning_rate": 2.206061167067723e-06, + "loss": 0.806409, + "num_input_tokens_seen": 86470785, + "step": 4018, + "time_per_iteration": 3.3674874305725098 + }, + { + "auxiliary_loss_clip": 0.01138594, + "auxiliary_loss_mlp": 0.01025579, + "balance_loss_clip": 1.04550838, + "balance_loss_mlp": 1.01650178, + "epoch": 0.4832561774785066, + "flos": 22601206840320.0, + "grad_norm": 2.651803338436259, + "language_loss": 0.79337287, + "learning_rate": 2.205286325350549e-06, + "loss": 0.81501466, + "num_input_tokens_seen": 86489850, + "step": 4019, + "time_per_iteration": 4.079021692276001 + }, + { + "auxiliary_loss_clip": 0.01125898, + "auxiliary_loss_mlp": 0.01028586, + "balance_loss_clip": 1.04750919, + "balance_loss_mlp": 1.02106678, + "epoch": 0.4833764203691457, + "flos": 13437342282240.0, + "grad_norm": 2.5098435429907733, + "language_loss": 0.72583973, + "learning_rate": 2.204511452491603e-06, + "loss": 0.74738455, + "num_input_tokens_seen": 86506475, + "step": 4020, + "time_per_iteration": 2.5845439434051514 + }, + { + "auxiliary_loss_clip": 0.01183429, + "auxiliary_loss_mlp": 0.0102831, + "balance_loss_clip": 1.05658627, + "balance_loss_mlp": 1.02049553, + "epoch": 0.48349666325978474, + "flos": 44128036955520.0, + "grad_norm": 1.9080535333604667, + "language_loss": 0.74868834, + "learning_rate": 2.2037365486084316e-06, + "loss": 0.77080566, + "num_input_tokens_seen": 86529715, + "step": 4021, + "time_per_iteration": 2.6882479190826416 + }, + { + "auxiliary_loss_clip": 0.01150009, + "auxiliary_loss_mlp": 0.01027513, + "balance_loss_clip": 1.04768014, + "balance_loss_mlp": 1.01951444, + "epoch": 0.48361690615042385, + "flos": 26028377245440.0, + "grad_norm": 1.9764802195079818, + "language_loss": 0.77711612, + "learning_rate": 2.2029616138185886e-06, + "loss": 0.79889131, + "num_input_tokens_seen": 86548715, + "step": 4022, + "time_per_iteration": 2.5874714851379395 + }, + { + "auxiliary_loss_clip": 0.01142156, + "auxiliary_loss_mlp": 0.01025955, + "balance_loss_clip": 1.0542779, + "balance_loss_mlp": 1.01812339, + "epoch": 0.48373714904106296, + "flos": 22273306560000.0, + "grad_norm": 1.626339909830215, + "language_loss": 0.82728058, + "learning_rate": 2.202186648239629e-06, + "loss": 0.84896171, + "num_input_tokens_seen": 86568650, + "step": 4023, + "time_per_iteration": 2.5781099796295166 + }, + { + "auxiliary_loss_clip": 0.01169081, + "auxiliary_loss_mlp": 0.01026173, + "balance_loss_clip": 1.05400395, + "balance_loss_mlp": 1.01848388, + "epoch": 0.483857391931702, + "flos": 28292293699200.0, + "grad_norm": 1.7218689869071404, + "language_loss": 0.71411288, + "learning_rate": 2.201411651989117e-06, + "loss": 0.73606539, + "num_input_tokens_seen": 86590630, + "step": 4024, + "time_per_iteration": 2.5486228466033936 + }, + { + "auxiliary_loss_clip": 0.01157062, + "auxiliary_loss_mlp": 0.00762175, + "balance_loss_clip": 1.05384421, + "balance_loss_mlp": 1.00089192, + "epoch": 0.48397763482234113, + "flos": 27418048577280.0, + "grad_norm": 2.002692343511475, + "language_loss": 0.78292942, + "learning_rate": 2.2006366251846167e-06, + "loss": 0.80212182, + "num_input_tokens_seen": 86611270, + "step": 4025, + "time_per_iteration": 2.596574544906616 + }, + { + "auxiliary_loss_clip": 0.01155085, + "auxiliary_loss_mlp": 0.01024447, + "balance_loss_clip": 1.05324304, + "balance_loss_mlp": 1.01710367, + "epoch": 0.48409787771298024, + "flos": 16797252470400.0, + "grad_norm": 2.4102459225170163, + "language_loss": 0.75703537, + "learning_rate": 2.1998615679436997e-06, + "loss": 0.77883065, + "num_input_tokens_seen": 86628810, + "step": 4026, + "time_per_iteration": 2.493163824081421 + }, + { + "auxiliary_loss_clip": 0.01164, + "auxiliary_loss_mlp": 0.01026931, + "balance_loss_clip": 1.05276132, + "balance_loss_mlp": 1.01828253, + "epoch": 0.4842181206036193, + "flos": 25083496028160.0, + "grad_norm": 2.7013640149759146, + "language_loss": 0.77127302, + "learning_rate": 2.199086480383942e-06, + "loss": 0.79318237, + "num_input_tokens_seen": 86648185, + "step": 4027, + "time_per_iteration": 2.5489606857299805 + }, + { + "auxiliary_loss_clip": 0.01168763, + "auxiliary_loss_mlp": 0.01031992, + "balance_loss_clip": 1.05427492, + "balance_loss_mlp": 1.02342629, + "epoch": 0.4843383634942584, + "flos": 30372311496960.0, + "grad_norm": 2.686318997904557, + "language_loss": 0.67442507, + "learning_rate": 2.1983113626229234e-06, + "loss": 0.69643259, + "num_input_tokens_seen": 86667435, + "step": 4028, + "time_per_iteration": 2.5680975914001465 + }, + { + "auxiliary_loss_clip": 0.01137635, + "auxiliary_loss_mlp": 0.0076257, + "balance_loss_clip": 1.04935205, + "balance_loss_mlp": 1.00111651, + "epoch": 0.4844586063848975, + "flos": 20413564917120.0, + "grad_norm": 1.6889330055967036, + "language_loss": 0.78209502, + "learning_rate": 2.1975362147782293e-06, + "loss": 0.80109704, + "num_input_tokens_seen": 86686630, + "step": 4029, + "time_per_iteration": 2.580242156982422 + }, + { + "auxiliary_loss_clip": 0.01070338, + "auxiliary_loss_mlp": 0.01008457, + "balance_loss_clip": 1.03157806, + "balance_loss_mlp": 1.00719345, + "epoch": 0.48457884927553657, + "flos": 70303722854400.0, + "grad_norm": 0.6957394107203132, + "language_loss": 0.54120624, + "learning_rate": 2.196761036967448e-06, + "loss": 0.56199419, + "num_input_tokens_seen": 86754595, + "step": 4030, + "time_per_iteration": 3.215390682220459 + }, + { + "auxiliary_loss_clip": 0.01165632, + "auxiliary_loss_mlp": 0.01022614, + "balance_loss_clip": 1.05175042, + "balance_loss_mlp": 1.01521063, + "epoch": 0.4846990921661757, + "flos": 19934516206080.0, + "grad_norm": 1.6819242033218664, + "language_loss": 0.77181792, + "learning_rate": 2.1959858293081743e-06, + "loss": 0.79370034, + "num_input_tokens_seen": 86773730, + "step": 4031, + "time_per_iteration": 2.5058960914611816 + }, + { + "auxiliary_loss_clip": 0.01138161, + "auxiliary_loss_mlp": 0.01027042, + "balance_loss_clip": 1.04999888, + "balance_loss_mlp": 1.01898384, + "epoch": 0.4848193350568148, + "flos": 23075945919360.0, + "grad_norm": 1.6259174615699303, + "language_loss": 0.75786704, + "learning_rate": 2.1952105919180056e-06, + "loss": 0.77951908, + "num_input_tokens_seen": 86792985, + "step": 4032, + "time_per_iteration": 2.595301628112793 + }, + { + "auxiliary_loss_clip": 0.01154975, + "auxiliary_loss_mlp": 0.01022151, + "balance_loss_clip": 1.05227113, + "balance_loss_mlp": 1.01405632, + "epoch": 0.48493957794745385, + "flos": 22455481363200.0, + "grad_norm": 2.3746357938557856, + "language_loss": 0.67815781, + "learning_rate": 2.1944353249145456e-06, + "loss": 0.69992906, + "num_input_tokens_seen": 86812095, + "step": 4033, + "time_per_iteration": 2.530862808227539 + }, + { + "auxiliary_loss_clip": 0.01183537, + "auxiliary_loss_mlp": 0.01026623, + "balance_loss_clip": 1.05502653, + "balance_loss_mlp": 1.01910686, + "epoch": 0.48505982083809296, + "flos": 25046112948480.0, + "grad_norm": 1.9792512975020602, + "language_loss": 0.74496138, + "learning_rate": 2.193660028415401e-06, + "loss": 0.76706302, + "num_input_tokens_seen": 86832875, + "step": 4034, + "time_per_iteration": 2.4865918159484863 + }, + { + "auxiliary_loss_clip": 0.01147591, + "auxiliary_loss_mlp": 0.01023673, + "balance_loss_clip": 1.04971933, + "balance_loss_mlp": 1.01570392, + "epoch": 0.485180063728732, + "flos": 26761386090240.0, + "grad_norm": 1.7879142253852731, + "language_loss": 0.82078892, + "learning_rate": 2.1928847025381852e-06, + "loss": 0.84250158, + "num_input_tokens_seen": 86853480, + "step": 4035, + "time_per_iteration": 2.5664150714874268 + }, + { + "auxiliary_loss_clip": 0.01168115, + "auxiliary_loss_mlp": 0.01028269, + "balance_loss_clip": 1.04974961, + "balance_loss_mlp": 1.01950729, + "epoch": 0.4853003066193711, + "flos": 24059143969920.0, + "grad_norm": 1.7154241524989966, + "language_loss": 0.8394624, + "learning_rate": 2.192109347400512e-06, + "loss": 0.86142623, + "num_input_tokens_seen": 86873695, + "step": 4036, + "time_per_iteration": 2.4981584548950195 + }, + { + "auxiliary_loss_clip": 0.01157388, + "auxiliary_loss_mlp": 0.01028823, + "balance_loss_clip": 1.04983604, + "balance_loss_mlp": 1.02015018, + "epoch": 0.48542054951001024, + "flos": 23076376882560.0, + "grad_norm": 1.7659796208566851, + "language_loss": 0.78810352, + "learning_rate": 2.191333963120004e-06, + "loss": 0.80996561, + "num_input_tokens_seen": 86892675, + "step": 4037, + "time_per_iteration": 2.5070948600769043 + }, + { + "auxiliary_loss_clip": 0.01158189, + "auxiliary_loss_mlp": 0.01030147, + "balance_loss_clip": 1.05337238, + "balance_loss_mlp": 1.02176642, + "epoch": 0.4855407924006493, + "flos": 25664889565440.0, + "grad_norm": 2.1360672682955624, + "language_loss": 0.7020123, + "learning_rate": 2.190558549814286e-06, + "loss": 0.72389567, + "num_input_tokens_seen": 86912835, + "step": 4038, + "time_per_iteration": 2.5441019535064697 + }, + { + "auxiliary_loss_clip": 0.01153214, + "auxiliary_loss_mlp": 0.01026835, + "balance_loss_clip": 1.04937339, + "balance_loss_mlp": 1.01899099, + "epoch": 0.4856610352912884, + "flos": 23987933256960.0, + "grad_norm": 1.795100453906707, + "language_loss": 0.7962271, + "learning_rate": 2.1897831076009872e-06, + "loss": 0.81802762, + "num_input_tokens_seen": 86932475, + "step": 4039, + "time_per_iteration": 2.5169501304626465 + }, + { + "auxiliary_loss_clip": 0.01170696, + "auxiliary_loss_mlp": 0.01024584, + "balance_loss_clip": 1.05275786, + "balance_loss_mlp": 1.01669264, + "epoch": 0.4857812781819275, + "flos": 24096814358400.0, + "grad_norm": 1.6131764928082544, + "language_loss": 0.79795551, + "learning_rate": 2.1890076365977426e-06, + "loss": 0.81990832, + "num_input_tokens_seen": 86952300, + "step": 4040, + "time_per_iteration": 2.506757974624634 + }, + { + "auxiliary_loss_clip": 0.01063363, + "auxiliary_loss_mlp": 0.01003423, + "balance_loss_clip": 1.02325833, + "balance_loss_mlp": 1.00210536, + "epoch": 0.48590152107256657, + "flos": 56266635185280.0, + "grad_norm": 1.0626654782870908, + "language_loss": 0.52870578, + "learning_rate": 2.188232136922189e-06, + "loss": 0.54937363, + "num_input_tokens_seen": 87010420, + "step": 4041, + "time_per_iteration": 3.0100150108337402 + }, + { + "auxiliary_loss_clip": 0.0110907, + "auxiliary_loss_mlp": 0.0103028, + "balance_loss_clip": 1.04716039, + "balance_loss_mlp": 1.0217452, + "epoch": 0.4860217639632057, + "flos": 20046988667520.0, + "grad_norm": 3.3479053190017054, + "language_loss": 0.75610352, + "learning_rate": 2.187456608691971e-06, + "loss": 0.77749699, + "num_input_tokens_seen": 87029295, + "step": 4042, + "time_per_iteration": 2.6242153644561768 + }, + { + "auxiliary_loss_clip": 0.01147203, + "auxiliary_loss_mlp": 0.01031346, + "balance_loss_clip": 1.05459642, + "balance_loss_mlp": 1.02299583, + "epoch": 0.4861420068538448, + "flos": 17822143232640.0, + "grad_norm": 1.9678526936507985, + "language_loss": 0.87404537, + "learning_rate": 2.1866810520247334e-06, + "loss": 0.89583087, + "num_input_tokens_seen": 87048165, + "step": 4043, + "time_per_iteration": 3.253032684326172 + }, + { + "auxiliary_loss_clip": 0.01173665, + "auxiliary_loss_mlp": 0.01023846, + "balance_loss_clip": 1.05097151, + "balance_loss_mlp": 1.01519144, + "epoch": 0.48626224974448384, + "flos": 26250125857920.0, + "grad_norm": 2.161399539930865, + "language_loss": 0.64658785, + "learning_rate": 2.185905467038129e-06, + "loss": 0.66856289, + "num_input_tokens_seen": 87067070, + "step": 4044, + "time_per_iteration": 2.5389628410339355 + }, + { + "auxiliary_loss_clip": 0.01184115, + "auxiliary_loss_mlp": 0.01026232, + "balance_loss_clip": 1.05666661, + "balance_loss_mlp": 1.01825058, + "epoch": 0.48638249263512295, + "flos": 22054502862720.0, + "grad_norm": 1.838073195984943, + "language_loss": 0.77488112, + "learning_rate": 2.1851298538498127e-06, + "loss": 0.79698461, + "num_input_tokens_seen": 87086785, + "step": 4045, + "time_per_iteration": 4.075937509536743 + }, + { + "auxiliary_loss_clip": 0.0117711, + "auxiliary_loss_mlp": 0.00763124, + "balance_loss_clip": 1.05554986, + "balance_loss_mlp": 1.00109768, + "epoch": 0.48650273552576206, + "flos": 25119945354240.0, + "grad_norm": 2.0362717953759373, + "language_loss": 0.79861271, + "learning_rate": 2.184354212577446e-06, + "loss": 0.8180151, + "num_input_tokens_seen": 87107090, + "step": 4046, + "time_per_iteration": 3.2447309494018555 + }, + { + "auxiliary_loss_clip": 0.01185118, + "auxiliary_loss_mlp": 0.01026201, + "balance_loss_clip": 1.05336285, + "balance_loss_mlp": 1.01790452, + "epoch": 0.4866229784164011, + "flos": 17456931699840.0, + "grad_norm": 2.893323654995752, + "language_loss": 0.6234988, + "learning_rate": 2.1835785433386907e-06, + "loss": 0.645612, + "num_input_tokens_seen": 87125905, + "step": 4047, + "time_per_iteration": 2.4282448291778564 + }, + { + "auxiliary_loss_clip": 0.01130349, + "auxiliary_loss_mlp": 0.01029311, + "balance_loss_clip": 1.04857433, + "balance_loss_mlp": 1.02099669, + "epoch": 0.48674322130704023, + "flos": 23331127115520.0, + "grad_norm": 1.800885139545837, + "language_loss": 0.65137851, + "learning_rate": 2.182802846251216e-06, + "loss": 0.67297518, + "num_input_tokens_seen": 87146175, + "step": 4048, + "time_per_iteration": 2.556091785430908 + }, + { + "auxiliary_loss_clip": 0.01146424, + "auxiliary_loss_mlp": 0.01025005, + "balance_loss_clip": 1.04855478, + "balance_loss_mlp": 1.01725078, + "epoch": 0.4868634641976793, + "flos": 28804344030720.0, + "grad_norm": 2.4319155358663975, + "language_loss": 0.7281692, + "learning_rate": 2.182027121432696e-06, + "loss": 0.74988347, + "num_input_tokens_seen": 87166800, + "step": 4049, + "time_per_iteration": 2.615419626235962 + }, + { + "auxiliary_loss_clip": 0.01187364, + "auxiliary_loss_mlp": 0.01031304, + "balance_loss_clip": 1.05490804, + "balance_loss_mlp": 1.02221441, + "epoch": 0.4869837070883184, + "flos": 19025976574080.0, + "grad_norm": 1.863719021683457, + "language_loss": 0.82362258, + "learning_rate": 2.1812513690008054e-06, + "loss": 0.84580934, + "num_input_tokens_seen": 87185920, + "step": 4050, + "time_per_iteration": 2.435861587524414 + }, + { + "auxiliary_loss_clip": 0.01176121, + "auxiliary_loss_mlp": 0.01029212, + "balance_loss_clip": 1.05394471, + "balance_loss_mlp": 1.02062905, + "epoch": 0.4871039499789575, + "flos": 15121409483520.0, + "grad_norm": 2.211116978822037, + "language_loss": 0.8002212, + "learning_rate": 2.180475589073227e-06, + "loss": 0.82227457, + "num_input_tokens_seen": 87203620, + "step": 4051, + "time_per_iteration": 2.4645848274230957 + }, + { + "auxiliary_loss_clip": 0.01159364, + "auxiliary_loss_mlp": 0.01023246, + "balance_loss_clip": 1.05000794, + "balance_loss_mlp": 1.01526475, + "epoch": 0.48722419286959656, + "flos": 26174066808960.0, + "grad_norm": 1.943776790221255, + "language_loss": 0.73121893, + "learning_rate": 2.1796997817676456e-06, + "loss": 0.75304496, + "num_input_tokens_seen": 87224630, + "step": 4052, + "time_per_iteration": 2.5141446590423584 + }, + { + "auxiliary_loss_clip": 0.01172652, + "auxiliary_loss_mlp": 0.0076204, + "balance_loss_clip": 1.0542078, + "balance_loss_mlp": 1.00107741, + "epoch": 0.4873444357602357, + "flos": 24026142349440.0, + "grad_norm": 1.7273365097378213, + "language_loss": 0.67320806, + "learning_rate": 2.1789239472017494e-06, + "loss": 0.69255501, + "num_input_tokens_seen": 87246280, + "step": 4053, + "time_per_iteration": 2.5179049968719482 + }, + { + "auxiliary_loss_clip": 0.01139933, + "auxiliary_loss_mlp": 0.01029862, + "balance_loss_clip": 1.0486474, + "balance_loss_mlp": 1.02172625, + "epoch": 0.4874646786508748, + "flos": 22820441500800.0, + "grad_norm": 2.0695615508792504, + "language_loss": 0.73151851, + "learning_rate": 2.1781480854932326e-06, + "loss": 0.75321651, + "num_input_tokens_seen": 87266045, + "step": 4054, + "time_per_iteration": 2.5500125885009766 + }, + { + "auxiliary_loss_clip": 0.01124802, + "auxiliary_loss_mlp": 0.01031564, + "balance_loss_clip": 1.04857254, + "balance_loss_mlp": 1.02377999, + "epoch": 0.48758492154151384, + "flos": 21287594557440.0, + "grad_norm": 1.9496542211590113, + "language_loss": 0.79511797, + "learning_rate": 2.1773721967597933e-06, + "loss": 0.81668162, + "num_input_tokens_seen": 87284495, + "step": 4055, + "time_per_iteration": 2.5731732845306396 + }, + { + "auxiliary_loss_clip": 0.01052008, + "auxiliary_loss_mlp": 0.01001589, + "balance_loss_clip": 1.01726389, + "balance_loss_mlp": 1.00019443, + "epoch": 0.48770516443215295, + "flos": 62244109180800.0, + "grad_norm": 0.8404567078579999, + "language_loss": 0.57331324, + "learning_rate": 2.1765962811191322e-06, + "loss": 0.5938493, + "num_input_tokens_seen": 87338960, + "step": 4056, + "time_per_iteration": 3.011725902557373 + }, + { + "auxiliary_loss_clip": 0.01034549, + "auxiliary_loss_mlp": 0.01006704, + "balance_loss_clip": 1.02072251, + "balance_loss_mlp": 1.00538111, + "epoch": 0.48782540732279206, + "flos": 66133451882880.0, + "grad_norm": 0.8301350284283346, + "language_loss": 0.6202718, + "learning_rate": 2.1758203386889566e-06, + "loss": 0.64068425, + "num_input_tokens_seen": 87401730, + "step": 4057, + "time_per_iteration": 3.167227029800415 + }, + { + "auxiliary_loss_clip": 0.01142949, + "auxiliary_loss_mlp": 0.00763114, + "balance_loss_clip": 1.0503149, + "balance_loss_mlp": 1.00113773, + "epoch": 0.4879456502134311, + "flos": 14607922608000.0, + "grad_norm": 1.9334982588732799, + "language_loss": 0.84443039, + "learning_rate": 2.1750443695869746e-06, + "loss": 0.86349106, + "num_input_tokens_seen": 87417300, + "step": 4058, + "time_per_iteration": 2.5169620513916016 + }, + { + "auxiliary_loss_clip": 0.01171266, + "auxiliary_loss_mlp": 0.01030124, + "balance_loss_clip": 1.05217028, + "balance_loss_mlp": 1.02219093, + "epoch": 0.4880658931040702, + "flos": 19500464257920.0, + "grad_norm": 1.8165092482810168, + "language_loss": 0.85813296, + "learning_rate": 2.174268373930901e-06, + "loss": 0.88014686, + "num_input_tokens_seen": 87434815, + "step": 4059, + "time_per_iteration": 2.4905292987823486 + }, + { + "auxiliary_loss_clip": 0.01136007, + "auxiliary_loss_mlp": 0.00763388, + "balance_loss_clip": 1.05180883, + "balance_loss_mlp": 1.00108659, + "epoch": 0.48818613599470934, + "flos": 16723060928640.0, + "grad_norm": 2.312371667887521, + "language_loss": 0.79723018, + "learning_rate": 2.1734923518384537e-06, + "loss": 0.8162241, + "num_input_tokens_seen": 87451420, + "step": 4060, + "time_per_iteration": 2.5136823654174805 + }, + { + "auxiliary_loss_clip": 0.01126141, + "auxiliary_loss_mlp": 0.01033002, + "balance_loss_clip": 1.04966462, + "balance_loss_mlp": 1.0253849, + "epoch": 0.4883063788853484, + "flos": 26756932803840.0, + "grad_norm": 1.7166913165396094, + "language_loss": 0.82115889, + "learning_rate": 2.1727163034273547e-06, + "loss": 0.84275031, + "num_input_tokens_seen": 87469585, + "step": 4061, + "time_per_iteration": 2.582867383956909 + }, + { + "auxiliary_loss_clip": 0.01170889, + "auxiliary_loss_mlp": 0.0102381, + "balance_loss_clip": 1.05074775, + "balance_loss_mlp": 1.01544118, + "epoch": 0.4884266217759875, + "flos": 16763388923520.0, + "grad_norm": 2.3821124602703145, + "language_loss": 0.78977025, + "learning_rate": 2.17194022881533e-06, + "loss": 0.81171715, + "num_input_tokens_seen": 87485675, + "step": 4062, + "time_per_iteration": 2.5906808376312256 + }, + { + "auxiliary_loss_clip": 0.01159104, + "auxiliary_loss_mlp": 0.01031128, + "balance_loss_clip": 1.05037808, + "balance_loss_mlp": 1.02271152, + "epoch": 0.4885468646666266, + "flos": 24207132003840.0, + "grad_norm": 2.0898252666058927, + "language_loss": 0.67928243, + "learning_rate": 2.1711641281201092e-06, + "loss": 0.70118475, + "num_input_tokens_seen": 87505605, + "step": 4063, + "time_per_iteration": 2.546323776245117 + }, + { + "auxiliary_loss_clip": 0.011687, + "auxiliary_loss_mlp": 0.01025238, + "balance_loss_clip": 1.05382347, + "balance_loss_mlp": 1.01711953, + "epoch": 0.48866710755726567, + "flos": 14610795696000.0, + "grad_norm": 2.03546370745362, + "language_loss": 0.79512888, + "learning_rate": 2.1703880014594264e-06, + "loss": 0.81706822, + "num_input_tokens_seen": 87523195, + "step": 4064, + "time_per_iteration": 2.466646909713745 + }, + { + "auxiliary_loss_clip": 0.0112073, + "auxiliary_loss_mlp": 0.01030985, + "balance_loss_clip": 1.04992867, + "balance_loss_mlp": 1.02330852, + "epoch": 0.4887873504479048, + "flos": 28804451771520.0, + "grad_norm": 2.7077158741934757, + "language_loss": 0.73981357, + "learning_rate": 2.1696118489510182e-06, + "loss": 0.76133078, + "num_input_tokens_seen": 87544125, + "step": 4065, + "time_per_iteration": 2.625756025314331 + }, + { + "auxiliary_loss_clip": 0.01145041, + "auxiliary_loss_mlp": 0.00762972, + "balance_loss_clip": 1.05019569, + "balance_loss_mlp": 1.00108325, + "epoch": 0.48890759333854383, + "flos": 22784387224320.0, + "grad_norm": 1.7150131014522823, + "language_loss": 0.7286489, + "learning_rate": 2.1688356707126286e-06, + "loss": 0.74772906, + "num_input_tokens_seen": 87563745, + "step": 4066, + "time_per_iteration": 2.6222708225250244 + }, + { + "auxiliary_loss_clip": 0.01139187, + "auxiliary_loss_mlp": 0.01026227, + "balance_loss_clip": 1.05034661, + "balance_loss_mlp": 1.01803732, + "epoch": 0.48902783622918294, + "flos": 17786088956160.0, + "grad_norm": 2.939397531756088, + "language_loss": 0.69878298, + "learning_rate": 2.168059466862001e-06, + "loss": 0.72043705, + "num_input_tokens_seen": 87581895, + "step": 4067, + "time_per_iteration": 2.527578830718994 + }, + { + "auxiliary_loss_clip": 0.01154873, + "auxiliary_loss_mlp": 0.0102949, + "balance_loss_clip": 1.04773855, + "balance_loss_mlp": 1.02183688, + "epoch": 0.48914807911982205, + "flos": 22310294590080.0, + "grad_norm": 2.0450124068131803, + "language_loss": 0.81420648, + "learning_rate": 2.167283237516887e-06, + "loss": 0.83605015, + "num_input_tokens_seen": 87600170, + "step": 4068, + "time_per_iteration": 2.5157320499420166 + }, + { + "auxiliary_loss_clip": 0.01158684, + "auxiliary_loss_mlp": 0.01035756, + "balance_loss_clip": 1.0515542, + "balance_loss_mlp": 1.02739334, + "epoch": 0.4892683220104611, + "flos": 16363020954240.0, + "grad_norm": 1.7613835925897419, + "language_loss": 0.74417078, + "learning_rate": 2.1665069827950383e-06, + "loss": 0.76611519, + "num_input_tokens_seen": 87617455, + "step": 4069, + "time_per_iteration": 2.5177013874053955 + }, + { + "auxiliary_loss_clip": 0.01156496, + "auxiliary_loss_mlp": 0.01026607, + "balance_loss_clip": 1.05142736, + "balance_loss_mlp": 1.0189774, + "epoch": 0.4893885649011002, + "flos": 15739144606080.0, + "grad_norm": 1.7904742157695765, + "language_loss": 0.86503506, + "learning_rate": 2.1657307028142126e-06, + "loss": 0.88686603, + "num_input_tokens_seen": 87634995, + "step": 4070, + "time_per_iteration": 3.3069605827331543 + }, + { + "auxiliary_loss_clip": 0.01155601, + "auxiliary_loss_mlp": 0.01027643, + "balance_loss_clip": 1.05199242, + "balance_loss_mlp": 1.01897693, + "epoch": 0.48950880779173933, + "flos": 28581984887040.0, + "grad_norm": 1.9270043551654914, + "language_loss": 0.67098647, + "learning_rate": 2.164954397692171e-06, + "loss": 0.69281888, + "num_input_tokens_seen": 87654420, + "step": 4071, + "time_per_iteration": 2.5814785957336426 + }, + { + "auxiliary_loss_clip": 0.01062161, + "auxiliary_loss_mlp": 0.01005385, + "balance_loss_clip": 1.01780903, + "balance_loss_mlp": 1.00407934, + "epoch": 0.4896290506823784, + "flos": 66186310746240.0, + "grad_norm": 1.0687477978333177, + "language_loss": 0.77359563, + "learning_rate": 2.164178067546678e-06, + "loss": 0.79427111, + "num_input_tokens_seen": 87713585, + "step": 4072, + "time_per_iteration": 4.692058563232422 + }, + { + "auxiliary_loss_clip": 0.01161462, + "auxiliary_loss_mlp": 0.01027382, + "balance_loss_clip": 1.05039334, + "balance_loss_mlp": 1.01927614, + "epoch": 0.4897492935730175, + "flos": 12531065207040.0, + "grad_norm": 1.9339903011883162, + "language_loss": 0.91075945, + "learning_rate": 2.163401712495504e-06, + "loss": 0.93264788, + "num_input_tokens_seen": 87731280, + "step": 4073, + "time_per_iteration": 3.270004987716675 + }, + { + "auxiliary_loss_clip": 0.01129219, + "auxiliary_loss_mlp": 0.01030934, + "balance_loss_clip": 1.04855204, + "balance_loss_mlp": 1.02242279, + "epoch": 0.4898695364636566, + "flos": 23476816679040.0, + "grad_norm": 3.008613366047532, + "language_loss": 0.79047358, + "learning_rate": 2.1626253326564194e-06, + "loss": 0.81207502, + "num_input_tokens_seen": 87750230, + "step": 4074, + "time_per_iteration": 2.62384295463562 + }, + { + "auxiliary_loss_clip": 0.01153776, + "auxiliary_loss_mlp": 0.01030646, + "balance_loss_clip": 1.04879069, + "balance_loss_mlp": 1.02231896, + "epoch": 0.48998977935429566, + "flos": 27160209774720.0, + "grad_norm": 1.6910957714972306, + "language_loss": 0.76933908, + "learning_rate": 2.161848928147201e-06, + "loss": 0.79118329, + "num_input_tokens_seen": 87770500, + "step": 4075, + "time_per_iteration": 2.568398952484131 + }, + { + "auxiliary_loss_clip": 0.01170375, + "auxiliary_loss_mlp": 0.01023179, + "balance_loss_clip": 1.05478251, + "balance_loss_mlp": 1.01518047, + "epoch": 0.4901100222449348, + "flos": 20339588856960.0, + "grad_norm": 2.1109603251678664, + "language_loss": 0.80377555, + "learning_rate": 2.161072499085629e-06, + "loss": 0.82571107, + "num_input_tokens_seen": 87789495, + "step": 4076, + "time_per_iteration": 2.4966092109680176 + }, + { + "auxiliary_loss_clip": 0.01148796, + "auxiliary_loss_mlp": 0.01025289, + "balance_loss_clip": 1.05240679, + "balance_loss_mlp": 1.01739693, + "epoch": 0.4902302651355739, + "flos": 30446359384320.0, + "grad_norm": 1.8889688233494577, + "language_loss": 0.82889009, + "learning_rate": 2.160296045589487e-06, + "loss": 0.850631, + "num_input_tokens_seen": 87812955, + "step": 4077, + "time_per_iteration": 2.6216163635253906 + }, + { + "auxiliary_loss_clip": 0.01168518, + "auxiliary_loss_mlp": 0.01022951, + "balance_loss_clip": 1.053262, + "balance_loss_mlp": 1.01467764, + "epoch": 0.49035050802621294, + "flos": 19174180089600.0, + "grad_norm": 2.195938307712388, + "language_loss": 0.69553548, + "learning_rate": 2.159519567776562e-06, + "loss": 0.71745014, + "num_input_tokens_seen": 87832605, + "step": 4078, + "time_per_iteration": 2.486017942428589 + }, + { + "auxiliary_loss_clip": 0.01130361, + "auxiliary_loss_mlp": 0.01024898, + "balance_loss_clip": 1.04451108, + "balance_loss_mlp": 1.0168246, + "epoch": 0.49047075091685205, + "flos": 22228489365120.0, + "grad_norm": 3.2335055566388355, + "language_loss": 0.70122564, + "learning_rate": 2.1587430657646463e-06, + "loss": 0.72277826, + "num_input_tokens_seen": 87846040, + "step": 4079, + "time_per_iteration": 2.551574468612671 + }, + { + "auxiliary_loss_clip": 0.01155204, + "auxiliary_loss_mlp": 0.01023607, + "balance_loss_clip": 1.05257726, + "balance_loss_mlp": 1.01585853, + "epoch": 0.4905909938074911, + "flos": 20156516213760.0, + "grad_norm": 1.762889690955025, + "language_loss": 0.77995205, + "learning_rate": 2.157966539671533e-06, + "loss": 0.80174011, + "num_input_tokens_seen": 87865680, + "step": 4080, + "time_per_iteration": 2.599757432937622 + }, + { + "auxiliary_loss_clip": 0.01141845, + "auxiliary_loss_mlp": 0.01024605, + "balance_loss_clip": 1.04830694, + "balance_loss_mlp": 1.01701474, + "epoch": 0.4907112366981302, + "flos": 17202217380480.0, + "grad_norm": 2.0602145480640677, + "language_loss": 0.67428142, + "learning_rate": 2.157189989615021e-06, + "loss": 0.69594592, + "num_input_tokens_seen": 87884270, + "step": 4081, + "time_per_iteration": 2.59912371635437 + }, + { + "auxiliary_loss_clip": 0.01170677, + "auxiliary_loss_mlp": 0.00763192, + "balance_loss_clip": 1.05129349, + "balance_loss_mlp": 1.0010128, + "epoch": 0.4908314795887693, + "flos": 21688968107520.0, + "grad_norm": 2.832664166734897, + "language_loss": 0.74896646, + "learning_rate": 2.156413415712913e-06, + "loss": 0.76830518, + "num_input_tokens_seen": 87906320, + "step": 4082, + "time_per_iteration": 2.5230844020843506 + }, + { + "auxiliary_loss_clip": 0.01162797, + "auxiliary_loss_mlp": 0.00763154, + "balance_loss_clip": 1.05404246, + "balance_loss_mlp": 1.00108254, + "epoch": 0.4909517224794084, + "flos": 26213676531840.0, + "grad_norm": 1.720137084456026, + "language_loss": 0.78604436, + "learning_rate": 2.155636818083014e-06, + "loss": 0.80530387, + "num_input_tokens_seen": 87927690, + "step": 4083, + "time_per_iteration": 2.590153694152832 + }, + { + "auxiliary_loss_clip": 0.01153646, + "auxiliary_loss_mlp": 0.01023895, + "balance_loss_clip": 1.05304003, + "balance_loss_mlp": 1.01667666, + "epoch": 0.4910719653700475, + "flos": 23148377694720.0, + "grad_norm": 2.5629943432706224, + "language_loss": 0.84343719, + "learning_rate": 2.154860196843134e-06, + "loss": 0.86521256, + "num_input_tokens_seen": 87946885, + "step": 4084, + "time_per_iteration": 2.534717082977295 + }, + { + "auxiliary_loss_clip": 0.01184592, + "auxiliary_loss_mlp": 0.01027154, + "balance_loss_clip": 1.05457032, + "balance_loss_mlp": 1.01928592, + "epoch": 0.4911922082606866, + "flos": 23331845387520.0, + "grad_norm": 1.8543534607663859, + "language_loss": 0.76975381, + "learning_rate": 2.154083552111085e-06, + "loss": 0.79187131, + "num_input_tokens_seen": 87966055, + "step": 4085, + "time_per_iteration": 2.4947783946990967 + }, + { + "auxiliary_loss_clip": 0.01185413, + "auxiliary_loss_mlp": 0.01024259, + "balance_loss_clip": 1.05302787, + "balance_loss_mlp": 1.01621187, + "epoch": 0.49131245115132566, + "flos": 29203239542400.0, + "grad_norm": 2.0054307907291418, + "language_loss": 0.81928223, + "learning_rate": 2.1533068840046834e-06, + "loss": 0.84137899, + "num_input_tokens_seen": 87986320, + "step": 4086, + "time_per_iteration": 2.5350632667541504 + }, + { + "auxiliary_loss_clip": 0.01149643, + "auxiliary_loss_mlp": 0.00763521, + "balance_loss_clip": 1.04943597, + "balance_loss_mlp": 1.00107932, + "epoch": 0.49143269404196477, + "flos": 20147465986560.0, + "grad_norm": 3.180094333122146, + "language_loss": 0.61765528, + "learning_rate": 2.152530192641749e-06, + "loss": 0.63678694, + "num_input_tokens_seen": 88001230, + "step": 4087, + "time_per_iteration": 2.512162446975708 + }, + { + "auxiliary_loss_clip": 0.01174713, + "auxiliary_loss_mlp": 0.01029097, + "balance_loss_clip": 1.05345488, + "balance_loss_mlp": 1.02061558, + "epoch": 0.4915529369326039, + "flos": 24389809597440.0, + "grad_norm": 1.9640240947935297, + "language_loss": 0.71953577, + "learning_rate": 2.1517534781401068e-06, + "loss": 0.74157393, + "num_input_tokens_seen": 88019110, + "step": 4088, + "time_per_iteration": 2.545927047729492 + }, + { + "auxiliary_loss_clip": 0.01170338, + "auxiliary_loss_mlp": 0.01026797, + "balance_loss_clip": 1.0537293, + "balance_loss_mlp": 1.0184288, + "epoch": 0.49167317982324293, + "flos": 10524305197440.0, + "grad_norm": 2.030438744720595, + "language_loss": 0.68955004, + "learning_rate": 2.150976740617581e-06, + "loss": 0.71152133, + "num_input_tokens_seen": 88035670, + "step": 4089, + "time_per_iteration": 2.530869960784912 + }, + { + "auxiliary_loss_clip": 0.01162262, + "auxiliary_loss_mlp": 0.01027633, + "balance_loss_clip": 1.05397391, + "balance_loss_mlp": 1.01956248, + "epoch": 0.49179342271388204, + "flos": 25593427457280.0, + "grad_norm": 2.2610573511167633, + "language_loss": 0.7123245, + "learning_rate": 2.150199980192006e-06, + "loss": 0.73422343, + "num_input_tokens_seen": 88054790, + "step": 4090, + "time_per_iteration": 2.585406541824341 + }, + { + "auxiliary_loss_clip": 0.01148666, + "auxiliary_loss_mlp": 0.0102379, + "balance_loss_clip": 1.04899955, + "balance_loss_mlp": 1.01583612, + "epoch": 0.49191366560452116, + "flos": 21102043875840.0, + "grad_norm": 2.127132627216119, + "language_loss": 0.80868995, + "learning_rate": 2.1494231969812114e-06, + "loss": 0.83041453, + "num_input_tokens_seen": 88073780, + "step": 4091, + "time_per_iteration": 2.558048725128174 + }, + { + "auxiliary_loss_clip": 0.01147343, + "auxiliary_loss_mlp": 0.01028435, + "balance_loss_clip": 1.05267191, + "balance_loss_mlp": 1.01999474, + "epoch": 0.4920339084951602, + "flos": 26067520091520.0, + "grad_norm": 2.1571752331795278, + "language_loss": 0.80993307, + "learning_rate": 2.1486463911030372e-06, + "loss": 0.83169079, + "num_input_tokens_seen": 88094430, + "step": 4092, + "time_per_iteration": 2.586113452911377 + }, + { + "auxiliary_loss_clip": 0.01152507, + "auxiliary_loss_mlp": 0.01030316, + "balance_loss_clip": 1.04830611, + "balance_loss_mlp": 1.02219796, + "epoch": 0.4921541513857993, + "flos": 25081269384960.0, + "grad_norm": 2.11780242005935, + "language_loss": 0.74620765, + "learning_rate": 2.147869562675324e-06, + "loss": 0.76803589, + "num_input_tokens_seen": 88113400, + "step": 4093, + "time_per_iteration": 2.541212320327759 + }, + { + "auxiliary_loss_clip": 0.0116893, + "auxiliary_loss_mlp": 0.01026317, + "balance_loss_clip": 1.05276728, + "balance_loss_mlp": 1.01841307, + "epoch": 0.49227439427643843, + "flos": 24389809597440.0, + "grad_norm": 1.9521607761426731, + "language_loss": 0.7209481, + "learning_rate": 2.147092711815915e-06, + "loss": 0.74290055, + "num_input_tokens_seen": 88132750, + "step": 4094, + "time_per_iteration": 2.520172119140625 + }, + { + "auxiliary_loss_clip": 0.01139448, + "auxiliary_loss_mlp": 0.01023293, + "balance_loss_clip": 1.05054641, + "balance_loss_mlp": 1.01605129, + "epoch": 0.4923946371670775, + "flos": 11363753018880.0, + "grad_norm": 2.5621202938040297, + "language_loss": 0.86265206, + "learning_rate": 2.1463158386426593e-06, + "loss": 0.88427955, + "num_input_tokens_seen": 88150560, + "step": 4095, + "time_per_iteration": 2.5106353759765625 + }, + { + "auxiliary_loss_clip": 0.01163556, + "auxiliary_loss_mlp": 0.01028239, + "balance_loss_clip": 1.05330002, + "balance_loss_mlp": 1.01979935, + "epoch": 0.4925148800577166, + "flos": 30445964334720.0, + "grad_norm": 2.496246846407157, + "language_loss": 0.77497292, + "learning_rate": 2.145538943273407e-06, + "loss": 0.79689091, + "num_input_tokens_seen": 88170835, + "step": 4096, + "time_per_iteration": 2.5783615112304688 + }, + { + "auxiliary_loss_clip": 0.01185068, + "auxiliary_loss_mlp": 0.01031243, + "balance_loss_clip": 1.05488575, + "balance_loss_mlp": 1.02302396, + "epoch": 0.49263512294835565, + "flos": 20850454039680.0, + "grad_norm": 2.0943925539783166, + "language_loss": 0.71861506, + "learning_rate": 2.144762025826013e-06, + "loss": 0.74077821, + "num_input_tokens_seen": 88189925, + "step": 4097, + "time_per_iteration": 3.138936996459961 + }, + { + "auxiliary_loss_clip": 0.01173718, + "auxiliary_loss_mlp": 0.01031521, + "balance_loss_clip": 1.05207825, + "balance_loss_mlp": 1.02340901, + "epoch": 0.49275536583899476, + "flos": 23767477534080.0, + "grad_norm": 2.098530237709984, + "language_loss": 0.86916041, + "learning_rate": 2.143985086418334e-06, + "loss": 0.89121282, + "num_input_tokens_seen": 88205105, + "step": 4098, + "time_per_iteration": 2.478965997695923 + }, + { + "auxiliary_loss_clip": 0.01159434, + "auxiliary_loss_mlp": 0.01024393, + "balance_loss_clip": 1.0525434, + "balance_loss_mlp": 1.01662612, + "epoch": 0.4928756087296339, + "flos": 22273522041600.0, + "grad_norm": 1.4210714601979717, + "language_loss": 0.76299262, + "learning_rate": 2.1432081251682324e-06, + "loss": 0.78483087, + "num_input_tokens_seen": 88225475, + "step": 4099, + "time_per_iteration": 4.831442832946777 + }, + { + "auxiliary_loss_clip": 0.01172486, + "auxiliary_loss_mlp": 0.01026859, + "balance_loss_clip": 1.05847108, + "balance_loss_mlp": 1.01846707, + "epoch": 0.49299585162027293, + "flos": 19645471463040.0, + "grad_norm": 1.8012910822008628, + "language_loss": 0.87356484, + "learning_rate": 2.142431142193572e-06, + "loss": 0.89555836, + "num_input_tokens_seen": 88243255, + "step": 4100, + "time_per_iteration": 2.4614784717559814 + }, + { + "auxiliary_loss_clip": 0.01183966, + "auxiliary_loss_mlp": 0.01029515, + "balance_loss_clip": 1.05535567, + "balance_loss_mlp": 1.02138495, + "epoch": 0.49311609451091204, + "flos": 38837138497920.0, + "grad_norm": 2.092447468316713, + "language_loss": 0.71725142, + "learning_rate": 2.1416541376122207e-06, + "loss": 0.73938626, + "num_input_tokens_seen": 88263435, + "step": 4101, + "time_per_iteration": 2.575352907180786 + }, + { + "auxiliary_loss_clip": 0.01184304, + "auxiliary_loss_mlp": 0.0102795, + "balance_loss_clip": 1.05308306, + "balance_loss_mlp": 1.01923585, + "epoch": 0.49323633740155115, + "flos": 28329102161280.0, + "grad_norm": 1.8508386019212972, + "language_loss": 0.73218465, + "learning_rate": 2.1408771115420496e-06, + "loss": 0.75430727, + "num_input_tokens_seen": 88283295, + "step": 4102, + "time_per_iteration": 2.48590350151062 + }, + { + "auxiliary_loss_clip": 0.01128701, + "auxiliary_loss_mlp": 0.0102567, + "balance_loss_clip": 1.05114222, + "balance_loss_mlp": 1.01776648, + "epoch": 0.4933565802921902, + "flos": 21135584200320.0, + "grad_norm": 1.966160199415806, + "language_loss": 0.64908803, + "learning_rate": 2.140100064100932e-06, + "loss": 0.67063177, + "num_input_tokens_seen": 88299270, + "step": 4103, + "time_per_iteration": 2.577543258666992 + }, + { + "auxiliary_loss_clip": 0.01166416, + "auxiliary_loss_mlp": 0.01023371, + "balance_loss_clip": 1.05220222, + "balance_loss_mlp": 1.01585448, + "epoch": 0.4934768231828293, + "flos": 18039007595520.0, + "grad_norm": 2.657523298970842, + "language_loss": 0.7554698, + "learning_rate": 2.139322995406746e-06, + "loss": 0.77736759, + "num_input_tokens_seen": 88316905, + "step": 4104, + "time_per_iteration": 2.450674533843994 + }, + { + "auxiliary_loss_clip": 0.01186801, + "auxiliary_loss_mlp": 0.01031583, + "balance_loss_clip": 1.05652368, + "balance_loss_mlp": 1.02274966, + "epoch": 0.4935970660734684, + "flos": 23469957181440.0, + "grad_norm": 3.9816284843111736, + "language_loss": 0.79736221, + "learning_rate": 2.1385459055773727e-06, + "loss": 0.8195461, + "num_input_tokens_seen": 88335095, + "step": 4105, + "time_per_iteration": 2.4578769207000732 + }, + { + "auxiliary_loss_clip": 0.01113717, + "auxiliary_loss_mlp": 0.00762841, + "balance_loss_clip": 1.04501867, + "balance_loss_mlp": 1.00113916, + "epoch": 0.4937173089641075, + "flos": 64479258840960.0, + "grad_norm": 1.9225835441694286, + "language_loss": 0.73867083, + "learning_rate": 2.137768794730696e-06, + "loss": 0.75743639, + "num_input_tokens_seen": 88358545, + "step": 4106, + "time_per_iteration": 3.021045446395874 + }, + { + "auxiliary_loss_clip": 0.01162206, + "auxiliary_loss_mlp": 0.01030365, + "balance_loss_clip": 1.05427623, + "balance_loss_mlp": 1.02194893, + "epoch": 0.4938375518547466, + "flos": 22346025644160.0, + "grad_norm": 1.8043781867363249, + "language_loss": 0.79993069, + "learning_rate": 2.1369916629846026e-06, + "loss": 0.82185638, + "num_input_tokens_seen": 88378295, + "step": 4107, + "time_per_iteration": 2.536414384841919 + }, + { + "auxiliary_loss_clip": 0.01154948, + "auxiliary_loss_mlp": 0.01024047, + "balance_loss_clip": 1.04809308, + "balance_loss_mlp": 1.01609552, + "epoch": 0.4939577947453857, + "flos": 17858700299520.0, + "grad_norm": 1.7778482350677307, + "language_loss": 0.7463752, + "learning_rate": 2.136214510456983e-06, + "loss": 0.76816511, + "num_input_tokens_seen": 88396750, + "step": 4108, + "time_per_iteration": 2.4734764099121094 + }, + { + "auxiliary_loss_clip": 0.01047737, + "auxiliary_loss_mlp": 0.00753032, + "balance_loss_clip": 1.02088583, + "balance_loss_mlp": 1.00053108, + "epoch": 0.49407803763602476, + "flos": 70066746875520.0, + "grad_norm": 0.8914381976028932, + "language_loss": 0.63066429, + "learning_rate": 2.1354373372657296e-06, + "loss": 0.64867198, + "num_input_tokens_seen": 88455190, + "step": 4109, + "time_per_iteration": 3.1658551692962646 + }, + { + "auxiliary_loss_clip": 0.01183175, + "auxiliary_loss_mlp": 0.01027126, + "balance_loss_clip": 1.05478621, + "balance_loss_mlp": 1.01964879, + "epoch": 0.49419828052666387, + "flos": 24317485562880.0, + "grad_norm": 1.5674119760874528, + "language_loss": 0.7103864, + "learning_rate": 2.1346601435287404e-06, + "loss": 0.73248947, + "num_input_tokens_seen": 88477460, + "step": 4110, + "time_per_iteration": 2.4789693355560303 + }, + { + "auxiliary_loss_clip": 0.0115273, + "auxiliary_loss_mlp": 0.0102573, + "balance_loss_clip": 1.04811919, + "balance_loss_mlp": 1.01814795, + "epoch": 0.494318523417303, + "flos": 29386060790400.0, + "grad_norm": 3.4581253206519835, + "language_loss": 0.80305636, + "learning_rate": 2.1338829293639144e-06, + "loss": 0.82484096, + "num_input_tokens_seen": 88497820, + "step": 4111, + "time_per_iteration": 2.591294050216675 + }, + { + "auxiliary_loss_clip": 0.01123982, + "auxiliary_loss_mlp": 0.0103141, + "balance_loss_clip": 1.0473423, + "balance_loss_mlp": 1.02279091, + "epoch": 0.49443876630794203, + "flos": 15268284195840.0, + "grad_norm": 2.095574593836752, + "language_loss": 0.83059192, + "learning_rate": 2.1331056948891547e-06, + "loss": 0.85214579, + "num_input_tokens_seen": 88514920, + "step": 4112, + "time_per_iteration": 2.535151720046997 + }, + { + "auxiliary_loss_clip": 0.01149851, + "auxiliary_loss_mlp": 0.01023315, + "balance_loss_clip": 1.04947555, + "balance_loss_mlp": 1.01500559, + "epoch": 0.49455900919858115, + "flos": 12347453859840.0, + "grad_norm": 2.4193947095074617, + "language_loss": 0.76031262, + "learning_rate": 2.1323284402223666e-06, + "loss": 0.78204429, + "num_input_tokens_seen": 88530910, + "step": 4113, + "time_per_iteration": 2.468132734298706 + }, + { + "auxiliary_loss_clip": 0.01183693, + "auxiliary_loss_mlp": 0.00761953, + "balance_loss_clip": 1.05752337, + "balance_loss_mlp": 1.00107765, + "epoch": 0.4946792520892202, + "flos": 22779610715520.0, + "grad_norm": 1.900237146305738, + "language_loss": 0.87942982, + "learning_rate": 2.1315511654814597e-06, + "loss": 0.89888632, + "num_input_tokens_seen": 88549320, + "step": 4114, + "time_per_iteration": 2.4581797122955322 + }, + { + "auxiliary_loss_clip": 0.01148851, + "auxiliary_loss_mlp": 0.01025102, + "balance_loss_clip": 1.05190706, + "balance_loss_mlp": 1.01780343, + "epoch": 0.4947994949798593, + "flos": 23148126299520.0, + "grad_norm": 3.5825584688232737, + "language_loss": 0.78252399, + "learning_rate": 2.1307738707843456e-06, + "loss": 0.80426353, + "num_input_tokens_seen": 88568985, + "step": 4115, + "time_per_iteration": 2.5153353214263916 + }, + { + "auxiliary_loss_clip": 0.01175506, + "auxiliary_loss_mlp": 0.01025219, + "balance_loss_clip": 1.05454016, + "balance_loss_mlp": 1.01754236, + "epoch": 0.4949197378704984, + "flos": 23659997063040.0, + "grad_norm": 2.3771109908926444, + "language_loss": 0.69300699, + "learning_rate": 2.1299965562489385e-06, + "loss": 0.71501422, + "num_input_tokens_seen": 88588790, + "step": 4116, + "time_per_iteration": 2.490352153778076 + }, + { + "auxiliary_loss_clip": 0.011664, + "auxiliary_loss_mlp": 0.01027367, + "balance_loss_clip": 1.05008292, + "balance_loss_mlp": 1.01937985, + "epoch": 0.4950399807611375, + "flos": 26911493026560.0, + "grad_norm": 4.160400489067851, + "language_loss": 0.78975135, + "learning_rate": 2.129219221993158e-06, + "loss": 0.81168902, + "num_input_tokens_seen": 88613575, + "step": 4117, + "time_per_iteration": 2.5677616596221924 + }, + { + "auxiliary_loss_clip": 0.01046649, + "auxiliary_loss_mlp": 0.01001699, + "balance_loss_clip": 1.01794112, + "balance_loss_mlp": 1.00045288, + "epoch": 0.4951602236517766, + "flos": 67315270187520.0, + "grad_norm": 0.7949111467859981, + "language_loss": 0.59935081, + "learning_rate": 2.128441868134924e-06, + "loss": 0.61983424, + "num_input_tokens_seen": 88675510, + "step": 4118, + "time_per_iteration": 3.1944422721862793 + }, + { + "auxiliary_loss_clip": 0.01142954, + "auxiliary_loss_mlp": 0.01028256, + "balance_loss_clip": 1.04725957, + "balance_loss_mlp": 1.02026939, + "epoch": 0.4952804665424157, + "flos": 19901442758400.0, + "grad_norm": 2.1361089026007956, + "language_loss": 0.82681596, + "learning_rate": 2.1276644947921606e-06, + "loss": 0.84852803, + "num_input_tokens_seen": 88694425, + "step": 4119, + "time_per_iteration": 2.582153081893921 + }, + { + "auxiliary_loss_clip": 0.01168171, + "auxiliary_loss_mlp": 0.01027109, + "balance_loss_clip": 1.05138826, + "balance_loss_mlp": 1.01816857, + "epoch": 0.49540070943305475, + "flos": 18806813740800.0, + "grad_norm": 2.1193853533700167, + "language_loss": 0.82447696, + "learning_rate": 2.126887102082795e-06, + "loss": 0.84642977, + "num_input_tokens_seen": 88714450, + "step": 4120, + "time_per_iteration": 2.4700841903686523 + }, + { + "auxiliary_loss_clip": 0.01140784, + "auxiliary_loss_mlp": 0.01028154, + "balance_loss_clip": 1.04704189, + "balance_loss_mlp": 1.02030122, + "epoch": 0.49552095232369386, + "flos": 24934179191040.0, + "grad_norm": 1.791609642984421, + "language_loss": 0.70100754, + "learning_rate": 2.126109690124757e-06, + "loss": 0.7226969, + "num_input_tokens_seen": 88735265, + "step": 4121, + "time_per_iteration": 2.5699429512023926 + }, + { + "auxiliary_loss_clip": 0.01128036, + "auxiliary_loss_mlp": 0.01027295, + "balance_loss_clip": 1.04647624, + "balance_loss_mlp": 1.01984406, + "epoch": 0.495641195214333, + "flos": 22857249962880.0, + "grad_norm": 1.7008516697259501, + "language_loss": 0.70637131, + "learning_rate": 2.1253322590359786e-06, + "loss": 0.72792459, + "num_input_tokens_seen": 88754600, + "step": 4122, + "time_per_iteration": 2.5839927196502686 + }, + { + "auxiliary_loss_clip": 0.0116585, + "auxiliary_loss_mlp": 0.0102928, + "balance_loss_clip": 1.05001414, + "balance_loss_mlp": 1.02170467, + "epoch": 0.49576143810497203, + "flos": 25769748343680.0, + "grad_norm": 1.7147836738610238, + "language_loss": 0.7421816, + "learning_rate": 2.124554808934397e-06, + "loss": 0.76413286, + "num_input_tokens_seen": 88775180, + "step": 4123, + "time_per_iteration": 3.248758554458618 + }, + { + "auxiliary_loss_clip": 0.01119408, + "auxiliary_loss_mlp": 0.01030113, + "balance_loss_clip": 1.04491878, + "balance_loss_mlp": 1.02194464, + "epoch": 0.49588168099561114, + "flos": 22128838058880.0, + "grad_norm": 2.099976662406231, + "language_loss": 0.73157054, + "learning_rate": 2.1237773399379496e-06, + "loss": 0.75306582, + "num_input_tokens_seen": 88796145, + "step": 4124, + "time_per_iteration": 2.624962329864502 + }, + { + "auxiliary_loss_clip": 0.01157408, + "auxiliary_loss_mlp": 0.01024825, + "balance_loss_clip": 1.04541016, + "balance_loss_mlp": 1.01621556, + "epoch": 0.49600192388625025, + "flos": 24387331559040.0, + "grad_norm": 2.3795521436935614, + "language_loss": 0.86752307, + "learning_rate": 2.122999852164578e-06, + "loss": 0.88934541, + "num_input_tokens_seen": 88816765, + "step": 4125, + "time_per_iteration": 3.312451124191284 + }, + { + "auxiliary_loss_clip": 0.0112422, + "auxiliary_loss_mlp": 0.01023081, + "balance_loss_clip": 1.04739451, + "balance_loss_mlp": 1.0148139, + "epoch": 0.4961221667768893, + "flos": 22857429530880.0, + "grad_norm": 2.459939710639832, + "language_loss": 0.58258259, + "learning_rate": 2.122222345732227e-06, + "loss": 0.60405564, + "num_input_tokens_seen": 88836680, + "step": 4126, + "time_per_iteration": 4.097977876663208 + }, + { + "auxiliary_loss_clip": 0.01141743, + "auxiliary_loss_mlp": 0.01029083, + "balance_loss_clip": 1.04780519, + "balance_loss_mlp": 1.02078021, + "epoch": 0.4962424096675284, + "flos": 17858089768320.0, + "grad_norm": 2.4494514896419437, + "language_loss": 0.83162868, + "learning_rate": 2.121444820758843e-06, + "loss": 0.85333693, + "num_input_tokens_seen": 88855320, + "step": 4127, + "time_per_iteration": 2.5199851989746094 + }, + { + "auxiliary_loss_clip": 0.01123381, + "auxiliary_loss_mlp": 0.01032178, + "balance_loss_clip": 1.04815626, + "balance_loss_mlp": 1.02377725, + "epoch": 0.49636265255816747, + "flos": 21793611404160.0, + "grad_norm": 2.0792698011601667, + "language_loss": 0.78766662, + "learning_rate": 2.120667277362376e-06, + "loss": 0.80922222, + "num_input_tokens_seen": 88874035, + "step": 4128, + "time_per_iteration": 2.5826902389526367 + }, + { + "auxiliary_loss_clip": 0.01185946, + "auxiliary_loss_mlp": 0.01035907, + "balance_loss_clip": 1.05537605, + "balance_loss_mlp": 1.02731776, + "epoch": 0.4964828954488066, + "flos": 16358603581440.0, + "grad_norm": 2.41073277998235, + "language_loss": 0.84937322, + "learning_rate": 2.1198897156607796e-06, + "loss": 0.87159175, + "num_input_tokens_seen": 88891390, + "step": 4129, + "time_per_iteration": 2.424842357635498 + }, + { + "auxiliary_loss_clip": 0.01173925, + "auxiliary_loss_mlp": 0.01028972, + "balance_loss_clip": 1.0514791, + "balance_loss_mlp": 1.02100861, + "epoch": 0.4966031383394457, + "flos": 24711101775360.0, + "grad_norm": 1.88039057283844, + "language_loss": 0.73960644, + "learning_rate": 2.1191121357720085e-06, + "loss": 0.76163542, + "num_input_tokens_seen": 88909450, + "step": 4130, + "time_per_iteration": 2.5549721717834473 + }, + { + "auxiliary_loss_clip": 0.01116889, + "auxiliary_loss_mlp": 0.01024484, + "balance_loss_clip": 1.0454185, + "balance_loss_mlp": 1.01637173, + "epoch": 0.49672338123008475, + "flos": 22930615491840.0, + "grad_norm": 1.8001381617149137, + "language_loss": 0.7467227, + "learning_rate": 2.1183345378140206e-06, + "loss": 0.76813644, + "num_input_tokens_seen": 88929195, + "step": 4131, + "time_per_iteration": 2.590911865234375 + }, + { + "auxiliary_loss_clip": 0.01072115, + "auxiliary_loss_mlp": 0.01001668, + "balance_loss_clip": 1.01733148, + "balance_loss_mlp": 1.00039279, + "epoch": 0.49684362412072386, + "flos": 65976736844160.0, + "grad_norm": 0.8595199510744767, + "language_loss": 0.62001455, + "learning_rate": 2.1175569219047783e-06, + "loss": 0.64075238, + "num_input_tokens_seen": 88990635, + "step": 4132, + "time_per_iteration": 3.191307544708252 + }, + { + "auxiliary_loss_clip": 0.01182997, + "auxiliary_loss_mlp": 0.01027175, + "balance_loss_clip": 1.05301142, + "balance_loss_mlp": 1.01967645, + "epoch": 0.49696386701136297, + "flos": 19971288754560.0, + "grad_norm": 1.5705840929011408, + "language_loss": 0.73518836, + "learning_rate": 2.1167792881622437e-06, + "loss": 0.75729012, + "num_input_tokens_seen": 89009655, + "step": 4133, + "time_per_iteration": 2.5357041358947754 + }, + { + "auxiliary_loss_clip": 0.01149695, + "auxiliary_loss_mlp": 0.01033709, + "balance_loss_clip": 1.05001831, + "balance_loss_mlp": 1.0261687, + "epoch": 0.497084109902002, + "flos": 24750819239040.0, + "grad_norm": 1.541156604097342, + "language_loss": 0.80898684, + "learning_rate": 2.116001636704384e-06, + "loss": 0.83082086, + "num_input_tokens_seen": 89030040, + "step": 4134, + "time_per_iteration": 2.53039288520813 + }, + { + "auxiliary_loss_clip": 0.01136394, + "auxiliary_loss_mlp": 0.01032216, + "balance_loss_clip": 1.0479387, + "balance_loss_mlp": 1.02400219, + "epoch": 0.49720435279264114, + "flos": 21871825269120.0, + "grad_norm": 1.961655020691588, + "language_loss": 0.8016988, + "learning_rate": 2.1152239676491685e-06, + "loss": 0.82338488, + "num_input_tokens_seen": 89048145, + "step": 4135, + "time_per_iteration": 2.5748109817504883 + }, + { + "auxiliary_loss_clip": 0.01158892, + "auxiliary_loss_mlp": 0.01026821, + "balance_loss_clip": 1.04787254, + "balance_loss_mlp": 1.01900709, + "epoch": 0.49732459568328025, + "flos": 23805794367360.0, + "grad_norm": 1.641663281757897, + "language_loss": 0.73385823, + "learning_rate": 2.114446281114569e-06, + "loss": 0.75571537, + "num_input_tokens_seen": 89067165, + "step": 4136, + "time_per_iteration": 2.5277528762817383 + }, + { + "auxiliary_loss_clip": 0.01146176, + "auxiliary_loss_mlp": 0.0102664, + "balance_loss_clip": 1.04881763, + "balance_loss_mlp": 1.01827204, + "epoch": 0.4974448385739193, + "flos": 20047742853120.0, + "grad_norm": 2.077240814850994, + "language_loss": 0.75857502, + "learning_rate": 2.1136685772185587e-06, + "loss": 0.78030318, + "num_input_tokens_seen": 89086190, + "step": 4137, + "time_per_iteration": 2.505084753036499 + }, + { + "auxiliary_loss_clip": 0.01152409, + "auxiliary_loss_mlp": 0.00763351, + "balance_loss_clip": 1.04371226, + "balance_loss_mlp": 1.00090361, + "epoch": 0.4975650814645584, + "flos": 24821347593600.0, + "grad_norm": 1.6871605262695266, + "language_loss": 0.77577454, + "learning_rate": 2.1128908560791163e-06, + "loss": 0.79493213, + "num_input_tokens_seen": 89106020, + "step": 4138, + "time_per_iteration": 2.5467522144317627 + }, + { + "auxiliary_loss_clip": 0.01183123, + "auxiliary_loss_mlp": 0.01027118, + "balance_loss_clip": 1.05397606, + "balance_loss_mlp": 1.01922059, + "epoch": 0.4976853243551975, + "flos": 19829477859840.0, + "grad_norm": 2.299139070413666, + "language_loss": 0.78207576, + "learning_rate": 2.1121131178142203e-06, + "loss": 0.80417818, + "num_input_tokens_seen": 89125385, + "step": 4139, + "time_per_iteration": 2.443603038787842 + }, + { + "auxiliary_loss_clip": 0.01155919, + "auxiliary_loss_mlp": 0.01023849, + "balance_loss_clip": 1.04877591, + "balance_loss_mlp": 1.01635706, + "epoch": 0.4978055672458366, + "flos": 23142990654720.0, + "grad_norm": 1.5456270085323398, + "language_loss": 0.82336998, + "learning_rate": 2.1113353625418544e-06, + "loss": 0.84516764, + "num_input_tokens_seen": 89143935, + "step": 4140, + "time_per_iteration": 2.5364387035369873 + }, + { + "auxiliary_loss_clip": 0.01162378, + "auxiliary_loss_mlp": 0.01027585, + "balance_loss_clip": 1.05299711, + "balance_loss_mlp": 1.02039051, + "epoch": 0.4979258101364757, + "flos": 15559914718080.0, + "grad_norm": 1.7282821990601283, + "language_loss": 0.78896809, + "learning_rate": 2.1105575903800017e-06, + "loss": 0.81086773, + "num_input_tokens_seen": 89162655, + "step": 4141, + "time_per_iteration": 2.446129083633423 + }, + { + "auxiliary_loss_clip": 0.01172108, + "auxiliary_loss_mlp": 0.01024038, + "balance_loss_clip": 1.04994023, + "balance_loss_mlp": 1.0159322, + "epoch": 0.4980460530271148, + "flos": 26356169784960.0, + "grad_norm": 1.8421184960704151, + "language_loss": 0.84717149, + "learning_rate": 2.1097798014466502e-06, + "loss": 0.869133, + "num_input_tokens_seen": 89182255, + "step": 4142, + "time_per_iteration": 2.541637897491455 + }, + { + "auxiliary_loss_clip": 0.01171599, + "auxiliary_loss_mlp": 0.01029812, + "balance_loss_clip": 1.05143237, + "balance_loss_mlp": 1.02136636, + "epoch": 0.49816629591775385, + "flos": 17274541415040.0, + "grad_norm": 2.247800493110876, + "language_loss": 0.59010959, + "learning_rate": 2.109001995859791e-06, + "loss": 0.61212373, + "num_input_tokens_seen": 89201155, + "step": 4143, + "time_per_iteration": 2.4417178630828857 + }, + { + "auxiliary_loss_clip": 0.01057819, + "auxiliary_loss_mlp": 0.01006246, + "balance_loss_clip": 1.01674414, + "balance_loss_mlp": 1.00505376, + "epoch": 0.49828653880839296, + "flos": 64930947344640.0, + "grad_norm": 0.7912449873712613, + "language_loss": 0.60107982, + "learning_rate": 2.108224173737415e-06, + "loss": 0.62172055, + "num_input_tokens_seen": 89264455, + "step": 4144, + "time_per_iteration": 3.0815281867980957 + }, + { + "auxiliary_loss_clip": 0.01149271, + "auxiliary_loss_mlp": 0.0102891, + "balance_loss_clip": 1.04625177, + "balance_loss_mlp": 1.02014184, + "epoch": 0.498406781699032, + "flos": 27484806003840.0, + "grad_norm": 1.9305115616528459, + "language_loss": 0.75944149, + "learning_rate": 2.1074463351975183e-06, + "loss": 0.7812233, + "num_input_tokens_seen": 89283340, + "step": 4145, + "time_per_iteration": 2.5545029640197754 + }, + { + "auxiliary_loss_clip": 0.01144376, + "auxiliary_loss_mlp": 0.01024069, + "balance_loss_clip": 1.0479393, + "balance_loss_mlp": 1.01621008, + "epoch": 0.49852702458967113, + "flos": 31499870307840.0, + "grad_norm": 1.8819829138390596, + "language_loss": 0.71382451, + "learning_rate": 2.106668480358098e-06, + "loss": 0.73550904, + "num_input_tokens_seen": 89303565, + "step": 4146, + "time_per_iteration": 2.6524927616119385 + }, + { + "auxiliary_loss_clip": 0.01150873, + "auxiliary_loss_mlp": 0.01024729, + "balance_loss_clip": 1.04633653, + "balance_loss_mlp": 1.01618791, + "epoch": 0.49864726748031024, + "flos": 22852868503680.0, + "grad_norm": 1.7038136804866477, + "language_loss": 0.70785964, + "learning_rate": 2.105890609337154e-06, + "loss": 0.72961569, + "num_input_tokens_seen": 89322080, + "step": 4147, + "time_per_iteration": 2.5522513389587402 + }, + { + "auxiliary_loss_clip": 0.01081594, + "auxiliary_loss_mlp": 0.01001909, + "balance_loss_clip": 1.01814008, + "balance_loss_mlp": 1.00062132, + "epoch": 0.4987675103709493, + "flos": 70405708544640.0, + "grad_norm": 0.6997613974781141, + "language_loss": 0.6385017, + "learning_rate": 2.1051127222526883e-06, + "loss": 0.65933669, + "num_input_tokens_seen": 89394195, + "step": 4148, + "time_per_iteration": 3.121534824371338 + }, + { + "auxiliary_loss_clip": 0.01166835, + "auxiliary_loss_mlp": 0.0102448, + "balance_loss_clip": 1.05340862, + "balance_loss_mlp": 1.01660657, + "epoch": 0.4988877532615884, + "flos": 28767571482240.0, + "grad_norm": 1.6572534893200803, + "language_loss": 0.8087796, + "learning_rate": 2.1043348192227067e-06, + "loss": 0.83069271, + "num_input_tokens_seen": 89414565, + "step": 4149, + "time_per_iteration": 2.543250322341919 + }, + { + "auxiliary_loss_clip": 0.0112999, + "auxiliary_loss_mlp": 0.01030605, + "balance_loss_clip": 1.0478766, + "balance_loss_mlp": 1.02300584, + "epoch": 0.4990079961522275, + "flos": 16872700988160.0, + "grad_norm": 1.8453994376288574, + "language_loss": 0.6156776, + "learning_rate": 2.1035569003652156e-06, + "loss": 0.63728356, + "num_input_tokens_seen": 89433195, + "step": 4150, + "time_per_iteration": 3.2850828170776367 + }, + { + "auxiliary_loss_clip": 0.01123583, + "auxiliary_loss_mlp": 0.01038994, + "balance_loss_clip": 1.04640031, + "balance_loss_mlp": 1.02945113, + "epoch": 0.4991282390428666, + "flos": 13291042187520.0, + "grad_norm": 2.1889938662486865, + "language_loss": 0.81720376, + "learning_rate": 2.1027789657982255e-06, + "loss": 0.83882952, + "num_input_tokens_seen": 89447410, + "step": 4151, + "time_per_iteration": 3.3802101612091064 + }, + { + "auxiliary_loss_clip": 0.01127296, + "auxiliary_loss_mlp": 0.01031487, + "balance_loss_clip": 1.04743552, + "balance_loss_mlp": 1.02335131, + "epoch": 0.4992484819335057, + "flos": 21537496454400.0, + "grad_norm": 1.9511104602440021, + "language_loss": 0.77083546, + "learning_rate": 2.1020010156397482e-06, + "loss": 0.79242337, + "num_input_tokens_seen": 89464630, + "step": 4152, + "time_per_iteration": 3.334953546524048 + }, + { + "auxiliary_loss_clip": 0.01168213, + "auxiliary_loss_mlp": 0.01028857, + "balance_loss_clip": 1.05016565, + "balance_loss_mlp": 1.0210191, + "epoch": 0.4993687248241448, + "flos": 24860095390080.0, + "grad_norm": 1.6877848344744621, + "language_loss": 0.77375948, + "learning_rate": 2.101223050007797e-06, + "loss": 0.79573023, + "num_input_tokens_seen": 89483180, + "step": 4153, + "time_per_iteration": 3.2981855869293213 + }, + { + "auxiliary_loss_clip": 0.01079757, + "auxiliary_loss_mlp": 0.01001895, + "balance_loss_clip": 1.01672006, + "balance_loss_mlp": 1.00070894, + "epoch": 0.49948896771478385, + "flos": 62941602453120.0, + "grad_norm": 0.8138083207281288, + "language_loss": 0.5379535, + "learning_rate": 2.1004450690203904e-06, + "loss": 0.55876994, + "num_input_tokens_seen": 89539260, + "step": 4154, + "time_per_iteration": 3.069300413131714 + }, + { + "auxiliary_loss_clip": 0.01079762, + "auxiliary_loss_mlp": 0.01001377, + "balance_loss_clip": 1.01671469, + "balance_loss_mlp": 1.00017345, + "epoch": 0.49960921060542296, + "flos": 68284213516800.0, + "grad_norm": 0.8529792022237408, + "language_loss": 0.6329807, + "learning_rate": 2.099667072795546e-06, + "loss": 0.65379214, + "num_input_tokens_seen": 89601380, + "step": 4155, + "time_per_iteration": 3.0684971809387207 + }, + { + "auxiliary_loss_clip": 0.01165939, + "auxiliary_loss_mlp": 0.01029781, + "balance_loss_clip": 1.04955971, + "balance_loss_mlp": 1.02164471, + "epoch": 0.49972945349606207, + "flos": 23659350618240.0, + "grad_norm": 1.8585868216430024, + "language_loss": 0.79949307, + "learning_rate": 2.0988890614512864e-06, + "loss": 0.82145023, + "num_input_tokens_seen": 89621270, + "step": 4156, + "time_per_iteration": 2.5026328563690186 + }, + { + "auxiliary_loss_clip": 0.01159941, + "auxiliary_loss_mlp": 0.01025264, + "balance_loss_clip": 1.05378652, + "balance_loss_mlp": 1.01719356, + "epoch": 0.4998496963867011, + "flos": 19755825022080.0, + "grad_norm": 1.6636484296211054, + "language_loss": 0.84176475, + "learning_rate": 2.098111035105635e-06, + "loss": 0.86361682, + "num_input_tokens_seen": 89639695, + "step": 4157, + "time_per_iteration": 2.4824917316436768 + }, + { + "auxiliary_loss_clip": 0.01126887, + "auxiliary_loss_mlp": 0.01027996, + "balance_loss_clip": 1.05088603, + "balance_loss_mlp": 1.02000904, + "epoch": 0.49996993927734024, + "flos": 22265728790400.0, + "grad_norm": 1.740716822077891, + "language_loss": 0.73212212, + "learning_rate": 2.0973329938766176e-06, + "loss": 0.75367093, + "num_input_tokens_seen": 89657125, + "step": 4158, + "time_per_iteration": 2.5576670169830322 + }, + { + "auxiliary_loss_clip": 0.01173356, + "auxiliary_loss_mlp": 0.01033454, + "balance_loss_clip": 1.05239797, + "balance_loss_mlp": 1.02490091, + "epoch": 0.5000901821679793, + "flos": 23327212533120.0, + "grad_norm": 2.070189780415778, + "language_loss": 0.78813958, + "learning_rate": 2.0965549378822618e-06, + "loss": 0.81020772, + "num_input_tokens_seen": 89678415, + "step": 4159, + "time_per_iteration": 2.5057904720306396 + }, + { + "auxiliary_loss_clip": 0.01079351, + "auxiliary_loss_mlp": 0.01028039, + "balance_loss_clip": 1.04174924, + "balance_loss_mlp": 1.01967049, + "epoch": 0.5002104250586185, + "flos": 20339014239360.0, + "grad_norm": 2.0115832009716605, + "language_loss": 0.84258151, + "learning_rate": 2.095776867240599e-06, + "loss": 0.86365533, + "num_input_tokens_seen": 89695405, + "step": 4160, + "time_per_iteration": 2.8536558151245117 + }, + { + "auxiliary_loss_clip": 0.01133034, + "auxiliary_loss_mlp": 0.01029747, + "balance_loss_clip": 1.04589999, + "balance_loss_mlp": 1.02231753, + "epoch": 0.5003306679492575, + "flos": 13991372634240.0, + "grad_norm": 1.9806809584618383, + "language_loss": 0.82518381, + "learning_rate": 2.094998782069661e-06, + "loss": 0.84681159, + "num_input_tokens_seen": 89713110, + "step": 4161, + "time_per_iteration": 2.7077882289886475 + }, + { + "auxiliary_loss_clip": 0.01183206, + "auxiliary_loss_mlp": 0.01027987, + "balance_loss_clip": 1.05434704, + "balance_loss_mlp": 1.01977324, + "epoch": 0.5004509108398966, + "flos": 27672762896640.0, + "grad_norm": 1.5893227322222259, + "language_loss": 0.75597203, + "learning_rate": 2.0942206824874845e-06, + "loss": 0.77808392, + "num_input_tokens_seen": 89735885, + "step": 4162, + "time_per_iteration": 2.533877372741699 + }, + { + "auxiliary_loss_clip": 0.01164766, + "auxiliary_loss_mlp": 0.01027361, + "balance_loss_clip": 1.05091619, + "balance_loss_mlp": 1.01959479, + "epoch": 0.5005711537305357, + "flos": 14976186796800.0, + "grad_norm": 6.681595029443981, + "language_loss": 0.79205137, + "learning_rate": 2.093442568612105e-06, + "loss": 0.81397265, + "num_input_tokens_seen": 89753690, + "step": 4163, + "time_per_iteration": 2.456993579864502 + }, + { + "auxiliary_loss_clip": 0.0118086, + "auxiliary_loss_mlp": 0.01022962, + "balance_loss_clip": 1.05087256, + "balance_loss_mlp": 1.01510024, + "epoch": 0.5006913966211748, + "flos": 26503259978880.0, + "grad_norm": 1.5684454402228616, + "language_loss": 0.85374856, + "learning_rate": 2.0926644405615613e-06, + "loss": 0.87578684, + "num_input_tokens_seen": 89774590, + "step": 4164, + "time_per_iteration": 2.481231451034546 + }, + { + "auxiliary_loss_clip": 0.01134171, + "auxiliary_loss_mlp": 0.0102708, + "balance_loss_clip": 1.04760957, + "balance_loss_mlp": 1.01933742, + "epoch": 0.5008116395118138, + "flos": 20449295971200.0, + "grad_norm": 1.8108541030297403, + "language_loss": 0.81064343, + "learning_rate": 2.091886298453897e-06, + "loss": 0.8322559, + "num_input_tokens_seen": 89792775, + "step": 4165, + "time_per_iteration": 2.525049924850464 + }, + { + "auxiliary_loss_clip": 0.01166478, + "auxiliary_loss_mlp": 0.01025516, + "balance_loss_clip": 1.05042279, + "balance_loss_mlp": 1.018116, + "epoch": 0.500931882402453, + "flos": 21579871524480.0, + "grad_norm": 1.8322546483187434, + "language_loss": 0.7308346, + "learning_rate": 2.091108142407153e-06, + "loss": 0.75275457, + "num_input_tokens_seen": 89811515, + "step": 4166, + "time_per_iteration": 2.506544828414917 + }, + { + "auxiliary_loss_clip": 0.01056428, + "auxiliary_loss_mlp": 0.01003686, + "balance_loss_clip": 1.01883352, + "balance_loss_mlp": 1.00223196, + "epoch": 0.5010521252930921, + "flos": 57785011925760.0, + "grad_norm": 0.8481236966898328, + "language_loss": 0.62444961, + "learning_rate": 2.090329972539377e-06, + "loss": 0.64505076, + "num_input_tokens_seen": 89870080, + "step": 4167, + "time_per_iteration": 3.1322693824768066 + }, + { + "auxiliary_loss_clip": 0.01070813, + "auxiliary_loss_mlp": 0.01029232, + "balance_loss_clip": 1.0396347, + "balance_loss_mlp": 1.0214361, + "epoch": 0.5011723681837311, + "flos": 18625500864000.0, + "grad_norm": 1.964159752782323, + "language_loss": 0.68830132, + "learning_rate": 2.089551788968616e-06, + "loss": 0.70930177, + "num_input_tokens_seen": 89888045, + "step": 4168, + "time_per_iteration": 2.7246828079223633 + }, + { + "auxiliary_loss_clip": 0.01077856, + "auxiliary_loss_mlp": 0.01001394, + "balance_loss_clip": 1.01484346, + "balance_loss_mlp": 1.00011814, + "epoch": 0.5012926110743702, + "flos": 55883146608000.0, + "grad_norm": 0.8373700639502587, + "language_loss": 0.61044234, + "learning_rate": 2.08877359181292e-06, + "loss": 0.63123482, + "num_input_tokens_seen": 89944610, + "step": 4169, + "time_per_iteration": 3.0822277069091797 + }, + { + "auxiliary_loss_clip": 0.01142686, + "auxiliary_loss_mlp": 0.01024863, + "balance_loss_clip": 1.04476202, + "balance_loss_mlp": 1.01726353, + "epoch": 0.5014128539650093, + "flos": 24238266117120.0, + "grad_norm": 2.380322840655851, + "language_loss": 0.85098124, + "learning_rate": 2.0879953811903396e-06, + "loss": 0.87265682, + "num_input_tokens_seen": 89959495, + "step": 4170, + "time_per_iteration": 2.5553605556488037 + }, + { + "auxiliary_loss_clip": 0.01167073, + "auxiliary_loss_mlp": 0.01031033, + "balance_loss_clip": 1.05201411, + "balance_loss_mlp": 1.02290249, + "epoch": 0.5015330968556484, + "flos": 27527468382720.0, + "grad_norm": 4.757967817475786, + "language_loss": 0.77989465, + "learning_rate": 2.08721715721893e-06, + "loss": 0.80187571, + "num_input_tokens_seen": 89978820, + "step": 4171, + "time_per_iteration": 2.534708261489868 + }, + { + "auxiliary_loss_clip": 0.01168428, + "auxiliary_loss_mlp": 0.01028414, + "balance_loss_clip": 1.05247545, + "balance_loss_mlp": 1.02054071, + "epoch": 0.5016533397462875, + "flos": 23800802376960.0, + "grad_norm": 1.7640714725972009, + "language_loss": 0.77173555, + "learning_rate": 2.0864389200167477e-06, + "loss": 0.79370397, + "num_input_tokens_seen": 89997075, + "step": 4172, + "time_per_iteration": 2.480888843536377 + }, + { + "auxiliary_loss_clip": 0.0117166, + "auxiliary_loss_mlp": 0.00762755, + "balance_loss_clip": 1.0523479, + "balance_loss_mlp": 1.00096059, + "epoch": 0.5017735826369266, + "flos": 25295009264640.0, + "grad_norm": 3.6904501295164516, + "language_loss": 0.78957605, + "learning_rate": 2.0856606697018504e-06, + "loss": 0.8089202, + "num_input_tokens_seen": 90015085, + "step": 4173, + "time_per_iteration": 2.5100350379943848 + }, + { + "auxiliary_loss_clip": 0.01147668, + "auxiliary_loss_mlp": 0.01028086, + "balance_loss_clip": 1.04747057, + "balance_loss_mlp": 1.01986098, + "epoch": 0.5018938255275657, + "flos": 16873203778560.0, + "grad_norm": 2.2692627160601053, + "language_loss": 0.73867118, + "learning_rate": 2.084882406392297e-06, + "loss": 0.76042867, + "num_input_tokens_seen": 90033045, + "step": 4174, + "time_per_iteration": 2.4659948348999023 + }, + { + "auxiliary_loss_clip": 0.01173708, + "auxiliary_loss_mlp": 0.01027326, + "balance_loss_clip": 1.05442786, + "balance_loss_mlp": 1.0195235, + "epoch": 0.5020140684182047, + "flos": 25515429073920.0, + "grad_norm": 2.576887279507795, + "language_loss": 0.70705783, + "learning_rate": 2.0841041302061496e-06, + "loss": 0.72906816, + "num_input_tokens_seen": 90052505, + "step": 4175, + "time_per_iteration": 2.512549877166748 + }, + { + "auxiliary_loss_clip": 0.01144115, + "auxiliary_loss_mlp": 0.01033342, + "balance_loss_clip": 1.04666424, + "balance_loss_mlp": 1.02528334, + "epoch": 0.5021343113088439, + "flos": 23659278791040.0, + "grad_norm": 1.9095445663890245, + "language_loss": 0.75630057, + "learning_rate": 2.083325841261473e-06, + "loss": 0.7780751, + "num_input_tokens_seen": 90071565, + "step": 4176, + "time_per_iteration": 2.513784170150757 + }, + { + "auxiliary_loss_clip": 0.0114553, + "auxiliary_loss_mlp": 0.01025412, + "balance_loss_clip": 1.04536533, + "balance_loss_mlp": 1.01788366, + "epoch": 0.502254554199483, + "flos": 24534673148160.0, + "grad_norm": 1.7850702897646258, + "language_loss": 0.66024631, + "learning_rate": 2.0825475396763322e-06, + "loss": 0.68195575, + "num_input_tokens_seen": 90092215, + "step": 4177, + "time_per_iteration": 3.2574989795684814 + }, + { + "auxiliary_loss_clip": 0.01074557, + "auxiliary_loss_mlp": 0.01026936, + "balance_loss_clip": 1.04139149, + "balance_loss_mlp": 1.01861548, + "epoch": 0.502374797090122, + "flos": 34240285607040.0, + "grad_norm": 1.5330758822242714, + "language_loss": 0.65519905, + "learning_rate": 2.081769225568796e-06, + "loss": 0.67621392, + "num_input_tokens_seen": 90114665, + "step": 4178, + "time_per_iteration": 3.752821207046509 + }, + { + "auxiliary_loss_clip": 0.01169041, + "auxiliary_loss_mlp": 0.01030484, + "balance_loss_clip": 1.04912782, + "balance_loss_mlp": 1.02186489, + "epoch": 0.5024950399807612, + "flos": 26031106679040.0, + "grad_norm": 1.688300521638661, + "language_loss": 0.75893009, + "learning_rate": 2.0809908990569327e-06, + "loss": 0.78092539, + "num_input_tokens_seen": 90136445, + "step": 4179, + "time_per_iteration": 4.068695306777954 + }, + { + "auxiliary_loss_clip": 0.01154675, + "auxiliary_loss_mlp": 0.0102997, + "balance_loss_clip": 1.04972005, + "balance_loss_mlp": 1.02145886, + "epoch": 0.5026152828714002, + "flos": 21252438120960.0, + "grad_norm": 1.7465074959606244, + "language_loss": 0.79263169, + "learning_rate": 2.0802125602588146e-06, + "loss": 0.81447816, + "num_input_tokens_seen": 90155710, + "step": 4180, + "time_per_iteration": 2.5609922409057617 + }, + { + "auxiliary_loss_clip": 0.01183525, + "auxiliary_loss_mlp": 0.01031319, + "balance_loss_clip": 1.05387616, + "balance_loss_mlp": 1.02329588, + "epoch": 0.5027355257620393, + "flos": 30956111245440.0, + "grad_norm": 2.8585795082771934, + "language_loss": 0.66177571, + "learning_rate": 2.0794342092925146e-06, + "loss": 0.6839242, + "num_input_tokens_seen": 90176845, + "step": 4181, + "time_per_iteration": 2.5146565437316895 + }, + { + "auxiliary_loss_clip": 0.01173381, + "auxiliary_loss_mlp": 0.01031001, + "balance_loss_clip": 1.0543679, + "balance_loss_mlp": 1.0226202, + "epoch": 0.5028557686526784, + "flos": 24791147233920.0, + "grad_norm": 1.8724847937411164, + "language_loss": 0.67738885, + "learning_rate": 2.078655846276108e-06, + "loss": 0.69943261, + "num_input_tokens_seen": 90197175, + "step": 4182, + "time_per_iteration": 2.5052218437194824 + }, + { + "auxiliary_loss_clip": 0.01149644, + "auxiliary_loss_mlp": 0.01027605, + "balance_loss_clip": 1.04985929, + "balance_loss_mlp": 1.01926625, + "epoch": 0.5029760115433175, + "flos": 22966992990720.0, + "grad_norm": 2.0998125901239333, + "language_loss": 0.68569952, + "learning_rate": 2.0778774713276727e-06, + "loss": 0.70747203, + "num_input_tokens_seen": 90216650, + "step": 4183, + "time_per_iteration": 2.5295181274414062 + }, + { + "auxiliary_loss_clip": 0.01163694, + "auxiliary_loss_mlp": 0.01027592, + "balance_loss_clip": 1.04774439, + "balance_loss_mlp": 1.01937807, + "epoch": 0.5030962544339566, + "flos": 15305164485120.0, + "grad_norm": 2.105173308760544, + "language_loss": 0.67626297, + "learning_rate": 2.077099084565287e-06, + "loss": 0.69817579, + "num_input_tokens_seen": 90234055, + "step": 4184, + "time_per_iteration": 2.4307870864868164 + }, + { + "auxiliary_loss_clip": 0.01147205, + "auxiliary_loss_mlp": 0.0102657, + "balance_loss_clip": 1.04613888, + "balance_loss_mlp": 1.01879716, + "epoch": 0.5032164973245957, + "flos": 24494847943680.0, + "grad_norm": 2.005454377797643, + "language_loss": 0.65211785, + "learning_rate": 2.0763206861070313e-06, + "loss": 0.67385566, + "num_input_tokens_seen": 90253115, + "step": 4185, + "time_per_iteration": 2.5088295936584473 + }, + { + "auxiliary_loss_clip": 0.01185166, + "auxiliary_loss_mlp": 0.01033044, + "balance_loss_clip": 1.05412853, + "balance_loss_mlp": 1.02452695, + "epoch": 0.5033367402152348, + "flos": 16213452721920.0, + "grad_norm": 1.9777825013029484, + "language_loss": 0.75213885, + "learning_rate": 2.0755422760709876e-06, + "loss": 0.77432096, + "num_input_tokens_seen": 90270515, + "step": 4186, + "time_per_iteration": 2.40427565574646 + }, + { + "auxiliary_loss_clip": 0.01118568, + "auxiliary_loss_mlp": 0.01037902, + "balance_loss_clip": 1.0441339, + "balance_loss_mlp": 1.02951014, + "epoch": 0.5034569831058738, + "flos": 21391375927680.0, + "grad_norm": 2.477724779626397, + "language_loss": 0.76524365, + "learning_rate": 2.0747638545752417e-06, + "loss": 0.78680837, + "num_input_tokens_seen": 90289075, + "step": 4187, + "time_per_iteration": 2.579172134399414 + }, + { + "auxiliary_loss_clip": 0.01155212, + "auxiliary_loss_mlp": 0.01025432, + "balance_loss_clip": 1.05308437, + "balance_loss_mlp": 1.01746249, + "epoch": 0.503577225996513, + "flos": 20558751690240.0, + "grad_norm": 2.0543994168345097, + "language_loss": 0.83144891, + "learning_rate": 2.073985421737878e-06, + "loss": 0.85325533, + "num_input_tokens_seen": 90306385, + "step": 4188, + "time_per_iteration": 2.5243871212005615 + }, + { + "auxiliary_loss_clip": 0.01170932, + "auxiliary_loss_mlp": 0.01024923, + "balance_loss_clip": 1.0517875, + "balance_loss_mlp": 1.01673961, + "epoch": 0.5036974688871521, + "flos": 27229157930880.0, + "grad_norm": 2.3137918070664427, + "language_loss": 0.73214936, + "learning_rate": 2.0732069776769844e-06, + "loss": 0.75410795, + "num_input_tokens_seen": 90323795, + "step": 4189, + "time_per_iteration": 2.497269868850708 + }, + { + "auxiliary_loss_clip": 0.01183209, + "auxiliary_loss_mlp": 0.01029738, + "balance_loss_clip": 1.05414867, + "balance_loss_mlp": 1.02121472, + "epoch": 0.5038177117777911, + "flos": 20412164286720.0, + "grad_norm": 2.174262870385668, + "language_loss": 0.73390633, + "learning_rate": 2.072428522510651e-06, + "loss": 0.7560358, + "num_input_tokens_seen": 90340360, + "step": 4190, + "time_per_iteration": 2.459739923477173 + }, + { + "auxiliary_loss_clip": 0.0113277, + "auxiliary_loss_mlp": 0.01031021, + "balance_loss_clip": 1.04802692, + "balance_loss_mlp": 1.02343333, + "epoch": 0.5039379546684303, + "flos": 21907987286400.0, + "grad_norm": 2.3526912264125697, + "language_loss": 0.76094472, + "learning_rate": 2.071650056356968e-06, + "loss": 0.78258264, + "num_input_tokens_seen": 90357900, + "step": 4191, + "time_per_iteration": 2.537432909011841 + }, + { + "auxiliary_loss_clip": 0.01181253, + "auxiliary_loss_mlp": 0.01033894, + "balance_loss_clip": 1.05246747, + "balance_loss_mlp": 1.02595448, + "epoch": 0.5040581975590693, + "flos": 20010718909440.0, + "grad_norm": 2.3872994868783417, + "language_loss": 0.8024708, + "learning_rate": 2.070871579334028e-06, + "loss": 0.82462227, + "num_input_tokens_seen": 90377010, + "step": 4192, + "time_per_iteration": 2.4234402179718018 + }, + { + "auxiliary_loss_clip": 0.01181481, + "auxiliary_loss_mlp": 0.01025612, + "balance_loss_clip": 1.05232298, + "balance_loss_mlp": 1.01768434, + "epoch": 0.5041784404497084, + "flos": 20959837931520.0, + "grad_norm": 2.5032322309300894, + "language_loss": 0.71508384, + "learning_rate": 2.0700930915599264e-06, + "loss": 0.73715484, + "num_input_tokens_seen": 90396740, + "step": 4193, + "time_per_iteration": 2.4398462772369385 + }, + { + "auxiliary_loss_clip": 0.011815, + "auxiliary_loss_mlp": 0.01029245, + "balance_loss_clip": 1.05179334, + "balance_loss_mlp": 1.02163386, + "epoch": 0.5042986833403476, + "flos": 12495082757760.0, + "grad_norm": 2.050558947154078, + "language_loss": 0.78321028, + "learning_rate": 2.0693145931527583e-06, + "loss": 0.80531776, + "num_input_tokens_seen": 90413220, + "step": 4194, + "time_per_iteration": 2.3926331996917725 + }, + { + "auxiliary_loss_clip": 0.01148335, + "auxiliary_loss_mlp": 0.01029726, + "balance_loss_clip": 1.04795575, + "balance_loss_mlp": 1.0216794, + "epoch": 0.5044189262309866, + "flos": 29202305788800.0, + "grad_norm": 1.5926902252891577, + "language_loss": 0.78079975, + "learning_rate": 2.068536084230622e-06, + "loss": 0.80258036, + "num_input_tokens_seen": 90435085, + "step": 4195, + "time_per_iteration": 2.5573251247406006 + }, + { + "auxiliary_loss_clip": 0.0117007, + "auxiliary_loss_mlp": 0.0103476, + "balance_loss_clip": 1.05354357, + "balance_loss_mlp": 1.02521777, + "epoch": 0.5045391691216257, + "flos": 23873198238720.0, + "grad_norm": 2.1804094293557084, + "language_loss": 0.88246143, + "learning_rate": 2.067757564911616e-06, + "loss": 0.90450978, + "num_input_tokens_seen": 90453660, + "step": 4196, + "time_per_iteration": 2.47404146194458 + }, + { + "auxiliary_loss_clip": 0.01163084, + "auxiliary_loss_mlp": 0.00763021, + "balance_loss_clip": 1.05190182, + "balance_loss_mlp": 1.00082171, + "epoch": 0.5046594120122648, + "flos": 24644990793600.0, + "grad_norm": 1.8650587737186866, + "language_loss": 0.92410278, + "learning_rate": 2.0669790353138407e-06, + "loss": 0.94336385, + "num_input_tokens_seen": 90472625, + "step": 4197, + "time_per_iteration": 2.528360605239868 + }, + { + "auxiliary_loss_clip": 0.01137551, + "auxiliary_loss_mlp": 0.00763012, + "balance_loss_clip": 1.04991722, + "balance_loss_mlp": 1.00091982, + "epoch": 0.5047796549029039, + "flos": 23362835846400.0, + "grad_norm": 2.7867927100694754, + "language_loss": 0.72978199, + "learning_rate": 2.0662004955553995e-06, + "loss": 0.74878758, + "num_input_tokens_seen": 90492325, + "step": 4198, + "time_per_iteration": 2.55450177192688 + }, + { + "auxiliary_loss_clip": 0.01150504, + "auxiliary_loss_mlp": 0.01024031, + "balance_loss_clip": 1.04849744, + "balance_loss_mlp": 1.01618099, + "epoch": 0.5048998977935429, + "flos": 17304095329920.0, + "grad_norm": 2.052863511971461, + "language_loss": 0.76965129, + "learning_rate": 2.065421945754395e-06, + "loss": 0.79139668, + "num_input_tokens_seen": 90510055, + "step": 4199, + "time_per_iteration": 2.5045900344848633 + }, + { + "auxiliary_loss_clip": 0.01127425, + "auxiliary_loss_mlp": 0.01027938, + "balance_loss_clip": 1.04905605, + "balance_loss_mlp": 1.02048159, + "epoch": 0.505020140684182, + "flos": 34856979235200.0, + "grad_norm": 1.6488579740058544, + "language_loss": 0.77768064, + "learning_rate": 2.0646433860289344e-06, + "loss": 0.79923427, + "num_input_tokens_seen": 90528980, + "step": 4200, + "time_per_iteration": 2.675868034362793 + }, + { + "auxiliary_loss_clip": 0.01173836, + "auxiliary_loss_mlp": 0.00763343, + "balance_loss_clip": 1.05179834, + "balance_loss_mlp": 1.0009141, + "epoch": 0.5051403835748212, + "flos": 24863974058880.0, + "grad_norm": 3.0938243131367353, + "language_loss": 0.82845289, + "learning_rate": 2.0638648164971233e-06, + "loss": 0.84782457, + "num_input_tokens_seen": 90547445, + "step": 4201, + "time_per_iteration": 2.489266872406006 + }, + { + "auxiliary_loss_clip": 0.01155129, + "auxiliary_loss_mlp": 0.01028004, + "balance_loss_clip": 1.05193067, + "balance_loss_mlp": 1.02057087, + "epoch": 0.5052606264654602, + "flos": 20959694277120.0, + "grad_norm": 1.9844882223083826, + "language_loss": 0.88577235, + "learning_rate": 2.06308623727707e-06, + "loss": 0.90760368, + "num_input_tokens_seen": 90567545, + "step": 4202, + "time_per_iteration": 2.5115880966186523 + }, + { + "auxiliary_loss_clip": 0.01164893, + "auxiliary_loss_mlp": 0.01028934, + "balance_loss_clip": 1.05063701, + "balance_loss_mlp": 1.02058363, + "epoch": 0.5053808693560993, + "flos": 19642382893440.0, + "grad_norm": 2.6001566009725887, + "language_loss": 0.76426971, + "learning_rate": 2.0623076484868846e-06, + "loss": 0.78620791, + "num_input_tokens_seen": 90585000, + "step": 4203, + "time_per_iteration": 2.470569610595703 + }, + { + "auxiliary_loss_clip": 0.01059869, + "auxiliary_loss_mlp": 0.01006322, + "balance_loss_clip": 1.02335024, + "balance_loss_mlp": 1.00529718, + "epoch": 0.5055011122467384, + "flos": 67504915019520.0, + "grad_norm": 0.8305906943621326, + "language_loss": 0.60681522, + "learning_rate": 2.061529050244679e-06, + "loss": 0.62747717, + "num_input_tokens_seen": 90644745, + "step": 4204, + "time_per_iteration": 4.434124708175659 + }, + { + "auxiliary_loss_clip": 0.01146434, + "auxiliary_loss_mlp": 0.01023583, + "balance_loss_clip": 1.04896736, + "balance_loss_mlp": 1.01489258, + "epoch": 0.5056213551373775, + "flos": 16872952383360.0, + "grad_norm": 2.039118767779479, + "language_loss": 0.74294049, + "learning_rate": 2.060750442668565e-06, + "loss": 0.76464069, + "num_input_tokens_seen": 90662500, + "step": 4205, + "time_per_iteration": 3.2569351196289062 + }, + { + "auxiliary_loss_clip": 0.01170527, + "auxiliary_loss_mlp": 0.01030483, + "balance_loss_clip": 1.05468202, + "balance_loss_mlp": 1.02213264, + "epoch": 0.5057415980280165, + "flos": 15334179696000.0, + "grad_norm": 3.18619490585033, + "language_loss": 0.6376285, + "learning_rate": 2.059971825876657e-06, + "loss": 0.65963858, + "num_input_tokens_seen": 90677010, + "step": 4206, + "time_per_iteration": 3.259385585784912 + }, + { + "auxiliary_loss_clip": 0.01170968, + "auxiliary_loss_mlp": 0.01024872, + "balance_loss_clip": 1.05292749, + "balance_loss_mlp": 1.01681983, + "epoch": 0.5058618409186557, + "flos": 19025976574080.0, + "grad_norm": 1.8287544210557711, + "language_loss": 0.76085579, + "learning_rate": 2.0591931999870713e-06, + "loss": 0.7828142, + "num_input_tokens_seen": 90695935, + "step": 4207, + "time_per_iteration": 2.4706177711486816 + }, + { + "auxiliary_loss_clip": 0.01068094, + "auxiliary_loss_mlp": 0.01002939, + "balance_loss_clip": 1.0219841, + "balance_loss_mlp": 1.00195527, + "epoch": 0.5059820838092948, + "flos": 63453114080640.0, + "grad_norm": 1.0090189380674661, + "language_loss": 0.57629472, + "learning_rate": 2.0584145651179234e-06, + "loss": 0.59700501, + "num_input_tokens_seen": 90751645, + "step": 4208, + "time_per_iteration": 3.061169147491455 + }, + { + "auxiliary_loss_clip": 0.01155209, + "auxiliary_loss_mlp": 0.0076224, + "balance_loss_clip": 1.05271924, + "balance_loss_mlp": 1.00097871, + "epoch": 0.5061023266999338, + "flos": 15441803821440.0, + "grad_norm": 2.642780852848987, + "language_loss": 0.80001897, + "learning_rate": 2.0576359213873327e-06, + "loss": 0.81919342, + "num_input_tokens_seen": 90766795, + "step": 4209, + "time_per_iteration": 2.449671506881714 + }, + { + "auxiliary_loss_clip": 0.01161212, + "auxiliary_loss_mlp": 0.01027015, + "balance_loss_clip": 1.0481391, + "balance_loss_mlp": 1.01892102, + "epoch": 0.506222569590573, + "flos": 22451063990400.0, + "grad_norm": 2.563741355788123, + "language_loss": 0.70216179, + "learning_rate": 2.056857268913419e-06, + "loss": 0.72404408, + "num_input_tokens_seen": 90786845, + "step": 4210, + "time_per_iteration": 2.519665479660034 + }, + { + "auxiliary_loss_clip": 0.01171222, + "auxiliary_loss_mlp": 0.01027633, + "balance_loss_clip": 1.056005, + "balance_loss_mlp": 1.02019095, + "epoch": 0.506342812481212, + "flos": 17558665994880.0, + "grad_norm": 2.4670576754443077, + "language_loss": 0.84389728, + "learning_rate": 2.056078607814303e-06, + "loss": 0.86588585, + "num_input_tokens_seen": 90802630, + "step": 4211, + "time_per_iteration": 2.4150028228759766 + }, + { + "auxiliary_loss_clip": 0.01169713, + "auxiliary_loss_mlp": 0.01021305, + "balance_loss_clip": 1.05287921, + "balance_loss_mlp": 1.0131036, + "epoch": 0.5064630553718511, + "flos": 23402050519680.0, + "grad_norm": 1.8906628803037777, + "language_loss": 0.7809931, + "learning_rate": 2.055299938208106e-06, + "loss": 0.80290329, + "num_input_tokens_seen": 90823620, + "step": 4212, + "time_per_iteration": 2.4919285774230957 + }, + { + "auxiliary_loss_clip": 0.01175174, + "auxiliary_loss_mlp": 0.0103134, + "balance_loss_clip": 1.05545652, + "balance_loss_mlp": 1.02293563, + "epoch": 0.5065832982624903, + "flos": 23987035416960.0, + "grad_norm": 2.4842672934739527, + "language_loss": 0.86287868, + "learning_rate": 2.0545212602129526e-06, + "loss": 0.88494384, + "num_input_tokens_seen": 90843475, + "step": 4213, + "time_per_iteration": 2.4772701263427734 + }, + { + "auxiliary_loss_clip": 0.01146447, + "auxiliary_loss_mlp": 0.01027539, + "balance_loss_clip": 1.0471611, + "balance_loss_mlp": 1.01896763, + "epoch": 0.5067035411531293, + "flos": 21503058289920.0, + "grad_norm": 2.6972889292141846, + "language_loss": 0.6634028, + "learning_rate": 2.0537425739469673e-06, + "loss": 0.6851427, + "num_input_tokens_seen": 90862410, + "step": 4214, + "time_per_iteration": 2.4762330055236816 + }, + { + "auxiliary_loss_clip": 0.01072885, + "auxiliary_loss_mlp": 0.01005111, + "balance_loss_clip": 1.01934123, + "balance_loss_mlp": 1.00388861, + "epoch": 0.5068237840437684, + "flos": 65934397687680.0, + "grad_norm": 0.8417373790649666, + "language_loss": 0.59498107, + "learning_rate": 2.052963879528276e-06, + "loss": 0.61576104, + "num_input_tokens_seen": 90922280, + "step": 4215, + "time_per_iteration": 3.0340380668640137 + }, + { + "auxiliary_loss_clip": 0.01169255, + "auxiliary_loss_mlp": 0.01024185, + "balance_loss_clip": 1.05285549, + "balance_loss_mlp": 1.01613855, + "epoch": 0.5069440269344075, + "flos": 27264206626560.0, + "grad_norm": 2.368955444256965, + "language_loss": 0.76229811, + "learning_rate": 2.052185177075007e-06, + "loss": 0.7842325, + "num_input_tokens_seen": 90941850, + "step": 4216, + "time_per_iteration": 2.5235822200775146 + }, + { + "auxiliary_loss_clip": 0.01171421, + "auxiliary_loss_mlp": 0.01035603, + "balance_loss_clip": 1.05274606, + "balance_loss_mlp": 1.02788448, + "epoch": 0.5070642698250466, + "flos": 23366319465600.0, + "grad_norm": 5.727134232501956, + "language_loss": 0.8270039, + "learning_rate": 2.051406466705288e-06, + "loss": 0.84907413, + "num_input_tokens_seen": 90961390, + "step": 4217, + "time_per_iteration": 2.46502685546875 + }, + { + "auxiliary_loss_clip": 0.01182897, + "auxiliary_loss_mlp": 0.01028859, + "balance_loss_clip": 1.05278635, + "balance_loss_mlp": 1.02106261, + "epoch": 0.5071845127156857, + "flos": 20340127560960.0, + "grad_norm": 2.073733320411171, + "language_loss": 0.80649513, + "learning_rate": 2.0506277485372486e-06, + "loss": 0.82861269, + "num_input_tokens_seen": 90980215, + "step": 4218, + "time_per_iteration": 2.4351587295532227 + }, + { + "auxiliary_loss_clip": 0.01164752, + "auxiliary_loss_mlp": 0.01028307, + "balance_loss_clip": 1.05307579, + "balance_loss_mlp": 1.02040982, + "epoch": 0.5073047556063248, + "flos": 12092955022080.0, + "grad_norm": 2.0339588276701606, + "language_loss": 0.66951001, + "learning_rate": 2.04984902268902e-06, + "loss": 0.69144058, + "num_input_tokens_seen": 90997415, + "step": 4219, + "time_per_iteration": 2.4349184036254883 + }, + { + "auxiliary_loss_clip": 0.01174729, + "auxiliary_loss_mlp": 0.01031002, + "balance_loss_clip": 1.05077767, + "balance_loss_mlp": 1.02185917, + "epoch": 0.5074249984969639, + "flos": 19682854542720.0, + "grad_norm": 2.2079044308851796, + "language_loss": 0.75846207, + "learning_rate": 2.0490702892787345e-06, + "loss": 0.78051937, + "num_input_tokens_seen": 91016475, + "step": 4220, + "time_per_iteration": 2.4674758911132812 + }, + { + "auxiliary_loss_clip": 0.01159479, + "auxiliary_loss_mlp": 0.01026337, + "balance_loss_clip": 1.04780602, + "balance_loss_mlp": 1.01832056, + "epoch": 0.5075452413876029, + "flos": 28765703975040.0, + "grad_norm": 1.6914154859841009, + "language_loss": 0.62218004, + "learning_rate": 2.0482915484245246e-06, + "loss": 0.6440382, + "num_input_tokens_seen": 91038095, + "step": 4221, + "time_per_iteration": 2.5123469829559326 + }, + { + "auxiliary_loss_clip": 0.01120333, + "auxiliary_loss_mlp": 0.01032298, + "balance_loss_clip": 1.04848647, + "balance_loss_mlp": 1.02382779, + "epoch": 0.5076654842782421, + "flos": 20339445202560.0, + "grad_norm": 2.2169825770215166, + "language_loss": 0.840294, + "learning_rate": 2.047512800244526e-06, + "loss": 0.86182034, + "num_input_tokens_seen": 91053360, + "step": 4222, + "time_per_iteration": 2.5358285903930664 + }, + { + "auxiliary_loss_clip": 0.01168717, + "auxiliary_loss_mlp": 0.01026008, + "balance_loss_clip": 1.05239296, + "balance_loss_mlp": 1.01784825, + "epoch": 0.5077857271688812, + "flos": 26359653404160.0, + "grad_norm": 1.8969926239477768, + "language_loss": 0.78472698, + "learning_rate": 2.046734044856873e-06, + "loss": 0.80667424, + "num_input_tokens_seen": 91072770, + "step": 4223, + "time_per_iteration": 2.4905500411987305 + }, + { + "auxiliary_loss_clip": 0.01168341, + "auxiliary_loss_mlp": 0.01026333, + "balance_loss_clip": 1.05187702, + "balance_loss_mlp": 1.01844692, + "epoch": 0.5079059700595202, + "flos": 21798962530560.0, + "grad_norm": 1.9629353212380312, + "language_loss": 0.81165516, + "learning_rate": 2.045955282379702e-06, + "loss": 0.83360195, + "num_input_tokens_seen": 91091430, + "step": 4224, + "time_per_iteration": 2.467928886413574 + }, + { + "auxiliary_loss_clip": 0.01166484, + "auxiliary_loss_mlp": 0.01026377, + "balance_loss_clip": 1.0498538, + "balance_loss_mlp": 1.01775789, + "epoch": 0.5080262129501594, + "flos": 13187943175680.0, + "grad_norm": 2.918770316643873, + "language_loss": 0.74734586, + "learning_rate": 2.045176512931152e-06, + "loss": 0.76927447, + "num_input_tokens_seen": 91106060, + "step": 4225, + "time_per_iteration": 2.446270227432251 + }, + { + "auxiliary_loss_clip": 0.01144671, + "auxiliary_loss_mlp": 0.01024411, + "balance_loss_clip": 1.04959524, + "balance_loss_mlp": 1.01660848, + "epoch": 0.5081464558407984, + "flos": 25301473712640.0, + "grad_norm": 1.8428354956959778, + "language_loss": 0.76310766, + "learning_rate": 2.0443977366293604e-06, + "loss": 0.7847985, + "num_input_tokens_seen": 91124100, + "step": 4226, + "time_per_iteration": 2.546116352081299 + }, + { + "auxiliary_loss_clip": 0.01112314, + "auxiliary_loss_mlp": 0.01031953, + "balance_loss_clip": 1.0451051, + "balance_loss_mlp": 1.02295852, + "epoch": 0.5082666987314375, + "flos": 30951226995840.0, + "grad_norm": 1.48570894617364, + "language_loss": 0.76831317, + "learning_rate": 2.043618953592468e-06, + "loss": 0.78975588, + "num_input_tokens_seen": 91146555, + "step": 4227, + "time_per_iteration": 2.715646743774414 + }, + { + "auxiliary_loss_clip": 0.01157005, + "auxiliary_loss_mlp": 0.01030399, + "balance_loss_clip": 1.05269051, + "balance_loss_mlp": 1.02159524, + "epoch": 0.5083869416220766, + "flos": 19682495406720.0, + "grad_norm": 1.834448825684967, + "language_loss": 0.81115061, + "learning_rate": 2.0428401639386144e-06, + "loss": 0.83302462, + "num_input_tokens_seen": 91167120, + "step": 4228, + "time_per_iteration": 2.6656742095947266 + }, + { + "auxiliary_loss_clip": 0.0105498, + "auxiliary_loss_mlp": 0.01003274, + "balance_loss_clip": 1.01842308, + "balance_loss_mlp": 1.00196314, + "epoch": 0.5085071845127157, + "flos": 71817535589760.0, + "grad_norm": 0.829237386505675, + "language_loss": 0.58152968, + "learning_rate": 2.042061367785943e-06, + "loss": 0.60211223, + "num_input_tokens_seen": 91220260, + "step": 4229, + "time_per_iteration": 3.047353744506836 + }, + { + "auxiliary_loss_clip": 0.01143835, + "auxiliary_loss_mlp": 0.01028343, + "balance_loss_clip": 1.04781663, + "balance_loss_mlp": 1.02029061, + "epoch": 0.5086274274033548, + "flos": 35951608252800.0, + "grad_norm": 2.247070177465219, + "language_loss": 0.75443614, + "learning_rate": 2.041282565252594e-06, + "loss": 0.77615792, + "num_input_tokens_seen": 91240425, + "step": 4230, + "time_per_iteration": 3.4077749252319336 + }, + { + "auxiliary_loss_clip": 0.011401, + "auxiliary_loss_mlp": 0.01026449, + "balance_loss_clip": 1.04790616, + "balance_loss_mlp": 1.01863503, + "epoch": 0.5087476702939938, + "flos": 23513732881920.0, + "grad_norm": 2.1016766918423158, + "language_loss": 0.77098846, + "learning_rate": 2.040503756456714e-06, + "loss": 0.79265392, + "num_input_tokens_seen": 91259635, + "step": 4231, + "time_per_iteration": 3.3278355598449707 + }, + { + "auxiliary_loss_clip": 0.01163221, + "auxiliary_loss_mlp": 0.01029385, + "balance_loss_clip": 1.05014348, + "balance_loss_mlp": 1.02084982, + "epoch": 0.508867913184633, + "flos": 15122091841920.0, + "grad_norm": 1.9375114457239129, + "language_loss": 0.78731668, + "learning_rate": 2.0397249415164456e-06, + "loss": 0.80924273, + "num_input_tokens_seen": 91276990, + "step": 4232, + "time_per_iteration": 3.1772685050964355 + }, + { + "auxiliary_loss_clip": 0.0114746, + "auxiliary_loss_mlp": 0.01028388, + "balance_loss_clip": 1.04699111, + "balance_loss_mlp": 1.02012074, + "epoch": 0.508988156075272, + "flos": 25885309374720.0, + "grad_norm": 1.9965281086496427, + "language_loss": 0.79759747, + "learning_rate": 2.0389461205499354e-06, + "loss": 0.81935596, + "num_input_tokens_seen": 91296125, + "step": 4233, + "time_per_iteration": 3.301868438720703 + }, + { + "auxiliary_loss_clip": 0.01141359, + "auxiliary_loss_mlp": 0.01027423, + "balance_loss_clip": 1.0479573, + "balance_loss_mlp": 1.01917422, + "epoch": 0.5091083989659111, + "flos": 13844857057920.0, + "grad_norm": 1.8231459841935835, + "language_loss": 0.73562992, + "learning_rate": 2.03816729367533e-06, + "loss": 0.75731778, + "num_input_tokens_seen": 91314280, + "step": 4234, + "time_per_iteration": 2.504221200942993 + }, + { + "auxiliary_loss_clip": 0.01155511, + "auxiliary_loss_mlp": 0.01031715, + "balance_loss_clip": 1.05112362, + "balance_loss_mlp": 1.02365351, + "epoch": 0.5092286418565503, + "flos": 21104881050240.0, + "grad_norm": 2.241705372997693, + "language_loss": 0.71170878, + "learning_rate": 2.0373884610107765e-06, + "loss": 0.73358107, + "num_input_tokens_seen": 91334595, + "step": 4235, + "time_per_iteration": 2.499324083328247 + }, + { + "auxiliary_loss_clip": 0.01169553, + "auxiliary_loss_mlp": 0.01024789, + "balance_loss_clip": 1.04834628, + "balance_loss_mlp": 1.01688504, + "epoch": 0.5093488847471893, + "flos": 18621298972800.0, + "grad_norm": 2.70591677017026, + "language_loss": 0.69492334, + "learning_rate": 2.0366096226744225e-06, + "loss": 0.71686673, + "num_input_tokens_seen": 91349790, + "step": 4236, + "time_per_iteration": 2.4149091243743896 + }, + { + "auxiliary_loss_clip": 0.01159849, + "auxiliary_loss_mlp": 0.01036654, + "balance_loss_clip": 1.05042136, + "balance_loss_mlp": 1.02829731, + "epoch": 0.5094691276378284, + "flos": 23803783205760.0, + "grad_norm": 1.7685232602319183, + "language_loss": 0.76631594, + "learning_rate": 2.035830778784418e-06, + "loss": 0.7882809, + "num_input_tokens_seen": 91370465, + "step": 4237, + "time_per_iteration": 2.4819819927215576 + }, + { + "auxiliary_loss_clip": 0.01158551, + "auxiliary_loss_mlp": 0.01024397, + "balance_loss_clip": 1.05361462, + "balance_loss_mlp": 1.01587963, + "epoch": 0.5095893705284675, + "flos": 17420410546560.0, + "grad_norm": 2.0845586792838966, + "language_loss": 0.79720569, + "learning_rate": 2.0350519294589134e-06, + "loss": 0.81903517, + "num_input_tokens_seen": 91388505, + "step": 4238, + "time_per_iteration": 2.4739432334899902 + }, + { + "auxiliary_loss_clip": 0.01120103, + "auxiliary_loss_mlp": 0.01025216, + "balance_loss_clip": 1.04452837, + "balance_loss_mlp": 1.01686525, + "epoch": 0.5097096134191066, + "flos": 25849362839040.0, + "grad_norm": 1.8041823294953354, + "language_loss": 0.82487929, + "learning_rate": 2.0342730748160588e-06, + "loss": 0.84633249, + "num_input_tokens_seen": 91408970, + "step": 4239, + "time_per_iteration": 2.6111013889312744 + }, + { + "auxiliary_loss_clip": 0.01151144, + "auxiliary_loss_mlp": 0.01029143, + "balance_loss_clip": 1.04683638, + "balance_loss_mlp": 1.02109027, + "epoch": 0.5098298563097456, + "flos": 27745122844800.0, + "grad_norm": 3.611870217618892, + "language_loss": 0.70496666, + "learning_rate": 2.033494214974006e-06, + "loss": 0.72676957, + "num_input_tokens_seen": 91430115, + "step": 4240, + "time_per_iteration": 2.5434975624084473 + }, + { + "auxiliary_loss_clip": 0.01145379, + "auxiliary_loss_mlp": 0.01032791, + "balance_loss_clip": 1.05044413, + "balance_loss_mlp": 1.02486992, + "epoch": 0.5099500992003848, + "flos": 21358913011200.0, + "grad_norm": 1.96942659375598, + "language_loss": 0.83904207, + "learning_rate": 2.0327153500509067e-06, + "loss": 0.86082375, + "num_input_tokens_seen": 91449140, + "step": 4241, + "time_per_iteration": 2.50748610496521 + }, + { + "auxiliary_loss_clip": 0.01158298, + "auxiliary_loss_mlp": 0.0102658, + "balance_loss_clip": 1.05313516, + "balance_loss_mlp": 1.01877189, + "epoch": 0.5100703420910239, + "flos": 19865999013120.0, + "grad_norm": 1.9698346640176172, + "language_loss": 0.84878987, + "learning_rate": 2.031936480164916e-06, + "loss": 0.87063867, + "num_input_tokens_seen": 91466880, + "step": 4242, + "time_per_iteration": 2.492889881134033 + }, + { + "auxiliary_loss_clip": 0.01153091, + "auxiliary_loss_mlp": 0.01027833, + "balance_loss_clip": 1.05371547, + "balance_loss_mlp": 1.01959562, + "epoch": 0.5101905849816629, + "flos": 24648797635200.0, + "grad_norm": 2.4530080953096536, + "language_loss": 0.80615211, + "learning_rate": 2.0311576054341857e-06, + "loss": 0.82796139, + "num_input_tokens_seen": 91487495, + "step": 4243, + "time_per_iteration": 2.5618062019348145 + }, + { + "auxiliary_loss_clip": 0.01185187, + "auxiliary_loss_mlp": 0.01027047, + "balance_loss_clip": 1.05646348, + "balance_loss_mlp": 1.01876831, + "epoch": 0.5103108278723021, + "flos": 22930076787840.0, + "grad_norm": 1.6540116688250193, + "language_loss": 0.62555295, + "learning_rate": 2.0303787259768715e-06, + "loss": 0.64767528, + "num_input_tokens_seen": 91508395, + "step": 4244, + "time_per_iteration": 2.439591884613037 + }, + { + "auxiliary_loss_clip": 0.01157639, + "auxiliary_loss_mlp": 0.01030083, + "balance_loss_clip": 1.05364287, + "balance_loss_mlp": 1.02194691, + "epoch": 0.5104310707629411, + "flos": 21506613736320.0, + "grad_norm": 2.998349859658174, + "language_loss": 0.69262207, + "learning_rate": 2.0295998419111294e-06, + "loss": 0.71449935, + "num_input_tokens_seen": 91525685, + "step": 4245, + "time_per_iteration": 2.4962151050567627 + }, + { + "auxiliary_loss_clip": 0.01116408, + "auxiliary_loss_mlp": 0.01032636, + "balance_loss_clip": 1.04662263, + "balance_loss_mlp": 1.02394533, + "epoch": 0.5105513136535802, + "flos": 14903180403840.0, + "grad_norm": 2.382014395736342, + "language_loss": 0.73976779, + "learning_rate": 2.028820953355115e-06, + "loss": 0.76125824, + "num_input_tokens_seen": 91543785, + "step": 4246, + "time_per_iteration": 2.563425064086914 + }, + { + "auxiliary_loss_clip": 0.01162662, + "auxiliary_loss_mlp": 0.01028503, + "balance_loss_clip": 1.0509584, + "balance_loss_mlp": 1.0199616, + "epoch": 0.5106715565442194, + "flos": 22602212421120.0, + "grad_norm": 2.105388508106807, + "language_loss": 0.78498781, + "learning_rate": 2.0280420604269834e-06, + "loss": 0.80689943, + "num_input_tokens_seen": 91563325, + "step": 4247, + "time_per_iteration": 2.5043065547943115 + }, + { + "auxiliary_loss_clip": 0.01071678, + "auxiliary_loss_mlp": 0.01004972, + "balance_loss_clip": 1.0212754, + "balance_loss_mlp": 1.00355959, + "epoch": 0.5107917994348584, + "flos": 71027645558400.0, + "grad_norm": 0.7314951845183892, + "language_loss": 0.58926767, + "learning_rate": 2.027263163244895e-06, + "loss": 0.61003417, + "num_input_tokens_seen": 91632450, + "step": 4248, + "time_per_iteration": 3.187582492828369 + }, + { + "auxiliary_loss_clip": 0.01170144, + "auxiliary_loss_mlp": 0.0103187, + "balance_loss_clip": 1.05432415, + "balance_loss_mlp": 1.02363884, + "epoch": 0.5109120423254975, + "flos": 24827416992000.0, + "grad_norm": 1.7694099664230878, + "language_loss": 0.74527729, + "learning_rate": 2.026484261927005e-06, + "loss": 0.76729739, + "num_input_tokens_seen": 91651945, + "step": 4249, + "time_per_iteration": 2.495616912841797 + }, + { + "auxiliary_loss_clip": 0.01177272, + "auxiliary_loss_mlp": 0.01028845, + "balance_loss_clip": 1.05633235, + "balance_loss_mlp": 1.01968336, + "epoch": 0.5110322852161366, + "flos": 21247661612160.0, + "grad_norm": 3.0957249749281757, + "language_loss": 0.73996663, + "learning_rate": 2.025705356591475e-06, + "loss": 0.76202774, + "num_input_tokens_seen": 91669635, + "step": 4250, + "time_per_iteration": 2.461975574493408 + }, + { + "auxiliary_loss_clip": 0.01046347, + "auxiliary_loss_mlp": 0.00753167, + "balance_loss_clip": 1.01868677, + "balance_loss_mlp": 1.00069177, + "epoch": 0.5111525281067757, + "flos": 66457114358400.0, + "grad_norm": 0.7597777182273479, + "language_loss": 0.57965213, + "learning_rate": 2.024926447356462e-06, + "loss": 0.59764731, + "num_input_tokens_seen": 91731920, + "step": 4251, + "time_per_iteration": 3.016576051712036 + }, + { + "auxiliary_loss_clip": 0.0116873, + "auxiliary_loss_mlp": 0.01033378, + "balance_loss_clip": 1.05221677, + "balance_loss_mlp": 1.02442491, + "epoch": 0.5112727709974147, + "flos": 14866731077760.0, + "grad_norm": 2.1911837189779213, + "language_loss": 0.78376484, + "learning_rate": 2.024147534340127e-06, + "loss": 0.80578595, + "num_input_tokens_seen": 91749780, + "step": 4252, + "time_per_iteration": 2.4809114933013916 + }, + { + "auxiliary_loss_clip": 0.01149744, + "auxiliary_loss_mlp": 0.01030313, + "balance_loss_clip": 1.04617274, + "balance_loss_mlp": 1.02210522, + "epoch": 0.5113930138880539, + "flos": 21177600134400.0, + "grad_norm": 1.621024105015606, + "language_loss": 0.79828501, + "learning_rate": 2.02336861766063e-06, + "loss": 0.82008559, + "num_input_tokens_seen": 91768840, + "step": 4253, + "time_per_iteration": 2.509453773498535 + }, + { + "auxiliary_loss_clip": 0.01177951, + "auxiliary_loss_mlp": 0.01028982, + "balance_loss_clip": 1.05457568, + "balance_loss_mlp": 1.02061391, + "epoch": 0.511513256778693, + "flos": 20409111630720.0, + "grad_norm": 1.6287744237343944, + "language_loss": 0.78702509, + "learning_rate": 2.0225896974361327e-06, + "loss": 0.80909443, + "num_input_tokens_seen": 91788945, + "step": 4254, + "time_per_iteration": 2.473755121231079 + }, + { + "auxiliary_loss_clip": 0.01051291, + "auxiliary_loss_mlp": 0.01001662, + "balance_loss_clip": 1.02114415, + "balance_loss_mlp": 1.0004046, + "epoch": 0.511633499669332, + "flos": 69879975131520.0, + "grad_norm": 0.8567048289634928, + "language_loss": 0.59946644, + "learning_rate": 2.0218107737847962e-06, + "loss": 0.61999601, + "num_input_tokens_seen": 91850990, + "step": 4255, + "time_per_iteration": 3.1324517726898193 + }, + { + "auxiliary_loss_clip": 0.01185337, + "auxiliary_loss_mlp": 0.01029409, + "balance_loss_clip": 1.0555805, + "balance_loss_mlp": 1.02136827, + "epoch": 0.5117537425599712, + "flos": 24097855852800.0, + "grad_norm": 1.852857807302608, + "language_loss": 0.74696505, + "learning_rate": 2.0210318468247826e-06, + "loss": 0.76911247, + "num_input_tokens_seen": 91869960, + "step": 4256, + "time_per_iteration": 2.4621310234069824 + }, + { + "auxiliary_loss_clip": 0.01157154, + "auxiliary_loss_mlp": 0.0102579, + "balance_loss_clip": 1.05035043, + "balance_loss_mlp": 1.0181129, + "epoch": 0.5118739854506102, + "flos": 20959550622720.0, + "grad_norm": 2.093742416326461, + "language_loss": 0.81775612, + "learning_rate": 2.020252916674255e-06, + "loss": 0.83958554, + "num_input_tokens_seen": 91889075, + "step": 4257, + "time_per_iteration": 4.017560958862305 + }, + { + "auxiliary_loss_clip": 0.0117183, + "auxiliary_loss_mlp": 0.01029314, + "balance_loss_clip": 1.05160165, + "balance_loss_mlp": 1.02091551, + "epoch": 0.5119942283412493, + "flos": 17457326749440.0, + "grad_norm": 1.6538072680183658, + "language_loss": 0.81365573, + "learning_rate": 2.019473983451375e-06, + "loss": 0.83566713, + "num_input_tokens_seen": 91907495, + "step": 4258, + "time_per_iteration": 2.4579782485961914 + }, + { + "auxiliary_loss_clip": 0.01146514, + "auxiliary_loss_mlp": 0.01030919, + "balance_loss_clip": 1.04838347, + "balance_loss_mlp": 1.02285504, + "epoch": 0.5121144712318885, + "flos": 21066743784960.0, + "grad_norm": 1.8922791147537683, + "language_loss": 0.71317679, + "learning_rate": 2.0186950472743076e-06, + "loss": 0.73495114, + "num_input_tokens_seen": 91927400, + "step": 4259, + "time_per_iteration": 3.259906530380249 + }, + { + "auxiliary_loss_clip": 0.01186393, + "auxiliary_loss_mlp": 0.01024993, + "balance_loss_clip": 1.05474663, + "balance_loss_mlp": 1.01648176, + "epoch": 0.5122347141225275, + "flos": 19860791541120.0, + "grad_norm": 1.7328534011188554, + "language_loss": 0.73936999, + "learning_rate": 2.0179161082612162e-06, + "loss": 0.76148391, + "num_input_tokens_seen": 91946790, + "step": 4260, + "time_per_iteration": 3.1491429805755615 + }, + { + "auxiliary_loss_clip": 0.01149479, + "auxiliary_loss_mlp": 0.01027633, + "balance_loss_clip": 1.04672837, + "balance_loss_mlp": 1.01987243, + "epoch": 0.5123549570131666, + "flos": 22528487756160.0, + "grad_norm": 2.775184270895797, + "language_loss": 0.72811699, + "learning_rate": 2.017137166530266e-06, + "loss": 0.74988812, + "num_input_tokens_seen": 91966325, + "step": 4261, + "time_per_iteration": 2.490036964416504 + }, + { + "auxiliary_loss_clip": 0.01157269, + "auxiliary_loss_mlp": 0.010282, + "balance_loss_clip": 1.0498631, + "balance_loss_mlp": 1.02054095, + "epoch": 0.5124751999038056, + "flos": 20333375804160.0, + "grad_norm": 2.0555546710782733, + "language_loss": 0.80274832, + "learning_rate": 2.0163582221996213e-06, + "loss": 0.82460302, + "num_input_tokens_seen": 91984700, + "step": 4262, + "time_per_iteration": 2.4872806072235107 + }, + { + "auxiliary_loss_clip": 0.01156688, + "auxiliary_loss_mlp": 0.01029832, + "balance_loss_clip": 1.05133319, + "balance_loss_mlp": 1.02165461, + "epoch": 0.5125954427944448, + "flos": 39785970211200.0, + "grad_norm": 1.8697910241618945, + "language_loss": 0.67692775, + "learning_rate": 2.015579275387446e-06, + "loss": 0.69879293, + "num_input_tokens_seen": 92010020, + "step": 4263, + "time_per_iteration": 2.6868345737457275 + }, + { + "auxiliary_loss_clip": 0.01148141, + "auxiliary_loss_mlp": 0.01028125, + "balance_loss_clip": 1.05122793, + "balance_loss_mlp": 1.01996756, + "epoch": 0.5127156856850839, + "flos": 29205394358400.0, + "grad_norm": 1.9367407940247354, + "language_loss": 0.68343353, + "learning_rate": 2.0148003262119085e-06, + "loss": 0.7051962, + "num_input_tokens_seen": 92030990, + "step": 4264, + "time_per_iteration": 2.633316993713379 + }, + { + "auxiliary_loss_clip": 0.01141106, + "auxiliary_loss_mlp": 0.01027844, + "balance_loss_clip": 1.04924917, + "balance_loss_mlp": 1.01929641, + "epoch": 0.5128359285757229, + "flos": 13553693412480.0, + "grad_norm": 1.8359710173625394, + "language_loss": 0.76265055, + "learning_rate": 2.0140213747911728e-06, + "loss": 0.78434002, + "num_input_tokens_seen": 92049525, + "step": 4265, + "time_per_iteration": 2.5202460289001465 + }, + { + "auxiliary_loss_clip": 0.01139108, + "auxiliary_loss_mlp": 0.0103218, + "balance_loss_clip": 1.05035496, + "balance_loss_mlp": 1.02335262, + "epoch": 0.5129561714663621, + "flos": 25192089820800.0, + "grad_norm": 2.058404832475701, + "language_loss": 0.80482584, + "learning_rate": 2.013242421243406e-06, + "loss": 0.82653874, + "num_input_tokens_seen": 92068430, + "step": 4266, + "time_per_iteration": 2.573199987411499 + }, + { + "auxiliary_loss_clip": 0.01127838, + "auxiliary_loss_mlp": 0.01022594, + "balance_loss_clip": 1.04987991, + "balance_loss_mlp": 1.01472604, + "epoch": 0.5130764143570011, + "flos": 18150223080960.0, + "grad_norm": 1.6123420357796852, + "language_loss": 0.78948236, + "learning_rate": 2.012463465686774e-06, + "loss": 0.8109867, + "num_input_tokens_seen": 92088180, + "step": 4267, + "time_per_iteration": 2.556691884994507 + }, + { + "auxiliary_loss_clip": 0.01039913, + "auxiliary_loss_mlp": 0.01003142, + "balance_loss_clip": 1.02150798, + "balance_loss_mlp": 1.00171745, + "epoch": 0.5131966572476402, + "flos": 59794896418560.0, + "grad_norm": 0.7598985504349409, + "language_loss": 0.54722655, + "learning_rate": 2.0116845082394446e-06, + "loss": 0.56765711, + "num_input_tokens_seen": 92153015, + "step": 4268, + "time_per_iteration": 3.137085199356079 + }, + { + "auxiliary_loss_clip": 0.01174424, + "auxiliary_loss_mlp": 0.01024301, + "balance_loss_clip": 1.05223298, + "balance_loss_mlp": 1.01593208, + "epoch": 0.5133169001382794, + "flos": 18515219132160.0, + "grad_norm": 3.450456568589192, + "language_loss": 0.78993022, + "learning_rate": 2.0109055490195836e-06, + "loss": 0.81191742, + "num_input_tokens_seen": 92171470, + "step": 4269, + "time_per_iteration": 2.456197738647461 + }, + { + "auxiliary_loss_clip": 0.01115531, + "auxiliary_loss_mlp": 0.01025294, + "balance_loss_clip": 1.04094148, + "balance_loss_mlp": 1.0175575, + "epoch": 0.5134371430289184, + "flos": 15523537219200.0, + "grad_norm": 2.289197351862812, + "language_loss": 0.64247084, + "learning_rate": 2.0101265881453605e-06, + "loss": 0.6638791, + "num_input_tokens_seen": 92189945, + "step": 4270, + "time_per_iteration": 2.591341733932495 + }, + { + "auxiliary_loss_clip": 0.0115126, + "auxiliary_loss_mlp": 0.01031285, + "balance_loss_clip": 1.05270398, + "balance_loss_mlp": 1.02338719, + "epoch": 0.5135573859195575, + "flos": 21433786911360.0, + "grad_norm": 2.9830241243601687, + "language_loss": 0.78056633, + "learning_rate": 2.009347625734941e-06, + "loss": 0.80239177, + "num_input_tokens_seen": 92209855, + "step": 4271, + "time_per_iteration": 2.509723663330078 + }, + { + "auxiliary_loss_clip": 0.01189673, + "auxiliary_loss_mlp": 0.01026164, + "balance_loss_clip": 1.05817533, + "balance_loss_mlp": 1.01818252, + "epoch": 0.5136776288101966, + "flos": 17712651600000.0, + "grad_norm": 8.599307600756287, + "language_loss": 0.74845517, + "learning_rate": 2.0085686619064954e-06, + "loss": 0.77061355, + "num_input_tokens_seen": 92226295, + "step": 4272, + "time_per_iteration": 2.410330057144165 + }, + { + "auxiliary_loss_clip": 0.01176373, + "auxiliary_loss_mlp": 0.01028776, + "balance_loss_clip": 1.05602872, + "balance_loss_mlp": 1.02029443, + "epoch": 0.5137978717008357, + "flos": 16581680997120.0, + "grad_norm": 2.123203295235104, + "language_loss": 0.83115613, + "learning_rate": 2.00778969677819e-06, + "loss": 0.85320759, + "num_input_tokens_seen": 92243330, + "step": 4273, + "time_per_iteration": 2.42494797706604 + }, + { + "auxiliary_loss_clip": 0.01151987, + "auxiliary_loss_mlp": 0.01024397, + "balance_loss_clip": 1.04837012, + "balance_loss_mlp": 1.01636255, + "epoch": 0.5139181145914747, + "flos": 20668243322880.0, + "grad_norm": 16.273235014614713, + "language_loss": 0.63830173, + "learning_rate": 2.0070107304681934e-06, + "loss": 0.66006559, + "num_input_tokens_seen": 92262285, + "step": 4274, + "time_per_iteration": 2.487851858139038 + }, + { + "auxiliary_loss_clip": 0.01139814, + "auxiliary_loss_mlp": 0.01030003, + "balance_loss_clip": 1.05185354, + "balance_loss_mlp": 1.02181375, + "epoch": 0.5140383574821139, + "flos": 32926996546560.0, + "grad_norm": 1.7395945074414922, + "language_loss": 0.78105909, + "learning_rate": 2.006231763094675e-06, + "loss": 0.80275726, + "num_input_tokens_seen": 92283305, + "step": 4275, + "time_per_iteration": 2.622424840927124 + }, + { + "auxiliary_loss_clip": 0.01147605, + "auxiliary_loss_mlp": 0.01029646, + "balance_loss_clip": 1.05064178, + "balance_loss_mlp": 1.02139711, + "epoch": 0.514158600372753, + "flos": 19537093152000.0, + "grad_norm": 2.713356997687572, + "language_loss": 0.87646604, + "learning_rate": 2.0054527947758027e-06, + "loss": 0.8982386, + "num_input_tokens_seen": 92302105, + "step": 4276, + "time_per_iteration": 2.518834114074707 + }, + { + "auxiliary_loss_clip": 0.01068262, + "auxiliary_loss_mlp": 0.01000964, + "balance_loss_clip": 1.01835632, + "balance_loss_mlp": 0.99961644, + "epoch": 0.514278843263392, + "flos": 62523855279360.0, + "grad_norm": 0.7338668288819111, + "language_loss": 0.55983812, + "learning_rate": 2.004673825629746e-06, + "loss": 0.58053041, + "num_input_tokens_seen": 92362885, + "step": 4277, + "time_per_iteration": 3.0284054279327393 + }, + { + "auxiliary_loss_clip": 0.01149941, + "auxiliary_loss_mlp": 0.01025548, + "balance_loss_clip": 1.04911089, + "balance_loss_mlp": 1.01787722, + "epoch": 0.5143990861540312, + "flos": 25882328545920.0, + "grad_norm": 1.7213259595690227, + "language_loss": 0.72671747, + "learning_rate": 2.0038948557746744e-06, + "loss": 0.74847239, + "num_input_tokens_seen": 92384740, + "step": 4278, + "time_per_iteration": 2.5315308570861816 + }, + { + "auxiliary_loss_clip": 0.01165334, + "auxiliary_loss_mlp": 0.01029358, + "balance_loss_clip": 1.05199766, + "balance_loss_mlp": 1.02140641, + "epoch": 0.5145193290446702, + "flos": 23330660238720.0, + "grad_norm": 1.6630276258762988, + "language_loss": 0.75169647, + "learning_rate": 2.0031158853287558e-06, + "loss": 0.77364337, + "num_input_tokens_seen": 92405175, + "step": 4279, + "time_per_iteration": 2.4733145236968994 + }, + { + "auxiliary_loss_clip": 0.0115694, + "auxiliary_loss_mlp": 0.01029442, + "balance_loss_clip": 1.0540874, + "balance_loss_mlp": 1.02140749, + "epoch": 0.5146395719353093, + "flos": 22856603518080.0, + "grad_norm": 2.1225880752218327, + "language_loss": 0.7016269, + "learning_rate": 2.0023369144101593e-06, + "loss": 0.72349072, + "num_input_tokens_seen": 92423345, + "step": 4280, + "time_per_iteration": 2.4924206733703613 + }, + { + "auxiliary_loss_clip": 0.01145013, + "auxiliary_loss_mlp": 0.010313, + "balance_loss_clip": 1.04746985, + "balance_loss_mlp": 1.02347374, + "epoch": 0.5147598148259485, + "flos": 26391577616640.0, + "grad_norm": 1.957873742325433, + "language_loss": 0.76748872, + "learning_rate": 2.0015579431370555e-06, + "loss": 0.7892518, + "num_input_tokens_seen": 92445025, + "step": 4281, + "time_per_iteration": 2.54952335357666 + }, + { + "auxiliary_loss_clip": 0.01166148, + "auxiliary_loss_mlp": 0.01028537, + "balance_loss_clip": 1.05245173, + "balance_loss_mlp": 1.02010918, + "epoch": 0.5148800577165875, + "flos": 29965694561280.0, + "grad_norm": 1.9114748392859429, + "language_loss": 0.69666427, + "learning_rate": 2.000778971627612e-06, + "loss": 0.71861112, + "num_input_tokens_seen": 92464490, + "step": 4282, + "time_per_iteration": 2.5264322757720947 + }, + { + "auxiliary_loss_clip": 0.01148837, + "auxiliary_loss_mlp": 0.01033234, + "balance_loss_clip": 1.04966879, + "balance_loss_mlp": 1.02493715, + "epoch": 0.5150003006072266, + "flos": 17931383470080.0, + "grad_norm": 1.7339772954260837, + "language_loss": 0.89998102, + "learning_rate": 2e-06, + "loss": 0.92180169, + "num_input_tokens_seen": 92482085, + "step": 4283, + "time_per_iteration": 2.4817631244659424 + }, + { + "auxiliary_loss_clip": 0.01182358, + "auxiliary_loss_mlp": 0.01032745, + "balance_loss_clip": 1.05434561, + "balance_loss_mlp": 1.02488863, + "epoch": 0.5151205434978657, + "flos": 18478733892480.0, + "grad_norm": 1.8252566795062057, + "language_loss": 0.8571924, + "learning_rate": 1.9992210283723878e-06, + "loss": 0.87934345, + "num_input_tokens_seen": 92499325, + "step": 4284, + "time_per_iteration": 3.8876686096191406 + }, + { + "auxiliary_loss_clip": 0.01183511, + "auxiliary_loss_mlp": 0.01030362, + "balance_loss_clip": 1.05555582, + "balance_loss_mlp": 1.02263165, + "epoch": 0.5152407863885048, + "flos": 25341263003520.0, + "grad_norm": 1.9559068782451292, + "language_loss": 0.7949118, + "learning_rate": 1.9984420568629448e-06, + "loss": 0.81705058, + "num_input_tokens_seen": 92522090, + "step": 4285, + "time_per_iteration": 2.4909989833831787 + }, + { + "auxiliary_loss_clip": 0.01169279, + "auxiliary_loss_mlp": 0.01026249, + "balance_loss_clip": 1.05168402, + "balance_loss_mlp": 1.01855421, + "epoch": 0.5153610292791438, + "flos": 18329740277760.0, + "grad_norm": 2.3341508306764447, + "language_loss": 0.77907807, + "learning_rate": 1.9976630855898405e-06, + "loss": 0.80103332, + "num_input_tokens_seen": 92539845, + "step": 4286, + "time_per_iteration": 4.124485969543457 + }, + { + "auxiliary_loss_clip": 0.01147341, + "auxiliary_loss_mlp": 0.01025195, + "balance_loss_clip": 1.04475331, + "balance_loss_mlp": 1.01736927, + "epoch": 0.515481272169783, + "flos": 30409945971840.0, + "grad_norm": 2.078819615209203, + "language_loss": 0.7457974, + "learning_rate": 1.9968841146712445e-06, + "loss": 0.76752275, + "num_input_tokens_seen": 92559460, + "step": 4287, + "time_per_iteration": 2.5651204586029053 + }, + { + "auxiliary_loss_clip": 0.01108758, + "auxiliary_loss_mlp": 0.00762392, + "balance_loss_clip": 1.04444146, + "balance_loss_mlp": 1.00115049, + "epoch": 0.5156015150604221, + "flos": 23037305863680.0, + "grad_norm": 1.6203862086029734, + "language_loss": 0.71297801, + "learning_rate": 1.996105144225326e-06, + "loss": 0.73168945, + "num_input_tokens_seen": 92579695, + "step": 4288, + "time_per_iteration": 2.6475913524627686 + }, + { + "auxiliary_loss_clip": 0.01170885, + "auxiliary_loss_mlp": 0.01028778, + "balance_loss_clip": 1.05461013, + "balance_loss_mlp": 1.02117825, + "epoch": 0.5157217579510611, + "flos": 17858556645120.0, + "grad_norm": 1.7655440428589788, + "language_loss": 0.78786099, + "learning_rate": 1.995326174370254e-06, + "loss": 0.80985761, + "num_input_tokens_seen": 92598795, + "step": 4289, + "time_per_iteration": 2.431386947631836 + }, + { + "auxiliary_loss_clip": 0.01165379, + "auxiliary_loss_mlp": 0.00762501, + "balance_loss_clip": 1.05046451, + "balance_loss_mlp": 1.00122035, + "epoch": 0.5158420008417003, + "flos": 19171486569600.0, + "grad_norm": 1.6009767529082317, + "language_loss": 0.72832721, + "learning_rate": 1.994547205224197e-06, + "loss": 0.74760604, + "num_input_tokens_seen": 92617700, + "step": 4290, + "time_per_iteration": 2.457494020462036 + }, + { + "auxiliary_loss_clip": 0.01147588, + "auxiliary_loss_mlp": 0.01025481, + "balance_loss_clip": 1.04976892, + "balance_loss_mlp": 1.01710081, + "epoch": 0.5159622437323393, + "flos": 22419534827520.0, + "grad_norm": 2.030350465556487, + "language_loss": 0.6753794, + "learning_rate": 1.993768236905325e-06, + "loss": 0.69711006, + "num_input_tokens_seen": 92638370, + "step": 4291, + "time_per_iteration": 2.503051996231079 + }, + { + "auxiliary_loss_clip": 0.01149814, + "auxiliary_loss_mlp": 0.01024735, + "balance_loss_clip": 1.04891849, + "balance_loss_mlp": 1.01645017, + "epoch": 0.5160824866229784, + "flos": 24603010773120.0, + "grad_norm": 3.030207906103901, + "language_loss": 0.66460657, + "learning_rate": 1.992989269531807e-06, + "loss": 0.68635207, + "num_input_tokens_seen": 92657180, + "step": 4292, + "time_per_iteration": 2.5193285942077637 + }, + { + "auxiliary_loss_clip": 0.01155153, + "auxiliary_loss_mlp": 0.0102763, + "balance_loss_clip": 1.04961371, + "balance_loss_mlp": 1.01934528, + "epoch": 0.5162027295136175, + "flos": 18002737837440.0, + "grad_norm": 4.234044743260658, + "language_loss": 0.68012679, + "learning_rate": 1.99221030322181e-06, + "loss": 0.7019546, + "num_input_tokens_seen": 92673985, + "step": 4293, + "time_per_iteration": 2.453245162963867 + }, + { + "auxiliary_loss_clip": 0.01157227, + "auxiliary_loss_mlp": 0.01030327, + "balance_loss_clip": 1.0503521, + "balance_loss_mlp": 1.02241158, + "epoch": 0.5163229724042566, + "flos": 27344611221120.0, + "grad_norm": 1.7433641357529006, + "language_loss": 0.80863488, + "learning_rate": 1.991431338093505e-06, + "loss": 0.83051038, + "num_input_tokens_seen": 92696340, + "step": 4294, + "time_per_iteration": 2.548217535018921 + }, + { + "auxiliary_loss_clip": 0.0115602, + "auxiliary_loss_mlp": 0.0103024, + "balance_loss_clip": 1.05521441, + "balance_loss_mlp": 1.02293217, + "epoch": 0.5164432152948957, + "flos": 21762764599680.0, + "grad_norm": 1.926271319362548, + "language_loss": 0.79378033, + "learning_rate": 1.9906523742650587e-06, + "loss": 0.81564295, + "num_input_tokens_seen": 92715200, + "step": 4295, + "time_per_iteration": 2.515872001647949 + }, + { + "auxiliary_loss_clip": 0.01182718, + "auxiliary_loss_mlp": 0.01035209, + "balance_loss_clip": 1.05136895, + "balance_loss_mlp": 1.02659035, + "epoch": 0.5165634581855347, + "flos": 25550334115200.0, + "grad_norm": 2.0815549553362525, + "language_loss": 0.77551997, + "learning_rate": 1.9898734118546397e-06, + "loss": 0.79769921, + "num_input_tokens_seen": 92735150, + "step": 4296, + "time_per_iteration": 2.4697105884552 + }, + { + "auxiliary_loss_clip": 0.01102936, + "auxiliary_loss_mlp": 0.01028367, + "balance_loss_clip": 1.04563737, + "balance_loss_mlp": 1.01959932, + "epoch": 0.5166837010761739, + "flos": 19901191363200.0, + "grad_norm": 1.628271525195498, + "language_loss": 0.80356407, + "learning_rate": 1.989094450980416e-06, + "loss": 0.82487708, + "num_input_tokens_seen": 92755250, + "step": 4297, + "time_per_iteration": 2.634310245513916 + }, + { + "auxiliary_loss_clip": 0.01166916, + "auxiliary_loss_mlp": 0.01024368, + "balance_loss_clip": 1.05143642, + "balance_loss_mlp": 1.01666141, + "epoch": 0.516803943966813, + "flos": 26646076454400.0, + "grad_norm": 2.3479276796710384, + "language_loss": 0.76592594, + "learning_rate": 1.9883154917605556e-06, + "loss": 0.78783876, + "num_input_tokens_seen": 92774460, + "step": 4298, + "time_per_iteration": 2.5094151496887207 + }, + { + "auxiliary_loss_clip": 0.01181252, + "auxiliary_loss_mlp": 0.01023633, + "balance_loss_clip": 1.05238223, + "balance_loss_mlp": 1.01624238, + "epoch": 0.516924186857452, + "flos": 19682854542720.0, + "grad_norm": 1.7408076908878303, + "language_loss": 0.8339808, + "learning_rate": 1.9875365343132262e-06, + "loss": 0.85602969, + "num_input_tokens_seen": 92791580, + "step": 4299, + "time_per_iteration": 2.4376399517059326 + }, + { + "auxiliary_loss_clip": 0.01169198, + "auxiliary_loss_mlp": 0.0076252, + "balance_loss_clip": 1.05442381, + "balance_loss_mlp": 1.00115287, + "epoch": 0.5170444297480912, + "flos": 15956583586560.0, + "grad_norm": 2.0498255011926, + "language_loss": 0.84619457, + "learning_rate": 1.9867575787565946e-06, + "loss": 0.86551172, + "num_input_tokens_seen": 92806240, + "step": 4300, + "time_per_iteration": 2.4498982429504395 + }, + { + "auxiliary_loss_clip": 0.01171578, + "auxiliary_loss_mlp": 0.01024171, + "balance_loss_clip": 1.05382872, + "balance_loss_mlp": 1.01564205, + "epoch": 0.5171646726387302, + "flos": 14174157968640.0, + "grad_norm": 1.7482975207481142, + "language_loss": 0.86132205, + "learning_rate": 1.9859786252088275e-06, + "loss": 0.88327956, + "num_input_tokens_seen": 92823420, + "step": 4301, + "time_per_iteration": 2.48504900932312 + }, + { + "auxiliary_loss_clip": 0.01144787, + "auxiliary_loss_mlp": 0.01031649, + "balance_loss_clip": 1.05018604, + "balance_loss_mlp": 1.02279782, + "epoch": 0.5172849155293693, + "flos": 23578550974080.0, + "grad_norm": 2.614293716648407, + "language_loss": 0.66733187, + "learning_rate": 1.9851996737880914e-06, + "loss": 0.68909621, + "num_input_tokens_seen": 92838605, + "step": 4302, + "time_per_iteration": 2.517916679382324 + }, + { + "auxiliary_loss_clip": 0.01173479, + "auxiliary_loss_mlp": 0.01031965, + "balance_loss_clip": 1.05328131, + "balance_loss_mlp": 1.02310777, + "epoch": 0.5174051584200084, + "flos": 14283541860480.0, + "grad_norm": 1.9411322544822995, + "language_loss": 0.74524879, + "learning_rate": 1.9844207246125537e-06, + "loss": 0.76730323, + "num_input_tokens_seen": 92855185, + "step": 4303, + "time_per_iteration": 2.4536843299865723 + }, + { + "auxiliary_loss_clip": 0.01151801, + "auxiliary_loss_mlp": 0.01027538, + "balance_loss_clip": 1.04945052, + "balance_loss_mlp": 1.02026057, + "epoch": 0.5175254013106475, + "flos": 37889384192640.0, + "grad_norm": 1.7784982953747475, + "language_loss": 0.68487847, + "learning_rate": 1.983641777800379e-06, + "loss": 0.70667189, + "num_input_tokens_seen": 92877830, + "step": 4304, + "time_per_iteration": 2.639650821685791 + }, + { + "auxiliary_loss_clip": 0.01064025, + "auxiliary_loss_mlp": 0.01003673, + "balance_loss_clip": 1.01955771, + "balance_loss_mlp": 1.00227821, + "epoch": 0.5176456442012866, + "flos": 68549737829760.0, + "grad_norm": 0.7522740503940162, + "language_loss": 0.58766669, + "learning_rate": 1.9828628334697343e-06, + "loss": 0.60834366, + "num_input_tokens_seen": 92945040, + "step": 4305, + "time_per_iteration": 3.2332921028137207 + }, + { + "auxiliary_loss_clip": 0.01060804, + "auxiliary_loss_mlp": 0.01003177, + "balance_loss_clip": 1.01603961, + "balance_loss_mlp": 1.0017699, + "epoch": 0.5177658870919257, + "flos": 64084137235200.0, + "grad_norm": 0.7691597561438883, + "language_loss": 0.54694211, + "learning_rate": 1.982083891738784e-06, + "loss": 0.56758189, + "num_input_tokens_seen": 93005910, + "step": 4306, + "time_per_iteration": 3.1484103202819824 + }, + { + "auxiliary_loss_clip": 0.01148104, + "auxiliary_loss_mlp": 0.01029193, + "balance_loss_clip": 1.051718, + "balance_loss_mlp": 1.02118564, + "epoch": 0.5178861299825648, + "flos": 26651248012800.0, + "grad_norm": 1.9309157280685676, + "language_loss": 0.82691967, + "learning_rate": 1.9813049527256923e-06, + "loss": 0.84869266, + "num_input_tokens_seen": 93026305, + "step": 4307, + "time_per_iteration": 2.534921407699585 + }, + { + "auxiliary_loss_clip": 0.011377, + "auxiliary_loss_mlp": 0.01029265, + "balance_loss_clip": 1.04596567, + "balance_loss_mlp": 1.0213908, + "epoch": 0.5180063728732038, + "flos": 17931886260480.0, + "grad_norm": 2.6260453228584146, + "language_loss": 0.82351363, + "learning_rate": 1.9805260165486252e-06, + "loss": 0.84518331, + "num_input_tokens_seen": 93045675, + "step": 4308, + "time_per_iteration": 2.5155649185180664 + }, + { + "auxiliary_loss_clip": 0.01169125, + "auxiliary_loss_mlp": 0.01023256, + "balance_loss_clip": 1.05419886, + "balance_loss_mlp": 1.01547766, + "epoch": 0.518126615763843, + "flos": 19500895221120.0, + "grad_norm": 2.2892407674309325, + "language_loss": 0.86445451, + "learning_rate": 1.9797470833257457e-06, + "loss": 0.88637829, + "num_input_tokens_seen": 93065375, + "step": 4309, + "time_per_iteration": 2.459787130355835 + }, + { + "auxiliary_loss_clip": 0.01170147, + "auxiliary_loss_mlp": 0.01031729, + "balance_loss_clip": 1.05391705, + "balance_loss_mlp": 1.02313364, + "epoch": 0.5182468586544821, + "flos": 20704082117760.0, + "grad_norm": 2.172968483503679, + "language_loss": 0.77093643, + "learning_rate": 1.9789681531752177e-06, + "loss": 0.79295522, + "num_input_tokens_seen": 93085595, + "step": 4310, + "time_per_iteration": 3.962002992630005 + }, + { + "auxiliary_loss_clip": 0.01121912, + "auxiliary_loss_mlp": 0.01025032, + "balance_loss_clip": 1.04827702, + "balance_loss_mlp": 1.01761699, + "epoch": 0.5183671015451211, + "flos": 23112107936640.0, + "grad_norm": 1.8012916028612995, + "language_loss": 0.72585028, + "learning_rate": 1.978189226215204e-06, + "loss": 0.7473197, + "num_input_tokens_seen": 93106140, + "step": 4311, + "time_per_iteration": 2.5666751861572266 + }, + { + "auxiliary_loss_clip": 0.01184013, + "auxiliary_loss_mlp": 0.01026013, + "balance_loss_clip": 1.05537724, + "balance_loss_mlp": 1.01759648, + "epoch": 0.5184873444357603, + "flos": 17597090568960.0, + "grad_norm": 3.0976245540117726, + "language_loss": 0.77410263, + "learning_rate": 1.9774103025638675e-06, + "loss": 0.7962029, + "num_input_tokens_seen": 93124265, + "step": 4312, + "time_per_iteration": 3.214688777923584 + }, + { + "auxiliary_loss_clip": 0.01130636, + "auxiliary_loss_mlp": 0.01021476, + "balance_loss_clip": 1.05384207, + "balance_loss_mlp": 1.01366234, + "epoch": 0.5186075873263993, + "flos": 24936800883840.0, + "grad_norm": 1.925629806533987, + "language_loss": 0.76642346, + "learning_rate": 1.9766313823393696e-06, + "loss": 0.78794456, + "num_input_tokens_seen": 93145130, + "step": 4313, + "time_per_iteration": 3.4056339263916016 + }, + { + "auxiliary_loss_clip": 0.01121749, + "auxiliary_loss_mlp": 0.01028224, + "balance_loss_clip": 1.04514265, + "balance_loss_mlp": 1.02002263, + "epoch": 0.5187278302170384, + "flos": 15190106244480.0, + "grad_norm": 2.3054767724214402, + "language_loss": 0.69417441, + "learning_rate": 1.975852465659873e-06, + "loss": 0.71567416, + "num_input_tokens_seen": 93161110, + "step": 4314, + "time_per_iteration": 2.5106568336486816 + }, + { + "auxiliary_loss_clip": 0.01171042, + "auxiliary_loss_mlp": 0.010346, + "balance_loss_clip": 1.05490589, + "balance_loss_mlp": 1.0266484, + "epoch": 0.5188480731076776, + "flos": 25009412227200.0, + "grad_norm": 2.1080021969818623, + "language_loss": 0.69832277, + "learning_rate": 1.9750735526435377e-06, + "loss": 0.72037917, + "num_input_tokens_seen": 93178055, + "step": 4315, + "time_per_iteration": 2.4819862842559814 + }, + { + "auxiliary_loss_clip": 0.01152789, + "auxiliary_loss_mlp": 0.01024409, + "balance_loss_clip": 1.05212545, + "balance_loss_mlp": 1.01631451, + "epoch": 0.5189683159983166, + "flos": 24790141653120.0, + "grad_norm": 3.16464961397824, + "language_loss": 0.79578602, + "learning_rate": 1.974294643408525e-06, + "loss": 0.81755799, + "num_input_tokens_seen": 93195850, + "step": 4316, + "time_per_iteration": 2.508026599884033 + }, + { + "auxiliary_loss_clip": 0.01171713, + "auxiliary_loss_mlp": 0.01029849, + "balance_loss_clip": 1.05120313, + "balance_loss_mlp": 1.02214551, + "epoch": 0.5190885588889557, + "flos": 24754266944640.0, + "grad_norm": 2.143946243110122, + "language_loss": 0.67100811, + "learning_rate": 1.9735157380729947e-06, + "loss": 0.69302374, + "num_input_tokens_seen": 93216260, + "step": 4317, + "time_per_iteration": 2.4895224571228027 + }, + { + "auxiliary_loss_clip": 0.01156048, + "auxiliary_loss_mlp": 0.0102404, + "balance_loss_clip": 1.05011129, + "balance_loss_mlp": 1.01678967, + "epoch": 0.5192088017795948, + "flos": 24712646060160.0, + "grad_norm": 1.8081672773283672, + "language_loss": 0.8415677, + "learning_rate": 1.9727368367551053e-06, + "loss": 0.86336857, + "num_input_tokens_seen": 93234810, + "step": 4318, + "time_per_iteration": 2.528599500656128 + }, + { + "auxiliary_loss_clip": 0.01140495, + "auxiliary_loss_mlp": 0.01026142, + "balance_loss_clip": 1.04719925, + "balance_loss_mlp": 1.01848841, + "epoch": 0.5193290446702339, + "flos": 27229588894080.0, + "grad_norm": 1.783297877779568, + "language_loss": 0.68130887, + "learning_rate": 1.9719579395730164e-06, + "loss": 0.70297527, + "num_input_tokens_seen": 93254185, + "step": 4319, + "time_per_iteration": 2.5353586673736572 + }, + { + "auxiliary_loss_clip": 0.01186157, + "auxiliary_loss_mlp": 0.01025732, + "balance_loss_clip": 1.05793905, + "balance_loss_mlp": 1.01785231, + "epoch": 0.5194492875608729, + "flos": 11473352392320.0, + "grad_norm": 2.7194186337444863, + "language_loss": 0.93341887, + "learning_rate": 1.9711790466448854e-06, + "loss": 0.9555378, + "num_input_tokens_seen": 93268205, + "step": 4320, + "time_per_iteration": 2.3843204975128174 + }, + { + "auxiliary_loss_clip": 0.01132993, + "auxiliary_loss_mlp": 0.01037905, + "balance_loss_clip": 1.05105686, + "balance_loss_mlp": 1.02953672, + "epoch": 0.5195695304515121, + "flos": 20338906498560.0, + "grad_norm": 2.327607954044515, + "language_loss": 0.71255684, + "learning_rate": 1.9704001580888704e-06, + "loss": 0.7342658, + "num_input_tokens_seen": 93286945, + "step": 4321, + "time_per_iteration": 2.575695276260376 + }, + { + "auxiliary_loss_clip": 0.01147486, + "auxiliary_loss_mlp": 0.00762681, + "balance_loss_clip": 1.04696488, + "balance_loss_mlp": 1.00118661, + "epoch": 0.5196897733421512, + "flos": 20048317470720.0, + "grad_norm": 2.044955063587946, + "language_loss": 0.8655889, + "learning_rate": 1.9696212740231283e-06, + "loss": 0.88469052, + "num_input_tokens_seen": 93305595, + "step": 4322, + "time_per_iteration": 2.503593921661377 + }, + { + "auxiliary_loss_clip": 0.01174145, + "auxiliary_loss_mlp": 0.01028931, + "balance_loss_clip": 1.05135584, + "balance_loss_mlp": 1.02029419, + "epoch": 0.5198100162327902, + "flos": 23805507058560.0, + "grad_norm": 2.1194391564242285, + "language_loss": 0.82128316, + "learning_rate": 1.9688423945658146e-06, + "loss": 0.84331393, + "num_input_tokens_seen": 93326460, + "step": 4323, + "time_per_iteration": 2.492277145385742 + }, + { + "auxiliary_loss_clip": 0.01115255, + "auxiliary_loss_mlp": 0.01028431, + "balance_loss_clip": 1.04132485, + "balance_loss_mlp": 1.02018178, + "epoch": 0.5199302591234293, + "flos": 24023951619840.0, + "grad_norm": 2.535710957989351, + "language_loss": 0.7177617, + "learning_rate": 1.9680635198350845e-06, + "loss": 0.73919857, + "num_input_tokens_seen": 93346170, + "step": 4324, + "time_per_iteration": 2.568620204925537 + }, + { + "auxiliary_loss_clip": 0.01168542, + "auxiliary_loss_mlp": 0.01034041, + "balance_loss_clip": 1.05092514, + "balance_loss_mlp": 1.02541625, + "epoch": 0.5200505020140684, + "flos": 26359366095360.0, + "grad_norm": 2.180665461900524, + "language_loss": 0.72586119, + "learning_rate": 1.967284649949093e-06, + "loss": 0.74788702, + "num_input_tokens_seen": 93365380, + "step": 4325, + "time_per_iteration": 2.547022581100464 + }, + { + "auxiliary_loss_clip": 0.01135384, + "auxiliary_loss_mlp": 0.01030515, + "balance_loss_clip": 1.04554653, + "balance_loss_mlp": 1.02234912, + "epoch": 0.5201707449047075, + "flos": 39604262284800.0, + "grad_norm": 4.040305585212235, + "language_loss": 0.72375184, + "learning_rate": 1.966505785025994e-06, + "loss": 0.7454108, + "num_input_tokens_seen": 93387285, + "step": 4326, + "time_per_iteration": 2.6775615215301514 + }, + { + "auxiliary_loss_clip": 0.01139995, + "auxiliary_loss_mlp": 0.01029482, + "balance_loss_clip": 1.05006576, + "balance_loss_mlp": 1.02114904, + "epoch": 0.5202909877953465, + "flos": 53682788292480.0, + "grad_norm": 2.080341639166474, + "language_loss": 0.76063704, + "learning_rate": 1.965726925183941e-06, + "loss": 0.78233182, + "num_input_tokens_seen": 93410390, + "step": 4327, + "time_per_iteration": 2.8272764682769775 + }, + { + "auxiliary_loss_clip": 0.01184086, + "auxiliary_loss_mlp": 0.01025637, + "balance_loss_clip": 1.05602908, + "balance_loss_mlp": 1.01803184, + "epoch": 0.5204112306859857, + "flos": 19537021324800.0, + "grad_norm": 1.7780200750154354, + "language_loss": 0.84809518, + "learning_rate": 1.964948070541087e-06, + "loss": 0.87019247, + "num_input_tokens_seen": 93429050, + "step": 4328, + "time_per_iteration": 2.4266016483306885 + }, + { + "auxiliary_loss_clip": 0.01157768, + "auxiliary_loss_mlp": 0.01029583, + "balance_loss_clip": 1.04866004, + "balance_loss_mlp": 1.02160239, + "epoch": 0.5205314735766248, + "flos": 15304697608320.0, + "grad_norm": 2.348432075139801, + "language_loss": 0.69418484, + "learning_rate": 1.9641692212155816e-06, + "loss": 0.71605837, + "num_input_tokens_seen": 93446815, + "step": 4329, + "time_per_iteration": 2.439492702484131 + }, + { + "auxiliary_loss_clip": 0.01125738, + "auxiliary_loss_mlp": 0.01031176, + "balance_loss_clip": 1.05230308, + "balance_loss_mlp": 1.02285576, + "epoch": 0.5206517164672638, + "flos": 59263701160320.0, + "grad_norm": 1.928720902710859, + "language_loss": 0.72607028, + "learning_rate": 1.9633903773255777e-06, + "loss": 0.74763936, + "num_input_tokens_seen": 93469130, + "step": 4330, + "time_per_iteration": 2.901442050933838 + }, + { + "auxiliary_loss_clip": 0.01180079, + "auxiliary_loss_mlp": 0.01025555, + "balance_loss_clip": 1.05098104, + "balance_loss_mlp": 1.01763964, + "epoch": 0.520771959357903, + "flos": 26871129118080.0, + "grad_norm": 2.0004540693868837, + "language_loss": 0.74754113, + "learning_rate": 1.9626115389892237e-06, + "loss": 0.76959747, + "num_input_tokens_seen": 93489920, + "step": 4331, + "time_per_iteration": 2.4910600185394287 + }, + { + "auxiliary_loss_clip": 0.01146302, + "auxiliary_loss_mlp": 0.01026928, + "balance_loss_clip": 1.04986691, + "balance_loss_mlp": 1.01876187, + "epoch": 0.520892202248542, + "flos": 26907075653760.0, + "grad_norm": 2.364076782006252, + "language_loss": 0.85188651, + "learning_rate": 1.96183270632467e-06, + "loss": 0.87361884, + "num_input_tokens_seen": 93509770, + "step": 4332, + "time_per_iteration": 2.561065435409546 + }, + { + "auxiliary_loss_clip": 0.01133944, + "auxiliary_loss_mlp": 0.00763234, + "balance_loss_clip": 1.04693699, + "balance_loss_mlp": 1.00114119, + "epoch": 0.5210124451391811, + "flos": 25849434666240.0, + "grad_norm": 4.423371487420675, + "language_loss": 0.79210019, + "learning_rate": 1.9610538794500644e-06, + "loss": 0.81107199, + "num_input_tokens_seen": 93529320, + "step": 4333, + "time_per_iteration": 2.564530372619629 + }, + { + "auxiliary_loss_clip": 0.0105242, + "auxiliary_loss_mlp": 0.01005277, + "balance_loss_clip": 1.01997709, + "balance_loss_mlp": 1.00384045, + "epoch": 0.5211326880298203, + "flos": 70553804319360.0, + "grad_norm": 0.7747172601442663, + "language_loss": 0.59402382, + "learning_rate": 1.9602750584835542e-06, + "loss": 0.61460078, + "num_input_tokens_seen": 93595255, + "step": 4334, + "time_per_iteration": 3.199118137359619 + }, + { + "auxiliary_loss_clip": 0.01150763, + "auxiliary_loss_mlp": 0.01022508, + "balance_loss_clip": 1.04929781, + "balance_loss_mlp": 1.01459217, + "epoch": 0.5212529309204593, + "flos": 15628898787840.0, + "grad_norm": 2.0962540754400796, + "language_loss": 0.82569122, + "learning_rate": 1.959496243543286e-06, + "loss": 0.84742391, + "num_input_tokens_seen": 93613135, + "step": 4335, + "time_per_iteration": 2.4704973697662354 + }, + { + "auxiliary_loss_clip": 0.01173032, + "auxiliary_loss_mlp": 0.01037758, + "balance_loss_clip": 1.0571506, + "balance_loss_mlp": 1.02929974, + "epoch": 0.5213731738110984, + "flos": 26242655829120.0, + "grad_norm": 2.304193038376994, + "language_loss": 0.79169029, + "learning_rate": 1.9587174347474057e-06, + "loss": 0.81379819, + "num_input_tokens_seen": 93629645, + "step": 4336, + "time_per_iteration": 2.4885642528533936 + }, + { + "auxiliary_loss_clip": 0.01111999, + "auxiliary_loss_mlp": 0.01031937, + "balance_loss_clip": 1.0449959, + "balance_loss_mlp": 1.02384841, + "epoch": 0.5214934167017375, + "flos": 19418407637760.0, + "grad_norm": 7.341602128491673, + "language_loss": 0.81936133, + "learning_rate": 1.9579386322140574e-06, + "loss": 0.84080058, + "num_input_tokens_seen": 93645325, + "step": 4337, + "time_per_iteration": 3.3153884410858154 + }, + { + "auxiliary_loss_clip": 0.01185979, + "auxiliary_loss_mlp": 0.00763308, + "balance_loss_clip": 1.05567634, + "balance_loss_mlp": 1.0012095, + "epoch": 0.5216136595923766, + "flos": 30955788023040.0, + "grad_norm": 1.8437824724916956, + "language_loss": 0.8127681, + "learning_rate": 1.9571598360613854e-06, + "loss": 0.83226097, + "num_input_tokens_seen": 93668200, + "step": 4338, + "time_per_iteration": 3.2419564723968506 + }, + { + "auxiliary_loss_clip": 0.01138931, + "auxiliary_loss_mlp": 0.01023595, + "balance_loss_clip": 1.04594946, + "balance_loss_mlp": 1.01580489, + "epoch": 0.5217339024830157, + "flos": 21945047143680.0, + "grad_norm": 4.313094568643801, + "language_loss": 0.69812417, + "learning_rate": 1.956381046407532e-06, + "loss": 0.71974945, + "num_input_tokens_seen": 93688495, + "step": 4339, + "time_per_iteration": 3.265094757080078 + }, + { + "auxiliary_loss_clip": 0.0113534, + "auxiliary_loss_mlp": 0.01032938, + "balance_loss_clip": 1.04725134, + "balance_loss_mlp": 1.02461684, + "epoch": 0.5218541453736548, + "flos": 20923209037440.0, + "grad_norm": 1.7588379020326972, + "language_loss": 0.86033738, + "learning_rate": 1.9556022633706394e-06, + "loss": 0.88202018, + "num_input_tokens_seen": 93707285, + "step": 4340, + "time_per_iteration": 3.291384696960449 + }, + { + "auxiliary_loss_clip": 0.01147706, + "auxiliary_loss_mlp": 0.01032073, + "balance_loss_clip": 1.05035162, + "balance_loss_mlp": 1.0241096, + "epoch": 0.5219743882642939, + "flos": 23951663498880.0, + "grad_norm": 1.7481999464173943, + "language_loss": 0.80052876, + "learning_rate": 1.954823487068848e-06, + "loss": 0.8223266, + "num_input_tokens_seen": 93727495, + "step": 4341, + "time_per_iteration": 2.517284393310547 + }, + { + "auxiliary_loss_clip": 0.01170437, + "auxiliary_loss_mlp": 0.01029595, + "balance_loss_clip": 1.05505204, + "balance_loss_mlp": 1.0217154, + "epoch": 0.5220946311549329, + "flos": 28799280213120.0, + "grad_norm": 2.2701317007809183, + "language_loss": 0.81243801, + "learning_rate": 1.9540447176202976e-06, + "loss": 0.83443832, + "num_input_tokens_seen": 93748740, + "step": 4342, + "time_per_iteration": 2.5257985591888428 + }, + { + "auxiliary_loss_clip": 0.0106864, + "auxiliary_loss_mlp": 0.0100365, + "balance_loss_clip": 1.01925981, + "balance_loss_mlp": 1.00226688, + "epoch": 0.5222148740455721, + "flos": 67189369017600.0, + "grad_norm": 0.8841404808611882, + "language_loss": 0.60694313, + "learning_rate": 1.9532659551431272e-06, + "loss": 0.627666, + "num_input_tokens_seen": 93815770, + "step": 4343, + "time_per_iteration": 3.205874443054199 + }, + { + "auxiliary_loss_clip": 0.01169863, + "auxiliary_loss_mlp": 0.01027038, + "balance_loss_clip": 1.05245578, + "balance_loss_mlp": 1.01918769, + "epoch": 0.5223351169362112, + "flos": 61856164339200.0, + "grad_norm": 1.7979523362041796, + "language_loss": 0.67392033, + "learning_rate": 1.9524871997554744e-06, + "loss": 0.69588935, + "num_input_tokens_seen": 93843530, + "step": 4344, + "time_per_iteration": 2.8448314666748047 + }, + { + "auxiliary_loss_clip": 0.01171092, + "auxiliary_loss_mlp": 0.01031876, + "balance_loss_clip": 1.05449414, + "balance_loss_mlp": 1.02382314, + "epoch": 0.5224553598268502, + "flos": 14647388676480.0, + "grad_norm": 2.229760813981262, + "language_loss": 0.8051427, + "learning_rate": 1.951708451575475e-06, + "loss": 0.8271724, + "num_input_tokens_seen": 93860595, + "step": 4345, + "time_per_iteration": 2.436274528503418 + }, + { + "auxiliary_loss_clip": 0.01148203, + "auxiliary_loss_mlp": 0.01029074, + "balance_loss_clip": 1.04785442, + "balance_loss_mlp": 1.02119422, + "epoch": 0.5225756027174894, + "flos": 14826043946880.0, + "grad_norm": 3.9008199223702333, + "language_loss": 0.81980526, + "learning_rate": 1.9509297107212657e-06, + "loss": 0.84157807, + "num_input_tokens_seen": 93877365, + "step": 4346, + "time_per_iteration": 2.5011889934539795 + }, + { + "auxiliary_loss_clip": 0.01181168, + "auxiliary_loss_mlp": 0.01027287, + "balance_loss_clip": 1.05388188, + "balance_loss_mlp": 1.01936519, + "epoch": 0.5226958456081284, + "flos": 23512009029120.0, + "grad_norm": 1.7465049292583485, + "language_loss": 0.79297876, + "learning_rate": 1.95015097731098e-06, + "loss": 0.81506324, + "num_input_tokens_seen": 93896855, + "step": 4347, + "time_per_iteration": 2.4576752185821533 + }, + { + "auxiliary_loss_clip": 0.01182507, + "auxiliary_loss_mlp": 0.01024775, + "balance_loss_clip": 1.05445814, + "balance_loss_mlp": 1.01698434, + "epoch": 0.5228160884987675, + "flos": 19062928690560.0, + "grad_norm": 9.810497952227907, + "language_loss": 0.81702006, + "learning_rate": 1.949372251462751e-06, + "loss": 0.83909285, + "num_input_tokens_seen": 93914270, + "step": 4348, + "time_per_iteration": 2.4164507389068604 + }, + { + "auxiliary_loss_clip": 0.01140565, + "auxiliary_loss_mlp": 0.00762261, + "balance_loss_clip": 1.05096602, + "balance_loss_mlp": 1.00111437, + "epoch": 0.5229363313894067, + "flos": 21063224252160.0, + "grad_norm": 1.8744842586335146, + "language_loss": 0.82712626, + "learning_rate": 1.9485935332947124e-06, + "loss": 0.84615457, + "num_input_tokens_seen": 93932180, + "step": 4349, + "time_per_iteration": 2.5799262523651123 + }, + { + "auxiliary_loss_clip": 0.01147492, + "auxiliary_loss_mlp": 0.01026836, + "balance_loss_clip": 1.04997253, + "balance_loss_mlp": 1.01943278, + "epoch": 0.5230565742800457, + "flos": 14830389492480.0, + "grad_norm": 2.933719361962067, + "language_loss": 0.83144867, + "learning_rate": 1.947814822924993e-06, + "loss": 0.85319197, + "num_input_tokens_seen": 93949690, + "step": 4350, + "time_per_iteration": 2.490199565887451 + }, + { + "auxiliary_loss_clip": 0.01181891, + "auxiliary_loss_mlp": 0.01029593, + "balance_loss_clip": 1.05604458, + "balance_loss_mlp": 1.02220213, + "epoch": 0.5231768171706848, + "flos": 25813021253760.0, + "grad_norm": 1.868259488906551, + "language_loss": 0.83078277, + "learning_rate": 1.9470361204717236e-06, + "loss": 0.85289764, + "num_input_tokens_seen": 93968830, + "step": 4351, + "time_per_iteration": 2.4720118045806885 + }, + { + "auxiliary_loss_clip": 0.01144426, + "auxiliary_loss_mlp": 0.00762773, + "balance_loss_clip": 1.0515666, + "balance_loss_mlp": 1.00118661, + "epoch": 0.5232970600613239, + "flos": 22743807834240.0, + "grad_norm": 2.4899613450485565, + "language_loss": 0.8099547, + "learning_rate": 1.9462574260530326e-06, + "loss": 0.8290267, + "num_input_tokens_seen": 93989110, + "step": 4352, + "time_per_iteration": 2.5582191944122314 + }, + { + "auxiliary_loss_clip": 0.01158754, + "auxiliary_loss_mlp": 0.01027505, + "balance_loss_clip": 1.0506711, + "balance_loss_mlp": 1.01947033, + "epoch": 0.523417302951963, + "flos": 17310703432320.0, + "grad_norm": 2.389743011163683, + "language_loss": 0.80829996, + "learning_rate": 1.9454787397870472e-06, + "loss": 0.83016253, + "num_input_tokens_seen": 94006430, + "step": 4353, + "time_per_iteration": 2.461106538772583 + }, + { + "auxiliary_loss_clip": 0.01103787, + "auxiliary_loss_mlp": 0.0103061, + "balance_loss_clip": 1.04655361, + "balance_loss_mlp": 1.02260184, + "epoch": 0.523537545842602, + "flos": 18551740285440.0, + "grad_norm": 5.112867677468753, + "language_loss": 0.7200402, + "learning_rate": 1.944700061791894e-06, + "loss": 0.74138415, + "num_input_tokens_seen": 94024825, + "step": 4354, + "time_per_iteration": 2.5614349842071533 + }, + { + "auxiliary_loss_clip": 0.01166963, + "auxiliary_loss_mlp": 0.01029928, + "balance_loss_clip": 1.05353606, + "balance_loss_mlp": 1.02211392, + "epoch": 0.5236577887332411, + "flos": 19719267955200.0, + "grad_norm": 2.4711455714344215, + "language_loss": 0.65442604, + "learning_rate": 1.943921392185698e-06, + "loss": 0.67639494, + "num_input_tokens_seen": 94043450, + "step": 4355, + "time_per_iteration": 2.4719629287719727 + }, + { + "auxiliary_loss_clip": 0.01156331, + "auxiliary_loss_mlp": 0.01028159, + "balance_loss_clip": 1.04974627, + "balance_loss_mlp": 1.02016032, + "epoch": 0.5237780316238803, + "flos": 23550218121600.0, + "grad_norm": 2.015464497409114, + "language_loss": 0.77067238, + "learning_rate": 1.9431427310865814e-06, + "loss": 0.7925173, + "num_input_tokens_seen": 94063055, + "step": 4356, + "time_per_iteration": 2.5503029823303223 + }, + { + "auxiliary_loss_clip": 0.01120924, + "auxiliary_loss_mlp": 0.01029352, + "balance_loss_clip": 1.04482961, + "balance_loss_mlp": 1.0210669, + "epoch": 0.5238982745145193, + "flos": 22491894775680.0, + "grad_norm": 2.010861087111963, + "language_loss": 0.78460169, + "learning_rate": 1.942364078612667e-06, + "loss": 0.80610442, + "num_input_tokens_seen": 94081785, + "step": 4357, + "time_per_iteration": 2.5300567150115967 + }, + { + "auxiliary_loss_clip": 0.01145953, + "auxiliary_loss_mlp": 0.01023588, + "balance_loss_clip": 1.04877949, + "balance_loss_mlp": 1.01602447, + "epoch": 0.5240185174051584, + "flos": 27088927234560.0, + "grad_norm": 1.9102534880030595, + "language_loss": 0.75228024, + "learning_rate": 1.9415854348820765e-06, + "loss": 0.77397561, + "num_input_tokens_seen": 94101635, + "step": 4358, + "time_per_iteration": 2.563305616378784 + }, + { + "auxiliary_loss_clip": 0.01172465, + "auxiliary_loss_mlp": 0.01029871, + "balance_loss_clip": 1.05187774, + "balance_loss_mlp": 1.02122211, + "epoch": 0.5241387602957975, + "flos": 22674680110080.0, + "grad_norm": 2.510978880339871, + "language_loss": 0.68147099, + "learning_rate": 1.940806800012929e-06, + "loss": 0.70349431, + "num_input_tokens_seen": 94121705, + "step": 4359, + "time_per_iteration": 2.470839738845825 + }, + { + "auxiliary_loss_clip": 0.01123043, + "auxiliary_loss_mlp": 0.00762611, + "balance_loss_clip": 1.04959846, + "balance_loss_mlp": 1.00106883, + "epoch": 0.5242590031864366, + "flos": 40553453134080.0, + "grad_norm": 1.4967025947925152, + "language_loss": 0.63473296, + "learning_rate": 1.9400281741233432e-06, + "loss": 0.65358949, + "num_input_tokens_seen": 94146595, + "step": 4360, + "time_per_iteration": 2.737917900085449 + }, + { + "auxiliary_loss_clip": 0.01041698, + "auxiliary_loss_mlp": 0.01001583, + "balance_loss_clip": 1.01749563, + "balance_loss_mlp": 1.00023603, + "epoch": 0.5243792460770756, + "flos": 66676313105280.0, + "grad_norm": 0.6732496805601916, + "language_loss": 0.52576208, + "learning_rate": 1.939249557331435e-06, + "loss": 0.54619491, + "num_input_tokens_seen": 94212410, + "step": 4361, + "time_per_iteration": 3.149351119995117 + }, + { + "auxiliary_loss_clip": 0.01148795, + "auxiliary_loss_mlp": 0.01029354, + "balance_loss_clip": 1.05136824, + "balance_loss_mlp": 1.02235687, + "epoch": 0.5244994889677148, + "flos": 28183663992960.0, + "grad_norm": 2.231990331705942, + "language_loss": 0.72760916, + "learning_rate": 1.938470949755321e-06, + "loss": 0.74939066, + "num_input_tokens_seen": 94232290, + "step": 4362, + "time_per_iteration": 2.660339832305908 + }, + { + "auxiliary_loss_clip": 0.01047338, + "auxiliary_loss_mlp": 0.01005105, + "balance_loss_clip": 1.01667476, + "balance_loss_mlp": 1.00374591, + "epoch": 0.5246197318583539, + "flos": 65950379239680.0, + "grad_norm": 0.8145100321220925, + "language_loss": 0.55693346, + "learning_rate": 1.937692351513115e-06, + "loss": 0.5774579, + "num_input_tokens_seen": 94291285, + "step": 4363, + "time_per_iteration": 3.108194351196289 + }, + { + "auxiliary_loss_clip": 0.01173031, + "auxiliary_loss_mlp": 0.01026248, + "balance_loss_clip": 1.05313635, + "balance_loss_mlp": 1.01860976, + "epoch": 0.5247399747489929, + "flos": 21033490769280.0, + "grad_norm": 1.6126490413385668, + "language_loss": 0.80519438, + "learning_rate": 1.9369137627229297e-06, + "loss": 0.82718718, + "num_input_tokens_seen": 94309685, + "step": 4364, + "time_per_iteration": 4.030983209609985 + }, + { + "auxiliary_loss_clip": 0.01166897, + "auxiliary_loss_mlp": 0.01031204, + "balance_loss_clip": 1.05255151, + "balance_loss_mlp": 1.02319908, + "epoch": 0.5248602176396321, + "flos": 19025940660480.0, + "grad_norm": 2.626236181391415, + "language_loss": 0.88134658, + "learning_rate": 1.936135183502877e-06, + "loss": 0.90332758, + "num_input_tokens_seen": 94326985, + "step": 4365, + "time_per_iteration": 2.464048147201538 + }, + { + "auxiliary_loss_clip": 0.01143226, + "auxiliary_loss_mlp": 0.01026643, + "balance_loss_clip": 1.04927337, + "balance_loss_mlp": 1.01867342, + "epoch": 0.5249804605302711, + "flos": 22200084685440.0, + "grad_norm": 2.5846829207530457, + "language_loss": 0.80609226, + "learning_rate": 1.935356613971066e-06, + "loss": 0.82779092, + "num_input_tokens_seen": 94347645, + "step": 4366, + "time_per_iteration": 3.446340322494507 + }, + { + "auxiliary_loss_clip": 0.0115176, + "auxiliary_loss_mlp": 0.00762595, + "balance_loss_clip": 1.05065823, + "balance_loss_mlp": 1.00113249, + "epoch": 0.5251007034209102, + "flos": 23805686626560.0, + "grad_norm": 1.9076958929282737, + "language_loss": 0.76651895, + "learning_rate": 1.9345780542456047e-06, + "loss": 0.78566247, + "num_input_tokens_seen": 94367020, + "step": 4367, + "time_per_iteration": 2.567326784133911 + }, + { + "auxiliary_loss_clip": 0.01157517, + "auxiliary_loss_mlp": 0.01028778, + "balance_loss_clip": 1.0504818, + "balance_loss_mlp": 1.02108955, + "epoch": 0.5252209463115494, + "flos": 23294605962240.0, + "grad_norm": 2.2217012946475085, + "language_loss": 0.71329939, + "learning_rate": 1.9337995044446007e-06, + "loss": 0.73516238, + "num_input_tokens_seen": 94385860, + "step": 4368, + "time_per_iteration": 2.519347667694092 + }, + { + "auxiliary_loss_clip": 0.01171477, + "auxiliary_loss_mlp": 0.01026357, + "balance_loss_clip": 1.05239832, + "balance_loss_mlp": 1.01835811, + "epoch": 0.5253411892021884, + "flos": 19828687760640.0, + "grad_norm": 2.073648139722042, + "language_loss": 0.79997104, + "learning_rate": 1.9330209646861596e-06, + "loss": 0.82194936, + "num_input_tokens_seen": 94405010, + "step": 4369, + "time_per_iteration": 2.5079355239868164 + }, + { + "auxiliary_loss_clip": 0.01151731, + "auxiliary_loss_mlp": 0.01032016, + "balance_loss_clip": 1.05069685, + "balance_loss_mlp": 1.02455926, + "epoch": 0.5254614320928275, + "flos": 24133730561280.0, + "grad_norm": 1.6493022873558139, + "language_loss": 0.77719051, + "learning_rate": 1.9322424350883843e-06, + "loss": 0.79902804, + "num_input_tokens_seen": 94426845, + "step": 4370, + "time_per_iteration": 2.5976126194000244 + }, + { + "auxiliary_loss_clip": 0.01153841, + "auxiliary_loss_mlp": 0.01029881, + "balance_loss_clip": 1.04950559, + "balance_loss_mlp": 1.02216864, + "epoch": 0.5255816749834666, + "flos": 24644954880000.0, + "grad_norm": 1.7228476329127531, + "language_loss": 0.7877599, + "learning_rate": 1.931463915769379e-06, + "loss": 0.80959713, + "num_input_tokens_seen": 94446960, + "step": 4371, + "time_per_iteration": 2.543860673904419 + }, + { + "auxiliary_loss_clip": 0.01120186, + "auxiliary_loss_mlp": 0.010253, + "balance_loss_clip": 1.04497468, + "balance_loss_mlp": 1.01761103, + "epoch": 0.5257019178741057, + "flos": 14136595320960.0, + "grad_norm": 2.6994140715420394, + "language_loss": 0.73562908, + "learning_rate": 1.930685406847242e-06, + "loss": 0.75708389, + "num_input_tokens_seen": 94461535, + "step": 4372, + "time_per_iteration": 2.537132501602173 + }, + { + "auxiliary_loss_clip": 0.01148799, + "auxiliary_loss_mlp": 0.01026728, + "balance_loss_clip": 1.04890406, + "balance_loss_mlp": 1.01921749, + "epoch": 0.5258221607647448, + "flos": 23548961145600.0, + "grad_norm": 1.5745349545527902, + "language_loss": 0.81768429, + "learning_rate": 1.9299069084400734e-06, + "loss": 0.83943951, + "num_input_tokens_seen": 94482395, + "step": 4373, + "time_per_iteration": 2.564203977584839 + }, + { + "auxiliary_loss_clip": 0.01137133, + "auxiliary_loss_mlp": 0.01028755, + "balance_loss_clip": 1.05227959, + "balance_loss_mlp": 1.02054191, + "epoch": 0.5259424036553839, + "flos": 24966103403520.0, + "grad_norm": 2.3060975833085937, + "language_loss": 0.69821215, + "learning_rate": 1.9291284206659717e-06, + "loss": 0.71987104, + "num_input_tokens_seen": 94500580, + "step": 4374, + "time_per_iteration": 2.5788283348083496 + }, + { + "auxiliary_loss_clip": 0.01185334, + "auxiliary_loss_mlp": 0.01025906, + "balance_loss_clip": 1.05650413, + "balance_loss_mlp": 1.01746023, + "epoch": 0.526062646546023, + "flos": 28763908295040.0, + "grad_norm": 32.6708223012958, + "language_loss": 0.71544254, + "learning_rate": 1.928349943643032e-06, + "loss": 0.73755491, + "num_input_tokens_seen": 94519680, + "step": 4375, + "time_per_iteration": 2.499790906906128 + }, + { + "auxiliary_loss_clip": 0.01164888, + "auxiliary_loss_mlp": 0.01029742, + "balance_loss_clip": 1.05334389, + "balance_loss_mlp": 1.02164745, + "epoch": 0.526182889436662, + "flos": 22821375254400.0, + "grad_norm": 1.659188907504883, + "language_loss": 0.8147586, + "learning_rate": 1.9275714774893493e-06, + "loss": 0.83670485, + "num_input_tokens_seen": 94539135, + "step": 4376, + "time_per_iteration": 2.478975534439087 + }, + { + "auxiliary_loss_clip": 0.01127624, + "auxiliary_loss_mlp": 0.01026333, + "balance_loss_clip": 1.0454731, + "balance_loss_mlp": 1.01782775, + "epoch": 0.5263031323273012, + "flos": 22929466256640.0, + "grad_norm": 2.699573483562077, + "language_loss": 0.72838229, + "learning_rate": 1.9267930223230154e-06, + "loss": 0.74992192, + "num_input_tokens_seen": 94557610, + "step": 4377, + "time_per_iteration": 2.5464954376220703 + }, + { + "auxiliary_loss_clip": 0.01156617, + "auxiliary_loss_mlp": 0.01027947, + "balance_loss_clip": 1.05259264, + "balance_loss_mlp": 1.02022278, + "epoch": 0.5264233752179402, + "flos": 17748634049280.0, + "grad_norm": 2.222182154821806, + "language_loss": 0.78206849, + "learning_rate": 1.9260145782621224e-06, + "loss": 0.80391407, + "num_input_tokens_seen": 94575390, + "step": 4378, + "time_per_iteration": 2.4880621433258057 + }, + { + "auxiliary_loss_clip": 0.01150847, + "auxiliary_loss_mlp": 0.01026187, + "balance_loss_clip": 1.05250764, + "balance_loss_mlp": 1.01816416, + "epoch": 0.5265436181085793, + "flos": 24421626069120.0, + "grad_norm": 1.8244532395509638, + "language_loss": 0.87701607, + "learning_rate": 1.925236145424758e-06, + "loss": 0.89878643, + "num_input_tokens_seen": 94594210, + "step": 4379, + "time_per_iteration": 2.5409321784973145 + }, + { + "auxiliary_loss_clip": 0.01067786, + "auxiliary_loss_mlp": 0.01005428, + "balance_loss_clip": 1.01565421, + "balance_loss_mlp": 1.00417662, + "epoch": 0.5266638609992185, + "flos": 69207298156800.0, + "grad_norm": 0.6963769137815407, + "language_loss": 0.57572269, + "learning_rate": 1.924457723929012e-06, + "loss": 0.59645486, + "num_input_tokens_seen": 94665020, + "step": 4380, + "time_per_iteration": 3.177259922027588 + }, + { + "auxiliary_loss_clip": 0.01166768, + "auxiliary_loss_mlp": 0.01025248, + "balance_loss_clip": 1.05211544, + "balance_loss_mlp": 1.01733267, + "epoch": 0.5267841038898575, + "flos": 20738699850240.0, + "grad_norm": 1.7068679165038712, + "language_loss": 0.8286531, + "learning_rate": 1.9236793138929685e-06, + "loss": 0.8505733, + "num_input_tokens_seen": 94684290, + "step": 4381, + "time_per_iteration": 2.4763705730438232 + }, + { + "auxiliary_loss_clip": 0.01171154, + "auxiliary_loss_mlp": 0.01025419, + "balance_loss_clip": 1.05102527, + "balance_loss_mlp": 1.01770675, + "epoch": 0.5269043467804966, + "flos": 17234392988160.0, + "grad_norm": 2.2761723342074207, + "language_loss": 0.80987442, + "learning_rate": 1.9229009154347133e-06, + "loss": 0.83184016, + "num_input_tokens_seen": 94701880, + "step": 4382, + "time_per_iteration": 2.4462623596191406 + }, + { + "auxiliary_loss_clip": 0.01110426, + "auxiliary_loss_mlp": 0.00762575, + "balance_loss_clip": 1.04479027, + "balance_loss_mlp": 1.00120449, + "epoch": 0.5270245896711357, + "flos": 18223157646720.0, + "grad_norm": 7.608576127665937, + "language_loss": 0.80614233, + "learning_rate": 1.922122528672327e-06, + "loss": 0.82487231, + "num_input_tokens_seen": 94720545, + "step": 4383, + "time_per_iteration": 2.581200122833252 + }, + { + "auxiliary_loss_clip": 0.01177863, + "auxiliary_loss_mlp": 0.0102533, + "balance_loss_clip": 1.05254388, + "balance_loss_mlp": 1.01771832, + "epoch": 0.5271448325617748, + "flos": 21287558643840.0, + "grad_norm": 2.4672435868029474, + "language_loss": 0.78157574, + "learning_rate": 1.9213441537238914e-06, + "loss": 0.80360764, + "num_input_tokens_seen": 94737420, + "step": 4384, + "time_per_iteration": 2.4447977542877197 + }, + { + "auxiliary_loss_clip": 0.01027072, + "auxiliary_loss_mlp": 0.0100782, + "balance_loss_clip": 1.01702309, + "balance_loss_mlp": 1.00661588, + "epoch": 0.5272650754524139, + "flos": 65495497403520.0, + "grad_norm": 0.8333042258178599, + "language_loss": 0.57359612, + "learning_rate": 1.920565790707485e-06, + "loss": 0.59394503, + "num_input_tokens_seen": 94802810, + "step": 4385, + "time_per_iteration": 3.2777605056762695 + }, + { + "auxiliary_loss_clip": 0.01132186, + "auxiliary_loss_mlp": 0.01027194, + "balance_loss_clip": 1.04733479, + "balance_loss_mlp": 1.01870632, + "epoch": 0.527385318343053, + "flos": 19676426008320.0, + "grad_norm": 2.4214297296905682, + "language_loss": 0.65829968, + "learning_rate": 1.9197874397411853e-06, + "loss": 0.67989349, + "num_input_tokens_seen": 94819440, + "step": 4386, + "time_per_iteration": 2.580073833465576 + }, + { + "auxiliary_loss_clip": 0.01139363, + "auxiliary_loss_mlp": 0.01034811, + "balance_loss_clip": 1.04558492, + "balance_loss_mlp": 1.0260911, + "epoch": 0.5275055612336921, + "flos": 12712018947840.0, + "grad_norm": 3.106086952791269, + "language_loss": 0.67557997, + "learning_rate": 1.919009100943067e-06, + "loss": 0.69732171, + "num_input_tokens_seen": 94835130, + "step": 4387, + "time_per_iteration": 2.5095889568328857 + }, + { + "auxiliary_loss_clip": 0.01136305, + "auxiliary_loss_mlp": 0.01025636, + "balance_loss_clip": 1.04874361, + "balance_loss_mlp": 1.01719594, + "epoch": 0.5276258041243311, + "flos": 17749029098880.0, + "grad_norm": 2.4335551504708817, + "language_loss": 0.65859038, + "learning_rate": 1.9182307744312043e-06, + "loss": 0.68020982, + "num_input_tokens_seen": 94852235, + "step": 4388, + "time_per_iteration": 2.569767951965332 + }, + { + "auxiliary_loss_clip": 0.01155384, + "auxiliary_loss_mlp": 0.0103346, + "balance_loss_clip": 1.0485301, + "balance_loss_mlp": 1.025419, + "epoch": 0.5277460470149702, + "flos": 22710447077760.0, + "grad_norm": 4.011718417676472, + "language_loss": 0.76266617, + "learning_rate": 1.9174524603236676e-06, + "loss": 0.7845546, + "num_input_tokens_seen": 94871185, + "step": 4389, + "time_per_iteration": 2.5820553302764893 + }, + { + "auxiliary_loss_clip": 0.01154322, + "auxiliary_loss_mlp": 0.01029253, + "balance_loss_clip": 1.05065286, + "balance_loss_mlp": 1.02052641, + "epoch": 0.5278662899056094, + "flos": 19902699734400.0, + "grad_norm": 2.10730270892603, + "language_loss": 0.76341343, + "learning_rate": 1.916674158738527e-06, + "loss": 0.78524917, + "num_input_tokens_seen": 94890090, + "step": 4390, + "time_per_iteration": 2.5181922912597656 + }, + { + "auxiliary_loss_clip": 0.01133418, + "auxiliary_loss_mlp": 0.00763598, + "balance_loss_clip": 1.04951572, + "balance_loss_mlp": 1.00114989, + "epoch": 0.5279865327962484, + "flos": 18005215875840.0, + "grad_norm": 1.9134427740746471, + "language_loss": 0.59907007, + "learning_rate": 1.9158958697938506e-06, + "loss": 0.6180402, + "num_input_tokens_seen": 94908470, + "step": 4391, + "time_per_iteration": 4.1151885986328125 + }, + { + "auxiliary_loss_clip": 0.01147657, + "auxiliary_loss_mlp": 0.01030414, + "balance_loss_clip": 1.04814625, + "balance_loss_mlp": 1.02211702, + "epoch": 0.5281067756868875, + "flos": 15924443892480.0, + "grad_norm": 2.883107786201956, + "language_loss": 0.86281049, + "learning_rate": 1.9151175936077032e-06, + "loss": 0.88459122, + "num_input_tokens_seen": 94923440, + "step": 4392, + "time_per_iteration": 3.311213493347168 + }, + { + "auxiliary_loss_clip": 0.01161511, + "auxiliary_loss_mlp": 0.01033075, + "balance_loss_clip": 1.05076575, + "balance_loss_mlp": 1.0247426, + "epoch": 0.5282270185775266, + "flos": 19426488197760.0, + "grad_norm": 1.6110645086540971, + "language_loss": 0.79032302, + "learning_rate": 1.9143393302981507e-06, + "loss": 0.81226891, + "num_input_tokens_seen": 94941125, + "step": 4393, + "time_per_iteration": 3.388991117477417 + }, + { + "auxiliary_loss_clip": 0.01156142, + "auxiliary_loss_mlp": 0.01025772, + "balance_loss_clip": 1.05051792, + "balance_loss_mlp": 1.01767802, + "epoch": 0.5283472614681657, + "flos": 16399613934720.0, + "grad_norm": 1.8923963440601597, + "language_loss": 0.83449268, + "learning_rate": 1.913561079983252e-06, + "loss": 0.8563118, + "num_input_tokens_seen": 94959950, + "step": 4394, + "time_per_iteration": 2.570316791534424 + }, + { + "auxiliary_loss_clip": 0.01157031, + "auxiliary_loss_mlp": 0.01037442, + "balance_loss_clip": 1.04950237, + "balance_loss_mlp": 1.02820921, + "epoch": 0.5284675043588047, + "flos": 26760524163840.0, + "grad_norm": 2.4581185546966493, + "language_loss": 0.745031, + "learning_rate": 1.9127828427810693e-06, + "loss": 0.76697576, + "num_input_tokens_seen": 94980515, + "step": 4395, + "time_per_iteration": 2.5850462913513184 + }, + { + "auxiliary_loss_clip": 0.0114747, + "auxiliary_loss_mlp": 0.01026292, + "balance_loss_clip": 1.04999077, + "balance_loss_mlp": 1.01816833, + "epoch": 0.5285877472494439, + "flos": 19899898473600.0, + "grad_norm": 1.9338376381925149, + "language_loss": 0.80929911, + "learning_rate": 1.9120046188096607e-06, + "loss": 0.83103669, + "num_input_tokens_seen": 94998560, + "step": 4396, + "time_per_iteration": 2.563671350479126 + }, + { + "auxiliary_loss_clip": 0.01152597, + "auxiliary_loss_mlp": 0.01037721, + "balance_loss_clip": 1.0524354, + "balance_loss_mlp": 1.02982283, + "epoch": 0.528707990140083, + "flos": 20011257613440.0, + "grad_norm": 2.361219798184842, + "language_loss": 0.74140781, + "learning_rate": 1.9112264081870804e-06, + "loss": 0.76331103, + "num_input_tokens_seen": 95016950, + "step": 4397, + "time_per_iteration": 2.5236756801605225 + }, + { + "auxiliary_loss_clip": 0.0113586, + "auxiliary_loss_mlp": 0.01030499, + "balance_loss_clip": 1.04961538, + "balance_loss_mlp": 1.02183819, + "epoch": 0.528828233030722, + "flos": 20667956014080.0, + "grad_norm": 2.0903332801120382, + "language_loss": 0.75370002, + "learning_rate": 1.9104482110313843e-06, + "loss": 0.77536368, + "num_input_tokens_seen": 95036540, + "step": 4398, + "time_per_iteration": 2.559474468231201 + }, + { + "auxiliary_loss_clip": 0.01165895, + "auxiliary_loss_mlp": 0.01028316, + "balance_loss_clip": 1.05265236, + "balance_loss_mlp": 1.02052546, + "epoch": 0.5289484759213612, + "flos": 25192448956800.0, + "grad_norm": 1.9683249211918432, + "language_loss": 0.74195766, + "learning_rate": 1.909670027460623e-06, + "loss": 0.7638998, + "num_input_tokens_seen": 95053840, + "step": 4399, + "time_per_iteration": 2.5017213821411133 + }, + { + "auxiliary_loss_clip": 0.01167721, + "auxiliary_loss_mlp": 0.01026918, + "balance_loss_clip": 1.05242908, + "balance_loss_mlp": 1.01882398, + "epoch": 0.5290687188120002, + "flos": 31139255715840.0, + "grad_norm": 1.7569308874175626, + "language_loss": 0.71554649, + "learning_rate": 1.908891857592847e-06, + "loss": 0.7374928, + "num_input_tokens_seen": 95074910, + "step": 4400, + "time_per_iteration": 2.5745108127593994 + }, + { + "auxiliary_loss_clip": 0.01130555, + "auxiliary_loss_mlp": 0.01024996, + "balance_loss_clip": 1.04911792, + "balance_loss_mlp": 1.01657403, + "epoch": 0.5291889617026393, + "flos": 20119851406080.0, + "grad_norm": 2.488590125434079, + "language_loss": 0.89692754, + "learning_rate": 1.9081137015461034e-06, + "loss": 0.91848308, + "num_input_tokens_seen": 95090985, + "step": 4401, + "time_per_iteration": 2.5047669410705566 + }, + { + "auxiliary_loss_clip": 0.01117889, + "auxiliary_loss_mlp": 0.01028267, + "balance_loss_clip": 1.04769731, + "balance_loss_mlp": 1.02053618, + "epoch": 0.5293092045932785, + "flos": 19643747610240.0, + "grad_norm": 1.86377370133453, + "language_loss": 0.90437102, + "learning_rate": 1.9073355594384383e-06, + "loss": 0.92583251, + "num_input_tokens_seen": 95109225, + "step": 4402, + "time_per_iteration": 2.5517418384552 + }, + { + "auxiliary_loss_clip": 0.01131576, + "auxiliary_loss_mlp": 0.01037721, + "balance_loss_clip": 1.04968023, + "balance_loss_mlp": 1.02951384, + "epoch": 0.5294294474839175, + "flos": 24317736958080.0, + "grad_norm": 2.3488641949383178, + "language_loss": 0.80717057, + "learning_rate": 1.906557431387895e-06, + "loss": 0.8288635, + "num_input_tokens_seen": 95128215, + "step": 4403, + "time_per_iteration": 2.5738296508789062 + }, + { + "auxiliary_loss_clip": 0.01133198, + "auxiliary_loss_mlp": 0.01028729, + "balance_loss_clip": 1.05125237, + "balance_loss_mlp": 1.01997948, + "epoch": 0.5295496903745566, + "flos": 18875941464960.0, + "grad_norm": 1.9517124533821018, + "language_loss": 0.78626925, + "learning_rate": 1.905779317512516e-06, + "loss": 0.80788851, + "num_input_tokens_seen": 95145760, + "step": 4404, + "time_per_iteration": 2.5248570442199707 + }, + { + "auxiliary_loss_clip": 0.01165209, + "auxiliary_loss_mlp": 0.01027795, + "balance_loss_clip": 1.05151308, + "balance_loss_mlp": 1.01994538, + "epoch": 0.5296699332651957, + "flos": 20923101296640.0, + "grad_norm": 2.358912235853505, + "language_loss": 0.80307734, + "learning_rate": 1.9050012179303385e-06, + "loss": 0.82500744, + "num_input_tokens_seen": 95164270, + "step": 4405, + "time_per_iteration": 2.4780194759368896 + }, + { + "auxiliary_loss_clip": 0.01168365, + "auxiliary_loss_mlp": 0.01029057, + "balance_loss_clip": 1.05102921, + "balance_loss_mlp": 1.02068853, + "epoch": 0.5297901761558348, + "flos": 22046745525120.0, + "grad_norm": 2.279102009311256, + "language_loss": 0.68780565, + "learning_rate": 1.904223132759401e-06, + "loss": 0.70977986, + "num_input_tokens_seen": 95182870, + "step": 4406, + "time_per_iteration": 2.5144782066345215 + }, + { + "auxiliary_loss_clip": 0.0116815, + "auxiliary_loss_mlp": 0.01032152, + "balance_loss_clip": 1.051741, + "balance_loss_mlp": 1.02371812, + "epoch": 0.5299104190464738, + "flos": 21798495653760.0, + "grad_norm": 2.125406268071738, + "language_loss": 0.68898296, + "learning_rate": 1.9034450621177383e-06, + "loss": 0.71098596, + "num_input_tokens_seen": 95201190, + "step": 4407, + "time_per_iteration": 2.472564220428467 + }, + { + "auxiliary_loss_clip": 0.01166886, + "auxiliary_loss_mlp": 0.01038835, + "balance_loss_clip": 1.05335808, + "balance_loss_mlp": 1.03007913, + "epoch": 0.530030661937113, + "flos": 14720790119040.0, + "grad_norm": 2.3552605050413478, + "language_loss": 0.7034905, + "learning_rate": 1.9026670061233824e-06, + "loss": 0.72554767, + "num_input_tokens_seen": 95218625, + "step": 4408, + "time_per_iteration": 2.494894027709961 + }, + { + "auxiliary_loss_clip": 0.01149674, + "auxiliary_loss_mlp": 0.01027477, + "balance_loss_clip": 1.05213428, + "balance_loss_mlp": 1.0196805, + "epoch": 0.5301509048277521, + "flos": 21251504367360.0, + "grad_norm": 1.767728858846868, + "language_loss": 0.80518186, + "learning_rate": 1.901888964894365e-06, + "loss": 0.82695335, + "num_input_tokens_seen": 95237665, + "step": 4409, + "time_per_iteration": 2.50641131401062 + }, + { + "auxiliary_loss_clip": 0.01182518, + "auxiliary_loss_mlp": 0.01028068, + "balance_loss_clip": 1.05249238, + "balance_loss_mlp": 1.02020645, + "epoch": 0.5302711477183911, + "flos": 25957058791680.0, + "grad_norm": 1.9139794338233997, + "language_loss": 0.67602336, + "learning_rate": 1.9011109385487134e-06, + "loss": 0.6981293, + "num_input_tokens_seen": 95258915, + "step": 4410, + "time_per_iteration": 2.537675619125366 + }, + { + "auxiliary_loss_clip": 0.01182643, + "auxiliary_loss_mlp": 0.01028121, + "balance_loss_clip": 1.05259919, + "balance_loss_mlp": 1.01967466, + "epoch": 0.5303913906090303, + "flos": 22273126992000.0, + "grad_norm": 4.057977505712608, + "language_loss": 0.6681096, + "learning_rate": 1.900332927204454e-06, + "loss": 0.69021726, + "num_input_tokens_seen": 95277365, + "step": 4411, + "time_per_iteration": 2.4944915771484375 + }, + { + "auxiliary_loss_clip": 0.01160002, + "auxiliary_loss_mlp": 0.01025494, + "balance_loss_clip": 1.05138469, + "balance_loss_mlp": 1.01728654, + "epoch": 0.5305116334996693, + "flos": 24936010784640.0, + "grad_norm": 5.327322798893866, + "language_loss": 0.76554096, + "learning_rate": 1.8995549309796097e-06, + "loss": 0.78739589, + "num_input_tokens_seen": 95296670, + "step": 4412, + "time_per_iteration": 2.533430814743042 + }, + { + "auxiliary_loss_clip": 0.01173836, + "auxiliary_loss_mlp": 0.01028694, + "balance_loss_clip": 1.05372405, + "balance_loss_mlp": 1.02088022, + "epoch": 0.5306318763903084, + "flos": 20189338266240.0, + "grad_norm": 1.9570650245163836, + "language_loss": 0.76490188, + "learning_rate": 1.8987769499922028e-06, + "loss": 0.78692722, + "num_input_tokens_seen": 95315640, + "step": 4413, + "time_per_iteration": 2.475717306137085 + }, + { + "auxiliary_loss_clip": 0.01168029, + "auxiliary_loss_mlp": 0.00762759, + "balance_loss_clip": 1.0529741, + "balance_loss_mlp": 1.00124025, + "epoch": 0.5307521192809476, + "flos": 20266366982400.0, + "grad_norm": 3.042790432287652, + "language_loss": 0.70374823, + "learning_rate": 1.897998984360252e-06, + "loss": 0.72305608, + "num_input_tokens_seen": 95334610, + "step": 4414, + "time_per_iteration": 2.488725185394287 + }, + { + "auxiliary_loss_clip": 0.01150147, + "auxiliary_loss_mlp": 0.01026169, + "balance_loss_clip": 1.05038333, + "balance_loss_mlp": 1.01812255, + "epoch": 0.5308723621715866, + "flos": 28844276976000.0, + "grad_norm": 2.1946286394309973, + "language_loss": 0.78559935, + "learning_rate": 1.897221034201775e-06, + "loss": 0.8073625, + "num_input_tokens_seen": 95358350, + "step": 4415, + "time_per_iteration": 2.5908243656158447 + }, + { + "auxiliary_loss_clip": 0.01138438, + "auxiliary_loss_mlp": 0.01028648, + "balance_loss_clip": 1.04644513, + "balance_loss_mlp": 1.02116442, + "epoch": 0.5309926050622257, + "flos": 27457766040960.0, + "grad_norm": 1.8419973403779202, + "language_loss": 0.66721576, + "learning_rate": 1.8964430996347842e-06, + "loss": 0.68888664, + "num_input_tokens_seen": 95379900, + "step": 4416, + "time_per_iteration": 2.6077888011932373 + }, + { + "auxiliary_loss_clip": 0.01152794, + "auxiliary_loss_mlp": 0.0102546, + "balance_loss_clip": 1.05084872, + "balance_loss_mlp": 1.01702607, + "epoch": 0.5311128479528648, + "flos": 20514545026560.0, + "grad_norm": 1.7241393429033296, + "language_loss": 0.82737809, + "learning_rate": 1.8956651807772931e-06, + "loss": 0.84916061, + "num_input_tokens_seen": 95397935, + "step": 4417, + "time_per_iteration": 3.2439565658569336 + }, + { + "auxiliary_loss_clip": 0.01165147, + "auxiliary_loss_mlp": 0.01023407, + "balance_loss_clip": 1.05209279, + "balance_loss_mlp": 1.01624262, + "epoch": 0.5312330908435039, + "flos": 21397660807680.0, + "grad_norm": 1.6648788575911593, + "language_loss": 0.83845484, + "learning_rate": 1.8948872777473115e-06, + "loss": 0.86034036, + "num_input_tokens_seen": 95415890, + "step": 4418, + "time_per_iteration": 3.20731520652771 + }, + { + "auxiliary_loss_clip": 0.01153735, + "auxiliary_loss_mlp": 0.01026408, + "balance_loss_clip": 1.051337, + "balance_loss_mlp": 1.01885629, + "epoch": 0.531353333734143, + "flos": 24717350741760.0, + "grad_norm": 1.6019822316909011, + "language_loss": 0.63091469, + "learning_rate": 1.8941093906628458e-06, + "loss": 0.65271616, + "num_input_tokens_seen": 95433675, + "step": 4419, + "time_per_iteration": 3.2504990100860596 + }, + { + "auxiliary_loss_clip": 0.01146064, + "auxiliary_loss_mlp": 0.01023073, + "balance_loss_clip": 1.04823279, + "balance_loss_mlp": 1.01572633, + "epoch": 0.531473576624782, + "flos": 30480689808000.0, + "grad_norm": 1.8591179351712759, + "language_loss": 0.70790291, + "learning_rate": 1.893331519641902e-06, + "loss": 0.72959429, + "num_input_tokens_seen": 95455820, + "step": 4420, + "time_per_iteration": 3.283900737762451 + }, + { + "auxiliary_loss_clip": 0.01124245, + "auxiliary_loss_mlp": 0.01026812, + "balance_loss_clip": 1.04288077, + "balance_loss_mlp": 1.01865196, + "epoch": 0.5315938195154212, + "flos": 23002975440000.0, + "grad_norm": 2.2594464091319955, + "language_loss": 0.73784113, + "learning_rate": 1.8925536648024815e-06, + "loss": 0.75935173, + "num_input_tokens_seen": 95473240, + "step": 4421, + "time_per_iteration": 2.5310518741607666 + }, + { + "auxiliary_loss_clip": 0.01181939, + "auxiliary_loss_mlp": 0.01026557, + "balance_loss_clip": 1.05352044, + "balance_loss_mlp": 1.01831985, + "epoch": 0.5317140624060602, + "flos": 22748584343040.0, + "grad_norm": 1.8375473507144116, + "language_loss": 0.75776064, + "learning_rate": 1.8917758262625849e-06, + "loss": 0.7798456, + "num_input_tokens_seen": 95493480, + "step": 4422, + "time_per_iteration": 2.4786200523376465 + }, + { + "auxiliary_loss_clip": 0.01148672, + "auxiliary_loss_mlp": 0.01028077, + "balance_loss_clip": 1.05159426, + "balance_loss_mlp": 1.02028084, + "epoch": 0.5318343052966993, + "flos": 22821087945600.0, + "grad_norm": 1.7888072565595303, + "language_loss": 0.80847311, + "learning_rate": 1.8909980041402089e-06, + "loss": 0.83024061, + "num_input_tokens_seen": 95512075, + "step": 4423, + "time_per_iteration": 2.52940034866333 + }, + { + "auxiliary_loss_clip": 0.01162262, + "auxiliary_loss_mlp": 0.01025353, + "balance_loss_clip": 1.05005407, + "balance_loss_mlp": 1.01700187, + "epoch": 0.5319545481873384, + "flos": 13626089274240.0, + "grad_norm": 2.880800449731631, + "language_loss": 0.65635931, + "learning_rate": 1.8902201985533494e-06, + "loss": 0.67823547, + "num_input_tokens_seen": 95529340, + "step": 4424, + "time_per_iteration": 2.456606149673462 + }, + { + "auxiliary_loss_clip": 0.01152908, + "auxiliary_loss_mlp": 0.01021419, + "balance_loss_clip": 1.05107999, + "balance_loss_mlp": 1.01403427, + "epoch": 0.5320747910779775, + "flos": 22162522037760.0, + "grad_norm": 3.2334807281671516, + "language_loss": 0.75047672, + "learning_rate": 1.8894424096199983e-06, + "loss": 0.77222002, + "num_input_tokens_seen": 95548545, + "step": 4425, + "time_per_iteration": 2.5400710105895996 + }, + { + "auxiliary_loss_clip": 0.01172106, + "auxiliary_loss_mlp": 0.01026894, + "balance_loss_clip": 1.05597305, + "balance_loss_mlp": 1.01837683, + "epoch": 0.5321950339686166, + "flos": 18588081870720.0, + "grad_norm": 2.3230786749555397, + "language_loss": 0.85987991, + "learning_rate": 1.8886646374581463e-06, + "loss": 0.88186991, + "num_input_tokens_seen": 95567770, + "step": 4426, + "time_per_iteration": 2.4725961685180664 + }, + { + "auxiliary_loss_clip": 0.01165539, + "auxiliary_loss_mlp": 0.01025414, + "balance_loss_clip": 1.05070055, + "balance_loss_mlp": 1.01694989, + "epoch": 0.5323152768592557, + "flos": 22856818999680.0, + "grad_norm": 1.6502334666976557, + "language_loss": 0.70900762, + "learning_rate": 1.8878868821857795e-06, + "loss": 0.7309171, + "num_input_tokens_seen": 95587420, + "step": 4427, + "time_per_iteration": 2.4756345748901367 + }, + { + "auxiliary_loss_clip": 0.01119453, + "auxiliary_loss_mlp": 0.01030663, + "balance_loss_clip": 1.04341364, + "balance_loss_mlp": 1.0219903, + "epoch": 0.5324355197498948, + "flos": 33948690998400.0, + "grad_norm": 2.4183938850847757, + "language_loss": 0.75045604, + "learning_rate": 1.8871091439208838e-06, + "loss": 0.77195722, + "num_input_tokens_seen": 95609030, + "step": 4428, + "time_per_iteration": 2.6782219409942627 + }, + { + "auxiliary_loss_clip": 0.01123668, + "auxiliary_loss_mlp": 0.01028751, + "balance_loss_clip": 1.04757094, + "balance_loss_mlp": 1.0203234, + "epoch": 0.5325557626405338, + "flos": 23256720092160.0, + "grad_norm": 2.2979828491624694, + "language_loss": 0.77060479, + "learning_rate": 1.8863314227814414e-06, + "loss": 0.79212898, + "num_input_tokens_seen": 95627340, + "step": 4429, + "time_per_iteration": 2.5854806900024414 + }, + { + "auxiliary_loss_clip": 0.01172595, + "auxiliary_loss_mlp": 0.01028662, + "balance_loss_clip": 1.05429411, + "balance_loss_mlp": 1.02038002, + "epoch": 0.532676005531173, + "flos": 26718687797760.0, + "grad_norm": 2.205799361193026, + "language_loss": 0.47999921, + "learning_rate": 1.8855537188854313e-06, + "loss": 0.50201172, + "num_input_tokens_seen": 95646315, + "step": 4430, + "time_per_iteration": 2.5468027591705322 + }, + { + "auxiliary_loss_clip": 0.01166802, + "auxiliary_loss_mlp": 0.01029254, + "balance_loss_clip": 1.04866958, + "balance_loss_mlp": 1.02152383, + "epoch": 0.5327962484218121, + "flos": 17894610921600.0, + "grad_norm": 2.1212300534062076, + "language_loss": 0.78394997, + "learning_rate": 1.8847760323508315e-06, + "loss": 0.80591059, + "num_input_tokens_seen": 95665220, + "step": 4431, + "time_per_iteration": 2.457033157348633 + }, + { + "auxiliary_loss_clip": 0.01149642, + "auxiliary_loss_mlp": 0.01026625, + "balance_loss_clip": 1.05159402, + "balance_loss_mlp": 1.01907921, + "epoch": 0.5329164913124511, + "flos": 17925385898880.0, + "grad_norm": 1.6984632724484208, + "language_loss": 0.75193715, + "learning_rate": 1.883998363295616e-06, + "loss": 0.77369988, + "num_input_tokens_seen": 95682700, + "step": 4432, + "time_per_iteration": 2.4856202602386475 + }, + { + "auxiliary_loss_clip": 0.01054802, + "auxiliary_loss_mlp": 0.01012925, + "balance_loss_clip": 1.01593018, + "balance_loss_mlp": 1.0115304, + "epoch": 0.5330367342030903, + "flos": 57254178781440.0, + "grad_norm": 0.873782215851889, + "language_loss": 0.62620705, + "learning_rate": 1.8832207118377565e-06, + "loss": 0.64688432, + "num_input_tokens_seen": 95738070, + "step": 4433, + "time_per_iteration": 2.997492790222168 + }, + { + "auxiliary_loss_clip": 0.01178915, + "auxiliary_loss_mlp": 0.01023193, + "balance_loss_clip": 1.05262101, + "balance_loss_mlp": 1.01598108, + "epoch": 0.5331569770937293, + "flos": 17420518287360.0, + "grad_norm": 2.3909725048453927, + "language_loss": 0.6971131, + "learning_rate": 1.882443078095222e-06, + "loss": 0.71913415, + "num_input_tokens_seen": 95756950, + "step": 4434, + "time_per_iteration": 2.4277052879333496 + }, + { + "auxiliary_loss_clip": 0.01043501, + "auxiliary_loss_mlp": 0.01008337, + "balance_loss_clip": 1.02150393, + "balance_loss_mlp": 1.00704992, + "epoch": 0.5332772199843684, + "flos": 56750783627520.0, + "grad_norm": 0.8563343392759823, + "language_loss": 0.66777635, + "learning_rate": 1.8816654621859794e-06, + "loss": 0.68829471, + "num_input_tokens_seen": 95816615, + "step": 4435, + "time_per_iteration": 3.048809051513672 + }, + { + "auxiliary_loss_clip": 0.01178416, + "auxiliary_loss_mlp": 0.01027137, + "balance_loss_clip": 1.05231166, + "balance_loss_mlp": 1.01881623, + "epoch": 0.5333974628750076, + "flos": 18697753071360.0, + "grad_norm": 2.531137939472157, + "language_loss": 0.72115844, + "learning_rate": 1.8808878642279915e-06, + "loss": 0.74321395, + "num_input_tokens_seen": 95832020, + "step": 4436, + "time_per_iteration": 2.438107490539551 + }, + { + "auxiliary_loss_clip": 0.01138693, + "auxiliary_loss_mlp": 0.01035953, + "balance_loss_clip": 1.0445888, + "balance_loss_mlp": 1.02747083, + "epoch": 0.5335177057656466, + "flos": 23805507058560.0, + "grad_norm": 2.3893033150078846, + "language_loss": 0.65330637, + "learning_rate": 1.8801102843392209e-06, + "loss": 0.67505282, + "num_input_tokens_seen": 95851425, + "step": 4437, + "time_per_iteration": 2.562403917312622 + }, + { + "auxiliary_loss_clip": 0.01136116, + "auxiliary_loss_mlp": 0.01027327, + "balance_loss_clip": 1.04652905, + "balance_loss_mlp": 1.01963782, + "epoch": 0.5336379486562857, + "flos": 25078683605760.0, + "grad_norm": 1.5847753124846062, + "language_loss": 0.85069054, + "learning_rate": 1.8793327226376238e-06, + "loss": 0.87232494, + "num_input_tokens_seen": 95870745, + "step": 4438, + "time_per_iteration": 2.5854194164276123 + }, + { + "auxiliary_loss_clip": 0.01158695, + "auxiliary_loss_mlp": 0.01028295, + "balance_loss_clip": 1.0498935, + "balance_loss_mlp": 1.02056432, + "epoch": 0.5337581915469248, + "flos": 21396691140480.0, + "grad_norm": 2.6564979290678217, + "language_loss": 0.80291444, + "learning_rate": 1.8785551792411569e-06, + "loss": 0.82478434, + "num_input_tokens_seen": 95889755, + "step": 4439, + "time_per_iteration": 2.5167396068573 + }, + { + "auxiliary_loss_clip": 0.01155113, + "auxiliary_loss_mlp": 0.01027581, + "balance_loss_clip": 1.05270326, + "balance_loss_mlp": 1.02015162, + "epoch": 0.5338784344375639, + "flos": 14865905064960.0, + "grad_norm": 2.0411747008753762, + "language_loss": 0.82512909, + "learning_rate": 1.8777776542677733e-06, + "loss": 0.84695601, + "num_input_tokens_seen": 95907805, + "step": 4440, + "time_per_iteration": 2.4818034172058105 + }, + { + "auxiliary_loss_clip": 0.01136345, + "auxiliary_loss_mlp": 0.01025802, + "balance_loss_clip": 1.04566026, + "balance_loss_mlp": 1.01771951, + "epoch": 0.5339986773282029, + "flos": 20813501923200.0, + "grad_norm": 2.278333384203696, + "language_loss": 0.72941154, + "learning_rate": 1.8770001478354216e-06, + "loss": 0.75103301, + "num_input_tokens_seen": 95927480, + "step": 4441, + "time_per_iteration": 2.5426857471466064 + }, + { + "auxiliary_loss_clip": 0.01163173, + "auxiliary_loss_mlp": 0.01032782, + "balance_loss_clip": 1.05063379, + "balance_loss_mlp": 1.02447867, + "epoch": 0.5341189202188421, + "flos": 17969089772160.0, + "grad_norm": 2.35584756636217, + "language_loss": 0.83898842, + "learning_rate": 1.8762226600620504e-06, + "loss": 0.86094797, + "num_input_tokens_seen": 95946095, + "step": 4442, + "time_per_iteration": 2.4690327644348145 + }, + { + "auxiliary_loss_clip": 0.011591, + "auxiliary_loss_mlp": 0.01029346, + "balance_loss_clip": 1.04992437, + "balance_loss_mlp": 1.02093589, + "epoch": 0.5342391631094812, + "flos": 11031866328960.0, + "grad_norm": 4.333111074044416, + "language_loss": 0.5888921, + "learning_rate": 1.8754451910656031e-06, + "loss": 0.61077648, + "num_input_tokens_seen": 95959995, + "step": 4443, + "time_per_iteration": 2.4630236625671387 + }, + { + "auxiliary_loss_clip": 0.01132965, + "auxiliary_loss_mlp": 0.01028453, + "balance_loss_clip": 1.04879427, + "balance_loss_mlp": 1.02028465, + "epoch": 0.5343594060001202, + "flos": 15339135772800.0, + "grad_norm": 2.1441751530961817, + "language_loss": 0.82786071, + "learning_rate": 1.8746677409640212e-06, + "loss": 0.84947491, + "num_input_tokens_seen": 95977095, + "step": 4444, + "time_per_iteration": 3.397054433822632 + }, + { + "auxiliary_loss_clip": 0.01172803, + "auxiliary_loss_mlp": 0.01028186, + "balance_loss_clip": 1.05575728, + "balance_loss_mlp": 1.02014554, + "epoch": 0.5344796488907594, + "flos": 26900898514560.0, + "grad_norm": 1.800701380336439, + "language_loss": 0.84740818, + "learning_rate": 1.8738903098752432e-06, + "loss": 0.86941814, + "num_input_tokens_seen": 95996225, + "step": 4445, + "time_per_iteration": 3.284879207611084 + }, + { + "auxiliary_loss_clip": 0.01152678, + "auxiliary_loss_mlp": 0.01035319, + "balance_loss_clip": 1.0510509, + "balance_loss_mlp": 1.02784491, + "epoch": 0.5345998917813984, + "flos": 25411216740480.0, + "grad_norm": 2.156058667524017, + "language_loss": 0.73302984, + "learning_rate": 1.8731128979172052e-06, + "loss": 0.75490981, + "num_input_tokens_seen": 96015425, + "step": 4446, + "time_per_iteration": 3.274986743927002 + }, + { + "auxiliary_loss_clip": 0.0114757, + "auxiliary_loss_mlp": 0.01027206, + "balance_loss_clip": 1.04932046, + "balance_loss_mlp": 1.01976156, + "epoch": 0.5347201346720375, + "flos": 32853379622400.0, + "grad_norm": 3.222399003971551, + "language_loss": 0.67065066, + "learning_rate": 1.8723355052078394e-06, + "loss": 0.69239843, + "num_input_tokens_seen": 96035460, + "step": 4447, + "time_per_iteration": 2.6478161811828613 + }, + { + "auxiliary_loss_clip": 0.01162628, + "auxiliary_loss_mlp": 0.01035614, + "balance_loss_clip": 1.04895544, + "balance_loss_mlp": 1.02728713, + "epoch": 0.5348403775626767, + "flos": 17967940536960.0, + "grad_norm": 2.1291122819468598, + "language_loss": 0.7729466, + "learning_rate": 1.8715581318650765e-06, + "loss": 0.79492891, + "num_input_tokens_seen": 96054515, + "step": 4448, + "time_per_iteration": 2.487852096557617 + }, + { + "auxiliary_loss_clip": 0.01146365, + "auxiliary_loss_mlp": 0.01032152, + "balance_loss_clip": 1.04935896, + "balance_loss_mlp": 1.02294934, + "epoch": 0.5349606204533157, + "flos": 17603339535360.0, + "grad_norm": 2.889992685840572, + "language_loss": 0.81249058, + "learning_rate": 1.8707807780068422e-06, + "loss": 0.83427572, + "num_input_tokens_seen": 96072330, + "step": 4449, + "time_per_iteration": 2.563732862472534 + }, + { + "auxiliary_loss_clip": 0.01149448, + "auxiliary_loss_mlp": 0.01026102, + "balance_loss_clip": 1.04934025, + "balance_loss_mlp": 1.01850224, + "epoch": 0.5350808633439548, + "flos": 29167831710720.0, + "grad_norm": 2.1727404396100107, + "language_loss": 0.66295588, + "learning_rate": 1.8700034437510611e-06, + "loss": 0.68471134, + "num_input_tokens_seen": 96092425, + "step": 4450, + "time_per_iteration": 2.5766232013702393 + }, + { + "auxiliary_loss_clip": 0.01127966, + "auxiliary_loss_mlp": 0.01027068, + "balance_loss_clip": 1.04678559, + "balance_loss_mlp": 1.01858044, + "epoch": 0.5352011062345938, + "flos": 19499997381120.0, + "grad_norm": 2.3822127584123676, + "language_loss": 0.81699085, + "learning_rate": 1.8692261292156549e-06, + "loss": 0.83854127, + "num_input_tokens_seen": 96111660, + "step": 4451, + "time_per_iteration": 2.55892276763916 + }, + { + "auxiliary_loss_clip": 0.01181931, + "auxiliary_loss_mlp": 0.01026485, + "balance_loss_clip": 1.05653214, + "balance_loss_mlp": 1.01860499, + "epoch": 0.535321349125233, + "flos": 23477642691840.0, + "grad_norm": 2.2935499150949257, + "language_loss": 0.8088181, + "learning_rate": 1.8684488345185401e-06, + "loss": 0.83090222, + "num_input_tokens_seen": 96131835, + "step": 4452, + "time_per_iteration": 2.480742931365967 + }, + { + "auxiliary_loss_clip": 0.0118314, + "auxiliary_loss_mlp": 0.01029692, + "balance_loss_clip": 1.05538893, + "balance_loss_mlp": 1.0215379, + "epoch": 0.535441592015872, + "flos": 20478059786880.0, + "grad_norm": 2.311672007108172, + "language_loss": 0.78634214, + "learning_rate": 1.8676715597776332e-06, + "loss": 0.80847049, + "num_input_tokens_seen": 96150180, + "step": 4453, + "time_per_iteration": 2.457456111907959 + }, + { + "auxiliary_loss_clip": 0.01116353, + "auxiliary_loss_mlp": 0.01026681, + "balance_loss_clip": 1.04480052, + "balance_loss_mlp": 1.01907301, + "epoch": 0.5355618349065111, + "flos": 19573147428480.0, + "grad_norm": 1.8608685783224879, + "language_loss": 0.76036477, + "learning_rate": 1.8668943051108455e-06, + "loss": 0.78179514, + "num_input_tokens_seen": 96167485, + "step": 4454, + "time_per_iteration": 2.5600569248199463 + }, + { + "auxiliary_loss_clip": 0.01152367, + "auxiliary_loss_mlp": 0.01031729, + "balance_loss_clip": 1.04999495, + "balance_loss_mlp": 1.02357554, + "epoch": 0.5356820777971503, + "flos": 24024633978240.0, + "grad_norm": 2.2954003349570926, + "language_loss": 0.76471627, + "learning_rate": 1.8661170706360856e-06, + "loss": 0.78655726, + "num_input_tokens_seen": 96186650, + "step": 4455, + "time_per_iteration": 2.5176632404327393 + }, + { + "auxiliary_loss_clip": 0.01166563, + "auxiliary_loss_mlp": 0.0102504, + "balance_loss_clip": 1.05359674, + "balance_loss_mlp": 1.0179832, + "epoch": 0.5358023206877893, + "flos": 20884676722560.0, + "grad_norm": 1.5808035995590666, + "language_loss": 0.81569147, + "learning_rate": 1.8653398564712594e-06, + "loss": 0.8376075, + "num_input_tokens_seen": 96205595, + "step": 4456, + "time_per_iteration": 2.469604969024658 + }, + { + "auxiliary_loss_clip": 0.0116361, + "auxiliary_loss_mlp": 0.01024759, + "balance_loss_clip": 1.05248559, + "balance_loss_mlp": 1.01721931, + "epoch": 0.5359225635784284, + "flos": 22418996123520.0, + "grad_norm": 1.587726689887629, + "language_loss": 0.81957394, + "learning_rate": 1.8645626627342704e-06, + "loss": 0.84145761, + "num_input_tokens_seen": 96226360, + "step": 4457, + "time_per_iteration": 2.493288516998291 + }, + { + "auxiliary_loss_clip": 0.01170034, + "auxiliary_loss_mlp": 0.01029637, + "balance_loss_clip": 1.05204725, + "balance_loss_mlp": 1.02191269, + "epoch": 0.5360428064690675, + "flos": 24097784025600.0, + "grad_norm": 2.3720466875049198, + "language_loss": 0.80630279, + "learning_rate": 1.8637854895430172e-06, + "loss": 0.82829952, + "num_input_tokens_seen": 96245625, + "step": 4458, + "time_per_iteration": 2.4937102794647217 + }, + { + "auxiliary_loss_clip": 0.011298, + "auxiliary_loss_mlp": 0.01028909, + "balance_loss_clip": 1.04711342, + "balance_loss_mlp": 1.02034378, + "epoch": 0.5361630493597066, + "flos": 21434505183360.0, + "grad_norm": 1.9744224313535592, + "language_loss": 0.69446087, + "learning_rate": 1.8630083370153978e-06, + "loss": 0.71604788, + "num_input_tokens_seen": 96265265, + "step": 4459, + "time_per_iteration": 2.531064748764038 + }, + { + "auxiliary_loss_clip": 0.0102752, + "auxiliary_loss_mlp": 0.01001946, + "balance_loss_clip": 1.01630402, + "balance_loss_mlp": 1.00061119, + "epoch": 0.5362832922503457, + "flos": 68888696520960.0, + "grad_norm": 0.7440207525166017, + "language_loss": 0.55384731, + "learning_rate": 1.8622312052693041e-06, + "loss": 0.57414198, + "num_input_tokens_seen": 96326445, + "step": 4460, + "time_per_iteration": 3.3009867668151855 + }, + { + "auxiliary_loss_clip": 0.01156858, + "auxiliary_loss_mlp": 0.0102676, + "balance_loss_clip": 1.0469507, + "balance_loss_mlp": 1.01891613, + "epoch": 0.5364035351409848, + "flos": 9793702563840.0, + "grad_norm": 2.8696953745643694, + "language_loss": 0.71469814, + "learning_rate": 1.8614540944226267e-06, + "loss": 0.7365343, + "num_input_tokens_seen": 96343115, + "step": 4461, + "time_per_iteration": 2.465428113937378 + }, + { + "auxiliary_loss_clip": 0.01148153, + "auxiliary_loss_mlp": 0.01025142, + "balance_loss_clip": 1.05151033, + "balance_loss_mlp": 1.01813817, + "epoch": 0.5365237780316239, + "flos": 23290080848640.0, + "grad_norm": 1.8111629146824257, + "language_loss": 0.67873704, + "learning_rate": 1.8606770045932537e-06, + "loss": 0.70046997, + "num_input_tokens_seen": 96362230, + "step": 4462, + "time_per_iteration": 2.5317370891571045 + }, + { + "auxiliary_loss_clip": 0.011292, + "auxiliary_loss_mlp": 0.01028392, + "balance_loss_clip": 1.04349601, + "balance_loss_mlp": 1.01958203, + "epoch": 0.5366440209222629, + "flos": 26578133879040.0, + "grad_norm": 2.5118439951549107, + "language_loss": 0.8180576, + "learning_rate": 1.859899935899068e-06, + "loss": 0.83963352, + "num_input_tokens_seen": 96382085, + "step": 4463, + "time_per_iteration": 2.60148024559021 + }, + { + "auxiliary_loss_clip": 0.01149671, + "auxiliary_loss_mlp": 0.01029323, + "balance_loss_clip": 1.05157375, + "balance_loss_mlp": 1.02157474, + "epoch": 0.5367642638129021, + "flos": 19608052469760.0, + "grad_norm": 1.6788293891849773, + "language_loss": 0.78976035, + "learning_rate": 1.8591228884579506e-06, + "loss": 0.81155038, + "num_input_tokens_seen": 96400580, + "step": 4464, + "time_per_iteration": 2.5060160160064697 + }, + { + "auxiliary_loss_clip": 0.01141589, + "auxiliary_loss_mlp": 0.01024322, + "balance_loss_clip": 1.04877186, + "balance_loss_mlp": 1.01648426, + "epoch": 0.5368845067035412, + "flos": 23915214172800.0, + "grad_norm": 2.3668898281615816, + "language_loss": 0.82151246, + "learning_rate": 1.8583458623877795e-06, + "loss": 0.8431716, + "num_input_tokens_seen": 96419680, + "step": 4465, + "time_per_iteration": 2.596750020980835 + }, + { + "auxiliary_loss_clip": 0.01168994, + "auxiliary_loss_mlp": 0.01027816, + "balance_loss_clip": 1.05391896, + "balance_loss_mlp": 1.02016878, + "epoch": 0.5370047495941802, + "flos": 16873131951360.0, + "grad_norm": 1.8706443126688936, + "language_loss": 0.7411629, + "learning_rate": 1.8575688578064281e-06, + "loss": 0.76313096, + "num_input_tokens_seen": 96437805, + "step": 4466, + "time_per_iteration": 2.521068572998047 + }, + { + "auxiliary_loss_clip": 0.01168833, + "auxiliary_loss_mlp": 0.01029297, + "balance_loss_clip": 1.05360854, + "balance_loss_mlp": 1.02138186, + "epoch": 0.5371249924848194, + "flos": 20740926493440.0, + "grad_norm": 1.9177699885269366, + "language_loss": 0.7666772, + "learning_rate": 1.8567918748317674e-06, + "loss": 0.7886585, + "num_input_tokens_seen": 96457155, + "step": 4467, + "time_per_iteration": 2.4760682582855225 + }, + { + "auxiliary_loss_clip": 0.01133891, + "auxiliary_loss_mlp": 0.01030291, + "balance_loss_clip": 1.04365122, + "balance_loss_mlp": 1.02252495, + "epoch": 0.5372452353754584, + "flos": 17968120104960.0, + "grad_norm": 3.398077225285846, + "language_loss": 0.82791841, + "learning_rate": 1.8560149135816659e-06, + "loss": 0.84956026, + "num_input_tokens_seen": 96473990, + "step": 4468, + "time_per_iteration": 2.509683847427368 + }, + { + "auxiliary_loss_clip": 0.0116017, + "auxiliary_loss_mlp": 0.01023569, + "balance_loss_clip": 1.04819143, + "balance_loss_mlp": 1.01624644, + "epoch": 0.5373654782660975, + "flos": 15377021642880.0, + "grad_norm": 2.5908109237472323, + "language_loss": 0.8407495, + "learning_rate": 1.8552379741739873e-06, + "loss": 0.86258686, + "num_input_tokens_seen": 96491335, + "step": 4469, + "time_per_iteration": 2.460700273513794 + }, + { + "auxiliary_loss_clip": 0.01042836, + "auxiliary_loss_mlp": 0.00752903, + "balance_loss_clip": 1.01483274, + "balance_loss_mlp": 1.00090015, + "epoch": 0.5374857211567367, + "flos": 69000091574400.0, + "grad_norm": 0.8991451433473153, + "language_loss": 0.55699176, + "learning_rate": 1.8544610567265935e-06, + "loss": 0.57494915, + "num_input_tokens_seen": 96545275, + "step": 4470, + "time_per_iteration": 3.8337066173553467 + }, + { + "auxiliary_loss_clip": 0.0115257, + "auxiliary_loss_mlp": 0.00762349, + "balance_loss_clip": 1.05289447, + "balance_loss_mlp": 1.00147021, + "epoch": 0.5376059640473757, + "flos": 15085355207040.0, + "grad_norm": 2.436164943030122, + "language_loss": 0.83505356, + "learning_rate": 1.853684161357341e-06, + "loss": 0.85420281, + "num_input_tokens_seen": 96562935, + "step": 4471, + "time_per_iteration": 3.2223236560821533 + }, + { + "auxiliary_loss_clip": 0.01163988, + "auxiliary_loss_mlp": 0.00762695, + "balance_loss_clip": 1.05265892, + "balance_loss_mlp": 1.00135922, + "epoch": 0.5377262069380148, + "flos": 19792597570560.0, + "grad_norm": 2.418243336961385, + "language_loss": 0.76836157, + "learning_rate": 1.852907288184085e-06, + "loss": 0.78762841, + "num_input_tokens_seen": 96581820, + "step": 4472, + "time_per_iteration": 3.4164624214172363 + }, + { + "auxiliary_loss_clip": 0.01127032, + "auxiliary_loss_mlp": 0.01025845, + "balance_loss_clip": 1.04692602, + "balance_loss_mlp": 1.01701474, + "epoch": 0.5378464498286539, + "flos": 30003077640960.0, + "grad_norm": 1.9932689310831238, + "language_loss": 0.69816554, + "learning_rate": 1.8521304373246762e-06, + "loss": 0.71969432, + "num_input_tokens_seen": 96602865, + "step": 4473, + "time_per_iteration": 3.395699977874756 + }, + { + "auxiliary_loss_clip": 0.01168828, + "auxiliary_loss_mlp": 0.01027443, + "balance_loss_clip": 1.05119205, + "balance_loss_mlp": 1.01970673, + "epoch": 0.537966692719293, + "flos": 21251217058560.0, + "grad_norm": 2.715717982667294, + "language_loss": 0.88615435, + "learning_rate": 1.8513536088969626e-06, + "loss": 0.90811706, + "num_input_tokens_seen": 96620530, + "step": 4474, + "time_per_iteration": 2.4705841541290283 + }, + { + "auxiliary_loss_clip": 0.01169441, + "auxiliary_loss_mlp": 0.01035668, + "balance_loss_clip": 1.05489135, + "balance_loss_mlp": 1.0269959, + "epoch": 0.538086935609932, + "flos": 21543170803200.0, + "grad_norm": 2.411935974068831, + "language_loss": 0.8017168, + "learning_rate": 1.8505768030187884e-06, + "loss": 0.8237679, + "num_input_tokens_seen": 96640660, + "step": 4475, + "time_per_iteration": 2.511242151260376 + }, + { + "auxiliary_loss_clip": 0.01148048, + "auxiliary_loss_mlp": 0.01027181, + "balance_loss_clip": 1.05134559, + "balance_loss_mlp": 1.01977515, + "epoch": 0.5382071785005712, + "flos": 22747219626240.0, + "grad_norm": 1.5303400659328725, + "language_loss": 0.80071759, + "learning_rate": 1.849800019807995e-06, + "loss": 0.82246989, + "num_input_tokens_seen": 96661885, + "step": 4476, + "time_per_iteration": 2.5383715629577637 + }, + { + "auxiliary_loss_clip": 0.01137481, + "auxiliary_loss_mlp": 0.01027537, + "balance_loss_clip": 1.04975283, + "balance_loss_mlp": 1.01997638, + "epoch": 0.5383274213912103, + "flos": 24934574240640.0, + "grad_norm": 2.030652937609397, + "language_loss": 0.70839709, + "learning_rate": 1.8490232593824186e-06, + "loss": 0.73004729, + "num_input_tokens_seen": 96678340, + "step": 4477, + "time_per_iteration": 2.588932991027832 + }, + { + "auxiliary_loss_clip": 0.01150203, + "auxiliary_loss_mlp": 0.01028635, + "balance_loss_clip": 1.05218744, + "balance_loss_mlp": 1.02179849, + "epoch": 0.5384476642818493, + "flos": 22310186849280.0, + "grad_norm": 2.2097960629529787, + "language_loss": 0.84832579, + "learning_rate": 1.8482465218598935e-06, + "loss": 0.87011415, + "num_input_tokens_seen": 96698285, + "step": 4478, + "time_per_iteration": 2.520014762878418 + }, + { + "auxiliary_loss_clip": 0.01138474, + "auxiliary_loss_mlp": 0.01028232, + "balance_loss_clip": 1.04822946, + "balance_loss_mlp": 1.01999474, + "epoch": 0.5385679071724885, + "flos": 22711021695360.0, + "grad_norm": 1.794534266304845, + "language_loss": 0.83220768, + "learning_rate": 1.8474698073582508e-06, + "loss": 0.85387468, + "num_input_tokens_seen": 96719655, + "step": 4479, + "time_per_iteration": 2.5702717304229736 + }, + { + "auxiliary_loss_clip": 0.01142564, + "auxiliary_loss_mlp": 0.0102309, + "balance_loss_clip": 1.04761481, + "balance_loss_mlp": 1.01503491, + "epoch": 0.5386881500631275, + "flos": 15953746412160.0, + "grad_norm": 2.0856940446459697, + "language_loss": 0.86681885, + "learning_rate": 1.8466931159953166e-06, + "loss": 0.88847542, + "num_input_tokens_seen": 96736290, + "step": 4480, + "time_per_iteration": 2.507686138153076 + }, + { + "auxiliary_loss_clip": 0.01158149, + "auxiliary_loss_mlp": 0.01029636, + "balance_loss_clip": 1.05422151, + "balance_loss_mlp": 1.02182186, + "epoch": 0.5388083929537666, + "flos": 24060041809920.0, + "grad_norm": 1.7909882953380265, + "language_loss": 0.84159827, + "learning_rate": 1.8459164478889158e-06, + "loss": 0.86347616, + "num_input_tokens_seen": 96757685, + "step": 4481, + "time_per_iteration": 2.5737433433532715 + }, + { + "auxiliary_loss_clip": 0.01127227, + "auxiliary_loss_mlp": 0.01023334, + "balance_loss_clip": 1.04452252, + "balance_loss_mlp": 1.0159905, + "epoch": 0.5389286358444056, + "flos": 22236893147520.0, + "grad_norm": 1.9908483023790802, + "language_loss": 0.7573086, + "learning_rate": 1.8451398031568663e-06, + "loss": 0.7788142, + "num_input_tokens_seen": 96777310, + "step": 4482, + "time_per_iteration": 2.5474579334259033 + }, + { + "auxiliary_loss_clip": 0.01136851, + "auxiliary_loss_mlp": 0.01023734, + "balance_loss_clip": 1.04821825, + "balance_loss_mlp": 1.01594996, + "epoch": 0.5390488787350448, + "flos": 24281718595200.0, + "grad_norm": 1.7458847340846384, + "language_loss": 0.7461468, + "learning_rate": 1.844363181916986e-06, + "loss": 0.76775265, + "num_input_tokens_seen": 96798035, + "step": 4483, + "time_per_iteration": 2.60013484954834 + }, + { + "auxiliary_loss_clip": 0.01161793, + "auxiliary_loss_mlp": 0.01027587, + "balance_loss_clip": 1.0493331, + "balance_loss_mlp": 1.01982665, + "epoch": 0.5391691216256839, + "flos": 16581393688320.0, + "grad_norm": 1.9798652564074526, + "language_loss": 0.8273716, + "learning_rate": 1.8435865842870868e-06, + "loss": 0.84926546, + "num_input_tokens_seen": 96815975, + "step": 4484, + "time_per_iteration": 2.486957550048828 + }, + { + "auxiliary_loss_clip": 0.01141481, + "auxiliary_loss_mlp": 0.00762636, + "balance_loss_clip": 1.04560292, + "balance_loss_mlp": 1.00149214, + "epoch": 0.5392893645163229, + "flos": 23330049707520.0, + "grad_norm": 1.797415532974992, + "language_loss": 0.71912795, + "learning_rate": 1.8428100103849787e-06, + "loss": 0.73816907, + "num_input_tokens_seen": 96835770, + "step": 4485, + "time_per_iteration": 2.5560615062713623 + }, + { + "auxiliary_loss_clip": 0.01153619, + "auxiliary_loss_mlp": 0.01031784, + "balance_loss_clip": 1.05378699, + "balance_loss_mlp": 1.02335, + "epoch": 0.5394096074069621, + "flos": 15669801400320.0, + "grad_norm": 2.1175505467565423, + "language_loss": 0.73303509, + "learning_rate": 1.842033460328467e-06, + "loss": 0.75488913, + "num_input_tokens_seen": 96854490, + "step": 4486, + "time_per_iteration": 2.4711577892303467 + }, + { + "auxiliary_loss_clip": 0.01151764, + "auxiliary_loss_mlp": 0.0076254, + "balance_loss_clip": 1.04708695, + "balance_loss_mlp": 1.00162947, + "epoch": 0.5395298502976011, + "flos": 22893447893760.0, + "grad_norm": 1.9962810686072572, + "language_loss": 0.75186706, + "learning_rate": 1.8412569342353541e-06, + "loss": 0.7710101, + "num_input_tokens_seen": 96874645, + "step": 4487, + "time_per_iteration": 2.5599443912506104 + }, + { + "auxiliary_loss_clip": 0.01157741, + "auxiliary_loss_mlp": 0.01031142, + "balance_loss_clip": 1.05317414, + "balance_loss_mlp": 1.02276123, + "epoch": 0.5396500931882402, + "flos": 23842135952640.0, + "grad_norm": 1.934159714171677, + "language_loss": 0.84970576, + "learning_rate": 1.840480432223438e-06, + "loss": 0.87159455, + "num_input_tokens_seen": 96893650, + "step": 4488, + "time_per_iteration": 2.5142815113067627 + }, + { + "auxiliary_loss_clip": 0.01153572, + "auxiliary_loss_mlp": 0.0103143, + "balance_loss_clip": 1.04892564, + "balance_loss_mlp": 1.02329969, + "epoch": 0.5397703360788794, + "flos": 26322988596480.0, + "grad_norm": 2.0907229416479076, + "language_loss": 0.77833629, + "learning_rate": 1.8397039544105131e-06, + "loss": 0.80018634, + "num_input_tokens_seen": 96912735, + "step": 4489, + "time_per_iteration": 2.565319538116455 + }, + { + "auxiliary_loss_clip": 0.01146689, + "auxiliary_loss_mlp": 0.0102771, + "balance_loss_clip": 1.04718113, + "balance_loss_mlp": 1.01972938, + "epoch": 0.5398905789695184, + "flos": 21214588164480.0, + "grad_norm": 1.9882293368153572, + "language_loss": 0.69645512, + "learning_rate": 1.8389275009143711e-06, + "loss": 0.71819913, + "num_input_tokens_seen": 96932475, + "step": 4490, + "time_per_iteration": 2.5265297889709473 + }, + { + "auxiliary_loss_clip": 0.01176664, + "auxiliary_loss_mlp": 0.0102768, + "balance_loss_clip": 1.05095828, + "balance_loss_mlp": 1.02022386, + "epoch": 0.5400108218601575, + "flos": 25080335631360.0, + "grad_norm": 1.843695262404593, + "language_loss": 0.73444116, + "learning_rate": 1.8381510718527988e-06, + "loss": 0.75648457, + "num_input_tokens_seen": 96952085, + "step": 4491, + "time_per_iteration": 2.459085702896118 + }, + { + "auxiliary_loss_clip": 0.01153711, + "auxiliary_loss_mlp": 0.01028493, + "balance_loss_clip": 1.04716301, + "balance_loss_mlp": 1.0203805, + "epoch": 0.5401310647507966, + "flos": 26357498588160.0, + "grad_norm": 2.334171271640039, + "language_loss": 0.63179767, + "learning_rate": 1.8373746673435812e-06, + "loss": 0.65361965, + "num_input_tokens_seen": 96973110, + "step": 4492, + "time_per_iteration": 2.5258986949920654 + }, + { + "auxiliary_loss_clip": 0.01183041, + "auxiliary_loss_mlp": 0.01028149, + "balance_loss_clip": 1.05534458, + "balance_loss_mlp": 1.02014077, + "epoch": 0.5402513076414357, + "flos": 27855332749440.0, + "grad_norm": 1.8744696329345907, + "language_loss": 0.79152662, + "learning_rate": 1.8365982875044964e-06, + "loss": 0.81363857, + "num_input_tokens_seen": 96993420, + "step": 4493, + "time_per_iteration": 2.4828436374664307 + }, + { + "auxiliary_loss_clip": 0.01170389, + "auxiliary_loss_mlp": 0.00763276, + "balance_loss_clip": 1.05213094, + "balance_loss_mlp": 1.00141668, + "epoch": 0.5403715505320748, + "flos": 22893771116160.0, + "grad_norm": 4.628718218564786, + "language_loss": 0.75337684, + "learning_rate": 1.8358219324533217e-06, + "loss": 0.77271348, + "num_input_tokens_seen": 97013685, + "step": 4494, + "time_per_iteration": 2.47296142578125 + }, + { + "auxiliary_loss_clip": 0.01144856, + "auxiliary_loss_mlp": 0.01024882, + "balance_loss_clip": 1.04697871, + "balance_loss_mlp": 1.01784909, + "epoch": 0.5404917934227139, + "flos": 30224143895040.0, + "grad_norm": 1.6414650167581435, + "language_loss": 0.70341754, + "learning_rate": 1.8350456023078292e-06, + "loss": 0.72511488, + "num_input_tokens_seen": 97036060, + "step": 4495, + "time_per_iteration": 2.5930674076080322 + }, + { + "auxiliary_loss_clip": 0.0118374, + "auxiliary_loss_mlp": 0.01035636, + "balance_loss_clip": 1.05322742, + "balance_loss_mlp": 1.02726173, + "epoch": 0.540612036313353, + "flos": 19938502615680.0, + "grad_norm": 2.435815170406642, + "language_loss": 0.78209341, + "learning_rate": 1.8342692971857874e-06, + "loss": 0.8042872, + "num_input_tokens_seen": 97055260, + "step": 4496, + "time_per_iteration": 2.4164798259735107 + }, + { + "auxiliary_loss_clip": 0.01148818, + "auxiliary_loss_mlp": 0.01029728, + "balance_loss_clip": 1.04945624, + "balance_loss_mlp": 1.0221163, + "epoch": 0.540732279203992, + "flos": 24279599692800.0, + "grad_norm": 5.424152236670788, + "language_loss": 0.71089041, + "learning_rate": 1.833493017204962e-06, + "loss": 0.73267585, + "num_input_tokens_seen": 97075365, + "step": 4497, + "time_per_iteration": 3.2657666206359863 + }, + { + "auxiliary_loss_clip": 0.01179782, + "auxiliary_loss_mlp": 0.01029544, + "balance_loss_clip": 1.05265403, + "balance_loss_mlp": 1.02171206, + "epoch": 0.5408525220946312, + "flos": 20193216935040.0, + "grad_norm": 1.7882521358880945, + "language_loss": 0.77685893, + "learning_rate": 1.8327167624831134e-06, + "loss": 0.79895216, + "num_input_tokens_seen": 97093095, + "step": 4498, + "time_per_iteration": 3.2036824226379395 + }, + { + "auxiliary_loss_clip": 0.01179006, + "auxiliary_loss_mlp": 0.01024122, + "balance_loss_clip": 1.05291295, + "balance_loss_mlp": 1.0164752, + "epoch": 0.5409727649852702, + "flos": 24134448833280.0, + "grad_norm": 1.6065250699573972, + "language_loss": 0.70877832, + "learning_rate": 1.831940533137999e-06, + "loss": 0.73080957, + "num_input_tokens_seen": 97112000, + "step": 4499, + "time_per_iteration": 3.2055768966674805 + }, + { + "auxiliary_loss_clip": 0.01162711, + "auxiliary_loss_mlp": 0.01024721, + "balance_loss_clip": 1.05265903, + "balance_loss_mlp": 1.01712728, + "epoch": 0.5410930078759093, + "flos": 23912700220800.0, + "grad_norm": 1.7165346372108592, + "language_loss": 0.72393042, + "learning_rate": 1.8311643292873718e-06, + "loss": 0.74580467, + "num_input_tokens_seen": 97130820, + "step": 4500, + "time_per_iteration": 2.4859158992767334 + }, + { + "auxiliary_loss_clip": 0.01163365, + "auxiliary_loss_mlp": 0.01028019, + "balance_loss_clip": 1.05295861, + "balance_loss_mlp": 1.02089942, + "epoch": 0.5412132507665485, + "flos": 21105132445440.0, + "grad_norm": 1.8355334094986597, + "language_loss": 0.88118249, + "learning_rate": 1.8303881510489818e-06, + "loss": 0.90309632, + "num_input_tokens_seen": 97149210, + "step": 4501, + "time_per_iteration": 2.455223560333252 + }, + { + "auxiliary_loss_clip": 0.01151735, + "auxiliary_loss_mlp": 0.01030562, + "balance_loss_clip": 1.05047917, + "balance_loss_mlp": 1.02146649, + "epoch": 0.5413334936571875, + "flos": 30227340205440.0, + "grad_norm": 1.922326552460339, + "language_loss": 0.69069397, + "learning_rate": 1.829611998540574e-06, + "loss": 0.71251696, + "num_input_tokens_seen": 97170415, + "step": 4502, + "time_per_iteration": 2.57296085357666 + }, + { + "auxiliary_loss_clip": 0.01166397, + "auxiliary_loss_mlp": 0.00762758, + "balance_loss_clip": 1.0498929, + "balance_loss_mlp": 1.00146699, + "epoch": 0.5414537365478266, + "flos": 24279635606400.0, + "grad_norm": 1.8327293798983189, + "language_loss": 0.80056906, + "learning_rate": 1.8288358718798914e-06, + "loss": 0.8198607, + "num_input_tokens_seen": 97189605, + "step": 4503, + "time_per_iteration": 2.4885506629943848 + }, + { + "auxiliary_loss_clip": 0.01160976, + "auxiliary_loss_mlp": 0.00762485, + "balance_loss_clip": 1.05203342, + "balance_loss_mlp": 1.00154543, + "epoch": 0.5415739794384657, + "flos": 16654543735680.0, + "grad_norm": 1.8330495218986405, + "language_loss": 0.72605544, + "learning_rate": 1.8280597711846703e-06, + "loss": 0.74529004, + "num_input_tokens_seen": 97207845, + "step": 4504, + "time_per_iteration": 2.4562439918518066 + }, + { + "auxiliary_loss_clip": 0.01162239, + "auxiliary_loss_mlp": 0.01025345, + "balance_loss_clip": 1.05253577, + "balance_loss_mlp": 1.0175066, + "epoch": 0.5416942223291048, + "flos": 23185724860800.0, + "grad_norm": 2.089635834317497, + "language_loss": 0.83309615, + "learning_rate": 1.8272836965726455e-06, + "loss": 0.85497189, + "num_input_tokens_seen": 97226780, + "step": 4505, + "time_per_iteration": 2.4654922485351562 + }, + { + "auxiliary_loss_clip": 0.0111087, + "auxiliary_loss_mlp": 0.01031999, + "balance_loss_clip": 1.04455972, + "balance_loss_mlp": 1.0235827, + "epoch": 0.5418144652197439, + "flos": 20303247271680.0, + "grad_norm": 1.805652124442668, + "language_loss": 0.78140306, + "learning_rate": 1.8265076481615461e-06, + "loss": 0.80283177, + "num_input_tokens_seen": 97246695, + "step": 4506, + "time_per_iteration": 2.5966038703918457 + }, + { + "auxiliary_loss_clip": 0.01149924, + "auxiliary_loss_mlp": 0.01029084, + "balance_loss_clip": 1.05162621, + "balance_loss_mlp": 1.02066743, + "epoch": 0.541934708110383, + "flos": 12458633431680.0, + "grad_norm": 2.171403475228418, + "language_loss": 0.87452465, + "learning_rate": 1.8257316260690987e-06, + "loss": 0.89631474, + "num_input_tokens_seen": 97264480, + "step": 4507, + "time_per_iteration": 2.481752395629883 + }, + { + "auxiliary_loss_clip": 0.01165907, + "auxiliary_loss_mlp": 0.01019891, + "balance_loss_clip": 1.05085254, + "balance_loss_mlp": 1.01274168, + "epoch": 0.5420549510010221, + "flos": 21253802837760.0, + "grad_norm": 1.4433271716632479, + "language_loss": 0.75910044, + "learning_rate": 1.8249556304130254e-06, + "loss": 0.78095841, + "num_input_tokens_seen": 97285760, + "step": 4508, + "time_per_iteration": 2.4786641597747803 + }, + { + "auxiliary_loss_clip": 0.01140994, + "auxiliary_loss_mlp": 0.01026245, + "balance_loss_clip": 1.04757762, + "balance_loss_mlp": 1.01817405, + "epoch": 0.5421751938916611, + "flos": 29490524519040.0, + "grad_norm": 1.9691089000781297, + "language_loss": 0.68471003, + "learning_rate": 1.824179661311044e-06, + "loss": 0.70638239, + "num_input_tokens_seen": 97304510, + "step": 4509, + "time_per_iteration": 2.586970329284668 + }, + { + "auxiliary_loss_clip": 0.01121671, + "auxiliary_loss_mlp": 0.01025997, + "balance_loss_clip": 1.04296827, + "balance_loss_mlp": 1.01792049, + "epoch": 0.5422954367823003, + "flos": 18734238311040.0, + "grad_norm": 1.9025922695822397, + "language_loss": 0.79700267, + "learning_rate": 1.823403718880868e-06, + "loss": 0.81847942, + "num_input_tokens_seen": 97323270, + "step": 4510, + "time_per_iteration": 2.6714539527893066 + }, + { + "auxiliary_loss_clip": 0.01150304, + "auxiliary_loss_mlp": 0.01031012, + "balance_loss_clip": 1.0463258, + "balance_loss_mlp": 1.02300739, + "epoch": 0.5424156796729394, + "flos": 39969006940800.0, + "grad_norm": 1.8578198931228, + "language_loss": 0.66370261, + "learning_rate": 1.822627803240207e-06, + "loss": 0.68551576, + "num_input_tokens_seen": 97345600, + "step": 4511, + "time_per_iteration": 2.6483590602874756 + }, + { + "auxiliary_loss_clip": 0.01142247, + "auxiliary_loss_mlp": 0.01031098, + "balance_loss_clip": 1.05065751, + "balance_loss_mlp": 1.02360296, + "epoch": 0.5425359225635784, + "flos": 11546538353280.0, + "grad_norm": 2.311419074567973, + "language_loss": 0.84979451, + "learning_rate": 1.8218519145067675e-06, + "loss": 0.87152791, + "num_input_tokens_seen": 97361220, + "step": 4512, + "time_per_iteration": 2.554234027862549 + }, + { + "auxiliary_loss_clip": 0.01131731, + "auxiliary_loss_mlp": 0.01031925, + "balance_loss_clip": 1.04592538, + "balance_loss_mlp": 1.02355623, + "epoch": 0.5426561654542175, + "flos": 20229702174720.0, + "grad_norm": 2.3217680143677675, + "language_loss": 0.89552146, + "learning_rate": 1.8210760527982508e-06, + "loss": 0.91715801, + "num_input_tokens_seen": 97381505, + "step": 4513, + "time_per_iteration": 2.5339155197143555 + }, + { + "auxiliary_loss_clip": 0.01152611, + "auxiliary_loss_mlp": 0.00762626, + "balance_loss_clip": 1.05169654, + "balance_loss_mlp": 1.00147414, + "epoch": 0.5427764083448566, + "flos": 21871681614720.0, + "grad_norm": 2.1369370224475834, + "language_loss": 0.75306004, + "learning_rate": 1.8203002182323552e-06, + "loss": 0.77221245, + "num_input_tokens_seen": 97399060, + "step": 4514, + "time_per_iteration": 2.5064430236816406 + }, + { + "auxiliary_loss_clip": 0.01153697, + "auxiliary_loss_mlp": 0.01026886, + "balance_loss_clip": 1.05156231, + "balance_loss_mlp": 1.0185411, + "epoch": 0.5428966512354957, + "flos": 19640946349440.0, + "grad_norm": 1.9171945498458558, + "language_loss": 0.7538352, + "learning_rate": 1.819524410926773e-06, + "loss": 0.77564108, + "num_input_tokens_seen": 97416740, + "step": 4515, + "time_per_iteration": 2.502310276031494 + }, + { + "auxiliary_loss_clip": 0.01109049, + "auxiliary_loss_mlp": 0.0102687, + "balance_loss_clip": 1.04837263, + "balance_loss_mlp": 1.01862693, + "epoch": 0.5430168941261347, + "flos": 22382187661440.0, + "grad_norm": 1.657239511104359, + "language_loss": 0.77044022, + "learning_rate": 1.8187486309991944e-06, + "loss": 0.79179937, + "num_input_tokens_seen": 97437620, + "step": 4516, + "time_per_iteration": 2.6412606239318848 + }, + { + "auxiliary_loss_clip": 0.01171935, + "auxiliary_loss_mlp": 0.01031009, + "balance_loss_clip": 1.05379796, + "balance_loss_mlp": 1.02351999, + "epoch": 0.5431371370167739, + "flos": 18764187275520.0, + "grad_norm": 1.6026893084603862, + "language_loss": 0.77565038, + "learning_rate": 1.817972878567304e-06, + "loss": 0.79767984, + "num_input_tokens_seen": 97456275, + "step": 4517, + "time_per_iteration": 2.465698003768921 + }, + { + "auxiliary_loss_clip": 0.01158082, + "auxiliary_loss_mlp": 0.01027532, + "balance_loss_clip": 1.05057001, + "balance_loss_mlp": 1.01986098, + "epoch": 0.543257379907413, + "flos": 18806023641600.0, + "grad_norm": 2.425952048255131, + "language_loss": 0.76460916, + "learning_rate": 1.8171971537487834e-06, + "loss": 0.78646529, + "num_input_tokens_seen": 97474925, + "step": 4518, + "time_per_iteration": 2.517136335372925 + }, + { + "auxiliary_loss_clip": 0.01180353, + "auxiliary_loss_mlp": 0.01031519, + "balance_loss_clip": 1.05221152, + "balance_loss_mlp": 1.0232935, + "epoch": 0.543377622798052, + "flos": 17493381025920.0, + "grad_norm": 1.929152870228996, + "language_loss": 0.80931497, + "learning_rate": 1.8164214566613093e-06, + "loss": 0.83143377, + "num_input_tokens_seen": 97493550, + "step": 4519, + "time_per_iteration": 2.4210543632507324 + }, + { + "auxiliary_loss_clip": 0.01178891, + "auxiliary_loss_mlp": 0.01026567, + "balance_loss_clip": 1.05232632, + "balance_loss_mlp": 1.01881194, + "epoch": 0.5434978656886912, + "flos": 18989311766400.0, + "grad_norm": 2.718985072744706, + "language_loss": 0.65127814, + "learning_rate": 1.8156457874225547e-06, + "loss": 0.67333275, + "num_input_tokens_seen": 97512010, + "step": 4520, + "time_per_iteration": 2.408815860748291 + }, + { + "auxiliary_loss_clip": 0.01140672, + "auxiliary_loss_mlp": 0.01024091, + "balance_loss_clip": 1.04838538, + "balance_loss_mlp": 1.01609182, + "epoch": 0.5436181085793302, + "flos": 17274936464640.0, + "grad_norm": 2.0470315433617845, + "language_loss": 0.8082667, + "learning_rate": 1.814870146150187e-06, + "loss": 0.82991433, + "num_input_tokens_seen": 97530120, + "step": 4521, + "time_per_iteration": 2.484025239944458 + }, + { + "auxiliary_loss_clip": 0.01157912, + "auxiliary_loss_mlp": 0.01031512, + "balance_loss_clip": 1.04848027, + "balance_loss_mlp": 1.02354312, + "epoch": 0.5437383514699693, + "flos": 19098587917440.0, + "grad_norm": 3.5588937933354607, + "language_loss": 0.78565705, + "learning_rate": 1.814094532961871e-06, + "loss": 0.80755126, + "num_input_tokens_seen": 97548695, + "step": 4522, + "time_per_iteration": 2.5091967582702637 + }, + { + "auxiliary_loss_clip": 0.01124806, + "auxiliary_loss_mlp": 0.01032763, + "balance_loss_clip": 1.04559684, + "balance_loss_mlp": 1.02447772, + "epoch": 0.5438585943606085, + "flos": 22602715211520.0, + "grad_norm": 1.9738180370106113, + "language_loss": 0.83248866, + "learning_rate": 1.8133189479752666e-06, + "loss": 0.85406435, + "num_input_tokens_seen": 97567625, + "step": 4523, + "time_per_iteration": 2.564119815826416 + }, + { + "auxiliary_loss_clip": 0.01178628, + "auxiliary_loss_mlp": 0.01024787, + "balance_loss_clip": 1.05270326, + "balance_loss_mlp": 1.01738667, + "epoch": 0.5439788372512475, + "flos": 21798495653760.0, + "grad_norm": 2.2129095978548268, + "language_loss": 0.81661201, + "learning_rate": 1.8125433913080292e-06, + "loss": 0.83864617, + "num_input_tokens_seen": 97585325, + "step": 4524, + "time_per_iteration": 3.212411642074585 + }, + { + "auxiliary_loss_clip": 0.01055883, + "auxiliary_loss_mlp": 0.01027326, + "balance_loss_clip": 1.03763092, + "balance_loss_mlp": 1.0205009, + "epoch": 0.5440990801418866, + "flos": 16399362539520.0, + "grad_norm": 2.086254740983938, + "language_loss": 0.82374042, + "learning_rate": 1.811767863077811e-06, + "loss": 0.84457242, + "num_input_tokens_seen": 97604275, + "step": 4525, + "time_per_iteration": 4.290956258773804 + }, + { + "auxiliary_loss_clip": 0.01098627, + "auxiliary_loss_mlp": 0.01031229, + "balance_loss_clip": 1.04433191, + "balance_loss_mlp": 1.02319396, + "epoch": 0.5442193230325257, + "flos": 21615638492160.0, + "grad_norm": 1.725860905822709, + "language_loss": 0.7830044, + "learning_rate": 1.8109923634022577e-06, + "loss": 0.80430299, + "num_input_tokens_seen": 97624300, + "step": 4526, + "time_per_iteration": 3.715850591659546 + }, + { + "auxiliary_loss_clip": 0.01183879, + "auxiliary_loss_mlp": 0.0102774, + "balance_loss_clip": 1.05453324, + "balance_loss_mlp": 1.0197767, + "epoch": 0.5443395659231648, + "flos": 15481198062720.0, + "grad_norm": 2.3782452479471226, + "language_loss": 0.86611319, + "learning_rate": 1.8102168923990128e-06, + "loss": 0.88822937, + "num_input_tokens_seen": 97637845, + "step": 4527, + "time_per_iteration": 2.4091832637786865 + }, + { + "auxiliary_loss_clip": 0.01169905, + "auxiliary_loss_mlp": 0.00762371, + "balance_loss_clip": 1.05468524, + "balance_loss_mlp": 1.00153673, + "epoch": 0.5444598088138038, + "flos": 18770436241920.0, + "grad_norm": 1.8196574498243943, + "language_loss": 0.80300891, + "learning_rate": 1.809441450185714e-06, + "loss": 0.82233167, + "num_input_tokens_seen": 97656330, + "step": 4528, + "time_per_iteration": 2.5029289722442627 + }, + { + "auxiliary_loss_clip": 0.01155232, + "auxiliary_loss_mlp": 0.01026607, + "balance_loss_clip": 1.04723334, + "balance_loss_mlp": 1.01853085, + "epoch": 0.544580051704443, + "flos": 21142335957120.0, + "grad_norm": 2.3128424372485146, + "language_loss": 0.7372942, + "learning_rate": 1.8086660368799958e-06, + "loss": 0.7591126, + "num_input_tokens_seen": 97674380, + "step": 4529, + "time_per_iteration": 2.5166404247283936 + }, + { + "auxiliary_loss_clip": 0.01155409, + "auxiliary_loss_mlp": 0.01026708, + "balance_loss_clip": 1.05252516, + "balance_loss_mlp": 1.01814866, + "epoch": 0.5447002945950821, + "flos": 32491508054400.0, + "grad_norm": 1.6701160933338788, + "language_loss": 0.77286351, + "learning_rate": 1.807890652599488e-06, + "loss": 0.79468471, + "num_input_tokens_seen": 97698765, + "step": 4530, + "time_per_iteration": 2.6504127979278564 + }, + { + "auxiliary_loss_clip": 0.01179844, + "auxiliary_loss_mlp": 0.01028375, + "balance_loss_clip": 1.05380297, + "balance_loss_mlp": 1.02126765, + "epoch": 0.5448205374857211, + "flos": 11798307757440.0, + "grad_norm": 2.4579983324521377, + "language_loss": 0.8285532, + "learning_rate": 1.8071152974618156e-06, + "loss": 0.85063541, + "num_input_tokens_seen": 97716565, + "step": 4531, + "time_per_iteration": 2.399745225906372 + }, + { + "auxiliary_loss_clip": 0.01140107, + "auxiliary_loss_mlp": 0.00762373, + "balance_loss_clip": 1.04793572, + "balance_loss_mlp": 1.00143802, + "epoch": 0.5449407803763603, + "flos": 24133766474880.0, + "grad_norm": 2.149246415832897, + "language_loss": 0.7839824, + "learning_rate": 1.806339971584599e-06, + "loss": 0.80300725, + "num_input_tokens_seen": 97733225, + "step": 4532, + "time_per_iteration": 2.581568479537964 + }, + { + "auxiliary_loss_clip": 0.01180144, + "auxiliary_loss_mlp": 0.01027454, + "balance_loss_clip": 1.05222476, + "balance_loss_mlp": 1.01921034, + "epoch": 0.5450610232669993, + "flos": 23258551685760.0, + "grad_norm": 1.7118710597012996, + "language_loss": 0.85089266, + "learning_rate": 1.8055646750854546e-06, + "loss": 0.87296867, + "num_input_tokens_seen": 97752735, + "step": 4533, + "time_per_iteration": 2.4403836727142334 + }, + { + "auxiliary_loss_clip": 0.01157367, + "auxiliary_loss_mlp": 0.01027256, + "balance_loss_clip": 1.05212617, + "balance_loss_mlp": 1.01892304, + "epoch": 0.5451812661576384, + "flos": 17785083375360.0, + "grad_norm": 2.857959403520116, + "language_loss": 0.81354243, + "learning_rate": 1.8047894080819945e-06, + "loss": 0.83538866, + "num_input_tokens_seen": 97769985, + "step": 4534, + "time_per_iteration": 2.503911018371582 + }, + { + "auxiliary_loss_clip": 0.0107851, + "auxiliary_loss_mlp": 0.01001592, + "balance_loss_clip": 1.01636434, + "balance_loss_mlp": 1.00038815, + "epoch": 0.5453015090482776, + "flos": 71062586513280.0, + "grad_norm": 0.723256328324059, + "language_loss": 0.63177532, + "learning_rate": 1.8040141706918258e-06, + "loss": 0.65257633, + "num_input_tokens_seen": 97831225, + "step": 4535, + "time_per_iteration": 3.1279690265655518 + }, + { + "auxiliary_loss_clip": 0.01151801, + "auxiliary_loss_mlp": 0.01031313, + "balance_loss_clip": 1.0506084, + "balance_loss_mlp": 1.0230695, + "epoch": 0.5454217519389166, + "flos": 25552201622400.0, + "grad_norm": 1.7412648372918527, + "language_loss": 0.76600248, + "learning_rate": 1.8032389630325525e-06, + "loss": 0.78783363, + "num_input_tokens_seen": 97849975, + "step": 4536, + "time_per_iteration": 2.5143136978149414 + }, + { + "auxiliary_loss_clip": 0.01151782, + "auxiliary_loss_mlp": 0.0103084, + "balance_loss_clip": 1.04767716, + "balance_loss_mlp": 1.0223937, + "epoch": 0.5455419948295557, + "flos": 23658345037440.0, + "grad_norm": 1.6973476964870549, + "language_loss": 0.75666451, + "learning_rate": 1.8024637852217707e-06, + "loss": 0.77849072, + "num_input_tokens_seen": 97869700, + "step": 4537, + "time_per_iteration": 2.4993975162506104 + }, + { + "auxiliary_loss_clip": 0.01153543, + "auxiliary_loss_mlp": 0.01034748, + "balance_loss_clip": 1.0513835, + "balance_loss_mlp": 1.02612329, + "epoch": 0.5456622377201948, + "flos": 23403989854080.0, + "grad_norm": 3.071679645913058, + "language_loss": 0.84801513, + "learning_rate": 1.8016886373770766e-06, + "loss": 0.86989808, + "num_input_tokens_seen": 97888215, + "step": 4538, + "time_per_iteration": 2.513484239578247 + }, + { + "auxiliary_loss_clip": 0.01152981, + "auxiliary_loss_mlp": 0.0102713, + "balance_loss_clip": 1.05130887, + "balance_loss_mlp": 1.01875544, + "epoch": 0.5457824806108339, + "flos": 23988040997760.0, + "grad_norm": 2.0728925984255167, + "language_loss": 0.79003531, + "learning_rate": 1.8009135196160579e-06, + "loss": 0.81183642, + "num_input_tokens_seen": 97907090, + "step": 4539, + "time_per_iteration": 2.5133910179138184 + }, + { + "auxiliary_loss_clip": 0.0113285, + "auxiliary_loss_mlp": 0.0102752, + "balance_loss_clip": 1.04638529, + "balance_loss_mlp": 1.02076995, + "epoch": 0.545902723501473, + "flos": 22565870835840.0, + "grad_norm": 1.7136759433762732, + "language_loss": 0.84270728, + "learning_rate": 1.8001384320563e-06, + "loss": 0.86431098, + "num_input_tokens_seen": 97927345, + "step": 4540, + "time_per_iteration": 2.5384836196899414 + }, + { + "auxiliary_loss_clip": 0.01077676, + "auxiliary_loss_mlp": 0.0100088, + "balance_loss_clip": 1.01593113, + "balance_loss_mlp": 0.99962825, + "epoch": 0.5460229663921121, + "flos": 55198399685760.0, + "grad_norm": 0.7715399913762221, + "language_loss": 0.57833654, + "learning_rate": 1.7993633748153833e-06, + "loss": 0.59912211, + "num_input_tokens_seen": 97981950, + "step": 4541, + "time_per_iteration": 2.9254770278930664 + }, + { + "auxiliary_loss_clip": 0.0116905, + "auxiliary_loss_mlp": 0.01029821, + "balance_loss_clip": 1.05075693, + "balance_loss_mlp": 1.02179837, + "epoch": 0.5461432092827512, + "flos": 15413866018560.0, + "grad_norm": 1.841521143946277, + "language_loss": 0.72621393, + "learning_rate": 1.7985883480108834e-06, + "loss": 0.74820262, + "num_input_tokens_seen": 97999585, + "step": 4542, + "time_per_iteration": 2.472543239593506 + }, + { + "auxiliary_loss_clip": 0.01162205, + "auxiliary_loss_mlp": 0.01029196, + "balance_loss_clip": 1.05097413, + "balance_loss_mlp": 1.02076781, + "epoch": 0.5462634521733902, + "flos": 24024921287040.0, + "grad_norm": 1.6216772628294822, + "language_loss": 0.71929443, + "learning_rate": 1.797813351760371e-06, + "loss": 0.74120843, + "num_input_tokens_seen": 98021290, + "step": 4543, + "time_per_iteration": 2.5020222663879395 + }, + { + "auxiliary_loss_clip": 0.01183179, + "auxiliary_loss_mlp": 0.01022846, + "balance_loss_clip": 1.05569077, + "balance_loss_mlp": 1.01450777, + "epoch": 0.5463836950640293, + "flos": 22820944291200.0, + "grad_norm": 2.1184840537429053, + "language_loss": 0.77943361, + "learning_rate": 1.7970383861814116e-06, + "loss": 0.80149388, + "num_input_tokens_seen": 98041060, + "step": 4544, + "time_per_iteration": 2.4518117904663086 + }, + { + "auxiliary_loss_clip": 0.0116578, + "auxiliary_loss_mlp": 0.0102802, + "balance_loss_clip": 1.05286467, + "balance_loss_mlp": 1.01946115, + "epoch": 0.5465039379546685, + "flos": 20448290390400.0, + "grad_norm": 2.3256919477467646, + "language_loss": 0.73758638, + "learning_rate": 1.7962634513915684e-06, + "loss": 0.75952441, + "num_input_tokens_seen": 98058410, + "step": 4545, + "time_per_iteration": 2.448909282684326 + }, + { + "auxiliary_loss_clip": 0.01178976, + "auxiliary_loss_mlp": 0.01025182, + "balance_loss_clip": 1.05344045, + "balance_loss_mlp": 1.01750803, + "epoch": 0.5466241808453075, + "flos": 17343310003200.0, + "grad_norm": 1.820426286635348, + "language_loss": 0.7914654, + "learning_rate": 1.7954885475083969e-06, + "loss": 0.81350696, + "num_input_tokens_seen": 98076080, + "step": 4546, + "time_per_iteration": 2.4668209552764893 + }, + { + "auxiliary_loss_clip": 0.01183169, + "auxiliary_loss_mlp": 0.01032079, + "balance_loss_clip": 1.05494952, + "balance_loss_mlp": 1.02407444, + "epoch": 0.5467444237359466, + "flos": 21617039122560.0, + "grad_norm": 4.754854904142636, + "language_loss": 0.72547793, + "learning_rate": 1.7947136746494513e-06, + "loss": 0.74763042, + "num_input_tokens_seen": 98096995, + "step": 4547, + "time_per_iteration": 2.4505510330200195 + }, + { + "auxiliary_loss_clip": 0.01164983, + "auxiliary_loss_mlp": 0.01031333, + "balance_loss_clip": 1.05163765, + "balance_loss_mlp": 1.0235368, + "epoch": 0.5468646666265857, + "flos": 24170467196160.0, + "grad_norm": 8.745371990378079, + "language_loss": 0.88063121, + "learning_rate": 1.793938832932277e-06, + "loss": 0.90259433, + "num_input_tokens_seen": 98115105, + "step": 4548, + "time_per_iteration": 2.4967939853668213 + }, + { + "auxiliary_loss_clip": 0.01180659, + "auxiliary_loss_mlp": 0.0102593, + "balance_loss_clip": 1.05262101, + "balance_loss_mlp": 1.018152, + "epoch": 0.5469849095172248, + "flos": 27527001505920.0, + "grad_norm": 2.0263303529772707, + "language_loss": 0.70240688, + "learning_rate": 1.7931640224744185e-06, + "loss": 0.72447276, + "num_input_tokens_seen": 98135655, + "step": 4549, + "time_per_iteration": 2.4749107360839844 + }, + { + "auxiliary_loss_clip": 0.0112292, + "auxiliary_loss_mlp": 0.0102454, + "balance_loss_clip": 1.04220271, + "balance_loss_mlp": 1.01685715, + "epoch": 0.5471051524078638, + "flos": 27964680727680.0, + "grad_norm": 1.733460203172678, + "language_loss": 0.73258305, + "learning_rate": 1.7923892433934127e-06, + "loss": 0.75405765, + "num_input_tokens_seen": 98156730, + "step": 4550, + "time_per_iteration": 3.4041264057159424 + }, + { + "auxiliary_loss_clip": 0.01153626, + "auxiliary_loss_mlp": 0.00763392, + "balance_loss_clip": 1.04998183, + "balance_loss_mlp": 1.00132465, + "epoch": 0.547225395298503, + "flos": 18150510389760.0, + "grad_norm": 2.3497663371302164, + "language_loss": 0.78536367, + "learning_rate": 1.7916144958067939e-06, + "loss": 0.80453384, + "num_input_tokens_seen": 98174590, + "step": 4551, + "time_per_iteration": 3.263478994369507 + }, + { + "auxiliary_loss_clip": 0.01167252, + "auxiliary_loss_mlp": 0.01026982, + "balance_loss_clip": 1.05084336, + "balance_loss_mlp": 1.01891744, + "epoch": 0.5473456381891421, + "flos": 21361498790400.0, + "grad_norm": 2.0188498046946095, + "language_loss": 0.78774369, + "learning_rate": 1.7908397798320905e-06, + "loss": 0.80968595, + "num_input_tokens_seen": 98194325, + "step": 4552, + "time_per_iteration": 3.2240869998931885 + }, + { + "auxiliary_loss_clip": 0.01167157, + "auxiliary_loss_mlp": 0.00763378, + "balance_loss_clip": 1.0531311, + "balance_loss_mlp": 1.00133598, + "epoch": 0.5474658810797811, + "flos": 19932145908480.0, + "grad_norm": 1.7969725921981077, + "language_loss": 0.74962497, + "learning_rate": 1.7900650955868265e-06, + "loss": 0.76893032, + "num_input_tokens_seen": 98213970, + "step": 4553, + "time_per_iteration": 3.2489943504333496 + }, + { + "auxiliary_loss_clip": 0.01166525, + "auxiliary_loss_mlp": 0.00762726, + "balance_loss_clip": 1.05365193, + "balance_loss_mlp": 1.00138474, + "epoch": 0.5475861239704203, + "flos": 50476217264640.0, + "grad_norm": 1.3926293784529673, + "language_loss": 0.76381403, + "learning_rate": 1.7892904431885202e-06, + "loss": 0.78310651, + "num_input_tokens_seen": 98241145, + "step": 4554, + "time_per_iteration": 2.7406063079833984 + }, + { + "auxiliary_loss_clip": 0.01121897, + "auxiliary_loss_mlp": 0.01025839, + "balance_loss_clip": 1.043118, + "balance_loss_mlp": 1.01809645, + "epoch": 0.5477063668610593, + "flos": 20705123612160.0, + "grad_norm": 1.8219738855332337, + "language_loss": 0.75340378, + "learning_rate": 1.788515822754686e-06, + "loss": 0.77488118, + "num_input_tokens_seen": 98261565, + "step": 4555, + "time_per_iteration": 2.565910577774048 + }, + { + "auxiliary_loss_clip": 0.01138588, + "auxiliary_loss_mlp": 0.01030265, + "balance_loss_clip": 1.0461452, + "balance_loss_mlp": 1.02189016, + "epoch": 0.5478266097516984, + "flos": 19609740408960.0, + "grad_norm": 1.8643845734585158, + "language_loss": 0.78191584, + "learning_rate": 1.7877412344028335e-06, + "loss": 0.80360436, + "num_input_tokens_seen": 98281370, + "step": 4556, + "time_per_iteration": 2.552619695663452 + }, + { + "auxiliary_loss_clip": 0.01167424, + "auxiliary_loss_mlp": 0.01024245, + "balance_loss_clip": 1.05053329, + "balance_loss_mlp": 1.01646042, + "epoch": 0.5479468526423376, + "flos": 12896599962240.0, + "grad_norm": 2.7171721691424455, + "language_loss": 0.77328295, + "learning_rate": 1.7869666782504668e-06, + "loss": 0.79519963, + "num_input_tokens_seen": 98297950, + "step": 4557, + "time_per_iteration": 2.4377663135528564 + }, + { + "auxiliary_loss_clip": 0.01137816, + "auxiliary_loss_mlp": 0.01024458, + "balance_loss_clip": 1.04480743, + "balance_loss_mlp": 1.01656675, + "epoch": 0.5480670955329766, + "flos": 18588800142720.0, + "grad_norm": 1.851521495307278, + "language_loss": 0.68935561, + "learning_rate": 1.7861921544150867e-06, + "loss": 0.71097833, + "num_input_tokens_seen": 98316800, + "step": 4558, + "time_per_iteration": 2.514643907546997 + }, + { + "auxiliary_loss_clip": 0.01094318, + "auxiliary_loss_mlp": 0.00762823, + "balance_loss_clip": 1.04308593, + "balance_loss_mlp": 1.00129271, + "epoch": 0.5481873384236157, + "flos": 15954608338560.0, + "grad_norm": 1.7980615331406695, + "language_loss": 0.76299453, + "learning_rate": 1.7854176630141856e-06, + "loss": 0.7815659, + "num_input_tokens_seen": 98333935, + "step": 4559, + "time_per_iteration": 2.709113359451294 + }, + { + "auxiliary_loss_clip": 0.01184949, + "auxiliary_loss_mlp": 0.01041718, + "balance_loss_clip": 1.05540061, + "balance_loss_mlp": 1.03356433, + "epoch": 0.5483075813142548, + "flos": 22783812606720.0, + "grad_norm": 2.5254815067067553, + "language_loss": 0.84257001, + "learning_rate": 1.784643204165255e-06, + "loss": 0.86483669, + "num_input_tokens_seen": 98353255, + "step": 4560, + "time_per_iteration": 2.6452043056488037 + }, + { + "auxiliary_loss_clip": 0.01159346, + "auxiliary_loss_mlp": 0.0102546, + "balance_loss_clip": 1.05110061, + "balance_loss_mlp": 1.01763368, + "epoch": 0.5484278242048939, + "flos": 19317212046720.0, + "grad_norm": 2.116729241488762, + "language_loss": 0.77266753, + "learning_rate": 1.7838687779857783e-06, + "loss": 0.79451561, + "num_input_tokens_seen": 98371130, + "step": 4561, + "time_per_iteration": 2.4528002738952637 + }, + { + "auxiliary_loss_clip": 0.01144936, + "auxiliary_loss_mlp": 0.01027694, + "balance_loss_clip": 1.04762244, + "balance_loss_mlp": 1.01919746, + "epoch": 0.5485480670955329, + "flos": 22816024128000.0, + "grad_norm": 2.3941573606786197, + "language_loss": 0.63801575, + "learning_rate": 1.7830943845932366e-06, + "loss": 0.65974206, + "num_input_tokens_seen": 98390455, + "step": 4562, + "time_per_iteration": 2.5072875022888184 + }, + { + "auxiliary_loss_clip": 0.01155416, + "auxiliary_loss_mlp": 0.01032467, + "balance_loss_clip": 1.05161393, + "balance_loss_mlp": 1.02451551, + "epoch": 0.5486683099861721, + "flos": 22671304231680.0, + "grad_norm": 1.600629784570238, + "language_loss": 0.75258112, + "learning_rate": 1.7823200241051044e-06, + "loss": 0.7744599, + "num_input_tokens_seen": 98409370, + "step": 4563, + "time_per_iteration": 2.5071866512298584 + }, + { + "auxiliary_loss_clip": 0.01180043, + "auxiliary_loss_mlp": 0.01023943, + "balance_loss_clip": 1.05240846, + "balance_loss_mlp": 1.0160749, + "epoch": 0.5487885528768112, + "flos": 23149383275520.0, + "grad_norm": 1.9013935258247538, + "language_loss": 0.80345595, + "learning_rate": 1.7815456966388513e-06, + "loss": 0.82549584, + "num_input_tokens_seen": 98428465, + "step": 4564, + "time_per_iteration": 2.428921937942505 + }, + { + "auxiliary_loss_clip": 0.01137879, + "auxiliary_loss_mlp": 0.01028875, + "balance_loss_clip": 1.0468595, + "balance_loss_mlp": 1.02115023, + "epoch": 0.5489087957674502, + "flos": 22053928245120.0, + "grad_norm": 2.1058379056741585, + "language_loss": 0.8070876, + "learning_rate": 1.780771402311943e-06, + "loss": 0.82875514, + "num_input_tokens_seen": 98447300, + "step": 4565, + "time_per_iteration": 2.522017478942871 + }, + { + "auxiliary_loss_clip": 0.01152432, + "auxiliary_loss_mlp": 0.01028465, + "balance_loss_clip": 1.05115187, + "balance_loss_mlp": 1.02040088, + "epoch": 0.5490290386580894, + "flos": 24315977191680.0, + "grad_norm": 1.6800334671153774, + "language_loss": 0.78524518, + "learning_rate": 1.7799971412418374e-06, + "loss": 0.80705416, + "num_input_tokens_seen": 98468695, + "step": 4566, + "time_per_iteration": 2.512957811355591 + }, + { + "auxiliary_loss_clip": 0.01139702, + "auxiliary_loss_mlp": 0.01027265, + "balance_loss_clip": 1.04985368, + "balance_loss_mlp": 1.019135, + "epoch": 0.5491492815487284, + "flos": 18294942977280.0, + "grad_norm": 2.636484934825691, + "language_loss": 0.73644495, + "learning_rate": 1.7792229135459918e-06, + "loss": 0.7581147, + "num_input_tokens_seen": 98485345, + "step": 4567, + "time_per_iteration": 2.50490665435791 + }, + { + "auxiliary_loss_clip": 0.01031792, + "auxiliary_loss_mlp": 0.01002257, + "balance_loss_clip": 1.0199523, + "balance_loss_mlp": 1.0009402, + "epoch": 0.5492695244393675, + "flos": 64550257050240.0, + "grad_norm": 0.7304462928667791, + "language_loss": 0.6163702, + "learning_rate": 1.7784487193418538e-06, + "loss": 0.63671064, + "num_input_tokens_seen": 98543195, + "step": 4568, + "time_per_iteration": 3.060385227203369 + }, + { + "auxiliary_loss_clip": 0.01121164, + "auxiliary_loss_mlp": 0.01026618, + "balance_loss_clip": 1.04220223, + "balance_loss_mlp": 1.01817191, + "epoch": 0.5493897673300067, + "flos": 17379579761280.0, + "grad_norm": 2.0414691459887, + "language_loss": 0.6079694, + "learning_rate": 1.7776745587468698e-06, + "loss": 0.62944722, + "num_input_tokens_seen": 98560620, + "step": 4569, + "time_per_iteration": 2.5166425704956055 + }, + { + "auxiliary_loss_clip": 0.01178714, + "auxiliary_loss_mlp": 0.01031134, + "balance_loss_clip": 1.05085802, + "balance_loss_mlp": 1.02345705, + "epoch": 0.5495100102206457, + "flos": 19901765980800.0, + "grad_norm": 2.4864490387643134, + "language_loss": 0.81564486, + "learning_rate": 1.7769004318784776e-06, + "loss": 0.83774328, + "num_input_tokens_seen": 98578265, + "step": 4570, + "time_per_iteration": 2.451190233230591 + }, + { + "auxiliary_loss_clip": 0.01166562, + "auxiliary_loss_mlp": 0.01024579, + "balance_loss_clip": 1.05079448, + "balance_loss_mlp": 1.01684856, + "epoch": 0.5496302531112848, + "flos": 16727190992640.0, + "grad_norm": 1.7108493512994327, + "language_loss": 0.80637181, + "learning_rate": 1.776126338854113e-06, + "loss": 0.82828319, + "num_input_tokens_seen": 98596055, + "step": 4571, + "time_per_iteration": 2.4627256393432617 + }, + { + "auxiliary_loss_clip": 0.01160359, + "auxiliary_loss_mlp": 0.01027652, + "balance_loss_clip": 1.0517211, + "balance_loss_mlp": 1.02008796, + "epoch": 0.5497504960019239, + "flos": 24572343536640.0, + "grad_norm": 1.9202486992610108, + "language_loss": 0.84500921, + "learning_rate": 1.7753522797912044e-06, + "loss": 0.86688936, + "num_input_tokens_seen": 98616140, + "step": 4572, + "time_per_iteration": 2.6443114280700684 + }, + { + "auxiliary_loss_clip": 0.01158778, + "auxiliary_loss_mlp": 0.01024495, + "balance_loss_clip": 1.04889154, + "balance_loss_mlp": 1.01638269, + "epoch": 0.549870738892563, + "flos": 15450494912640.0, + "grad_norm": 2.3629390417030884, + "language_loss": 0.70000029, + "learning_rate": 1.7745782548071765e-06, + "loss": 0.72183299, + "num_input_tokens_seen": 98633035, + "step": 4573, + "time_per_iteration": 2.468156576156616 + }, + { + "auxiliary_loss_clip": 0.01134205, + "auxiliary_loss_mlp": 0.01028565, + "balance_loss_clip": 1.05224109, + "balance_loss_mlp": 1.02124858, + "epoch": 0.549990981783202, + "flos": 21069114082560.0, + "grad_norm": 1.778063163413034, + "language_loss": 0.74374783, + "learning_rate": 1.7738042640194482e-06, + "loss": 0.76537561, + "num_input_tokens_seen": 98652700, + "step": 4574, + "time_per_iteration": 2.541045665740967 + }, + { + "auxiliary_loss_clip": 0.01179403, + "auxiliary_loss_mlp": 0.0102828, + "balance_loss_clip": 1.05197954, + "balance_loss_mlp": 1.02005422, + "epoch": 0.5501112246738411, + "flos": 21395901041280.0, + "grad_norm": 1.9476878850137496, + "language_loss": 0.70494926, + "learning_rate": 1.7730303075454335e-06, + "loss": 0.7270261, + "num_input_tokens_seen": 98671590, + "step": 4575, + "time_per_iteration": 2.4209930896759033 + }, + { + "auxiliary_loss_clip": 0.01141088, + "auxiliary_loss_mlp": 0.01030023, + "balance_loss_clip": 1.04754901, + "balance_loss_mlp": 1.02135706, + "epoch": 0.5502314675644803, + "flos": 17456931699840.0, + "grad_norm": 1.9403187226359555, + "language_loss": 0.84947586, + "learning_rate": 1.7722563855025402e-06, + "loss": 0.87118697, + "num_input_tokens_seen": 98689620, + "step": 4576, + "time_per_iteration": 2.517068386077881 + }, + { + "auxiliary_loss_clip": 0.01151338, + "auxiliary_loss_mlp": 0.01026227, + "balance_loss_clip": 1.04629159, + "balance_loss_mlp": 1.01820993, + "epoch": 0.5503517104551193, + "flos": 24310410583680.0, + "grad_norm": 13.442790140814878, + "language_loss": 0.71250778, + "learning_rate": 1.7714824980081721e-06, + "loss": 0.73428345, + "num_input_tokens_seen": 98708915, + "step": 4577, + "time_per_iteration": 3.3073573112487793 + }, + { + "auxiliary_loss_clip": 0.01162953, + "auxiliary_loss_mlp": 0.01024434, + "balance_loss_clip": 1.05294108, + "balance_loss_mlp": 1.01696849, + "epoch": 0.5504719533457584, + "flos": 22419427086720.0, + "grad_norm": 2.024659419974389, + "language_loss": 0.73861742, + "learning_rate": 1.7707086451797276e-06, + "loss": 0.76049125, + "num_input_tokens_seen": 98729790, + "step": 4578, + "time_per_iteration": 3.9924733638763428 + }, + { + "auxiliary_loss_clip": 0.01041398, + "auxiliary_loss_mlp": 0.01002595, + "balance_loss_clip": 1.01153278, + "balance_loss_mlp": 1.00130117, + "epoch": 0.5505921962363975, + "flos": 67294155968640.0, + "grad_norm": 0.7027947178207559, + "language_loss": 0.52323937, + "learning_rate": 1.7699348271345993e-06, + "loss": 0.54367936, + "num_input_tokens_seen": 98792415, + "step": 4579, + "time_per_iteration": 3.0425264835357666 + }, + { + "auxiliary_loss_clip": 0.01039319, + "auxiliary_loss_mlp": 0.0100405, + "balance_loss_clip": 1.01372385, + "balance_loss_mlp": 1.00270295, + "epoch": 0.5507124391270366, + "flos": 45685125578880.0, + "grad_norm": 0.704785683048479, + "language_loss": 0.54399157, + "learning_rate": 1.7691610439901753e-06, + "loss": 0.56442535, + "num_input_tokens_seen": 98855350, + "step": 4580, + "time_per_iteration": 3.9044129848480225 + }, + { + "auxiliary_loss_clip": 0.01170103, + "auxiliary_loss_mlp": 0.01026396, + "balance_loss_clip": 1.0529865, + "balance_loss_mlp": 1.01886249, + "epoch": 0.5508326820176757, + "flos": 22273845264000.0, + "grad_norm": 1.8486776441153467, + "language_loss": 0.756136, + "learning_rate": 1.7683872958638367e-06, + "loss": 0.77810097, + "num_input_tokens_seen": 98874230, + "step": 4581, + "time_per_iteration": 2.488053321838379 + }, + { + "auxiliary_loss_clip": 0.01148001, + "auxiliary_loss_mlp": 0.01025556, + "balance_loss_clip": 1.04669213, + "balance_loss_mlp": 1.01750374, + "epoch": 0.5509529249083148, + "flos": 20012442762240.0, + "grad_norm": 2.0458795188221295, + "language_loss": 0.84346366, + "learning_rate": 1.7676135828729614e-06, + "loss": 0.86519933, + "num_input_tokens_seen": 98893940, + "step": 4582, + "time_per_iteration": 2.5588250160217285 + }, + { + "auxiliary_loss_clip": 0.01165295, + "auxiliary_loss_mlp": 0.01024375, + "balance_loss_clip": 1.0517385, + "balance_loss_mlp": 1.01634002, + "epoch": 0.5510731677989539, + "flos": 21834801325440.0, + "grad_norm": 2.184038632134145, + "language_loss": 0.8288219, + "learning_rate": 1.7668399051349205e-06, + "loss": 0.85071862, + "num_input_tokens_seen": 98913620, + "step": 4583, + "time_per_iteration": 2.483574151992798 + }, + { + "auxiliary_loss_clip": 0.01135106, + "auxiliary_loss_mlp": 0.01024368, + "balance_loss_clip": 1.0482235, + "balance_loss_mlp": 1.01655436, + "epoch": 0.5511934106895929, + "flos": 21467901853440.0, + "grad_norm": 2.0938300590369265, + "language_loss": 0.8335501, + "learning_rate": 1.766066262767081e-06, + "loss": 0.85514486, + "num_input_tokens_seen": 98931460, + "step": 4584, + "time_per_iteration": 2.528848648071289 + }, + { + "auxiliary_loss_clip": 0.01150074, + "auxiliary_loss_mlp": 0.0102522, + "balance_loss_clip": 1.05224299, + "balance_loss_mlp": 1.01724446, + "epoch": 0.5513136535802321, + "flos": 21068934514560.0, + "grad_norm": 2.266280985078634, + "language_loss": 0.7722466, + "learning_rate": 1.765292655886803e-06, + "loss": 0.79399955, + "num_input_tokens_seen": 98950105, + "step": 4585, + "time_per_iteration": 2.485902786254883 + }, + { + "auxiliary_loss_clip": 0.01142642, + "auxiliary_loss_mlp": 0.01025878, + "balance_loss_clip": 1.04771543, + "balance_loss_mlp": 1.01786113, + "epoch": 0.5514338964708712, + "flos": 27815004754560.0, + "grad_norm": 2.18110433410557, + "language_loss": 0.70669591, + "learning_rate": 1.764519084611443e-06, + "loss": 0.7283811, + "num_input_tokens_seen": 98970560, + "step": 4586, + "time_per_iteration": 2.5655674934387207 + }, + { + "auxiliary_loss_clip": 0.01150696, + "auxiliary_loss_mlp": 0.01026274, + "balance_loss_clip": 1.04761338, + "balance_loss_mlp": 1.01770329, + "epoch": 0.5515541393615102, + "flos": 21908525990400.0, + "grad_norm": 6.584818879482861, + "language_loss": 0.78057575, + "learning_rate": 1.7637455490583505e-06, + "loss": 0.80234551, + "num_input_tokens_seen": 98989885, + "step": 4587, + "time_per_iteration": 2.504288673400879 + }, + { + "auxiliary_loss_clip": 0.01165628, + "auxiliary_loss_mlp": 0.01026773, + "balance_loss_clip": 1.0518508, + "balance_loss_mlp": 1.01931024, + "epoch": 0.5516743822521494, + "flos": 20485422074880.0, + "grad_norm": 1.8988990422552299, + "language_loss": 0.76988745, + "learning_rate": 1.7629720493448701e-06, + "loss": 0.79181147, + "num_input_tokens_seen": 99007180, + "step": 4588, + "time_per_iteration": 2.454793930053711 + }, + { + "auxiliary_loss_clip": 0.01160617, + "auxiliary_loss_mlp": 0.01027858, + "balance_loss_clip": 1.05120981, + "balance_loss_mlp": 1.01997828, + "epoch": 0.5517946251427884, + "flos": 14940383915520.0, + "grad_norm": 1.714934300135647, + "language_loss": 0.8506813, + "learning_rate": 1.7621985855883418e-06, + "loss": 0.87256598, + "num_input_tokens_seen": 99023880, + "step": 4589, + "time_per_iteration": 2.4670603275299072 + }, + { + "auxiliary_loss_clip": 0.01143897, + "auxiliary_loss_mlp": 0.01028261, + "balance_loss_clip": 1.04799366, + "balance_loss_mlp": 1.02030408, + "epoch": 0.5519148680334275, + "flos": 18404865573120.0, + "grad_norm": 2.781448037927104, + "language_loss": 0.72287858, + "learning_rate": 1.7614251579060983e-06, + "loss": 0.74460018, + "num_input_tokens_seen": 99042475, + "step": 4590, + "time_per_iteration": 2.467280626296997 + }, + { + "auxiliary_loss_clip": 0.01140628, + "auxiliary_loss_mlp": 0.01027758, + "balance_loss_clip": 1.04842794, + "balance_loss_mlp": 1.01932359, + "epoch": 0.5520351109240667, + "flos": 25113337251840.0, + "grad_norm": 1.9692529133282257, + "language_loss": 0.84710956, + "learning_rate": 1.76065176641547e-06, + "loss": 0.86879343, + "num_input_tokens_seen": 99065185, + "step": 4591, + "time_per_iteration": 2.5909924507141113 + }, + { + "auxiliary_loss_clip": 0.01162995, + "auxiliary_loss_mlp": 0.01023832, + "balance_loss_clip": 1.04720116, + "balance_loss_mlp": 1.01558232, + "epoch": 0.5521553538147057, + "flos": 21069545045760.0, + "grad_norm": 1.9897357161963287, + "language_loss": 0.77527559, + "learning_rate": 1.759878411233777e-06, + "loss": 0.79714382, + "num_input_tokens_seen": 99083645, + "step": 4592, + "time_per_iteration": 2.443493604660034 + }, + { + "auxiliary_loss_clip": 0.01163158, + "auxiliary_loss_mlp": 0.01024198, + "balance_loss_clip": 1.04938602, + "balance_loss_mlp": 1.01613045, + "epoch": 0.5522755967053448, + "flos": 18879999701760.0, + "grad_norm": 2.0961466312726182, + "language_loss": 0.75756836, + "learning_rate": 1.7591050924783388e-06, + "loss": 0.77944195, + "num_input_tokens_seen": 99100835, + "step": 4593, + "time_per_iteration": 2.4392054080963135 + }, + { + "auxiliary_loss_clip": 0.01031983, + "auxiliary_loss_mlp": 0.01000963, + "balance_loss_clip": 1.01201248, + "balance_loss_mlp": 0.99962199, + "epoch": 0.5523958395959839, + "flos": 64675622494080.0, + "grad_norm": 0.8380943531931558, + "language_loss": 0.57934403, + "learning_rate": 1.7583318102664661e-06, + "loss": 0.59967351, + "num_input_tokens_seen": 99168400, + "step": 4594, + "time_per_iteration": 3.166384220123291 + }, + { + "auxiliary_loss_clip": 0.01166526, + "auxiliary_loss_mlp": 0.01026761, + "balance_loss_clip": 1.04735208, + "balance_loss_mlp": 1.01901865, + "epoch": 0.552516082486623, + "flos": 10889732211840.0, + "grad_norm": 1.9728428082301663, + "language_loss": 0.78877199, + "learning_rate": 1.757558564715466e-06, + "loss": 0.81070489, + "num_input_tokens_seen": 99186475, + "step": 4595, + "time_per_iteration": 2.457122564315796 + }, + { + "auxiliary_loss_clip": 0.01165028, + "auxiliary_loss_mlp": 0.01026549, + "balance_loss_clip": 1.04839683, + "balance_loss_mlp": 1.01847303, + "epoch": 0.552636325377262, + "flos": 22199797376640.0, + "grad_norm": 2.487173335872375, + "language_loss": 0.73863614, + "learning_rate": 1.7567853559426386e-06, + "loss": 0.76055193, + "num_input_tokens_seen": 99203525, + "step": 4596, + "time_per_iteration": 2.4421820640563965 + }, + { + "auxiliary_loss_clip": 0.01168826, + "auxiliary_loss_mlp": 0.01029678, + "balance_loss_clip": 1.05185866, + "balance_loss_mlp": 1.02216184, + "epoch": 0.5527565682679012, + "flos": 23988184652160.0, + "grad_norm": 2.0040196539288346, + "language_loss": 0.75031757, + "learning_rate": 1.7560121840652797e-06, + "loss": 0.77230257, + "num_input_tokens_seen": 99222910, + "step": 4597, + "time_per_iteration": 2.4795563220977783 + }, + { + "auxiliary_loss_clip": 0.01126846, + "auxiliary_loss_mlp": 0.01022949, + "balance_loss_clip": 1.04741764, + "balance_loss_mlp": 1.01491976, + "epoch": 0.5528768111585403, + "flos": 19719267955200.0, + "grad_norm": 1.7820923350011413, + "language_loss": 0.68606102, + "learning_rate": 1.7552390492006782e-06, + "loss": 0.70755899, + "num_input_tokens_seen": 99241230, + "step": 4598, + "time_per_iteration": 2.510634660720825 + }, + { + "auxiliary_loss_clip": 0.01128976, + "auxiliary_loss_mlp": 0.00762597, + "balance_loss_clip": 1.04416811, + "balance_loss_mlp": 1.00128746, + "epoch": 0.5529970540491793, + "flos": 26215975002240.0, + "grad_norm": 1.9281489118999688, + "language_loss": 0.65216368, + "learning_rate": 1.7544659514661184e-06, + "loss": 0.67107946, + "num_input_tokens_seen": 99264320, + "step": 4599, + "time_per_iteration": 2.640360116958618 + }, + { + "auxiliary_loss_clip": 0.01144508, + "auxiliary_loss_mlp": 0.01024496, + "balance_loss_clip": 1.04438639, + "balance_loss_mlp": 1.01689613, + "epoch": 0.5531172969398185, + "flos": 24425971614720.0, + "grad_norm": 1.9729208464371601, + "language_loss": 0.79710865, + "learning_rate": 1.7536928909788786e-06, + "loss": 0.81879866, + "num_input_tokens_seen": 99283625, + "step": 4600, + "time_per_iteration": 2.5198330879211426 + }, + { + "auxiliary_loss_clip": 0.0103693, + "auxiliary_loss_mlp": 0.01002842, + "balance_loss_clip": 1.013376, + "balance_loss_mlp": 1.00164378, + "epoch": 0.5532375398304575, + "flos": 64907316195840.0, + "grad_norm": 0.8840152539630796, + "language_loss": 0.62013304, + "learning_rate": 1.752919867856231e-06, + "loss": 0.64053082, + "num_input_tokens_seen": 99335270, + "step": 4601, + "time_per_iteration": 3.026937961578369 + }, + { + "auxiliary_loss_clip": 0.01140722, + "auxiliary_loss_mlp": 0.01025748, + "balance_loss_clip": 1.04493666, + "balance_loss_mlp": 1.01801693, + "epoch": 0.5533577827210966, + "flos": 19683105937920.0, + "grad_norm": 1.7244282102780994, + "language_loss": 0.7885921, + "learning_rate": 1.7521468822154436e-06, + "loss": 0.81025684, + "num_input_tokens_seen": 99354185, + "step": 4602, + "time_per_iteration": 2.682513475418091 + }, + { + "auxiliary_loss_clip": 0.01144395, + "auxiliary_loss_mlp": 0.01025729, + "balance_loss_clip": 1.04914021, + "balance_loss_mlp": 1.01880026, + "epoch": 0.5534780256117358, + "flos": 32306496076800.0, + "grad_norm": 1.8712243452035837, + "language_loss": 0.7509172, + "learning_rate": 1.751373934173777e-06, + "loss": 0.77261841, + "num_input_tokens_seen": 99376930, + "step": 4603, + "time_per_iteration": 3.29949951171875 + }, + { + "auxiliary_loss_clip": 0.01179194, + "auxiliary_loss_mlp": 0.01030823, + "balance_loss_clip": 1.05010808, + "balance_loss_mlp": 1.02272868, + "epoch": 0.5535982685023748, + "flos": 23222425582080.0, + "grad_norm": 1.903138579963123, + "language_loss": 0.73246706, + "learning_rate": 1.750601023848487e-06, + "loss": 0.75456715, + "num_input_tokens_seen": 99397655, + "step": 4604, + "time_per_iteration": 2.4555492401123047 + }, + { + "auxiliary_loss_clip": 0.01177493, + "auxiliary_loss_mlp": 0.00762447, + "balance_loss_clip": 1.05271983, + "balance_loss_mlp": 1.00128901, + "epoch": 0.5537185113930139, + "flos": 24352534258560.0, + "grad_norm": 2.7988043905452296, + "language_loss": 0.73983741, + "learning_rate": 1.749828151356823e-06, + "loss": 0.75923675, + "num_input_tokens_seen": 99417850, + "step": 4605, + "time_per_iteration": 3.2179343700408936 + }, + { + "auxiliary_loss_clip": 0.01148727, + "auxiliary_loss_mlp": 0.01029102, + "balance_loss_clip": 1.04712141, + "balance_loss_mlp": 1.02183032, + "epoch": 0.553838754283653, + "flos": 23549068886400.0, + "grad_norm": 1.7604272569278347, + "language_loss": 0.75150347, + "learning_rate": 1.7490553168160297e-06, + "loss": 0.77328181, + "num_input_tokens_seen": 99438920, + "step": 4606, + "time_per_iteration": 2.5458977222442627 + }, + { + "auxiliary_loss_clip": 0.01148979, + "auxiliary_loss_mlp": 0.01026363, + "balance_loss_clip": 1.04803908, + "balance_loss_mlp": 1.01805472, + "epoch": 0.5539589971742921, + "flos": 17275044205440.0, + "grad_norm": 2.790278733947259, + "language_loss": 0.76491702, + "learning_rate": 1.748282520343345e-06, + "loss": 0.78667045, + "num_input_tokens_seen": 99457950, + "step": 4607, + "time_per_iteration": 3.273638963699341 + }, + { + "auxiliary_loss_clip": 0.01173541, + "auxiliary_loss_mlp": 0.01021957, + "balance_loss_clip": 1.0516026, + "balance_loss_mlp": 1.0140053, + "epoch": 0.5540792400649311, + "flos": 27564169104000.0, + "grad_norm": 2.3887041899908907, + "language_loss": 0.78662729, + "learning_rate": 1.7475097620560023e-06, + "loss": 0.80858225, + "num_input_tokens_seen": 99478015, + "step": 4608, + "time_per_iteration": 2.5686163902282715 + }, + { + "auxiliary_loss_clip": 0.01177708, + "auxiliary_loss_mlp": 0.01024331, + "balance_loss_clip": 1.05182576, + "balance_loss_mlp": 1.01666033, + "epoch": 0.5541994829555702, + "flos": 23878657105920.0, + "grad_norm": 1.7022847625071198, + "language_loss": 0.71005595, + "learning_rate": 1.746737042071228e-06, + "loss": 0.73207641, + "num_input_tokens_seen": 99496520, + "step": 4609, + "time_per_iteration": 2.4606435298919678 + }, + { + "auxiliary_loss_clip": 0.01146641, + "auxiliary_loss_mlp": 0.01025827, + "balance_loss_clip": 1.04832506, + "balance_loss_mlp": 1.01777768, + "epoch": 0.5543197258462094, + "flos": 20115721342080.0, + "grad_norm": 2.002158689208895, + "language_loss": 0.78988367, + "learning_rate": 1.7459643605062424e-06, + "loss": 0.81160831, + "num_input_tokens_seen": 99513780, + "step": 4610, + "time_per_iteration": 2.4958789348602295 + }, + { + "auxiliary_loss_clip": 0.01118482, + "auxiliary_loss_mlp": 0.01024745, + "balance_loss_clip": 1.04634297, + "balance_loss_mlp": 1.01654291, + "epoch": 0.5544399687368484, + "flos": 20916565021440.0, + "grad_norm": 1.6909739449375312, + "language_loss": 0.80469275, + "learning_rate": 1.745191717478262e-06, + "loss": 0.82612497, + "num_input_tokens_seen": 99532360, + "step": 4611, + "time_per_iteration": 2.5813252925872803 + }, + { + "auxiliary_loss_clip": 0.01143317, + "auxiliary_loss_mlp": 0.01029879, + "balance_loss_clip": 1.04731727, + "balance_loss_mlp": 1.02188587, + "epoch": 0.5545602116274875, + "flos": 25518661297920.0, + "grad_norm": 1.8920506130727917, + "language_loss": 0.79451752, + "learning_rate": 1.7444191131044948e-06, + "loss": 0.81624949, + "num_input_tokens_seen": 99552635, + "step": 4612, + "time_per_iteration": 2.5282704830169678 + }, + { + "auxiliary_loss_clip": 0.01150909, + "auxiliary_loss_mlp": 0.01030105, + "balance_loss_clip": 1.05056906, + "balance_loss_mlp": 1.02162945, + "epoch": 0.5546804545181266, + "flos": 20995568985600.0, + "grad_norm": 1.6506461855743086, + "language_loss": 0.72817957, + "learning_rate": 1.7436465475021456e-06, + "loss": 0.74998975, + "num_input_tokens_seen": 99572685, + "step": 4613, + "time_per_iteration": 2.4971354007720947 + }, + { + "auxiliary_loss_clip": 0.01126191, + "auxiliary_loss_mlp": 0.01023488, + "balance_loss_clip": 1.04600883, + "balance_loss_mlp": 1.0153873, + "epoch": 0.5548006974087657, + "flos": 26833638297600.0, + "grad_norm": 3.087519328632928, + "language_loss": 0.71443623, + "learning_rate": 1.7428740207884111e-06, + "loss": 0.73593301, + "num_input_tokens_seen": 99593565, + "step": 4614, + "time_per_iteration": 2.562157392501831 + }, + { + "auxiliary_loss_clip": 0.01123838, + "auxiliary_loss_mlp": 0.01025109, + "balance_loss_clip": 1.04752147, + "balance_loss_mlp": 1.01741076, + "epoch": 0.5549209402994048, + "flos": 33656414031360.0, + "grad_norm": 1.7239580171589366, + "language_loss": 0.60937649, + "learning_rate": 1.7421015330804833e-06, + "loss": 0.63086593, + "num_input_tokens_seen": 99613485, + "step": 4615, + "time_per_iteration": 2.651582717895508 + }, + { + "auxiliary_loss_clip": 0.01177602, + "auxiliary_loss_mlp": 0.01028618, + "balance_loss_clip": 1.05166554, + "balance_loss_mlp": 1.02084517, + "epoch": 0.5550411831900439, + "flos": 23769524609280.0, + "grad_norm": 2.067286697971353, + "language_loss": 0.72078151, + "learning_rate": 1.7413290844955475e-06, + "loss": 0.74284375, + "num_input_tokens_seen": 99633515, + "step": 4616, + "time_per_iteration": 2.4424355030059814 + }, + { + "auxiliary_loss_clip": 0.01159689, + "auxiliary_loss_mlp": 0.01028402, + "balance_loss_clip": 1.05314314, + "balance_loss_mlp": 1.02055216, + "epoch": 0.555161426080683, + "flos": 21651189978240.0, + "grad_norm": 1.7418605562893292, + "language_loss": 0.78417337, + "learning_rate": 1.7405566751507843e-06, + "loss": 0.80605423, + "num_input_tokens_seen": 99651560, + "step": 4617, + "time_per_iteration": 2.4481215476989746 + }, + { + "auxiliary_loss_clip": 0.0113599, + "auxiliary_loss_mlp": 0.01032682, + "balance_loss_clip": 1.04690468, + "balance_loss_mlp": 1.02507019, + "epoch": 0.555281668971322, + "flos": 49563116605440.0, + "grad_norm": 1.4744160163512459, + "language_loss": 0.67407584, + "learning_rate": 1.7397843051633668e-06, + "loss": 0.69576252, + "num_input_tokens_seen": 99674255, + "step": 4618, + "time_per_iteration": 2.7785415649414062 + }, + { + "auxiliary_loss_clip": 0.01159143, + "auxiliary_loss_mlp": 0.01026263, + "balance_loss_clip": 1.04945207, + "balance_loss_mlp": 1.01848495, + "epoch": 0.5554019118619612, + "flos": 20741608851840.0, + "grad_norm": 1.6792519789203675, + "language_loss": 0.71536326, + "learning_rate": 1.739011974650464e-06, + "loss": 0.73721743, + "num_input_tokens_seen": 99693585, + "step": 4619, + "time_per_iteration": 2.4554431438446045 + }, + { + "auxiliary_loss_clip": 0.01125758, + "auxiliary_loss_mlp": 0.01025567, + "balance_loss_clip": 1.04622269, + "balance_loss_mlp": 1.01749659, + "epoch": 0.5555221547526003, + "flos": 25483217552640.0, + "grad_norm": 2.076229126769864, + "language_loss": 0.76736093, + "learning_rate": 1.7382396837292365e-06, + "loss": 0.78887415, + "num_input_tokens_seen": 99714045, + "step": 4620, + "time_per_iteration": 2.6373496055603027 + }, + { + "auxiliary_loss_clip": 0.01179175, + "auxiliary_loss_mlp": 0.01023419, + "balance_loss_clip": 1.05340099, + "balance_loss_mlp": 1.01546156, + "epoch": 0.5556423976432393, + "flos": 21762513204480.0, + "grad_norm": 1.6332792040020054, + "language_loss": 0.73185003, + "learning_rate": 1.737467432516841e-06, + "loss": 0.75387597, + "num_input_tokens_seen": 99734145, + "step": 4621, + "time_per_iteration": 2.4401304721832275 + }, + { + "auxiliary_loss_clip": 0.01147824, + "auxiliary_loss_mlp": 0.01022288, + "balance_loss_clip": 1.04525363, + "balance_loss_mlp": 1.01446795, + "epoch": 0.5557626405338785, + "flos": 24900171989760.0, + "grad_norm": 3.188216139676721, + "language_loss": 0.74626482, + "learning_rate": 1.7366952211304274e-06, + "loss": 0.76796591, + "num_input_tokens_seen": 99751990, + "step": 4622, + "time_per_iteration": 2.5241785049438477 + }, + { + "auxiliary_loss_clip": 0.01141504, + "auxiliary_loss_mlp": 0.01026856, + "balance_loss_clip": 1.04660058, + "balance_loss_mlp": 1.01930451, + "epoch": 0.5558828834245175, + "flos": 18697501676160.0, + "grad_norm": 2.088224322898579, + "language_loss": 0.83350098, + "learning_rate": 1.735923049687139e-06, + "loss": 0.85518461, + "num_input_tokens_seen": 99768565, + "step": 4623, + "time_per_iteration": 2.4875526428222656 + }, + { + "auxiliary_loss_clip": 0.01143677, + "auxiliary_loss_mlp": 0.01026, + "balance_loss_clip": 1.04657567, + "balance_loss_mlp": 1.01834631, + "epoch": 0.5560031263151566, + "flos": 27272179445760.0, + "grad_norm": 1.7278471695266278, + "language_loss": 0.7384972, + "learning_rate": 1.7351509183041144e-06, + "loss": 0.76019394, + "num_input_tokens_seen": 99788895, + "step": 4624, + "time_per_iteration": 2.558884620666504 + }, + { + "auxiliary_loss_clip": 0.01179393, + "auxiliary_loss_mlp": 0.01024212, + "balance_loss_clip": 1.05185795, + "balance_loss_mlp": 1.01682138, + "epoch": 0.5561233692057957, + "flos": 23403738458880.0, + "grad_norm": 2.0313211319964846, + "language_loss": 0.7129907, + "learning_rate": 1.7343788270984852e-06, + "loss": 0.73502684, + "num_input_tokens_seen": 99808035, + "step": 4625, + "time_per_iteration": 2.452871084213257 + }, + { + "auxiliary_loss_clip": 0.01148361, + "auxiliary_loss_mlp": 0.01025745, + "balance_loss_clip": 1.05082393, + "balance_loss_mlp": 1.01759684, + "epoch": 0.5562436120964348, + "flos": 37670867804160.0, + "grad_norm": 2.1818553975515864, + "language_loss": 0.74846005, + "learning_rate": 1.7336067761873764e-06, + "loss": 0.77020115, + "num_input_tokens_seen": 99830460, + "step": 4626, + "time_per_iteration": 2.6376688480377197 + }, + { + "auxiliary_loss_clip": 0.01172141, + "auxiliary_loss_mlp": 0.01029067, + "balance_loss_clip": 1.05177045, + "balance_loss_mlp": 1.02069879, + "epoch": 0.5563638549870739, + "flos": 25155245445120.0, + "grad_norm": 3.8245415911876894, + "language_loss": 0.76226842, + "learning_rate": 1.7328347656879076e-06, + "loss": 0.78428054, + "num_input_tokens_seen": 99850320, + "step": 4627, + "time_per_iteration": 2.4915060997009277 + }, + { + "auxiliary_loss_clip": 0.01136523, + "auxiliary_loss_mlp": 0.01024418, + "balance_loss_clip": 1.04811668, + "balance_loss_mlp": 1.01624012, + "epoch": 0.556484097877713, + "flos": 13581810783360.0, + "grad_norm": 2.772568116828185, + "language_loss": 0.68051189, + "learning_rate": 1.7320627957171927e-06, + "loss": 0.70212126, + "num_input_tokens_seen": 99864980, + "step": 4628, + "time_per_iteration": 2.479290723800659 + }, + { + "auxiliary_loss_clip": 0.01180633, + "auxiliary_loss_mlp": 0.01023575, + "balance_loss_clip": 1.05402899, + "balance_loss_mlp": 1.01619554, + "epoch": 0.5566043407683521, + "flos": 24681368292480.0, + "grad_norm": 1.8532318623641062, + "language_loss": 0.81241369, + "learning_rate": 1.7312908663923382e-06, + "loss": 0.83445573, + "num_input_tokens_seen": 99881155, + "step": 4629, + "time_per_iteration": 2.4489364624023438 + }, + { + "auxiliary_loss_clip": 0.01157451, + "auxiliary_loss_mlp": 0.01023356, + "balance_loss_clip": 1.04757357, + "balance_loss_mlp": 1.01495469, + "epoch": 0.5567245836589911, + "flos": 20588161950720.0, + "grad_norm": 2.191889183118748, + "language_loss": 0.67016858, + "learning_rate": 1.7305189778304463e-06, + "loss": 0.69197667, + "num_input_tokens_seen": 99899330, + "step": 4630, + "time_per_iteration": 3.195199966430664 + }, + { + "auxiliary_loss_clip": 0.0114991, + "auxiliary_loss_mlp": 0.01029911, + "balance_loss_clip": 1.0517031, + "balance_loss_mlp": 1.02220452, + "epoch": 0.5568448265496303, + "flos": 20704189858560.0, + "grad_norm": 1.8268861282210516, + "language_loss": 0.80061841, + "learning_rate": 1.729747130148611e-06, + "loss": 0.82241666, + "num_input_tokens_seen": 99918525, + "step": 4631, + "time_per_iteration": 3.961745262145996 + }, + { + "auxiliary_loss_clip": 0.01142472, + "auxiliary_loss_mlp": 0.01027479, + "balance_loss_clip": 1.04903126, + "balance_loss_mlp": 1.01886594, + "epoch": 0.5569650694402694, + "flos": 25302910256640.0, + "grad_norm": 1.82761154492927, + "language_loss": 0.7725631, + "learning_rate": 1.7289753234639208e-06, + "loss": 0.79426259, + "num_input_tokens_seen": 99937500, + "step": 4632, + "time_per_iteration": 2.59407377243042 + }, + { + "auxiliary_loss_clip": 0.01171277, + "auxiliary_loss_mlp": 0.01028954, + "balance_loss_clip": 1.05343485, + "balance_loss_mlp": 1.02067459, + "epoch": 0.5570853123309084, + "flos": 19712623939200.0, + "grad_norm": 1.8107183514106118, + "language_loss": 0.76537532, + "learning_rate": 1.7282035578934592e-06, + "loss": 0.7873776, + "num_input_tokens_seen": 99955665, + "step": 4633, + "time_per_iteration": 2.4517529010772705 + }, + { + "auxiliary_loss_clip": 0.01142923, + "auxiliary_loss_mlp": 0.01033984, + "balance_loss_clip": 1.0513078, + "balance_loss_mlp": 1.02647948, + "epoch": 0.5572055552215476, + "flos": 16108091153280.0, + "grad_norm": 2.469881134076473, + "language_loss": 0.78928787, + "learning_rate": 1.727431833554301e-06, + "loss": 0.81105691, + "num_input_tokens_seen": 99974140, + "step": 4634, + "time_per_iteration": 3.235639810562134 + }, + { + "auxiliary_loss_clip": 0.01117126, + "auxiliary_loss_mlp": 0.01024631, + "balance_loss_clip": 1.04698515, + "balance_loss_mlp": 1.01705277, + "epoch": 0.5573257981121866, + "flos": 17128815937920.0, + "grad_norm": 1.9001187744183947, + "language_loss": 0.77241361, + "learning_rate": 1.7266601505635175e-06, + "loss": 0.79383123, + "num_input_tokens_seen": 99991480, + "step": 4635, + "time_per_iteration": 2.5511603355407715 + }, + { + "auxiliary_loss_clip": 0.01165855, + "auxiliary_loss_mlp": 0.01025823, + "balance_loss_clip": 1.05238271, + "balance_loss_mlp": 1.01802087, + "epoch": 0.5574460410028257, + "flos": 18807029222400.0, + "grad_norm": 3.0309163752652766, + "language_loss": 0.75830609, + "learning_rate": 1.7258885090381717e-06, + "loss": 0.78022289, + "num_input_tokens_seen": 100009520, + "step": 4636, + "time_per_iteration": 2.4417805671691895 + }, + { + "auxiliary_loss_clip": 0.01153238, + "auxiliary_loss_mlp": 0.01030606, + "balance_loss_clip": 1.04965305, + "balance_loss_mlp": 1.02296519, + "epoch": 0.5575662838934649, + "flos": 29642678530560.0, + "grad_norm": 1.7933941704414211, + "language_loss": 0.78608704, + "learning_rate": 1.7251169090953213e-06, + "loss": 0.80792546, + "num_input_tokens_seen": 100029995, + "step": 4637, + "time_per_iteration": 2.591215133666992 + }, + { + "auxiliary_loss_clip": 0.01164286, + "auxiliary_loss_mlp": 0.01023508, + "balance_loss_clip": 1.04996943, + "balance_loss_mlp": 1.01531863, + "epoch": 0.5576865267841039, + "flos": 22054466949120.0, + "grad_norm": 2.6719965428474155, + "language_loss": 0.76469576, + "learning_rate": 1.7243453508520168e-06, + "loss": 0.78657377, + "num_input_tokens_seen": 100046980, + "step": 4638, + "time_per_iteration": 2.4664485454559326 + }, + { + "auxiliary_loss_clip": 0.0115054, + "auxiliary_loss_mlp": 0.01026878, + "balance_loss_clip": 1.04807615, + "balance_loss_mlp": 1.01873612, + "epoch": 0.557806769674743, + "flos": 17196040241280.0, + "grad_norm": 2.3116679470384804, + "language_loss": 0.84188604, + "learning_rate": 1.7235738344253038e-06, + "loss": 0.86366022, + "num_input_tokens_seen": 100060610, + "step": 4639, + "time_per_iteration": 2.4483301639556885 + }, + { + "auxiliary_loss_clip": 0.0116303, + "auxiliary_loss_mlp": 0.01026639, + "balance_loss_clip": 1.05268526, + "balance_loss_mlp": 1.01836014, + "epoch": 0.557927012565382, + "flos": 24712717887360.0, + "grad_norm": 1.800992832323297, + "language_loss": 0.82604277, + "learning_rate": 1.72280235993222e-06, + "loss": 0.84793949, + "num_input_tokens_seen": 100078915, + "step": 4640, + "time_per_iteration": 2.482396125793457 + }, + { + "auxiliary_loss_clip": 0.0116297, + "auxiliary_loss_mlp": 0.00762961, + "balance_loss_clip": 1.05157614, + "balance_loss_mlp": 1.00123668, + "epoch": 0.5580472554560212, + "flos": 16983090460800.0, + "grad_norm": 2.1572937900723432, + "language_loss": 0.69464034, + "learning_rate": 1.722030927489798e-06, + "loss": 0.71389967, + "num_input_tokens_seen": 100096195, + "step": 4641, + "time_per_iteration": 2.4436848163604736 + }, + { + "auxiliary_loss_clip": 0.01140254, + "auxiliary_loss_mlp": 0.01023432, + "balance_loss_clip": 1.05153084, + "balance_loss_mlp": 1.01528454, + "epoch": 0.5581674983466602, + "flos": 23509100027520.0, + "grad_norm": 1.6358170025340917, + "language_loss": 0.74068356, + "learning_rate": 1.7212595372150634e-06, + "loss": 0.76232046, + "num_input_tokens_seen": 100116175, + "step": 4642, + "time_per_iteration": 2.563978672027588 + }, + { + "auxiliary_loss_clip": 0.01180615, + "auxiliary_loss_mlp": 0.01024162, + "balance_loss_clip": 1.05392814, + "balance_loss_mlp": 1.01675344, + "epoch": 0.5582877412372993, + "flos": 13480291969920.0, + "grad_norm": 4.4685392603703455, + "language_loss": 0.7279228, + "learning_rate": 1.720488189225035e-06, + "loss": 0.74997056, + "num_input_tokens_seen": 100133875, + "step": 4643, + "time_per_iteration": 2.41811203956604 + }, + { + "auxiliary_loss_clip": 0.01166383, + "auxiliary_loss_mlp": 0.01025547, + "balance_loss_clip": 1.05065775, + "balance_loss_mlp": 1.01788235, + "epoch": 0.5584079841279385, + "flos": 21903605827200.0, + "grad_norm": 2.2594921116240734, + "language_loss": 0.79374731, + "learning_rate": 1.7197168836367265e-06, + "loss": 0.81566656, + "num_input_tokens_seen": 100150685, + "step": 4644, + "time_per_iteration": 2.4509847164154053 + }, + { + "auxiliary_loss_clip": 0.011611, + "auxiliary_loss_mlp": 0.00762755, + "balance_loss_clip": 1.04900002, + "balance_loss_mlp": 1.00122643, + "epoch": 0.5585282270185775, + "flos": 18843550375680.0, + "grad_norm": 2.4923375219871935, + "language_loss": 0.81837147, + "learning_rate": 1.7189456205671433e-06, + "loss": 0.83761001, + "num_input_tokens_seen": 100169530, + "step": 4645, + "time_per_iteration": 2.489499092102051 + }, + { + "auxiliary_loss_clip": 0.01171568, + "auxiliary_loss_mlp": 0.01028616, + "balance_loss_clip": 1.05214882, + "balance_loss_mlp": 1.02043867, + "epoch": 0.5586484699092166, + "flos": 21868449390720.0, + "grad_norm": 1.8382442222580446, + "language_loss": 0.81987512, + "learning_rate": 1.7181744001332866e-06, + "loss": 0.84187698, + "num_input_tokens_seen": 100188140, + "step": 4646, + "time_per_iteration": 2.4710960388183594 + }, + { + "auxiliary_loss_clip": 0.01178871, + "auxiliary_loss_mlp": 0.01025966, + "balance_loss_clip": 1.05414963, + "balance_loss_mlp": 1.01848006, + "epoch": 0.5587687127998557, + "flos": 22893232412160.0, + "grad_norm": 2.30249310634816, + "language_loss": 0.63281792, + "learning_rate": 1.7174032224521493e-06, + "loss": 0.65486634, + "num_input_tokens_seen": 100206850, + "step": 4647, + "time_per_iteration": 2.4334709644317627 + }, + { + "auxiliary_loss_clip": 0.01163721, + "auxiliary_loss_mlp": 0.01028577, + "balance_loss_clip": 1.05053556, + "balance_loss_mlp": 1.02083445, + "epoch": 0.5588889556904948, + "flos": 20303067703680.0, + "grad_norm": 1.5789237342101679, + "language_loss": 0.69585031, + "learning_rate": 1.7166320876407184e-06, + "loss": 0.71777326, + "num_input_tokens_seen": 100226270, + "step": 4648, + "time_per_iteration": 2.456420660018921 + }, + { + "auxiliary_loss_clip": 0.01181627, + "auxiliary_loss_mlp": 0.00762874, + "balance_loss_clip": 1.05398619, + "balance_loss_mlp": 1.00130689, + "epoch": 0.5590091985811338, + "flos": 16472153450880.0, + "grad_norm": 1.8550243371936626, + "language_loss": 0.67794675, + "learning_rate": 1.7158609958159742e-06, + "loss": 0.69739175, + "num_input_tokens_seen": 100243675, + "step": 4649, + "time_per_iteration": 2.3934805393218994 + }, + { + "auxiliary_loss_clip": 0.01114646, + "auxiliary_loss_mlp": 0.01029721, + "balance_loss_clip": 1.04356718, + "balance_loss_mlp": 1.0220741, + "epoch": 0.559129441471773, + "flos": 14532186781440.0, + "grad_norm": 2.1141959780225545, + "language_loss": 0.78185719, + "learning_rate": 1.7150899470948911e-06, + "loss": 0.80330086, + "num_input_tokens_seen": 100258940, + "step": 4650, + "time_per_iteration": 2.5468482971191406 + }, + { + "auxiliary_loss_clip": 0.01048659, + "auxiliary_loss_mlp": 0.00999889, + "balance_loss_clip": 1.0112505, + "balance_loss_mlp": 0.99860716, + "epoch": 0.5592496843624121, + "flos": 60521009852160.0, + "grad_norm": 0.7944325161034467, + "language_loss": 0.56597275, + "learning_rate": 1.7143189415944365e-06, + "loss": 0.58645827, + "num_input_tokens_seen": 100323400, + "step": 4651, + "time_per_iteration": 3.1045804023742676 + }, + { + "auxiliary_loss_clip": 0.01163305, + "auxiliary_loss_mlp": 0.0102763, + "balance_loss_clip": 1.05182064, + "balance_loss_mlp": 1.01928484, + "epoch": 0.5593699272530511, + "flos": 20886256920960.0, + "grad_norm": 1.789014135466764, + "language_loss": 0.76114094, + "learning_rate": 1.7135479794315714e-06, + "loss": 0.7830503, + "num_input_tokens_seen": 100340355, + "step": 4652, + "time_per_iteration": 2.4496946334838867 + }, + { + "auxiliary_loss_clip": 0.01134085, + "auxiliary_loss_mlp": 0.01024144, + "balance_loss_clip": 1.04780138, + "balance_loss_mlp": 1.01671124, + "epoch": 0.5594901701436903, + "flos": 12896743616640.0, + "grad_norm": 2.382928735101919, + "language_loss": 0.79029369, + "learning_rate": 1.7127770607232502e-06, + "loss": 0.81187594, + "num_input_tokens_seen": 100358900, + "step": 4653, + "time_per_iteration": 2.494659662246704 + }, + { + "auxiliary_loss_clip": 0.01140657, + "auxiliary_loss_mlp": 0.01026191, + "balance_loss_clip": 1.04685199, + "balance_loss_mlp": 1.01812625, + "epoch": 0.5596104130343293, + "flos": 23112107936640.0, + "grad_norm": 2.1604110539713397, + "language_loss": 0.79769027, + "learning_rate": 1.7120061855864204e-06, + "loss": 0.81935871, + "num_input_tokens_seen": 100378910, + "step": 4654, + "time_per_iteration": 2.5278806686401367 + }, + { + "auxiliary_loss_clip": 0.01164998, + "auxiliary_loss_mlp": 0.01029779, + "balance_loss_clip": 1.05293608, + "balance_loss_mlp": 1.02167249, + "epoch": 0.5597306559249684, + "flos": 25957812977280.0, + "grad_norm": 1.945922581615715, + "language_loss": 0.71005988, + "learning_rate": 1.7112353541380233e-06, + "loss": 0.73200762, + "num_input_tokens_seen": 100398770, + "step": 4655, + "time_per_iteration": 2.485323190689087 + }, + { + "auxiliary_loss_clip": 0.01152895, + "auxiliary_loss_mlp": 0.01031978, + "balance_loss_clip": 1.05109, + "balance_loss_mlp": 1.02298999, + "epoch": 0.5598508988156076, + "flos": 22492289825280.0, + "grad_norm": 1.6555481675938066, + "language_loss": 0.72134382, + "learning_rate": 1.7104645664949931e-06, + "loss": 0.74319255, + "num_input_tokens_seen": 100421240, + "step": 4656, + "time_per_iteration": 2.5257842540740967 + }, + { + "auxiliary_loss_clip": 0.0115149, + "auxiliary_loss_mlp": 0.01029884, + "balance_loss_clip": 1.04695559, + "balance_loss_mlp": 1.02152205, + "epoch": 0.5599711417062466, + "flos": 23112538899840.0, + "grad_norm": 1.7245850430701588, + "language_loss": 0.71564728, + "learning_rate": 1.7096938227742584e-06, + "loss": 0.73746103, + "num_input_tokens_seen": 100442370, + "step": 4657, + "time_per_iteration": 3.3248581886291504 + }, + { + "auxiliary_loss_clip": 0.01178384, + "auxiliary_loss_mlp": 0.01026299, + "balance_loss_clip": 1.05198395, + "balance_loss_mlp": 1.01828849, + "epoch": 0.5600913845968857, + "flos": 22339345714560.0, + "grad_norm": 2.0897749930517873, + "language_loss": 0.84273577, + "learning_rate": 1.70892312309274e-06, + "loss": 0.86478263, + "num_input_tokens_seen": 100460260, + "step": 4658, + "time_per_iteration": 3.30371356010437 + }, + { + "auxiliary_loss_clip": 0.01149262, + "auxiliary_loss_mlp": 0.01024488, + "balance_loss_clip": 1.0431118, + "balance_loss_mlp": 1.0163523, + "epoch": 0.5602116274875248, + "flos": 17633791290240.0, + "grad_norm": 2.07006915050807, + "language_loss": 0.68448025, + "learning_rate": 1.7081524675673523e-06, + "loss": 0.70621777, + "num_input_tokens_seen": 100475750, + "step": 4659, + "time_per_iteration": 3.231497049331665 + }, + { + "auxiliary_loss_clip": 0.01055877, + "auxiliary_loss_mlp": 0.01004483, + "balance_loss_clip": 1.01392221, + "balance_loss_mlp": 1.00319517, + "epoch": 0.5603318703781639, + "flos": 70115945529600.0, + "grad_norm": 0.7723558221770669, + "language_loss": 0.59648657, + "learning_rate": 1.7073818563150026e-06, + "loss": 0.61709011, + "num_input_tokens_seen": 100537830, + "step": 4660, + "time_per_iteration": 3.156444549560547 + }, + { + "auxiliary_loss_clip": 0.01161604, + "auxiliary_loss_mlp": 0.0102343, + "balance_loss_clip": 1.05008686, + "balance_loss_mlp": 1.01549697, + "epoch": 0.560452113268803, + "flos": 18545850455040.0, + "grad_norm": 2.4526237805823055, + "language_loss": 0.86453354, + "learning_rate": 1.7066112894525935e-06, + "loss": 0.88638389, + "num_input_tokens_seen": 100555910, + "step": 4661, + "time_per_iteration": 3.205564260482788 + }, + { + "auxiliary_loss_clip": 0.0114162, + "auxiliary_loss_mlp": 0.01029998, + "balance_loss_clip": 1.0477488, + "balance_loss_mlp": 1.02211213, + "epoch": 0.5605723561594421, + "flos": 25264665250560.0, + "grad_norm": 1.8663205881162783, + "language_loss": 0.73001164, + "learning_rate": 1.7058407670970177e-06, + "loss": 0.75172782, + "num_input_tokens_seen": 100577385, + "step": 4662, + "time_per_iteration": 2.5301949977874756 + }, + { + "auxiliary_loss_clip": 0.01168164, + "auxiliary_loss_mlp": 0.01025507, + "balance_loss_clip": 1.04927182, + "balance_loss_mlp": 1.01754999, + "epoch": 0.5606925990500812, + "flos": 20594949621120.0, + "grad_norm": 1.6633536327393899, + "language_loss": 0.60958213, + "learning_rate": 1.7050702893651643e-06, + "loss": 0.63151884, + "num_input_tokens_seen": 100596965, + "step": 4663, + "time_per_iteration": 2.4423880577087402 + }, + { + "auxiliary_loss_clip": 0.01163973, + "auxiliary_loss_mlp": 0.01028179, + "balance_loss_clip": 1.05158448, + "balance_loss_mlp": 1.01999497, + "epoch": 0.5608128419407202, + "flos": 35006044677120.0, + "grad_norm": 2.0910923759290037, + "language_loss": 0.75597966, + "learning_rate": 1.7042998563739134e-06, + "loss": 0.77790117, + "num_input_tokens_seen": 100615315, + "step": 4664, + "time_per_iteration": 2.572443723678589 + }, + { + "auxiliary_loss_clip": 0.01156752, + "auxiliary_loss_mlp": 0.01032206, + "balance_loss_clip": 1.04739714, + "balance_loss_mlp": 1.02375388, + "epoch": 0.5609330848313594, + "flos": 24639819235200.0, + "grad_norm": 2.0395682546807694, + "language_loss": 0.716474, + "learning_rate": 1.703529468240139e-06, + "loss": 0.73836362, + "num_input_tokens_seen": 100634185, + "step": 4665, + "time_per_iteration": 2.5149474143981934 + }, + { + "auxiliary_loss_clip": 0.01144318, + "auxiliary_loss_mlp": 0.01025336, + "balance_loss_clip": 1.0500195, + "balance_loss_mlp": 1.01750135, + "epoch": 0.5610533277219985, + "flos": 18762894385920.0, + "grad_norm": 2.066458096447469, + "language_loss": 0.7355262, + "learning_rate": 1.7027591250807088e-06, + "loss": 0.75722277, + "num_input_tokens_seen": 100651360, + "step": 4666, + "time_per_iteration": 2.4796013832092285 + }, + { + "auxiliary_loss_clip": 0.01181412, + "auxiliary_loss_mlp": 0.01026783, + "balance_loss_clip": 1.05430436, + "balance_loss_mlp": 1.01882577, + "epoch": 0.5611735706126375, + "flos": 15012384727680.0, + "grad_norm": 2.1915838076267615, + "language_loss": 0.84476101, + "learning_rate": 1.7019888270124825e-06, + "loss": 0.86684299, + "num_input_tokens_seen": 100668525, + "step": 4667, + "time_per_iteration": 2.4000558853149414 + }, + { + "auxiliary_loss_clip": 0.01169253, + "auxiliary_loss_mlp": 0.01031418, + "balance_loss_clip": 1.05293572, + "balance_loss_mlp": 1.02320993, + "epoch": 0.5612938135032767, + "flos": 16468167041280.0, + "grad_norm": 2.1855554333011735, + "language_loss": 0.82077992, + "learning_rate": 1.7012185741523147e-06, + "loss": 0.84278667, + "num_input_tokens_seen": 100684850, + "step": 4668, + "time_per_iteration": 2.4215643405914307 + }, + { + "auxiliary_loss_clip": 0.01179759, + "auxiliary_loss_mlp": 0.01028701, + "balance_loss_clip": 1.05327272, + "balance_loss_mlp": 1.02101815, + "epoch": 0.5614140563939157, + "flos": 25666433850240.0, + "grad_norm": 2.114155762967836, + "language_loss": 0.62516469, + "learning_rate": 1.7004483666170514e-06, + "loss": 0.64724928, + "num_input_tokens_seen": 100705345, + "step": 4669, + "time_per_iteration": 2.4724161624908447 + }, + { + "auxiliary_loss_clip": 0.01162691, + "auxiliary_loss_mlp": 0.01026654, + "balance_loss_clip": 1.04919672, + "balance_loss_mlp": 1.01914406, + "epoch": 0.5615342992845548, + "flos": 24717566223360.0, + "grad_norm": 2.069932812370489, + "language_loss": 0.80347288, + "learning_rate": 1.699678204523533e-06, + "loss": 0.82536638, + "num_input_tokens_seen": 100725210, + "step": 4670, + "time_per_iteration": 2.536151885986328 + }, + { + "auxiliary_loss_clip": 0.01155822, + "auxiliary_loss_mlp": 0.01029405, + "balance_loss_clip": 1.05345583, + "balance_loss_mlp": 1.02053595, + "epoch": 0.5616545421751938, + "flos": 22015934634240.0, + "grad_norm": 3.712135912638353, + "language_loss": 0.68528742, + "learning_rate": 1.6989080879885918e-06, + "loss": 0.70713967, + "num_input_tokens_seen": 100743070, + "step": 4671, + "time_per_iteration": 2.510531187057495 + }, + { + "auxiliary_loss_clip": 0.01042604, + "auxiliary_loss_mlp": 0.01003971, + "balance_loss_clip": 1.01339281, + "balance_loss_mlp": 1.00260627, + "epoch": 0.561774785065833, + "flos": 53760358690560.0, + "grad_norm": 0.9041320054315038, + "language_loss": 0.61006945, + "learning_rate": 1.6981380171290544e-06, + "loss": 0.63053519, + "num_input_tokens_seen": 100804095, + "step": 4672, + "time_per_iteration": 3.0785298347473145 + }, + { + "auxiliary_loss_clip": 0.01144829, + "auxiliary_loss_mlp": 0.01024424, + "balance_loss_clip": 1.04596925, + "balance_loss_mlp": 1.01669073, + "epoch": 0.5618950279564721, + "flos": 19750007018880.0, + "grad_norm": 2.4190737501612736, + "language_loss": 0.7436319, + "learning_rate": 1.6973679920617396e-06, + "loss": 0.76532441, + "num_input_tokens_seen": 100821630, + "step": 4673, + "time_per_iteration": 2.4949254989624023 + }, + { + "auxiliary_loss_clip": 0.01148175, + "auxiliary_loss_mlp": 0.0102854, + "balance_loss_clip": 1.05000818, + "balance_loss_mlp": 1.02018332, + "epoch": 0.5620152708471111, + "flos": 16800592435200.0, + "grad_norm": 2.315887276170271, + "language_loss": 0.85282803, + "learning_rate": 1.6965980129034603e-06, + "loss": 0.87459517, + "num_input_tokens_seen": 100839015, + "step": 4674, + "time_per_iteration": 2.4646124839782715 + }, + { + "auxiliary_loss_clip": 0.01154265, + "auxiliary_loss_mlp": 0.01027475, + "balance_loss_clip": 1.05429053, + "balance_loss_mlp": 1.01968503, + "epoch": 0.5621355137377503, + "flos": 26797799502720.0, + "grad_norm": 2.6771485105388604, + "language_loss": 0.7669372, + "learning_rate": 1.6958280797710209e-06, + "loss": 0.78875458, + "num_input_tokens_seen": 100860940, + "step": 4675, + "time_per_iteration": 2.544156312942505 + }, + { + "auxiliary_loss_clip": 0.01052704, + "auxiliary_loss_mlp": 0.01000618, + "balance_loss_clip": 1.01276624, + "balance_loss_mlp": 0.99925882, + "epoch": 0.5622557566283893, + "flos": 61207046686080.0, + "grad_norm": 0.716745126047207, + "language_loss": 0.54697561, + "learning_rate": 1.6950581927812198e-06, + "loss": 0.56750882, + "num_input_tokens_seen": 100920510, + "step": 4676, + "time_per_iteration": 2.9516711235046387 + }, + { + "auxiliary_loss_clip": 0.01164586, + "auxiliary_loss_mlp": 0.0102635, + "balance_loss_clip": 1.05118656, + "balance_loss_mlp": 1.01841116, + "epoch": 0.5623759995190284, + "flos": 26468534505600.0, + "grad_norm": 1.9891991139190217, + "language_loss": 0.79363477, + "learning_rate": 1.6942883520508486e-06, + "loss": 0.81554413, + "num_input_tokens_seen": 100939245, + "step": 4677, + "time_per_iteration": 2.5081777572631836 + }, + { + "auxiliary_loss_clip": 0.01167359, + "auxiliary_loss_mlp": 0.01024789, + "balance_loss_clip": 1.052194, + "balance_loss_mlp": 1.01692677, + "epoch": 0.5624962424096676, + "flos": 19390900798080.0, + "grad_norm": 1.9987554328505999, + "language_loss": 0.77541804, + "learning_rate": 1.693518557696691e-06, + "loss": 0.79733956, + "num_input_tokens_seen": 100958385, + "step": 4678, + "time_per_iteration": 2.4426236152648926 + }, + { + "auxiliary_loss_clip": 0.0116115, + "auxiliary_loss_mlp": 0.01026171, + "balance_loss_clip": 1.04806554, + "balance_loss_mlp": 1.01844025, + "epoch": 0.5626164853003066, + "flos": 20667345482880.0, + "grad_norm": 2.2728611625799062, + "language_loss": 0.88982135, + "learning_rate": 1.6927488098355252e-06, + "loss": 0.91169453, + "num_input_tokens_seen": 100976015, + "step": 4679, + "time_per_iteration": 2.460632562637329 + }, + { + "auxiliary_loss_clip": 0.0103868, + "auxiliary_loss_mlp": 0.01002175, + "balance_loss_clip": 1.01287794, + "balance_loss_mlp": 1.00074422, + "epoch": 0.5627367281909457, + "flos": 62766071665920.0, + "grad_norm": 0.9046377187970658, + "language_loss": 0.63174087, + "learning_rate": 1.6919791085841201e-06, + "loss": 0.65214944, + "num_input_tokens_seen": 101033425, + "step": 4680, + "time_per_iteration": 3.073939323425293 + }, + { + "auxiliary_loss_clip": 0.01159318, + "auxiliary_loss_mlp": 0.0103195, + "balance_loss_clip": 1.04746914, + "balance_loss_mlp": 1.02325988, + "epoch": 0.5628569710815848, + "flos": 12787144243200.0, + "grad_norm": 2.4097517851742234, + "language_loss": 0.78879994, + "learning_rate": 1.6912094540592396e-06, + "loss": 0.81071264, + "num_input_tokens_seen": 101048945, + "step": 4681, + "time_per_iteration": 2.4411895275115967 + }, + { + "auxiliary_loss_clip": 0.0116393, + "auxiliary_loss_mlp": 0.01029956, + "balance_loss_clip": 1.05051935, + "balance_loss_mlp": 1.02212954, + "epoch": 0.5629772139722239, + "flos": 13762082165760.0, + "grad_norm": 3.683443897051642, + "language_loss": 0.80981374, + "learning_rate": 1.6904398463776393e-06, + "loss": 0.83175254, + "num_input_tokens_seen": 101062745, + "step": 4682, + "time_per_iteration": 2.4572901725769043 + }, + { + "auxiliary_loss_clip": 0.01166422, + "auxiliary_loss_mlp": 0.01025758, + "balance_loss_clip": 1.04981017, + "balance_loss_mlp": 1.01782513, + "epoch": 0.5630974568628629, + "flos": 21467830026240.0, + "grad_norm": 1.7204581950473194, + "language_loss": 0.72580683, + "learning_rate": 1.6896702856560683e-06, + "loss": 0.74772859, + "num_input_tokens_seen": 101081840, + "step": 4683, + "time_per_iteration": 2.4706668853759766 + }, + { + "auxiliary_loss_clip": 0.01133607, + "auxiliary_loss_mlp": 0.01023902, + "balance_loss_clip": 1.04370475, + "balance_loss_mlp": 1.01628423, + "epoch": 0.5632176997535021, + "flos": 14245907385600.0, + "grad_norm": 3.5913999696781116, + "language_loss": 0.69461763, + "learning_rate": 1.6889007720112677e-06, + "loss": 0.71619272, + "num_input_tokens_seen": 101099585, + "step": 4684, + "time_per_iteration": 4.0847532749176025 + }, + { + "auxiliary_loss_clip": 0.01168362, + "auxiliary_loss_mlp": 0.01027413, + "balance_loss_clip": 1.05300927, + "balance_loss_mlp": 1.02006984, + "epoch": 0.5633379426441412, + "flos": 20812244947200.0, + "grad_norm": 2.9413868336184787, + "language_loss": 0.77393895, + "learning_rate": 1.6881313055599734e-06, + "loss": 0.79589671, + "num_input_tokens_seen": 101119515, + "step": 4685, + "time_per_iteration": 2.4671151638031006 + }, + { + "auxiliary_loss_clip": 0.01136726, + "auxiliary_loss_mlp": 0.01022321, + "balance_loss_clip": 1.04507446, + "balance_loss_mlp": 1.0139997, + "epoch": 0.5634581855347802, + "flos": 22600883617920.0, + "grad_norm": 2.2071013456594373, + "language_loss": 0.82178217, + "learning_rate": 1.6873618864189117e-06, + "loss": 0.8433727, + "num_input_tokens_seen": 101135285, + "step": 4686, + "time_per_iteration": 2.50227427482605 + }, + { + "auxiliary_loss_clip": 0.01164424, + "auxiliary_loss_mlp": 0.01029667, + "balance_loss_clip": 1.04998422, + "balance_loss_mlp": 1.02134061, + "epoch": 0.5635784284254194, + "flos": 21506972872320.0, + "grad_norm": 2.690472594680069, + "language_loss": 0.7787745, + "learning_rate": 1.686592514704803e-06, + "loss": 0.80071545, + "num_input_tokens_seen": 101152680, + "step": 4687, + "time_per_iteration": 3.2375752925872803 + }, + { + "auxiliary_loss_clip": 0.01150064, + "auxiliary_loss_mlp": 0.0102732, + "balance_loss_clip": 1.05248022, + "balance_loss_mlp": 1.01994658, + "epoch": 0.5636986713160584, + "flos": 19827466698240.0, + "grad_norm": 2.1322700367486442, + "language_loss": 0.71066701, + "learning_rate": 1.685823190534361e-06, + "loss": 0.73244083, + "num_input_tokens_seen": 101170920, + "step": 4688, + "time_per_iteration": 2.4821083545684814 + }, + { + "auxiliary_loss_clip": 0.01181627, + "auxiliary_loss_mlp": 0.01024395, + "balance_loss_clip": 1.05256748, + "balance_loss_mlp": 1.01596677, + "epoch": 0.5638189142066975, + "flos": 19792453916160.0, + "grad_norm": 1.9436232200748598, + "language_loss": 0.83922732, + "learning_rate": 1.6850539140242907e-06, + "loss": 0.86128747, + "num_input_tokens_seen": 101190180, + "step": 4689, + "time_per_iteration": 2.43100905418396 + }, + { + "auxiliary_loss_clip": 0.01167598, + "auxiliary_loss_mlp": 0.01028141, + "balance_loss_clip": 1.05032682, + "balance_loss_mlp": 1.02026153, + "epoch": 0.5639391570973367, + "flos": 22893771116160.0, + "grad_norm": 2.2935084770142242, + "language_loss": 0.82412285, + "learning_rate": 1.684284685291292e-06, + "loss": 0.84608024, + "num_input_tokens_seen": 101211825, + "step": 4690, + "time_per_iteration": 2.487133026123047 + }, + { + "auxiliary_loss_clip": 0.01178872, + "auxiliary_loss_mlp": 0.01029071, + "balance_loss_clip": 1.05157757, + "balance_loss_mlp": 1.02095914, + "epoch": 0.5640593999879757, + "flos": 23727077712000.0, + "grad_norm": 2.096781392137312, + "language_loss": 0.81181127, + "learning_rate": 1.683515504452055e-06, + "loss": 0.83389074, + "num_input_tokens_seen": 101229200, + "step": 4691, + "time_per_iteration": 2.4657557010650635 + }, + { + "auxiliary_loss_clip": 0.01128126, + "auxiliary_loss_mlp": 0.01029335, + "balance_loss_clip": 1.04560053, + "balance_loss_mlp": 1.02053142, + "epoch": 0.5641796428786148, + "flos": 22710123855360.0, + "grad_norm": 1.5818921153380068, + "language_loss": 0.66415143, + "learning_rate": 1.6827463716232648e-06, + "loss": 0.68572605, + "num_input_tokens_seen": 101249860, + "step": 4692, + "time_per_iteration": 2.54571270942688 + }, + { + "auxiliary_loss_clip": 0.01163033, + "auxiliary_loss_mlp": 0.00762847, + "balance_loss_clip": 1.05014253, + "balance_loss_mlp": 1.00126481, + "epoch": 0.5642998857692539, + "flos": 19791987039360.0, + "grad_norm": 2.705913084203147, + "language_loss": 0.75715393, + "learning_rate": 1.6819772869215972e-06, + "loss": 0.77641273, + "num_input_tokens_seen": 101268940, + "step": 4693, + "time_per_iteration": 2.4550957679748535 + }, + { + "auxiliary_loss_clip": 0.01156466, + "auxiliary_loss_mlp": 0.01028011, + "balance_loss_clip": 1.05114043, + "balance_loss_mlp": 1.02075684, + "epoch": 0.564420128659893, + "flos": 23185904428800.0, + "grad_norm": 1.6528239032040413, + "language_loss": 0.82048059, + "learning_rate": 1.6812082504637228e-06, + "loss": 0.84232533, + "num_input_tokens_seen": 101290260, + "step": 4694, + "time_per_iteration": 2.544856071472168 + }, + { + "auxiliary_loss_clip": 0.0116276, + "auxiliary_loss_mlp": 0.01024658, + "balance_loss_clip": 1.0523746, + "balance_loss_mlp": 1.01679611, + "epoch": 0.564540371550532, + "flos": 23258264376960.0, + "grad_norm": 1.4241583674252727, + "language_loss": 0.74061322, + "learning_rate": 1.6804392623663025e-06, + "loss": 0.76248741, + "num_input_tokens_seen": 101311465, + "step": 4695, + "time_per_iteration": 2.4856653213500977 + }, + { + "auxiliary_loss_clip": 0.01157196, + "auxiliary_loss_mlp": 0.01026719, + "balance_loss_clip": 1.0486083, + "balance_loss_mlp": 1.01908326, + "epoch": 0.5646606144411712, + "flos": 25010058672000.0, + "grad_norm": 1.8054465989573296, + "language_loss": 0.7799902, + "learning_rate": 1.6796703227459935e-06, + "loss": 0.8018294, + "num_input_tokens_seen": 101329420, + "step": 4696, + "time_per_iteration": 2.4811413288116455 + }, + { + "auxiliary_loss_clip": 0.01113889, + "auxiliary_loss_mlp": 0.01026702, + "balance_loss_clip": 1.04439735, + "balance_loss_mlp": 1.01876271, + "epoch": 0.5647808573318103, + "flos": 36539645806080.0, + "grad_norm": 1.8128082792689815, + "language_loss": 0.75972301, + "learning_rate": 1.6789014317194407e-06, + "loss": 0.78112888, + "num_input_tokens_seen": 101350900, + "step": 4697, + "time_per_iteration": 2.69112491607666 + }, + { + "auxiliary_loss_clip": 0.01158796, + "auxiliary_loss_mlp": 0.01027952, + "balance_loss_clip": 1.05227852, + "balance_loss_mlp": 1.01953554, + "epoch": 0.5649011002224493, + "flos": 22528451842560.0, + "grad_norm": 2.615180054519435, + "language_loss": 0.73211336, + "learning_rate": 1.6781325894032853e-06, + "loss": 0.75398088, + "num_input_tokens_seen": 101369860, + "step": 4698, + "time_per_iteration": 2.5154387950897217 + }, + { + "auxiliary_loss_clip": 0.01144376, + "auxiliary_loss_mlp": 0.0103045, + "balance_loss_clip": 1.04947662, + "balance_loss_mlp": 1.02248657, + "epoch": 0.5650213431130885, + "flos": 18515147304960.0, + "grad_norm": 1.9651805124999497, + "language_loss": 0.92237419, + "learning_rate": 1.6773637959141608e-06, + "loss": 0.94412243, + "num_input_tokens_seen": 101386835, + "step": 4699, + "time_per_iteration": 2.4696266651153564 + }, + { + "auxiliary_loss_clip": 0.01141279, + "auxiliary_loss_mlp": 0.01027897, + "balance_loss_clip": 1.04733789, + "balance_loss_mlp": 1.02018428, + "epoch": 0.5651415860037275, + "flos": 17526310819200.0, + "grad_norm": 3.1201835118736456, + "language_loss": 0.66509497, + "learning_rate": 1.6765950513686915e-06, + "loss": 0.68678677, + "num_input_tokens_seen": 101404945, + "step": 4700, + "time_per_iteration": 2.507430076599121 + }, + { + "auxiliary_loss_clip": 0.01122254, + "auxiliary_loss_mlp": 0.0103112, + "balance_loss_clip": 1.04245353, + "balance_loss_mlp": 1.02282941, + "epoch": 0.5652618288943666, + "flos": 25520026014720.0, + "grad_norm": 1.5991248676872118, + "language_loss": 0.76055312, + "learning_rate": 1.675826355883496e-06, + "loss": 0.78208691, + "num_input_tokens_seen": 101424160, + "step": 4701, + "time_per_iteration": 2.589725971221924 + }, + { + "auxiliary_loss_clip": 0.01143749, + "auxiliary_loss_mlp": 0.01029481, + "balance_loss_clip": 1.04879856, + "balance_loss_mlp": 1.02116632, + "epoch": 0.5653820717850057, + "flos": 19683105937920.0, + "grad_norm": 1.937214821636951, + "language_loss": 0.79020429, + "learning_rate": 1.6750577095751848e-06, + "loss": 0.81193662, + "num_input_tokens_seen": 101443270, + "step": 4702, + "time_per_iteration": 2.4899916648864746 + }, + { + "auxiliary_loss_clip": 0.01174733, + "auxiliary_loss_mlp": 0.0102652, + "balance_loss_clip": 1.05011225, + "balance_loss_mlp": 1.01884329, + "epoch": 0.5655023146756448, + "flos": 26979722910720.0, + "grad_norm": 1.723417358329252, + "language_loss": 0.72735012, + "learning_rate": 1.6742891125603605e-06, + "loss": 0.74936265, + "num_input_tokens_seen": 101464175, + "step": 4703, + "time_per_iteration": 2.4604766368865967 + }, + { + "auxiliary_loss_clip": 0.01162433, + "auxiliary_loss_mlp": 0.01026497, + "balance_loss_clip": 1.05052757, + "balance_loss_mlp": 1.01802742, + "epoch": 0.5656225575662839, + "flos": 27669351104640.0, + "grad_norm": 2.2031283443751897, + "language_loss": 0.72311223, + "learning_rate": 1.6735205649556185e-06, + "loss": 0.74500155, + "num_input_tokens_seen": 101484045, + "step": 4704, + "time_per_iteration": 2.519509792327881 + }, + { + "auxiliary_loss_clip": 0.01139675, + "auxiliary_loss_mlp": 0.01028791, + "balance_loss_clip": 1.04803503, + "balance_loss_mlp": 1.02122128, + "epoch": 0.5657428004569229, + "flos": 24349732997760.0, + "grad_norm": 1.6107948758205743, + "language_loss": 0.84687668, + "learning_rate": 1.6727520668775476e-06, + "loss": 0.86856127, + "num_input_tokens_seen": 101504330, + "step": 4705, + "time_per_iteration": 2.566103219985962 + }, + { + "auxiliary_loss_clip": 0.01180306, + "auxiliary_loss_mlp": 0.01027647, + "balance_loss_clip": 1.05137444, + "balance_loss_mlp": 1.01937413, + "epoch": 0.5658630433475621, + "flos": 21944041562880.0, + "grad_norm": 1.9010156565144691, + "language_loss": 0.74943292, + "learning_rate": 1.6719836184427275e-06, + "loss": 0.77151251, + "num_input_tokens_seen": 101524635, + "step": 4706, + "time_per_iteration": 2.4601480960845947 + }, + { + "auxiliary_loss_clip": 0.01148465, + "auxiliary_loss_mlp": 0.01022608, + "balance_loss_clip": 1.04791367, + "balance_loss_mlp": 1.01529467, + "epoch": 0.5659832862382012, + "flos": 30409012218240.0, + "grad_norm": 2.351188343695549, + "language_loss": 0.64401412, + "learning_rate": 1.671215219767733e-06, + "loss": 0.66572487, + "num_input_tokens_seen": 101544095, + "step": 4707, + "time_per_iteration": 2.5945775508880615 + }, + { + "auxiliary_loss_clip": 0.01123239, + "auxiliary_loss_mlp": 0.01032416, + "balance_loss_clip": 1.04593182, + "balance_loss_mlp": 1.02425051, + "epoch": 0.5661035291288402, + "flos": 13188194570880.0, + "grad_norm": 2.2461837585079527, + "language_loss": 0.76758611, + "learning_rate": 1.670446870969127e-06, + "loss": 0.78914261, + "num_input_tokens_seen": 101561760, + "step": 4708, + "time_per_iteration": 2.6257483959198 + }, + { + "auxiliary_loss_clip": 0.01155711, + "auxiliary_loss_mlp": 0.01029954, + "balance_loss_clip": 1.05097795, + "balance_loss_mlp": 1.02191341, + "epoch": 0.5662237720194794, + "flos": 16143032108160.0, + "grad_norm": 2.9404507307898675, + "language_loss": 0.79747182, + "learning_rate": 1.6696785721634685e-06, + "loss": 0.81932849, + "num_input_tokens_seen": 101576245, + "step": 4709, + "time_per_iteration": 2.4589450359344482 + }, + { + "auxiliary_loss_clip": 0.01166689, + "auxiliary_loss_mlp": 0.01032597, + "balance_loss_clip": 1.05086994, + "balance_loss_mlp": 1.02414536, + "epoch": 0.5663440149101184, + "flos": 17676848718720.0, + "grad_norm": 2.5192132968063596, + "language_loss": 0.73744118, + "learning_rate": 1.6689103234673086e-06, + "loss": 0.75943404, + "num_input_tokens_seen": 101594565, + "step": 4710, + "time_per_iteration": 3.2003731727600098 + }, + { + "auxiliary_loss_clip": 0.01149821, + "auxiliary_loss_mlp": 0.01030928, + "balance_loss_clip": 1.05110788, + "balance_loss_mlp": 1.02275658, + "epoch": 0.5664642578007575, + "flos": 23368330627200.0, + "grad_norm": 2.3255376062743722, + "language_loss": 0.77060366, + "learning_rate": 1.668142124997189e-06, + "loss": 0.79241121, + "num_input_tokens_seen": 101614225, + "step": 4711, + "time_per_iteration": 3.3055849075317383 + }, + { + "auxiliary_loss_clip": 0.01048776, + "auxiliary_loss_mlp": 0.01029499, + "balance_loss_clip": 1.0178659, + "balance_loss_mlp": 1.02836657, + "epoch": 0.5665845006913967, + "flos": 65516470945920.0, + "grad_norm": 0.8439039124629718, + "language_loss": 0.59833205, + "learning_rate": 1.6673739768696453e-06, + "loss": 0.61911476, + "num_input_tokens_seen": 101680795, + "step": 4712, + "time_per_iteration": 3.086622476577759 + }, + { + "auxiliary_loss_clip": 0.01155985, + "auxiliary_loss_mlp": 0.01023774, + "balance_loss_clip": 1.04821777, + "balance_loss_mlp": 1.01560211, + "epoch": 0.5667047435820357, + "flos": 26140885620480.0, + "grad_norm": 1.8030319117114844, + "language_loss": 0.77329987, + "learning_rate": 1.6666058792012052e-06, + "loss": 0.79509747, + "num_input_tokens_seen": 101701680, + "step": 4713, + "time_per_iteration": 3.311448812484741 + }, + { + "auxiliary_loss_clip": 0.01067221, + "auxiliary_loss_mlp": 0.0101005, + "balance_loss_clip": 1.01440489, + "balance_loss_mlp": 1.00883985, + "epoch": 0.5668249864726748, + "flos": 71866949725440.0, + "grad_norm": 0.8774411211925013, + "language_loss": 0.68760413, + "learning_rate": 1.6658378321083878e-06, + "loss": 0.70837682, + "num_input_tokens_seen": 101766010, + "step": 4714, + "time_per_iteration": 3.073148012161255 + }, + { + "auxiliary_loss_clip": 0.01114009, + "auxiliary_loss_mlp": 0.01028521, + "balance_loss_clip": 1.04532969, + "balance_loss_mlp": 1.02100444, + "epoch": 0.5669452293633139, + "flos": 22195667312640.0, + "grad_norm": 1.6931444497036245, + "language_loss": 0.82757401, + "learning_rate": 1.6650698357077055e-06, + "loss": 0.84899926, + "num_input_tokens_seen": 101783055, + "step": 4715, + "time_per_iteration": 2.589167594909668 + }, + { + "auxiliary_loss_clip": 0.01156925, + "auxiliary_loss_mlp": 0.01030722, + "balance_loss_clip": 1.04993463, + "balance_loss_mlp": 1.02219892, + "epoch": 0.567065472253953, + "flos": 18223193560320.0, + "grad_norm": 3.837701957501254, + "language_loss": 0.80410385, + "learning_rate": 1.6643018901156632e-06, + "loss": 0.82598031, + "num_input_tokens_seen": 101802150, + "step": 4716, + "time_per_iteration": 2.482914924621582 + }, + { + "auxiliary_loss_clip": 0.01156376, + "auxiliary_loss_mlp": 0.01026025, + "balance_loss_clip": 1.05057371, + "balance_loss_mlp": 1.01832366, + "epoch": 0.567185715144592, + "flos": 20371548983040.0, + "grad_norm": 2.164445742416022, + "language_loss": 0.79092544, + "learning_rate": 1.6635339954487566e-06, + "loss": 0.81274945, + "num_input_tokens_seen": 101818025, + "step": 4717, + "time_per_iteration": 2.474604368209839 + }, + { + "auxiliary_loss_clip": 0.01154886, + "auxiliary_loss_mlp": 0.01024271, + "balance_loss_clip": 1.04927349, + "balance_loss_mlp": 1.01631355, + "epoch": 0.5673059580352312, + "flos": 23221348174080.0, + "grad_norm": 2.1193780463339826, + "language_loss": 0.82061094, + "learning_rate": 1.6627661518234765e-06, + "loss": 0.84240252, + "num_input_tokens_seen": 101837280, + "step": 4718, + "time_per_iteration": 2.5784542560577393 + }, + { + "auxiliary_loss_clip": 0.01125104, + "auxiliary_loss_mlp": 0.01025804, + "balance_loss_clip": 1.04882193, + "balance_loss_mlp": 1.01740599, + "epoch": 0.5674262009258703, + "flos": 21719599430400.0, + "grad_norm": 2.3692863326686364, + "language_loss": 0.85504961, + "learning_rate": 1.661998359356302e-06, + "loss": 0.87655866, + "num_input_tokens_seen": 101856310, + "step": 4719, + "time_per_iteration": 2.6915876865386963 + }, + { + "auxiliary_loss_clip": 0.01075698, + "auxiliary_loss_mlp": 0.01001974, + "balance_loss_clip": 1.01434827, + "balance_loss_mlp": 1.00084138, + "epoch": 0.5675464438165093, + "flos": 67470369114240.0, + "grad_norm": 0.7542435849764898, + "language_loss": 0.55803704, + "learning_rate": 1.6612306181637077e-06, + "loss": 0.57881379, + "num_input_tokens_seen": 101915635, + "step": 4720, + "time_per_iteration": 3.0268001556396484 + }, + { + "auxiliary_loss_clip": 0.01134074, + "auxiliary_loss_mlp": 0.01032098, + "balance_loss_clip": 1.04697049, + "balance_loss_mlp": 1.02430761, + "epoch": 0.5676666867071485, + "flos": 18879173688960.0, + "grad_norm": 2.622430361957945, + "language_loss": 0.65129197, + "learning_rate": 1.6604629283621598e-06, + "loss": 0.67295367, + "num_input_tokens_seen": 101933565, + "step": 4721, + "time_per_iteration": 2.5401721000671387 + }, + { + "auxiliary_loss_clip": 0.01180299, + "auxiliary_loss_mlp": 0.01031652, + "balance_loss_clip": 1.05110943, + "balance_loss_mlp": 1.02333117, + "epoch": 0.5677869295977875, + "flos": 33546778744320.0, + "grad_norm": 1.7494325458327467, + "language_loss": 0.74635738, + "learning_rate": 1.6596952900681152e-06, + "loss": 0.76847684, + "num_input_tokens_seen": 101954325, + "step": 4722, + "time_per_iteration": 2.5252292156219482 + }, + { + "auxiliary_loss_clip": 0.01113679, + "auxiliary_loss_mlp": 0.01030695, + "balance_loss_clip": 1.04810596, + "balance_loss_mlp": 1.02227902, + "epoch": 0.5679071724884266, + "flos": 28037256157440.0, + "grad_norm": 2.294410673028093, + "language_loss": 0.82003719, + "learning_rate": 1.658927703398025e-06, + "loss": 0.84148091, + "num_input_tokens_seen": 101974390, + "step": 4723, + "time_per_iteration": 2.586087703704834 + }, + { + "auxiliary_loss_clip": 0.01120277, + "auxiliary_loss_mlp": 0.01023861, + "balance_loss_clip": 1.04018915, + "balance_loss_mlp": 1.01593924, + "epoch": 0.5680274153790658, + "flos": 23550110380800.0, + "grad_norm": 2.289296673886538, + "language_loss": 0.77565455, + "learning_rate": 1.6581601684683309e-06, + "loss": 0.79709589, + "num_input_tokens_seen": 101994815, + "step": 4724, + "time_per_iteration": 2.5751078128814697 + }, + { + "auxiliary_loss_clip": 0.01166561, + "auxiliary_loss_mlp": 0.01025974, + "balance_loss_clip": 1.05218458, + "balance_loss_mlp": 1.01862502, + "epoch": 0.5681476582697048, + "flos": 22455158140800.0, + "grad_norm": 4.028297881491654, + "language_loss": 0.68340653, + "learning_rate": 1.6573926853954674e-06, + "loss": 0.70533192, + "num_input_tokens_seen": 102012400, + "step": 4725, + "time_per_iteration": 2.487100601196289 + }, + { + "auxiliary_loss_clip": 0.01141876, + "auxiliary_loss_mlp": 0.01023114, + "balance_loss_clip": 1.04407167, + "balance_loss_mlp": 1.01499927, + "epoch": 0.5682679011603439, + "flos": 19536913584000.0, + "grad_norm": 1.7670230949344006, + "language_loss": 0.83417797, + "learning_rate": 1.6566252542958608e-06, + "loss": 0.85582793, + "num_input_tokens_seen": 102031900, + "step": 4726, + "time_per_iteration": 2.486768960952759 + }, + { + "auxiliary_loss_clip": 0.0113053, + "auxiliary_loss_mlp": 0.01030874, + "balance_loss_clip": 1.04738903, + "balance_loss_mlp": 1.02293801, + "epoch": 0.568388144050983, + "flos": 28765488493440.0, + "grad_norm": 1.9050240902120072, + "language_loss": 0.78392768, + "learning_rate": 1.6558578752859305e-06, + "loss": 0.80554175, + "num_input_tokens_seen": 102050860, + "step": 4727, + "time_per_iteration": 2.604414939880371 + }, + { + "auxiliary_loss_clip": 0.01134221, + "auxiliary_loss_mlp": 0.01026417, + "balance_loss_clip": 1.04615045, + "balance_loss_mlp": 1.01884103, + "epoch": 0.5685083869416221, + "flos": 21209452519680.0, + "grad_norm": 2.0271926319801645, + "language_loss": 0.78670901, + "learning_rate": 1.6550905484820865e-06, + "loss": 0.80831534, + "num_input_tokens_seen": 102069320, + "step": 4728, + "time_per_iteration": 2.5325100421905518 + }, + { + "auxiliary_loss_clip": 0.01179726, + "auxiliary_loss_mlp": 0.01026815, + "balance_loss_clip": 1.05115581, + "balance_loss_mlp": 1.01846993, + "epoch": 0.5686286298322611, + "flos": 24827021942400.0, + "grad_norm": 2.279336091815224, + "language_loss": 0.78782046, + "learning_rate": 1.6543232740007328e-06, + "loss": 0.80988586, + "num_input_tokens_seen": 102086435, + "step": 4729, + "time_per_iteration": 2.4647839069366455 + }, + { + "auxiliary_loss_clip": 0.01165646, + "auxiliary_loss_mlp": 0.01029477, + "balance_loss_clip": 1.049914, + "balance_loss_mlp": 1.02137113, + "epoch": 0.5687488727229003, + "flos": 26615121909120.0, + "grad_norm": 2.414104358390535, + "language_loss": 0.66998875, + "learning_rate": 1.653556051958263e-06, + "loss": 0.69194001, + "num_input_tokens_seen": 102106115, + "step": 4730, + "time_per_iteration": 2.513744592666626 + }, + { + "auxiliary_loss_clip": 0.01089584, + "auxiliary_loss_mlp": 0.01029666, + "balance_loss_clip": 1.04144263, + "balance_loss_mlp": 1.02142227, + "epoch": 0.5688691156135394, + "flos": 20808725414400.0, + "grad_norm": 1.8786999735008687, + "language_loss": 0.73835713, + "learning_rate": 1.6527888824710642e-06, + "loss": 0.75954962, + "num_input_tokens_seen": 102125715, + "step": 4731, + "time_per_iteration": 2.5955007076263428 + }, + { + "auxiliary_loss_clip": 0.0112935, + "auxiliary_loss_mlp": 0.01027908, + "balance_loss_clip": 1.04419208, + "balance_loss_mlp": 1.01949763, + "epoch": 0.5689893585041784, + "flos": 25880963829120.0, + "grad_norm": 2.7558946006154588, + "language_loss": 0.76692182, + "learning_rate": 1.6520217656555166e-06, + "loss": 0.78849441, + "num_input_tokens_seen": 102145005, + "step": 4732, + "time_per_iteration": 2.5926756858825684 + }, + { + "auxiliary_loss_clip": 0.01137361, + "auxiliary_loss_mlp": 0.01032083, + "balance_loss_clip": 1.04551268, + "balance_loss_mlp": 1.0239346, + "epoch": 0.5691096013948175, + "flos": 23477463123840.0, + "grad_norm": 1.5764552971283643, + "language_loss": 0.70861709, + "learning_rate": 1.65125470162799e-06, + "loss": 0.73031151, + "num_input_tokens_seen": 102165360, + "step": 4733, + "time_per_iteration": 2.5186827182769775 + }, + { + "auxiliary_loss_clip": 0.01136594, + "auxiliary_loss_mlp": 0.01028015, + "balance_loss_clip": 1.04384506, + "balance_loss_mlp": 1.02018321, + "epoch": 0.5692298442854566, + "flos": 18075600576000.0, + "grad_norm": 2.0068118851325787, + "language_loss": 0.69780123, + "learning_rate": 1.6504876905048485e-06, + "loss": 0.71944731, + "num_input_tokens_seen": 102182320, + "step": 4734, + "time_per_iteration": 2.488260269165039 + }, + { + "auxiliary_loss_clip": 0.01178635, + "auxiliary_loss_mlp": 0.01026414, + "balance_loss_clip": 1.05333519, + "balance_loss_mlp": 1.01898146, + "epoch": 0.5693500871760957, + "flos": 23039317025280.0, + "grad_norm": 1.6257309428356341, + "language_loss": 0.72073722, + "learning_rate": 1.6497207324024464e-06, + "loss": 0.74278772, + "num_input_tokens_seen": 102201220, + "step": 4735, + "time_per_iteration": 2.4590208530426025 + }, + { + "auxiliary_loss_clip": 0.01154981, + "auxiliary_loss_mlp": 0.01029416, + "balance_loss_clip": 1.04688287, + "balance_loss_mlp": 1.02157807, + "epoch": 0.5694703300667348, + "flos": 18989670902400.0, + "grad_norm": 2.05330237272868, + "language_loss": 0.82425755, + "learning_rate": 1.6489538274371305e-06, + "loss": 0.84610152, + "num_input_tokens_seen": 102219825, + "step": 4736, + "time_per_iteration": 2.4734771251678467 + }, + { + "auxiliary_loss_clip": 0.01158545, + "auxiliary_loss_mlp": 0.01027079, + "balance_loss_clip": 1.05137956, + "balance_loss_mlp": 1.019521, + "epoch": 0.5695905729573739, + "flos": 21908705558400.0, + "grad_norm": 1.9336716046972786, + "language_loss": 0.83447582, + "learning_rate": 1.6481869757252396e-06, + "loss": 0.85633206, + "num_input_tokens_seen": 102238160, + "step": 4737, + "time_per_iteration": 3.254087209701538 + }, + { + "auxiliary_loss_clip": 0.0116453, + "auxiliary_loss_mlp": 0.01029333, + "balance_loss_clip": 1.05153668, + "balance_loss_mlp": 1.02181077, + "epoch": 0.569710815848013, + "flos": 28476659232000.0, + "grad_norm": 1.417138221813094, + "language_loss": 0.72001028, + "learning_rate": 1.647420177383105e-06, + "loss": 0.74194896, + "num_input_tokens_seen": 102261030, + "step": 4738, + "time_per_iteration": 3.3280420303344727 + }, + { + "auxiliary_loss_clip": 0.01161071, + "auxiliary_loss_mlp": 0.01023711, + "balance_loss_clip": 1.05341029, + "balance_loss_mlp": 1.0161202, + "epoch": 0.569831058738652, + "flos": 28366162018560.0, + "grad_norm": 1.842389674719541, + "language_loss": 0.72472912, + "learning_rate": 1.646653432527049e-06, + "loss": 0.74657696, + "num_input_tokens_seen": 102281670, + "step": 4739, + "time_per_iteration": 2.6036858558654785 + }, + { + "auxiliary_loss_clip": 0.01136879, + "auxiliary_loss_mlp": 0.01027757, + "balance_loss_clip": 1.04724777, + "balance_loss_mlp": 1.02019334, + "epoch": 0.5699513016292912, + "flos": 25849973370240.0, + "grad_norm": 1.448831420836524, + "language_loss": 0.74462652, + "learning_rate": 1.645886741273387e-06, + "loss": 0.7662729, + "num_input_tokens_seen": 102303485, + "step": 4740, + "time_per_iteration": 3.4162135124206543 + }, + { + "auxiliary_loss_clip": 0.01132014, + "auxiliary_loss_mlp": 0.01031966, + "balance_loss_clip": 1.05009019, + "balance_loss_mlp": 1.02396131, + "epoch": 0.5700715445199303, + "flos": 18037858360320.0, + "grad_norm": 4.044285280649749, + "language_loss": 0.73816085, + "learning_rate": 1.645120103738424e-06, + "loss": 0.75980067, + "num_input_tokens_seen": 102320995, + "step": 4741, + "time_per_iteration": 2.623206615447998 + }, + { + "auxiliary_loss_clip": 0.01152233, + "auxiliary_loss_mlp": 0.00762514, + "balance_loss_clip": 1.04781568, + "balance_loss_mlp": 1.00142491, + "epoch": 0.5701917874105693, + "flos": 11473352392320.0, + "grad_norm": 2.948052143048771, + "language_loss": 0.83643115, + "learning_rate": 1.6443535200384591e-06, + "loss": 0.85557866, + "num_input_tokens_seen": 102339170, + "step": 4742, + "time_per_iteration": 2.516386032104492 + }, + { + "auxiliary_loss_clip": 0.01179345, + "auxiliary_loss_mlp": 0.0103013, + "balance_loss_clip": 1.05286169, + "balance_loss_mlp": 1.02220893, + "epoch": 0.5703120303012085, + "flos": 21761759018880.0, + "grad_norm": 1.988500020221854, + "language_loss": 0.70761931, + "learning_rate": 1.6435869902897827e-06, + "loss": 0.7297141, + "num_input_tokens_seen": 102357750, + "step": 4743, + "time_per_iteration": 2.4395453929901123 + }, + { + "auxiliary_loss_clip": 0.01042133, + "auxiliary_loss_mlp": 0.01001936, + "balance_loss_clip": 1.01416707, + "balance_loss_mlp": 1.00076199, + "epoch": 0.5704322731918475, + "flos": 56746258513920.0, + "grad_norm": 0.7961526151858739, + "language_loss": 0.6201728, + "learning_rate": 1.6428205146086764e-06, + "loss": 0.6406135, + "num_input_tokens_seen": 102419730, + "step": 4744, + "time_per_iteration": 3.1932694911956787 + }, + { + "auxiliary_loss_clip": 0.01155127, + "auxiliary_loss_mlp": 0.01025045, + "balance_loss_clip": 1.0482049, + "balance_loss_mlp": 1.016837, + "epoch": 0.5705525160824866, + "flos": 20741141975040.0, + "grad_norm": 1.4998390789202742, + "language_loss": 0.70392513, + "learning_rate": 1.6420540931114142e-06, + "loss": 0.72572684, + "num_input_tokens_seen": 102440320, + "step": 4745, + "time_per_iteration": 2.522035598754883 + }, + { + "auxiliary_loss_clip": 0.01152754, + "auxiliary_loss_mlp": 0.01041765, + "balance_loss_clip": 1.04807115, + "balance_loss_mlp": 1.0339272, + "epoch": 0.5706727589731257, + "flos": 18771262254720.0, + "grad_norm": 1.758806676032609, + "language_loss": 0.79148227, + "learning_rate": 1.6412877259142616e-06, + "loss": 0.81342745, + "num_input_tokens_seen": 102460240, + "step": 4746, + "time_per_iteration": 2.491891622543335 + }, + { + "auxiliary_loss_clip": 0.01149385, + "auxiliary_loss_mlp": 0.01027946, + "balance_loss_clip": 1.05065346, + "balance_loss_mlp": 1.02005446, + "epoch": 0.5707930018637648, + "flos": 27634733372160.0, + "grad_norm": 2.805191075692116, + "language_loss": 0.74195504, + "learning_rate": 1.6405214131334757e-06, + "loss": 0.76372826, + "num_input_tokens_seen": 102478765, + "step": 4747, + "time_per_iteration": 2.5611891746520996 + }, + { + "auxiliary_loss_clip": 0.01119444, + "auxiliary_loss_mlp": 0.01024091, + "balance_loss_clip": 1.04770041, + "balance_loss_mlp": 1.01639009, + "epoch": 0.5709132447544039, + "flos": 27597673514880.0, + "grad_norm": 1.6980927038914666, + "language_loss": 0.79610682, + "learning_rate": 1.6397551548853052e-06, + "loss": 0.81754214, + "num_input_tokens_seen": 102496930, + "step": 4748, + "time_per_iteration": 2.612699270248413 + }, + { + "auxiliary_loss_clip": 0.01148545, + "auxiliary_loss_mlp": 0.01026438, + "balance_loss_clip": 1.04853582, + "balance_loss_mlp": 1.01842141, + "epoch": 0.571033487645043, + "flos": 21686095019520.0, + "grad_norm": 1.9446538779645177, + "language_loss": 0.70769316, + "learning_rate": 1.6389889512859917e-06, + "loss": 0.72944295, + "num_input_tokens_seen": 102516590, + "step": 4749, + "time_per_iteration": 2.511479139328003 + }, + { + "auxiliary_loss_clip": 0.01054592, + "auxiliary_loss_mlp": 0.01001418, + "balance_loss_clip": 1.01660037, + "balance_loss_mlp": 1.00036287, + "epoch": 0.5711537305356821, + "flos": 70181445980160.0, + "grad_norm": 0.8140298812275529, + "language_loss": 0.60378158, + "learning_rate": 1.638222802451767e-06, + "loss": 0.62434173, + "num_input_tokens_seen": 102578070, + "step": 4750, + "time_per_iteration": 3.0952484607696533 + }, + { + "auxiliary_loss_clip": 0.01156071, + "auxiliary_loss_mlp": 0.01025178, + "balance_loss_clip": 1.04957199, + "balance_loss_mlp": 1.01768029, + "epoch": 0.5712739734263211, + "flos": 24717494396160.0, + "grad_norm": 10.039610957226767, + "language_loss": 0.75106359, + "learning_rate": 1.6374567084988561e-06, + "loss": 0.77287608, + "num_input_tokens_seen": 102599255, + "step": 4751, + "time_per_iteration": 2.5089073181152344 + }, + { + "auxiliary_loss_clip": 0.01156213, + "auxiliary_loss_mlp": 0.01025451, + "balance_loss_clip": 1.05252552, + "balance_loss_mlp": 1.01678443, + "epoch": 0.5713942163169603, + "flos": 26578169792640.0, + "grad_norm": 1.749608892899211, + "language_loss": 0.76634848, + "learning_rate": 1.6366906695434738e-06, + "loss": 0.78816515, + "num_input_tokens_seen": 102621775, + "step": 4752, + "time_per_iteration": 2.533428430557251 + }, + { + "auxiliary_loss_clip": 0.01166104, + "auxiliary_loss_mlp": 0.01027521, + "balance_loss_clip": 1.05370581, + "balance_loss_mlp": 1.01999891, + "epoch": 0.5715144592075994, + "flos": 21142443697920.0, + "grad_norm": 2.0809418018430037, + "language_loss": 0.86092305, + "learning_rate": 1.6359246857018275e-06, + "loss": 0.88285923, + "num_input_tokens_seen": 102639305, + "step": 4753, + "time_per_iteration": 2.4618101119995117 + }, + { + "auxiliary_loss_clip": 0.01119681, + "auxiliary_loss_mlp": 0.01025875, + "balance_loss_clip": 1.04407895, + "balance_loss_mlp": 1.018031, + "epoch": 0.5716347020982384, + "flos": 23330265189120.0, + "grad_norm": 2.496869562173045, + "language_loss": 0.78573775, + "learning_rate": 1.6351587570901178e-06, + "loss": 0.80719334, + "num_input_tokens_seen": 102659430, + "step": 4754, + "time_per_iteration": 2.5614097118377686 + }, + { + "auxiliary_loss_clip": 0.01134764, + "auxiliary_loss_mlp": 0.01026516, + "balance_loss_clip": 1.04803789, + "balance_loss_mlp": 1.01899409, + "epoch": 0.5717549449888776, + "flos": 17009555806080.0, + "grad_norm": 2.696120287254758, + "language_loss": 0.76022905, + "learning_rate": 1.634392883824534e-06, + "loss": 0.78184187, + "num_input_tokens_seen": 102671430, + "step": 4755, + "time_per_iteration": 2.4709360599517822 + }, + { + "auxiliary_loss_clip": 0.01123453, + "auxiliary_loss_mlp": 0.01026897, + "balance_loss_clip": 1.04521251, + "balance_loss_mlp": 1.01905251, + "epoch": 0.5718751878795166, + "flos": 35518130922240.0, + "grad_norm": 10.795774528172723, + "language_loss": 0.67849779, + "learning_rate": 1.6336270660212595e-06, + "loss": 0.7000013, + "num_input_tokens_seen": 102693025, + "step": 4756, + "time_per_iteration": 2.7105274200439453 + }, + { + "auxiliary_loss_clip": 0.01147182, + "auxiliary_loss_mlp": 0.010267, + "balance_loss_clip": 1.05226445, + "balance_loss_mlp": 1.01831913, + "epoch": 0.5719954307701557, + "flos": 38613989255040.0, + "grad_norm": 1.9346778888138194, + "language_loss": 0.65510082, + "learning_rate": 1.6328613037964676e-06, + "loss": 0.67683965, + "num_input_tokens_seen": 102716090, + "step": 4757, + "time_per_iteration": 2.6822805404663086 + }, + { + "auxiliary_loss_clip": 0.01162285, + "auxiliary_loss_mlp": 0.01024281, + "balance_loss_clip": 1.04933739, + "balance_loss_mlp": 1.01643074, + "epoch": 0.5721156736607949, + "flos": 20631111638400.0, + "grad_norm": 1.9176822554306086, + "language_loss": 0.68109107, + "learning_rate": 1.6320955972663241e-06, + "loss": 0.7029568, + "num_input_tokens_seen": 102735685, + "step": 4758, + "time_per_iteration": 2.4698877334594727 + }, + { + "auxiliary_loss_clip": 0.0116386, + "auxiliary_loss_mlp": 0.01025998, + "balance_loss_clip": 1.04935861, + "balance_loss_mlp": 1.01813006, + "epoch": 0.5722359165514339, + "flos": 37415076076800.0, + "grad_norm": 2.1505438091256037, + "language_loss": 0.65338933, + "learning_rate": 1.6313299465469857e-06, + "loss": 0.67528796, + "num_input_tokens_seen": 102758415, + "step": 4759, + "time_per_iteration": 2.592468738555908 + }, + { + "auxiliary_loss_clip": 0.01160823, + "auxiliary_loss_mlp": 0.01026701, + "balance_loss_clip": 1.05115414, + "balance_loss_mlp": 1.0182128, + "epoch": 0.572356159442073, + "flos": 21972877205760.0, + "grad_norm": 2.4637344537144084, + "language_loss": 0.79300392, + "learning_rate": 1.6305643517546014e-06, + "loss": 0.81487912, + "num_input_tokens_seen": 102773795, + "step": 4760, + "time_per_iteration": 2.44144606590271 + }, + { + "auxiliary_loss_clip": 0.01176719, + "auxiliary_loss_mlp": 0.01035418, + "balance_loss_clip": 1.05247581, + "balance_loss_mlp": 1.02796745, + "epoch": 0.5724764023327121, + "flos": 19135540033920.0, + "grad_norm": 2.07702692271519, + "language_loss": 0.84394336, + "learning_rate": 1.629798813005311e-06, + "loss": 0.86606467, + "num_input_tokens_seen": 102793515, + "step": 4761, + "time_per_iteration": 2.4338488578796387 + }, + { + "auxiliary_loss_clip": 0.01123568, + "auxiliary_loss_mlp": 0.01030401, + "balance_loss_clip": 1.0477612, + "balance_loss_mlp": 1.02295065, + "epoch": 0.5725966452233512, + "flos": 22819759142400.0, + "grad_norm": 1.8706155467865297, + "language_loss": 0.71103954, + "learning_rate": 1.6290333304152473e-06, + "loss": 0.73257929, + "num_input_tokens_seen": 102813390, + "step": 4762, + "time_per_iteration": 2.591215133666992 + }, + { + "auxiliary_loss_clip": 0.01144959, + "auxiliary_loss_mlp": 0.01029774, + "balance_loss_clip": 1.05103683, + "balance_loss_mlp": 1.021945, + "epoch": 0.5727168881139902, + "flos": 41496610498560.0, + "grad_norm": 1.8108937796303037, + "language_loss": 0.56554288, + "learning_rate": 1.6282679041005314e-06, + "loss": 0.58729017, + "num_input_tokens_seen": 102838980, + "step": 4763, + "time_per_iteration": 2.6752781867980957 + }, + { + "auxiliary_loss_clip": 0.01140431, + "auxiliary_loss_mlp": 0.01022241, + "balance_loss_clip": 1.04547906, + "balance_loss_mlp": 1.01463509, + "epoch": 0.5728371310046293, + "flos": 14647675985280.0, + "grad_norm": 2.346714782083448, + "language_loss": 0.87149984, + "learning_rate": 1.6275025341772789e-06, + "loss": 0.89312655, + "num_input_tokens_seen": 102855285, + "step": 4764, + "time_per_iteration": 3.9599521160125732 + }, + { + "auxiliary_loss_clip": 0.01151285, + "auxiliary_loss_mlp": 0.01026851, + "balance_loss_clip": 1.0491215, + "balance_loss_mlp": 1.01830339, + "epoch": 0.5729573738952685, + "flos": 21506613736320.0, + "grad_norm": 2.9171452796615323, + "language_loss": 0.81297827, + "learning_rate": 1.626737220761596e-06, + "loss": 0.83475965, + "num_input_tokens_seen": 102872750, + "step": 4765, + "time_per_iteration": 2.515685796737671 + }, + { + "auxiliary_loss_clip": 0.01158858, + "auxiliary_loss_mlp": 0.01024961, + "balance_loss_clip": 1.05012071, + "balance_loss_mlp": 1.01747465, + "epoch": 0.5730776167859075, + "flos": 23621680229760.0, + "grad_norm": 2.149864177813516, + "language_loss": 0.7918247, + "learning_rate": 1.62597196396958e-06, + "loss": 0.81366289, + "num_input_tokens_seen": 102890920, + "step": 4766, + "time_per_iteration": 2.48991322517395 + }, + { + "auxiliary_loss_clip": 0.01161182, + "auxiliary_loss_mlp": 0.01024216, + "balance_loss_clip": 1.04996026, + "balance_loss_mlp": 1.01645541, + "epoch": 0.5731978596765466, + "flos": 25739224761600.0, + "grad_norm": 1.783458550795113, + "language_loss": 0.85797846, + "learning_rate": 1.6252067639173197e-06, + "loss": 0.87983245, + "num_input_tokens_seen": 102912830, + "step": 4767, + "time_per_iteration": 3.2140212059020996 + }, + { + "auxiliary_loss_clip": 0.01163966, + "auxiliary_loss_mlp": 0.01025966, + "balance_loss_clip": 1.05077887, + "balance_loss_mlp": 1.01848853, + "epoch": 0.5733181025671857, + "flos": 26359509749760.0, + "grad_norm": 1.6775686598056248, + "language_loss": 0.6961875, + "learning_rate": 1.6244416207208956e-06, + "loss": 0.71808684, + "num_input_tokens_seen": 102933765, + "step": 4768, + "time_per_iteration": 2.5291740894317627 + }, + { + "auxiliary_loss_clip": 0.01135941, + "auxiliary_loss_mlp": 0.0103512, + "balance_loss_clip": 1.04731846, + "balance_loss_mlp": 1.02752006, + "epoch": 0.5734383454578248, + "flos": 29423874833280.0, + "grad_norm": 1.823201346555296, + "language_loss": 0.737333, + "learning_rate": 1.6236765344963787e-06, + "loss": 0.75904357, + "num_input_tokens_seen": 102955025, + "step": 4769, + "time_per_iteration": 2.584160327911377 + }, + { + "auxiliary_loss_clip": 0.01148935, + "auxiliary_loss_mlp": 0.01027217, + "balance_loss_clip": 1.0487318, + "balance_loss_mlp": 1.01951599, + "epoch": 0.5735585883484638, + "flos": 34969954487040.0, + "grad_norm": 2.3460551067676483, + "language_loss": 0.6906749, + "learning_rate": 1.6229115053598322e-06, + "loss": 0.7124365, + "num_input_tokens_seen": 102976780, + "step": 4770, + "time_per_iteration": 2.613410472869873 + }, + { + "auxiliary_loss_clip": 0.01165639, + "auxiliary_loss_mlp": 0.01030951, + "balance_loss_clip": 1.05275774, + "balance_loss_mlp": 1.02302933, + "epoch": 0.573678831239103, + "flos": 18770759464320.0, + "grad_norm": 2.094850722794159, + "language_loss": 0.72028673, + "learning_rate": 1.6221465334273108e-06, + "loss": 0.74225259, + "num_input_tokens_seen": 102995990, + "step": 4771, + "time_per_iteration": 2.4523215293884277 + }, + { + "auxiliary_loss_clip": 0.01139612, + "auxiliary_loss_mlp": 0.01025757, + "balance_loss_clip": 1.04691648, + "balance_loss_mlp": 1.01804447, + "epoch": 0.5737990741297421, + "flos": 25702883176320.0, + "grad_norm": 2.443644190262531, + "language_loss": 0.61376095, + "learning_rate": 1.6213816188148593e-06, + "loss": 0.6354146, + "num_input_tokens_seen": 103014695, + "step": 4772, + "time_per_iteration": 2.560901641845703 + }, + { + "auxiliary_loss_clip": 0.01141118, + "auxiliary_loss_mlp": 0.01024735, + "balance_loss_clip": 1.05012047, + "balance_loss_mlp": 1.01723361, + "epoch": 0.5739193170203811, + "flos": 27269234530560.0, + "grad_norm": 1.8317061373280756, + "language_loss": 0.77008665, + "learning_rate": 1.6206167616385162e-06, + "loss": 0.79174519, + "num_input_tokens_seen": 103035760, + "step": 4773, + "time_per_iteration": 2.5362040996551514 + }, + { + "auxiliary_loss_clip": 0.01156914, + "auxiliary_loss_mlp": 0.01026963, + "balance_loss_clip": 1.05165577, + "balance_loss_mlp": 1.01882112, + "epoch": 0.5740395599110203, + "flos": 12239721993600.0, + "grad_norm": 1.974479321389316, + "language_loss": 0.73608738, + "learning_rate": 1.6198519620143078e-06, + "loss": 0.75792617, + "num_input_tokens_seen": 103052915, + "step": 4774, + "time_per_iteration": 2.482557535171509 + }, + { + "auxiliary_loss_clip": 0.01138453, + "auxiliary_loss_mlp": 0.01031992, + "balance_loss_clip": 1.0485723, + "balance_loss_mlp": 1.02445507, + "epoch": 0.5741598028016593, + "flos": 25921399564800.0, + "grad_norm": 1.626535366636689, + "language_loss": 0.78211379, + "learning_rate": 1.6190872200582546e-06, + "loss": 0.80381823, + "num_input_tokens_seen": 103074655, + "step": 4775, + "time_per_iteration": 2.5674030780792236 + }, + { + "auxiliary_loss_clip": 0.01142532, + "auxiliary_loss_mlp": 0.00762331, + "balance_loss_clip": 1.04703796, + "balance_loss_mlp": 1.00119162, + "epoch": 0.5742800456922984, + "flos": 19244133826560.0, + "grad_norm": 2.1214594326655973, + "language_loss": 0.7816956, + "learning_rate": 1.6183225358863676e-06, + "loss": 0.8007443, + "num_input_tokens_seen": 103091550, + "step": 4776, + "time_per_iteration": 2.494978904724121 + }, + { + "auxiliary_loss_clip": 0.01140777, + "auxiliary_loss_mlp": 0.01025319, + "balance_loss_clip": 1.04679, + "balance_loss_mlp": 1.0170753, + "epoch": 0.5744002885829376, + "flos": 30920487932160.0, + "grad_norm": 2.436413159088932, + "language_loss": 0.7226969, + "learning_rate": 1.617557909614648e-06, + "loss": 0.74435782, + "num_input_tokens_seen": 103110985, + "step": 4777, + "time_per_iteration": 2.5549447536468506 + }, + { + "auxiliary_loss_clip": 0.01131587, + "auxiliary_loss_mlp": 0.01025834, + "balance_loss_clip": 1.04460192, + "balance_loss_mlp": 1.01833868, + "epoch": 0.5745205314735766, + "flos": 23840017050240.0, + "grad_norm": 1.7964546609436236, + "language_loss": 0.85841793, + "learning_rate": 1.6167933413590899e-06, + "loss": 0.87999213, + "num_input_tokens_seen": 103129890, + "step": 4778, + "time_per_iteration": 2.5591347217559814 + }, + { + "auxiliary_loss_clip": 0.01160514, + "auxiliary_loss_mlp": 0.01028952, + "balance_loss_clip": 1.04877818, + "balance_loss_mlp": 1.02126288, + "epoch": 0.5746407743642157, + "flos": 12311902373760.0, + "grad_norm": 2.598303800187379, + "language_loss": 0.90778565, + "learning_rate": 1.6160288312356773e-06, + "loss": 0.92968035, + "num_input_tokens_seen": 103147020, + "step": 4779, + "time_per_iteration": 2.4304749965667725 + }, + { + "auxiliary_loss_clip": 0.0116608, + "auxiliary_loss_mlp": 0.01026516, + "balance_loss_clip": 1.04956961, + "balance_loss_mlp": 1.01867247, + "epoch": 0.5747610172548548, + "flos": 24133658734080.0, + "grad_norm": 1.6974161039080173, + "language_loss": 0.81723058, + "learning_rate": 1.6152643793603857e-06, + "loss": 0.83915651, + "num_input_tokens_seen": 103167370, + "step": 4780, + "time_per_iteration": 2.490206003189087 + }, + { + "auxiliary_loss_clip": 0.01176573, + "auxiliary_loss_mlp": 0.01026594, + "balance_loss_clip": 1.05202603, + "balance_loss_mlp": 1.01859522, + "epoch": 0.5748812601454939, + "flos": 25408451393280.0, + "grad_norm": 2.0982978635464504, + "language_loss": 0.8765645, + "learning_rate": 1.6144999858491815e-06, + "loss": 0.89859617, + "num_input_tokens_seen": 103186000, + "step": 4781, + "time_per_iteration": 2.5028223991394043 + }, + { + "auxiliary_loss_clip": 0.01153577, + "auxiliary_loss_mlp": 0.01025297, + "balance_loss_clip": 1.04857635, + "balance_loss_mlp": 1.01717854, + "epoch": 0.575001503036133, + "flos": 30624942827520.0, + "grad_norm": 1.6651714664109416, + "language_loss": 0.85681707, + "learning_rate": 1.6137356508180232e-06, + "loss": 0.87860584, + "num_input_tokens_seen": 103207710, + "step": 4782, + "time_per_iteration": 2.5728635787963867 + }, + { + "auxiliary_loss_clip": 0.01177333, + "auxiliary_loss_mlp": 0.00761948, + "balance_loss_clip": 1.0507443, + "balance_loss_mlp": 1.0012989, + "epoch": 0.5751217459267721, + "flos": 21726566668800.0, + "grad_norm": 1.7393297187094996, + "language_loss": 0.81245881, + "learning_rate": 1.6129713743828593e-06, + "loss": 0.8318516, + "num_input_tokens_seen": 103226720, + "step": 4783, + "time_per_iteration": 2.447719097137451 + }, + { + "auxiliary_loss_clip": 0.01149067, + "auxiliary_loss_mlp": 0.01025796, + "balance_loss_clip": 1.04632282, + "balance_loss_mlp": 1.01871252, + "epoch": 0.5752419888174112, + "flos": 21651620941440.0, + "grad_norm": 1.4620369626673007, + "language_loss": 0.7550593, + "learning_rate": 1.6122071566596306e-06, + "loss": 0.77680796, + "num_input_tokens_seen": 103246995, + "step": 4784, + "time_per_iteration": 2.507991313934326 + }, + { + "auxiliary_loss_clip": 0.01164301, + "auxiliary_loss_mlp": 0.0102715, + "balance_loss_clip": 1.05003011, + "balance_loss_mlp": 1.01894236, + "epoch": 0.5753622317080502, + "flos": 17775997234560.0, + "grad_norm": 2.1283264637476007, + "language_loss": 0.83136547, + "learning_rate": 1.6114429977642674e-06, + "loss": 0.85327995, + "num_input_tokens_seen": 103261500, + "step": 4785, + "time_per_iteration": 2.4208357334136963 + }, + { + "auxiliary_loss_clip": 0.01165005, + "auxiliary_loss_mlp": 0.01028438, + "balance_loss_clip": 1.052791, + "balance_loss_mlp": 1.02117491, + "epoch": 0.5754824745986894, + "flos": 19789616741760.0, + "grad_norm": 1.7966550692055163, + "language_loss": 0.73637027, + "learning_rate": 1.6106788978126926e-06, + "loss": 0.75830472, + "num_input_tokens_seen": 103280475, + "step": 4786, + "time_per_iteration": 2.482895851135254 + }, + { + "auxiliary_loss_clip": 0.01115946, + "auxiliary_loss_mlp": 0.01030738, + "balance_loss_clip": 1.04170692, + "balance_loss_mlp": 1.02253604, + "epoch": 0.5756027174893285, + "flos": 30985665160320.0, + "grad_norm": 2.2004484191779756, + "language_loss": 0.78964955, + "learning_rate": 1.6099148569208196e-06, + "loss": 0.8111164, + "num_input_tokens_seen": 103297695, + "step": 4787, + "time_per_iteration": 2.6224002838134766 + }, + { + "auxiliary_loss_clip": 0.01148954, + "auxiliary_loss_mlp": 0.01030705, + "balance_loss_clip": 1.05031443, + "balance_loss_mlp": 1.02235484, + "epoch": 0.5757229603799675, + "flos": 28546864364160.0, + "grad_norm": 2.2269929768668435, + "language_loss": 0.63013512, + "learning_rate": 1.6091508752045523e-06, + "loss": 0.6519317, + "num_input_tokens_seen": 103318575, + "step": 4788, + "time_per_iteration": 2.5476508140563965 + }, + { + "auxiliary_loss_clip": 0.01125274, + "auxiliary_loss_mlp": 0.01023952, + "balance_loss_clip": 1.0433507, + "balance_loss_mlp": 1.0164206, + "epoch": 0.5758432032706067, + "flos": 22999024944000.0, + "grad_norm": 1.6317154874099573, + "language_loss": 0.86491358, + "learning_rate": 1.608386952779787e-06, + "loss": 0.88640583, + "num_input_tokens_seen": 103337945, + "step": 4789, + "time_per_iteration": 2.5418102741241455 + }, + { + "auxiliary_loss_clip": 0.01155673, + "auxiliary_loss_mlp": 0.01027904, + "balance_loss_clip": 1.05062437, + "balance_loss_mlp": 1.02066827, + "epoch": 0.5759634461612457, + "flos": 25739727552000.0, + "grad_norm": 1.6241717272159866, + "language_loss": 0.74805444, + "learning_rate": 1.6076230897624098e-06, + "loss": 0.76989025, + "num_input_tokens_seen": 103360150, + "step": 4790, + "time_per_iteration": 3.330305576324463 + }, + { + "auxiliary_loss_clip": 0.01163925, + "auxiliary_loss_mlp": 0.01029319, + "balance_loss_clip": 1.04864287, + "balance_loss_mlp": 1.02113485, + "epoch": 0.5760836890518848, + "flos": 30591761639040.0, + "grad_norm": 3.3930244716306643, + "language_loss": 0.77110356, + "learning_rate": 1.6068592862682974e-06, + "loss": 0.79303598, + "num_input_tokens_seen": 103378305, + "step": 4791, + "time_per_iteration": 3.346389055252075 + }, + { + "auxiliary_loss_clip": 0.01151431, + "auxiliary_loss_mlp": 0.01026955, + "balance_loss_clip": 1.04953051, + "balance_loss_mlp": 1.01936698, + "epoch": 0.576203931942524, + "flos": 36538963447680.0, + "grad_norm": 2.3933584174453184, + "language_loss": 0.73489869, + "learning_rate": 1.6060955424133187e-06, + "loss": 0.75668252, + "num_input_tokens_seen": 103399230, + "step": 4792, + "time_per_iteration": 2.6239607334136963 + }, + { + "auxiliary_loss_clip": 0.01162196, + "auxiliary_loss_mlp": 0.01024593, + "balance_loss_clip": 1.0513413, + "balance_loss_mlp": 1.01633787, + "epoch": 0.576324174833163, + "flos": 25516937445120.0, + "grad_norm": 1.7155350517711399, + "language_loss": 0.89256418, + "learning_rate": 1.6053318583133332e-06, + "loss": 0.91443205, + "num_input_tokens_seen": 103420100, + "step": 4793, + "time_per_iteration": 3.306173086166382 + }, + { + "auxiliary_loss_clip": 0.01161379, + "auxiliary_loss_mlp": 0.01028757, + "balance_loss_clip": 1.0502044, + "balance_loss_mlp": 1.02085352, + "epoch": 0.5764444177238021, + "flos": 25119262995840.0, + "grad_norm": 2.039704664537911, + "language_loss": 0.74903131, + "learning_rate": 1.6045682340841907e-06, + "loss": 0.77093267, + "num_input_tokens_seen": 103439025, + "step": 4794, + "time_per_iteration": 2.498257875442505 + }, + { + "auxiliary_loss_clip": 0.01052133, + "auxiliary_loss_mlp": 0.00753044, + "balance_loss_clip": 1.02073085, + "balance_loss_mlp": 1.00091863, + "epoch": 0.5765646606144411, + "flos": 62212687758720.0, + "grad_norm": 0.7571701001839047, + "language_loss": 0.58063412, + "learning_rate": 1.6038046698417336e-06, + "loss": 0.59868592, + "num_input_tokens_seen": 103499920, + "step": 4795, + "time_per_iteration": 3.0904600620269775 + }, + { + "auxiliary_loss_clip": 0.01162379, + "auxiliary_loss_mlp": 0.01024695, + "balance_loss_clip": 1.04988813, + "balance_loss_mlp": 1.01720273, + "epoch": 0.5766849035050803, + "flos": 25118760205440.0, + "grad_norm": 8.434707351242077, + "language_loss": 0.68954772, + "learning_rate": 1.6030411657017919e-06, + "loss": 0.71141851, + "num_input_tokens_seen": 103519575, + "step": 4796, + "time_per_iteration": 2.5002939701080322 + }, + { + "auxiliary_loss_clip": 0.01154796, + "auxiliary_loss_mlp": 0.01024325, + "balance_loss_clip": 1.04893398, + "balance_loss_mlp": 1.01688635, + "epoch": 0.5768051463957193, + "flos": 15991093578240.0, + "grad_norm": 2.024960526169027, + "language_loss": 0.84323651, + "learning_rate": 1.6022777217801903e-06, + "loss": 0.86502773, + "num_input_tokens_seen": 103536530, + "step": 4797, + "time_per_iteration": 2.4384193420410156 + }, + { + "auxiliary_loss_clip": 0.01135759, + "auxiliary_loss_mlp": 0.01020618, + "balance_loss_clip": 1.05027306, + "balance_loss_mlp": 1.01317, + "epoch": 0.5769253892863584, + "flos": 22163635359360.0, + "grad_norm": 1.9005906558249475, + "language_loss": 0.73948812, + "learning_rate": 1.601514338192742e-06, + "loss": 0.76105189, + "num_input_tokens_seen": 103556460, + "step": 4798, + "time_per_iteration": 2.5485994815826416 + }, + { + "auxiliary_loss_clip": 0.01172541, + "auxiliary_loss_mlp": 0.01022944, + "balance_loss_clip": 1.0504874, + "balance_loss_mlp": 1.01624155, + "epoch": 0.5770456321769976, + "flos": 22856388036480.0, + "grad_norm": 2.145844483352257, + "language_loss": 0.71334958, + "learning_rate": 1.6007510150552514e-06, + "loss": 0.73530442, + "num_input_tokens_seen": 103574520, + "step": 4799, + "time_per_iteration": 2.4311187267303467 + }, + { + "auxiliary_loss_clip": 0.01166768, + "auxiliary_loss_mlp": 0.01025913, + "balance_loss_clip": 1.04939318, + "balance_loss_mlp": 1.01768732, + "epoch": 0.5771658750676366, + "flos": 46353672489600.0, + "grad_norm": 1.524221912001253, + "language_loss": 0.62208259, + "learning_rate": 1.599987752483515e-06, + "loss": 0.64400941, + "num_input_tokens_seen": 103598965, + "step": 4800, + "time_per_iteration": 2.6790549755096436 + }, + { + "auxiliary_loss_clip": 0.01130185, + "auxiliary_loss_mlp": 0.01027353, + "balance_loss_clip": 1.04570234, + "balance_loss_mlp": 1.0198729, + "epoch": 0.5772861179582757, + "flos": 22159972172160.0, + "grad_norm": 1.7762656907374095, + "language_loss": 0.67955375, + "learning_rate": 1.5992245505933184e-06, + "loss": 0.7011292, + "num_input_tokens_seen": 103618665, + "step": 4801, + "time_per_iteration": 2.534660816192627 + }, + { + "auxiliary_loss_clip": 0.01178337, + "auxiliary_loss_mlp": 0.0102718, + "balance_loss_clip": 1.05283499, + "balance_loss_mlp": 1.02021527, + "epoch": 0.5774063608489148, + "flos": 31248926916480.0, + "grad_norm": 2.067980660126704, + "language_loss": 0.70986176, + "learning_rate": 1.5984614095004388e-06, + "loss": 0.73191696, + "num_input_tokens_seen": 103639800, + "step": 4802, + "time_per_iteration": 2.5059187412261963 + }, + { + "auxiliary_loss_clip": 0.01156933, + "auxiliary_loss_mlp": 0.01027642, + "balance_loss_clip": 1.0495168, + "balance_loss_mlp": 1.02020943, + "epoch": 0.5775266037395539, + "flos": 22527123039360.0, + "grad_norm": 2.353957475063669, + "language_loss": 0.81039661, + "learning_rate": 1.5976983293206438e-06, + "loss": 0.83224237, + "num_input_tokens_seen": 103655605, + "step": 4803, + "time_per_iteration": 2.4617269039154053 + }, + { + "auxiliary_loss_clip": 0.01143536, + "auxiliary_loss_mlp": 0.01024249, + "balance_loss_clip": 1.04529154, + "balance_loss_mlp": 1.01678693, + "epoch": 0.577646846630193, + "flos": 21068790860160.0, + "grad_norm": 2.1976059942446255, + "language_loss": 0.7138983, + "learning_rate": 1.5969353101696928e-06, + "loss": 0.73557615, + "num_input_tokens_seen": 103674045, + "step": 4804, + "time_per_iteration": 2.4971988201141357 + }, + { + "auxiliary_loss_clip": 0.0116203, + "auxiliary_loss_mlp": 0.01028254, + "balance_loss_clip": 1.04950333, + "balance_loss_mlp": 1.02098203, + "epoch": 0.5777670895208321, + "flos": 29714284293120.0, + "grad_norm": 1.864206055592077, + "language_loss": 0.79860455, + "learning_rate": 1.5961723521633341e-06, + "loss": 0.82050741, + "num_input_tokens_seen": 103695285, + "step": 4805, + "time_per_iteration": 2.5336010456085205 + }, + { + "auxiliary_loss_clip": 0.01144607, + "auxiliary_loss_mlp": 0.01028069, + "balance_loss_clip": 1.04682469, + "balance_loss_mlp": 1.02035606, + "epoch": 0.5778873324114712, + "flos": 19500428344320.0, + "grad_norm": 2.3248439475891054, + "language_loss": 0.91001976, + "learning_rate": 1.5954094554173097e-06, + "loss": 0.9317466, + "num_input_tokens_seen": 103713275, + "step": 4806, + "time_per_iteration": 2.531205654144287 + }, + { + "auxiliary_loss_clip": 0.01155208, + "auxiliary_loss_mlp": 0.01024542, + "balance_loss_clip": 1.05124426, + "balance_loss_mlp": 1.01706743, + "epoch": 0.5780075753021102, + "flos": 14136846716160.0, + "grad_norm": 2.1066792500391065, + "language_loss": 0.79062176, + "learning_rate": 1.5946466200473482e-06, + "loss": 0.81241924, + "num_input_tokens_seen": 103731185, + "step": 4807, + "time_per_iteration": 2.573617696762085 + }, + { + "auxiliary_loss_clip": 0.01152739, + "auxiliary_loss_mlp": 0.01030941, + "balance_loss_clip": 1.04883099, + "balance_loss_mlp": 1.02377605, + "epoch": 0.5781278181927494, + "flos": 15262178883840.0, + "grad_norm": 2.074258032513797, + "language_loss": 0.82980978, + "learning_rate": 1.5938838461691723e-06, + "loss": 0.8516466, + "num_input_tokens_seen": 103748095, + "step": 4808, + "time_per_iteration": 2.515230894088745 + }, + { + "auxiliary_loss_clip": 0.01179053, + "auxiliary_loss_mlp": 0.01028739, + "balance_loss_clip": 1.05346584, + "balance_loss_mlp": 1.0213604, + "epoch": 0.5782480610833884, + "flos": 16726831856640.0, + "grad_norm": 3.617787539425044, + "language_loss": 0.82744485, + "learning_rate": 1.593121133898494e-06, + "loss": 0.84952277, + "num_input_tokens_seen": 103765300, + "step": 4809, + "time_per_iteration": 2.406355381011963 + }, + { + "auxiliary_loss_clip": 0.01168351, + "auxiliary_loss_mlp": 0.01027345, + "balance_loss_clip": 1.05068898, + "balance_loss_mlp": 1.02005577, + "epoch": 0.5783683039740275, + "flos": 25482140144640.0, + "grad_norm": 3.6560797765584985, + "language_loss": 0.79248464, + "learning_rate": 1.592358483351016e-06, + "loss": 0.81444168, + "num_input_tokens_seen": 103785475, + "step": 4810, + "time_per_iteration": 2.503455877304077 + }, + { + "auxiliary_loss_clip": 0.01158991, + "auxiliary_loss_mlp": 0.01024084, + "balance_loss_clip": 1.04887128, + "balance_loss_mlp": 1.01708055, + "epoch": 0.5784885468646667, + "flos": 18405835240320.0, + "grad_norm": 1.9581343122777792, + "language_loss": 0.7240845, + "learning_rate": 1.5915958946424326e-06, + "loss": 0.74591523, + "num_input_tokens_seen": 103804160, + "step": 4811, + "time_per_iteration": 2.4340193271636963 + }, + { + "auxiliary_loss_clip": 0.01133451, + "auxiliary_loss_mlp": 0.00762956, + "balance_loss_clip": 1.04584384, + "balance_loss_mlp": 1.00118232, + "epoch": 0.5786087897553057, + "flos": 46100717936640.0, + "grad_norm": 2.043706849012102, + "language_loss": 0.74428153, + "learning_rate": 1.5908333678884271e-06, + "loss": 0.76324558, + "num_input_tokens_seen": 103830580, + "step": 4812, + "time_per_iteration": 2.7701449394226074 + }, + { + "auxiliary_loss_clip": 0.01161057, + "auxiliary_loss_mlp": 0.0102618, + "balance_loss_clip": 1.0500443, + "balance_loss_mlp": 1.01898623, + "epoch": 0.5787290326459448, + "flos": 12385950261120.0, + "grad_norm": 2.2249589834086367, + "language_loss": 0.73933506, + "learning_rate": 1.5900709032046743e-06, + "loss": 0.76120746, + "num_input_tokens_seen": 103848655, + "step": 4813, + "time_per_iteration": 2.432630777359009 + }, + { + "auxiliary_loss_clip": 0.01145666, + "auxiliary_loss_mlp": 0.01023467, + "balance_loss_clip": 1.0511601, + "balance_loss_mlp": 1.01586139, + "epoch": 0.5788492755365839, + "flos": 23290332243840.0, + "grad_norm": 2.079775765922196, + "language_loss": 0.77973688, + "learning_rate": 1.5893085007068391e-06, + "loss": 0.8014282, + "num_input_tokens_seen": 103866215, + "step": 4814, + "time_per_iteration": 2.5248281955718994 + }, + { + "auxiliary_loss_clip": 0.01135989, + "auxiliary_loss_mlp": 0.01027768, + "balance_loss_clip": 1.04432535, + "balance_loss_mlp": 1.01972723, + "epoch": 0.578969518427223, + "flos": 24061047390720.0, + "grad_norm": 1.9614142004037214, + "language_loss": 0.70881563, + "learning_rate": 1.5885461605105786e-06, + "loss": 0.73045325, + "num_input_tokens_seen": 103887815, + "step": 4815, + "time_per_iteration": 2.518522024154663 + }, + { + "auxiliary_loss_clip": 0.01148315, + "auxiliary_loss_mlp": 0.01023924, + "balance_loss_clip": 1.04874825, + "balance_loss_mlp": 1.0162679, + "epoch": 0.579089761317862, + "flos": 21871825269120.0, + "grad_norm": 2.4040625146644974, + "language_loss": 0.76569998, + "learning_rate": 1.5877838827315375e-06, + "loss": 0.7874223, + "num_input_tokens_seen": 103906360, + "step": 4816, + "time_per_iteration": 2.506683826446533 + }, + { + "auxiliary_loss_clip": 0.01177358, + "auxiliary_loss_mlp": 0.01027262, + "balance_loss_clip": 1.05288279, + "balance_loss_mlp": 1.01980901, + "epoch": 0.5792100042085012, + "flos": 22929681738240.0, + "grad_norm": 2.026938352491957, + "language_loss": 0.69983202, + "learning_rate": 1.587021667485355e-06, + "loss": 0.72187817, + "num_input_tokens_seen": 103925730, + "step": 4817, + "time_per_iteration": 4.768558740615845 + }, + { + "auxiliary_loss_clip": 0.0114912, + "auxiliary_loss_mlp": 0.01021573, + "balance_loss_clip": 1.04619658, + "balance_loss_mlp": 1.0143373, + "epoch": 0.5793302470991403, + "flos": 21470056669440.0, + "grad_norm": 1.8222037559296234, + "language_loss": 0.78238791, + "learning_rate": 1.5862595148876559e-06, + "loss": 0.80409491, + "num_input_tokens_seen": 103945835, + "step": 4818, + "time_per_iteration": 2.5385515689849854 + }, + { + "auxiliary_loss_clip": 0.01121403, + "auxiliary_loss_mlp": 0.01026698, + "balance_loss_clip": 1.04551196, + "balance_loss_mlp": 1.01901472, + "epoch": 0.5794504899897793, + "flos": 12711013367040.0, + "grad_norm": 2.214161018618058, + "language_loss": 0.76307428, + "learning_rate": 1.58549742505406e-06, + "loss": 0.78455532, + "num_input_tokens_seen": 103960580, + "step": 4819, + "time_per_iteration": 2.50982666015625 + }, + { + "auxiliary_loss_clip": 0.01176904, + "auxiliary_loss_mlp": 0.01023887, + "balance_loss_clip": 1.05130696, + "balance_loss_mlp": 1.01650786, + "epoch": 0.5795707328804185, + "flos": 14867054300160.0, + "grad_norm": 2.3378390401531113, + "language_loss": 0.7580837, + "learning_rate": 1.5847353981001747e-06, + "loss": 0.78009158, + "num_input_tokens_seen": 103977760, + "step": 4820, + "time_per_iteration": 3.1811978816986084 + }, + { + "auxiliary_loss_clip": 0.01141472, + "auxiliary_loss_mlp": 0.0103139, + "balance_loss_clip": 1.04564476, + "balance_loss_mlp": 1.02365637, + "epoch": 0.5796909757710575, + "flos": 36430046432640.0, + "grad_norm": 5.674671730575674, + "language_loss": 0.6999774, + "learning_rate": 1.5839734341415993e-06, + "loss": 0.72170603, + "num_input_tokens_seen": 103999960, + "step": 4821, + "time_per_iteration": 2.6223456859588623 + }, + { + "auxiliary_loss_clip": 0.01158339, + "auxiliary_loss_mlp": 0.01024452, + "balance_loss_clip": 1.05382967, + "balance_loss_mlp": 1.01739764, + "epoch": 0.5798112186616966, + "flos": 23039891642880.0, + "grad_norm": 1.7296911852331123, + "language_loss": 0.76449394, + "learning_rate": 1.5832115332939238e-06, + "loss": 0.78632188, + "num_input_tokens_seen": 104018400, + "step": 4822, + "time_per_iteration": 2.4750120639801025 + }, + { + "auxiliary_loss_clip": 0.01166023, + "auxiliary_loss_mlp": 0.01029611, + "balance_loss_clip": 1.05302596, + "balance_loss_mlp": 1.02171338, + "epoch": 0.5799314615523358, + "flos": 16652604401280.0, + "grad_norm": 1.670359903771047, + "language_loss": 0.74511206, + "learning_rate": 1.5824496956727272e-06, + "loss": 0.76706839, + "num_input_tokens_seen": 104035605, + "step": 4823, + "time_per_iteration": 2.439587354660034 + }, + { + "auxiliary_loss_clip": 0.01149497, + "auxiliary_loss_mlp": 0.01026424, + "balance_loss_clip": 1.04900503, + "balance_loss_mlp": 1.01922369, + "epoch": 0.5800517044429748, + "flos": 20485673470080.0, + "grad_norm": 1.7053020437191095, + "language_loss": 0.7345891, + "learning_rate": 1.5816879213935797e-06, + "loss": 0.75634837, + "num_input_tokens_seen": 104054415, + "step": 4824, + "time_per_iteration": 2.4948105812072754 + }, + { + "auxiliary_loss_clip": 0.01159342, + "auxiliary_loss_mlp": 0.01029182, + "balance_loss_clip": 1.05046308, + "balance_loss_mlp": 1.022295, + "epoch": 0.5801719473336139, + "flos": 31538258968320.0, + "grad_norm": 1.6565452675706376, + "language_loss": 0.79669303, + "learning_rate": 1.5809262105720416e-06, + "loss": 0.8185783, + "num_input_tokens_seen": 104075455, + "step": 4825, + "time_per_iteration": 2.5448989868164062 + }, + { + "auxiliary_loss_clip": 0.01172598, + "auxiliary_loss_mlp": 0.01026969, + "balance_loss_clip": 1.05044818, + "balance_loss_mlp": 1.0198133, + "epoch": 0.580292190224253, + "flos": 20375966355840.0, + "grad_norm": 1.5174531942093703, + "language_loss": 0.79120767, + "learning_rate": 1.5801645633236644e-06, + "loss": 0.81320333, + "num_input_tokens_seen": 104096440, + "step": 4826, + "time_per_iteration": 2.453003406524658 + }, + { + "auxiliary_loss_clip": 0.01141212, + "auxiliary_loss_mlp": 0.01029106, + "balance_loss_clip": 1.04567349, + "balance_loss_mlp": 1.02149475, + "epoch": 0.5804124331148921, + "flos": 26615373304320.0, + "grad_norm": 1.912842916498899, + "language_loss": 0.76950717, + "learning_rate": 1.579402979763989e-06, + "loss": 0.79121029, + "num_input_tokens_seen": 104116775, + "step": 4827, + "time_per_iteration": 2.53704571723938 + }, + { + "auxiliary_loss_clip": 0.0111896, + "auxiliary_loss_mlp": 0.01025318, + "balance_loss_clip": 1.04802489, + "balance_loss_mlp": 1.0177002, + "epoch": 0.5805326760055312, + "flos": 13478496289920.0, + "grad_norm": 2.0718168318577384, + "language_loss": 0.8157407, + "learning_rate": 1.578641460008548e-06, + "loss": 0.83718348, + "num_input_tokens_seen": 104134510, + "step": 4828, + "time_per_iteration": 2.554082155227661 + }, + { + "auxiliary_loss_clip": 0.01161811, + "auxiliary_loss_mlp": 0.01027742, + "balance_loss_clip": 1.05088782, + "balance_loss_mlp": 1.01948678, + "epoch": 0.5806529188961702, + "flos": 12091374823680.0, + "grad_norm": 3.051738059408049, + "language_loss": 0.68094033, + "learning_rate": 1.5778800041728613e-06, + "loss": 0.70283586, + "num_input_tokens_seen": 104150800, + "step": 4829, + "time_per_iteration": 2.4331958293914795 + }, + { + "auxiliary_loss_clip": 0.0115634, + "auxiliary_loss_mlp": 0.01020298, + "balance_loss_clip": 1.04957128, + "balance_loss_mlp": 1.01291609, + "epoch": 0.5807731617868094, + "flos": 26214107495040.0, + "grad_norm": 1.5781380302930974, + "language_loss": 0.66190946, + "learning_rate": 1.577118612372443e-06, + "loss": 0.68367589, + "num_input_tokens_seen": 104172640, + "step": 4830, + "time_per_iteration": 2.5087406635284424 + }, + { + "auxiliary_loss_clip": 0.01139963, + "auxiliary_loss_mlp": 0.00762549, + "balance_loss_clip": 1.04354942, + "balance_loss_mlp": 1.00118089, + "epoch": 0.5808934046774484, + "flos": 37962139190400.0, + "grad_norm": 1.7273917313951135, + "language_loss": 0.7040211, + "learning_rate": 1.5763572847227943e-06, + "loss": 0.72304618, + "num_input_tokens_seen": 104193525, + "step": 4831, + "time_per_iteration": 2.6744751930236816 + }, + { + "auxiliary_loss_clip": 0.01158962, + "auxiliary_loss_mlp": 0.01028478, + "balance_loss_clip": 1.04755902, + "balance_loss_mlp": 1.02121234, + "epoch": 0.5810136475680875, + "flos": 20485853038080.0, + "grad_norm": 1.8336976179883167, + "language_loss": 0.81126475, + "learning_rate": 1.5755960213394091e-06, + "loss": 0.83313912, + "num_input_tokens_seen": 104210625, + "step": 4832, + "time_per_iteration": 2.4549496173858643 + }, + { + "auxiliary_loss_clip": 0.01136009, + "auxiliary_loss_mlp": 0.01026499, + "balance_loss_clip": 1.04625475, + "balance_loss_mlp": 1.01940334, + "epoch": 0.5811338904587267, + "flos": 17530153574400.0, + "grad_norm": 1.903145974437711, + "language_loss": 0.78663498, + "learning_rate": 1.5748348223377703e-06, + "loss": 0.80826008, + "num_input_tokens_seen": 104228180, + "step": 4833, + "time_per_iteration": 2.4982309341430664 + }, + { + "auxiliary_loss_clip": 0.01144286, + "auxiliary_loss_mlp": 0.01026597, + "balance_loss_clip": 1.04838908, + "balance_loss_mlp": 1.01967955, + "epoch": 0.5812541333493657, + "flos": 19458017360640.0, + "grad_norm": 1.6426077307059455, + "language_loss": 0.77833772, + "learning_rate": 1.5740736878333507e-06, + "loss": 0.80004656, + "num_input_tokens_seen": 104246020, + "step": 4834, + "time_per_iteration": 2.480544090270996 + }, + { + "auxiliary_loss_clip": 0.01151693, + "auxiliary_loss_mlp": 0.01023266, + "balance_loss_clip": 1.04852915, + "balance_loss_mlp": 1.01562405, + "epoch": 0.5813743762400048, + "flos": 20594949621120.0, + "grad_norm": 2.392839891845154, + "language_loss": 0.78454322, + "learning_rate": 1.5733126179416143e-06, + "loss": 0.80629277, + "num_input_tokens_seen": 104260505, + "step": 4835, + "time_per_iteration": 2.4855239391326904 + }, + { + "auxiliary_loss_clip": 0.01161454, + "auxiliary_loss_mlp": 0.01024375, + "balance_loss_clip": 1.04955578, + "balance_loss_mlp": 1.01709747, + "epoch": 0.5814946191306439, + "flos": 33178227246720.0, + "grad_norm": 1.8489623655655127, + "language_loss": 0.72013807, + "learning_rate": 1.5725516127780137e-06, + "loss": 0.74199635, + "num_input_tokens_seen": 104282640, + "step": 4836, + "time_per_iteration": 2.5563948154449463 + }, + { + "auxiliary_loss_clip": 0.01166034, + "auxiliary_loss_mlp": 0.01029304, + "balance_loss_clip": 1.04791856, + "balance_loss_mlp": 1.02119803, + "epoch": 0.581614862021283, + "flos": 16143283503360.0, + "grad_norm": 2.7635473907536756, + "language_loss": 0.87916625, + "learning_rate": 1.5717906724579943e-06, + "loss": 0.90111965, + "num_input_tokens_seen": 104299700, + "step": 4837, + "time_per_iteration": 2.423619508743286 + }, + { + "auxiliary_loss_clip": 0.01142372, + "auxiliary_loss_mlp": 0.01021939, + "balance_loss_clip": 1.04787886, + "balance_loss_mlp": 1.01475048, + "epoch": 0.581735104911922, + "flos": 33802642298880.0, + "grad_norm": 2.1528301813094237, + "language_loss": 0.68163562, + "learning_rate": 1.571029797096989e-06, + "loss": 0.70327878, + "num_input_tokens_seen": 104320805, + "step": 4838, + "time_per_iteration": 2.6292335987091064 + }, + { + "auxiliary_loss_clip": 0.01174266, + "auxiliary_loss_mlp": 0.0102746, + "balance_loss_clip": 1.05053282, + "balance_loss_mlp": 1.02000356, + "epoch": 0.5818553478025612, + "flos": 23331163029120.0, + "grad_norm": 2.000328930812378, + "language_loss": 0.78893358, + "learning_rate": 1.570268986810423e-06, + "loss": 0.81095082, + "num_input_tokens_seen": 104340700, + "step": 4839, + "time_per_iteration": 2.454040288925171 + }, + { + "auxiliary_loss_clip": 0.01143966, + "auxiliary_loss_mlp": 0.0102586, + "balance_loss_clip": 1.04729402, + "balance_loss_mlp": 1.01886845, + "epoch": 0.5819755906932003, + "flos": 20996143603200.0, + "grad_norm": 1.986758233651435, + "language_loss": 0.74689525, + "learning_rate": 1.5695082417137096e-06, + "loss": 0.76859349, + "num_input_tokens_seen": 104358575, + "step": 4840, + "time_per_iteration": 2.4894967079162598 + }, + { + "auxiliary_loss_clip": 0.01143894, + "auxiliary_loss_mlp": 0.01023754, + "balance_loss_clip": 1.04481816, + "balance_loss_mlp": 1.01650584, + "epoch": 0.5820958335838393, + "flos": 21431668008960.0, + "grad_norm": 1.650160760175611, + "language_loss": 0.75228143, + "learning_rate": 1.5687475619222539e-06, + "loss": 0.77395791, + "num_input_tokens_seen": 104378530, + "step": 4841, + "time_per_iteration": 2.530656337738037 + }, + { + "auxiliary_loss_clip": 0.0114123, + "auxiliary_loss_mlp": 0.01024653, + "balance_loss_clip": 1.04447174, + "balance_loss_mlp": 1.01698816, + "epoch": 0.5822160764744785, + "flos": 17967473660160.0, + "grad_norm": 2.3553670243172165, + "language_loss": 0.73453832, + "learning_rate": 1.5679869475514496e-06, + "loss": 0.75619709, + "num_input_tokens_seen": 104395465, + "step": 4842, + "time_per_iteration": 2.4664618968963623 + }, + { + "auxiliary_loss_clip": 0.01162823, + "auxiliary_loss_mlp": 0.01029906, + "balance_loss_clip": 1.04981792, + "balance_loss_mlp": 1.02186513, + "epoch": 0.5823363193651175, + "flos": 23033858158080.0, + "grad_norm": 2.1879075120561544, + "language_loss": 0.81060755, + "learning_rate": 1.567226398716682e-06, + "loss": 0.83253479, + "num_input_tokens_seen": 104415380, + "step": 4843, + "time_per_iteration": 2.480381488800049 + }, + { + "auxiliary_loss_clip": 0.01153594, + "auxiliary_loss_mlp": 0.01022533, + "balance_loss_clip": 1.04747462, + "balance_loss_mlp": 1.01403928, + "epoch": 0.5824565622557566, + "flos": 32891840110080.0, + "grad_norm": 1.7088820028381864, + "language_loss": 0.62013882, + "learning_rate": 1.566465915533326e-06, + "loss": 0.64190006, + "num_input_tokens_seen": 104437410, + "step": 4844, + "time_per_iteration": 4.164912462234497 + }, + { + "auxiliary_loss_clip": 0.01158869, + "auxiliary_loss_mlp": 0.01024263, + "balance_loss_clip": 1.04915357, + "balance_loss_mlp": 1.01674938, + "epoch": 0.5825768051463958, + "flos": 22229674513920.0, + "grad_norm": 1.8304701440139737, + "language_loss": 0.88282841, + "learning_rate": 1.5657054981167458e-06, + "loss": 0.90465975, + "num_input_tokens_seen": 104456305, + "step": 4845, + "time_per_iteration": 2.460355520248413 + }, + { + "auxiliary_loss_clip": 0.01157838, + "auxiliary_loss_mlp": 0.01026745, + "balance_loss_clip": 1.04904795, + "balance_loss_mlp": 1.01990223, + "epoch": 0.5826970480370348, + "flos": 28001561016960.0, + "grad_norm": 1.7644108844152218, + "language_loss": 0.67804867, + "learning_rate": 1.5649451465822965e-06, + "loss": 0.69989455, + "num_input_tokens_seen": 104477695, + "step": 4846, + "time_per_iteration": 2.5142552852630615 + }, + { + "auxiliary_loss_clip": 0.01116963, + "auxiliary_loss_mlp": 0.01029798, + "balance_loss_clip": 1.04604983, + "balance_loss_mlp": 1.02221584, + "epoch": 0.5828172909276739, + "flos": 17858053854720.0, + "grad_norm": 1.6851326002557172, + "language_loss": 0.83528829, + "learning_rate": 1.5641848610453218e-06, + "loss": 0.85675591, + "num_input_tokens_seen": 104496355, + "step": 4847, + "time_per_iteration": 3.3067264556884766 + }, + { + "auxiliary_loss_clip": 0.01159063, + "auxiliary_loss_mlp": 0.01024097, + "balance_loss_clip": 1.05106485, + "balance_loss_mlp": 1.01667333, + "epoch": 0.582937533818313, + "flos": 19865244827520.0, + "grad_norm": 3.2521088566318697, + "language_loss": 0.85884851, + "learning_rate": 1.563424641621158e-06, + "loss": 0.88068014, + "num_input_tokens_seen": 104515535, + "step": 4848, + "time_per_iteration": 2.4793972969055176 + }, + { + "auxiliary_loss_clip": 0.01151739, + "auxiliary_loss_mlp": 0.01026728, + "balance_loss_clip": 1.04837191, + "balance_loss_mlp": 1.01893723, + "epoch": 0.5830577767089521, + "flos": 26870734068480.0, + "grad_norm": 2.180038866418005, + "language_loss": 0.69894463, + "learning_rate": 1.5626644884251282e-06, + "loss": 0.72072929, + "num_input_tokens_seen": 104535055, + "step": 4849, + "time_per_iteration": 2.542965888977051 + }, + { + "auxiliary_loss_clip": 0.01173646, + "auxiliary_loss_mlp": 0.0102272, + "balance_loss_clip": 1.05003214, + "balance_loss_mlp": 1.01587391, + "epoch": 0.5831780195995911, + "flos": 25298205575040.0, + "grad_norm": 1.6171065466975727, + "language_loss": 0.88188386, + "learning_rate": 1.5619044015725488e-06, + "loss": 0.90384758, + "num_input_tokens_seen": 104554745, + "step": 4850, + "time_per_iteration": 2.4659423828125 + }, + { + "auxiliary_loss_clip": 0.01181615, + "auxiliary_loss_mlp": 0.01028653, + "balance_loss_clip": 1.05399036, + "balance_loss_mlp": 1.02053499, + "epoch": 0.5832982624902303, + "flos": 14756988049920.0, + "grad_norm": 2.663013634254192, + "language_loss": 0.87186104, + "learning_rate": 1.5611443811787224e-06, + "loss": 0.89396369, + "num_input_tokens_seen": 104568870, + "step": 4851, + "time_per_iteration": 2.429147481918335 + }, + { + "auxiliary_loss_clip": 0.01159708, + "auxiliary_loss_mlp": 0.01022993, + "balance_loss_clip": 1.04926443, + "balance_loss_mlp": 1.01568222, + "epoch": 0.5834185053808694, + "flos": 20444555376000.0, + "grad_norm": 2.114472287886864, + "language_loss": 0.68741381, + "learning_rate": 1.560384427358945e-06, + "loss": 0.70924085, + "num_input_tokens_seen": 104588415, + "step": 4852, + "time_per_iteration": 2.4506871700286865 + }, + { + "auxiliary_loss_clip": 0.01140831, + "auxiliary_loss_mlp": 0.0102501, + "balance_loss_clip": 1.04354477, + "balance_loss_mlp": 1.01766706, + "epoch": 0.5835387482715084, + "flos": 27200394115200.0, + "grad_norm": 1.4594088053360035, + "language_loss": 0.73288107, + "learning_rate": 1.5596245402284998e-06, + "loss": 0.75453949, + "num_input_tokens_seen": 104611940, + "step": 4853, + "time_per_iteration": 2.563549518585205 + }, + { + "auxiliary_loss_clip": 0.01165873, + "auxiliary_loss_mlp": 0.010261, + "balance_loss_clip": 1.05256307, + "balance_loss_mlp": 1.01845264, + "epoch": 0.5836589911621476, + "flos": 16654615562880.0, + "grad_norm": 1.8258329399430244, + "language_loss": 0.82093441, + "learning_rate": 1.5588647199026619e-06, + "loss": 0.84285414, + "num_input_tokens_seen": 104629675, + "step": 4854, + "time_per_iteration": 2.439039945602417 + }, + { + "auxiliary_loss_clip": 0.01180139, + "auxiliary_loss_mlp": 0.01025515, + "balance_loss_clip": 1.05405402, + "balance_loss_mlp": 1.01764727, + "epoch": 0.5837792340527866, + "flos": 20446817932800.0, + "grad_norm": 2.5267565539855568, + "language_loss": 0.87464964, + "learning_rate": 1.5581049664966956e-06, + "loss": 0.89670616, + "num_input_tokens_seen": 104647435, + "step": 4855, + "time_per_iteration": 2.4132070541381836 + }, + { + "auxiliary_loss_clip": 0.01028306, + "auxiliary_loss_mlp": 0.0100132, + "balance_loss_clip": 1.01587713, + "balance_loss_mlp": 1.00013435, + "epoch": 0.5838994769434257, + "flos": 65995480765440.0, + "grad_norm": 1.0005328024683895, + "language_loss": 0.65092957, + "learning_rate": 1.5573452801258545e-06, + "loss": 0.67122591, + "num_input_tokens_seen": 104694605, + "step": 4856, + "time_per_iteration": 2.9625377655029297 + }, + { + "auxiliary_loss_clip": 0.01165693, + "auxiliary_loss_mlp": 0.01035743, + "balance_loss_clip": 1.04968297, + "balance_loss_mlp": 1.02792311, + "epoch": 0.5840197198340649, + "flos": 21470523546240.0, + "grad_norm": 2.1851135465401055, + "language_loss": 0.6365847, + "learning_rate": 1.5565856609053824e-06, + "loss": 0.65859908, + "num_input_tokens_seen": 104713400, + "step": 4857, + "time_per_iteration": 2.4815542697906494 + }, + { + "auxiliary_loss_clip": 0.01175844, + "auxiliary_loss_mlp": 0.01024092, + "balance_loss_clip": 1.0515101, + "balance_loss_mlp": 1.01639152, + "epoch": 0.5841399627247039, + "flos": 19135144984320.0, + "grad_norm": 1.7611010944519492, + "language_loss": 0.79947835, + "learning_rate": 1.5558261089505127e-06, + "loss": 0.82147765, + "num_input_tokens_seen": 104732130, + "step": 4858, + "time_per_iteration": 2.4609711170196533 + }, + { + "auxiliary_loss_clip": 0.01162301, + "auxiliary_loss_mlp": 0.01024324, + "balance_loss_clip": 1.05104876, + "balance_loss_mlp": 1.0168463, + "epoch": 0.584260205615343, + "flos": 26425692558720.0, + "grad_norm": 2.0510826986766633, + "language_loss": 0.79720932, + "learning_rate": 1.5550666243764697e-06, + "loss": 0.81907552, + "num_input_tokens_seen": 104750290, + "step": 4859, + "time_per_iteration": 2.5539238452911377 + }, + { + "auxiliary_loss_clip": 0.01161741, + "auxiliary_loss_mlp": 0.01027942, + "balance_loss_clip": 1.0491817, + "balance_loss_mlp": 1.0202949, + "epoch": 0.584380448505982, + "flos": 13881809174400.0, + "grad_norm": 2.144471932876283, + "language_loss": 0.77339804, + "learning_rate": 1.554307207298465e-06, + "loss": 0.79529488, + "num_input_tokens_seen": 104768550, + "step": 4860, + "time_per_iteration": 2.432086229324341 + }, + { + "auxiliary_loss_clip": 0.01178942, + "auxiliary_loss_mlp": 0.01028365, + "balance_loss_clip": 1.0526185, + "balance_loss_mlp": 1.02051771, + "epoch": 0.5845006913966212, + "flos": 21543709507200.0, + "grad_norm": 2.3331698985494294, + "language_loss": 0.78386635, + "learning_rate": 1.553547857831704e-06, + "loss": 0.80593938, + "num_input_tokens_seen": 104785060, + "step": 4861, + "time_per_iteration": 2.4311749935150146 + }, + { + "auxiliary_loss_clip": 0.01077284, + "auxiliary_loss_mlp": 0.01001482, + "balance_loss_clip": 1.0166018, + "balance_loss_mlp": 1.000296, + "epoch": 0.5846209342872603, + "flos": 58375452712320.0, + "grad_norm": 0.8829976997818496, + "language_loss": 0.64201421, + "learning_rate": 1.5527885760913771e-06, + "loss": 0.66280186, + "num_input_tokens_seen": 104834950, + "step": 4862, + "time_per_iteration": 2.849613904953003 + }, + { + "auxiliary_loss_clip": 0.01145547, + "auxiliary_loss_mlp": 0.01026946, + "balance_loss_clip": 1.04897285, + "balance_loss_mlp": 1.01981127, + "epoch": 0.5847411771778993, + "flos": 18588045957120.0, + "grad_norm": 1.6428407415445447, + "language_loss": 0.76281846, + "learning_rate": 1.552029362192668e-06, + "loss": 0.7845434, + "num_input_tokens_seen": 104854210, + "step": 4863, + "time_per_iteration": 2.485426425933838 + }, + { + "auxiliary_loss_clip": 0.01128634, + "auxiliary_loss_mlp": 0.01029797, + "balance_loss_clip": 1.04543614, + "balance_loss_mlp": 1.02255249, + "epoch": 0.5848614200685385, + "flos": 24240780069120.0, + "grad_norm": 2.048926847342627, + "language_loss": 0.72615778, + "learning_rate": 1.5512702162507478e-06, + "loss": 0.74774212, + "num_input_tokens_seen": 104874525, + "step": 4864, + "time_per_iteration": 2.5534770488739014 + }, + { + "auxiliary_loss_clip": 0.01054259, + "auxiliary_loss_mlp": 0.01002034, + "balance_loss_clip": 1.01394415, + "balance_loss_mlp": 1.00077009, + "epoch": 0.5849816629591775, + "flos": 71660245933440.0, + "grad_norm": 1.1289036865822433, + "language_loss": 0.55850923, + "learning_rate": 1.5505111383807792e-06, + "loss": 0.57907218, + "num_input_tokens_seen": 104937195, + "step": 4865, + "time_per_iteration": 3.1317663192749023 + }, + { + "auxiliary_loss_clip": 0.01121952, + "auxiliary_loss_mlp": 0.01023922, + "balance_loss_clip": 1.04356539, + "balance_loss_mlp": 1.01677215, + "epoch": 0.5851019058498166, + "flos": 23802095266560.0, + "grad_norm": 4.873935567069714, + "language_loss": 0.80443043, + "learning_rate": 1.5497521286979138e-06, + "loss": 0.82588911, + "num_input_tokens_seen": 104957435, + "step": 4866, + "time_per_iteration": 2.591313362121582 + }, + { + "auxiliary_loss_clip": 0.01137909, + "auxiliary_loss_mlp": 0.01027183, + "balance_loss_clip": 1.04709709, + "balance_loss_mlp": 1.01888657, + "epoch": 0.5852221487404557, + "flos": 24388516707840.0, + "grad_norm": 2.2837288521437564, + "language_loss": 0.74217761, + "learning_rate": 1.5489931873172927e-06, + "loss": 0.76382858, + "num_input_tokens_seen": 104978755, + "step": 4867, + "time_per_iteration": 2.5704362392425537 + }, + { + "auxiliary_loss_clip": 0.01086643, + "auxiliary_loss_mlp": 0.01024491, + "balance_loss_clip": 1.03742981, + "balance_loss_mlp": 1.01727843, + "epoch": 0.5853423916310948, + "flos": 27271425260160.0, + "grad_norm": 1.656663592663798, + "language_loss": 0.79637551, + "learning_rate": 1.5482343143540467e-06, + "loss": 0.81748688, + "num_input_tokens_seen": 105000020, + "step": 4868, + "time_per_iteration": 2.7131106853485107 + }, + { + "auxiliary_loss_clip": 0.0113305, + "auxiliary_loss_mlp": 0.00762101, + "balance_loss_clip": 1.0451808, + "balance_loss_mlp": 1.00112557, + "epoch": 0.5854626345217339, + "flos": 11983786611840.0, + "grad_norm": 2.36476059656723, + "language_loss": 0.82898134, + "learning_rate": 1.547475509923295e-06, + "loss": 0.84793282, + "num_input_tokens_seen": 105017060, + "step": 4869, + "time_per_iteration": 2.5202643871307373 + }, + { + "auxiliary_loss_clip": 0.01036837, + "auxiliary_loss_mlp": 0.01003778, + "balance_loss_clip": 1.01355934, + "balance_loss_mlp": 1.00256848, + "epoch": 0.585582877412373, + "flos": 64342335173760.0, + "grad_norm": 0.7300492849840746, + "language_loss": 0.56130332, + "learning_rate": 1.5467167741401495e-06, + "loss": 0.58170944, + "num_input_tokens_seen": 105078540, + "step": 4870, + "time_per_iteration": 4.643794775009155 + }, + { + "auxiliary_loss_clip": 0.01143452, + "auxiliary_loss_mlp": 0.01027357, + "balance_loss_clip": 1.04388511, + "balance_loss_mlp": 1.01942337, + "epoch": 0.5857031203030121, + "flos": 17011926103680.0, + "grad_norm": 2.2408241395200728, + "language_loss": 0.70789325, + "learning_rate": 1.5459581071197083e-06, + "loss": 0.72960138, + "num_input_tokens_seen": 105094200, + "step": 4871, + "time_per_iteration": 2.481311798095703 + }, + { + "auxiliary_loss_clip": 0.01165928, + "auxiliary_loss_mlp": 0.01020884, + "balance_loss_clip": 1.05279195, + "balance_loss_mlp": 1.01338577, + "epoch": 0.5858233631936511, + "flos": 20885682303360.0, + "grad_norm": 2.0397517845485673, + "language_loss": 0.82959986, + "learning_rate": 1.5451995089770624e-06, + "loss": 0.85146803, + "num_input_tokens_seen": 105113985, + "step": 4872, + "time_per_iteration": 2.4783623218536377 + }, + { + "auxiliary_loss_clip": 0.01173713, + "auxiliary_loss_mlp": 0.01026386, + "balance_loss_clip": 1.05045438, + "balance_loss_mlp": 1.01948667, + "epoch": 0.5859436060842903, + "flos": 23191902000000.0, + "grad_norm": 1.3931178545973788, + "language_loss": 0.71733671, + "learning_rate": 1.5444409798272885e-06, + "loss": 0.73933768, + "num_input_tokens_seen": 105138075, + "step": 4873, + "time_per_iteration": 2.5327072143554688 + }, + { + "auxiliary_loss_clip": 0.01135609, + "auxiliary_loss_mlp": 0.01027256, + "balance_loss_clip": 1.04618526, + "balance_loss_mlp": 1.01973987, + "epoch": 0.5860638489749294, + "flos": 22492648961280.0, + "grad_norm": 2.986432413978774, + "language_loss": 0.80611026, + "learning_rate": 1.543682519785456e-06, + "loss": 0.82773888, + "num_input_tokens_seen": 105156555, + "step": 4874, + "time_per_iteration": 3.289921522140503 + }, + { + "auxiliary_loss_clip": 0.01147919, + "auxiliary_loss_mlp": 0.01031394, + "balance_loss_clip": 1.04789782, + "balance_loss_mlp": 1.02425611, + "epoch": 0.5861840918655684, + "flos": 17566243764480.0, + "grad_norm": 2.3370207023439096, + "language_loss": 0.79754114, + "learning_rate": 1.5429241289666219e-06, + "loss": 0.81933427, + "num_input_tokens_seen": 105174055, + "step": 4875, + "time_per_iteration": 2.455665111541748 + }, + { + "auxiliary_loss_clip": 0.0114078, + "auxiliary_loss_mlp": 0.01026412, + "balance_loss_clip": 1.04769087, + "balance_loss_mlp": 1.01925993, + "epoch": 0.5863043347562076, + "flos": 25556152118400.0, + "grad_norm": 2.737271369457767, + "language_loss": 0.6978637, + "learning_rate": 1.5421658074858342e-06, + "loss": 0.71953559, + "num_input_tokens_seen": 105192160, + "step": 4876, + "time_per_iteration": 2.5226802825927734 + }, + { + "auxiliary_loss_clip": 0.01143381, + "auxiliary_loss_mlp": 0.01028658, + "balance_loss_clip": 1.04776013, + "balance_loss_mlp": 1.02078438, + "epoch": 0.5864245776468466, + "flos": 20667525050880.0, + "grad_norm": 2.454147490251741, + "language_loss": 0.66288888, + "learning_rate": 1.5414075554581298e-06, + "loss": 0.68460929, + "num_input_tokens_seen": 105210205, + "step": 4877, + "time_per_iteration": 2.48879075050354 + }, + { + "auxiliary_loss_clip": 0.01177447, + "auxiliary_loss_mlp": 0.01025649, + "balance_loss_clip": 1.05099607, + "balance_loss_mlp": 1.01837087, + "epoch": 0.5865448205374857, + "flos": 28913907490560.0, + "grad_norm": 16.576924549508384, + "language_loss": 0.78227496, + "learning_rate": 1.5406493729985348e-06, + "loss": 0.80430591, + "num_input_tokens_seen": 105229400, + "step": 4878, + "time_per_iteration": 2.487704277038574 + }, + { + "auxiliary_loss_clip": 0.01123386, + "auxiliary_loss_mlp": 0.00762333, + "balance_loss_clip": 1.04722214, + "balance_loss_mlp": 1.00089705, + "epoch": 0.5866650634281249, + "flos": 25842575168640.0, + "grad_norm": 3.0220106809837506, + "language_loss": 0.71787912, + "learning_rate": 1.5398912602220644e-06, + "loss": 0.7367363, + "num_input_tokens_seen": 105248675, + "step": 4879, + "time_per_iteration": 2.5864903926849365 + }, + { + "auxiliary_loss_clip": 0.01132562, + "auxiliary_loss_mlp": 0.01028623, + "balance_loss_clip": 1.04713106, + "balance_loss_mlp": 1.02105331, + "epoch": 0.5867853063187639, + "flos": 17052325925760.0, + "grad_norm": 2.5787117597511413, + "language_loss": 0.78298151, + "learning_rate": 1.539133217243724e-06, + "loss": 0.80459332, + "num_input_tokens_seen": 105265695, + "step": 4880, + "time_per_iteration": 2.564591884613037 + }, + { + "auxiliary_loss_clip": 0.01137445, + "auxiliary_loss_mlp": 0.01028954, + "balance_loss_clip": 1.04609954, + "balance_loss_mlp": 1.02074659, + "epoch": 0.586905549209403, + "flos": 24645026707200.0, + "grad_norm": 2.3303164842754556, + "language_loss": 0.75950807, + "learning_rate": 1.5383752441785081e-06, + "loss": 0.7811721, + "num_input_tokens_seen": 105284920, + "step": 4881, + "time_per_iteration": 2.550807237625122 + }, + { + "auxiliary_loss_clip": 0.01165876, + "auxiliary_loss_mlp": 0.01032955, + "balance_loss_clip": 1.04987931, + "balance_loss_mlp": 1.02508736, + "epoch": 0.5870257921000421, + "flos": 14720538723840.0, + "grad_norm": 2.243897656399085, + "language_loss": 0.86097443, + "learning_rate": 1.5376173411414003e-06, + "loss": 0.8829627, + "num_input_tokens_seen": 105302960, + "step": 4882, + "time_per_iteration": 2.450958013534546 + }, + { + "auxiliary_loss_clip": 0.01147566, + "auxiliary_loss_mlp": 0.01030543, + "balance_loss_clip": 1.04484606, + "balance_loss_mlp": 1.02257383, + "epoch": 0.5871460349906812, + "flos": 23914998691200.0, + "grad_norm": 1.9001724103411526, + "language_loss": 0.78463399, + "learning_rate": 1.5368595082473753e-06, + "loss": 0.80641508, + "num_input_tokens_seen": 105321260, + "step": 4883, + "time_per_iteration": 2.516343116760254 + }, + { + "auxiliary_loss_clip": 0.01162383, + "auxiliary_loss_mlp": 0.01022786, + "balance_loss_clip": 1.04765415, + "balance_loss_mlp": 1.01556146, + "epoch": 0.5872662778813202, + "flos": 22164174063360.0, + "grad_norm": 1.71956153224816, + "language_loss": 0.78053224, + "learning_rate": 1.5361017456113935e-06, + "loss": 0.80238396, + "num_input_tokens_seen": 105341610, + "step": 4884, + "time_per_iteration": 2.481703758239746 + }, + { + "auxiliary_loss_clip": 0.0116335, + "auxiliary_loss_mlp": 0.01029066, + "balance_loss_clip": 1.04906106, + "balance_loss_mlp": 1.02098322, + "epoch": 0.5873865207719594, + "flos": 18441925430400.0, + "grad_norm": 2.1591138431447674, + "language_loss": 0.85414207, + "learning_rate": 1.5353440533484085e-06, + "loss": 0.87606621, + "num_input_tokens_seen": 105360465, + "step": 4885, + "time_per_iteration": 2.445906162261963 + }, + { + "auxiliary_loss_clip": 0.01150306, + "auxiliary_loss_mlp": 0.01027365, + "balance_loss_clip": 1.04866397, + "balance_loss_mlp": 1.01967049, + "epoch": 0.5875067636625985, + "flos": 54015321427200.0, + "grad_norm": 2.1014831267451966, + "language_loss": 0.66019988, + "learning_rate": 1.534586431573361e-06, + "loss": 0.68197656, + "num_input_tokens_seen": 105385405, + "step": 4886, + "time_per_iteration": 2.7837822437286377 + }, + { + "auxiliary_loss_clip": 0.0110454, + "auxiliary_loss_mlp": 0.01024076, + "balance_loss_clip": 1.04020214, + "balance_loss_mlp": 1.01536727, + "epoch": 0.5876270065532375, + "flos": 27995707100160.0, + "grad_norm": 2.9533006287418617, + "language_loss": 0.79269874, + "learning_rate": 1.5338288804011817e-06, + "loss": 0.81398487, + "num_input_tokens_seen": 105404905, + "step": 4887, + "time_per_iteration": 2.6393370628356934 + }, + { + "auxiliary_loss_clip": 0.01141607, + "auxiliary_loss_mlp": 0.01027892, + "balance_loss_clip": 1.04411435, + "balance_loss_mlp": 1.02000594, + "epoch": 0.5877472494438767, + "flos": 21361462876800.0, + "grad_norm": 2.0618894788974633, + "language_loss": 0.7102468, + "learning_rate": 1.533071399946791e-06, + "loss": 0.73194182, + "num_input_tokens_seen": 105423650, + "step": 4888, + "time_per_iteration": 2.5017189979553223 + }, + { + "auxiliary_loss_clip": 0.01148696, + "auxiliary_loss_mlp": 0.01025463, + "balance_loss_clip": 1.0459758, + "balance_loss_mlp": 1.01830173, + "epoch": 0.5878674923345157, + "flos": 22383013674240.0, + "grad_norm": 1.9749524520042547, + "language_loss": 0.5703544, + "learning_rate": 1.5323139903250977e-06, + "loss": 0.59209603, + "num_input_tokens_seen": 105444255, + "step": 4889, + "time_per_iteration": 2.509035587310791 + }, + { + "auxiliary_loss_clip": 0.01151412, + "auxiliary_loss_mlp": 0.01023566, + "balance_loss_clip": 1.05087233, + "balance_loss_mlp": 1.01616037, + "epoch": 0.5879877352251548, + "flos": 21868664872320.0, + "grad_norm": 1.534255904358635, + "language_loss": 0.7696355, + "learning_rate": 1.5315566516510002e-06, + "loss": 0.79138529, + "num_input_tokens_seen": 105462425, + "step": 4890, + "time_per_iteration": 2.4867095947265625 + }, + { + "auxiliary_loss_clip": 0.0117564, + "auxiliary_loss_mlp": 0.01028034, + "balance_loss_clip": 1.05091083, + "balance_loss_mlp": 1.02037764, + "epoch": 0.5881079781157939, + "flos": 17493811989120.0, + "grad_norm": 1.7570551024625367, + "language_loss": 0.67120552, + "learning_rate": 1.5307993840393857e-06, + "loss": 0.69324231, + "num_input_tokens_seen": 105480505, + "step": 4891, + "time_per_iteration": 2.417621612548828 + }, + { + "auxiliary_loss_clip": 0.01175127, + "auxiliary_loss_mlp": 0.01020247, + "balance_loss_clip": 1.05033004, + "balance_loss_mlp": 1.01280856, + "epoch": 0.588228221006433, + "flos": 22601853285120.0, + "grad_norm": 2.051849275658348, + "language_loss": 0.80514133, + "learning_rate": 1.530042187605132e-06, + "loss": 0.82709503, + "num_input_tokens_seen": 105499760, + "step": 4892, + "time_per_iteration": 2.444622278213501 + }, + { + "auxiliary_loss_clip": 0.0116303, + "auxiliary_loss_mlp": 0.00761905, + "balance_loss_clip": 1.05044675, + "balance_loss_mlp": 1.00104129, + "epoch": 0.5883484638970721, + "flos": 26176939896960.0, + "grad_norm": 1.3280551926846358, + "language_loss": 0.84046477, + "learning_rate": 1.5292850624631044e-06, + "loss": 0.85971409, + "num_input_tokens_seen": 105521955, + "step": 4893, + "time_per_iteration": 2.547329902648926 + }, + { + "auxiliary_loss_clip": 0.0115965, + "auxiliary_loss_mlp": 0.01028334, + "balance_loss_clip": 1.05058026, + "balance_loss_mlp": 1.02025199, + "epoch": 0.5884687067877111, + "flos": 30443737691520.0, + "grad_norm": 2.1174305585722326, + "language_loss": 0.80409253, + "learning_rate": 1.5285280087281593e-06, + "loss": 0.82597232, + "num_input_tokens_seen": 105542685, + "step": 4894, + "time_per_iteration": 2.5292441844940186 + }, + { + "auxiliary_loss_clip": 0.01055485, + "auxiliary_loss_mlp": 0.0100052, + "balance_loss_clip": 1.0138309, + "balance_loss_mlp": 0.99935782, + "epoch": 0.5885889496783503, + "flos": 70507550580480.0, + "grad_norm": 0.6398514065162237, + "language_loss": 0.56619537, + "learning_rate": 1.5277710265151398e-06, + "loss": 0.58675539, + "num_input_tokens_seen": 105612165, + "step": 4895, + "time_per_iteration": 3.218963861465454 + }, + { + "auxiliary_loss_clip": 0.0116232, + "auxiliary_loss_mlp": 0.01025719, + "balance_loss_clip": 1.05007935, + "balance_loss_mlp": 1.01767826, + "epoch": 0.5887091925689893, + "flos": 19098767485440.0, + "grad_norm": 2.7011140242999625, + "language_loss": 0.77293372, + "learning_rate": 1.5270141159388803e-06, + "loss": 0.79481411, + "num_input_tokens_seen": 105629185, + "step": 4896, + "time_per_iteration": 3.2395291328430176 + }, + { + "auxiliary_loss_clip": 0.0117542, + "auxiliary_loss_mlp": 0.01022649, + "balance_loss_clip": 1.0497272, + "balance_loss_mlp": 1.01473975, + "epoch": 0.5888294354596284, + "flos": 23294282739840.0, + "grad_norm": 1.9910131908004058, + "language_loss": 0.80161703, + "learning_rate": 1.526257277114203e-06, + "loss": 0.82359779, + "num_input_tokens_seen": 105650260, + "step": 4897, + "time_per_iteration": 3.9544591903686523 + }, + { + "auxiliary_loss_clip": 0.01142685, + "auxiliary_loss_mlp": 0.01023394, + "balance_loss_clip": 1.04765713, + "balance_loss_mlp": 1.01611614, + "epoch": 0.5889496783502676, + "flos": 21981532383360.0, + "grad_norm": 2.053437834130494, + "language_loss": 0.79654169, + "learning_rate": 1.5255005101559201e-06, + "loss": 0.81820244, + "num_input_tokens_seen": 105667870, + "step": 4898, + "time_per_iteration": 2.507202625274658 + }, + { + "auxiliary_loss_clip": 0.01165077, + "auxiliary_loss_mlp": 0.0102417, + "balance_loss_clip": 1.0496124, + "balance_loss_mlp": 1.01688027, + "epoch": 0.5890699212409066, + "flos": 21685233093120.0, + "grad_norm": 1.8666838611901881, + "language_loss": 0.7685293, + "learning_rate": 1.524743815178833e-06, + "loss": 0.79042184, + "num_input_tokens_seen": 105685830, + "step": 4899, + "time_per_iteration": 2.4464263916015625 + }, + { + "auxiliary_loss_clip": 0.01148312, + "auxiliary_loss_mlp": 0.01023251, + "balance_loss_clip": 1.04525852, + "balance_loss_mlp": 1.01594353, + "epoch": 0.5891901641315457, + "flos": 19464553635840.0, + "grad_norm": 1.7759848008338046, + "language_loss": 0.80913925, + "learning_rate": 1.5239871922977315e-06, + "loss": 0.83085489, + "num_input_tokens_seen": 105705745, + "step": 4900, + "time_per_iteration": 3.2370944023132324 + }, + { + "auxiliary_loss_clip": 0.01143719, + "auxiliary_loss_mlp": 0.01030541, + "balance_loss_clip": 1.04366827, + "balance_loss_mlp": 1.0226016, + "epoch": 0.5893104070221848, + "flos": 19609884063360.0, + "grad_norm": 1.935640699196716, + "language_loss": 0.89978021, + "learning_rate": 1.523230641627394e-06, + "loss": 0.92152286, + "num_input_tokens_seen": 105724730, + "step": 4901, + "time_per_iteration": 2.4779961109161377 + }, + { + "auxiliary_loss_clip": 0.01121279, + "auxiliary_loss_mlp": 0.01024787, + "balance_loss_clip": 1.04090261, + "balance_loss_mlp": 1.01747918, + "epoch": 0.5894306499128239, + "flos": 29060063930880.0, + "grad_norm": 2.9537831609396643, + "language_loss": 0.72745502, + "learning_rate": 1.5224741632825888e-06, + "loss": 0.74891567, + "num_input_tokens_seen": 105744920, + "step": 4902, + "time_per_iteration": 2.6248579025268555 + }, + { + "auxiliary_loss_clip": 0.01181235, + "auxiliary_loss_mlp": 0.01025677, + "balance_loss_clip": 1.0539434, + "balance_loss_mlp": 1.0175972, + "epoch": 0.589550892803463, + "flos": 42298890721920.0, + "grad_norm": 1.7401841728155099, + "language_loss": 0.69528282, + "learning_rate": 1.521717757378074e-06, + "loss": 0.71735191, + "num_input_tokens_seen": 105765465, + "step": 4903, + "time_per_iteration": 2.6238644123077393 + }, + { + "auxiliary_loss_clip": 0.01167243, + "auxiliary_loss_mlp": 0.01029222, + "balance_loss_clip": 1.05000448, + "balance_loss_mlp": 1.02102041, + "epoch": 0.5896711356941021, + "flos": 14137062197760.0, + "grad_norm": 2.2328469190866085, + "language_loss": 0.69149214, + "learning_rate": 1.5209614240285943e-06, + "loss": 0.71345681, + "num_input_tokens_seen": 105783120, + "step": 4904, + "time_per_iteration": 2.4341540336608887 + }, + { + "auxiliary_loss_clip": 0.0117373, + "auxiliary_loss_mlp": 0.00762469, + "balance_loss_clip": 1.0486002, + "balance_loss_mlp": 1.00110865, + "epoch": 0.5897913785847412, + "flos": 17201355454080.0, + "grad_norm": 2.363526864961352, + "language_loss": 0.84732056, + "learning_rate": 1.520205163348887e-06, + "loss": 0.86668253, + "num_input_tokens_seen": 105801055, + "step": 4905, + "time_per_iteration": 2.440687656402588 + }, + { + "auxiliary_loss_clip": 0.01046572, + "auxiliary_loss_mlp": 0.01001843, + "balance_loss_clip": 1.01296043, + "balance_loss_mlp": 1.0006392, + "epoch": 0.5899116214753802, + "flos": 48794164202880.0, + "grad_norm": 0.8869916101366119, + "language_loss": 0.57019544, + "learning_rate": 1.519448975453674e-06, + "loss": 0.59067953, + "num_input_tokens_seen": 105856155, + "step": 4906, + "time_per_iteration": 3.001115083694458 + }, + { + "auxiliary_loss_clip": 0.01162233, + "auxiliary_loss_mlp": 0.00762653, + "balance_loss_clip": 1.05066347, + "balance_loss_mlp": 1.00114775, + "epoch": 0.5900318643660194, + "flos": 21103659987840.0, + "grad_norm": 25.748228819051057, + "language_loss": 0.76292086, + "learning_rate": 1.5186928604576696e-06, + "loss": 0.78216976, + "num_input_tokens_seen": 105873350, + "step": 4907, + "time_per_iteration": 2.4864580631256104 + }, + { + "auxiliary_loss_clip": 0.01147846, + "auxiliary_loss_mlp": 0.01030736, + "balance_loss_clip": 1.04581451, + "balance_loss_mlp": 1.02330327, + "epoch": 0.5901521072566585, + "flos": 21178390233600.0, + "grad_norm": 2.562035422692409, + "language_loss": 0.77204418, + "learning_rate": 1.5179368184755752e-06, + "loss": 0.79383004, + "num_input_tokens_seen": 105891435, + "step": 4908, + "time_per_iteration": 2.4988465309143066 + }, + { + "auxiliary_loss_clip": 0.01146468, + "auxiliary_loss_mlp": 0.01021541, + "balance_loss_clip": 1.04826903, + "balance_loss_mlp": 1.01396561, + "epoch": 0.5902723501472975, + "flos": 20225967160320.0, + "grad_norm": 1.5880438381491628, + "language_loss": 0.825297, + "learning_rate": 1.5171808496220821e-06, + "loss": 0.84697711, + "num_input_tokens_seen": 105910190, + "step": 4909, + "time_per_iteration": 2.501002073287964 + }, + { + "auxiliary_loss_clip": 0.01152499, + "auxiliary_loss_mlp": 0.0102458, + "balance_loss_clip": 1.04778504, + "balance_loss_mlp": 1.01718903, + "epoch": 0.5903925930379367, + "flos": 22964407211520.0, + "grad_norm": 2.032475778889329, + "language_loss": 0.8156116, + "learning_rate": 1.5164249540118708e-06, + "loss": 0.83738244, + "num_input_tokens_seen": 105929315, + "step": 4910, + "time_per_iteration": 2.507903814315796 + }, + { + "auxiliary_loss_clip": 0.0110842, + "auxiliary_loss_mlp": 0.01021946, + "balance_loss_clip": 1.04219878, + "balance_loss_mlp": 1.01428354, + "epoch": 0.5905128359285757, + "flos": 23367720096000.0, + "grad_norm": 1.8135663817368428, + "language_loss": 0.83401728, + "learning_rate": 1.5156691317596093e-06, + "loss": 0.85532093, + "num_input_tokens_seen": 105950740, + "step": 4911, + "time_per_iteration": 2.6067869663238525 + }, + { + "auxiliary_loss_clip": 0.0116404, + "auxiliary_loss_mlp": 0.00762608, + "balance_loss_clip": 1.04874849, + "balance_loss_mlp": 1.00118208, + "epoch": 0.5906330788192148, + "flos": 28032335994240.0, + "grad_norm": 2.3038737797423647, + "language_loss": 0.664819, + "learning_rate": 1.5149133829799556e-06, + "loss": 0.68408549, + "num_input_tokens_seen": 105968735, + "step": 4912, + "time_per_iteration": 2.525836229324341 + }, + { + "auxiliary_loss_clip": 0.01154637, + "auxiliary_loss_mlp": 0.01034322, + "balance_loss_clip": 1.04744434, + "balance_loss_mlp": 1.02618027, + "epoch": 0.590753321709854, + "flos": 18477943793280.0, + "grad_norm": 1.9596020810636452, + "language_loss": 0.80123401, + "learning_rate": 1.5141577077875556e-06, + "loss": 0.82312357, + "num_input_tokens_seen": 105986060, + "step": 4913, + "time_per_iteration": 2.461843967437744 + }, + { + "auxiliary_loss_clip": 0.01164455, + "auxiliary_loss_mlp": 0.01027199, + "balance_loss_clip": 1.05008197, + "balance_loss_mlp": 1.01971245, + "epoch": 0.590873564600493, + "flos": 16873706568960.0, + "grad_norm": 2.5603999386463188, + "language_loss": 0.72577822, + "learning_rate": 1.5134021062970451e-06, + "loss": 0.74769473, + "num_input_tokens_seen": 106004440, + "step": 4914, + "time_per_iteration": 2.4424164295196533 + }, + { + "auxiliary_loss_clip": 0.0112301, + "auxiliary_loss_mlp": 0.01027168, + "balance_loss_clip": 1.04634142, + "balance_loss_mlp": 1.01957989, + "epoch": 0.5909938074911321, + "flos": 13516166678400.0, + "grad_norm": 1.934343086404142, + "language_loss": 0.80605847, + "learning_rate": 1.5126465786230483e-06, + "loss": 0.82756019, + "num_input_tokens_seen": 106021215, + "step": 4915, + "time_per_iteration": 2.4838438034057617 + }, + { + "auxiliary_loss_clip": 0.01175346, + "auxiliary_loss_mlp": 0.01025941, + "balance_loss_clip": 1.05018687, + "balance_loss_mlp": 1.01828516, + "epoch": 0.5911140503817712, + "flos": 26024067613440.0, + "grad_norm": 6.86870540816177, + "language_loss": 0.82215637, + "learning_rate": 1.5118911248801787e-06, + "loss": 0.84416926, + "num_input_tokens_seen": 106039225, + "step": 4916, + "time_per_iteration": 2.4464306831359863 + }, + { + "auxiliary_loss_clip": 0.01158666, + "auxiliary_loss_mlp": 0.01025103, + "balance_loss_clip": 1.04844928, + "balance_loss_mlp": 1.01824892, + "epoch": 0.5912342932724103, + "flos": 23258731253760.0, + "grad_norm": 2.192428026554616, + "language_loss": 0.797737, + "learning_rate": 1.5111357451830364e-06, + "loss": 0.81957471, + "num_input_tokens_seen": 106057920, + "step": 4917, + "time_per_iteration": 2.462960958480835 + }, + { + "auxiliary_loss_clip": 0.01161006, + "auxiliary_loss_mlp": 0.01026208, + "balance_loss_clip": 1.04766631, + "balance_loss_mlp": 1.0187397, + "epoch": 0.5913545361630493, + "flos": 19573039687680.0, + "grad_norm": 2.226194968667083, + "language_loss": 0.71069276, + "learning_rate": 1.5103804396462131e-06, + "loss": 0.73256493, + "num_input_tokens_seen": 106077855, + "step": 4918, + "time_per_iteration": 2.486553430557251 + }, + { + "auxiliary_loss_clip": 0.01164545, + "auxiliary_loss_mlp": 0.01030847, + "balance_loss_clip": 1.04740095, + "balance_loss_mlp": 1.02265692, + "epoch": 0.5914747790536885, + "flos": 26213532877440.0, + "grad_norm": 2.948223789102078, + "language_loss": 0.79932094, + "learning_rate": 1.5096252083842877e-06, + "loss": 0.82127488, + "num_input_tokens_seen": 106097065, + "step": 4919, + "time_per_iteration": 2.5005695819854736 + }, + { + "auxiliary_loss_clip": 0.0115745, + "auxiliary_loss_mlp": 0.0102473, + "balance_loss_clip": 1.0453949, + "balance_loss_mlp": 1.01720798, + "epoch": 0.5915950219443276, + "flos": 27417545786880.0, + "grad_norm": 1.7185632211285429, + "language_loss": 0.85268813, + "learning_rate": 1.5088700515118285e-06, + "loss": 0.87450981, + "num_input_tokens_seen": 106116385, + "step": 4920, + "time_per_iteration": 2.499330759048462 + }, + { + "auxiliary_loss_clip": 0.01128334, + "auxiliary_loss_mlp": 0.01024229, + "balance_loss_clip": 1.04628408, + "balance_loss_mlp": 1.01636136, + "epoch": 0.5917152648349666, + "flos": 21907879545600.0, + "grad_norm": 1.9301287524992923, + "language_loss": 0.66514665, + "learning_rate": 1.508114969143392e-06, + "loss": 0.68667227, + "num_input_tokens_seen": 106136370, + "step": 4921, + "time_per_iteration": 2.5251355171203613 + }, + { + "auxiliary_loss_clip": 0.01149048, + "auxiliary_loss_mlp": 0.01025758, + "balance_loss_clip": 1.04601669, + "balance_loss_mlp": 1.01855755, + "epoch": 0.5918355077256057, + "flos": 28109185142400.0, + "grad_norm": 1.506187451572099, + "language_loss": 0.772416, + "learning_rate": 1.5073599613935238e-06, + "loss": 0.79416406, + "num_input_tokens_seen": 106158490, + "step": 4922, + "time_per_iteration": 2.556262254714966 + }, + { + "auxiliary_loss_clip": 0.01148638, + "auxiliary_loss_mlp": 0.01028885, + "balance_loss_clip": 1.04738569, + "balance_loss_mlp": 1.02101111, + "epoch": 0.5919557506162448, + "flos": 28183807647360.0, + "grad_norm": 2.0604176475345812, + "language_loss": 0.57735527, + "learning_rate": 1.5066050283767574e-06, + "loss": 0.59913051, + "num_input_tokens_seen": 106179170, + "step": 4923, + "time_per_iteration": 3.3035616874694824 + }, + { + "auxiliary_loss_clip": 0.01143065, + "auxiliary_loss_mlp": 0.01024979, + "balance_loss_clip": 1.0471437, + "balance_loss_mlp": 1.01764441, + "epoch": 0.5920759935068839, + "flos": 12094355652480.0, + "grad_norm": 1.9712568366316916, + "language_loss": 0.82798088, + "learning_rate": 1.505850170207616e-06, + "loss": 0.84966135, + "num_input_tokens_seen": 106196035, + "step": 4924, + "time_per_iteration": 3.9772794246673584 + }, + { + "auxiliary_loss_clip": 0.01146645, + "auxiliary_loss_mlp": 0.010251, + "balance_loss_clip": 1.04618645, + "balance_loss_mlp": 1.01800752, + "epoch": 0.592196236397523, + "flos": 29424772673280.0, + "grad_norm": 1.9951052025985379, + "language_loss": 0.78137356, + "learning_rate": 1.505095387000611e-06, + "loss": 0.80309105, + "num_input_tokens_seen": 106218335, + "step": 4925, + "time_per_iteration": 2.5560543537139893 + }, + { + "auxiliary_loss_clip": 0.01138492, + "auxiliary_loss_mlp": 0.0102487, + "balance_loss_clip": 1.04678476, + "balance_loss_mlp": 1.01758361, + "epoch": 0.5923164792881621, + "flos": 24384709866240.0, + "grad_norm": 2.0629428363479536, + "language_loss": 0.74634778, + "learning_rate": 1.504340678870242e-06, + "loss": 0.76798141, + "num_input_tokens_seen": 106236550, + "step": 4926, + "time_per_iteration": 2.5281803607940674 + }, + { + "auxiliary_loss_clip": 0.01161282, + "auxiliary_loss_mlp": 0.01029926, + "balance_loss_clip": 1.04979157, + "balance_loss_mlp": 1.02294576, + "epoch": 0.5924367221788012, + "flos": 24024238928640.0, + "grad_norm": 2.1862472057472893, + "language_loss": 0.89781976, + "learning_rate": 1.5035860459309989e-06, + "loss": 0.91973186, + "num_input_tokens_seen": 106254265, + "step": 4927, + "time_per_iteration": 3.2275474071502686 + }, + { + "auxiliary_loss_clip": 0.0114241, + "auxiliary_loss_mlp": 0.01026557, + "balance_loss_clip": 1.04622436, + "balance_loss_mlp": 1.01856983, + "epoch": 0.5925569650694402, + "flos": 26870590414080.0, + "grad_norm": 2.0471448457840546, + "language_loss": 0.63632739, + "learning_rate": 1.5028314882973568e-06, + "loss": 0.6580171, + "num_input_tokens_seen": 106274670, + "step": 4928, + "time_per_iteration": 2.5352718830108643 + }, + { + "auxiliary_loss_clip": 0.01146284, + "auxiliary_loss_mlp": 0.01028105, + "balance_loss_clip": 1.04806602, + "balance_loss_mlp": 1.02045155, + "epoch": 0.5926772079600794, + "flos": 22302788647680.0, + "grad_norm": 1.7964779991447113, + "language_loss": 0.84856868, + "learning_rate": 1.502077006083783e-06, + "loss": 0.87031257, + "num_input_tokens_seen": 106293330, + "step": 4929, + "time_per_iteration": 2.490159273147583 + }, + { + "auxiliary_loss_clip": 0.01166257, + "auxiliary_loss_mlp": 0.00761481, + "balance_loss_clip": 1.0502069, + "balance_loss_mlp": 1.00104713, + "epoch": 0.5927974508507184, + "flos": 19865244827520.0, + "grad_norm": 1.862150629195704, + "language_loss": 0.76357383, + "learning_rate": 1.5013225994047315e-06, + "loss": 0.78285122, + "num_input_tokens_seen": 106310960, + "step": 4930, + "time_per_iteration": 2.488074779510498 + }, + { + "auxiliary_loss_clip": 0.01162978, + "auxiliary_loss_mlp": 0.00761645, + "balance_loss_clip": 1.050138, + "balance_loss_mlp": 1.0011071, + "epoch": 0.5929176937413575, + "flos": 15776743167360.0, + "grad_norm": 1.5804644459127666, + "language_loss": 0.80653191, + "learning_rate": 1.5005682683746452e-06, + "loss": 0.82577813, + "num_input_tokens_seen": 106329475, + "step": 4931, + "time_per_iteration": 2.465548038482666 + }, + { + "auxiliary_loss_clip": 0.011644, + "auxiliary_loss_mlp": 0.01027057, + "balance_loss_clip": 1.05298448, + "balance_loss_mlp": 1.01927245, + "epoch": 0.5930379366319967, + "flos": 17601472028160.0, + "grad_norm": 2.1028975009790223, + "language_loss": 0.7263664, + "learning_rate": 1.4998140131079553e-06, + "loss": 0.74828088, + "num_input_tokens_seen": 106345565, + "step": 4932, + "time_per_iteration": 2.435617208480835 + }, + { + "auxiliary_loss_clip": 0.01103524, + "auxiliary_loss_mlp": 0.00761679, + "balance_loss_clip": 1.04323411, + "balance_loss_mlp": 1.0010426, + "epoch": 0.5931581795226357, + "flos": 17704283731200.0, + "grad_norm": 1.9553651854358018, + "language_loss": 0.73327786, + "learning_rate": 1.4990598337190821e-06, + "loss": 0.75192988, + "num_input_tokens_seen": 106361920, + "step": 4933, + "time_per_iteration": 2.5541396141052246 + }, + { + "auxiliary_loss_clip": 0.01174789, + "auxiliary_loss_mlp": 0.00762294, + "balance_loss_clip": 1.05041337, + "balance_loss_mlp": 1.00099885, + "epoch": 0.5932784224132748, + "flos": 24280102483200.0, + "grad_norm": 1.7006763511209446, + "language_loss": 0.67621607, + "learning_rate": 1.4983057303224338e-06, + "loss": 0.69558692, + "num_input_tokens_seen": 106381735, + "step": 4934, + "time_per_iteration": 2.4559504985809326 + }, + { + "auxiliary_loss_clip": 0.01117581, + "auxiliary_loss_mlp": 0.01029005, + "balance_loss_clip": 1.04416096, + "balance_loss_mlp": 1.02145934, + "epoch": 0.5933986653039139, + "flos": 22926700909440.0, + "grad_norm": 1.6612405008481927, + "language_loss": 0.87430882, + "learning_rate": 1.4975517030324072e-06, + "loss": 0.89577472, + "num_input_tokens_seen": 106399745, + "step": 4935, + "time_per_iteration": 2.568876266479492 + }, + { + "auxiliary_loss_clip": 0.01071075, + "auxiliary_loss_mlp": 0.00753132, + "balance_loss_clip": 1.01094317, + "balance_loss_mlp": 1.00073802, + "epoch": 0.593518908194553, + "flos": 71121730256640.0, + "grad_norm": 0.7787301871543086, + "language_loss": 0.61797798, + "learning_rate": 1.4967977519633882e-06, + "loss": 0.6362201, + "num_input_tokens_seen": 106457205, + "step": 4936, + "time_per_iteration": 3.1221280097961426 + }, + { + "auxiliary_loss_clip": 0.01131161, + "auxiliary_loss_mlp": 0.01021275, + "balance_loss_clip": 1.04548597, + "balance_loss_mlp": 1.01374722, + "epoch": 0.593639151085192, + "flos": 20448649526400.0, + "grad_norm": 1.9117555366891659, + "language_loss": 0.77989686, + "learning_rate": 1.4960438772297494e-06, + "loss": 0.80142123, + "num_input_tokens_seen": 106474250, + "step": 4937, + "time_per_iteration": 2.512387752532959 + }, + { + "auxiliary_loss_clip": 0.01147511, + "auxiliary_loss_mlp": 0.01025536, + "balance_loss_clip": 1.0455842, + "balance_loss_mlp": 1.01805305, + "epoch": 0.5937593939758312, + "flos": 30883428074880.0, + "grad_norm": 2.325277358592349, + "language_loss": 0.73483431, + "learning_rate": 1.495290078945855e-06, + "loss": 0.7565648, + "num_input_tokens_seen": 106494015, + "step": 4938, + "time_per_iteration": 2.559628963470459 + }, + { + "auxiliary_loss_clip": 0.01174225, + "auxiliary_loss_mlp": 0.01026399, + "balance_loss_clip": 1.0503217, + "balance_loss_mlp": 1.01880872, + "epoch": 0.5938796368664703, + "flos": 36898069668480.0, + "grad_norm": 2.740942458497064, + "language_loss": 0.74466193, + "learning_rate": 1.4945363572260529e-06, + "loss": 0.76666814, + "num_input_tokens_seen": 106515010, + "step": 4939, + "time_per_iteration": 2.541234016418457 + }, + { + "auxiliary_loss_clip": 0.01161474, + "auxiliary_loss_mlp": 0.01025105, + "balance_loss_clip": 1.04947925, + "balance_loss_mlp": 1.01812553, + "epoch": 0.5939998797571093, + "flos": 23842926051840.0, + "grad_norm": 2.2989526476063147, + "language_loss": 0.68023026, + "learning_rate": 1.4937827121846845e-06, + "loss": 0.70209599, + "num_input_tokens_seen": 106535265, + "step": 4940, + "time_per_iteration": 2.482654094696045 + }, + { + "auxiliary_loss_clip": 0.01129353, + "auxiliary_loss_mlp": 0.01031986, + "balance_loss_clip": 1.04876447, + "balance_loss_mlp": 1.02471161, + "epoch": 0.5941201226477485, + "flos": 25191407462400.0, + "grad_norm": 1.6428858199600471, + "language_loss": 0.73461449, + "learning_rate": 1.4930291439360755e-06, + "loss": 0.75622791, + "num_input_tokens_seen": 106557830, + "step": 4941, + "time_per_iteration": 2.5687427520751953 + }, + { + "auxiliary_loss_clip": 0.0116386, + "auxiliary_loss_mlp": 0.01026479, + "balance_loss_clip": 1.05058849, + "balance_loss_mlp": 1.01824725, + "epoch": 0.5942403655383875, + "flos": 22418996123520.0, + "grad_norm": 2.425688943864977, + "language_loss": 0.79287946, + "learning_rate": 1.4922756525945427e-06, + "loss": 0.8147828, + "num_input_tokens_seen": 106577140, + "step": 4942, + "time_per_iteration": 2.4987354278564453 + }, + { + "auxiliary_loss_clip": 0.01062214, + "auxiliary_loss_mlp": 0.01002059, + "balance_loss_clip": 1.01114988, + "balance_loss_mlp": 1.00090253, + "epoch": 0.5943606084290266, + "flos": 67629310796160.0, + "grad_norm": 0.7739460750750374, + "language_loss": 0.59575433, + "learning_rate": 1.4915222382743894e-06, + "loss": 0.61639708, + "num_input_tokens_seen": 106635975, + "step": 4943, + "time_per_iteration": 3.081312656402588 + }, + { + "auxiliary_loss_clip": 0.01164572, + "auxiliary_loss_mlp": 0.01023769, + "balance_loss_clip": 1.05174351, + "balance_loss_mlp": 1.01590753, + "epoch": 0.5944808513196658, + "flos": 18223157646720.0, + "grad_norm": 2.183593540014685, + "language_loss": 0.72010398, + "learning_rate": 1.4907689010899085e-06, + "loss": 0.74198741, + "num_input_tokens_seen": 106653555, + "step": 4944, + "time_per_iteration": 2.4449527263641357 + }, + { + "auxiliary_loss_clip": 0.01146992, + "auxiliary_loss_mlp": 0.01021275, + "balance_loss_clip": 1.04714382, + "balance_loss_mlp": 1.0138905, + "epoch": 0.5946010942103048, + "flos": 24790824011520.0, + "grad_norm": 2.0369234447771505, + "language_loss": 0.62521201, + "learning_rate": 1.4900156411553804e-06, + "loss": 0.64689469, + "num_input_tokens_seen": 106673385, + "step": 4945, + "time_per_iteration": 2.5233378410339355 + }, + { + "auxiliary_loss_clip": 0.01151326, + "auxiliary_loss_mlp": 0.01027567, + "balance_loss_clip": 1.04960322, + "balance_loss_mlp": 1.01997995, + "epoch": 0.5947213371009439, + "flos": 15231619388160.0, + "grad_norm": 1.9575357498019814, + "language_loss": 0.84945118, + "learning_rate": 1.4892624585850739e-06, + "loss": 0.87124014, + "num_input_tokens_seen": 106691740, + "step": 4946, + "time_per_iteration": 2.4751195907592773 + }, + { + "auxiliary_loss_clip": 0.01176943, + "auxiliary_loss_mlp": 0.01028889, + "balance_loss_clip": 1.05128109, + "balance_loss_mlp": 1.02125406, + "epoch": 0.594841579991583, + "flos": 25848069949440.0, + "grad_norm": 1.9328802250950317, + "language_loss": 0.79734862, + "learning_rate": 1.4885093534932465e-06, + "loss": 0.81940699, + "num_input_tokens_seen": 106709705, + "step": 4947, + "time_per_iteration": 2.4731976985931396 + }, + { + "auxiliary_loss_clip": 0.01147075, + "auxiliary_loss_mlp": 0.01027657, + "balance_loss_clip": 1.04950464, + "balance_loss_mlp": 1.01981246, + "epoch": 0.5949618228822221, + "flos": 23981109672960.0, + "grad_norm": 2.0442476050783216, + "language_loss": 0.71255273, + "learning_rate": 1.4877563259941433e-06, + "loss": 0.73430002, + "num_input_tokens_seen": 106727560, + "step": 4948, + "time_per_iteration": 2.54600191116333 + }, + { + "auxiliary_loss_clip": 0.01168815, + "auxiliary_loss_mlp": 0.01026263, + "balance_loss_clip": 1.05139148, + "balance_loss_mlp": 1.01825774, + "epoch": 0.5950820657728612, + "flos": 40547491476480.0, + "grad_norm": 5.64963191161863, + "language_loss": 0.67513633, + "learning_rate": 1.4870033762019988e-06, + "loss": 0.69708711, + "num_input_tokens_seen": 106747725, + "step": 4949, + "time_per_iteration": 3.3868582248687744 + }, + { + "auxiliary_loss_clip": 0.0114717, + "auxiliary_loss_mlp": 0.01026565, + "balance_loss_clip": 1.04817677, + "balance_loss_mlp": 1.01863158, + "epoch": 0.5952023086635003, + "flos": 23184467884800.0, + "grad_norm": 1.6695534947692658, + "language_loss": 0.73305058, + "learning_rate": 1.4862505042310334e-06, + "loss": 0.75478798, + "num_input_tokens_seen": 106767010, + "step": 4950, + "time_per_iteration": 3.2332756519317627 + }, + { + "auxiliary_loss_clip": 0.01139357, + "auxiliary_loss_mlp": 0.01027596, + "balance_loss_clip": 1.04703975, + "balance_loss_mlp": 1.02032995, + "epoch": 0.5953225515541394, + "flos": 33653289548160.0, + "grad_norm": 1.666849036661882, + "language_loss": 0.69541419, + "learning_rate": 1.4854977101954587e-06, + "loss": 0.71708369, + "num_input_tokens_seen": 106789230, + "step": 4951, + "time_per_iteration": 3.3657405376434326 + }, + { + "auxiliary_loss_clip": 0.01161355, + "auxiliary_loss_mlp": 0.0102704, + "balance_loss_clip": 1.04552281, + "balance_loss_mlp": 1.0193634, + "epoch": 0.5954427944447784, + "flos": 24459619680000.0, + "grad_norm": 1.8434514313628778, + "language_loss": 0.86075991, + "learning_rate": 1.4847449942094716e-06, + "loss": 0.88264388, + "num_input_tokens_seen": 106808110, + "step": 4952, + "time_per_iteration": 2.487928867340088 + }, + { + "auxiliary_loss_clip": 0.01142224, + "auxiliary_loss_mlp": 0.0102372, + "balance_loss_clip": 1.0469265, + "balance_loss_mlp": 1.01639485, + "epoch": 0.5955630373354175, + "flos": 18551848026240.0, + "grad_norm": 1.9716933350245005, + "language_loss": 0.86345029, + "learning_rate": 1.4839923563872598e-06, + "loss": 0.88510972, + "num_input_tokens_seen": 106826650, + "step": 4953, + "time_per_iteration": 2.4762191772460938 + }, + { + "auxiliary_loss_clip": 0.01132394, + "auxiliary_loss_mlp": 0.01023262, + "balance_loss_clip": 1.04690969, + "balance_loss_mlp": 1.01544774, + "epoch": 0.5956832802260567, + "flos": 19791699730560.0, + "grad_norm": 2.0753848433277935, + "language_loss": 0.75977749, + "learning_rate": 1.483239796842997e-06, + "loss": 0.78133404, + "num_input_tokens_seen": 106844680, + "step": 4954, + "time_per_iteration": 3.3018670082092285 + }, + { + "auxiliary_loss_clip": 0.01133944, + "auxiliary_loss_mlp": 0.01025286, + "balance_loss_clip": 1.04708529, + "balance_loss_mlp": 1.01815128, + "epoch": 0.5958035231166957, + "flos": 19750868945280.0, + "grad_norm": 1.7361611326090118, + "language_loss": 0.83988243, + "learning_rate": 1.4824873156908462e-06, + "loss": 0.86147475, + "num_input_tokens_seen": 106862605, + "step": 4955, + "time_per_iteration": 2.5259320735931396 + }, + { + "auxiliary_loss_clip": 0.01161596, + "auxiliary_loss_mlp": 0.00762787, + "balance_loss_clip": 1.05012465, + "balance_loss_mlp": 1.00104713, + "epoch": 0.5959237660073348, + "flos": 21652806090240.0, + "grad_norm": 1.6282489080897204, + "language_loss": 0.75437617, + "learning_rate": 1.4817349130449584e-06, + "loss": 0.77362001, + "num_input_tokens_seen": 106882325, + "step": 4956, + "time_per_iteration": 2.519418239593506 + }, + { + "auxiliary_loss_clip": 0.01158628, + "auxiliary_loss_mlp": 0.01025746, + "balance_loss_clip": 1.04879034, + "balance_loss_mlp": 1.01831961, + "epoch": 0.5960440088979739, + "flos": 21171207513600.0, + "grad_norm": 1.7613567744129646, + "language_loss": 0.83084756, + "learning_rate": 1.4809825890194717e-06, + "loss": 0.85269129, + "num_input_tokens_seen": 106900995, + "step": 4957, + "time_per_iteration": 2.4499990940093994 + }, + { + "auxiliary_loss_clip": 0.01140938, + "auxiliary_loss_mlp": 0.01023793, + "balance_loss_clip": 1.04414487, + "balance_loss_mlp": 1.01628327, + "epoch": 0.596164251788613, + "flos": 14757526753920.0, + "grad_norm": 2.5951414757980604, + "language_loss": 0.77292156, + "learning_rate": 1.4802303437285139e-06, + "loss": 0.7945689, + "num_input_tokens_seen": 106918265, + "step": 4958, + "time_per_iteration": 2.4546656608581543 + }, + { + "auxiliary_loss_clip": 0.01142419, + "auxiliary_loss_mlp": 0.01026749, + "balance_loss_clip": 1.04395771, + "balance_loss_mlp": 1.01910472, + "epoch": 0.596284494679252, + "flos": 20485924865280.0, + "grad_norm": 2.115131940947606, + "language_loss": 0.80834496, + "learning_rate": 1.4794781772861994e-06, + "loss": 0.83003664, + "num_input_tokens_seen": 106934760, + "step": 4959, + "time_per_iteration": 2.487410068511963 + }, + { + "auxiliary_loss_clip": 0.01144943, + "auxiliary_loss_mlp": 0.00762364, + "balance_loss_clip": 1.0467937, + "balance_loss_mlp": 1.00116301, + "epoch": 0.5964047375698912, + "flos": 31212262108800.0, + "grad_norm": 1.9937672569271734, + "language_loss": 0.67210317, + "learning_rate": 1.4787260898066324e-06, + "loss": 0.6911763, + "num_input_tokens_seen": 106954760, + "step": 4960, + "time_per_iteration": 2.5721468925476074 + }, + { + "auxiliary_loss_clip": 0.01172414, + "auxiliary_loss_mlp": 0.01024246, + "balance_loss_clip": 1.05028915, + "balance_loss_mlp": 1.0166254, + "epoch": 0.5965249804605303, + "flos": 27483620855040.0, + "grad_norm": 2.0453898282681924, + "language_loss": 0.85683548, + "learning_rate": 1.4779740814039023e-06, + "loss": 0.87880206, + "num_input_tokens_seen": 106974845, + "step": 4961, + "time_per_iteration": 2.5018112659454346 + }, + { + "auxiliary_loss_clip": 0.01173894, + "auxiliary_loss_mlp": 0.01029758, + "balance_loss_clip": 1.04916, + "balance_loss_mlp": 1.02152658, + "epoch": 0.5966452233511693, + "flos": 30773936442240.0, + "grad_norm": 1.8591619143984854, + "language_loss": 0.68451422, + "learning_rate": 1.4772221521920894e-06, + "loss": 0.70655078, + "num_input_tokens_seen": 106994870, + "step": 4962, + "time_per_iteration": 2.480309247970581 + }, + { + "auxiliary_loss_clip": 0.01146434, + "auxiliary_loss_mlp": 0.01021346, + "balance_loss_clip": 1.04913592, + "balance_loss_mlp": 1.01390147, + "epoch": 0.5967654662418085, + "flos": 25481170477440.0, + "grad_norm": 2.0146246872770392, + "language_loss": 0.74288446, + "learning_rate": 1.4764703022852598e-06, + "loss": 0.76456225, + "num_input_tokens_seen": 107015390, + "step": 4963, + "time_per_iteration": 2.526489734649658 + }, + { + "auxiliary_loss_clip": 0.01093425, + "auxiliary_loss_mlp": 0.01022516, + "balance_loss_clip": 1.0416925, + "balance_loss_mlp": 1.0154047, + "epoch": 0.5968857091324475, + "flos": 19099126621440.0, + "grad_norm": 5.863259605655188, + "language_loss": 0.76878047, + "learning_rate": 1.4757185317974696e-06, + "loss": 0.78993988, + "num_input_tokens_seen": 107033775, + "step": 4964, + "time_per_iteration": 2.565617799758911 + }, + { + "auxiliary_loss_clip": 0.01162948, + "auxiliary_loss_mlp": 0.01027745, + "balance_loss_clip": 1.04995012, + "balance_loss_mlp": 1.01997232, + "epoch": 0.5970059520230866, + "flos": 23692711374720.0, + "grad_norm": 2.510927986800928, + "language_loss": 0.71059179, + "learning_rate": 1.474966840842761e-06, + "loss": 0.73249871, + "num_input_tokens_seen": 107053355, + "step": 4965, + "time_per_iteration": 2.478074550628662 + }, + { + "auxiliary_loss_clip": 0.01162576, + "auxiliary_loss_mlp": 0.01025749, + "balance_loss_clip": 1.04786301, + "balance_loss_mlp": 1.0183996, + "epoch": 0.5971261949137258, + "flos": 23185545292800.0, + "grad_norm": 1.9214147507698836, + "language_loss": 0.8704282, + "learning_rate": 1.4742152295351655e-06, + "loss": 0.89231145, + "num_input_tokens_seen": 107072510, + "step": 4966, + "time_per_iteration": 2.4799327850341797 + }, + { + "auxiliary_loss_clip": 0.01160386, + "auxiliary_loss_mlp": 0.0076297, + "balance_loss_clip": 1.0480361, + "balance_loss_mlp": 1.00119305, + "epoch": 0.5972464378043648, + "flos": 20557710195840.0, + "grad_norm": 2.7206892024968163, + "language_loss": 0.64126009, + "learning_rate": 1.4734636979887016e-06, + "loss": 0.66049361, + "num_input_tokens_seen": 107089970, + "step": 4967, + "time_per_iteration": 2.4901645183563232 + }, + { + "auxiliary_loss_clip": 0.01136317, + "auxiliary_loss_mlp": 0.01028482, + "balance_loss_clip": 1.04573679, + "balance_loss_mlp": 1.02091265, + "epoch": 0.5973666806950039, + "flos": 29387030457600.0, + "grad_norm": 3.280217018432474, + "language_loss": 0.89806318, + "learning_rate": 1.4727122463173755e-06, + "loss": 0.91971117, + "num_input_tokens_seen": 107108500, + "step": 4968, + "time_per_iteration": 2.583343267440796 + }, + { + "auxiliary_loss_clip": 0.01147343, + "auxiliary_loss_mlp": 0.01024293, + "balance_loss_clip": 1.04883039, + "balance_loss_mlp": 1.01666975, + "epoch": 0.597486923585643, + "flos": 22273522041600.0, + "grad_norm": 1.6724476138625513, + "language_loss": 0.64244139, + "learning_rate": 1.471960874635183e-06, + "loss": 0.66415769, + "num_input_tokens_seen": 107128060, + "step": 4969, + "time_per_iteration": 2.5045645236968994 + }, + { + "auxiliary_loss_clip": 0.01142561, + "auxiliary_loss_mlp": 0.01024474, + "balance_loss_clip": 1.04559398, + "balance_loss_mlp": 1.01670802, + "epoch": 0.5976071664762821, + "flos": 13772461196160.0, + "grad_norm": 2.1894017122357914, + "language_loss": 0.70518279, + "learning_rate": 1.4712095830561055e-06, + "loss": 0.72685313, + "num_input_tokens_seen": 107146550, + "step": 4970, + "time_per_iteration": 2.476475715637207 + }, + { + "auxiliary_loss_clip": 0.01143454, + "auxiliary_loss_mlp": 0.01022191, + "balance_loss_clip": 1.04324961, + "balance_loss_mlp": 1.01489854, + "epoch": 0.5977274093669211, + "flos": 19098623831040.0, + "grad_norm": 1.9218526631860144, + "language_loss": 0.80564594, + "learning_rate": 1.4704583716941147e-06, + "loss": 0.82730246, + "num_input_tokens_seen": 107165415, + "step": 4971, + "time_per_iteration": 2.485135793685913 + }, + { + "auxiliary_loss_clip": 0.01155085, + "auxiliary_loss_mlp": 0.01032237, + "balance_loss_clip": 1.05006063, + "balance_loss_mlp": 1.0246402, + "epoch": 0.5978476522575603, + "flos": 20376002269440.0, + "grad_norm": 2.4235125664150776, + "language_loss": 0.72036242, + "learning_rate": 1.4697072406631672e-06, + "loss": 0.74223566, + "num_input_tokens_seen": 107185320, + "step": 4972, + "time_per_iteration": 2.4720139503479004 + }, + { + "auxiliary_loss_clip": 0.01122896, + "auxiliary_loss_mlp": 0.01028446, + "balance_loss_clip": 1.04803228, + "balance_loss_mlp": 1.01988125, + "epoch": 0.5979678951481994, + "flos": 29023147728000.0, + "grad_norm": 1.6030312388121468, + "language_loss": 0.72640443, + "learning_rate": 1.4689561900772097e-06, + "loss": 0.74791789, + "num_input_tokens_seen": 107205380, + "step": 4973, + "time_per_iteration": 2.6448493003845215 + }, + { + "auxiliary_loss_clip": 0.01142229, + "auxiliary_loss_mlp": 0.01028535, + "balance_loss_clip": 1.04290164, + "balance_loss_mlp": 1.02131116, + "epoch": 0.5980881380388384, + "flos": 17967689141760.0, + "grad_norm": 2.5405984110302815, + "language_loss": 0.72137374, + "learning_rate": 1.4682052200501758e-06, + "loss": 0.74308133, + "num_input_tokens_seen": 107222585, + "step": 4974, + "time_per_iteration": 2.4628167152404785 + }, + { + "auxiliary_loss_clip": 0.0117319, + "auxiliary_loss_mlp": 0.0103028, + "balance_loss_clip": 1.04898453, + "balance_loss_mlp": 1.02226949, + "epoch": 0.5982083809294776, + "flos": 22962827013120.0, + "grad_norm": 1.7826900018242096, + "language_loss": 0.79673046, + "learning_rate": 1.4674543306959876e-06, + "loss": 0.81876516, + "num_input_tokens_seen": 107242055, + "step": 4975, + "time_per_iteration": 2.4352152347564697 + }, + { + "auxiliary_loss_clip": 0.01151898, + "auxiliary_loss_mlp": 0.01024505, + "balance_loss_clip": 1.04922223, + "balance_loss_mlp": 1.01664281, + "epoch": 0.5983286238201166, + "flos": 20991941712000.0, + "grad_norm": 3.741498327680076, + "language_loss": 0.84638959, + "learning_rate": 1.4667035221285535e-06, + "loss": 0.86815357, + "num_input_tokens_seen": 107259695, + "step": 4976, + "time_per_iteration": 3.2381274700164795 + }, + { + "auxiliary_loss_clip": 0.01159612, + "auxiliary_loss_mlp": 0.01022865, + "balance_loss_clip": 1.05031466, + "balance_loss_mlp": 1.01565886, + "epoch": 0.5984488667107557, + "flos": 28183448511360.0, + "grad_norm": 1.788703071785846, + "language_loss": 0.74273872, + "learning_rate": 1.4659527944617715e-06, + "loss": 0.76456344, + "num_input_tokens_seen": 107279640, + "step": 4977, + "time_per_iteration": 3.2587525844573975 + }, + { + "auxiliary_loss_clip": 0.01100032, + "auxiliary_loss_mlp": 0.01024353, + "balance_loss_clip": 1.04087782, + "balance_loss_mlp": 1.01683998, + "epoch": 0.5985691096013949, + "flos": 16471794314880.0, + "grad_norm": 2.0038774772813914, + "language_loss": 0.75980324, + "learning_rate": 1.465202147809526e-06, + "loss": 0.78104711, + "num_input_tokens_seen": 107298135, + "step": 4978, + "time_per_iteration": 3.299344539642334 + }, + { + "auxiliary_loss_clip": 0.01174163, + "auxiliary_loss_mlp": 0.01023798, + "balance_loss_clip": 1.05033541, + "balance_loss_mlp": 1.01680899, + "epoch": 0.5986893524920339, + "flos": 26719046933760.0, + "grad_norm": 2.023285953213034, + "language_loss": 0.76124102, + "learning_rate": 1.4644515822856888e-06, + "loss": 0.78322065, + "num_input_tokens_seen": 107316570, + "step": 4979, + "time_per_iteration": 2.496164560317993 + }, + { + "auxiliary_loss_clip": 0.01039267, + "auxiliary_loss_mlp": 0.01003313, + "balance_loss_clip": 1.01016748, + "balance_loss_mlp": 1.0021987, + "epoch": 0.598809595382673, + "flos": 61608061100160.0, + "grad_norm": 0.7652442150416192, + "language_loss": 0.56549788, + "learning_rate": 1.4637010980041215e-06, + "loss": 0.58592367, + "num_input_tokens_seen": 107378680, + "step": 4980, + "time_per_iteration": 3.846482276916504 + }, + { + "auxiliary_loss_clip": 0.01176292, + "auxiliary_loss_mlp": 0.01025757, + "balance_loss_clip": 1.05022025, + "balance_loss_mlp": 1.01812494, + "epoch": 0.5989298382733121, + "flos": 11801719549440.0, + "grad_norm": 3.572642672283889, + "language_loss": 0.90125763, + "learning_rate": 1.4629506950786707e-06, + "loss": 0.92327809, + "num_input_tokens_seen": 107394860, + "step": 4981, + "time_per_iteration": 2.4044830799102783 + }, + { + "auxiliary_loss_clip": 0.01070258, + "auxiliary_loss_mlp": 0.01001566, + "balance_loss_clip": 1.01075864, + "balance_loss_mlp": 1.00052309, + "epoch": 0.5990500811639512, + "flos": 60025800021120.0, + "grad_norm": 0.816815182957055, + "language_loss": 0.56069505, + "learning_rate": 1.4622003736231733e-06, + "loss": 0.58141339, + "num_input_tokens_seen": 107453850, + "step": 4982, + "time_per_iteration": 3.071885347366333 + }, + { + "auxiliary_loss_clip": 0.01158171, + "auxiliary_loss_mlp": 0.01022374, + "balance_loss_clip": 1.04856694, + "balance_loss_mlp": 1.01451266, + "epoch": 0.5991703240545903, + "flos": 18222726683520.0, + "grad_norm": 1.9302914909910267, + "language_loss": 0.80804431, + "learning_rate": 1.461450133751451e-06, + "loss": 0.82984972, + "num_input_tokens_seen": 107471920, + "step": 4983, + "time_per_iteration": 2.434887647628784 + }, + { + "auxiliary_loss_clip": 0.0116394, + "auxiliary_loss_mlp": 0.01025793, + "balance_loss_clip": 1.04941523, + "balance_loss_mlp": 1.01788342, + "epoch": 0.5992905669452293, + "flos": 27709894581120.0, + "grad_norm": 2.1358023090853004, + "language_loss": 0.7615217, + "learning_rate": 1.4606999755773153e-06, + "loss": 0.78341901, + "num_input_tokens_seen": 107493125, + "step": 4984, + "time_per_iteration": 2.549564838409424 + }, + { + "auxiliary_loss_clip": 0.01173426, + "auxiliary_loss_mlp": 0.01024045, + "balance_loss_clip": 1.05031204, + "balance_loss_mlp": 1.01635623, + "epoch": 0.5994108098358685, + "flos": 20449008662400.0, + "grad_norm": 1.8445895362859577, + "language_loss": 0.82332832, + "learning_rate": 1.4599498992145643e-06, + "loss": 0.84530306, + "num_input_tokens_seen": 107513150, + "step": 4985, + "time_per_iteration": 2.5105865001678467 + }, + { + "auxiliary_loss_clip": 0.01153026, + "auxiliary_loss_mlp": 0.00762032, + "balance_loss_clip": 1.04886198, + "balance_loss_mlp": 1.0012213, + "epoch": 0.5995310527265075, + "flos": 22269966595200.0, + "grad_norm": 1.890799441449593, + "language_loss": 0.70426577, + "learning_rate": 1.4591999047769846e-06, + "loss": 0.72341633, + "num_input_tokens_seen": 107532005, + "step": 4986, + "time_per_iteration": 2.5857415199279785 + }, + { + "auxiliary_loss_clip": 0.0109997, + "auxiliary_loss_mlp": 0.0102819, + "balance_loss_clip": 1.03972673, + "balance_loss_mlp": 1.02006567, + "epoch": 0.5996512956171466, + "flos": 18916951818240.0, + "grad_norm": 1.8284500545414886, + "language_loss": 0.7494483, + "learning_rate": 1.4584499923783486e-06, + "loss": 0.7707299, + "num_input_tokens_seen": 107550585, + "step": 4987, + "time_per_iteration": 2.6429381370544434 + }, + { + "auxiliary_loss_clip": 0.011462, + "auxiliary_loss_mlp": 0.01022112, + "balance_loss_clip": 1.04737329, + "balance_loss_mlp": 1.01494741, + "epoch": 0.5997715385077858, + "flos": 15370916330880.0, + "grad_norm": 1.9261115876926653, + "language_loss": 0.75476271, + "learning_rate": 1.457700162132419e-06, + "loss": 0.77644581, + "num_input_tokens_seen": 107567575, + "step": 4988, + "time_per_iteration": 2.469963788986206 + }, + { + "auxiliary_loss_clip": 0.01114404, + "auxiliary_loss_mlp": 0.01022909, + "balance_loss_clip": 1.04473126, + "balance_loss_mlp": 1.01550031, + "epoch": 0.5998917813984248, + "flos": 25264844818560.0, + "grad_norm": 2.203401047173966, + "language_loss": 0.72246361, + "learning_rate": 1.4569504141529433e-06, + "loss": 0.74383676, + "num_input_tokens_seen": 107585410, + "step": 4989, + "time_per_iteration": 2.5713088512420654 + }, + { + "auxiliary_loss_clip": 0.01159729, + "auxiliary_loss_mlp": 0.01028988, + "balance_loss_clip": 1.05032587, + "balance_loss_mlp": 1.02089977, + "epoch": 0.6000120242890639, + "flos": 22054502862720.0, + "grad_norm": 2.252792303779004, + "language_loss": 0.71544033, + "learning_rate": 1.456200748553658e-06, + "loss": 0.73732752, + "num_input_tokens_seen": 107603405, + "step": 4990, + "time_per_iteration": 2.473705768585205 + }, + { + "auxiliary_loss_clip": 0.0117723, + "auxiliary_loss_mlp": 0.01030095, + "balance_loss_clip": 1.05121386, + "balance_loss_mlp": 1.0218811, + "epoch": 0.600132267179703, + "flos": 29863421562240.0, + "grad_norm": 1.5066095441742526, + "language_loss": 0.7876066, + "learning_rate": 1.455451165448287e-06, + "loss": 0.80967987, + "num_input_tokens_seen": 107626060, + "step": 4991, + "time_per_iteration": 2.530733346939087 + }, + { + "auxiliary_loss_clip": 0.01144869, + "auxiliary_loss_mlp": 0.01025745, + "balance_loss_clip": 1.04904175, + "balance_loss_mlp": 1.01793134, + "epoch": 0.6002525100703421, + "flos": 25045358762880.0, + "grad_norm": 3.509006623705884, + "language_loss": 0.7351042, + "learning_rate": 1.4547016649505407e-06, + "loss": 0.75681037, + "num_input_tokens_seen": 107644070, + "step": 4992, + "time_per_iteration": 2.523132801055908 + }, + { + "auxiliary_loss_clip": 0.01131298, + "auxiliary_loss_mlp": 0.01022265, + "balance_loss_clip": 1.04398656, + "balance_loss_mlp": 1.01405692, + "epoch": 0.6003727529609811, + "flos": 20849592113280.0, + "grad_norm": 2.072474420278194, + "language_loss": 0.8486408, + "learning_rate": 1.4539522471741193e-06, + "loss": 0.87017643, + "num_input_tokens_seen": 107661495, + "step": 4993, + "time_per_iteration": 2.613370180130005 + }, + { + "auxiliary_loss_clip": 0.01163805, + "auxiliary_loss_mlp": 0.01028083, + "balance_loss_clip": 1.04786992, + "balance_loss_mlp": 1.02001834, + "epoch": 0.6004929958516203, + "flos": 15594604277760.0, + "grad_norm": 2.1779602308014865, + "language_loss": 0.70898128, + "learning_rate": 1.4532029122327067e-06, + "loss": 0.73090011, + "num_input_tokens_seen": 107678280, + "step": 4994, + "time_per_iteration": 2.490605592727661 + }, + { + "auxiliary_loss_clip": 0.01126618, + "auxiliary_loss_mlp": 0.01025824, + "balance_loss_clip": 1.04826176, + "balance_loss_mlp": 1.01854289, + "epoch": 0.6006132387422594, + "flos": 21763267390080.0, + "grad_norm": 2.183618784035755, + "language_loss": 0.75421679, + "learning_rate": 1.4524536602399783e-06, + "loss": 0.77574122, + "num_input_tokens_seen": 107697370, + "step": 4995, + "time_per_iteration": 2.5793087482452393 + }, + { + "auxiliary_loss_clip": 0.01144051, + "auxiliary_loss_mlp": 0.01030545, + "balance_loss_clip": 1.04969561, + "balance_loss_mlp": 1.02324069, + "epoch": 0.6007334816328984, + "flos": 22858542852480.0, + "grad_norm": 1.5567602314065063, + "language_loss": 0.77313471, + "learning_rate": 1.4517044913095938e-06, + "loss": 0.79488069, + "num_input_tokens_seen": 107717790, + "step": 4996, + "time_per_iteration": 2.571558952331543 + }, + { + "auxiliary_loss_clip": 0.01162917, + "auxiliary_loss_mlp": 0.01025807, + "balance_loss_clip": 1.05062461, + "balance_loss_mlp": 1.01757014, + "epoch": 0.6008537245235376, + "flos": 28324577047680.0, + "grad_norm": 1.998727208150211, + "language_loss": 0.81576705, + "learning_rate": 1.4509554055552022e-06, + "loss": 0.83765429, + "num_input_tokens_seen": 107738020, + "step": 4997, + "time_per_iteration": 2.5892961025238037 + }, + { + "auxiliary_loss_clip": 0.01143488, + "auxiliary_loss_mlp": 0.01030211, + "balance_loss_clip": 1.0464381, + "balance_loss_mlp": 1.02236688, + "epoch": 0.6009739674141766, + "flos": 20886113266560.0, + "grad_norm": 11.140427563466224, + "language_loss": 0.84119827, + "learning_rate": 1.450206403090439e-06, + "loss": 0.86293519, + "num_input_tokens_seen": 107756215, + "step": 4998, + "time_per_iteration": 2.5259828567504883 + }, + { + "auxiliary_loss_clip": 0.01162058, + "auxiliary_loss_mlp": 0.01021142, + "balance_loss_clip": 1.05184174, + "balance_loss_mlp": 1.01363516, + "epoch": 0.6010942103048157, + "flos": 20481004702080.0, + "grad_norm": 6.32723287910939, + "language_loss": 0.86534727, + "learning_rate": 1.4494574840289274e-06, + "loss": 0.88717926, + "num_input_tokens_seen": 107773330, + "step": 4999, + "time_per_iteration": 2.4999587535858154 + }, + { + "auxiliary_loss_clip": 0.01164884, + "auxiliary_loss_mlp": 0.01030593, + "balance_loss_clip": 1.04816949, + "balance_loss_mlp": 1.02219427, + "epoch": 0.6012144531954549, + "flos": 23805973935360.0, + "grad_norm": 1.691639143087727, + "language_loss": 0.73748755, + "learning_rate": 1.4487086484842782e-06, + "loss": 0.75944227, + "num_input_tokens_seen": 107791975, + "step": 5000, + "time_per_iteration": 2.507807731628418 + }, + { + "auxiliary_loss_clip": 0.01171046, + "auxiliary_loss_mlp": 0.01025845, + "balance_loss_clip": 1.04761016, + "balance_loss_mlp": 1.01847148, + "epoch": 0.6013346960860939, + "flos": 18988378012800.0, + "grad_norm": 2.620537234773121, + "language_loss": 0.60162854, + "learning_rate": 1.4479598965700878e-06, + "loss": 0.62359744, + "num_input_tokens_seen": 107809240, + "step": 5001, + "time_per_iteration": 2.43402099609375 + }, + { + "auxiliary_loss_clip": 0.01130389, + "auxiliary_loss_mlp": 0.01026605, + "balance_loss_clip": 1.04379272, + "balance_loss_mlp": 1.01870751, + "epoch": 0.601454938976733, + "flos": 24025316336640.0, + "grad_norm": 3.1299986272022435, + "language_loss": 0.68529058, + "learning_rate": 1.4472112283999427e-06, + "loss": 0.70686054, + "num_input_tokens_seen": 107827895, + "step": 5002, + "time_per_iteration": 3.5202202796936035 + }, + { + "auxiliary_loss_clip": 0.01154521, + "auxiliary_loss_mlp": 0.01025049, + "balance_loss_clip": 1.04868162, + "balance_loss_mlp": 1.01815259, + "epoch": 0.6015751818673721, + "flos": 26427129102720.0, + "grad_norm": 2.653238995858655, + "language_loss": 0.6935904, + "learning_rate": 1.4464626440874143e-06, + "loss": 0.71538609, + "num_input_tokens_seen": 107847010, + "step": 5003, + "time_per_iteration": 3.4073755741119385 + }, + { + "auxiliary_loss_clip": 0.01125292, + "auxiliary_loss_mlp": 0.01027552, + "balance_loss_clip": 1.04182386, + "balance_loss_mlp": 1.01957107, + "epoch": 0.6016954247580112, + "flos": 13115260005120.0, + "grad_norm": 2.2718737240756157, + "language_loss": 0.74220377, + "learning_rate": 1.4457141437460636e-06, + "loss": 0.76373219, + "num_input_tokens_seen": 107864235, + "step": 5004, + "time_per_iteration": 3.326862335205078 + }, + { + "auxiliary_loss_clip": 0.01148467, + "auxiliary_loss_mlp": 0.01028054, + "balance_loss_clip": 1.04679608, + "balance_loss_mlp": 1.01990283, + "epoch": 0.6018156676486502, + "flos": 23768447201280.0, + "grad_norm": 1.7418652237683194, + "language_loss": 0.72917807, + "learning_rate": 1.444965727489436e-06, + "loss": 0.75094324, + "num_input_tokens_seen": 107883680, + "step": 5005, + "time_per_iteration": 2.548295259475708 + }, + { + "auxiliary_loss_clip": 0.01128726, + "auxiliary_loss_mlp": 0.01028826, + "balance_loss_clip": 1.04186606, + "balance_loss_mlp": 1.02129221, + "epoch": 0.6019359105392894, + "flos": 26469360518400.0, + "grad_norm": 2.5944826920772783, + "language_loss": 0.63159859, + "learning_rate": 1.444217395431066e-06, + "loss": 0.65317404, + "num_input_tokens_seen": 107906220, + "step": 5006, + "time_per_iteration": 3.467136859893799 + }, + { + "auxiliary_loss_clip": 0.01038638, + "auxiliary_loss_mlp": 0.01005439, + "balance_loss_clip": 1.01461852, + "balance_loss_mlp": 1.00445557, + "epoch": 0.6020561534299285, + "flos": 69190849728000.0, + "grad_norm": 0.7890048347377049, + "language_loss": 0.55820239, + "learning_rate": 1.4434691476844755e-06, + "loss": 0.5786432, + "num_input_tokens_seen": 107967195, + "step": 5007, + "time_per_iteration": 3.100051164627075 + }, + { + "auxiliary_loss_clip": 0.01142559, + "auxiliary_loss_mlp": 0.01024195, + "balance_loss_clip": 1.04890859, + "balance_loss_mlp": 1.01755476, + "epoch": 0.6021763963205675, + "flos": 21835304115840.0, + "grad_norm": 6.962025114287594, + "language_loss": 0.66838801, + "learning_rate": 1.4427209843631729e-06, + "loss": 0.69005555, + "num_input_tokens_seen": 107984245, + "step": 5008, + "time_per_iteration": 2.5308942794799805 + }, + { + "auxiliary_loss_clip": 0.01174561, + "auxiliary_loss_mlp": 0.00762223, + "balance_loss_clip": 1.05154431, + "balance_loss_mlp": 1.00116634, + "epoch": 0.6022966392112067, + "flos": 26578636669440.0, + "grad_norm": 1.9287492911343638, + "language_loss": 0.80484879, + "learning_rate": 1.4419729055806534e-06, + "loss": 0.8242166, + "num_input_tokens_seen": 108003680, + "step": 5009, + "time_per_iteration": 2.5585193634033203 + }, + { + "auxiliary_loss_clip": 0.01143703, + "auxiliary_loss_mlp": 0.00761934, + "balance_loss_clip": 1.05008352, + "balance_loss_mlp": 1.00111735, + "epoch": 0.6024168821018457, + "flos": 20703722981760.0, + "grad_norm": 1.7207903087629945, + "language_loss": 0.82407558, + "learning_rate": 1.441224911450401e-06, + "loss": 0.84313196, + "num_input_tokens_seen": 108019635, + "step": 5010, + "time_per_iteration": 2.522386312484741 + }, + { + "auxiliary_loss_clip": 0.01164158, + "auxiliary_loss_mlp": 0.01033203, + "balance_loss_clip": 1.04800916, + "balance_loss_mlp": 1.02532899, + "epoch": 0.6025371249924848, + "flos": 24680973242880.0, + "grad_norm": 1.8145446028765309, + "language_loss": 0.82261187, + "learning_rate": 1.4404770020858851e-06, + "loss": 0.84458548, + "num_input_tokens_seen": 108039120, + "step": 5011, + "time_per_iteration": 2.5136570930480957 + }, + { + "auxiliary_loss_clip": 0.01152731, + "auxiliary_loss_mlp": 0.01026308, + "balance_loss_clip": 1.04598951, + "balance_loss_mlp": 1.01908374, + "epoch": 0.602657367883124, + "flos": 25955801815680.0, + "grad_norm": 1.810247386279225, + "language_loss": 0.8587833, + "learning_rate": 1.439729177600563e-06, + "loss": 0.88057363, + "num_input_tokens_seen": 108059615, + "step": 5012, + "time_per_iteration": 2.5147616863250732 + }, + { + "auxiliary_loss_clip": 0.01159083, + "auxiliary_loss_mlp": 0.01026079, + "balance_loss_clip": 1.04981828, + "balance_loss_mlp": 1.01832461, + "epoch": 0.602777610773763, + "flos": 16690633925760.0, + "grad_norm": 2.0463808736943734, + "language_loss": 0.7305311, + "learning_rate": 1.4389814381078793e-06, + "loss": 0.7523827, + "num_input_tokens_seen": 108078855, + "step": 5013, + "time_per_iteration": 2.4665956497192383 + }, + { + "auxiliary_loss_clip": 0.0106214, + "auxiliary_loss_mlp": 0.01035219, + "balance_loss_clip": 1.0403229, + "balance_loss_mlp": 1.02794743, + "epoch": 0.6028978536644021, + "flos": 13334243270400.0, + "grad_norm": 3.070107051080541, + "language_loss": 0.79708397, + "learning_rate": 1.438233783721265e-06, + "loss": 0.81805754, + "num_input_tokens_seen": 108095020, + "step": 5014, + "time_per_iteration": 2.893082618713379 + }, + { + "auxiliary_loss_clip": 0.01143494, + "auxiliary_loss_mlp": 0.01026002, + "balance_loss_clip": 1.05082941, + "balance_loss_mlp": 1.01879263, + "epoch": 0.6030180965550412, + "flos": 19644825018240.0, + "grad_norm": 2.037070022711203, + "language_loss": 0.78092611, + "learning_rate": 1.43748621455414e-06, + "loss": 0.80262113, + "num_input_tokens_seen": 108111455, + "step": 5015, + "time_per_iteration": 2.6831789016723633 + }, + { + "auxiliary_loss_clip": 0.01142544, + "auxiliary_loss_mlp": 0.01027715, + "balance_loss_clip": 1.04689467, + "balance_loss_mlp": 1.01989508, + "epoch": 0.6031383394456803, + "flos": 14458390289280.0, + "grad_norm": 2.2993124146890915, + "language_loss": 0.8036505, + "learning_rate": 1.4367387307199082e-06, + "loss": 0.82535309, + "num_input_tokens_seen": 108128305, + "step": 5016, + "time_per_iteration": 2.7374465465545654 + }, + { + "auxiliary_loss_clip": 0.0115551, + "auxiliary_loss_mlp": 0.01026406, + "balance_loss_clip": 1.04650116, + "balance_loss_mlp": 1.01896715, + "epoch": 0.6032585823363193, + "flos": 13917791623680.0, + "grad_norm": 2.4505107056173436, + "language_loss": 0.82447433, + "learning_rate": 1.4359913323319632e-06, + "loss": 0.84629345, + "num_input_tokens_seen": 108145475, + "step": 5017, + "time_per_iteration": 2.488919973373413 + }, + { + "auxiliary_loss_clip": 0.01092498, + "auxiliary_loss_mlp": 0.01025692, + "balance_loss_clip": 1.04044199, + "balance_loss_mlp": 1.01778817, + "epoch": 0.6033788252269584, + "flos": 24353252530560.0, + "grad_norm": 1.6723443662487287, + "language_loss": 0.77443433, + "learning_rate": 1.4352440195036847e-06, + "loss": 0.79561627, + "num_input_tokens_seen": 108165650, + "step": 5018, + "time_per_iteration": 2.6714375019073486 + }, + { + "auxiliary_loss_clip": 0.01095266, + "auxiliary_loss_mlp": 0.01024685, + "balance_loss_clip": 1.03943253, + "balance_loss_mlp": 1.01727355, + "epoch": 0.6034990681175976, + "flos": 25521247077120.0, + "grad_norm": 1.7334035537618546, + "language_loss": 0.79743534, + "learning_rate": 1.4344967923484395e-06, + "loss": 0.81863487, + "num_input_tokens_seen": 108187620, + "step": 5019, + "time_per_iteration": 2.6976306438446045 + }, + { + "auxiliary_loss_clip": 0.01156325, + "auxiliary_loss_mlp": 0.01030908, + "balance_loss_clip": 1.04751205, + "balance_loss_mlp": 1.02333522, + "epoch": 0.6036193110082366, + "flos": 25958387594880.0, + "grad_norm": 2.8857976952072164, + "language_loss": 0.72143948, + "learning_rate": 1.433749650979581e-06, + "loss": 0.74331182, + "num_input_tokens_seen": 108207605, + "step": 5020, + "time_per_iteration": 2.5538270473480225 + }, + { + "auxiliary_loss_clip": 0.01136736, + "auxiliary_loss_mlp": 0.01025275, + "balance_loss_clip": 1.04479122, + "balance_loss_mlp": 1.01790524, + "epoch": 0.6037395538988757, + "flos": 25593427457280.0, + "grad_norm": 1.8532778224171644, + "language_loss": 0.68084311, + "learning_rate": 1.433002595510451e-06, + "loss": 0.70246315, + "num_input_tokens_seen": 108226385, + "step": 5021, + "time_per_iteration": 2.5648562908172607 + }, + { + "auxiliary_loss_clip": 0.01142188, + "auxiliary_loss_mlp": 0.00762656, + "balance_loss_clip": 1.04504311, + "balance_loss_mlp": 1.00103378, + "epoch": 0.6038597967895148, + "flos": 17816253402240.0, + "grad_norm": 2.956200872040174, + "language_loss": 0.72074866, + "learning_rate": 1.4322556260543757e-06, + "loss": 0.73979712, + "num_input_tokens_seen": 108242960, + "step": 5022, + "time_per_iteration": 2.4931817054748535 + }, + { + "auxiliary_loss_clip": 0.01043324, + "auxiliary_loss_mlp": 0.01001295, + "balance_loss_clip": 1.01304364, + "balance_loss_mlp": 1.00025773, + "epoch": 0.6039800396801539, + "flos": 65169213235200.0, + "grad_norm": 0.8941390727634155, + "language_loss": 0.62758183, + "learning_rate": 1.4315087427246703e-06, + "loss": 0.64802802, + "num_input_tokens_seen": 108296785, + "step": 5023, + "time_per_iteration": 3.0351321697235107 + }, + { + "auxiliary_loss_clip": 0.0107005, + "auxiliary_loss_mlp": 0.01001327, + "balance_loss_clip": 1.0102613, + "balance_loss_mlp": 1.00026059, + "epoch": 0.604100282570793, + "flos": 67386409073280.0, + "grad_norm": 0.8879719421176991, + "language_loss": 0.58510876, + "learning_rate": 1.4307619456346372e-06, + "loss": 0.60582256, + "num_input_tokens_seen": 108341090, + "step": 5024, + "time_per_iteration": 2.7534351348876953 + }, + { + "auxiliary_loss_clip": 0.01160935, + "auxiliary_loss_mlp": 0.01025034, + "balance_loss_clip": 1.0460248, + "balance_loss_mlp": 1.01738691, + "epoch": 0.6042205254614321, + "flos": 35297495631360.0, + "grad_norm": 2.5120209288315722, + "language_loss": 0.74110907, + "learning_rate": 1.430015234897564e-06, + "loss": 0.76296878, + "num_input_tokens_seen": 108364370, + "step": 5025, + "time_per_iteration": 2.5913898944854736 + }, + { + "auxiliary_loss_clip": 0.01173087, + "auxiliary_loss_mlp": 0.0076237, + "balance_loss_clip": 1.04837084, + "balance_loss_mlp": 1.00100863, + "epoch": 0.6043407683520712, + "flos": 45658262206080.0, + "grad_norm": 1.6784897632859497, + "language_loss": 0.66190231, + "learning_rate": 1.4292686106267274e-06, + "loss": 0.68125689, + "num_input_tokens_seen": 108387220, + "step": 5026, + "time_per_iteration": 2.653181552886963 + }, + { + "auxiliary_loss_clip": 0.0116274, + "auxiliary_loss_mlp": 0.01031123, + "balance_loss_clip": 1.04842937, + "balance_loss_mlp": 1.02347589, + "epoch": 0.6044610112427102, + "flos": 16180020138240.0, + "grad_norm": 1.7681962565339495, + "language_loss": 0.77155316, + "learning_rate": 1.4285220729353876e-06, + "loss": 0.79349184, + "num_input_tokens_seen": 108405760, + "step": 5027, + "time_per_iteration": 2.4455947875976562 + }, + { + "auxiliary_loss_clip": 0.01143271, + "auxiliary_loss_mlp": 0.01027733, + "balance_loss_clip": 1.04420257, + "balance_loss_mlp": 1.0203867, + "epoch": 0.6045812541333494, + "flos": 13804062186240.0, + "grad_norm": 1.8748576561175565, + "language_loss": 0.77931738, + "learning_rate": 1.4277756219367957e-06, + "loss": 0.80102742, + "num_input_tokens_seen": 108422785, + "step": 5028, + "time_per_iteration": 2.494807004928589 + }, + { + "auxiliary_loss_clip": 0.01140915, + "auxiliary_loss_mlp": 0.01025372, + "balance_loss_clip": 1.04839516, + "balance_loss_mlp": 1.01737881, + "epoch": 0.6047014970239885, + "flos": 19975059682560.0, + "grad_norm": 1.9567198271784882, + "language_loss": 0.7981137, + "learning_rate": 1.4270292577441864e-06, + "loss": 0.81977659, + "num_input_tokens_seen": 108442290, + "step": 5029, + "time_per_iteration": 3.3303167819976807 + }, + { + "auxiliary_loss_clip": 0.01162712, + "auxiliary_loss_mlp": 0.01025408, + "balance_loss_clip": 1.04649258, + "balance_loss_mlp": 1.01779008, + "epoch": 0.6048217399146275, + "flos": 25337097025920.0, + "grad_norm": 1.7279258048489456, + "language_loss": 0.71902871, + "learning_rate": 1.4262829804707836e-06, + "loss": 0.74090987, + "num_input_tokens_seen": 108464280, + "step": 5030, + "time_per_iteration": 3.3752284049987793 + }, + { + "auxiliary_loss_clip": 0.01160194, + "auxiliary_loss_mlp": 0.01030094, + "balance_loss_clip": 1.0453999, + "balance_loss_mlp": 1.02220249, + "epoch": 0.6049419828052667, + "flos": 26030819370240.0, + "grad_norm": 1.5921731271309492, + "language_loss": 0.69977695, + "learning_rate": 1.4255367902297958e-06, + "loss": 0.72167981, + "num_input_tokens_seen": 108485610, + "step": 5031, + "time_per_iteration": 2.5915298461914062 + }, + { + "auxiliary_loss_clip": 0.01170536, + "auxiliary_loss_mlp": 0.01025702, + "balance_loss_clip": 1.0487659, + "balance_loss_mlp": 1.01867485, + "epoch": 0.6050622256959057, + "flos": 14648106948480.0, + "grad_norm": 2.2485755283532822, + "language_loss": 0.78859353, + "learning_rate": 1.4247906871344215e-06, + "loss": 0.81055593, + "num_input_tokens_seen": 108501005, + "step": 5032, + "time_per_iteration": 2.505186080932617 + }, + { + "auxiliary_loss_clip": 0.01137593, + "auxiliary_loss_mlp": 0.01022325, + "balance_loss_clip": 1.04192829, + "balance_loss_mlp": 1.01521707, + "epoch": 0.6051824685865448, + "flos": 23331450337920.0, + "grad_norm": 2.253751772815964, + "language_loss": 0.75561202, + "learning_rate": 1.4240446712978415e-06, + "loss": 0.77721125, + "num_input_tokens_seen": 108519990, + "step": 5033, + "time_per_iteration": 3.3075037002563477 + }, + { + "auxiliary_loss_clip": 0.01164045, + "auxiliary_loss_mlp": 0.01022218, + "balance_loss_clip": 1.04926801, + "balance_loss_mlp": 1.01429629, + "epoch": 0.605302711477184, + "flos": 27563307177600.0, + "grad_norm": 2.6709949329221274, + "language_loss": 0.74540919, + "learning_rate": 1.423298742833227e-06, + "loss": 0.76727176, + "num_input_tokens_seen": 108538650, + "step": 5034, + "time_per_iteration": 2.5957117080688477 + }, + { + "auxiliary_loss_clip": 0.01135696, + "auxiliary_loss_mlp": 0.01025842, + "balance_loss_clip": 1.04329908, + "balance_loss_mlp": 1.01853466, + "epoch": 0.605422954367823, + "flos": 15154698412800.0, + "grad_norm": 2.5419086787295653, + "language_loss": 0.71709937, + "learning_rate": 1.4225529018537352e-06, + "loss": 0.73871475, + "num_input_tokens_seen": 108554155, + "step": 5035, + "time_per_iteration": 2.555943012237549 + }, + { + "auxiliary_loss_clip": 0.01173077, + "auxiliary_loss_mlp": 0.01023747, + "balance_loss_clip": 1.0493474, + "balance_loss_mlp": 1.01617742, + "epoch": 0.6055431972584621, + "flos": 27673912131840.0, + "grad_norm": 2.0162624884793305, + "language_loss": 0.78016722, + "learning_rate": 1.4218071484725082e-06, + "loss": 0.80213547, + "num_input_tokens_seen": 108576275, + "step": 5036, + "time_per_iteration": 2.493729591369629 + }, + { + "auxiliary_loss_clip": 0.01142879, + "auxiliary_loss_mlp": 0.01027318, + "balance_loss_clip": 1.04895663, + "balance_loss_mlp": 1.02009439, + "epoch": 0.6056634401491012, + "flos": 19387489006080.0, + "grad_norm": 2.310839492964316, + "language_loss": 0.76482093, + "learning_rate": 1.4210614828026786e-06, + "loss": 0.78652292, + "num_input_tokens_seen": 108594125, + "step": 5037, + "time_per_iteration": 2.489825963973999 + }, + { + "auxiliary_loss_clip": 0.01171122, + "auxiliary_loss_mlp": 0.01021608, + "balance_loss_clip": 1.04717171, + "balance_loss_mlp": 1.01428258, + "epoch": 0.6057836830397403, + "flos": 24789459294720.0, + "grad_norm": 1.6675480677389978, + "language_loss": 0.74372268, + "learning_rate": 1.4203159049573605e-06, + "loss": 0.76564997, + "num_input_tokens_seen": 108615360, + "step": 5038, + "time_per_iteration": 2.46870493888855 + }, + { + "auxiliary_loss_clip": 0.01153678, + "auxiliary_loss_mlp": 0.01027951, + "balance_loss_clip": 1.04712772, + "balance_loss_mlp": 1.02035177, + "epoch": 0.6059039259303793, + "flos": 20558248899840.0, + "grad_norm": 2.0885878790979397, + "language_loss": 0.86869681, + "learning_rate": 1.4195704150496593e-06, + "loss": 0.89051318, + "num_input_tokens_seen": 108633075, + "step": 5039, + "time_per_iteration": 2.4922938346862793 + }, + { + "auxiliary_loss_clip": 0.01145953, + "auxiliary_loss_mlp": 0.01025926, + "balance_loss_clip": 1.04758835, + "balance_loss_mlp": 1.0182786, + "epoch": 0.6060241688210185, + "flos": 21069724613760.0, + "grad_norm": 1.6954939680644043, + "language_loss": 0.74084383, + "learning_rate": 1.4188250131926639e-06, + "loss": 0.76256257, + "num_input_tokens_seen": 108651875, + "step": 5040, + "time_per_iteration": 2.4995040893554688 + }, + { + "auxiliary_loss_clip": 0.0114749, + "auxiliary_loss_mlp": 0.01029301, + "balance_loss_clip": 1.04733586, + "balance_loss_mlp": 1.02138007, + "epoch": 0.6061444117116576, + "flos": 16361081619840.0, + "grad_norm": 2.7160095569263847, + "language_loss": 0.80565512, + "learning_rate": 1.4180796994994525e-06, + "loss": 0.82742304, + "num_input_tokens_seen": 108669290, + "step": 5041, + "time_per_iteration": 2.466620445251465 + }, + { + "auxiliary_loss_clip": 0.01141583, + "auxiliary_loss_mlp": 0.01020351, + "balance_loss_clip": 1.04390097, + "balance_loss_mlp": 1.01273918, + "epoch": 0.6062646546022966, + "flos": 21507296094720.0, + "grad_norm": 2.1731245093484226, + "language_loss": 0.71785831, + "learning_rate": 1.4173344740830877e-06, + "loss": 0.73947763, + "num_input_tokens_seen": 108688420, + "step": 5042, + "time_per_iteration": 2.5347092151641846 + }, + { + "auxiliary_loss_clip": 0.01139947, + "auxiliary_loss_mlp": 0.01029758, + "balance_loss_clip": 1.04810739, + "balance_loss_mlp": 1.02240252, + "epoch": 0.6063848974929358, + "flos": 38983151283840.0, + "grad_norm": 2.1731888078752415, + "language_loss": 0.70905793, + "learning_rate": 1.4165893370566206e-06, + "loss": 0.73075497, + "num_input_tokens_seen": 108712175, + "step": 5043, + "time_per_iteration": 2.6482326984405518 + }, + { + "auxiliary_loss_clip": 0.01153968, + "auxiliary_loss_mlp": 0.01027349, + "balance_loss_clip": 1.0443747, + "balance_loss_mlp": 1.01955843, + "epoch": 0.6065051403835748, + "flos": 19646584784640.0, + "grad_norm": 1.5862931769991064, + "language_loss": 0.77396727, + "learning_rate": 1.4158442885330865e-06, + "loss": 0.79578042, + "num_input_tokens_seen": 108730745, + "step": 5044, + "time_per_iteration": 2.4632456302642822 + }, + { + "auxiliary_loss_clip": 0.01153796, + "auxiliary_loss_mlp": 0.01028253, + "balance_loss_clip": 1.04590058, + "balance_loss_mlp": 1.02032006, + "epoch": 0.6066253832742139, + "flos": 23513086437120.0, + "grad_norm": 1.9634254537168052, + "language_loss": 0.78720868, + "learning_rate": 1.4150993286255094e-06, + "loss": 0.8090291, + "num_input_tokens_seen": 108749995, + "step": 5045, + "time_per_iteration": 2.5132765769958496 + }, + { + "auxiliary_loss_clip": 0.01170971, + "auxiliary_loss_mlp": 0.01025634, + "balance_loss_clip": 1.04705119, + "balance_loss_mlp": 1.01847529, + "epoch": 0.6067456261648531, + "flos": 19133708440320.0, + "grad_norm": 2.3369664202585683, + "language_loss": 0.7998758, + "learning_rate": 1.4143544574468993e-06, + "loss": 0.82184184, + "num_input_tokens_seen": 108768355, + "step": 5046, + "time_per_iteration": 2.4268505573272705 + }, + { + "auxiliary_loss_clip": 0.01157218, + "auxiliary_loss_mlp": 0.01024179, + "balance_loss_clip": 1.04857516, + "balance_loss_mlp": 1.01608467, + "epoch": 0.6068658690554921, + "flos": 20520614424960.0, + "grad_norm": 8.240603596528468, + "language_loss": 0.82296801, + "learning_rate": 1.4136096751102523e-06, + "loss": 0.84478199, + "num_input_tokens_seen": 108786685, + "step": 5047, + "time_per_iteration": 2.4710874557495117 + }, + { + "auxiliary_loss_clip": 0.01149434, + "auxiliary_loss_mlp": 0.01024999, + "balance_loss_clip": 1.04876423, + "balance_loss_mlp": 1.01737857, + "epoch": 0.6069861119461312, + "flos": 27374560185600.0, + "grad_norm": 2.43315173968245, + "language_loss": 0.82850516, + "learning_rate": 1.4128649817285516e-06, + "loss": 0.85024953, + "num_input_tokens_seen": 108804820, + "step": 5048, + "time_per_iteration": 2.5515048503875732 + }, + { + "auxiliary_loss_clip": 0.01149035, + "auxiliary_loss_mlp": 0.0103732, + "balance_loss_clip": 1.04587281, + "balance_loss_mlp": 1.02924991, + "epoch": 0.6071063548367702, + "flos": 25626500904960.0, + "grad_norm": 2.4286473069103254, + "language_loss": 0.63316101, + "learning_rate": 1.412120377414766e-06, + "loss": 0.65502459, + "num_input_tokens_seen": 108825010, + "step": 5049, + "time_per_iteration": 2.5395915508270264 + }, + { + "auxiliary_loss_clip": 0.01175437, + "auxiliary_loss_mlp": 0.01029045, + "balance_loss_clip": 1.05187392, + "balance_loss_mlp": 1.02138543, + "epoch": 0.6072265977274094, + "flos": 24460517520000.0, + "grad_norm": 1.5974459034535566, + "language_loss": 0.7140137, + "learning_rate": 1.4113758622818522e-06, + "loss": 0.73605853, + "num_input_tokens_seen": 108845075, + "step": 5050, + "time_per_iteration": 2.4598827362060547 + }, + { + "auxiliary_loss_clip": 0.01151723, + "auxiliary_loss_mlp": 0.00761979, + "balance_loss_clip": 1.04876316, + "balance_loss_mlp": 1.00093329, + "epoch": 0.6073468406180484, + "flos": 18149253413760.0, + "grad_norm": 1.7939258701385687, + "language_loss": 0.83154958, + "learning_rate": 1.410631436442751e-06, + "loss": 0.85068661, + "num_input_tokens_seen": 108863870, + "step": 5051, + "time_per_iteration": 2.4900269508361816 + }, + { + "auxiliary_loss_clip": 0.01164116, + "auxiliary_loss_mlp": 0.01022711, + "balance_loss_clip": 1.04929852, + "balance_loss_mlp": 1.01515305, + "epoch": 0.6074670835086875, + "flos": 20697617669760.0, + "grad_norm": 2.32741857040436, + "language_loss": 0.86761463, + "learning_rate": 1.4098871000103936e-06, + "loss": 0.88948286, + "num_input_tokens_seen": 108882470, + "step": 5052, + "time_per_iteration": 2.5110909938812256 + }, + { + "auxiliary_loss_clip": 0.01145517, + "auxiliary_loss_mlp": 0.01022448, + "balance_loss_clip": 1.04514384, + "balance_loss_mlp": 1.01535773, + "epoch": 0.6075873263993267, + "flos": 23769955572480.0, + "grad_norm": 1.6297518025800937, + "language_loss": 0.82594967, + "learning_rate": 1.409142853097693e-06, + "loss": 0.84762937, + "num_input_tokens_seen": 108902710, + "step": 5053, + "time_per_iteration": 2.5653650760650635 + }, + { + "auxiliary_loss_clip": 0.01148363, + "auxiliary_loss_mlp": 0.01025284, + "balance_loss_clip": 1.04780853, + "balance_loss_mlp": 1.01804185, + "epoch": 0.6077075692899657, + "flos": 24454484035200.0, + "grad_norm": 1.9800821603083854, + "language_loss": 0.79534268, + "learning_rate": 1.408398695817553e-06, + "loss": 0.81707919, + "num_input_tokens_seen": 108919935, + "step": 5054, + "time_per_iteration": 2.601675271987915 + }, + { + "auxiliary_loss_clip": 0.01144207, + "auxiliary_loss_mlp": 0.0103169, + "balance_loss_clip": 1.04500186, + "balance_loss_mlp": 1.02301776, + "epoch": 0.6078278121806048, + "flos": 27382102041600.0, + "grad_norm": 1.6951405990305326, + "language_loss": 0.70397627, + "learning_rate": 1.4076546282828593e-06, + "loss": 0.72573531, + "num_input_tokens_seen": 108942790, + "step": 5055, + "time_per_iteration": 2.55871319770813 + }, + { + "auxiliary_loss_clip": 0.01146909, + "auxiliary_loss_mlp": 0.01028048, + "balance_loss_clip": 1.04227877, + "balance_loss_mlp": 1.02080309, + "epoch": 0.6079480550712439, + "flos": 38436447306240.0, + "grad_norm": 2.386306774092865, + "language_loss": 0.664698, + "learning_rate": 1.4069106506064874e-06, + "loss": 0.68644762, + "num_input_tokens_seen": 108964215, + "step": 5056, + "time_per_iteration": 3.436164140701294 + }, + { + "auxiliary_loss_clip": 0.01142408, + "auxiliary_loss_mlp": 0.01024217, + "balance_loss_clip": 1.04759479, + "balance_loss_mlp": 1.01670992, + "epoch": 0.608068297961883, + "flos": 25336271013120.0, + "grad_norm": 2.220441682122608, + "language_loss": 0.78227264, + "learning_rate": 1.4061667629012989e-06, + "loss": 0.80393887, + "num_input_tokens_seen": 108984885, + "step": 5057, + "time_per_iteration": 3.3665552139282227 + }, + { + "auxiliary_loss_clip": 0.0113923, + "auxiliary_loss_mlp": 0.01028006, + "balance_loss_clip": 1.04792428, + "balance_loss_mlp": 1.02048111, + "epoch": 0.608188540852522, + "flos": 24202463235840.0, + "grad_norm": 1.5939415220358613, + "language_loss": 0.83413965, + "learning_rate": 1.40542296528014e-06, + "loss": 0.85581201, + "num_input_tokens_seen": 109004545, + "step": 5058, + "time_per_iteration": 2.529470682144165 + }, + { + "auxiliary_loss_clip": 0.01159171, + "auxiliary_loss_mlp": 0.01030417, + "balance_loss_clip": 1.04728103, + "balance_loss_mlp": 1.02287388, + "epoch": 0.6083087837431612, + "flos": 21284146851840.0, + "grad_norm": 1.7304450002557008, + "language_loss": 0.75987029, + "learning_rate": 1.4046792578558452e-06, + "loss": 0.78176612, + "num_input_tokens_seen": 109022440, + "step": 5059, + "time_per_iteration": 3.2553634643554688 + }, + { + "auxiliary_loss_clip": 0.01142034, + "auxiliary_loss_mlp": 0.0102651, + "balance_loss_clip": 1.04497266, + "balance_loss_mlp": 1.0186367, + "epoch": 0.6084290266338003, + "flos": 16471435178880.0, + "grad_norm": 2.326344554255091, + "language_loss": 0.76198983, + "learning_rate": 1.4039356407412325e-06, + "loss": 0.78367531, + "num_input_tokens_seen": 109035680, + "step": 5060, + "time_per_iteration": 2.4574718475341797 + }, + { + "auxiliary_loss_clip": 0.01061992, + "auxiliary_loss_mlp": 0.01003226, + "balance_loss_clip": 1.01136708, + "balance_loss_mlp": 1.00217712, + "epoch": 0.6085492695244393, + "flos": 66443574931200.0, + "grad_norm": 0.785283083800807, + "language_loss": 0.57152855, + "learning_rate": 1.40319211404911e-06, + "loss": 0.59218073, + "num_input_tokens_seen": 109090680, + "step": 5061, + "time_per_iteration": 3.0189085006713867 + }, + { + "auxiliary_loss_clip": 0.01175059, + "auxiliary_loss_mlp": 0.01025947, + "balance_loss_clip": 1.050354, + "balance_loss_mlp": 1.01844847, + "epoch": 0.6086695124150785, + "flos": 23618986709760.0, + "grad_norm": 1.8611295618004335, + "language_loss": 0.90522325, + "learning_rate": 1.4024486778922691e-06, + "loss": 0.92723334, + "num_input_tokens_seen": 109108995, + "step": 5062, + "time_per_iteration": 2.466728925704956 + }, + { + "auxiliary_loss_clip": 0.01150263, + "auxiliary_loss_mlp": 0.01030234, + "balance_loss_clip": 1.04585099, + "balance_loss_mlp": 1.02252162, + "epoch": 0.6087897553057176, + "flos": 20157054917760.0, + "grad_norm": 1.8028853138237357, + "language_loss": 0.77416015, + "learning_rate": 1.4017053323834884e-06, + "loss": 0.79596514, + "num_input_tokens_seen": 109128825, + "step": 5063, + "time_per_iteration": 2.505993127822876 + }, + { + "auxiliary_loss_clip": 0.01147296, + "auxiliary_loss_mlp": 0.01024252, + "balance_loss_clip": 1.04517889, + "balance_loss_mlp": 1.01699543, + "epoch": 0.6089099981963566, + "flos": 25482535194240.0, + "grad_norm": 2.2325298618326315, + "language_loss": 0.76503849, + "learning_rate": 1.4009620776355333e-06, + "loss": 0.78675401, + "num_input_tokens_seen": 109150425, + "step": 5064, + "time_per_iteration": 2.5498297214508057 + }, + { + "auxiliary_loss_clip": 0.01157623, + "auxiliary_loss_mlp": 0.01022565, + "balance_loss_clip": 1.04734004, + "balance_loss_mlp": 1.01523924, + "epoch": 0.6090302410869958, + "flos": 25332895134720.0, + "grad_norm": 1.7861538574460152, + "language_loss": 0.79134959, + "learning_rate": 1.4002189137611553e-06, + "loss": 0.81315148, + "num_input_tokens_seen": 109169765, + "step": 5065, + "time_per_iteration": 2.489407777786255 + }, + { + "auxiliary_loss_clip": 0.01157494, + "auxiliary_loss_mlp": 0.01024477, + "balance_loss_clip": 1.04661107, + "balance_loss_mlp": 1.01700556, + "epoch": 0.6091504839776348, + "flos": 23987358639360.0, + "grad_norm": 2.08615944355927, + "language_loss": 0.69768763, + "learning_rate": 1.3994758408730901e-06, + "loss": 0.71950734, + "num_input_tokens_seen": 109188950, + "step": 5066, + "time_per_iteration": 2.4967920780181885 + }, + { + "auxiliary_loss_clip": 0.01148843, + "auxiliary_loss_mlp": 0.01024348, + "balance_loss_clip": 1.04937756, + "balance_loss_mlp": 1.01596761, + "epoch": 0.6092707268682739, + "flos": 29643037666560.0, + "grad_norm": 2.4884183080156275, + "language_loss": 0.7650255, + "learning_rate": 1.3987328590840629e-06, + "loss": 0.78675735, + "num_input_tokens_seen": 109209895, + "step": 5067, + "time_per_iteration": 2.585358142852783 + }, + { + "auxiliary_loss_clip": 0.01155492, + "auxiliary_loss_mlp": 0.01025494, + "balance_loss_clip": 1.04628265, + "balance_loss_mlp": 1.01827264, + "epoch": 0.609390969758913, + "flos": 24024957200640.0, + "grad_norm": 7.023162183374294, + "language_loss": 0.86077398, + "learning_rate": 1.397989968506783e-06, + "loss": 0.88258386, + "num_input_tokens_seen": 109228905, + "step": 5068, + "time_per_iteration": 2.5016214847564697 + }, + { + "auxiliary_loss_clip": 0.01179215, + "auxiliary_loss_mlp": 0.01032717, + "balance_loss_clip": 1.05204391, + "balance_loss_mlp": 1.0248313, + "epoch": 0.6095112126495521, + "flos": 11102143288320.0, + "grad_norm": 2.2038566463926297, + "language_loss": 0.72785795, + "learning_rate": 1.3972471692539458e-06, + "loss": 0.74997723, + "num_input_tokens_seen": 109243620, + "step": 5069, + "time_per_iteration": 2.4420711994171143 + }, + { + "auxiliary_loss_clip": 0.01142915, + "auxiliary_loss_mlp": 0.01024619, + "balance_loss_clip": 1.04680383, + "balance_loss_mlp": 1.01706123, + "epoch": 0.6096314555401912, + "flos": 17265491187840.0, + "grad_norm": 1.9906887153899553, + "language_loss": 0.75086504, + "learning_rate": 1.3965044614382348e-06, + "loss": 0.77254033, + "num_input_tokens_seen": 109259070, + "step": 5070, + "time_per_iteration": 2.5050253868103027 + }, + { + "auxiliary_loss_clip": 0.01178488, + "auxiliary_loss_mlp": 0.01024831, + "balance_loss_clip": 1.05148494, + "balance_loss_mlp": 1.01726151, + "epoch": 0.6097516984308303, + "flos": 21645910679040.0, + "grad_norm": 2.888404512110917, + "language_loss": 0.7509917, + "learning_rate": 1.3957618451723162e-06, + "loss": 0.77302492, + "num_input_tokens_seen": 109275100, + "step": 5071, + "time_per_iteration": 2.4561729431152344 + }, + { + "auxiliary_loss_clip": 0.01148345, + "auxiliary_loss_mlp": 0.01028332, + "balance_loss_clip": 1.04815888, + "balance_loss_mlp": 1.02094388, + "epoch": 0.6098719413214694, + "flos": 27199208966400.0, + "grad_norm": 1.9713347396194176, + "language_loss": 0.7117092, + "learning_rate": 1.3950193205688457e-06, + "loss": 0.73347592, + "num_input_tokens_seen": 109294825, + "step": 5072, + "time_per_iteration": 2.574273109436035 + }, + { + "auxiliary_loss_clip": 0.01143111, + "auxiliary_loss_mlp": 0.01025121, + "balance_loss_clip": 1.0469842, + "balance_loss_mlp": 1.01760507, + "epoch": 0.6099921842121084, + "flos": 20412954385920.0, + "grad_norm": 2.2147909382042017, + "language_loss": 0.83891481, + "learning_rate": 1.3942768877404627e-06, + "loss": 0.86059713, + "num_input_tokens_seen": 109313790, + "step": 5073, + "time_per_iteration": 2.5697665214538574 + }, + { + "auxiliary_loss_clip": 0.01173086, + "auxiliary_loss_mlp": 0.0102716, + "balance_loss_clip": 1.04931498, + "balance_loss_mlp": 1.02000785, + "epoch": 0.6101124271027476, + "flos": 23366139897600.0, + "grad_norm": 1.487013017789346, + "language_loss": 0.73356891, + "learning_rate": 1.393534546799795e-06, + "loss": 0.75557137, + "num_input_tokens_seen": 109333490, + "step": 5074, + "time_per_iteration": 2.493177652359009 + }, + { + "auxiliary_loss_clip": 0.01137267, + "auxiliary_loss_mlp": 0.01030624, + "balance_loss_clip": 1.04641533, + "balance_loss_mlp": 1.02223206, + "epoch": 0.6102326699933867, + "flos": 26687840993280.0, + "grad_norm": 2.3367251086204055, + "language_loss": 0.67623508, + "learning_rate": 1.3927922978594536e-06, + "loss": 0.697914, + "num_input_tokens_seen": 109354575, + "step": 5075, + "time_per_iteration": 2.566347122192383 + }, + { + "auxiliary_loss_clip": 0.01056732, + "auxiliary_loss_mlp": 0.01001159, + "balance_loss_clip": 1.01099062, + "balance_loss_mlp": 0.99997884, + "epoch": 0.6103529128840257, + "flos": 60644612551680.0, + "grad_norm": 0.7735945396092758, + "language_loss": 0.57383871, + "learning_rate": 1.3920501410320387e-06, + "loss": 0.59441763, + "num_input_tokens_seen": 109410690, + "step": 5076, + "time_per_iteration": 3.0285003185272217 + }, + { + "auxiliary_loss_clip": 0.01145776, + "auxiliary_loss_mlp": 0.01026071, + "balance_loss_clip": 1.04555428, + "balance_loss_mlp": 1.01817942, + "epoch": 0.6104731557746649, + "flos": 19021307806080.0, + "grad_norm": 2.3516698671581597, + "language_loss": 0.76220047, + "learning_rate": 1.3913080764301333e-06, + "loss": 0.78391898, + "num_input_tokens_seen": 109427650, + "step": 5077, + "time_per_iteration": 2.5107295513153076 + }, + { + "auxiliary_loss_clip": 0.01124844, + "auxiliary_loss_mlp": 0.01032402, + "balance_loss_clip": 1.04141402, + "balance_loss_mlp": 1.02493382, + "epoch": 0.6105933986653039, + "flos": 23366894083200.0, + "grad_norm": 1.7601405985583773, + "language_loss": 0.71215856, + "learning_rate": 1.3905661041663085e-06, + "loss": 0.73373103, + "num_input_tokens_seen": 109448835, + "step": 5078, + "time_per_iteration": 2.6028382778167725 + }, + { + "auxiliary_loss_clip": 0.01159064, + "auxiliary_loss_mlp": 0.01030904, + "balance_loss_clip": 1.0489006, + "balance_loss_mlp": 1.02294683, + "epoch": 0.610713641555943, + "flos": 34637565006720.0, + "grad_norm": 2.11198201107076, + "language_loss": 0.65204662, + "learning_rate": 1.389824224353122e-06, + "loss": 0.67394632, + "num_input_tokens_seen": 109470425, + "step": 5079, + "time_per_iteration": 2.5768003463745117 + }, + { + "auxiliary_loss_clip": 0.01160737, + "auxiliary_loss_mlp": 0.01024495, + "balance_loss_clip": 1.05047989, + "balance_loss_mlp": 1.01688015, + "epoch": 0.610833884446582, + "flos": 26646471504000.0, + "grad_norm": 1.4983654483306084, + "language_loss": 0.76745665, + "learning_rate": 1.389082437103115e-06, + "loss": 0.78930891, + "num_input_tokens_seen": 109489695, + "step": 5080, + "time_per_iteration": 2.5149295330047607 + }, + { + "auxiliary_loss_clip": 0.01129303, + "auxiliary_loss_mlp": 0.01024919, + "balance_loss_clip": 1.04269791, + "balance_loss_mlp": 1.01689005, + "epoch": 0.6109541273372212, + "flos": 21215126868480.0, + "grad_norm": 2.6117799871561607, + "language_loss": 0.77522969, + "learning_rate": 1.3883407425288172e-06, + "loss": 0.79677188, + "num_input_tokens_seen": 109510030, + "step": 5081, + "time_per_iteration": 2.542107343673706 + }, + { + "auxiliary_loss_clip": 0.01143276, + "auxiliary_loss_mlp": 0.01023549, + "balance_loss_clip": 1.0446763, + "balance_loss_mlp": 1.01578283, + "epoch": 0.6110743702278603, + "flos": 20084084438400.0, + "grad_norm": 2.516869151255249, + "language_loss": 0.79530919, + "learning_rate": 1.3875991407427417e-06, + "loss": 0.8169775, + "num_input_tokens_seen": 109528255, + "step": 5082, + "time_per_iteration": 3.312303066253662 + }, + { + "auxiliary_loss_clip": 0.01043414, + "auxiliary_loss_mlp": 0.01001235, + "balance_loss_clip": 1.01138854, + "balance_loss_mlp": 1.00009656, + "epoch": 0.6111946131184993, + "flos": 68302957438080.0, + "grad_norm": 0.7650526312814462, + "language_loss": 0.58166122, + "learning_rate": 1.38685763185739e-06, + "loss": 0.60210776, + "num_input_tokens_seen": 109581915, + "step": 5083, + "time_per_iteration": 4.612701177597046 + }, + { + "auxiliary_loss_clip": 0.01171352, + "auxiliary_loss_mlp": 0.01026819, + "balance_loss_clip": 1.04765952, + "balance_loss_mlp": 1.01911759, + "epoch": 0.6113148560091385, + "flos": 19937676602880.0, + "grad_norm": 2.9891448337666886, + "language_loss": 0.67350131, + "learning_rate": 1.3861162159852476e-06, + "loss": 0.69548297, + "num_input_tokens_seen": 109600050, + "step": 5084, + "time_per_iteration": 2.434312343597412 + }, + { + "auxiliary_loss_clip": 0.01150971, + "auxiliary_loss_mlp": 0.0102438, + "balance_loss_clip": 1.04844713, + "balance_loss_mlp": 1.01623225, + "epoch": 0.6114350988997775, + "flos": 23731854220800.0, + "grad_norm": 1.9852568744895442, + "language_loss": 0.79900324, + "learning_rate": 1.3853748932387875e-06, + "loss": 0.82075673, + "num_input_tokens_seen": 109620690, + "step": 5085, + "time_per_iteration": 3.2816481590270996 + }, + { + "auxiliary_loss_clip": 0.01134564, + "auxiliary_loss_mlp": 0.01021488, + "balance_loss_clip": 1.04419589, + "balance_loss_mlp": 1.01382279, + "epoch": 0.6115553417904166, + "flos": 24023700224640.0, + "grad_norm": 2.371526245427314, + "language_loss": 0.75241411, + "learning_rate": 1.3846336637304671e-06, + "loss": 0.77397466, + "num_input_tokens_seen": 109638960, + "step": 5086, + "time_per_iteration": 2.522296667098999 + }, + { + "auxiliary_loss_clip": 0.01138519, + "auxiliary_loss_mlp": 0.01021336, + "balance_loss_clip": 1.04593587, + "balance_loss_mlp": 1.01388526, + "epoch": 0.6116755846810558, + "flos": 23733542160000.0, + "grad_norm": 5.675752808155373, + "language_loss": 0.83137792, + "learning_rate": 1.3838925275727316e-06, + "loss": 0.8529765, + "num_input_tokens_seen": 109659700, + "step": 5087, + "time_per_iteration": 2.529971122741699 + }, + { + "auxiliary_loss_clip": 0.01177632, + "auxiliary_loss_mlp": 0.01023444, + "balance_loss_clip": 1.05306792, + "balance_loss_mlp": 1.01610923, + "epoch": 0.6117958275716948, + "flos": 18661626967680.0, + "grad_norm": 1.7221867742308503, + "language_loss": 0.79084492, + "learning_rate": 1.3831514848780089e-06, + "loss": 0.81285572, + "num_input_tokens_seen": 109679275, + "step": 5088, + "time_per_iteration": 2.4443085193634033 + }, + { + "auxiliary_loss_clip": 0.01155012, + "auxiliary_loss_mlp": 0.01025625, + "balance_loss_clip": 1.04732394, + "balance_loss_mlp": 1.01860619, + "epoch": 0.6119160704623339, + "flos": 16471183783680.0, + "grad_norm": 2.3229384792805465, + "language_loss": 0.91688633, + "learning_rate": 1.3824105357587152e-06, + "loss": 0.93869269, + "num_input_tokens_seen": 109696380, + "step": 5089, + "time_per_iteration": 2.4471707344055176 + }, + { + "auxiliary_loss_clip": 0.01140606, + "auxiliary_loss_mlp": 0.01023848, + "balance_loss_clip": 1.04441166, + "balance_loss_mlp": 1.01641512, + "epoch": 0.612036313352973, + "flos": 23915465568000.0, + "grad_norm": 1.5009420787135197, + "language_loss": 0.82668811, + "learning_rate": 1.381669680327253e-06, + "loss": 0.84833264, + "num_input_tokens_seen": 109718060, + "step": 5090, + "time_per_iteration": 2.5493695735931396 + }, + { + "auxiliary_loss_clip": 0.01141398, + "auxiliary_loss_mlp": 0.0102734, + "balance_loss_clip": 1.04852724, + "balance_loss_mlp": 1.01966298, + "epoch": 0.6121565562436121, + "flos": 26974766833920.0, + "grad_norm": 2.3687511614301844, + "language_loss": 0.70693433, + "learning_rate": 1.380928918696008e-06, + "loss": 0.72862166, + "num_input_tokens_seen": 109736830, + "step": 5091, + "time_per_iteration": 2.540473699569702 + }, + { + "auxiliary_loss_clip": 0.01158784, + "auxiliary_loss_mlp": 0.01025088, + "balance_loss_clip": 1.04628813, + "balance_loss_mlp": 1.01756597, + "epoch": 0.6122767991342511, + "flos": 15668867646720.0, + "grad_norm": 2.4020148164260484, + "language_loss": 0.7161032, + "learning_rate": 1.3801882509773548e-06, + "loss": 0.73794186, + "num_input_tokens_seen": 109754690, + "step": 5092, + "time_per_iteration": 2.4587152004241943 + }, + { + "auxiliary_loss_clip": 0.01153331, + "auxiliary_loss_mlp": 0.01026715, + "balance_loss_clip": 1.04460764, + "balance_loss_mlp": 1.01876378, + "epoch": 0.6123970420248903, + "flos": 27964321591680.0, + "grad_norm": 2.131457858663716, + "language_loss": 0.8152957, + "learning_rate": 1.3794476772836503e-06, + "loss": 0.83709615, + "num_input_tokens_seen": 109775790, + "step": 5093, + "time_per_iteration": 2.5212724208831787 + }, + { + "auxiliary_loss_clip": 0.01121652, + "auxiliary_loss_mlp": 0.01030308, + "balance_loss_clip": 1.04330456, + "balance_loss_mlp": 1.02257407, + "epoch": 0.6125172849155294, + "flos": 21468727866240.0, + "grad_norm": 1.6589641400694592, + "language_loss": 0.84122229, + "learning_rate": 1.3787071977272402e-06, + "loss": 0.86274195, + "num_input_tokens_seen": 109795050, + "step": 5094, + "time_per_iteration": 2.5735199451446533 + }, + { + "auxiliary_loss_clip": 0.01113616, + "auxiliary_loss_mlp": 0.01027904, + "balance_loss_clip": 1.04625487, + "balance_loss_mlp": 1.02048349, + "epoch": 0.6126375278061684, + "flos": 16248321849600.0, + "grad_norm": 3.420129416755712, + "language_loss": 0.72150326, + "learning_rate": 1.3779668124204535e-06, + "loss": 0.74291849, + "num_input_tokens_seen": 109811465, + "step": 5095, + "time_per_iteration": 2.5460307598114014 + }, + { + "auxiliary_loss_clip": 0.01140983, + "auxiliary_loss_mlp": 0.01023782, + "balance_loss_clip": 1.04864454, + "balance_loss_mlp": 1.01604509, + "epoch": 0.6127577706968076, + "flos": 20448865008000.0, + "grad_norm": 1.6563832337942141, + "language_loss": 0.80760014, + "learning_rate": 1.3772265214756074e-06, + "loss": 0.82924777, + "num_input_tokens_seen": 109831225, + "step": 5096, + "time_per_iteration": 2.5117568969726562 + }, + { + "auxiliary_loss_clip": 0.01160606, + "auxiliary_loss_mlp": 0.01026028, + "balance_loss_clip": 1.04513597, + "balance_loss_mlp": 1.01846385, + "epoch": 0.6128780135874466, + "flos": 18260397072000.0, + "grad_norm": 1.9943771946405042, + "language_loss": 0.75267535, + "learning_rate": 1.3764863250050025e-06, + "loss": 0.77454162, + "num_input_tokens_seen": 109849465, + "step": 5097, + "time_per_iteration": 2.479778289794922 + }, + { + "auxiliary_loss_clip": 0.01131445, + "auxiliary_loss_mlp": 0.01025279, + "balance_loss_clip": 1.04379332, + "balance_loss_mlp": 1.01813877, + "epoch": 0.6129982564780857, + "flos": 24937088192640.0, + "grad_norm": 1.981780912091678, + "language_loss": 0.80632573, + "learning_rate": 1.3757462231209272e-06, + "loss": 0.82789296, + "num_input_tokens_seen": 109869770, + "step": 5098, + "time_per_iteration": 2.662351608276367 + }, + { + "auxiliary_loss_clip": 0.01140503, + "auxiliary_loss_mlp": 0.01021938, + "balance_loss_clip": 1.04517055, + "balance_loss_mlp": 1.01402903, + "epoch": 0.6131184993687249, + "flos": 22492038430080.0, + "grad_norm": 2.0171264934790583, + "language_loss": 0.88763428, + "learning_rate": 1.3750062159356525e-06, + "loss": 0.90925872, + "num_input_tokens_seen": 109889120, + "step": 5099, + "time_per_iteration": 2.5161983966827393 + }, + { + "auxiliary_loss_clip": 0.01122971, + "auxiliary_loss_mlp": 0.01026349, + "balance_loss_clip": 1.04434502, + "balance_loss_mlp": 1.01906502, + "epoch": 0.6132387422593639, + "flos": 15885839750400.0, + "grad_norm": 2.04367757375538, + "language_loss": 0.82867253, + "learning_rate": 1.3742663035614382e-06, + "loss": 0.85016572, + "num_input_tokens_seen": 109906490, + "step": 5100, + "time_per_iteration": 2.517151355743408 + }, + { + "auxiliary_loss_clip": 0.01175285, + "auxiliary_loss_mlp": 0.01029643, + "balance_loss_clip": 1.05015898, + "balance_loss_mlp": 1.02184093, + "epoch": 0.613358985150003, + "flos": 25411539962880.0, + "grad_norm": 2.009991990939654, + "language_loss": 0.80037117, + "learning_rate": 1.3735264861105283e-06, + "loss": 0.82242042, + "num_input_tokens_seen": 109927130, + "step": 5101, + "time_per_iteration": 2.5776989459991455 + }, + { + "auxiliary_loss_clip": 0.01131299, + "auxiliary_loss_mlp": 0.01025544, + "balance_loss_clip": 1.04308724, + "balance_loss_mlp": 1.01837039, + "epoch": 0.6134792280406421, + "flos": 21361283308800.0, + "grad_norm": 1.9048278151711167, + "language_loss": 0.78469151, + "learning_rate": 1.372786763695152e-06, + "loss": 0.80625993, + "num_input_tokens_seen": 109945890, + "step": 5102, + "time_per_iteration": 2.5326650142669678 + }, + { + "auxiliary_loss_clip": 0.0115952, + "auxiliary_loss_mlp": 0.01029979, + "balance_loss_clip": 1.04623199, + "balance_loss_mlp": 1.02205157, + "epoch": 0.6135994709312812, + "flos": 21211248199680.0, + "grad_norm": 1.799539783501876, + "language_loss": 0.77644503, + "learning_rate": 1.3720471364275257e-06, + "loss": 0.79834002, + "num_input_tokens_seen": 109965535, + "step": 5103, + "time_per_iteration": 2.4551656246185303 + }, + { + "auxiliary_loss_clip": 0.01130285, + "auxiliary_loss_mlp": 0.00762707, + "balance_loss_clip": 1.0460124, + "balance_loss_mlp": 1.00095665, + "epoch": 0.6137197138219203, + "flos": 14794047907200.0, + "grad_norm": 1.883237500927885, + "language_loss": 0.77828056, + "learning_rate": 1.3713076044198486e-06, + "loss": 0.79721051, + "num_input_tokens_seen": 109982345, + "step": 5104, + "time_per_iteration": 2.510901927947998 + }, + { + "auxiliary_loss_clip": 0.01140271, + "auxiliary_loss_mlp": 0.01030965, + "balance_loss_clip": 1.04549801, + "balance_loss_mlp": 1.02297473, + "epoch": 0.6138399567125594, + "flos": 20084515401600.0, + "grad_norm": 2.997149840557895, + "language_loss": 0.80803382, + "learning_rate": 1.3705681677843086e-06, + "loss": 0.82974625, + "num_input_tokens_seen": 110000940, + "step": 5105, + "time_per_iteration": 2.498356342315674 + }, + { + "auxiliary_loss_clip": 0.01069655, + "auxiliary_loss_mlp": 0.01004952, + "balance_loss_clip": 1.00980091, + "balance_loss_mlp": 1.00376558, + "epoch": 0.6139601996031985, + "flos": 60123838193280.0, + "grad_norm": 0.7832404538040459, + "language_loss": 0.60579211, + "learning_rate": 1.3698288266330768e-06, + "loss": 0.62653816, + "num_input_tokens_seen": 110061565, + "step": 5106, + "time_per_iteration": 3.123157501220703 + }, + { + "auxiliary_loss_clip": 0.01144006, + "auxiliary_loss_mlp": 0.01026869, + "balance_loss_clip": 1.0504576, + "balance_loss_mlp": 1.01971376, + "epoch": 0.6140804424938375, + "flos": 23586703361280.0, + "grad_norm": 2.32609039371574, + "language_loss": 0.72945851, + "learning_rate": 1.3690895810783113e-06, + "loss": 0.7511673, + "num_input_tokens_seen": 110080360, + "step": 5107, + "time_per_iteration": 2.522468328475952 + }, + { + "auxiliary_loss_clip": 0.01111512, + "auxiliary_loss_mlp": 0.00762234, + "balance_loss_clip": 1.04188359, + "balance_loss_mlp": 1.00091434, + "epoch": 0.6142006853844767, + "flos": 21398199511680.0, + "grad_norm": 2.8139078123001973, + "language_loss": 0.71587187, + "learning_rate": 1.3683504312321543e-06, + "loss": 0.73460931, + "num_input_tokens_seen": 110100695, + "step": 5108, + "time_per_iteration": 3.3725545406341553 + }, + { + "auxiliary_loss_clip": 0.01162668, + "auxiliary_loss_mlp": 0.0102555, + "balance_loss_clip": 1.04750323, + "balance_loss_mlp": 1.0177238, + "epoch": 0.6143209282751158, + "flos": 12057367622400.0, + "grad_norm": 2.0239575321473717, + "language_loss": 0.80005813, + "learning_rate": 1.3676113772067355e-06, + "loss": 0.8219403, + "num_input_tokens_seen": 110117750, + "step": 5109, + "time_per_iteration": 2.4952569007873535 + }, + { + "auxiliary_loss_clip": 0.01123608, + "auxiliary_loss_mlp": 0.01027678, + "balance_loss_clip": 1.04424572, + "balance_loss_mlp": 1.02016151, + "epoch": 0.6144411711657548, + "flos": 25082274965760.0, + "grad_norm": 1.965517813736306, + "language_loss": 0.72517532, + "learning_rate": 1.3668724191141671e-06, + "loss": 0.74668813, + "num_input_tokens_seen": 110137020, + "step": 5110, + "time_per_iteration": 3.3986213207244873 + }, + { + "auxiliary_loss_clip": 0.01127527, + "auxiliary_loss_mlp": 0.0103085, + "balance_loss_clip": 1.05055416, + "balance_loss_mlp": 1.02338791, + "epoch": 0.6145614140563939, + "flos": 20114069316480.0, + "grad_norm": 2.150872405261927, + "language_loss": 0.66048485, + "learning_rate": 1.3661335570665493e-06, + "loss": 0.68206859, + "num_input_tokens_seen": 110154930, + "step": 5111, + "time_per_iteration": 2.5517046451568604 + }, + { + "auxiliary_loss_clip": 0.01150832, + "auxiliary_loss_mlp": 0.01028431, + "balance_loss_clip": 1.04953194, + "balance_loss_mlp": 1.02102852, + "epoch": 0.614681656947033, + "flos": 16800376953600.0, + "grad_norm": 2.507269727122353, + "language_loss": 0.70099467, + "learning_rate": 1.3653947911759676e-06, + "loss": 0.72278726, + "num_input_tokens_seen": 110172480, + "step": 5112, + "time_per_iteration": 3.226322650909424 + }, + { + "auxiliary_loss_clip": 0.01111914, + "auxiliary_loss_mlp": 0.01033781, + "balance_loss_clip": 1.04393983, + "balance_loss_mlp": 1.02545965, + "epoch": 0.6148018998376721, + "flos": 38801587011840.0, + "grad_norm": 2.3105793537484685, + "language_loss": 0.74480832, + "learning_rate": 1.3646561215544904e-06, + "loss": 0.76626521, + "num_input_tokens_seen": 110197120, + "step": 5113, + "time_per_iteration": 2.748044967651367 + }, + { + "auxiliary_loss_clip": 0.01159632, + "auxiliary_loss_mlp": 0.01021783, + "balance_loss_clip": 1.04841089, + "balance_loss_mlp": 1.01404333, + "epoch": 0.6149221427283111, + "flos": 23327032965120.0, + "grad_norm": 2.09988237561293, + "language_loss": 0.79367054, + "learning_rate": 1.363917548314176e-06, + "loss": 0.81548464, + "num_input_tokens_seen": 110216385, + "step": 5114, + "time_per_iteration": 2.49957537651062 + }, + { + "auxiliary_loss_clip": 0.01166293, + "auxiliary_loss_mlp": 0.01027291, + "balance_loss_clip": 1.0482161, + "balance_loss_mlp": 1.0199213, + "epoch": 0.6150423856189503, + "flos": 22379494141440.0, + "grad_norm": 2.387694026683991, + "language_loss": 0.73145008, + "learning_rate": 1.3631790715670626e-06, + "loss": 0.75338602, + "num_input_tokens_seen": 110234790, + "step": 5115, + "time_per_iteration": 2.4730191230773926 + }, + { + "auxiliary_loss_clip": 0.01081815, + "auxiliary_loss_mlp": 0.01022016, + "balance_loss_clip": 1.04403329, + "balance_loss_mlp": 1.01514101, + "epoch": 0.6151626285095894, + "flos": 18692078722560.0, + "grad_norm": 1.8372455900580082, + "language_loss": 0.85418034, + "learning_rate": 1.3624406914251783e-06, + "loss": 0.87521863, + "num_input_tokens_seen": 110251910, + "step": 5116, + "time_per_iteration": 2.609921932220459 + }, + { + "auxiliary_loss_clip": 0.01159681, + "auxiliary_loss_mlp": 0.01029603, + "balance_loss_clip": 1.04581094, + "balance_loss_mlp": 1.02277839, + "epoch": 0.6152828714002284, + "flos": 15851688894720.0, + "grad_norm": 1.9245791342039038, + "language_loss": 0.88439143, + "learning_rate": 1.3617024080005335e-06, + "loss": 0.90628427, + "num_input_tokens_seen": 110268810, + "step": 5117, + "time_per_iteration": 2.450669050216675 + }, + { + "auxiliary_loss_clip": 0.01148249, + "auxiliary_loss_mlp": 0.00762093, + "balance_loss_clip": 1.04538417, + "balance_loss_mlp": 1.00104129, + "epoch": 0.6154031142908676, + "flos": 24869792062080.0, + "grad_norm": 3.031096369072982, + "language_loss": 0.74350667, + "learning_rate": 1.3609642214051266e-06, + "loss": 0.76261014, + "num_input_tokens_seen": 110293035, + "step": 5118, + "time_per_iteration": 2.6209850311279297 + }, + { + "auxiliary_loss_clip": 0.01138878, + "auxiliary_loss_mlp": 0.01028774, + "balance_loss_clip": 1.04738593, + "balance_loss_mlp": 1.0208528, + "epoch": 0.6155233571815066, + "flos": 19244744357760.0, + "grad_norm": 2.083995204102124, + "language_loss": 0.65867937, + "learning_rate": 1.3602261317509385e-06, + "loss": 0.68035597, + "num_input_tokens_seen": 110309695, + "step": 5119, + "time_per_iteration": 2.5007104873657227 + }, + { + "auxiliary_loss_clip": 0.01162134, + "auxiliary_loss_mlp": 0.01022113, + "balance_loss_clip": 1.04835606, + "balance_loss_mlp": 1.01408982, + "epoch": 0.6156436000721457, + "flos": 18770077105920.0, + "grad_norm": 2.4172311392000236, + "language_loss": 0.81997931, + "learning_rate": 1.3594881391499387e-06, + "loss": 0.84182179, + "num_input_tokens_seen": 110328610, + "step": 5120, + "time_per_iteration": 2.4600441455841064 + }, + { + "auxiliary_loss_clip": 0.01150187, + "auxiliary_loss_mlp": 0.0102707, + "balance_loss_clip": 1.04886436, + "balance_loss_mlp": 1.01922047, + "epoch": 0.6157638429627849, + "flos": 18041198325120.0, + "grad_norm": 1.7545953805683276, + "language_loss": 0.79344785, + "learning_rate": 1.3587502437140778e-06, + "loss": 0.81522042, + "num_input_tokens_seen": 110346775, + "step": 5121, + "time_per_iteration": 2.48138165473938 + }, + { + "auxiliary_loss_clip": 0.01148893, + "auxiliary_loss_mlp": 0.01026412, + "balance_loss_clip": 1.04600823, + "balance_loss_mlp": 1.01866972, + "epoch": 0.6158840858534239, + "flos": 25556726736000.0, + "grad_norm": 2.4264165643319084, + "language_loss": 0.85047513, + "learning_rate": 1.3580124455552952e-06, + "loss": 0.87222815, + "num_input_tokens_seen": 110366140, + "step": 5122, + "time_per_iteration": 2.601832389831543 + }, + { + "auxiliary_loss_clip": 0.01160568, + "auxiliary_loss_mlp": 0.00761413, + "balance_loss_clip": 1.04889679, + "balance_loss_mlp": 1.00108898, + "epoch": 0.616004328744063, + "flos": 24640788902400.0, + "grad_norm": 1.8716399840288147, + "language_loss": 0.87615049, + "learning_rate": 1.3572747447855148e-06, + "loss": 0.89537024, + "num_input_tokens_seen": 110386550, + "step": 5123, + "time_per_iteration": 2.54042911529541 + }, + { + "auxiliary_loss_clip": 0.01178466, + "auxiliary_loss_mlp": 0.01025553, + "balance_loss_clip": 1.05306315, + "balance_loss_mlp": 1.01788211, + "epoch": 0.6161245716347021, + "flos": 21689686379520.0, + "grad_norm": 1.7569713524084278, + "language_loss": 0.69052863, + "learning_rate": 1.356537141516644e-06, + "loss": 0.71256882, + "num_input_tokens_seen": 110403970, + "step": 5124, + "time_per_iteration": 2.44518780708313 + }, + { + "auxiliary_loss_clip": 0.01161715, + "auxiliary_loss_mlp": 0.01026066, + "balance_loss_clip": 1.05088854, + "balance_loss_mlp": 1.01844275, + "epoch": 0.6162448145253412, + "flos": 35189225061120.0, + "grad_norm": 2.0645172126361926, + "language_loss": 0.61632109, + "learning_rate": 1.3557996358605775e-06, + "loss": 0.63819891, + "num_input_tokens_seen": 110423890, + "step": 5125, + "time_per_iteration": 2.5845084190368652 + }, + { + "auxiliary_loss_clip": 0.01158481, + "auxiliary_loss_mlp": 0.01028013, + "balance_loss_clip": 1.04738593, + "balance_loss_mlp": 1.02082133, + "epoch": 0.6163650574159802, + "flos": 21615279356160.0, + "grad_norm": 3.033971106659516, + "language_loss": 0.70583391, + "learning_rate": 1.3550622279291941e-06, + "loss": 0.7276988, + "num_input_tokens_seen": 110442035, + "step": 5126, + "time_per_iteration": 2.4723961353302 + }, + { + "auxiliary_loss_clip": 0.01108458, + "auxiliary_loss_mlp": 0.01024287, + "balance_loss_clip": 1.04140246, + "balance_loss_mlp": 1.01655662, + "epoch": 0.6164853003066194, + "flos": 24572163968640.0, + "grad_norm": 1.4376602960312546, + "language_loss": 0.83175713, + "learning_rate": 1.354324917834358e-06, + "loss": 0.85308456, + "num_input_tokens_seen": 110463280, + "step": 5127, + "time_per_iteration": 2.6061646938323975 + }, + { + "auxiliary_loss_clip": 0.01098352, + "auxiliary_loss_mlp": 0.00762102, + "balance_loss_clip": 1.0405606, + "balance_loss_mlp": 1.00102234, + "epoch": 0.6166055431972585, + "flos": 21835986474240.0, + "grad_norm": 2.464458056768648, + "language_loss": 0.76817596, + "learning_rate": 1.353587705687918e-06, + "loss": 0.78678048, + "num_input_tokens_seen": 110481455, + "step": 5128, + "time_per_iteration": 2.6060678958892822 + }, + { + "auxiliary_loss_clip": 0.01149546, + "auxiliary_loss_mlp": 0.01025282, + "balance_loss_clip": 1.04813445, + "balance_loss_mlp": 1.01766431, + "epoch": 0.6167257860878975, + "flos": 17785262943360.0, + "grad_norm": 3.70095349431242, + "language_loss": 0.72384924, + "learning_rate": 1.3528505916017096e-06, + "loss": 0.74559748, + "num_input_tokens_seen": 110499155, + "step": 5129, + "time_per_iteration": 2.4798290729522705 + }, + { + "auxiliary_loss_clip": 0.01159523, + "auxiliary_loss_mlp": 0.01034139, + "balance_loss_clip": 1.04680467, + "balance_loss_mlp": 1.02626562, + "epoch": 0.6168460289785367, + "flos": 23214811898880.0, + "grad_norm": 2.034478975280225, + "language_loss": 0.88297796, + "learning_rate": 1.3521135756875514e-06, + "loss": 0.90491462, + "num_input_tokens_seen": 110515470, + "step": 5130, + "time_per_iteration": 2.462958574295044 + }, + { + "auxiliary_loss_clip": 0.01095244, + "auxiliary_loss_mlp": 0.01022074, + "balance_loss_clip": 1.04171753, + "balance_loss_mlp": 1.01494217, + "epoch": 0.6169662718691757, + "flos": 26213281482240.0, + "grad_norm": 2.1723950211145078, + "language_loss": 0.86092842, + "learning_rate": 1.3513766580572496e-06, + "loss": 0.88210166, + "num_input_tokens_seen": 110538290, + "step": 5131, + "time_per_iteration": 2.671987533569336 + }, + { + "auxiliary_loss_clip": 0.01158708, + "auxiliary_loss_mlp": 0.01023957, + "balance_loss_clip": 1.04716825, + "balance_loss_mlp": 1.01704907, + "epoch": 0.6170865147598148, + "flos": 19026120228480.0, + "grad_norm": 2.093452949976627, + "language_loss": 0.77169865, + "learning_rate": 1.3506398388225924e-06, + "loss": 0.79352534, + "num_input_tokens_seen": 110555610, + "step": 5132, + "time_per_iteration": 2.456223726272583 + }, + { + "auxiliary_loss_clip": 0.01172374, + "auxiliary_loss_mlp": 0.01025262, + "balance_loss_clip": 1.05017352, + "balance_loss_mlp": 1.01830876, + "epoch": 0.617206757650454, + "flos": 18260361158400.0, + "grad_norm": 1.7981572918323914, + "language_loss": 0.71909374, + "learning_rate": 1.349903118095355e-06, + "loss": 0.74107015, + "num_input_tokens_seen": 110574745, + "step": 5133, + "time_per_iteration": 2.4297428131103516 + }, + { + "auxiliary_loss_clip": 0.01161814, + "auxiliary_loss_mlp": 0.01026776, + "balance_loss_clip": 1.04721999, + "balance_loss_mlp": 1.01943564, + "epoch": 0.617327000541093, + "flos": 18186959715840.0, + "grad_norm": 1.7488514873706469, + "language_loss": 0.7320624, + "learning_rate": 1.349166495987298e-06, + "loss": 0.75394833, + "num_input_tokens_seen": 110593310, + "step": 5134, + "time_per_iteration": 2.4391725063323975 + }, + { + "auxiliary_loss_clip": 0.01053607, + "auxiliary_loss_mlp": 0.01006959, + "balance_loss_clip": 1.01878321, + "balance_loss_mlp": 1.00554073, + "epoch": 0.6174472434317321, + "flos": 61833796122240.0, + "grad_norm": 0.8224879730784661, + "language_loss": 0.60894728, + "learning_rate": 1.348429972610166e-06, + "loss": 0.62955296, + "num_input_tokens_seen": 110657615, + "step": 5135, + "time_per_iteration": 3.8751726150512695 + }, + { + "auxiliary_loss_clip": 0.01029565, + "auxiliary_loss_mlp": 0.01003446, + "balance_loss_clip": 1.02107882, + "balance_loss_mlp": 1.00209868, + "epoch": 0.6175674863223712, + "flos": 71230970494080.0, + "grad_norm": 0.8512068328539234, + "language_loss": 0.5785358, + "learning_rate": 1.3476935480756897e-06, + "loss": 0.59886593, + "num_input_tokens_seen": 110714365, + "step": 5136, + "time_per_iteration": 4.518130302429199 + }, + { + "auxiliary_loss_clip": 0.01122559, + "auxiliary_loss_mlp": 0.01028338, + "balance_loss_clip": 1.04321325, + "balance_loss_mlp": 1.02054214, + "epoch": 0.6176877292130103, + "flos": 21835447770240.0, + "grad_norm": 2.1772773467052335, + "language_loss": 0.75235355, + "learning_rate": 1.346957222495583e-06, + "loss": 0.77386248, + "num_input_tokens_seen": 110732160, + "step": 5137, + "time_per_iteration": 2.638195753097534 + }, + { + "auxiliary_loss_clip": 0.01152502, + "auxiliary_loss_mlp": 0.00762161, + "balance_loss_clip": 1.04926801, + "balance_loss_mlp": 1.0010711, + "epoch": 0.6178079721036493, + "flos": 17741738638080.0, + "grad_norm": 2.3623896083624656, + "language_loss": 0.70752883, + "learning_rate": 1.3462209959815466e-06, + "loss": 0.72667545, + "num_input_tokens_seen": 110746900, + "step": 5138, + "time_per_iteration": 2.5501160621643066 + }, + { + "auxiliary_loss_clip": 0.0114949, + "auxiliary_loss_mlp": 0.01026657, + "balance_loss_clip": 1.04833078, + "balance_loss_mlp": 1.01929307, + "epoch": 0.6179282149942885, + "flos": 22633131052800.0, + "grad_norm": 2.275662479101894, + "language_loss": 0.7425462, + "learning_rate": 1.345484868645265e-06, + "loss": 0.76430774, + "num_input_tokens_seen": 110765710, + "step": 5139, + "time_per_iteration": 3.2801427841186523 + }, + { + "auxiliary_loss_clip": 0.01137952, + "auxiliary_loss_mlp": 0.01026571, + "balance_loss_clip": 1.04489565, + "balance_loss_mlp": 1.01850617, + "epoch": 0.6180484578849276, + "flos": 22310330503680.0, + "grad_norm": 1.8936621347926708, + "language_loss": 0.7862764, + "learning_rate": 1.3447488405984088e-06, + "loss": 0.80792165, + "num_input_tokens_seen": 110783970, + "step": 5140, + "time_per_iteration": 2.5696122646331787 + }, + { + "auxiliary_loss_clip": 0.01144825, + "auxiliary_loss_mlp": 0.01025241, + "balance_loss_clip": 1.04700136, + "balance_loss_mlp": 1.01776683, + "epoch": 0.6181687007755666, + "flos": 35225458905600.0, + "grad_norm": 2.464154437436713, + "language_loss": 0.70017338, + "learning_rate": 1.3440129119526322e-06, + "loss": 0.72187412, + "num_input_tokens_seen": 110806395, + "step": 5141, + "time_per_iteration": 2.657010078430176 + }, + { + "auxiliary_loss_clip": 0.01070557, + "auxiliary_loss_mlp": 0.01001835, + "balance_loss_clip": 1.01107216, + "balance_loss_mlp": 1.0007267, + "epoch": 0.6182889436662057, + "flos": 61547370094080.0, + "grad_norm": 0.8040977944210721, + "language_loss": 0.51220036, + "learning_rate": 1.3432770828195762e-06, + "loss": 0.53292429, + "num_input_tokens_seen": 110867380, + "step": 5142, + "time_per_iteration": 3.187882900238037 + }, + { + "auxiliary_loss_clip": 0.01120777, + "auxiliary_loss_mlp": 0.01021414, + "balance_loss_clip": 1.0414598, + "balance_loss_mlp": 1.01325965, + "epoch": 0.6184091865568448, + "flos": 19609991804160.0, + "grad_norm": 2.4413085696691144, + "language_loss": 0.70113772, + "learning_rate": 1.3425413533108635e-06, + "loss": 0.72255969, + "num_input_tokens_seen": 110885980, + "step": 5143, + "time_per_iteration": 2.5371270179748535 + }, + { + "auxiliary_loss_clip": 0.0111859, + "auxiliary_loss_mlp": 0.01023703, + "balance_loss_clip": 1.04640603, + "balance_loss_mlp": 1.01660383, + "epoch": 0.6185294294474839, + "flos": 23586882929280.0, + "grad_norm": 2.3345796296437564, + "language_loss": 0.70963466, + "learning_rate": 1.341805723538105e-06, + "loss": 0.73105752, + "num_input_tokens_seen": 110906085, + "step": 5144, + "time_per_iteration": 2.584836483001709 + }, + { + "auxiliary_loss_clip": 0.01153156, + "auxiliary_loss_mlp": 0.01029369, + "balance_loss_clip": 1.04889417, + "balance_loss_mlp": 1.02187634, + "epoch": 0.618649672338123, + "flos": 26762032535040.0, + "grad_norm": 1.5687018813944529, + "language_loss": 0.77499807, + "learning_rate": 1.3410701936128948e-06, + "loss": 0.79682326, + "num_input_tokens_seen": 110928865, + "step": 5145, + "time_per_iteration": 2.5616390705108643 + }, + { + "auxiliary_loss_clip": 0.01162891, + "auxiliary_loss_mlp": 0.01023668, + "balance_loss_clip": 1.05251884, + "balance_loss_mlp": 1.01644433, + "epoch": 0.6187699152287621, + "flos": 14456630522880.0, + "grad_norm": 2.68399403146229, + "language_loss": 0.85385764, + "learning_rate": 1.340334763646812e-06, + "loss": 0.87572318, + "num_input_tokens_seen": 110943000, + "step": 5146, + "time_per_iteration": 2.4834671020507812 + }, + { + "auxiliary_loss_clip": 0.01175688, + "auxiliary_loss_mlp": 0.01028196, + "balance_loss_clip": 1.04972434, + "balance_loss_mlp": 1.0197084, + "epoch": 0.6188901581194012, + "flos": 20084766796800.0, + "grad_norm": 1.9333035729169485, + "language_loss": 0.74045569, + "learning_rate": 1.3395994337514218e-06, + "loss": 0.76249456, + "num_input_tokens_seen": 110963170, + "step": 5147, + "time_per_iteration": 2.4363579750061035 + }, + { + "auxiliary_loss_clip": 0.01152595, + "auxiliary_loss_mlp": 0.01027783, + "balance_loss_clip": 1.04537332, + "balance_loss_mlp": 1.02055264, + "epoch": 0.6190104010100402, + "flos": 25700728360320.0, + "grad_norm": 1.6024589311036692, + "language_loss": 0.78608036, + "learning_rate": 1.3388642040382725e-06, + "loss": 0.8078841, + "num_input_tokens_seen": 110983595, + "step": 5148, + "time_per_iteration": 2.505571126937866 + }, + { + "auxiliary_loss_clip": 0.01131475, + "auxiliary_loss_mlp": 0.01022129, + "balance_loss_clip": 1.04032874, + "balance_loss_mlp": 1.01451731, + "epoch": 0.6191306439006794, + "flos": 30442372974720.0, + "grad_norm": 2.120590580365777, + "language_loss": 0.83949876, + "learning_rate": 1.3381290746188975e-06, + "loss": 0.86103475, + "num_input_tokens_seen": 111002965, + "step": 5149, + "time_per_iteration": 2.6071693897247314 + }, + { + "auxiliary_loss_clip": 0.01162739, + "auxiliary_loss_mlp": 0.01029414, + "balance_loss_clip": 1.05149567, + "balance_loss_mlp": 1.0215615, + "epoch": 0.6192508867913185, + "flos": 26685793918080.0, + "grad_norm": 1.7003303595989727, + "language_loss": 0.67512602, + "learning_rate": 1.3373940456048152e-06, + "loss": 0.69704759, + "num_input_tokens_seen": 111022990, + "step": 5150, + "time_per_iteration": 2.522519111633301 + }, + { + "auxiliary_loss_clip": 0.01174555, + "auxiliary_loss_mlp": 0.01023117, + "balance_loss_clip": 1.05199277, + "balance_loss_mlp": 1.01605368, + "epoch": 0.6193711296819575, + "flos": 36722036090880.0, + "grad_norm": 1.8873376450762336, + "language_loss": 0.59361368, + "learning_rate": 1.3366591171075299e-06, + "loss": 0.61559039, + "num_input_tokens_seen": 111046495, + "step": 5151, + "time_per_iteration": 2.5670816898345947 + }, + { + "auxiliary_loss_clip": 0.01145016, + "auxiliary_loss_mlp": 0.01024454, + "balance_loss_clip": 1.04702652, + "balance_loss_mlp": 1.0173198, + "epoch": 0.6194913725725967, + "flos": 25192556697600.0, + "grad_norm": 2.4301045000375914, + "language_loss": 0.90868622, + "learning_rate": 1.335924289238529e-06, + "loss": 0.93038094, + "num_input_tokens_seen": 111065705, + "step": 5152, + "time_per_iteration": 2.534996747970581 + }, + { + "auxiliary_loss_clip": 0.01159928, + "auxiliary_loss_mlp": 0.00762269, + "balance_loss_clip": 1.05174971, + "balance_loss_mlp": 1.00114846, + "epoch": 0.6196116154632357, + "flos": 21178821196800.0, + "grad_norm": 1.6573313938494292, + "language_loss": 0.76696622, + "learning_rate": 1.3351895621092859e-06, + "loss": 0.78618819, + "num_input_tokens_seen": 111086050, + "step": 5153, + "time_per_iteration": 2.4716134071350098 + }, + { + "auxiliary_loss_clip": 0.01059492, + "auxiliary_loss_mlp": 0.01028931, + "balance_loss_clip": 1.0311451, + "balance_loss_mlp": 1.02157569, + "epoch": 0.6197318583538748, + "flos": 16253744803200.0, + "grad_norm": 1.9365602551109973, + "language_loss": 0.76270759, + "learning_rate": 1.3344549358312567e-06, + "loss": 0.78359181, + "num_input_tokens_seen": 111104450, + "step": 5154, + "time_per_iteration": 2.9061784744262695 + }, + { + "auxiliary_loss_clip": 0.01165887, + "auxiliary_loss_mlp": 0.01024581, + "balance_loss_clip": 1.05149198, + "balance_loss_mlp": 1.01683187, + "epoch": 0.619852101244514, + "flos": 24425612478720.0, + "grad_norm": 2.1925532273544683, + "language_loss": 0.77916992, + "learning_rate": 1.3337204105158852e-06, + "loss": 0.80107462, + "num_input_tokens_seen": 111123320, + "step": 5155, + "time_per_iteration": 2.6941792964935303 + }, + { + "auxiliary_loss_clip": 0.01115428, + "auxiliary_loss_mlp": 0.01026964, + "balance_loss_clip": 1.03664279, + "balance_loss_mlp": 1.01916122, + "epoch": 0.619972344135153, + "flos": 16727298733440.0, + "grad_norm": 4.647783941676405, + "language_loss": 0.73057663, + "learning_rate": 1.332985986274597e-06, + "loss": 0.75200051, + "num_input_tokens_seen": 111140950, + "step": 5156, + "time_per_iteration": 2.5094850063323975 + }, + { + "auxiliary_loss_clip": 0.01096496, + "auxiliary_loss_mlp": 0.00761832, + "balance_loss_clip": 1.04552174, + "balance_loss_mlp": 1.00114286, + "epoch": 0.6200925870257921, + "flos": 12495190498560.0, + "grad_norm": 2.2782474785676037, + "language_loss": 0.75544965, + "learning_rate": 1.3322516632188047e-06, + "loss": 0.77403295, + "num_input_tokens_seen": 111157845, + "step": 5157, + "time_per_iteration": 2.586402654647827 + }, + { + "auxiliary_loss_clip": 0.01128424, + "auxiliary_loss_mlp": 0.01022657, + "balance_loss_clip": 1.04443479, + "balance_loss_mlp": 1.01495028, + "epoch": 0.6202128299164312, + "flos": 26539350168960.0, + "grad_norm": 3.0587691930165612, + "language_loss": 0.66654587, + "learning_rate": 1.3315174414599045e-06, + "loss": 0.68805671, + "num_input_tokens_seen": 111179165, + "step": 5158, + "time_per_iteration": 2.593744993209839 + }, + { + "auxiliary_loss_clip": 0.01155815, + "auxiliary_loss_mlp": 0.01026703, + "balance_loss_clip": 1.04619741, + "balance_loss_mlp": 1.01881766, + "epoch": 0.6203330728070703, + "flos": 18770508069120.0, + "grad_norm": 3.4862779979897134, + "language_loss": 0.75401807, + "learning_rate": 1.3307833211092768e-06, + "loss": 0.77584326, + "num_input_tokens_seen": 111197830, + "step": 5159, + "time_per_iteration": 2.4566805362701416 + }, + { + "auxiliary_loss_clip": 0.01175205, + "auxiliary_loss_mlp": 0.01030991, + "balance_loss_clip": 1.05179071, + "balance_loss_mlp": 1.02334952, + "epoch": 0.6204533156977093, + "flos": 20629782835200.0, + "grad_norm": 1.5968458518524469, + "language_loss": 0.75442815, + "learning_rate": 1.3300493022782873e-06, + "loss": 0.77649009, + "num_input_tokens_seen": 111218400, + "step": 5160, + "time_per_iteration": 2.455104112625122 + }, + { + "auxiliary_loss_clip": 0.0110726, + "auxiliary_loss_mlp": 0.00762699, + "balance_loss_clip": 1.04261184, + "balance_loss_mlp": 1.00110292, + "epoch": 0.6205735585883485, + "flos": 17348050598400.0, + "grad_norm": 2.103079053215316, + "language_loss": 0.72135925, + "learning_rate": 1.3293153850782855e-06, + "loss": 0.74005884, + "num_input_tokens_seen": 111236720, + "step": 5161, + "time_per_iteration": 3.9263412952423096 + }, + { + "auxiliary_loss_clip": 0.01121962, + "auxiliary_loss_mlp": 0.0102695, + "balance_loss_clip": 1.04392374, + "balance_loss_mlp": 1.01792622, + "epoch": 0.6206938014789876, + "flos": 22965017742720.0, + "grad_norm": 2.2931763689600575, + "language_loss": 0.71169907, + "learning_rate": 1.3285815696206069e-06, + "loss": 0.73318821, + "num_input_tokens_seen": 111258265, + "step": 5162, + "time_per_iteration": 2.6379740238189697 + }, + { + "auxiliary_loss_clip": 0.0113161, + "auxiliary_loss_mlp": 0.01029216, + "balance_loss_clip": 1.04308486, + "balance_loss_mlp": 1.0214082, + "epoch": 0.6208140443696266, + "flos": 23983192661760.0, + "grad_norm": 1.8116094678173282, + "language_loss": 0.77229488, + "learning_rate": 1.32784785601657e-06, + "loss": 0.79390317, + "num_input_tokens_seen": 111277675, + "step": 5163, + "time_per_iteration": 4.127584934234619 + }, + { + "auxiliary_loss_clip": 0.01147558, + "auxiliary_loss_mlp": 0.01022162, + "balance_loss_clip": 1.04519987, + "balance_loss_mlp": 1.01485205, + "epoch": 0.6209342872602658, + "flos": 35077291303680.0, + "grad_norm": 2.2194237477894636, + "language_loss": 0.73925924, + "learning_rate": 1.3271142443774798e-06, + "loss": 0.76095641, + "num_input_tokens_seen": 111299910, + "step": 5164, + "time_per_iteration": 2.6179938316345215 + }, + { + "auxiliary_loss_clip": 0.0114355, + "auxiliary_loss_mlp": 0.01022417, + "balance_loss_clip": 1.04831338, + "balance_loss_mlp": 1.01492822, + "epoch": 0.6210545301509048, + "flos": 26979327861120.0, + "grad_norm": 1.8481748331209533, + "language_loss": 0.81155956, + "learning_rate": 1.3263807348146228e-06, + "loss": 0.83321917, + "num_input_tokens_seen": 111319765, + "step": 5165, + "time_per_iteration": 2.555447578430176 + }, + { + "auxiliary_loss_clip": 0.01142785, + "auxiliary_loss_mlp": 0.01034178, + "balance_loss_clip": 1.04339242, + "balance_loss_mlp": 1.02595878, + "epoch": 0.6211747730415439, + "flos": 33618240852480.0, + "grad_norm": 1.9671271466246987, + "language_loss": 0.73313224, + "learning_rate": 1.3256473274392733e-06, + "loss": 0.75490189, + "num_input_tokens_seen": 111341110, + "step": 5166, + "time_per_iteration": 3.4217660427093506 + }, + { + "auxiliary_loss_clip": 0.01173559, + "auxiliary_loss_mlp": 0.01028445, + "balance_loss_clip": 1.04973316, + "balance_loss_mlp": 1.02090502, + "epoch": 0.6212950159321831, + "flos": 34167099646080.0, + "grad_norm": 1.675793478021476, + "language_loss": 0.69939232, + "learning_rate": 1.3249140223626873e-06, + "loss": 0.72141236, + "num_input_tokens_seen": 111362730, + "step": 5167, + "time_per_iteration": 2.541304588317871 + }, + { + "auxiliary_loss_clip": 0.0115825, + "auxiliary_loss_mlp": 0.01021138, + "balance_loss_clip": 1.04894185, + "balance_loss_mlp": 1.01374686, + "epoch": 0.6214152588228221, + "flos": 27965758135680.0, + "grad_norm": 2.1641178051614083, + "language_loss": 0.75253999, + "learning_rate": 1.3241808196961077e-06, + "loss": 0.77433389, + "num_input_tokens_seen": 111383855, + "step": 5168, + "time_per_iteration": 2.5407180786132812 + }, + { + "auxiliary_loss_clip": 0.01132375, + "auxiliary_loss_mlp": 0.01024554, + "balance_loss_clip": 1.04371834, + "balance_loss_mlp": 1.01701355, + "epoch": 0.6215355017134612, + "flos": 20230204965120.0, + "grad_norm": 1.7220115524134032, + "language_loss": 0.70674074, + "learning_rate": 1.3234477195507608e-06, + "loss": 0.72831005, + "num_input_tokens_seen": 111402685, + "step": 5169, + "time_per_iteration": 2.5497772693634033 + }, + { + "auxiliary_loss_clip": 0.01131752, + "auxiliary_loss_mlp": 0.01027114, + "balance_loss_clip": 1.04600322, + "balance_loss_mlp": 1.02010155, + "epoch": 0.6216557446041003, + "flos": 41428129219200.0, + "grad_norm": 2.5873577957245035, + "language_loss": 0.62600338, + "learning_rate": 1.322714722037857e-06, + "loss": 0.64759207, + "num_input_tokens_seen": 111424130, + "step": 5170, + "time_per_iteration": 2.751080274581909 + }, + { + "auxiliary_loss_clip": 0.01140804, + "auxiliary_loss_mlp": 0.01028288, + "balance_loss_clip": 1.04727161, + "balance_loss_mlp": 1.02071834, + "epoch": 0.6217759874947394, + "flos": 27928770105600.0, + "grad_norm": 2.9185502939470505, + "language_loss": 0.77488983, + "learning_rate": 1.321981827268591e-06, + "loss": 0.79658067, + "num_input_tokens_seen": 111444785, + "step": 5171, + "time_per_iteration": 2.6405937671661377 + }, + { + "auxiliary_loss_clip": 0.01148087, + "auxiliary_loss_mlp": 0.01026158, + "balance_loss_clip": 1.04696107, + "balance_loss_mlp": 1.01865399, + "epoch": 0.6218962303853784, + "flos": 21765673601280.0, + "grad_norm": 1.7055591869850886, + "language_loss": 0.81724036, + "learning_rate": 1.3212490353541426e-06, + "loss": 0.83898282, + "num_input_tokens_seen": 111467045, + "step": 5172, + "time_per_iteration": 2.6360280513763428 + }, + { + "auxiliary_loss_clip": 0.01174144, + "auxiliary_loss_mlp": 0.01025209, + "balance_loss_clip": 1.04861367, + "balance_loss_mlp": 1.01731753, + "epoch": 0.6220164732760175, + "flos": 21246260981760.0, + "grad_norm": 1.8877148003885278, + "language_loss": 0.80151141, + "learning_rate": 1.3205163464056762e-06, + "loss": 0.82350504, + "num_input_tokens_seen": 111483650, + "step": 5173, + "time_per_iteration": 2.4705910682678223 + }, + { + "auxiliary_loss_clip": 0.01158402, + "auxiliary_loss_mlp": 0.01029031, + "balance_loss_clip": 1.04790473, + "balance_loss_mlp": 1.02156806, + "epoch": 0.6221367161666567, + "flos": 26136360506880.0, + "grad_norm": 1.865239061774146, + "language_loss": 0.72882378, + "learning_rate": 1.319783760534339e-06, + "loss": 0.75069809, + "num_input_tokens_seen": 111502895, + "step": 5174, + "time_per_iteration": 2.574486255645752 + }, + { + "auxiliary_loss_clip": 0.01160611, + "auxiliary_loss_mlp": 0.01029911, + "balance_loss_clip": 1.05051351, + "balance_loss_mlp": 1.02165031, + "epoch": 0.6222569590572957, + "flos": 16284196558080.0, + "grad_norm": 2.4924997087597385, + "language_loss": 0.75840986, + "learning_rate": 1.319051277851266e-06, + "loss": 0.78031504, + "num_input_tokens_seen": 111519180, + "step": 5175, + "time_per_iteration": 2.454510450363159 + }, + { + "auxiliary_loss_clip": 0.01162019, + "auxiliary_loss_mlp": 0.01025693, + "balance_loss_clip": 1.04844892, + "balance_loss_mlp": 1.01816535, + "epoch": 0.6223772019479348, + "flos": 18223840005120.0, + "grad_norm": 2.1942991965476044, + "language_loss": 0.83760411, + "learning_rate": 1.3183188984675716e-06, + "loss": 0.85948122, + "num_input_tokens_seen": 111537545, + "step": 5176, + "time_per_iteration": 2.4428935050964355 + }, + { + "auxiliary_loss_clip": 0.01146059, + "auxiliary_loss_mlp": 0.01031191, + "balance_loss_clip": 1.04925692, + "balance_loss_mlp": 1.0238719, + "epoch": 0.6224974448385739, + "flos": 27489797994240.0, + "grad_norm": 10.287912751017865, + "language_loss": 0.71032321, + "learning_rate": 1.3175866224943586e-06, + "loss": 0.73209572, + "num_input_tokens_seen": 111556265, + "step": 5177, + "time_per_iteration": 2.5382964611053467 + }, + { + "auxiliary_loss_clip": 0.01150574, + "auxiliary_loss_mlp": 0.01029063, + "balance_loss_clip": 1.04912925, + "balance_loss_mlp": 1.021106, + "epoch": 0.622617687729213, + "flos": 19791951125760.0, + "grad_norm": 2.202239911528675, + "language_loss": 0.7340591, + "learning_rate": 1.316854450042712e-06, + "loss": 0.75585544, + "num_input_tokens_seen": 111574205, + "step": 5178, + "time_per_iteration": 2.4864184856414795 + }, + { + "auxiliary_loss_clip": 0.01165482, + "auxiliary_loss_mlp": 0.01022952, + "balance_loss_clip": 1.05000114, + "balance_loss_mlp": 1.01516128, + "epoch": 0.622737930619852, + "flos": 23038886062080.0, + "grad_norm": 1.8754772073970003, + "language_loss": 0.7418319, + "learning_rate": 1.3161223812237024e-06, + "loss": 0.76371622, + "num_input_tokens_seen": 111593560, + "step": 5179, + "time_per_iteration": 2.5113701820373535 + }, + { + "auxiliary_loss_clip": 0.01172312, + "auxiliary_loss_mlp": 0.01027641, + "balance_loss_clip": 1.04750204, + "balance_loss_mlp": 1.01995468, + "epoch": 0.6228581735104912, + "flos": 12634271959680.0, + "grad_norm": 2.833343260352334, + "language_loss": 0.85090661, + "learning_rate": 1.3153904161483842e-06, + "loss": 0.87290609, + "num_input_tokens_seen": 111608860, + "step": 5180, + "time_per_iteration": 2.3850820064544678 + }, + { + "auxiliary_loss_clip": 0.01128149, + "auxiliary_loss_mlp": 0.01024862, + "balance_loss_clip": 1.04388022, + "balance_loss_mlp": 1.01662445, + "epoch": 0.6229784164011303, + "flos": 23802813538560.0, + "grad_norm": 3.979298948557219, + "language_loss": 0.85619509, + "learning_rate": 1.3146585549277953e-06, + "loss": 0.87772524, + "num_input_tokens_seen": 111627500, + "step": 5181, + "time_per_iteration": 2.545823574066162 + }, + { + "auxiliary_loss_clip": 0.01157525, + "auxiliary_loss_mlp": 0.01032989, + "balance_loss_clip": 1.05047035, + "balance_loss_mlp": 1.02530575, + "epoch": 0.6230986592917693, + "flos": 22414219614720.0, + "grad_norm": 3.3334693312208934, + "language_loss": 0.78786778, + "learning_rate": 1.3139267976729591e-06, + "loss": 0.80977285, + "num_input_tokens_seen": 111647690, + "step": 5182, + "time_per_iteration": 2.5082826614379883 + }, + { + "auxiliary_loss_clip": 0.01162974, + "auxiliary_loss_mlp": 0.01025992, + "balance_loss_clip": 1.05018318, + "balance_loss_mlp": 1.01799345, + "epoch": 0.6232189021824085, + "flos": 34528217028480.0, + "grad_norm": 1.841934898546098, + "language_loss": 0.72152853, + "learning_rate": 1.3131951444948815e-06, + "loss": 0.74341822, + "num_input_tokens_seen": 111667090, + "step": 5183, + "time_per_iteration": 2.568795919418335 + }, + { + "auxiliary_loss_clip": 0.011484, + "auxiliary_loss_mlp": 0.01028568, + "balance_loss_clip": 1.04870439, + "balance_loss_mlp": 1.02103758, + "epoch": 0.6233391450730476, + "flos": 22237000888320.0, + "grad_norm": 1.876432437600538, + "language_loss": 0.76547825, + "learning_rate": 1.3124635955045546e-06, + "loss": 0.78724802, + "num_input_tokens_seen": 111686905, + "step": 5184, + "time_per_iteration": 2.5141241550445557 + }, + { + "auxiliary_loss_clip": 0.01107953, + "auxiliary_loss_mlp": 0.00762432, + "balance_loss_clip": 1.04125237, + "balance_loss_mlp": 1.0010463, + "epoch": 0.6234593879636866, + "flos": 20332693445760.0, + "grad_norm": 1.946063964022245, + "language_loss": 0.84350699, + "learning_rate": 1.3117321508129537e-06, + "loss": 0.86221087, + "num_input_tokens_seen": 111704985, + "step": 5185, + "time_per_iteration": 2.5742194652557373 + }, + { + "auxiliary_loss_clip": 0.01148006, + "auxiliary_loss_mlp": 0.01023711, + "balance_loss_clip": 1.04755592, + "balance_loss_mlp": 1.01651716, + "epoch": 0.6235796308543258, + "flos": 20664903358080.0, + "grad_norm": 1.5271372824248046, + "language_loss": 0.75998068, + "learning_rate": 1.3110008105310388e-06, + "loss": 0.78169787, + "num_input_tokens_seen": 111724805, + "step": 5186, + "time_per_iteration": 2.573991298675537 + }, + { + "auxiliary_loss_clip": 0.01174558, + "auxiliary_loss_mlp": 0.01033103, + "balance_loss_clip": 1.04829597, + "balance_loss_mlp": 1.02514613, + "epoch": 0.6236998737449648, + "flos": 26618641441920.0, + "grad_norm": 1.9286435056964681, + "language_loss": 0.77654666, + "learning_rate": 1.3102695747697526e-06, + "loss": 0.79862332, + "num_input_tokens_seen": 111747675, + "step": 5187, + "time_per_iteration": 2.508856773376465 + }, + { + "auxiliary_loss_clip": 0.01103391, + "auxiliary_loss_mlp": 0.01026747, + "balance_loss_clip": 1.04506671, + "balance_loss_mlp": 1.0187422, + "epoch": 0.6238201166356039, + "flos": 12674599954560.0, + "grad_norm": 2.51488359243333, + "language_loss": 0.90423149, + "learning_rate": 1.3095384436400237e-06, + "loss": 0.92553288, + "num_input_tokens_seen": 111759205, + "step": 5188, + "time_per_iteration": 3.3414721488952637 + }, + { + "auxiliary_loss_clip": 0.01152102, + "auxiliary_loss_mlp": 0.01025538, + "balance_loss_clip": 1.04654706, + "balance_loss_mlp": 1.01787925, + "epoch": 0.623940359526243, + "flos": 10452160730880.0, + "grad_norm": 2.0325229602182495, + "language_loss": 0.82225668, + "learning_rate": 1.3088074172527633e-06, + "loss": 0.84403312, + "num_input_tokens_seen": 111776335, + "step": 5189, + "time_per_iteration": 3.208725690841675 + }, + { + "auxiliary_loss_clip": 0.01148539, + "auxiliary_loss_mlp": 0.01022223, + "balance_loss_clip": 1.04561067, + "balance_loss_mlp": 1.01411128, + "epoch": 0.6240606024168821, + "flos": 29059525226880.0, + "grad_norm": 2.0921283982110572, + "language_loss": 0.71176094, + "learning_rate": 1.3080764957188684e-06, + "loss": 0.73346853, + "num_input_tokens_seen": 111796580, + "step": 5190, + "time_per_iteration": 3.2725937366485596 + }, + { + "auxiliary_loss_clip": 0.01120329, + "auxiliary_loss_mlp": 0.01024992, + "balance_loss_clip": 1.04298866, + "balance_loss_mlp": 1.01731455, + "epoch": 0.6241808453075212, + "flos": 22018089450240.0, + "grad_norm": 1.9021140557197527, + "language_loss": 0.70701468, + "learning_rate": 1.3073456791492192e-06, + "loss": 0.72846788, + "num_input_tokens_seen": 111816290, + "step": 5191, + "time_per_iteration": 2.6114935874938965 + }, + { + "auxiliary_loss_clip": 0.01146153, + "auxiliary_loss_mlp": 0.01026876, + "balance_loss_clip": 1.04426122, + "balance_loss_mlp": 1.01939011, + "epoch": 0.6243010881981603, + "flos": 21138708683520.0, + "grad_norm": 1.7210631009789372, + "language_loss": 0.78253818, + "learning_rate": 1.3066149676546801e-06, + "loss": 0.80426854, + "num_input_tokens_seen": 111834470, + "step": 5192, + "time_per_iteration": 2.5044190883636475 + }, + { + "auxiliary_loss_clip": 0.01144286, + "auxiliary_loss_mlp": 0.01024766, + "balance_loss_clip": 1.0495863, + "balance_loss_mlp": 1.01712728, + "epoch": 0.6244213310887994, + "flos": 22344948236160.0, + "grad_norm": 2.2364316018191683, + "language_loss": 0.65981078, + "learning_rate": 1.3058843613460985e-06, + "loss": 0.68150133, + "num_input_tokens_seen": 111852410, + "step": 5193, + "time_per_iteration": 3.2724435329437256 + }, + { + "auxiliary_loss_clip": 0.01139623, + "auxiliary_loss_mlp": 0.01026084, + "balance_loss_clip": 1.04615855, + "balance_loss_mlp": 1.01851404, + "epoch": 0.6245415739794384, + "flos": 15231978524160.0, + "grad_norm": 1.884291820481267, + "language_loss": 0.74272293, + "learning_rate": 1.3051538603343075e-06, + "loss": 0.76438004, + "num_input_tokens_seen": 111870340, + "step": 5194, + "time_per_iteration": 2.499701976776123 + }, + { + "auxiliary_loss_clip": 0.01160202, + "auxiliary_loss_mlp": 0.01030187, + "balance_loss_clip": 1.04968572, + "balance_loss_mlp": 1.02282023, + "epoch": 0.6246618168700776, + "flos": 18879891960960.0, + "grad_norm": 1.8837551290567685, + "language_loss": 0.67375737, + "learning_rate": 1.3044234647301235e-06, + "loss": 0.69566131, + "num_input_tokens_seen": 111888365, + "step": 5195, + "time_per_iteration": 2.461310386657715 + }, + { + "auxiliary_loss_clip": 0.01154153, + "auxiliary_loss_mlp": 0.01023029, + "balance_loss_clip": 1.04745555, + "balance_loss_mlp": 1.01628184, + "epoch": 0.6247820597607167, + "flos": 14319201087360.0, + "grad_norm": 1.9376295424998855, + "language_loss": 0.72505605, + "learning_rate": 1.303693174644347e-06, + "loss": 0.7468279, + "num_input_tokens_seen": 111905840, + "step": 5196, + "time_per_iteration": 2.4252822399139404 + }, + { + "auxiliary_loss_clip": 0.01141467, + "auxiliary_loss_mlp": 0.01025959, + "balance_loss_clip": 1.04491019, + "balance_loss_mlp": 1.01760864, + "epoch": 0.6249023026513557, + "flos": 22637979388800.0, + "grad_norm": 4.240194200103494, + "language_loss": 0.80796462, + "learning_rate": 1.3029629901877625e-06, + "loss": 0.8296389, + "num_input_tokens_seen": 111925215, + "step": 5197, + "time_per_iteration": 2.496213674545288 + }, + { + "auxiliary_loss_clip": 0.01168652, + "auxiliary_loss_mlp": 0.01025842, + "balance_loss_clip": 1.05170655, + "balance_loss_mlp": 1.01765275, + "epoch": 0.6250225455419949, + "flos": 20266690204800.0, + "grad_norm": 2.621712917487729, + "language_loss": 0.7732929, + "learning_rate": 1.3022329114711376e-06, + "loss": 0.79523778, + "num_input_tokens_seen": 111943925, + "step": 5198, + "time_per_iteration": 2.447084426879883 + }, + { + "auxiliary_loss_clip": 0.01143359, + "auxiliary_loss_mlp": 0.01023661, + "balance_loss_clip": 1.04730535, + "balance_loss_mlp": 1.01596558, + "epoch": 0.6251427884326339, + "flos": 23437853400960.0, + "grad_norm": 3.2174350560590304, + "language_loss": 0.69434011, + "learning_rate": 1.3015029386052256e-06, + "loss": 0.71601033, + "num_input_tokens_seen": 111964095, + "step": 5199, + "time_per_iteration": 2.5152454376220703 + }, + { + "auxiliary_loss_clip": 0.01142328, + "auxiliary_loss_mlp": 0.01029875, + "balance_loss_clip": 1.04752696, + "balance_loss_mlp": 1.02225149, + "epoch": 0.625263031323273, + "flos": 31723055464320.0, + "grad_norm": 1.8637646802632368, + "language_loss": 0.73114705, + "learning_rate": 1.3007730717007622e-06, + "loss": 0.75286913, + "num_input_tokens_seen": 111984910, + "step": 5200, + "time_per_iteration": 2.612367630004883 + }, + { + "auxiliary_loss_clip": 0.01176361, + "auxiliary_loss_mlp": 0.01027567, + "balance_loss_clip": 1.05071425, + "balance_loss_mlp": 1.01945198, + "epoch": 0.6253832742139122, + "flos": 24134341092480.0, + "grad_norm": 2.589378152398318, + "language_loss": 0.75416923, + "learning_rate": 1.3000433108684676e-06, + "loss": 0.77620852, + "num_input_tokens_seen": 112005410, + "step": 5201, + "time_per_iteration": 2.4477224349975586 + }, + { + "auxiliary_loss_clip": 0.01155265, + "auxiliary_loss_mlp": 0.01023534, + "balance_loss_clip": 1.04812241, + "balance_loss_mlp": 1.01582074, + "epoch": 0.6255035171045512, + "flos": 27668812400640.0, + "grad_norm": 2.3235289378387547, + "language_loss": 0.80028737, + "learning_rate": 1.2993136562190467e-06, + "loss": 0.82207537, + "num_input_tokens_seen": 112024530, + "step": 5202, + "time_per_iteration": 2.5034408569335938 + }, + { + "auxiliary_loss_clip": 0.01148771, + "auxiliary_loss_mlp": 0.01028063, + "balance_loss_clip": 1.04661131, + "balance_loss_mlp": 1.02003384, + "epoch": 0.6256237599951903, + "flos": 20227798753920.0, + "grad_norm": 1.5503060771469095, + "language_loss": 0.70242417, + "learning_rate": 1.2985841078631871e-06, + "loss": 0.7241925, + "num_input_tokens_seen": 112043850, + "step": 5203, + "time_per_iteration": 2.4798457622528076 + }, + { + "auxiliary_loss_clip": 0.01100003, + "auxiliary_loss_mlp": 0.01030432, + "balance_loss_clip": 1.0386163, + "balance_loss_mlp": 1.02219498, + "epoch": 0.6257440028858293, + "flos": 24170574936960.0, + "grad_norm": 1.6627471320062168, + "language_loss": 0.78007549, + "learning_rate": 1.2978546659115608e-06, + "loss": 0.8013798, + "num_input_tokens_seen": 112061930, + "step": 5204, + "time_per_iteration": 2.795313835144043 + }, + { + "auxiliary_loss_clip": 0.01148346, + "auxiliary_loss_mlp": 0.01028849, + "balance_loss_clip": 1.04757404, + "balance_loss_mlp": 1.0210619, + "epoch": 0.6258642457764685, + "flos": 15851940289920.0, + "grad_norm": 2.4052489237778607, + "language_loss": 0.85242677, + "learning_rate": 1.2971253304748228e-06, + "loss": 0.87419868, + "num_input_tokens_seen": 112079645, + "step": 5205, + "time_per_iteration": 2.6498513221740723 + }, + { + "auxiliary_loss_clip": 0.01166096, + "auxiliary_loss_mlp": 0.01027684, + "balance_loss_clip": 1.05128169, + "balance_loss_mlp": 1.01964915, + "epoch": 0.6259844886671075, + "flos": 11911354836480.0, + "grad_norm": 1.5161008494593962, + "language_loss": 0.74682093, + "learning_rate": 1.296396101663614e-06, + "loss": 0.76875871, + "num_input_tokens_seen": 112096205, + "step": 5206, + "time_per_iteration": 2.4371864795684814 + }, + { + "auxiliary_loss_clip": 0.01163164, + "auxiliary_loss_mlp": 0.01027209, + "balance_loss_clip": 1.04921317, + "balance_loss_mlp": 1.01952624, + "epoch": 0.6261047315577466, + "flos": 15887958652800.0, + "grad_norm": 3.4509044909523774, + "language_loss": 0.84275746, + "learning_rate": 1.2956669795885565e-06, + "loss": 0.86466122, + "num_input_tokens_seen": 112112835, + "step": 5207, + "time_per_iteration": 2.4521543979644775 + }, + { + "auxiliary_loss_clip": 0.01126836, + "auxiliary_loss_mlp": 0.01030072, + "balance_loss_clip": 1.0459435, + "balance_loss_mlp": 1.02230573, + "epoch": 0.6262249744483858, + "flos": 31248926916480.0, + "grad_norm": 1.721984928067079, + "language_loss": 0.67969608, + "learning_rate": 1.294937964360259e-06, + "loss": 0.70126516, + "num_input_tokens_seen": 112133105, + "step": 5208, + "time_per_iteration": 2.607156276702881 + }, + { + "auxiliary_loss_clip": 0.01152357, + "auxiliary_loss_mlp": 0.01029855, + "balance_loss_clip": 1.04674506, + "balance_loss_mlp": 1.02074718, + "epoch": 0.6263452173390248, + "flos": 27198598435200.0, + "grad_norm": 4.14258185732185, + "language_loss": 0.71284425, + "learning_rate": 1.2942090560893108e-06, + "loss": 0.73466641, + "num_input_tokens_seen": 112152510, + "step": 5209, + "time_per_iteration": 2.538771629333496 + }, + { + "auxiliary_loss_clip": 0.01174057, + "auxiliary_loss_mlp": 0.01023501, + "balance_loss_clip": 1.04970622, + "balance_loss_mlp": 1.0163542, + "epoch": 0.6264654602296639, + "flos": 37342069683840.0, + "grad_norm": 1.9113522452782798, + "language_loss": 0.60411942, + "learning_rate": 1.2934802548862882e-06, + "loss": 0.62609506, + "num_input_tokens_seen": 112175295, + "step": 5210, + "time_per_iteration": 2.5602214336395264 + }, + { + "auxiliary_loss_clip": 0.01143038, + "auxiliary_loss_mlp": 0.01026518, + "balance_loss_clip": 1.04552007, + "balance_loss_mlp": 1.01885295, + "epoch": 0.626585703120303, + "flos": 14756952136320.0, + "grad_norm": 1.8817833819819414, + "language_loss": 0.82638073, + "learning_rate": 1.292751560861749e-06, + "loss": 0.84807634, + "num_input_tokens_seen": 112190200, + "step": 5211, + "time_per_iteration": 2.4864447116851807 + }, + { + "auxiliary_loss_clip": 0.01175769, + "auxiliary_loss_mlp": 0.01025897, + "balance_loss_clip": 1.05000961, + "balance_loss_mlp": 1.01770771, + "epoch": 0.6267059460109421, + "flos": 22347318533760.0, + "grad_norm": 1.827834592262429, + "language_loss": 0.79453099, + "learning_rate": 1.2920229741262354e-06, + "loss": 0.81654763, + "num_input_tokens_seen": 112208205, + "step": 5212, + "time_per_iteration": 2.4445769786834717 + }, + { + "auxiliary_loss_clip": 0.01146495, + "auxiliary_loss_mlp": 0.01025404, + "balance_loss_clip": 1.04594135, + "balance_loss_mlp": 1.01801586, + "epoch": 0.6268261889015811, + "flos": 17748813617280.0, + "grad_norm": 2.439107795239129, + "language_loss": 0.75267899, + "learning_rate": 1.2912944947902739e-06, + "loss": 0.77439797, + "num_input_tokens_seen": 112224690, + "step": 5213, + "time_per_iteration": 2.4891347885131836 + }, + { + "auxiliary_loss_clip": 0.01152501, + "auxiliary_loss_mlp": 0.01023483, + "balance_loss_clip": 1.04820085, + "balance_loss_mlp": 1.01515579, + "epoch": 0.6269464317922203, + "flos": 32846484211200.0, + "grad_norm": 2.107389834105623, + "language_loss": 0.71649897, + "learning_rate": 1.2905661229643742e-06, + "loss": 0.73825884, + "num_input_tokens_seen": 112244450, + "step": 5214, + "time_per_iteration": 2.614525556564331 + }, + { + "auxiliary_loss_clip": 0.0117441, + "auxiliary_loss_mlp": 0.0103048, + "balance_loss_clip": 1.04854131, + "balance_loss_mlp": 1.02264214, + "epoch": 0.6270666746828594, + "flos": 17929192740480.0, + "grad_norm": 2.8597116334116124, + "language_loss": 0.84112525, + "learning_rate": 1.2898378587590299e-06, + "loss": 0.8631742, + "num_input_tokens_seen": 112261050, + "step": 5215, + "time_per_iteration": 3.34311580657959 + }, + { + "auxiliary_loss_clip": 0.01157619, + "auxiliary_loss_mlp": 0.01026463, + "balance_loss_clip": 1.04947186, + "balance_loss_mlp": 1.01892912, + "epoch": 0.6271869175734984, + "flos": 17457326749440.0, + "grad_norm": 1.8306879987845293, + "language_loss": 0.87559211, + "learning_rate": 1.2891097022847173e-06, + "loss": 0.89743292, + "num_input_tokens_seen": 112278395, + "step": 5216, + "time_per_iteration": 3.942575693130493 + }, + { + "auxiliary_loss_clip": 0.0114794, + "auxiliary_loss_mlp": 0.01027573, + "balance_loss_clip": 1.04758191, + "balance_loss_mlp": 1.01892447, + "epoch": 0.6273071604641376, + "flos": 26868615166080.0, + "grad_norm": 1.7246609630296854, + "language_loss": 0.66532338, + "learning_rate": 1.2883816536518978e-06, + "loss": 0.68707854, + "num_input_tokens_seen": 112299535, + "step": 5217, + "time_per_iteration": 2.542051076889038 + }, + { + "auxiliary_loss_clip": 0.01157451, + "auxiliary_loss_mlp": 0.01026599, + "balance_loss_clip": 1.04643106, + "balance_loss_mlp": 1.01948237, + "epoch": 0.6274274033547766, + "flos": 26062384446720.0, + "grad_norm": 1.8276919354765278, + "language_loss": 0.82189262, + "learning_rate": 1.2876537129710155e-06, + "loss": 0.84373313, + "num_input_tokens_seen": 112317265, + "step": 5218, + "time_per_iteration": 2.500866174697876 + }, + { + "auxiliary_loss_clip": 0.01144029, + "auxiliary_loss_mlp": 0.01026006, + "balance_loss_clip": 1.05049276, + "balance_loss_mlp": 1.01823354, + "epoch": 0.6275476462454157, + "flos": 20266259241600.0, + "grad_norm": 2.1918339004726706, + "language_loss": 0.75263637, + "learning_rate": 1.286925880352499e-06, + "loss": 0.7743367, + "num_input_tokens_seen": 112336125, + "step": 5219, + "time_per_iteration": 2.5179967880249023 + }, + { + "auxiliary_loss_clip": 0.0114335, + "auxiliary_loss_mlp": 0.01019369, + "balance_loss_clip": 1.04625607, + "balance_loss_mlp": 1.01222229, + "epoch": 0.6276678891360549, + "flos": 26320402817280.0, + "grad_norm": 1.7732769905593158, + "language_loss": 0.71097684, + "learning_rate": 1.2861981559067592e-06, + "loss": 0.73260403, + "num_input_tokens_seen": 112356730, + "step": 5220, + "time_per_iteration": 3.328688621520996 + }, + { + "auxiliary_loss_clip": 0.01108439, + "auxiliary_loss_mlp": 0.01024532, + "balance_loss_clip": 1.0419749, + "balance_loss_mlp": 1.01698017, + "epoch": 0.6277881320266939, + "flos": 13912512324480.0, + "grad_norm": 2.1104276420937165, + "language_loss": 0.8034513, + "learning_rate": 1.2854705397441917e-06, + "loss": 0.82478106, + "num_input_tokens_seen": 112372270, + "step": 5221, + "time_per_iteration": 2.5469579696655273 + }, + { + "auxiliary_loss_clip": 0.01124971, + "auxiliary_loss_mlp": 0.01026729, + "balance_loss_clip": 1.04161739, + "balance_loss_mlp": 1.0188911, + "epoch": 0.627908374917333, + "flos": 27048922462080.0, + "grad_norm": 2.0196537470385105, + "language_loss": 0.776811, + "learning_rate": 1.2847430319751747e-06, + "loss": 0.79832798, + "num_input_tokens_seen": 112390365, + "step": 5222, + "time_per_iteration": 2.57277250289917 + }, + { + "auxiliary_loss_clip": 0.0115617, + "auxiliary_loss_mlp": 0.01024054, + "balance_loss_clip": 1.04947138, + "balance_loss_mlp": 1.01660943, + "epoch": 0.6280286178079721, + "flos": 23769201386880.0, + "grad_norm": 2.1191891521136053, + "language_loss": 0.67294753, + "learning_rate": 1.2840156327100712e-06, + "loss": 0.69474971, + "num_input_tokens_seen": 112407490, + "step": 5223, + "time_per_iteration": 2.473426580429077 + }, + { + "auxiliary_loss_clip": 0.01172938, + "auxiliary_loss_mlp": 0.01024146, + "balance_loss_clip": 1.04945445, + "balance_loss_mlp": 1.01622438, + "epoch": 0.6281488606986112, + "flos": 26359150613760.0, + "grad_norm": 9.00177061458453, + "language_loss": 0.72718716, + "learning_rate": 1.2832883420592272e-06, + "loss": 0.74915802, + "num_input_tokens_seen": 112426385, + "step": 5224, + "time_per_iteration": 2.4719011783599854 + }, + { + "auxiliary_loss_clip": 0.01142445, + "auxiliary_loss_mlp": 0.01026637, + "balance_loss_clip": 1.04689646, + "balance_loss_mlp": 1.01854897, + "epoch": 0.6282691035892503, + "flos": 36137194848000.0, + "grad_norm": 2.737654983790061, + "language_loss": 0.64470345, + "learning_rate": 1.282561160132972e-06, + "loss": 0.66639423, + "num_input_tokens_seen": 112446905, + "step": 5225, + "time_per_iteration": 2.61255145072937 + }, + { + "auxiliary_loss_clip": 0.01151681, + "auxiliary_loss_mlp": 0.01035562, + "balance_loss_clip": 1.0446794, + "balance_loss_mlp": 1.02770603, + "epoch": 0.6283893464798894, + "flos": 26537231266560.0, + "grad_norm": 1.5732810031396982, + "language_loss": 0.80608922, + "learning_rate": 1.2818340870416186e-06, + "loss": 0.82796168, + "num_input_tokens_seen": 112468040, + "step": 5226, + "time_per_iteration": 2.5544276237487793 + }, + { + "auxiliary_loss_clip": 0.01138162, + "auxiliary_loss_mlp": 0.01026958, + "balance_loss_clip": 1.04392552, + "balance_loss_mlp": 1.01823187, + "epoch": 0.6285095893705285, + "flos": 22237216369920.0, + "grad_norm": 1.8346487226222536, + "language_loss": 0.75865722, + "learning_rate": 1.2811071228954626e-06, + "loss": 0.78030837, + "num_input_tokens_seen": 112486675, + "step": 5227, + "time_per_iteration": 2.549755096435547 + }, + { + "auxiliary_loss_clip": 0.01147368, + "auxiliary_loss_mlp": 0.01024975, + "balance_loss_clip": 1.04967594, + "balance_loss_mlp": 1.01724386, + "epoch": 0.6286298322611675, + "flos": 26542259170560.0, + "grad_norm": 1.849654623242666, + "language_loss": 0.80803919, + "learning_rate": 1.2803802678047846e-06, + "loss": 0.82976258, + "num_input_tokens_seen": 112506825, + "step": 5228, + "time_per_iteration": 2.526991605758667 + }, + { + "auxiliary_loss_clip": 0.01151369, + "auxiliary_loss_mlp": 0.01032772, + "balance_loss_clip": 1.0486784, + "balance_loss_mlp": 1.02412915, + "epoch": 0.6287500751518067, + "flos": 21795227516160.0, + "grad_norm": 1.8621384827625662, + "language_loss": 0.7407093, + "learning_rate": 1.279653521879848e-06, + "loss": 0.76255071, + "num_input_tokens_seen": 112526890, + "step": 5229, + "time_per_iteration": 2.5180904865264893 + }, + { + "auxiliary_loss_clip": 0.01080816, + "auxiliary_loss_mlp": 0.01026191, + "balance_loss_clip": 1.03874898, + "balance_loss_mlp": 1.01871657, + "epoch": 0.6288703180424458, + "flos": 20009605587840.0, + "grad_norm": 2.179874532163779, + "language_loss": 0.83847308, + "learning_rate": 1.2789268852308997e-06, + "loss": 0.8595432, + "num_input_tokens_seen": 112542100, + "step": 5230, + "time_per_iteration": 2.7466728687286377 + }, + { + "auxiliary_loss_clip": 0.01153416, + "auxiliary_loss_mlp": 0.01029287, + "balance_loss_clip": 1.04706287, + "balance_loss_mlp": 1.02106762, + "epoch": 0.6289905609330848, + "flos": 22124923476480.0, + "grad_norm": 1.9367050038527387, + "language_loss": 0.70836794, + "learning_rate": 1.2782003579681688e-06, + "loss": 0.73019493, + "num_input_tokens_seen": 112561630, + "step": 5231, + "time_per_iteration": 2.6466164588928223 + }, + { + "auxiliary_loss_clip": 0.01176934, + "auxiliary_loss_mlp": 0.01029455, + "balance_loss_clip": 1.05091643, + "balance_loss_mlp": 1.02121186, + "epoch": 0.629110803823724, + "flos": 25518481729920.0, + "grad_norm": 1.6091795495975767, + "language_loss": 0.74202812, + "learning_rate": 1.2774739402018701e-06, + "loss": 0.76409197, + "num_input_tokens_seen": 112582465, + "step": 5232, + "time_per_iteration": 2.475940227508545 + }, + { + "auxiliary_loss_clip": 0.01160305, + "auxiliary_loss_mlp": 0.01028339, + "balance_loss_clip": 1.05101562, + "balance_loss_mlp": 1.01967812, + "epoch": 0.629231046714363, + "flos": 20886616056960.0, + "grad_norm": 1.6168848595652376, + "language_loss": 0.73068076, + "learning_rate": 1.2767476320422002e-06, + "loss": 0.75256723, + "num_input_tokens_seen": 112602390, + "step": 5233, + "time_per_iteration": 2.474911689758301 + }, + { + "auxiliary_loss_clip": 0.01048182, + "auxiliary_loss_mlp": 0.01001495, + "balance_loss_clip": 1.0152638, + "balance_loss_mlp": 1.00036836, + "epoch": 0.6293512896050021, + "flos": 65050027908480.0, + "grad_norm": 0.6886532675695021, + "language_loss": 0.57230484, + "learning_rate": 1.2760214335993392e-06, + "loss": 0.59280157, + "num_input_tokens_seen": 112669035, + "step": 5234, + "time_per_iteration": 3.202446699142456 + }, + { + "auxiliary_loss_clip": 0.01152923, + "auxiliary_loss_mlp": 0.01024401, + "balance_loss_clip": 1.04587638, + "balance_loss_mlp": 1.01732635, + "epoch": 0.6294715324956413, + "flos": 34677857088000.0, + "grad_norm": 2.0265719909133915, + "language_loss": 0.58602571, + "learning_rate": 1.2752953449834514e-06, + "loss": 0.60779893, + "num_input_tokens_seen": 112691485, + "step": 5235, + "time_per_iteration": 2.5850391387939453 + }, + { + "auxiliary_loss_clip": 0.01175027, + "auxiliary_loss_mlp": 0.01028217, + "balance_loss_clip": 1.05084872, + "balance_loss_mlp": 1.02082002, + "epoch": 0.6295917753862803, + "flos": 22784207656320.0, + "grad_norm": 1.7925764456132687, + "language_loss": 0.80228049, + "learning_rate": 1.2745693663046836e-06, + "loss": 0.82431293, + "num_input_tokens_seen": 112710555, + "step": 5236, + "time_per_iteration": 2.425600528717041 + }, + { + "auxiliary_loss_clip": 0.01155576, + "auxiliary_loss_mlp": 0.0102335, + "balance_loss_clip": 1.04687715, + "balance_loss_mlp": 1.01612353, + "epoch": 0.6297120182769194, + "flos": 20850454039680.0, + "grad_norm": 1.8380806249794899, + "language_loss": 0.80946732, + "learning_rate": 1.2738434976731662e-06, + "loss": 0.83125663, + "num_input_tokens_seen": 112728740, + "step": 5237, + "time_per_iteration": 2.469883918762207 + }, + { + "auxiliary_loss_clip": 0.0114798, + "auxiliary_loss_mlp": 0.01034021, + "balance_loss_clip": 1.04877901, + "balance_loss_mlp": 1.02512217, + "epoch": 0.6298322611675584, + "flos": 19497662997120.0, + "grad_norm": 1.6223503842608602, + "language_loss": 0.75334215, + "learning_rate": 1.2731177391990125e-06, + "loss": 0.77516216, + "num_input_tokens_seen": 112748665, + "step": 5238, + "time_per_iteration": 2.49552845954895 + }, + { + "auxiliary_loss_clip": 0.01147133, + "auxiliary_loss_mlp": 0.01019946, + "balance_loss_clip": 1.04552257, + "balance_loss_mlp": 1.01248956, + "epoch": 0.6299525040581976, + "flos": 12604466649600.0, + "grad_norm": 1.9406653691692877, + "language_loss": 0.81869125, + "learning_rate": 1.2723920909923203e-06, + "loss": 0.84036207, + "num_input_tokens_seen": 112764410, + "step": 5239, + "time_per_iteration": 2.4932501316070557 + }, + { + "auxiliary_loss_clip": 0.01071587, + "auxiliary_loss_mlp": 0.01000842, + "balance_loss_clip": 1.01219404, + "balance_loss_mlp": 0.99973375, + "epoch": 0.6300727469488366, + "flos": 57725685636480.0, + "grad_norm": 0.8781207644345415, + "language_loss": 0.60450327, + "learning_rate": 1.2716665531631688e-06, + "loss": 0.62522751, + "num_input_tokens_seen": 112818695, + "step": 5240, + "time_per_iteration": 2.9587345123291016 + }, + { + "auxiliary_loss_clip": 0.01164239, + "auxiliary_loss_mlp": 0.01023778, + "balance_loss_clip": 1.04593492, + "balance_loss_mlp": 1.01538002, + "epoch": 0.6301929898394757, + "flos": 22527302607360.0, + "grad_norm": 2.2710058158601165, + "language_loss": 0.77060497, + "learning_rate": 1.270941125821623e-06, + "loss": 0.79248512, + "num_input_tokens_seen": 112839120, + "step": 5241, + "time_per_iteration": 3.430651903152466 + }, + { + "auxiliary_loss_clip": 0.01152842, + "auxiliary_loss_mlp": 0.01029179, + "balance_loss_clip": 1.04341376, + "balance_loss_mlp": 1.02150464, + "epoch": 0.6303132327301149, + "flos": 28293550675200.0, + "grad_norm": 1.6380677333597051, + "language_loss": 0.75391471, + "learning_rate": 1.2702158090777278e-06, + "loss": 0.7757349, + "num_input_tokens_seen": 112860210, + "step": 5242, + "time_per_iteration": 2.5486385822296143 + }, + { + "auxiliary_loss_clip": 0.01129803, + "auxiliary_loss_mlp": 0.0102515, + "balance_loss_clip": 1.04449177, + "balance_loss_mlp": 1.01736224, + "epoch": 0.6304334756207539, + "flos": 25264521596160.0, + "grad_norm": 1.8606048064800949, + "language_loss": 0.745929, + "learning_rate": 1.2694906030415148e-06, + "loss": 0.76747847, + "num_input_tokens_seen": 112877955, + "step": 5243, + "time_per_iteration": 3.9852492809295654 + }, + { + "auxiliary_loss_clip": 0.01154777, + "auxiliary_loss_mlp": 0.01028085, + "balance_loss_clip": 1.0471791, + "balance_loss_mlp": 1.01996684, + "epoch": 0.630553718511393, + "flos": 18033548728320.0, + "grad_norm": 2.574219013900799, + "language_loss": 0.83522689, + "learning_rate": 1.2687655078229958e-06, + "loss": 0.85705543, + "num_input_tokens_seen": 112892285, + "step": 5244, + "time_per_iteration": 2.4578311443328857 + }, + { + "auxiliary_loss_clip": 0.0114345, + "auxiliary_loss_mlp": 0.01025715, + "balance_loss_clip": 1.04722917, + "balance_loss_mlp": 1.01824069, + "epoch": 0.6306739614020321, + "flos": 27304103658240.0, + "grad_norm": 2.6608492686438954, + "language_loss": 0.69290423, + "learning_rate": 1.2680405235321678e-06, + "loss": 0.71459591, + "num_input_tokens_seen": 112913620, + "step": 5245, + "time_per_iteration": 2.5426182746887207 + }, + { + "auxiliary_loss_clip": 0.0114965, + "auxiliary_loss_mlp": 0.00762974, + "balance_loss_clip": 1.05075097, + "balance_loss_mlp": 1.00102723, + "epoch": 0.6307942042926712, + "flos": 15341434243200.0, + "grad_norm": 2.3965828304032186, + "language_loss": 0.78805494, + "learning_rate": 1.267315650279011e-06, + "loss": 0.80718124, + "num_input_tokens_seen": 112932090, + "step": 5246, + "time_per_iteration": 2.496448516845703 + }, + { + "auxiliary_loss_clip": 0.011252, + "auxiliary_loss_mlp": 0.01023596, + "balance_loss_clip": 1.04693818, + "balance_loss_mlp": 1.01591921, + "epoch": 0.6309144471833102, + "flos": 19606400444160.0, + "grad_norm": 3.7993198166336932, + "language_loss": 0.73507476, + "learning_rate": 1.2665908881734874e-06, + "loss": 0.75656271, + "num_input_tokens_seen": 112950925, + "step": 5247, + "time_per_iteration": 3.2769358158111572 + }, + { + "auxiliary_loss_clip": 0.01160927, + "auxiliary_loss_mlp": 0.0102896, + "balance_loss_clip": 1.04930627, + "balance_loss_mlp": 1.02198601, + "epoch": 0.6310346900739494, + "flos": 17493345112320.0, + "grad_norm": 2.351249302205262, + "language_loss": 0.85165334, + "learning_rate": 1.2658662373255432e-06, + "loss": 0.8735522, + "num_input_tokens_seen": 112969315, + "step": 5248, + "time_per_iteration": 2.4449918270111084 + }, + { + "auxiliary_loss_clip": 0.01051314, + "auxiliary_loss_mlp": 0.01001999, + "balance_loss_clip": 1.0124675, + "balance_loss_mlp": 1.00093162, + "epoch": 0.6311549329645885, + "flos": 55070164131840.0, + "grad_norm": 1.1646229116502207, + "language_loss": 0.5225091, + "learning_rate": 1.2651416978451063e-06, + "loss": 0.54304218, + "num_input_tokens_seen": 113034700, + "step": 5249, + "time_per_iteration": 3.15988826751709 + }, + { + "auxiliary_loss_clip": 0.01177498, + "auxiliary_loss_mlp": 0.01023923, + "balance_loss_clip": 1.05042779, + "balance_loss_mlp": 1.01589453, + "epoch": 0.6312751758552275, + "flos": 41902545075840.0, + "grad_norm": 3.3385509450498, + "language_loss": 0.65437472, + "learning_rate": 1.2644172698420903e-06, + "loss": 0.67638886, + "num_input_tokens_seen": 113056805, + "step": 5250, + "time_per_iteration": 2.6146066188812256 + }, + { + "auxiliary_loss_clip": 0.0113245, + "auxiliary_loss_mlp": 0.01030679, + "balance_loss_clip": 1.04591739, + "balance_loss_mlp": 1.0223521, + "epoch": 0.6313954187458667, + "flos": 19646800266240.0, + "grad_norm": 1.7716667735575287, + "language_loss": 0.84920752, + "learning_rate": 1.2636929534263892e-06, + "loss": 0.87083882, + "num_input_tokens_seen": 113075790, + "step": 5251, + "time_per_iteration": 2.5388171672821045 + }, + { + "auxiliary_loss_clip": 0.01133443, + "auxiliary_loss_mlp": 0.01026934, + "balance_loss_clip": 1.0417105, + "balance_loss_mlp": 1.01897049, + "epoch": 0.6315156616365057, + "flos": 22894273906560.0, + "grad_norm": 1.7671389210346367, + "language_loss": 0.77615118, + "learning_rate": 1.2629687487078821e-06, + "loss": 0.79775494, + "num_input_tokens_seen": 113094600, + "step": 5252, + "time_per_iteration": 2.5545430183410645 + }, + { + "auxiliary_loss_clip": 0.01164092, + "auxiliary_loss_mlp": 0.01032487, + "balance_loss_clip": 1.04680204, + "balance_loss_mlp": 1.02416074, + "epoch": 0.6316359045271448, + "flos": 23726251699200.0, + "grad_norm": 2.0644008250305346, + "language_loss": 0.75981557, + "learning_rate": 1.2622446557964293e-06, + "loss": 0.78178132, + "num_input_tokens_seen": 113112605, + "step": 5253, + "time_per_iteration": 2.5449137687683105 + }, + { + "auxiliary_loss_clip": 0.01143052, + "auxiliary_loss_mlp": 0.01027157, + "balance_loss_clip": 1.04161215, + "balance_loss_mlp": 1.0203321, + "epoch": 0.631756147417784, + "flos": 33108417164160.0, + "grad_norm": 1.8763114048299603, + "language_loss": 0.71305311, + "learning_rate": 1.261520674801876e-06, + "loss": 0.73475522, + "num_input_tokens_seen": 113133200, + "step": 5254, + "time_per_iteration": 2.59892201423645 + }, + { + "auxiliary_loss_clip": 0.01145223, + "auxiliary_loss_mlp": 0.01026056, + "balance_loss_clip": 1.04956436, + "balance_loss_mlp": 1.01789045, + "epoch": 0.631876390308423, + "flos": 31248424126080.0, + "grad_norm": 2.041431651574693, + "language_loss": 0.72007847, + "learning_rate": 1.2607968058340488e-06, + "loss": 0.74179131, + "num_input_tokens_seen": 113152895, + "step": 5255, + "time_per_iteration": 2.575453042984009 + }, + { + "auxiliary_loss_clip": 0.01142733, + "auxiliary_loss_mlp": 0.0102933, + "balance_loss_clip": 1.04536319, + "balance_loss_mlp": 1.02166522, + "epoch": 0.6319966331990621, + "flos": 24681152810880.0, + "grad_norm": 1.7330259967948694, + "language_loss": 0.73202556, + "learning_rate": 1.2600730490027583e-06, + "loss": 0.75374621, + "num_input_tokens_seen": 113173135, + "step": 5256, + "time_per_iteration": 2.5393457412719727 + }, + { + "auxiliary_loss_clip": 0.01130819, + "auxiliary_loss_mlp": 0.01025889, + "balance_loss_clip": 1.04430556, + "balance_loss_mlp": 1.01813436, + "epoch": 0.6321168760897012, + "flos": 17491764913920.0, + "grad_norm": 1.7854754682522622, + "language_loss": 0.80251706, + "learning_rate": 1.2593494044177984e-06, + "loss": 0.82408416, + "num_input_tokens_seen": 113191440, + "step": 5257, + "time_per_iteration": 2.49760365486145 + }, + { + "auxiliary_loss_clip": 0.01178446, + "auxiliary_loss_mlp": 0.01025275, + "balance_loss_clip": 1.04938316, + "balance_loss_mlp": 1.01703799, + "epoch": 0.6322371189803403, + "flos": 18295373940480.0, + "grad_norm": 2.4351821853016715, + "language_loss": 0.80815709, + "learning_rate": 1.2586258721889448e-06, + "loss": 0.83019423, + "num_input_tokens_seen": 113208790, + "step": 5258, + "time_per_iteration": 2.4135689735412598 + }, + { + "auxiliary_loss_clip": 0.01112446, + "auxiliary_loss_mlp": 0.0102879, + "balance_loss_clip": 1.04554927, + "balance_loss_mlp": 1.02074957, + "epoch": 0.6323573618709794, + "flos": 20157270399360.0, + "grad_norm": 1.9384890226635991, + "language_loss": 0.82147527, + "learning_rate": 1.2579024524259573e-06, + "loss": 0.84288764, + "num_input_tokens_seen": 113225050, + "step": 5259, + "time_per_iteration": 2.559544563293457 + }, + { + "auxiliary_loss_clip": 0.01142015, + "auxiliary_loss_mlp": 0.01024713, + "balance_loss_clip": 1.04416513, + "balance_loss_mlp": 1.01661897, + "epoch": 0.6324776047616185, + "flos": 20042391726720.0, + "grad_norm": 2.5719976516384335, + "language_loss": 0.91045535, + "learning_rate": 1.2571791452385768e-06, + "loss": 0.93212259, + "num_input_tokens_seen": 113242315, + "step": 5260, + "time_per_iteration": 2.503037691116333 + }, + { + "auxiliary_loss_clip": 0.01148391, + "auxiliary_loss_mlp": 0.01028275, + "balance_loss_clip": 1.04747295, + "balance_loss_mlp": 1.02023435, + "epoch": 0.6325978476522576, + "flos": 30848235724800.0, + "grad_norm": 1.8931686272436987, + "language_loss": 0.77184713, + "learning_rate": 1.2564559507365301e-06, + "loss": 0.79361379, + "num_input_tokens_seen": 113264720, + "step": 5261, + "time_per_iteration": 2.592447519302368 + }, + { + "auxiliary_loss_clip": 0.01148793, + "auxiliary_loss_mlp": 0.01026306, + "balance_loss_clip": 1.04754794, + "balance_loss_mlp": 1.01759744, + "epoch": 0.6327180905428966, + "flos": 24535104111360.0, + "grad_norm": 2.6627847701033884, + "language_loss": 0.79104412, + "learning_rate": 1.2557328690295244e-06, + "loss": 0.81279516, + "num_input_tokens_seen": 113282910, + "step": 5262, + "time_per_iteration": 2.5168771743774414 + }, + { + "auxiliary_loss_clip": 0.01138627, + "auxiliary_loss_mlp": 0.01025623, + "balance_loss_clip": 1.04757941, + "balance_loss_mlp": 1.0178566, + "epoch": 0.6328383334335358, + "flos": 21575274583680.0, + "grad_norm": 1.925106559916415, + "language_loss": 0.7629534, + "learning_rate": 1.255009900227251e-06, + "loss": 0.78459591, + "num_input_tokens_seen": 113301935, + "step": 5263, + "time_per_iteration": 2.5369577407836914 + }, + { + "auxiliary_loss_clip": 0.01171033, + "auxiliary_loss_mlp": 0.01025528, + "balance_loss_clip": 1.04934716, + "balance_loss_mlp": 1.01835203, + "epoch": 0.6329585763241748, + "flos": 22929861306240.0, + "grad_norm": 2.24508159787508, + "language_loss": 0.79582405, + "learning_rate": 1.254287044439383e-06, + "loss": 0.81778961, + "num_input_tokens_seen": 113321540, + "step": 5264, + "time_per_iteration": 2.432516098022461 + }, + { + "auxiliary_loss_clip": 0.01071755, + "auxiliary_loss_mlp": 0.0100006, + "balance_loss_clip": 1.01207924, + "balance_loss_mlp": 0.99897528, + "epoch": 0.6330788192148139, + "flos": 70936897847040.0, + "grad_norm": 0.7682165261583604, + "language_loss": 0.54397309, + "learning_rate": 1.2535643017755776e-06, + "loss": 0.56469119, + "num_input_tokens_seen": 113383730, + "step": 5265, + "time_per_iteration": 3.141976833343506 + }, + { + "auxiliary_loss_clip": 0.01132616, + "auxiliary_loss_mlp": 0.01031367, + "balance_loss_clip": 1.04456449, + "balance_loss_mlp": 1.02309418, + "epoch": 0.6331990621054531, + "flos": 21244501215360.0, + "grad_norm": 2.457214497208106, + "language_loss": 0.71829146, + "learning_rate": 1.2528416723454737e-06, + "loss": 0.73993134, + "num_input_tokens_seen": 113400400, + "step": 5266, + "time_per_iteration": 2.5542750358581543 + }, + { + "auxiliary_loss_clip": 0.0117347, + "auxiliary_loss_mlp": 0.01024284, + "balance_loss_clip": 1.05125809, + "balance_loss_mlp": 1.01700652, + "epoch": 0.6333193049960921, + "flos": 34459412526720.0, + "grad_norm": 1.5055294965841024, + "language_loss": 0.71087408, + "learning_rate": 1.2521191562586945e-06, + "loss": 0.73285162, + "num_input_tokens_seen": 113424050, + "step": 5267, + "time_per_iteration": 2.536158323287964 + }, + { + "auxiliary_loss_clip": 0.01174847, + "auxiliary_loss_mlp": 0.00762384, + "balance_loss_clip": 1.05013609, + "balance_loss_mlp": 1.00107121, + "epoch": 0.6334395478867312, + "flos": 18329883932160.0, + "grad_norm": 2.095930792513844, + "language_loss": 0.77022839, + "learning_rate": 1.2513967536248445e-06, + "loss": 0.78960067, + "num_input_tokens_seen": 113440370, + "step": 5268, + "time_per_iteration": 3.21943998336792 + }, + { + "auxiliary_loss_clip": 0.01156997, + "auxiliary_loss_mlp": 0.01026361, + "balance_loss_clip": 1.049963, + "balance_loss_mlp": 1.01917255, + "epoch": 0.6335597907773702, + "flos": 23623152687360.0, + "grad_norm": 1.6837014338767764, + "language_loss": 0.81068301, + "learning_rate": 1.2506744645535117e-06, + "loss": 0.83251667, + "num_input_tokens_seen": 113460800, + "step": 5269, + "time_per_iteration": 2.5154640674591064 + }, + { + "auxiliary_loss_clip": 0.01136896, + "auxiliary_loss_mlp": 0.01022053, + "balance_loss_clip": 1.04024291, + "balance_loss_mlp": 1.01420939, + "epoch": 0.6336800336680094, + "flos": 22710913954560.0, + "grad_norm": 1.9534718105701405, + "language_loss": 0.60385668, + "learning_rate": 1.249952289154267e-06, + "loss": 0.6254462, + "num_input_tokens_seen": 113480840, + "step": 5270, + "time_per_iteration": 3.9953458309173584 + }, + { + "auxiliary_loss_clip": 0.01092157, + "auxiliary_loss_mlp": 0.0103016, + "balance_loss_clip": 1.04071856, + "balance_loss_mlp": 1.0227896, + "epoch": 0.6338002765586485, + "flos": 23622757637760.0, + "grad_norm": 5.530685265230291, + "language_loss": 0.76436049, + "learning_rate": 1.2492302275366635e-06, + "loss": 0.78558362, + "num_input_tokens_seen": 113500515, + "step": 5271, + "time_per_iteration": 2.6124024391174316 + }, + { + "auxiliary_loss_clip": 0.01154736, + "auxiliary_loss_mlp": 0.01025724, + "balance_loss_clip": 1.04698896, + "balance_loss_mlp": 1.01803482, + "epoch": 0.6339205194492875, + "flos": 26505450708480.0, + "grad_norm": 2.3805679717774035, + "language_loss": 0.65346324, + "learning_rate": 1.2485082798102377e-06, + "loss": 0.67526782, + "num_input_tokens_seen": 113520930, + "step": 5272, + "time_per_iteration": 2.505711317062378 + }, + { + "auxiliary_loss_clip": 0.0113535, + "auxiliary_loss_mlp": 0.01025947, + "balance_loss_clip": 1.04362583, + "balance_loss_mlp": 1.01725113, + "epoch": 0.6340407623399267, + "flos": 18544306170240.0, + "grad_norm": 2.2485734403115654, + "language_loss": 0.68403399, + "learning_rate": 1.2477864460845084e-06, + "loss": 0.70564699, + "num_input_tokens_seen": 113537330, + "step": 5273, + "time_per_iteration": 2.514005661010742 + }, + { + "auxiliary_loss_clip": 0.01145846, + "auxiliary_loss_mlp": 0.01024647, + "balance_loss_clip": 1.04575706, + "balance_loss_mlp": 1.01673794, + "epoch": 0.6341610052305657, + "flos": 17712579772800.0, + "grad_norm": 2.85592189669793, + "language_loss": 0.73205209, + "learning_rate": 1.2470647264689776e-06, + "loss": 0.753757, + "num_input_tokens_seen": 113555810, + "step": 5274, + "time_per_iteration": 3.192795753479004 + }, + { + "auxiliary_loss_clip": 0.01111396, + "auxiliary_loss_mlp": 0.01026239, + "balance_loss_clip": 1.04111183, + "balance_loss_mlp": 1.01822162, + "epoch": 0.6342812481212048, + "flos": 23587026583680.0, + "grad_norm": 2.637526010530082, + "language_loss": 0.71280271, + "learning_rate": 1.2463431210731282e-06, + "loss": 0.73417902, + "num_input_tokens_seen": 113575395, + "step": 5275, + "time_per_iteration": 2.6772568225860596 + }, + { + "auxiliary_loss_clip": 0.01128524, + "auxiliary_loss_mlp": 0.01023849, + "balance_loss_clip": 1.04259229, + "balance_loss_mlp": 1.01658964, + "epoch": 0.634401491011844, + "flos": 17821927751040.0, + "grad_norm": 2.499631256608622, + "language_loss": 0.76224232, + "learning_rate": 1.2456216300064289e-06, + "loss": 0.78376603, + "num_input_tokens_seen": 113592945, + "step": 5276, + "time_per_iteration": 2.7290878295898438 + }, + { + "auxiliary_loss_clip": 0.01140426, + "auxiliary_loss_mlp": 0.01024623, + "balance_loss_clip": 1.04534793, + "balance_loss_mlp": 1.01639175, + "epoch": 0.634521733902483, + "flos": 21358158825600.0, + "grad_norm": 1.8185835322198298, + "language_loss": 0.78529942, + "learning_rate": 1.244900253378328e-06, + "loss": 0.80694991, + "num_input_tokens_seen": 113613000, + "step": 5277, + "time_per_iteration": 2.571209669113159 + }, + { + "auxiliary_loss_clip": 0.01077756, + "auxiliary_loss_mlp": 0.01026844, + "balance_loss_clip": 1.04193747, + "balance_loss_mlp": 1.01916718, + "epoch": 0.6346419767931221, + "flos": 16545052103040.0, + "grad_norm": 2.7792291828771094, + "language_loss": 0.69552982, + "learning_rate": 1.2441789912982583e-06, + "loss": 0.7165758, + "num_input_tokens_seen": 113630085, + "step": 5278, + "time_per_iteration": 2.714406728744507 + }, + { + "auxiliary_loss_clip": 0.01165474, + "auxiliary_loss_mlp": 0.01025952, + "balance_loss_clip": 1.04980671, + "balance_loss_mlp": 1.0175184, + "epoch": 0.6347622196837612, + "flos": 24350989973760.0, + "grad_norm": 2.315878409455381, + "language_loss": 0.64658952, + "learning_rate": 1.2434578438756346e-06, + "loss": 0.66850382, + "num_input_tokens_seen": 113650515, + "step": 5279, + "time_per_iteration": 3.0841825008392334 + }, + { + "auxiliary_loss_clip": 0.01161047, + "auxiliary_loss_mlp": 0.0102198, + "balance_loss_clip": 1.0470525, + "balance_loss_mlp": 1.01449943, + "epoch": 0.6348824625744003, + "flos": 64523178195840.0, + "grad_norm": 2.0228864170351764, + "language_loss": 0.78103656, + "learning_rate": 1.242736811219855e-06, + "loss": 0.80286682, + "num_input_tokens_seen": 113676475, + "step": 5280, + "time_per_iteration": 2.886754035949707 + }, + { + "auxiliary_loss_clip": 0.01155054, + "auxiliary_loss_mlp": 0.01024766, + "balance_loss_clip": 1.04722703, + "balance_loss_mlp": 1.01689243, + "epoch": 0.6350027054650393, + "flos": 28622133313920.0, + "grad_norm": 1.7950644096565083, + "language_loss": 0.81979823, + "learning_rate": 1.2420158934402988e-06, + "loss": 0.84159648, + "num_input_tokens_seen": 113697090, + "step": 5281, + "time_per_iteration": 2.514810562133789 + }, + { + "auxiliary_loss_clip": 0.01118306, + "auxiliary_loss_mlp": 0.01024504, + "balance_loss_clip": 1.0412631, + "balance_loss_mlp": 1.01645184, + "epoch": 0.6351229483556785, + "flos": 23002544476800.0, + "grad_norm": 2.0121165813701345, + "language_loss": 0.84535837, + "learning_rate": 1.2412950906463286e-06, + "loss": 0.86678648, + "num_input_tokens_seen": 113714395, + "step": 5282, + "time_per_iteration": 2.544157028198242 + }, + { + "auxiliary_loss_clip": 0.01117304, + "auxiliary_loss_mlp": 0.01024163, + "balance_loss_clip": 1.04396534, + "balance_loss_mlp": 1.01669431, + "epoch": 0.6352431912463176, + "flos": 21939300967680.0, + "grad_norm": 1.8370747348661625, + "language_loss": 0.89776111, + "learning_rate": 1.2405744029472902e-06, + "loss": 0.91917574, + "num_input_tokens_seen": 113733880, + "step": 5283, + "time_per_iteration": 2.5829789638519287 + }, + { + "auxiliary_loss_clip": 0.01143671, + "auxiliary_loss_mlp": 0.01022977, + "balance_loss_clip": 1.04522276, + "balance_loss_mlp": 1.01544285, + "epoch": 0.6353634341369566, + "flos": 13735257684480.0, + "grad_norm": 2.0387583862576113, + "language_loss": 0.75929976, + "learning_rate": 1.2398538304525108e-06, + "loss": 0.78096616, + "num_input_tokens_seen": 113752505, + "step": 5284, + "time_per_iteration": 2.4805567264556885 + }, + { + "auxiliary_loss_clip": 0.01128504, + "auxiliary_loss_mlp": 0.01028798, + "balance_loss_clip": 1.04530978, + "balance_loss_mlp": 1.02042341, + "epoch": 0.6354836770275958, + "flos": 19316170552320.0, + "grad_norm": 2.1153653842944546, + "language_loss": 0.7563262, + "learning_rate": 1.2391333732713016e-06, + "loss": 0.77789927, + "num_input_tokens_seen": 113770310, + "step": 5285, + "time_per_iteration": 2.502422332763672 + }, + { + "auxiliary_loss_clip": 0.01129917, + "auxiliary_loss_mlp": 0.01035477, + "balance_loss_clip": 1.04279697, + "balance_loss_mlp": 1.02637506, + "epoch": 0.6356039199182348, + "flos": 21613375935360.0, + "grad_norm": 2.8160285138246097, + "language_loss": 0.78370613, + "learning_rate": 1.2384130315129543e-06, + "loss": 0.80536008, + "num_input_tokens_seen": 113788635, + "step": 5286, + "time_per_iteration": 2.5249085426330566 + }, + { + "auxiliary_loss_clip": 0.01068828, + "auxiliary_loss_mlp": 0.01023914, + "balance_loss_clip": 1.03730953, + "balance_loss_mlp": 1.01556897, + "epoch": 0.6357241628088739, + "flos": 18111978074880.0, + "grad_norm": 2.2604546389415434, + "language_loss": 0.73180234, + "learning_rate": 1.2376928052867447e-06, + "loss": 0.75272977, + "num_input_tokens_seen": 113807755, + "step": 5287, + "time_per_iteration": 2.7435011863708496 + }, + { + "auxiliary_loss_clip": 0.01146334, + "auxiliary_loss_mlp": 0.01025135, + "balance_loss_clip": 1.04826784, + "balance_loss_mlp": 1.01738334, + "epoch": 0.6358444056995131, + "flos": 24935256599040.0, + "grad_norm": 2.0268402410651305, + "language_loss": 0.77545607, + "learning_rate": 1.2369726947019299e-06, + "loss": 0.79717076, + "num_input_tokens_seen": 113828230, + "step": 5288, + "time_per_iteration": 3.1513195037841797 + }, + { + "auxiliary_loss_clip": 0.01158985, + "auxiliary_loss_mlp": 0.01021946, + "balance_loss_clip": 1.04673362, + "balance_loss_mlp": 1.01414371, + "epoch": 0.6359646485901521, + "flos": 23293348986240.0, + "grad_norm": 2.2369906138598985, + "language_loss": 0.67465752, + "learning_rate": 1.2362526998677511e-06, + "loss": 0.6964668, + "num_input_tokens_seen": 113844595, + "step": 5289, + "time_per_iteration": 2.519963026046753 + }, + { + "auxiliary_loss_clip": 0.01147703, + "auxiliary_loss_mlp": 0.01027038, + "balance_loss_clip": 1.04566956, + "balance_loss_mlp": 1.0201354, + "epoch": 0.6360848914807912, + "flos": 20887442069760.0, + "grad_norm": 2.066050657994633, + "language_loss": 0.84564912, + "learning_rate": 1.2355328208934301e-06, + "loss": 0.86739659, + "num_input_tokens_seen": 113863470, + "step": 5290, + "time_per_iteration": 2.52815580368042 + }, + { + "auxiliary_loss_clip": 0.01158314, + "auxiliary_loss_mlp": 0.0076275, + "balance_loss_clip": 1.04487646, + "balance_loss_mlp": 1.00113535, + "epoch": 0.6362051343714303, + "flos": 18479775386880.0, + "grad_norm": 1.7911508925271233, + "language_loss": 0.72435254, + "learning_rate": 1.2348130578881728e-06, + "loss": 0.74356318, + "num_input_tokens_seen": 113881690, + "step": 5291, + "time_per_iteration": 2.461695909500122 + }, + { + "auxiliary_loss_clip": 0.01176335, + "auxiliary_loss_mlp": 0.01031206, + "balance_loss_clip": 1.04984236, + "balance_loss_mlp": 1.02309942, + "epoch": 0.6363253772620694, + "flos": 24389594115840.0, + "grad_norm": 2.5091503551723573, + "language_loss": 0.76057053, + "learning_rate": 1.2340934109611664e-06, + "loss": 0.78264594, + "num_input_tokens_seen": 113902450, + "step": 5292, + "time_per_iteration": 2.454402446746826 + }, + { + "auxiliary_loss_clip": 0.01150084, + "auxiliary_loss_mlp": 0.01027506, + "balance_loss_clip": 1.04698706, + "balance_loss_mlp": 1.01894093, + "epoch": 0.6364456201527084, + "flos": 25958243940480.0, + "grad_norm": 2.5155587728251736, + "language_loss": 0.6925422, + "learning_rate": 1.2333738802215798e-06, + "loss": 0.71431804, + "num_input_tokens_seen": 113922670, + "step": 5293, + "time_per_iteration": 2.536012887954712 + }, + { + "auxiliary_loss_clip": 0.01110617, + "auxiliary_loss_mlp": 0.01026484, + "balance_loss_clip": 1.04059291, + "balance_loss_mlp": 1.01861644, + "epoch": 0.6365658630433476, + "flos": 20740711011840.0, + "grad_norm": 1.8911391600327727, + "language_loss": 0.80693847, + "learning_rate": 1.2326544657785668e-06, + "loss": 0.82830948, + "num_input_tokens_seen": 113942360, + "step": 5294, + "time_per_iteration": 2.5673279762268066 + }, + { + "auxiliary_loss_clip": 0.01123744, + "auxiliary_loss_mlp": 0.01030785, + "balance_loss_clip": 1.04383767, + "balance_loss_mlp": 1.02209425, + "epoch": 0.6366861059339867, + "flos": 21434146047360.0, + "grad_norm": 2.190098932320648, + "language_loss": 0.73877084, + "learning_rate": 1.2319351677412608e-06, + "loss": 0.76031607, + "num_input_tokens_seen": 113959405, + "step": 5295, + "time_per_iteration": 3.3182928562164307 + }, + { + "auxiliary_loss_clip": 0.01142942, + "auxiliary_loss_mlp": 0.01025252, + "balance_loss_clip": 1.04929161, + "balance_loss_mlp": 1.01710391, + "epoch": 0.6368063488246257, + "flos": 22267093507200.0, + "grad_norm": 1.965040875406403, + "language_loss": 0.73996609, + "learning_rate": 1.2312159862187796e-06, + "loss": 0.76164806, + "num_input_tokens_seen": 113977815, + "step": 5296, + "time_per_iteration": 3.691617727279663 + }, + { + "auxiliary_loss_clip": 0.01179876, + "auxiliary_loss_mlp": 0.01031477, + "balance_loss_clip": 1.0526185, + "balance_loss_mlp": 1.02359104, + "epoch": 0.6369265917152649, + "flos": 22420719976320.0, + "grad_norm": 1.7322273569570206, + "language_loss": 0.76147318, + "learning_rate": 1.2304969213202217e-06, + "loss": 0.78358674, + "num_input_tokens_seen": 113999075, + "step": 5297, + "time_per_iteration": 2.443378448486328 + }, + { + "auxiliary_loss_clip": 0.01141578, + "auxiliary_loss_mlp": 0.0102647, + "balance_loss_clip": 1.04496634, + "balance_loss_mlp": 1.01893044, + "epoch": 0.6370468346059039, + "flos": 24718176754560.0, + "grad_norm": 2.8729619843849363, + "language_loss": 0.79324985, + "learning_rate": 1.2297779731546692e-06, + "loss": 0.81493032, + "num_input_tokens_seen": 114018170, + "step": 5298, + "time_per_iteration": 2.5217177867889404 + }, + { + "auxiliary_loss_clip": 0.01146216, + "auxiliary_loss_mlp": 0.01025852, + "balance_loss_clip": 1.04935074, + "balance_loss_mlp": 1.01788831, + "epoch": 0.637167077496543, + "flos": 25296589463040.0, + "grad_norm": 2.029069928046034, + "language_loss": 0.77952975, + "learning_rate": 1.2290591418311853e-06, + "loss": 0.8012504, + "num_input_tokens_seen": 114035565, + "step": 5299, + "time_per_iteration": 2.5269787311553955 + }, + { + "auxiliary_loss_clip": 0.01158114, + "auxiliary_loss_mlp": 0.01027057, + "balance_loss_clip": 1.04813111, + "balance_loss_mlp": 1.01932073, + "epoch": 0.637287320387182, + "flos": 27671110871040.0, + "grad_norm": 1.6042214426814971, + "language_loss": 0.72318006, + "learning_rate": 1.2283404274588172e-06, + "loss": 0.74503171, + "num_input_tokens_seen": 114054510, + "step": 5300, + "time_per_iteration": 2.523761749267578 + }, + { + "auxiliary_loss_clip": 0.00993317, + "auxiliary_loss_mlp": 0.01002199, + "balance_loss_clip": 1.00841022, + "balance_loss_mlp": 1.00121009, + "epoch": 0.6374075632778212, + "flos": 63173406873600.0, + "grad_norm": 0.7394508609444757, + "language_loss": 0.52865309, + "learning_rate": 1.227621830146592e-06, + "loss": 0.54860824, + "num_input_tokens_seen": 114109875, + "step": 5301, + "time_per_iteration": 4.127566814422607 + }, + { + "auxiliary_loss_clip": 0.0113472, + "auxiliary_loss_mlp": 0.01030446, + "balance_loss_clip": 1.04636121, + "balance_loss_mlp": 1.02273345, + "epoch": 0.6375278061684603, + "flos": 25558127366400.0, + "grad_norm": 2.640481991119296, + "language_loss": 0.79231811, + "learning_rate": 1.2269033500035217e-06, + "loss": 0.81396973, + "num_input_tokens_seen": 114130010, + "step": 5302, + "time_per_iteration": 3.0531630516052246 + }, + { + "auxiliary_loss_clip": 0.01132498, + "auxiliary_loss_mlp": 0.01029064, + "balance_loss_clip": 1.04565465, + "balance_loss_mlp": 1.02161026, + "epoch": 0.6376480490590993, + "flos": 25666362023040.0, + "grad_norm": 1.8069160615697681, + "language_loss": 0.736233, + "learning_rate": 1.2261849871385988e-06, + "loss": 0.75784862, + "num_input_tokens_seen": 114151115, + "step": 5303, + "time_per_iteration": 2.561383008956909 + }, + { + "auxiliary_loss_clip": 0.01175907, + "auxiliary_loss_mlp": 0.01023193, + "balance_loss_clip": 1.0491395, + "balance_loss_mlp": 1.01503289, + "epoch": 0.6377682919497385, + "flos": 31537684350720.0, + "grad_norm": 2.245562747615019, + "language_loss": 0.62612778, + "learning_rate": 1.2254667416607972e-06, + "loss": 0.64811879, + "num_input_tokens_seen": 114172715, + "step": 5304, + "time_per_iteration": 2.5024101734161377 + }, + { + "auxiliary_loss_clip": 0.01158643, + "auxiliary_loss_mlp": 0.01025506, + "balance_loss_clip": 1.0491724, + "balance_loss_mlp": 1.01737285, + "epoch": 0.6378885348403776, + "flos": 23039209284480.0, + "grad_norm": 1.7874186346313068, + "language_loss": 0.83131289, + "learning_rate": 1.2247486136790756e-06, + "loss": 0.8531543, + "num_input_tokens_seen": 114192195, + "step": 5305, + "time_per_iteration": 2.483069658279419 + }, + { + "auxiliary_loss_clip": 0.01164756, + "auxiliary_loss_mlp": 0.01032733, + "balance_loss_clip": 1.05129898, + "balance_loss_mlp": 1.02504361, + "epoch": 0.6380087777310166, + "flos": 18697070712960.0, + "grad_norm": 2.373968368438008, + "language_loss": 0.80140412, + "learning_rate": 1.2240306033023726e-06, + "loss": 0.82337898, + "num_input_tokens_seen": 114210020, + "step": 5306, + "time_per_iteration": 2.4424147605895996 + }, + { + "auxiliary_loss_clip": 0.01132762, + "auxiliary_loss_mlp": 0.010261, + "balance_loss_clip": 1.0414741, + "balance_loss_mlp": 1.01810122, + "epoch": 0.6381290206216558, + "flos": 23331558078720.0, + "grad_norm": 1.735492746022293, + "language_loss": 0.72000074, + "learning_rate": 1.223312710639611e-06, + "loss": 0.74158931, + "num_input_tokens_seen": 114228740, + "step": 5307, + "time_per_iteration": 2.544201374053955 + }, + { + "auxiliary_loss_clip": 0.01146866, + "auxiliary_loss_mlp": 0.01025889, + "balance_loss_clip": 1.04787111, + "balance_loss_mlp": 1.01797938, + "epoch": 0.6382492635122948, + "flos": 18880466578560.0, + "grad_norm": 2.1837620222449194, + "language_loss": 0.87225378, + "learning_rate": 1.2225949357996928e-06, + "loss": 0.89398128, + "num_input_tokens_seen": 114246865, + "step": 5308, + "time_per_iteration": 2.512765645980835 + }, + { + "auxiliary_loss_clip": 0.01155919, + "auxiliary_loss_mlp": 0.01024614, + "balance_loss_clip": 1.04914212, + "balance_loss_mlp": 1.01722598, + "epoch": 0.6383695064029339, + "flos": 27819134818560.0, + "grad_norm": 1.6370625743334029, + "language_loss": 0.80261338, + "learning_rate": 1.221877278891505e-06, + "loss": 0.82441872, + "num_input_tokens_seen": 114266120, + "step": 5309, + "time_per_iteration": 2.5156266689300537 + }, + { + "auxiliary_loss_clip": 0.01169007, + "auxiliary_loss_mlp": 0.01032458, + "balance_loss_clip": 1.05052066, + "balance_loss_mlp": 1.02401185, + "epoch": 0.638489749293573, + "flos": 26395635853440.0, + "grad_norm": 2.214344400579386, + "language_loss": 0.71339506, + "learning_rate": 1.221159740023915e-06, + "loss": 0.73540974, + "num_input_tokens_seen": 114285950, + "step": 5310, + "time_per_iteration": 2.502485990524292 + }, + { + "auxiliary_loss_clip": 0.01141514, + "auxiliary_loss_mlp": 0.00762789, + "balance_loss_clip": 1.04728317, + "balance_loss_mlp": 1.00115252, + "epoch": 0.6386099921842121, + "flos": 23988328306560.0, + "grad_norm": 2.592201412566402, + "language_loss": 0.72685528, + "learning_rate": 1.2204423193057735e-06, + "loss": 0.74589831, + "num_input_tokens_seen": 114304780, + "step": 5311, + "time_per_iteration": 2.5615389347076416 + }, + { + "auxiliary_loss_clip": 0.01050739, + "auxiliary_loss_mlp": 0.01001547, + "balance_loss_clip": 1.01084447, + "balance_loss_mlp": 1.00046206, + "epoch": 0.6387302350748512, + "flos": 71731169337600.0, + "grad_norm": 0.8518513782716837, + "language_loss": 0.63438499, + "learning_rate": 1.2197250168459122e-06, + "loss": 0.65490794, + "num_input_tokens_seen": 114361180, + "step": 5312, + "time_per_iteration": 3.1135525703430176 + }, + { + "auxiliary_loss_clip": 0.01163729, + "auxiliary_loss_mlp": 0.01023659, + "balance_loss_clip": 1.04987192, + "balance_loss_mlp": 1.0158031, + "epoch": 0.6388504779654903, + "flos": 14535778141440.0, + "grad_norm": 2.016876890535321, + "language_loss": 0.74496144, + "learning_rate": 1.2190078327531454e-06, + "loss": 0.76683533, + "num_input_tokens_seen": 114377425, + "step": 5313, + "time_per_iteration": 2.4614806175231934 + }, + { + "auxiliary_loss_clip": 0.01161916, + "auxiliary_loss_mlp": 0.01030076, + "balance_loss_clip": 1.04769588, + "balance_loss_mlp": 1.02238679, + "epoch": 0.6389707208561294, + "flos": 22346133384960.0, + "grad_norm": 1.5089126930039618, + "language_loss": 0.72649211, + "learning_rate": 1.2182907671362697e-06, + "loss": 0.74841201, + "num_input_tokens_seen": 114398120, + "step": 5314, + "time_per_iteration": 2.499514579772949 + }, + { + "auxiliary_loss_clip": 0.01161131, + "auxiliary_loss_mlp": 0.0102583, + "balance_loss_clip": 1.04961991, + "balance_loss_mlp": 1.01758122, + "epoch": 0.6390909637467684, + "flos": 19426883247360.0, + "grad_norm": 3.061248500749292, + "language_loss": 0.78610611, + "learning_rate": 1.2175738201040626e-06, + "loss": 0.80797577, + "num_input_tokens_seen": 114415160, + "step": 5315, + "time_per_iteration": 2.4613234996795654 + }, + { + "auxiliary_loss_clip": 0.01159931, + "auxiliary_loss_mlp": 0.01031588, + "balance_loss_clip": 1.04770994, + "balance_loss_mlp": 1.02382731, + "epoch": 0.6392112066374076, + "flos": 24090852700800.0, + "grad_norm": 2.7596711268945864, + "language_loss": 0.78607249, + "learning_rate": 1.2168569917652855e-06, + "loss": 0.80798775, + "num_input_tokens_seen": 114435015, + "step": 5316, + "time_per_iteration": 2.4966278076171875 + }, + { + "auxiliary_loss_clip": 0.01163857, + "auxiliary_loss_mlp": 0.01022688, + "balance_loss_clip": 1.05067158, + "balance_loss_mlp": 1.014498, + "epoch": 0.6393314495280467, + "flos": 26795141896320.0, + "grad_norm": 1.6423243189274204, + "language_loss": 0.63833863, + "learning_rate": 1.2161402822286797e-06, + "loss": 0.66020405, + "num_input_tokens_seen": 114455700, + "step": 5317, + "time_per_iteration": 2.5345253944396973 + }, + { + "auxiliary_loss_clip": 0.01129262, + "auxiliary_loss_mlp": 0.0102414, + "balance_loss_clip": 1.04389143, + "balance_loss_mlp": 1.01649308, + "epoch": 0.6394516924186857, + "flos": 20260692633600.0, + "grad_norm": 2.0311518199605967, + "language_loss": 0.79055721, + "learning_rate": 1.2154236916029703e-06, + "loss": 0.81209123, + "num_input_tokens_seen": 114473675, + "step": 5318, + "time_per_iteration": 2.5249972343444824 + }, + { + "auxiliary_loss_clip": 0.01118018, + "auxiliary_loss_mlp": 0.01025314, + "balance_loss_clip": 1.0400877, + "balance_loss_mlp": 1.01730871, + "epoch": 0.6395719353093249, + "flos": 18368847210240.0, + "grad_norm": 2.665067741143917, + "language_loss": 0.7408886, + "learning_rate": 1.2147072199968627e-06, + "loss": 0.76232189, + "num_input_tokens_seen": 114492310, + "step": 5319, + "time_per_iteration": 2.5487253665924072 + }, + { + "auxiliary_loss_clip": 0.01159513, + "auxiliary_loss_mlp": 0.0102781, + "balance_loss_clip": 1.04859781, + "balance_loss_mlp": 1.02055621, + "epoch": 0.6396921781999639, + "flos": 17566315591680.0, + "grad_norm": 1.77529836159715, + "language_loss": 0.71513027, + "learning_rate": 1.2139908675190454e-06, + "loss": 0.73700351, + "num_input_tokens_seen": 114511520, + "step": 5320, + "time_per_iteration": 2.4771831035614014 + }, + { + "auxiliary_loss_clip": 0.01094033, + "auxiliary_loss_mlp": 0.01025238, + "balance_loss_clip": 1.03860831, + "balance_loss_mlp": 1.01751351, + "epoch": 0.639812421090603, + "flos": 21251252972160.0, + "grad_norm": 1.9385114180421417, + "language_loss": 0.7533474, + "learning_rate": 1.2132746342781883e-06, + "loss": 0.77454007, + "num_input_tokens_seen": 114532680, + "step": 5321, + "time_per_iteration": 3.539865255355835 + }, + { + "auxiliary_loss_clip": 0.01176499, + "auxiliary_loss_mlp": 0.0102616, + "balance_loss_clip": 1.05060244, + "balance_loss_mlp": 1.01776171, + "epoch": 0.6399326639812422, + "flos": 11180967684480.0, + "grad_norm": 2.6517444429643704, + "language_loss": 0.80687475, + "learning_rate": 1.2125585203829442e-06, + "loss": 0.82890135, + "num_input_tokens_seen": 114548320, + "step": 5322, + "time_per_iteration": 3.9243600368499756 + }, + { + "auxiliary_loss_clip": 0.01121876, + "auxiliary_loss_mlp": 0.01028948, + "balance_loss_clip": 1.04537606, + "balance_loss_mlp": 1.02087152, + "epoch": 0.6400529068718812, + "flos": 23911048195200.0, + "grad_norm": 1.8830435498594589, + "language_loss": 0.73826033, + "learning_rate": 1.211842525941946e-06, + "loss": 0.75976855, + "num_input_tokens_seen": 114568115, + "step": 5323, + "time_per_iteration": 2.5509586334228516 + }, + { + "auxiliary_loss_clip": 0.01115219, + "auxiliary_loss_mlp": 0.01023249, + "balance_loss_clip": 1.04502344, + "balance_loss_mlp": 1.01551247, + "epoch": 0.6401731497625203, + "flos": 44018724890880.0, + "grad_norm": 1.8430010082357589, + "language_loss": 0.78996658, + "learning_rate": 1.2111266510638105e-06, + "loss": 0.8113513, + "num_input_tokens_seen": 114591040, + "step": 5324, + "time_per_iteration": 2.7757961750030518 + }, + { + "auxiliary_loss_clip": 0.01097977, + "auxiliary_loss_mlp": 0.01029851, + "balance_loss_clip": 1.04218197, + "balance_loss_mlp": 1.02170908, + "epoch": 0.6402933926531594, + "flos": 20662209838080.0, + "grad_norm": 2.260102996931899, + "language_loss": 0.8017453, + "learning_rate": 1.2104108958571346e-06, + "loss": 0.82302356, + "num_input_tokens_seen": 114609310, + "step": 5325, + "time_per_iteration": 2.6362643241882324 + }, + { + "auxiliary_loss_clip": 0.01158923, + "auxiliary_loss_mlp": 0.01029406, + "balance_loss_clip": 1.04978955, + "balance_loss_mlp": 1.02168751, + "epoch": 0.6404136355437985, + "flos": 24863327614080.0, + "grad_norm": 1.4290348053518107, + "language_loss": 0.75899553, + "learning_rate": 1.2096952604304975e-06, + "loss": 0.7808789, + "num_input_tokens_seen": 114629740, + "step": 5326, + "time_per_iteration": 2.4990477561950684 + }, + { + "auxiliary_loss_clip": 0.01161725, + "auxiliary_loss_mlp": 0.01026147, + "balance_loss_clip": 1.04693937, + "balance_loss_mlp": 1.01804638, + "epoch": 0.6405338784344375, + "flos": 40479548901120.0, + "grad_norm": 2.165598567976121, + "language_loss": 0.70238268, + "learning_rate": 1.2089797448924616e-06, + "loss": 0.72426146, + "num_input_tokens_seen": 114653615, + "step": 5327, + "time_per_iteration": 3.402688980102539 + }, + { + "auxiliary_loss_clip": 0.0112159, + "auxiliary_loss_mlp": 0.01027543, + "balance_loss_clip": 1.039994, + "balance_loss_mlp": 1.01950884, + "epoch": 0.6406541213250767, + "flos": 20886041439360.0, + "grad_norm": 3.1634723877099535, + "language_loss": 0.66637039, + "learning_rate": 1.2082643493515692e-06, + "loss": 0.68786168, + "num_input_tokens_seen": 114671935, + "step": 5328, + "time_per_iteration": 2.54941463470459 + }, + { + "auxiliary_loss_clip": 0.01158536, + "auxiliary_loss_mlp": 0.01026823, + "balance_loss_clip": 1.04729164, + "balance_loss_mlp": 1.01880026, + "epoch": 0.6407743642157158, + "flos": 23295970679040.0, + "grad_norm": 1.9237600114485596, + "language_loss": 0.81938052, + "learning_rate": 1.207549073916346e-06, + "loss": 0.84123409, + "num_input_tokens_seen": 114692870, + "step": 5329, + "time_per_iteration": 2.4858579635620117 + }, + { + "auxiliary_loss_clip": 0.01137674, + "auxiliary_loss_mlp": 0.01026261, + "balance_loss_clip": 1.04658365, + "balance_loss_mlp": 1.01869094, + "epoch": 0.6408946071063548, + "flos": 15012636122880.0, + "grad_norm": 2.0371384541355435, + "language_loss": 0.77725083, + "learning_rate": 1.2068339186952976e-06, + "loss": 0.79889017, + "num_input_tokens_seen": 114710410, + "step": 5330, + "time_per_iteration": 2.468622922897339 + }, + { + "auxiliary_loss_clip": 0.0116394, + "auxiliary_loss_mlp": 0.01028092, + "balance_loss_clip": 1.04989934, + "balance_loss_mlp": 1.02020037, + "epoch": 0.6410148499969939, + "flos": 22528595496960.0, + "grad_norm": 1.7918282877896476, + "language_loss": 0.73339689, + "learning_rate": 1.2061188837969136e-06, + "loss": 0.75531727, + "num_input_tokens_seen": 114730020, + "step": 5331, + "time_per_iteration": 2.4668161869049072 + }, + { + "auxiliary_loss_clip": 0.01125557, + "auxiliary_loss_mlp": 0.01023854, + "balance_loss_clip": 1.04182434, + "balance_loss_mlp": 1.01546144, + "epoch": 0.641135092887633, + "flos": 12422004537600.0, + "grad_norm": 2.1979001867320513, + "language_loss": 0.83909464, + "learning_rate": 1.2054039693296631e-06, + "loss": 0.86058879, + "num_input_tokens_seen": 114748015, + "step": 5332, + "time_per_iteration": 2.506239414215088 + }, + { + "auxiliary_loss_clip": 0.01124157, + "auxiliary_loss_mlp": 0.01026244, + "balance_loss_clip": 1.04136896, + "balance_loss_mlp": 1.01873326, + "epoch": 0.6412553357782721, + "flos": 22127329687680.0, + "grad_norm": 1.8081052835486404, + "language_loss": 0.81489933, + "learning_rate": 1.2046891754019992e-06, + "loss": 0.83640337, + "num_input_tokens_seen": 114768625, + "step": 5333, + "time_per_iteration": 2.5506248474121094 + }, + { + "auxiliary_loss_clip": 0.01164117, + "auxiliary_loss_mlp": 0.01031951, + "balance_loss_clip": 1.04982102, + "balance_loss_mlp": 1.02372551, + "epoch": 0.6413755786689112, + "flos": 15888605097600.0, + "grad_norm": 1.9069818094170168, + "language_loss": 0.82511544, + "learning_rate": 1.2039745021223548e-06, + "loss": 0.84707606, + "num_input_tokens_seen": 114786045, + "step": 5334, + "time_per_iteration": 2.4339985847473145 + }, + { + "auxiliary_loss_clip": 0.01021595, + "auxiliary_loss_mlp": 0.0100187, + "balance_loss_clip": 1.00996208, + "balance_loss_mlp": 1.00079155, + "epoch": 0.6414958215595503, + "flos": 68039159955840.0, + "grad_norm": 0.7908248630391187, + "language_loss": 0.5707947, + "learning_rate": 1.2032599495991456e-06, + "loss": 0.59102935, + "num_input_tokens_seen": 114850785, + "step": 5335, + "time_per_iteration": 3.217888116836548 + }, + { + "auxiliary_loss_clip": 0.01161291, + "auxiliary_loss_mlp": 0.01023725, + "balance_loss_clip": 1.04932499, + "balance_loss_mlp": 1.01570857, + "epoch": 0.6416160644501894, + "flos": 44091300320640.0, + "grad_norm": 1.7678491516493757, + "language_loss": 0.69550538, + "learning_rate": 1.2025455179407685e-06, + "loss": 0.71735561, + "num_input_tokens_seen": 114871945, + "step": 5336, + "time_per_iteration": 2.667365312576294 + }, + { + "auxiliary_loss_clip": 0.01156788, + "auxiliary_loss_mlp": 0.00762589, + "balance_loss_clip": 1.04826379, + "balance_loss_mlp": 1.00117171, + "epoch": 0.6417363073408284, + "flos": 20959837931520.0, + "grad_norm": 2.331515976449252, + "language_loss": 0.73656452, + "learning_rate": 1.2018312072556022e-06, + "loss": 0.75575829, + "num_input_tokens_seen": 114890445, + "step": 5337, + "time_per_iteration": 2.4621706008911133 + }, + { + "auxiliary_loss_clip": 0.0117077, + "auxiliary_loss_mlp": 0.00762742, + "balance_loss_clip": 1.04764807, + "balance_loss_mlp": 1.00107622, + "epoch": 0.6418565502314676, + "flos": 22455122227200.0, + "grad_norm": 1.7052110617795913, + "language_loss": 0.74149048, + "learning_rate": 1.2011170176520077e-06, + "loss": 0.76082557, + "num_input_tokens_seen": 114911360, + "step": 5338, + "time_per_iteration": 2.4740869998931885 + }, + { + "auxiliary_loss_clip": 0.01087161, + "auxiliary_loss_mlp": 0.01022195, + "balance_loss_clip": 1.04005492, + "balance_loss_mlp": 1.01468778, + "epoch": 0.6419767931221066, + "flos": 25045502417280.0, + "grad_norm": 1.5579817636417803, + "language_loss": 0.8141948, + "learning_rate": 1.2004029492383256e-06, + "loss": 0.83528841, + "num_input_tokens_seen": 114932700, + "step": 5339, + "time_per_iteration": 2.637355089187622 + }, + { + "auxiliary_loss_clip": 0.01159055, + "auxiliary_loss_mlp": 0.01026399, + "balance_loss_clip": 1.04952645, + "balance_loss_mlp": 1.01856709, + "epoch": 0.6420970360127457, + "flos": 19463691709440.0, + "grad_norm": 2.2291964819829637, + "language_loss": 0.7365396, + "learning_rate": 1.1996890021228814e-06, + "loss": 0.75839412, + "num_input_tokens_seen": 114949475, + "step": 5340, + "time_per_iteration": 2.4739768505096436 + }, + { + "auxiliary_loss_clip": 0.01141906, + "auxiliary_loss_mlp": 0.01027972, + "balance_loss_clip": 1.04406226, + "balance_loss_mlp": 1.02006269, + "epoch": 0.6422172789033849, + "flos": 40406147458560.0, + "grad_norm": 1.574168179571479, + "language_loss": 0.69901431, + "learning_rate": 1.1989751764139785e-06, + "loss": 0.72071314, + "num_input_tokens_seen": 114973125, + "step": 5341, + "time_per_iteration": 2.675703525543213 + }, + { + "auxiliary_loss_clip": 0.01112278, + "auxiliary_loss_mlp": 0.01024597, + "balance_loss_clip": 1.0374701, + "balance_loss_mlp": 1.01638365, + "epoch": 0.6423375217940239, + "flos": 27672870637440.0, + "grad_norm": 1.5678103639003755, + "language_loss": 0.83081168, + "learning_rate": 1.1982614722199044e-06, + "loss": 0.85218036, + "num_input_tokens_seen": 114994300, + "step": 5342, + "time_per_iteration": 2.640468120574951 + }, + { + "auxiliary_loss_clip": 0.01150764, + "auxiliary_loss_mlp": 0.01028861, + "balance_loss_clip": 1.04566503, + "balance_loss_mlp": 1.02138698, + "epoch": 0.642457764684663, + "flos": 18369242259840.0, + "grad_norm": 2.9187193586294935, + "language_loss": 0.77271259, + "learning_rate": 1.1975478896489276e-06, + "loss": 0.79450881, + "num_input_tokens_seen": 115012135, + "step": 5343, + "time_per_iteration": 2.4741885662078857 + }, + { + "auxiliary_loss_clip": 0.01170781, + "auxiliary_loss_mlp": 0.01026299, + "balance_loss_clip": 1.04725862, + "balance_loss_mlp": 1.01900029, + "epoch": 0.6425780075753021, + "flos": 19750509809280.0, + "grad_norm": 2.2633127546748204, + "language_loss": 0.7661739, + "learning_rate": 1.1968344288092981e-06, + "loss": 0.78814471, + "num_input_tokens_seen": 115028715, + "step": 5344, + "time_per_iteration": 2.4501914978027344 + }, + { + "auxiliary_loss_clip": 0.0116073, + "auxiliary_loss_mlp": 0.00762753, + "balance_loss_clip": 1.04874468, + "balance_loss_mlp": 1.00112963, + "epoch": 0.6426982504659412, + "flos": 20558536208640.0, + "grad_norm": 2.5209174089476685, + "language_loss": 0.65028751, + "learning_rate": 1.1961210898092468e-06, + "loss": 0.66952235, + "num_input_tokens_seen": 115047665, + "step": 5345, + "time_per_iteration": 2.5994789600372314 + }, + { + "auxiliary_loss_clip": 0.01151872, + "auxiliary_loss_mlp": 0.01029471, + "balance_loss_clip": 1.04841781, + "balance_loss_mlp": 1.02121282, + "epoch": 0.6428184933565803, + "flos": 17851984456320.0, + "grad_norm": 2.1611260636080516, + "language_loss": 0.79299605, + "learning_rate": 1.1954078727569874e-06, + "loss": 0.81480944, + "num_input_tokens_seen": 115064965, + "step": 5346, + "time_per_iteration": 2.595580816268921 + }, + { + "auxiliary_loss_clip": 0.01134551, + "auxiliary_loss_mlp": 0.00762888, + "balance_loss_clip": 1.04285622, + "balance_loss_mlp": 1.00115836, + "epoch": 0.6429387362472194, + "flos": 22456953820800.0, + "grad_norm": 2.085271825230915, + "language_loss": 0.7790224, + "learning_rate": 1.1946947777607141e-06, + "loss": 0.79799676, + "num_input_tokens_seen": 115086100, + "step": 5347, + "time_per_iteration": 3.3304636478424072 + }, + { + "auxiliary_loss_clip": 0.01110182, + "auxiliary_loss_mlp": 0.01025751, + "balance_loss_clip": 1.04157925, + "balance_loss_mlp": 1.0178591, + "epoch": 0.6430589791378585, + "flos": 24752579005440.0, + "grad_norm": 3.526587160222229, + "language_loss": 0.80286968, + "learning_rate": 1.1939818049286024e-06, + "loss": 0.82422894, + "num_input_tokens_seen": 115104260, + "step": 5348, + "time_per_iteration": 2.588914155960083 + }, + { + "auxiliary_loss_clip": 0.01091239, + "auxiliary_loss_mlp": 0.01025566, + "balance_loss_clip": 1.04094982, + "balance_loss_mlp": 1.01789463, + "epoch": 0.6431792220284975, + "flos": 24901249397760.0, + "grad_norm": 1.6904368172927309, + "language_loss": 0.75635612, + "learning_rate": 1.1932689543688101e-06, + "loss": 0.77752417, + "num_input_tokens_seen": 115125365, + "step": 5349, + "time_per_iteration": 4.177224159240723 + }, + { + "auxiliary_loss_clip": 0.01145781, + "auxiliary_loss_mlp": 0.01028423, + "balance_loss_clip": 1.04803109, + "balance_loss_mlp": 1.02047157, + "epoch": 0.6432994649191367, + "flos": 21032305620480.0, + "grad_norm": 2.935174623005061, + "language_loss": 0.72487378, + "learning_rate": 1.1925562261894756e-06, + "loss": 0.74661577, + "num_input_tokens_seen": 115144445, + "step": 5350, + "time_per_iteration": 2.5301504135131836 + }, + { + "auxiliary_loss_clip": 0.0114148, + "auxiliary_loss_mlp": 0.01030072, + "balance_loss_clip": 1.04474664, + "balance_loss_mlp": 1.02260375, + "epoch": 0.6434197078097758, + "flos": 30884433655680.0, + "grad_norm": 1.936171672304102, + "language_loss": 0.77618003, + "learning_rate": 1.1918436204987207e-06, + "loss": 0.79789555, + "num_input_tokens_seen": 115166305, + "step": 5351, + "time_per_iteration": 2.5859694480895996 + }, + { + "auxiliary_loss_clip": 0.01154909, + "auxiliary_loss_mlp": 0.01027114, + "balance_loss_clip": 1.04786515, + "balance_loss_mlp": 1.01933575, + "epoch": 0.6435399507004148, + "flos": 15012492468480.0, + "grad_norm": 2.485241265866211, + "language_loss": 0.81395334, + "learning_rate": 1.191131137404645e-06, + "loss": 0.83577359, + "num_input_tokens_seen": 115183045, + "step": 5352, + "time_per_iteration": 2.489269733428955 + }, + { + "auxiliary_loss_clip": 0.01119707, + "auxiliary_loss_mlp": 0.0102725, + "balance_loss_clip": 1.04347992, + "balance_loss_mlp": 1.01966262, + "epoch": 0.643660193591054, + "flos": 19901981462400.0, + "grad_norm": 1.8955068996572693, + "language_loss": 0.76926088, + "learning_rate": 1.190418777015333e-06, + "loss": 0.79073048, + "num_input_tokens_seen": 115201955, + "step": 5353, + "time_per_iteration": 2.554658889770508 + }, + { + "auxiliary_loss_clip": 0.01143779, + "auxiliary_loss_mlp": 0.01020462, + "balance_loss_clip": 1.04608202, + "balance_loss_mlp": 1.01325321, + "epoch": 0.643780436481693, + "flos": 24133622820480.0, + "grad_norm": 1.423863053369931, + "language_loss": 0.73432577, + "learning_rate": 1.1897065394388487e-06, + "loss": 0.75596821, + "num_input_tokens_seen": 115222395, + "step": 5354, + "time_per_iteration": 3.3029520511627197 + }, + { + "auxiliary_loss_clip": 0.01146809, + "auxiliary_loss_mlp": 0.01028873, + "balance_loss_clip": 1.05085444, + "balance_loss_mlp": 1.02096987, + "epoch": 0.6439006793723321, + "flos": 23148808657920.0, + "grad_norm": 2.1900391483762935, + "language_loss": 0.76281071, + "learning_rate": 1.1889944247832385e-06, + "loss": 0.78456753, + "num_input_tokens_seen": 115242635, + "step": 5355, + "time_per_iteration": 2.50691819190979 + }, + { + "auxiliary_loss_clip": 0.01161329, + "auxiliary_loss_mlp": 0.01030998, + "balance_loss_clip": 1.0463655, + "balance_loss_mlp": 1.0233804, + "epoch": 0.6440209222629713, + "flos": 23617909301760.0, + "grad_norm": 2.181230274906088, + "language_loss": 0.71175635, + "learning_rate": 1.1882824331565283e-06, + "loss": 0.73367965, + "num_input_tokens_seen": 115262095, + "step": 5356, + "time_per_iteration": 2.4989001750946045 + }, + { + "auxiliary_loss_clip": 0.01125995, + "auxiliary_loss_mlp": 0.01028913, + "balance_loss_clip": 1.0420109, + "balance_loss_mlp": 1.02129579, + "epoch": 0.6441411651536103, + "flos": 16544872535040.0, + "grad_norm": 2.1172347888941934, + "language_loss": 0.89096981, + "learning_rate": 1.1875705646667287e-06, + "loss": 0.91251886, + "num_input_tokens_seen": 115279985, + "step": 5357, + "time_per_iteration": 2.5079872608184814 + }, + { + "auxiliary_loss_clip": 0.01155505, + "auxiliary_loss_mlp": 0.0102651, + "balance_loss_clip": 1.04447317, + "balance_loss_mlp": 1.01806378, + "epoch": 0.6442614080442494, + "flos": 25410965345280.0, + "grad_norm": 2.1118695030418984, + "language_loss": 0.75016308, + "learning_rate": 1.1868588194218282e-06, + "loss": 0.77198315, + "num_input_tokens_seen": 115300365, + "step": 5358, + "time_per_iteration": 2.4933533668518066 + }, + { + "auxiliary_loss_clip": 0.0115171, + "auxiliary_loss_mlp": 0.01027703, + "balance_loss_clip": 1.04572248, + "balance_loss_mlp": 1.01948345, + "epoch": 0.6443816509348885, + "flos": 28294017552000.0, + "grad_norm": 1.6796312196428318, + "language_loss": 0.73860848, + "learning_rate": 1.1861471975297979e-06, + "loss": 0.76040268, + "num_input_tokens_seen": 115322060, + "step": 5359, + "time_per_iteration": 2.658783435821533 + }, + { + "auxiliary_loss_clip": 0.01130102, + "auxiliary_loss_mlp": 0.01025932, + "balance_loss_clip": 1.0477519, + "balance_loss_mlp": 1.01733053, + "epoch": 0.6445018938255276, + "flos": 36690075964800.0, + "grad_norm": 1.5067072266466313, + "language_loss": 0.7106781, + "learning_rate": 1.185435699098591e-06, + "loss": 0.73223841, + "num_input_tokens_seen": 115348255, + "step": 5360, + "time_per_iteration": 2.7533392906188965 + }, + { + "auxiliary_loss_clip": 0.01149336, + "auxiliary_loss_mlp": 0.01023115, + "balance_loss_clip": 1.04612958, + "balance_loss_mlp": 1.01556623, + "epoch": 0.6446221367161666, + "flos": 14501411804160.0, + "grad_norm": 2.107954919380672, + "language_loss": 0.78152168, + "learning_rate": 1.1847243242361403e-06, + "loss": 0.80324626, + "num_input_tokens_seen": 115366845, + "step": 5361, + "time_per_iteration": 2.4680981636047363 + }, + { + "auxiliary_loss_clip": 0.01145177, + "auxiliary_loss_mlp": 0.01030803, + "balance_loss_clip": 1.04500103, + "balance_loss_mlp": 1.02314401, + "epoch": 0.6447423796068057, + "flos": 24609367480320.0, + "grad_norm": 1.5574639869102385, + "language_loss": 0.77925855, + "learning_rate": 1.1840130730503624e-06, + "loss": 0.80101836, + "num_input_tokens_seen": 115388125, + "step": 5362, + "time_per_iteration": 2.559142589569092 + }, + { + "auxiliary_loss_clip": 0.01174099, + "auxiliary_loss_mlp": 0.01026958, + "balance_loss_clip": 1.04929495, + "balance_loss_mlp": 1.01940632, + "epoch": 0.6448626224974449, + "flos": 25047298097280.0, + "grad_norm": 2.10385321375662, + "language_loss": 0.75026298, + "learning_rate": 1.1833019456491518e-06, + "loss": 0.77227354, + "num_input_tokens_seen": 115409655, + "step": 5363, + "time_per_iteration": 2.461615800857544 + }, + { + "auxiliary_loss_clip": 0.01162054, + "auxiliary_loss_mlp": 0.01028531, + "balance_loss_clip": 1.04988337, + "balance_loss_mlp": 1.02072263, + "epoch": 0.6449828653880839, + "flos": 22530355263360.0, + "grad_norm": 1.9511658669439313, + "language_loss": 0.7885955, + "learning_rate": 1.1825909421403871e-06, + "loss": 0.81050134, + "num_input_tokens_seen": 115428750, + "step": 5364, + "time_per_iteration": 2.4799771308898926 + }, + { + "auxiliary_loss_clip": 0.01161097, + "auxiliary_loss_mlp": 0.01027321, + "balance_loss_clip": 1.0480715, + "balance_loss_mlp": 1.01982248, + "epoch": 0.645103108278723, + "flos": 25695736369920.0, + "grad_norm": 1.8327106955872798, + "language_loss": 0.76213014, + "learning_rate": 1.181880062631926e-06, + "loss": 0.78401428, + "num_input_tokens_seen": 115448085, + "step": 5365, + "time_per_iteration": 2.5311317443847656 + }, + { + "auxiliary_loss_clip": 0.01140036, + "auxiliary_loss_mlp": 0.01031424, + "balance_loss_clip": 1.04731214, + "balance_loss_mlp": 1.02241826, + "epoch": 0.6452233511693621, + "flos": 27450331925760.0, + "grad_norm": 2.4648687521395996, + "language_loss": 0.84592366, + "learning_rate": 1.1811693072316093e-06, + "loss": 0.86763823, + "num_input_tokens_seen": 115465765, + "step": 5366, + "time_per_iteration": 2.5606062412261963 + }, + { + "auxiliary_loss_clip": 0.01172551, + "auxiliary_loss_mlp": 0.00763127, + "balance_loss_clip": 1.04703379, + "balance_loss_mlp": 1.00108457, + "epoch": 0.6453435940600012, + "flos": 19208618254080.0, + "grad_norm": 2.635234596844871, + "language_loss": 0.83747256, + "learning_rate": 1.1804586760472574e-06, + "loss": 0.85682929, + "num_input_tokens_seen": 115482230, + "step": 5367, + "time_per_iteration": 2.43666934967041 + }, + { + "auxiliary_loss_clip": 0.01130513, + "auxiliary_loss_mlp": 0.01024592, + "balance_loss_clip": 1.04531479, + "balance_loss_mlp": 1.01634312, + "epoch": 0.6454638369506402, + "flos": 25737680476800.0, + "grad_norm": 2.1645378097686243, + "language_loss": 0.79811192, + "learning_rate": 1.1797481691866736e-06, + "loss": 0.81966299, + "num_input_tokens_seen": 115499455, + "step": 5368, + "time_per_iteration": 2.552358388900757 + }, + { + "auxiliary_loss_clip": 0.01135226, + "auxiliary_loss_mlp": 0.01032894, + "balance_loss_clip": 1.04532146, + "balance_loss_mlp": 1.02522898, + "epoch": 0.6455840798412794, + "flos": 20989176364800.0, + "grad_norm": 2.13194573144124, + "language_loss": 0.83388358, + "learning_rate": 1.1790377867576393e-06, + "loss": 0.85556477, + "num_input_tokens_seen": 115517205, + "step": 5369, + "time_per_iteration": 2.498859167098999 + }, + { + "auxiliary_loss_clip": 0.01150785, + "auxiliary_loss_mlp": 0.01024448, + "balance_loss_clip": 1.04780793, + "balance_loss_mlp": 1.01651502, + "epoch": 0.6457043227319185, + "flos": 26067556005120.0, + "grad_norm": 2.4363934828856255, + "language_loss": 0.76207834, + "learning_rate": 1.1783275288679203e-06, + "loss": 0.78383064, + "num_input_tokens_seen": 115534370, + "step": 5370, + "time_per_iteration": 2.542241334915161 + }, + { + "auxiliary_loss_clip": 0.01060184, + "auxiliary_loss_mlp": 0.0100265, + "balance_loss_clip": 1.01008189, + "balance_loss_mlp": 1.00145757, + "epoch": 0.6458245656225575, + "flos": 60370831088640.0, + "grad_norm": 0.8443403444349042, + "language_loss": 0.57164878, + "learning_rate": 1.177617395625262e-06, + "loss": 0.59227711, + "num_input_tokens_seen": 115592345, + "step": 5371, + "time_per_iteration": 3.0860769748687744 + }, + { + "auxiliary_loss_clip": 0.01159692, + "auxiliary_loss_mlp": 0.01029122, + "balance_loss_clip": 1.04906154, + "balance_loss_mlp": 1.02134955, + "epoch": 0.6459448085131967, + "flos": 23076771932160.0, + "grad_norm": 1.6875625295855927, + "language_loss": 0.75190079, + "learning_rate": 1.1769073871373908e-06, + "loss": 0.77378893, + "num_input_tokens_seen": 115612550, + "step": 5372, + "time_per_iteration": 2.4919228553771973 + }, + { + "auxiliary_loss_clip": 0.01129028, + "auxiliary_loss_mlp": 0.01027159, + "balance_loss_clip": 1.0427556, + "balance_loss_mlp": 1.01952338, + "epoch": 0.6460650514038357, + "flos": 22598190097920.0, + "grad_norm": 2.1047502016769037, + "language_loss": 0.83791018, + "learning_rate": 1.176197503512015e-06, + "loss": 0.85947204, + "num_input_tokens_seen": 115632265, + "step": 5373, + "time_per_iteration": 2.566441297531128 + }, + { + "auxiliary_loss_clip": 0.0114449, + "auxiliary_loss_mlp": 0.01023945, + "balance_loss_clip": 1.04788888, + "balance_loss_mlp": 1.01669097, + "epoch": 0.6461852942944748, + "flos": 20266726118400.0, + "grad_norm": 1.9550602362871108, + "language_loss": 0.81768084, + "learning_rate": 1.1754877448568223e-06, + "loss": 0.83936512, + "num_input_tokens_seen": 115651720, + "step": 5374, + "time_per_iteration": 3.2100462913513184 + }, + { + "auxiliary_loss_clip": 0.01146041, + "auxiliary_loss_mlp": 0.01025769, + "balance_loss_clip": 1.046453, + "balance_loss_mlp": 1.01806784, + "epoch": 0.646305537185114, + "flos": 23367109564800.0, + "grad_norm": 7.561648320632024, + "language_loss": 0.90079606, + "learning_rate": 1.1747781112794837e-06, + "loss": 0.9225142, + "num_input_tokens_seen": 115668215, + "step": 5375, + "time_per_iteration": 3.248318910598755 + }, + { + "auxiliary_loss_clip": 0.0112924, + "auxiliary_loss_mlp": 0.01030201, + "balance_loss_clip": 1.04559588, + "balance_loss_mlp": 1.02258325, + "epoch": 0.646425780075753, + "flos": 24277480790400.0, + "grad_norm": 1.6435203753045864, + "language_loss": 0.83202469, + "learning_rate": 1.1740686028876487e-06, + "loss": 0.8536191, + "num_input_tokens_seen": 115687080, + "step": 5376, + "time_per_iteration": 3.341362476348877 + }, + { + "auxiliary_loss_clip": 0.01156062, + "auxiliary_loss_mlp": 0.01021432, + "balance_loss_clip": 1.04815364, + "balance_loss_mlp": 1.01399302, + "epoch": 0.6465460229663921, + "flos": 20813968800000.0, + "grad_norm": 2.2974725379405476, + "language_loss": 0.75077707, + "learning_rate": 1.1733592197889507e-06, + "loss": 0.77255201, + "num_input_tokens_seen": 115703990, + "step": 5377, + "time_per_iteration": 2.4524364471435547 + }, + { + "auxiliary_loss_clip": 0.01153556, + "auxiliary_loss_mlp": 0.01024056, + "balance_loss_clip": 1.04854667, + "balance_loss_mlp": 1.01682353, + "epoch": 0.6466662658570312, + "flos": 22853299466880.0, + "grad_norm": 2.1824986478939588, + "language_loss": 0.7254141, + "learning_rate": 1.1726499620910014e-06, + "loss": 0.74719024, + "num_input_tokens_seen": 115724270, + "step": 5378, + "time_per_iteration": 2.487321615219116 + }, + { + "auxiliary_loss_clip": 0.0115698, + "auxiliary_loss_mlp": 0.01024025, + "balance_loss_clip": 1.047454, + "balance_loss_mlp": 1.01603222, + "epoch": 0.6467865087476703, + "flos": 15304553953920.0, + "grad_norm": 2.3198620798134284, + "language_loss": 0.77462476, + "learning_rate": 1.1719408299013955e-06, + "loss": 0.79643482, + "num_input_tokens_seen": 115742995, + "step": 5379, + "time_per_iteration": 2.4595892429351807 + }, + { + "auxiliary_loss_clip": 0.01173641, + "auxiliary_loss_mlp": 0.01029033, + "balance_loss_clip": 1.05130816, + "balance_loss_mlp": 1.02154708, + "epoch": 0.6469067516383094, + "flos": 19573650218880.0, + "grad_norm": 2.1208960214723795, + "language_loss": 0.75382572, + "learning_rate": 1.1712318233277067e-06, + "loss": 0.77585244, + "num_input_tokens_seen": 115762015, + "step": 5380, + "time_per_iteration": 3.2178030014038086 + }, + { + "auxiliary_loss_clip": 0.01057996, + "auxiliary_loss_mlp": 0.01001588, + "balance_loss_clip": 1.00953174, + "balance_loss_mlp": 1.00054514, + "epoch": 0.6470269945289485, + "flos": 65098002522240.0, + "grad_norm": 0.7600785321943881, + "language_loss": 0.57922781, + "learning_rate": 1.1705229424774916e-06, + "loss": 0.59982371, + "num_input_tokens_seen": 115816285, + "step": 5381, + "time_per_iteration": 2.916477680206299 + }, + { + "auxiliary_loss_clip": 0.01141658, + "auxiliary_loss_mlp": 0.01027697, + "balance_loss_clip": 1.04482818, + "balance_loss_mlp": 1.01999867, + "epoch": 0.6471472374195876, + "flos": 30696943639680.0, + "grad_norm": 2.1388661450726745, + "language_loss": 0.63964498, + "learning_rate": 1.1698141874582867e-06, + "loss": 0.66133857, + "num_input_tokens_seen": 115837330, + "step": 5382, + "time_per_iteration": 2.564896583557129 + }, + { + "auxiliary_loss_clip": 0.01172626, + "auxiliary_loss_mlp": 0.01027546, + "balance_loss_clip": 1.04970622, + "balance_loss_mlp": 1.02040529, + "epoch": 0.6472674803102266, + "flos": 20521835487360.0, + "grad_norm": 1.749533963444112, + "language_loss": 0.72108608, + "learning_rate": 1.169105558377609e-06, + "loss": 0.74308777, + "num_input_tokens_seen": 115857420, + "step": 5383, + "time_per_iteration": 2.4252800941467285 + }, + { + "auxiliary_loss_clip": 0.01117026, + "auxiliary_loss_mlp": 0.00762435, + "balance_loss_clip": 1.04751706, + "balance_loss_mlp": 1.00109732, + "epoch": 0.6473877232008658, + "flos": 24715447320960.0, + "grad_norm": 1.7021263587363593, + "language_loss": 0.78557312, + "learning_rate": 1.1683970553429587e-06, + "loss": 0.80436778, + "num_input_tokens_seen": 115878875, + "step": 5384, + "time_per_iteration": 2.5999600887298584 + }, + { + "auxiliary_loss_clip": 0.01132743, + "auxiliary_loss_mlp": 0.01030854, + "balance_loss_clip": 1.04469776, + "balance_loss_mlp": 1.02279556, + "epoch": 0.6475079660915048, + "flos": 15885552441600.0, + "grad_norm": 2.1106573895609304, + "language_loss": 0.82086384, + "learning_rate": 1.1676886784618128e-06, + "loss": 0.84249985, + "num_input_tokens_seen": 115895540, + "step": 5385, + "time_per_iteration": 2.5115127563476562 + }, + { + "auxiliary_loss_clip": 0.01160574, + "auxiliary_loss_mlp": 0.01023909, + "balance_loss_clip": 1.05006468, + "balance_loss_mlp": 1.01617885, + "epoch": 0.6476282089821439, + "flos": 17381590922880.0, + "grad_norm": 2.0809763834376476, + "language_loss": 0.83926833, + "learning_rate": 1.1669804278416332e-06, + "loss": 0.86111313, + "num_input_tokens_seen": 115910265, + "step": 5386, + "time_per_iteration": 2.432042121887207 + }, + { + "auxiliary_loss_clip": 0.01150576, + "auxiliary_loss_mlp": 0.01028484, + "balance_loss_clip": 1.04828191, + "balance_loss_mlp": 1.02027071, + "epoch": 0.6477484518727831, + "flos": 20194078861440.0, + "grad_norm": 2.0094878888679264, + "language_loss": 0.71565652, + "learning_rate": 1.1662723035898602e-06, + "loss": 0.73744714, + "num_input_tokens_seen": 115930025, + "step": 5387, + "time_per_iteration": 2.548278570175171 + }, + { + "auxiliary_loss_clip": 0.01159612, + "auxiliary_loss_mlp": 0.0102541, + "balance_loss_clip": 1.04954314, + "balance_loss_mlp": 1.01763809, + "epoch": 0.6478686947634221, + "flos": 25410426641280.0, + "grad_norm": 11.675403836057779, + "language_loss": 0.81903809, + "learning_rate": 1.165564305813915e-06, + "loss": 0.84088838, + "num_input_tokens_seen": 115949025, + "step": 5388, + "time_per_iteration": 2.501854181289673 + }, + { + "auxiliary_loss_clip": 0.01158856, + "auxiliary_loss_mlp": 0.01026804, + "balance_loss_clip": 1.0478003, + "balance_loss_mlp": 1.01940107, + "epoch": 0.6479889376540612, + "flos": 20083581648000.0, + "grad_norm": 1.866533760591023, + "language_loss": 0.8148402, + "learning_rate": 1.1648564346212019e-06, + "loss": 0.83669686, + "num_input_tokens_seen": 115968145, + "step": 5389, + "time_per_iteration": 2.4607560634613037 + }, + { + "auxiliary_loss_clip": 0.01154786, + "auxiliary_loss_mlp": 0.01029656, + "balance_loss_clip": 1.04990947, + "balance_loss_mlp": 1.02222323, + "epoch": 0.6481091805447003, + "flos": 26758082039040.0, + "grad_norm": 2.07228319528234, + "language_loss": 0.76230508, + "learning_rate": 1.164148690119104e-06, + "loss": 0.78414953, + "num_input_tokens_seen": 115989425, + "step": 5390, + "time_per_iteration": 2.5126049518585205 + }, + { + "auxiliary_loss_clip": 0.01171286, + "auxiliary_loss_mlp": 0.0102646, + "balance_loss_clip": 1.04886103, + "balance_loss_mlp": 1.01884842, + "epoch": 0.6482294234353394, + "flos": 23952094462080.0, + "grad_norm": 1.8036953100089217, + "language_loss": 0.74146193, + "learning_rate": 1.163441072414985e-06, + "loss": 0.76343942, + "num_input_tokens_seen": 116009630, + "step": 5391, + "time_per_iteration": 2.4572296142578125 + }, + { + "auxiliary_loss_clip": 0.01161906, + "auxiliary_loss_mlp": 0.01025938, + "balance_loss_clip": 1.05066371, + "balance_loss_mlp": 1.01841879, + "epoch": 0.6483496663259785, + "flos": 26209833776640.0, + "grad_norm": 1.8688124535007686, + "language_loss": 0.69708931, + "learning_rate": 1.16273358161619e-06, + "loss": 0.7189678, + "num_input_tokens_seen": 116029965, + "step": 5392, + "time_per_iteration": 2.503835678100586 + }, + { + "auxiliary_loss_clip": 0.01157102, + "auxiliary_loss_mlp": 0.01028055, + "balance_loss_clip": 1.05145216, + "balance_loss_mlp": 1.02040803, + "epoch": 0.6484699092166175, + "flos": 20922239370240.0, + "grad_norm": 2.098980592674738, + "language_loss": 0.83250237, + "learning_rate": 1.1620262178300446e-06, + "loss": 0.85435396, + "num_input_tokens_seen": 116048580, + "step": 5393, + "time_per_iteration": 2.5089824199676514 + }, + { + "auxiliary_loss_clip": 0.01130432, + "auxiliary_loss_mlp": 0.01026406, + "balance_loss_clip": 1.04300427, + "balance_loss_mlp": 1.01877093, + "epoch": 0.6485901521072567, + "flos": 33072865678080.0, + "grad_norm": 1.8489637910730818, + "language_loss": 0.75890201, + "learning_rate": 1.1613189811638563e-06, + "loss": 0.78047037, + "num_input_tokens_seen": 116070305, + "step": 5394, + "time_per_iteration": 2.6279170513153076 + }, + { + "auxiliary_loss_clip": 0.01162174, + "auxiliary_loss_mlp": 0.0102714, + "balance_loss_clip": 1.0505867, + "balance_loss_mlp": 1.01992524, + "epoch": 0.6487103949978957, + "flos": 22274060745600.0, + "grad_norm": 1.9010688927516863, + "language_loss": 0.78083372, + "learning_rate": 1.1606118717249117e-06, + "loss": 0.80272686, + "num_input_tokens_seen": 116090405, + "step": 5395, + "time_per_iteration": 2.4823386669158936 + }, + { + "auxiliary_loss_clip": 0.01176918, + "auxiliary_loss_mlp": 0.01025314, + "balance_loss_clip": 1.04979229, + "balance_loss_mlp": 1.01745808, + "epoch": 0.6488306378885348, + "flos": 22930400010240.0, + "grad_norm": 1.9549298087818014, + "language_loss": 0.6821692, + "learning_rate": 1.1599048896204787e-06, + "loss": 0.70419145, + "num_input_tokens_seen": 116110285, + "step": 5396, + "time_per_iteration": 2.434535026550293 + }, + { + "auxiliary_loss_clip": 0.0113264, + "auxiliary_loss_mlp": 0.01027255, + "balance_loss_clip": 1.04475617, + "balance_loss_mlp": 1.01972413, + "epoch": 0.648950880779174, + "flos": 20376110010240.0, + "grad_norm": 1.6600065828379182, + "language_loss": 0.80585301, + "learning_rate": 1.1591980349578061e-06, + "loss": 0.82745194, + "num_input_tokens_seen": 116128955, + "step": 5397, + "time_per_iteration": 2.5245258808135986 + }, + { + "auxiliary_loss_clip": 0.01037514, + "auxiliary_loss_mlp": 0.01001746, + "balance_loss_clip": 1.00961494, + "balance_loss_mlp": 1.00053048, + "epoch": 0.649071123669813, + "flos": 59930889310080.0, + "grad_norm": 0.7356641374361531, + "language_loss": 0.54295826, + "learning_rate": 1.158491307844123e-06, + "loss": 0.5633508, + "num_input_tokens_seen": 116188875, + "step": 5398, + "time_per_iteration": 3.0657761096954346 + }, + { + "auxiliary_loss_clip": 0.01145576, + "auxiliary_loss_mlp": 0.01026387, + "balance_loss_clip": 1.04918337, + "balance_loss_mlp": 1.0187639, + "epoch": 0.6491913665604521, + "flos": 20446566537600.0, + "grad_norm": 1.6580987278327215, + "language_loss": 0.83977586, + "learning_rate": 1.1577847083866387e-06, + "loss": 0.86149549, + "num_input_tokens_seen": 116207910, + "step": 5399, + "time_per_iteration": 2.502359390258789 + }, + { + "auxiliary_loss_clip": 0.01135495, + "auxiliary_loss_mlp": 0.01029182, + "balance_loss_clip": 1.04595113, + "balance_loss_mlp": 1.02099204, + "epoch": 0.6493116094510912, + "flos": 16946820702720.0, + "grad_norm": 2.753795239913573, + "language_loss": 0.7196728, + "learning_rate": 1.1570782366925453e-06, + "loss": 0.7413196, + "num_input_tokens_seen": 116226425, + "step": 5400, + "time_per_iteration": 3.3001439571380615 + }, + { + "auxiliary_loss_clip": 0.01145414, + "auxiliary_loss_mlp": 0.01027044, + "balance_loss_clip": 1.04381776, + "balance_loss_mlp": 1.01927805, + "epoch": 0.6494318523417303, + "flos": 18802935072000.0, + "grad_norm": 1.785318806017555, + "language_loss": 0.75401628, + "learning_rate": 1.1563718928690132e-06, + "loss": 0.77574086, + "num_input_tokens_seen": 116243860, + "step": 5401, + "time_per_iteration": 2.4836695194244385 + }, + { + "auxiliary_loss_clip": 0.01130004, + "auxiliary_loss_mlp": 0.01026858, + "balance_loss_clip": 1.04645669, + "balance_loss_mlp": 1.01905632, + "epoch": 0.6495520952323693, + "flos": 18982847318400.0, + "grad_norm": 2.136230468307545, + "language_loss": 0.71024519, + "learning_rate": 1.1556656770231942e-06, + "loss": 0.73181379, + "num_input_tokens_seen": 116260055, + "step": 5402, + "time_per_iteration": 3.9421145915985107 + }, + { + "auxiliary_loss_clip": 0.01160403, + "auxiliary_loss_mlp": 0.01028257, + "balance_loss_clip": 1.04794621, + "balance_loss_mlp": 1.02112877, + "epoch": 0.6496723381230085, + "flos": 22745388032640.0, + "grad_norm": 1.5224130948570138, + "language_loss": 0.76210296, + "learning_rate": 1.1549595892622207e-06, + "loss": 0.78398949, + "num_input_tokens_seen": 116278825, + "step": 5403, + "time_per_iteration": 2.4777727127075195 + }, + { + "auxiliary_loss_clip": 0.0102136, + "auxiliary_loss_mlp": 0.01000574, + "balance_loss_clip": 1.01152515, + "balance_loss_mlp": 0.99962634, + "epoch": 0.6497925810136476, + "flos": 62145283887360.0, + "grad_norm": 0.835523651964043, + "language_loss": 0.59039128, + "learning_rate": 1.1542536296932047e-06, + "loss": 0.6106106, + "num_input_tokens_seen": 116342360, + "step": 5404, + "time_per_iteration": 3.199883460998535 + }, + { + "auxiliary_loss_clip": 0.01136679, + "auxiliary_loss_mlp": 0.01031412, + "balance_loss_clip": 1.0435605, + "balance_loss_mlp": 1.02314496, + "epoch": 0.6499128239042866, + "flos": 20156731695360.0, + "grad_norm": 1.8555151887086498, + "language_loss": 0.69956875, + "learning_rate": 1.1535477984232414e-06, + "loss": 0.72124964, + "num_input_tokens_seen": 116362235, + "step": 5405, + "time_per_iteration": 2.7888920307159424 + }, + { + "auxiliary_loss_clip": 0.01119948, + "auxiliary_loss_mlp": 0.01028537, + "balance_loss_clip": 1.03971934, + "balance_loss_mlp": 1.02077317, + "epoch": 0.6500330667949258, + "flos": 24462420940800.0, + "grad_norm": 2.2073198068095583, + "language_loss": 0.77199352, + "learning_rate": 1.152842095559404e-06, + "loss": 0.79347837, + "num_input_tokens_seen": 116382895, + "step": 5406, + "time_per_iteration": 2.616406202316284 + }, + { + "auxiliary_loss_clip": 0.01148484, + "auxiliary_loss_mlp": 0.0102836, + "balance_loss_clip": 1.04493117, + "balance_loss_mlp": 1.02086771, + "epoch": 0.6501533096855648, + "flos": 25477399549440.0, + "grad_norm": 1.8077029272887863, + "language_loss": 0.76755077, + "learning_rate": 1.1521365212087474e-06, + "loss": 0.78931922, + "num_input_tokens_seen": 116402880, + "step": 5407, + "time_per_iteration": 3.2845520973205566 + }, + { + "auxiliary_loss_clip": 0.01159365, + "auxiliary_loss_mlp": 0.01025317, + "balance_loss_clip": 1.04729056, + "balance_loss_mlp": 1.01742506, + "epoch": 0.6502735525762039, + "flos": 44819245347840.0, + "grad_norm": 1.5661757634844242, + "language_loss": 0.70742959, + "learning_rate": 1.1514310754783062e-06, + "loss": 0.72927642, + "num_input_tokens_seen": 116425830, + "step": 5408, + "time_per_iteration": 2.6759896278381348 + }, + { + "auxiliary_loss_clip": 0.01148795, + "auxiliary_loss_mlp": 0.01026746, + "balance_loss_clip": 1.04820752, + "balance_loss_mlp": 1.01869965, + "epoch": 0.6503937954668431, + "flos": 28658546726400.0, + "grad_norm": 2.164639354542933, + "language_loss": 0.73298717, + "learning_rate": 1.1507257584750964e-06, + "loss": 0.75474262, + "num_input_tokens_seen": 116446010, + "step": 5409, + "time_per_iteration": 2.5719566345214844 + }, + { + "auxiliary_loss_clip": 0.01173559, + "auxiliary_loss_mlp": 0.01029692, + "balance_loss_clip": 1.04949772, + "balance_loss_mlp": 1.0217768, + "epoch": 0.6505140383574821, + "flos": 20922562592640.0, + "grad_norm": 3.3791713220423203, + "language_loss": 0.77512765, + "learning_rate": 1.150020570306113e-06, + "loss": 0.79716021, + "num_input_tokens_seen": 116465150, + "step": 5410, + "time_per_iteration": 2.432922601699829 + }, + { + "auxiliary_loss_clip": 0.01137564, + "auxiliary_loss_mlp": 0.01025841, + "balance_loss_clip": 1.04184842, + "balance_loss_mlp": 1.01789021, + "epoch": 0.6506342812481212, + "flos": 20595236929920.0, + "grad_norm": 1.9166384614974261, + "language_loss": 0.75052446, + "learning_rate": 1.1493155110783338e-06, + "loss": 0.77215856, + "num_input_tokens_seen": 116483675, + "step": 5411, + "time_per_iteration": 2.546314239501953 + }, + { + "auxiliary_loss_clip": 0.01159045, + "auxiliary_loss_mlp": 0.01023961, + "balance_loss_clip": 1.04861021, + "balance_loss_mlp": 1.01602721, + "epoch": 0.6507545241387603, + "flos": 30226478279040.0, + "grad_norm": 2.251325371485196, + "language_loss": 0.70703006, + "learning_rate": 1.1486105808987155e-06, + "loss": 0.72886014, + "num_input_tokens_seen": 116505165, + "step": 5412, + "time_per_iteration": 2.546931505203247 + }, + { + "auxiliary_loss_clip": 0.01163777, + "auxiliary_loss_mlp": 0.0102239, + "balance_loss_clip": 1.0511806, + "balance_loss_mlp": 1.01423609, + "epoch": 0.6508747670293994, + "flos": 17128241320320.0, + "grad_norm": 1.8368916660917984, + "language_loss": 0.81199908, + "learning_rate": 1.1479057798741947e-06, + "loss": 0.83386081, + "num_input_tokens_seen": 116523220, + "step": 5413, + "time_per_iteration": 2.439532518386841 + }, + { + "auxiliary_loss_clip": 0.01050258, + "auxiliary_loss_mlp": 0.01002731, + "balance_loss_clip": 1.01696348, + "balance_loss_mlp": 1.00161076, + "epoch": 0.6509950099200384, + "flos": 68559826573440.0, + "grad_norm": 0.7884206814647335, + "language_loss": 0.53344381, + "learning_rate": 1.14720110811169e-06, + "loss": 0.55397367, + "num_input_tokens_seen": 116580450, + "step": 5414, + "time_per_iteration": 3.102006673812866 + }, + { + "auxiliary_loss_clip": 0.01164913, + "auxiliary_loss_mlp": 0.01025737, + "balance_loss_clip": 1.05002749, + "balance_loss_mlp": 1.01806557, + "epoch": 0.6511152528106776, + "flos": 22347462188160.0, + "grad_norm": 2.123156776769095, + "language_loss": 0.77022564, + "learning_rate": 1.146496565718098e-06, + "loss": 0.79213214, + "num_input_tokens_seen": 116601020, + "step": 5415, + "time_per_iteration": 2.484287977218628 + }, + { + "auxiliary_loss_clip": 0.0114516, + "auxiliary_loss_mlp": 0.01026312, + "balance_loss_clip": 1.04782856, + "balance_loss_mlp": 1.01806295, + "epoch": 0.6512354957013167, + "flos": 20522158709760.0, + "grad_norm": 2.827076642695323, + "language_loss": 0.75479102, + "learning_rate": 1.1457921528002996e-06, + "loss": 0.77650577, + "num_input_tokens_seen": 116619455, + "step": 5416, + "time_per_iteration": 2.5185136795043945 + }, + { + "auxiliary_loss_clip": 0.0117326, + "auxiliary_loss_mlp": 0.00762903, + "balance_loss_clip": 1.04918492, + "balance_loss_mlp": 1.00107646, + "epoch": 0.6513557385919557, + "flos": 32337342881280.0, + "grad_norm": 2.187560347057488, + "language_loss": 0.72344583, + "learning_rate": 1.1450878694651522e-06, + "loss": 0.74280751, + "num_input_tokens_seen": 116640020, + "step": 5417, + "time_per_iteration": 2.5622940063476562 + }, + { + "auxiliary_loss_clip": 0.01115478, + "auxiliary_loss_mlp": 0.01026562, + "balance_loss_clip": 1.04081357, + "balance_loss_mlp": 1.01863503, + "epoch": 0.6514759814825949, + "flos": 12093206417280.0, + "grad_norm": 2.181831430856097, + "language_loss": 0.63221824, + "learning_rate": 1.1443837158194954e-06, + "loss": 0.6536386, + "num_input_tokens_seen": 116655165, + "step": 5418, + "time_per_iteration": 2.551109552383423 + }, + { + "auxiliary_loss_clip": 0.01132557, + "auxiliary_loss_mlp": 0.01027097, + "balance_loss_clip": 1.05052733, + "balance_loss_mlp": 1.01921773, + "epoch": 0.651596224373234, + "flos": 22526907557760.0, + "grad_norm": 1.6456896498455376, + "language_loss": 0.74223375, + "learning_rate": 1.1436796919701484e-06, + "loss": 0.7638303, + "num_input_tokens_seen": 116673880, + "step": 5419, + "time_per_iteration": 2.5447237491607666 + }, + { + "auxiliary_loss_clip": 0.01144121, + "auxiliary_loss_mlp": 0.01023469, + "balance_loss_clip": 1.047297, + "balance_loss_mlp": 1.01581621, + "epoch": 0.651716467263873, + "flos": 27818955250560.0, + "grad_norm": 2.0719202080328927, + "language_loss": 0.61531901, + "learning_rate": 1.1429757980239115e-06, + "loss": 0.6369949, + "num_input_tokens_seen": 116694305, + "step": 5420, + "time_per_iteration": 2.565415143966675 + }, + { + "auxiliary_loss_clip": 0.01174701, + "auxiliary_loss_mlp": 0.01035719, + "balance_loss_clip": 1.04931331, + "balance_loss_mlp": 1.02737153, + "epoch": 0.6518367101545122, + "flos": 24316300414080.0, + "grad_norm": 2.3846820748067157, + "language_loss": 0.8177256, + "learning_rate": 1.1422720340875636e-06, + "loss": 0.8398298, + "num_input_tokens_seen": 116713055, + "step": 5421, + "time_per_iteration": 2.451666831970215 + }, + { + "auxiliary_loss_clip": 0.01166107, + "auxiliary_loss_mlp": 0.01025719, + "balance_loss_clip": 1.04868436, + "balance_loss_mlp": 1.01832795, + "epoch": 0.6519569530451512, + "flos": 20011939971840.0, + "grad_norm": 1.969264197428429, + "language_loss": 0.78911543, + "learning_rate": 1.1415684002678671e-06, + "loss": 0.81103367, + "num_input_tokens_seen": 116731815, + "step": 5422, + "time_per_iteration": 2.483464241027832 + }, + { + "auxiliary_loss_clip": 0.01148664, + "auxiliary_loss_mlp": 0.01028708, + "balance_loss_clip": 1.04547453, + "balance_loss_mlp": 1.0204761, + "epoch": 0.6520771959357903, + "flos": 21576064682880.0, + "grad_norm": 2.311956801370739, + "language_loss": 0.77836841, + "learning_rate": 1.1408648966715617e-06, + "loss": 0.80014211, + "num_input_tokens_seen": 116749335, + "step": 5423, + "time_per_iteration": 2.497129201889038 + }, + { + "auxiliary_loss_clip": 0.01144547, + "auxiliary_loss_mlp": 0.01031332, + "balance_loss_clip": 1.04270601, + "balance_loss_mlp": 1.02324951, + "epoch": 0.6521974388264293, + "flos": 22711021695360.0, + "grad_norm": 2.2944855766256356, + "language_loss": 0.72497404, + "learning_rate": 1.1401615234053683e-06, + "loss": 0.74673277, + "num_input_tokens_seen": 116768155, + "step": 5424, + "time_per_iteration": 2.539151906967163 + }, + { + "auxiliary_loss_clip": 0.01146367, + "auxiliary_loss_mlp": 0.01030006, + "balance_loss_clip": 1.04544079, + "balance_loss_mlp": 1.02179241, + "epoch": 0.6523176817170685, + "flos": 23002939526400.0, + "grad_norm": 1.86519857978446, + "language_loss": 0.75999641, + "learning_rate": 1.1394582805759885e-06, + "loss": 0.78176022, + "num_input_tokens_seen": 116787435, + "step": 5425, + "time_per_iteration": 2.503570079803467 + }, + { + "auxiliary_loss_clip": 0.01161536, + "auxiliary_loss_mlp": 0.01029895, + "balance_loss_clip": 1.04981303, + "balance_loss_mlp": 1.02218199, + "epoch": 0.6524379246077076, + "flos": 21688249835520.0, + "grad_norm": 1.5922145559121725, + "language_loss": 0.75872517, + "learning_rate": 1.1387551682901022e-06, + "loss": 0.78063953, + "num_input_tokens_seen": 116808040, + "step": 5426, + "time_per_iteration": 2.478416681289673 + }, + { + "auxiliary_loss_clip": 0.01128767, + "auxiliary_loss_mlp": 0.01023609, + "balance_loss_clip": 1.04453254, + "balance_loss_mlp": 1.01584816, + "epoch": 0.6525581674983466, + "flos": 19390936711680.0, + "grad_norm": 3.0816372168958357, + "language_loss": 0.70324862, + "learning_rate": 1.138052186654373e-06, + "loss": 0.72477233, + "num_input_tokens_seen": 116825510, + "step": 5427, + "time_per_iteration": 3.305699348449707 + }, + { + "auxiliary_loss_clip": 0.01147346, + "auxiliary_loss_mlp": 0.01026814, + "balance_loss_clip": 1.04668355, + "balance_loss_mlp": 1.01834404, + "epoch": 0.6526784103889858, + "flos": 17165444832000.0, + "grad_norm": 2.1512541160279066, + "language_loss": 0.88106304, + "learning_rate": 1.1373493357754417e-06, + "loss": 0.90280461, + "num_input_tokens_seen": 116844415, + "step": 5428, + "time_per_iteration": 2.5005953311920166 + }, + { + "auxiliary_loss_clip": 0.01172524, + "auxiliary_loss_mlp": 0.01022652, + "balance_loss_clip": 1.04754806, + "balance_loss_mlp": 1.01543999, + "epoch": 0.6527986532796248, + "flos": 18989168112000.0, + "grad_norm": 1.8072078108423388, + "language_loss": 0.77033925, + "learning_rate": 1.1366466157599303e-06, + "loss": 0.79229093, + "num_input_tokens_seen": 116863690, + "step": 5429, + "time_per_iteration": 3.210881471633911 + }, + { + "auxiliary_loss_clip": 0.01115633, + "auxiliary_loss_mlp": 0.00763232, + "balance_loss_clip": 1.04338002, + "balance_loss_mlp": 1.00106692, + "epoch": 0.6529188961702639, + "flos": 14238581011200.0, + "grad_norm": 1.929021672833137, + "language_loss": 0.75798595, + "learning_rate": 1.1359440267144412e-06, + "loss": 0.77677464, + "num_input_tokens_seen": 116881145, + "step": 5430, + "time_per_iteration": 2.577997922897339 + }, + { + "auxiliary_loss_clip": 0.01161551, + "auxiliary_loss_mlp": 0.01022431, + "balance_loss_clip": 1.04798293, + "balance_loss_mlp": 1.01509094, + "epoch": 0.653039139060903, + "flos": 36682929158400.0, + "grad_norm": 1.9711789009108625, + "language_loss": 0.74547076, + "learning_rate": 1.1352415687455556e-06, + "loss": 0.7673105, + "num_input_tokens_seen": 116902405, + "step": 5431, + "time_per_iteration": 2.6058056354522705 + }, + { + "auxiliary_loss_clip": 0.01160617, + "auxiliary_loss_mlp": 0.01031097, + "balance_loss_clip": 1.04916382, + "balance_loss_mlp": 1.02331257, + "epoch": 0.6531593819515421, + "flos": 25376275785600.0, + "grad_norm": 2.508718942450795, + "language_loss": 0.63454962, + "learning_rate": 1.1345392419598362e-06, + "loss": 0.65646672, + "num_input_tokens_seen": 116921285, + "step": 5432, + "time_per_iteration": 2.552647829055786 + }, + { + "auxiliary_loss_clip": 0.01153271, + "auxiliary_loss_mlp": 0.0102157, + "balance_loss_clip": 1.04579377, + "balance_loss_mlp": 1.01327324, + "epoch": 0.6532796248421812, + "flos": 21178533888000.0, + "grad_norm": 2.1492656949290443, + "language_loss": 0.71726906, + "learning_rate": 1.1338370464638263e-06, + "loss": 0.73901749, + "num_input_tokens_seen": 116940685, + "step": 5433, + "time_per_iteration": 2.4677515029907227 + }, + { + "auxiliary_loss_clip": 0.01172475, + "auxiliary_loss_mlp": 0.01023467, + "balance_loss_clip": 1.04747117, + "balance_loss_mlp": 1.01563454, + "epoch": 0.6533998677328203, + "flos": 17675950878720.0, + "grad_norm": 2.305725974997432, + "language_loss": 0.63979554, + "learning_rate": 1.1331349823640474e-06, + "loss": 0.66175497, + "num_input_tokens_seen": 116958115, + "step": 5434, + "time_per_iteration": 3.117790699005127 + }, + { + "auxiliary_loss_clip": 0.01161049, + "auxiliary_loss_mlp": 0.00762185, + "balance_loss_clip": 1.0481137, + "balance_loss_mlp": 1.00120819, + "epoch": 0.6535201106234594, + "flos": 28400384701440.0, + "grad_norm": 2.2258982060761823, + "language_loss": 0.77986777, + "learning_rate": 1.132433049767003e-06, + "loss": 0.79910016, + "num_input_tokens_seen": 116976030, + "step": 5435, + "time_per_iteration": 2.5253384113311768 + }, + { + "auxiliary_loss_clip": 0.01144678, + "auxiliary_loss_mlp": 0.01028722, + "balance_loss_clip": 1.04642606, + "balance_loss_mlp": 1.02150965, + "epoch": 0.6536403535140984, + "flos": 23586667447680.0, + "grad_norm": 1.5586546020353713, + "language_loss": 0.8103627, + "learning_rate": 1.1317312487791748e-06, + "loss": 0.8320967, + "num_input_tokens_seen": 116997680, + "step": 5436, + "time_per_iteration": 2.564047336578369 + }, + { + "auxiliary_loss_clip": 0.01153795, + "auxiliary_loss_mlp": 0.01027898, + "balance_loss_clip": 1.04593134, + "balance_loss_mlp": 1.01982439, + "epoch": 0.6537605964047376, + "flos": 21579476474880.0, + "grad_norm": 4.415344479912204, + "language_loss": 0.73064125, + "learning_rate": 1.1310295795070253e-06, + "loss": 0.75245816, + "num_input_tokens_seen": 117017620, + "step": 5437, + "time_per_iteration": 2.4680423736572266 + }, + { + "auxiliary_loss_clip": 0.01122194, + "auxiliary_loss_mlp": 0.01028426, + "balance_loss_clip": 1.04334879, + "balance_loss_mlp": 1.02043653, + "epoch": 0.6538808392953767, + "flos": 26833997433600.0, + "grad_norm": 1.7633052579226394, + "language_loss": 0.80982769, + "learning_rate": 1.1303280420569982e-06, + "loss": 0.83133394, + "num_input_tokens_seen": 117039505, + "step": 5438, + "time_per_iteration": 2.5967624187469482 + }, + { + "auxiliary_loss_clip": 0.01155597, + "auxiliary_loss_mlp": 0.01023783, + "balance_loss_clip": 1.04781365, + "balance_loss_mlp": 1.015903, + "epoch": 0.6540010821860157, + "flos": 30738241301760.0, + "grad_norm": 1.9167452145243602, + "language_loss": 0.77149439, + "learning_rate": 1.1296266365355158e-06, + "loss": 0.79328817, + "num_input_tokens_seen": 117062890, + "step": 5439, + "time_per_iteration": 2.5419986248016357 + }, + { + "auxiliary_loss_clip": 0.01136908, + "auxiliary_loss_mlp": 0.01026098, + "balance_loss_clip": 1.04751301, + "balance_loss_mlp": 1.01763463, + "epoch": 0.6541213250766549, + "flos": 26907147480960.0, + "grad_norm": 4.617204007908902, + "language_loss": 0.73525333, + "learning_rate": 1.1289253630489806e-06, + "loss": 0.75688344, + "num_input_tokens_seen": 117083940, + "step": 5440, + "time_per_iteration": 2.6022086143493652 + }, + { + "auxiliary_loss_clip": 0.01163958, + "auxiliary_loss_mlp": 0.01030271, + "balance_loss_clip": 1.04696167, + "balance_loss_mlp": 1.02190876, + "epoch": 0.6542415679672939, + "flos": 19172384409600.0, + "grad_norm": 3.990340208749632, + "language_loss": 0.72792816, + "learning_rate": 1.1282242217037753e-06, + "loss": 0.74987042, + "num_input_tokens_seen": 117101440, + "step": 5441, + "time_per_iteration": 2.4585506916046143 + }, + { + "auxiliary_loss_clip": 0.01113586, + "auxiliary_loss_mlp": 0.01025583, + "balance_loss_clip": 1.04035091, + "balance_loss_mlp": 1.01755989, + "epoch": 0.654361810857933, + "flos": 48173517100800.0, + "grad_norm": 2.063288583467447, + "language_loss": 0.61656106, + "learning_rate": 1.127523212606262e-06, + "loss": 0.6379528, + "num_input_tokens_seen": 117124265, + "step": 5442, + "time_per_iteration": 2.8261349201202393 + }, + { + "auxiliary_loss_clip": 0.01158375, + "auxiliary_loss_mlp": 0.01026326, + "balance_loss_clip": 1.04808164, + "balance_loss_mlp": 1.0183866, + "epoch": 0.6544820537485722, + "flos": 26943165843840.0, + "grad_norm": 2.110315982976855, + "language_loss": 0.73223138, + "learning_rate": 1.1268223358627835e-06, + "loss": 0.75407839, + "num_input_tokens_seen": 117146755, + "step": 5443, + "time_per_iteration": 2.519963264465332 + }, + { + "auxiliary_loss_clip": 0.01174198, + "auxiliary_loss_mlp": 0.0102585, + "balance_loss_clip": 1.04852057, + "balance_loss_mlp": 1.01772618, + "epoch": 0.6546022966392112, + "flos": 20886328748160.0, + "grad_norm": 1.9698038994214426, + "language_loss": 0.72286701, + "learning_rate": 1.126121591579663e-06, + "loss": 0.74486744, + "num_input_tokens_seen": 117165960, + "step": 5444, + "time_per_iteration": 2.4613261222839355 + }, + { + "auxiliary_loss_clip": 0.01156317, + "auxiliary_loss_mlp": 0.01025453, + "balance_loss_clip": 1.04875731, + "balance_loss_mlp": 1.01780593, + "epoch": 0.6547225395298503, + "flos": 24936693143040.0, + "grad_norm": 1.9566833830854677, + "language_loss": 0.687729, + "learning_rate": 1.1254209798632018e-06, + "loss": 0.70954669, + "num_input_tokens_seen": 117186980, + "step": 5445, + "time_per_iteration": 2.509918212890625 + }, + { + "auxiliary_loss_clip": 0.01091565, + "auxiliary_loss_mlp": 0.01023601, + "balance_loss_clip": 1.04024816, + "balance_loss_mlp": 1.015715, + "epoch": 0.6548427824204894, + "flos": 22565942663040.0, + "grad_norm": 1.8485714872091685, + "language_loss": 0.84963393, + "learning_rate": 1.124720500819683e-06, + "loss": 0.87078553, + "num_input_tokens_seen": 117205135, + "step": 5446, + "time_per_iteration": 2.6624562740325928 + }, + { + "auxiliary_loss_clip": 0.01178363, + "auxiliary_loss_mlp": 0.01031349, + "balance_loss_clip": 1.05263615, + "balance_loss_mlp": 1.02324247, + "epoch": 0.6549630253111285, + "flos": 18442500048000.0, + "grad_norm": 2.4420386525312154, + "language_loss": 0.82528132, + "learning_rate": 1.1240201545553682e-06, + "loss": 0.84737843, + "num_input_tokens_seen": 117222935, + "step": 5447, + "time_per_iteration": 2.415020227432251 + }, + { + "auxiliary_loss_clip": 0.01129942, + "auxiliary_loss_mlp": 0.0102736, + "balance_loss_clip": 1.04524744, + "balance_loss_mlp": 1.0195756, + "epoch": 0.6550832682017675, + "flos": 25187313312000.0, + "grad_norm": 1.8356436052323213, + "language_loss": 0.73285329, + "learning_rate": 1.1233199411764987e-06, + "loss": 0.75442636, + "num_input_tokens_seen": 117242370, + "step": 5448, + "time_per_iteration": 2.586167335510254 + }, + { + "auxiliary_loss_clip": 0.01119081, + "auxiliary_loss_mlp": 0.01026581, + "balance_loss_clip": 1.04175925, + "balance_loss_mlp": 1.01896322, + "epoch": 0.6552035110924067, + "flos": 22748153379840.0, + "grad_norm": 2.3794934098728038, + "language_loss": 0.68715286, + "learning_rate": 1.1226198607892978e-06, + "loss": 0.70860952, + "num_input_tokens_seen": 117262930, + "step": 5449, + "time_per_iteration": 2.5635664463043213 + }, + { + "auxiliary_loss_clip": 0.01121087, + "auxiliary_loss_mlp": 0.01023894, + "balance_loss_clip": 1.04596961, + "balance_loss_mlp": 1.01608253, + "epoch": 0.6553237539830458, + "flos": 21799178012160.0, + "grad_norm": 1.778838071843153, + "language_loss": 0.79980719, + "learning_rate": 1.1219199134999664e-06, + "loss": 0.821257, + "num_input_tokens_seen": 117281430, + "step": 5450, + "time_per_iteration": 2.5955514907836914 + }, + { + "auxiliary_loss_clip": 0.01148721, + "auxiliary_loss_mlp": 0.01029862, + "balance_loss_clip": 1.0471735, + "balance_loss_mlp": 1.02099276, + "epoch": 0.6554439968736848, + "flos": 20887226588160.0, + "grad_norm": 2.86446139174491, + "language_loss": 0.78553289, + "learning_rate": 1.1212200994146863e-06, + "loss": 0.80731869, + "num_input_tokens_seen": 117299185, + "step": 5451, + "time_per_iteration": 2.5009078979492188 + }, + { + "auxiliary_loss_clip": 0.01128321, + "auxiliary_loss_mlp": 0.010271, + "balance_loss_clip": 1.0406853, + "balance_loss_mlp": 1.01957142, + "epoch": 0.655564239764324, + "flos": 16139045698560.0, + "grad_norm": 1.9197964806945749, + "language_loss": 0.75426066, + "learning_rate": 1.120520418639618e-06, + "loss": 0.77581489, + "num_input_tokens_seen": 117317720, + "step": 5452, + "time_per_iteration": 2.560969114303589 + }, + { + "auxiliary_loss_clip": 0.01161127, + "auxiliary_loss_mlp": 0.01029185, + "balance_loss_clip": 1.05015492, + "balance_loss_mlp": 1.02197325, + "epoch": 0.655684482654963, + "flos": 29570354496000.0, + "grad_norm": 2.0101078255381286, + "language_loss": 0.83424151, + "learning_rate": 1.119820871280903e-06, + "loss": 0.85614461, + "num_input_tokens_seen": 117338795, + "step": 5453, + "time_per_iteration": 2.611057996749878 + }, + { + "auxiliary_loss_clip": 0.01158584, + "auxiliary_loss_mlp": 0.01026098, + "balance_loss_clip": 1.04789615, + "balance_loss_mlp": 1.01843905, + "epoch": 0.6558047255456021, + "flos": 29789409588480.0, + "grad_norm": 2.057473950240316, + "language_loss": 0.7351371, + "learning_rate": 1.1191214574446614e-06, + "loss": 0.75698394, + "num_input_tokens_seen": 117359040, + "step": 5454, + "time_per_iteration": 3.544079065322876 + }, + { + "auxiliary_loss_clip": 0.01139787, + "auxiliary_loss_mlp": 0.01026533, + "balance_loss_clip": 1.04465389, + "balance_loss_mlp": 1.01849234, + "epoch": 0.6559249684362413, + "flos": 29059166090880.0, + "grad_norm": 1.4829852035507158, + "language_loss": 0.80021894, + "learning_rate": 1.118422177236995e-06, + "loss": 0.82188213, + "num_input_tokens_seen": 117380865, + "step": 5455, + "time_per_iteration": 4.1753175258636475 + }, + { + "auxiliary_loss_clip": 0.01149241, + "auxiliary_loss_mlp": 0.01028753, + "balance_loss_clip": 1.04699373, + "balance_loss_mlp": 1.0202862, + "epoch": 0.6560452113268803, + "flos": 20225464369920.0, + "grad_norm": 2.5895066664401685, + "language_loss": 0.85566616, + "learning_rate": 1.1177230307639835e-06, + "loss": 0.87744617, + "num_input_tokens_seen": 117398405, + "step": 5456, + "time_per_iteration": 2.5324628353118896 + }, + { + "auxiliary_loss_clip": 0.01127603, + "auxiliary_loss_mlp": 0.01025091, + "balance_loss_clip": 1.04374051, + "balance_loss_mlp": 1.01731849, + "epoch": 0.6561654542175194, + "flos": 25045538330880.0, + "grad_norm": 1.7629800397392763, + "language_loss": 0.78708327, + "learning_rate": 1.1170240181316865e-06, + "loss": 0.8086102, + "num_input_tokens_seen": 117419850, + "step": 5457, + "time_per_iteration": 2.578169822692871 + }, + { + "auxiliary_loss_clip": 0.0112675, + "auxiliary_loss_mlp": 0.01024754, + "balance_loss_clip": 1.04116833, + "balance_loss_mlp": 1.0170176, + "epoch": 0.6562856971081584, + "flos": 22856711258880.0, + "grad_norm": 2.2052537868348594, + "language_loss": 0.79210258, + "learning_rate": 1.1163251394461442e-06, + "loss": 0.81361765, + "num_input_tokens_seen": 117438330, + "step": 5458, + "time_per_iteration": 2.588817834854126 + }, + { + "auxiliary_loss_clip": 0.01157035, + "auxiliary_loss_mlp": 0.01026793, + "balance_loss_clip": 1.04786491, + "balance_loss_mlp": 1.01891351, + "epoch": 0.6564059399987976, + "flos": 18872565586560.0, + "grad_norm": 2.48886252988734, + "language_loss": 0.82328165, + "learning_rate": 1.1156263948133746e-06, + "loss": 0.84511995, + "num_input_tokens_seen": 117454985, + "step": 5459, + "time_per_iteration": 2.441352367401123 + }, + { + "auxiliary_loss_clip": 0.01110745, + "auxiliary_loss_mlp": 0.00762694, + "balance_loss_clip": 1.04369211, + "balance_loss_mlp": 1.00105429, + "epoch": 0.6565261828894366, + "flos": 25484187219840.0, + "grad_norm": 2.0655004037898643, + "language_loss": 0.77726924, + "learning_rate": 1.1149277843393787e-06, + "loss": 0.79600364, + "num_input_tokens_seen": 117476145, + "step": 5460, + "time_per_iteration": 2.673271417617798 + }, + { + "auxiliary_loss_clip": 0.01098117, + "auxiliary_loss_mlp": 0.0076284, + "balance_loss_clip": 1.03709435, + "balance_loss_mlp": 1.00102043, + "epoch": 0.6566464257800757, + "flos": 19683500987520.0, + "grad_norm": 2.1209624253067, + "language_loss": 0.63608605, + "learning_rate": 1.1142293081301342e-06, + "loss": 0.65469551, + "num_input_tokens_seen": 117494025, + "step": 5461, + "time_per_iteration": 3.3785061836242676 + }, + { + "auxiliary_loss_clip": 0.01141486, + "auxiliary_loss_mlp": 0.01022428, + "balance_loss_clip": 1.04669714, + "balance_loss_mlp": 1.0153712, + "epoch": 0.6567666686707149, + "flos": 23514127931520.0, + "grad_norm": 1.726993808717716, + "language_loss": 0.67938066, + "learning_rate": 1.1135309662915995e-06, + "loss": 0.70101976, + "num_input_tokens_seen": 117514190, + "step": 5462, + "time_per_iteration": 2.593794822692871 + }, + { + "auxiliary_loss_clip": 0.01124051, + "auxiliary_loss_mlp": 0.01024947, + "balance_loss_clip": 1.0429728, + "balance_loss_mlp": 1.01726997, + "epoch": 0.6568869115613539, + "flos": 32781342896640.0, + "grad_norm": 2.195156572695068, + "language_loss": 0.60386992, + "learning_rate": 1.112832758929712e-06, + "loss": 0.62535989, + "num_input_tokens_seen": 117536800, + "step": 5463, + "time_per_iteration": 2.7438831329345703 + }, + { + "auxiliary_loss_clip": 0.01156268, + "auxiliary_loss_mlp": 0.0103158, + "balance_loss_clip": 1.04785907, + "balance_loss_mlp": 1.0236764, + "epoch": 0.657007154451993, + "flos": 18442428220800.0, + "grad_norm": 1.7778915100486914, + "language_loss": 0.75197989, + "learning_rate": 1.11213468615039e-06, + "loss": 0.77385831, + "num_input_tokens_seen": 117556230, + "step": 5464, + "time_per_iteration": 2.5201876163482666 + }, + { + "auxiliary_loss_clip": 0.01103844, + "auxiliary_loss_mlp": 0.0102617, + "balance_loss_clip": 1.04305696, + "balance_loss_mlp": 1.01887417, + "epoch": 0.6571273973426321, + "flos": 25156717902720.0, + "grad_norm": 1.6868803985607976, + "language_loss": 0.75243109, + "learning_rate": 1.1114367480595292e-06, + "loss": 0.77373123, + "num_input_tokens_seen": 117577310, + "step": 5465, + "time_per_iteration": 2.639098644256592 + }, + { + "auxiliary_loss_clip": 0.01101751, + "auxiliary_loss_mlp": 0.01028236, + "balance_loss_clip": 1.04461694, + "balance_loss_mlp": 1.02010596, + "epoch": 0.6572476402332712, + "flos": 17529830352000.0, + "grad_norm": 2.0940412186115878, + "language_loss": 0.81428528, + "learning_rate": 1.1107389447630086e-06, + "loss": 0.83558518, + "num_input_tokens_seen": 117596010, + "step": 5466, + "time_per_iteration": 2.5805253982543945 + }, + { + "auxiliary_loss_clip": 0.01140125, + "auxiliary_loss_mlp": 0.00761973, + "balance_loss_clip": 1.04345632, + "balance_loss_mlp": 1.00114298, + "epoch": 0.6573678831239103, + "flos": 17014260487680.0, + "grad_norm": 2.1277725823023976, + "language_loss": 0.78288114, + "learning_rate": 1.1100412763666818e-06, + "loss": 0.80190212, + "num_input_tokens_seen": 117611270, + "step": 5467, + "time_per_iteration": 2.4889376163482666 + }, + { + "auxiliary_loss_clip": 0.01145718, + "auxiliary_loss_mlp": 0.01021363, + "balance_loss_clip": 1.04649615, + "balance_loss_mlp": 1.01383495, + "epoch": 0.6574881260145494, + "flos": 23910078528000.0, + "grad_norm": 3.5657039484382644, + "language_loss": 0.7999177, + "learning_rate": 1.1093437429763865e-06, + "loss": 0.82158846, + "num_input_tokens_seen": 117631535, + "step": 5468, + "time_per_iteration": 2.534446954727173 + }, + { + "auxiliary_loss_clip": 0.01159861, + "auxiliary_loss_mlp": 0.01021967, + "balance_loss_clip": 1.0494566, + "balance_loss_mlp": 1.01469541, + "epoch": 0.6576083689051885, + "flos": 11218458504960.0, + "grad_norm": 1.9892930228751582, + "language_loss": 0.73769724, + "learning_rate": 1.1086463446979361e-06, + "loss": 0.75951552, + "num_input_tokens_seen": 117649885, + "step": 5469, + "time_per_iteration": 2.4744954109191895 + }, + { + "auxiliary_loss_clip": 0.01163369, + "auxiliary_loss_mlp": 0.01024429, + "balance_loss_clip": 1.05142212, + "balance_loss_mlp": 1.01718712, + "epoch": 0.6577286117958275, + "flos": 22455553190400.0, + "grad_norm": 2.0186989825188597, + "language_loss": 0.77614915, + "learning_rate": 1.1079490816371277e-06, + "loss": 0.79802716, + "num_input_tokens_seen": 117669650, + "step": 5470, + "time_per_iteration": 2.5268166065216064 + }, + { + "auxiliary_loss_clip": 0.01160736, + "auxiliary_loss_mlp": 0.00762403, + "balance_loss_clip": 1.04756808, + "balance_loss_mlp": 1.00098205, + "epoch": 0.6578488546864667, + "flos": 21872184405120.0, + "grad_norm": 2.0083998533386174, + "language_loss": 0.74426007, + "learning_rate": 1.1072519538997352e-06, + "loss": 0.76349145, + "num_input_tokens_seen": 117688790, + "step": 5471, + "time_per_iteration": 2.4930431842803955 + }, + { + "auxiliary_loss_clip": 0.01146732, + "auxiliary_loss_mlp": 0.01023385, + "balance_loss_clip": 1.04384899, + "balance_loss_mlp": 1.01608372, + "epoch": 0.6579690975771058, + "flos": 23543753673600.0, + "grad_norm": 1.7082017098586062, + "language_loss": 0.8231461, + "learning_rate": 1.1065549615915095e-06, + "loss": 0.8448472, + "num_input_tokens_seen": 117708620, + "step": 5472, + "time_per_iteration": 2.5863561630249023 + }, + { + "auxiliary_loss_clip": 0.01160494, + "auxiliary_loss_mlp": 0.01025682, + "balance_loss_clip": 1.05224562, + "balance_loss_mlp": 1.01767683, + "epoch": 0.6580893404677448, + "flos": 32743995730560.0, + "grad_norm": 3.495212502271767, + "language_loss": 0.78568554, + "learning_rate": 1.105858104818187e-06, + "loss": 0.80754721, + "num_input_tokens_seen": 117729775, + "step": 5473, + "time_per_iteration": 2.6140172481536865 + }, + { + "auxiliary_loss_clip": 0.01162119, + "auxiliary_loss_mlp": 0.01024643, + "balance_loss_clip": 1.04855967, + "balance_loss_mlp": 1.01644135, + "epoch": 0.658209583358384, + "flos": 15888138220800.0, + "grad_norm": 2.8189414528279064, + "language_loss": 0.74874592, + "learning_rate": 1.105161383685478e-06, + "loss": 0.77061355, + "num_input_tokens_seen": 117746160, + "step": 5474, + "time_per_iteration": 2.4362614154815674 + }, + { + "auxiliary_loss_clip": 0.01037472, + "auxiliary_loss_mlp": 0.01000796, + "balance_loss_clip": 1.01180589, + "balance_loss_mlp": 0.99969882, + "epoch": 0.658329826249023, + "flos": 62695902447360.0, + "grad_norm": 0.7309770306137011, + "language_loss": 0.56355834, + "learning_rate": 1.1044647982990771e-06, + "loss": 0.58394104, + "num_input_tokens_seen": 117808045, + "step": 5475, + "time_per_iteration": 3.0608134269714355 + }, + { + "auxiliary_loss_clip": 0.0114782, + "auxiliary_loss_mlp": 0.01025911, + "balance_loss_clip": 1.04794931, + "balance_loss_mlp": 1.01782835, + "epoch": 0.6584500691396621, + "flos": 31722624501120.0, + "grad_norm": 2.200530904164096, + "language_loss": 0.64012581, + "learning_rate": 1.1037683487646536e-06, + "loss": 0.66186309, + "num_input_tokens_seen": 117828330, + "step": 5476, + "time_per_iteration": 2.6073076725006104 + }, + { + "auxiliary_loss_clip": 0.01143994, + "auxiliary_loss_mlp": 0.0076262, + "balance_loss_clip": 1.04867852, + "balance_loss_mlp": 1.00101781, + "epoch": 0.6585703120303013, + "flos": 18406086635520.0, + "grad_norm": 1.7116438970877108, + "language_loss": 0.76681453, + "learning_rate": 1.1030720351878583e-06, + "loss": 0.78588068, + "num_input_tokens_seen": 117846450, + "step": 5477, + "time_per_iteration": 2.4999499320983887 + }, + { + "auxiliary_loss_clip": 0.01054563, + "auxiliary_loss_mlp": 0.01001509, + "balance_loss_clip": 1.01437521, + "balance_loss_mlp": 1.00038838, + "epoch": 0.6586905549209403, + "flos": 58309880434560.0, + "grad_norm": 0.806695359385477, + "language_loss": 0.57673049, + "learning_rate": 1.102375857674323e-06, + "loss": 0.59729123, + "num_input_tokens_seen": 117908365, + "step": 5478, + "time_per_iteration": 3.0373473167419434 + }, + { + "auxiliary_loss_clip": 0.01145202, + "auxiliary_loss_mlp": 0.01023226, + "balance_loss_clip": 1.04587674, + "balance_loss_mlp": 1.01580524, + "epoch": 0.6588107978115794, + "flos": 22782627457920.0, + "grad_norm": 1.643065355573079, + "language_loss": 0.90543914, + "learning_rate": 1.1016798163296561e-06, + "loss": 0.92712343, + "num_input_tokens_seen": 117927565, + "step": 5479, + "time_per_iteration": 2.503437042236328 + }, + { + "auxiliary_loss_clip": 0.01161503, + "auxiliary_loss_mlp": 0.0102217, + "balance_loss_clip": 1.04804611, + "balance_loss_mlp": 1.01429605, + "epoch": 0.6589310407022185, + "flos": 20667525050880.0, + "grad_norm": 2.1506661943465555, + "language_loss": 0.66098642, + "learning_rate": 1.1009839112594471e-06, + "loss": 0.68282318, + "num_input_tokens_seen": 117945590, + "step": 5480, + "time_per_iteration": 3.193179130554199 + }, + { + "auxiliary_loss_clip": 0.01162811, + "auxiliary_loss_mlp": 0.01030195, + "balance_loss_clip": 1.04904056, + "balance_loss_mlp": 1.02229118, + "epoch": 0.6590512835928576, + "flos": 25630595055360.0, + "grad_norm": 2.2025340939578215, + "language_loss": 0.71746224, + "learning_rate": 1.1002881425692638e-06, + "loss": 0.73939228, + "num_input_tokens_seen": 117966020, + "step": 5481, + "time_per_iteration": 3.2875547409057617 + }, + { + "auxiliary_loss_clip": 0.01153229, + "auxiliary_loss_mlp": 0.01022526, + "balance_loss_clip": 1.04570329, + "balance_loss_mlp": 1.01489043, + "epoch": 0.6591715264834966, + "flos": 23726108044800.0, + "grad_norm": 1.6540419609355317, + "language_loss": 0.75592911, + "learning_rate": 1.0995925103646532e-06, + "loss": 0.77768672, + "num_input_tokens_seen": 117984620, + "step": 5482, + "time_per_iteration": 3.2361950874328613 + }, + { + "auxiliary_loss_clip": 0.01129096, + "auxiliary_loss_mlp": 0.01024025, + "balance_loss_clip": 1.04743946, + "balance_loss_mlp": 1.01616955, + "epoch": 0.6592917693741358, + "flos": 35773850822400.0, + "grad_norm": 1.8191573864171375, + "language_loss": 0.66854304, + "learning_rate": 1.0988970147511437e-06, + "loss": 0.69007427, + "num_input_tokens_seen": 118006500, + "step": 5483, + "time_per_iteration": 2.6714370250701904 + }, + { + "auxiliary_loss_clip": 0.0114705, + "auxiliary_loss_mlp": 0.01028103, + "balance_loss_clip": 1.04896367, + "balance_loss_mlp": 1.0199852, + "epoch": 0.6594120122647749, + "flos": 21396834794880.0, + "grad_norm": 2.028270439289079, + "language_loss": 0.80333781, + "learning_rate": 1.0982016558342405e-06, + "loss": 0.82508928, + "num_input_tokens_seen": 118025470, + "step": 5484, + "time_per_iteration": 2.51316499710083 + }, + { + "auxiliary_loss_clip": 0.01174191, + "auxiliary_loss_mlp": 0.01027821, + "balance_loss_clip": 1.0504967, + "balance_loss_mlp": 1.02043653, + "epoch": 0.6595322551554139, + "flos": 19351829779200.0, + "grad_norm": 3.91019240763876, + "language_loss": 0.71394634, + "learning_rate": 1.0975064337194291e-06, + "loss": 0.7359665, + "num_input_tokens_seen": 118043515, + "step": 5485, + "time_per_iteration": 2.4505248069763184 + }, + { + "auxiliary_loss_clip": 0.01123888, + "auxiliary_loss_mlp": 0.01036458, + "balance_loss_clip": 1.0428828, + "balance_loss_mlp": 1.02870619, + "epoch": 0.6596524980460531, + "flos": 16837113588480.0, + "grad_norm": 1.6016922523783514, + "language_loss": 0.70457345, + "learning_rate": 1.0968113485121743e-06, + "loss": 0.72617692, + "num_input_tokens_seen": 118063105, + "step": 5486, + "time_per_iteration": 2.5904934406280518 + }, + { + "auxiliary_loss_clip": 0.01159935, + "auxiliary_loss_mlp": 0.0076267, + "balance_loss_clip": 1.04628158, + "balance_loss_mlp": 1.00090361, + "epoch": 0.6597727409366921, + "flos": 21798567480960.0, + "grad_norm": 1.922837244347142, + "language_loss": 0.80199838, + "learning_rate": 1.0961164003179185e-06, + "loss": 0.82122445, + "num_input_tokens_seen": 118081615, + "step": 5487, + "time_per_iteration": 3.2382595539093018 + }, + { + "auxiliary_loss_clip": 0.01129806, + "auxiliary_loss_mlp": 0.01024461, + "balance_loss_clip": 1.04421413, + "balance_loss_mlp": 1.01644981, + "epoch": 0.6598929838273312, + "flos": 23730704985600.0, + "grad_norm": 2.221997594434983, + "language_loss": 0.84660769, + "learning_rate": 1.0954215892420884e-06, + "loss": 0.86815041, + "num_input_tokens_seen": 118102315, + "step": 5488, + "time_per_iteration": 2.5458362102508545 + }, + { + "auxiliary_loss_clip": 0.01136112, + "auxiliary_loss_mlp": 0.01033373, + "balance_loss_clip": 1.04719865, + "balance_loss_mlp": 1.02479625, + "epoch": 0.6600132267179702, + "flos": 19974520978560.0, + "grad_norm": 1.7474081920349414, + "language_loss": 0.70356679, + "learning_rate": 1.094726915390082e-06, + "loss": 0.72526163, + "num_input_tokens_seen": 118120650, + "step": 5489, + "time_per_iteration": 2.51912260055542 + }, + { + "auxiliary_loss_clip": 0.01161445, + "auxiliary_loss_mlp": 0.01026181, + "balance_loss_clip": 1.04946327, + "balance_loss_mlp": 1.01859903, + "epoch": 0.6601334696086094, + "flos": 22342649765760.0, + "grad_norm": 1.700393390421264, + "language_loss": 0.69545817, + "learning_rate": 1.0940323788672836e-06, + "loss": 0.71733445, + "num_input_tokens_seen": 118139825, + "step": 5490, + "time_per_iteration": 2.464287519454956 + }, + { + "auxiliary_loss_clip": 0.01156076, + "auxiliary_loss_mlp": 0.01024501, + "balance_loss_clip": 1.04826713, + "balance_loss_mlp": 1.01679373, + "epoch": 0.6602537124992485, + "flos": 25703098657920.0, + "grad_norm": 1.6425676815723744, + "language_loss": 0.73799878, + "learning_rate": 1.0933379797790522e-06, + "loss": 0.75980455, + "num_input_tokens_seen": 118159240, + "step": 5491, + "time_per_iteration": 2.513073444366455 + }, + { + "auxiliary_loss_clip": 0.01175076, + "auxiliary_loss_mlp": 0.01028293, + "balance_loss_clip": 1.05081022, + "balance_loss_mlp": 1.02032375, + "epoch": 0.6603739553898875, + "flos": 25848572739840.0, + "grad_norm": 2.720755397856245, + "language_loss": 0.71519947, + "learning_rate": 1.0926437182307293e-06, + "loss": 0.7372331, + "num_input_tokens_seen": 118178050, + "step": 5492, + "time_per_iteration": 2.4636995792388916 + }, + { + "auxiliary_loss_clip": 0.01147764, + "auxiliary_loss_mlp": 0.01027716, + "balance_loss_clip": 1.04507661, + "balance_loss_mlp": 1.02027106, + "epoch": 0.6604941982805267, + "flos": 24570296461440.0, + "grad_norm": 2.164469026151461, + "language_loss": 0.77862775, + "learning_rate": 1.0919495943276338e-06, + "loss": 0.80038255, + "num_input_tokens_seen": 118199070, + "step": 5493, + "time_per_iteration": 2.5393922328948975 + }, + { + "auxiliary_loss_clip": 0.01132115, + "auxiliary_loss_mlp": 0.01025348, + "balance_loss_clip": 1.04076338, + "balance_loss_mlp": 1.01674068, + "epoch": 0.6606144411711657, + "flos": 13261775581440.0, + "grad_norm": 2.5263599606303586, + "language_loss": 0.76448774, + "learning_rate": 1.0912556081750611e-06, + "loss": 0.78606236, + "num_input_tokens_seen": 118217000, + "step": 5494, + "time_per_iteration": 2.4930169582366943 + }, + { + "auxiliary_loss_clip": 0.01144125, + "auxiliary_loss_mlp": 0.01027797, + "balance_loss_clip": 1.04895222, + "balance_loss_mlp": 1.02055526, + "epoch": 0.6607346840618048, + "flos": 25155281358720.0, + "grad_norm": 1.8977469051854758, + "language_loss": 0.76343942, + "learning_rate": 1.0905617598782909e-06, + "loss": 0.78515863, + "num_input_tokens_seen": 118237205, + "step": 5495, + "time_per_iteration": 2.5228986740112305 + }, + { + "auxiliary_loss_clip": 0.01110812, + "auxiliary_loss_mlp": 0.01026794, + "balance_loss_clip": 1.04326653, + "balance_loss_mlp": 1.01957631, + "epoch": 0.660854926952444, + "flos": 17638029095040.0, + "grad_norm": 2.172258766892755, + "language_loss": 0.8154791, + "learning_rate": 1.0898680495425775e-06, + "loss": 0.83685517, + "num_input_tokens_seen": 118255495, + "step": 5496, + "time_per_iteration": 2.5399909019470215 + }, + { + "auxiliary_loss_clip": 0.01150339, + "auxiliary_loss_mlp": 0.01026071, + "balance_loss_clip": 1.04861701, + "balance_loss_mlp": 1.01838446, + "epoch": 0.660975169843083, + "flos": 16836000266880.0, + "grad_norm": 13.645054790232495, + "language_loss": 0.80539209, + "learning_rate": 1.0891744772731594e-06, + "loss": 0.82715619, + "num_input_tokens_seen": 118273310, + "step": 5497, + "time_per_iteration": 2.5107312202453613 + }, + { + "auxiliary_loss_clip": 0.01161506, + "auxiliary_loss_mlp": 0.01026988, + "balance_loss_clip": 1.04801774, + "balance_loss_mlp": 1.01951408, + "epoch": 0.6610954127337221, + "flos": 26870410846080.0, + "grad_norm": 1.9180596940267276, + "language_loss": 0.6589061, + "learning_rate": 1.088481043175248e-06, + "loss": 0.68079108, + "num_input_tokens_seen": 118293880, + "step": 5498, + "time_per_iteration": 2.517176389694214 + }, + { + "auxiliary_loss_clip": 0.01132712, + "auxiliary_loss_mlp": 0.01024907, + "balance_loss_clip": 1.04116464, + "balance_loss_mlp": 1.01703954, + "epoch": 0.6612156556243612, + "flos": 26465697331200.0, + "grad_norm": 1.687998040491481, + "language_loss": 0.75701183, + "learning_rate": 1.0877877473540368e-06, + "loss": 0.77858806, + "num_input_tokens_seen": 118314465, + "step": 5499, + "time_per_iteration": 2.5562214851379395 + }, + { + "auxiliary_loss_clip": 0.01174786, + "auxiliary_loss_mlp": 0.01023747, + "balance_loss_clip": 1.04916668, + "balance_loss_mlp": 1.01635647, + "epoch": 0.6613358985150003, + "flos": 19791915212160.0, + "grad_norm": 1.859115423867326, + "language_loss": 0.72344542, + "learning_rate": 1.0870945899147002e-06, + "loss": 0.74543071, + "num_input_tokens_seen": 118331110, + "step": 5500, + "time_per_iteration": 2.4212169647216797 + }, + { + "auxiliary_loss_clip": 0.01158296, + "auxiliary_loss_mlp": 0.01028458, + "balance_loss_clip": 1.04963517, + "balance_loss_mlp": 1.02121878, + "epoch": 0.6614561414056394, + "flos": 26831627136000.0, + "grad_norm": 1.796248743972622, + "language_loss": 0.76270354, + "learning_rate": 1.0864015709623879e-06, + "loss": 0.78457111, + "num_input_tokens_seen": 118351980, + "step": 5501, + "time_per_iteration": 2.509178876876831 + }, + { + "auxiliary_loss_clip": 0.01162243, + "auxiliary_loss_mlp": 0.0102488, + "balance_loss_clip": 1.04757893, + "balance_loss_mlp": 1.01763225, + "epoch": 0.6615763842962785, + "flos": 22894597128960.0, + "grad_norm": 2.2554514713007774, + "language_loss": 0.80521894, + "learning_rate": 1.0857086906022313e-06, + "loss": 0.8270902, + "num_input_tokens_seen": 118370315, + "step": 5502, + "time_per_iteration": 2.4733810424804688 + }, + { + "auxiliary_loss_clip": 0.01091479, + "auxiliary_loss_mlp": 0.01023457, + "balance_loss_clip": 1.04230702, + "balance_loss_mlp": 1.0155772, + "epoch": 0.6616966271869176, + "flos": 24790321221120.0, + "grad_norm": 2.7732087297479437, + "language_loss": 0.73216212, + "learning_rate": 1.0850159489393388e-06, + "loss": 0.75331151, + "num_input_tokens_seen": 118389575, + "step": 5503, + "time_per_iteration": 2.6229701042175293 + }, + { + "auxiliary_loss_clip": 0.01121855, + "auxiliary_loss_mlp": 0.0102591, + "balance_loss_clip": 1.03943026, + "balance_loss_mlp": 1.01793504, + "epoch": 0.6618168700775566, + "flos": 17202109639680.0, + "grad_norm": 1.901436383537696, + "language_loss": 0.82166064, + "learning_rate": 1.0843233460787992e-06, + "loss": 0.84313834, + "num_input_tokens_seen": 118406790, + "step": 5504, + "time_per_iteration": 2.5069234371185303 + }, + { + "auxiliary_loss_clip": 0.01117803, + "auxiliary_loss_mlp": 0.01026456, + "balance_loss_clip": 1.04487944, + "balance_loss_mlp": 1.01885664, + "epoch": 0.6619371129681958, + "flos": 25447091448960.0, + "grad_norm": 2.331993142884519, + "language_loss": 0.78147203, + "learning_rate": 1.0836308821256805e-06, + "loss": 0.80291462, + "num_input_tokens_seen": 118427590, + "step": 5505, + "time_per_iteration": 2.573092460632324 + }, + { + "auxiliary_loss_clip": 0.01160446, + "auxiliary_loss_mlp": 0.01027639, + "balance_loss_clip": 1.04980707, + "balance_loss_mlp": 1.02046824, + "epoch": 0.6620573558588349, + "flos": 18040444139520.0, + "grad_norm": 1.9804038160364377, + "language_loss": 0.77854532, + "learning_rate": 1.0829385571850282e-06, + "loss": 0.80042619, + "num_input_tokens_seen": 118444570, + "step": 5506, + "time_per_iteration": 3.1942214965820312 + }, + { + "auxiliary_loss_clip": 0.01178526, + "auxiliary_loss_mlp": 0.01025755, + "balance_loss_clip": 1.05043519, + "balance_loss_mlp": 1.01765513, + "epoch": 0.6621775987494739, + "flos": 17785586165760.0, + "grad_norm": 2.8288949214189802, + "language_loss": 0.83549726, + "learning_rate": 1.0822463713618679e-06, + "loss": 0.85754007, + "num_input_tokens_seen": 118461425, + "step": 5507, + "time_per_iteration": 2.4194319248199463 + }, + { + "auxiliary_loss_clip": 0.01132338, + "auxiliary_loss_mlp": 0.01027058, + "balance_loss_clip": 1.04554784, + "balance_loss_mlp": 1.01956582, + "epoch": 0.6622978416401131, + "flos": 17492590926720.0, + "grad_norm": 2.435544633746708, + "language_loss": 0.84788311, + "learning_rate": 1.0815543247612034e-06, + "loss": 0.86947709, + "num_input_tokens_seen": 118478495, + "step": 5508, + "time_per_iteration": 3.3085079193115234 + }, + { + "auxiliary_loss_clip": 0.01141206, + "auxiliary_loss_mlp": 0.0102059, + "balance_loss_clip": 1.04104185, + "balance_loss_mlp": 1.0128777, + "epoch": 0.6624180845307521, + "flos": 21648352803840.0, + "grad_norm": 1.6976525515613725, + "language_loss": 0.82818174, + "learning_rate": 1.0808624174880168e-06, + "loss": 0.84979975, + "num_input_tokens_seen": 118499145, + "step": 5509, + "time_per_iteration": 3.35526180267334 + }, + { + "auxiliary_loss_clip": 0.01171177, + "auxiliary_loss_mlp": 0.01022375, + "balance_loss_clip": 1.04951024, + "balance_loss_mlp": 1.01532412, + "epoch": 0.6625383274213912, + "flos": 23805902108160.0, + "grad_norm": 1.6283635244556318, + "language_loss": 0.80238885, + "learning_rate": 1.080170649647272e-06, + "loss": 0.82432437, + "num_input_tokens_seen": 118518950, + "step": 5510, + "time_per_iteration": 2.4519896507263184 + }, + { + "auxiliary_loss_clip": 0.01172, + "auxiliary_loss_mlp": 0.01022412, + "balance_loss_clip": 1.04851913, + "balance_loss_mlp": 1.0146215, + "epoch": 0.6626585703120303, + "flos": 33262941473280.0, + "grad_norm": 2.0371016454873185, + "language_loss": 0.67380464, + "learning_rate": 1.0794790213439068e-06, + "loss": 0.69574881, + "num_input_tokens_seen": 118545850, + "step": 5511, + "time_per_iteration": 2.5814523696899414 + }, + { + "auxiliary_loss_clip": 0.01120506, + "auxiliary_loss_mlp": 0.0102584, + "balance_loss_clip": 1.04532516, + "balance_loss_mlp": 1.01781726, + "epoch": 0.6627788132026694, + "flos": 22085780630400.0, + "grad_norm": 1.8828613448277034, + "language_loss": 0.78365213, + "learning_rate": 1.078787532682843e-06, + "loss": 0.80511558, + "num_input_tokens_seen": 118563325, + "step": 5512, + "time_per_iteration": 2.5747158527374268 + }, + { + "auxiliary_loss_clip": 0.01155802, + "auxiliary_loss_mlp": 0.01027336, + "balance_loss_clip": 1.04743314, + "balance_loss_mlp": 1.01990938, + "epoch": 0.6628990560933085, + "flos": 36173608260480.0, + "grad_norm": 3.2468104574609704, + "language_loss": 0.7536912, + "learning_rate": 1.0780961837689773e-06, + "loss": 0.77552259, + "num_input_tokens_seen": 118582835, + "step": 5513, + "time_per_iteration": 2.6203975677490234 + }, + { + "auxiliary_loss_clip": 0.01139416, + "auxiliary_loss_mlp": 0.01026148, + "balance_loss_clip": 1.04738891, + "balance_loss_mlp": 1.0183754, + "epoch": 0.6630192989839476, + "flos": 18513567106560.0, + "grad_norm": 1.5295655727987987, + "language_loss": 0.69797391, + "learning_rate": 1.0774049747071883e-06, + "loss": 0.71962953, + "num_input_tokens_seen": 118600715, + "step": 5514, + "time_per_iteration": 3.2647321224212646 + }, + { + "auxiliary_loss_clip": 0.01113015, + "auxiliary_loss_mlp": 0.01029523, + "balance_loss_clip": 1.04433513, + "balance_loss_mlp": 1.02139616, + "epoch": 0.6631395418745867, + "flos": 35809510049280.0, + "grad_norm": 1.7427911859238951, + "language_loss": 0.67815602, + "learning_rate": 1.076713905602332e-06, + "loss": 0.69958138, + "num_input_tokens_seen": 118621290, + "step": 5515, + "time_per_iteration": 2.6708669662475586 + }, + { + "auxiliary_loss_clip": 0.01162996, + "auxiliary_loss_mlp": 0.01024868, + "balance_loss_clip": 1.0500493, + "balance_loss_mlp": 1.01769185, + "epoch": 0.6632597847652257, + "flos": 20047742853120.0, + "grad_norm": 1.7375491261770184, + "language_loss": 0.81171703, + "learning_rate": 1.07602297655924e-06, + "loss": 0.83359569, + "num_input_tokens_seen": 118639610, + "step": 5516, + "time_per_iteration": 2.455979347229004 + }, + { + "auxiliary_loss_clip": 0.01175747, + "auxiliary_loss_mlp": 0.01027271, + "balance_loss_clip": 1.05243993, + "balance_loss_mlp": 1.01993942, + "epoch": 0.6633800276558649, + "flos": 21214480423680.0, + "grad_norm": 1.8201131779919177, + "language_loss": 0.80993629, + "learning_rate": 1.0753321876827292e-06, + "loss": 0.83196652, + "num_input_tokens_seen": 118658895, + "step": 5517, + "time_per_iteration": 2.420869827270508 + }, + { + "auxiliary_loss_clip": 0.01172287, + "auxiliary_loss_mlp": 0.0102558, + "balance_loss_clip": 1.04764092, + "balance_loss_mlp": 1.01788521, + "epoch": 0.663500270546504, + "flos": 23987753688960.0, + "grad_norm": 2.4783246927182705, + "language_loss": 0.73953056, + "learning_rate": 1.0746415390775893e-06, + "loss": 0.76150924, + "num_input_tokens_seen": 118677025, + "step": 5518, + "time_per_iteration": 2.4447412490844727 + }, + { + "auxiliary_loss_clip": 0.01172587, + "auxiliary_loss_mlp": 0.01025111, + "balance_loss_clip": 1.05063665, + "balance_loss_mlp": 1.01795292, + "epoch": 0.663620513437143, + "flos": 17932389050880.0, + "grad_norm": 3.055187308389374, + "language_loss": 0.76328069, + "learning_rate": 1.0739510308485939e-06, + "loss": 0.78525764, + "num_input_tokens_seen": 118694240, + "step": 5519, + "time_per_iteration": 2.4016342163085938 + }, + { + "auxiliary_loss_clip": 0.01045113, + "auxiliary_loss_mlp": 0.01000795, + "balance_loss_clip": 1.01355112, + "balance_loss_mlp": 0.99974597, + "epoch": 0.6637407563277821, + "flos": 57840241086720.0, + "grad_norm": 0.8122373987858372, + "language_loss": 0.62526351, + "learning_rate": 1.07326066310049e-06, + "loss": 0.64572257, + "num_input_tokens_seen": 118758365, + "step": 5520, + "time_per_iteration": 3.136658191680908 + }, + { + "auxiliary_loss_clip": 0.0112591, + "auxiliary_loss_mlp": 0.01025505, + "balance_loss_clip": 1.04264092, + "balance_loss_mlp": 1.01733971, + "epoch": 0.6638609992184212, + "flos": 27306007079040.0, + "grad_norm": 2.0980125834022663, + "language_loss": 0.79522419, + "learning_rate": 1.0725704359380059e-06, + "loss": 0.81673825, + "num_input_tokens_seen": 118778220, + "step": 5521, + "time_per_iteration": 2.579162359237671 + }, + { + "auxiliary_loss_clip": 0.01172949, + "auxiliary_loss_mlp": 0.01021468, + "balance_loss_clip": 1.04890656, + "balance_loss_mlp": 1.01445818, + "epoch": 0.6639812421090603, + "flos": 18624854419200.0, + "grad_norm": 2.099856665955451, + "language_loss": 0.71789515, + "learning_rate": 1.0718803494658497e-06, + "loss": 0.73983926, + "num_input_tokens_seen": 118797110, + "step": 5522, + "time_per_iteration": 2.4594368934631348 + }, + { + "auxiliary_loss_clip": 0.01066079, + "auxiliary_loss_mlp": 0.01030589, + "balance_loss_clip": 1.03770447, + "balance_loss_mlp": 1.0225122, + "epoch": 0.6641014849996993, + "flos": 15924479806080.0, + "grad_norm": 2.241506339277391, + "language_loss": 0.84019929, + "learning_rate": 1.071190403788707e-06, + "loss": 0.861166, + "num_input_tokens_seen": 118812415, + "step": 5523, + "time_per_iteration": 2.7082931995391846 + }, + { + "auxiliary_loss_clip": 0.01138977, + "auxiliary_loss_mlp": 0.01028922, + "balance_loss_clip": 1.04877198, + "balance_loss_mlp": 1.02094746, + "epoch": 0.6642217278903385, + "flos": 26505486622080.0, + "grad_norm": 1.8985985253164368, + "language_loss": 0.75564355, + "learning_rate": 1.0705005990112415e-06, + "loss": 0.77732253, + "num_input_tokens_seen": 118832195, + "step": 5524, + "time_per_iteration": 2.7412595748901367 + }, + { + "auxiliary_loss_clip": 0.01106938, + "auxiliary_loss_mlp": 0.01030523, + "balance_loss_clip": 1.04479742, + "balance_loss_mlp": 1.02313471, + "epoch": 0.6643419707809776, + "flos": 15377308951680.0, + "grad_norm": 2.5865468099108773, + "language_loss": 0.74319494, + "learning_rate": 1.0698109352380957e-06, + "loss": 0.76456952, + "num_input_tokens_seen": 118849795, + "step": 5525, + "time_per_iteration": 2.5419392585754395 + }, + { + "auxiliary_loss_clip": 0.01172408, + "auxiliary_loss_mlp": 0.0102747, + "balance_loss_clip": 1.04961085, + "balance_loss_mlp": 1.02056503, + "epoch": 0.6644622136716166, + "flos": 25117610970240.0, + "grad_norm": 1.823267670963283, + "language_loss": 0.7827853, + "learning_rate": 1.0691214125738909e-06, + "loss": 0.80478412, + "num_input_tokens_seen": 118870000, + "step": 5526, + "time_per_iteration": 2.4976797103881836 + }, + { + "auxiliary_loss_clip": 0.01071176, + "auxiliary_loss_mlp": 0.00999621, + "balance_loss_clip": 1.0123955, + "balance_loss_mlp": 0.99859619, + "epoch": 0.6645824565622558, + "flos": 66201717680640.0, + "grad_norm": 0.7888229517803534, + "language_loss": 0.57501411, + "learning_rate": 1.0684320311232287e-06, + "loss": 0.59572208, + "num_input_tokens_seen": 118932905, + "step": 5527, + "time_per_iteration": 3.0789880752563477 + }, + { + "auxiliary_loss_clip": 0.01142352, + "auxiliary_loss_mlp": 0.01024894, + "balance_loss_clip": 1.04558468, + "balance_loss_mlp": 1.0169549, + "epoch": 0.6647026994528948, + "flos": 25082131311360.0, + "grad_norm": 1.785541188654786, + "language_loss": 0.81402034, + "learning_rate": 1.0677427909906865e-06, + "loss": 0.83569276, + "num_input_tokens_seen": 118953355, + "step": 5528, + "time_per_iteration": 2.544168472290039 + }, + { + "auxiliary_loss_clip": 0.01177855, + "auxiliary_loss_mlp": 0.0103419, + "balance_loss_clip": 1.05193388, + "balance_loss_mlp": 1.02593458, + "epoch": 0.6648229423435339, + "flos": 18222187979520.0, + "grad_norm": 1.8476324496840064, + "language_loss": 0.7206015, + "learning_rate": 1.0670536922808216e-06, + "loss": 0.74272192, + "num_input_tokens_seen": 118973480, + "step": 5529, + "time_per_iteration": 2.4494309425354004 + }, + { + "auxiliary_loss_clip": 0.01144879, + "auxiliary_loss_mlp": 0.01024771, + "balance_loss_clip": 1.04736364, + "balance_loss_mlp": 1.01764202, + "epoch": 0.6649431852341731, + "flos": 18296882311680.0, + "grad_norm": 2.1209194312787134, + "language_loss": 0.71853036, + "learning_rate": 1.06636473509817e-06, + "loss": 0.74022686, + "num_input_tokens_seen": 118989860, + "step": 5530, + "time_per_iteration": 2.7889883518218994 + }, + { + "auxiliary_loss_clip": 0.01140786, + "auxiliary_loss_mlp": 0.00762876, + "balance_loss_clip": 1.04565871, + "balance_loss_mlp": 1.00088692, + "epoch": 0.6650634281248121, + "flos": 17019575700480.0, + "grad_norm": 6.264058655728009, + "language_loss": 0.80930144, + "learning_rate": 1.0656759195472447e-06, + "loss": 0.82833809, + "num_input_tokens_seen": 119007150, + "step": 5531, + "time_per_iteration": 2.502697467803955 + }, + { + "auxiliary_loss_clip": 0.01048392, + "auxiliary_loss_mlp": 0.00999955, + "balance_loss_clip": 1.0125432, + "balance_loss_mlp": 0.99887049, + "epoch": 0.6651836710154512, + "flos": 69294810666240.0, + "grad_norm": 0.8540762048202751, + "language_loss": 0.59727401, + "learning_rate": 1.0649872457325414e-06, + "loss": 0.6177575, + "num_input_tokens_seen": 119068435, + "step": 5532, + "time_per_iteration": 3.0353527069091797 + }, + { + "auxiliary_loss_clip": 0.01062195, + "auxiliary_loss_mlp": 0.01002141, + "balance_loss_clip": 1.01221943, + "balance_loss_mlp": 1.00107431, + "epoch": 0.6653039139060903, + "flos": 66883444882560.0, + "grad_norm": 0.8576759834546072, + "language_loss": 0.55106413, + "learning_rate": 1.0642987137585278e-06, + "loss": 0.57170749, + "num_input_tokens_seen": 119127960, + "step": 5533, + "time_per_iteration": 3.73885440826416 + }, + { + "auxiliary_loss_clip": 0.0114211, + "auxiliary_loss_mlp": 0.01024878, + "balance_loss_clip": 1.04509592, + "balance_loss_mlp": 1.01765668, + "epoch": 0.6654241567967294, + "flos": 21470056669440.0, + "grad_norm": 2.1204515362572383, + "language_loss": 0.82493699, + "learning_rate": 1.0636103237296561e-06, + "loss": 0.84660691, + "num_input_tokens_seen": 119146885, + "step": 5534, + "time_per_iteration": 2.5437910556793213 + }, + { + "auxiliary_loss_clip": 0.01158802, + "auxiliary_loss_mlp": 0.01028003, + "balance_loss_clip": 1.0510788, + "balance_loss_mlp": 1.02104115, + "epoch": 0.6655443996873684, + "flos": 25119514391040.0, + "grad_norm": 1.8200875728783574, + "language_loss": 0.84033024, + "learning_rate": 1.062922075750353e-06, + "loss": 0.86219823, + "num_input_tokens_seen": 119166900, + "step": 5535, + "time_per_iteration": 4.083764553070068 + }, + { + "auxiliary_loss_clip": 0.01132877, + "auxiliary_loss_mlp": 0.01022689, + "balance_loss_clip": 1.04578519, + "balance_loss_mlp": 1.01543522, + "epoch": 0.6656646425780076, + "flos": 17457326749440.0, + "grad_norm": 2.73379481282168, + "language_loss": 0.7194947, + "learning_rate": 1.0622339699250267e-06, + "loss": 0.7410503, + "num_input_tokens_seen": 119184820, + "step": 5536, + "time_per_iteration": 2.526789665222168 + }, + { + "auxiliary_loss_clip": 0.01130505, + "auxiliary_loss_mlp": 0.01021145, + "balance_loss_clip": 1.04406762, + "balance_loss_mlp": 1.01402831, + "epoch": 0.6657848854686467, + "flos": 23434190213760.0, + "grad_norm": 1.7413331308190416, + "language_loss": 0.79403788, + "learning_rate": 1.0615460063580624e-06, + "loss": 0.81555438, + "num_input_tokens_seen": 119203295, + "step": 5537, + "time_per_iteration": 2.551549196243286 + }, + { + "auxiliary_loss_clip": 0.01147269, + "auxiliary_loss_mlp": 0.0102408, + "balance_loss_clip": 1.04625058, + "balance_loss_mlp": 1.01732349, + "epoch": 0.6659051283592857, + "flos": 11509909459200.0, + "grad_norm": 1.869513283438856, + "language_loss": 0.72983539, + "learning_rate": 1.060858185153821e-06, + "loss": 0.75154883, + "num_input_tokens_seen": 119221395, + "step": 5538, + "time_per_iteration": 2.489100456237793 + }, + { + "auxiliary_loss_clip": 0.01151615, + "auxiliary_loss_mlp": 0.01023773, + "balance_loss_clip": 1.04739738, + "balance_loss_mlp": 1.01607203, + "epoch": 0.6660253712499249, + "flos": 20594554571520.0, + "grad_norm": 2.238597470429803, + "language_loss": 0.76307523, + "learning_rate": 1.0601705064166474e-06, + "loss": 0.78482914, + "num_input_tokens_seen": 119239790, + "step": 5539, + "time_per_iteration": 2.483821392059326 + }, + { + "auxiliary_loss_clip": 0.01138509, + "auxiliary_loss_mlp": 0.01027825, + "balance_loss_clip": 1.04649067, + "balance_loss_mlp": 1.02043366, + "epoch": 0.666145614140564, + "flos": 21251504367360.0, + "grad_norm": 3.3934014801505494, + "language_loss": 0.73939401, + "learning_rate": 1.0594829702508596e-06, + "loss": 0.76105732, + "num_input_tokens_seen": 119257505, + "step": 5540, + "time_per_iteration": 2.4965715408325195 + }, + { + "auxiliary_loss_clip": 0.01132777, + "auxiliary_loss_mlp": 0.01023255, + "balance_loss_clip": 1.04404926, + "balance_loss_mlp": 1.01587272, + "epoch": 0.666265857031203, + "flos": 33726188200320.0, + "grad_norm": 1.7204106994849724, + "language_loss": 0.55043972, + "learning_rate": 1.0587955767607592e-06, + "loss": 0.57200003, + "num_input_tokens_seen": 119279365, + "step": 5541, + "time_per_iteration": 3.384969472885132 + }, + { + "auxiliary_loss_clip": 0.01174023, + "auxiliary_loss_mlp": 0.010257, + "balance_loss_clip": 1.04996467, + "balance_loss_mlp": 1.01804721, + "epoch": 0.6663860999218422, + "flos": 17456644391040.0, + "grad_norm": 2.4047115393363527, + "language_loss": 0.77069163, + "learning_rate": 1.0581083260506206e-06, + "loss": 0.79268891, + "num_input_tokens_seen": 119296150, + "step": 5542, + "time_per_iteration": 2.413875102996826 + }, + { + "auxiliary_loss_clip": 0.01141505, + "auxiliary_loss_mlp": 0.01024842, + "balance_loss_clip": 1.04422402, + "balance_loss_mlp": 1.01772189, + "epoch": 0.6665063428124812, + "flos": 17676740977920.0, + "grad_norm": 2.4430956296061326, + "language_loss": 0.76214671, + "learning_rate": 1.0574212182246993e-06, + "loss": 0.78381014, + "num_input_tokens_seen": 119314845, + "step": 5543, + "time_per_iteration": 2.486684799194336 + }, + { + "auxiliary_loss_clip": 0.01149407, + "auxiliary_loss_mlp": 0.01025726, + "balance_loss_clip": 1.04605734, + "balance_loss_mlp": 1.01722634, + "epoch": 0.6666265857031203, + "flos": 27673265687040.0, + "grad_norm": 2.687494534370148, + "language_loss": 0.75848854, + "learning_rate": 1.0567342533872303e-06, + "loss": 0.78023994, + "num_input_tokens_seen": 119334875, + "step": 5544, + "time_per_iteration": 2.54890775680542 + }, + { + "auxiliary_loss_clip": 0.01147239, + "auxiliary_loss_mlp": 0.01025632, + "balance_loss_clip": 1.04736233, + "balance_loss_mlp": 1.01796067, + "epoch": 0.6667468285937594, + "flos": 25046831220480.0, + "grad_norm": 1.6392399560372748, + "language_loss": 0.81101358, + "learning_rate": 1.0560474316424255e-06, + "loss": 0.83274227, + "num_input_tokens_seen": 119354635, + "step": 5545, + "time_per_iteration": 2.5281784534454346 + }, + { + "auxiliary_loss_clip": 0.01145468, + "auxiliary_loss_mlp": 0.01028151, + "balance_loss_clip": 1.04541302, + "balance_loss_mlp": 1.02012181, + "epoch": 0.6668670714843985, + "flos": 22780472641920.0, + "grad_norm": 3.0069036045593434, + "language_loss": 0.74214083, + "learning_rate": 1.0553607530944746e-06, + "loss": 0.76387697, + "num_input_tokens_seen": 119372690, + "step": 5546, + "time_per_iteration": 2.5392487049102783 + }, + { + "auxiliary_loss_clip": 0.01130774, + "auxiliary_loss_mlp": 0.01029335, + "balance_loss_clip": 1.04372716, + "balance_loss_mlp": 1.02152681, + "epoch": 0.6669873143750376, + "flos": 22163886754560.0, + "grad_norm": 2.553331246268969, + "language_loss": 0.8969084, + "learning_rate": 1.0546742178475463e-06, + "loss": 0.91850942, + "num_input_tokens_seen": 119391685, + "step": 5547, + "time_per_iteration": 2.5343854427337646 + }, + { + "auxiliary_loss_clip": 0.01122505, + "auxiliary_loss_mlp": 0.01023345, + "balance_loss_clip": 1.045398, + "balance_loss_mlp": 1.01632333, + "epoch": 0.6671075572656767, + "flos": 20514832335360.0, + "grad_norm": 1.874509623453513, + "language_loss": 0.86604512, + "learning_rate": 1.0539878260057868e-06, + "loss": 0.88750362, + "num_input_tokens_seen": 119410725, + "step": 5548, + "time_per_iteration": 2.5722920894622803 + }, + { + "auxiliary_loss_clip": 0.01159799, + "auxiliary_loss_mlp": 0.01024233, + "balance_loss_clip": 1.04977393, + "balance_loss_mlp": 1.01610935, + "epoch": 0.6672278001563158, + "flos": 17931203902080.0, + "grad_norm": 3.484803599752357, + "language_loss": 0.68676692, + "learning_rate": 1.0533015776733226e-06, + "loss": 0.70860732, + "num_input_tokens_seen": 119426875, + "step": 5549, + "time_per_iteration": 2.4619903564453125 + }, + { + "auxiliary_loss_clip": 0.011433, + "auxiliary_loss_mlp": 0.01024233, + "balance_loss_clip": 1.0474087, + "balance_loss_mlp": 1.01613879, + "epoch": 0.6673480430469548, + "flos": 22342146975360.0, + "grad_norm": 2.112385899986325, + "language_loss": 0.785761, + "learning_rate": 1.0526154729542566e-06, + "loss": 0.80743635, + "num_input_tokens_seen": 119446935, + "step": 5550, + "time_per_iteration": 2.520890235900879 + }, + { + "auxiliary_loss_clip": 0.01132035, + "auxiliary_loss_mlp": 0.01030259, + "balance_loss_clip": 1.04694557, + "balance_loss_mlp": 1.02193832, + "epoch": 0.6674682859375939, + "flos": 20703830722560.0, + "grad_norm": 2.4990973221190975, + "language_loss": 0.79911578, + "learning_rate": 1.0519295119526699e-06, + "loss": 0.82073873, + "num_input_tokens_seen": 119463240, + "step": 5551, + "time_per_iteration": 2.5392026901245117 + }, + { + "auxiliary_loss_clip": 0.01148562, + "auxiliary_loss_mlp": 0.01023795, + "balance_loss_clip": 1.04705131, + "balance_loss_mlp": 1.01586509, + "epoch": 0.667588528828233, + "flos": 26206673379840.0, + "grad_norm": 1.5722385436932265, + "language_loss": 0.82968032, + "learning_rate": 1.0512436947726227e-06, + "loss": 0.85140395, + "num_input_tokens_seen": 119484655, + "step": 5552, + "time_per_iteration": 2.5521013736724854 + }, + { + "auxiliary_loss_clip": 0.01131083, + "auxiliary_loss_mlp": 0.01021885, + "balance_loss_clip": 1.04261971, + "balance_loss_mlp": 1.01373708, + "epoch": 0.6677087717188721, + "flos": 23071025756160.0, + "grad_norm": 2.5071386278837293, + "language_loss": 0.65738285, + "learning_rate": 1.0505580215181517e-06, + "loss": 0.67891252, + "num_input_tokens_seen": 119502895, + "step": 5553, + "time_per_iteration": 2.5646190643310547 + }, + { + "auxiliary_loss_clip": 0.01026529, + "auxiliary_loss_mlp": 0.01012114, + "balance_loss_clip": 1.0083499, + "balance_loss_mlp": 1.01107717, + "epoch": 0.6678290146095112, + "flos": 70941315219840.0, + "grad_norm": 0.7754038504334857, + "language_loss": 0.56636125, + "learning_rate": 1.0498724922932753e-06, + "loss": 0.58674771, + "num_input_tokens_seen": 119561010, + "step": 5554, + "time_per_iteration": 3.0298731327056885 + }, + { + "auxiliary_loss_clip": 0.01178021, + "auxiliary_loss_mlp": 0.01026366, + "balance_loss_clip": 1.0514586, + "balance_loss_mlp": 1.01811361, + "epoch": 0.6679492575001503, + "flos": 18661088263680.0, + "grad_norm": 1.986428009113567, + "language_loss": 0.8652088, + "learning_rate": 1.0491871072019851e-06, + "loss": 0.88725269, + "num_input_tokens_seen": 119578900, + "step": 5555, + "time_per_iteration": 2.4193286895751953 + }, + { + "auxiliary_loss_clip": 0.01135387, + "auxiliary_loss_mlp": 0.01027343, + "balance_loss_clip": 1.04371643, + "balance_loss_mlp": 1.01974344, + "epoch": 0.6680695003907894, + "flos": 29711985822720.0, + "grad_norm": 1.7247674679341696, + "language_loss": 0.6398387, + "learning_rate": 1.0485018663482555e-06, + "loss": 0.661466, + "num_input_tokens_seen": 119598920, + "step": 5556, + "time_per_iteration": 2.5875003337860107 + }, + { + "auxiliary_loss_clip": 0.01156151, + "auxiliary_loss_mlp": 0.01023193, + "balance_loss_clip": 1.04782367, + "balance_loss_mlp": 1.01504469, + "epoch": 0.6681897432814284, + "flos": 28218964083840.0, + "grad_norm": 2.7414747485979487, + "language_loss": 0.71571225, + "learning_rate": 1.0478167698360354e-06, + "loss": 0.73750561, + "num_input_tokens_seen": 119618220, + "step": 5557, + "time_per_iteration": 2.522944927215576 + }, + { + "auxiliary_loss_clip": 0.01152701, + "auxiliary_loss_mlp": 0.01026381, + "balance_loss_clip": 1.04560125, + "balance_loss_mlp": 1.01837015, + "epoch": 0.6683099861720676, + "flos": 25046543911680.0, + "grad_norm": 2.2336982055188894, + "language_loss": 0.6978808, + "learning_rate": 1.0471318177692556e-06, + "loss": 0.71967161, + "num_input_tokens_seen": 119638520, + "step": 5558, + "time_per_iteration": 2.500509262084961 + }, + { + "auxiliary_loss_clip": 0.01122794, + "auxiliary_loss_mlp": 0.0102727, + "balance_loss_clip": 1.04479289, + "balance_loss_mlp": 1.01977193, + "epoch": 0.6684302290627067, + "flos": 22996977868800.0, + "grad_norm": 2.5626048401695525, + "language_loss": 0.75784266, + "learning_rate": 1.046447010251821e-06, + "loss": 0.77934325, + "num_input_tokens_seen": 119655850, + "step": 5559, + "time_per_iteration": 2.600281000137329 + }, + { + "auxiliary_loss_clip": 0.01147276, + "auxiliary_loss_mlp": 0.01027871, + "balance_loss_clip": 1.04985523, + "balance_loss_mlp": 1.02044487, + "epoch": 0.6685504719533457, + "flos": 26573824247040.0, + "grad_norm": 1.9766256143973162, + "language_loss": 0.75931215, + "learning_rate": 1.0457623473876157e-06, + "loss": 0.78106368, + "num_input_tokens_seen": 119675355, + "step": 5560, + "time_per_iteration": 3.362144708633423 + }, + { + "auxiliary_loss_clip": 0.01172355, + "auxiliary_loss_mlp": 0.01025818, + "balance_loss_clip": 1.04892409, + "balance_loss_mlp": 1.01830149, + "epoch": 0.6686707148439849, + "flos": 28986087870720.0, + "grad_norm": 1.9457559436657685, + "language_loss": 0.71069139, + "learning_rate": 1.0450778292805046e-06, + "loss": 0.73267311, + "num_input_tokens_seen": 119695340, + "step": 5561, + "time_per_iteration": 2.5167369842529297 + }, + { + "auxiliary_loss_clip": 0.01161198, + "auxiliary_loss_mlp": 0.01026436, + "balance_loss_clip": 1.04562736, + "balance_loss_mlp": 1.01887834, + "epoch": 0.6687909577346239, + "flos": 23623152687360.0, + "grad_norm": 1.6052652785110468, + "language_loss": 0.7858786, + "learning_rate": 1.0443934560343267e-06, + "loss": 0.80775499, + "num_input_tokens_seen": 119716750, + "step": 5562, + "time_per_iteration": 4.015549659729004 + }, + { + "auxiliary_loss_clip": 0.01118553, + "auxiliary_loss_mlp": 0.01023216, + "balance_loss_clip": 1.04256511, + "balance_loss_mlp": 1.01572621, + "epoch": 0.668911200625263, + "flos": 23148593176320.0, + "grad_norm": 2.258194514626613, + "language_loss": 0.78381681, + "learning_rate": 1.0437092277529034e-06, + "loss": 0.80523449, + "num_input_tokens_seen": 119736005, + "step": 5563, + "time_per_iteration": 2.5322933197021484 + }, + { + "auxiliary_loss_clip": 0.01142535, + "auxiliary_loss_mlp": 0.01027455, + "balance_loss_clip": 1.04549956, + "balance_loss_mlp": 1.02021265, + "epoch": 0.6690314435159022, + "flos": 18551919853440.0, + "grad_norm": 2.6133227728488673, + "language_loss": 0.73779017, + "learning_rate": 1.0430251445400292e-06, + "loss": 0.75949001, + "num_input_tokens_seen": 119754050, + "step": 5564, + "time_per_iteration": 2.483062505722046 + }, + { + "auxiliary_loss_clip": 0.0107601, + "auxiliary_loss_mlp": 0.0102804, + "balance_loss_clip": 1.04228425, + "balance_loss_mlp": 1.02044034, + "epoch": 0.6691516864065412, + "flos": 31759540704000.0, + "grad_norm": 2.004207129198165, + "language_loss": 0.62312257, + "learning_rate": 1.0423412064994787e-06, + "loss": 0.64416307, + "num_input_tokens_seen": 119774820, + "step": 5565, + "time_per_iteration": 2.9646308422088623 + }, + { + "auxiliary_loss_clip": 0.01133887, + "auxiliary_loss_mlp": 0.01024804, + "balance_loss_clip": 1.04456258, + "balance_loss_mlp": 1.01711547, + "epoch": 0.6692719292971803, + "flos": 34933864296960.0, + "grad_norm": 1.8015995608318822, + "language_loss": 0.73441279, + "learning_rate": 1.0416574137350064e-06, + "loss": 0.75599974, + "num_input_tokens_seen": 119795525, + "step": 5566, + "time_per_iteration": 2.8105757236480713 + }, + { + "auxiliary_loss_clip": 0.01151707, + "auxiliary_loss_mlp": 0.01025267, + "balance_loss_clip": 1.04686165, + "balance_loss_mlp": 1.01728618, + "epoch": 0.6693921721878194, + "flos": 20449188230400.0, + "grad_norm": 2.198314497859648, + "language_loss": 0.80967045, + "learning_rate": 1.0409737663503428e-06, + "loss": 0.83144021, + "num_input_tokens_seen": 119813905, + "step": 5567, + "time_per_iteration": 3.7862863540649414 + }, + { + "auxiliary_loss_clip": 0.01155674, + "auxiliary_loss_mlp": 0.0102605, + "balance_loss_clip": 1.04482496, + "balance_loss_mlp": 1.01799726, + "epoch": 0.6695124150784585, + "flos": 16614538963200.0, + "grad_norm": 1.8038399621765282, + "language_loss": 0.83119613, + "learning_rate": 1.040290264449196e-06, + "loss": 0.8530134, + "num_input_tokens_seen": 119832010, + "step": 5568, + "time_per_iteration": 2.5018646717071533 + }, + { + "auxiliary_loss_clip": 0.01155822, + "auxiliary_loss_mlp": 0.01025994, + "balance_loss_clip": 1.04829383, + "balance_loss_mlp": 1.0187906, + "epoch": 0.6696326579690975, + "flos": 26652145852800.0, + "grad_norm": 2.402567283104916, + "language_loss": 0.64204538, + "learning_rate": 1.0396069081352532e-06, + "loss": 0.66386354, + "num_input_tokens_seen": 119851165, + "step": 5569, + "time_per_iteration": 2.5494344234466553 + }, + { + "auxiliary_loss_clip": 0.01070962, + "auxiliary_loss_mlp": 0.01001228, + "balance_loss_clip": 1.01198101, + "balance_loss_mlp": 1.00019705, + "epoch": 0.6697529008597367, + "flos": 66964603662720.0, + "grad_norm": 0.7759153982855914, + "language_loss": 0.56001437, + "learning_rate": 1.0389236975121782e-06, + "loss": 0.58073628, + "num_input_tokens_seen": 119906015, + "step": 5570, + "time_per_iteration": 2.953359365463257 + }, + { + "auxiliary_loss_clip": 0.01176262, + "auxiliary_loss_mlp": 0.01019636, + "balance_loss_clip": 1.05080974, + "balance_loss_mlp": 1.01182175, + "epoch": 0.6698731437503758, + "flos": 20886939279360.0, + "grad_norm": 1.7967380847724934, + "language_loss": 0.71022171, + "learning_rate": 1.0382406326836147e-06, + "loss": 0.73218071, + "num_input_tokens_seen": 119925160, + "step": 5571, + "time_per_iteration": 2.46809983253479 + }, + { + "auxiliary_loss_clip": 0.01166743, + "auxiliary_loss_mlp": 0.01025584, + "balance_loss_clip": 1.05028749, + "balance_loss_mlp": 1.01752591, + "epoch": 0.6699933866410148, + "flos": 20409470766720.0, + "grad_norm": 1.9129144760047603, + "language_loss": 0.76030296, + "learning_rate": 1.0375577137531828e-06, + "loss": 0.7822262, + "num_input_tokens_seen": 119943720, + "step": 5572, + "time_per_iteration": 2.4788970947265625 + }, + { + "auxiliary_loss_clip": 0.01149058, + "auxiliary_loss_mlp": 0.01027661, + "balance_loss_clip": 1.04860747, + "balance_loss_mlp": 1.019436, + "epoch": 0.670113629531654, + "flos": 29023075900800.0, + "grad_norm": 1.5538879343947969, + "language_loss": 0.71742672, + "learning_rate": 1.0368749408244802e-06, + "loss": 0.73919392, + "num_input_tokens_seen": 119966640, + "step": 5573, + "time_per_iteration": 2.576552629470825 + }, + { + "auxiliary_loss_clip": 0.01154674, + "auxiliary_loss_mlp": 0.01029547, + "balance_loss_clip": 1.04875851, + "balance_loss_mlp": 1.02182865, + "epoch": 0.670233872422293, + "flos": 19791699730560.0, + "grad_norm": 1.7276721204474306, + "language_loss": 0.78903484, + "learning_rate": 1.0361923140010836e-06, + "loss": 0.81087708, + "num_input_tokens_seen": 119985125, + "step": 5574, + "time_per_iteration": 2.4687318801879883 + }, + { + "auxiliary_loss_clip": 0.01162134, + "auxiliary_loss_mlp": 0.01021488, + "balance_loss_clip": 1.04619884, + "balance_loss_mlp": 1.01378679, + "epoch": 0.6703541153129321, + "flos": 24243689070720.0, + "grad_norm": 2.0106026781925137, + "language_loss": 0.63312304, + "learning_rate": 1.0355098333865455e-06, + "loss": 0.65495926, + "num_input_tokens_seen": 120004355, + "step": 5575, + "time_per_iteration": 2.4933969974517822 + }, + { + "auxiliary_loss_clip": 0.01154552, + "auxiliary_loss_mlp": 0.01027532, + "balance_loss_clip": 1.05012727, + "balance_loss_mlp": 1.01993251, + "epoch": 0.6704743582035713, + "flos": 26688523351680.0, + "grad_norm": 1.861816336031843, + "language_loss": 0.69244099, + "learning_rate": 1.0348274990844006e-06, + "loss": 0.71426183, + "num_input_tokens_seen": 120027115, + "step": 5576, + "time_per_iteration": 2.5089874267578125 + }, + { + "auxiliary_loss_clip": 0.01160066, + "auxiliary_loss_mlp": 0.01027876, + "balance_loss_clip": 1.04917908, + "balance_loss_mlp": 1.02016258, + "epoch": 0.6705946010942103, + "flos": 23514379326720.0, + "grad_norm": 1.8363342182536044, + "language_loss": 0.72900355, + "learning_rate": 1.034145311198155e-06, + "loss": 0.75088298, + "num_input_tokens_seen": 120047130, + "step": 5577, + "time_per_iteration": 2.4990627765655518 + }, + { + "auxiliary_loss_clip": 0.01170481, + "auxiliary_loss_mlp": 0.01024985, + "balance_loss_clip": 1.04856753, + "balance_loss_mlp": 1.01757026, + "epoch": 0.6707148439848494, + "flos": 24061011477120.0, + "grad_norm": 1.867608640173522, + "language_loss": 0.64056885, + "learning_rate": 1.0334632698312989e-06, + "loss": 0.66252351, + "num_input_tokens_seen": 120067925, + "step": 5578, + "time_per_iteration": 2.458458662033081 + }, + { + "auxiliary_loss_clip": 0.01137842, + "auxiliary_loss_mlp": 0.01026195, + "balance_loss_clip": 1.0454495, + "balance_loss_mlp": 1.01830316, + "epoch": 0.6708350868754885, + "flos": 22528667324160.0, + "grad_norm": 1.7609099445927725, + "language_loss": 0.75190139, + "learning_rate": 1.032781375087295e-06, + "loss": 0.77354175, + "num_input_tokens_seen": 120087825, + "step": 5579, + "time_per_iteration": 2.5051090717315674 + }, + { + "auxiliary_loss_clip": 0.01148192, + "auxiliary_loss_mlp": 0.01025134, + "balance_loss_clip": 1.04952002, + "balance_loss_mlp": 1.01805878, + "epoch": 0.6709553297661276, + "flos": 25227749047680.0, + "grad_norm": 1.4694774644195923, + "language_loss": 0.67159301, + "learning_rate": 1.0320996270695891e-06, + "loss": 0.69332629, + "num_input_tokens_seen": 120108895, + "step": 5580, + "time_per_iteration": 2.547531843185425 + }, + { + "auxiliary_loss_clip": 0.01130368, + "auxiliary_loss_mlp": 0.0102653, + "balance_loss_clip": 1.04383373, + "balance_loss_mlp": 1.01894867, + "epoch": 0.6710755726567667, + "flos": 20448757267200.0, + "grad_norm": 2.0515461318709973, + "language_loss": 0.73227763, + "learning_rate": 1.0314180258815998e-06, + "loss": 0.75384665, + "num_input_tokens_seen": 120127535, + "step": 5581, + "time_per_iteration": 2.5476183891296387 + }, + { + "auxiliary_loss_clip": 0.01120707, + "auxiliary_loss_mlp": 0.01024144, + "balance_loss_clip": 1.04136634, + "balance_loss_mlp": 1.01685429, + "epoch": 0.6711958155474057, + "flos": 25995411538560.0, + "grad_norm": 1.7886787243219557, + "language_loss": 0.74401712, + "learning_rate": 1.0307365716267247e-06, + "loss": 0.76546556, + "num_input_tokens_seen": 120147980, + "step": 5582, + "time_per_iteration": 2.5725035667419434 + }, + { + "auxiliary_loss_clip": 0.01159143, + "auxiliary_loss_mlp": 0.01023537, + "balance_loss_clip": 1.04859138, + "balance_loss_mlp": 1.01624763, + "epoch": 0.6713160584380449, + "flos": 19937712516480.0, + "grad_norm": 2.0038464408579237, + "language_loss": 0.78099537, + "learning_rate": 1.0300552644083423e-06, + "loss": 0.80282211, + "num_input_tokens_seen": 120166905, + "step": 5583, + "time_per_iteration": 2.5633797645568848 + }, + { + "auxiliary_loss_clip": 0.01135014, + "auxiliary_loss_mlp": 0.01026902, + "balance_loss_clip": 1.04653525, + "balance_loss_mlp": 1.01889086, + "epoch": 0.6714363013286839, + "flos": 18223373128320.0, + "grad_norm": 2.4992941246208606, + "language_loss": 0.72622323, + "learning_rate": 1.0293741043298036e-06, + "loss": 0.74784231, + "num_input_tokens_seen": 120185255, + "step": 5584, + "time_per_iteration": 2.519679546356201 + }, + { + "auxiliary_loss_clip": 0.0113434, + "auxiliary_loss_mlp": 0.010311, + "balance_loss_clip": 1.04909849, + "balance_loss_mlp": 1.02276719, + "epoch": 0.671556544219323, + "flos": 25812374808960.0, + "grad_norm": 2.1793439062492643, + "language_loss": 0.71628737, + "learning_rate": 1.0286930914944436e-06, + "loss": 0.73794186, + "num_input_tokens_seen": 120205070, + "step": 5585, + "time_per_iteration": 2.5843303203582764 + }, + { + "auxiliary_loss_clip": 0.01171522, + "auxiliary_loss_mlp": 0.01026455, + "balance_loss_clip": 1.04581785, + "balance_loss_mlp": 1.01910567, + "epoch": 0.6716767871099621, + "flos": 15850431918720.0, + "grad_norm": 2.4560957814960136, + "language_loss": 0.77260548, + "learning_rate": 1.0280122260055684e-06, + "loss": 0.79458523, + "num_input_tokens_seen": 120220780, + "step": 5586, + "time_per_iteration": 3.1724581718444824 + }, + { + "auxiliary_loss_clip": 0.01174, + "auxiliary_loss_mlp": 0.0103009, + "balance_loss_clip": 1.04941106, + "balance_loss_mlp": 1.02172744, + "epoch": 0.6717970300006012, + "flos": 19756112330880.0, + "grad_norm": 1.9615954728197218, + "language_loss": 0.82149506, + "learning_rate": 1.0273315079664652e-06, + "loss": 0.8435359, + "num_input_tokens_seen": 120238735, + "step": 5587, + "time_per_iteration": 2.456770181655884 + }, + { + "auxiliary_loss_clip": 0.01161829, + "auxiliary_loss_mlp": 0.01023281, + "balance_loss_clip": 1.04970479, + "balance_loss_mlp": 1.0158844, + "epoch": 0.6719172728912403, + "flos": 25485049146240.0, + "grad_norm": 2.6775917274239927, + "language_loss": 0.74273318, + "learning_rate": 1.0266509374803992e-06, + "loss": 0.7645843, + "num_input_tokens_seen": 120259895, + "step": 5588, + "time_per_iteration": 3.315183639526367 + }, + { + "auxiliary_loss_clip": 0.01173733, + "auxiliary_loss_mlp": 0.00762653, + "balance_loss_clip": 1.04887676, + "balance_loss_mlp": 1.0008893, + "epoch": 0.6720375157818794, + "flos": 15880344969600.0, + "grad_norm": 2.6043188903070704, + "language_loss": 0.84234619, + "learning_rate": 1.0259705146506123e-06, + "loss": 0.86171007, + "num_input_tokens_seen": 120274790, + "step": 5589, + "time_per_iteration": 3.114011764526367 + }, + { + "auxiliary_loss_clip": 0.0116231, + "auxiliary_loss_mlp": 0.01027661, + "balance_loss_clip": 1.04877043, + "balance_loss_mlp": 1.02030623, + "epoch": 0.6721577586725185, + "flos": 32010843231360.0, + "grad_norm": 2.257760511420109, + "language_loss": 0.77768219, + "learning_rate": 1.025290239580324e-06, + "loss": 0.79958189, + "num_input_tokens_seen": 120295460, + "step": 5590, + "time_per_iteration": 2.5612924098968506 + }, + { + "auxiliary_loss_clip": 0.01118037, + "auxiliary_loss_mlp": 0.01030625, + "balance_loss_clip": 1.04328346, + "balance_loss_mlp": 1.02266216, + "epoch": 0.6722780015631575, + "flos": 20737873837440.0, + "grad_norm": 1.6318325500596202, + "language_loss": 0.7565304, + "learning_rate": 1.0246101123727313e-06, + "loss": 0.77801704, + "num_input_tokens_seen": 120314440, + "step": 5591, + "time_per_iteration": 2.607795238494873 + }, + { + "auxiliary_loss_clip": 0.0115779, + "auxiliary_loss_mlp": 0.01031348, + "balance_loss_clip": 1.04573846, + "balance_loss_mlp": 1.02401114, + "epoch": 0.6723982444537967, + "flos": 16909617191040.0, + "grad_norm": 2.2109401667368305, + "language_loss": 0.7888357, + "learning_rate": 1.0239301331310085e-06, + "loss": 0.81072706, + "num_input_tokens_seen": 120332060, + "step": 5592, + "time_per_iteration": 2.459958553314209 + }, + { + "auxiliary_loss_clip": 0.01155845, + "auxiliary_loss_mlp": 0.01027968, + "balance_loss_clip": 1.04722524, + "balance_loss_mlp": 1.02070761, + "epoch": 0.6725184873444358, + "flos": 20667812359680.0, + "grad_norm": 1.7595528999999999, + "language_loss": 0.88424611, + "learning_rate": 1.0232503019583088e-06, + "loss": 0.90608424, + "num_input_tokens_seen": 120351670, + "step": 5593, + "time_per_iteration": 2.466921329498291 + }, + { + "auxiliary_loss_clip": 0.01154943, + "auxiliary_loss_mlp": 0.0102816, + "balance_loss_clip": 1.04806507, + "balance_loss_mlp": 1.02041447, + "epoch": 0.6726387302350748, + "flos": 23727616416000.0, + "grad_norm": 1.8528793177606246, + "language_loss": 0.69590878, + "learning_rate": 1.0225706189577619e-06, + "loss": 0.71773982, + "num_input_tokens_seen": 120370195, + "step": 5594, + "time_per_iteration": 3.272908926010132 + }, + { + "auxiliary_loss_clip": 0.01162092, + "auxiliary_loss_mlp": 0.01025482, + "balance_loss_clip": 1.0491395, + "balance_loss_mlp": 1.01765013, + "epoch": 0.672758973125714, + "flos": 15188274650880.0, + "grad_norm": 2.0779655056299897, + "language_loss": 0.74925804, + "learning_rate": 1.021891084232475e-06, + "loss": 0.77113378, + "num_input_tokens_seen": 120388130, + "step": 5595, + "time_per_iteration": 2.494729995727539 + }, + { + "auxiliary_loss_clip": 0.01158987, + "auxiliary_loss_mlp": 0.01027054, + "balance_loss_clip": 1.04643178, + "balance_loss_mlp": 1.01882875, + "epoch": 0.672879216016353, + "flos": 18077252601600.0, + "grad_norm": 2.466616240819095, + "language_loss": 0.79920059, + "learning_rate": 1.0212116978855325e-06, + "loss": 0.82106102, + "num_input_tokens_seen": 120406145, + "step": 5596, + "time_per_iteration": 2.4411349296569824 + }, + { + "auxiliary_loss_clip": 0.01129555, + "auxiliary_loss_mlp": 0.01021009, + "balance_loss_clip": 1.04535723, + "balance_loss_mlp": 1.0139761, + "epoch": 0.6729994589069921, + "flos": 23476349802240.0, + "grad_norm": 2.6893905489670593, + "language_loss": 0.78744411, + "learning_rate": 1.020532460019997e-06, + "loss": 0.80894983, + "num_input_tokens_seen": 120425395, + "step": 5597, + "time_per_iteration": 2.547109365463257 + }, + { + "auxiliary_loss_clip": 0.01094015, + "auxiliary_loss_mlp": 0.01024122, + "balance_loss_clip": 1.04173958, + "balance_loss_mlp": 1.01682675, + "epoch": 0.6731197017976313, + "flos": 26322018929280.0, + "grad_norm": 1.6894704743508782, + "language_loss": 0.70810151, + "learning_rate": 1.0198533707389096e-06, + "loss": 0.72928286, + "num_input_tokens_seen": 120446270, + "step": 5598, + "time_per_iteration": 2.762458562850952 + }, + { + "auxiliary_loss_clip": 0.01155246, + "auxiliary_loss_mlp": 0.00762681, + "balance_loss_clip": 1.04828119, + "balance_loss_mlp": 1.00080764, + "epoch": 0.6732399446882703, + "flos": 21616428591360.0, + "grad_norm": 1.925268894637541, + "language_loss": 0.73125672, + "learning_rate": 1.0191744301452853e-06, + "loss": 0.75043607, + "num_input_tokens_seen": 120465570, + "step": 5599, + "time_per_iteration": 2.94667911529541 + }, + { + "auxiliary_loss_clip": 0.01171476, + "auxiliary_loss_mlp": 0.01026847, + "balance_loss_clip": 1.04810953, + "balance_loss_mlp": 1.01924109, + "epoch": 0.6733601875789094, + "flos": 25880173729920.0, + "grad_norm": 2.0478652747177137, + "language_loss": 0.69992429, + "learning_rate": 1.0184956383421208e-06, + "loss": 0.7219075, + "num_input_tokens_seen": 120484220, + "step": 5600, + "time_per_iteration": 2.4708330631256104 + }, + { + "auxiliary_loss_clip": 0.0116287, + "auxiliary_loss_mlp": 0.01026967, + "balance_loss_clip": 1.04975152, + "balance_loss_mlp": 1.019207, + "epoch": 0.6734804304695485, + "flos": 22929573997440.0, + "grad_norm": 2.88186311839598, + "language_loss": 0.65441662, + "learning_rate": 1.017816995432387e-06, + "loss": 0.67631495, + "num_input_tokens_seen": 120503320, + "step": 5601, + "time_per_iteration": 2.4807095527648926 + }, + { + "auxiliary_loss_clip": 0.01144458, + "auxiliary_loss_mlp": 0.01024692, + "balance_loss_clip": 1.04628921, + "balance_loss_mlp": 1.01675558, + "epoch": 0.6736006733601876, + "flos": 18697968552960.0, + "grad_norm": 2.0629339984217037, + "language_loss": 0.73976272, + "learning_rate": 1.0171385015190353e-06, + "loss": 0.76145422, + "num_input_tokens_seen": 120523180, + "step": 5602, + "time_per_iteration": 2.564445972442627 + }, + { + "auxiliary_loss_clip": 0.01140177, + "auxiliary_loss_mlp": 0.00762632, + "balance_loss_clip": 1.04848945, + "balance_loss_mlp": 1.00092769, + "epoch": 0.6737209162508266, + "flos": 19427745173760.0, + "grad_norm": 1.9718058880455842, + "language_loss": 0.73463786, + "learning_rate": 1.0164601567049908e-06, + "loss": 0.75366592, + "num_input_tokens_seen": 120541710, + "step": 5603, + "time_per_iteration": 2.492797374725342 + }, + { + "auxiliary_loss_clip": 0.01145685, + "auxiliary_loss_mlp": 0.01027209, + "balance_loss_clip": 1.0478729, + "balance_loss_mlp": 1.01897812, + "epoch": 0.6738411591414658, + "flos": 20158060498560.0, + "grad_norm": 1.607497442008007, + "language_loss": 0.80291218, + "learning_rate": 1.015781961093158e-06, + "loss": 0.82464111, + "num_input_tokens_seen": 120561030, + "step": 5604, + "time_per_iteration": 2.499032497406006 + }, + { + "auxiliary_loss_clip": 0.01146193, + "auxiliary_loss_mlp": 0.01023582, + "balance_loss_clip": 1.04249263, + "balance_loss_mlp": 1.01602435, + "epoch": 0.6739614020321049, + "flos": 21653847584640.0, + "grad_norm": 1.6175833111181845, + "language_loss": 0.77283555, + "learning_rate": 1.0151039147864197e-06, + "loss": 0.79453325, + "num_input_tokens_seen": 120581005, + "step": 5605, + "time_per_iteration": 2.5059971809387207 + }, + { + "auxiliary_loss_clip": 0.01083273, + "auxiliary_loss_mlp": 0.01023979, + "balance_loss_clip": 1.04562068, + "balance_loss_mlp": 1.01575351, + "epoch": 0.6740816449227439, + "flos": 19171702051200.0, + "grad_norm": 2.094527429241963, + "language_loss": 0.66135192, + "learning_rate": 1.0144260178876336e-06, + "loss": 0.68242443, + "num_input_tokens_seen": 120600350, + "step": 5606, + "time_per_iteration": 2.640148401260376 + }, + { + "auxiliary_loss_clip": 0.01151697, + "auxiliary_loss_mlp": 0.01020818, + "balance_loss_clip": 1.04699266, + "balance_loss_mlp": 1.01353133, + "epoch": 0.6742018878133831, + "flos": 21097015971840.0, + "grad_norm": 2.2939602033066975, + "language_loss": 0.67403835, + "learning_rate": 1.0137482704996388e-06, + "loss": 0.69576353, + "num_input_tokens_seen": 120614700, + "step": 5607, + "time_per_iteration": 2.5257835388183594 + }, + { + "auxiliary_loss_clip": 0.01135347, + "auxiliary_loss_mlp": 0.01027874, + "balance_loss_clip": 1.04619002, + "balance_loss_mlp": 1.01988101, + "epoch": 0.6743221307040221, + "flos": 23549966726400.0, + "grad_norm": 2.068293208626403, + "language_loss": 0.79093146, + "learning_rate": 1.0130706727252461e-06, + "loss": 0.81256366, + "num_input_tokens_seen": 120631755, + "step": 5608, + "time_per_iteration": 2.5413362979888916 + }, + { + "auxiliary_loss_clip": 0.01135533, + "auxiliary_loss_mlp": 0.01028687, + "balance_loss_clip": 1.04670656, + "balance_loss_mlp": 1.02083135, + "epoch": 0.6744423735946612, + "flos": 16249542912000.0, + "grad_norm": 2.1921537244548635, + "language_loss": 0.68476641, + "learning_rate": 1.0123932246672468e-06, + "loss": 0.70640856, + "num_input_tokens_seen": 120645900, + "step": 5609, + "time_per_iteration": 2.49267840385437 + }, + { + "auxiliary_loss_clip": 0.01026444, + "auxiliary_loss_mlp": 0.00753044, + "balance_loss_clip": 1.010867, + "balance_loss_mlp": 1.0006808, + "epoch": 0.6745626164853004, + "flos": 57843257829120.0, + "grad_norm": 0.7430967636270926, + "language_loss": 0.55820024, + "learning_rate": 1.0117159264284114e-06, + "loss": 0.57599509, + "num_input_tokens_seen": 120709070, + "step": 5610, + "time_per_iteration": 3.149108409881592 + }, + { + "auxiliary_loss_clip": 0.01149608, + "auxiliary_loss_mlp": 0.01024134, + "balance_loss_clip": 1.04884171, + "balance_loss_mlp": 1.0164957, + "epoch": 0.6746828593759394, + "flos": 20485027025280.0, + "grad_norm": 1.7936197190333267, + "language_loss": 0.76889777, + "learning_rate": 1.0110387781114837e-06, + "loss": 0.79063511, + "num_input_tokens_seen": 120727685, + "step": 5611, + "time_per_iteration": 2.5232491493225098 + }, + { + "auxiliary_loss_clip": 0.01172633, + "auxiliary_loss_mlp": 0.01025852, + "balance_loss_clip": 1.04966199, + "balance_loss_mlp": 1.01809752, + "epoch": 0.6748031022665785, + "flos": 19208223204480.0, + "grad_norm": 1.9328580518709293, + "language_loss": 0.76860976, + "learning_rate": 1.0103617798191872e-06, + "loss": 0.79059458, + "num_input_tokens_seen": 120747160, + "step": 5612, + "time_per_iteration": 2.4491124153137207 + }, + { + "auxiliary_loss_clip": 0.01141048, + "auxiliary_loss_mlp": 0.0102269, + "balance_loss_clip": 1.04691672, + "balance_loss_mlp": 1.01494181, + "epoch": 0.6749233451572175, + "flos": 15195026407680.0, + "grad_norm": 2.158776007519646, + "language_loss": 0.82746983, + "learning_rate": 1.0096849316542217e-06, + "loss": 0.84910721, + "num_input_tokens_seen": 120763710, + "step": 5613, + "time_per_iteration": 3.2572972774505615 + }, + { + "auxiliary_loss_clip": 0.01074172, + "auxiliary_loss_mlp": 0.0102091, + "balance_loss_clip": 1.03736043, + "balance_loss_mlp": 1.01325059, + "epoch": 0.6750435880478567, + "flos": 26499489050880.0, + "grad_norm": 3.009828321755657, + "language_loss": 0.74948859, + "learning_rate": 1.0090082337192643e-06, + "loss": 0.77043939, + "num_input_tokens_seen": 120783355, + "step": 5614, + "time_per_iteration": 2.665631055831909 + }, + { + "auxiliary_loss_clip": 0.01094754, + "auxiliary_loss_mlp": 0.01025524, + "balance_loss_clip": 1.03683615, + "balance_loss_mlp": 1.01795411, + "epoch": 0.6751638309384957, + "flos": 23404313076480.0, + "grad_norm": 2.2820015626418155, + "language_loss": 0.78854692, + "learning_rate": 1.0083316861169705e-06, + "loss": 0.80974972, + "num_input_tokens_seen": 120802090, + "step": 5615, + "time_per_iteration": 4.140194654464722 + }, + { + "auxiliary_loss_clip": 0.0113631, + "auxiliary_loss_mlp": 0.0102573, + "balance_loss_clip": 1.04371762, + "balance_loss_mlp": 1.0170989, + "epoch": 0.6752840738291348, + "flos": 23441408847360.0, + "grad_norm": 7.276048708592484, + "language_loss": 0.71608061, + "learning_rate": 1.0076552889499713e-06, + "loss": 0.73770094, + "num_input_tokens_seen": 120822855, + "step": 5616, + "time_per_iteration": 2.583163022994995 + }, + { + "auxiliary_loss_clip": 0.01158215, + "auxiliary_loss_mlp": 0.01026519, + "balance_loss_clip": 1.04932618, + "balance_loss_mlp": 1.01942003, + "epoch": 0.675404316719774, + "flos": 30335826257280.0, + "grad_norm": 2.2143447033203616, + "language_loss": 0.73742616, + "learning_rate": 1.006979042320876e-06, + "loss": 0.75927353, + "num_input_tokens_seen": 120843070, + "step": 5617, + "time_per_iteration": 2.5330092906951904 + }, + { + "auxiliary_loss_clip": 0.01138905, + "auxiliary_loss_mlp": 0.01022574, + "balance_loss_clip": 1.04268432, + "balance_loss_mlp": 1.01485848, + "epoch": 0.675524559610413, + "flos": 23622613983360.0, + "grad_norm": 2.0869174287190804, + "language_loss": 0.62881178, + "learning_rate": 1.0063029463322702e-06, + "loss": 0.65042657, + "num_input_tokens_seen": 120863345, + "step": 5618, + "time_per_iteration": 2.53336501121521 + }, + { + "auxiliary_loss_clip": 0.01108416, + "auxiliary_loss_mlp": 0.00762901, + "balance_loss_clip": 1.04013562, + "balance_loss_mlp": 1.00101018, + "epoch": 0.6756448025010521, + "flos": 21248631279360.0, + "grad_norm": 2.143594030109147, + "language_loss": 0.75347936, + "learning_rate": 1.0056270010867164e-06, + "loss": 0.77219248, + "num_input_tokens_seen": 120880915, + "step": 5619, + "time_per_iteration": 2.575690984725952 + }, + { + "auxiliary_loss_clip": 0.01143843, + "auxiliary_loss_mlp": 0.0103164, + "balance_loss_clip": 1.04257476, + "balance_loss_mlp": 1.02337861, + "epoch": 0.6757650453916912, + "flos": 21646521210240.0, + "grad_norm": 2.7762592489339224, + "language_loss": 0.78127849, + "learning_rate": 1.004951206686758e-06, + "loss": 0.80303335, + "num_input_tokens_seen": 120899190, + "step": 5620, + "time_per_iteration": 2.5126075744628906 + }, + { + "auxiliary_loss_clip": 0.01154132, + "auxiliary_loss_mlp": 0.01030666, + "balance_loss_clip": 1.04654086, + "balance_loss_mlp": 1.02272725, + "epoch": 0.6758852882823303, + "flos": 21795658479360.0, + "grad_norm": 1.7970837077595265, + "language_loss": 0.71670747, + "learning_rate": 1.0042755632349087e-06, + "loss": 0.73855543, + "num_input_tokens_seen": 120916080, + "step": 5621, + "time_per_iteration": 3.221890449523926 + }, + { + "auxiliary_loss_clip": 0.01128334, + "auxiliary_loss_mlp": 0.01028906, + "balance_loss_clip": 1.04352772, + "balance_loss_mlp": 1.0211519, + "epoch": 0.6760055311729694, + "flos": 27088783580160.0, + "grad_norm": 2.189166953334833, + "language_loss": 0.62636167, + "learning_rate": 1.0036000708336653e-06, + "loss": 0.64793408, + "num_input_tokens_seen": 120935210, + "step": 5622, + "time_per_iteration": 2.5552103519439697 + }, + { + "auxiliary_loss_clip": 0.01148183, + "auxiliary_loss_mlp": 0.0102823, + "balance_loss_clip": 1.04797602, + "balance_loss_mlp": 1.02051163, + "epoch": 0.6761257740636085, + "flos": 17999792922240.0, + "grad_norm": 2.3144702600709843, + "language_loss": 0.79577434, + "learning_rate": 1.0029247295854984e-06, + "loss": 0.8175385, + "num_input_tokens_seen": 120951830, + "step": 5623, + "time_per_iteration": 2.4669108390808105 + }, + { + "auxiliary_loss_clip": 0.01137558, + "auxiliary_loss_mlp": 0.01028222, + "balance_loss_clip": 1.04767609, + "balance_loss_mlp": 1.0212661, + "epoch": 0.6762460169542476, + "flos": 15121912273920.0, + "grad_norm": 2.061633949034935, + "language_loss": 0.71956134, + "learning_rate": 1.0022495395928588e-06, + "loss": 0.74121916, + "num_input_tokens_seen": 120970310, + "step": 5624, + "time_per_iteration": 2.516209363937378 + }, + { + "auxiliary_loss_clip": 0.01070385, + "auxiliary_loss_mlp": 0.01001409, + "balance_loss_clip": 1.01212454, + "balance_loss_mlp": 1.00039577, + "epoch": 0.6763662598448866, + "flos": 67886970030720.0, + "grad_norm": 0.7919151983260391, + "language_loss": 0.6239388, + "learning_rate": 1.0015745009581697e-06, + "loss": 0.64465678, + "num_input_tokens_seen": 121031915, + "step": 5625, + "time_per_iteration": 3.0726146697998047 + }, + { + "auxiliary_loss_clip": 0.01157022, + "auxiliary_loss_mlp": 0.01024931, + "balance_loss_clip": 1.04945493, + "balance_loss_mlp": 1.01760828, + "epoch": 0.6764865027355258, + "flos": 20631829910400.0, + "grad_norm": 2.1828284449229396, + "language_loss": 0.67178977, + "learning_rate": 1.0008996137838343e-06, + "loss": 0.6936093, + "num_input_tokens_seen": 121050890, + "step": 5626, + "time_per_iteration": 2.5190725326538086 + }, + { + "auxiliary_loss_clip": 0.0117839, + "auxiliary_loss_mlp": 0.01026476, + "balance_loss_clip": 1.05097413, + "balance_loss_mlp": 1.01837015, + "epoch": 0.6766067456261649, + "flos": 21215809226880.0, + "grad_norm": 2.200512296086121, + "language_loss": 0.7999329, + "learning_rate": 1.000224878172234e-06, + "loss": 0.82198155, + "num_input_tokens_seen": 121070015, + "step": 5627, + "time_per_iteration": 2.446892023086548 + }, + { + "auxiliary_loss_clip": 0.01161545, + "auxiliary_loss_mlp": 0.01023828, + "balance_loss_clip": 1.04784238, + "balance_loss_mlp": 1.01613927, + "epoch": 0.6767269885168039, + "flos": 19938251220480.0, + "grad_norm": 2.432937563644251, + "language_loss": 0.7267316, + "learning_rate": 9.99550294225724e-07, + "loss": 0.74858534, + "num_input_tokens_seen": 121089170, + "step": 5628, + "time_per_iteration": 2.4725723266601562 + }, + { + "auxiliary_loss_clip": 0.01117498, + "auxiliary_loss_mlp": 0.01025588, + "balance_loss_clip": 1.04101646, + "balance_loss_mlp": 1.01743424, + "epoch": 0.6768472314074431, + "flos": 20814076540800.0, + "grad_norm": 1.9295445801353437, + "language_loss": 0.7208873, + "learning_rate": 9.988758620466402e-07, + "loss": 0.74231815, + "num_input_tokens_seen": 121108040, + "step": 5629, + "time_per_iteration": 2.5994701385498047 + }, + { + "auxiliary_loss_clip": 0.01108062, + "auxiliary_loss_mlp": 0.01024888, + "balance_loss_clip": 1.0420723, + "balance_loss_mlp": 1.01731777, + "epoch": 0.6769674742980821, + "flos": 23186012169600.0, + "grad_norm": 1.632138929928471, + "language_loss": 0.76331067, + "learning_rate": 9.982015817372917e-07, + "loss": 0.78464019, + "num_input_tokens_seen": 121128480, + "step": 5630, + "time_per_iteration": 2.617513656616211 + }, + { + "auxiliary_loss_clip": 0.01113179, + "auxiliary_loss_mlp": 0.0102741, + "balance_loss_clip": 1.03968132, + "balance_loss_mlp": 1.01931012, + "epoch": 0.6770877171887212, + "flos": 24242934885120.0, + "grad_norm": 1.8796387954435076, + "language_loss": 0.81828296, + "learning_rate": 9.975274533999657e-07, + "loss": 0.83968878, + "num_input_tokens_seen": 121148010, + "step": 5631, + "time_per_iteration": 2.6192374229431152 + }, + { + "auxiliary_loss_clip": 0.01173354, + "auxiliary_loss_mlp": 0.0102951, + "balance_loss_clip": 1.04810703, + "balance_loss_mlp": 1.02115369, + "epoch": 0.6772079600793603, + "flos": 18141567903360.0, + "grad_norm": 2.968694488344421, + "language_loss": 0.84243727, + "learning_rate": 9.96853477136929e-07, + "loss": 0.86446589, + "num_input_tokens_seen": 121162755, + "step": 5632, + "time_per_iteration": 2.399585247039795 + }, + { + "auxiliary_loss_clip": 0.01122664, + "auxiliary_loss_mlp": 0.01022404, + "balance_loss_clip": 1.04189563, + "balance_loss_mlp": 1.0149684, + "epoch": 0.6773282029699994, + "flos": 22452069571200.0, + "grad_norm": 18.335031353435422, + "language_loss": 0.75067198, + "learning_rate": 9.96179653050422e-07, + "loss": 0.77212262, + "num_input_tokens_seen": 121182915, + "step": 5633, + "time_per_iteration": 2.5441670417785645 + }, + { + "auxiliary_loss_clip": 0.01124677, + "auxiliary_loss_mlp": 0.01025099, + "balance_loss_clip": 1.04475808, + "balance_loss_mlp": 1.01711488, + "epoch": 0.6774484458606385, + "flos": 18693730748160.0, + "grad_norm": 2.1462083794394142, + "language_loss": 0.74245667, + "learning_rate": 9.955059812426635e-07, + "loss": 0.7639544, + "num_input_tokens_seen": 121200445, + "step": 5634, + "time_per_iteration": 2.5109446048736572 + }, + { + "auxiliary_loss_clip": 0.01176811, + "auxiliary_loss_mlp": 0.01024794, + "balance_loss_clip": 1.05305004, + "balance_loss_mlp": 1.01631546, + "epoch": 0.6775686887512776, + "flos": 25994046821760.0, + "grad_norm": 3.3979729932177363, + "language_loss": 0.82850397, + "learning_rate": 9.948324618158493e-07, + "loss": 0.85052001, + "num_input_tokens_seen": 121220785, + "step": 5635, + "time_per_iteration": 2.4736454486846924 + }, + { + "auxiliary_loss_clip": 0.01160451, + "auxiliary_loss_mlp": 0.01026744, + "balance_loss_clip": 1.04587293, + "balance_loss_mlp": 1.01886988, + "epoch": 0.6776889316419167, + "flos": 13587987922560.0, + "grad_norm": 2.2268748565408507, + "language_loss": 0.78371882, + "learning_rate": 9.941590948721502e-07, + "loss": 0.80559075, + "num_input_tokens_seen": 121237985, + "step": 5636, + "time_per_iteration": 2.43438720703125 + }, + { + "auxiliary_loss_clip": 0.01141009, + "auxiliary_loss_mlp": 0.01024171, + "balance_loss_clip": 1.04707634, + "balance_loss_mlp": 1.01722383, + "epoch": 0.6778091745325557, + "flos": 27601121220480.0, + "grad_norm": 1.8035627414022066, + "language_loss": 0.76501518, + "learning_rate": 9.934858805137188e-07, + "loss": 0.78666699, + "num_input_tokens_seen": 121258635, + "step": 5637, + "time_per_iteration": 2.560720443725586 + }, + { + "auxiliary_loss_clip": 0.01152965, + "auxiliary_loss_mlp": 0.0102946, + "balance_loss_clip": 1.04639578, + "balance_loss_mlp": 1.0220902, + "epoch": 0.6779294174231949, + "flos": 18734058743040.0, + "grad_norm": 1.6159858340561557, + "language_loss": 0.81157619, + "learning_rate": 9.92812818842677e-07, + "loss": 0.83340049, + "num_input_tokens_seen": 121277810, + "step": 5638, + "time_per_iteration": 2.4456405639648438 + }, + { + "auxiliary_loss_clip": 0.01153972, + "auxiliary_loss_mlp": 0.01026935, + "balance_loss_clip": 1.04641867, + "balance_loss_mlp": 1.01941323, + "epoch": 0.678049660313834, + "flos": 45873797765760.0, + "grad_norm": 2.0200977497165984, + "language_loss": 0.63915288, + "learning_rate": 9.921399099611306e-07, + "loss": 0.66096199, + "num_input_tokens_seen": 121298975, + "step": 5639, + "time_per_iteration": 2.6933577060699463 + }, + { + "auxiliary_loss_clip": 0.01145092, + "auxiliary_loss_mlp": 0.01029204, + "balance_loss_clip": 1.04581785, + "balance_loss_mlp": 1.02187002, + "epoch": 0.678169903204473, + "flos": 19974556892160.0, + "grad_norm": 1.639296391125891, + "language_loss": 0.68989396, + "learning_rate": 9.914671539711588e-07, + "loss": 0.7116369, + "num_input_tokens_seen": 121318495, + "step": 5640, + "time_per_iteration": 3.2730660438537598 + }, + { + "auxiliary_loss_clip": 0.01076493, + "auxiliary_loss_mlp": 0.00763091, + "balance_loss_clip": 1.04041421, + "balance_loss_mlp": 1.00090468, + "epoch": 0.6782901460951122, + "flos": 21395613732480.0, + "grad_norm": 2.099468441586962, + "language_loss": 0.78319997, + "learning_rate": 9.90794550974817e-07, + "loss": 0.80159581, + "num_input_tokens_seen": 121338890, + "step": 5641, + "time_per_iteration": 2.8831543922424316 + }, + { + "auxiliary_loss_clip": 0.0112515, + "auxiliary_loss_mlp": 0.01029122, + "balance_loss_clip": 1.04448998, + "balance_loss_mlp": 1.02093792, + "epoch": 0.6784103889857512, + "flos": 21434002392960.0, + "grad_norm": 2.0881635631075217, + "language_loss": 0.81581604, + "learning_rate": 9.901221010741407e-07, + "loss": 0.83735877, + "num_input_tokens_seen": 121358210, + "step": 5642, + "time_per_iteration": 4.556462526321411 + }, + { + "auxiliary_loss_clip": 0.01164876, + "auxiliary_loss_mlp": 0.0102891, + "balance_loss_clip": 1.04887557, + "balance_loss_mlp": 1.02147114, + "epoch": 0.6785306318763903, + "flos": 32671923091200.0, + "grad_norm": 2.234605865736556, + "language_loss": 0.74787641, + "learning_rate": 9.894498043711375e-07, + "loss": 0.76981425, + "num_input_tokens_seen": 121379955, + "step": 5643, + "time_per_iteration": 2.63100528717041 + }, + { + "auxiliary_loss_clip": 0.01141734, + "auxiliary_loss_mlp": 0.01026034, + "balance_loss_clip": 1.04468632, + "balance_loss_mlp": 1.01792812, + "epoch": 0.6786508747670293, + "flos": 25632139340160.0, + "grad_norm": 2.091099451345781, + "language_loss": 0.68865991, + "learning_rate": 9.887776609677962e-07, + "loss": 0.71033752, + "num_input_tokens_seen": 121401325, + "step": 5644, + "time_per_iteration": 2.5684127807617188 + }, + { + "auxiliary_loss_clip": 0.01119159, + "auxiliary_loss_mlp": 0.01023711, + "balance_loss_clip": 1.03921854, + "balance_loss_mlp": 1.0161289, + "epoch": 0.6787711176576685, + "flos": 19171881619200.0, + "grad_norm": 1.6959457015301937, + "language_loss": 0.72577941, + "learning_rate": 9.88105670966079e-07, + "loss": 0.74720806, + "num_input_tokens_seen": 121419785, + "step": 5645, + "time_per_iteration": 2.536848783493042 + }, + { + "auxiliary_loss_clip": 0.01103282, + "auxiliary_loss_mlp": 0.01021391, + "balance_loss_clip": 1.043136, + "balance_loss_mlp": 1.01417327, + "epoch": 0.6788913605483076, + "flos": 13985159581440.0, + "grad_norm": 1.8862559700338668, + "language_loss": 0.7887364, + "learning_rate": 9.874338344679283e-07, + "loss": 0.80998313, + "num_input_tokens_seen": 121435630, + "step": 5646, + "time_per_iteration": 2.5626204013824463 + }, + { + "auxiliary_loss_clip": 0.01170171, + "auxiliary_loss_mlp": 0.01028187, + "balance_loss_clip": 1.04889941, + "balance_loss_mlp": 1.02097774, + "epoch": 0.6790116034389466, + "flos": 22017586659840.0, + "grad_norm": 1.6076412022122015, + "language_loss": 0.74012434, + "learning_rate": 9.86762151575259e-07, + "loss": 0.76210791, + "num_input_tokens_seen": 121455625, + "step": 5647, + "time_per_iteration": 2.4426565170288086 + }, + { + "auxiliary_loss_clip": 0.01118846, + "auxiliary_loss_mlp": 0.00762293, + "balance_loss_clip": 1.04606581, + "balance_loss_mlp": 1.00109756, + "epoch": 0.6791318463295858, + "flos": 20922454851840.0, + "grad_norm": 1.4313536648802225, + "language_loss": 0.79969949, + "learning_rate": 9.860906223899651e-07, + "loss": 0.81851089, + "num_input_tokens_seen": 121475020, + "step": 5648, + "time_per_iteration": 3.381722927093506 + }, + { + "auxiliary_loss_clip": 0.01150916, + "auxiliary_loss_mlp": 0.01029897, + "balance_loss_clip": 1.04679322, + "balance_loss_mlp": 1.0223155, + "epoch": 0.6792520892202248, + "flos": 28512749422080.0, + "grad_norm": 1.9894051352238187, + "language_loss": 0.75755656, + "learning_rate": 9.854192470139184e-07, + "loss": 0.77936465, + "num_input_tokens_seen": 121496500, + "step": 5649, + "time_per_iteration": 2.5667598247528076 + }, + { + "auxiliary_loss_clip": 0.01144476, + "auxiliary_loss_mlp": 0.01030139, + "balance_loss_clip": 1.04923964, + "balance_loss_mlp": 1.02296877, + "epoch": 0.6793723321108639, + "flos": 20011904058240.0, + "grad_norm": 2.080315301629421, + "language_loss": 0.71433759, + "learning_rate": 9.847480255489645e-07, + "loss": 0.73608381, + "num_input_tokens_seen": 121515525, + "step": 5650, + "time_per_iteration": 2.526454448699951 + }, + { + "auxiliary_loss_clip": 0.01149148, + "auxiliary_loss_mlp": 0.01024795, + "balance_loss_clip": 1.04659224, + "balance_loss_mlp": 1.01759481, + "epoch": 0.6794925750015031, + "flos": 26649488246400.0, + "grad_norm": 1.633653235556704, + "language_loss": 0.69095284, + "learning_rate": 9.840769580969295e-07, + "loss": 0.71269226, + "num_input_tokens_seen": 121535965, + "step": 5651, + "time_per_iteration": 2.5504438877105713 + }, + { + "auxiliary_loss_clip": 0.01151368, + "auxiliary_loss_mlp": 0.01023961, + "balance_loss_clip": 1.04684734, + "balance_loss_mlp": 1.01637936, + "epoch": 0.6796128178921421, + "flos": 21580374314880.0, + "grad_norm": 1.8263313080544088, + "language_loss": 0.80420113, + "learning_rate": 9.834060447596114e-07, + "loss": 0.82595438, + "num_input_tokens_seen": 121555235, + "step": 5652, + "time_per_iteration": 2.506047487258911 + }, + { + "auxiliary_loss_clip": 0.01161081, + "auxiliary_loss_mlp": 0.01024237, + "balance_loss_clip": 1.04656255, + "balance_loss_mlp": 1.0160532, + "epoch": 0.6797330607827812, + "flos": 22492002516480.0, + "grad_norm": 1.7970724000029692, + "language_loss": 0.78282559, + "learning_rate": 9.827352856387868e-07, + "loss": 0.8046788, + "num_input_tokens_seen": 121574945, + "step": 5653, + "time_per_iteration": 2.4953386783599854 + }, + { + "auxiliary_loss_clip": 0.01024709, + "auxiliary_loss_mlp": 0.01007696, + "balance_loss_clip": 1.013749, + "balance_loss_mlp": 1.00663507, + "epoch": 0.6798533036734203, + "flos": 66306648286080.0, + "grad_norm": 0.7780848443134281, + "language_loss": 0.6426515, + "learning_rate": 9.820646808362118e-07, + "loss": 0.66297555, + "num_input_tokens_seen": 121641200, + "step": 5654, + "time_per_iteration": 3.214235544204712 + }, + { + "auxiliary_loss_clip": 0.01141563, + "auxiliary_loss_mlp": 0.01030475, + "balance_loss_clip": 1.04709613, + "balance_loss_mlp": 1.02276826, + "epoch": 0.6799735465640594, + "flos": 16180163792640.0, + "grad_norm": 2.6314245230213875, + "language_loss": 0.72760141, + "learning_rate": 9.813942304536154e-07, + "loss": 0.74932176, + "num_input_tokens_seen": 121659170, + "step": 5655, + "time_per_iteration": 2.487535238265991 + }, + { + "auxiliary_loss_clip": 0.01145643, + "auxiliary_loss_mlp": 0.01028932, + "balance_loss_clip": 1.04627812, + "balance_loss_mlp": 1.02139449, + "epoch": 0.6800937894546984, + "flos": 22125749489280.0, + "grad_norm": 1.9052150085909205, + "language_loss": 0.63922429, + "learning_rate": 9.807239345927043e-07, + "loss": 0.66097009, + "num_input_tokens_seen": 121679180, + "step": 5656, + "time_per_iteration": 2.5550804138183594 + }, + { + "auxiliary_loss_clip": 0.01147824, + "auxiliary_loss_mlp": 0.01028341, + "balance_loss_clip": 1.04403639, + "balance_loss_mlp": 1.02035451, + "epoch": 0.6802140323453376, + "flos": 31612953300480.0, + "grad_norm": 2.8931854544380857, + "language_loss": 0.72423857, + "learning_rate": 9.80053793355162e-07, + "loss": 0.74600023, + "num_input_tokens_seen": 121697875, + "step": 5657, + "time_per_iteration": 2.5779755115509033 + }, + { + "auxiliary_loss_clip": 0.0111083, + "auxiliary_loss_mlp": 0.01030685, + "balance_loss_clip": 1.04183567, + "balance_loss_mlp": 1.0224061, + "epoch": 0.6803342752359767, + "flos": 17712938908800.0, + "grad_norm": 2.090582208949251, + "language_loss": 0.74976766, + "learning_rate": 9.793838068426472e-07, + "loss": 0.77118284, + "num_input_tokens_seen": 121715570, + "step": 5658, + "time_per_iteration": 2.531064748764038 + }, + { + "auxiliary_loss_clip": 0.01173846, + "auxiliary_loss_mlp": 0.01027331, + "balance_loss_clip": 1.05046344, + "balance_loss_mlp": 1.01930809, + "epoch": 0.6804545181266157, + "flos": 11326800902400.0, + "grad_norm": 2.148416286301834, + "language_loss": 0.60911226, + "learning_rate": 9.78713975156799e-07, + "loss": 0.63112408, + "num_input_tokens_seen": 121731435, + "step": 5659, + "time_per_iteration": 2.4167885780334473 + }, + { + "auxiliary_loss_clip": 0.01132511, + "auxiliary_loss_mlp": 0.01028816, + "balance_loss_clip": 1.04880083, + "balance_loss_mlp": 1.02054858, + "epoch": 0.6805747610172549, + "flos": 29350976181120.0, + "grad_norm": 1.93452553154524, + "language_loss": 0.71679509, + "learning_rate": 9.780442983992273e-07, + "loss": 0.73840833, + "num_input_tokens_seen": 121749950, + "step": 5660, + "time_per_iteration": 2.591662645339966 + }, + { + "auxiliary_loss_clip": 0.01139958, + "auxiliary_loss_mlp": 0.01024867, + "balance_loss_clip": 1.04602277, + "balance_loss_mlp": 1.01693344, + "epoch": 0.680695003907894, + "flos": 37631868612480.0, + "grad_norm": 1.7928104854709657, + "language_loss": 0.71573615, + "learning_rate": 9.773747766715238e-07, + "loss": 0.73738432, + "num_input_tokens_seen": 121770770, + "step": 5661, + "time_per_iteration": 2.6315879821777344 + }, + { + "auxiliary_loss_clip": 0.01147469, + "auxiliary_loss_mlp": 0.010269, + "balance_loss_clip": 1.04500031, + "balance_loss_mlp": 1.0188241, + "epoch": 0.680815246798533, + "flos": 22127365601280.0, + "grad_norm": 3.7538931733364587, + "language_loss": 0.80165237, + "learning_rate": 9.767054100752536e-07, + "loss": 0.82339609, + "num_input_tokens_seen": 121790720, + "step": 5662, + "time_per_iteration": 2.502042770385742 + }, + { + "auxiliary_loss_clip": 0.0113447, + "auxiliary_loss_mlp": 0.0102808, + "balance_loss_clip": 1.04627287, + "balance_loss_mlp": 1.0203371, + "epoch": 0.6809354896891722, + "flos": 17201822330880.0, + "grad_norm": 2.2826915065439155, + "language_loss": 0.81373143, + "learning_rate": 9.760361987119584e-07, + "loss": 0.83535695, + "num_input_tokens_seen": 121808455, + "step": 5663, + "time_per_iteration": 2.4904911518096924 + }, + { + "auxiliary_loss_clip": 0.01145321, + "auxiliary_loss_mlp": 0.01024156, + "balance_loss_clip": 1.04616404, + "balance_loss_mlp": 1.01575232, + "epoch": 0.6810557325798112, + "flos": 12458166554880.0, + "grad_norm": 2.1063800961472374, + "language_loss": 0.67708361, + "learning_rate": 9.753671426831592e-07, + "loss": 0.69877839, + "num_input_tokens_seen": 121824470, + "step": 5664, + "time_per_iteration": 2.473663330078125 + }, + { + "auxiliary_loss_clip": 0.0115161, + "auxiliary_loss_mlp": 0.01027721, + "balance_loss_clip": 1.0445652, + "balance_loss_mlp": 1.02006221, + "epoch": 0.6811759754704503, + "flos": 22156165330560.0, + "grad_norm": 1.8652908774250498, + "language_loss": 0.79589969, + "learning_rate": 9.746982420903483e-07, + "loss": 0.817693, + "num_input_tokens_seen": 121842665, + "step": 5665, + "time_per_iteration": 2.4916770458221436 + }, + { + "auxiliary_loss_clip": 0.01155561, + "auxiliary_loss_mlp": 0.01023576, + "balance_loss_clip": 1.04956925, + "balance_loss_mlp": 1.01648927, + "epoch": 0.6812962183610894, + "flos": 17525377065600.0, + "grad_norm": 1.480928727371707, + "language_loss": 0.7476781, + "learning_rate": 9.740294970349993e-07, + "loss": 0.7694695, + "num_input_tokens_seen": 121859080, + "step": 5666, + "time_per_iteration": 2.440502882003784 + }, + { + "auxiliary_loss_clip": 0.01050835, + "auxiliary_loss_mlp": 0.01000279, + "balance_loss_clip": 1.01268852, + "balance_loss_mlp": 0.99919432, + "epoch": 0.6814164612517285, + "flos": 60274480855680.0, + "grad_norm": 0.8913382797930618, + "language_loss": 0.60933703, + "learning_rate": 9.733609076185594e-07, + "loss": 0.62984818, + "num_input_tokens_seen": 121915485, + "step": 5667, + "time_per_iteration": 3.732267141342163 + }, + { + "auxiliary_loss_clip": 0.01159754, + "auxiliary_loss_mlp": 0.01033306, + "balance_loss_clip": 1.05031443, + "balance_loss_mlp": 1.02493715, + "epoch": 0.6815367041423676, + "flos": 19317750750720.0, + "grad_norm": 2.4171425326836347, + "language_loss": 0.83751202, + "learning_rate": 9.72692473942455e-07, + "loss": 0.85944259, + "num_input_tokens_seen": 121932710, + "step": 5668, + "time_per_iteration": 3.2704052925109863 + }, + { + "auxiliary_loss_clip": 0.01120191, + "auxiliary_loss_mlp": 0.01026095, + "balance_loss_clip": 1.04693699, + "balance_loss_mlp": 1.01800036, + "epoch": 0.6816569470330067, + "flos": 22161696024960.0, + "grad_norm": 1.5263442464957113, + "language_loss": 0.77272183, + "learning_rate": 9.720241961080849e-07, + "loss": 0.79418468, + "num_input_tokens_seen": 121952025, + "step": 5669, + "time_per_iteration": 2.5764989852905273 + }, + { + "auxiliary_loss_clip": 0.01171916, + "auxiliary_loss_mlp": 0.01027131, + "balance_loss_clip": 1.04798532, + "balance_loss_mlp": 1.01966274, + "epoch": 0.6817771899236458, + "flos": 41463501137280.0, + "grad_norm": 1.9294726734214835, + "language_loss": 0.73059869, + "learning_rate": 9.713560742168259e-07, + "loss": 0.75258917, + "num_input_tokens_seen": 121974650, + "step": 5670, + "time_per_iteration": 2.6181094646453857 + }, + { + "auxiliary_loss_clip": 0.0113033, + "auxiliary_loss_mlp": 0.01029217, + "balance_loss_clip": 1.04503477, + "balance_loss_mlp": 1.02168322, + "epoch": 0.6818974328142848, + "flos": 21106138026240.0, + "grad_norm": 1.9945987204204, + "language_loss": 0.71271223, + "learning_rate": 9.706881083700333e-07, + "loss": 0.73430765, + "num_input_tokens_seen": 121994335, + "step": 5671, + "time_per_iteration": 2.5503053665161133 + }, + { + "auxiliary_loss_clip": 0.01101322, + "auxiliary_loss_mlp": 0.0102776, + "balance_loss_clip": 1.04505801, + "balance_loss_mlp": 1.01949322, + "epoch": 0.682017675704924, + "flos": 20441897769600.0, + "grad_norm": 1.8920729384219115, + "language_loss": 0.82732606, + "learning_rate": 9.700202986690357e-07, + "loss": 0.8486169, + "num_input_tokens_seen": 122012635, + "step": 5672, + "time_per_iteration": 2.597592353820801 + }, + { + "auxiliary_loss_clip": 0.01159177, + "auxiliary_loss_mlp": 0.0076296, + "balance_loss_clip": 1.04833055, + "balance_loss_mlp": 1.0008781, + "epoch": 0.682137918595563, + "flos": 20044438801920.0, + "grad_norm": 2.4039853846676724, + "language_loss": 0.66659582, + "learning_rate": 9.693526452151413e-07, + "loss": 0.68581712, + "num_input_tokens_seen": 122031685, + "step": 5673, + "time_per_iteration": 2.4781830310821533 + }, + { + "auxiliary_loss_clip": 0.01135111, + "auxiliary_loss_mlp": 0.01021507, + "balance_loss_clip": 1.04439402, + "balance_loss_mlp": 1.01328731, + "epoch": 0.6822581614862021, + "flos": 31684559063040.0, + "grad_norm": 1.7352656236263893, + "language_loss": 0.75877869, + "learning_rate": 9.686851481096305e-07, + "loss": 0.78034484, + "num_input_tokens_seen": 122052995, + "step": 5674, + "time_per_iteration": 3.374742269515991 + }, + { + "auxiliary_loss_clip": 0.01101132, + "auxiliary_loss_mlp": 0.01025804, + "balance_loss_clip": 1.04221821, + "balance_loss_mlp": 1.01761997, + "epoch": 0.6823784043768413, + "flos": 23477570864640.0, + "grad_norm": 2.0728208922688647, + "language_loss": 0.7196939, + "learning_rate": 9.68017807453762e-07, + "loss": 0.74096334, + "num_input_tokens_seen": 122071740, + "step": 5675, + "time_per_iteration": 2.6131675243377686 + }, + { + "auxiliary_loss_clip": 0.01148202, + "auxiliary_loss_mlp": 0.00762789, + "balance_loss_clip": 1.04923904, + "balance_loss_mlp": 1.00099206, + "epoch": 0.6824986472674803, + "flos": 14137134024960.0, + "grad_norm": 1.8104933592123418, + "language_loss": 0.73122221, + "learning_rate": 9.673506233487721e-07, + "loss": 0.75033212, + "num_input_tokens_seen": 122089705, + "step": 5676, + "time_per_iteration": 2.5305116176605225 + }, + { + "auxiliary_loss_clip": 0.01145939, + "auxiliary_loss_mlp": 0.00762231, + "balance_loss_clip": 1.04590726, + "balance_loss_mlp": 1.00094175, + "epoch": 0.6826188901581194, + "flos": 21504997624320.0, + "grad_norm": 1.6810310583898773, + "language_loss": 0.85868967, + "learning_rate": 9.666835958958717e-07, + "loss": 0.87777132, + "num_input_tokens_seen": 122109025, + "step": 5677, + "time_per_iteration": 2.514716148376465 + }, + { + "auxiliary_loss_clip": 0.01172482, + "auxiliary_loss_mlp": 0.0102193, + "balance_loss_clip": 1.04997039, + "balance_loss_mlp": 1.01464045, + "epoch": 0.6827391330487584, + "flos": 20810126044800.0, + "grad_norm": 2.1209508969094237, + "language_loss": 0.80271286, + "learning_rate": 9.660167251962484e-07, + "loss": 0.82465702, + "num_input_tokens_seen": 122127385, + "step": 5678, + "time_per_iteration": 2.441812038421631 + }, + { + "auxiliary_loss_clip": 0.01132422, + "auxiliary_loss_mlp": 0.01025646, + "balance_loss_clip": 1.04355216, + "balance_loss_mlp": 1.01856208, + "epoch": 0.6828593759393976, + "flos": 21688788539520.0, + "grad_norm": 1.5174320233732455, + "language_loss": 0.77344996, + "learning_rate": 9.653500113510654e-07, + "loss": 0.79503059, + "num_input_tokens_seen": 122146500, + "step": 5679, + "time_per_iteration": 2.525930643081665 + }, + { + "auxiliary_loss_clip": 0.01138073, + "auxiliary_loss_mlp": 0.0102902, + "balance_loss_clip": 1.04337621, + "balance_loss_mlp": 1.02092326, + "epoch": 0.6829796188300367, + "flos": 25337707557120.0, + "grad_norm": 2.421201332991618, + "language_loss": 0.67201817, + "learning_rate": 9.646834544614627e-07, + "loss": 0.69368911, + "num_input_tokens_seen": 122167000, + "step": 5680, + "time_per_iteration": 2.576749324798584 + }, + { + "auxiliary_loss_clip": 0.01137405, + "auxiliary_loss_mlp": 0.01024486, + "balance_loss_clip": 1.04701376, + "balance_loss_mlp": 1.01708937, + "epoch": 0.6830998617206757, + "flos": 20704800389760.0, + "grad_norm": 1.9611983147260394, + "language_loss": 0.76485002, + "learning_rate": 9.64017054628558e-07, + "loss": 0.78646886, + "num_input_tokens_seen": 122185825, + "step": 5681, + "time_per_iteration": 2.4968700408935547 + }, + { + "auxiliary_loss_clip": 0.01118921, + "auxiliary_loss_mlp": 0.01025005, + "balance_loss_clip": 1.04332554, + "balance_loss_mlp": 1.017519, + "epoch": 0.6832201046113149, + "flos": 21726638496000.0, + "grad_norm": 1.871200653986464, + "language_loss": 0.78609681, + "learning_rate": 9.63350811953441e-07, + "loss": 0.80753601, + "num_input_tokens_seen": 122206200, + "step": 5682, + "time_per_iteration": 2.6005728244781494 + }, + { + "auxiliary_loss_clip": 0.01132583, + "auxiliary_loss_mlp": 0.01023022, + "balance_loss_clip": 1.04520726, + "balance_loss_mlp": 1.01548231, + "epoch": 0.6833403475019539, + "flos": 19536554448000.0, + "grad_norm": 2.0820436199218935, + "language_loss": 0.70222163, + "learning_rate": 9.626847265371826e-07, + "loss": 0.72377765, + "num_input_tokens_seen": 122225520, + "step": 5683, + "time_per_iteration": 2.5325567722320557 + }, + { + "auxiliary_loss_clip": 0.01134541, + "auxiliary_loss_mlp": 0.01026093, + "balance_loss_clip": 1.04381847, + "balance_loss_mlp": 1.01864851, + "epoch": 0.683460590392593, + "flos": 19352153001600.0, + "grad_norm": 2.1133993854942315, + "language_loss": 0.78754175, + "learning_rate": 9.620187984808262e-07, + "loss": 0.80914807, + "num_input_tokens_seen": 122244320, + "step": 5684, + "time_per_iteration": 2.497929573059082 + }, + { + "auxiliary_loss_clip": 0.01143757, + "auxiliary_loss_mlp": 0.00761856, + "balance_loss_clip": 1.04770792, + "balance_loss_mlp": 1.00082803, + "epoch": 0.6835808332832322, + "flos": 23288500650240.0, + "grad_norm": 1.9007817680637455, + "language_loss": 0.85612547, + "learning_rate": 9.613530278853919e-07, + "loss": 0.87518167, + "num_input_tokens_seen": 122264295, + "step": 5685, + "time_per_iteration": 2.5247418880462646 + }, + { + "auxiliary_loss_clip": 0.01156954, + "auxiliary_loss_mlp": 0.01023199, + "balance_loss_clip": 1.04906213, + "balance_loss_mlp": 1.01607609, + "epoch": 0.6837010761738712, + "flos": 21653416621440.0, + "grad_norm": 1.762363007462299, + "language_loss": 0.74578947, + "learning_rate": 9.60687414851879e-07, + "loss": 0.76759106, + "num_input_tokens_seen": 122285300, + "step": 5686, + "time_per_iteration": 2.4764297008514404 + }, + { + "auxiliary_loss_clip": 0.01147771, + "auxiliary_loss_mlp": 0.01029031, + "balance_loss_clip": 1.04899979, + "balance_loss_mlp": 1.02117825, + "epoch": 0.6838213190645103, + "flos": 17566387418880.0, + "grad_norm": 2.194689789695966, + "language_loss": 0.76960158, + "learning_rate": 9.600219594812575e-07, + "loss": 0.79136956, + "num_input_tokens_seen": 122303240, + "step": 5687, + "time_per_iteration": 2.4856069087982178 + }, + { + "auxiliary_loss_clip": 0.01170556, + "auxiliary_loss_mlp": 0.01023997, + "balance_loss_clip": 1.0488174, + "balance_loss_mlp": 1.01704717, + "epoch": 0.6839415619551494, + "flos": 23112538899840.0, + "grad_norm": 1.6590318547677079, + "language_loss": 0.72358167, + "learning_rate": 9.593566618744786e-07, + "loss": 0.74552727, + "num_input_tokens_seen": 122323390, + "step": 5688, + "time_per_iteration": 2.4581220149993896 + }, + { + "auxiliary_loss_clip": 0.01171212, + "auxiliary_loss_mlp": 0.0102733, + "balance_loss_clip": 1.04757679, + "balance_loss_mlp": 1.01995087, + "epoch": 0.6840618048457885, + "flos": 22127868391680.0, + "grad_norm": 1.653063104243488, + "language_loss": 0.73807651, + "learning_rate": 9.58691522132466e-07, + "loss": 0.76006186, + "num_input_tokens_seen": 122342200, + "step": 5689, + "time_per_iteration": 2.4757871627807617 + }, + { + "auxiliary_loss_clip": 0.01151829, + "auxiliary_loss_mlp": 0.01023598, + "balance_loss_clip": 1.04958034, + "balance_loss_mlp": 1.01577187, + "epoch": 0.6841820477364275, + "flos": 22015898720640.0, + "grad_norm": 2.149139636065389, + "language_loss": 0.84834719, + "learning_rate": 9.58026540356123e-07, + "loss": 0.87010145, + "num_input_tokens_seen": 122360465, + "step": 5690, + "time_per_iteration": 2.5166265964508057 + }, + { + "auxiliary_loss_clip": 0.01159112, + "auxiliary_loss_mlp": 0.01026341, + "balance_loss_clip": 1.04579401, + "balance_loss_mlp": 1.01881874, + "epoch": 0.6843022906270667, + "flos": 24900531125760.0, + "grad_norm": 1.7098318166820488, + "language_loss": 0.86898106, + "learning_rate": 9.573617166463246e-07, + "loss": 0.89083558, + "num_input_tokens_seen": 122381680, + "step": 5691, + "time_per_iteration": 2.523521900177002 + }, + { + "auxiliary_loss_clip": 0.01146179, + "auxiliary_loss_mlp": 0.01026702, + "balance_loss_clip": 1.04473722, + "balance_loss_mlp": 1.01972485, + "epoch": 0.6844225335177058, + "flos": 19969924037760.0, + "grad_norm": 3.9262118155481014, + "language_loss": 0.59725535, + "learning_rate": 9.56697051103924e-07, + "loss": 0.6189841, + "num_input_tokens_seen": 122399120, + "step": 5692, + "time_per_iteration": 2.5203139781951904 + }, + { + "auxiliary_loss_clip": 0.01139917, + "auxiliary_loss_mlp": 0.0102368, + "balance_loss_clip": 1.04489422, + "balance_loss_mlp": 1.01599693, + "epoch": 0.6845427764083448, + "flos": 25883334126720.0, + "grad_norm": 2.768655711768754, + "language_loss": 0.81149197, + "learning_rate": 9.560325438297522e-07, + "loss": 0.83312786, + "num_input_tokens_seen": 122417430, + "step": 5693, + "time_per_iteration": 3.3502328395843506 + }, + { + "auxiliary_loss_clip": 0.01144905, + "auxiliary_loss_mlp": 0.01022941, + "balance_loss_clip": 1.05002058, + "balance_loss_mlp": 1.0161525, + "epoch": 0.684663019298984, + "flos": 18880143356160.0, + "grad_norm": 2.149564926213334, + "language_loss": 0.86738098, + "learning_rate": 9.553681949246127e-07, + "loss": 0.88905942, + "num_input_tokens_seen": 122435055, + "step": 5694, + "time_per_iteration": 2.51438045501709 + }, + { + "auxiliary_loss_clip": 0.01135617, + "auxiliary_loss_mlp": 0.01025623, + "balance_loss_clip": 1.04591942, + "balance_loss_mlp": 1.017272, + "epoch": 0.684783262189623, + "flos": 54193725302400.0, + "grad_norm": 1.848497492735595, + "language_loss": 0.75299251, + "learning_rate": 9.547040044892886e-07, + "loss": 0.77460492, + "num_input_tokens_seen": 122462570, + "step": 5695, + "time_per_iteration": 4.223644256591797 + }, + { + "auxiliary_loss_clip": 0.01062918, + "auxiliary_loss_mlp": 0.01002032, + "balance_loss_clip": 1.01284719, + "balance_loss_mlp": 1.00089347, + "epoch": 0.6849035050802621, + "flos": 63970264143360.0, + "grad_norm": 0.8624990773738548, + "language_loss": 0.6011982, + "learning_rate": 9.540399726245354e-07, + "loss": 0.62184775, + "num_input_tokens_seen": 122519275, + "step": 5696, + "time_per_iteration": 2.9185853004455566 + }, + { + "auxiliary_loss_clip": 0.01139044, + "auxiliary_loss_mlp": 0.01027206, + "balance_loss_clip": 1.04516196, + "balance_loss_mlp": 1.01912999, + "epoch": 0.6850237479709013, + "flos": 25224121774080.0, + "grad_norm": 1.8694479185286363, + "language_loss": 0.69230765, + "learning_rate": 9.533760994310859e-07, + "loss": 0.71397018, + "num_input_tokens_seen": 122539675, + "step": 5697, + "time_per_iteration": 2.52778697013855 + }, + { + "auxiliary_loss_clip": 0.01171687, + "auxiliary_loss_mlp": 0.01024274, + "balance_loss_clip": 1.04862046, + "balance_loss_mlp": 1.01690674, + "epoch": 0.6851439908615403, + "flos": 19354128249600.0, + "grad_norm": 2.1101732650334433, + "language_loss": 0.74729377, + "learning_rate": 9.527123850096508e-07, + "loss": 0.76925337, + "num_input_tokens_seen": 122558035, + "step": 5698, + "time_per_iteration": 2.433532476425171 + }, + { + "auxiliary_loss_clip": 0.01161375, + "auxiliary_loss_mlp": 0.0102187, + "balance_loss_clip": 1.04634595, + "balance_loss_mlp": 1.0147177, + "epoch": 0.6852642337521794, + "flos": 23182133500800.0, + "grad_norm": 2.502469484446254, + "language_loss": 0.7167505, + "learning_rate": 9.520488294609142e-07, + "loss": 0.73858297, + "num_input_tokens_seen": 122576815, + "step": 5699, + "time_per_iteration": 2.4811112880706787 + }, + { + "auxiliary_loss_clip": 0.01028845, + "auxiliary_loss_mlp": 0.00999772, + "balance_loss_clip": 1.01277304, + "balance_loss_mlp": 0.99862206, + "epoch": 0.6853844766428185, + "flos": 62647206583680.0, + "grad_norm": 0.7367273873459584, + "language_loss": 0.53821909, + "learning_rate": 9.513854328855368e-07, + "loss": 0.5585053, + "num_input_tokens_seen": 122634690, + "step": 5700, + "time_per_iteration": 3.0876095294952393 + }, + { + "auxiliary_loss_clip": 0.0116956, + "auxiliary_loss_mlp": 0.01026144, + "balance_loss_clip": 1.04943037, + "balance_loss_mlp": 1.01922429, + "epoch": 0.6855047195334576, + "flos": 23437242869760.0, + "grad_norm": 2.7796838594717532, + "language_loss": 0.81675994, + "learning_rate": 9.507221953841558e-07, + "loss": 0.83871686, + "num_input_tokens_seen": 122652320, + "step": 5701, + "time_per_iteration": 3.1966114044189453 + }, + { + "auxiliary_loss_clip": 0.0116152, + "auxiliary_loss_mlp": 0.01024833, + "balance_loss_clip": 1.05105567, + "balance_loss_mlp": 1.01705492, + "epoch": 0.6856249624240967, + "flos": 20664831530880.0, + "grad_norm": 1.657437610914012, + "language_loss": 0.78240705, + "learning_rate": 9.500591170573824e-07, + "loss": 0.80427057, + "num_input_tokens_seen": 122672340, + "step": 5702, + "time_per_iteration": 2.476608991622925 + }, + { + "auxiliary_loss_clip": 0.01112799, + "auxiliary_loss_mlp": 0.01025254, + "balance_loss_clip": 1.04323351, + "balance_loss_mlp": 1.01769018, + "epoch": 0.6857452053147358, + "flos": 17087302794240.0, + "grad_norm": 6.851076804920379, + "language_loss": 0.74182469, + "learning_rate": 9.493961980058078e-07, + "loss": 0.76320523, + "num_input_tokens_seen": 122689935, + "step": 5703, + "time_per_iteration": 2.5372278690338135 + }, + { + "auxiliary_loss_clip": 0.01084078, + "auxiliary_loss_mlp": 0.01023348, + "balance_loss_clip": 1.03873396, + "balance_loss_mlp": 1.01596868, + "epoch": 0.6858654482053749, + "flos": 30847266057600.0, + "grad_norm": 2.159185893950755, + "language_loss": 0.67413759, + "learning_rate": 9.48733438329993e-07, + "loss": 0.69521183, + "num_input_tokens_seen": 122710200, + "step": 5704, + "time_per_iteration": 2.665571451187134 + }, + { + "auxiliary_loss_clip": 0.01170063, + "auxiliary_loss_mlp": 0.00762041, + "balance_loss_clip": 1.04920053, + "balance_loss_mlp": 1.00092936, + "epoch": 0.6859856910960139, + "flos": 28877314510080.0, + "grad_norm": 1.698537104536368, + "language_loss": 0.74518454, + "learning_rate": 9.480708381304807e-07, + "loss": 0.76450562, + "num_input_tokens_seen": 122731495, + "step": 5705, + "time_per_iteration": 2.531934976577759 + }, + { + "auxiliary_loss_clip": 0.011139, + "auxiliary_loss_mlp": 0.01026617, + "balance_loss_clip": 1.04606903, + "balance_loss_mlp": 1.01877034, + "epoch": 0.6861059339866531, + "flos": 19354523299200.0, + "grad_norm": 2.087602808444165, + "language_loss": 0.83645487, + "learning_rate": 9.474083975077858e-07, + "loss": 0.85786003, + "num_input_tokens_seen": 122748620, + "step": 5706, + "time_per_iteration": 2.538837194442749 + }, + { + "auxiliary_loss_clip": 0.0115066, + "auxiliary_loss_mlp": 0.0102033, + "balance_loss_clip": 1.04592979, + "balance_loss_mlp": 1.01315343, + "epoch": 0.6862261768772921, + "flos": 22199976944640.0, + "grad_norm": 2.298778661763617, + "language_loss": 0.8024081, + "learning_rate": 9.467461165623994e-07, + "loss": 0.82411796, + "num_input_tokens_seen": 122767670, + "step": 5707, + "time_per_iteration": 2.4671688079833984 + }, + { + "auxiliary_loss_clip": 0.01159608, + "auxiliary_loss_mlp": 0.01022811, + "balance_loss_clip": 1.04605269, + "balance_loss_mlp": 1.01545572, + "epoch": 0.6863464197679312, + "flos": 26285677344000.0, + "grad_norm": 1.9113880756197472, + "language_loss": 0.79185915, + "learning_rate": 9.46083995394791e-07, + "loss": 0.81368327, + "num_input_tokens_seen": 122785480, + "step": 5708, + "time_per_iteration": 2.4974400997161865 + }, + { + "auxiliary_loss_clip": 0.0115768, + "auxiliary_loss_mlp": 0.00761653, + "balance_loss_clip": 1.04648876, + "balance_loss_mlp": 1.00083327, + "epoch": 0.6864666626585703, + "flos": 37815228564480.0, + "grad_norm": 2.3159679021677535, + "language_loss": 0.63360012, + "learning_rate": 9.454220341054012e-07, + "loss": 0.65279347, + "num_input_tokens_seen": 122810265, + "step": 5709, + "time_per_iteration": 2.609485626220703 + }, + { + "auxiliary_loss_clip": 0.01126239, + "auxiliary_loss_mlp": 0.01028858, + "balance_loss_clip": 1.04356277, + "balance_loss_mlp": 1.02113914, + "epoch": 0.6865869055492094, + "flos": 19391152193280.0, + "grad_norm": 2.009350506528741, + "language_loss": 0.80425674, + "learning_rate": 9.447602327946512e-07, + "loss": 0.82580769, + "num_input_tokens_seen": 122828905, + "step": 5710, + "time_per_iteration": 2.5521316528320312 + }, + { + "auxiliary_loss_clip": 0.01140228, + "auxiliary_loss_mlp": 0.01026982, + "balance_loss_clip": 1.04376578, + "balance_loss_mlp": 1.01934659, + "epoch": 0.6867071484398485, + "flos": 20375966355840.0, + "grad_norm": 2.372184731783101, + "language_loss": 0.76685387, + "learning_rate": 9.440985915629338e-07, + "loss": 0.78852594, + "num_input_tokens_seen": 122846235, + "step": 5711, + "time_per_iteration": 2.5121843814849854 + }, + { + "auxiliary_loss_clip": 0.01171654, + "auxiliary_loss_mlp": 0.01027202, + "balance_loss_clip": 1.05053556, + "balance_loss_mlp": 1.01997209, + "epoch": 0.6868273913304875, + "flos": 15889143801600.0, + "grad_norm": 1.9920138165452292, + "language_loss": 0.73016214, + "learning_rate": 9.434371105106223e-07, + "loss": 0.75215071, + "num_input_tokens_seen": 122863835, + "step": 5712, + "time_per_iteration": 2.417870044708252 + }, + { + "auxiliary_loss_clip": 0.01126358, + "auxiliary_loss_mlp": 0.0102683, + "balance_loss_clip": 1.0436722, + "balance_loss_mlp": 1.01896238, + "epoch": 0.6869476342211267, + "flos": 24462492768000.0, + "grad_norm": 6.6021902588505235, + "language_loss": 0.70480686, + "learning_rate": 9.427757897380602e-07, + "loss": 0.72633874, + "num_input_tokens_seen": 122883235, + "step": 5713, + "time_per_iteration": 2.555516242980957 + }, + { + "auxiliary_loss_clip": 0.01125002, + "auxiliary_loss_mlp": 0.0102195, + "balance_loss_clip": 1.04445767, + "balance_loss_mlp": 1.01410604, + "epoch": 0.6870678771117658, + "flos": 18442571875200.0, + "grad_norm": 2.064986890766056, + "language_loss": 0.85014713, + "learning_rate": 9.421146293455695e-07, + "loss": 0.87161666, + "num_input_tokens_seen": 122898975, + "step": 5714, + "time_per_iteration": 2.52820086479187 + }, + { + "auxiliary_loss_clip": 0.01139177, + "auxiliary_loss_mlp": 0.01023458, + "balance_loss_clip": 1.04268122, + "balance_loss_mlp": 1.01609123, + "epoch": 0.6871881200024048, + "flos": 22200371994240.0, + "grad_norm": 2.1571854398698176, + "language_loss": 0.68482268, + "learning_rate": 9.414536294334489e-07, + "loss": 0.70644903, + "num_input_tokens_seen": 122918995, + "step": 5715, + "time_per_iteration": 2.61907958984375 + }, + { + "auxiliary_loss_clip": 0.01143277, + "auxiliary_loss_mlp": 0.01027896, + "balance_loss_clip": 1.04196894, + "balance_loss_mlp": 1.02003431, + "epoch": 0.687308362893044, + "flos": 22127724737280.0, + "grad_norm": 2.400823438379412, + "language_loss": 0.69577205, + "learning_rate": 9.407927901019708e-07, + "loss": 0.71748382, + "num_input_tokens_seen": 122938125, + "step": 5716, + "time_per_iteration": 2.5109684467315674 + }, + { + "auxiliary_loss_clip": 0.01156699, + "auxiliary_loss_mlp": 0.01022033, + "balance_loss_clip": 1.04677343, + "balance_loss_mlp": 1.01491022, + "epoch": 0.687428605783683, + "flos": 25040546340480.0, + "grad_norm": 2.127118056653622, + "language_loss": 0.76901436, + "learning_rate": 9.401321114513854e-07, + "loss": 0.79080176, + "num_input_tokens_seen": 122957020, + "step": 5717, + "time_per_iteration": 2.518042802810669 + }, + { + "auxiliary_loss_clip": 0.0117407, + "auxiliary_loss_mlp": 0.01024791, + "balance_loss_clip": 1.05030334, + "balance_loss_mlp": 1.01720917, + "epoch": 0.6875488486743221, + "flos": 23770063313280.0, + "grad_norm": 2.2550165567013765, + "language_loss": 0.7532149, + "learning_rate": 9.394715935819155e-07, + "loss": 0.77520347, + "num_input_tokens_seen": 122977410, + "step": 5718, + "time_per_iteration": 2.5238454341888428 + }, + { + "auxiliary_loss_clip": 0.01159849, + "auxiliary_loss_mlp": 0.01028371, + "balance_loss_clip": 1.04674435, + "balance_loss_mlp": 1.02099776, + "epoch": 0.6876690915649613, + "flos": 25516937445120.0, + "grad_norm": 2.5319777934349905, + "language_loss": 0.621602, + "learning_rate": 9.388112365937608e-07, + "loss": 0.64348418, + "num_input_tokens_seen": 122996875, + "step": 5719, + "time_per_iteration": 2.5308547019958496 + }, + { + "auxiliary_loss_clip": 0.0112962, + "auxiliary_loss_mlp": 0.0102505, + "balance_loss_clip": 1.04463124, + "balance_loss_mlp": 1.01719439, + "epoch": 0.6877893344556003, + "flos": 19427996568960.0, + "grad_norm": 2.1052016826178153, + "language_loss": 0.82750857, + "learning_rate": 9.381510405870985e-07, + "loss": 0.84905529, + "num_input_tokens_seen": 123015890, + "step": 5720, + "time_per_iteration": 3.2603671550750732 + }, + { + "auxiliary_loss_clip": 0.01156471, + "auxiliary_loss_mlp": 0.01026079, + "balance_loss_clip": 1.0470295, + "balance_loss_mlp": 1.01880133, + "epoch": 0.6879095773462394, + "flos": 18661303745280.0, + "grad_norm": 2.3877729873460094, + "language_loss": 0.77263921, + "learning_rate": 9.374910056620791e-07, + "loss": 0.79446471, + "num_input_tokens_seen": 123034955, + "step": 5721, + "time_per_iteration": 3.1820597648620605 + }, + { + "auxiliary_loss_clip": 0.0115958, + "auxiliary_loss_mlp": 0.01028237, + "balance_loss_clip": 1.0487411, + "balance_loss_mlp": 1.02038682, + "epoch": 0.6880298202368785, + "flos": 20883132437760.0, + "grad_norm": 1.838283054924558, + "language_loss": 0.81192809, + "learning_rate": 9.368311319188293e-07, + "loss": 0.83380628, + "num_input_tokens_seen": 123052770, + "step": 5722, + "time_per_iteration": 3.2807209491729736 + }, + { + "auxiliary_loss_clip": 0.01128548, + "auxiliary_loss_mlp": 0.0102519, + "balance_loss_clip": 1.04369164, + "balance_loss_mlp": 1.01769507, + "epoch": 0.6881500631275176, + "flos": 30153292318080.0, + "grad_norm": 1.827058894491359, + "language_loss": 0.79366839, + "learning_rate": 9.361714194574515e-07, + "loss": 0.81520569, + "num_input_tokens_seen": 123075105, + "step": 5723, + "time_per_iteration": 2.6423795223236084 + }, + { + "auxiliary_loss_clip": 0.01071114, + "auxiliary_loss_mlp": 0.01001341, + "balance_loss_clip": 1.01267052, + "balance_loss_mlp": 1.00020301, + "epoch": 0.6882703060181566, + "flos": 66181537215360.0, + "grad_norm": 0.7284399679563082, + "language_loss": 0.58266521, + "learning_rate": 9.355118683780228e-07, + "loss": 0.60338974, + "num_input_tokens_seen": 123145175, + "step": 5724, + "time_per_iteration": 3.1171092987060547 + }, + { + "auxiliary_loss_clip": 0.01170033, + "auxiliary_loss_mlp": 0.0102239, + "balance_loss_clip": 1.04756713, + "balance_loss_mlp": 1.01505876, + "epoch": 0.6883905489087958, + "flos": 18214646123520.0, + "grad_norm": 2.28483133131449, + "language_loss": 0.793589, + "learning_rate": 9.348524787805987e-07, + "loss": 0.81551313, + "num_input_tokens_seen": 123160365, + "step": 5725, + "time_per_iteration": 2.403801679611206 + }, + { + "auxiliary_loss_clip": 0.01131006, + "auxiliary_loss_mlp": 0.01022092, + "balance_loss_clip": 1.0410893, + "balance_loss_mlp": 1.01485634, + "epoch": 0.6885107917994349, + "flos": 14056262553600.0, + "grad_norm": 2.825896690311587, + "language_loss": 0.85737979, + "learning_rate": 9.341932507652053e-07, + "loss": 0.87891078, + "num_input_tokens_seen": 123174855, + "step": 5726, + "time_per_iteration": 2.4965052604675293 + }, + { + "auxiliary_loss_clip": 0.01140651, + "auxiliary_loss_mlp": 0.01026914, + "balance_loss_clip": 1.04114509, + "balance_loss_mlp": 1.018659, + "epoch": 0.6886310346900739, + "flos": 28690722334080.0, + "grad_norm": 1.8425598840431692, + "language_loss": 0.78317356, + "learning_rate": 9.335341844318489e-07, + "loss": 0.80484921, + "num_input_tokens_seen": 123194995, + "step": 5727, + "time_per_iteration": 3.2946035861968994 + }, + { + "auxiliary_loss_clip": 0.01143683, + "auxiliary_loss_mlp": 0.01026892, + "balance_loss_clip": 1.04582381, + "balance_loss_mlp": 1.01941776, + "epoch": 0.6887512775807131, + "flos": 24535319592960.0, + "grad_norm": 1.893882380042971, + "language_loss": 0.7346223, + "learning_rate": 9.328752798805091e-07, + "loss": 0.75632805, + "num_input_tokens_seen": 123213465, + "step": 5728, + "time_per_iteration": 2.528421640396118 + }, + { + "auxiliary_loss_clip": 0.01156862, + "auxiliary_loss_mlp": 0.01028738, + "balance_loss_clip": 1.0463028, + "balance_loss_mlp": 1.02107859, + "epoch": 0.6888715204713521, + "flos": 22414363269120.0, + "grad_norm": 2.5152178358750423, + "language_loss": 0.76180375, + "learning_rate": 9.322165372111399e-07, + "loss": 0.7836597, + "num_input_tokens_seen": 123231610, + "step": 5729, + "time_per_iteration": 2.53360652923584 + }, + { + "auxiliary_loss_clip": 0.01127296, + "auxiliary_loss_mlp": 0.01025689, + "balance_loss_clip": 1.04659295, + "balance_loss_mlp": 1.01829815, + "epoch": 0.6889917633619912, + "flos": 22054323294720.0, + "grad_norm": 1.9334566925796945, + "language_loss": 0.75210387, + "learning_rate": 9.315579565236747e-07, + "loss": 0.77363372, + "num_input_tokens_seen": 123250715, + "step": 5730, + "time_per_iteration": 2.5719101428985596 + }, + { + "auxiliary_loss_clip": 0.01139304, + "auxiliary_loss_mlp": 0.01025538, + "balance_loss_clip": 1.04852176, + "balance_loss_mlp": 1.01725912, + "epoch": 0.6891120062526304, + "flos": 23949724164480.0, + "grad_norm": 1.7058472978215482, + "language_loss": 0.74014235, + "learning_rate": 9.308995379180162e-07, + "loss": 0.76179069, + "num_input_tokens_seen": 123270270, + "step": 5731, + "time_per_iteration": 2.546293258666992 + }, + { + "auxiliary_loss_clip": 0.0106248, + "auxiliary_loss_mlp": 0.01001914, + "balance_loss_clip": 1.01266074, + "balance_loss_mlp": 1.0007937, + "epoch": 0.6892322491432694, + "flos": 64117354337280.0, + "grad_norm": 0.7411235512937367, + "language_loss": 0.59542745, + "learning_rate": 9.302412814940488e-07, + "loss": 0.6160714, + "num_input_tokens_seen": 123333045, + "step": 5732, + "time_per_iteration": 3.1017515659332275 + }, + { + "auxiliary_loss_clip": 0.01140422, + "auxiliary_loss_mlp": 0.01024036, + "balance_loss_clip": 1.04267788, + "balance_loss_mlp": 1.01615059, + "epoch": 0.6893524920339085, + "flos": 23002436736000.0, + "grad_norm": 2.285598473435249, + "language_loss": 0.71070302, + "learning_rate": 9.295831873516276e-07, + "loss": 0.73234761, + "num_input_tokens_seen": 123352320, + "step": 5733, + "time_per_iteration": 2.5097975730895996 + }, + { + "auxiliary_loss_clip": 0.01170674, + "auxiliary_loss_mlp": 0.01024384, + "balance_loss_clip": 1.04912949, + "balance_loss_mlp": 1.01687992, + "epoch": 0.6894727349245476, + "flos": 21396260177280.0, + "grad_norm": 1.610961851707114, + "language_loss": 0.76067436, + "learning_rate": 9.289252555905873e-07, + "loss": 0.78262496, + "num_input_tokens_seen": 123372400, + "step": 5734, + "time_per_iteration": 2.4727277755737305 + }, + { + "auxiliary_loss_clip": 0.01157977, + "auxiliary_loss_mlp": 0.01026207, + "balance_loss_clip": 1.04930639, + "balance_loss_mlp": 1.01834249, + "epoch": 0.6895929778151867, + "flos": 19865316654720.0, + "grad_norm": 1.924477003217911, + "language_loss": 0.75654137, + "learning_rate": 9.282674863107334e-07, + "loss": 0.77838326, + "num_input_tokens_seen": 123390215, + "step": 5735, + "time_per_iteration": 2.463134527206421 + }, + { + "auxiliary_loss_clip": 0.01152312, + "auxiliary_loss_mlp": 0.01023676, + "balance_loss_clip": 1.04650474, + "balance_loss_mlp": 1.01633906, + "epoch": 0.6897132207058257, + "flos": 18179166464640.0, + "grad_norm": 2.3353072940244557, + "language_loss": 0.75630754, + "learning_rate": 9.276098796118488e-07, + "loss": 0.77806741, + "num_input_tokens_seen": 123406700, + "step": 5736, + "time_per_iteration": 2.448234796524048 + }, + { + "auxiliary_loss_clip": 0.01144039, + "auxiliary_loss_mlp": 0.01024925, + "balance_loss_clip": 1.04781675, + "balance_loss_mlp": 1.017349, + "epoch": 0.6898334635964649, + "flos": 32561641359360.0, + "grad_norm": 1.769590171426828, + "language_loss": 0.66148669, + "learning_rate": 9.269524355936938e-07, + "loss": 0.68317628, + "num_input_tokens_seen": 123429880, + "step": 5737, + "time_per_iteration": 2.5915751457214355 + }, + { + "auxiliary_loss_clip": 0.01135194, + "auxiliary_loss_mlp": 0.01021864, + "balance_loss_clip": 1.0418272, + "balance_loss_mlp": 1.01480699, + "epoch": 0.689953706487104, + "flos": 22819004956800.0, + "grad_norm": 1.7188487995036126, + "language_loss": 0.847763, + "learning_rate": 9.262951543560002e-07, + "loss": 0.86933362, + "num_input_tokens_seen": 123449105, + "step": 5738, + "time_per_iteration": 2.5244197845458984 + }, + { + "auxiliary_loss_clip": 0.01141444, + "auxiliary_loss_mlp": 0.01029537, + "balance_loss_clip": 1.04769886, + "balance_loss_mlp": 1.02200317, + "epoch": 0.690073949377743, + "flos": 18515362786560.0, + "grad_norm": 2.220682129769571, + "language_loss": 0.86265051, + "learning_rate": 9.256380359984795e-07, + "loss": 0.88436037, + "num_input_tokens_seen": 123466215, + "step": 5739, + "time_per_iteration": 2.4653286933898926 + }, + { + "auxiliary_loss_clip": 0.01118906, + "auxiliary_loss_mlp": 0.01029195, + "balance_loss_clip": 1.03849149, + "balance_loss_mlp": 1.02147055, + "epoch": 0.6901941922683821, + "flos": 34857194716800.0, + "grad_norm": 2.0965939478132176, + "language_loss": 0.74878109, + "learning_rate": 9.249810806208139e-07, + "loss": 0.77026212, + "num_input_tokens_seen": 123485480, + "step": 5740, + "time_per_iteration": 2.68192982673645 + }, + { + "auxiliary_loss_clip": 0.01111054, + "auxiliary_loss_mlp": 0.00761914, + "balance_loss_clip": 1.03826332, + "balance_loss_mlp": 1.00093353, + "epoch": 0.6903144351590212, + "flos": 16253672976000.0, + "grad_norm": 2.0389693987169037, + "language_loss": 0.79852331, + "learning_rate": 9.243242883226627e-07, + "loss": 0.81725299, + "num_input_tokens_seen": 123504575, + "step": 5741, + "time_per_iteration": 2.54594087600708 + }, + { + "auxiliary_loss_clip": 0.01159687, + "auxiliary_loss_mlp": 0.01027049, + "balance_loss_clip": 1.04432321, + "balance_loss_mlp": 1.01860285, + "epoch": 0.6904346780496603, + "flos": 28035137255040.0, + "grad_norm": 1.8027061647577711, + "language_loss": 0.70127678, + "learning_rate": 9.236676592036628e-07, + "loss": 0.72314417, + "num_input_tokens_seen": 123524250, + "step": 5742, + "time_per_iteration": 2.5784449577331543 + }, + { + "auxiliary_loss_clip": 0.01139915, + "auxiliary_loss_mlp": 0.01024618, + "balance_loss_clip": 1.04782557, + "balance_loss_mlp": 1.01691675, + "epoch": 0.6905549209402994, + "flos": 23624266008960.0, + "grad_norm": 2.2755430061402304, + "language_loss": 0.7361992, + "learning_rate": 9.230111933634228e-07, + "loss": 0.75784457, + "num_input_tokens_seen": 123545845, + "step": 5743, + "time_per_iteration": 2.5404911041259766 + }, + { + "auxiliary_loss_clip": 0.01159787, + "auxiliary_loss_mlp": 0.01022615, + "balance_loss_clip": 1.04859185, + "balance_loss_mlp": 1.01542091, + "epoch": 0.6906751638309385, + "flos": 23114945111040.0, + "grad_norm": 1.4897495242449494, + "language_loss": 0.80745208, + "learning_rate": 9.223548909015288e-07, + "loss": 0.82927614, + "num_input_tokens_seen": 123567535, + "step": 5744, + "time_per_iteration": 2.5260589122772217 + }, + { + "auxiliary_loss_clip": 0.01106268, + "auxiliary_loss_mlp": 0.0102537, + "balance_loss_clip": 1.03952646, + "balance_loss_mlp": 1.01806545, + "epoch": 0.6907954067215776, + "flos": 27305468375040.0, + "grad_norm": 2.2478285791029506, + "language_loss": 0.72150111, + "learning_rate": 9.216987519175407e-07, + "loss": 0.74281746, + "num_input_tokens_seen": 123587710, + "step": 5745, + "time_per_iteration": 2.6323628425598145 + }, + { + "auxiliary_loss_clip": 0.01150417, + "auxiliary_loss_mlp": 0.01022777, + "balance_loss_clip": 1.04586554, + "balance_loss_mlp": 1.01560676, + "epoch": 0.6909156496122166, + "flos": 21689399070720.0, + "grad_norm": 1.6948674887480255, + "language_loss": 0.68214017, + "learning_rate": 9.210427765109942e-07, + "loss": 0.70387214, + "num_input_tokens_seen": 123607385, + "step": 5746, + "time_per_iteration": 2.47977876663208 + }, + { + "auxiliary_loss_clip": 0.01141267, + "auxiliary_loss_mlp": 0.01024227, + "balance_loss_clip": 1.04230142, + "balance_loss_mlp": 1.01600754, + "epoch": 0.6910358925028558, + "flos": 22561453463040.0, + "grad_norm": 2.800188839479589, + "language_loss": 0.8163383, + "learning_rate": 9.20386964781402e-07, + "loss": 0.83799326, + "num_input_tokens_seen": 123625405, + "step": 5747, + "time_per_iteration": 3.258970022201538 + }, + { + "auxiliary_loss_clip": 0.01138837, + "auxiliary_loss_mlp": 0.01025562, + "balance_loss_clip": 1.04428971, + "balance_loss_mlp": 1.01815343, + "epoch": 0.6911561353934949, + "flos": 22054107813120.0, + "grad_norm": 1.9938172307065052, + "language_loss": 0.84055138, + "learning_rate": 9.197313168282472e-07, + "loss": 0.86219537, + "num_input_tokens_seen": 123642850, + "step": 5748, + "time_per_iteration": 4.04276180267334 + }, + { + "auxiliary_loss_clip": 0.01151096, + "auxiliary_loss_mlp": 0.01025602, + "balance_loss_clip": 1.04304266, + "balance_loss_mlp": 1.01772237, + "epoch": 0.6912763782841339, + "flos": 24206557386240.0, + "grad_norm": 2.233486216783324, + "language_loss": 0.72093546, + "learning_rate": 9.190758327509935e-07, + "loss": 0.74270242, + "num_input_tokens_seen": 123661595, + "step": 5749, + "time_per_iteration": 2.499250888824463 + }, + { + "auxiliary_loss_clip": 0.01033704, + "auxiliary_loss_mlp": 0.00753129, + "balance_loss_clip": 1.0131278, + "balance_loss_mlp": 1.00062394, + "epoch": 0.6913966211747731, + "flos": 52329641091840.0, + "grad_norm": 0.9314392059870309, + "language_loss": 0.64444804, + "learning_rate": 9.184205126490767e-07, + "loss": 0.66231644, + "num_input_tokens_seen": 123710490, + "step": 5750, + "time_per_iteration": 2.9570109844207764 + }, + { + "auxiliary_loss_clip": 0.01041193, + "auxiliary_loss_mlp": 0.0075314, + "balance_loss_clip": 1.01305103, + "balance_loss_mlp": 1.00074732, + "epoch": 0.6915168640654121, + "flos": 66741274851840.0, + "grad_norm": 1.0869716815230053, + "language_loss": 0.59627187, + "learning_rate": 9.177653566219075e-07, + "loss": 0.61421525, + "num_input_tokens_seen": 123765215, + "step": 5751, + "time_per_iteration": 3.012197256088257 + }, + { + "auxiliary_loss_clip": 0.01130673, + "auxiliary_loss_mlp": 0.01027456, + "balance_loss_clip": 1.04207551, + "balance_loss_mlp": 1.02010059, + "epoch": 0.6916371069560512, + "flos": 18296523175680.0, + "grad_norm": 2.1926028150227803, + "language_loss": 0.76118183, + "learning_rate": 9.171103647688744e-07, + "loss": 0.78276312, + "num_input_tokens_seen": 123783955, + "step": 5752, + "time_per_iteration": 2.556135892868042 + }, + { + "auxiliary_loss_clip": 0.0107705, + "auxiliary_loss_mlp": 0.01025443, + "balance_loss_clip": 1.03885579, + "balance_loss_mlp": 1.01849866, + "epoch": 0.6917573498466904, + "flos": 19645794685440.0, + "grad_norm": 1.909020820584617, + "language_loss": 0.69272172, + "learning_rate": 9.164555371893367e-07, + "loss": 0.71374661, + "num_input_tokens_seen": 123803885, + "step": 5753, + "time_per_iteration": 2.6719701290130615 + }, + { + "auxiliary_loss_clip": 0.01156999, + "auxiliary_loss_mlp": 0.00761979, + "balance_loss_clip": 1.04755747, + "balance_loss_mlp": 1.00091648, + "epoch": 0.6918775927373294, + "flos": 14210319985920.0, + "grad_norm": 2.3021923293695155, + "language_loss": 0.74978745, + "learning_rate": 9.158008739826333e-07, + "loss": 0.76897722, + "num_input_tokens_seen": 123821485, + "step": 5754, + "time_per_iteration": 3.2770628929138184 + }, + { + "auxiliary_loss_clip": 0.0113997, + "auxiliary_loss_mlp": 0.01027851, + "balance_loss_clip": 1.04696751, + "balance_loss_mlp": 1.02041852, + "epoch": 0.6919978356279685, + "flos": 23985455218560.0, + "grad_norm": 1.6545412949701346, + "language_loss": 0.86418152, + "learning_rate": 9.151463752480744e-07, + "loss": 0.88585973, + "num_input_tokens_seen": 123840215, + "step": 5755, + "time_per_iteration": 2.5352976322174072 + }, + { + "auxiliary_loss_clip": 0.01117626, + "auxiliary_loss_mlp": 0.0102971, + "balance_loss_clip": 1.04106832, + "balance_loss_mlp": 1.02211595, + "epoch": 0.6921180785186076, + "flos": 23622937205760.0, + "grad_norm": 1.474719267525081, + "language_loss": 0.800825, + "learning_rate": 9.144920410849493e-07, + "loss": 0.82229835, + "num_input_tokens_seen": 123861450, + "step": 5756, + "time_per_iteration": 2.6057043075561523 + }, + { + "auxiliary_loss_clip": 0.01148388, + "auxiliary_loss_mlp": 0.01026873, + "balance_loss_clip": 1.04598582, + "balance_loss_mlp": 1.01930904, + "epoch": 0.6922383214092467, + "flos": 21142623265920.0, + "grad_norm": 1.8042047751981247, + "language_loss": 0.80382419, + "learning_rate": 9.138378715925176e-07, + "loss": 0.82557678, + "num_input_tokens_seen": 123880545, + "step": 5757, + "time_per_iteration": 2.5126140117645264 + }, + { + "auxiliary_loss_clip": 0.01136022, + "auxiliary_loss_mlp": 0.01025289, + "balance_loss_clip": 1.0431242, + "balance_loss_mlp": 1.01776648, + "epoch": 0.6923585642998857, + "flos": 21470667200640.0, + "grad_norm": 1.9022796508654605, + "language_loss": 0.80963159, + "learning_rate": 9.131838668700167e-07, + "loss": 0.83124465, + "num_input_tokens_seen": 123900615, + "step": 5758, + "time_per_iteration": 2.5022029876708984 + }, + { + "auxiliary_loss_clip": 0.01128109, + "auxiliary_loss_mlp": 0.01023732, + "balance_loss_clip": 1.04245043, + "balance_loss_mlp": 1.01650834, + "epoch": 0.6924788071905249, + "flos": 21105204272640.0, + "grad_norm": 1.9059594314614436, + "language_loss": 0.86342955, + "learning_rate": 9.125300270166598e-07, + "loss": 0.88494802, + "num_input_tokens_seen": 123921220, + "step": 5759, + "time_per_iteration": 2.5675547122955322 + }, + { + "auxiliary_loss_clip": 0.01136556, + "auxiliary_loss_mlp": 0.01022076, + "balance_loss_clip": 1.04385006, + "balance_loss_mlp": 1.01411557, + "epoch": 0.692599050081164, + "flos": 26250018117120.0, + "grad_norm": 4.0285879423398185, + "language_loss": 0.85730135, + "learning_rate": 9.118763521316324e-07, + "loss": 0.87888765, + "num_input_tokens_seen": 123941795, + "step": 5760, + "time_per_iteration": 2.586599111557007 + }, + { + "auxiliary_loss_clip": 0.01170501, + "auxiliary_loss_mlp": 0.00762427, + "balance_loss_clip": 1.04748154, + "balance_loss_mlp": 1.00086951, + "epoch": 0.692719292971803, + "flos": 20885215426560.0, + "grad_norm": 1.777708838827004, + "language_loss": 0.75957811, + "learning_rate": 9.112228423140987e-07, + "loss": 0.77890736, + "num_input_tokens_seen": 123960715, + "step": 5761, + "time_per_iteration": 2.4593451023101807 + }, + { + "auxiliary_loss_clip": 0.01147557, + "auxiliary_loss_mlp": 0.01029719, + "balance_loss_clip": 1.04620135, + "balance_loss_mlp": 1.02165425, + "epoch": 0.6928395358624422, + "flos": 25921938268800.0, + "grad_norm": 2.272517151104845, + "language_loss": 0.86706114, + "learning_rate": 9.105694976631932e-07, + "loss": 0.88883388, + "num_input_tokens_seen": 123978625, + "step": 5762, + "time_per_iteration": 2.5441226959228516 + }, + { + "auxiliary_loss_clip": 0.01157496, + "auxiliary_loss_mlp": 0.01027805, + "balance_loss_clip": 1.04935265, + "balance_loss_mlp": 1.01993132, + "epoch": 0.6929597787530812, + "flos": 23586559706880.0, + "grad_norm": 2.8340021384802276, + "language_loss": 0.724684, + "learning_rate": 9.099163182780283e-07, + "loss": 0.74653703, + "num_input_tokens_seen": 123996780, + "step": 5763, + "time_per_iteration": 2.477940320968628 + }, + { + "auxiliary_loss_clip": 0.01138216, + "auxiliary_loss_mlp": 0.01029646, + "balance_loss_clip": 1.04469967, + "balance_loss_mlp": 1.0216651, + "epoch": 0.6930800216437203, + "flos": 18255656476800.0, + "grad_norm": 3.4034463958941865, + "language_loss": 0.49096459, + "learning_rate": 9.092633042576916e-07, + "loss": 0.51264322, + "num_input_tokens_seen": 124014045, + "step": 5764, + "time_per_iteration": 2.4746313095092773 + }, + { + "auxiliary_loss_clip": 0.01140515, + "auxiliary_loss_mlp": 0.01026735, + "balance_loss_clip": 1.04747844, + "balance_loss_mlp": 1.01924825, + "epoch": 0.6932002645343595, + "flos": 29168621809920.0, + "grad_norm": 1.9966640175031642, + "language_loss": 0.56446588, + "learning_rate": 9.086104557012446e-07, + "loss": 0.58613837, + "num_input_tokens_seen": 124034615, + "step": 5765, + "time_per_iteration": 2.5535354614257812 + }, + { + "auxiliary_loss_clip": 0.01148584, + "auxiliary_loss_mlp": 0.01021707, + "balance_loss_clip": 1.04647017, + "balance_loss_mlp": 1.01435733, + "epoch": 0.6933205074249985, + "flos": 23842746483840.0, + "grad_norm": 2.441693576966053, + "language_loss": 0.65567839, + "learning_rate": 9.079577727077239e-07, + "loss": 0.67738128, + "num_input_tokens_seen": 124053445, + "step": 5766, + "time_per_iteration": 2.54347562789917 + }, + { + "auxiliary_loss_clip": 0.01158989, + "auxiliary_loss_mlp": 0.01029672, + "balance_loss_clip": 1.0492034, + "balance_loss_mlp": 1.02200651, + "epoch": 0.6934407503156376, + "flos": 24166696268160.0, + "grad_norm": 2.1156224429229877, + "language_loss": 0.72045279, + "learning_rate": 9.073052553761404e-07, + "loss": 0.74233943, + "num_input_tokens_seen": 124072810, + "step": 5767, + "time_per_iteration": 2.4864728450775146 + }, + { + "auxiliary_loss_clip": 0.01116207, + "auxiliary_loss_mlp": 0.01027522, + "balance_loss_clip": 1.04299712, + "balance_loss_mlp": 1.01949954, + "epoch": 0.6935609932062767, + "flos": 20631327120000.0, + "grad_norm": 1.8352541706231231, + "language_loss": 0.78265858, + "learning_rate": 9.066529038054805e-07, + "loss": 0.80409586, + "num_input_tokens_seen": 124092875, + "step": 5768, + "time_per_iteration": 2.587529420852661 + }, + { + "auxiliary_loss_clip": 0.01140041, + "auxiliary_loss_mlp": 0.0102608, + "balance_loss_clip": 1.04499769, + "balance_loss_mlp": 1.01907635, + "epoch": 0.6936812360969158, + "flos": 18254184019200.0, + "grad_norm": 2.123756577119488, + "language_loss": 0.74103439, + "learning_rate": 9.060007180947071e-07, + "loss": 0.76269555, + "num_input_tokens_seen": 124110930, + "step": 5769, + "time_per_iteration": 2.483200788497925 + }, + { + "auxiliary_loss_clip": 0.01113113, + "auxiliary_loss_mlp": 0.01025385, + "balance_loss_clip": 1.03836799, + "balance_loss_mlp": 1.01753509, + "epoch": 0.6938014789875548, + "flos": 31317336368640.0, + "grad_norm": 1.7985957194443714, + "language_loss": 0.73191017, + "learning_rate": 9.053486983427534e-07, + "loss": 0.75329512, + "num_input_tokens_seen": 124132180, + "step": 5770, + "time_per_iteration": 2.6344313621520996 + }, + { + "auxiliary_loss_clip": 0.01143964, + "auxiliary_loss_mlp": 0.01026275, + "balance_loss_clip": 1.04330063, + "balance_loss_mlp": 1.01879454, + "epoch": 0.6939217218781939, + "flos": 17528429721600.0, + "grad_norm": 2.127463843966922, + "language_loss": 0.70547539, + "learning_rate": 9.046968446485326e-07, + "loss": 0.72717774, + "num_input_tokens_seen": 124150585, + "step": 5771, + "time_per_iteration": 2.5043578147888184 + }, + { + "auxiliary_loss_clip": 0.01161651, + "auxiliary_loss_mlp": 0.01028805, + "balance_loss_clip": 1.05000305, + "balance_loss_mlp": 1.02040696, + "epoch": 0.6940419647688331, + "flos": 18551776199040.0, + "grad_norm": 2.144531270681782, + "language_loss": 0.70642424, + "learning_rate": 9.040451571109295e-07, + "loss": 0.72832882, + "num_input_tokens_seen": 124166205, + "step": 5772, + "time_per_iteration": 2.45487380027771 + }, + { + "auxiliary_loss_clip": 0.01044219, + "auxiliary_loss_mlp": 0.01006923, + "balance_loss_clip": 1.02129078, + "balance_loss_mlp": 1.00570071, + "epoch": 0.6941622076594721, + "flos": 66926286829440.0, + "grad_norm": 0.8567194533967086, + "language_loss": 0.60387361, + "learning_rate": 9.033936358288042e-07, + "loss": 0.62438512, + "num_input_tokens_seen": 124219940, + "step": 5773, + "time_per_iteration": 3.7772185802459717 + }, + { + "auxiliary_loss_clip": 0.01172971, + "auxiliary_loss_mlp": 0.01019786, + "balance_loss_clip": 1.04973841, + "balance_loss_mlp": 1.01241863, + "epoch": 0.6942824505501112, + "flos": 26578062051840.0, + "grad_norm": 1.6319007090460114, + "language_loss": 0.82528526, + "learning_rate": 9.027422809009937e-07, + "loss": 0.84721285, + "num_input_tokens_seen": 124239885, + "step": 5774, + "time_per_iteration": 3.285829544067383 + }, + { + "auxiliary_loss_clip": 0.01157756, + "auxiliary_loss_mlp": 0.01022924, + "balance_loss_clip": 1.04526281, + "balance_loss_mlp": 1.0151968, + "epoch": 0.6944026934407503, + "flos": 21248308056960.0, + "grad_norm": 2.2916353174227546, + "language_loss": 0.83447003, + "learning_rate": 9.020910924263054e-07, + "loss": 0.85627687, + "num_input_tokens_seen": 124258410, + "step": 5775, + "time_per_iteration": 3.2663369178771973 + }, + { + "auxiliary_loss_clip": 0.01041135, + "auxiliary_loss_mlp": 0.01003103, + "balance_loss_clip": 1.01935136, + "balance_loss_mlp": 1.00177944, + "epoch": 0.6945229363313894, + "flos": 70677191537280.0, + "grad_norm": 0.814509682875596, + "language_loss": 0.58151364, + "learning_rate": 9.014400705035261e-07, + "loss": 0.60195601, + "num_input_tokens_seen": 124315315, + "step": 5776, + "time_per_iteration": 3.1413872241973877 + }, + { + "auxiliary_loss_clip": 0.01168767, + "auxiliary_loss_mlp": 0.01022316, + "balance_loss_clip": 1.0490191, + "balance_loss_mlp": 1.01505876, + "epoch": 0.6946431792220285, + "flos": 18952934267520.0, + "grad_norm": 2.153334616481474, + "language_loss": 0.76720333, + "learning_rate": 9.00789215231414e-07, + "loss": 0.78911418, + "num_input_tokens_seen": 124333710, + "step": 5777, + "time_per_iteration": 2.4179766178131104 + }, + { + "auxiliary_loss_clip": 0.01127669, + "auxiliary_loss_mlp": 0.00762715, + "balance_loss_clip": 1.04101658, + "balance_loss_mlp": 1.00096464, + "epoch": 0.6947634221126676, + "flos": 20338834671360.0, + "grad_norm": 1.9511408673333868, + "language_loss": 0.81998861, + "learning_rate": 9.001385267087056e-07, + "loss": 0.83889246, + "num_input_tokens_seen": 124352855, + "step": 5778, + "time_per_iteration": 2.5765485763549805 + }, + { + "auxiliary_loss_clip": 0.01159653, + "auxiliary_loss_mlp": 0.01023974, + "balance_loss_clip": 1.04837692, + "balance_loss_mlp": 1.0165329, + "epoch": 0.6948836650033067, + "flos": 21833723917440.0, + "grad_norm": 1.5781292819691437, + "language_loss": 0.70281094, + "learning_rate": 8.994880050341072e-07, + "loss": 0.72464722, + "num_input_tokens_seen": 124372960, + "step": 5779, + "time_per_iteration": 2.5017287731170654 + }, + { + "auxiliary_loss_clip": 0.01138192, + "auxiliary_loss_mlp": 0.01030857, + "balance_loss_clip": 1.04693222, + "balance_loss_mlp": 1.02335, + "epoch": 0.6950039078939457, + "flos": 23657519024640.0, + "grad_norm": 2.223994769401111, + "language_loss": 0.77570754, + "learning_rate": 8.988376503063026e-07, + "loss": 0.79739797, + "num_input_tokens_seen": 124394220, + "step": 5780, + "time_per_iteration": 3.3847079277038574 + }, + { + "auxiliary_loss_clip": 0.01123455, + "auxiliary_loss_mlp": 0.01026186, + "balance_loss_clip": 1.0440433, + "balance_loss_mlp": 1.01839519, + "epoch": 0.6951241507845849, + "flos": 21792462168960.0, + "grad_norm": 1.8729292410937048, + "language_loss": 0.8149842, + "learning_rate": 8.981874626239521e-07, + "loss": 0.83648062, + "num_input_tokens_seen": 124412795, + "step": 5781, + "time_per_iteration": 2.5814573764801025 + }, + { + "auxiliary_loss_clip": 0.01158915, + "auxiliary_loss_mlp": 0.01028645, + "balance_loss_clip": 1.05017614, + "balance_loss_mlp": 1.02101827, + "epoch": 0.695244393675224, + "flos": 14647568244480.0, + "grad_norm": 2.303817584262593, + "language_loss": 0.88254905, + "learning_rate": 8.975374420856872e-07, + "loss": 0.90442467, + "num_input_tokens_seen": 124429690, + "step": 5782, + "time_per_iteration": 2.4492545127868652 + }, + { + "auxiliary_loss_clip": 0.01116457, + "auxiliary_loss_mlp": 0.01020988, + "balance_loss_clip": 1.0391103, + "balance_loss_mlp": 1.01385355, + "epoch": 0.695364636565863, + "flos": 16873203778560.0, + "grad_norm": 2.33378515312097, + "language_loss": 0.72823048, + "learning_rate": 8.968875887901157e-07, + "loss": 0.74960482, + "num_input_tokens_seen": 124447070, + "step": 5783, + "time_per_iteration": 2.5050878524780273 + }, + { + "auxiliary_loss_clip": 0.01141947, + "auxiliary_loss_mlp": 0.01027917, + "balance_loss_clip": 1.04300082, + "balance_loss_mlp": 1.01993656, + "epoch": 0.6954848794565022, + "flos": 19354523299200.0, + "grad_norm": 1.9239816772774114, + "language_loss": 0.62512809, + "learning_rate": 8.9623790283582e-07, + "loss": 0.64682674, + "num_input_tokens_seen": 124464950, + "step": 5784, + "time_per_iteration": 2.487135887145996 + }, + { + "auxiliary_loss_clip": 0.01133168, + "auxiliary_loss_mlp": 0.0102614, + "balance_loss_clip": 1.04700291, + "balance_loss_mlp": 1.01837397, + "epoch": 0.6956051223471412, + "flos": 18990209606400.0, + "grad_norm": 2.713930509072413, + "language_loss": 0.76222795, + "learning_rate": 8.955883843213561e-07, + "loss": 0.78382105, + "num_input_tokens_seen": 124483965, + "step": 5785, + "time_per_iteration": 2.529085397720337 + }, + { + "auxiliary_loss_clip": 0.01163375, + "auxiliary_loss_mlp": 0.01030685, + "balance_loss_clip": 1.04794598, + "balance_loss_mlp": 1.02257836, + "epoch": 0.6957253652377803, + "flos": 16107229226880.0, + "grad_norm": 1.8091184974239243, + "language_loss": 0.86503458, + "learning_rate": 8.949390333452569e-07, + "loss": 0.88697517, + "num_input_tokens_seen": 124501910, + "step": 5786, + "time_per_iteration": 2.466111421585083 + }, + { + "auxiliary_loss_clip": 0.01170156, + "auxiliary_loss_mlp": 0.01028269, + "balance_loss_clip": 1.0489068, + "balance_loss_mlp": 1.0206244, + "epoch": 0.6958456081284194, + "flos": 29388646569600.0, + "grad_norm": 1.8568487738214088, + "language_loss": 0.67975342, + "learning_rate": 8.942898500060279e-07, + "loss": 0.7017377, + "num_input_tokens_seen": 124521625, + "step": 5787, + "time_per_iteration": 2.4903724193573 + }, + { + "auxiliary_loss_clip": 0.01121528, + "auxiliary_loss_mlp": 0.01026538, + "balance_loss_clip": 1.04420662, + "balance_loss_mlp": 1.01839018, + "epoch": 0.6959658510190585, + "flos": 25154850395520.0, + "grad_norm": 3.582734226777699, + "language_loss": 0.71141827, + "learning_rate": 8.936408344021493e-07, + "loss": 0.73289895, + "num_input_tokens_seen": 124538540, + "step": 5788, + "time_per_iteration": 2.599661350250244 + }, + { + "auxiliary_loss_clip": 0.01153446, + "auxiliary_loss_mlp": 0.01031318, + "balance_loss_clip": 1.04888701, + "balance_loss_mlp": 1.02235949, + "epoch": 0.6960860939096976, + "flos": 42814388759040.0, + "grad_norm": 2.1712103829250937, + "language_loss": 0.71103382, + "learning_rate": 8.929919866320765e-07, + "loss": 0.73288143, + "num_input_tokens_seen": 124559355, + "step": 5789, + "time_per_iteration": 2.67883563041687 + }, + { + "auxiliary_loss_clip": 0.01133656, + "auxiliary_loss_mlp": 0.00762655, + "balance_loss_clip": 1.04284668, + "balance_loss_mlp": 1.00071168, + "epoch": 0.6962063368003367, + "flos": 17566566986880.0, + "grad_norm": 1.8983921607356529, + "language_loss": 0.81731522, + "learning_rate": 8.923433067942385e-07, + "loss": 0.83627838, + "num_input_tokens_seen": 124577920, + "step": 5790, + "time_per_iteration": 2.575538158416748 + }, + { + "auxiliary_loss_clip": 0.01136639, + "auxiliary_loss_mlp": 0.01028044, + "balance_loss_clip": 1.04515505, + "balance_loss_mlp": 1.02049768, + "epoch": 0.6963265796909758, + "flos": 21251648021760.0, + "grad_norm": 1.8386160108721425, + "language_loss": 0.6869688, + "learning_rate": 8.916947949870417e-07, + "loss": 0.7086156, + "num_input_tokens_seen": 124597585, + "step": 5791, + "time_per_iteration": 2.5685558319091797 + }, + { + "auxiliary_loss_clip": 0.01061344, + "auxiliary_loss_mlp": 0.01000353, + "balance_loss_clip": 1.01215482, + "balance_loss_mlp": 0.9993214, + "epoch": 0.6964468225816148, + "flos": 68828295801600.0, + "grad_norm": 0.7407083321040345, + "language_loss": 0.58152992, + "learning_rate": 8.910464513088615e-07, + "loss": 0.60214686, + "num_input_tokens_seen": 124661625, + "step": 5792, + "time_per_iteration": 3.149731159210205 + }, + { + "auxiliary_loss_clip": 0.01136413, + "auxiliary_loss_mlp": 0.0102541, + "balance_loss_clip": 1.04443729, + "balance_loss_mlp": 1.01694632, + "epoch": 0.696567065472254, + "flos": 18950887192320.0, + "grad_norm": 2.408688415343576, + "language_loss": 0.78622675, + "learning_rate": 8.903982758580542e-07, + "loss": 0.80784494, + "num_input_tokens_seen": 124680565, + "step": 5793, + "time_per_iteration": 2.5511281490325928 + }, + { + "auxiliary_loss_clip": 0.01138661, + "auxiliary_loss_mlp": 0.01031743, + "balance_loss_clip": 1.04511929, + "balance_loss_mlp": 1.02425051, + "epoch": 0.696687308362893, + "flos": 22856675345280.0, + "grad_norm": 2.142558021599008, + "language_loss": 0.8024115, + "learning_rate": 8.897502687329457e-07, + "loss": 0.82411557, + "num_input_tokens_seen": 124700365, + "step": 5794, + "time_per_iteration": 2.5382840633392334 + }, + { + "auxiliary_loss_clip": 0.01124587, + "auxiliary_loss_mlp": 0.0102689, + "balance_loss_clip": 1.04284477, + "balance_loss_mlp": 1.0196569, + "epoch": 0.6968075512535321, + "flos": 24972926987520.0, + "grad_norm": 1.917346592634715, + "language_loss": 0.79736614, + "learning_rate": 8.891024300318382e-07, + "loss": 0.81888092, + "num_input_tokens_seen": 124718935, + "step": 5795, + "time_per_iteration": 2.5739076137542725 + }, + { + "auxiliary_loss_clip": 0.01118641, + "auxiliary_loss_mlp": 0.01024769, + "balance_loss_clip": 1.04084671, + "balance_loss_mlp": 1.0177896, + "epoch": 0.6969277941441713, + "flos": 21030438113280.0, + "grad_norm": 1.781443236743228, + "language_loss": 0.75803876, + "learning_rate": 8.884547598530103e-07, + "loss": 0.77947289, + "num_input_tokens_seen": 124739505, + "step": 5796, + "time_per_iteration": 2.5664479732513428 + }, + { + "auxiliary_loss_clip": 0.01074162, + "auxiliary_loss_mlp": 0.01029905, + "balance_loss_clip": 1.03736615, + "balance_loss_mlp": 1.02207291, + "epoch": 0.6970480370348103, + "flos": 21579404647680.0, + "grad_norm": 1.9189693062556208, + "language_loss": 0.75181365, + "learning_rate": 8.8780725829471e-07, + "loss": 0.77285433, + "num_input_tokens_seen": 124757410, + "step": 5797, + "time_per_iteration": 2.6359987258911133 + }, + { + "auxiliary_loss_clip": 0.0117076, + "auxiliary_loss_mlp": 0.01027686, + "balance_loss_clip": 1.04745817, + "balance_loss_mlp": 1.0197587, + "epoch": 0.6971682799254494, + "flos": 22419175691520.0, + "grad_norm": 2.1547051759435623, + "language_loss": 0.77871734, + "learning_rate": 8.87159925455165e-07, + "loss": 0.80070186, + "num_input_tokens_seen": 124777240, + "step": 5798, + "time_per_iteration": 2.4457144737243652 + }, + { + "auxiliary_loss_clip": 0.01125045, + "auxiliary_loss_mlp": 0.01026742, + "balance_loss_clip": 1.04395771, + "balance_loss_mlp": 1.01949716, + "epoch": 0.6972885228160886, + "flos": 20005834659840.0, + "grad_norm": 1.8496512446664004, + "language_loss": 0.73598135, + "learning_rate": 8.865127614325738e-07, + "loss": 0.75749922, + "num_input_tokens_seen": 124795670, + "step": 5799, + "time_per_iteration": 3.3012166023254395 + }, + { + "auxiliary_loss_clip": 0.01136227, + "auxiliary_loss_mlp": 0.01030845, + "balance_loss_clip": 1.04413366, + "balance_loss_mlp": 1.02254176, + "epoch": 0.6974087657067276, + "flos": 37853437656960.0, + "grad_norm": 1.8021244989868457, + "language_loss": 0.66790116, + "learning_rate": 8.85865766325113e-07, + "loss": 0.68957192, + "num_input_tokens_seen": 124819600, + "step": 5800, + "time_per_iteration": 2.654128313064575 + }, + { + "auxiliary_loss_clip": 0.01139659, + "auxiliary_loss_mlp": 0.01026862, + "balance_loss_clip": 1.04472661, + "balance_loss_mlp": 1.01931, + "epoch": 0.6975290085973667, + "flos": 29489267543040.0, + "grad_norm": 2.3303494178735624, + "language_loss": 0.72198105, + "learning_rate": 8.852189402309287e-07, + "loss": 0.74364626, + "num_input_tokens_seen": 124838785, + "step": 5801, + "time_per_iteration": 3.3319430351257324 + }, + { + "auxiliary_loss_clip": 0.0115672, + "auxiliary_loss_mlp": 0.01027274, + "balance_loss_clip": 1.04762459, + "balance_loss_mlp": 1.02008557, + "epoch": 0.6976492514880057, + "flos": 12895630295040.0, + "grad_norm": 2.3907541469788143, + "language_loss": 0.74371088, + "learning_rate": 8.845722832481441e-07, + "loss": 0.76555085, + "num_input_tokens_seen": 124854215, + "step": 5802, + "time_per_iteration": 3.272151231765747 + }, + { + "auxiliary_loss_clip": 0.01155193, + "auxiliary_loss_mlp": 0.01028937, + "balance_loss_clip": 1.04692054, + "balance_loss_mlp": 1.02137375, + "epoch": 0.6977694943786449, + "flos": 24352929308160.0, + "grad_norm": 1.73164584158707, + "language_loss": 0.77193713, + "learning_rate": 8.83925795474858e-07, + "loss": 0.79377848, + "num_input_tokens_seen": 124874340, + "step": 5803, + "time_per_iteration": 2.514629364013672 + }, + { + "auxiliary_loss_clip": 0.01125098, + "auxiliary_loss_mlp": 0.01025479, + "balance_loss_clip": 1.04563975, + "balance_loss_mlp": 1.01742041, + "epoch": 0.6978897372692839, + "flos": 29898470257920.0, + "grad_norm": 2.524735110454635, + "language_loss": 0.59486538, + "learning_rate": 8.832794770091414e-07, + "loss": 0.61637115, + "num_input_tokens_seen": 124895175, + "step": 5804, + "time_per_iteration": 2.588115930557251 + }, + { + "auxiliary_loss_clip": 0.01147109, + "auxiliary_loss_mlp": 0.01026684, + "balance_loss_clip": 1.04532623, + "balance_loss_mlp": 1.01900101, + "epoch": 0.698009980159923, + "flos": 21761579450880.0, + "grad_norm": 2.9084223164717558, + "language_loss": 0.8284986, + "learning_rate": 8.826333279490401e-07, + "loss": 0.85023648, + "num_input_tokens_seen": 124915810, + "step": 5805, + "time_per_iteration": 2.513921022415161 + }, + { + "auxiliary_loss_clip": 0.01145631, + "auxiliary_loss_mlp": 0.01026173, + "balance_loss_clip": 1.0471648, + "balance_loss_mlp": 1.01914561, + "epoch": 0.6981302230505622, + "flos": 19857164267520.0, + "grad_norm": 2.154611988802036, + "language_loss": 0.68246531, + "learning_rate": 8.819873483925748e-07, + "loss": 0.70418334, + "num_input_tokens_seen": 124932930, + "step": 5806, + "time_per_iteration": 2.4990031719207764 + }, + { + "auxiliary_loss_clip": 0.01134315, + "auxiliary_loss_mlp": 0.00762369, + "balance_loss_clip": 1.04751813, + "balance_loss_mlp": 1.00079155, + "epoch": 0.6982504659412012, + "flos": 22198648141440.0, + "grad_norm": 2.986245037250485, + "language_loss": 0.74460667, + "learning_rate": 8.81341538437739e-07, + "loss": 0.76357353, + "num_input_tokens_seen": 124951220, + "step": 5807, + "time_per_iteration": 3.3478171825408936 + }, + { + "auxiliary_loss_clip": 0.01143748, + "auxiliary_loss_mlp": 0.01021593, + "balance_loss_clip": 1.04204118, + "balance_loss_mlp": 1.01399374, + "epoch": 0.6983707088318403, + "flos": 35588479708800.0, + "grad_norm": 1.6209394869554627, + "language_loss": 0.68064862, + "learning_rate": 8.80695898182503e-07, + "loss": 0.70230198, + "num_input_tokens_seen": 124972200, + "step": 5808, + "time_per_iteration": 2.6199772357940674 + }, + { + "auxiliary_loss_clip": 0.01058462, + "auxiliary_loss_mlp": 0.01005986, + "balance_loss_clip": 1.01726437, + "balance_loss_mlp": 1.00475776, + "epoch": 0.6984909517224794, + "flos": 65440052760960.0, + "grad_norm": 0.8279485744592221, + "language_loss": 0.65084863, + "learning_rate": 8.800504277248093e-07, + "loss": 0.67149305, + "num_input_tokens_seen": 125036950, + "step": 5809, + "time_per_iteration": 3.070845603942871 + }, + { + "auxiliary_loss_clip": 0.01124712, + "auxiliary_loss_mlp": 0.00761869, + "balance_loss_clip": 1.04814231, + "balance_loss_mlp": 1.00086069, + "epoch": 0.6986111946131185, + "flos": 18546927863040.0, + "grad_norm": 1.8403391984587307, + "language_loss": 0.75111175, + "learning_rate": 8.794051271625753e-07, + "loss": 0.76997757, + "num_input_tokens_seen": 125054585, + "step": 5810, + "time_per_iteration": 2.5085320472717285 + }, + { + "auxiliary_loss_clip": 0.01142188, + "auxiliary_loss_mlp": 0.01024433, + "balance_loss_clip": 1.04643977, + "balance_loss_mlp": 1.01722407, + "epoch": 0.6987314375037575, + "flos": 23039173370880.0, + "grad_norm": 1.643426703105273, + "language_loss": 0.83174366, + "learning_rate": 8.787599965936925e-07, + "loss": 0.85340989, + "num_input_tokens_seen": 125075515, + "step": 5811, + "time_per_iteration": 2.517460346221924 + }, + { + "auxiliary_loss_clip": 0.01123547, + "auxiliary_loss_mlp": 0.01023711, + "balance_loss_clip": 1.04500937, + "balance_loss_mlp": 1.0166831, + "epoch": 0.6988516803943967, + "flos": 38400393029760.0, + "grad_norm": 1.8008929179536144, + "language_loss": 0.71996403, + "learning_rate": 8.781150361160261e-07, + "loss": 0.7414366, + "num_input_tokens_seen": 125097425, + "step": 5812, + "time_per_iteration": 2.7068285942077637 + }, + { + "auxiliary_loss_clip": 0.0113275, + "auxiliary_loss_mlp": 0.01024781, + "balance_loss_clip": 1.04490256, + "balance_loss_mlp": 1.0169282, + "epoch": 0.6989719232850358, + "flos": 24096993926400.0, + "grad_norm": 1.6103830561800507, + "language_loss": 0.73854303, + "learning_rate": 8.774702458274181e-07, + "loss": 0.76011837, + "num_input_tokens_seen": 125117830, + "step": 5813, + "time_per_iteration": 2.5886244773864746 + }, + { + "auxiliary_loss_clip": 0.01156178, + "auxiliary_loss_mlp": 0.01026706, + "balance_loss_clip": 1.0476017, + "balance_loss_mlp": 1.0187192, + "epoch": 0.6990921661756748, + "flos": 14866838818560.0, + "grad_norm": 2.419478327005229, + "language_loss": 0.70517492, + "learning_rate": 8.768256258256799e-07, + "loss": 0.72700375, + "num_input_tokens_seen": 125134455, + "step": 5814, + "time_per_iteration": 2.4636263847351074 + }, + { + "auxiliary_loss_clip": 0.01159894, + "auxiliary_loss_mlp": 0.01026613, + "balance_loss_clip": 1.04784858, + "balance_loss_mlp": 1.0189538, + "epoch": 0.699212409066314, + "flos": 20193719725440.0, + "grad_norm": 1.9971837282601628, + "language_loss": 0.73730516, + "learning_rate": 8.76181176208602e-07, + "loss": 0.75917023, + "num_input_tokens_seen": 125152555, + "step": 5815, + "time_per_iteration": 2.469125986099243 + }, + { + "auxiliary_loss_clip": 0.01102382, + "auxiliary_loss_mlp": 0.01030752, + "balance_loss_clip": 1.03835523, + "balance_loss_mlp": 1.02253258, + "epoch": 0.699332651956953, + "flos": 19427888828160.0, + "grad_norm": 1.7749243290862682, + "language_loss": 0.73783368, + "learning_rate": 8.755368970739461e-07, + "loss": 0.75916499, + "num_input_tokens_seen": 125171915, + "step": 5816, + "time_per_iteration": 2.5621914863586426 + }, + { + "auxiliary_loss_clip": 0.01132916, + "auxiliary_loss_mlp": 0.01026019, + "balance_loss_clip": 1.04294395, + "balance_loss_mlp": 1.01769209, + "epoch": 0.6994528948475921, + "flos": 16143714466560.0, + "grad_norm": 2.3741971266258712, + "language_loss": 0.6154539, + "learning_rate": 8.748927885194479e-07, + "loss": 0.63704324, + "num_input_tokens_seen": 125190220, + "step": 5817, + "time_per_iteration": 2.5181617736816406 + }, + { + "auxiliary_loss_clip": 0.01027034, + "auxiliary_loss_mlp": 0.01003485, + "balance_loss_clip": 1.00934267, + "balance_loss_mlp": 1.00245428, + "epoch": 0.6995731377382313, + "flos": 64952420699520.0, + "grad_norm": 0.7920124531983008, + "language_loss": 0.57398164, + "learning_rate": 8.742488506428209e-07, + "loss": 0.5942868, + "num_input_tokens_seen": 125249310, + "step": 5818, + "time_per_iteration": 3.05526065826416 + }, + { + "auxiliary_loss_clip": 0.01144295, + "auxiliary_loss_mlp": 0.00762323, + "balance_loss_clip": 1.04432845, + "balance_loss_mlp": 1.00074089, + "epoch": 0.6996933806288703, + "flos": 24900136076160.0, + "grad_norm": 1.8556191629871888, + "language_loss": 0.78241491, + "learning_rate": 8.736050835417466e-07, + "loss": 0.80148113, + "num_input_tokens_seen": 125269350, + "step": 5819, + "time_per_iteration": 2.6105899810791016 + }, + { + "auxiliary_loss_clip": 0.01161003, + "auxiliary_loss_mlp": 0.01029714, + "balance_loss_clip": 1.04764593, + "balance_loss_mlp": 1.02174735, + "epoch": 0.6998136235195094, + "flos": 20777806782720.0, + "grad_norm": 2.335277726675155, + "language_loss": 0.61200351, + "learning_rate": 8.729614873138862e-07, + "loss": 0.63391066, + "num_input_tokens_seen": 125286985, + "step": 5820, + "time_per_iteration": 2.477769374847412 + }, + { + "auxiliary_loss_clip": 0.01123957, + "auxiliary_loss_mlp": 0.01029464, + "balance_loss_clip": 1.04751921, + "balance_loss_mlp": 1.02125072, + "epoch": 0.6999338664101485, + "flos": 23733470332800.0, + "grad_norm": 2.138467753487016, + "language_loss": 0.77428126, + "learning_rate": 8.723180620568716e-07, + "loss": 0.79581547, + "num_input_tokens_seen": 125306240, + "step": 5821, + "time_per_iteration": 2.596944570541382 + }, + { + "auxiliary_loss_clip": 0.01145953, + "auxiliary_loss_mlp": 0.01024964, + "balance_loss_clip": 1.0449264, + "balance_loss_mlp": 1.01737642, + "epoch": 0.7000541093007876, + "flos": 19864598382720.0, + "grad_norm": 1.9435210471889302, + "language_loss": 0.84993625, + "learning_rate": 8.716748078683116e-07, + "loss": 0.87164539, + "num_input_tokens_seen": 125323015, + "step": 5822, + "time_per_iteration": 2.4902496337890625 + }, + { + "auxiliary_loss_clip": 0.01075045, + "auxiliary_loss_mlp": 0.01029655, + "balance_loss_clip": 1.03730845, + "balance_loss_mlp": 1.02072072, + "epoch": 0.7001743521914267, + "flos": 29679056029440.0, + "grad_norm": 2.1068790649343594, + "language_loss": 0.68641114, + "learning_rate": 8.710317248457855e-07, + "loss": 0.70745814, + "num_input_tokens_seen": 125342630, + "step": 5823, + "time_per_iteration": 2.752711057662964 + }, + { + "auxiliary_loss_clip": 0.0114081, + "auxiliary_loss_mlp": 0.01024702, + "balance_loss_clip": 1.04694867, + "balance_loss_mlp": 1.01699519, + "epoch": 0.7002945950820658, + "flos": 27489762080640.0, + "grad_norm": 6.314689392412598, + "language_loss": 0.72088969, + "learning_rate": 8.703888130868482e-07, + "loss": 0.74254489, + "num_input_tokens_seen": 125364480, + "step": 5824, + "time_per_iteration": 2.994812250137329 + }, + { + "auxiliary_loss_clip": 0.01128242, + "auxiliary_loss_mlp": 0.010248, + "balance_loss_clip": 1.0443902, + "balance_loss_mlp": 1.01779056, + "epoch": 0.7004148379727049, + "flos": 22158463800960.0, + "grad_norm": 1.994126337902295, + "language_loss": 0.82040584, + "learning_rate": 8.697460726890307e-07, + "loss": 0.84193623, + "num_input_tokens_seen": 125381625, + "step": 5825, + "time_per_iteration": 2.5657927989959717 + }, + { + "auxiliary_loss_clip": 0.01126412, + "auxiliary_loss_mlp": 0.00762486, + "balance_loss_clip": 1.04051495, + "balance_loss_mlp": 1.00070822, + "epoch": 0.7005350808633439, + "flos": 19423758764160.0, + "grad_norm": 2.1103622392386154, + "language_loss": 0.9020102, + "learning_rate": 8.691035037498354e-07, + "loss": 0.92089915, + "num_input_tokens_seen": 125397615, + "step": 5826, + "time_per_iteration": 3.2855372428894043 + }, + { + "auxiliary_loss_clip": 0.0113912, + "auxiliary_loss_mlp": 0.01026154, + "balance_loss_clip": 1.04294884, + "balance_loss_mlp": 1.01846516, + "epoch": 0.7006553237539831, + "flos": 23476708938240.0, + "grad_norm": 1.6781584942159407, + "language_loss": 0.72585011, + "learning_rate": 8.684611063667391e-07, + "loss": 0.74750286, + "num_input_tokens_seen": 125418080, + "step": 5827, + "time_per_iteration": 2.574211359024048 + }, + { + "auxiliary_loss_clip": 0.01155034, + "auxiliary_loss_mlp": 0.0102323, + "balance_loss_clip": 1.04474998, + "balance_loss_mlp": 1.01612568, + "epoch": 0.7007755666446221, + "flos": 31212872640000.0, + "grad_norm": 1.7841796530816159, + "language_loss": 0.77272689, + "learning_rate": 8.678188806371935e-07, + "loss": 0.79450959, + "num_input_tokens_seen": 125440115, + "step": 5828, + "time_per_iteration": 4.14251708984375 + }, + { + "auxiliary_loss_clip": 0.01155346, + "auxiliary_loss_mlp": 0.01022477, + "balance_loss_clip": 1.04503477, + "balance_loss_mlp": 1.01610184, + "epoch": 0.7008958095352612, + "flos": 18149899858560.0, + "grad_norm": 1.7298485076057042, + "language_loss": 0.85361195, + "learning_rate": 8.671768266586228e-07, + "loss": 0.87539023, + "num_input_tokens_seen": 125458240, + "step": 5829, + "time_per_iteration": 2.4446144104003906 + }, + { + "auxiliary_loss_clip": 0.01125472, + "auxiliary_loss_mlp": 0.01028267, + "balance_loss_clip": 1.04238105, + "balance_loss_mlp": 1.02088785, + "epoch": 0.7010160524259004, + "flos": 27452307173760.0, + "grad_norm": 1.7839515234841306, + "language_loss": 0.78305697, + "learning_rate": 8.665349445284275e-07, + "loss": 0.8045944, + "num_input_tokens_seen": 125477980, + "step": 5830, + "time_per_iteration": 2.5928051471710205 + }, + { + "auxiliary_loss_clip": 0.0112555, + "auxiliary_loss_mlp": 0.0102127, + "balance_loss_clip": 1.04392517, + "balance_loss_mlp": 1.01367366, + "epoch": 0.7011362953165394, + "flos": 23842064125440.0, + "grad_norm": 1.5394305142509346, + "language_loss": 0.80969107, + "learning_rate": 8.658932343439799e-07, + "loss": 0.83115923, + "num_input_tokens_seen": 125497765, + "step": 5831, + "time_per_iteration": 2.603816032409668 + }, + { + "auxiliary_loss_clip": 0.01171707, + "auxiliary_loss_mlp": 0.01028857, + "balance_loss_clip": 1.04826248, + "balance_loss_mlp": 1.02064383, + "epoch": 0.7012565382071785, + "flos": 24823430582400.0, + "grad_norm": 2.244008130668379, + "language_loss": 0.77675533, + "learning_rate": 8.65251696202627e-07, + "loss": 0.79876101, + "num_input_tokens_seen": 125514145, + "step": 5832, + "time_per_iteration": 2.525167465209961 + }, + { + "auxiliary_loss_clip": 0.01130569, + "auxiliary_loss_mlp": 0.01025487, + "balance_loss_clip": 1.04581439, + "balance_loss_mlp": 1.01779246, + "epoch": 0.7013767810978175, + "flos": 21397445326080.0, + "grad_norm": 1.94637270015614, + "language_loss": 0.87585527, + "learning_rate": 8.646103302016896e-07, + "loss": 0.89741588, + "num_input_tokens_seen": 125533115, + "step": 5833, + "time_per_iteration": 3.334350824356079 + }, + { + "auxiliary_loss_clip": 0.01125567, + "auxiliary_loss_mlp": 0.010256, + "balance_loss_clip": 1.04288304, + "balance_loss_mlp": 1.01764929, + "epoch": 0.7014970239884567, + "flos": 16687150306560.0, + "grad_norm": 2.9839322819253677, + "language_loss": 0.88563555, + "learning_rate": 8.639691364384614e-07, + "loss": 0.90714723, + "num_input_tokens_seen": 125550740, + "step": 5834, + "time_per_iteration": 2.546884059906006 + }, + { + "auxiliary_loss_clip": 0.01141562, + "auxiliary_loss_mlp": 0.01026343, + "balance_loss_clip": 1.04432595, + "balance_loss_mlp": 1.01867199, + "epoch": 0.7016172668790958, + "flos": 12568268718720.0, + "grad_norm": 1.8335874528151737, + "language_loss": 0.72775251, + "learning_rate": 8.633281150102136e-07, + "loss": 0.74943155, + "num_input_tokens_seen": 125567590, + "step": 5835, + "time_per_iteration": 2.474947929382324 + }, + { + "auxiliary_loss_clip": 0.01141678, + "auxiliary_loss_mlp": 0.01020302, + "balance_loss_clip": 1.04623783, + "balance_loss_mlp": 1.01304781, + "epoch": 0.7017375097697348, + "flos": 17452729808640.0, + "grad_norm": 3.3847401913282607, + "language_loss": 0.67892802, + "learning_rate": 8.626872660141855e-07, + "loss": 0.70054781, + "num_input_tokens_seen": 125585500, + "step": 5836, + "time_per_iteration": 2.504289150238037 + }, + { + "auxiliary_loss_clip": 0.01114549, + "auxiliary_loss_mlp": 0.01025846, + "balance_loss_clip": 1.04365087, + "balance_loss_mlp": 1.0183177, + "epoch": 0.701857752660374, + "flos": 18513028402560.0, + "grad_norm": 1.6746861837980764, + "language_loss": 0.74554241, + "learning_rate": 8.620465895475957e-07, + "loss": 0.76694638, + "num_input_tokens_seen": 125603720, + "step": 5837, + "time_per_iteration": 2.5421841144561768 + }, + { + "auxiliary_loss_clip": 0.01109104, + "auxiliary_loss_mlp": 0.01024799, + "balance_loss_clip": 1.04257858, + "balance_loss_mlp": 1.01725936, + "epoch": 0.701977995551013, + "flos": 24425971614720.0, + "grad_norm": 2.0315965942336045, + "language_loss": 0.75187039, + "learning_rate": 8.614060857076333e-07, + "loss": 0.77320939, + "num_input_tokens_seen": 125624390, + "step": 5838, + "time_per_iteration": 2.611539602279663 + }, + { + "auxiliary_loss_clip": 0.01135292, + "auxiliary_loss_mlp": 0.0102901, + "balance_loss_clip": 1.04252601, + "balance_loss_mlp": 1.0211544, + "epoch": 0.7020982384416521, + "flos": 23002759958400.0, + "grad_norm": 1.7624523778356203, + "language_loss": 0.74665874, + "learning_rate": 8.60765754591462e-07, + "loss": 0.76830173, + "num_input_tokens_seen": 125644085, + "step": 5839, + "time_per_iteration": 2.536172389984131 + }, + { + "auxiliary_loss_clip": 0.01167777, + "auxiliary_loss_mlp": 0.01023012, + "balance_loss_clip": 1.04638016, + "balance_loss_mlp": 1.01553738, + "epoch": 0.7022184813322913, + "flos": 20449080489600.0, + "grad_norm": 1.8635915019298113, + "language_loss": 0.72793198, + "learning_rate": 8.601255962962211e-07, + "loss": 0.74983978, + "num_input_tokens_seen": 125663095, + "step": 5840, + "time_per_iteration": 2.4382526874542236 + }, + { + "auxiliary_loss_clip": 0.01168001, + "auxiliary_loss_mlp": 0.01031742, + "balance_loss_clip": 1.0505507, + "balance_loss_mlp": 1.0230813, + "epoch": 0.7023387242229303, + "flos": 19790514581760.0, + "grad_norm": 2.7230724289694432, + "language_loss": 0.72423166, + "learning_rate": 8.594856109190194e-07, + "loss": 0.74622911, + "num_input_tokens_seen": 125680125, + "step": 5841, + "time_per_iteration": 2.4448909759521484 + }, + { + "auxiliary_loss_clip": 0.01170538, + "auxiliary_loss_mlp": 0.0102436, + "balance_loss_clip": 1.04792666, + "balance_loss_mlp": 1.0166415, + "epoch": 0.7024589671135694, + "flos": 33259278286080.0, + "grad_norm": 1.681828365799491, + "language_loss": 0.69415557, + "learning_rate": 8.588457985569446e-07, + "loss": 0.71610451, + "num_input_tokens_seen": 125703035, + "step": 5842, + "time_per_iteration": 2.555363178253174 + }, + { + "auxiliary_loss_clip": 0.01174143, + "auxiliary_loss_mlp": 0.01030632, + "balance_loss_clip": 1.0490818, + "balance_loss_mlp": 1.02308059, + "epoch": 0.7025792100042085, + "flos": 19098982967040.0, + "grad_norm": 2.0881576610332844, + "language_loss": 0.71731907, + "learning_rate": 8.582061593070542e-07, + "loss": 0.73936677, + "num_input_tokens_seen": 125723765, + "step": 5843, + "time_per_iteration": 2.438732862472534 + }, + { + "auxiliary_loss_clip": 0.01171772, + "auxiliary_loss_mlp": 0.00762534, + "balance_loss_clip": 1.04866564, + "balance_loss_mlp": 1.00070775, + "epoch": 0.7026994528948476, + "flos": 18952611045120.0, + "grad_norm": 2.0767567261088065, + "language_loss": 0.76716697, + "learning_rate": 8.57566693266383e-07, + "loss": 0.78650999, + "num_input_tokens_seen": 125741455, + "step": 5844, + "time_per_iteration": 2.423555612564087 + }, + { + "auxiliary_loss_clip": 0.01146216, + "auxiliary_loss_mlp": 0.00762795, + "balance_loss_clip": 1.04437327, + "balance_loss_mlp": 1.00070047, + "epoch": 0.7028196957854866, + "flos": 19536662188800.0, + "grad_norm": 2.2664780082099356, + "language_loss": 0.6930933, + "learning_rate": 8.569274005319354e-07, + "loss": 0.71218336, + "num_input_tokens_seen": 125759855, + "step": 5845, + "time_per_iteration": 2.4844822883605957 + }, + { + "auxiliary_loss_clip": 0.01151764, + "auxiliary_loss_mlp": 0.01026903, + "balance_loss_clip": 1.04456735, + "balance_loss_mlp": 1.01902008, + "epoch": 0.7029399386761258, + "flos": 20845318394880.0, + "grad_norm": 1.6582961871230977, + "language_loss": 0.79404074, + "learning_rate": 8.562882812006913e-07, + "loss": 0.81582737, + "num_input_tokens_seen": 125777345, + "step": 5846, + "time_per_iteration": 2.4773690700531006 + }, + { + "auxiliary_loss_clip": 0.01168922, + "auxiliary_loss_mlp": 0.01027085, + "balance_loss_clip": 1.0475173, + "balance_loss_mlp": 1.01943457, + "epoch": 0.7030601815667649, + "flos": 22055005653120.0, + "grad_norm": 1.83633803627709, + "language_loss": 0.77652228, + "learning_rate": 8.556493353696066e-07, + "loss": 0.79848242, + "num_input_tokens_seen": 125796345, + "step": 5847, + "time_per_iteration": 2.4281046390533447 + }, + { + "auxiliary_loss_clip": 0.01161695, + "auxiliary_loss_mlp": 0.00762765, + "balance_loss_clip": 1.05030251, + "balance_loss_mlp": 1.00061965, + "epoch": 0.7031804244574039, + "flos": 27198742089600.0, + "grad_norm": 2.203388625843661, + "language_loss": 0.67827237, + "learning_rate": 8.550105631356077e-07, + "loss": 0.69751692, + "num_input_tokens_seen": 125816070, + "step": 5848, + "time_per_iteration": 2.5395829677581787 + }, + { + "auxiliary_loss_clip": 0.01123534, + "auxiliary_loss_mlp": 0.01030815, + "balance_loss_clip": 1.04133582, + "balance_loss_mlp": 1.02257204, + "epoch": 0.7033006673480431, + "flos": 22379853277440.0, + "grad_norm": 2.1530762074937893, + "language_loss": 0.77128023, + "learning_rate": 8.543719645955961e-07, + "loss": 0.79282373, + "num_input_tokens_seen": 125834400, + "step": 5849, + "time_per_iteration": 2.536809206008911 + }, + { + "auxiliary_loss_clip": 0.01143884, + "auxiliary_loss_mlp": 0.01024602, + "balance_loss_clip": 1.0452708, + "balance_loss_mlp": 1.01708031, + "epoch": 0.7034209102386821, + "flos": 24715986024960.0, + "grad_norm": 1.6659799724025413, + "language_loss": 0.74748909, + "learning_rate": 8.537335398464467e-07, + "loss": 0.76917392, + "num_input_tokens_seen": 125854720, + "step": 5850, + "time_per_iteration": 2.535217761993408 + }, + { + "auxiliary_loss_clip": 0.01140559, + "auxiliary_loss_mlp": 0.0102997, + "balance_loss_clip": 1.04148757, + "balance_loss_mlp": 1.02212596, + "epoch": 0.7035411531293212, + "flos": 22556174163840.0, + "grad_norm": 2.9340451890098955, + "language_loss": 0.85302377, + "learning_rate": 8.53095288985007e-07, + "loss": 0.87472904, + "num_input_tokens_seen": 125868455, + "step": 5851, + "time_per_iteration": 2.479238510131836 + }, + { + "auxiliary_loss_clip": 0.01168231, + "auxiliary_loss_mlp": 0.01021739, + "balance_loss_clip": 1.04775, + "balance_loss_mlp": 1.01443124, + "epoch": 0.7036613960199604, + "flos": 22674967418880.0, + "grad_norm": 1.6845936774764605, + "language_loss": 0.81974816, + "learning_rate": 8.524572121081009e-07, + "loss": 0.84164786, + "num_input_tokens_seen": 125888555, + "step": 5852, + "time_per_iteration": 2.503995895385742 + }, + { + "auxiliary_loss_clip": 0.01160205, + "auxiliary_loss_mlp": 0.01025537, + "balance_loss_clip": 1.04568577, + "balance_loss_mlp": 1.01800871, + "epoch": 0.7037816389105994, + "flos": 22492146170880.0, + "grad_norm": 2.19356857815059, + "language_loss": 0.62635916, + "learning_rate": 8.518193093125232e-07, + "loss": 0.64821661, + "num_input_tokens_seen": 125907610, + "step": 5853, + "time_per_iteration": 3.1998581886291504 + }, + { + "auxiliary_loss_clip": 0.01147434, + "auxiliary_loss_mlp": 0.01026898, + "balance_loss_clip": 1.04625452, + "balance_loss_mlp": 1.01981151, + "epoch": 0.7039018818012385, + "flos": 27087490690560.0, + "grad_norm": 2.00873270657036, + "language_loss": 0.81410146, + "learning_rate": 8.511815806950436e-07, + "loss": 0.83584481, + "num_input_tokens_seen": 125928640, + "step": 5854, + "time_per_iteration": 2.5467886924743652 + }, + { + "auxiliary_loss_clip": 0.01154596, + "auxiliary_loss_mlp": 0.0102418, + "balance_loss_clip": 1.04340947, + "balance_loss_mlp": 1.01664591, + "epoch": 0.7040221246918776, + "flos": 17749819198080.0, + "grad_norm": 1.7916228438880308, + "language_loss": 0.78372288, + "learning_rate": 8.505440263524044e-07, + "loss": 0.80551064, + "num_input_tokens_seen": 125947485, + "step": 5855, + "time_per_iteration": 4.00084924697876 + }, + { + "auxiliary_loss_clip": 0.01157411, + "auxiliary_loss_mlp": 0.01024745, + "balance_loss_clip": 1.04433894, + "balance_loss_mlp": 1.01651406, + "epoch": 0.7041423675825167, + "flos": 16279851012480.0, + "grad_norm": 2.8193469178895105, + "language_loss": 0.87965637, + "learning_rate": 8.49906646381322e-07, + "loss": 0.90147793, + "num_input_tokens_seen": 125960320, + "step": 5856, + "time_per_iteration": 2.4179186820983887 + }, + { + "auxiliary_loss_clip": 0.01129511, + "auxiliary_loss_mlp": 0.01023822, + "balance_loss_clip": 1.04460835, + "balance_loss_mlp": 1.01678884, + "epoch": 0.7042626104731557, + "flos": 25483181639040.0, + "grad_norm": 1.745029941591506, + "language_loss": 0.72087008, + "learning_rate": 8.492694408784884e-07, + "loss": 0.74240339, + "num_input_tokens_seen": 125980575, + "step": 5857, + "time_per_iteration": 2.5728137493133545 + }, + { + "auxiliary_loss_clip": 0.01160192, + "auxiliary_loss_mlp": 0.01025141, + "balance_loss_clip": 1.04743242, + "balance_loss_mlp": 1.01756525, + "epoch": 0.7043828533637949, + "flos": 17857622891520.0, + "grad_norm": 2.4724698897639925, + "language_loss": 0.62268066, + "learning_rate": 8.486324099405642e-07, + "loss": 0.64453399, + "num_input_tokens_seen": 125997420, + "step": 5858, + "time_per_iteration": 2.431140422821045 + }, + { + "auxiliary_loss_clip": 0.01152795, + "auxiliary_loss_mlp": 0.01025303, + "balance_loss_clip": 1.04348052, + "balance_loss_mlp": 1.01825142, + "epoch": 0.704503096254434, + "flos": 29494259533440.0, + "grad_norm": 1.6070552245483154, + "language_loss": 0.74812412, + "learning_rate": 8.479955536641887e-07, + "loss": 0.76990509, + "num_input_tokens_seen": 126018915, + "step": 5859, + "time_per_iteration": 3.292357921600342 + }, + { + "auxiliary_loss_clip": 0.01133282, + "auxiliary_loss_mlp": 0.01027761, + "balance_loss_clip": 1.03918469, + "balance_loss_mlp": 1.0203073, + "epoch": 0.704623339145073, + "flos": 30920739327360.0, + "grad_norm": 2.303698617866176, + "language_loss": 0.66346455, + "learning_rate": 8.473588721459716e-07, + "loss": 0.68507493, + "num_input_tokens_seen": 126038825, + "step": 5860, + "time_per_iteration": 2.5638792514801025 + }, + { + "auxiliary_loss_clip": 0.01160516, + "auxiliary_loss_mlp": 0.01034021, + "balance_loss_clip": 1.05090666, + "balance_loss_mlp": 1.02553892, + "epoch": 0.7047435820357122, + "flos": 23914747296000.0, + "grad_norm": 2.026668239920602, + "language_loss": 0.7075426, + "learning_rate": 8.467223654824967e-07, + "loss": 0.72948802, + "num_input_tokens_seen": 126058280, + "step": 5861, + "time_per_iteration": 2.484870433807373 + }, + { + "auxiliary_loss_clip": 0.01150154, + "auxiliary_loss_mlp": 0.01024805, + "balance_loss_clip": 1.04526865, + "balance_loss_mlp": 1.01719975, + "epoch": 0.7048638249263512, + "flos": 46494010926720.0, + "grad_norm": 2.4069993875739932, + "language_loss": 0.62345612, + "learning_rate": 8.460860337703233e-07, + "loss": 0.6452058, + "num_input_tokens_seen": 126078885, + "step": 5862, + "time_per_iteration": 2.6606500148773193 + }, + { + "auxiliary_loss_clip": 0.01113431, + "auxiliary_loss_mlp": 0.01029555, + "balance_loss_clip": 1.0391562, + "balance_loss_mlp": 1.02100742, + "epoch": 0.7049840678169903, + "flos": 21689219502720.0, + "grad_norm": 1.801429457479863, + "language_loss": 0.7045964, + "learning_rate": 8.454498771059797e-07, + "loss": 0.7260263, + "num_input_tokens_seen": 126098260, + "step": 5863, + "time_per_iteration": 2.689892292022705 + }, + { + "auxiliary_loss_clip": 0.01108278, + "auxiliary_loss_mlp": 0.01023977, + "balance_loss_clip": 1.04101193, + "balance_loss_mlp": 1.01554322, + "epoch": 0.7051043107076294, + "flos": 18405081054720.0, + "grad_norm": 2.3019998779068724, + "language_loss": 0.83539176, + "learning_rate": 8.448138955859725e-07, + "loss": 0.85671437, + "num_input_tokens_seen": 126114845, + "step": 5864, + "time_per_iteration": 2.5458991527557373 + }, + { + "auxiliary_loss_clip": 0.01142554, + "auxiliary_loss_mlp": 0.01026875, + "balance_loss_clip": 1.0451467, + "balance_loss_mlp": 1.01920938, + "epoch": 0.7052245535982685, + "flos": 19319043640320.0, + "grad_norm": 2.2257636752327774, + "language_loss": 0.90148711, + "learning_rate": 8.44178089306778e-07, + "loss": 0.92318147, + "num_input_tokens_seen": 126132780, + "step": 5865, + "time_per_iteration": 2.491848945617676 + }, + { + "auxiliary_loss_clip": 0.01170216, + "auxiliary_loss_mlp": 0.01024864, + "balance_loss_clip": 1.04840612, + "balance_loss_mlp": 1.0177834, + "epoch": 0.7053447964889076, + "flos": 19062138591360.0, + "grad_norm": 2.1397001815357513, + "language_loss": 0.77207023, + "learning_rate": 8.4354245836485e-07, + "loss": 0.79402107, + "num_input_tokens_seen": 126151225, + "step": 5866, + "time_per_iteration": 2.416663885116577 + }, + { + "auxiliary_loss_clip": 0.01128972, + "auxiliary_loss_mlp": 0.01024581, + "balance_loss_clip": 1.04392004, + "balance_loss_mlp": 1.01583159, + "epoch": 0.7054650393795466, + "flos": 27379228953600.0, + "grad_norm": 1.649279873678331, + "language_loss": 0.72869444, + "learning_rate": 8.429070028566108e-07, + "loss": 0.75022995, + "num_input_tokens_seen": 126172535, + "step": 5867, + "time_per_iteration": 2.62290620803833 + }, + { + "auxiliary_loss_clip": 0.01154946, + "auxiliary_loss_mlp": 0.01026739, + "balance_loss_clip": 1.04680419, + "balance_loss_mlp": 1.01889503, + "epoch": 0.7055852822701858, + "flos": 16102201322880.0, + "grad_norm": 1.9611309661268637, + "language_loss": 0.74812335, + "learning_rate": 8.422717228784586e-07, + "loss": 0.76994026, + "num_input_tokens_seen": 126189410, + "step": 5868, + "time_per_iteration": 2.4384267330169678 + }, + { + "auxiliary_loss_clip": 0.01113655, + "auxiliary_loss_mlp": 0.0102818, + "balance_loss_clip": 1.0451299, + "balance_loss_mlp": 1.02028811, + "epoch": 0.7057055251608249, + "flos": 11692299744000.0, + "grad_norm": 1.9164848124035871, + "language_loss": 0.69439709, + "learning_rate": 8.416366185267663e-07, + "loss": 0.71581542, + "num_input_tokens_seen": 126206910, + "step": 5869, + "time_per_iteration": 2.547628879547119 + }, + { + "auxiliary_loss_clip": 0.01155054, + "auxiliary_loss_mlp": 0.01022035, + "balance_loss_clip": 1.04390693, + "balance_loss_mlp": 1.01467931, + "epoch": 0.7058257680514639, + "flos": 22711560399360.0, + "grad_norm": 1.6109998178060325, + "language_loss": 0.77786231, + "learning_rate": 8.410016898978778e-07, + "loss": 0.79963315, + "num_input_tokens_seen": 126224385, + "step": 5870, + "time_per_iteration": 2.477860450744629 + }, + { + "auxiliary_loss_clip": 0.01113572, + "auxiliary_loss_mlp": 0.01023787, + "balance_loss_clip": 1.04369199, + "balance_loss_mlp": 1.01670289, + "epoch": 0.7059460109421031, + "flos": 17529543043200.0, + "grad_norm": 1.8807070321760773, + "language_loss": 0.78681779, + "learning_rate": 8.403669370881115e-07, + "loss": 0.80819142, + "num_input_tokens_seen": 126243120, + "step": 5871, + "time_per_iteration": 2.5686094760894775 + }, + { + "auxiliary_loss_clip": 0.01171073, + "auxiliary_loss_mlp": 0.01027768, + "balance_loss_clip": 1.04881728, + "balance_loss_mlp": 1.02061546, + "epoch": 0.7060662538327421, + "flos": 23544687427200.0, + "grad_norm": 1.834786048811903, + "language_loss": 0.78316891, + "learning_rate": 8.397323601937587e-07, + "loss": 0.8051573, + "num_input_tokens_seen": 126263020, + "step": 5872, + "time_per_iteration": 2.4964704513549805 + }, + { + "auxiliary_loss_clip": 0.01122434, + "auxiliary_loss_mlp": 0.01027997, + "balance_loss_clip": 1.04281652, + "balance_loss_mlp": 1.02076077, + "epoch": 0.7061864967233812, + "flos": 30260736875520.0, + "grad_norm": 2.2736669221895025, + "language_loss": 0.77036929, + "learning_rate": 8.390979593110838e-07, + "loss": 0.79187363, + "num_input_tokens_seen": 126285150, + "step": 5873, + "time_per_iteration": 2.6393537521362305 + }, + { + "auxiliary_loss_clip": 0.01147923, + "auxiliary_loss_mlp": 0.01027956, + "balance_loss_clip": 1.04769945, + "balance_loss_mlp": 1.01977849, + "epoch": 0.7063067396140204, + "flos": 20701460424960.0, + "grad_norm": 2.651741001358414, + "language_loss": 0.81819785, + "learning_rate": 8.384637345363262e-07, + "loss": 0.8399567, + "num_input_tokens_seen": 126304340, + "step": 5874, + "time_per_iteration": 2.5060410499572754 + }, + { + "auxiliary_loss_clip": 0.01134429, + "auxiliary_loss_mlp": 0.01023193, + "balance_loss_clip": 1.04119325, + "balance_loss_mlp": 1.01536691, + "epoch": 0.7064269825046594, + "flos": 32266168081920.0, + "grad_norm": 1.7027014360429982, + "language_loss": 0.7660079, + "learning_rate": 8.378296859656964e-07, + "loss": 0.78758413, + "num_input_tokens_seen": 126325495, + "step": 5875, + "time_per_iteration": 2.5762481689453125 + }, + { + "auxiliary_loss_clip": 0.0114327, + "auxiliary_loss_mlp": 0.0102746, + "balance_loss_clip": 1.04504943, + "balance_loss_mlp": 1.01996744, + "epoch": 0.7065472253952985, + "flos": 30227124723840.0, + "grad_norm": 2.0129196247743466, + "language_loss": 0.68448126, + "learning_rate": 8.371958136953792e-07, + "loss": 0.70618856, + "num_input_tokens_seen": 126345525, + "step": 5876, + "time_per_iteration": 2.614086151123047 + }, + { + "auxiliary_loss_clip": 0.01132105, + "auxiliary_loss_mlp": 0.01028708, + "balance_loss_clip": 1.04183471, + "balance_loss_mlp": 1.02038682, + "epoch": 0.7066674682859376, + "flos": 16216720859520.0, + "grad_norm": 2.3455321671875695, + "language_loss": 0.66198158, + "learning_rate": 8.365621178215326e-07, + "loss": 0.6835897, + "num_input_tokens_seen": 126361995, + "step": 5877, + "time_per_iteration": 2.5090131759643555 + }, + { + "auxiliary_loss_clip": 0.01150958, + "auxiliary_loss_mlp": 0.0102762, + "balance_loss_clip": 1.04466259, + "balance_loss_mlp": 1.02063751, + "epoch": 0.7067877111765767, + "flos": 14830461319680.0, + "grad_norm": 2.8126569507204864, + "language_loss": 0.75098372, + "learning_rate": 8.359285984402871e-07, + "loss": 0.77276945, + "num_input_tokens_seen": 126379260, + "step": 5878, + "time_per_iteration": 2.5350582599639893 + }, + { + "auxiliary_loss_clip": 0.01135588, + "auxiliary_loss_mlp": 0.01024373, + "balance_loss_clip": 1.0437696, + "balance_loss_mlp": 1.01720583, + "epoch": 0.7069079540672157, + "flos": 25440196037760.0, + "grad_norm": 2.0222816162149955, + "language_loss": 0.73980623, + "learning_rate": 8.352952556477489e-07, + "loss": 0.76140583, + "num_input_tokens_seen": 126397170, + "step": 5879, + "time_per_iteration": 2.5012359619140625 + }, + { + "auxiliary_loss_clip": 0.01155555, + "auxiliary_loss_mlp": 0.01023823, + "balance_loss_clip": 1.0468123, + "balance_loss_mlp": 1.01610982, + "epoch": 0.7070281969578549, + "flos": 24607751368320.0, + "grad_norm": 1.9958709262736434, + "language_loss": 0.76663536, + "learning_rate": 8.34662089539993e-07, + "loss": 0.78842914, + "num_input_tokens_seen": 126416680, + "step": 5880, + "time_per_iteration": 3.2567172050476074 + }, + { + "auxiliary_loss_clip": 0.01167965, + "auxiliary_loss_mlp": 0.01020218, + "balance_loss_clip": 1.04786062, + "balance_loss_mlp": 1.01288378, + "epoch": 0.707148439848494, + "flos": 26724469887360.0, + "grad_norm": 2.1857821770381, + "language_loss": 0.78739727, + "learning_rate": 8.340291002130722e-07, + "loss": 0.8092792, + "num_input_tokens_seen": 126435870, + "step": 5881, + "time_per_iteration": 3.250929355621338 + }, + { + "auxiliary_loss_clip": 0.01172145, + "auxiliary_loss_mlp": 0.01028053, + "balance_loss_clip": 1.04839277, + "balance_loss_mlp": 1.01998866, + "epoch": 0.707268682739133, + "flos": 15085750256640.0, + "grad_norm": 2.2810740879387676, + "language_loss": 0.79169297, + "learning_rate": 8.3339628776301e-07, + "loss": 0.81369495, + "num_input_tokens_seen": 126454010, + "step": 5882, + "time_per_iteration": 3.156297445297241 + }, + { + "auxiliary_loss_clip": 0.01168218, + "auxiliary_loss_mlp": 0.01025897, + "balance_loss_clip": 1.04623413, + "balance_loss_mlp": 1.01849365, + "epoch": 0.7073889256297722, + "flos": 34313148345600.0, + "grad_norm": 19.025319343737912, + "language_loss": 0.56832695, + "learning_rate": 8.327636522858033e-07, + "loss": 0.59026808, + "num_input_tokens_seen": 126473615, + "step": 5883, + "time_per_iteration": 2.532832384109497 + }, + { + "auxiliary_loss_clip": 0.01114399, + "auxiliary_loss_mlp": 0.01027779, + "balance_loss_clip": 1.04420662, + "balance_loss_mlp": 1.01991105, + "epoch": 0.7075091685204112, + "flos": 20083940784000.0, + "grad_norm": 1.8941199187749305, + "language_loss": 0.77130872, + "learning_rate": 8.321311938774225e-07, + "loss": 0.79273051, + "num_input_tokens_seen": 126492705, + "step": 5884, + "time_per_iteration": 2.5650076866149902 + }, + { + "auxiliary_loss_clip": 0.01172673, + "auxiliary_loss_mlp": 0.01025649, + "balance_loss_clip": 1.04776216, + "balance_loss_mlp": 1.01783502, + "epoch": 0.7076294114110503, + "flos": 20777124424320.0, + "grad_norm": 2.002775813109256, + "language_loss": 0.79060125, + "learning_rate": 8.314989126338104e-07, + "loss": 0.81258452, + "num_input_tokens_seen": 126512715, + "step": 5885, + "time_per_iteration": 2.485186815261841 + }, + { + "auxiliary_loss_clip": 0.0116012, + "auxiliary_loss_mlp": 0.01026106, + "balance_loss_clip": 1.04671347, + "balance_loss_mlp": 1.01829517, + "epoch": 0.7077496543016895, + "flos": 17967689141760.0, + "grad_norm": 1.8641236449403944, + "language_loss": 0.84220386, + "learning_rate": 8.308668086508847e-07, + "loss": 0.86406612, + "num_input_tokens_seen": 126530795, + "step": 5886, + "time_per_iteration": 3.1645922660827637 + }, + { + "auxiliary_loss_clip": 0.01131632, + "auxiliary_loss_mlp": 0.01021088, + "balance_loss_clip": 1.04154408, + "balance_loss_mlp": 1.01305294, + "epoch": 0.7078698971923285, + "flos": 45478098564480.0, + "grad_norm": 1.9847594964925435, + "language_loss": 0.73568553, + "learning_rate": 8.302348820245342e-07, + "loss": 0.75721276, + "num_input_tokens_seen": 126553360, + "step": 5887, + "time_per_iteration": 2.749131441116333 + }, + { + "auxiliary_loss_clip": 0.01128035, + "auxiliary_loss_mlp": 0.01027601, + "balance_loss_clip": 1.04190838, + "balance_loss_mlp": 1.01920879, + "epoch": 0.7079901400829676, + "flos": 26943704547840.0, + "grad_norm": 2.4646511518191176, + "language_loss": 0.70019484, + "learning_rate": 8.296031328506232e-07, + "loss": 0.72175121, + "num_input_tokens_seen": 126573110, + "step": 5888, + "time_per_iteration": 2.596531867980957 + }, + { + "auxiliary_loss_clip": 0.0114329, + "auxiliary_loss_mlp": 0.01024899, + "balance_loss_clip": 1.04571533, + "balance_loss_mlp": 1.01718569, + "epoch": 0.7081103829736067, + "flos": 24423206267520.0, + "grad_norm": 1.9143890601724383, + "language_loss": 0.75541031, + "learning_rate": 8.289715612249857e-07, + "loss": 0.77709222, + "num_input_tokens_seen": 126593725, + "step": 5889, + "time_per_iteration": 2.5339250564575195 + }, + { + "auxiliary_loss_clip": 0.01139969, + "auxiliary_loss_mlp": 0.01028616, + "balance_loss_clip": 1.0461359, + "balance_loss_mlp": 1.02067637, + "epoch": 0.7082306258642458, + "flos": 18543300589440.0, + "grad_norm": 3.97983684306486, + "language_loss": 0.77541268, + "learning_rate": 8.283401672434305e-07, + "loss": 0.79709852, + "num_input_tokens_seen": 126608950, + "step": 5890, + "time_per_iteration": 2.5290026664733887 + }, + { + "auxiliary_loss_clip": 0.01140177, + "auxiliary_loss_mlp": 0.01027316, + "balance_loss_clip": 1.04664171, + "balance_loss_mlp": 1.02005649, + "epoch": 0.7083508687548848, + "flos": 23477534951040.0, + "grad_norm": 1.9163301779522357, + "language_loss": 0.70423877, + "learning_rate": 8.277089510017412e-07, + "loss": 0.72591376, + "num_input_tokens_seen": 126629755, + "step": 5891, + "time_per_iteration": 2.6195037364959717 + }, + { + "auxiliary_loss_clip": 0.01141405, + "auxiliary_loss_mlp": 0.010277, + "balance_loss_clip": 1.04784441, + "balance_loss_mlp": 1.02010703, + "epoch": 0.708471111645524, + "flos": 22419463000320.0, + "grad_norm": 1.6566722882593305, + "language_loss": 0.81937969, + "learning_rate": 8.270779125956719e-07, + "loss": 0.84107071, + "num_input_tokens_seen": 126650135, + "step": 5892, + "time_per_iteration": 2.5739729404449463 + }, + { + "auxiliary_loss_clip": 0.01109245, + "auxiliary_loss_mlp": 0.01024592, + "balance_loss_clip": 1.04160154, + "balance_loss_mlp": 1.01693261, + "epoch": 0.7085913545361631, + "flos": 20922885815040.0, + "grad_norm": 2.61459656593713, + "language_loss": 0.80094546, + "learning_rate": 8.264470521209505e-07, + "loss": 0.82228386, + "num_input_tokens_seen": 126668500, + "step": 5893, + "time_per_iteration": 2.671903371810913 + }, + { + "auxiliary_loss_clip": 0.01147569, + "auxiliary_loss_mlp": 0.01026199, + "balance_loss_clip": 1.04426789, + "balance_loss_mlp": 1.01864076, + "epoch": 0.7087115974268021, + "flos": 15012384727680.0, + "grad_norm": 2.1582186712982754, + "language_loss": 0.76683348, + "learning_rate": 8.258163696732785e-07, + "loss": 0.78857112, + "num_input_tokens_seen": 126686090, + "step": 5894, + "time_per_iteration": 2.557326555252075 + }, + { + "auxiliary_loss_clip": 0.01150031, + "auxiliary_loss_mlp": 0.01024864, + "balance_loss_clip": 1.04456067, + "balance_loss_mlp": 1.01759815, + "epoch": 0.7088318403174413, + "flos": 21539040739200.0, + "grad_norm": 3.9302892363417348, + "language_loss": 0.76798201, + "learning_rate": 8.251858653483288e-07, + "loss": 0.78973103, + "num_input_tokens_seen": 126704255, + "step": 5895, + "time_per_iteration": 2.508002758026123 + }, + { + "auxiliary_loss_clip": 0.01157545, + "auxiliary_loss_mlp": 0.01026309, + "balance_loss_clip": 1.04843307, + "balance_loss_mlp": 1.01873946, + "epoch": 0.7089520832080803, + "flos": 15516785462400.0, + "grad_norm": 2.164960028210307, + "language_loss": 0.85624456, + "learning_rate": 8.245555392417501e-07, + "loss": 0.87808311, + "num_input_tokens_seen": 126718910, + "step": 5896, + "time_per_iteration": 2.520287036895752 + }, + { + "auxiliary_loss_clip": 0.0110007, + "auxiliary_loss_mlp": 0.01022297, + "balance_loss_clip": 1.03868771, + "balance_loss_mlp": 1.01435173, + "epoch": 0.7090723260987194, + "flos": 20412667077120.0, + "grad_norm": 1.8059141110452703, + "language_loss": 0.79010808, + "learning_rate": 8.239253914491613e-07, + "loss": 0.81133175, + "num_input_tokens_seen": 126737235, + "step": 5897, + "time_per_iteration": 2.654571771621704 + }, + { + "auxiliary_loss_clip": 0.01124965, + "auxiliary_loss_mlp": 0.0102315, + "balance_loss_clip": 1.04465938, + "balance_loss_mlp": 1.01590157, + "epoch": 0.7091925689893585, + "flos": 25668337271040.0, + "grad_norm": 3.547550074209898, + "language_loss": 0.75022304, + "learning_rate": 8.232954220661556e-07, + "loss": 0.7717042, + "num_input_tokens_seen": 126759970, + "step": 5898, + "time_per_iteration": 2.6113202571868896 + }, + { + "auxiliary_loss_clip": 0.01171294, + "auxiliary_loss_mlp": 0.01027566, + "balance_loss_clip": 1.0506891, + "balance_loss_mlp": 1.020473, + "epoch": 0.7093128118799976, + "flos": 24206629213440.0, + "grad_norm": 3.3013684027349397, + "language_loss": 0.70136523, + "learning_rate": 8.226656311882989e-07, + "loss": 0.7233538, + "num_input_tokens_seen": 126779280, + "step": 5899, + "time_per_iteration": 2.442021369934082 + }, + { + "auxiliary_loss_clip": 0.01152632, + "auxiliary_loss_mlp": 0.01028481, + "balance_loss_clip": 1.04685068, + "balance_loss_mlp": 1.02127743, + "epoch": 0.7094330547706367, + "flos": 16646786398080.0, + "grad_norm": 2.217405262556955, + "language_loss": 0.76825625, + "learning_rate": 8.22036018911129e-07, + "loss": 0.79006743, + "num_input_tokens_seen": 126797310, + "step": 5900, + "time_per_iteration": 2.431309223175049 + }, + { + "auxiliary_loss_clip": 0.01173673, + "auxiliary_loss_mlp": 0.01026765, + "balance_loss_clip": 1.0479157, + "balance_loss_mlp": 1.01873016, + "epoch": 0.7095532976612757, + "flos": 16283370545280.0, + "grad_norm": 2.5727737667935417, + "language_loss": 0.80237138, + "learning_rate": 8.214065853301599e-07, + "loss": 0.82437575, + "num_input_tokens_seen": 126812840, + "step": 5901, + "time_per_iteration": 2.4341351985931396 + }, + { + "auxiliary_loss_clip": 0.01057061, + "auxiliary_loss_mlp": 0.01000969, + "balance_loss_clip": 1.00812817, + "balance_loss_mlp": 0.99997914, + "epoch": 0.7096735405519149, + "flos": 70722080559360.0, + "grad_norm": 0.8075746308611993, + "language_loss": 0.58264762, + "learning_rate": 8.207773305408734e-07, + "loss": 0.60322791, + "num_input_tokens_seen": 126880060, + "step": 5902, + "time_per_iteration": 3.159853458404541 + }, + { + "auxiliary_loss_clip": 0.01123035, + "auxiliary_loss_mlp": 0.01027625, + "balance_loss_clip": 1.04273939, + "balance_loss_mlp": 1.0193994, + "epoch": 0.709793783442554, + "flos": 23621500661760.0, + "grad_norm": 2.224002613981061, + "language_loss": 0.80002964, + "learning_rate": 8.201482546387288e-07, + "loss": 0.82153624, + "num_input_tokens_seen": 126899535, + "step": 5903, + "time_per_iteration": 2.582578659057617 + }, + { + "auxiliary_loss_clip": 0.01156517, + "auxiliary_loss_mlp": 0.01025283, + "balance_loss_clip": 1.04774451, + "balance_loss_mlp": 1.0180347, + "epoch": 0.709914026333193, + "flos": 25993472204160.0, + "grad_norm": 1.641818965124189, + "language_loss": 0.91746294, + "learning_rate": 8.195193577191553e-07, + "loss": 0.93928087, + "num_input_tokens_seen": 126921365, + "step": 5904, + "time_per_iteration": 2.5068845748901367 + }, + { + "auxiliary_loss_clip": 0.01149002, + "auxiliary_loss_mlp": 0.00762558, + "balance_loss_clip": 1.04486096, + "balance_loss_mlp": 1.00064707, + "epoch": 0.7100342692238322, + "flos": 24861531934080.0, + "grad_norm": 1.716544238683549, + "language_loss": 0.84503448, + "learning_rate": 8.188906398775579e-07, + "loss": 0.86415011, + "num_input_tokens_seen": 126941910, + "step": 5905, + "time_per_iteration": 2.5299620628356934 + }, + { + "auxiliary_loss_clip": 0.01169211, + "auxiliary_loss_mlp": 0.00762396, + "balance_loss_clip": 1.04638243, + "balance_loss_mlp": 1.00074542, + "epoch": 0.7101545121144712, + "flos": 24932203943040.0, + "grad_norm": 2.0196064886183236, + "language_loss": 0.68557417, + "learning_rate": 8.18262101209311e-07, + "loss": 0.70489031, + "num_input_tokens_seen": 126961120, + "step": 5906, + "time_per_iteration": 2.5054891109466553 + }, + { + "auxiliary_loss_clip": 0.01159264, + "auxiliary_loss_mlp": 0.01025645, + "balance_loss_clip": 1.0457921, + "balance_loss_mlp": 1.01840925, + "epoch": 0.7102747550051103, + "flos": 23768842250880.0, + "grad_norm": 1.9435285486346248, + "language_loss": 0.7005347, + "learning_rate": 8.176337418097626e-07, + "loss": 0.72238374, + "num_input_tokens_seen": 126981590, + "step": 5907, + "time_per_iteration": 3.269451141357422 + }, + { + "auxiliary_loss_clip": 0.01154687, + "auxiliary_loss_mlp": 0.00762082, + "balance_loss_clip": 1.04725027, + "balance_loss_mlp": 1.00060952, + "epoch": 0.7103949978957494, + "flos": 15303907509120.0, + "grad_norm": 2.137176547868832, + "language_loss": 0.79847801, + "learning_rate": 8.170055617742364e-07, + "loss": 0.81764567, + "num_input_tokens_seen": 126998870, + "step": 5908, + "time_per_iteration": 3.91745662689209 + }, + { + "auxiliary_loss_clip": 0.01135246, + "auxiliary_loss_mlp": 0.01028273, + "balance_loss_clip": 1.04274631, + "balance_loss_mlp": 1.02023256, + "epoch": 0.7105152407863885, + "flos": 22638805401600.0, + "grad_norm": 1.8924779020358429, + "language_loss": 0.70727587, + "learning_rate": 8.163775611980252e-07, + "loss": 0.72891104, + "num_input_tokens_seen": 127017980, + "step": 5909, + "time_per_iteration": 2.5111029148101807 + }, + { + "auxiliary_loss_clip": 0.01143292, + "auxiliary_loss_mlp": 0.01026495, + "balance_loss_clip": 1.04745388, + "balance_loss_mlp": 1.01938975, + "epoch": 0.7106354836770276, + "flos": 17238594879360.0, + "grad_norm": 1.7111497364644723, + "language_loss": 0.78761923, + "learning_rate": 8.157497401763982e-07, + "loss": 0.80931711, + "num_input_tokens_seen": 127035645, + "step": 5910, + "time_per_iteration": 2.4745922088623047 + }, + { + "auxiliary_loss_clip": 0.0115509, + "auxiliary_loss_mlp": 0.01022523, + "balance_loss_clip": 1.04665411, + "balance_loss_mlp": 1.01484585, + "epoch": 0.7107557265676667, + "flos": 20193647898240.0, + "grad_norm": 1.6757259628954477, + "language_loss": 0.77482802, + "learning_rate": 8.151220988045935e-07, + "loss": 0.79660416, + "num_input_tokens_seen": 127054900, + "step": 5911, + "time_per_iteration": 2.4503278732299805 + }, + { + "auxiliary_loss_clip": 0.01154057, + "auxiliary_loss_mlp": 0.01026197, + "balance_loss_clip": 1.04559648, + "balance_loss_mlp": 1.01905918, + "epoch": 0.7108759694583058, + "flos": 21507080613120.0, + "grad_norm": 1.8062964507769854, + "language_loss": 0.82659197, + "learning_rate": 8.144946371778234e-07, + "loss": 0.84839457, + "num_input_tokens_seen": 127075010, + "step": 5912, + "time_per_iteration": 2.457789182662964 + }, + { + "auxiliary_loss_clip": 0.01142184, + "auxiliary_loss_mlp": 0.00763036, + "balance_loss_clip": 1.04672551, + "balance_loss_mlp": 1.00074744, + "epoch": 0.7109962123489448, + "flos": 24061909317120.0, + "grad_norm": 2.206670824052414, + "language_loss": 0.78080475, + "learning_rate": 8.138673553912751e-07, + "loss": 0.7998569, + "num_input_tokens_seen": 127095570, + "step": 5913, + "time_per_iteration": 3.3119020462036133 + }, + { + "auxiliary_loss_clip": 0.01112105, + "auxiliary_loss_mlp": 0.0102571, + "balance_loss_clip": 1.04140043, + "balance_loss_mlp": 1.01783061, + "epoch": 0.711116455239584, + "flos": 30480474326400.0, + "grad_norm": 3.091500132023002, + "language_loss": 0.56865346, + "learning_rate": 8.132402535401059e-07, + "loss": 0.59003162, + "num_input_tokens_seen": 127116825, + "step": 5914, + "time_per_iteration": 2.6110432147979736 + }, + { + "auxiliary_loss_clip": 0.01152295, + "auxiliary_loss_mlp": 0.01025416, + "balance_loss_clip": 1.04638541, + "balance_loss_mlp": 1.01773882, + "epoch": 0.711236698130223, + "flos": 25045610158080.0, + "grad_norm": 1.7744269011475122, + "language_loss": 0.74196172, + "learning_rate": 8.126133317194465e-07, + "loss": 0.76373881, + "num_input_tokens_seen": 127137015, + "step": 5915, + "time_per_iteration": 2.4866206645965576 + }, + { + "auxiliary_loss_clip": 0.01106453, + "auxiliary_loss_mlp": 0.0102695, + "balance_loss_clip": 1.03824055, + "balance_loss_mlp": 1.01908815, + "epoch": 0.7113569410208621, + "flos": 24206701040640.0, + "grad_norm": 1.8455923065950774, + "language_loss": 0.74380791, + "learning_rate": 8.11986590024401e-07, + "loss": 0.76514196, + "num_input_tokens_seen": 127156755, + "step": 5916, + "time_per_iteration": 2.5975985527038574 + }, + { + "auxiliary_loss_clip": 0.01145689, + "auxiliary_loss_mlp": 0.01030993, + "balance_loss_clip": 1.04773569, + "balance_loss_mlp": 1.02306592, + "epoch": 0.7114771839115013, + "flos": 35439306526080.0, + "grad_norm": 1.7687781026757028, + "language_loss": 0.68486047, + "learning_rate": 8.113600285500442e-07, + "loss": 0.70662737, + "num_input_tokens_seen": 127176965, + "step": 5917, + "time_per_iteration": 2.6313188076019287 + }, + { + "auxiliary_loss_clip": 0.01170863, + "auxiliary_loss_mlp": 0.01022107, + "balance_loss_clip": 1.04728246, + "balance_loss_mlp": 1.01494277, + "epoch": 0.7115974268021403, + "flos": 21099458096640.0, + "grad_norm": 1.8201729548077241, + "language_loss": 0.7457636, + "learning_rate": 8.107336473914268e-07, + "loss": 0.76769328, + "num_input_tokens_seen": 127195595, + "step": 5918, + "time_per_iteration": 2.422978639602661 + }, + { + "auxiliary_loss_clip": 0.01043371, + "auxiliary_loss_mlp": 0.01001452, + "balance_loss_clip": 1.00789309, + "balance_loss_mlp": 1.00054586, + "epoch": 0.7117176696927794, + "flos": 56752866616320.0, + "grad_norm": 0.7709782456094203, + "language_loss": 0.55656004, + "learning_rate": 8.101074466435694e-07, + "loss": 0.57700837, + "num_input_tokens_seen": 127255070, + "step": 5919, + "time_per_iteration": 3.011667251586914 + }, + { + "auxiliary_loss_clip": 0.01149036, + "auxiliary_loss_mlp": 0.01026438, + "balance_loss_clip": 1.04438102, + "balance_loss_mlp": 1.01899934, + "epoch": 0.7118379125834186, + "flos": 15925269905280.0, + "grad_norm": 1.8069275273401768, + "language_loss": 0.68064845, + "learning_rate": 8.094814264014662e-07, + "loss": 0.70240319, + "num_input_tokens_seen": 127273825, + "step": 5920, + "time_per_iteration": 2.4324839115142822 + }, + { + "auxiliary_loss_clip": 0.01171882, + "auxiliary_loss_mlp": 0.01028034, + "balance_loss_clip": 1.04714656, + "balance_loss_mlp": 1.01979065, + "epoch": 0.7119581554740576, + "flos": 20193360589440.0, + "grad_norm": 2.317258331832086, + "language_loss": 0.81247163, + "learning_rate": 8.088555867600844e-07, + "loss": 0.83447075, + "num_input_tokens_seen": 127289990, + "step": 5921, + "time_per_iteration": 2.4303762912750244 + }, + { + "auxiliary_loss_clip": 0.01126421, + "auxiliary_loss_mlp": 0.01025636, + "balance_loss_clip": 1.0435015, + "balance_loss_mlp": 1.01835215, + "epoch": 0.7120783983646967, + "flos": 34715383822080.0, + "grad_norm": 1.8183218533023975, + "language_loss": 0.60311842, + "learning_rate": 8.08229927814362e-07, + "loss": 0.62463903, + "num_input_tokens_seen": 127312880, + "step": 5922, + "time_per_iteration": 2.649549961090088 + }, + { + "auxiliary_loss_clip": 0.01125022, + "auxiliary_loss_mlp": 0.01023454, + "balance_loss_clip": 1.04196739, + "balance_loss_mlp": 1.0159018, + "epoch": 0.7121986412553358, + "flos": 26359114700160.0, + "grad_norm": 1.6293090925971272, + "language_loss": 0.65011322, + "learning_rate": 8.076044496592134e-07, + "loss": 0.67159796, + "num_input_tokens_seen": 127334730, + "step": 5923, + "time_per_iteration": 2.5826590061187744 + }, + { + "auxiliary_loss_clip": 0.01142373, + "auxiliary_loss_mlp": 0.01025117, + "balance_loss_clip": 1.04628646, + "balance_loss_mlp": 1.01797378, + "epoch": 0.7123188841459749, + "flos": 11145344371200.0, + "grad_norm": 2.1960041318214087, + "language_loss": 0.77971351, + "learning_rate": 8.069791523895204e-07, + "loss": 0.80138844, + "num_input_tokens_seen": 127351180, + "step": 5924, + "time_per_iteration": 2.4907145500183105 + }, + { + "auxiliary_loss_clip": 0.01116209, + "auxiliary_loss_mlp": 0.01027818, + "balance_loss_clip": 1.03976834, + "balance_loss_mlp": 1.02018881, + "epoch": 0.7124391270366139, + "flos": 20811670329600.0, + "grad_norm": 2.054413541493429, + "language_loss": 0.77321136, + "learning_rate": 8.063540361001422e-07, + "loss": 0.79465163, + "num_input_tokens_seen": 127369750, + "step": 5925, + "time_per_iteration": 2.574115753173828 + }, + { + "auxiliary_loss_clip": 0.01121572, + "auxiliary_loss_mlp": 0.01024347, + "balance_loss_clip": 1.04235482, + "balance_loss_mlp": 1.01628208, + "epoch": 0.7125593699272531, + "flos": 17603734584960.0, + "grad_norm": 7.8671901951661765, + "language_loss": 0.79278684, + "learning_rate": 8.057291008859069e-07, + "loss": 0.81424606, + "num_input_tokens_seen": 127387910, + "step": 5926, + "time_per_iteration": 2.514880895614624 + }, + { + "auxiliary_loss_clip": 0.01152301, + "auxiliary_loss_mlp": 0.01029084, + "balance_loss_clip": 1.04451203, + "balance_loss_mlp": 1.0217526, + "epoch": 0.7126796128178922, + "flos": 28654057526400.0, + "grad_norm": 2.389221155056856, + "language_loss": 0.68414611, + "learning_rate": 8.051043468416187e-07, + "loss": 0.70595992, + "num_input_tokens_seen": 127409160, + "step": 5927, + "time_per_iteration": 2.591562271118164 + }, + { + "auxiliary_loss_clip": 0.01169493, + "auxiliary_loss_mlp": 0.01024237, + "balance_loss_clip": 1.04957163, + "balance_loss_mlp": 1.01683378, + "epoch": 0.7127998557085312, + "flos": 16034438315520.0, + "grad_norm": 29.22179186197181, + "language_loss": 0.82005441, + "learning_rate": 8.044797740620506e-07, + "loss": 0.84199166, + "num_input_tokens_seen": 127427765, + "step": 5928, + "time_per_iteration": 2.4193899631500244 + }, + { + "auxiliary_loss_clip": 0.01107629, + "auxiliary_loss_mlp": 0.01024614, + "balance_loss_clip": 1.04267335, + "balance_loss_mlp": 1.01755941, + "epoch": 0.7129200985991703, + "flos": 23403271582080.0, + "grad_norm": 2.054273264277698, + "language_loss": 0.78691351, + "learning_rate": 8.038553826419494e-07, + "loss": 0.808236, + "num_input_tokens_seen": 127446475, + "step": 5929, + "time_per_iteration": 2.5730087757110596 + }, + { + "auxiliary_loss_clip": 0.01166748, + "auxiliary_loss_mlp": 0.01022709, + "balance_loss_clip": 1.04509497, + "balance_loss_mlp": 1.01508594, + "epoch": 0.7130403414898094, + "flos": 21397445326080.0, + "grad_norm": 1.6494337139383322, + "language_loss": 0.81239319, + "learning_rate": 8.032311726760364e-07, + "loss": 0.83428776, + "num_input_tokens_seen": 127467695, + "step": 5930, + "time_per_iteration": 2.4481558799743652 + }, + { + "auxiliary_loss_clip": 0.01118458, + "auxiliary_loss_mlp": 0.01023143, + "balance_loss_clip": 1.04265547, + "balance_loss_mlp": 1.01481628, + "epoch": 0.7131605843804485, + "flos": 74739045306240.0, + "grad_norm": 1.9696308928756767, + "language_loss": 0.68968284, + "learning_rate": 8.026071442590022e-07, + "loss": 0.71109891, + "num_input_tokens_seen": 127494590, + "step": 5931, + "time_per_iteration": 2.922729015350342 + }, + { + "auxiliary_loss_clip": 0.01157379, + "auxiliary_loss_mlp": 0.01024676, + "balance_loss_clip": 1.04997396, + "balance_loss_mlp": 1.01760149, + "epoch": 0.7132808272710875, + "flos": 18368739469440.0, + "grad_norm": 2.0094552690649947, + "language_loss": 0.8045572, + "learning_rate": 8.019832974855134e-07, + "loss": 0.82637775, + "num_input_tokens_seen": 127512550, + "step": 5932, + "time_per_iteration": 2.4470982551574707 + }, + { + "auxiliary_loss_clip": 0.01125039, + "auxiliary_loss_mlp": 0.01024812, + "balance_loss_clip": 1.04393566, + "balance_loss_mlp": 1.01724195, + "epoch": 0.7134010701617267, + "flos": 23253380127360.0, + "grad_norm": 2.3632376141462634, + "language_loss": 0.82092023, + "learning_rate": 8.013596324502052e-07, + "loss": 0.84241879, + "num_input_tokens_seen": 127531015, + "step": 5933, + "time_per_iteration": 3.3200581073760986 + }, + { + "auxiliary_loss_clip": 0.01149387, + "auxiliary_loss_mlp": 0.01023919, + "balance_loss_clip": 1.04690862, + "balance_loss_mlp": 1.01677871, + "epoch": 0.7135213130523658, + "flos": 23653137565440.0, + "grad_norm": 1.7114475925573087, + "language_loss": 0.78767395, + "learning_rate": 8.007361492476872e-07, + "loss": 0.80940706, + "num_input_tokens_seen": 127550340, + "step": 5934, + "time_per_iteration": 3.252021074295044 + }, + { + "auxiliary_loss_clip": 0.0113416, + "auxiliary_loss_mlp": 0.01026675, + "balance_loss_clip": 1.04238272, + "balance_loss_mlp": 1.01897955, + "epoch": 0.7136415559430048, + "flos": 24790644443520.0, + "grad_norm": 1.5325292348058392, + "language_loss": 0.78890622, + "learning_rate": 8.001128479725426e-07, + "loss": 0.81051451, + "num_input_tokens_seen": 127572245, + "step": 5935, + "time_per_iteration": 3.313612937927246 + }, + { + "auxiliary_loss_clip": 0.01105993, + "auxiliary_loss_mlp": 0.01021893, + "balance_loss_clip": 1.03873062, + "balance_loss_mlp": 1.01414454, + "epoch": 0.713761798833644, + "flos": 18296954138880.0, + "grad_norm": 1.7388550466257662, + "language_loss": 0.8103615, + "learning_rate": 7.994897287193248e-07, + "loss": 0.83164036, + "num_input_tokens_seen": 127591625, + "step": 5936, + "time_per_iteration": 2.562657117843628 + }, + { + "auxiliary_loss_clip": 0.01158657, + "auxiliary_loss_mlp": 0.01030784, + "balance_loss_clip": 1.04612947, + "balance_loss_mlp": 1.02264845, + "epoch": 0.713882041724283, + "flos": 15558262692480.0, + "grad_norm": 3.4467769871632523, + "language_loss": 0.83410293, + "learning_rate": 7.988667915825605e-07, + "loss": 0.85599732, + "num_input_tokens_seen": 127608690, + "step": 5937, + "time_per_iteration": 2.467510223388672 + }, + { + "auxiliary_loss_clip": 0.01141758, + "auxiliary_loss_mlp": 0.01025294, + "balance_loss_clip": 1.04478157, + "balance_loss_mlp": 1.01766467, + "epoch": 0.7140022846149221, + "flos": 24061011477120.0, + "grad_norm": 2.7499182518149254, + "language_loss": 0.75472707, + "learning_rate": 7.982440366567491e-07, + "loss": 0.77639759, + "num_input_tokens_seen": 127627180, + "step": 5938, + "time_per_iteration": 2.499363899230957 + }, + { + "auxiliary_loss_clip": 0.01148794, + "auxiliary_loss_mlp": 0.01024856, + "balance_loss_clip": 1.04416418, + "balance_loss_mlp": 1.01750052, + "epoch": 0.7141225275055613, + "flos": 27891710248320.0, + "grad_norm": 1.7176147628517044, + "language_loss": 0.74995977, + "learning_rate": 7.97621464036361e-07, + "loss": 0.77169627, + "num_input_tokens_seen": 127648940, + "step": 5939, + "time_per_iteration": 3.3097219467163086 + }, + { + "auxiliary_loss_clip": 0.0115783, + "auxiliary_loss_mlp": 0.01024148, + "balance_loss_clip": 1.04595649, + "balance_loss_mlp": 1.01608968, + "epoch": 0.7142427703962003, + "flos": 19682603147520.0, + "grad_norm": 1.63656365994906, + "language_loss": 0.68138653, + "learning_rate": 7.969990738158417e-07, + "loss": 0.7032063, + "num_input_tokens_seen": 127667350, + "step": 5940, + "time_per_iteration": 2.4476089477539062 + }, + { + "auxiliary_loss_clip": 0.01158064, + "auxiliary_loss_mlp": 0.01024165, + "balance_loss_clip": 1.04797971, + "balance_loss_mlp": 1.01645803, + "epoch": 0.7143630132868394, + "flos": 21032377447680.0, + "grad_norm": 1.9391806162545215, + "language_loss": 0.85010904, + "learning_rate": 7.963768660896062e-07, + "loss": 0.87193131, + "num_input_tokens_seen": 127685760, + "step": 5941, + "time_per_iteration": 2.4576826095581055 + }, + { + "auxiliary_loss_clip": 0.01157891, + "auxiliary_loss_mlp": 0.01025512, + "balance_loss_clip": 1.04658484, + "balance_loss_mlp": 1.01738811, + "epoch": 0.7144832561774785, + "flos": 24129923719680.0, + "grad_norm": 1.9384677557412302, + "language_loss": 0.82394361, + "learning_rate": 7.957548409520432e-07, + "loss": 0.84577775, + "num_input_tokens_seen": 127704985, + "step": 5942, + "time_per_iteration": 2.473517656326294 + }, + { + "auxiliary_loss_clip": 0.01128763, + "auxiliary_loss_mlp": 0.0102325, + "balance_loss_clip": 1.04250813, + "balance_loss_mlp": 1.01613629, + "epoch": 0.7146034990681176, + "flos": 16325817442560.0, + "grad_norm": 1.8916204890280204, + "language_loss": 0.84024823, + "learning_rate": 7.951329984975135e-07, + "loss": 0.86176831, + "num_input_tokens_seen": 127721925, + "step": 5943, + "time_per_iteration": 2.492116928100586 + }, + { + "auxiliary_loss_clip": 0.0103858, + "auxiliary_loss_mlp": 0.0100237, + "balance_loss_clip": 1.00778937, + "balance_loss_mlp": 1.00135064, + "epoch": 0.7147237419587567, + "flos": 69627164232960.0, + "grad_norm": 0.70945588327878, + "language_loss": 0.54293764, + "learning_rate": 7.94511338820349e-07, + "loss": 0.5633471, + "num_input_tokens_seen": 127784230, + "step": 5944, + "time_per_iteration": 3.108997106552124 + }, + { + "auxiliary_loss_clip": 0.01142634, + "auxiliary_loss_mlp": 0.00762781, + "balance_loss_clip": 1.04466259, + "balance_loss_mlp": 1.00055516, + "epoch": 0.7148439848493958, + "flos": 22266806198400.0, + "grad_norm": 1.956307750639128, + "language_loss": 0.78281814, + "learning_rate": 7.938898620148575e-07, + "loss": 0.80187225, + "num_input_tokens_seen": 127801990, + "step": 5945, + "time_per_iteration": 2.510942220687866 + }, + { + "auxiliary_loss_clip": 0.01141432, + "auxiliary_loss_mlp": 0.0102787, + "balance_loss_clip": 1.0453999, + "balance_loss_mlp": 1.0204432, + "epoch": 0.7149642277400349, + "flos": 17931383470080.0, + "grad_norm": 2.2290074639180193, + "language_loss": 0.71130323, + "learning_rate": 7.932685681753135e-07, + "loss": 0.73299623, + "num_input_tokens_seen": 127819270, + "step": 5946, + "time_per_iteration": 2.46671462059021 + }, + { + "auxiliary_loss_clip": 0.01166458, + "auxiliary_loss_mlp": 0.01021359, + "balance_loss_clip": 1.04771137, + "balance_loss_mlp": 1.01437676, + "epoch": 0.7150844706306739, + "flos": 31681937370240.0, + "grad_norm": 2.107099896661249, + "language_loss": 0.62352562, + "learning_rate": 7.92647457395969e-07, + "loss": 0.6454038, + "num_input_tokens_seen": 127841095, + "step": 5947, + "time_per_iteration": 2.5095865726470947 + }, + { + "auxiliary_loss_clip": 0.01107576, + "auxiliary_loss_mlp": 0.0102683, + "balance_loss_clip": 1.03955364, + "balance_loss_mlp": 1.01888812, + "epoch": 0.7152047135213131, + "flos": 10926217451520.0, + "grad_norm": 3.1028095044926762, + "language_loss": 0.74344969, + "learning_rate": 7.920265297710444e-07, + "loss": 0.76479375, + "num_input_tokens_seen": 127858485, + "step": 5948, + "time_per_iteration": 2.6060333251953125 + }, + { + "auxiliary_loss_clip": 0.01157777, + "auxiliary_loss_mlp": 0.01030125, + "balance_loss_clip": 1.04808056, + "balance_loss_mlp": 1.02253413, + "epoch": 0.7153249564119522, + "flos": 20995640812800.0, + "grad_norm": 4.043541528834933, + "language_loss": 0.7284019, + "learning_rate": 7.914057853947363e-07, + "loss": 0.75028098, + "num_input_tokens_seen": 127877665, + "step": 5949, + "time_per_iteration": 2.4672536849975586 + }, + { + "auxiliary_loss_clip": 0.01125092, + "auxiliary_loss_mlp": 0.01029953, + "balance_loss_clip": 1.04269958, + "balance_loss_mlp": 1.0221684, + "epoch": 0.7154451993025912, + "flos": 24243114453120.0, + "grad_norm": 1.7568264335357224, + "language_loss": 0.62817305, + "learning_rate": 7.907852243612089e-07, + "loss": 0.64972353, + "num_input_tokens_seen": 127898070, + "step": 5950, + "time_per_iteration": 2.621185541152954 + }, + { + "auxiliary_loss_clip": 0.01140075, + "auxiliary_loss_mlp": 0.01023348, + "balance_loss_clip": 1.04421306, + "balance_loss_mlp": 1.01610041, + "epoch": 0.7155654421932304, + "flos": 23330947547520.0, + "grad_norm": 1.857833131584005, + "language_loss": 0.72462815, + "learning_rate": 7.901648467646009e-07, + "loss": 0.74626237, + "num_input_tokens_seen": 127917010, + "step": 5951, + "time_per_iteration": 2.5171241760253906 + }, + { + "auxiliary_loss_clip": 0.01174381, + "auxiliary_loss_mlp": 0.01026055, + "balance_loss_clip": 1.0507865, + "balance_loss_mlp": 1.01865768, + "epoch": 0.7156856850838694, + "flos": 22711883621760.0, + "grad_norm": 1.496802555098499, + "language_loss": 0.72745234, + "learning_rate": 7.895446526990244e-07, + "loss": 0.7494567, + "num_input_tokens_seen": 127937025, + "step": 5952, + "time_per_iteration": 2.4608805179595947 + }, + { + "auxiliary_loss_clip": 0.01122038, + "auxiliary_loss_mlp": 0.01027201, + "balance_loss_clip": 1.04312539, + "balance_loss_mlp": 1.01948237, + "epoch": 0.7158059279745085, + "flos": 19865424395520.0, + "grad_norm": 1.750287875483891, + "language_loss": 0.75610816, + "learning_rate": 7.889246422585609e-07, + "loss": 0.77760053, + "num_input_tokens_seen": 127956410, + "step": 5953, + "time_per_iteration": 2.59254789352417 + }, + { + "auxiliary_loss_clip": 0.01171201, + "auxiliary_loss_mlp": 0.0102611, + "balance_loss_clip": 1.04926252, + "balance_loss_mlp": 1.0190407, + "epoch": 0.7159261708651476, + "flos": 24134772055680.0, + "grad_norm": 1.8511217253350685, + "language_loss": 0.73449194, + "learning_rate": 7.883048155372675e-07, + "loss": 0.75646508, + "num_input_tokens_seen": 127974925, + "step": 5954, + "time_per_iteration": 2.447781562805176 + }, + { + "auxiliary_loss_clip": 0.01146418, + "auxiliary_loss_mlp": 0.01024528, + "balance_loss_clip": 1.04574728, + "balance_loss_mlp": 1.01700592, + "epoch": 0.7160464137557867, + "flos": 16983198201600.0, + "grad_norm": 2.2861993161625285, + "language_loss": 0.71593535, + "learning_rate": 7.876851726291698e-07, + "loss": 0.73764479, + "num_input_tokens_seen": 127993225, + "step": 5955, + "time_per_iteration": 2.476712465286255 + }, + { + "auxiliary_loss_clip": 0.01132882, + "auxiliary_loss_mlp": 0.0102374, + "balance_loss_clip": 1.04412127, + "balance_loss_mlp": 1.01665902, + "epoch": 0.7161666566464258, + "flos": 25228251838080.0, + "grad_norm": 1.9568323684484528, + "language_loss": 0.78353792, + "learning_rate": 7.870657136282666e-07, + "loss": 0.8051042, + "num_input_tokens_seen": 128012085, + "step": 5956, + "time_per_iteration": 2.5426745414733887 + }, + { + "auxiliary_loss_clip": 0.01150649, + "auxiliary_loss_mlp": 0.01028772, + "balance_loss_clip": 1.0453285, + "balance_loss_mlp": 1.02123761, + "epoch": 0.7162868995370649, + "flos": 26468390851200.0, + "grad_norm": 1.7624803306016288, + "language_loss": 0.82100368, + "learning_rate": 7.86446438628531e-07, + "loss": 0.84279794, + "num_input_tokens_seen": 128033155, + "step": 5957, + "time_per_iteration": 2.4997947216033936 + }, + { + "auxiliary_loss_clip": 0.01065175, + "auxiliary_loss_mlp": 0.00999787, + "balance_loss_clip": 1.00792146, + "balance_loss_mlp": 0.99878532, + "epoch": 0.716407142427704, + "flos": 69998912040960.0, + "grad_norm": 0.7638232488090885, + "language_loss": 0.56904417, + "learning_rate": 7.858273477239059e-07, + "loss": 0.58969378, + "num_input_tokens_seen": 128101575, + "step": 5958, + "time_per_iteration": 3.0345041751861572 + }, + { + "auxiliary_loss_clip": 0.01100558, + "auxiliary_loss_mlp": 0.01026186, + "balance_loss_clip": 1.04153669, + "balance_loss_mlp": 1.01806784, + "epoch": 0.716527385318343, + "flos": 20740459616640.0, + "grad_norm": 1.8892316989818971, + "language_loss": 0.713709, + "learning_rate": 7.852084410083067e-07, + "loss": 0.73497641, + "num_input_tokens_seen": 128120395, + "step": 5959, + "time_per_iteration": 2.5592589378356934 + }, + { + "auxiliary_loss_clip": 0.01137374, + "auxiliary_loss_mlp": 0.01024627, + "balance_loss_clip": 1.04560256, + "balance_loss_mlp": 1.01747203, + "epoch": 0.7166476282089821, + "flos": 25371966153600.0, + "grad_norm": 1.5470379179690146, + "language_loss": 0.63756615, + "learning_rate": 7.84589718575621e-07, + "loss": 0.65918618, + "num_input_tokens_seen": 128140840, + "step": 5960, + "time_per_iteration": 3.283158540725708 + }, + { + "auxiliary_loss_clip": 0.01141389, + "auxiliary_loss_mlp": 0.01024108, + "balance_loss_clip": 1.04077792, + "balance_loss_mlp": 1.01625156, + "epoch": 0.7167678710996213, + "flos": 24133730561280.0, + "grad_norm": 1.9622856376836475, + "language_loss": 0.69255495, + "learning_rate": 7.83971180519708e-07, + "loss": 0.71420991, + "num_input_tokens_seen": 128159695, + "step": 5961, + "time_per_iteration": 3.277916431427002 + }, + { + "auxiliary_loss_clip": 0.01173222, + "auxiliary_loss_mlp": 0.01021904, + "balance_loss_clip": 1.0500021, + "balance_loss_mlp": 1.01405954, + "epoch": 0.7168881139902603, + "flos": 30226586019840.0, + "grad_norm": 2.4823528097534413, + "language_loss": 0.75970507, + "learning_rate": 7.833528269344008e-07, + "loss": 0.78165638, + "num_input_tokens_seen": 128179600, + "step": 5962, + "time_per_iteration": 3.286471366882324 + }, + { + "auxiliary_loss_clip": 0.01126622, + "auxiliary_loss_mlp": 0.01028386, + "balance_loss_clip": 1.04500055, + "balance_loss_mlp": 1.02058411, + "epoch": 0.7170083568808994, + "flos": 14606414236800.0, + "grad_norm": 2.1337089287597886, + "language_loss": 0.77707338, + "learning_rate": 7.827346579135023e-07, + "loss": 0.7986235, + "num_input_tokens_seen": 128196940, + "step": 5963, + "time_per_iteration": 2.4981191158294678 + }, + { + "auxiliary_loss_clip": 0.01137098, + "auxiliary_loss_mlp": 0.01022976, + "balance_loss_clip": 1.04225659, + "balance_loss_mlp": 1.01504898, + "epoch": 0.7171285997715385, + "flos": 23331091201920.0, + "grad_norm": 2.0504594793488575, + "language_loss": 0.83242399, + "learning_rate": 7.821166735507885e-07, + "loss": 0.85402477, + "num_input_tokens_seen": 128215970, + "step": 5964, + "time_per_iteration": 2.4996585845947266 + }, + { + "auxiliary_loss_clip": 0.01169761, + "auxiliary_loss_mlp": 0.01023576, + "balance_loss_clip": 1.04862821, + "balance_loss_mlp": 1.01592278, + "epoch": 0.7172488426621776, + "flos": 16543543731840.0, + "grad_norm": 2.023140622658755, + "language_loss": 0.68679845, + "learning_rate": 7.81498873940007e-07, + "loss": 0.70873177, + "num_input_tokens_seen": 128233185, + "step": 5965, + "time_per_iteration": 2.4027233123779297 + }, + { + "auxiliary_loss_clip": 0.01159926, + "auxiliary_loss_mlp": 0.01022635, + "balance_loss_clip": 1.0442189, + "balance_loss_mlp": 1.01480341, + "epoch": 0.7173690855528166, + "flos": 26541612725760.0, + "grad_norm": 2.370142601975637, + "language_loss": 0.77347296, + "learning_rate": 7.808812591748768e-07, + "loss": 0.79529858, + "num_input_tokens_seen": 128253565, + "step": 5966, + "time_per_iteration": 3.2583134174346924 + }, + { + "auxiliary_loss_clip": 0.01123312, + "auxiliary_loss_mlp": 0.01022023, + "balance_loss_clip": 1.04300177, + "balance_loss_mlp": 1.01417911, + "epoch": 0.7174893284434558, + "flos": 22784099915520.0, + "grad_norm": 3.854051106565457, + "language_loss": 0.65053308, + "learning_rate": 7.802638293490915e-07, + "loss": 0.6719864, + "num_input_tokens_seen": 128273210, + "step": 5967, + "time_per_iteration": 2.530930519104004 + }, + { + "auxiliary_loss_clip": 0.01145025, + "auxiliary_loss_mlp": 0.01024764, + "balance_loss_clip": 1.04438639, + "balance_loss_mlp": 1.01740849, + "epoch": 0.7176095713340949, + "flos": 23293564467840.0, + "grad_norm": 1.7106793147001742, + "language_loss": 0.76746368, + "learning_rate": 7.796465845563123e-07, + "loss": 0.78916156, + "num_input_tokens_seen": 128292085, + "step": 5968, + "time_per_iteration": 2.50687575340271 + }, + { + "auxiliary_loss_clip": 0.01136142, + "auxiliary_loss_mlp": 0.00762716, + "balance_loss_clip": 1.0437125, + "balance_loss_mlp": 1.00076878, + "epoch": 0.7177298142247339, + "flos": 25591631777280.0, + "grad_norm": 2.34431166414454, + "language_loss": 0.79373038, + "learning_rate": 7.790295248901766e-07, + "loss": 0.81271893, + "num_input_tokens_seen": 128313215, + "step": 5969, + "time_per_iteration": 2.5350852012634277 + }, + { + "auxiliary_loss_clip": 0.0115668, + "auxiliary_loss_mlp": 0.01022006, + "balance_loss_clip": 1.04740465, + "balance_loss_mlp": 1.01426387, + "epoch": 0.7178500571153731, + "flos": 31652778504960.0, + "grad_norm": 2.076686199703236, + "language_loss": 0.62255359, + "learning_rate": 7.784126504442902e-07, + "loss": 0.64434046, + "num_input_tokens_seen": 128336445, + "step": 5970, + "time_per_iteration": 2.5445210933685303 + }, + { + "auxiliary_loss_clip": 0.0111893, + "auxiliary_loss_mlp": 0.01023242, + "balance_loss_clip": 1.04348314, + "balance_loss_mlp": 1.01538587, + "epoch": 0.7179703000060121, + "flos": 19427242383360.0, + "grad_norm": 1.4291849578806834, + "language_loss": 0.68089432, + "learning_rate": 7.777959613122351e-07, + "loss": 0.70231605, + "num_input_tokens_seen": 128356270, + "step": 5971, + "time_per_iteration": 2.509718179702759 + }, + { + "auxiliary_loss_clip": 0.01134451, + "auxiliary_loss_mlp": 0.01028991, + "balance_loss_clip": 1.04446173, + "balance_loss_mlp": 1.02159977, + "epoch": 0.7180905428966512, + "flos": 28839249072000.0, + "grad_norm": 2.1143545794282694, + "language_loss": 0.77435935, + "learning_rate": 7.771794575875604e-07, + "loss": 0.7959938, + "num_input_tokens_seen": 128378140, + "step": 5972, + "time_per_iteration": 2.5498836040496826 + }, + { + "auxiliary_loss_clip": 0.0115676, + "auxiliary_loss_mlp": 0.01031366, + "balance_loss_clip": 1.04838026, + "balance_loss_mlp": 1.02320576, + "epoch": 0.7182107857872904, + "flos": 20047563285120.0, + "grad_norm": 2.489330217097126, + "language_loss": 0.77314985, + "learning_rate": 7.765631393637888e-07, + "loss": 0.79503107, + "num_input_tokens_seen": 128396335, + "step": 5973, + "time_per_iteration": 2.452133893966675 + }, + { + "auxiliary_loss_clip": 0.01150897, + "auxiliary_loss_mlp": 0.01022439, + "balance_loss_clip": 1.0430733, + "balance_loss_mlp": 1.01463699, + "epoch": 0.7183310286779294, + "flos": 22747686503040.0, + "grad_norm": 2.7973418161753263, + "language_loss": 0.48257142, + "learning_rate": 7.75947006734417e-07, + "loss": 0.50430477, + "num_input_tokens_seen": 128414115, + "step": 5974, + "time_per_iteration": 2.460461139678955 + }, + { + "auxiliary_loss_clip": 0.01167751, + "auxiliary_loss_mlp": 0.01025812, + "balance_loss_clip": 1.04583395, + "balance_loss_mlp": 1.01817071, + "epoch": 0.7184512715685685, + "flos": 17158262112000.0, + "grad_norm": 2.4379049762846234, + "language_loss": 0.82742852, + "learning_rate": 7.753310597929101e-07, + "loss": 0.84936416, + "num_input_tokens_seen": 128430755, + "step": 5975, + "time_per_iteration": 2.412544012069702 + }, + { + "auxiliary_loss_clip": 0.01064453, + "auxiliary_loss_mlp": 0.01000835, + "balance_loss_clip": 1.00733495, + "balance_loss_mlp": 0.99992353, + "epoch": 0.7185715144592076, + "flos": 65509611448320.0, + "grad_norm": 0.7640268066550094, + "language_loss": 0.55461776, + "learning_rate": 7.747152986327095e-07, + "loss": 0.57527065, + "num_input_tokens_seen": 128491300, + "step": 5976, + "time_per_iteration": 2.949226140975952 + }, + { + "auxiliary_loss_clip": 0.01117397, + "auxiliary_loss_mlp": 0.01025191, + "balance_loss_clip": 1.0425216, + "balance_loss_mlp": 1.0179162, + "epoch": 0.7186917573498467, + "flos": 16180522928640.0, + "grad_norm": 1.7788623423521648, + "language_loss": 0.68123996, + "learning_rate": 7.740997233472228e-07, + "loss": 0.70266581, + "num_input_tokens_seen": 128508920, + "step": 5977, + "time_per_iteration": 2.5196948051452637 + }, + { + "auxiliary_loss_clip": 0.01141248, + "auxiliary_loss_mlp": 0.01020415, + "balance_loss_clip": 1.04457009, + "balance_loss_mlp": 1.01359892, + "epoch": 0.7188120002404857, + "flos": 29242274647680.0, + "grad_norm": 2.6227641964960013, + "language_loss": 0.7081582, + "learning_rate": 7.734843340298329e-07, + "loss": 0.72977483, + "num_input_tokens_seen": 128528745, + "step": 5978, + "time_per_iteration": 2.546232223510742 + }, + { + "auxiliary_loss_clip": 0.01145433, + "auxiliary_loss_mlp": 0.01028659, + "balance_loss_clip": 1.04396749, + "balance_loss_mlp": 1.02079439, + "epoch": 0.7189322431311249, + "flos": 33401161008000.0, + "grad_norm": 1.9424643990713917, + "language_loss": 0.74782318, + "learning_rate": 7.72869130773895e-07, + "loss": 0.76956409, + "num_input_tokens_seen": 128549345, + "step": 5979, + "time_per_iteration": 2.576899766921997 + }, + { + "auxiliary_loss_clip": 0.01056549, + "auxiliary_loss_mlp": 0.01000482, + "balance_loss_clip": 1.00793231, + "balance_loss_mlp": 0.99952191, + "epoch": 0.719052486021764, + "flos": 61351263792000.0, + "grad_norm": 0.7862734167740836, + "language_loss": 0.59348571, + "learning_rate": 7.722541136727343e-07, + "loss": 0.61405611, + "num_input_tokens_seen": 128605360, + "step": 5980, + "time_per_iteration": 2.9185492992401123 + }, + { + "auxiliary_loss_clip": 0.01156901, + "auxiliary_loss_mlp": 0.01021579, + "balance_loss_clip": 1.04837787, + "balance_loss_mlp": 1.01389027, + "epoch": 0.719172728912403, + "flos": 15596795007360.0, + "grad_norm": 1.8564384331473207, + "language_loss": 0.80785692, + "learning_rate": 7.716392828196483e-07, + "loss": 0.8296417, + "num_input_tokens_seen": 128623160, + "step": 5981, + "time_per_iteration": 2.4284722805023193 + }, + { + "auxiliary_loss_clip": 0.01156157, + "auxiliary_loss_mlp": 0.01026257, + "balance_loss_clip": 1.04758215, + "balance_loss_mlp": 1.01871419, + "epoch": 0.7192929718030422, + "flos": 15553162961280.0, + "grad_norm": 2.4601724193959384, + "language_loss": 0.77319169, + "learning_rate": 7.710246383079064e-07, + "loss": 0.79501587, + "num_input_tokens_seen": 128638545, + "step": 5982, + "time_per_iteration": 2.420584201812744 + }, + { + "auxiliary_loss_clip": 0.01144329, + "auxiliary_loss_mlp": 0.01023807, + "balance_loss_clip": 1.04352462, + "balance_loss_mlp": 1.01611161, + "epoch": 0.7194132146936812, + "flos": 21862487733120.0, + "grad_norm": 2.3272788806267846, + "language_loss": 0.92301536, + "learning_rate": 7.704101802307492e-07, + "loss": 0.94469666, + "num_input_tokens_seen": 128650845, + "step": 5983, + "time_per_iteration": 2.476076602935791 + }, + { + "auxiliary_loss_clip": 0.01119732, + "auxiliary_loss_mlp": 0.01030857, + "balance_loss_clip": 1.04267752, + "balance_loss_mlp": 1.02260494, + "epoch": 0.7195334575843203, + "flos": 27338900958720.0, + "grad_norm": 2.0776669245687884, + "language_loss": 0.87063086, + "learning_rate": 7.697959086813912e-07, + "loss": 0.89213681, + "num_input_tokens_seen": 128667010, + "step": 5984, + "time_per_iteration": 2.543992519378662 + }, + { + "auxiliary_loss_clip": 0.01119562, + "auxiliary_loss_mlp": 0.01027456, + "balance_loss_clip": 1.04133534, + "balance_loss_mlp": 1.02013636, + "epoch": 0.7196537004749595, + "flos": 18770615809920.0, + "grad_norm": 1.672130821916179, + "language_loss": 0.80398893, + "learning_rate": 7.691818237530145e-07, + "loss": 0.82545912, + "num_input_tokens_seen": 128685870, + "step": 5985, + "time_per_iteration": 2.5190980434417725 + }, + { + "auxiliary_loss_clip": 0.01124654, + "auxiliary_loss_mlp": 0.01023537, + "balance_loss_clip": 1.04197168, + "balance_loss_mlp": 1.01646233, + "epoch": 0.7197739433655985, + "flos": 24531009960960.0, + "grad_norm": 1.9269450327579438, + "language_loss": 0.77284253, + "learning_rate": 7.685679255387774e-07, + "loss": 0.7943244, + "num_input_tokens_seen": 128704185, + "step": 5986, + "time_per_iteration": 2.6082041263580322 + }, + { + "auxiliary_loss_clip": 0.01139222, + "auxiliary_loss_mlp": 0.01026725, + "balance_loss_clip": 1.04482174, + "balance_loss_mlp": 1.01924491, + "epoch": 0.7198941862562376, + "flos": 18040587793920.0, + "grad_norm": 1.9107071343768711, + "language_loss": 0.76846981, + "learning_rate": 7.679542141318065e-07, + "loss": 0.7901293, + "num_input_tokens_seen": 128721290, + "step": 5987, + "time_per_iteration": 3.2328498363494873 + }, + { + "auxiliary_loss_clip": 0.01128143, + "auxiliary_loss_mlp": 0.01023715, + "balance_loss_clip": 1.04095697, + "balance_loss_mlp": 1.01639807, + "epoch": 0.7200144291468767, + "flos": 29022393542400.0, + "grad_norm": 2.228109591620719, + "language_loss": 0.75870693, + "learning_rate": 7.673406896252013e-07, + "loss": 0.78022552, + "num_input_tokens_seen": 128742665, + "step": 5988, + "time_per_iteration": 3.336941719055176 + }, + { + "auxiliary_loss_clip": 0.01125581, + "auxiliary_loss_mlp": 0.01029263, + "balance_loss_clip": 1.04122603, + "balance_loss_mlp": 1.02076316, + "epoch": 0.7201346720375158, + "flos": 25374264624000.0, + "grad_norm": 3.4918180060096367, + "language_loss": 0.78811353, + "learning_rate": 7.667273521120347e-07, + "loss": 0.80966198, + "num_input_tokens_seen": 128762225, + "step": 5989, + "time_per_iteration": 3.3004961013793945 + }, + { + "auxiliary_loss_clip": 0.01134471, + "auxiliary_loss_mlp": 0.01031921, + "balance_loss_clip": 1.04663169, + "balance_loss_mlp": 1.02440524, + "epoch": 0.7202549149281549, + "flos": 14355614499840.0, + "grad_norm": 1.922447523325306, + "language_loss": 0.79401839, + "learning_rate": 7.661142016853468e-07, + "loss": 0.81568235, + "num_input_tokens_seen": 128779585, + "step": 5990, + "time_per_iteration": 2.5076968669891357 + }, + { + "auxiliary_loss_clip": 0.01111573, + "auxiliary_loss_mlp": 0.01027651, + "balance_loss_clip": 1.04184186, + "balance_loss_mlp": 1.02023888, + "epoch": 0.7203751578187939, + "flos": 23001682550400.0, + "grad_norm": 1.653823133625968, + "language_loss": 0.74640018, + "learning_rate": 7.655012384381543e-07, + "loss": 0.7677924, + "num_input_tokens_seen": 128799070, + "step": 5991, + "time_per_iteration": 2.573552370071411 + }, + { + "auxiliary_loss_clip": 0.011402, + "auxiliary_loss_mlp": 0.01028051, + "balance_loss_clip": 1.04909062, + "balance_loss_mlp": 1.02057076, + "epoch": 0.7204954007094331, + "flos": 23692424065920.0, + "grad_norm": 2.0125593472365386, + "language_loss": 0.81794608, + "learning_rate": 7.648884624634415e-07, + "loss": 0.83962858, + "num_input_tokens_seen": 128817620, + "step": 5992, + "time_per_iteration": 3.319131374359131 + }, + { + "auxiliary_loss_clip": 0.01151808, + "auxiliary_loss_mlp": 0.01028111, + "balance_loss_clip": 1.04602098, + "balance_loss_mlp": 1.02057147, + "epoch": 0.7206156436000721, + "flos": 16253026531200.0, + "grad_norm": 2.17031355589184, + "language_loss": 0.89002782, + "learning_rate": 7.642758738541683e-07, + "loss": 0.91182697, + "num_input_tokens_seen": 128834200, + "step": 5993, + "time_per_iteration": 2.4501876831054688 + }, + { + "auxiliary_loss_clip": 0.01054806, + "auxiliary_loss_mlp": 0.01001756, + "balance_loss_clip": 1.0076313, + "balance_loss_mlp": 1.00078487, + "epoch": 0.7207358864907112, + "flos": 54377806504320.0, + "grad_norm": 0.7563037313012633, + "language_loss": 0.60729766, + "learning_rate": 7.636634727032621e-07, + "loss": 0.62786329, + "num_input_tokens_seen": 128891305, + "step": 5994, + "time_per_iteration": 2.8998985290527344 + }, + { + "auxiliary_loss_clip": 0.01129498, + "auxiliary_loss_mlp": 0.01025218, + "balance_loss_clip": 1.04013431, + "balance_loss_mlp": 1.0171628, + "epoch": 0.7208561293813504, + "flos": 19135540033920.0, + "grad_norm": 2.017561175594734, + "language_loss": 0.78811383, + "learning_rate": 7.630512591036231e-07, + "loss": 0.80966097, + "num_input_tokens_seen": 128910615, + "step": 5995, + "time_per_iteration": 2.5127363204956055 + }, + { + "auxiliary_loss_clip": 0.01159093, + "auxiliary_loss_mlp": 0.01027657, + "balance_loss_clip": 1.04808927, + "balance_loss_mlp": 1.01966357, + "epoch": 0.7209763722719894, + "flos": 17748526308480.0, + "grad_norm": 2.1816466147000786, + "language_loss": 0.64346045, + "learning_rate": 7.624392331481255e-07, + "loss": 0.66532797, + "num_input_tokens_seen": 128928270, + "step": 5996, + "time_per_iteration": 2.4229915142059326 + }, + { + "auxiliary_loss_clip": 0.01054973, + "auxiliary_loss_mlp": 0.00999781, + "balance_loss_clip": 1.00819898, + "balance_loss_mlp": 0.99876779, + "epoch": 0.7210966151626285, + "flos": 66819488716800.0, + "grad_norm": 0.7503658307446012, + "language_loss": 0.51795745, + "learning_rate": 7.618273949296115e-07, + "loss": 0.53850502, + "num_input_tokens_seen": 128987780, + "step": 5997, + "time_per_iteration": 2.9411673545837402 + }, + { + "auxiliary_loss_clip": 0.0113484, + "auxiliary_loss_mlp": 0.01025054, + "balance_loss_clip": 1.04183698, + "balance_loss_mlp": 1.01651299, + "epoch": 0.7212168580532676, + "flos": 21141869080320.0, + "grad_norm": 1.8922160422785768, + "language_loss": 0.68771732, + "learning_rate": 7.612157445408987e-07, + "loss": 0.70931625, + "num_input_tokens_seen": 129005590, + "step": 5998, + "time_per_iteration": 2.4903364181518555 + }, + { + "auxiliary_loss_clip": 0.0114552, + "auxiliary_loss_mlp": 0.01029901, + "balance_loss_clip": 1.04703784, + "balance_loss_mlp": 1.02166069, + "epoch": 0.7213371009439067, + "flos": 22345738335360.0, + "grad_norm": 2.0549555198683245, + "language_loss": 0.74206388, + "learning_rate": 7.606042820747716e-07, + "loss": 0.76381809, + "num_input_tokens_seen": 129021995, + "step": 5999, + "time_per_iteration": 2.5103423595428467 + }, + { + "auxiliary_loss_clip": 0.01148603, + "auxiliary_loss_mlp": 0.01021579, + "balance_loss_clip": 1.04904103, + "balance_loss_mlp": 1.01417875, + "epoch": 0.7214573438345457, + "flos": 18515901490560.0, + "grad_norm": 2.0931152446364494, + "language_loss": 0.85296214, + "learning_rate": 7.599930076239889e-07, + "loss": 0.87466395, + "num_input_tokens_seen": 129039280, + "step": 6000, + "time_per_iteration": 2.4759368896484375 + }, + { + "auxiliary_loss_clip": 0.01116629, + "auxiliary_loss_mlp": 0.00762251, + "balance_loss_clip": 1.04446995, + "balance_loss_mlp": 1.00066829, + "epoch": 0.7215775867251849, + "flos": 35736108606720.0, + "grad_norm": 1.9997676300756617, + "language_loss": 0.70565599, + "learning_rate": 7.593819212812818e-07, + "loss": 0.72444475, + "num_input_tokens_seen": 129060860, + "step": 6001, + "time_per_iteration": 2.67655348777771 + }, + { + "auxiliary_loss_clip": 0.0115732, + "auxiliary_loss_mlp": 0.0102444, + "balance_loss_clip": 1.04798961, + "balance_loss_mlp": 1.01743329, + "epoch": 0.721697829615824, + "flos": 20372410909440.0, + "grad_norm": 1.844123925507218, + "language_loss": 0.71424198, + "learning_rate": 7.587710231393508e-07, + "loss": 0.73605955, + "num_input_tokens_seen": 129079215, + "step": 6002, + "time_per_iteration": 2.4432690143585205 + }, + { + "auxiliary_loss_clip": 0.01073109, + "auxiliary_loss_mlp": 0.01022072, + "balance_loss_clip": 1.03709078, + "balance_loss_mlp": 1.01489305, + "epoch": 0.721818072506463, + "flos": 20229809915520.0, + "grad_norm": 2.0475979919284444, + "language_loss": 0.84002793, + "learning_rate": 7.581603132908685e-07, + "loss": 0.86097974, + "num_input_tokens_seen": 129097185, + "step": 6003, + "time_per_iteration": 2.6039481163024902 + }, + { + "auxiliary_loss_clip": 0.01123561, + "auxiliary_loss_mlp": 0.01024724, + "balance_loss_clip": 1.04350662, + "balance_loss_mlp": 1.01685059, + "epoch": 0.7219383153971022, + "flos": 18186887888640.0, + "grad_norm": 1.8101966070039692, + "language_loss": 0.78606617, + "learning_rate": 7.575497918284795e-07, + "loss": 0.807549, + "num_input_tokens_seen": 129114730, + "step": 6004, + "time_per_iteration": 2.4917502403259277 + }, + { + "auxiliary_loss_clip": 0.0117441, + "auxiliary_loss_mlp": 0.01030903, + "balance_loss_clip": 1.04880857, + "balance_loss_mlp": 1.02310634, + "epoch": 0.7220585582877412, + "flos": 17342124854400.0, + "grad_norm": 2.4631469451506027, + "language_loss": 0.74469543, + "learning_rate": 7.569394588447984e-07, + "loss": 0.76674855, + "num_input_tokens_seen": 129131745, + "step": 6005, + "time_per_iteration": 2.4067647457122803 + }, + { + "auxiliary_loss_clip": 0.01148053, + "auxiliary_loss_mlp": 0.01026403, + "balance_loss_clip": 1.04426658, + "balance_loss_mlp": 1.01894045, + "epoch": 0.7221788011783803, + "flos": 16976338704000.0, + "grad_norm": 2.493575099551229, + "language_loss": 0.78396934, + "learning_rate": 7.563293144324146e-07, + "loss": 0.80571395, + "num_input_tokens_seen": 129147295, + "step": 6006, + "time_per_iteration": 2.418809652328491 + }, + { + "auxiliary_loss_clip": 0.01169784, + "auxiliary_loss_mlp": 0.01030294, + "balance_loss_clip": 1.04936123, + "balance_loss_mlp": 1.02304029, + "epoch": 0.7222990440690195, + "flos": 26286359702400.0, + "grad_norm": 1.8705325420845442, + "language_loss": 0.80325389, + "learning_rate": 7.557193586838834e-07, + "loss": 0.82525468, + "num_input_tokens_seen": 129162660, + "step": 6007, + "time_per_iteration": 2.473767042160034 + }, + { + "auxiliary_loss_clip": 0.01146776, + "auxiliary_loss_mlp": 0.01022805, + "balance_loss_clip": 1.04464436, + "balance_loss_mlp": 1.01551843, + "epoch": 0.7224192869596585, + "flos": 17601687509760.0, + "grad_norm": 2.471817329842101, + "language_loss": 0.70840561, + "learning_rate": 7.551095916917371e-07, + "loss": 0.73010141, + "num_input_tokens_seen": 129179990, + "step": 6008, + "time_per_iteration": 2.4575278759002686 + }, + { + "auxiliary_loss_clip": 0.01139366, + "auxiliary_loss_mlp": 0.01027572, + "balance_loss_clip": 1.04426205, + "balance_loss_mlp": 1.01956081, + "epoch": 0.7225395298502976, + "flos": 12932331016320.0, + "grad_norm": 2.7164195622901874, + "language_loss": 0.66769993, + "learning_rate": 7.545000135484758e-07, + "loss": 0.68936932, + "num_input_tokens_seen": 129197425, + "step": 6009, + "time_per_iteration": 2.4902448654174805 + }, + { + "auxiliary_loss_clip": 0.01170744, + "auxiliary_loss_mlp": 0.00762527, + "balance_loss_clip": 1.04854286, + "balance_loss_mlp": 1.00063705, + "epoch": 0.7226597727409367, + "flos": 29643899592960.0, + "grad_norm": 1.9547402002451093, + "language_loss": 0.62808561, + "learning_rate": 7.538906243465714e-07, + "loss": 0.64741832, + "num_input_tokens_seen": 129217560, + "step": 6010, + "time_per_iteration": 2.48227858543396 + }, + { + "auxiliary_loss_clip": 0.01171805, + "auxiliary_loss_mlp": 0.01025774, + "balance_loss_clip": 1.04895282, + "balance_loss_mlp": 1.01813233, + "epoch": 0.7227800156315758, + "flos": 13771635183360.0, + "grad_norm": 2.0073200668537003, + "language_loss": 0.78924942, + "learning_rate": 7.5328142417847e-07, + "loss": 0.81122518, + "num_input_tokens_seen": 129234325, + "step": 6011, + "time_per_iteration": 2.401768445968628 + }, + { + "auxiliary_loss_clip": 0.01153474, + "auxiliary_loss_mlp": 0.01033014, + "balance_loss_clip": 1.04434872, + "balance_loss_mlp": 1.02593243, + "epoch": 0.7229002585222148, + "flos": 20301882554880.0, + "grad_norm": 1.8867070493861038, + "language_loss": 0.69251859, + "learning_rate": 7.526724131365838e-07, + "loss": 0.71438348, + "num_input_tokens_seen": 129255280, + "step": 6012, + "time_per_iteration": 2.4970955848693848 + }, + { + "auxiliary_loss_clip": 0.01142549, + "auxiliary_loss_mlp": 0.01030085, + "balance_loss_clip": 1.04898548, + "balance_loss_mlp": 1.02197874, + "epoch": 0.723020501412854, + "flos": 16581250033920.0, + "grad_norm": 1.9328322038397139, + "language_loss": 0.70417809, + "learning_rate": 7.520635913133017e-07, + "loss": 0.72590441, + "num_input_tokens_seen": 129273910, + "step": 6013, + "time_per_iteration": 3.261050224304199 + }, + { + "auxiliary_loss_clip": 0.01162993, + "auxiliary_loss_mlp": 0.01027887, + "balance_loss_clip": 1.04809403, + "balance_loss_mlp": 1.01932144, + "epoch": 0.7231407443034931, + "flos": 28548300908160.0, + "grad_norm": 5.306913536517546, + "language_loss": 0.82572877, + "learning_rate": 7.514549588009798e-07, + "loss": 0.84763759, + "num_input_tokens_seen": 129294785, + "step": 6014, + "time_per_iteration": 2.5323827266693115 + }, + { + "auxiliary_loss_clip": 0.01143548, + "auxiliary_loss_mlp": 0.01026684, + "balance_loss_clip": 1.04570174, + "balance_loss_mlp": 1.01957905, + "epoch": 0.7232609871941321, + "flos": 30008536508160.0, + "grad_norm": 2.0797768507875194, + "language_loss": 0.70347244, + "learning_rate": 7.508465156919492e-07, + "loss": 0.72517478, + "num_input_tokens_seen": 129318295, + "step": 6015, + "time_per_iteration": 3.321009874343872 + }, + { + "auxiliary_loss_clip": 0.01143418, + "auxiliary_loss_mlp": 0.01027316, + "balance_loss_clip": 1.04473913, + "balance_loss_mlp": 1.01938272, + "epoch": 0.7233812300847713, + "flos": 16654005031680.0, + "grad_norm": 2.9629282974305804, + "language_loss": 0.61532462, + "learning_rate": 7.502382620785083e-07, + "loss": 0.63703197, + "num_input_tokens_seen": 129334845, + "step": 6016, + "time_per_iteration": 3.181567668914795 + }, + { + "auxiliary_loss_clip": 0.01025604, + "auxiliary_loss_mlp": 0.0100666, + "balance_loss_clip": 1.00742686, + "balance_loss_mlp": 1.00553989, + "epoch": 0.7235014729754103, + "flos": 67258784050560.0, + "grad_norm": 0.8325725659099343, + "language_loss": 0.6250816, + "learning_rate": 7.496301980529289e-07, + "loss": 0.64540428, + "num_input_tokens_seen": 129398055, + "step": 6017, + "time_per_iteration": 3.118438720703125 + }, + { + "auxiliary_loss_clip": 0.0117135, + "auxiliary_loss_mlp": 0.01027133, + "balance_loss_clip": 1.0486033, + "balance_loss_mlp": 1.01969469, + "epoch": 0.7236217158660494, + "flos": 26943237671040.0, + "grad_norm": 2.3953971360056796, + "language_loss": 0.74485385, + "learning_rate": 7.490223237074547e-07, + "loss": 0.76683879, + "num_input_tokens_seen": 129417765, + "step": 6018, + "time_per_iteration": 2.4966444969177246 + }, + { + "auxiliary_loss_clip": 0.01126232, + "auxiliary_loss_mlp": 0.0102982, + "balance_loss_clip": 1.04122424, + "balance_loss_mlp": 1.02183259, + "epoch": 0.7237419587566886, + "flos": 29423372042880.0, + "grad_norm": 1.8413032700030023, + "language_loss": 0.65910047, + "learning_rate": 7.484146391342989e-07, + "loss": 0.68066096, + "num_input_tokens_seen": 129437560, + "step": 6019, + "time_per_iteration": 3.319014310836792 + }, + { + "auxiliary_loss_clip": 0.01135856, + "auxiliary_loss_mlp": 0.01024384, + "balance_loss_clip": 1.04335284, + "balance_loss_mlp": 1.01678991, + "epoch": 0.7238622016473276, + "flos": 17821496787840.0, + "grad_norm": 2.4176930745831093, + "language_loss": 0.5641883, + "learning_rate": 7.478071444256484e-07, + "loss": 0.58579063, + "num_input_tokens_seen": 129455320, + "step": 6020, + "time_per_iteration": 2.4505796432495117 + }, + { + "auxiliary_loss_clip": 0.01137717, + "auxiliary_loss_mlp": 0.0102535, + "balance_loss_clip": 1.04408216, + "balance_loss_mlp": 1.01781583, + "epoch": 0.7239824445379667, + "flos": 25739117020800.0, + "grad_norm": 1.8415736567276948, + "language_loss": 0.79474902, + "learning_rate": 7.471998396736579e-07, + "loss": 0.81637967, + "num_input_tokens_seen": 129475700, + "step": 6021, + "time_per_iteration": 2.547088623046875 + }, + { + "auxiliary_loss_clip": 0.01131521, + "auxiliary_loss_mlp": 0.01019501, + "balance_loss_clip": 1.04613161, + "balance_loss_mlp": 1.01219606, + "epoch": 0.7241026874286057, + "flos": 23148916398720.0, + "grad_norm": 1.779966501161682, + "language_loss": 0.758497, + "learning_rate": 7.465927249704549e-07, + "loss": 0.78000724, + "num_input_tokens_seen": 129493585, + "step": 6022, + "time_per_iteration": 2.5464892387390137 + }, + { + "auxiliary_loss_clip": 0.0115478, + "auxiliary_loss_mlp": 0.01024578, + "balance_loss_clip": 1.04594469, + "balance_loss_mlp": 1.01695704, + "epoch": 0.7242229303192449, + "flos": 20266905686400.0, + "grad_norm": 2.2984347260444267, + "language_loss": 0.77396321, + "learning_rate": 7.459858004081398e-07, + "loss": 0.79575682, + "num_input_tokens_seen": 129511555, + "step": 6023, + "time_per_iteration": 2.519322633743286 + }, + { + "auxiliary_loss_clip": 0.01025127, + "auxiliary_loss_mlp": 0.01002087, + "balance_loss_clip": 1.00664127, + "balance_loss_mlp": 1.00099039, + "epoch": 0.724343173209884, + "flos": 62311659684480.0, + "grad_norm": 0.6625103962333327, + "language_loss": 0.58025944, + "learning_rate": 7.453790660787815e-07, + "loss": 0.60053158, + "num_input_tokens_seen": 129579650, + "step": 6024, + "time_per_iteration": 3.2030444145202637 + }, + { + "auxiliary_loss_clip": 0.01144908, + "auxiliary_loss_mlp": 0.01024979, + "balance_loss_clip": 1.04589391, + "balance_loss_mlp": 1.01671195, + "epoch": 0.724463416100523, + "flos": 35006403813120.0, + "grad_norm": 2.057419404046907, + "language_loss": 0.63448638, + "learning_rate": 7.447725220744214e-07, + "loss": 0.65618527, + "num_input_tokens_seen": 129601895, + "step": 6025, + "time_per_iteration": 2.6507232189178467 + }, + { + "auxiliary_loss_clip": 0.01169943, + "auxiliary_loss_mlp": 0.01028011, + "balance_loss_clip": 1.04641962, + "balance_loss_mlp": 1.02044392, + "epoch": 0.7245836589911622, + "flos": 21871968923520.0, + "grad_norm": 2.2814020487464775, + "language_loss": 0.7728411, + "learning_rate": 7.441661684870717e-07, + "loss": 0.79482067, + "num_input_tokens_seen": 129622150, + "step": 6026, + "time_per_iteration": 2.4541237354278564 + }, + { + "auxiliary_loss_clip": 0.01170643, + "auxiliary_loss_mlp": 0.01023427, + "balance_loss_clip": 1.0490135, + "balance_loss_mlp": 1.01617599, + "epoch": 0.7247039018818012, + "flos": 23006494972800.0, + "grad_norm": 1.7365207852448599, + "language_loss": 0.81530344, + "learning_rate": 7.435600054087152e-07, + "loss": 0.83724415, + "num_input_tokens_seen": 129644315, + "step": 6027, + "time_per_iteration": 2.48465895652771 + }, + { + "auxiliary_loss_clip": 0.011723, + "auxiliary_loss_mlp": 0.01030927, + "balance_loss_clip": 1.05008388, + "balance_loss_mlp": 1.02338111, + "epoch": 0.7248241447724403, + "flos": 31722588587520.0, + "grad_norm": 1.935988049515235, + "language_loss": 0.7404654, + "learning_rate": 7.42954032931308e-07, + "loss": 0.76249766, + "num_input_tokens_seen": 129665355, + "step": 6028, + "time_per_iteration": 2.4925217628479004 + }, + { + "auxiliary_loss_clip": 0.01143072, + "auxiliary_loss_mlp": 0.01025905, + "balance_loss_clip": 1.04421067, + "balance_loss_mlp": 1.01881838, + "epoch": 0.7249443876630794, + "flos": 34896984007680.0, + "grad_norm": 1.7806124184005243, + "language_loss": 0.74455208, + "learning_rate": 7.423482511467733e-07, + "loss": 0.76624191, + "num_input_tokens_seen": 129686125, + "step": 6029, + "time_per_iteration": 2.625861167907715 + }, + { + "auxiliary_loss_clip": 0.01089053, + "auxiliary_loss_mlp": 0.01028491, + "balance_loss_clip": 1.03950691, + "balance_loss_mlp": 1.02112341, + "epoch": 0.7250646305537185, + "flos": 26359294268160.0, + "grad_norm": 2.3130224258215026, + "language_loss": 0.64334667, + "learning_rate": 7.417426601470099e-07, + "loss": 0.66452211, + "num_input_tokens_seen": 129706485, + "step": 6030, + "time_per_iteration": 2.620666742324829 + }, + { + "auxiliary_loss_clip": 0.01158017, + "auxiliary_loss_mlp": 0.01024733, + "balance_loss_clip": 1.04722118, + "balance_loss_mlp": 1.01672792, + "epoch": 0.7251848734443576, + "flos": 30081614728320.0, + "grad_norm": 3.2622140955057817, + "language_loss": 0.7836327, + "learning_rate": 7.411372600238841e-07, + "loss": 0.80546016, + "num_input_tokens_seen": 129727100, + "step": 6031, + "time_per_iteration": 2.5310516357421875 + }, + { + "auxiliary_loss_clip": 0.01169726, + "auxiliary_loss_mlp": 0.01029215, + "balance_loss_clip": 1.04728055, + "balance_loss_mlp": 1.02125788, + "epoch": 0.7253051163349967, + "flos": 17785262943360.0, + "grad_norm": 2.1531273562587274, + "language_loss": 0.73766214, + "learning_rate": 7.405320508692346e-07, + "loss": 0.75965154, + "num_input_tokens_seen": 129745840, + "step": 6032, + "time_per_iteration": 2.387960433959961 + }, + { + "auxiliary_loss_clip": 0.01167039, + "auxiliary_loss_mlp": 0.0102267, + "balance_loss_clip": 1.0478251, + "balance_loss_mlp": 1.01523757, + "epoch": 0.7254253592256358, + "flos": 12641346938880.0, + "grad_norm": 1.881967228606484, + "language_loss": 0.75577259, + "learning_rate": 7.399270327748727e-07, + "loss": 0.77766967, + "num_input_tokens_seen": 129763500, + "step": 6033, + "time_per_iteration": 2.4322421550750732 + }, + { + "auxiliary_loss_clip": 0.01127519, + "auxiliary_loss_mlp": 0.00761058, + "balance_loss_clip": 1.04167295, + "balance_loss_mlp": 1.00057125, + "epoch": 0.7255456021162748, + "flos": 27199208966400.0, + "grad_norm": 1.8840300208626193, + "language_loss": 0.74493486, + "learning_rate": 7.39322205832577e-07, + "loss": 0.76382065, + "num_input_tokens_seen": 129784390, + "step": 6034, + "time_per_iteration": 2.571841239929199 + }, + { + "auxiliary_loss_clip": 0.01136538, + "auxiliary_loss_mlp": 0.01022631, + "balance_loss_clip": 1.04311728, + "balance_loss_mlp": 1.01511526, + "epoch": 0.725665845006914, + "flos": 21288205088640.0, + "grad_norm": 2.238848775287063, + "language_loss": 0.80968398, + "learning_rate": 7.387175701341009e-07, + "loss": 0.8312757, + "num_input_tokens_seen": 129803060, + "step": 6035, + "time_per_iteration": 2.4979777336120605 + }, + { + "auxiliary_loss_clip": 0.01155885, + "auxiliary_loss_mlp": 0.01022739, + "balance_loss_clip": 1.0455848, + "balance_loss_mlp": 1.01516354, + "epoch": 0.7257860878975531, + "flos": 16033684129920.0, + "grad_norm": 3.4090022499495682, + "language_loss": 0.71863961, + "learning_rate": 7.381131257711659e-07, + "loss": 0.74042594, + "num_input_tokens_seen": 129820165, + "step": 6036, + "time_per_iteration": 2.468266487121582 + }, + { + "auxiliary_loss_clip": 0.01140748, + "auxiliary_loss_mlp": 0.0102958, + "balance_loss_clip": 1.05055416, + "balance_loss_mlp": 1.02197766, + "epoch": 0.7259063307881921, + "flos": 12129943052160.0, + "grad_norm": 1.8270706076415586, + "language_loss": 0.84014612, + "learning_rate": 7.375088728354677e-07, + "loss": 0.86184943, + "num_input_tokens_seen": 129835195, + "step": 6037, + "time_per_iteration": 2.470543384552002 + }, + { + "auxiliary_loss_clip": 0.01131395, + "auxiliary_loss_mlp": 0.01025001, + "balance_loss_clip": 1.04370749, + "balance_loss_mlp": 1.01713347, + "epoch": 0.7260265736788313, + "flos": 30443845432320.0, + "grad_norm": 8.283864783500947, + "language_loss": 0.67043257, + "learning_rate": 7.369048114186691e-07, + "loss": 0.69199646, + "num_input_tokens_seen": 129856240, + "step": 6038, + "time_per_iteration": 2.6003732681274414 + }, + { + "auxiliary_loss_clip": 0.01137353, + "auxiliary_loss_mlp": 0.00761928, + "balance_loss_clip": 1.04565287, + "balance_loss_mlp": 1.00067139, + "epoch": 0.7261468165694703, + "flos": 21142264129920.0, + "grad_norm": 2.0677540376653716, + "language_loss": 0.83062339, + "learning_rate": 7.363009416124055e-07, + "loss": 0.84961623, + "num_input_tokens_seen": 129875565, + "step": 6039, + "time_per_iteration": 2.53202223777771 + }, + { + "auxiliary_loss_clip": 0.01130976, + "auxiliary_loss_mlp": 0.01026774, + "balance_loss_clip": 1.04575562, + "balance_loss_mlp": 1.01932299, + "epoch": 0.7262670594601094, + "flos": 22306308180480.0, + "grad_norm": 3.214154240979727, + "language_loss": 0.63126624, + "learning_rate": 7.356972635082852e-07, + "loss": 0.65284371, + "num_input_tokens_seen": 129894420, + "step": 6040, + "time_per_iteration": 3.33056378364563 + }, + { + "auxiliary_loss_clip": 0.01111006, + "auxiliary_loss_mlp": 0.01027768, + "balance_loss_clip": 1.04455996, + "balance_loss_mlp": 1.0202632, + "epoch": 0.7263873023507486, + "flos": 25335049950720.0, + "grad_norm": 1.713183281915521, + "language_loss": 0.74988532, + "learning_rate": 7.35093777197884e-07, + "loss": 0.77127302, + "num_input_tokens_seen": 129914490, + "step": 6041, + "time_per_iteration": 2.6072163581848145 + }, + { + "auxiliary_loss_clip": 0.01140041, + "auxiliary_loss_mlp": 0.01021568, + "balance_loss_clip": 1.04624391, + "balance_loss_mlp": 1.01424527, + "epoch": 0.7265075452413876, + "flos": 23878621192320.0, + "grad_norm": 2.534527562158057, + "language_loss": 0.85781091, + "learning_rate": 7.344904827727525e-07, + "loss": 0.87942702, + "num_input_tokens_seen": 129931670, + "step": 6042, + "time_per_iteration": 3.9733269214630127 + }, + { + "auxiliary_loss_clip": 0.01129047, + "auxiliary_loss_mlp": 0.01023787, + "balance_loss_clip": 1.04193199, + "balance_loss_mlp": 1.01603842, + "epoch": 0.7266277881320267, + "flos": 28724549967360.0, + "grad_norm": 2.12241536388991, + "language_loss": 0.73697507, + "learning_rate": 7.338873803244076e-07, + "loss": 0.75850344, + "num_input_tokens_seen": 129946905, + "step": 6043, + "time_per_iteration": 2.576366424560547 + }, + { + "auxiliary_loss_clip": 0.0113726, + "auxiliary_loss_mlp": 0.01023728, + "balance_loss_clip": 1.04539752, + "balance_loss_mlp": 1.01680756, + "epoch": 0.7267480310226658, + "flos": 24863507182080.0, + "grad_norm": 1.7535775022120872, + "language_loss": 0.80726939, + "learning_rate": 7.332844699443401e-07, + "loss": 0.82887924, + "num_input_tokens_seen": 129965505, + "step": 6044, + "time_per_iteration": 2.5326473712921143 + }, + { + "auxiliary_loss_clip": 0.01104794, + "auxiliary_loss_mlp": 0.01025271, + "balance_loss_clip": 1.03984928, + "balance_loss_mlp": 1.01854432, + "epoch": 0.7268682739133049, + "flos": 27198490694400.0, + "grad_norm": 4.05398078678409, + "language_loss": 0.75446594, + "learning_rate": 7.326817517240121e-07, + "loss": 0.77576661, + "num_input_tokens_seen": 129987210, + "step": 6045, + "time_per_iteration": 2.645451307296753 + }, + { + "auxiliary_loss_clip": 0.01156462, + "auxiliary_loss_mlp": 0.00761206, + "balance_loss_clip": 1.04655719, + "balance_loss_mlp": 1.00057817, + "epoch": 0.7269885168039439, + "flos": 33508138688640.0, + "grad_norm": 1.796734142588718, + "language_loss": 0.83575797, + "learning_rate": 7.320792257548545e-07, + "loss": 0.85493469, + "num_input_tokens_seen": 130008385, + "step": 6046, + "time_per_iteration": 3.2775890827178955 + }, + { + "auxiliary_loss_clip": 0.01147043, + "auxiliary_loss_mlp": 0.01025108, + "balance_loss_clip": 1.04635739, + "balance_loss_mlp": 1.01751423, + "epoch": 0.7271087596945831, + "flos": 24313750548480.0, + "grad_norm": 17.291652043169048, + "language_loss": 0.76255691, + "learning_rate": 7.314768921282704e-07, + "loss": 0.78427845, + "num_input_tokens_seen": 130029040, + "step": 6047, + "time_per_iteration": 2.523672580718994 + }, + { + "auxiliary_loss_clip": 0.01157222, + "auxiliary_loss_mlp": 0.01024364, + "balance_loss_clip": 1.04587054, + "balance_loss_mlp": 1.01709795, + "epoch": 0.7272290025852222, + "flos": 23805147922560.0, + "grad_norm": 3.0900046327804462, + "language_loss": 0.71646619, + "learning_rate": 7.30874750935633e-07, + "loss": 0.73828197, + "num_input_tokens_seen": 130048725, + "step": 6048, + "time_per_iteration": 2.5394554138183594 + }, + { + "auxiliary_loss_clip": 0.01127701, + "auxiliary_loss_mlp": 0.01025852, + "balance_loss_clip": 1.04518127, + "balance_loss_mlp": 1.0183028, + "epoch": 0.7273492454758612, + "flos": 16720367408640.0, + "grad_norm": 1.9256094620867956, + "language_loss": 0.78980285, + "learning_rate": 7.30272802268286e-07, + "loss": 0.81133842, + "num_input_tokens_seen": 130065720, + "step": 6049, + "time_per_iteration": 2.4986226558685303 + }, + { + "auxiliary_loss_clip": 0.01071597, + "auxiliary_loss_mlp": 0.01019759, + "balance_loss_clip": 1.03521919, + "balance_loss_mlp": 1.01267171, + "epoch": 0.7274694883665004, + "flos": 28031330413440.0, + "grad_norm": 1.66710487330793, + "language_loss": 0.76321763, + "learning_rate": 7.29671046217547e-07, + "loss": 0.78413117, + "num_input_tokens_seen": 130084830, + "step": 6050, + "time_per_iteration": 2.6221542358398438 + }, + { + "auxiliary_loss_clip": 0.0112827, + "auxiliary_loss_mlp": 0.01027041, + "balance_loss_clip": 1.04490423, + "balance_loss_mlp": 1.01993883, + "epoch": 0.7275897312571394, + "flos": 30372706546560.0, + "grad_norm": 2.0829522444169366, + "language_loss": 0.81647694, + "learning_rate": 7.290694828746988e-07, + "loss": 0.83802998, + "num_input_tokens_seen": 130104495, + "step": 6051, + "time_per_iteration": 2.591071844100952 + }, + { + "auxiliary_loss_clip": 0.01131597, + "auxiliary_loss_mlp": 0.01024934, + "balance_loss_clip": 1.0426048, + "balance_loss_mlp": 1.01714396, + "epoch": 0.7277099741477785, + "flos": 19204775498880.0, + "grad_norm": 1.85058729159132, + "language_loss": 0.8559984, + "learning_rate": 7.284681123310004e-07, + "loss": 0.87756371, + "num_input_tokens_seen": 130123210, + "step": 6052, + "time_per_iteration": 2.547013759613037 + }, + { + "auxiliary_loss_clip": 0.01154986, + "auxiliary_loss_mlp": 0.01029119, + "balance_loss_clip": 1.04649186, + "balance_loss_mlp": 1.02097106, + "epoch": 0.7278302170384175, + "flos": 20667884186880.0, + "grad_norm": 1.961748580164719, + "language_loss": 0.79871404, + "learning_rate": 7.27866934677678e-07, + "loss": 0.82055509, + "num_input_tokens_seen": 130142880, + "step": 6053, + "time_per_iteration": 2.4833528995513916 + }, + { + "auxiliary_loss_clip": 0.01108922, + "auxiliary_loss_mlp": 0.01025673, + "balance_loss_clip": 1.04229712, + "balance_loss_mlp": 1.01819277, + "epoch": 0.7279504599290567, + "flos": 19093200877440.0, + "grad_norm": 1.7666655507698095, + "language_loss": 0.78134596, + "learning_rate": 7.272659500059297e-07, + "loss": 0.80269194, + "num_input_tokens_seen": 130160220, + "step": 6054, + "time_per_iteration": 2.5351901054382324 + }, + { + "auxiliary_loss_clip": 0.01149221, + "auxiliary_loss_mlp": 0.01033676, + "balance_loss_clip": 1.04560328, + "balance_loss_mlp": 1.02632701, + "epoch": 0.7280707028196958, + "flos": 19062174504960.0, + "grad_norm": 2.2811758113903324, + "language_loss": 0.80572057, + "learning_rate": 7.266651584069264e-07, + "loss": 0.82754958, + "num_input_tokens_seen": 130177885, + "step": 6055, + "time_per_iteration": 2.444190502166748 + }, + { + "auxiliary_loss_clip": 0.01160188, + "auxiliary_loss_mlp": 0.01021625, + "balance_loss_clip": 1.04932427, + "balance_loss_mlp": 1.01423395, + "epoch": 0.7281909457103348, + "flos": 37196308293120.0, + "grad_norm": 1.6543890031650956, + "language_loss": 0.56764275, + "learning_rate": 7.260645599718045e-07, + "loss": 0.58946085, + "num_input_tokens_seen": 130204240, + "step": 6056, + "time_per_iteration": 2.7059309482574463 + }, + { + "auxiliary_loss_clip": 0.01141812, + "auxiliary_loss_mlp": 0.01028147, + "balance_loss_clip": 1.04490554, + "balance_loss_mlp": 1.01991558, + "epoch": 0.728311188600974, + "flos": 20667094087680.0, + "grad_norm": 2.2321195803384026, + "language_loss": 0.67089045, + "learning_rate": 7.254641547916767e-07, + "loss": 0.69259, + "num_input_tokens_seen": 130221735, + "step": 6057, + "time_per_iteration": 2.61248517036438 + }, + { + "auxiliary_loss_clip": 0.01170045, + "auxiliary_loss_mlp": 0.01028409, + "balance_loss_clip": 1.04901052, + "balance_loss_mlp": 1.02100921, + "epoch": 0.728431431491613, + "flos": 28840685616000.0, + "grad_norm": 2.2332234210655475, + "language_loss": 0.69276154, + "learning_rate": 7.248639429576226e-07, + "loss": 0.71474612, + "num_input_tokens_seen": 130241190, + "step": 6058, + "time_per_iteration": 2.6021432876586914 + }, + { + "auxiliary_loss_clip": 0.01156275, + "auxiliary_loss_mlp": 0.01026357, + "balance_loss_clip": 1.04548132, + "balance_loss_mlp": 1.01873934, + "epoch": 0.7285516743822521, + "flos": 25991856092160.0, + "grad_norm": 1.627141464288172, + "language_loss": 0.7187413, + "learning_rate": 7.242639245606959e-07, + "loss": 0.74056768, + "num_input_tokens_seen": 130260980, + "step": 6059, + "time_per_iteration": 2.6046581268310547 + }, + { + "auxiliary_loss_clip": 0.01148061, + "auxiliary_loss_mlp": 0.01024806, + "balance_loss_clip": 1.0443573, + "balance_loss_mlp": 1.01689017, + "epoch": 0.7286719172728913, + "flos": 16399721675520.0, + "grad_norm": 1.6181454062829403, + "language_loss": 0.82175255, + "learning_rate": 7.236640996919168e-07, + "loss": 0.84348124, + "num_input_tokens_seen": 130280025, + "step": 6060, + "time_per_iteration": 2.604135513305664 + }, + { + "auxiliary_loss_clip": 0.01157931, + "auxiliary_loss_mlp": 0.01024322, + "balance_loss_clip": 1.04595327, + "balance_loss_mlp": 1.01709771, + "epoch": 0.7287921601635303, + "flos": 22018161277440.0, + "grad_norm": 1.6425646483985847, + "language_loss": 0.70669776, + "learning_rate": 7.230644684422782e-07, + "loss": 0.72852027, + "num_input_tokens_seen": 130300255, + "step": 6061, + "time_per_iteration": 2.607149839401245 + }, + { + "auxiliary_loss_clip": 0.0112941, + "auxiliary_loss_mlp": 0.01027236, + "balance_loss_clip": 1.0450666, + "balance_loss_mlp": 1.01992249, + "epoch": 0.7289124030541694, + "flos": 24600927784320.0, + "grad_norm": 1.9773301767758806, + "language_loss": 0.8136183, + "learning_rate": 7.224650309027451e-07, + "loss": 0.83518475, + "num_input_tokens_seen": 130320005, + "step": 6062, + "time_per_iteration": 2.539685010910034 + }, + { + "auxiliary_loss_clip": 0.0115989, + "auxiliary_loss_mlp": 0.0102483, + "balance_loss_clip": 1.0492382, + "balance_loss_mlp": 1.01771617, + "epoch": 0.7290326459448085, + "flos": 21393638484480.0, + "grad_norm": 2.0956020487797167, + "language_loss": 0.68692672, + "learning_rate": 7.218657871642506e-07, + "loss": 0.70877397, + "num_input_tokens_seen": 130338810, + "step": 6063, + "time_per_iteration": 2.457029104232788 + }, + { + "auxiliary_loss_clip": 0.01173399, + "auxiliary_loss_mlp": 0.0102719, + "balance_loss_clip": 1.04912806, + "balance_loss_mlp": 1.01961374, + "epoch": 0.7291528888354476, + "flos": 18587686821120.0, + "grad_norm": 2.07231687034867, + "language_loss": 0.62441474, + "learning_rate": 7.212667373177012e-07, + "loss": 0.64642066, + "num_input_tokens_seen": 130353805, + "step": 6064, + "time_per_iteration": 2.3997695446014404 + }, + { + "auxiliary_loss_clip": 0.01128303, + "auxiliary_loss_mlp": 0.01023845, + "balance_loss_clip": 1.04387975, + "balance_loss_mlp": 1.01616776, + "epoch": 0.7292731317260867, + "flos": 18951066760320.0, + "grad_norm": 1.7460499344183673, + "language_loss": 0.75436294, + "learning_rate": 7.206678814539704e-07, + "loss": 0.77588445, + "num_input_tokens_seen": 130372105, + "step": 6065, + "time_per_iteration": 2.499920606613159 + }, + { + "auxiliary_loss_clip": 0.01122929, + "auxiliary_loss_mlp": 0.01025994, + "balance_loss_clip": 1.0438447, + "balance_loss_mlp": 1.01915407, + "epoch": 0.7293933746167258, + "flos": 21067569797760.0, + "grad_norm": 1.6047815240690368, + "language_loss": 0.72653913, + "learning_rate": 7.20069219663904e-07, + "loss": 0.7480284, + "num_input_tokens_seen": 130391990, + "step": 6066, + "time_per_iteration": 2.5711958408355713 + }, + { + "auxiliary_loss_clip": 0.01157491, + "auxiliary_loss_mlp": 0.01022671, + "balance_loss_clip": 1.04447579, + "balance_loss_mlp": 1.01533413, + "epoch": 0.7295136175073649, + "flos": 22453326547200.0, + "grad_norm": 1.669284254729695, + "language_loss": 0.79146457, + "learning_rate": 7.1947075203832e-07, + "loss": 0.81326628, + "num_input_tokens_seen": 130411970, + "step": 6067, + "time_per_iteration": 3.5593109130859375 + }, + { + "auxiliary_loss_clip": 0.0106505, + "auxiliary_loss_mlp": 0.01000877, + "balance_loss_clip": 1.0079093, + "balance_loss_mlp": 0.99992895, + "epoch": 0.7296338603980039, + "flos": 56125506648960.0, + "grad_norm": 0.8612660783548666, + "language_loss": 0.60143495, + "learning_rate": 7.188724786680049e-07, + "loss": 0.62209421, + "num_input_tokens_seen": 130472440, + "step": 6068, + "time_per_iteration": 3.825794219970703 + }, + { + "auxiliary_loss_clip": 0.01139478, + "auxiliary_loss_mlp": 0.0102452, + "balance_loss_clip": 1.04247642, + "balance_loss_mlp": 1.01689625, + "epoch": 0.7297541032886431, + "flos": 25228287751680.0, + "grad_norm": 1.6224506835866768, + "language_loss": 0.75559109, + "learning_rate": 7.182743996437162e-07, + "loss": 0.7772311, + "num_input_tokens_seen": 130491975, + "step": 6069, + "time_per_iteration": 3.2705817222595215 + }, + { + "auxiliary_loss_clip": 0.01132835, + "auxiliary_loss_mlp": 0.01024191, + "balance_loss_clip": 1.04062748, + "balance_loss_mlp": 1.0162282, + "epoch": 0.7298743461792822, + "flos": 26467600752000.0, + "grad_norm": 1.9279082364864488, + "language_loss": 0.68261051, + "learning_rate": 7.176765150561819e-07, + "loss": 0.70418078, + "num_input_tokens_seen": 130510580, + "step": 6070, + "time_per_iteration": 2.5538153648376465 + }, + { + "auxiliary_loss_clip": 0.01170644, + "auxiliary_loss_mlp": 0.01023076, + "balance_loss_clip": 1.0461359, + "balance_loss_mlp": 1.0156312, + "epoch": 0.7299945890699212, + "flos": 19569053278080.0, + "grad_norm": 2.3619843276489307, + "language_loss": 0.79443681, + "learning_rate": 7.170788249961002e-07, + "loss": 0.816374, + "num_input_tokens_seen": 130529090, + "step": 6071, + "time_per_iteration": 2.414219856262207 + }, + { + "auxiliary_loss_clip": 0.01166117, + "auxiliary_loss_mlp": 0.01026726, + "balance_loss_clip": 1.04541516, + "balance_loss_mlp": 1.01927495, + "epoch": 0.7301148319605604, + "flos": 22928963466240.0, + "grad_norm": 1.858680840108943, + "language_loss": 0.88128018, + "learning_rate": 7.164813295541418e-07, + "loss": 0.90320861, + "num_input_tokens_seen": 130548655, + "step": 6072, + "time_per_iteration": 3.236715793609619 + }, + { + "auxiliary_loss_clip": 0.01144238, + "auxiliary_loss_mlp": 0.01027961, + "balance_loss_clip": 1.04537177, + "balance_loss_mlp": 1.02063525, + "epoch": 0.7302350748511994, + "flos": 25369703596800.0, + "grad_norm": 1.708875412753505, + "language_loss": 0.70472622, + "learning_rate": 7.15884028820944e-07, + "loss": 0.72644818, + "num_input_tokens_seen": 130567710, + "step": 6073, + "time_per_iteration": 2.5365281105041504 + }, + { + "auxiliary_loss_clip": 0.01121415, + "auxiliary_loss_mlp": 0.01022615, + "balance_loss_clip": 1.04014349, + "balance_loss_mlp": 1.0154326, + "epoch": 0.7303553177418385, + "flos": 27819170732160.0, + "grad_norm": 3.2764483313435306, + "language_loss": 0.60398841, + "learning_rate": 7.152869228871185e-07, + "loss": 0.62542868, + "num_input_tokens_seen": 130590195, + "step": 6074, + "time_per_iteration": 2.6354188919067383 + }, + { + "auxiliary_loss_clip": 0.01136103, + "auxiliary_loss_mlp": 0.01029831, + "balance_loss_clip": 1.04363036, + "balance_loss_mlp": 1.02216005, + "epoch": 0.7304755606324776, + "flos": 24426510318720.0, + "grad_norm": 1.70770068631523, + "language_loss": 0.72054261, + "learning_rate": 7.146900118432457e-07, + "loss": 0.74220192, + "num_input_tokens_seen": 130609940, + "step": 6075, + "time_per_iteration": 2.518756866455078 + }, + { + "auxiliary_loss_clip": 0.01083843, + "auxiliary_loss_mlp": 0.01028554, + "balance_loss_clip": 1.03467703, + "balance_loss_mlp": 1.02145195, + "epoch": 0.7305958035231167, + "flos": 23840483927040.0, + "grad_norm": 1.6941066871583785, + "language_loss": 0.85721266, + "learning_rate": 7.140932957798753e-07, + "loss": 0.87833661, + "num_input_tokens_seen": 130628380, + "step": 6076, + "time_per_iteration": 2.7266621589660645 + }, + { + "auxiliary_loss_clip": 0.01143013, + "auxiliary_loss_mlp": 0.01025988, + "balance_loss_clip": 1.0420866, + "balance_loss_mlp": 1.01832855, + "epoch": 0.7307160464137558, + "flos": 16726939597440.0, + "grad_norm": 2.176122973035219, + "language_loss": 0.71050882, + "learning_rate": 7.134967747875309e-07, + "loss": 0.73219883, + "num_input_tokens_seen": 130646590, + "step": 6077, + "time_per_iteration": 2.7817282676696777 + }, + { + "auxiliary_loss_clip": 0.01150518, + "auxiliary_loss_mlp": 0.01026384, + "balance_loss_clip": 1.0434382, + "balance_loss_mlp": 1.01910019, + "epoch": 0.7308362893043949, + "flos": 21798280172160.0, + "grad_norm": 1.8240682407321795, + "language_loss": 0.81721961, + "learning_rate": 7.129004489567014e-07, + "loss": 0.83898854, + "num_input_tokens_seen": 130664070, + "step": 6078, + "time_per_iteration": 2.470581293106079 + }, + { + "auxiliary_loss_clip": 0.01131242, + "auxiliary_loss_mlp": 0.01024966, + "balance_loss_clip": 1.04263389, + "balance_loss_mlp": 1.0175426, + "epoch": 0.730956532195034, + "flos": 10707377840640.0, + "grad_norm": 2.353628406435186, + "language_loss": 0.77782869, + "learning_rate": 7.123043183778512e-07, + "loss": 0.79939079, + "num_input_tokens_seen": 130681400, + "step": 6079, + "time_per_iteration": 2.4861011505126953 + }, + { + "auxiliary_loss_clip": 0.01135709, + "auxiliary_loss_mlp": 0.01031138, + "balance_loss_clip": 1.046139, + "balance_loss_mlp": 1.0236907, + "epoch": 0.731076775085673, + "flos": 19791987039360.0, + "grad_norm": 1.5994334911015164, + "language_loss": 0.65243566, + "learning_rate": 7.117083831414114e-07, + "loss": 0.67410415, + "num_input_tokens_seen": 130700675, + "step": 6080, + "time_per_iteration": 2.5100924968719482 + }, + { + "auxiliary_loss_clip": 0.01168134, + "auxiliary_loss_mlp": 0.01023818, + "balance_loss_clip": 1.04823399, + "balance_loss_mlp": 1.01648998, + "epoch": 0.7311970179763122, + "flos": 20447033414400.0, + "grad_norm": 1.8004113822123833, + "language_loss": 0.69720042, + "learning_rate": 7.11112643337787e-07, + "loss": 0.71911997, + "num_input_tokens_seen": 130719720, + "step": 6081, + "time_per_iteration": 2.4132373332977295 + }, + { + "auxiliary_loss_clip": 0.01143881, + "auxiliary_loss_mlp": 0.01029631, + "balance_loss_clip": 1.04746115, + "balance_loss_mlp": 1.02169204, + "epoch": 0.7313172608669513, + "flos": 18513818501760.0, + "grad_norm": 2.650270889812563, + "language_loss": 0.76595819, + "learning_rate": 7.10517099057349e-07, + "loss": 0.78769332, + "num_input_tokens_seen": 130736670, + "step": 6082, + "time_per_iteration": 2.4693167209625244 + }, + { + "auxiliary_loss_clip": 0.01140073, + "auxiliary_loss_mlp": 0.01024461, + "balance_loss_clip": 1.04302239, + "balance_loss_mlp": 1.01655126, + "epoch": 0.7314375037575903, + "flos": 16180738410240.0, + "grad_norm": 2.7304771121752154, + "language_loss": 0.61444914, + "learning_rate": 7.099217503904411e-07, + "loss": 0.63609457, + "num_input_tokens_seen": 130754525, + "step": 6083, + "time_per_iteration": 2.4563560485839844 + }, + { + "auxiliary_loss_clip": 0.01144807, + "auxiliary_loss_mlp": 0.01023153, + "balance_loss_clip": 1.04546475, + "balance_loss_mlp": 1.01594067, + "epoch": 0.7315577466482295, + "flos": 17967940536960.0, + "grad_norm": 1.9555273350400046, + "language_loss": 0.89957345, + "learning_rate": 7.093265974273788e-07, + "loss": 0.92125309, + "num_input_tokens_seen": 130772420, + "step": 6084, + "time_per_iteration": 2.4615063667297363 + }, + { + "auxiliary_loss_clip": 0.01155605, + "auxiliary_loss_mlp": 0.01024415, + "balance_loss_clip": 1.04456115, + "balance_loss_mlp": 1.01719737, + "epoch": 0.7316779895388685, + "flos": 18405440190720.0, + "grad_norm": 1.787608833139521, + "language_loss": 0.72173131, + "learning_rate": 7.087316402584447e-07, + "loss": 0.74353147, + "num_input_tokens_seen": 130791245, + "step": 6085, + "time_per_iteration": 2.4670615196228027 + }, + { + "auxiliary_loss_clip": 0.01169043, + "auxiliary_loss_mlp": 0.01020938, + "balance_loss_clip": 1.04739797, + "balance_loss_mlp": 1.01342821, + "epoch": 0.7317982324295076, + "flos": 17928294900480.0, + "grad_norm": 2.337311271151604, + "language_loss": 0.8637175, + "learning_rate": 7.081368789738953e-07, + "loss": 0.88561726, + "num_input_tokens_seen": 130808445, + "step": 6086, + "time_per_iteration": 2.390587568283081 + }, + { + "auxiliary_loss_clip": 0.01134462, + "auxiliary_loss_mlp": 0.01025791, + "balance_loss_clip": 1.03970885, + "balance_loss_mlp": 1.01784265, + "epoch": 0.7319184753201466, + "flos": 27229840289280.0, + "grad_norm": 1.92557510677315, + "language_loss": 0.77721524, + "learning_rate": 7.075423136639537e-07, + "loss": 0.79881781, + "num_input_tokens_seen": 130827700, + "step": 6087, + "time_per_iteration": 2.542180061340332 + }, + { + "auxiliary_loss_clip": 0.01119807, + "auxiliary_loss_mlp": 0.01024379, + "balance_loss_clip": 1.039891, + "balance_loss_mlp": 1.01651382, + "epoch": 0.7320387182107858, + "flos": 37448544574080.0, + "grad_norm": 2.0029872229162486, + "language_loss": 0.74394506, + "learning_rate": 7.069479444188149e-07, + "loss": 0.76538694, + "num_input_tokens_seen": 130848290, + "step": 6088, + "time_per_iteration": 2.6464426517486572 + }, + { + "auxiliary_loss_clip": 0.01134284, + "auxiliary_loss_mlp": 0.01024252, + "balance_loss_clip": 1.04465723, + "balance_loss_mlp": 1.01633382, + "epoch": 0.7321589611014249, + "flos": 17859023521920.0, + "grad_norm": 1.6269345214071458, + "language_loss": 0.81731057, + "learning_rate": 7.063537713286453e-07, + "loss": 0.83889592, + "num_input_tokens_seen": 130865970, + "step": 6089, + "time_per_iteration": 2.4612972736358643 + }, + { + "auxiliary_loss_clip": 0.01147427, + "auxiliary_loss_mlp": 0.010249, + "balance_loss_clip": 1.04480457, + "balance_loss_mlp": 1.017187, + "epoch": 0.7322792039920639, + "flos": 26100593539200.0, + "grad_norm": 1.850856482947034, + "language_loss": 0.81005716, + "learning_rate": 7.057597944835803e-07, + "loss": 0.83178043, + "num_input_tokens_seen": 130885245, + "step": 6090, + "time_per_iteration": 2.515717029571533 + }, + { + "auxiliary_loss_clip": 0.01134657, + "auxiliary_loss_mlp": 0.01019432, + "balance_loss_clip": 1.04347873, + "balance_loss_mlp": 1.01247621, + "epoch": 0.7323994468827031, + "flos": 25369093065600.0, + "grad_norm": 2.8811336195919663, + "language_loss": 0.74868715, + "learning_rate": 7.051660139737253e-07, + "loss": 0.77022809, + "num_input_tokens_seen": 130903465, + "step": 6091, + "time_per_iteration": 2.560029983520508 + }, + { + "auxiliary_loss_clip": 0.01154327, + "auxiliary_loss_mlp": 0.0076209, + "balance_loss_clip": 1.0484041, + "balance_loss_mlp": 1.00056696, + "epoch": 0.7325196897733421, + "flos": 26907075653760.0, + "grad_norm": 2.0295164314589136, + "language_loss": 0.76765519, + "learning_rate": 7.045724298891565e-07, + "loss": 0.78681934, + "num_input_tokens_seen": 130922935, + "step": 6092, + "time_per_iteration": 2.5474772453308105 + }, + { + "auxiliary_loss_clip": 0.0115501, + "auxiliary_loss_mlp": 0.01024702, + "balance_loss_clip": 1.04783678, + "balance_loss_mlp": 1.01744223, + "epoch": 0.7326399326639812, + "flos": 25775781828480.0, + "grad_norm": 1.9237240833623923, + "language_loss": 0.69025904, + "learning_rate": 7.039790423199192e-07, + "loss": 0.71205616, + "num_input_tokens_seen": 130942575, + "step": 6093, + "time_per_iteration": 2.4940567016601562 + }, + { + "auxiliary_loss_clip": 0.01146838, + "auxiliary_loss_mlp": 0.01024854, + "balance_loss_clip": 1.04705715, + "balance_loss_mlp": 1.01745701, + "epoch": 0.7327601755546204, + "flos": 21032269706880.0, + "grad_norm": 1.915263844072405, + "language_loss": 0.77648795, + "learning_rate": 7.033858513560322e-07, + "loss": 0.7982049, + "num_input_tokens_seen": 130958870, + "step": 6094, + "time_per_iteration": 3.212934732437134 + }, + { + "auxiliary_loss_clip": 0.0115765, + "auxiliary_loss_mlp": 0.01025935, + "balance_loss_clip": 1.04854679, + "balance_loss_mlp": 1.01849604, + "epoch": 0.7328804184452594, + "flos": 16289224462080.0, + "grad_norm": 2.402886420373747, + "language_loss": 0.76585376, + "learning_rate": 7.027928570874794e-07, + "loss": 0.78768957, + "num_input_tokens_seen": 130977060, + "step": 6095, + "time_per_iteration": 3.177750587463379 + }, + { + "auxiliary_loss_clip": 0.01168191, + "auxiliary_loss_mlp": 0.01026163, + "balance_loss_clip": 1.04729128, + "balance_loss_mlp": 1.01867962, + "epoch": 0.7330006613358985, + "flos": 17858233422720.0, + "grad_norm": 1.8044907253537976, + "language_loss": 0.85450494, + "learning_rate": 7.022000596042194e-07, + "loss": 0.87644845, + "num_input_tokens_seen": 130994160, + "step": 6096, + "time_per_iteration": 3.394298791885376 + }, + { + "auxiliary_loss_clip": 0.01127528, + "auxiliary_loss_mlp": 0.01024053, + "balance_loss_clip": 1.03968418, + "balance_loss_mlp": 1.01694226, + "epoch": 0.7331209042265376, + "flos": 22492074343680.0, + "grad_norm": 3.8437462115919923, + "language_loss": 0.81451058, + "learning_rate": 7.016074589961784e-07, + "loss": 0.83602643, + "num_input_tokens_seen": 131012725, + "step": 6097, + "time_per_iteration": 2.5813801288604736 + }, + { + "auxiliary_loss_clip": 0.01136422, + "auxiliary_loss_mlp": 0.01023686, + "balance_loss_clip": 1.04373217, + "balance_loss_mlp": 1.01581824, + "epoch": 0.7332411471171767, + "flos": 33072757937280.0, + "grad_norm": 1.7556733808265998, + "language_loss": 0.67164934, + "learning_rate": 7.01015055353253e-07, + "loss": 0.69325042, + "num_input_tokens_seen": 131035150, + "step": 6098, + "time_per_iteration": 2.5884666442871094 + }, + { + "auxiliary_loss_clip": 0.01102627, + "auxiliary_loss_mlp": 0.01025425, + "balance_loss_clip": 1.04033411, + "balance_loss_mlp": 1.01732504, + "epoch": 0.7333613900078157, + "flos": 22743017735040.0, + "grad_norm": 2.8201897272949785, + "language_loss": 0.77954054, + "learning_rate": 7.004228487653123e-07, + "loss": 0.80082101, + "num_input_tokens_seen": 131055955, + "step": 6099, + "time_per_iteration": 3.350940227508545 + }, + { + "auxiliary_loss_clip": 0.01122176, + "auxiliary_loss_mlp": 0.01023687, + "balance_loss_clip": 1.03837228, + "balance_loss_mlp": 1.01624846, + "epoch": 0.7334816328984549, + "flos": 22346133384960.0, + "grad_norm": 4.264657147815541, + "language_loss": 0.7798388, + "learning_rate": 6.998308393221906e-07, + "loss": 0.80129743, + "num_input_tokens_seen": 131074360, + "step": 6100, + "time_per_iteration": 2.5426535606384277 + }, + { + "auxiliary_loss_clip": 0.01129583, + "auxiliary_loss_mlp": 0.01024758, + "balance_loss_clip": 1.04338384, + "balance_loss_mlp": 1.01736689, + "epoch": 0.733601875789094, + "flos": 20736149984640.0, + "grad_norm": 2.2775949682986285, + "language_loss": 0.71157598, + "learning_rate": 6.992390271136977e-07, + "loss": 0.73311949, + "num_input_tokens_seen": 131090070, + "step": 6101, + "time_per_iteration": 2.5066845417022705 + }, + { + "auxiliary_loss_clip": 0.01149563, + "auxiliary_loss_mlp": 0.01024153, + "balance_loss_clip": 1.04401708, + "balance_loss_mlp": 1.01696813, + "epoch": 0.733722118679733, + "flos": 22564362464640.0, + "grad_norm": 2.4047495042647964, + "language_loss": 0.8540926, + "learning_rate": 6.986474122296094e-07, + "loss": 0.87582982, + "num_input_tokens_seen": 131109185, + "step": 6102, + "time_per_iteration": 2.4644622802734375 + }, + { + "auxiliary_loss_clip": 0.01174135, + "auxiliary_loss_mlp": 0.01022647, + "balance_loss_clip": 1.04997599, + "balance_loss_mlp": 1.01530325, + "epoch": 0.7338423615703722, + "flos": 20084192179200.0, + "grad_norm": 3.776987985775528, + "language_loss": 0.72193015, + "learning_rate": 6.980559947596751e-07, + "loss": 0.74389803, + "num_input_tokens_seen": 131127725, + "step": 6103, + "time_per_iteration": 2.4256820678710938 + }, + { + "auxiliary_loss_clip": 0.01111285, + "auxiliary_loss_mlp": 0.01022919, + "balance_loss_clip": 1.04070354, + "balance_loss_mlp": 1.01515269, + "epoch": 0.7339626044610112, + "flos": 21687675217920.0, + "grad_norm": 1.950574404285697, + "language_loss": 0.7574085, + "learning_rate": 6.974647747936109e-07, + "loss": 0.77875054, + "num_input_tokens_seen": 131146110, + "step": 6104, + "time_per_iteration": 2.556182384490967 + }, + { + "auxiliary_loss_clip": 0.0117081, + "auxiliary_loss_mlp": 0.00762223, + "balance_loss_clip": 1.04876733, + "balance_loss_mlp": 1.00056469, + "epoch": 0.7340828473516503, + "flos": 15268248282240.0, + "grad_norm": 2.1604940253596685, + "language_loss": 0.8232094, + "learning_rate": 6.968737524211039e-07, + "loss": 0.84253967, + "num_input_tokens_seen": 131162920, + "step": 6105, + "time_per_iteration": 2.4035167694091797 + }, + { + "auxiliary_loss_clip": 0.01154209, + "auxiliary_loss_mlp": 0.01025518, + "balance_loss_clip": 1.04713559, + "balance_loss_mlp": 1.01772809, + "epoch": 0.7342030902422895, + "flos": 22930112701440.0, + "grad_norm": 2.1526484942434885, + "language_loss": 0.80079627, + "learning_rate": 6.962829277318132e-07, + "loss": 0.82259357, + "num_input_tokens_seen": 131182515, + "step": 6106, + "time_per_iteration": 2.4672634601593018 + }, + { + "auxiliary_loss_clip": 0.01159087, + "auxiliary_loss_mlp": 0.01025028, + "balance_loss_clip": 1.04984975, + "balance_loss_mlp": 1.01770258, + "epoch": 0.7343233331329285, + "flos": 25847890381440.0, + "grad_norm": 1.9867410469401268, + "language_loss": 0.83674777, + "learning_rate": 6.956923008153652e-07, + "loss": 0.85858893, + "num_input_tokens_seen": 131202280, + "step": 6107, + "time_per_iteration": 2.4958925247192383 + }, + { + "auxiliary_loss_clip": 0.01153783, + "auxiliary_loss_mlp": 0.01030875, + "balance_loss_clip": 1.0434289, + "balance_loss_mlp": 1.02413332, + "epoch": 0.7344435760235676, + "flos": 18478985287680.0, + "grad_norm": 2.4548406127157016, + "language_loss": 0.84271336, + "learning_rate": 6.951018717613593e-07, + "loss": 0.86455989, + "num_input_tokens_seen": 131221295, + "step": 6108, + "time_per_iteration": 2.442636251449585 + }, + { + "auxiliary_loss_clip": 0.01154556, + "auxiliary_loss_mlp": 0.01024387, + "balance_loss_clip": 1.04756367, + "balance_loss_mlp": 1.01700509, + "epoch": 0.7345638189142067, + "flos": 17640040256640.0, + "grad_norm": 1.906234479885713, + "language_loss": 0.78398871, + "learning_rate": 6.945116406593614e-07, + "loss": 0.80577821, + "num_input_tokens_seen": 131240150, + "step": 6109, + "time_per_iteration": 2.4331483840942383 + }, + { + "auxiliary_loss_clip": 0.01117326, + "auxiliary_loss_mlp": 0.01025254, + "balance_loss_clip": 1.04455709, + "balance_loss_mlp": 1.01771414, + "epoch": 0.7346840618048458, + "flos": 20260225756800.0, + "grad_norm": 1.9858885437742808, + "language_loss": 0.74227154, + "learning_rate": 6.939216075989089e-07, + "loss": 0.76369739, + "num_input_tokens_seen": 131258080, + "step": 6110, + "time_per_iteration": 2.5617129802703857 + }, + { + "auxiliary_loss_clip": 0.01138659, + "auxiliary_loss_mlp": 0.01020207, + "balance_loss_clip": 1.04340529, + "balance_loss_mlp": 1.01280451, + "epoch": 0.7348043046954849, + "flos": 29023183641600.0, + "grad_norm": 3.269223346426037, + "language_loss": 0.65726566, + "learning_rate": 6.933317726695109e-07, + "loss": 0.67885435, + "num_input_tokens_seen": 131279310, + "step": 6111, + "time_per_iteration": 2.541555643081665 + }, + { + "auxiliary_loss_clip": 0.01124797, + "auxiliary_loss_mlp": 0.01025102, + "balance_loss_clip": 1.04505706, + "balance_loss_mlp": 1.01758862, + "epoch": 0.734924547586124, + "flos": 17931203902080.0, + "grad_norm": 5.104750494096401, + "language_loss": 0.80026805, + "learning_rate": 6.92742135960644e-07, + "loss": 0.82176709, + "num_input_tokens_seen": 131297010, + "step": 6112, + "time_per_iteration": 2.488445281982422 + }, + { + "auxiliary_loss_clip": 0.01057814, + "auxiliary_loss_mlp": 0.0100185, + "balance_loss_clip": 1.01041651, + "balance_loss_mlp": 1.00084877, + "epoch": 0.7350447904767631, + "flos": 63588319850880.0, + "grad_norm": 0.8209691767143322, + "language_loss": 0.55674237, + "learning_rate": 6.921526975617556e-07, + "loss": 0.57733905, + "num_input_tokens_seen": 131356470, + "step": 6113, + "time_per_iteration": 3.077716588973999 + }, + { + "auxiliary_loss_clip": 0.01143634, + "auxiliary_loss_mlp": 0.01027845, + "balance_loss_clip": 1.04349709, + "balance_loss_mlp": 1.02019775, + "epoch": 0.7351650333674021, + "flos": 21580015178880.0, + "grad_norm": 1.9852235336411972, + "language_loss": 0.75854915, + "learning_rate": 6.915634575622631e-07, + "loss": 0.7802639, + "num_input_tokens_seen": 131374985, + "step": 6114, + "time_per_iteration": 2.521780014038086 + }, + { + "auxiliary_loss_clip": 0.01168897, + "auxiliary_loss_mlp": 0.01020608, + "balance_loss_clip": 1.04834712, + "balance_loss_mlp": 1.01335382, + "epoch": 0.7352852762580413, + "flos": 18186349184640.0, + "grad_norm": 2.44207499547173, + "language_loss": 0.70741838, + "learning_rate": 6.909744160515532e-07, + "loss": 0.72931337, + "num_input_tokens_seen": 131393125, + "step": 6115, + "time_per_iteration": 2.4022932052612305 + }, + { + "auxiliary_loss_clip": 0.01140402, + "auxiliary_loss_mlp": 0.01025035, + "balance_loss_clip": 1.04406834, + "balance_loss_mlp": 1.01747727, + "epoch": 0.7354055191486804, + "flos": 38910073063680.0, + "grad_norm": 1.9335818619432754, + "language_loss": 0.69433653, + "learning_rate": 6.903855731189849e-07, + "loss": 0.7159909, + "num_input_tokens_seen": 131415760, + "step": 6116, + "time_per_iteration": 2.6252264976501465 + }, + { + "auxiliary_loss_clip": 0.01149574, + "auxiliary_loss_mlp": 0.01029979, + "balance_loss_clip": 1.04705381, + "balance_loss_mlp": 1.02197087, + "epoch": 0.7355257620393194, + "flos": 16289978647680.0, + "grad_norm": 2.19244461687445, + "language_loss": 0.81773257, + "learning_rate": 6.897969288538825e-07, + "loss": 0.83952808, + "num_input_tokens_seen": 131433705, + "step": 6117, + "time_per_iteration": 2.4655346870422363 + }, + { + "auxiliary_loss_clip": 0.01139677, + "auxiliary_loss_mlp": 0.01025574, + "balance_loss_clip": 1.04489553, + "balance_loss_mlp": 1.01838851, + "epoch": 0.7356460049299585, + "flos": 18114240631680.0, + "grad_norm": 2.2929466029418277, + "language_loss": 0.81299001, + "learning_rate": 6.892084833455452e-07, + "loss": 0.83464253, + "num_input_tokens_seen": 131453275, + "step": 6118, + "time_per_iteration": 2.4718375205993652 + }, + { + "auxiliary_loss_clip": 0.01154145, + "auxiliary_loss_mlp": 0.01023288, + "balance_loss_clip": 1.04771519, + "balance_loss_mlp": 1.01634693, + "epoch": 0.7357662478205976, + "flos": 21325193118720.0, + "grad_norm": 2.847024850060923, + "language_loss": 0.8396455, + "learning_rate": 6.886202366832384e-07, + "loss": 0.8614198, + "num_input_tokens_seen": 131474960, + "step": 6119, + "time_per_iteration": 2.474106550216675 + }, + { + "auxiliary_loss_clip": 0.01111545, + "auxiliary_loss_mlp": 0.01024003, + "balance_loss_clip": 1.0428915, + "balance_loss_mlp": 1.01637936, + "epoch": 0.7358864907112367, + "flos": 14246841139200.0, + "grad_norm": 1.9446174925129436, + "language_loss": 0.73680741, + "learning_rate": 6.880321889561987e-07, + "loss": 0.75816292, + "num_input_tokens_seen": 131492935, + "step": 6120, + "time_per_iteration": 3.4979567527770996 + }, + { + "auxiliary_loss_clip": 0.01122209, + "auxiliary_loss_mlp": 0.01027701, + "balance_loss_clip": 1.04327464, + "balance_loss_mlp": 1.01930857, + "epoch": 0.7360067336018757, + "flos": 22309684058880.0, + "grad_norm": 2.018065063965961, + "language_loss": 0.65527892, + "learning_rate": 6.874443402536338e-07, + "loss": 0.67677808, + "num_input_tokens_seen": 131512025, + "step": 6121, + "time_per_iteration": 2.5371713638305664 + }, + { + "auxiliary_loss_clip": 0.01144622, + "auxiliary_loss_mlp": 0.01025812, + "balance_loss_clip": 1.04610276, + "balance_loss_mlp": 1.01800406, + "epoch": 0.7361269764925149, + "flos": 25554607833600.0, + "grad_norm": 1.680400795710964, + "language_loss": 0.80258226, + "learning_rate": 6.868566906647177e-07, + "loss": 0.82428658, + "num_input_tokens_seen": 131532975, + "step": 6122, + "time_per_iteration": 4.055913925170898 + }, + { + "auxiliary_loss_clip": 0.01155593, + "auxiliary_loss_mlp": 0.01029877, + "balance_loss_clip": 1.04564214, + "balance_loss_mlp": 1.02204525, + "epoch": 0.736247219383154, + "flos": 20376505059840.0, + "grad_norm": 1.7294756862750225, + "language_loss": 0.83426917, + "learning_rate": 6.862692402785984e-07, + "loss": 0.85612392, + "num_input_tokens_seen": 131553225, + "step": 6123, + "time_per_iteration": 2.4753684997558594 + }, + { + "auxiliary_loss_clip": 0.01035688, + "auxiliary_loss_mlp": 0.01001219, + "balance_loss_clip": 1.0153358, + "balance_loss_mlp": 0.99994355, + "epoch": 0.736367462273793, + "flos": 70339525735680.0, + "grad_norm": 0.6829145732472923, + "language_loss": 0.49586594, + "learning_rate": 6.856819891843899e-07, + "loss": 0.51623511, + "num_input_tokens_seen": 131617930, + "step": 6124, + "time_per_iteration": 3.203608274459839 + }, + { + "auxiliary_loss_clip": 0.01099919, + "auxiliary_loss_mlp": 0.01029297, + "balance_loss_clip": 1.04128659, + "balance_loss_mlp": 1.02170372, + "epoch": 0.7364877051644322, + "flos": 22412711243520.0, + "grad_norm": 2.2395269228949055, + "language_loss": 0.71687025, + "learning_rate": 6.8509493747118e-07, + "loss": 0.7381624, + "num_input_tokens_seen": 131636740, + "step": 6125, + "time_per_iteration": 2.582695722579956 + }, + { + "auxiliary_loss_clip": 0.01170547, + "auxiliary_loss_mlp": 0.01023923, + "balance_loss_clip": 1.04856372, + "balance_loss_mlp": 1.01661003, + "epoch": 0.7366079480550712, + "flos": 12130266274560.0, + "grad_norm": 2.0684329640912806, + "language_loss": 0.88161379, + "learning_rate": 6.845080852280221e-07, + "loss": 0.90355849, + "num_input_tokens_seen": 131653810, + "step": 6126, + "time_per_iteration": 3.2053749561309814 + }, + { + "auxiliary_loss_clip": 0.01127159, + "auxiliary_loss_mlp": 0.01024124, + "balance_loss_clip": 1.04234219, + "balance_loss_mlp": 1.01735854, + "epoch": 0.7367281909457103, + "flos": 15049336844160.0, + "grad_norm": 2.9274723003318996, + "language_loss": 0.74022257, + "learning_rate": 6.839214325439409e-07, + "loss": 0.76173544, + "num_input_tokens_seen": 131671505, + "step": 6127, + "time_per_iteration": 2.5065178871154785 + }, + { + "auxiliary_loss_clip": 0.01135295, + "auxiliary_loss_mlp": 0.01023516, + "balance_loss_clip": 1.04544377, + "balance_loss_mlp": 1.01653945, + "epoch": 0.7368484338363495, + "flos": 23510752053120.0, + "grad_norm": 1.5833844031590991, + "language_loss": 0.71452832, + "learning_rate": 6.833349795079327e-07, + "loss": 0.73611641, + "num_input_tokens_seen": 131690615, + "step": 6128, + "time_per_iteration": 2.5005698204040527 + }, + { + "auxiliary_loss_clip": 0.01128848, + "auxiliary_loss_mlp": 0.01029557, + "balance_loss_clip": 1.04544246, + "balance_loss_mlp": 1.02203453, + "epoch": 0.7369686767269885, + "flos": 27417833095680.0, + "grad_norm": 2.4644478318058036, + "language_loss": 0.68455291, + "learning_rate": 6.827487262089613e-07, + "loss": 0.70613694, + "num_input_tokens_seen": 131711120, + "step": 6129, + "time_per_iteration": 2.574481964111328 + }, + { + "auxiliary_loss_clip": 0.01038492, + "auxiliary_loss_mlp": 0.00999746, + "balance_loss_clip": 1.00750589, + "balance_loss_mlp": 0.99868459, + "epoch": 0.7370889196176276, + "flos": 70293343824000.0, + "grad_norm": 0.8786456068386154, + "language_loss": 0.56786805, + "learning_rate": 6.821626727359606e-07, + "loss": 0.5882504, + "num_input_tokens_seen": 131776680, + "step": 6130, + "time_per_iteration": 3.1352479457855225 + }, + { + "auxiliary_loss_clip": 0.0114321, + "auxiliary_loss_mlp": 0.01024789, + "balance_loss_clip": 1.04886258, + "balance_loss_mlp": 1.01701081, + "epoch": 0.7372091625082667, + "flos": 18040839189120.0, + "grad_norm": 2.1332472659829316, + "language_loss": 0.77263439, + "learning_rate": 6.815768191778348e-07, + "loss": 0.79431438, + "num_input_tokens_seen": 131794760, + "step": 6131, + "time_per_iteration": 2.4890122413635254 + }, + { + "auxiliary_loss_clip": 0.01152368, + "auxiliary_loss_mlp": 0.01026288, + "balance_loss_clip": 1.04512298, + "balance_loss_mlp": 1.01856971, + "epoch": 0.7373294053989058, + "flos": 33726331854720.0, + "grad_norm": 2.2594910566313393, + "language_loss": 0.72657299, + "learning_rate": 6.809911656234569e-07, + "loss": 0.74835956, + "num_input_tokens_seen": 131816735, + "step": 6132, + "time_per_iteration": 2.5623509883880615 + }, + { + "auxiliary_loss_clip": 0.01129031, + "auxiliary_loss_mlp": 0.01024956, + "balance_loss_clip": 1.04155946, + "balance_loss_mlp": 1.01756263, + "epoch": 0.7374496482895448, + "flos": 21506326427520.0, + "grad_norm": 2.204674928368816, + "language_loss": 0.78249514, + "learning_rate": 6.804057121616707e-07, + "loss": 0.80403501, + "num_input_tokens_seen": 131834940, + "step": 6133, + "time_per_iteration": 2.5407285690307617 + }, + { + "auxiliary_loss_clip": 0.01157264, + "auxiliary_loss_mlp": 0.01026557, + "balance_loss_clip": 1.04695404, + "balance_loss_mlp": 1.01862371, + "epoch": 0.737569891180184, + "flos": 24936908624640.0, + "grad_norm": 1.9053268228305162, + "language_loss": 0.7224648, + "learning_rate": 6.798204588812888e-07, + "loss": 0.74430305, + "num_input_tokens_seen": 131854355, + "step": 6134, + "time_per_iteration": 2.5241572856903076 + }, + { + "auxiliary_loss_clip": 0.01087474, + "auxiliary_loss_mlp": 0.00762141, + "balance_loss_clip": 1.03760028, + "balance_loss_mlp": 1.00054502, + "epoch": 0.7376901340708231, + "flos": 20664544222080.0, + "grad_norm": 1.6515537547415144, + "language_loss": 0.75665021, + "learning_rate": 6.792354058710937e-07, + "loss": 0.77514637, + "num_input_tokens_seen": 131871825, + "step": 6135, + "time_per_iteration": 2.5902364253997803 + }, + { + "auxiliary_loss_clip": 0.01162933, + "auxiliary_loss_mlp": 0.01020978, + "balance_loss_clip": 1.0453366, + "balance_loss_mlp": 1.01353335, + "epoch": 0.7378103769614621, + "flos": 23805794367360.0, + "grad_norm": 1.933345079112897, + "language_loss": 0.65068781, + "learning_rate": 6.786505532198374e-07, + "loss": 0.67252684, + "num_input_tokens_seen": 131890770, + "step": 6136, + "time_per_iteration": 2.4423885345458984 + }, + { + "auxiliary_loss_clip": 0.01170211, + "auxiliary_loss_mlp": 0.01025305, + "balance_loss_clip": 1.048002, + "balance_loss_mlp": 1.01736271, + "epoch": 0.7379306198521013, + "flos": 22237216369920.0, + "grad_norm": 1.7783174476895238, + "language_loss": 0.85481918, + "learning_rate": 6.780659010162411e-07, + "loss": 0.87677443, + "num_input_tokens_seen": 131909720, + "step": 6137, + "time_per_iteration": 2.4227912425994873 + }, + { + "auxiliary_loss_clip": 0.01132779, + "auxiliary_loss_mlp": 0.01021341, + "balance_loss_clip": 1.04573166, + "balance_loss_mlp": 1.0143013, + "epoch": 0.7380508627427403, + "flos": 14903108576640.0, + "grad_norm": 1.5879274089793713, + "language_loss": 0.83143383, + "learning_rate": 6.774814493489975e-07, + "loss": 0.85297501, + "num_input_tokens_seen": 131927395, + "step": 6138, + "time_per_iteration": 2.492523193359375 + }, + { + "auxiliary_loss_clip": 0.01152536, + "auxiliary_loss_mlp": 0.01023226, + "balance_loss_clip": 1.04599059, + "balance_loss_mlp": 1.01594543, + "epoch": 0.7381711056333794, + "flos": 21685843624320.0, + "grad_norm": 1.8179663516287445, + "language_loss": 0.66149247, + "learning_rate": 6.768971983067655e-07, + "loss": 0.68325007, + "num_input_tokens_seen": 131947725, + "step": 6139, + "time_per_iteration": 2.4578073024749756 + }, + { + "auxiliary_loss_clip": 0.010658, + "auxiliary_loss_mlp": 0.01001409, + "balance_loss_clip": 1.00835848, + "balance_loss_mlp": 1.00045514, + "epoch": 0.7382913485240186, + "flos": 52404263596800.0, + "grad_norm": 1.0042218826092746, + "language_loss": 0.67784894, + "learning_rate": 6.763131479781772e-07, + "loss": 0.69852108, + "num_input_tokens_seen": 131997485, + "step": 6140, + "time_per_iteration": 2.8277299404144287 + }, + { + "auxiliary_loss_clip": 0.01133098, + "auxiliary_loss_mlp": 0.01022274, + "balance_loss_clip": 1.04598165, + "balance_loss_mlp": 1.01512456, + "epoch": 0.7384115914146576, + "flos": 21798818876160.0, + "grad_norm": 2.1180801772119797, + "language_loss": 0.76114231, + "learning_rate": 6.757292984518316e-07, + "loss": 0.78269601, + "num_input_tokens_seen": 132016885, + "step": 6141, + "time_per_iteration": 2.5010931491851807 + }, + { + "auxiliary_loss_clip": 0.01056008, + "auxiliary_loss_mlp": 0.01001584, + "balance_loss_clip": 1.00828636, + "balance_loss_mlp": 1.00061882, + "epoch": 0.7385318343052967, + "flos": 61494331662720.0, + "grad_norm": 0.7382473631681932, + "language_loss": 0.56395638, + "learning_rate": 6.751456498162981e-07, + "loss": 0.58453226, + "num_input_tokens_seen": 132075920, + "step": 6142, + "time_per_iteration": 2.9384026527404785 + }, + { + "auxiliary_loss_clip": 0.01152697, + "auxiliary_loss_mlp": 0.01021312, + "balance_loss_clip": 1.04243433, + "balance_loss_mlp": 1.01505661, + "epoch": 0.7386520771959358, + "flos": 17013757697280.0, + "grad_norm": 1.839704688019097, + "language_loss": 0.8516987, + "learning_rate": 6.745622021601174e-07, + "loss": 0.87343878, + "num_input_tokens_seen": 132092945, + "step": 6143, + "time_per_iteration": 2.4315474033355713 + }, + { + "auxiliary_loss_clip": 0.01129904, + "auxiliary_loss_mlp": 0.01021431, + "balance_loss_clip": 1.04338562, + "balance_loss_mlp": 1.01407838, + "epoch": 0.7387723200865749, + "flos": 18770759464320.0, + "grad_norm": 3.000232087191948, + "language_loss": 0.69537294, + "learning_rate": 6.739789555717954e-07, + "loss": 0.71688628, + "num_input_tokens_seen": 132109920, + "step": 6144, + "time_per_iteration": 2.49597430229187 + }, + { + "auxiliary_loss_clip": 0.01167369, + "auxiliary_loss_mlp": 0.01025302, + "balance_loss_clip": 1.04637527, + "balance_loss_mlp": 1.01812911, + "epoch": 0.738892562977214, + "flos": 22525542840960.0, + "grad_norm": 1.9907109709669184, + "language_loss": 0.77114654, + "learning_rate": 6.733959101398124e-07, + "loss": 0.79307324, + "num_input_tokens_seen": 132128050, + "step": 6145, + "time_per_iteration": 2.437748670578003 + }, + { + "auxiliary_loss_clip": 0.01139522, + "auxiliary_loss_mlp": 0.0102371, + "balance_loss_clip": 1.04355454, + "balance_loss_mlp": 1.01572347, + "epoch": 0.7390128058678531, + "flos": 21501478091520.0, + "grad_norm": 2.3213561803509495, + "language_loss": 0.81424582, + "learning_rate": 6.728130659526143e-07, + "loss": 0.83587813, + "num_input_tokens_seen": 132145860, + "step": 6146, + "time_per_iteration": 2.473242998123169 + }, + { + "auxiliary_loss_clip": 0.01143764, + "auxiliary_loss_mlp": 0.01032822, + "balance_loss_clip": 1.04663885, + "balance_loss_mlp": 1.02519894, + "epoch": 0.7391330487584922, + "flos": 25776176878080.0, + "grad_norm": 2.1610682966228505, + "language_loss": 0.70897186, + "learning_rate": 6.7223042309862e-07, + "loss": 0.73073769, + "num_input_tokens_seen": 132166060, + "step": 6147, + "time_per_iteration": 3.3434648513793945 + }, + { + "auxiliary_loss_clip": 0.01151852, + "auxiliary_loss_mlp": 0.0102592, + "balance_loss_clip": 1.04427981, + "balance_loss_mlp": 1.0190177, + "epoch": 0.7392532916491312, + "flos": 28366736636160.0, + "grad_norm": 3.205660947885496, + "language_loss": 0.73630953, + "learning_rate": 6.716479816662144e-07, + "loss": 0.75808728, + "num_input_tokens_seen": 132187790, + "step": 6148, + "time_per_iteration": 3.266505718231201 + }, + { + "auxiliary_loss_clip": 0.01143799, + "auxiliary_loss_mlp": 0.01024598, + "balance_loss_clip": 1.04363811, + "balance_loss_mlp": 1.01732326, + "epoch": 0.7393735345397703, + "flos": 23585877348480.0, + "grad_norm": 2.0468657672510533, + "language_loss": 0.72983384, + "learning_rate": 6.710657417437531e-07, + "loss": 0.75151789, + "num_input_tokens_seen": 132207495, + "step": 6149, + "time_per_iteration": 3.270458221435547 + }, + { + "auxiliary_loss_clip": 0.01139521, + "auxiliary_loss_mlp": 0.01024875, + "balance_loss_clip": 1.04417026, + "balance_loss_mlp": 1.01771677, + "epoch": 0.7394937774304094, + "flos": 19974772373760.0, + "grad_norm": 2.333271098965654, + "language_loss": 0.79877734, + "learning_rate": 6.704837034195628e-07, + "loss": 0.82042128, + "num_input_tokens_seen": 132225960, + "step": 6150, + "time_per_iteration": 2.47348690032959 + }, + { + "auxiliary_loss_clip": 0.01148455, + "auxiliary_loss_mlp": 0.01031764, + "balance_loss_clip": 1.04504633, + "balance_loss_mlp": 1.02415264, + "epoch": 0.7396140203210485, + "flos": 23478037741440.0, + "grad_norm": 1.9293822146601145, + "language_loss": 0.84861565, + "learning_rate": 6.699018667819376e-07, + "loss": 0.87041783, + "num_input_tokens_seen": 132245360, + "step": 6151, + "time_per_iteration": 2.4787039756774902 + }, + { + "auxiliary_loss_clip": 0.01151514, + "auxiliary_loss_mlp": 0.0103107, + "balance_loss_clip": 1.04373622, + "balance_loss_mlp": 1.02336645, + "epoch": 0.7397342632116876, + "flos": 25555433846400.0, + "grad_norm": 1.6391067542314048, + "language_loss": 0.72945392, + "learning_rate": 6.693202319191415e-07, + "loss": 0.75127971, + "num_input_tokens_seen": 132267095, + "step": 6152, + "time_per_iteration": 2.4993715286254883 + }, + { + "auxiliary_loss_clip": 0.01169606, + "auxiliary_loss_mlp": 0.01026655, + "balance_loss_clip": 1.0509088, + "balance_loss_mlp": 1.01889408, + "epoch": 0.7398545061023267, + "flos": 24755021130240.0, + "grad_norm": 1.802595833897674, + "language_loss": 0.74868047, + "learning_rate": 6.687387989194084e-07, + "loss": 0.77064306, + "num_input_tokens_seen": 132286610, + "step": 6153, + "time_per_iteration": 3.5179812908172607 + }, + { + "auxiliary_loss_clip": 0.01134817, + "auxiliary_loss_mlp": 0.01026155, + "balance_loss_clip": 1.04515755, + "balance_loss_mlp": 1.01887083, + "epoch": 0.7399747489929658, + "flos": 16508602776960.0, + "grad_norm": 1.9937119427730883, + "language_loss": 0.79424316, + "learning_rate": 6.681575678709404e-07, + "loss": 0.81585282, + "num_input_tokens_seen": 132305300, + "step": 6154, + "time_per_iteration": 2.4803059101104736 + }, + { + "auxiliary_loss_clip": 0.01154506, + "auxiliary_loss_mlp": 0.01023536, + "balance_loss_clip": 1.0465126, + "balance_loss_mlp": 1.01630008, + "epoch": 0.7400949918836048, + "flos": 24097065753600.0, + "grad_norm": 1.847641426630635, + "language_loss": 0.70577103, + "learning_rate": 6.67576538861911e-07, + "loss": 0.72755146, + "num_input_tokens_seen": 132323875, + "step": 6155, + "time_per_iteration": 2.46907114982605 + }, + { + "auxiliary_loss_clip": 0.01135802, + "auxiliary_loss_mlp": 0.01023481, + "balance_loss_clip": 1.04259849, + "balance_loss_mlp": 1.01654947, + "epoch": 0.740215234774244, + "flos": 21802517976960.0, + "grad_norm": 1.7049573177743822, + "language_loss": 0.81836665, + "learning_rate": 6.669957119804612e-07, + "loss": 0.8399595, + "num_input_tokens_seen": 132345510, + "step": 6156, + "time_per_iteration": 2.5143027305603027 + }, + { + "auxiliary_loss_clip": 0.01148027, + "auxiliary_loss_mlp": 0.01024259, + "balance_loss_clip": 1.04581952, + "balance_loss_mlp": 1.01732945, + "epoch": 0.7403354776648831, + "flos": 18733196816640.0, + "grad_norm": 2.736770335748318, + "language_loss": 0.72509134, + "learning_rate": 6.66415087314702e-07, + "loss": 0.74681419, + "num_input_tokens_seen": 132360465, + "step": 6157, + "time_per_iteration": 2.446489095687866 + }, + { + "auxiliary_loss_clip": 0.01141089, + "auxiliary_loss_mlp": 0.01018639, + "balance_loss_clip": 1.04302251, + "balance_loss_mlp": 1.01115847, + "epoch": 0.7404557205555221, + "flos": 16909581277440.0, + "grad_norm": 2.391863191946483, + "language_loss": 0.72807014, + "learning_rate": 6.65834664952714e-07, + "loss": 0.74966741, + "num_input_tokens_seen": 132377915, + "step": 6158, + "time_per_iteration": 2.4525160789489746 + }, + { + "auxiliary_loss_clip": 0.01127567, + "auxiliary_loss_mlp": 0.01021777, + "balance_loss_clip": 1.0429492, + "balance_loss_mlp": 1.0148747, + "epoch": 0.7405759634461613, + "flos": 21214408596480.0, + "grad_norm": 1.9278438679748917, + "language_loss": 0.75980258, + "learning_rate": 6.652544449825457e-07, + "loss": 0.78129596, + "num_input_tokens_seen": 132398170, + "step": 6159, + "time_per_iteration": 2.554436206817627 + }, + { + "auxiliary_loss_clip": 0.01149199, + "auxiliary_loss_mlp": 0.01027769, + "balance_loss_clip": 1.04466319, + "balance_loss_mlp": 1.02016973, + "epoch": 0.7406962063368003, + "flos": 20480106862080.0, + "grad_norm": 1.6612458762805058, + "language_loss": 0.76459718, + "learning_rate": 6.646744274922182e-07, + "loss": 0.78636682, + "num_input_tokens_seen": 132416615, + "step": 6160, + "time_per_iteration": 2.4687955379486084 + }, + { + "auxiliary_loss_clip": 0.01141959, + "auxiliary_loss_mlp": 0.01022626, + "balance_loss_clip": 1.04367042, + "balance_loss_mlp": 1.01516342, + "epoch": 0.7408164492274394, + "flos": 19791915212160.0, + "grad_norm": 2.6067359055718744, + "language_loss": 0.74546814, + "learning_rate": 6.640946125697171e-07, + "loss": 0.76711392, + "num_input_tokens_seen": 132434145, + "step": 6161, + "time_per_iteration": 2.485675811767578 + }, + { + "auxiliary_loss_clip": 0.01154563, + "auxiliary_loss_mlp": 0.01020924, + "balance_loss_clip": 1.04449511, + "balance_loss_mlp": 1.01328826, + "epoch": 0.7409366921180786, + "flos": 29204855654400.0, + "grad_norm": 1.9105687396778688, + "language_loss": 0.75377035, + "learning_rate": 6.635150003030017e-07, + "loss": 0.77552521, + "num_input_tokens_seen": 132452670, + "step": 6162, + "time_per_iteration": 2.504122734069824 + }, + { + "auxiliary_loss_clip": 0.01111889, + "auxiliary_loss_mlp": 0.01022662, + "balance_loss_clip": 1.03901291, + "balance_loss_mlp": 1.015697, + "epoch": 0.7410569350087176, + "flos": 22930004960640.0, + "grad_norm": 2.4379440471357907, + "language_loss": 0.85958397, + "learning_rate": 6.629355907799981e-07, + "loss": 0.88092947, + "num_input_tokens_seen": 132472475, + "step": 6163, + "time_per_iteration": 2.5654373168945312 + }, + { + "auxiliary_loss_clip": 0.01154419, + "auxiliary_loss_mlp": 0.01025916, + "balance_loss_clip": 1.04366899, + "balance_loss_mlp": 1.01865625, + "epoch": 0.7411771778993567, + "flos": 30440397726720.0, + "grad_norm": 1.640332982890047, + "language_loss": 0.69038928, + "learning_rate": 6.623563840886015e-07, + "loss": 0.71219265, + "num_input_tokens_seen": 132493400, + "step": 6164, + "time_per_iteration": 2.523892879486084 + }, + { + "auxiliary_loss_clip": 0.01149368, + "auxiliary_loss_mlp": 0.01021442, + "balance_loss_clip": 1.04358935, + "balance_loss_mlp": 1.01433718, + "epoch": 0.7412974207899958, + "flos": 20522050968960.0, + "grad_norm": 1.6744621243300035, + "language_loss": 0.69789636, + "learning_rate": 6.617773803166795e-07, + "loss": 0.71960449, + "num_input_tokens_seen": 132511725, + "step": 6165, + "time_per_iteration": 2.4543373584747314 + }, + { + "auxiliary_loss_clip": 0.01145468, + "auxiliary_loss_mlp": 0.00762218, + "balance_loss_clip": 1.04514861, + "balance_loss_mlp": 1.00064039, + "epoch": 0.7414176636806349, + "flos": 22090700793600.0, + "grad_norm": 3.005554991307894, + "language_loss": 0.81806767, + "learning_rate": 6.611985795520634e-07, + "loss": 0.83714455, + "num_input_tokens_seen": 132530270, + "step": 6166, + "time_per_iteration": 2.5031936168670654 + }, + { + "auxiliary_loss_clip": 0.01138201, + "auxiliary_loss_mlp": 0.01025703, + "balance_loss_clip": 1.04654026, + "balance_loss_mlp": 1.01788592, + "epoch": 0.7415379065712739, + "flos": 25155245445120.0, + "grad_norm": 5.705864423860447, + "language_loss": 0.77412498, + "learning_rate": 6.606199818825588e-07, + "loss": 0.79576397, + "num_input_tokens_seen": 132550725, + "step": 6167, + "time_per_iteration": 2.5573110580444336 + }, + { + "auxiliary_loss_clip": 0.01143748, + "auxiliary_loss_mlp": 0.01021179, + "balance_loss_clip": 1.0420084, + "balance_loss_mlp": 1.01394343, + "epoch": 0.7416581494619131, + "flos": 16871731320960.0, + "grad_norm": 1.9521958793738012, + "language_loss": 0.81499684, + "learning_rate": 6.600415873959377e-07, + "loss": 0.83664614, + "num_input_tokens_seen": 132568600, + "step": 6168, + "time_per_iteration": 2.480156421661377 + }, + { + "auxiliary_loss_clip": 0.01094918, + "auxiliary_loss_mlp": 0.00761575, + "balance_loss_clip": 1.03722286, + "balance_loss_mlp": 1.00054955, + "epoch": 0.7417783923525522, + "flos": 28438881102720.0, + "grad_norm": 2.0555182127818483, + "language_loss": 0.64817655, + "learning_rate": 6.594633961799437e-07, + "loss": 0.66674149, + "num_input_tokens_seen": 132587640, + "step": 6169, + "time_per_iteration": 2.7069382667541504 + }, + { + "auxiliary_loss_clip": 0.01134823, + "auxiliary_loss_mlp": 0.01023428, + "balance_loss_clip": 1.04416728, + "balance_loss_mlp": 1.01615298, + "epoch": 0.7418986352431912, + "flos": 20084299920000.0, + "grad_norm": 2.032400459972932, + "language_loss": 0.81928301, + "learning_rate": 6.588854083222857e-07, + "loss": 0.84086555, + "num_input_tokens_seen": 132607075, + "step": 6170, + "time_per_iteration": 2.5588767528533936 + }, + { + "auxiliary_loss_clip": 0.01144402, + "auxiliary_loss_mlp": 0.01027755, + "balance_loss_clip": 1.04569972, + "balance_loss_mlp": 1.01946354, + "epoch": 0.7420188781338304, + "flos": 18259571059200.0, + "grad_norm": 2.267618535604961, + "language_loss": 0.80575109, + "learning_rate": 6.583076239106444e-07, + "loss": 0.82747263, + "num_input_tokens_seen": 132625580, + "step": 6171, + "time_per_iteration": 2.5532052516937256 + }, + { + "auxiliary_loss_clip": 0.01145123, + "auxiliary_loss_mlp": 0.01023148, + "balance_loss_clip": 1.04493165, + "balance_loss_mlp": 1.01549459, + "epoch": 0.7421391210244694, + "flos": 13771994319360.0, + "grad_norm": 2.149377269349128, + "language_loss": 0.75271469, + "learning_rate": 6.577300430326707e-07, + "loss": 0.77439737, + "num_input_tokens_seen": 132640525, + "step": 6172, + "time_per_iteration": 2.515233278274536 + }, + { + "auxiliary_loss_clip": 0.01123951, + "auxiliary_loss_mlp": 0.01023958, + "balance_loss_clip": 1.04478502, + "balance_loss_mlp": 1.01681781, + "epoch": 0.7422593639151085, + "flos": 15961683317760.0, + "grad_norm": 2.2600127869174527, + "language_loss": 0.72212338, + "learning_rate": 6.571526657759821e-07, + "loss": 0.74360245, + "num_input_tokens_seen": 132656265, + "step": 6173, + "time_per_iteration": 3.369070529937744 + }, + { + "auxiliary_loss_clip": 0.01147561, + "auxiliary_loss_mlp": 0.01020889, + "balance_loss_clip": 1.04245508, + "balance_loss_mlp": 1.01376307, + "epoch": 0.7423796068057477, + "flos": 30114400867200.0, + "grad_norm": 18.027885747990094, + "language_loss": 0.71085083, + "learning_rate": 6.565754922281663e-07, + "loss": 0.73253524, + "num_input_tokens_seen": 132678510, + "step": 6174, + "time_per_iteration": 2.5378401279449463 + }, + { + "auxiliary_loss_clip": 0.0113979, + "auxiliary_loss_mlp": 0.01027761, + "balance_loss_clip": 1.04381824, + "balance_loss_mlp": 1.0203793, + "epoch": 0.7424998496963867, + "flos": 20521907314560.0, + "grad_norm": 1.8676659591167615, + "language_loss": 0.78408229, + "learning_rate": 6.559985224767801e-07, + "loss": 0.80575782, + "num_input_tokens_seen": 132696385, + "step": 6175, + "time_per_iteration": 3.271153450012207 + }, + { + "auxiliary_loss_clip": 0.01132059, + "auxiliary_loss_mlp": 0.01029179, + "balance_loss_clip": 1.0430243, + "balance_loss_mlp": 1.02165651, + "epoch": 0.7426200925870258, + "flos": 21871573873920.0, + "grad_norm": 2.8012776848475847, + "language_loss": 0.75314653, + "learning_rate": 6.55421756609349e-07, + "loss": 0.77475893, + "num_input_tokens_seen": 132714640, + "step": 6176, + "time_per_iteration": 3.2511332035064697 + }, + { + "auxiliary_loss_clip": 0.01151294, + "auxiliary_loss_mlp": 0.01027456, + "balance_loss_clip": 1.04720902, + "balance_loss_mlp": 1.01960027, + "epoch": 0.7427403354776649, + "flos": 26432049265920.0, + "grad_norm": 2.1081652400497752, + "language_loss": 0.789608, + "learning_rate": 6.54845194713369e-07, + "loss": 0.81139547, + "num_input_tokens_seen": 132735590, + "step": 6177, + "time_per_iteration": 2.490739345550537 + }, + { + "auxiliary_loss_clip": 0.0115012, + "auxiliary_loss_mlp": 0.01029016, + "balance_loss_clip": 1.04556274, + "balance_loss_mlp": 1.02201223, + "epoch": 0.742860578368304, + "flos": 19898390102400.0, + "grad_norm": 1.9969922482400364, + "language_loss": 0.79637825, + "learning_rate": 6.542688368763034e-07, + "loss": 0.81816959, + "num_input_tokens_seen": 132753995, + "step": 6178, + "time_per_iteration": 2.444950819015503 + }, + { + "auxiliary_loss_clip": 0.01154099, + "auxiliary_loss_mlp": 0.01024183, + "balance_loss_clip": 1.04741025, + "balance_loss_mlp": 1.01720583, + "epoch": 0.742980821258943, + "flos": 24827201510400.0, + "grad_norm": 1.5660677452198077, + "language_loss": 0.76812702, + "learning_rate": 6.536926831855854e-07, + "loss": 0.78990984, + "num_input_tokens_seen": 132773160, + "step": 6179, + "time_per_iteration": 3.228813409805298 + }, + { + "auxiliary_loss_clip": 0.01138636, + "auxiliary_loss_mlp": 0.01023332, + "balance_loss_clip": 1.04503608, + "balance_loss_mlp": 1.01610804, + "epoch": 0.7431010641495821, + "flos": 25228646887680.0, + "grad_norm": 2.0617689953801293, + "language_loss": 0.72851372, + "learning_rate": 6.531167337286165e-07, + "loss": 0.75013345, + "num_input_tokens_seen": 132793180, + "step": 6180, + "time_per_iteration": 2.505481719970703 + }, + { + "auxiliary_loss_clip": 0.01140948, + "auxiliary_loss_mlp": 0.01022632, + "balance_loss_clip": 1.0465306, + "balance_loss_mlp": 1.01555705, + "epoch": 0.7432213070402213, + "flos": 21762369550080.0, + "grad_norm": 1.668112340967331, + "language_loss": 0.79740429, + "learning_rate": 6.52540988592768e-07, + "loss": 0.81904006, + "num_input_tokens_seen": 132814200, + "step": 6181, + "time_per_iteration": 2.511209011077881 + }, + { + "auxiliary_loss_clip": 0.0114197, + "auxiliary_loss_mlp": 0.01023359, + "balance_loss_clip": 1.04432285, + "balance_loss_mlp": 1.01613832, + "epoch": 0.7433415499308603, + "flos": 14793832425600.0, + "grad_norm": 2.3221056791086236, + "language_loss": 0.83214456, + "learning_rate": 6.519654478653814e-07, + "loss": 0.85379785, + "num_input_tokens_seen": 132832565, + "step": 6182, + "time_per_iteration": 2.475759744644165 + }, + { + "auxiliary_loss_clip": 0.01049165, + "auxiliary_loss_mlp": 0.01000195, + "balance_loss_clip": 1.01003802, + "balance_loss_mlp": 0.99924153, + "epoch": 0.7434617928214994, + "flos": 67155577297920.0, + "grad_norm": 0.7526400675407713, + "language_loss": 0.5610947, + "learning_rate": 6.51390111633763e-07, + "loss": 0.58158833, + "num_input_tokens_seen": 132897840, + "step": 6183, + "time_per_iteration": 3.1323916912078857 + }, + { + "auxiliary_loss_clip": 0.01097874, + "auxiliary_loss_mlp": 0.01021599, + "balance_loss_clip": 1.03866935, + "balance_loss_mlp": 1.01455128, + "epoch": 0.7435820357121385, + "flos": 27377576928000.0, + "grad_norm": 1.7087059609435682, + "language_loss": 0.76085699, + "learning_rate": 6.508149799851932e-07, + "loss": 0.7820518, + "num_input_tokens_seen": 132919505, + "step": 6184, + "time_per_iteration": 2.6548895835876465 + }, + { + "auxiliary_loss_clip": 0.01135884, + "auxiliary_loss_mlp": 0.0102168, + "balance_loss_clip": 1.04391682, + "balance_loss_mlp": 1.01479876, + "epoch": 0.7437022786027776, + "flos": 23987645948160.0, + "grad_norm": 1.8427897465605203, + "language_loss": 0.61140382, + "learning_rate": 6.502400530069183e-07, + "loss": 0.63297951, + "num_input_tokens_seen": 132939390, + "step": 6185, + "time_per_iteration": 2.5040862560272217 + }, + { + "auxiliary_loss_clip": 0.0112891, + "auxiliary_loss_mlp": 0.01029269, + "balance_loss_clip": 1.04448271, + "balance_loss_mlp": 1.02151179, + "epoch": 0.7438225214934167, + "flos": 21866761451520.0, + "grad_norm": 1.700831098236158, + "language_loss": 0.68310654, + "learning_rate": 6.496653307861535e-07, + "loss": 0.70468831, + "num_input_tokens_seen": 132960060, + "step": 6186, + "time_per_iteration": 2.5425798892974854 + }, + { + "auxiliary_loss_clip": 0.01160067, + "auxiliary_loss_mlp": 0.01026177, + "balance_loss_clip": 1.04750574, + "balance_loss_mlp": 1.01880419, + "epoch": 0.7439427643840558, + "flos": 20230097224320.0, + "grad_norm": 1.8083927514401519, + "language_loss": 0.65750122, + "learning_rate": 6.490908134100857e-07, + "loss": 0.67936373, + "num_input_tokens_seen": 132978525, + "step": 6187, + "time_per_iteration": 2.46268630027771 + }, + { + "auxiliary_loss_clip": 0.01159836, + "auxiliary_loss_mlp": 0.01025991, + "balance_loss_clip": 1.04708576, + "balance_loss_mlp": 1.01837599, + "epoch": 0.7440630072746949, + "flos": 20849915335680.0, + "grad_norm": 2.2024034622274926, + "language_loss": 0.69336563, + "learning_rate": 6.48516500965866e-07, + "loss": 0.71522391, + "num_input_tokens_seen": 132998460, + "step": 6188, + "time_per_iteration": 2.459895372390747 + }, + { + "auxiliary_loss_clip": 0.01156011, + "auxiliary_loss_mlp": 0.01022466, + "balance_loss_clip": 1.04329014, + "balance_loss_mlp": 1.01497948, + "epoch": 0.7441832501653339, + "flos": 26503762769280.0, + "grad_norm": 1.7170271675026443, + "language_loss": 0.81642121, + "learning_rate": 6.479423935406192e-07, + "loss": 0.83820599, + "num_input_tokens_seen": 133018445, + "step": 6189, + "time_per_iteration": 2.5103156566619873 + }, + { + "auxiliary_loss_clip": 0.01039848, + "auxiliary_loss_mlp": 0.01008831, + "balance_loss_clip": 1.01092911, + "balance_loss_mlp": 1.00788307, + "epoch": 0.7443034930559731, + "flos": 68602848088320.0, + "grad_norm": 0.8082894153202407, + "language_loss": 0.61976862, + "learning_rate": 6.473684912214357e-07, + "loss": 0.64025545, + "num_input_tokens_seen": 133082005, + "step": 6190, + "time_per_iteration": 3.2033698558807373 + }, + { + "auxiliary_loss_clip": 0.01155545, + "auxiliary_loss_mlp": 0.01021784, + "balance_loss_clip": 1.04800069, + "balance_loss_mlp": 1.01452458, + "epoch": 0.7444237359466122, + "flos": 18654982951680.0, + "grad_norm": 2.024862957824277, + "language_loss": 0.69992888, + "learning_rate": 6.467947940953778e-07, + "loss": 0.7217021, + "num_input_tokens_seen": 133100530, + "step": 6191, + "time_per_iteration": 2.4580438137054443 + }, + { + "auxiliary_loss_clip": 0.01141458, + "auxiliary_loss_mlp": 0.0102897, + "balance_loss_clip": 1.04403889, + "balance_loss_mlp": 1.02182925, + "epoch": 0.7445439788372512, + "flos": 22817604326400.0, + "grad_norm": 1.7310967024674206, + "language_loss": 0.72561753, + "learning_rate": 6.462213022494732e-07, + "loss": 0.74732178, + "num_input_tokens_seen": 133119775, + "step": 6192, + "time_per_iteration": 2.49180269241333 + }, + { + "auxiliary_loss_clip": 0.01056768, + "auxiliary_loss_mlp": 0.010015, + "balance_loss_clip": 1.00843251, + "balance_loss_mlp": 1.00054598, + "epoch": 0.7446642217278904, + "flos": 67045690615680.0, + "grad_norm": 0.7727911869765186, + "language_loss": 0.61043864, + "learning_rate": 6.456480157707201e-07, + "loss": 0.63102132, + "num_input_tokens_seen": 133184550, + "step": 6193, + "time_per_iteration": 2.9959917068481445 + }, + { + "auxiliary_loss_clip": 0.0111946, + "auxiliary_loss_mlp": 0.01028064, + "balance_loss_clip": 1.04166543, + "balance_loss_mlp": 1.01985908, + "epoch": 0.7447844646185294, + "flos": 17417465631360.0, + "grad_norm": 2.1561354135992556, + "language_loss": 0.85044694, + "learning_rate": 6.450749347460866e-07, + "loss": 0.87192225, + "num_input_tokens_seen": 133201525, + "step": 6194, + "time_per_iteration": 2.555060625076294 + }, + { + "auxiliary_loss_clip": 0.01169257, + "auxiliary_loss_mlp": 0.01026552, + "balance_loss_clip": 1.04643476, + "balance_loss_mlp": 1.0194118, + "epoch": 0.7449047075091685, + "flos": 26615876094720.0, + "grad_norm": 1.800283515608775, + "language_loss": 0.78838241, + "learning_rate": 6.445020592625083e-07, + "loss": 0.81034046, + "num_input_tokens_seen": 133222175, + "step": 6195, + "time_per_iteration": 2.482860565185547 + }, + { + "auxiliary_loss_clip": 0.01167975, + "auxiliary_loss_mlp": 0.01026513, + "balance_loss_clip": 1.04567206, + "balance_loss_mlp": 1.01892209, + "epoch": 0.7450249503998077, + "flos": 14170458867840.0, + "grad_norm": 9.319356129239948, + "language_loss": 0.80276322, + "learning_rate": 6.4392938940689e-07, + "loss": 0.8247081, + "num_input_tokens_seen": 133237590, + "step": 6196, + "time_per_iteration": 2.428792953491211 + }, + { + "auxiliary_loss_clip": 0.0111076, + "auxiliary_loss_mlp": 0.00762235, + "balance_loss_clip": 1.0428077, + "balance_loss_mlp": 1.0005734, + "epoch": 0.7451451932904467, + "flos": 19606687752960.0, + "grad_norm": 2.1212138624393977, + "language_loss": 0.70983148, + "learning_rate": 6.433569252661049e-07, + "loss": 0.72856146, + "num_input_tokens_seen": 133255590, + "step": 6197, + "time_per_iteration": 2.551971435546875 + }, + { + "auxiliary_loss_clip": 0.01120741, + "auxiliary_loss_mlp": 0.01023453, + "balance_loss_clip": 1.04203546, + "balance_loss_mlp": 1.01640177, + "epoch": 0.7452654361810858, + "flos": 12495405980160.0, + "grad_norm": 2.0480541412061526, + "language_loss": 0.7155695, + "learning_rate": 6.427846669269952e-07, + "loss": 0.73701143, + "num_input_tokens_seen": 133273210, + "step": 6198, + "time_per_iteration": 2.5511393547058105 + }, + { + "auxiliary_loss_clip": 0.01171101, + "auxiliary_loss_mlp": 0.01029537, + "balance_loss_clip": 1.05098152, + "balance_loss_mlp": 1.02259874, + "epoch": 0.7453856790717249, + "flos": 22127329687680.0, + "grad_norm": 1.9950222328931428, + "language_loss": 0.82693338, + "learning_rate": 6.422126144763729e-07, + "loss": 0.84893978, + "num_input_tokens_seen": 133292600, + "step": 6199, + "time_per_iteration": 2.4129726886749268 + }, + { + "auxiliary_loss_clip": 0.01125113, + "auxiliary_loss_mlp": 0.00762234, + "balance_loss_clip": 1.04000914, + "balance_loss_mlp": 1.00052655, + "epoch": 0.745505921962364, + "flos": 20010682995840.0, + "grad_norm": 3.051465889519726, + "language_loss": 0.76279563, + "learning_rate": 6.416407680010174e-07, + "loss": 0.78166914, + "num_input_tokens_seen": 133306960, + "step": 6200, + "time_per_iteration": 3.261103868484497 + }, + { + "auxiliary_loss_clip": 0.0112726, + "auxiliary_loss_mlp": 0.01028604, + "balance_loss_clip": 1.04567206, + "balance_loss_mlp": 1.02048576, + "epoch": 0.745626164853003, + "flos": 24677884673280.0, + "grad_norm": 2.901202734752205, + "language_loss": 0.81057507, + "learning_rate": 6.410691275876774e-07, + "loss": 0.83213365, + "num_input_tokens_seen": 133326380, + "step": 6201, + "time_per_iteration": 3.35756778717041 + }, + { + "auxiliary_loss_clip": 0.01148293, + "auxiliary_loss_mlp": 0.01024424, + "balance_loss_clip": 1.04651606, + "balance_loss_mlp": 1.01700354, + "epoch": 0.7457464077436422, + "flos": 14538830797440.0, + "grad_norm": 2.73272719801431, + "language_loss": 0.7716105, + "learning_rate": 6.404976933230704e-07, + "loss": 0.7933377, + "num_input_tokens_seen": 133342900, + "step": 6202, + "time_per_iteration": 2.4758174419403076 + }, + { + "auxiliary_loss_clip": 0.01146267, + "auxiliary_loss_mlp": 0.01026361, + "balance_loss_clip": 1.04579306, + "balance_loss_mlp": 1.01873159, + "epoch": 0.7458666506342813, + "flos": 34021194600960.0, + "grad_norm": 2.5322437460209457, + "language_loss": 0.72848499, + "learning_rate": 6.399264652938813e-07, + "loss": 0.75021124, + "num_input_tokens_seen": 133363805, + "step": 6203, + "time_per_iteration": 3.2850921154022217 + }, + { + "auxiliary_loss_clip": 0.01138831, + "auxiliary_loss_mlp": 0.01022244, + "balance_loss_clip": 1.04378605, + "balance_loss_mlp": 1.01484156, + "epoch": 0.7459868935249203, + "flos": 24279025075200.0, + "grad_norm": 1.9040202707786222, + "language_loss": 0.74614334, + "learning_rate": 6.393554435867679e-07, + "loss": 0.76775408, + "num_input_tokens_seen": 133384655, + "step": 6204, + "time_per_iteration": 2.5178303718566895 + }, + { + "auxiliary_loss_clip": 0.01123385, + "auxiliary_loss_mlp": 0.01024795, + "balance_loss_clip": 1.04195487, + "balance_loss_mlp": 1.01672459, + "epoch": 0.7461071364155595, + "flos": 21908777385600.0, + "grad_norm": 2.1675177962499825, + "language_loss": 0.83302867, + "learning_rate": 6.387846282883502e-07, + "loss": 0.85451043, + "num_input_tokens_seen": 133401185, + "step": 6205, + "time_per_iteration": 2.539149045944214 + }, + { + "auxiliary_loss_clip": 0.01167388, + "auxiliary_loss_mlp": 0.01026049, + "balance_loss_clip": 1.0470227, + "balance_loss_mlp": 1.0187062, + "epoch": 0.7462273793061985, + "flos": 22889712879360.0, + "grad_norm": 1.9599604284238445, + "language_loss": 0.77172559, + "learning_rate": 6.38214019485223e-07, + "loss": 0.79365999, + "num_input_tokens_seen": 133420010, + "step": 6206, + "time_per_iteration": 3.1595358848571777 + }, + { + "auxiliary_loss_clip": 0.01094109, + "auxiliary_loss_mlp": 0.01025243, + "balance_loss_clip": 1.03806591, + "balance_loss_mlp": 1.01771522, + "epoch": 0.7463476221968376, + "flos": 19968451580160.0, + "grad_norm": 1.9419220181879653, + "language_loss": 0.71351391, + "learning_rate": 6.376436172639461e-07, + "loss": 0.73470747, + "num_input_tokens_seen": 133437855, + "step": 6207, + "time_per_iteration": 2.5912249088287354 + }, + { + "auxiliary_loss_clip": 0.010861, + "auxiliary_loss_mlp": 0.01026936, + "balance_loss_clip": 1.03814101, + "balance_loss_mlp": 1.01903224, + "epoch": 0.7464678650874768, + "flos": 16836610798080.0, + "grad_norm": 2.3000870792498147, + "language_loss": 0.64785331, + "learning_rate": 6.370734217110487e-07, + "loss": 0.6689837, + "num_input_tokens_seen": 133456600, + "step": 6208, + "time_per_iteration": 2.734255313873291 + }, + { + "auxiliary_loss_clip": 0.01143992, + "auxiliary_loss_mlp": 0.01029656, + "balance_loss_clip": 1.04753208, + "balance_loss_mlp": 1.0218302, + "epoch": 0.7465881079781158, + "flos": 48100869843840.0, + "grad_norm": 1.6086953710667122, + "language_loss": 0.64250714, + "learning_rate": 6.36503432913031e-07, + "loss": 0.66424358, + "num_input_tokens_seen": 133479745, + "step": 6209, + "time_per_iteration": 2.887728691101074 + }, + { + "auxiliary_loss_clip": 0.0115336, + "auxiliary_loss_mlp": 0.01026901, + "balance_loss_clip": 1.04646301, + "balance_loss_mlp": 1.01921225, + "epoch": 0.7467083508687549, + "flos": 19677359761920.0, + "grad_norm": 2.037401346982849, + "language_loss": 0.68887788, + "learning_rate": 6.359336509563569e-07, + "loss": 0.71068048, + "num_input_tokens_seen": 133495765, + "step": 6210, + "time_per_iteration": 2.4512202739715576 + }, + { + "auxiliary_loss_clip": 0.01115327, + "auxiliary_loss_mlp": 0.01031512, + "balance_loss_clip": 1.04255116, + "balance_loss_mlp": 1.02403116, + "epoch": 0.7468285937593939, + "flos": 17895436934400.0, + "grad_norm": 1.983450996640855, + "language_loss": 0.80382401, + "learning_rate": 6.353640759274641e-07, + "loss": 0.82529241, + "num_input_tokens_seen": 133514655, + "step": 6211, + "time_per_iteration": 2.4982385635375977 + }, + { + "auxiliary_loss_clip": 0.01151412, + "auxiliary_loss_mlp": 0.01023035, + "balance_loss_clip": 1.04362071, + "balance_loss_mlp": 1.01524484, + "epoch": 0.7469488366500331, + "flos": 23141446369920.0, + "grad_norm": 2.92090824593919, + "language_loss": 0.74836409, + "learning_rate": 6.347947079127556e-07, + "loss": 0.77010858, + "num_input_tokens_seen": 133532555, + "step": 6212, + "time_per_iteration": 2.474867582321167 + }, + { + "auxiliary_loss_clip": 0.01137084, + "auxiliary_loss_mlp": 0.01024047, + "balance_loss_clip": 1.04448426, + "balance_loss_mlp": 1.01632774, + "epoch": 0.7470690795406721, + "flos": 16690849407360.0, + "grad_norm": 2.3956381416553687, + "language_loss": 0.76398957, + "learning_rate": 6.342255469986053e-07, + "loss": 0.7856009, + "num_input_tokens_seen": 133551300, + "step": 6213, + "time_per_iteration": 2.47198748588562 + }, + { + "auxiliary_loss_clip": 0.01166421, + "auxiliary_loss_mlp": 0.01024303, + "balance_loss_clip": 1.04644346, + "balance_loss_mlp": 1.01673293, + "epoch": 0.7471893224313112, + "flos": 25192700352000.0, + "grad_norm": 2.0148534171554537, + "language_loss": 0.7663402, + "learning_rate": 6.336565932713533e-07, + "loss": 0.78824747, + "num_input_tokens_seen": 133570725, + "step": 6214, + "time_per_iteration": 2.4666154384613037 + }, + { + "auxiliary_loss_clip": 0.011378, + "auxiliary_loss_mlp": 0.01026103, + "balance_loss_clip": 1.04663289, + "balance_loss_mlp": 1.01841998, + "epoch": 0.7473095653219504, + "flos": 22526225199360.0, + "grad_norm": 2.0039801645830426, + "language_loss": 0.77660328, + "learning_rate": 6.330878468173088e-07, + "loss": 0.79824227, + "num_input_tokens_seen": 133590790, + "step": 6215, + "time_per_iteration": 2.507344961166382 + }, + { + "auxiliary_loss_clip": 0.01146661, + "auxiliary_loss_mlp": 0.01022785, + "balance_loss_clip": 1.04366148, + "balance_loss_mlp": 1.01539421, + "epoch": 0.7474298082125894, + "flos": 18113989236480.0, + "grad_norm": 1.7590635454407715, + "language_loss": 0.73164737, + "learning_rate": 6.32519307722752e-07, + "loss": 0.75334179, + "num_input_tokens_seen": 133608685, + "step": 6216, + "time_per_iteration": 2.4325711727142334 + }, + { + "auxiliary_loss_clip": 0.01037087, + "auxiliary_loss_mlp": 0.01001291, + "balance_loss_clip": 1.01625681, + "balance_loss_mlp": 1.00002134, + "epoch": 0.7475500511032285, + "flos": 62086535193600.0, + "grad_norm": 0.9962838269710528, + "language_loss": 0.55000901, + "learning_rate": 6.31950976073929e-07, + "loss": 0.57039273, + "num_input_tokens_seen": 133662775, + "step": 6217, + "time_per_iteration": 3.075521230697632 + }, + { + "auxiliary_loss_clip": 0.01107, + "auxiliary_loss_mlp": 0.01027721, + "balance_loss_clip": 1.04165971, + "balance_loss_mlp": 1.02044904, + "epoch": 0.7476702939938676, + "flos": 17785586165760.0, + "grad_norm": 2.6059204109505374, + "language_loss": 0.80816567, + "learning_rate": 6.31382851957055e-07, + "loss": 0.82951295, + "num_input_tokens_seen": 133679595, + "step": 6218, + "time_per_iteration": 2.5229997634887695 + }, + { + "auxiliary_loss_clip": 0.01123063, + "auxiliary_loss_mlp": 0.00762006, + "balance_loss_clip": 1.0440526, + "balance_loss_mlp": 1.00066745, + "epoch": 0.7477905368845067, + "flos": 27927944092800.0, + "grad_norm": 2.069305447819605, + "language_loss": 0.7172215, + "learning_rate": 6.308149354583143e-07, + "loss": 0.73607218, + "num_input_tokens_seen": 133699000, + "step": 6219, + "time_per_iteration": 2.581616163253784 + }, + { + "auxiliary_loss_clip": 0.0115898, + "auxiliary_loss_mlp": 0.01025156, + "balance_loss_clip": 1.04774821, + "balance_loss_mlp": 1.01741362, + "epoch": 0.7479107797751458, + "flos": 26870374932480.0, + "grad_norm": 1.7643779488181042, + "language_loss": 0.81653804, + "learning_rate": 6.302472266638586e-07, + "loss": 0.83837944, + "num_input_tokens_seen": 133719540, + "step": 6220, + "time_per_iteration": 2.5438644886016846 + }, + { + "auxiliary_loss_clip": 0.01175079, + "auxiliary_loss_mlp": 0.01026418, + "balance_loss_clip": 1.04879963, + "balance_loss_mlp": 1.01877117, + "epoch": 0.7480310226657849, + "flos": 33943375785600.0, + "grad_norm": 2.5966241079203267, + "language_loss": 0.69740832, + "learning_rate": 6.296797256598101e-07, + "loss": 0.71942329, + "num_input_tokens_seen": 133741020, + "step": 6221, + "time_per_iteration": 2.5346083641052246 + }, + { + "auxiliary_loss_clip": 0.0111799, + "auxiliary_loss_mlp": 0.01024494, + "balance_loss_clip": 1.04257691, + "balance_loss_mlp": 1.01721346, + "epoch": 0.748151265556424, + "flos": 24826555065600.0, + "grad_norm": 1.93075445493598, + "language_loss": 0.81376898, + "learning_rate": 6.291124325322576e-07, + "loss": 0.83519381, + "num_input_tokens_seen": 133761145, + "step": 6222, + "time_per_iteration": 2.5958242416381836 + }, + { + "auxiliary_loss_clip": 0.01145074, + "auxiliary_loss_mlp": 0.01024341, + "balance_loss_clip": 1.04489577, + "balance_loss_mlp": 1.01675963, + "epoch": 0.748271508447063, + "flos": 38399351535360.0, + "grad_norm": 1.5084683693115255, + "language_loss": 0.62270892, + "learning_rate": 6.285453473672595e-07, + "loss": 0.6444031, + "num_input_tokens_seen": 133783715, + "step": 6223, + "time_per_iteration": 2.6479413509368896 + }, + { + "auxiliary_loss_clip": 0.01165764, + "auxiliary_loss_mlp": 0.0102524, + "balance_loss_clip": 1.04569697, + "balance_loss_mlp": 1.01798332, + "epoch": 0.7483917513377022, + "flos": 21541842000000.0, + "grad_norm": 2.437674442884871, + "language_loss": 0.75512993, + "learning_rate": 6.279784702508415e-07, + "loss": 0.77704, + "num_input_tokens_seen": 133804465, + "step": 6224, + "time_per_iteration": 2.4247922897338867 + }, + { + "auxiliary_loss_clip": 0.01037773, + "auxiliary_loss_mlp": 0.01001053, + "balance_loss_clip": 1.00789464, + "balance_loss_mlp": 1.00003946, + "epoch": 0.7485119942283412, + "flos": 62314532772480.0, + "grad_norm": 0.8026185379986683, + "language_loss": 0.58628702, + "learning_rate": 6.274118012689979e-07, + "loss": 0.60667527, + "num_input_tokens_seen": 133866365, + "step": 6225, + "time_per_iteration": 3.207003116607666 + }, + { + "auxiliary_loss_clip": 0.01132459, + "auxiliary_loss_mlp": 0.01020357, + "balance_loss_clip": 1.04309177, + "balance_loss_mlp": 1.01319265, + "epoch": 0.7486322371189803, + "flos": 29937613104000.0, + "grad_norm": 1.4923896085906483, + "language_loss": 0.6837827, + "learning_rate": 6.268453405076943e-07, + "loss": 0.70531088, + "num_input_tokens_seen": 133888760, + "step": 6226, + "time_per_iteration": 2.581244468688965 + }, + { + "auxiliary_loss_clip": 0.0114034, + "auxiliary_loss_mlp": 0.0102453, + "balance_loss_clip": 1.0438931, + "balance_loss_mlp": 1.01779783, + "epoch": 0.7487524800096195, + "flos": 18949414734720.0, + "grad_norm": 2.0138312082251355, + "language_loss": 0.82397771, + "learning_rate": 6.262790880528592e-07, + "loss": 0.84562641, + "num_input_tokens_seen": 133906380, + "step": 6227, + "time_per_iteration": 3.2171056270599365 + }, + { + "auxiliary_loss_clip": 0.01136705, + "auxiliary_loss_mlp": 0.01027055, + "balance_loss_clip": 1.04130697, + "balance_loss_mlp": 1.01913309, + "epoch": 0.7488727229002585, + "flos": 18697393935360.0, + "grad_norm": 3.146534127811033, + "language_loss": 0.79575741, + "learning_rate": 6.257130439903951e-07, + "loss": 0.81739497, + "num_input_tokens_seen": 133922875, + "step": 6228, + "time_per_iteration": 3.2803120613098145 + }, + { + "auxiliary_loss_clip": 0.01171585, + "auxiliary_loss_mlp": 0.01025237, + "balance_loss_clip": 1.04946995, + "balance_loss_mlp": 1.0177027, + "epoch": 0.7489929657908976, + "flos": 23623368168960.0, + "grad_norm": 2.8903667535410262, + "language_loss": 0.80947113, + "learning_rate": 6.251472084061695e-07, + "loss": 0.83143938, + "num_input_tokens_seen": 133941795, + "step": 6229, + "time_per_iteration": 3.2094674110412598 + }, + { + "auxiliary_loss_clip": 0.011551, + "auxiliary_loss_mlp": 0.01025713, + "balance_loss_clip": 1.04839277, + "balance_loss_mlp": 1.01858759, + "epoch": 0.7491132086815367, + "flos": 20551533056640.0, + "grad_norm": 2.049361433897669, + "language_loss": 0.88905609, + "learning_rate": 6.245815813860191e-07, + "loss": 0.91086423, + "num_input_tokens_seen": 133957305, + "step": 6230, + "time_per_iteration": 2.445674180984497 + }, + { + "auxiliary_loss_clip": 0.01170571, + "auxiliary_loss_mlp": 0.01024103, + "balance_loss_clip": 1.04598629, + "balance_loss_mlp": 1.01665521, + "epoch": 0.7492334515721758, + "flos": 23003011353600.0, + "grad_norm": 2.3080108224754374, + "language_loss": 0.70222902, + "learning_rate": 6.240161630157495e-07, + "loss": 0.72417569, + "num_input_tokens_seen": 133976660, + "step": 6231, + "time_per_iteration": 2.444093942642212 + }, + { + "auxiliary_loss_clip": 0.01172589, + "auxiliary_loss_mlp": 0.01023269, + "balance_loss_clip": 1.04859805, + "balance_loss_mlp": 1.01572037, + "epoch": 0.7493536944628149, + "flos": 16398823835520.0, + "grad_norm": 2.9619521918767178, + "language_loss": 0.70019102, + "learning_rate": 6.23450953381133e-07, + "loss": 0.72214967, + "num_input_tokens_seen": 133994750, + "step": 6232, + "time_per_iteration": 3.219876527786255 + }, + { + "auxiliary_loss_clip": 0.01134271, + "auxiliary_loss_mlp": 0.0102303, + "balance_loss_clip": 1.04387081, + "balance_loss_mlp": 1.01585984, + "epoch": 0.749473937353454, + "flos": 15338561155200.0, + "grad_norm": 1.9683613369747013, + "language_loss": 0.67729664, + "learning_rate": 6.228859525679131e-07, + "loss": 0.69886971, + "num_input_tokens_seen": 134009165, + "step": 6233, + "time_per_iteration": 2.5174014568328857 + }, + { + "auxiliary_loss_clip": 0.01154062, + "auxiliary_loss_mlp": 0.01025008, + "balance_loss_clip": 1.04579008, + "balance_loss_mlp": 1.01757526, + "epoch": 0.7495941802440931, + "flos": 18951138587520.0, + "grad_norm": 3.3859150459894654, + "language_loss": 0.7996906, + "learning_rate": 6.223211606617986e-07, + "loss": 0.82148129, + "num_input_tokens_seen": 134027585, + "step": 6234, + "time_per_iteration": 2.450185537338257 + }, + { + "auxiliary_loss_clip": 0.01153559, + "auxiliary_loss_mlp": 0.01023692, + "balance_loss_clip": 1.04931986, + "balance_loss_mlp": 1.01725793, + "epoch": 0.7497144231347321, + "flos": 22492469393280.0, + "grad_norm": 1.802198471096273, + "language_loss": 0.84229624, + "learning_rate": 6.217565777484701e-07, + "loss": 0.86406869, + "num_input_tokens_seen": 134046680, + "step": 6235, + "time_per_iteration": 2.4562768936157227 + }, + { + "auxiliary_loss_clip": 0.01135307, + "auxiliary_loss_mlp": 0.00761973, + "balance_loss_clip": 1.04305696, + "balance_loss_mlp": 1.00054395, + "epoch": 0.7498346660253713, + "flos": 24243509502720.0, + "grad_norm": 1.8134172355057325, + "language_loss": 0.80326581, + "learning_rate": 6.211922039135722e-07, + "loss": 0.82223862, + "num_input_tokens_seen": 134066825, + "step": 6236, + "time_per_iteration": 2.5094094276428223 + }, + { + "auxiliary_loss_clip": 0.01169719, + "auxiliary_loss_mlp": 0.01025808, + "balance_loss_clip": 1.04846048, + "balance_loss_mlp": 1.01844978, + "epoch": 0.7499549089160104, + "flos": 24387080163840.0, + "grad_norm": 3.9149932811955197, + "language_loss": 0.81091022, + "learning_rate": 6.206280392427201e-07, + "loss": 0.83286548, + "num_input_tokens_seen": 134086410, + "step": 6237, + "time_per_iteration": 2.4481959342956543 + }, + { + "auxiliary_loss_clip": 0.01148029, + "auxiliary_loss_mlp": 0.0102378, + "balance_loss_clip": 1.04328394, + "balance_loss_mlp": 1.01625156, + "epoch": 0.7500751518066494, + "flos": 34057320704640.0, + "grad_norm": 1.5577432783198906, + "language_loss": 0.73492271, + "learning_rate": 6.200640838214983e-07, + "loss": 0.75664079, + "num_input_tokens_seen": 134109185, + "step": 6238, + "time_per_iteration": 2.565852403640747 + }, + { + "auxiliary_loss_clip": 0.01167743, + "auxiliary_loss_mlp": 0.0102762, + "balance_loss_clip": 1.04700065, + "balance_loss_mlp": 1.02011013, + "epoch": 0.7501953946972886, + "flos": 18843586289280.0, + "grad_norm": 1.868417767279132, + "language_loss": 0.66750288, + "learning_rate": 6.195003377354578e-07, + "loss": 0.68945646, + "num_input_tokens_seen": 134128455, + "step": 6239, + "time_per_iteration": 2.423114538192749 + }, + { + "auxiliary_loss_clip": 0.01151753, + "auxiliary_loss_mlp": 0.01025428, + "balance_loss_clip": 1.04430926, + "balance_loss_mlp": 1.01763451, + "epoch": 0.7503156375879276, + "flos": 20257675891200.0, + "grad_norm": 3.1267254992210103, + "language_loss": 0.73722208, + "learning_rate": 6.189368010701183e-07, + "loss": 0.75899386, + "num_input_tokens_seen": 134145515, + "step": 6240, + "time_per_iteration": 2.4272027015686035 + }, + { + "auxiliary_loss_clip": 0.01158318, + "auxiliary_loss_mlp": 0.01027221, + "balance_loss_clip": 1.04418898, + "balance_loss_mlp": 1.01976442, + "epoch": 0.7504358804785667, + "flos": 13480040574720.0, + "grad_norm": 2.069993436399623, + "language_loss": 0.76498717, + "learning_rate": 6.183734739109683e-07, + "loss": 0.78684258, + "num_input_tokens_seen": 134163335, + "step": 6241, + "time_per_iteration": 2.435854911804199 + }, + { + "auxiliary_loss_clip": 0.01163268, + "auxiliary_loss_mlp": 0.01024094, + "balance_loss_clip": 1.04810166, + "balance_loss_mlp": 1.01628542, + "epoch": 0.7505561233692057, + "flos": 29461042431360.0, + "grad_norm": 2.123562738576487, + "language_loss": 0.68715012, + "learning_rate": 6.178103563434629e-07, + "loss": 0.70902377, + "num_input_tokens_seen": 134182335, + "step": 6242, + "time_per_iteration": 2.5127718448638916 + }, + { + "auxiliary_loss_clip": 0.01169076, + "auxiliary_loss_mlp": 0.01029323, + "balance_loss_clip": 1.04752314, + "balance_loss_mlp": 1.02197957, + "epoch": 0.7506763662598449, + "flos": 20302457172480.0, + "grad_norm": 1.6544114648881867, + "language_loss": 0.84154224, + "learning_rate": 6.172474484530283e-07, + "loss": 0.86352623, + "num_input_tokens_seen": 134201070, + "step": 6243, + "time_per_iteration": 2.410017967224121 + }, + { + "auxiliary_loss_clip": 0.01130887, + "auxiliary_loss_mlp": 0.01025603, + "balance_loss_clip": 1.04041409, + "balance_loss_mlp": 1.01801562, + "epoch": 0.750796609150484, + "flos": 37230961939200.0, + "grad_norm": 2.6013862561754184, + "language_loss": 0.75681186, + "learning_rate": 6.166847503250563e-07, + "loss": 0.7783767, + "num_input_tokens_seen": 134223310, + "step": 6244, + "time_per_iteration": 2.6201353073120117 + }, + { + "auxiliary_loss_clip": 0.01143035, + "auxiliary_loss_mlp": 0.01026944, + "balance_loss_clip": 1.04509306, + "balance_loss_mlp": 1.0195415, + "epoch": 0.750916852041123, + "flos": 19609417186560.0, + "grad_norm": 2.2785729723231283, + "language_loss": 0.7900508, + "learning_rate": 6.161222620449078e-07, + "loss": 0.81175059, + "num_input_tokens_seen": 134242085, + "step": 6245, + "time_per_iteration": 2.515126943588257 + }, + { + "auxiliary_loss_clip": 0.01130452, + "auxiliary_loss_mlp": 0.01029066, + "balance_loss_clip": 1.04406643, + "balance_loss_mlp": 1.02164245, + "epoch": 0.7510370949317622, + "flos": 25112690807040.0, + "grad_norm": 2.617640742423248, + "language_loss": 0.80176902, + "learning_rate": 6.155599836979117e-07, + "loss": 0.82336426, + "num_input_tokens_seen": 134260770, + "step": 6246, + "time_per_iteration": 2.5639989376068115 + }, + { + "auxiliary_loss_clip": 0.01113541, + "auxiliary_loss_mlp": 0.01030284, + "balance_loss_clip": 1.04079318, + "balance_loss_mlp": 1.02223182, + "epoch": 0.7511573378224012, + "flos": 19062282245760.0, + "grad_norm": 2.2261725725570907, + "language_loss": 0.81669372, + "learning_rate": 6.149979153693649e-07, + "loss": 0.8381319, + "num_input_tokens_seen": 134278025, + "step": 6247, + "time_per_iteration": 2.542158603668213 + }, + { + "auxiliary_loss_clip": 0.01153112, + "auxiliary_loss_mlp": 0.01023074, + "balance_loss_clip": 1.045295, + "balance_loss_mlp": 1.01560259, + "epoch": 0.7512775807130403, + "flos": 19937676602880.0, + "grad_norm": 2.599438416887886, + "language_loss": 0.76592493, + "learning_rate": 6.144360571445343e-07, + "loss": 0.78768682, + "num_input_tokens_seen": 134297170, + "step": 6248, + "time_per_iteration": 2.455000877380371 + }, + { + "auxiliary_loss_clip": 0.01151128, + "auxiliary_loss_mlp": 0.01025707, + "balance_loss_clip": 1.04686236, + "balance_loss_mlp": 1.01798844, + "epoch": 0.7513978236036795, + "flos": 20739920912640.0, + "grad_norm": 1.7796050890284478, + "language_loss": 0.80088115, + "learning_rate": 6.138744091086509e-07, + "loss": 0.82264948, + "num_input_tokens_seen": 134316755, + "step": 6249, + "time_per_iteration": 2.4788007736206055 + }, + { + "auxiliary_loss_clip": 0.01132646, + "auxiliary_loss_mlp": 0.01023754, + "balance_loss_clip": 1.04617381, + "balance_loss_mlp": 1.01657772, + "epoch": 0.7515180664943185, + "flos": 27563163523200.0, + "grad_norm": 2.529848954907498, + "language_loss": 0.72665471, + "learning_rate": 6.133129713469183e-07, + "loss": 0.74821866, + "num_input_tokens_seen": 134335960, + "step": 6250, + "time_per_iteration": 2.56345534324646 + }, + { + "auxiliary_loss_clip": 0.01135465, + "auxiliary_loss_mlp": 0.01024357, + "balance_loss_clip": 1.04224229, + "balance_loss_mlp": 1.01625705, + "epoch": 0.7516383093849576, + "flos": 33803181002880.0, + "grad_norm": 1.7798953130109685, + "language_loss": 0.63691491, + "learning_rate": 6.127517439445053e-07, + "loss": 0.65851313, + "num_input_tokens_seen": 134356805, + "step": 6251, + "time_per_iteration": 2.6387720108032227 + }, + { + "auxiliary_loss_clip": 0.01105116, + "auxiliary_loss_mlp": 0.01025301, + "balance_loss_clip": 1.04153597, + "balance_loss_mlp": 1.01826477, + "epoch": 0.7517585522755967, + "flos": 29746172592000.0, + "grad_norm": 1.859845056183207, + "language_loss": 0.81769121, + "learning_rate": 6.121907269865498e-07, + "loss": 0.83899534, + "num_input_tokens_seen": 134376295, + "step": 6252, + "time_per_iteration": 2.611422538757324 + }, + { + "auxiliary_loss_clip": 0.01030072, + "auxiliary_loss_mlp": 0.01001731, + "balance_loss_clip": 1.00832033, + "balance_loss_mlp": 1.00069416, + "epoch": 0.7518787951662358, + "flos": 69807974319360.0, + "grad_norm": 0.9276140960468029, + "language_loss": 0.67212021, + "learning_rate": 6.116299205581577e-07, + "loss": 0.69243824, + "num_input_tokens_seen": 134431125, + "step": 6253, + "time_per_iteration": 3.067988872528076 + }, + { + "auxiliary_loss_clip": 0.01174663, + "auxiliary_loss_mlp": 0.01027081, + "balance_loss_clip": 1.04988325, + "balance_loss_mlp": 1.01903093, + "epoch": 0.7519990380568748, + "flos": 34203225749760.0, + "grad_norm": 1.9415691857638333, + "language_loss": 0.68623221, + "learning_rate": 6.110693247444018e-07, + "loss": 0.70824963, + "num_input_tokens_seen": 134452960, + "step": 6254, + "time_per_iteration": 3.3812899589538574 + }, + { + "auxiliary_loss_clip": 0.01115467, + "auxiliary_loss_mlp": 0.01023132, + "balance_loss_clip": 1.04167998, + "balance_loss_mlp": 1.0163846, + "epoch": 0.752119280947514, + "flos": 21725704742400.0, + "grad_norm": 2.0965028802087162, + "language_loss": 0.82158184, + "learning_rate": 6.105089396303258e-07, + "loss": 0.84296781, + "num_input_tokens_seen": 134471350, + "step": 6255, + "time_per_iteration": 3.261983633041382 + }, + { + "auxiliary_loss_clip": 0.01141063, + "auxiliary_loss_mlp": 0.0103113, + "balance_loss_clip": 1.04456747, + "balance_loss_mlp": 1.0231843, + "epoch": 0.7522395238381531, + "flos": 32742774668160.0, + "grad_norm": 1.8920524345741638, + "language_loss": 0.75939226, + "learning_rate": 6.099487653009383e-07, + "loss": 0.78111422, + "num_input_tokens_seen": 134490695, + "step": 6256, + "time_per_iteration": 3.330019235610962 + }, + { + "auxiliary_loss_clip": 0.01152426, + "auxiliary_loss_mlp": 0.01023689, + "balance_loss_clip": 1.04399729, + "balance_loss_mlp": 1.01725173, + "epoch": 0.7523597667287921, + "flos": 23476026579840.0, + "grad_norm": 1.9112194551487325, + "language_loss": 0.83104968, + "learning_rate": 6.093888018412192e-07, + "loss": 0.8528108, + "num_input_tokens_seen": 134506885, + "step": 6257, + "time_per_iteration": 2.4978344440460205 + }, + { + "auxiliary_loss_clip": 0.01055505, + "auxiliary_loss_mlp": 0.01000776, + "balance_loss_clip": 1.00742793, + "balance_loss_mlp": 0.99967974, + "epoch": 0.7524800096194313, + "flos": 67346730501120.0, + "grad_norm": 0.7057200570429524, + "language_loss": 0.5464412, + "learning_rate": 6.088290493361125e-07, + "loss": 0.56700397, + "num_input_tokens_seen": 134571770, + "step": 6258, + "time_per_iteration": 3.1836116313934326 + }, + { + "auxiliary_loss_clip": 0.01103558, + "auxiliary_loss_mlp": 0.01023818, + "balance_loss_clip": 1.0404191, + "balance_loss_mlp": 1.01645672, + "epoch": 0.7526002525100703, + "flos": 13006055681280.0, + "grad_norm": 1.9572556191309842, + "language_loss": 0.71316463, + "learning_rate": 6.082695078705322e-07, + "loss": 0.73443842, + "num_input_tokens_seen": 134589250, + "step": 6259, + "time_per_iteration": 3.4924230575561523 + }, + { + "auxiliary_loss_clip": 0.01149061, + "auxiliary_loss_mlp": 0.01025724, + "balance_loss_clip": 1.04651535, + "balance_loss_mlp": 1.01784456, + "epoch": 0.7527204954007094, + "flos": 21397229844480.0, + "grad_norm": 2.149943450406662, + "language_loss": 0.68895257, + "learning_rate": 6.077101775293618e-07, + "loss": 0.71070039, + "num_input_tokens_seen": 134608075, + "step": 6260, + "time_per_iteration": 2.5150959491729736 + }, + { + "auxiliary_loss_clip": 0.01156311, + "auxiliary_loss_mlp": 0.01025518, + "balance_loss_clip": 1.04537177, + "balance_loss_mlp": 1.0170182, + "epoch": 0.7528407382913486, + "flos": 18947188091520.0, + "grad_norm": 4.8958279171365335, + "language_loss": 0.82440341, + "learning_rate": 6.071510583974504e-07, + "loss": 0.84622169, + "num_input_tokens_seen": 134623260, + "step": 6261, + "time_per_iteration": 2.424548864364624 + }, + { + "auxiliary_loss_clip": 0.01171868, + "auxiliary_loss_mlp": 0.01030462, + "balance_loss_clip": 1.048509, + "balance_loss_mlp": 1.02285671, + "epoch": 0.7529609811819876, + "flos": 15231798956160.0, + "grad_norm": 2.0055849805379506, + "language_loss": 0.72135866, + "learning_rate": 6.065921505596161e-07, + "loss": 0.74338198, + "num_input_tokens_seen": 134641540, + "step": 6262, + "time_per_iteration": 2.407299280166626 + }, + { + "auxiliary_loss_clip": 0.01127365, + "auxiliary_loss_mlp": 0.01025769, + "balance_loss_clip": 1.04410088, + "balance_loss_mlp": 1.01834822, + "epoch": 0.7530812240726267, + "flos": 19354487385600.0, + "grad_norm": 1.6644118081989647, + "language_loss": 0.76886529, + "learning_rate": 6.060334541006445e-07, + "loss": 0.79039663, + "num_input_tokens_seen": 134660035, + "step": 6263, + "time_per_iteration": 2.511859655380249 + }, + { + "auxiliary_loss_clip": 0.01128556, + "auxiliary_loss_mlp": 0.01027668, + "balance_loss_clip": 1.04110193, + "balance_loss_mlp": 1.02027154, + "epoch": 0.7532014669632658, + "flos": 27748247328000.0, + "grad_norm": 1.5478035042045162, + "language_loss": 0.68902194, + "learning_rate": 6.05474969105289e-07, + "loss": 0.7105841, + "num_input_tokens_seen": 134683025, + "step": 6264, + "time_per_iteration": 2.599367380142212 + }, + { + "auxiliary_loss_clip": 0.01157357, + "auxiliary_loss_mlp": 0.01021824, + "balance_loss_clip": 1.04766059, + "balance_loss_mlp": 1.01392603, + "epoch": 0.7533217098539049, + "flos": 14137421333760.0, + "grad_norm": 2.0821704623535413, + "language_loss": 0.73680735, + "learning_rate": 6.049166956582725e-07, + "loss": 0.75859916, + "num_input_tokens_seen": 134701290, + "step": 6265, + "time_per_iteration": 2.442431926727295 + }, + { + "auxiliary_loss_clip": 0.01149969, + "auxiliary_loss_mlp": 0.01021474, + "balance_loss_clip": 1.04413199, + "balance_loss_mlp": 1.01445889, + "epoch": 0.753441952744544, + "flos": 26429068437120.0, + "grad_norm": 1.8729514147754511, + "language_loss": 0.87850332, + "learning_rate": 6.043586338442841e-07, + "loss": 0.90021777, + "num_input_tokens_seen": 134720345, + "step": 6266, + "time_per_iteration": 2.4937522411346436 + }, + { + "auxiliary_loss_clip": 0.01165906, + "auxiliary_loss_mlp": 0.01022545, + "balance_loss_clip": 1.04785454, + "balance_loss_mlp": 1.01622677, + "epoch": 0.7535621956351831, + "flos": 23878621192320.0, + "grad_norm": 2.4024852102906973, + "language_loss": 0.73084199, + "learning_rate": 6.038007837479815e-07, + "loss": 0.75272655, + "num_input_tokens_seen": 134741450, + "step": 6267, + "time_per_iteration": 2.450429916381836 + }, + { + "auxiliary_loss_clip": 0.01152056, + "auxiliary_loss_mlp": 0.01025615, + "balance_loss_clip": 1.04593968, + "balance_loss_mlp": 1.01801205, + "epoch": 0.7536824385258222, + "flos": 21795873960960.0, + "grad_norm": 2.777974967469771, + "language_loss": 0.63988769, + "learning_rate": 6.032431454539897e-07, + "loss": 0.66166437, + "num_input_tokens_seen": 134760295, + "step": 6268, + "time_per_iteration": 2.4668467044830322 + }, + { + "auxiliary_loss_clip": 0.01128841, + "auxiliary_loss_mlp": 0.01025039, + "balance_loss_clip": 1.04362667, + "balance_loss_mlp": 1.01808333, + "epoch": 0.7538026814164612, + "flos": 28911644933760.0, + "grad_norm": 1.7176283626392377, + "language_loss": 0.81950635, + "learning_rate": 6.026857190469014e-07, + "loss": 0.84104514, + "num_input_tokens_seen": 134782050, + "step": 6269, + "time_per_iteration": 2.6269593238830566 + }, + { + "auxiliary_loss_clip": 0.01142737, + "auxiliary_loss_mlp": 0.01020097, + "balance_loss_clip": 1.04410982, + "balance_loss_mlp": 1.01261353, + "epoch": 0.7539229243071004, + "flos": 21104701482240.0, + "grad_norm": 2.5231685752788358, + "language_loss": 0.74321419, + "learning_rate": 6.0212850461128e-07, + "loss": 0.76484257, + "num_input_tokens_seen": 134801170, + "step": 6270, + "time_per_iteration": 2.491837501525879 + }, + { + "auxiliary_loss_clip": 0.01143136, + "auxiliary_loss_mlp": 0.01025597, + "balance_loss_clip": 1.04357469, + "balance_loss_mlp": 1.01769948, + "epoch": 0.7540431671977395, + "flos": 15158469340800.0, + "grad_norm": 2.068254034183314, + "language_loss": 0.74906039, + "learning_rate": 6.015715022316516e-07, + "loss": 0.77074772, + "num_input_tokens_seen": 134819150, + "step": 6271, + "time_per_iteration": 2.5116705894470215 + }, + { + "auxiliary_loss_clip": 0.01113715, + "auxiliary_loss_mlp": 0.01021158, + "balance_loss_clip": 1.0399611, + "balance_loss_mlp": 1.01306355, + "epoch": 0.7541634100883785, + "flos": 18770579896320.0, + "grad_norm": 2.6171874579422014, + "language_loss": 0.77926838, + "learning_rate": 6.010147119925154e-07, + "loss": 0.80061722, + "num_input_tokens_seen": 134836905, + "step": 6272, + "time_per_iteration": 2.5537827014923096 + }, + { + "auxiliary_loss_clip": 0.01120062, + "auxiliary_loss_mlp": 0.01027317, + "balance_loss_clip": 1.04341292, + "balance_loss_mlp": 1.01948142, + "epoch": 0.7542836529790176, + "flos": 20594770053120.0, + "grad_norm": 1.963561097880035, + "language_loss": 0.66296607, + "learning_rate": 6.004581339783348e-07, + "loss": 0.68443984, + "num_input_tokens_seen": 134855225, + "step": 6273, + "time_per_iteration": 2.531003475189209 + }, + { + "auxiliary_loss_clip": 0.01161373, + "auxiliary_loss_mlp": 0.01033213, + "balance_loss_clip": 1.04845548, + "balance_loss_mlp": 1.0249784, + "epoch": 0.7544038958696567, + "flos": 19095104298240.0, + "grad_norm": 5.360091956054326, + "language_loss": 0.68806106, + "learning_rate": 5.999017682735425e-07, + "loss": 0.71000689, + "num_input_tokens_seen": 134871615, + "step": 6274, + "time_per_iteration": 2.4589171409606934 + }, + { + "auxiliary_loss_clip": 0.01104773, + "auxiliary_loss_mlp": 0.01027454, + "balance_loss_clip": 1.04045999, + "balance_loss_mlp": 1.01974738, + "epoch": 0.7545241387602958, + "flos": 31723306859520.0, + "grad_norm": 1.9462329184402523, + "language_loss": 0.66456223, + "learning_rate": 5.993456149625387e-07, + "loss": 0.68588448, + "num_input_tokens_seen": 134892765, + "step": 6275, + "time_per_iteration": 2.694934844970703 + }, + { + "auxiliary_loss_clip": 0.01117471, + "auxiliary_loss_mlp": 0.01023756, + "balance_loss_clip": 1.04161286, + "balance_loss_mlp": 1.01682961, + "epoch": 0.7546443816509348, + "flos": 20296495514880.0, + "grad_norm": 1.8334126048800938, + "language_loss": 0.82293868, + "learning_rate": 5.987896741296909e-07, + "loss": 0.84435093, + "num_input_tokens_seen": 134910505, + "step": 6276, + "time_per_iteration": 2.521209716796875 + }, + { + "auxiliary_loss_clip": 0.01141951, + "auxiliary_loss_mlp": 0.01026736, + "balance_loss_clip": 1.04682302, + "balance_loss_mlp": 1.01918113, + "epoch": 0.754764624541574, + "flos": 23696159080320.0, + "grad_norm": 2.0928170394662695, + "language_loss": 0.78230721, + "learning_rate": 5.982339458593361e-07, + "loss": 0.80399406, + "num_input_tokens_seen": 134930445, + "step": 6277, + "time_per_iteration": 2.5253167152404785 + }, + { + "auxiliary_loss_clip": 0.01151006, + "auxiliary_loss_mlp": 0.0076202, + "balance_loss_clip": 1.04653466, + "balance_loss_mlp": 1.00054288, + "epoch": 0.7548848674322131, + "flos": 25337204766720.0, + "grad_norm": 1.6914972138068698, + "language_loss": 0.84245437, + "learning_rate": 5.976784302357767e-07, + "loss": 0.8615846, + "num_input_tokens_seen": 134951010, + "step": 6278, + "time_per_iteration": 2.498873710632324 + }, + { + "auxiliary_loss_clip": 0.01159009, + "auxiliary_loss_mlp": 0.01028007, + "balance_loss_clip": 1.04794407, + "balance_loss_mlp": 1.02075255, + "epoch": 0.7550051103228521, + "flos": 19573147428480.0, + "grad_norm": 1.8210677753427895, + "language_loss": 0.73342252, + "learning_rate": 5.971231273432855e-07, + "loss": 0.75529265, + "num_input_tokens_seen": 134970495, + "step": 6279, + "time_per_iteration": 2.4720773696899414 + }, + { + "auxiliary_loss_clip": 0.01056459, + "auxiliary_loss_mlp": 0.01001654, + "balance_loss_clip": 1.00907362, + "balance_loss_mlp": 1.00065315, + "epoch": 0.7551253532134913, + "flos": 64150068648960.0, + "grad_norm": 0.8096463739302285, + "language_loss": 0.54612291, + "learning_rate": 5.965680372661e-07, + "loss": 0.56670403, + "num_input_tokens_seen": 135028060, + "step": 6280, + "time_per_iteration": 3.7199387550354004 + }, + { + "auxiliary_loss_clip": 0.01142716, + "auxiliary_loss_mlp": 0.01026002, + "balance_loss_clip": 1.04727376, + "balance_loss_mlp": 1.01934433, + "epoch": 0.7552455961041303, + "flos": 26067986968320.0, + "grad_norm": 1.8384111665372946, + "language_loss": 0.56324482, + "learning_rate": 5.960131600884266e-07, + "loss": 0.58493203, + "num_input_tokens_seen": 135047330, + "step": 6281, + "time_per_iteration": 3.3315725326538086 + }, + { + "auxiliary_loss_clip": 0.01130292, + "auxiliary_loss_mlp": 0.0102331, + "balance_loss_clip": 1.04429889, + "balance_loss_mlp": 1.01640201, + "epoch": 0.7553658389947694, + "flos": 24498223822080.0, + "grad_norm": 3.117340710703474, + "language_loss": 0.76066494, + "learning_rate": 5.954584958944413e-07, + "loss": 0.78220093, + "num_input_tokens_seen": 135065995, + "step": 6282, + "time_per_iteration": 2.5451295375823975 + }, + { + "auxiliary_loss_clip": 0.0112856, + "auxiliary_loss_mlp": 0.00762084, + "balance_loss_clip": 1.04133677, + "balance_loss_mlp": 1.00052118, + "epoch": 0.7554860818854086, + "flos": 21799465320960.0, + "grad_norm": 1.935828674439747, + "language_loss": 0.8138839, + "learning_rate": 5.949040447682854e-07, + "loss": 0.83279032, + "num_input_tokens_seen": 135085820, + "step": 6283, + "time_per_iteration": 3.2941019535064697 + }, + { + "auxiliary_loss_clip": 0.01147078, + "auxiliary_loss_mlp": 0.01023745, + "balance_loss_clip": 1.04527223, + "balance_loss_mlp": 1.01631832, + "epoch": 0.7556063247760476, + "flos": 16362123114240.0, + "grad_norm": 2.1801622910012837, + "language_loss": 0.68550569, + "learning_rate": 5.943498067940686e-07, + "loss": 0.707214, + "num_input_tokens_seen": 135102845, + "step": 6284, + "time_per_iteration": 2.4634134769439697 + }, + { + "auxiliary_loss_clip": 0.01134727, + "auxiliary_loss_mlp": 0.01029264, + "balance_loss_clip": 1.04799449, + "balance_loss_mlp": 1.0216223, + "epoch": 0.7557265676666867, + "flos": 27235155502080.0, + "grad_norm": 1.9404580166258598, + "language_loss": 0.81387937, + "learning_rate": 5.937957820558686e-07, + "loss": 0.83551919, + "num_input_tokens_seen": 135122190, + "step": 6285, + "time_per_iteration": 2.5396463871002197 + }, + { + "auxiliary_loss_clip": 0.01046655, + "auxiliary_loss_mlp": 0.01001734, + "balance_loss_clip": 1.00758648, + "balance_loss_mlp": 1.0006789, + "epoch": 0.7558468105573258, + "flos": 62189131415040.0, + "grad_norm": 0.8499998988610321, + "language_loss": 0.65346485, + "learning_rate": 5.932419706377296e-07, + "loss": 0.67394876, + "num_input_tokens_seen": 135180495, + "step": 6286, + "time_per_iteration": 3.7938132286071777 + }, + { + "auxiliary_loss_clip": 0.01123904, + "auxiliary_loss_mlp": 0.01024752, + "balance_loss_clip": 1.04605472, + "balance_loss_mlp": 1.01731324, + "epoch": 0.7559670534479649, + "flos": 33249078823680.0, + "grad_norm": 2.2178128656164353, + "language_loss": 0.74591935, + "learning_rate": 5.92688372623666e-07, + "loss": 0.76740593, + "num_input_tokens_seen": 135199200, + "step": 6287, + "time_per_iteration": 2.6450650691986084 + }, + { + "auxiliary_loss_clip": 0.01155301, + "auxiliary_loss_mlp": 0.01023945, + "balance_loss_clip": 1.04409337, + "balance_loss_mlp": 1.01645899, + "epoch": 0.7560872963386039, + "flos": 14064379027200.0, + "grad_norm": 3.33574568077882, + "language_loss": 0.74086505, + "learning_rate": 5.921349880976574e-07, + "loss": 0.76265752, + "num_input_tokens_seen": 135217035, + "step": 6288, + "time_per_iteration": 2.5210769176483154 + }, + { + "auxiliary_loss_clip": 0.0114306, + "auxiliary_loss_mlp": 0.0076264, + "balance_loss_clip": 1.04282546, + "balance_loss_mlp": 1.00058627, + "epoch": 0.7562075392292431, + "flos": 20412307941120.0, + "grad_norm": 1.5416043294483213, + "language_loss": 0.81583273, + "learning_rate": 5.915818171436515e-07, + "loss": 0.83488977, + "num_input_tokens_seen": 135236370, + "step": 6289, + "time_per_iteration": 2.495403289794922 + }, + { + "auxiliary_loss_clip": 0.01139069, + "auxiliary_loss_mlp": 0.01027146, + "balance_loss_clip": 1.04059434, + "balance_loss_mlp": 1.01987481, + "epoch": 0.7563277821198822, + "flos": 20376792368640.0, + "grad_norm": 1.5861205483324963, + "language_loss": 0.74637425, + "learning_rate": 5.910288598455642e-07, + "loss": 0.76803637, + "num_input_tokens_seen": 135255720, + "step": 6290, + "time_per_iteration": 2.5029208660125732 + }, + { + "auxiliary_loss_clip": 0.0116031, + "auxiliary_loss_mlp": 0.01028879, + "balance_loss_clip": 1.04658735, + "balance_loss_mlp": 1.02117527, + "epoch": 0.7564480250105212, + "flos": 18588261438720.0, + "grad_norm": 2.0839807590812094, + "language_loss": 0.74114895, + "learning_rate": 5.90476116287278e-07, + "loss": 0.76304084, + "num_input_tokens_seen": 135273320, + "step": 6291, + "time_per_iteration": 2.4457147121429443 + }, + { + "auxiliary_loss_clip": 0.01142777, + "auxiliary_loss_mlp": 0.01026932, + "balance_loss_clip": 1.04839969, + "balance_loss_mlp": 1.01971388, + "epoch": 0.7565682679011604, + "flos": 21215521918080.0, + "grad_norm": 1.885405717399406, + "language_loss": 0.68089378, + "learning_rate": 5.899235865526456e-07, + "loss": 0.70259094, + "num_input_tokens_seen": 135292615, + "step": 6292, + "time_per_iteration": 2.5098037719726562 + }, + { + "auxiliary_loss_clip": 0.01118906, + "auxiliary_loss_mlp": 0.01024572, + "balance_loss_clip": 1.04205251, + "balance_loss_mlp": 1.01791668, + "epoch": 0.7566885107917994, + "flos": 20449008662400.0, + "grad_norm": 1.6858943577023016, + "language_loss": 0.82023436, + "learning_rate": 5.893712707254825e-07, + "loss": 0.84166908, + "num_input_tokens_seen": 135310075, + "step": 6293, + "time_per_iteration": 2.512664318084717 + }, + { + "auxiliary_loss_clip": 0.01107937, + "auxiliary_loss_mlp": 0.01021129, + "balance_loss_clip": 1.03950679, + "balance_loss_mlp": 1.01315355, + "epoch": 0.7568087536824385, + "flos": 19025832919680.0, + "grad_norm": 2.463499473864732, + "language_loss": 0.65890312, + "learning_rate": 5.888191688895769e-07, + "loss": 0.68019378, + "num_input_tokens_seen": 135327335, + "step": 6294, + "time_per_iteration": 2.589416027069092 + }, + { + "auxiliary_loss_clip": 0.01169773, + "auxiliary_loss_mlp": 0.01028596, + "balance_loss_clip": 1.04610693, + "balance_loss_mlp": 1.02047181, + "epoch": 0.7569289965730777, + "flos": 15225442248960.0, + "grad_norm": 2.2538741517382133, + "language_loss": 0.62305826, + "learning_rate": 5.882672811286813e-07, + "loss": 0.64504194, + "num_input_tokens_seen": 135343615, + "step": 6295, + "time_per_iteration": 2.3943371772766113 + }, + { + "auxiliary_loss_clip": 0.01170618, + "auxiliary_loss_mlp": 0.01026407, + "balance_loss_clip": 1.04712653, + "balance_loss_mlp": 1.01891422, + "epoch": 0.7570492394637167, + "flos": 20769367086720.0, + "grad_norm": 2.0659363841012555, + "language_loss": 0.69808871, + "learning_rate": 5.877156075265166e-07, + "loss": 0.72005892, + "num_input_tokens_seen": 135359880, + "step": 6296, + "time_per_iteration": 2.418064594268799 + }, + { + "auxiliary_loss_clip": 0.0113932, + "auxiliary_loss_mlp": 0.01021737, + "balance_loss_clip": 1.04273987, + "balance_loss_mlp": 1.01401782, + "epoch": 0.7571694823543558, + "flos": 15664091137920.0, + "grad_norm": 2.9152037913952324, + "language_loss": 0.68935966, + "learning_rate": 5.871641481667715e-07, + "loss": 0.71097022, + "num_input_tokens_seen": 135374325, + "step": 6297, + "time_per_iteration": 2.4547274112701416 + }, + { + "auxiliary_loss_clip": 0.0111527, + "auxiliary_loss_mlp": 0.01030047, + "balance_loss_clip": 1.04250121, + "balance_loss_mlp": 1.02222371, + "epoch": 0.7572897252449949, + "flos": 25409241492480.0, + "grad_norm": 1.9152070967078865, + "language_loss": 0.84243155, + "learning_rate": 5.866129031331011e-07, + "loss": 0.86388469, + "num_input_tokens_seen": 135393980, + "step": 6298, + "time_per_iteration": 2.601121664047241 + }, + { + "auxiliary_loss_clip": 0.01142338, + "auxiliary_loss_mlp": 0.0102285, + "balance_loss_clip": 1.04365897, + "balance_loss_mlp": 1.0153935, + "epoch": 0.757409968135634, + "flos": 24279348297600.0, + "grad_norm": 2.320615453610574, + "language_loss": 0.83269393, + "learning_rate": 5.8606187250913e-07, + "loss": 0.8543458, + "num_input_tokens_seen": 135412030, + "step": 6299, + "time_per_iteration": 2.5320518016815186 + }, + { + "auxiliary_loss_clip": 0.01154884, + "auxiliary_loss_mlp": 0.0076215, + "balance_loss_clip": 1.04841471, + "balance_loss_mlp": 1.00059211, + "epoch": 0.757530211026273, + "flos": 24133766474880.0, + "grad_norm": 1.848613128607865, + "language_loss": 0.83983147, + "learning_rate": 5.855110563784482e-07, + "loss": 0.85900182, + "num_input_tokens_seen": 135430565, + "step": 6300, + "time_per_iteration": 2.485365390777588 + }, + { + "auxiliary_loss_clip": 0.01149828, + "auxiliary_loss_mlp": 0.00761923, + "balance_loss_clip": 1.04436111, + "balance_loss_mlp": 1.00057971, + "epoch": 0.7576504539169122, + "flos": 23951807153280.0, + "grad_norm": 1.870662107925895, + "language_loss": 0.63992369, + "learning_rate": 5.849604548246156e-07, + "loss": 0.65904111, + "num_input_tokens_seen": 135451675, + "step": 6301, + "time_per_iteration": 2.494382858276367 + }, + { + "auxiliary_loss_clip": 0.01147796, + "auxiliary_loss_mlp": 0.007621, + "balance_loss_clip": 1.04754436, + "balance_loss_mlp": 1.00054955, + "epoch": 0.7577706968075513, + "flos": 21251360712960.0, + "grad_norm": 9.147263464952937, + "language_loss": 0.80260181, + "learning_rate": 5.844100679311565e-07, + "loss": 0.82170081, + "num_input_tokens_seen": 135470635, + "step": 6302, + "time_per_iteration": 2.505312442779541 + }, + { + "auxiliary_loss_clip": 0.01143956, + "auxiliary_loss_mlp": 0.01023113, + "balance_loss_clip": 1.04785752, + "balance_loss_mlp": 1.01513171, + "epoch": 0.7578909396981903, + "flos": 18296595002880.0, + "grad_norm": 1.989015299949718, + "language_loss": 0.76351285, + "learning_rate": 5.838598957815637e-07, + "loss": 0.78518355, + "num_input_tokens_seen": 135487865, + "step": 6303, + "time_per_iteration": 2.4620368480682373 + }, + { + "auxiliary_loss_clip": 0.01135226, + "auxiliary_loss_mlp": 0.01021571, + "balance_loss_clip": 1.04309106, + "balance_loss_mlp": 1.01456749, + "epoch": 0.7580111825888295, + "flos": 25373869574400.0, + "grad_norm": 2.236389830285023, + "language_loss": 0.85356951, + "learning_rate": 5.833099384592996e-07, + "loss": 0.87513745, + "num_input_tokens_seen": 135508440, + "step": 6304, + "time_per_iteration": 2.5292460918426514 + }, + { + "auxiliary_loss_clip": 0.01137133, + "auxiliary_loss_mlp": 0.01026744, + "balance_loss_clip": 1.04443705, + "balance_loss_mlp": 1.01903152, + "epoch": 0.7581314254794685, + "flos": 23768662682880.0, + "grad_norm": 1.9767833196326294, + "language_loss": 0.71485174, + "learning_rate": 5.827601960477913e-07, + "loss": 0.73649055, + "num_input_tokens_seen": 135526365, + "step": 6305, + "time_per_iteration": 2.4888572692871094 + }, + { + "auxiliary_loss_clip": 0.01151554, + "auxiliary_loss_mlp": 0.0102736, + "balance_loss_clip": 1.04416871, + "balance_loss_mlp": 1.02083588, + "epoch": 0.7582516683701076, + "flos": 22054610603520.0, + "grad_norm": 2.58406951419987, + "language_loss": 0.70272404, + "learning_rate": 5.822106686304344e-07, + "loss": 0.72451317, + "num_input_tokens_seen": 135545655, + "step": 6306, + "time_per_iteration": 2.4547324180603027 + }, + { + "auxiliary_loss_clip": 0.01134193, + "auxiliary_loss_mlp": 0.01027216, + "balance_loss_clip": 1.044011, + "balance_loss_mlp": 1.01989663, + "epoch": 0.7583719112607467, + "flos": 31649725848960.0, + "grad_norm": 1.724943167289728, + "language_loss": 0.58066249, + "learning_rate": 5.816613562905919e-07, + "loss": 0.60227656, + "num_input_tokens_seen": 135566840, + "step": 6307, + "time_per_iteration": 3.4117913246154785 + }, + { + "auxiliary_loss_clip": 0.01122986, + "auxiliary_loss_mlp": 0.01026784, + "balance_loss_clip": 1.04601038, + "balance_loss_mlp": 1.01927352, + "epoch": 0.7584921541513858, + "flos": 33068376478080.0, + "grad_norm": 1.6130052197821638, + "language_loss": 0.69744813, + "learning_rate": 5.811122591115933e-07, + "loss": 0.7189458, + "num_input_tokens_seen": 135587825, + "step": 6308, + "time_per_iteration": 3.369980573654175 + }, + { + "auxiliary_loss_clip": 0.01126879, + "auxiliary_loss_mlp": 0.010282, + "balance_loss_clip": 1.04709113, + "balance_loss_mlp": 1.0208776, + "epoch": 0.7586123970420249, + "flos": 23326350606720.0, + "grad_norm": 4.339822383519919, + "language_loss": 0.71260762, + "learning_rate": 5.805633771767376e-07, + "loss": 0.7341584, + "num_input_tokens_seen": 135605220, + "step": 6309, + "time_per_iteration": 3.2872767448425293 + }, + { + "auxiliary_loss_clip": 0.01139003, + "auxiliary_loss_mlp": 0.01026086, + "balance_loss_clip": 1.04620707, + "balance_loss_mlp": 1.01790237, + "epoch": 0.7587326399326639, + "flos": 18334229477760.0, + "grad_norm": 1.847398567426598, + "language_loss": 0.7756831, + "learning_rate": 5.800147105692888e-07, + "loss": 0.79733396, + "num_input_tokens_seen": 135624795, + "step": 6310, + "time_per_iteration": 2.5124106407165527 + }, + { + "auxiliary_loss_clip": 0.01154178, + "auxiliary_loss_mlp": 0.01023956, + "balance_loss_clip": 1.04330897, + "balance_loss_mlp": 1.01673806, + "epoch": 0.7588528828233031, + "flos": 17275080119040.0, + "grad_norm": 1.6970580748022395, + "language_loss": 0.79120946, + "learning_rate": 5.794662593724795e-07, + "loss": 0.81299078, + "num_input_tokens_seen": 135643800, + "step": 6311, + "time_per_iteration": 2.482511520385742 + }, + { + "auxiliary_loss_clip": 0.0117252, + "auxiliary_loss_mlp": 0.01031977, + "balance_loss_clip": 1.05030334, + "balance_loss_mlp": 1.02447593, + "epoch": 0.7589731257139422, + "flos": 17713621267200.0, + "grad_norm": 1.9077484718751967, + "language_loss": 0.75022948, + "learning_rate": 5.789180236695091e-07, + "loss": 0.77227449, + "num_input_tokens_seen": 135660655, + "step": 6312, + "time_per_iteration": 3.1923696994781494 + }, + { + "auxiliary_loss_clip": 0.01149428, + "auxiliary_loss_mlp": 0.01027666, + "balance_loss_clip": 1.04586101, + "balance_loss_mlp": 1.02101374, + "epoch": 0.7590933686045812, + "flos": 15961072786560.0, + "grad_norm": 1.940281505416938, + "language_loss": 0.85054743, + "learning_rate": 5.78370003543544e-07, + "loss": 0.87231839, + "num_input_tokens_seen": 135679410, + "step": 6313, + "time_per_iteration": 2.447624444961548 + }, + { + "auxiliary_loss_clip": 0.0115526, + "auxiliary_loss_mlp": 0.00762172, + "balance_loss_clip": 1.04658747, + "balance_loss_mlp": 1.00059414, + "epoch": 0.7592136114952204, + "flos": 21068072588160.0, + "grad_norm": 2.255498125189362, + "language_loss": 0.83700252, + "learning_rate": 5.778221990777203e-07, + "loss": 0.85617685, + "num_input_tokens_seen": 135697150, + "step": 6314, + "time_per_iteration": 2.4623754024505615 + }, + { + "auxiliary_loss_clip": 0.01146095, + "auxiliary_loss_mlp": 0.01029636, + "balance_loss_clip": 1.04846084, + "balance_loss_mlp": 1.02194667, + "epoch": 0.7593338543858594, + "flos": 25297666871040.0, + "grad_norm": 2.5583358462413015, + "language_loss": 0.83030117, + "learning_rate": 5.772746103551372e-07, + "loss": 0.85205841, + "num_input_tokens_seen": 135712545, + "step": 6315, + "time_per_iteration": 2.502530336380005 + }, + { + "auxiliary_loss_clip": 0.01137835, + "auxiliary_loss_mlp": 0.01023293, + "balance_loss_clip": 1.04533434, + "balance_loss_mlp": 1.01566935, + "epoch": 0.7594540972764985, + "flos": 31832367528960.0, + "grad_norm": 2.34017184615063, + "language_loss": 0.71572715, + "learning_rate": 5.767272374588648e-07, + "loss": 0.73733842, + "num_input_tokens_seen": 135733950, + "step": 6316, + "time_per_iteration": 2.5764920711517334 + }, + { + "auxiliary_loss_clip": 0.0115465, + "auxiliary_loss_mlp": 0.01024358, + "balance_loss_clip": 1.04880214, + "balance_loss_mlp": 1.01683617, + "epoch": 0.7595743401671377, + "flos": 37597250880000.0, + "grad_norm": 1.7223050653269396, + "language_loss": 0.7791183, + "learning_rate": 5.76180080471939e-07, + "loss": 0.80090839, + "num_input_tokens_seen": 135757120, + "step": 6317, + "time_per_iteration": 2.6037003993988037 + }, + { + "auxiliary_loss_clip": 0.01174792, + "auxiliary_loss_mlp": 0.01025913, + "balance_loss_clip": 1.04908228, + "balance_loss_mlp": 1.01803088, + "epoch": 0.7596945830577767, + "flos": 18287724343680.0, + "grad_norm": 3.666441027209607, + "language_loss": 0.71983683, + "learning_rate": 5.756331394773631e-07, + "loss": 0.74184382, + "num_input_tokens_seen": 135773335, + "step": 6318, + "time_per_iteration": 2.395989179611206 + }, + { + "auxiliary_loss_clip": 0.01096613, + "auxiliary_loss_mlp": 0.00762398, + "balance_loss_clip": 1.03832686, + "balance_loss_mlp": 1.00046885, + "epoch": 0.7598148259484158, + "flos": 22233122219520.0, + "grad_norm": 2.0495725489878014, + "language_loss": 0.75873524, + "learning_rate": 5.750864145581071e-07, + "loss": 0.77732539, + "num_input_tokens_seen": 135792555, + "step": 6319, + "time_per_iteration": 2.615859270095825 + }, + { + "auxiliary_loss_clip": 0.01169321, + "auxiliary_loss_mlp": 0.01025259, + "balance_loss_clip": 1.04950237, + "balance_loss_mlp": 1.01810634, + "epoch": 0.7599350688390549, + "flos": 27161718145920.0, + "grad_norm": 1.9686419332690674, + "language_loss": 0.86262321, + "learning_rate": 5.745399057971085e-07, + "loss": 0.88456899, + "num_input_tokens_seen": 135813690, + "step": 6320, + "time_per_iteration": 2.5424633026123047 + }, + { + "auxiliary_loss_clip": 0.0115847, + "auxiliary_loss_mlp": 0.01025831, + "balance_loss_clip": 1.0463419, + "balance_loss_mlp": 1.01866031, + "epoch": 0.760055311729694, + "flos": 15560704817280.0, + "grad_norm": 2.173434925161073, + "language_loss": 0.74934995, + "learning_rate": 5.739936132772738e-07, + "loss": 0.77119303, + "num_input_tokens_seen": 135832255, + "step": 6321, + "time_per_iteration": 2.4456357955932617 + }, + { + "auxiliary_loss_clip": 0.01167355, + "auxiliary_loss_mlp": 0.01025783, + "balance_loss_clip": 1.04581356, + "balance_loss_mlp": 1.01808536, + "epoch": 0.760175554620333, + "flos": 25155496840320.0, + "grad_norm": 4.102588492780538, + "language_loss": 0.73982507, + "learning_rate": 5.734475370814733e-07, + "loss": 0.76175648, + "num_input_tokens_seen": 135851935, + "step": 6322, + "time_per_iteration": 2.454249620437622 + }, + { + "auxiliary_loss_clip": 0.01155789, + "auxiliary_loss_mlp": 0.01020537, + "balance_loss_clip": 1.04421782, + "balance_loss_mlp": 1.01343787, + "epoch": 0.7602957975109722, + "flos": 24353791234560.0, + "grad_norm": 1.591009491429818, + "language_loss": 0.78497624, + "learning_rate": 5.729016772925483e-07, + "loss": 0.80673945, + "num_input_tokens_seen": 135873510, + "step": 6323, + "time_per_iteration": 2.508070945739746 + }, + { + "auxiliary_loss_clip": 0.01107697, + "auxiliary_loss_mlp": 0.01022454, + "balance_loss_clip": 1.04256368, + "balance_loss_mlp": 1.01511121, + "epoch": 0.7604160404016113, + "flos": 25192664438400.0, + "grad_norm": 1.803000573417526, + "language_loss": 0.70999444, + "learning_rate": 5.723560339933038e-07, + "loss": 0.731296, + "num_input_tokens_seen": 135893845, + "step": 6324, + "time_per_iteration": 2.6169333457946777 + }, + { + "auxiliary_loss_clip": 0.01151808, + "auxiliary_loss_mlp": 0.0076196, + "balance_loss_clip": 1.04448676, + "balance_loss_mlp": 1.00059712, + "epoch": 0.7605362832922503, + "flos": 29861841363840.0, + "grad_norm": 2.1463295342428013, + "language_loss": 0.65043336, + "learning_rate": 5.71810607266513e-07, + "loss": 0.66957104, + "num_input_tokens_seen": 135912430, + "step": 6325, + "time_per_iteration": 2.5198965072631836 + }, + { + "auxiliary_loss_clip": 0.01155107, + "auxiliary_loss_mlp": 0.01023013, + "balance_loss_clip": 1.0449307, + "balance_loss_mlp": 1.01578903, + "epoch": 0.7606565261828895, + "flos": 13917935278080.0, + "grad_norm": 1.9296476404486467, + "language_loss": 0.60404843, + "learning_rate": 5.712653971949184e-07, + "loss": 0.62582964, + "num_input_tokens_seen": 135930550, + "step": 6326, + "time_per_iteration": 2.439866781234741 + }, + { + "auxiliary_loss_clip": 0.01148552, + "auxiliary_loss_mlp": 0.01022183, + "balance_loss_clip": 1.04376864, + "balance_loss_mlp": 1.01461887, + "epoch": 0.7607767690735285, + "flos": 18551273408640.0, + "grad_norm": 2.9861720854091987, + "language_loss": 0.74991786, + "learning_rate": 5.707204038612268e-07, + "loss": 0.77162522, + "num_input_tokens_seen": 135947980, + "step": 6327, + "time_per_iteration": 2.430582284927368 + }, + { + "auxiliary_loss_clip": 0.01151499, + "auxiliary_loss_mlp": 0.01027088, + "balance_loss_clip": 1.05117095, + "balance_loss_mlp": 1.0191431, + "epoch": 0.7608970119641676, + "flos": 20922993555840.0, + "grad_norm": 2.31294870599605, + "language_loss": 0.73360682, + "learning_rate": 5.701756273481138e-07, + "loss": 0.75539267, + "num_input_tokens_seen": 135965400, + "step": 6328, + "time_per_iteration": 2.5004220008850098 + }, + { + "auxiliary_loss_clip": 0.01144615, + "auxiliary_loss_mlp": 0.01022887, + "balance_loss_clip": 1.04416275, + "balance_loss_mlp": 1.0157702, + "epoch": 0.7610172548548068, + "flos": 23807302738560.0, + "grad_norm": 1.897653673215076, + "language_loss": 0.73906457, + "learning_rate": 5.696310677382212e-07, + "loss": 0.76073962, + "num_input_tokens_seen": 135986795, + "step": 6329, + "time_per_iteration": 2.524103879928589 + }, + { + "auxiliary_loss_clip": 0.01032273, + "auxiliary_loss_mlp": 0.01002494, + "balance_loss_clip": 1.00987768, + "balance_loss_mlp": 1.00154066, + "epoch": 0.7611374977454458, + "flos": 66496580426880.0, + "grad_norm": 0.8694970022158176, + "language_loss": 0.61805046, + "learning_rate": 5.690867251141576e-07, + "loss": 0.63839817, + "num_input_tokens_seen": 136053450, + "step": 6330, + "time_per_iteration": 3.2302463054656982 + }, + { + "auxiliary_loss_clip": 0.01161368, + "auxiliary_loss_mlp": 0.01023927, + "balance_loss_clip": 1.04594278, + "balance_loss_mlp": 1.01659536, + "epoch": 0.7612577406360849, + "flos": 15633136592640.0, + "grad_norm": 2.6216009036851724, + "language_loss": 0.91587877, + "learning_rate": 5.685425995585013e-07, + "loss": 0.93773174, + "num_input_tokens_seen": 136071375, + "step": 6331, + "time_per_iteration": 2.478990316390991 + }, + { + "auxiliary_loss_clip": 0.01048073, + "auxiliary_loss_mlp": 0.01001441, + "balance_loss_clip": 1.00909221, + "balance_loss_mlp": 1.00040364, + "epoch": 0.761377983526724, + "flos": 60526253237760.0, + "grad_norm": 1.2375494915441498, + "language_loss": 0.59074855, + "learning_rate": 5.679986911537935e-07, + "loss": 0.61124372, + "num_input_tokens_seen": 136138905, + "step": 6332, + "time_per_iteration": 3.2022109031677246 + }, + { + "auxiliary_loss_clip": 0.01099865, + "auxiliary_loss_mlp": 0.01020709, + "balance_loss_clip": 1.04103446, + "balance_loss_mlp": 1.01355386, + "epoch": 0.7614982264173631, + "flos": 35772522019200.0, + "grad_norm": 1.8667656342329448, + "language_loss": 0.67326987, + "learning_rate": 5.674549999825462e-07, + "loss": 0.69447559, + "num_input_tokens_seen": 136161720, + "step": 6333, + "time_per_iteration": 3.398524045944214 + }, + { + "auxiliary_loss_clip": 0.01056691, + "auxiliary_loss_mlp": 0.01001855, + "balance_loss_clip": 1.00826645, + "balance_loss_mlp": 1.00090694, + "epoch": 0.7616184693080021, + "flos": 67925502345600.0, + "grad_norm": 0.9211682355644588, + "language_loss": 0.71415091, + "learning_rate": 5.669115261272363e-07, + "loss": 0.73473632, + "num_input_tokens_seen": 136222040, + "step": 6334, + "time_per_iteration": 3.8353819847106934 + }, + { + "auxiliary_loss_clip": 0.01155375, + "auxiliary_loss_mlp": 0.01030239, + "balance_loss_clip": 1.04628992, + "balance_loss_mlp": 1.0227406, + "epoch": 0.7617387121986413, + "flos": 20521979141760.0, + "grad_norm": 2.181461894291309, + "language_loss": 0.72820938, + "learning_rate": 5.663682696703081e-07, + "loss": 0.75006545, + "num_input_tokens_seen": 136240305, + "step": 6335, + "time_per_iteration": 3.216170072555542 + }, + { + "auxiliary_loss_clip": 0.01167209, + "auxiliary_loss_mlp": 0.01022233, + "balance_loss_clip": 1.04806221, + "balance_loss_mlp": 1.01505089, + "epoch": 0.7618589550892804, + "flos": 18624495283200.0, + "grad_norm": 1.8891046103649543, + "language_loss": 0.82048464, + "learning_rate": 5.658252306941746e-07, + "loss": 0.84237897, + "num_input_tokens_seen": 136259625, + "step": 6336, + "time_per_iteration": 2.4115939140319824 + }, + { + "auxiliary_loss_clip": 0.01114918, + "auxiliary_loss_mlp": 0.01029417, + "balance_loss_clip": 1.04335618, + "balance_loss_mlp": 1.02129006, + "epoch": 0.7619791979799194, + "flos": 17453735389440.0, + "grad_norm": 2.053138464776332, + "language_loss": 0.75406629, + "learning_rate": 5.65282409281212e-07, + "loss": 0.77550972, + "num_input_tokens_seen": 136277090, + "step": 6337, + "time_per_iteration": 2.550105333328247 + }, + { + "auxiliary_loss_clip": 0.0113868, + "auxiliary_loss_mlp": 0.0102526, + "balance_loss_clip": 1.04399335, + "balance_loss_mlp": 1.01807213, + "epoch": 0.7620994408705585, + "flos": 14137421333760.0, + "grad_norm": 2.046838026228264, + "language_loss": 0.70083398, + "learning_rate": 5.64739805513768e-07, + "loss": 0.72247338, + "num_input_tokens_seen": 136294635, + "step": 6338, + "time_per_iteration": 2.4688751697540283 + }, + { + "auxiliary_loss_clip": 0.0105126, + "auxiliary_loss_mlp": 0.00752846, + "balance_loss_clip": 1.00811601, + "balance_loss_mlp": 1.00059891, + "epoch": 0.7622196837611976, + "flos": 70708792527360.0, + "grad_norm": 0.786857805059053, + "language_loss": 0.55660808, + "learning_rate": 5.641974194741541e-07, + "loss": 0.5746491, + "num_input_tokens_seen": 136350320, + "step": 6339, + "time_per_iteration": 3.6590216159820557 + }, + { + "auxiliary_loss_clip": 0.01041946, + "auxiliary_loss_mlp": 0.01005043, + "balance_loss_clip": 1.01841426, + "balance_loss_mlp": 1.00388718, + "epoch": 0.7623399266518367, + "flos": 60684150447360.0, + "grad_norm": 0.7763285417705972, + "language_loss": 0.63783216, + "learning_rate": 5.636552512446502e-07, + "loss": 0.65830207, + "num_input_tokens_seen": 136411375, + "step": 6340, + "time_per_iteration": 2.9965641498565674 + }, + { + "auxiliary_loss_clip": 0.01149746, + "auxiliary_loss_mlp": 0.01024471, + "balance_loss_clip": 1.04487669, + "balance_loss_mlp": 1.01714575, + "epoch": 0.7624601695424758, + "flos": 26468893641600.0, + "grad_norm": 1.8421563548929643, + "language_loss": 0.7784456, + "learning_rate": 5.631133009075027e-07, + "loss": 0.80018783, + "num_input_tokens_seen": 136430560, + "step": 6341, + "time_per_iteration": 2.4966907501220703 + }, + { + "auxiliary_loss_clip": 0.01155211, + "auxiliary_loss_mlp": 0.00761792, + "balance_loss_clip": 1.04654002, + "balance_loss_mlp": 1.0006268, + "epoch": 0.7625804124331149, + "flos": 19135755515520.0, + "grad_norm": 2.09335453865575, + "language_loss": 0.68449318, + "learning_rate": 5.625715685449242e-07, + "loss": 0.70366323, + "num_input_tokens_seen": 136448665, + "step": 6342, + "time_per_iteration": 2.500884771347046 + }, + { + "auxiliary_loss_clip": 0.01125499, + "auxiliary_loss_mlp": 0.01024911, + "balance_loss_clip": 1.04750884, + "balance_loss_mlp": 1.01794899, + "epoch": 0.762700655323754, + "flos": 26213101914240.0, + "grad_norm": 4.835845594372766, + "language_loss": 0.71488738, + "learning_rate": 5.620300542390966e-07, + "loss": 0.73639154, + "num_input_tokens_seen": 136469710, + "step": 6343, + "time_per_iteration": 2.5622620582580566 + }, + { + "auxiliary_loss_clip": 0.01135847, + "auxiliary_loss_mlp": 0.01024327, + "balance_loss_clip": 1.04167593, + "balance_loss_mlp": 1.01773453, + "epoch": 0.762820898214393, + "flos": 22382582711040.0, + "grad_norm": 1.930044796128457, + "language_loss": 0.85304439, + "learning_rate": 5.614887580721659e-07, + "loss": 0.87464613, + "num_input_tokens_seen": 136489855, + "step": 6344, + "time_per_iteration": 2.4953505992889404 + }, + { + "auxiliary_loss_clip": 0.01119752, + "auxiliary_loss_mlp": 0.01028693, + "balance_loss_clip": 1.04561591, + "balance_loss_mlp": 1.02061701, + "epoch": 0.7629411411050322, + "flos": 15700504550400.0, + "grad_norm": 1.998404879519403, + "language_loss": 0.74165404, + "learning_rate": 5.609476801262481e-07, + "loss": 0.76313841, + "num_input_tokens_seen": 136504715, + "step": 6345, + "time_per_iteration": 2.48455548286438 + }, + { + "auxiliary_loss_clip": 0.01125276, + "auxiliary_loss_mlp": 0.01027378, + "balance_loss_clip": 1.04634643, + "balance_loss_mlp": 1.02002835, + "epoch": 0.7630613839956712, + "flos": 13770342293760.0, + "grad_norm": 5.894231982273964, + "language_loss": 0.63826036, + "learning_rate": 5.604068204834223e-07, + "loss": 0.65978688, + "num_input_tokens_seen": 136521610, + "step": 6346, + "time_per_iteration": 2.4944279193878174 + }, + { + "auxiliary_loss_clip": 0.01110379, + "auxiliary_loss_mlp": 0.00762459, + "balance_loss_clip": 1.04284716, + "balance_loss_mlp": 1.00052667, + "epoch": 0.7631816268863103, + "flos": 14569569861120.0, + "grad_norm": 4.741480496331495, + "language_loss": 0.76590765, + "learning_rate": 5.598661792257367e-07, + "loss": 0.78463602, + "num_input_tokens_seen": 136538655, + "step": 6347, + "time_per_iteration": 2.5436508655548096 + }, + { + "auxiliary_loss_clip": 0.01152443, + "auxiliary_loss_mlp": 0.01023118, + "balance_loss_clip": 1.04474461, + "balance_loss_mlp": 1.01607299, + "epoch": 0.7633018697769495, + "flos": 19062210418560.0, + "grad_norm": 1.9631967184579988, + "language_loss": 0.75796473, + "learning_rate": 5.593257564352071e-07, + "loss": 0.77972031, + "num_input_tokens_seen": 136557095, + "step": 6348, + "time_per_iteration": 2.456549882888794 + }, + { + "auxiliary_loss_clip": 0.01151949, + "auxiliary_loss_mlp": 0.01019398, + "balance_loss_clip": 1.04540241, + "balance_loss_mlp": 1.01246047, + "epoch": 0.7634221126675885, + "flos": 22052958577920.0, + "grad_norm": 1.4551165533173533, + "language_loss": 0.75429296, + "learning_rate": 5.58785552193815e-07, + "loss": 0.7760064, + "num_input_tokens_seen": 136577340, + "step": 6349, + "time_per_iteration": 2.4659714698791504 + }, + { + "auxiliary_loss_clip": 0.01169088, + "auxiliary_loss_mlp": 0.01022112, + "balance_loss_clip": 1.04759395, + "balance_loss_mlp": 1.014956, + "epoch": 0.7635423555582276, + "flos": 29382720825600.0, + "grad_norm": 1.9594767186056827, + "language_loss": 0.75725579, + "learning_rate": 5.582455665835086e-07, + "loss": 0.77916777, + "num_input_tokens_seen": 136597635, + "step": 6350, + "time_per_iteration": 2.4982845783233643 + }, + { + "auxiliary_loss_clip": 0.01149732, + "auxiliary_loss_mlp": 0.01033095, + "balance_loss_clip": 1.04342914, + "balance_loss_mlp": 1.02462554, + "epoch": 0.7636625984488667, + "flos": 17784903807360.0, + "grad_norm": 2.8871986818944646, + "language_loss": 0.72955096, + "learning_rate": 5.577057996862036e-07, + "loss": 0.75137919, + "num_input_tokens_seen": 136615260, + "step": 6351, + "time_per_iteration": 2.470168113708496 + }, + { + "auxiliary_loss_clip": 0.01165255, + "auxiliary_loss_mlp": 0.01025383, + "balance_loss_clip": 1.04686201, + "balance_loss_mlp": 1.01846862, + "epoch": 0.7637828413395058, + "flos": 23734583654400.0, + "grad_norm": 1.8264672695433208, + "language_loss": 0.76179707, + "learning_rate": 5.571662515837814e-07, + "loss": 0.78370345, + "num_input_tokens_seen": 136637220, + "step": 6352, + "time_per_iteration": 2.462733745574951 + }, + { + "auxiliary_loss_clip": 0.01139685, + "auxiliary_loss_mlp": 0.01024409, + "balance_loss_clip": 1.04488325, + "balance_loss_mlp": 1.01716137, + "epoch": 0.7639030842301449, + "flos": 36283279461120.0, + "grad_norm": 1.6586205668701606, + "language_loss": 0.8367337, + "learning_rate": 5.566269223580926e-07, + "loss": 0.85837471, + "num_input_tokens_seen": 136658930, + "step": 6353, + "time_per_iteration": 2.609872341156006 + }, + { + "auxiliary_loss_clip": 0.01157896, + "auxiliary_loss_mlp": 0.0102316, + "balance_loss_clip": 1.047261, + "balance_loss_mlp": 1.01604581, + "epoch": 0.764023327120784, + "flos": 28878104609280.0, + "grad_norm": 1.723827088578598, + "language_loss": 0.75236511, + "learning_rate": 5.560878120909511e-07, + "loss": 0.77417564, + "num_input_tokens_seen": 136681530, + "step": 6354, + "time_per_iteration": 2.517516851425171 + }, + { + "auxiliary_loss_clip": 0.01057872, + "auxiliary_loss_mlp": 0.01000173, + "balance_loss_clip": 1.00945044, + "balance_loss_mlp": 0.99923736, + "epoch": 0.7641435700114231, + "flos": 64789711067520.0, + "grad_norm": 0.8467066650089945, + "language_loss": 0.58589637, + "learning_rate": 5.55548920864141e-07, + "loss": 0.60647684, + "num_input_tokens_seen": 136742185, + "step": 6355, + "time_per_iteration": 3.066352128982544 + }, + { + "auxiliary_loss_clip": 0.01156768, + "auxiliary_loss_mlp": 0.01022376, + "balance_loss_clip": 1.0505805, + "balance_loss_mlp": 1.01580787, + "epoch": 0.7642638129020621, + "flos": 16835784785280.0, + "grad_norm": 1.9690621117607898, + "language_loss": 0.77900028, + "learning_rate": 5.550102487594113e-07, + "loss": 0.80079174, + "num_input_tokens_seen": 136760855, + "step": 6356, + "time_per_iteration": 2.4794204235076904 + }, + { + "auxiliary_loss_clip": 0.01116639, + "auxiliary_loss_mlp": 0.00761906, + "balance_loss_clip": 1.04074466, + "balance_loss_mlp": 1.00054359, + "epoch": 0.7643840557927013, + "flos": 30408940391040.0, + "grad_norm": 1.4811944013932326, + "language_loss": 0.71425873, + "learning_rate": 5.54471795858477e-07, + "loss": 0.73304415, + "num_input_tokens_seen": 136780925, + "step": 6357, + "time_per_iteration": 2.647082805633545 + }, + { + "auxiliary_loss_clip": 0.01125165, + "auxiliary_loss_mlp": 0.01025748, + "balance_loss_clip": 1.040061, + "balance_loss_mlp": 1.01841021, + "epoch": 0.7645042986833404, + "flos": 16983234115200.0, + "grad_norm": 1.8238707635301954, + "language_loss": 0.82975042, + "learning_rate": 5.539335622430235e-07, + "loss": 0.85125959, + "num_input_tokens_seen": 136799545, + "step": 6358, + "time_per_iteration": 2.524460792541504 + }, + { + "auxiliary_loss_clip": 0.01147474, + "auxiliary_loss_mlp": 0.01028117, + "balance_loss_clip": 1.04292655, + "balance_loss_mlp": 1.02005827, + "epoch": 0.7646245415739794, + "flos": 17311493531520.0, + "grad_norm": 1.9199227077053431, + "language_loss": 0.74436104, + "learning_rate": 5.533955479946975e-07, + "loss": 0.76611698, + "num_input_tokens_seen": 136818325, + "step": 6359, + "time_per_iteration": 2.430335521697998 + }, + { + "auxiliary_loss_clip": 0.01030981, + "auxiliary_loss_mlp": 0.00753773, + "balance_loss_clip": 1.01665246, + "balance_loss_mlp": 1.00045729, + "epoch": 0.7647447844646186, + "flos": 70402332666240.0, + "grad_norm": 0.8536379052605284, + "language_loss": 0.65813965, + "learning_rate": 5.528577531951173e-07, + "loss": 0.67598718, + "num_input_tokens_seen": 136878730, + "step": 6360, + "time_per_iteration": 3.8412981033325195 + }, + { + "auxiliary_loss_clip": 0.01146671, + "auxiliary_loss_mlp": 0.01025186, + "balance_loss_clip": 1.0464251, + "balance_loss_mlp": 1.01838851, + "epoch": 0.7648650273552576, + "flos": 17675914965120.0, + "grad_norm": 1.9430305813267124, + "language_loss": 0.73826402, + "learning_rate": 5.523201779258653e-07, + "loss": 0.75998259, + "num_input_tokens_seen": 136897705, + "step": 6361, + "time_per_iteration": 3.2424869537353516 + }, + { + "auxiliary_loss_clip": 0.01166401, + "auxiliary_loss_mlp": 0.01022264, + "balance_loss_clip": 1.04543078, + "balance_loss_mlp": 1.01468229, + "epoch": 0.7649852702458967, + "flos": 22162019247360.0, + "grad_norm": 1.8292006188019807, + "language_loss": 0.84192359, + "learning_rate": 5.517828222684912e-07, + "loss": 0.86381018, + "num_input_tokens_seen": 136918360, + "step": 6362, + "time_per_iteration": 3.1618411540985107 + }, + { + "auxiliary_loss_clip": 0.01043189, + "auxiliary_loss_mlp": 0.01001012, + "balance_loss_clip": 1.00942564, + "balance_loss_mlp": 0.99993938, + "epoch": 0.7651055131365359, + "flos": 69848338227840.0, + "grad_norm": 0.7669884555300981, + "language_loss": 0.59060478, + "learning_rate": 5.512456863045117e-07, + "loss": 0.61104679, + "num_input_tokens_seen": 136979050, + "step": 6363, + "time_per_iteration": 3.0695927143096924 + }, + { + "auxiliary_loss_clip": 0.01168368, + "auxiliary_loss_mlp": 0.0102474, + "balance_loss_clip": 1.04582429, + "balance_loss_mlp": 1.01731014, + "epoch": 0.7652257560271749, + "flos": 19464014931840.0, + "grad_norm": 2.746295817848069, + "language_loss": 0.73880935, + "learning_rate": 5.507087701154089e-07, + "loss": 0.7607404, + "num_input_tokens_seen": 136998970, + "step": 6364, + "time_per_iteration": 2.4305579662323 + }, + { + "auxiliary_loss_clip": 0.01114624, + "auxiliary_loss_mlp": 0.01023563, + "balance_loss_clip": 1.04164386, + "balance_loss_mlp": 1.01644933, + "epoch": 0.765345998917814, + "flos": 15961108700160.0, + "grad_norm": 2.180495000087238, + "language_loss": 0.75449073, + "learning_rate": 5.50172073782634e-07, + "loss": 0.77587265, + "num_input_tokens_seen": 137016950, + "step": 6365, + "time_per_iteration": 3.248990774154663 + }, + { + "auxiliary_loss_clip": 0.0112469, + "auxiliary_loss_mlp": 0.01027369, + "balance_loss_clip": 1.04507565, + "balance_loss_mlp": 1.02010632, + "epoch": 0.7654662418084531, + "flos": 23659853408640.0, + "grad_norm": 2.0874269829567433, + "language_loss": 0.87883461, + "learning_rate": 5.496355973876023e-07, + "loss": 0.90035522, + "num_input_tokens_seen": 137036205, + "step": 6366, + "time_per_iteration": 2.557605743408203 + }, + { + "auxiliary_loss_clip": 0.01122779, + "auxiliary_loss_mlp": 0.00762637, + "balance_loss_clip": 1.0418725, + "balance_loss_mlp": 1.00046647, + "epoch": 0.7655864846990922, + "flos": 41463608878080.0, + "grad_norm": 1.6076840399390309, + "language_loss": 0.70698464, + "learning_rate": 5.490993410116984e-07, + "loss": 0.72583884, + "num_input_tokens_seen": 137059195, + "step": 6367, + "time_per_iteration": 2.740568161010742 + }, + { + "auxiliary_loss_clip": 0.01119884, + "auxiliary_loss_mlp": 0.01027038, + "balance_loss_clip": 1.04252172, + "balance_loss_mlp": 1.01992977, + "epoch": 0.7657067275897312, + "flos": 43142684088960.0, + "grad_norm": 1.7547763329345372, + "language_loss": 0.69794786, + "learning_rate": 5.485633047362704e-07, + "loss": 0.7194171, + "num_input_tokens_seen": 137081200, + "step": 6368, + "time_per_iteration": 2.7278542518615723 + }, + { + "auxiliary_loss_clip": 0.01176197, + "auxiliary_loss_mlp": 0.01030057, + "balance_loss_clip": 1.05252528, + "balance_loss_mlp": 1.02227569, + "epoch": 0.7658269704803703, + "flos": 17311780840320.0, + "grad_norm": 2.2006483116191773, + "language_loss": 0.78720725, + "learning_rate": 5.480274886426341e-07, + "loss": 0.80926985, + "num_input_tokens_seen": 137097840, + "step": 6369, + "time_per_iteration": 2.4142067432403564 + }, + { + "auxiliary_loss_clip": 0.01148929, + "auxiliary_loss_mlp": 0.01022385, + "balance_loss_clip": 1.04643655, + "balance_loss_mlp": 1.0157721, + "epoch": 0.7659472133710095, + "flos": 12568160977920.0, + "grad_norm": 2.1428057546011394, + "language_loss": 0.77921456, + "learning_rate": 5.474918928120744e-07, + "loss": 0.80092776, + "num_input_tokens_seen": 137114335, + "step": 6370, + "time_per_iteration": 2.433645725250244 + }, + { + "auxiliary_loss_clip": 0.01151246, + "auxiliary_loss_mlp": 0.01020413, + "balance_loss_clip": 1.04450274, + "balance_loss_mlp": 1.01364136, + "epoch": 0.7660674562616485, + "flos": 22707430335360.0, + "grad_norm": 1.7547041490899096, + "language_loss": 0.87556696, + "learning_rate": 5.469565173258392e-07, + "loss": 0.89728355, + "num_input_tokens_seen": 137132850, + "step": 6371, + "time_per_iteration": 2.4773831367492676 + }, + { + "auxiliary_loss_clip": 0.01171191, + "auxiliary_loss_mlp": 0.01025218, + "balance_loss_clip": 1.04713535, + "balance_loss_mlp": 1.01743317, + "epoch": 0.7661876991522876, + "flos": 17056455989760.0, + "grad_norm": 1.697321908505598, + "language_loss": 0.63691163, + "learning_rate": 5.464213622651454e-07, + "loss": 0.6588757, + "num_input_tokens_seen": 137150665, + "step": 6372, + "time_per_iteration": 2.402017831802368 + }, + { + "auxiliary_loss_clip": 0.01132575, + "auxiliary_loss_mlp": 0.01024008, + "balance_loss_clip": 1.04472053, + "balance_loss_mlp": 1.01643848, + "epoch": 0.7663079420429267, + "flos": 20084228092800.0, + "grad_norm": 2.3888586256217557, + "language_loss": 0.84149647, + "learning_rate": 5.458864277111753e-07, + "loss": 0.86306226, + "num_input_tokens_seen": 137168500, + "step": 6373, + "time_per_iteration": 2.520429849624634 + }, + { + "auxiliary_loss_clip": 0.01133395, + "auxiliary_loss_mlp": 0.00761265, + "balance_loss_clip": 1.04257178, + "balance_loss_mlp": 1.00045192, + "epoch": 0.7664281849335658, + "flos": 12677473042560.0, + "grad_norm": 2.378835712163824, + "language_loss": 0.69623601, + "learning_rate": 5.453517137450769e-07, + "loss": 0.71518266, + "num_input_tokens_seen": 137185075, + "step": 6374, + "time_per_iteration": 2.4736506938934326 + }, + { + "auxiliary_loss_clip": 0.01154076, + "auxiliary_loss_mlp": 0.01025473, + "balance_loss_clip": 1.0482254, + "balance_loss_mlp": 1.01796889, + "epoch": 0.7665484278242048, + "flos": 22345271458560.0, + "grad_norm": 4.264322032776026, + "language_loss": 0.7604171, + "learning_rate": 5.448172204479684e-07, + "loss": 0.78221262, + "num_input_tokens_seen": 137204355, + "step": 6375, + "time_per_iteration": 2.5074098110198975 + }, + { + "auxiliary_loss_clip": 0.01164679, + "auxiliary_loss_mlp": 0.01022613, + "balance_loss_clip": 1.04495692, + "balance_loss_mlp": 1.01532388, + "epoch": 0.766668670714844, + "flos": 23617909301760.0, + "grad_norm": 1.7760032622599717, + "language_loss": 0.74367929, + "learning_rate": 5.442829479009294e-07, + "loss": 0.76555222, + "num_input_tokens_seen": 137223135, + "step": 6376, + "time_per_iteration": 2.4513204097747803 + }, + { + "auxiliary_loss_clip": 0.01161645, + "auxiliary_loss_mlp": 0.01026198, + "balance_loss_clip": 1.04633057, + "balance_loss_mlp": 1.01845586, + "epoch": 0.7667889136054831, + "flos": 19427134642560.0, + "grad_norm": 1.6468193302562837, + "language_loss": 0.71248776, + "learning_rate": 5.437488961850103e-07, + "loss": 0.73436618, + "num_input_tokens_seen": 137242935, + "step": 6377, + "time_per_iteration": 2.4530441761016846 + }, + { + "auxiliary_loss_clip": 0.01108761, + "auxiliary_loss_mlp": 0.01024628, + "balance_loss_clip": 1.04147124, + "balance_loss_mlp": 1.01747274, + "epoch": 0.7669091564961221, + "flos": 26866352609280.0, + "grad_norm": 1.842139545708323, + "language_loss": 0.75448215, + "learning_rate": 5.432150653812258e-07, + "loss": 0.77581608, + "num_input_tokens_seen": 137262970, + "step": 6378, + "time_per_iteration": 2.5956032276153564 + }, + { + "auxiliary_loss_clip": 0.01149694, + "auxiliary_loss_mlp": 0.01023401, + "balance_loss_clip": 1.04583311, + "balance_loss_mlp": 1.01612949, + "epoch": 0.7670293993867613, + "flos": 12385303816320.0, + "grad_norm": 1.9933926912512352, + "language_loss": 0.82339883, + "learning_rate": 5.42681455570557e-07, + "loss": 0.84512985, + "num_input_tokens_seen": 137279500, + "step": 6379, + "time_per_iteration": 2.4347031116485596 + }, + { + "auxiliary_loss_clip": 0.01163209, + "auxiliary_loss_mlp": 0.01021487, + "balance_loss_clip": 1.04426634, + "balance_loss_mlp": 1.01437902, + "epoch": 0.7671496422774003, + "flos": 21762944167680.0, + "grad_norm": 1.8294726369780547, + "language_loss": 0.65004206, + "learning_rate": 5.42148066833954e-07, + "loss": 0.67188907, + "num_input_tokens_seen": 137298745, + "step": 6380, + "time_per_iteration": 2.4573676586151123 + }, + { + "auxiliary_loss_clip": 0.01165823, + "auxiliary_loss_mlp": 0.01023851, + "balance_loss_clip": 1.04654217, + "balance_loss_mlp": 1.01680851, + "epoch": 0.7672698851680394, + "flos": 21069221823360.0, + "grad_norm": 2.0448647878445376, + "language_loss": 0.75221813, + "learning_rate": 5.416148992523289e-07, + "loss": 0.77411485, + "num_input_tokens_seen": 137317320, + "step": 6381, + "time_per_iteration": 2.418001890182495 + }, + { + "auxiliary_loss_clip": 0.01083307, + "auxiliary_loss_mlp": 0.01029786, + "balance_loss_clip": 1.04012358, + "balance_loss_mlp": 1.02279687, + "epoch": 0.7673901280586786, + "flos": 16976697840000.0, + "grad_norm": 1.9753653914866216, + "language_loss": 0.79000723, + "learning_rate": 5.410819529065644e-07, + "loss": 0.81113815, + "num_input_tokens_seen": 137335275, + "step": 6382, + "time_per_iteration": 2.592508316040039 + }, + { + "auxiliary_loss_clip": 0.0111392, + "auxiliary_loss_mlp": 0.01027184, + "balance_loss_clip": 1.041888, + "balance_loss_mlp": 1.02006161, + "epoch": 0.7675103709493176, + "flos": 29242669697280.0, + "grad_norm": 6.771106320370801, + "language_loss": 0.65384257, + "learning_rate": 5.405492278775079e-07, + "loss": 0.67525351, + "num_input_tokens_seen": 137355055, + "step": 6383, + "time_per_iteration": 2.6037278175354004 + }, + { + "auxiliary_loss_clip": 0.01139931, + "auxiliary_loss_mlp": 0.0102752, + "balance_loss_clip": 1.04263067, + "balance_loss_mlp": 1.02004278, + "epoch": 0.7676306138399567, + "flos": 29023004073600.0, + "grad_norm": 2.1734596964816957, + "language_loss": 0.80254346, + "learning_rate": 5.400167242459732e-07, + "loss": 0.82421803, + "num_input_tokens_seen": 137374015, + "step": 6384, + "time_per_iteration": 2.537813663482666 + }, + { + "auxiliary_loss_clip": 0.0115173, + "auxiliary_loss_mlp": 0.01026387, + "balance_loss_clip": 1.04533839, + "balance_loss_mlp": 1.01931214, + "epoch": 0.7677508567305958, + "flos": 22565116650240.0, + "grad_norm": 2.6621250022027727, + "language_loss": 0.80584979, + "learning_rate": 5.394844420927405e-07, + "loss": 0.827631, + "num_input_tokens_seen": 137393625, + "step": 6385, + "time_per_iteration": 2.457882881164551 + }, + { + "auxiliary_loss_clip": 0.01167837, + "auxiliary_loss_mlp": 0.01029635, + "balance_loss_clip": 1.04722953, + "balance_loss_mlp": 1.02242911, + "epoch": 0.7678710996212349, + "flos": 25411432222080.0, + "grad_norm": 2.0146867190792235, + "language_loss": 0.73118067, + "learning_rate": 5.389523814985562e-07, + "loss": 0.75315535, + "num_input_tokens_seen": 137413045, + "step": 6386, + "time_per_iteration": 2.4599742889404297 + }, + { + "auxiliary_loss_clip": 0.01110718, + "auxiliary_loss_mlp": 0.01023447, + "balance_loss_clip": 1.04161119, + "balance_loss_mlp": 1.01579666, + "epoch": 0.767991342511874, + "flos": 26756825063040.0, + "grad_norm": 1.7505781211633173, + "language_loss": 0.76060528, + "learning_rate": 5.384205425441344e-07, + "loss": 0.7819469, + "num_input_tokens_seen": 137433955, + "step": 6387, + "time_per_iteration": 2.6088690757751465 + }, + { + "auxiliary_loss_clip": 0.01141036, + "auxiliary_loss_mlp": 0.01023001, + "balance_loss_clip": 1.04281616, + "balance_loss_mlp": 1.01589024, + "epoch": 0.7681115854025131, + "flos": 26359509749760.0, + "grad_norm": 1.8855101745060068, + "language_loss": 0.845707, + "learning_rate": 5.378889253101537e-07, + "loss": 0.86734736, + "num_input_tokens_seen": 137454510, + "step": 6388, + "time_per_iteration": 3.3406355381011963 + }, + { + "auxiliary_loss_clip": 0.01153579, + "auxiliary_loss_mlp": 0.01023136, + "balance_loss_clip": 1.04426968, + "balance_loss_mlp": 1.01603079, + "epoch": 0.7682318282931522, + "flos": 23257043314560.0, + "grad_norm": 1.6307636218301729, + "language_loss": 0.80538774, + "learning_rate": 5.373575298772617e-07, + "loss": 0.82715487, + "num_input_tokens_seen": 137473630, + "step": 6389, + "time_per_iteration": 3.9865477085113525 + }, + { + "auxiliary_loss_clip": 0.01056438, + "auxiliary_loss_mlp": 0.0100235, + "balance_loss_clip": 1.00750673, + "balance_loss_mlp": 1.00148523, + "epoch": 0.7683520711837912, + "flos": 70072457137920.0, + "grad_norm": 0.7695029085400649, + "language_loss": 0.61293983, + "learning_rate": 5.368263563260689e-07, + "loss": 0.63352776, + "num_input_tokens_seen": 137538765, + "step": 6390, + "time_per_iteration": 3.120307445526123 + }, + { + "auxiliary_loss_clip": 0.01153525, + "auxiliary_loss_mlp": 0.01020524, + "balance_loss_clip": 1.04446411, + "balance_loss_mlp": 1.01330256, + "epoch": 0.7684723140744304, + "flos": 18624890332800.0, + "grad_norm": 1.6223568533390302, + "language_loss": 0.64454675, + "learning_rate": 5.362954047371537e-07, + "loss": 0.66628718, + "num_input_tokens_seen": 137557875, + "step": 6391, + "time_per_iteration": 2.464961051940918 + }, + { + "auxiliary_loss_clip": 0.01127308, + "auxiliary_loss_mlp": 0.01030154, + "balance_loss_clip": 1.04787445, + "balance_loss_mlp": 1.02252162, + "epoch": 0.7685925569650695, + "flos": 27452989532160.0, + "grad_norm": 1.8813300539150346, + "language_loss": 0.71988374, + "learning_rate": 5.357646751910627e-07, + "loss": 0.74145842, + "num_input_tokens_seen": 137579055, + "step": 6392, + "time_per_iteration": 3.3690733909606934 + }, + { + "auxiliary_loss_clip": 0.01137687, + "auxiliary_loss_mlp": 0.01028736, + "balance_loss_clip": 1.04278183, + "balance_loss_mlp": 1.02098167, + "epoch": 0.7687127998557085, + "flos": 24535714642560.0, + "grad_norm": 2.3105815538055867, + "language_loss": 0.79743254, + "learning_rate": 5.352341677683061e-07, + "loss": 0.81909674, + "num_input_tokens_seen": 137600355, + "step": 6393, + "time_per_iteration": 2.519652843475342 + }, + { + "auxiliary_loss_clip": 0.01134917, + "auxiliary_loss_mlp": 0.01026731, + "balance_loss_clip": 1.04496884, + "balance_loss_mlp": 1.01918507, + "epoch": 0.7688330427463477, + "flos": 25155963717120.0, + "grad_norm": 2.202886304497734, + "language_loss": 0.78997171, + "learning_rate": 5.347038825493617e-07, + "loss": 0.81158817, + "num_input_tokens_seen": 137621885, + "step": 6394, + "time_per_iteration": 2.5626468658447266 + }, + { + "auxiliary_loss_clip": 0.011378, + "auxiliary_loss_mlp": 0.01027123, + "balance_loss_clip": 1.0471468, + "balance_loss_mlp": 1.02022696, + "epoch": 0.7689532856369867, + "flos": 21211284113280.0, + "grad_norm": 2.0620133259648643, + "language_loss": 0.68668956, + "learning_rate": 5.341738196146732e-07, + "loss": 0.70833874, + "num_input_tokens_seen": 137640230, + "step": 6395, + "time_per_iteration": 2.487943410873413 + }, + { + "auxiliary_loss_clip": 0.01149087, + "auxiliary_loss_mlp": 0.01020791, + "balance_loss_clip": 1.04252827, + "balance_loss_mlp": 1.01328933, + "epoch": 0.7690735285276258, + "flos": 25119083427840.0, + "grad_norm": 9.065201660005622, + "language_loss": 0.73567593, + "learning_rate": 5.336439790446503e-07, + "loss": 0.75737464, + "num_input_tokens_seen": 137659330, + "step": 6396, + "time_per_iteration": 2.48336124420166 + }, + { + "auxiliary_loss_clip": 0.01120904, + "auxiliary_loss_mlp": 0.0102826, + "balance_loss_clip": 1.03943908, + "balance_loss_mlp": 1.02000463, + "epoch": 0.769193771418265, + "flos": 54744020640000.0, + "grad_norm": 1.6139728745513855, + "language_loss": 0.62801021, + "learning_rate": 5.331143609196711e-07, + "loss": 0.6495018, + "num_input_tokens_seen": 137683145, + "step": 6397, + "time_per_iteration": 2.8054990768432617 + }, + { + "auxiliary_loss_clip": 0.01152697, + "auxiliary_loss_mlp": 0.0102487, + "balance_loss_clip": 1.04659057, + "balance_loss_mlp": 1.01750302, + "epoch": 0.769314014308904, + "flos": 37341890115840.0, + "grad_norm": 1.7147420095518333, + "language_loss": 0.76904809, + "learning_rate": 5.325849653200758e-07, + "loss": 0.7908237, + "num_input_tokens_seen": 137707095, + "step": 6398, + "time_per_iteration": 2.578864097595215 + }, + { + "auxiliary_loss_clip": 0.01169181, + "auxiliary_loss_mlp": 0.01024386, + "balance_loss_clip": 1.04824662, + "balance_loss_mlp": 1.01691794, + "epoch": 0.7694342571995431, + "flos": 20631686256000.0, + "grad_norm": 1.939054914240124, + "language_loss": 0.76311457, + "learning_rate": 5.32055792326175e-07, + "loss": 0.78505027, + "num_input_tokens_seen": 137725520, + "step": 6399, + "time_per_iteration": 2.40311861038208 + }, + { + "auxiliary_loss_clip": 0.01142981, + "auxiliary_loss_mlp": 0.01022353, + "balance_loss_clip": 1.04739714, + "balance_loss_mlp": 1.01516187, + "epoch": 0.7695545000901821, + "flos": 24207706621440.0, + "grad_norm": 1.980182410251135, + "language_loss": 0.72638988, + "learning_rate": 5.315268420182437e-07, + "loss": 0.7480433, + "num_input_tokens_seen": 137744195, + "step": 6400, + "time_per_iteration": 2.513068675994873 + }, + { + "auxiliary_loss_clip": 0.01130411, + "auxiliary_loss_mlp": 0.00761857, + "balance_loss_clip": 1.04319143, + "balance_loss_mlp": 1.00058258, + "epoch": 0.7696747429808213, + "flos": 28001273708160.0, + "grad_norm": 1.7800350635248587, + "language_loss": 0.76700658, + "learning_rate": 5.309981144765221e-07, + "loss": 0.78592926, + "num_input_tokens_seen": 137764340, + "step": 6401, + "time_per_iteration": 2.603911876678467 + }, + { + "auxiliary_loss_clip": 0.01117069, + "auxiliary_loss_mlp": 0.0102405, + "balance_loss_clip": 1.04043543, + "balance_loss_mlp": 1.01726663, + "epoch": 0.7697949858714603, + "flos": 11509550323200.0, + "grad_norm": 3.0477854685569588, + "language_loss": 0.75488049, + "learning_rate": 5.304696097812196e-07, + "loss": 0.77629167, + "num_input_tokens_seen": 137780940, + "step": 6402, + "time_per_iteration": 2.5308871269226074 + }, + { + "auxiliary_loss_clip": 0.0113802, + "auxiliary_loss_mlp": 0.01026611, + "balance_loss_clip": 1.04301381, + "balance_loss_mlp": 1.01876378, + "epoch": 0.7699152287620994, + "flos": 26688271956480.0, + "grad_norm": 4.975172912081611, + "language_loss": 0.60211015, + "learning_rate": 5.299413280125078e-07, + "loss": 0.62375641, + "num_input_tokens_seen": 137799250, + "step": 6403, + "time_per_iteration": 2.533581018447876 + }, + { + "auxiliary_loss_clip": 0.01140194, + "auxiliary_loss_mlp": 0.01029142, + "balance_loss_clip": 1.04397821, + "balance_loss_mlp": 1.02177525, + "epoch": 0.7700354716527386, + "flos": 16544944362240.0, + "grad_norm": 1.9549624358580022, + "language_loss": 0.72608525, + "learning_rate": 5.294132692505284e-07, + "loss": 0.74777865, + "num_input_tokens_seen": 137817660, + "step": 6404, + "time_per_iteration": 2.51015567779541 + }, + { + "auxiliary_loss_clip": 0.01101288, + "auxiliary_loss_mlp": 0.01024085, + "balance_loss_clip": 1.03932571, + "balance_loss_mlp": 1.01640224, + "epoch": 0.7701557145433776, + "flos": 19242733196160.0, + "grad_norm": 2.5623099836045817, + "language_loss": 0.79355842, + "learning_rate": 5.288854335753861e-07, + "loss": 0.81481212, + "num_input_tokens_seen": 137835920, + "step": 6405, + "time_per_iteration": 2.545071601867676 + }, + { + "auxiliary_loss_clip": 0.0115508, + "auxiliary_loss_mlp": 0.01023632, + "balance_loss_clip": 1.04466176, + "balance_loss_mlp": 1.01638699, + "epoch": 0.7702759574340167, + "flos": 31685744211840.0, + "grad_norm": 1.536698290623967, + "language_loss": 0.75243735, + "learning_rate": 5.283578210671551e-07, + "loss": 0.77422452, + "num_input_tokens_seen": 137858160, + "step": 6406, + "time_per_iteration": 2.5445072650909424 + }, + { + "auxiliary_loss_clip": 0.0114417, + "auxiliary_loss_mlp": 0.01023662, + "balance_loss_clip": 1.0453527, + "balance_loss_mlp": 1.01628256, + "epoch": 0.7703962003246558, + "flos": 16800089644800.0, + "grad_norm": 2.101808575419471, + "language_loss": 0.76620436, + "learning_rate": 5.278304318058719e-07, + "loss": 0.78788269, + "num_input_tokens_seen": 137876015, + "step": 6407, + "time_per_iteration": 2.472489595413208 + }, + { + "auxiliary_loss_clip": 0.01095587, + "auxiliary_loss_mlp": 0.01026595, + "balance_loss_clip": 1.03931582, + "balance_loss_mlp": 1.01911497, + "epoch": 0.7705164432152949, + "flos": 35736072693120.0, + "grad_norm": 2.7445022559300885, + "language_loss": 0.79162937, + "learning_rate": 5.273032658715411e-07, + "loss": 0.81285119, + "num_input_tokens_seen": 137898825, + "step": 6408, + "time_per_iteration": 2.704343795776367 + }, + { + "auxiliary_loss_clip": 0.01106917, + "auxiliary_loss_mlp": 0.01024591, + "balance_loss_clip": 1.03957319, + "balance_loss_mlp": 1.01712275, + "epoch": 0.7706366861059339, + "flos": 23365960329600.0, + "grad_norm": 1.9288501105952853, + "language_loss": 0.76558292, + "learning_rate": 5.267763233441347e-07, + "loss": 0.78689802, + "num_input_tokens_seen": 137919455, + "step": 6409, + "time_per_iteration": 2.5731661319732666 + }, + { + "auxiliary_loss_clip": 0.01156721, + "auxiliary_loss_mlp": 0.01023087, + "balance_loss_clip": 1.04680634, + "balance_loss_mlp": 1.01551747, + "epoch": 0.7707569289965731, + "flos": 22929897219840.0, + "grad_norm": 2.370863413080902, + "language_loss": 0.69519103, + "learning_rate": 5.26249604303588e-07, + "loss": 0.7169891, + "num_input_tokens_seen": 137937960, + "step": 6410, + "time_per_iteration": 2.472429037094116 + }, + { + "auxiliary_loss_clip": 0.01168247, + "auxiliary_loss_mlp": 0.01024035, + "balance_loss_clip": 1.04784155, + "balance_loss_mlp": 1.01665604, + "epoch": 0.7708771718872122, + "flos": 17420661941760.0, + "grad_norm": 2.1661801446504327, + "language_loss": 0.78542781, + "learning_rate": 5.257231088298057e-07, + "loss": 0.80735064, + "num_input_tokens_seen": 137956370, + "step": 6411, + "time_per_iteration": 2.3980062007904053 + }, + { + "auxiliary_loss_clip": 0.01030019, + "auxiliary_loss_mlp": 0.01001621, + "balance_loss_clip": 1.00642991, + "balance_loss_mlp": 1.00066721, + "epoch": 0.7709974147778512, + "flos": 72241316248320.0, + "grad_norm": 0.7981227970837961, + "language_loss": 0.53956348, + "learning_rate": 5.25196837002655e-07, + "loss": 0.55987984, + "num_input_tokens_seen": 138016080, + "step": 6412, + "time_per_iteration": 3.1325135231018066 + }, + { + "auxiliary_loss_clip": 0.01137562, + "auxiliary_loss_mlp": 0.01032737, + "balance_loss_clip": 1.04391277, + "balance_loss_mlp": 1.02506578, + "epoch": 0.7711176576684904, + "flos": 39859694876160.0, + "grad_norm": 2.3140931182983233, + "language_loss": 0.6850453, + "learning_rate": 5.24670788901971e-07, + "loss": 0.70674831, + "num_input_tokens_seen": 138039170, + "step": 6413, + "time_per_iteration": 2.639094114303589 + }, + { + "auxiliary_loss_clip": 0.01139757, + "auxiliary_loss_mlp": 0.01025387, + "balance_loss_clip": 1.04583263, + "balance_loss_mlp": 1.01680446, + "epoch": 0.7712379005591294, + "flos": 36976391274240.0, + "grad_norm": 2.1103188685050185, + "language_loss": 0.68424815, + "learning_rate": 5.241449646075557e-07, + "loss": 0.7058996, + "num_input_tokens_seen": 138062395, + "step": 6414, + "time_per_iteration": 4.186159372329712 + }, + { + "auxiliary_loss_clip": 0.01164252, + "auxiliary_loss_mlp": 0.01025578, + "balance_loss_clip": 1.04859996, + "balance_loss_mlp": 1.01820207, + "epoch": 0.7713581434497685, + "flos": 22776773541120.0, + "grad_norm": 2.168952988609096, + "language_loss": 0.72233057, + "learning_rate": 5.236193641991762e-07, + "loss": 0.74422884, + "num_input_tokens_seen": 138080325, + "step": 6415, + "time_per_iteration": 2.4831011295318604 + }, + { + "auxiliary_loss_clip": 0.01139817, + "auxiliary_loss_mlp": 0.01023694, + "balance_loss_clip": 1.04535222, + "balance_loss_mlp": 1.0165174, + "epoch": 0.7714783863404077, + "flos": 24097460803200.0, + "grad_norm": 2.054297274562023, + "language_loss": 0.7007814, + "learning_rate": 5.23093987756565e-07, + "loss": 0.72241652, + "num_input_tokens_seen": 138099020, + "step": 6416, + "time_per_iteration": 3.2842328548431396 + }, + { + "auxiliary_loss_clip": 0.01130143, + "auxiliary_loss_mlp": 0.01026584, + "balance_loss_clip": 1.04035401, + "balance_loss_mlp": 1.01884699, + "epoch": 0.7715986292310467, + "flos": 21063655215360.0, + "grad_norm": 1.866884107411473, + "language_loss": 0.75485599, + "learning_rate": 5.225688353594217e-07, + "loss": 0.77642328, + "num_input_tokens_seen": 138118650, + "step": 6417, + "time_per_iteration": 2.548492908477783 + }, + { + "auxiliary_loss_clip": 0.01144413, + "auxiliary_loss_mlp": 0.00761779, + "balance_loss_clip": 1.0458312, + "balance_loss_mlp": 1.00060439, + "epoch": 0.7717188721216858, + "flos": 20594877793920.0, + "grad_norm": 3.6126183274180956, + "language_loss": 0.77352798, + "learning_rate": 5.220439070874108e-07, + "loss": 0.79258984, + "num_input_tokens_seen": 138137890, + "step": 6418, + "time_per_iteration": 2.4752397537231445 + }, + { + "auxiliary_loss_clip": 0.01155614, + "auxiliary_loss_mlp": 0.01026093, + "balance_loss_clip": 1.04814434, + "balance_loss_mlp": 1.01888704, + "epoch": 0.7718391150123249, + "flos": 26250951870720.0, + "grad_norm": 1.7449842691034168, + "language_loss": 0.71064818, + "learning_rate": 5.215192030201652e-07, + "loss": 0.73246527, + "num_input_tokens_seen": 138158880, + "step": 6419, + "time_per_iteration": 3.262087106704712 + }, + { + "auxiliary_loss_clip": 0.01112322, + "auxiliary_loss_mlp": 0.01026106, + "balance_loss_clip": 1.03826046, + "balance_loss_mlp": 1.01883399, + "epoch": 0.771959357902964, + "flos": 22049762267520.0, + "grad_norm": 1.8603742145152067, + "language_loss": 0.8623758, + "learning_rate": 5.209947232372798e-07, + "loss": 0.88376009, + "num_input_tokens_seen": 138176370, + "step": 6420, + "time_per_iteration": 2.5066111087799072 + }, + { + "auxiliary_loss_clip": 0.01157458, + "auxiliary_loss_mlp": 0.00761961, + "balance_loss_clip": 1.04469132, + "balance_loss_mlp": 1.00058162, + "epoch": 0.772079600793603, + "flos": 30446000248320.0, + "grad_norm": 1.800714045167674, + "language_loss": 0.80994928, + "learning_rate": 5.204704678183196e-07, + "loss": 0.8291434, + "num_input_tokens_seen": 138195105, + "step": 6421, + "time_per_iteration": 2.5356979370117188 + }, + { + "auxiliary_loss_clip": 0.01169402, + "auxiliary_loss_mlp": 0.01022373, + "balance_loss_clip": 1.04794645, + "balance_loss_mlp": 1.01454699, + "epoch": 0.7721998436842422, + "flos": 12969857750400.0, + "grad_norm": 2.018312334336048, + "language_loss": 0.85225415, + "learning_rate": 5.19946436842813e-07, + "loss": 0.87417185, + "num_input_tokens_seen": 138212235, + "step": 6422, + "time_per_iteration": 2.3939242362976074 + }, + { + "auxiliary_loss_clip": 0.01125243, + "auxiliary_loss_mlp": 0.01023063, + "balance_loss_clip": 1.04433894, + "balance_loss_mlp": 1.01604497, + "epoch": 0.7723200865748813, + "flos": 32635509678720.0, + "grad_norm": 1.6788510371750853, + "language_loss": 0.68119705, + "learning_rate": 5.194226303902546e-07, + "loss": 0.70268011, + "num_input_tokens_seen": 138231970, + "step": 6423, + "time_per_iteration": 2.6108970642089844 + }, + { + "auxiliary_loss_clip": 0.01138447, + "auxiliary_loss_mlp": 0.01025969, + "balance_loss_clip": 1.04390168, + "balance_loss_mlp": 1.018736, + "epoch": 0.7724403294655203, + "flos": 21105707063040.0, + "grad_norm": 1.9694810847598787, + "language_loss": 0.70622516, + "learning_rate": 5.188990485401072e-07, + "loss": 0.72786927, + "num_input_tokens_seen": 138251175, + "step": 6424, + "time_per_iteration": 2.477288007736206 + }, + { + "auxiliary_loss_clip": 0.01155236, + "auxiliary_loss_mlp": 0.01021909, + "balance_loss_clip": 1.04660082, + "balance_loss_mlp": 1.01503015, + "epoch": 0.7725605723561595, + "flos": 22090736707200.0, + "grad_norm": 1.8666940096396702, + "language_loss": 0.86299407, + "learning_rate": 5.183756913717954e-07, + "loss": 0.88476557, + "num_input_tokens_seen": 138270950, + "step": 6425, + "time_per_iteration": 2.469475507736206 + }, + { + "auxiliary_loss_clip": 0.01133714, + "auxiliary_loss_mlp": 0.0102896, + "balance_loss_clip": 1.04354405, + "balance_loss_mlp": 1.0216943, + "epoch": 0.7726808152467985, + "flos": 34495610457600.0, + "grad_norm": 1.7661788433004935, + "language_loss": 0.73035347, + "learning_rate": 5.178525589647136e-07, + "loss": 0.75198013, + "num_input_tokens_seen": 138292590, + "step": 6426, + "time_per_iteration": 2.5862114429473877 + }, + { + "auxiliary_loss_clip": 0.01144027, + "auxiliary_loss_mlp": 0.01023916, + "balance_loss_clip": 1.04372323, + "balance_loss_mlp": 1.01709414, + "epoch": 0.7728010581374376, + "flos": 22306344094080.0, + "grad_norm": 2.0445296387577083, + "language_loss": 0.78984678, + "learning_rate": 5.173296513982197e-07, + "loss": 0.8115263, + "num_input_tokens_seen": 138311115, + "step": 6427, + "time_per_iteration": 2.480914354324341 + }, + { + "auxiliary_loss_clip": 0.01137044, + "auxiliary_loss_mlp": 0.01027341, + "balance_loss_clip": 1.04612851, + "balance_loss_mlp": 1.01871026, + "epoch": 0.7729213010280768, + "flos": 27126453968640.0, + "grad_norm": 1.981474813148968, + "language_loss": 0.64677286, + "learning_rate": 5.168069687516398e-07, + "loss": 0.66841674, + "num_input_tokens_seen": 138330885, + "step": 6428, + "time_per_iteration": 2.5682642459869385 + }, + { + "auxiliary_loss_clip": 0.01141006, + "auxiliary_loss_mlp": 0.01020743, + "balance_loss_clip": 1.04681098, + "balance_loss_mlp": 1.01312852, + "epoch": 0.7730415439187158, + "flos": 18150223080960.0, + "grad_norm": 2.1646422308166007, + "language_loss": 0.71604478, + "learning_rate": 5.16284511104263e-07, + "loss": 0.7376622, + "num_input_tokens_seen": 138350020, + "step": 6429, + "time_per_iteration": 2.5066845417022705 + }, + { + "auxiliary_loss_clip": 0.01140119, + "auxiliary_loss_mlp": 0.01026964, + "balance_loss_clip": 1.04546189, + "balance_loss_mlp": 1.01947212, + "epoch": 0.7731617868093549, + "flos": 11947480940160.0, + "grad_norm": 2.596970942912798, + "language_loss": 0.80558395, + "learning_rate": 5.157622785353457e-07, + "loss": 0.82725483, + "num_input_tokens_seen": 138368135, + "step": 6430, + "time_per_iteration": 2.4569520950317383 + }, + { + "auxiliary_loss_clip": 0.01056005, + "auxiliary_loss_mlp": 0.01000379, + "balance_loss_clip": 1.0083431, + "balance_loss_mlp": 0.9993121, + "epoch": 0.7732820296999939, + "flos": 64201027069440.0, + "grad_norm": 0.644383603259178, + "language_loss": 0.60397804, + "learning_rate": 5.152402711241113e-07, + "loss": 0.62454188, + "num_input_tokens_seen": 138436040, + "step": 6431, + "time_per_iteration": 3.1136438846588135 + }, + { + "auxiliary_loss_clip": 0.01121696, + "auxiliary_loss_mlp": 0.01023217, + "balance_loss_clip": 1.03997707, + "balance_loss_mlp": 1.01594496, + "epoch": 0.7734022725906331, + "flos": 25302191984640.0, + "grad_norm": 1.7178594255300008, + "language_loss": 0.82834071, + "learning_rate": 5.147184889497465e-07, + "loss": 0.84978986, + "num_input_tokens_seen": 138455510, + "step": 6432, + "time_per_iteration": 2.559795618057251 + }, + { + "auxiliary_loss_clip": 0.01118731, + "auxiliary_loss_mlp": 0.01024405, + "balance_loss_clip": 1.04225492, + "balance_loss_mlp": 1.01672745, + "epoch": 0.7735225154812722, + "flos": 17347440067200.0, + "grad_norm": 2.447287417469229, + "language_loss": 0.80254805, + "learning_rate": 5.141969320914072e-07, + "loss": 0.82397938, + "num_input_tokens_seen": 138473015, + "step": 6433, + "time_per_iteration": 2.484694004058838 + }, + { + "auxiliary_loss_clip": 0.01171504, + "auxiliary_loss_mlp": 0.01026167, + "balance_loss_clip": 1.04729795, + "balance_loss_mlp": 1.01862442, + "epoch": 0.7736427583719112, + "flos": 32630086725120.0, + "grad_norm": 3.4149618608154295, + "language_loss": 0.62624526, + "learning_rate": 5.136756006282113e-07, + "loss": 0.64822203, + "num_input_tokens_seen": 138491680, + "step": 6434, + "time_per_iteration": 2.50626802444458 + }, + { + "auxiliary_loss_clip": 0.01170124, + "auxiliary_loss_mlp": 0.01025279, + "balance_loss_clip": 1.04876637, + "balance_loss_mlp": 1.01795387, + "epoch": 0.7737630012625504, + "flos": 19860073269120.0, + "grad_norm": 2.4542191199124272, + "language_loss": 0.8506192, + "learning_rate": 5.131544946392446e-07, + "loss": 0.87257326, + "num_input_tokens_seen": 138506960, + "step": 6435, + "time_per_iteration": 2.411463499069214 + }, + { + "auxiliary_loss_clip": 0.0114155, + "auxiliary_loss_mlp": 0.01025895, + "balance_loss_clip": 1.04844928, + "balance_loss_mlp": 1.01848304, + "epoch": 0.7738832441531894, + "flos": 36022639397760.0, + "grad_norm": 2.520366769204538, + "language_loss": 0.64195061, + "learning_rate": 5.126336142035592e-07, + "loss": 0.66362512, + "num_input_tokens_seen": 138526995, + "step": 6436, + "time_per_iteration": 2.600829601287842 + }, + { + "auxiliary_loss_clip": 0.01141522, + "auxiliary_loss_mlp": 0.0102335, + "balance_loss_clip": 1.04445589, + "balance_loss_mlp": 1.01559281, + "epoch": 0.7740034870438285, + "flos": 13405274415360.0, + "grad_norm": 2.672947123499331, + "language_loss": 0.71844935, + "learning_rate": 5.121129594001721e-07, + "loss": 0.74009806, + "num_input_tokens_seen": 138541260, + "step": 6437, + "time_per_iteration": 2.431009292602539 + }, + { + "auxiliary_loss_clip": 0.01154874, + "auxiliary_loss_mlp": 0.01024705, + "balance_loss_clip": 1.04747319, + "balance_loss_mlp": 1.01687908, + "epoch": 0.7741237299344677, + "flos": 22086714384000.0, + "grad_norm": 1.642611240515045, + "language_loss": 0.81093514, + "learning_rate": 5.115925303080661e-07, + "loss": 0.83273089, + "num_input_tokens_seen": 138560970, + "step": 6438, + "time_per_iteration": 2.471721649169922 + }, + { + "auxiliary_loss_clip": 0.01139982, + "auxiliary_loss_mlp": 0.01024721, + "balance_loss_clip": 1.04432142, + "balance_loss_mlp": 1.01771116, + "epoch": 0.7742439728251067, + "flos": 19864777950720.0, + "grad_norm": 3.6719449114524605, + "language_loss": 0.79283273, + "learning_rate": 5.110723270061899e-07, + "loss": 0.81447971, + "num_input_tokens_seen": 138577460, + "step": 6439, + "time_per_iteration": 2.459984302520752 + }, + { + "auxiliary_loss_clip": 0.01166005, + "auxiliary_loss_mlp": 0.01024926, + "balance_loss_clip": 1.04637194, + "balance_loss_mlp": 1.01819706, + "epoch": 0.7743642157157458, + "flos": 16690167048960.0, + "grad_norm": 1.820104785101105, + "language_loss": 0.79510236, + "learning_rate": 5.105523495734572e-07, + "loss": 0.81701171, + "num_input_tokens_seen": 138594860, + "step": 6440, + "time_per_iteration": 3.1517493724823 + }, + { + "auxiliary_loss_clip": 0.01168102, + "auxiliary_loss_mlp": 0.01024878, + "balance_loss_clip": 1.04584265, + "balance_loss_mlp": 1.01698923, + "epoch": 0.7744844586063849, + "flos": 20304360593280.0, + "grad_norm": 1.5674613149206775, + "language_loss": 0.75308239, + "learning_rate": 5.100325980887499e-07, + "loss": 0.77501214, + "num_input_tokens_seen": 138614785, + "step": 6441, + "time_per_iteration": 3.182567834854126 + }, + { + "auxiliary_loss_clip": 0.01149956, + "auxiliary_loss_mlp": 0.01024268, + "balance_loss_clip": 1.04625106, + "balance_loss_mlp": 1.0169723, + "epoch": 0.774604701497024, + "flos": 22966705681920.0, + "grad_norm": 1.9070641011633056, + "language_loss": 0.83396381, + "learning_rate": 5.095130726309116e-07, + "loss": 0.8557061, + "num_input_tokens_seen": 138634960, + "step": 6442, + "time_per_iteration": 2.500337839126587 + }, + { + "auxiliary_loss_clip": 0.01064471, + "auxiliary_loss_mlp": 0.01001255, + "balance_loss_clip": 1.00751662, + "balance_loss_mlp": 1.00018847, + "epoch": 0.774724944387663, + "flos": 60288523073280.0, + "grad_norm": 0.7946783275518771, + "language_loss": 0.58989024, + "learning_rate": 5.089937732787559e-07, + "loss": 0.61054754, + "num_input_tokens_seen": 138699520, + "step": 6443, + "time_per_iteration": 3.8191800117492676 + }, + { + "auxiliary_loss_clip": 0.0112596, + "auxiliary_loss_mlp": 0.01025074, + "balance_loss_clip": 1.0427146, + "balance_loss_mlp": 1.01743293, + "epoch": 0.7748451872783022, + "flos": 26761026954240.0, + "grad_norm": 2.3484435463483355, + "language_loss": 0.66644573, + "learning_rate": 5.084747001110592e-07, + "loss": 0.68795609, + "num_input_tokens_seen": 138719145, + "step": 6444, + "time_per_iteration": 2.552452325820923 + }, + { + "auxiliary_loss_clip": 0.01151141, + "auxiliary_loss_mlp": 0.00761682, + "balance_loss_clip": 1.04867768, + "balance_loss_mlp": 1.00054216, + "epoch": 0.7749654301689413, + "flos": 30338627518080.0, + "grad_norm": 1.8731025003932038, + "language_loss": 0.69878471, + "learning_rate": 5.07955853206564e-07, + "loss": 0.71791291, + "num_input_tokens_seen": 138743850, + "step": 6445, + "time_per_iteration": 3.291645050048828 + }, + { + "auxiliary_loss_clip": 0.01158876, + "auxiliary_loss_mlp": 0.01027066, + "balance_loss_clip": 1.04666686, + "balance_loss_mlp": 1.01958013, + "epoch": 0.7750856730595803, + "flos": 43179851687040.0, + "grad_norm": 1.5088828081609484, + "language_loss": 0.7084372, + "learning_rate": 5.074372326439807e-07, + "loss": 0.73029661, + "num_input_tokens_seen": 138766860, + "step": 6446, + "time_per_iteration": 2.6401734352111816 + }, + { + "auxiliary_loss_clip": 0.01128848, + "auxiliary_loss_mlp": 0.0102513, + "balance_loss_clip": 1.0439595, + "balance_loss_mlp": 1.01776862, + "epoch": 0.7752059159502195, + "flos": 17640040256640.0, + "grad_norm": 2.4120105643590724, + "language_loss": 0.7353791, + "learning_rate": 5.069188385019814e-07, + "loss": 0.75691891, + "num_input_tokens_seen": 138784560, + "step": 6447, + "time_per_iteration": 2.491739511489868 + }, + { + "auxiliary_loss_clip": 0.01119732, + "auxiliary_loss_mlp": 0.01023609, + "balance_loss_clip": 1.04133511, + "balance_loss_mlp": 1.01605082, + "epoch": 0.7753261588408585, + "flos": 12677688524160.0, + "grad_norm": 2.713623686024014, + "language_loss": 0.61363822, + "learning_rate": 5.064006708592077e-07, + "loss": 0.63507164, + "num_input_tokens_seen": 138800805, + "step": 6448, + "time_per_iteration": 2.5128839015960693 + }, + { + "auxiliary_loss_clip": 0.01135629, + "auxiliary_loss_mlp": 0.0102177, + "balance_loss_clip": 1.04573119, + "balance_loss_mlp": 1.01473069, + "epoch": 0.7754464017314976, + "flos": 16690741666560.0, + "grad_norm": 2.2487753769798613, + "language_loss": 0.75147307, + "learning_rate": 5.058827297942641e-07, + "loss": 0.77304697, + "num_input_tokens_seen": 138815910, + "step": 6449, + "time_per_iteration": 2.485912561416626 + }, + { + "auxiliary_loss_clip": 0.01147009, + "auxiliary_loss_mlp": 0.01023332, + "balance_loss_clip": 1.04561162, + "balance_loss_mlp": 1.01609612, + "epoch": 0.7755666446221368, + "flos": 19718944732800.0, + "grad_norm": 1.849397317088518, + "language_loss": 0.75021195, + "learning_rate": 5.053650153857237e-07, + "loss": 0.77191538, + "num_input_tokens_seen": 138834920, + "step": 6450, + "time_per_iteration": 2.483072519302368 + }, + { + "auxiliary_loss_clip": 0.01154214, + "auxiliary_loss_mlp": 0.01024716, + "balance_loss_clip": 1.04703259, + "balance_loss_mlp": 1.01746166, + "epoch": 0.7756868875127758, + "flos": 18693623007360.0, + "grad_norm": 1.588913355812507, + "language_loss": 0.69968629, + "learning_rate": 5.048475277121214e-07, + "loss": 0.72147554, + "num_input_tokens_seen": 138852135, + "step": 6451, + "time_per_iteration": 2.440720796585083 + }, + { + "auxiliary_loss_clip": 0.01153409, + "auxiliary_loss_mlp": 0.01021438, + "balance_loss_clip": 1.04429674, + "balance_loss_mlp": 1.01416075, + "epoch": 0.7758071304034149, + "flos": 28404191543040.0, + "grad_norm": 1.7261089940159262, + "language_loss": 0.77137321, + "learning_rate": 5.043302668519598e-07, + "loss": 0.7931217, + "num_input_tokens_seen": 138871470, + "step": 6452, + "time_per_iteration": 2.5080342292785645 + }, + { + "auxiliary_loss_clip": 0.01156037, + "auxiliary_loss_mlp": 0.01022391, + "balance_loss_clip": 1.04408431, + "balance_loss_mlp": 1.01532519, + "epoch": 0.775927373294054, + "flos": 20595344670720.0, + "grad_norm": 5.338227014886899, + "language_loss": 0.71806109, + "learning_rate": 5.038132328837079e-07, + "loss": 0.7398454, + "num_input_tokens_seen": 138889860, + "step": 6453, + "time_per_iteration": 2.487948417663574 + }, + { + "auxiliary_loss_clip": 0.01155498, + "auxiliary_loss_mlp": 0.01023402, + "balance_loss_clip": 1.04569364, + "balance_loss_mlp": 1.01577258, + "epoch": 0.7760476161846931, + "flos": 22526368853760.0, + "grad_norm": 2.745510876536476, + "language_loss": 0.7402963, + "learning_rate": 5.032964258857993e-07, + "loss": 0.76208532, + "num_input_tokens_seen": 138909955, + "step": 6454, + "time_per_iteration": 2.4425342082977295 + }, + { + "auxiliary_loss_clip": 0.01150742, + "auxiliary_loss_mlp": 0.01025724, + "balance_loss_clip": 1.04133558, + "balance_loss_mlp": 1.01824379, + "epoch": 0.7761678590753321, + "flos": 48651488403840.0, + "grad_norm": 2.375644921535922, + "language_loss": 0.68480051, + "learning_rate": 5.027798459366329e-07, + "loss": 0.70656514, + "num_input_tokens_seen": 138935320, + "step": 6455, + "time_per_iteration": 2.6924359798431396 + }, + { + "auxiliary_loss_clip": 0.01159846, + "auxiliary_loss_mlp": 0.01027531, + "balance_loss_clip": 1.04658079, + "balance_loss_mlp": 1.02005064, + "epoch": 0.7762881019659713, + "flos": 26177047637760.0, + "grad_norm": 1.4511521207338791, + "language_loss": 0.63428664, + "learning_rate": 5.02263493114573e-07, + "loss": 0.65616035, + "num_input_tokens_seen": 138957115, + "step": 6456, + "time_per_iteration": 2.493183135986328 + }, + { + "auxiliary_loss_clip": 0.01165986, + "auxiliary_loss_mlp": 0.01026317, + "balance_loss_clip": 1.04636455, + "balance_loss_mlp": 1.01848221, + "epoch": 0.7764083448566104, + "flos": 20588341518720.0, + "grad_norm": 2.601009892577988, + "language_loss": 0.77482563, + "learning_rate": 5.017473674979502e-07, + "loss": 0.7967487, + "num_input_tokens_seen": 138973140, + "step": 6457, + "time_per_iteration": 2.384098768234253 + }, + { + "auxiliary_loss_clip": 0.01028648, + "auxiliary_loss_mlp": 0.01008896, + "balance_loss_clip": 1.01356375, + "balance_loss_mlp": 1.00799584, + "epoch": 0.7765285877472494, + "flos": 67293078560640.0, + "grad_norm": 0.7473618854006284, + "language_loss": 0.58380502, + "learning_rate": 5.01231469165061e-07, + "loss": 0.6041804, + "num_input_tokens_seen": 139028965, + "step": 6458, + "time_per_iteration": 2.970550537109375 + }, + { + "auxiliary_loss_clip": 0.01054437, + "auxiliary_loss_mlp": 0.01001158, + "balance_loss_clip": 1.00721526, + "balance_loss_mlp": 1.00015056, + "epoch": 0.7766488306378886, + "flos": 61344476121600.0, + "grad_norm": 0.8292680541376127, + "language_loss": 0.569525, + "learning_rate": 5.007157981941663e-07, + "loss": 0.59008098, + "num_input_tokens_seen": 139094325, + "step": 6459, + "time_per_iteration": 3.1352126598358154 + }, + { + "auxiliary_loss_clip": 0.01044581, + "auxiliary_loss_mlp": 0.0100039, + "balance_loss_clip": 1.0075171, + "balance_loss_mlp": 0.99944228, + "epoch": 0.7767690735285276, + "flos": 62946199393920.0, + "grad_norm": 0.8665739061359047, + "language_loss": 0.67453641, + "learning_rate": 5.002003546634928e-07, + "loss": 0.69498616, + "num_input_tokens_seen": 139150425, + "step": 6460, + "time_per_iteration": 2.9972968101501465 + }, + { + "auxiliary_loss_clip": 0.01111922, + "auxiliary_loss_mlp": 0.01023211, + "balance_loss_clip": 1.04593492, + "balance_loss_mlp": 1.01617813, + "epoch": 0.7768893164191667, + "flos": 20886400575360.0, + "grad_norm": 1.6702496897624084, + "language_loss": 0.76068795, + "learning_rate": 4.996851386512331e-07, + "loss": 0.78203928, + "num_input_tokens_seen": 139169130, + "step": 6461, + "time_per_iteration": 2.5703203678131104 + }, + { + "auxiliary_loss_clip": 0.01139296, + "auxiliary_loss_mlp": 0.01025924, + "balance_loss_clip": 1.04466677, + "balance_loss_mlp": 1.01787746, + "epoch": 0.7770095593098058, + "flos": 20704584908160.0, + "grad_norm": 2.6933642548715704, + "language_loss": 0.82950425, + "learning_rate": 4.991701502355444e-07, + "loss": 0.85115647, + "num_input_tokens_seen": 139189595, + "step": 6462, + "time_per_iteration": 2.4803173542022705 + }, + { + "auxiliary_loss_clip": 0.01157564, + "auxiliary_loss_mlp": 0.01024374, + "balance_loss_clip": 1.04613817, + "balance_loss_mlp": 1.01778138, + "epoch": 0.7771298022004449, + "flos": 24717709877760.0, + "grad_norm": 1.5600492856009427, + "language_loss": 0.75915909, + "learning_rate": 4.986553894945518e-07, + "loss": 0.78097844, + "num_input_tokens_seen": 139210805, + "step": 6463, + "time_per_iteration": 2.4882142543792725 + }, + { + "auxiliary_loss_clip": 0.01114967, + "auxiliary_loss_mlp": 0.01027332, + "balance_loss_clip": 1.04087901, + "balance_loss_mlp": 1.02092195, + "epoch": 0.777250045091084, + "flos": 25009232659200.0, + "grad_norm": 2.073629863820284, + "language_loss": 0.85899401, + "learning_rate": 4.981408565063416e-07, + "loss": 0.88041705, + "num_input_tokens_seen": 139230750, + "step": 6464, + "time_per_iteration": 2.581284999847412 + }, + { + "auxiliary_loss_clip": 0.01168671, + "auxiliary_loss_mlp": 0.01023368, + "balance_loss_clip": 1.04695427, + "balance_loss_mlp": 1.01619744, + "epoch": 0.777370287981723, + "flos": 20119887319680.0, + "grad_norm": 1.8934668261725114, + "language_loss": 0.76207435, + "learning_rate": 4.976265513489701e-07, + "loss": 0.78399467, + "num_input_tokens_seen": 139250720, + "step": 6465, + "time_per_iteration": 2.429532527923584 + }, + { + "auxiliary_loss_clip": 0.01152782, + "auxiliary_loss_mlp": 0.01025056, + "balance_loss_clip": 1.04338121, + "balance_loss_mlp": 1.01783788, + "epoch": 0.7774905308723622, + "flos": 21718809331200.0, + "grad_norm": 1.99974499054222, + "language_loss": 0.80552113, + "learning_rate": 4.971124741004562e-07, + "loss": 0.82729948, + "num_input_tokens_seen": 139269720, + "step": 6466, + "time_per_iteration": 2.4732580184936523 + }, + { + "auxiliary_loss_clip": 0.01151409, + "auxiliary_loss_mlp": 0.01024122, + "balance_loss_clip": 1.04364777, + "balance_loss_mlp": 1.01693034, + "epoch": 0.7776107737630013, + "flos": 16034115093120.0, + "grad_norm": 1.7097872885097132, + "language_loss": 0.76375067, + "learning_rate": 4.965986248387846e-07, + "loss": 0.78550589, + "num_input_tokens_seen": 139288035, + "step": 6467, + "time_per_iteration": 3.162691831588745 + }, + { + "auxiliary_loss_clip": 0.01141832, + "auxiliary_loss_mlp": 0.01026265, + "balance_loss_clip": 1.04289043, + "balance_loss_mlp": 1.01923144, + "epoch": 0.7777310166536403, + "flos": 24790895838720.0, + "grad_norm": 1.6502068356564437, + "language_loss": 0.77183294, + "learning_rate": 4.960850036419073e-07, + "loss": 0.79351401, + "num_input_tokens_seen": 139307135, + "step": 6468, + "time_per_iteration": 3.2723405361175537 + }, + { + "auxiliary_loss_clip": 0.01134497, + "auxiliary_loss_mlp": 0.0102381, + "balance_loss_clip": 1.04266763, + "balance_loss_mlp": 1.01621079, + "epoch": 0.7778512595442795, + "flos": 17272530253440.0, + "grad_norm": 2.9775932480927274, + "language_loss": 0.7862258, + "learning_rate": 4.955716105877378e-07, + "loss": 0.80780888, + "num_input_tokens_seen": 139325905, + "step": 6469, + "time_per_iteration": 3.2113912105560303 + }, + { + "auxiliary_loss_clip": 0.01156891, + "auxiliary_loss_mlp": 0.00761958, + "balance_loss_clip": 1.04514074, + "balance_loss_mlp": 1.00056696, + "epoch": 0.7779715024349185, + "flos": 17748418567680.0, + "grad_norm": 1.73091788776466, + "language_loss": 0.83062065, + "learning_rate": 4.950584457541598e-07, + "loss": 0.84980917, + "num_input_tokens_seen": 139344370, + "step": 6470, + "time_per_iteration": 2.434086799621582 + }, + { + "auxiliary_loss_clip": 0.01155333, + "auxiliary_loss_mlp": 0.01027389, + "balance_loss_clip": 1.04534006, + "balance_loss_mlp": 1.02052844, + "epoch": 0.7780917453255576, + "flos": 24316875031680.0, + "grad_norm": 1.4088318238977846, + "language_loss": 0.81895661, + "learning_rate": 4.945455092190183e-07, + "loss": 0.84078383, + "num_input_tokens_seen": 139365625, + "step": 6471, + "time_per_iteration": 2.490785598754883 + }, + { + "auxiliary_loss_clip": 0.01063778, + "auxiliary_loss_mlp": 0.01001811, + "balance_loss_clip": 1.00692427, + "balance_loss_mlp": 1.00074375, + "epoch": 0.7782119882161967, + "flos": 56364601530240.0, + "grad_norm": 0.6764504662027041, + "language_loss": 0.56016165, + "learning_rate": 4.940328010601271e-07, + "loss": 0.58081758, + "num_input_tokens_seen": 139430540, + "step": 6472, + "time_per_iteration": 3.789824962615967 + }, + { + "auxiliary_loss_clip": 0.01153751, + "auxiliary_loss_mlp": 0.01031479, + "balance_loss_clip": 1.0513339, + "balance_loss_mlp": 1.023283, + "epoch": 0.7783322311068358, + "flos": 46789986994560.0, + "grad_norm": 1.737583735185946, + "language_loss": 0.76654077, + "learning_rate": 4.935203213552621e-07, + "loss": 0.78839302, + "num_input_tokens_seen": 139454280, + "step": 6473, + "time_per_iteration": 2.7093582153320312 + }, + { + "auxiliary_loss_clip": 0.01140595, + "auxiliary_loss_mlp": 0.01022974, + "balance_loss_clip": 1.0442822, + "balance_loss_mlp": 1.01506186, + "epoch": 0.7784524739974749, + "flos": 19057864872960.0, + "grad_norm": 1.969772661376738, + "language_loss": 0.6695568, + "learning_rate": 4.930080701821662e-07, + "loss": 0.69119251, + "num_input_tokens_seen": 139471745, + "step": 6474, + "time_per_iteration": 2.4762816429138184 + }, + { + "auxiliary_loss_clip": 0.01137886, + "auxiliary_loss_mlp": 0.01025231, + "balance_loss_clip": 1.0424931, + "balance_loss_mlp": 1.01808143, + "epoch": 0.778572716888114, + "flos": 24791111320320.0, + "grad_norm": 1.9161498820411906, + "language_loss": 0.77151269, + "learning_rate": 4.92496047618548e-07, + "loss": 0.79314387, + "num_input_tokens_seen": 139491505, + "step": 6475, + "time_per_iteration": 2.5464606285095215 + }, + { + "auxiliary_loss_clip": 0.0115901, + "auxiliary_loss_mlp": 0.01024659, + "balance_loss_clip": 1.04929638, + "balance_loss_mlp": 1.01745832, + "epoch": 0.7786929597787531, + "flos": 20078086867200.0, + "grad_norm": 2.2163599228382322, + "language_loss": 0.77623248, + "learning_rate": 4.919842537420811e-07, + "loss": 0.79806912, + "num_input_tokens_seen": 139508620, + "step": 6476, + "time_per_iteration": 2.4457318782806396 + }, + { + "auxiliary_loss_clip": 0.01142438, + "auxiliary_loss_mlp": 0.01028373, + "balance_loss_clip": 1.04837358, + "balance_loss_mlp": 1.02163148, + "epoch": 0.7788132026693921, + "flos": 21872220318720.0, + "grad_norm": 1.7950660203008102, + "language_loss": 0.79393023, + "learning_rate": 4.91472688630404e-07, + "loss": 0.8156383, + "num_input_tokens_seen": 139529360, + "step": 6477, + "time_per_iteration": 2.539797067642212 + }, + { + "auxiliary_loss_clip": 0.01166686, + "auxiliary_loss_mlp": 0.01021621, + "balance_loss_clip": 1.04703987, + "balance_loss_mlp": 1.01477885, + "epoch": 0.7789334455600313, + "flos": 11181937351680.0, + "grad_norm": 2.3703401821580807, + "language_loss": 0.74104834, + "learning_rate": 4.909613523611202e-07, + "loss": 0.76293141, + "num_input_tokens_seen": 139546240, + "step": 6478, + "time_per_iteration": 2.3992464542388916 + }, + { + "auxiliary_loss_clip": 0.01108718, + "auxiliary_loss_mlp": 0.00762166, + "balance_loss_clip": 1.03975654, + "balance_loss_mlp": 1.00062466, + "epoch": 0.7790536884506704, + "flos": 28695427015680.0, + "grad_norm": 1.8361757730682817, + "language_loss": 0.74547613, + "learning_rate": 4.904502450117991e-07, + "loss": 0.76418495, + "num_input_tokens_seen": 139567200, + "step": 6479, + "time_per_iteration": 2.6184113025665283 + }, + { + "auxiliary_loss_clip": 0.01137286, + "auxiliary_loss_mlp": 0.01025928, + "balance_loss_clip": 1.04688096, + "balance_loss_mlp": 1.01904678, + "epoch": 0.7791739313413094, + "flos": 11072302064640.0, + "grad_norm": 2.289965671886951, + "language_loss": 0.72479129, + "learning_rate": 4.899393666599762e-07, + "loss": 0.74642348, + "num_input_tokens_seen": 139583775, + "step": 6480, + "time_per_iteration": 2.4585535526275635 + }, + { + "auxiliary_loss_clip": 0.01165226, + "auxiliary_loss_mlp": 0.01019201, + "balance_loss_clip": 1.04417992, + "balance_loss_mlp": 1.01217914, + "epoch": 0.7792941742319486, + "flos": 14679276975360.0, + "grad_norm": 2.101567902994485, + "language_loss": 0.72548187, + "learning_rate": 4.894287173831506e-07, + "loss": 0.74732614, + "num_input_tokens_seen": 139599735, + "step": 6481, + "time_per_iteration": 2.3805530071258545 + }, + { + "auxiliary_loss_clip": 0.01141109, + "auxiliary_loss_mlp": 0.0102334, + "balance_loss_clip": 1.04286969, + "balance_loss_mlp": 1.01519227, + "epoch": 0.7794144171225876, + "flos": 23258874908160.0, + "grad_norm": 2.616386494863926, + "language_loss": 0.84325874, + "learning_rate": 4.889182972587877e-07, + "loss": 0.86490315, + "num_input_tokens_seen": 139619030, + "step": 6482, + "time_per_iteration": 2.4959867000579834 + }, + { + "auxiliary_loss_clip": 0.01134354, + "auxiliary_loss_mlp": 0.01025194, + "balance_loss_clip": 1.04426241, + "balance_loss_mlp": 1.01797032, + "epoch": 0.7795346600132267, + "flos": 21507080613120.0, + "grad_norm": 1.8025847853680415, + "language_loss": 0.66272974, + "learning_rate": 4.884081063643177e-07, + "loss": 0.68432522, + "num_input_tokens_seen": 139637690, + "step": 6483, + "time_per_iteration": 2.519819974899292 + }, + { + "auxiliary_loss_clip": 0.0103844, + "auxiliary_loss_mlp": 0.01002341, + "balance_loss_clip": 1.01057708, + "balance_loss_mlp": 1.0013932, + "epoch": 0.7796549029038659, + "flos": 70052273694720.0, + "grad_norm": 0.8636857386155322, + "language_loss": 0.52537054, + "learning_rate": 4.878981447771353e-07, + "loss": 0.54577833, + "num_input_tokens_seen": 139692070, + "step": 6484, + "time_per_iteration": 3.0751986503601074 + }, + { + "auxiliary_loss_clip": 0.01120481, + "auxiliary_loss_mlp": 0.01024528, + "balance_loss_clip": 1.04268885, + "balance_loss_mlp": 1.01618099, + "epoch": 0.7797751457945049, + "flos": 23989405714560.0, + "grad_norm": 1.5962640759167388, + "language_loss": 0.72957408, + "learning_rate": 4.873884125746035e-07, + "loss": 0.75102425, + "num_input_tokens_seen": 139713745, + "step": 6485, + "time_per_iteration": 2.552006483078003 + }, + { + "auxiliary_loss_clip": 0.01136265, + "auxiliary_loss_mlp": 0.01018947, + "balance_loss_clip": 1.0430789, + "balance_loss_mlp": 1.01153827, + "epoch": 0.779895388685144, + "flos": 22674751937280.0, + "grad_norm": 2.212191038361635, + "language_loss": 0.72008461, + "learning_rate": 4.868789098340456e-07, + "loss": 0.74163675, + "num_input_tokens_seen": 139731650, + "step": 6486, + "time_per_iteration": 2.488110065460205 + }, + { + "auxiliary_loss_clip": 0.01126431, + "auxiliary_loss_mlp": 0.01023447, + "balance_loss_clip": 1.04267228, + "balance_loss_mlp": 1.01644921, + "epoch": 0.7800156315757831, + "flos": 23768698596480.0, + "grad_norm": 2.778382099950242, + "language_loss": 0.73260778, + "learning_rate": 4.863696366327543e-07, + "loss": 0.75410658, + "num_input_tokens_seen": 139750820, + "step": 6487, + "time_per_iteration": 2.5381319522857666 + }, + { + "auxiliary_loss_clip": 0.01155669, + "auxiliary_loss_mlp": 0.01028044, + "balance_loss_clip": 1.043782, + "balance_loss_mlp": 1.02058148, + "epoch": 0.7801358744664222, + "flos": 26429714881920.0, + "grad_norm": 1.7200915315591512, + "language_loss": 0.78094709, + "learning_rate": 4.85860593047986e-07, + "loss": 0.80278426, + "num_input_tokens_seen": 139770885, + "step": 6488, + "time_per_iteration": 2.488224744796753 + }, + { + "auxiliary_loss_clip": 0.01118359, + "auxiliary_loss_mlp": 0.01024473, + "balance_loss_clip": 1.03833461, + "balance_loss_mlp": 1.01741552, + "epoch": 0.7802561173570612, + "flos": 26322162583680.0, + "grad_norm": 2.6915270016306883, + "language_loss": 0.74586201, + "learning_rate": 4.853517791569613e-07, + "loss": 0.76729035, + "num_input_tokens_seen": 139793065, + "step": 6489, + "time_per_iteration": 2.5715394020080566 + }, + { + "auxiliary_loss_clip": 0.01144656, + "auxiliary_loss_mlp": 0.0076218, + "balance_loss_clip": 1.04346621, + "balance_loss_mlp": 1.00052106, + "epoch": 0.7803763602477004, + "flos": 40333751596800.0, + "grad_norm": 1.7684767189977963, + "language_loss": 0.66201174, + "learning_rate": 4.848431950368684e-07, + "loss": 0.6810801, + "num_input_tokens_seen": 139815625, + "step": 6490, + "time_per_iteration": 2.7655155658721924 + }, + { + "auxiliary_loss_clip": 0.01063556, + "auxiliary_loss_mlp": 0.00752858, + "balance_loss_clip": 1.00694346, + "balance_loss_mlp": 1.00050437, + "epoch": 0.7804966031383395, + "flos": 67001448038400.0, + "grad_norm": 0.7024724596864153, + "language_loss": 0.55765891, + "learning_rate": 4.843348407648569e-07, + "loss": 0.57582307, + "num_input_tokens_seen": 139876905, + "step": 6491, + "time_per_iteration": 2.9732508659362793 + }, + { + "auxiliary_loss_clip": 0.01156257, + "auxiliary_loss_mlp": 0.01022986, + "balance_loss_clip": 1.04285872, + "balance_loss_mlp": 1.01494837, + "epoch": 0.7806168460289785, + "flos": 17740733057280.0, + "grad_norm": 2.3218481810861804, + "language_loss": 0.83255845, + "learning_rate": 4.838267164180457e-07, + "loss": 0.85435086, + "num_input_tokens_seen": 139892575, + "step": 6492, + "time_per_iteration": 2.418240547180176 + }, + { + "auxiliary_loss_clip": 0.0117019, + "auxiliary_loss_mlp": 0.01025348, + "balance_loss_clip": 1.04742384, + "balance_loss_mlp": 1.01721811, + "epoch": 0.7807370889196176, + "flos": 23946240545280.0, + "grad_norm": 1.9864428823735414, + "language_loss": 0.83827376, + "learning_rate": 4.833188220735156e-07, + "loss": 0.86022913, + "num_input_tokens_seen": 139912245, + "step": 6493, + "time_per_iteration": 3.215789794921875 + }, + { + "auxiliary_loss_clip": 0.01153532, + "auxiliary_loss_mlp": 0.01022997, + "balance_loss_clip": 1.04531431, + "balance_loss_mlp": 1.01585329, + "epoch": 0.7808573318102567, + "flos": 18989024457600.0, + "grad_norm": 2.173465564590116, + "language_loss": 0.74554074, + "learning_rate": 4.828111578083152e-07, + "loss": 0.76730597, + "num_input_tokens_seen": 139929150, + "step": 6494, + "time_per_iteration": 3.2077035903930664 + }, + { + "auxiliary_loss_clip": 0.01136468, + "auxiliary_loss_mlp": 0.0102615, + "balance_loss_clip": 1.04519033, + "balance_loss_mlp": 1.01853275, + "epoch": 0.7809775747008958, + "flos": 23980750536960.0, + "grad_norm": 2.0729309816107353, + "language_loss": 0.81348509, + "learning_rate": 4.823037236994556e-07, + "loss": 0.83511126, + "num_input_tokens_seen": 139947315, + "step": 6495, + "time_per_iteration": 2.5240280628204346 + }, + { + "auxiliary_loss_clip": 0.01054925, + "auxiliary_loss_mlp": 0.01001376, + "balance_loss_clip": 1.00700331, + "balance_loss_mlp": 1.00041628, + "epoch": 0.7810978175915348, + "flos": 68535875180160.0, + "grad_norm": 0.7179038631293561, + "language_loss": 0.56323373, + "learning_rate": 4.817965198239136e-07, + "loss": 0.58379674, + "num_input_tokens_seen": 140013775, + "step": 6496, + "time_per_iteration": 3.771411657333374 + }, + { + "auxiliary_loss_clip": 0.01125093, + "auxiliary_loss_mlp": 0.01025662, + "balance_loss_clip": 1.04080176, + "balance_loss_mlp": 1.0176456, + "epoch": 0.781218060482174, + "flos": 19642131498240.0, + "grad_norm": 2.581268407964621, + "language_loss": 0.7428025, + "learning_rate": 4.812895462586331e-07, + "loss": 0.76431012, + "num_input_tokens_seen": 140031600, + "step": 6497, + "time_per_iteration": 2.5075085163116455 + }, + { + "auxiliary_loss_clip": 0.01129625, + "auxiliary_loss_mlp": 0.0102393, + "balance_loss_clip": 1.04457784, + "balance_loss_mlp": 1.01704013, + "epoch": 0.7813383033728131, + "flos": 25627865621760.0, + "grad_norm": 1.6449793929468552, + "language_loss": 0.81844985, + "learning_rate": 4.807828030805207e-07, + "loss": 0.83998537, + "num_input_tokens_seen": 140050590, + "step": 6498, + "time_per_iteration": 3.2696897983551025 + }, + { + "auxiliary_loss_clip": 0.01149513, + "auxiliary_loss_mlp": 0.0103115, + "balance_loss_clip": 1.0448215, + "balance_loss_mlp": 1.02349734, + "epoch": 0.7814585462634521, + "flos": 20485924865280.0, + "grad_norm": 1.9396609011658374, + "language_loss": 0.67437828, + "learning_rate": 4.802762903664495e-07, + "loss": 0.69618493, + "num_input_tokens_seen": 140069770, + "step": 6499, + "time_per_iteration": 2.4508097171783447 + }, + { + "auxiliary_loss_clip": 0.0114891, + "auxiliary_loss_mlp": 0.01025603, + "balance_loss_clip": 1.04843867, + "balance_loss_mlp": 1.01754737, + "epoch": 0.7815787891540913, + "flos": 22304297018880.0, + "grad_norm": 3.4089366017673695, + "language_loss": 0.74049485, + "learning_rate": 4.797700081932565e-07, + "loss": 0.76223993, + "num_input_tokens_seen": 140087635, + "step": 6500, + "time_per_iteration": 2.481616735458374 + }, + { + "auxiliary_loss_clip": 0.0109111, + "auxiliary_loss_mlp": 0.01028654, + "balance_loss_clip": 1.03729129, + "balance_loss_mlp": 1.0215373, + "epoch": 0.7816990320447303, + "flos": 22600668136320.0, + "grad_norm": 2.219951054940637, + "language_loss": 0.81627697, + "learning_rate": 4.792639566377442e-07, + "loss": 0.83747458, + "num_input_tokens_seen": 140105045, + "step": 6501, + "time_per_iteration": 2.6180543899536133 + }, + { + "auxiliary_loss_clip": 0.01147669, + "auxiliary_loss_mlp": 0.01022954, + "balance_loss_clip": 1.04203916, + "balance_loss_mlp": 1.01544404, + "epoch": 0.7818192749353694, + "flos": 24935974871040.0, + "grad_norm": 1.8637565967802763, + "language_loss": 0.77541649, + "learning_rate": 4.78758135776681e-07, + "loss": 0.79712272, + "num_input_tokens_seen": 140124900, + "step": 6502, + "time_per_iteration": 2.4949791431427 + }, + { + "auxiliary_loss_clip": 0.01141604, + "auxiliary_loss_mlp": 0.01025866, + "balance_loss_clip": 1.0451901, + "balance_loss_mlp": 1.01887417, + "epoch": 0.7819395178260086, + "flos": 23733039369600.0, + "grad_norm": 1.9871547120512678, + "language_loss": 0.78929764, + "learning_rate": 4.782525456867989e-07, + "loss": 0.81097233, + "num_input_tokens_seen": 140143755, + "step": 6503, + "time_per_iteration": 2.5154826641082764 + }, + { + "auxiliary_loss_clip": 0.01128902, + "auxiliary_loss_mlp": 0.01025191, + "balance_loss_clip": 1.0446527, + "balance_loss_mlp": 1.01744795, + "epoch": 0.7820597607166476, + "flos": 23221671396480.0, + "grad_norm": 1.9529111448995238, + "language_loss": 0.83131325, + "learning_rate": 4.777471864447959e-07, + "loss": 0.85285413, + "num_input_tokens_seen": 140164495, + "step": 6504, + "time_per_iteration": 2.5425004959106445 + }, + { + "auxiliary_loss_clip": 0.01139833, + "auxiliary_loss_mlp": 0.01030348, + "balance_loss_clip": 1.04184115, + "balance_loss_mlp": 1.02306426, + "epoch": 0.7821800036072867, + "flos": 22309540404480.0, + "grad_norm": 1.985093598341568, + "language_loss": 0.80908436, + "learning_rate": 4.772420581273344e-07, + "loss": 0.83078623, + "num_input_tokens_seen": 140181980, + "step": 6505, + "time_per_iteration": 2.4924476146698 + }, + { + "auxiliary_loss_clip": 0.01149982, + "auxiliary_loss_mlp": 0.01021747, + "balance_loss_clip": 1.04555178, + "balance_loss_mlp": 1.01429677, + "epoch": 0.7823002464979258, + "flos": 21544176384000.0, + "grad_norm": 2.092423338646906, + "language_loss": 0.76476848, + "learning_rate": 4.7673716081104134e-07, + "loss": 0.78648573, + "num_input_tokens_seen": 140202155, + "step": 6506, + "time_per_iteration": 2.4859354496002197 + }, + { + "auxiliary_loss_clip": 0.01153394, + "auxiliary_loss_mlp": 0.01023232, + "balance_loss_clip": 1.04748213, + "balance_loss_mlp": 1.01601696, + "epoch": 0.7824204893885649, + "flos": 24535642815360.0, + "grad_norm": 1.8570335938533054, + "language_loss": 0.84367692, + "learning_rate": 4.762324945725109e-07, + "loss": 0.86544323, + "num_input_tokens_seen": 140221600, + "step": 6507, + "time_per_iteration": 2.50003981590271 + }, + { + "auxiliary_loss_clip": 0.01136347, + "auxiliary_loss_mlp": 0.01026483, + "balance_loss_clip": 1.04714179, + "balance_loss_mlp": 1.01932144, + "epoch": 0.782540732279204, + "flos": 27415211402880.0, + "grad_norm": 1.697684765702161, + "language_loss": 0.76099592, + "learning_rate": 4.7572805948829844e-07, + "loss": 0.78262424, + "num_input_tokens_seen": 140241860, + "step": 6508, + "time_per_iteration": 2.5362980365753174 + }, + { + "auxiliary_loss_clip": 0.01116127, + "auxiliary_loss_mlp": 0.01021226, + "balance_loss_clip": 1.04231548, + "balance_loss_mlp": 1.01427031, + "epoch": 0.7826609751698431, + "flos": 24353216616960.0, + "grad_norm": 2.4526686121143584, + "language_loss": 0.71187508, + "learning_rate": 4.7522385563492795e-07, + "loss": 0.73324859, + "num_input_tokens_seen": 140262160, + "step": 6509, + "time_per_iteration": 2.5776922702789307 + }, + { + "auxiliary_loss_clip": 0.01129442, + "auxiliary_loss_mlp": 0.01024917, + "balance_loss_clip": 1.0450418, + "balance_loss_mlp": 1.01769876, + "epoch": 0.7827812180604822, + "flos": 23988543788160.0, + "grad_norm": 2.3085818149852697, + "language_loss": 0.70279998, + "learning_rate": 4.747198830888863e-07, + "loss": 0.7243436, + "num_input_tokens_seen": 140282030, + "step": 6510, + "time_per_iteration": 2.542556047439575 + }, + { + "auxiliary_loss_clip": 0.0113431, + "auxiliary_loss_mlp": 0.01025716, + "balance_loss_clip": 1.04332089, + "balance_loss_mlp": 1.01867104, + "epoch": 0.7829014609511212, + "flos": 27454318335360.0, + "grad_norm": 2.0786260567602413, + "language_loss": 0.68448889, + "learning_rate": 4.742161419266251e-07, + "loss": 0.70608914, + "num_input_tokens_seen": 140301190, + "step": 6511, + "time_per_iteration": 2.533812999725342 + }, + { + "auxiliary_loss_clip": 0.0115938, + "auxiliary_loss_mlp": 0.01029528, + "balance_loss_clip": 1.04653192, + "balance_loss_mlp": 1.02127314, + "epoch": 0.7830217038417604, + "flos": 29204532432000.0, + "grad_norm": 2.5179695794849195, + "language_loss": 0.64948958, + "learning_rate": 4.7371263222456304e-07, + "loss": 0.67137873, + "num_input_tokens_seen": 140318510, + "step": 6512, + "time_per_iteration": 2.5093700885772705 + }, + { + "auxiliary_loss_clip": 0.0105104, + "auxiliary_loss_mlp": 0.01000941, + "balance_loss_clip": 1.00775051, + "balance_loss_mlp": 0.9999817, + "epoch": 0.7831419467323995, + "flos": 60950895822720.0, + "grad_norm": 0.7935131463940933, + "language_loss": 0.61357617, + "learning_rate": 4.7320935405908004e-07, + "loss": 0.63409597, + "num_input_tokens_seen": 140379380, + "step": 6513, + "time_per_iteration": 3.013005495071411 + }, + { + "auxiliary_loss_clip": 0.01170584, + "auxiliary_loss_mlp": 0.01027555, + "balance_loss_clip": 1.04703617, + "balance_loss_mlp": 1.01954973, + "epoch": 0.7832621896230385, + "flos": 19682531320320.0, + "grad_norm": 2.171754446130169, + "language_loss": 0.83812535, + "learning_rate": 4.7270630750652475e-07, + "loss": 0.86010683, + "num_input_tokens_seen": 140395335, + "step": 6514, + "time_per_iteration": 2.4467267990112305 + }, + { + "auxiliary_loss_clip": 0.01151416, + "auxiliary_loss_mlp": 0.01023228, + "balance_loss_clip": 1.04324412, + "balance_loss_mlp": 1.01628089, + "epoch": 0.7833824325136777, + "flos": 25009232659200.0, + "grad_norm": 1.6775227822685403, + "language_loss": 0.80599546, + "learning_rate": 4.7220349264320746e-07, + "loss": 0.82774186, + "num_input_tokens_seen": 140414420, + "step": 6515, + "time_per_iteration": 2.4748499393463135 + }, + { + "auxiliary_loss_clip": 0.01053784, + "auxiliary_loss_mlp": 0.01001458, + "balance_loss_clip": 1.00820065, + "balance_loss_mlp": 1.00048053, + "epoch": 0.7835026754043167, + "flos": 68800142517120.0, + "grad_norm": 0.7374686853232519, + "language_loss": 0.54941583, + "learning_rate": 4.71700909545407e-07, + "loss": 0.56996822, + "num_input_tokens_seen": 140477365, + "step": 6516, + "time_per_iteration": 3.032658576965332 + }, + { + "auxiliary_loss_clip": 0.01156351, + "auxiliary_loss_mlp": 0.01022332, + "balance_loss_clip": 1.04581094, + "balance_loss_mlp": 1.01513445, + "epoch": 0.7836229182949558, + "flos": 19864598382720.0, + "grad_norm": 2.0185495183221867, + "language_loss": 0.76978922, + "learning_rate": 4.711985582893627e-07, + "loss": 0.79157603, + "num_input_tokens_seen": 140495885, + "step": 6517, + "time_per_iteration": 2.4440805912017822 + }, + { + "auxiliary_loss_clip": 0.01114046, + "auxiliary_loss_mlp": 0.01024745, + "balance_loss_clip": 1.04005313, + "balance_loss_mlp": 1.01729476, + "epoch": 0.783743161185595, + "flos": 22965843755520.0, + "grad_norm": 1.9467017351983467, + "language_loss": 0.71543038, + "learning_rate": 4.706964389512811e-07, + "loss": 0.73681831, + "num_input_tokens_seen": 140515920, + "step": 6518, + "time_per_iteration": 2.5756888389587402 + }, + { + "auxiliary_loss_clip": 0.01167991, + "auxiliary_loss_mlp": 0.01022292, + "balance_loss_clip": 1.04848695, + "balance_loss_mlp": 1.01507092, + "epoch": 0.783863404076234, + "flos": 12458489777280.0, + "grad_norm": 1.9208230854065618, + "language_loss": 0.8748405, + "learning_rate": 4.701945516073345e-07, + "loss": 0.8967433, + "num_input_tokens_seen": 140533395, + "step": 6519, + "time_per_iteration": 2.3934237957000732 + }, + { + "auxiliary_loss_clip": 0.01123191, + "auxiliary_loss_mlp": 0.01021391, + "balance_loss_clip": 1.04345262, + "balance_loss_mlp": 1.01456058, + "epoch": 0.7839836469668731, + "flos": 24243940465920.0, + "grad_norm": 1.8101148074220776, + "language_loss": 0.74848801, + "learning_rate": 4.696928963336577e-07, + "loss": 0.76993382, + "num_input_tokens_seen": 140552825, + "step": 6520, + "time_per_iteration": 3.3401432037353516 + }, + { + "auxiliary_loss_clip": 0.01050492, + "auxiliary_loss_mlp": 0.01000434, + "balance_loss_clip": 1.00736141, + "balance_loss_mlp": 0.99946803, + "epoch": 0.7841038898575122, + "flos": 62121978938880.0, + "grad_norm": 0.8561641324495509, + "language_loss": 0.61026061, + "learning_rate": 4.6919147320635224e-07, + "loss": 0.63076985, + "num_input_tokens_seen": 140615535, + "step": 6521, + "time_per_iteration": 3.737170457839966 + }, + { + "auxiliary_loss_clip": 0.01156055, + "auxiliary_loss_mlp": 0.01024254, + "balance_loss_clip": 1.04472005, + "balance_loss_mlp": 1.01747394, + "epoch": 0.7842241327481513, + "flos": 20193899293440.0, + "grad_norm": 2.5165819268931813, + "language_loss": 0.72982079, + "learning_rate": 4.6869028230148286e-07, + "loss": 0.75162387, + "num_input_tokens_seen": 140633330, + "step": 6522, + "time_per_iteration": 2.4456734657287598 + }, + { + "auxiliary_loss_clip": 0.01119514, + "auxiliary_loss_mlp": 0.0102394, + "balance_loss_clip": 1.03926623, + "balance_loss_mlp": 1.01613748, + "epoch": 0.7843443756387903, + "flos": 28074531496320.0, + "grad_norm": 2.2456714217907385, + "language_loss": 0.5961287, + "learning_rate": 4.6818932369507957e-07, + "loss": 0.61756325, + "num_input_tokens_seen": 140652830, + "step": 6523, + "time_per_iteration": 3.237525224685669 + }, + { + "auxiliary_loss_clip": 0.01155099, + "auxiliary_loss_mlp": 0.01027183, + "balance_loss_clip": 1.04799139, + "balance_loss_mlp": 1.01993823, + "epoch": 0.7844646185294295, + "flos": 21323397438720.0, + "grad_norm": 2.4661319809158417, + "language_loss": 0.89160883, + "learning_rate": 4.676885974631386e-07, + "loss": 0.91343164, + "num_input_tokens_seen": 140671190, + "step": 6524, + "time_per_iteration": 2.4528040885925293 + }, + { + "auxiliary_loss_clip": 0.01155061, + "auxiliary_loss_mlp": 0.01023409, + "balance_loss_clip": 1.04589665, + "balance_loss_mlp": 1.01630688, + "epoch": 0.7845848614200686, + "flos": 23656585271040.0, + "grad_norm": 2.181560213740823, + "language_loss": 0.81009936, + "learning_rate": 4.67188103681619e-07, + "loss": 0.83188403, + "num_input_tokens_seen": 140690975, + "step": 6525, + "time_per_iteration": 3.195817470550537 + }, + { + "auxiliary_loss_clip": 0.01151356, + "auxiliary_loss_mlp": 0.00761766, + "balance_loss_clip": 1.04776192, + "balance_loss_mlp": 1.00059783, + "epoch": 0.7847051043107076, + "flos": 23402194174080.0, + "grad_norm": 2.050917824261166, + "language_loss": 0.68993479, + "learning_rate": 4.666878424264453e-07, + "loss": 0.70906603, + "num_input_tokens_seen": 140710930, + "step": 6526, + "time_per_iteration": 2.5048131942749023 + }, + { + "auxiliary_loss_clip": 0.01133157, + "auxiliary_loss_mlp": 0.01017939, + "balance_loss_clip": 1.04479361, + "balance_loss_mlp": 1.01155233, + "epoch": 0.7848253472013467, + "flos": 19022277473280.0, + "grad_norm": 1.697863156926212, + "language_loss": 0.7362386, + "learning_rate": 4.661878137735069e-07, + "loss": 0.75774956, + "num_input_tokens_seen": 140729120, + "step": 6527, + "time_per_iteration": 2.4772751331329346 + }, + { + "auxiliary_loss_clip": 0.01139667, + "auxiliary_loss_mlp": 0.01022795, + "balance_loss_clip": 1.04431534, + "balance_loss_mlp": 1.01575899, + "epoch": 0.7849455900919858, + "flos": 21179180332800.0, + "grad_norm": 4.06657296430176, + "language_loss": 0.74866068, + "learning_rate": 4.656880177986571e-07, + "loss": 0.77028537, + "num_input_tokens_seen": 140747665, + "step": 6528, + "time_per_iteration": 2.4848623275756836 + }, + { + "auxiliary_loss_clip": 0.01141919, + "auxiliary_loss_mlp": 0.01023022, + "balance_loss_clip": 1.04242444, + "balance_loss_mlp": 1.01551235, + "epoch": 0.7850658329826249, + "flos": 19536482620800.0, + "grad_norm": 2.3142759365633547, + "language_loss": 0.81645358, + "learning_rate": 4.6518845457771607e-07, + "loss": 0.83810294, + "num_input_tokens_seen": 140766525, + "step": 6529, + "time_per_iteration": 2.5127639770507812 + }, + { + "auxiliary_loss_clip": 0.01147431, + "auxiliary_loss_mlp": 0.00761856, + "balance_loss_clip": 1.04466343, + "balance_loss_mlp": 1.00065386, + "epoch": 0.7851860758732639, + "flos": 12495334152960.0, + "grad_norm": 3.0653034474262464, + "language_loss": 0.79253471, + "learning_rate": 4.646891241864652e-07, + "loss": 0.81162763, + "num_input_tokens_seen": 140785090, + "step": 6530, + "time_per_iteration": 2.438866376876831 + }, + { + "auxiliary_loss_clip": 0.01152637, + "auxiliary_loss_mlp": 0.01027286, + "balance_loss_clip": 1.04421735, + "balance_loss_mlp": 1.01931691, + "epoch": 0.7853063187639031, + "flos": 22960959505920.0, + "grad_norm": 1.793546502058425, + "language_loss": 0.73328233, + "learning_rate": 4.6419002670065397e-07, + "loss": 0.75508153, + "num_input_tokens_seen": 140804670, + "step": 6531, + "time_per_iteration": 2.462233543395996 + }, + { + "auxiliary_loss_clip": 0.01130351, + "auxiliary_loss_mlp": 0.0102606, + "balance_loss_clip": 1.04471827, + "balance_loss_mlp": 1.01831138, + "epoch": 0.7854265616545422, + "flos": 17347260499200.0, + "grad_norm": 4.007475392499036, + "language_loss": 0.8635428, + "learning_rate": 4.6369116219599445e-07, + "loss": 0.88510686, + "num_input_tokens_seen": 140820655, + "step": 6532, + "time_per_iteration": 2.4766199588775635 + }, + { + "auxiliary_loss_clip": 0.01122624, + "auxiliary_loss_mlp": 0.0102116, + "balance_loss_clip": 1.04184318, + "balance_loss_mlp": 1.01431441, + "epoch": 0.7855468045451812, + "flos": 23838293197440.0, + "grad_norm": 1.9043528493818493, + "language_loss": 0.79242909, + "learning_rate": 4.631925307481637e-07, + "loss": 0.81386691, + "num_input_tokens_seen": 140840470, + "step": 6533, + "time_per_iteration": 2.54259991645813 + }, + { + "auxiliary_loss_clip": 0.0113851, + "auxiliary_loss_mlp": 0.01023973, + "balance_loss_clip": 1.0458411, + "balance_loss_mlp": 1.01702571, + "epoch": 0.7856670474358204, + "flos": 25666792986240.0, + "grad_norm": 2.064215665197447, + "language_loss": 0.75823832, + "learning_rate": 4.6269413243280533e-07, + "loss": 0.77986324, + "num_input_tokens_seen": 140859890, + "step": 6534, + "time_per_iteration": 2.539626121520996 + }, + { + "auxiliary_loss_clip": 0.01144276, + "auxiliary_loss_mlp": 0.0102328, + "balance_loss_clip": 1.04691076, + "balance_loss_mlp": 1.01520944, + "epoch": 0.7857872903264594, + "flos": 18144656472960.0, + "grad_norm": 2.6337780111628804, + "language_loss": 0.73777151, + "learning_rate": 4.621959673255236e-07, + "loss": 0.7594471, + "num_input_tokens_seen": 140876190, + "step": 6535, + "time_per_iteration": 2.4725847244262695 + }, + { + "auxiliary_loss_clip": 0.01110429, + "auxiliary_loss_mlp": 0.01027945, + "balance_loss_clip": 1.04097307, + "balance_loss_mlp": 1.02068508, + "epoch": 0.7859075332170985, + "flos": 14386138081920.0, + "grad_norm": 2.3966597057492547, + "language_loss": 0.90371412, + "learning_rate": 4.6169803550189135e-07, + "loss": 0.92509788, + "num_input_tokens_seen": 140891885, + "step": 6536, + "time_per_iteration": 2.5499589443206787 + }, + { + "auxiliary_loss_clip": 0.01104164, + "auxiliary_loss_mlp": 0.01028873, + "balance_loss_clip": 1.04184484, + "balance_loss_mlp": 1.02095759, + "epoch": 0.7860277761077377, + "flos": 19864059678720.0, + "grad_norm": 2.019413673245359, + "language_loss": 0.77369922, + "learning_rate": 4.6120033703744355e-07, + "loss": 0.79502964, + "num_input_tokens_seen": 140910780, + "step": 6537, + "time_per_iteration": 2.537594795227051 + }, + { + "auxiliary_loss_clip": 0.01129075, + "auxiliary_loss_mlp": 0.01021749, + "balance_loss_clip": 1.04220092, + "balance_loss_mlp": 1.01464415, + "epoch": 0.7861480189983767, + "flos": 26396174557440.0, + "grad_norm": 2.4911475828671987, + "language_loss": 0.78232157, + "learning_rate": 4.607028720076822e-07, + "loss": 0.80382979, + "num_input_tokens_seen": 140927460, + "step": 6538, + "time_per_iteration": 2.530602216720581 + }, + { + "auxiliary_loss_clip": 0.01154529, + "auxiliary_loss_mlp": 0.01024225, + "balance_loss_clip": 1.04591179, + "balance_loss_mlp": 1.01705194, + "epoch": 0.7862682618890158, + "flos": 24236578177920.0, + "grad_norm": 2.803066535644366, + "language_loss": 0.73565102, + "learning_rate": 4.6020564048807074e-07, + "loss": 0.7574386, + "num_input_tokens_seen": 140945135, + "step": 6539, + "time_per_iteration": 2.557053327560425 + }, + { + "auxiliary_loss_clip": 0.0115738, + "auxiliary_loss_mlp": 0.01024607, + "balance_loss_clip": 1.04643631, + "balance_loss_mlp": 1.01703107, + "epoch": 0.7863885047796549, + "flos": 47551508259840.0, + "grad_norm": 3.110233705729916, + "language_loss": 0.7162807, + "learning_rate": 4.5970864255403883e-07, + "loss": 0.73810053, + "num_input_tokens_seen": 140966660, + "step": 6540, + "time_per_iteration": 2.667836904525757 + }, + { + "auxiliary_loss_clip": 0.01143497, + "auxiliary_loss_mlp": 0.0102151, + "balance_loss_clip": 1.04467058, + "balance_loss_mlp": 1.0147717, + "epoch": 0.786508747670294, + "flos": 24389234979840.0, + "grad_norm": 4.44449131956024, + "language_loss": 0.81756663, + "learning_rate": 4.59211878280982e-07, + "loss": 0.83921671, + "num_input_tokens_seen": 140986175, + "step": 6541, + "time_per_iteration": 2.4989147186279297 + }, + { + "auxiliary_loss_clip": 0.01140315, + "auxiliary_loss_mlp": 0.01021478, + "balance_loss_clip": 1.04448628, + "balance_loss_mlp": 1.01443279, + "epoch": 0.786628990560933, + "flos": 18041234238720.0, + "grad_norm": 2.3781939562900614, + "language_loss": 0.69685292, + "learning_rate": 4.587153477442578e-07, + "loss": 0.71847081, + "num_input_tokens_seen": 141002490, + "step": 6542, + "time_per_iteration": 2.500081777572632 + }, + { + "auxiliary_loss_clip": 0.01170735, + "auxiliary_loss_mlp": 0.01026929, + "balance_loss_clip": 1.04787755, + "balance_loss_mlp": 1.01904321, + "epoch": 0.7867492334515722, + "flos": 25848860048640.0, + "grad_norm": 3.42296378443156, + "language_loss": 0.81125379, + "learning_rate": 4.582190510191899e-07, + "loss": 0.83323038, + "num_input_tokens_seen": 141021150, + "step": 6543, + "time_per_iteration": 2.4447834491729736 + }, + { + "auxiliary_loss_clip": 0.01123157, + "auxiliary_loss_mlp": 0.01023691, + "balance_loss_clip": 1.04455125, + "balance_loss_mlp": 1.01667833, + "epoch": 0.7868694763422113, + "flos": 16580819070720.0, + "grad_norm": 2.4898646512338445, + "language_loss": 0.87235892, + "learning_rate": 4.5772298818106625e-07, + "loss": 0.89382744, + "num_input_tokens_seen": 141036940, + "step": 6544, + "time_per_iteration": 2.481004238128662 + }, + { + "auxiliary_loss_clip": 0.01131855, + "auxiliary_loss_mlp": 0.01028807, + "balance_loss_clip": 1.04501319, + "balance_loss_mlp": 1.02099299, + "epoch": 0.7869897192328503, + "flos": 29386276272000.0, + "grad_norm": 2.779293343694073, + "language_loss": 0.72266698, + "learning_rate": 4.572271593051384e-07, + "loss": 0.74427354, + "num_input_tokens_seen": 141054295, + "step": 6545, + "time_per_iteration": 2.5727338790893555 + }, + { + "auxiliary_loss_clip": 0.0110475, + "auxiliary_loss_mlp": 0.01023758, + "balance_loss_clip": 1.04231977, + "balance_loss_mlp": 1.01674891, + "epoch": 0.7871099621234895, + "flos": 17128923678720.0, + "grad_norm": 1.931838890106836, + "language_loss": 0.77950191, + "learning_rate": 4.567315644666245e-07, + "loss": 0.80078697, + "num_input_tokens_seen": 141073090, + "step": 6546, + "time_per_iteration": 2.525733709335327 + }, + { + "auxiliary_loss_clip": 0.01119102, + "auxiliary_loss_mlp": 0.01022947, + "balance_loss_clip": 1.04363954, + "balance_loss_mlp": 1.01586294, + "epoch": 0.7872302050141285, + "flos": 23440187784960.0, + "grad_norm": 1.9848675843764803, + "language_loss": 0.84561861, + "learning_rate": 4.5623620374070507e-07, + "loss": 0.86703908, + "num_input_tokens_seen": 141092405, + "step": 6547, + "time_per_iteration": 3.3150291442871094 + }, + { + "auxiliary_loss_clip": 0.01030332, + "auxiliary_loss_mlp": 0.01000676, + "balance_loss_clip": 1.00674021, + "balance_loss_mlp": 0.99951982, + "epoch": 0.7873504479047676, + "flos": 65959752689280.0, + "grad_norm": 0.7603942400150472, + "language_loss": 0.58329546, + "learning_rate": 4.557410772025263e-07, + "loss": 0.60360551, + "num_input_tokens_seen": 141154355, + "step": 6548, + "time_per_iteration": 3.9526214599609375 + }, + { + "auxiliary_loss_clip": 0.01135844, + "auxiliary_loss_mlp": 0.01026856, + "balance_loss_clip": 1.04265356, + "balance_loss_mlp": 1.01945019, + "epoch": 0.7874706907954068, + "flos": 23258336204160.0, + "grad_norm": 1.8231658288246468, + "language_loss": 0.6619457, + "learning_rate": 4.5524618492719803e-07, + "loss": 0.68357277, + "num_input_tokens_seen": 141173575, + "step": 6549, + "time_per_iteration": 2.514491081237793 + }, + { + "auxiliary_loss_clip": 0.01153441, + "auxiliary_loss_mlp": 0.01022784, + "balance_loss_clip": 1.04455161, + "balance_loss_mlp": 1.01566076, + "epoch": 0.7875909336860458, + "flos": 28767786963840.0, + "grad_norm": 1.6824737063457895, + "language_loss": 0.79155552, + "learning_rate": 4.54751526989795e-07, + "loss": 0.81331778, + "num_input_tokens_seen": 141195415, + "step": 6550, + "time_per_iteration": 3.2601840496063232 + }, + { + "auxiliary_loss_clip": 0.01157682, + "auxiliary_loss_mlp": 0.01026188, + "balance_loss_clip": 1.0471158, + "balance_loss_mlp": 1.01882076, + "epoch": 0.7877111765766849, + "flos": 18697286194560.0, + "grad_norm": 2.0127734211375015, + "language_loss": 0.79153407, + "learning_rate": 4.5425710346535775e-07, + "loss": 0.81337273, + "num_input_tokens_seen": 141213360, + "step": 6551, + "time_per_iteration": 3.228074073791504 + }, + { + "auxiliary_loss_clip": 0.01156457, + "auxiliary_loss_mlp": 0.01026407, + "balance_loss_clip": 1.04586101, + "balance_loss_mlp": 1.01826537, + "epoch": 0.787831419467324, + "flos": 27592968833280.0, + "grad_norm": 1.9968169700813414, + "language_loss": 0.81482816, + "learning_rate": 4.537629144288877e-07, + "loss": 0.83665675, + "num_input_tokens_seen": 141230815, + "step": 6552, + "time_per_iteration": 2.5073390007019043 + }, + { + "auxiliary_loss_clip": 0.01118307, + "auxiliary_loss_mlp": 0.01025056, + "balance_loss_clip": 1.04008138, + "balance_loss_mlp": 1.01776946, + "epoch": 0.7879516623579631, + "flos": 18150187167360.0, + "grad_norm": 3.856506081174398, + "language_loss": 0.74670124, + "learning_rate": 4.5326895995535477e-07, + "loss": 0.76813495, + "num_input_tokens_seen": 141249715, + "step": 6553, + "time_per_iteration": 2.558450937271118 + }, + { + "auxiliary_loss_clip": 0.01150456, + "auxiliary_loss_mlp": 0.01027082, + "balance_loss_clip": 1.04529965, + "balance_loss_mlp": 1.0196141, + "epoch": 0.7880719052486022, + "flos": 20339193807360.0, + "grad_norm": 2.9538901055236586, + "language_loss": 0.83999276, + "learning_rate": 4.527752401196907e-07, + "loss": 0.86176813, + "num_input_tokens_seen": 141267730, + "step": 6554, + "time_per_iteration": 2.4468724727630615 + }, + { + "auxiliary_loss_clip": 0.01132639, + "auxiliary_loss_mlp": 0.01026619, + "balance_loss_clip": 1.04252052, + "balance_loss_mlp": 1.01885223, + "epoch": 0.7881921481392413, + "flos": 21653237053440.0, + "grad_norm": 1.766061447054303, + "language_loss": 0.67096859, + "learning_rate": 4.5228175499679254e-07, + "loss": 0.69256121, + "num_input_tokens_seen": 141287315, + "step": 6555, + "time_per_iteration": 2.48754620552063 + }, + { + "auxiliary_loss_clip": 0.01053806, + "auxiliary_loss_mlp": 0.01001658, + "balance_loss_clip": 1.00747085, + "balance_loss_mlp": 1.00068688, + "epoch": 0.7883123910298804, + "flos": 68565860058240.0, + "grad_norm": 0.8286054280456369, + "language_loss": 0.54504192, + "learning_rate": 4.5178850466152174e-07, + "loss": 0.56559652, + "num_input_tokens_seen": 141346145, + "step": 6556, + "time_per_iteration": 3.093440532684326 + }, + { + "auxiliary_loss_clip": 0.01133253, + "auxiliary_loss_mlp": 0.01021974, + "balance_loss_clip": 1.04054356, + "balance_loss_mlp": 1.01500034, + "epoch": 0.7884326339205194, + "flos": 19318217627520.0, + "grad_norm": 2.500025124497533, + "language_loss": 0.8199982, + "learning_rate": 4.512954891887031e-07, + "loss": 0.84155047, + "num_input_tokens_seen": 141364445, + "step": 6557, + "time_per_iteration": 2.5008420944213867 + }, + { + "auxiliary_loss_clip": 0.01134146, + "auxiliary_loss_mlp": 0.01027804, + "balance_loss_clip": 1.04428816, + "balance_loss_mlp": 1.0201149, + "epoch": 0.7885528768111585, + "flos": 17784903807360.0, + "grad_norm": 2.5554483942821284, + "language_loss": 0.83216089, + "learning_rate": 4.5080270865312806e-07, + "loss": 0.85378039, + "num_input_tokens_seen": 141381640, + "step": 6558, + "time_per_iteration": 2.454123020172119 + }, + { + "auxiliary_loss_clip": 0.01152243, + "auxiliary_loss_mlp": 0.01023502, + "balance_loss_clip": 1.04442191, + "balance_loss_mlp": 1.01654625, + "epoch": 0.7886731197017977, + "flos": 18807639753600.0, + "grad_norm": 2.0915759808728773, + "language_loss": 0.71285355, + "learning_rate": 4.5031016312954985e-07, + "loss": 0.73461103, + "num_input_tokens_seen": 141399955, + "step": 6559, + "time_per_iteration": 2.440627098083496 + }, + { + "auxiliary_loss_clip": 0.01164388, + "auxiliary_loss_mlp": 0.01023796, + "balance_loss_clip": 1.04954326, + "balance_loss_mlp": 1.01630998, + "epoch": 0.7887933625924367, + "flos": 33365358126720.0, + "grad_norm": 2.13015146394723, + "language_loss": 0.74311113, + "learning_rate": 4.498178526926886e-07, + "loss": 0.76499295, + "num_input_tokens_seen": 141420820, + "step": 6560, + "time_per_iteration": 2.5458850860595703 + }, + { + "auxiliary_loss_clip": 0.01168122, + "auxiliary_loss_mlp": 0.01030148, + "balance_loss_clip": 1.04830694, + "balance_loss_mlp": 1.0231595, + "epoch": 0.7889136054830758, + "flos": 17019360218880.0, + "grad_norm": 2.095279278638362, + "language_loss": 0.72210521, + "learning_rate": 4.4932577741722635e-07, + "loss": 0.74408787, + "num_input_tokens_seen": 141439350, + "step": 6561, + "time_per_iteration": 2.3998992443084717 + }, + { + "auxiliary_loss_clip": 0.01137929, + "auxiliary_loss_mlp": 0.01027944, + "balance_loss_clip": 1.04414761, + "balance_loss_mlp": 1.02012348, + "epoch": 0.7890338483737149, + "flos": 29424629018880.0, + "grad_norm": 1.6599028420957787, + "language_loss": 0.74080682, + "learning_rate": 4.4883393737780985e-07, + "loss": 0.76246554, + "num_input_tokens_seen": 141460300, + "step": 6562, + "time_per_iteration": 2.540412425994873 + }, + { + "auxiliary_loss_clip": 0.01148303, + "auxiliary_loss_mlp": 0.01023342, + "balance_loss_clip": 1.04372203, + "balance_loss_mlp": 1.0159297, + "epoch": 0.789154091264354, + "flos": 19971576063360.0, + "grad_norm": 1.8374867576945821, + "language_loss": 0.78511012, + "learning_rate": 4.4834233264905254e-07, + "loss": 0.80682659, + "num_input_tokens_seen": 141477315, + "step": 6563, + "time_per_iteration": 2.4731028079986572 + }, + { + "auxiliary_loss_clip": 0.01118416, + "auxiliary_loss_mlp": 0.01028697, + "balance_loss_clip": 1.04086745, + "balance_loss_mlp": 1.02121937, + "epoch": 0.789274334154993, + "flos": 14537825216640.0, + "grad_norm": 2.776802443908304, + "language_loss": 0.71241444, + "learning_rate": 4.478509633055294e-07, + "loss": 0.73388565, + "num_input_tokens_seen": 141495025, + "step": 6564, + "time_per_iteration": 2.4903950691223145 + }, + { + "auxiliary_loss_clip": 0.01145027, + "auxiliary_loss_mlp": 0.01027415, + "balance_loss_clip": 1.04587698, + "balance_loss_mlp": 1.0195291, + "epoch": 0.7893945770456322, + "flos": 21827403123840.0, + "grad_norm": 2.7334539672735807, + "language_loss": 0.80485952, + "learning_rate": 4.473598294217813e-07, + "loss": 0.82658398, + "num_input_tokens_seen": 141510450, + "step": 6565, + "time_per_iteration": 2.481294631958008 + }, + { + "auxiliary_loss_clip": 0.01151612, + "auxiliary_loss_mlp": 0.0102342, + "balance_loss_clip": 1.04618466, + "balance_loss_mlp": 1.01645184, + "epoch": 0.7895148199362713, + "flos": 20740639184640.0, + "grad_norm": 2.8489956896143376, + "language_loss": 0.71510702, + "learning_rate": 4.468689310723124e-07, + "loss": 0.73685735, + "num_input_tokens_seen": 141528265, + "step": 6566, + "time_per_iteration": 2.4411847591400146 + }, + { + "auxiliary_loss_clip": 0.01129741, + "auxiliary_loss_mlp": 0.01026255, + "balance_loss_clip": 1.04323733, + "balance_loss_mlp": 1.01911497, + "epoch": 0.7896350628269103, + "flos": 16690669839360.0, + "grad_norm": 2.3647506627634334, + "language_loss": 0.7847932, + "learning_rate": 4.463782683315913e-07, + "loss": 0.80635321, + "num_input_tokens_seen": 141547270, + "step": 6567, + "time_per_iteration": 2.5033419132232666 + }, + { + "auxiliary_loss_clip": 0.01166508, + "auxiliary_loss_mlp": 0.01024322, + "balance_loss_clip": 1.0471127, + "balance_loss_mlp": 1.01748538, + "epoch": 0.7897553057175495, + "flos": 22638374438400.0, + "grad_norm": 1.7514050056128, + "language_loss": 0.73141271, + "learning_rate": 4.458878412740523e-07, + "loss": 0.75332093, + "num_input_tokens_seen": 141566050, + "step": 6568, + "time_per_iteration": 2.4250056743621826 + }, + { + "auxiliary_loss_clip": 0.01149951, + "auxiliary_loss_mlp": 0.0102323, + "balance_loss_clip": 1.04492581, + "balance_loss_mlp": 1.01589298, + "epoch": 0.7898755486081885, + "flos": 14537573821440.0, + "grad_norm": 3.264450981805543, + "language_loss": 0.77860934, + "learning_rate": 4.453976499740919e-07, + "loss": 0.80034113, + "num_input_tokens_seen": 141583695, + "step": 6569, + "time_per_iteration": 2.446577310562134 + }, + { + "auxiliary_loss_clip": 0.01150193, + "auxiliary_loss_mlp": 0.01026364, + "balance_loss_clip": 1.04591775, + "balance_loss_mlp": 1.01915157, + "epoch": 0.7899957914988276, + "flos": 17238487138560.0, + "grad_norm": 1.7578185557216295, + "language_loss": 0.77680451, + "learning_rate": 4.4490769450607215e-07, + "loss": 0.79857004, + "num_input_tokens_seen": 141601320, + "step": 6570, + "time_per_iteration": 2.430979013442993 + }, + { + "auxiliary_loss_clip": 0.01121951, + "auxiliary_loss_mlp": 0.01020412, + "balance_loss_clip": 1.03833783, + "balance_loss_mlp": 1.01341188, + "epoch": 0.7901160343894668, + "flos": 41279351086080.0, + "grad_norm": 7.1347801039338234, + "language_loss": 0.7262857, + "learning_rate": 4.4441797494431845e-07, + "loss": 0.74770933, + "num_input_tokens_seen": 141623125, + "step": 6571, + "time_per_iteration": 2.678771495819092 + }, + { + "auxiliary_loss_clip": 0.01151506, + "auxiliary_loss_mlp": 0.01024827, + "balance_loss_clip": 1.04594481, + "balance_loss_mlp": 1.01722395, + "epoch": 0.7902362772801058, + "flos": 16837005847680.0, + "grad_norm": 2.071619302967261, + "language_loss": 0.77968991, + "learning_rate": 4.439284913631207e-07, + "loss": 0.80145323, + "num_input_tokens_seen": 141640335, + "step": 6572, + "time_per_iteration": 2.423659086227417 + }, + { + "auxiliary_loss_clip": 0.0112993, + "auxiliary_loss_mlp": 0.01026463, + "balance_loss_clip": 1.04605556, + "balance_loss_mlp": 1.01889312, + "epoch": 0.7903565201707449, + "flos": 27125987091840.0, + "grad_norm": 1.997777118255376, + "language_loss": 0.84143198, + "learning_rate": 4.434392438367347e-07, + "loss": 0.86299592, + "num_input_tokens_seen": 141659760, + "step": 6573, + "time_per_iteration": 3.262484312057495 + }, + { + "auxiliary_loss_clip": 0.01158352, + "auxiliary_loss_mlp": 0.01020763, + "balance_loss_clip": 1.0448103, + "balance_loss_mlp": 1.01335418, + "epoch": 0.790476763061384, + "flos": 31025167142400.0, + "grad_norm": 1.9119792889811482, + "language_loss": 0.74028373, + "learning_rate": 4.4295023243937677e-07, + "loss": 0.76207483, + "num_input_tokens_seen": 141679965, + "step": 6574, + "time_per_iteration": 3.2932960987091064 + }, + { + "auxiliary_loss_clip": 0.01156555, + "auxiliary_loss_mlp": 0.01025924, + "balance_loss_clip": 1.04838145, + "balance_loss_mlp": 1.01834786, + "epoch": 0.7905970059520231, + "flos": 22089084681600.0, + "grad_norm": 1.6438104127563355, + "language_loss": 0.80247998, + "learning_rate": 4.4246145724523123e-07, + "loss": 0.82430482, + "num_input_tokens_seen": 141697710, + "step": 6575, + "time_per_iteration": 2.454025983810425 + }, + { + "auxiliary_loss_clip": 0.01126742, + "auxiliary_loss_mlp": 0.01024085, + "balance_loss_clip": 1.04533553, + "balance_loss_mlp": 1.01652741, + "epoch": 0.7907172488426621, + "flos": 20558141159040.0, + "grad_norm": 3.1076615527913947, + "language_loss": 0.77513731, + "learning_rate": 4.41972918328444e-07, + "loss": 0.79664564, + "num_input_tokens_seen": 141715145, + "step": 6576, + "time_per_iteration": 3.287065029144287 + }, + { + "auxiliary_loss_clip": 0.01153014, + "auxiliary_loss_mlp": 0.01031333, + "balance_loss_clip": 1.0472424, + "balance_loss_mlp": 1.02347684, + "epoch": 0.7908374917333013, + "flos": 30081542901120.0, + "grad_norm": 2.1927261481553564, + "language_loss": 0.77552426, + "learning_rate": 4.4148461576312646e-07, + "loss": 0.79736769, + "num_input_tokens_seen": 141734810, + "step": 6577, + "time_per_iteration": 2.5236284732818604 + }, + { + "auxiliary_loss_clip": 0.01156476, + "auxiliary_loss_mlp": 0.01021981, + "balance_loss_clip": 1.04989076, + "balance_loss_mlp": 1.01506424, + "epoch": 0.7909577346239404, + "flos": 20996359084800.0, + "grad_norm": 1.6651134843034943, + "language_loss": 0.74540961, + "learning_rate": 4.4099654962335343e-07, + "loss": 0.76719415, + "num_input_tokens_seen": 141755260, + "step": 6578, + "time_per_iteration": 3.269705057144165 + }, + { + "auxiliary_loss_clip": 0.01147763, + "auxiliary_loss_mlp": 0.01023227, + "balance_loss_clip": 1.04712808, + "balance_loss_mlp": 1.01572585, + "epoch": 0.7910779775145794, + "flos": 26247935128320.0, + "grad_norm": 1.7692471351286907, + "language_loss": 0.74872923, + "learning_rate": 4.405087199831636e-07, + "loss": 0.77043915, + "num_input_tokens_seen": 141775500, + "step": 6579, + "time_per_iteration": 2.5351362228393555 + }, + { + "auxiliary_loss_clip": 0.01142825, + "auxiliary_loss_mlp": 0.00761794, + "balance_loss_clip": 1.04386199, + "balance_loss_mlp": 1.00063825, + "epoch": 0.7911982204052186, + "flos": 22564434291840.0, + "grad_norm": 2.0525835964524655, + "language_loss": 0.67422581, + "learning_rate": 4.400211269165619e-07, + "loss": 0.69327199, + "num_input_tokens_seen": 141791955, + "step": 6580, + "time_per_iteration": 2.508024215698242 + }, + { + "auxiliary_loss_clip": 0.01174605, + "auxiliary_loss_mlp": 0.01024002, + "balance_loss_clip": 1.05280542, + "balance_loss_mlp": 1.01743031, + "epoch": 0.7913184632958576, + "flos": 23112538899840.0, + "grad_norm": 1.498661200544066, + "language_loss": 0.76823175, + "learning_rate": 4.3953377049751416e-07, + "loss": 0.79021776, + "num_input_tokens_seen": 141812380, + "step": 6581, + "time_per_iteration": 2.4202349185943604 + }, + { + "auxiliary_loss_clip": 0.01144266, + "auxiliary_loss_mlp": 0.01028497, + "balance_loss_clip": 1.04593003, + "balance_loss_mlp": 1.02123737, + "epoch": 0.7914387061864967, + "flos": 12311758719360.0, + "grad_norm": 2.44888908573289, + "language_loss": 0.77940154, + "learning_rate": 4.390466507999537e-07, + "loss": 0.80112922, + "num_input_tokens_seen": 141828130, + "step": 6582, + "time_per_iteration": 2.4520626068115234 + }, + { + "auxiliary_loss_clip": 0.01123453, + "auxiliary_loss_mlp": 0.01027843, + "balance_loss_clip": 1.04185247, + "balance_loss_mlp": 1.02052999, + "epoch": 0.7915589490771359, + "flos": 17603267708160.0, + "grad_norm": 2.1668951930721656, + "language_loss": 0.7562393, + "learning_rate": 4.385597678977748e-07, + "loss": 0.77775228, + "num_input_tokens_seen": 141846965, + "step": 6583, + "time_per_iteration": 2.514603614807129 + }, + { + "auxiliary_loss_clip": 0.01137442, + "auxiliary_loss_mlp": 0.01023684, + "balance_loss_clip": 1.04258394, + "balance_loss_mlp": 1.01630521, + "epoch": 0.7916791919677749, + "flos": 25591272641280.0, + "grad_norm": 4.41017025827925, + "language_loss": 0.75333858, + "learning_rate": 4.3807312186483726e-07, + "loss": 0.77494991, + "num_input_tokens_seen": 141867685, + "step": 6584, + "time_per_iteration": 2.5121421813964844 + }, + { + "auxiliary_loss_clip": 0.01151438, + "auxiliary_loss_mlp": 0.01022868, + "balance_loss_clip": 1.04780555, + "balance_loss_mlp": 1.01523304, + "epoch": 0.791799434858414, + "flos": 18844340474880.0, + "grad_norm": 1.9449846171256069, + "language_loss": 0.78323066, + "learning_rate": 4.375867127749655e-07, + "loss": 0.80497372, + "num_input_tokens_seen": 141885960, + "step": 6585, + "time_per_iteration": 2.4290876388549805 + }, + { + "auxiliary_loss_clip": 0.01128157, + "auxiliary_loss_mlp": 0.01024861, + "balance_loss_clip": 1.04684985, + "balance_loss_mlp": 1.01733291, + "epoch": 0.7919196777490531, + "flos": 25812015672960.0, + "grad_norm": 2.1081232201223243, + "language_loss": 0.67374563, + "learning_rate": 4.3710054070194744e-07, + "loss": 0.69527578, + "num_input_tokens_seen": 141905655, + "step": 6586, + "time_per_iteration": 2.5383758544921875 + }, + { + "auxiliary_loss_clip": 0.01170164, + "auxiliary_loss_mlp": 0.0076218, + "balance_loss_clip": 1.04777765, + "balance_loss_mlp": 1.00060499, + "epoch": 0.7920399206396922, + "flos": 11947624594560.0, + "grad_norm": 2.9442630540761487, + "language_loss": 0.65740228, + "learning_rate": 4.3661460571953455e-07, + "loss": 0.67672575, + "num_input_tokens_seen": 141922390, + "step": 6587, + "time_per_iteration": 2.376394510269165 + }, + { + "auxiliary_loss_clip": 0.01152609, + "auxiliary_loss_mlp": 0.01021216, + "balance_loss_clip": 1.04218936, + "balance_loss_mlp": 1.01432276, + "epoch": 0.7921601635303313, + "flos": 21579907438080.0, + "grad_norm": 1.64897238099986, + "language_loss": 0.68798828, + "learning_rate": 4.36128907901443e-07, + "loss": 0.70972651, + "num_input_tokens_seen": 141941985, + "step": 6588, + "time_per_iteration": 2.473308801651001 + }, + { + "auxiliary_loss_clip": 0.01128163, + "auxiliary_loss_mlp": 0.01023263, + "balance_loss_clip": 1.04256725, + "balance_loss_mlp": 1.01574063, + "epoch": 0.7922804064209703, + "flos": 18113989236480.0, + "grad_norm": 3.9072298451491805, + "language_loss": 0.72699225, + "learning_rate": 4.356434473213519e-07, + "loss": 0.74850649, + "num_input_tokens_seen": 141959435, + "step": 6589, + "time_per_iteration": 2.525400161743164 + }, + { + "auxiliary_loss_clip": 0.01139146, + "auxiliary_loss_mlp": 0.01024696, + "balance_loss_clip": 1.04624319, + "balance_loss_mlp": 1.01781476, + "epoch": 0.7924006493116095, + "flos": 21652806090240.0, + "grad_norm": 2.206187357825709, + "language_loss": 0.79568511, + "learning_rate": 4.351582240529068e-07, + "loss": 0.81732357, + "num_input_tokens_seen": 141980265, + "step": 6590, + "time_per_iteration": 2.504404067993164 + }, + { + "auxiliary_loss_clip": 0.01047745, + "auxiliary_loss_mlp": 0.01002042, + "balance_loss_clip": 1.00838089, + "balance_loss_mlp": 1.00095737, + "epoch": 0.7925208922022485, + "flos": 64242755694720.0, + "grad_norm": 0.6802671931484287, + "language_loss": 0.58175772, + "learning_rate": 4.346732381697149e-07, + "loss": 0.60225564, + "num_input_tokens_seen": 142044395, + "step": 6591, + "time_per_iteration": 3.114205837249756 + }, + { + "auxiliary_loss_clip": 0.01132235, + "auxiliary_loss_mlp": 0.01026246, + "balance_loss_clip": 1.04349124, + "balance_loss_mlp": 1.01888227, + "epoch": 0.7926411350928876, + "flos": 16941541403520.0, + "grad_norm": 2.0229515011970407, + "language_loss": 0.81592429, + "learning_rate": 4.3418848974534825e-07, + "loss": 0.83750916, + "num_input_tokens_seen": 142061335, + "step": 6592, + "time_per_iteration": 2.4586021900177 + }, + { + "auxiliary_loss_clip": 0.01131505, + "auxiliary_loss_mlp": 0.01023978, + "balance_loss_clip": 1.04544425, + "balance_loss_mlp": 1.01717687, + "epoch": 0.7927613779835267, + "flos": 34459987144320.0, + "grad_norm": 1.500743647397675, + "language_loss": 0.68870831, + "learning_rate": 4.3370397885334276e-07, + "loss": 0.71026313, + "num_input_tokens_seen": 142081965, + "step": 6593, + "time_per_iteration": 2.62770414352417 + }, + { + "auxiliary_loss_clip": 0.0114926, + "auxiliary_loss_mlp": 0.01031923, + "balance_loss_clip": 1.04725432, + "balance_loss_mlp": 1.02451086, + "epoch": 0.7928816208741658, + "flos": 18951174501120.0, + "grad_norm": 2.105213711718343, + "language_loss": 0.75619119, + "learning_rate": 4.3321970556719777e-07, + "loss": 0.77800298, + "num_input_tokens_seen": 142100260, + "step": 6594, + "time_per_iteration": 2.4351515769958496 + }, + { + "auxiliary_loss_clip": 0.01169206, + "auxiliary_loss_mlp": 0.01029091, + "balance_loss_clip": 1.04843378, + "balance_loss_mlp": 1.02138996, + "epoch": 0.7930018637648049, + "flos": 18623022825600.0, + "grad_norm": 2.340085917285134, + "language_loss": 0.72067809, + "learning_rate": 4.3273566996037856e-07, + "loss": 0.742661, + "num_input_tokens_seen": 142116955, + "step": 6595, + "time_per_iteration": 2.3904173374176025 + }, + { + "auxiliary_loss_clip": 0.01140258, + "auxiliary_loss_mlp": 0.01028281, + "balance_loss_clip": 1.04568458, + "balance_loss_mlp": 1.02109921, + "epoch": 0.793122106655444, + "flos": 24530650824960.0, + "grad_norm": 3.74041034010443, + "language_loss": 0.80583334, + "learning_rate": 4.322518721063113e-07, + "loss": 0.8275187, + "num_input_tokens_seen": 142135505, + "step": 6596, + "time_per_iteration": 2.5410854816436768 + }, + { + "auxiliary_loss_clip": 0.01155076, + "auxiliary_loss_mlp": 0.0102489, + "balance_loss_clip": 1.04851246, + "balance_loss_mlp": 1.01762712, + "epoch": 0.7932423495460831, + "flos": 34421203434240.0, + "grad_norm": 2.718063555427447, + "language_loss": 0.70283705, + "learning_rate": 4.3176831207838906e-07, + "loss": 0.72463673, + "num_input_tokens_seen": 142158915, + "step": 6597, + "time_per_iteration": 2.584857940673828 + }, + { + "auxiliary_loss_clip": 0.01154156, + "auxiliary_loss_mlp": 0.01022018, + "balance_loss_clip": 1.04968798, + "balance_loss_mlp": 1.01483226, + "epoch": 0.7933625924367221, + "flos": 26980333441920.0, + "grad_norm": 1.8422519095557273, + "language_loss": 0.74832177, + "learning_rate": 4.3128498994996685e-07, + "loss": 0.77008355, + "num_input_tokens_seen": 142178390, + "step": 6598, + "time_per_iteration": 2.5123751163482666 + }, + { + "auxiliary_loss_clip": 0.0115865, + "auxiliary_loss_mlp": 0.01024894, + "balance_loss_clip": 1.04699826, + "balance_loss_mlp": 1.01695466, + "epoch": 0.7934828353273613, + "flos": 29568630643200.0, + "grad_norm": 2.2693863121438067, + "language_loss": 0.71328443, + "learning_rate": 4.308019057943646e-07, + "loss": 0.73511988, + "num_input_tokens_seen": 142200115, + "step": 6599, + "time_per_iteration": 2.507535219192505 + }, + { + "auxiliary_loss_clip": 0.01121559, + "auxiliary_loss_mlp": 0.01025095, + "balance_loss_clip": 1.04385567, + "balance_loss_mlp": 1.01827025, + "epoch": 0.7936030782180004, + "flos": 28615381557120.0, + "grad_norm": 1.5852572813274406, + "language_loss": 0.74305499, + "learning_rate": 4.3031905968486535e-07, + "loss": 0.76452154, + "num_input_tokens_seen": 142220945, + "step": 6600, + "time_per_iteration": 3.3619632720947266 + }, + { + "auxiliary_loss_clip": 0.01111732, + "auxiliary_loss_mlp": 0.01022516, + "balance_loss_clip": 1.04544592, + "balance_loss_mlp": 1.01522017, + "epoch": 0.7937233211086394, + "flos": 16392574869120.0, + "grad_norm": 2.9349885191489116, + "language_loss": 0.68417239, + "learning_rate": 4.298364516947162e-07, + "loss": 0.70551491, + "num_input_tokens_seen": 142238175, + "step": 6601, + "time_per_iteration": 3.2815868854522705 + }, + { + "auxiliary_loss_clip": 0.01108128, + "auxiliary_loss_mlp": 0.01021862, + "balance_loss_clip": 1.04039049, + "balance_loss_mlp": 1.01489711, + "epoch": 0.7938435639992786, + "flos": 22013420682240.0, + "grad_norm": 2.2819844760783243, + "language_loss": 0.65602738, + "learning_rate": 4.293540818971295e-07, + "loss": 0.67732728, + "num_input_tokens_seen": 142255980, + "step": 6602, + "time_per_iteration": 2.5534253120422363 + }, + { + "auxiliary_loss_clip": 0.01161144, + "auxiliary_loss_mlp": 0.01018027, + "balance_loss_clip": 1.04783249, + "balance_loss_mlp": 1.01059484, + "epoch": 0.7939638068899176, + "flos": 22197032029440.0, + "grad_norm": 2.1836501386889156, + "language_loss": 0.77074838, + "learning_rate": 4.2887195036527934e-07, + "loss": 0.79254007, + "num_input_tokens_seen": 142274785, + "step": 6603, + "time_per_iteration": 3.1936261653900146 + }, + { + "auxiliary_loss_clip": 0.0114297, + "auxiliary_loss_mlp": 0.01020949, + "balance_loss_clip": 1.04090679, + "balance_loss_mlp": 1.01364768, + "epoch": 0.7940840497805567, + "flos": 17745186343680.0, + "grad_norm": 3.0118622250143288, + "language_loss": 0.73612159, + "learning_rate": 4.28390057172306e-07, + "loss": 0.75776082, + "num_input_tokens_seen": 142291290, + "step": 6604, + "time_per_iteration": 2.410322427749634 + }, + { + "auxiliary_loss_clip": 0.01120534, + "auxiliary_loss_mlp": 0.01027101, + "balance_loss_clip": 1.03973401, + "balance_loss_mlp": 1.01882744, + "epoch": 0.7942042926711959, + "flos": 23805435231360.0, + "grad_norm": 2.075504967613023, + "language_loss": 0.72446102, + "learning_rate": 4.279084023913111e-07, + "loss": 0.74593741, + "num_input_tokens_seen": 142309165, + "step": 6605, + "time_per_iteration": 3.3080780506134033 + }, + { + "auxiliary_loss_clip": 0.01154666, + "auxiliary_loss_mlp": 0.01025154, + "balance_loss_clip": 1.04795313, + "balance_loss_mlp": 1.01817155, + "epoch": 0.7943245355618349, + "flos": 19244959839360.0, + "grad_norm": 1.862536324948511, + "language_loss": 0.69247276, + "learning_rate": 4.2742698609536096e-07, + "loss": 0.71427095, + "num_input_tokens_seen": 142327475, + "step": 6606, + "time_per_iteration": 2.4554014205932617 + }, + { + "auxiliary_loss_clip": 0.01143216, + "auxiliary_loss_mlp": 0.01023843, + "balance_loss_clip": 1.04480147, + "balance_loss_mlp": 1.0167737, + "epoch": 0.794444778452474, + "flos": 25007616547200.0, + "grad_norm": 1.8268809518562734, + "language_loss": 0.78565514, + "learning_rate": 4.2694580835748706e-07, + "loss": 0.80732572, + "num_input_tokens_seen": 142347335, + "step": 6607, + "time_per_iteration": 2.51481032371521 + }, + { + "auxiliary_loss_clip": 0.01139181, + "auxiliary_loss_mlp": 0.01022508, + "balance_loss_clip": 1.04381919, + "balance_loss_mlp": 1.01484263, + "epoch": 0.7945650213431131, + "flos": 23221491828480.0, + "grad_norm": 2.213123676214686, + "language_loss": 0.74057353, + "learning_rate": 4.264648692506836e-07, + "loss": 0.76219034, + "num_input_tokens_seen": 142366125, + "step": 6608, + "time_per_iteration": 2.4931373596191406 + }, + { + "auxiliary_loss_clip": 0.01134644, + "auxiliary_loss_mlp": 0.01025495, + "balance_loss_clip": 1.04239571, + "balance_loss_mlp": 1.01740122, + "epoch": 0.7946852642337522, + "flos": 26062887237120.0, + "grad_norm": 1.8855544812785716, + "language_loss": 0.72124088, + "learning_rate": 4.2598416884790824e-07, + "loss": 0.7428422, + "num_input_tokens_seen": 142385175, + "step": 6609, + "time_per_iteration": 2.5148520469665527 + }, + { + "auxiliary_loss_clip": 0.01149117, + "auxiliary_loss_mlp": 0.01022869, + "balance_loss_clip": 1.04375386, + "balance_loss_mlp": 1.01488233, + "epoch": 0.7948055071243912, + "flos": 23769704177280.0, + "grad_norm": 1.9857908001530415, + "language_loss": 0.80991256, + "learning_rate": 4.255037072220828e-07, + "loss": 0.83163238, + "num_input_tokens_seen": 142406545, + "step": 6610, + "time_per_iteration": 2.5209410190582275 + }, + { + "auxiliary_loss_clip": 0.01165673, + "auxiliary_loss_mlp": 0.01020166, + "balance_loss_clip": 1.04658175, + "balance_loss_mlp": 1.01343048, + "epoch": 0.7949257500150304, + "flos": 21980814111360.0, + "grad_norm": 2.2595299751236286, + "language_loss": 0.71759462, + "learning_rate": 4.2502348444609293e-07, + "loss": 0.73945308, + "num_input_tokens_seen": 142426165, + "step": 6611, + "time_per_iteration": 2.4418771266937256 + }, + { + "auxiliary_loss_clip": 0.01109548, + "auxiliary_loss_mlp": 0.01023806, + "balance_loss_clip": 1.03919339, + "balance_loss_mlp": 1.01715732, + "epoch": 0.7950459929056695, + "flos": 25774129802880.0, + "grad_norm": 1.8457101232485928, + "language_loss": 0.69483709, + "learning_rate": 4.2454350059278844e-07, + "loss": 0.71617061, + "num_input_tokens_seen": 142447225, + "step": 6612, + "time_per_iteration": 2.6182141304016113 + }, + { + "auxiliary_loss_clip": 0.01132743, + "auxiliary_loss_mlp": 0.01026098, + "balance_loss_clip": 1.03959608, + "balance_loss_mlp": 1.01912415, + "epoch": 0.7951662357963085, + "flos": 22158068751360.0, + "grad_norm": 1.7540714463591975, + "language_loss": 0.84236193, + "learning_rate": 4.240637557349824e-07, + "loss": 0.86395031, + "num_input_tokens_seen": 142464440, + "step": 6613, + "time_per_iteration": 2.5243947505950928 + }, + { + "auxiliary_loss_clip": 0.01128831, + "auxiliary_loss_mlp": 0.01023644, + "balance_loss_clip": 1.04436779, + "balance_loss_mlp": 1.01610994, + "epoch": 0.7952864786869477, + "flos": 24641938137600.0, + "grad_norm": 2.583622451731839, + "language_loss": 0.66402155, + "learning_rate": 4.235842499454516e-07, + "loss": 0.68554628, + "num_input_tokens_seen": 142484355, + "step": 6614, + "time_per_iteration": 2.5253608226776123 + }, + { + "auxiliary_loss_clip": 0.01143434, + "auxiliary_loss_mlp": 0.01027771, + "balance_loss_clip": 1.04590821, + "balance_loss_mlp": 1.02025497, + "epoch": 0.7954067215775867, + "flos": 21830922656640.0, + "grad_norm": 1.8634312603378007, + "language_loss": 0.82511824, + "learning_rate": 4.2310498329693687e-07, + "loss": 0.84683031, + "num_input_tokens_seen": 142505255, + "step": 6615, + "time_per_iteration": 2.518226385116577 + }, + { + "auxiliary_loss_clip": 0.01157213, + "auxiliary_loss_mlp": 0.01027057, + "balance_loss_clip": 1.04619181, + "balance_loss_mlp": 1.01889706, + "epoch": 0.7955269644682258, + "flos": 24060652341120.0, + "grad_norm": 1.6180771461183463, + "language_loss": 0.80765176, + "learning_rate": 4.2262595586214164e-07, + "loss": 0.82949448, + "num_input_tokens_seen": 142526350, + "step": 6616, + "time_per_iteration": 2.493069648742676 + }, + { + "auxiliary_loss_clip": 0.01157264, + "auxiliary_loss_mlp": 0.01025301, + "balance_loss_clip": 1.04612279, + "balance_loss_mlp": 1.01739144, + "epoch": 0.795647207358865, + "flos": 25010741030400.0, + "grad_norm": 1.725482783480117, + "language_loss": 0.76549804, + "learning_rate": 4.221471677137358e-07, + "loss": 0.78732377, + "num_input_tokens_seen": 142547165, + "step": 6617, + "time_per_iteration": 2.4914355278015137 + }, + { + "auxiliary_loss_clip": 0.01128932, + "auxiliary_loss_mlp": 0.01024195, + "balance_loss_clip": 1.04369187, + "balance_loss_mlp": 1.017591, + "epoch": 0.795767450249504, + "flos": 14648358343680.0, + "grad_norm": 1.817315958423535, + "language_loss": 0.70105672, + "learning_rate": 4.216686189243492e-07, + "loss": 0.722588, + "num_input_tokens_seen": 142565955, + "step": 6618, + "time_per_iteration": 2.4845197200775146 + }, + { + "auxiliary_loss_clip": 0.01122034, + "auxiliary_loss_mlp": 0.01022236, + "balance_loss_clip": 1.04394507, + "balance_loss_mlp": 1.01452327, + "epoch": 0.7958876931401431, + "flos": 18547897530240.0, + "grad_norm": 1.6712161955346507, + "language_loss": 0.72848392, + "learning_rate": 4.211903095665785e-07, + "loss": 0.74992657, + "num_input_tokens_seen": 142585340, + "step": 6619, + "time_per_iteration": 2.496694326400757 + }, + { + "auxiliary_loss_clip": 0.01147324, + "auxiliary_loss_mlp": 0.01028485, + "balance_loss_clip": 1.04355407, + "balance_loss_mlp": 1.02118063, + "epoch": 0.7960079360307821, + "flos": 21543960902400.0, + "grad_norm": 1.689267186037295, + "language_loss": 0.75124127, + "learning_rate": 4.2071223971298277e-07, + "loss": 0.77299941, + "num_input_tokens_seen": 142602525, + "step": 6620, + "time_per_iteration": 2.4499478340148926 + }, + { + "auxiliary_loss_clip": 0.01154802, + "auxiliary_loss_mlp": 0.01022996, + "balance_loss_clip": 1.04537547, + "balance_loss_mlp": 1.01473486, + "epoch": 0.7961281789214213, + "flos": 25481745095040.0, + "grad_norm": 1.954545571667054, + "language_loss": 0.61142027, + "learning_rate": 4.2023440943608433e-07, + "loss": 0.63319826, + "num_input_tokens_seen": 142622490, + "step": 6621, + "time_per_iteration": 2.473780632019043 + }, + { + "auxiliary_loss_clip": 0.01151536, + "auxiliary_loss_mlp": 0.01019683, + "balance_loss_clip": 1.04365468, + "balance_loss_mlp": 1.01272738, + "epoch": 0.7962484218120603, + "flos": 21944436612480.0, + "grad_norm": 1.692800797952807, + "language_loss": 0.78156102, + "learning_rate": 4.1975681880837023e-07, + "loss": 0.8032732, + "num_input_tokens_seen": 142642495, + "step": 6622, + "time_per_iteration": 2.465829849243164 + }, + { + "auxiliary_loss_clip": 0.01120318, + "auxiliary_loss_mlp": 0.01026158, + "balance_loss_clip": 1.03971159, + "balance_loss_mlp": 1.01884413, + "epoch": 0.7963686647026994, + "flos": 18876264687360.0, + "grad_norm": 1.658784734189198, + "language_loss": 0.82100618, + "learning_rate": 4.192794679022895e-07, + "loss": 0.84247088, + "num_input_tokens_seen": 142660820, + "step": 6623, + "time_per_iteration": 2.5076417922973633 + }, + { + "auxiliary_loss_clip": 0.01154852, + "auxiliary_loss_mlp": 0.01027485, + "balance_loss_clip": 1.04468274, + "balance_loss_mlp": 1.02057099, + "epoch": 0.7964889075933386, + "flos": 29716582763520.0, + "grad_norm": 2.0735074841266643, + "language_loss": 0.72060847, + "learning_rate": 4.1880235679025743e-07, + "loss": 0.74243188, + "num_input_tokens_seen": 142680915, + "step": 6624, + "time_per_iteration": 2.5176732540130615 + }, + { + "auxiliary_loss_clip": 0.01099467, + "auxiliary_loss_mlp": 0.01037946, + "balance_loss_clip": 1.03982401, + "balance_loss_mlp": 1.03053093, + "epoch": 0.7966091504839776, + "flos": 29491458272640.0, + "grad_norm": 1.9202140598968989, + "language_loss": 0.63734794, + "learning_rate": 4.1832548554464986e-07, + "loss": 0.65872204, + "num_input_tokens_seen": 142699210, + "step": 6625, + "time_per_iteration": 2.6340231895446777 + }, + { + "auxiliary_loss_clip": 0.01048728, + "auxiliary_loss_mlp": 0.01000712, + "balance_loss_clip": 1.00713277, + "balance_loss_mlp": 0.99965113, + "epoch": 0.7967293933746167, + "flos": 67288697101440.0, + "grad_norm": 0.7405843100946993, + "language_loss": 0.58705187, + "learning_rate": 4.178488542378098e-07, + "loss": 0.60754621, + "num_input_tokens_seen": 142756790, + "step": 6626, + "time_per_iteration": 3.7493979930877686 + }, + { + "auxiliary_loss_clip": 0.01172462, + "auxiliary_loss_mlp": 0.01026274, + "balance_loss_clip": 1.04900587, + "balance_loss_mlp": 1.01867485, + "epoch": 0.7968496362652558, + "flos": 25554679660800.0, + "grad_norm": 1.861179822951879, + "language_loss": 0.88914156, + "learning_rate": 4.173724629420401e-07, + "loss": 0.91112894, + "num_input_tokens_seen": 142778150, + "step": 6627, + "time_per_iteration": 2.466695547103882 + }, + { + "auxiliary_loss_clip": 0.01148731, + "auxiliary_loss_mlp": 0.01027749, + "balance_loss_clip": 1.04748464, + "balance_loss_mlp": 1.01983964, + "epoch": 0.7969698791558949, + "flos": 14501088581760.0, + "grad_norm": 2.6449960363801424, + "language_loss": 0.68276894, + "learning_rate": 4.168963117296087e-07, + "loss": 0.70453382, + "num_input_tokens_seen": 142795485, + "step": 6628, + "time_per_iteration": 3.3917689323425293 + }, + { + "auxiliary_loss_clip": 0.01168167, + "auxiliary_loss_mlp": 0.01021452, + "balance_loss_clip": 1.04790521, + "balance_loss_mlp": 1.01437736, + "epoch": 0.797090122046534, + "flos": 22127545169280.0, + "grad_norm": 2.2787841802961344, + "language_loss": 0.75926507, + "learning_rate": 4.1642040067274876e-07, + "loss": 0.78116131, + "num_input_tokens_seen": 142815155, + "step": 6629, + "time_per_iteration": 3.1622347831726074 + }, + { + "auxiliary_loss_clip": 0.01144424, + "auxiliary_loss_mlp": 0.01019586, + "balance_loss_clip": 1.04524589, + "balance_loss_mlp": 1.01224589, + "epoch": 0.7972103649371731, + "flos": 19897671830400.0, + "grad_norm": 1.8290416575004187, + "language_loss": 0.72727686, + "learning_rate": 4.1594472984365493e-07, + "loss": 0.74891698, + "num_input_tokens_seen": 142833840, + "step": 6630, + "time_per_iteration": 2.489682197570801 + }, + { + "auxiliary_loss_clip": 0.01148674, + "auxiliary_loss_mlp": 0.01027668, + "balance_loss_clip": 1.04481196, + "balance_loss_mlp": 1.0207901, + "epoch": 0.7973306078278122, + "flos": 36058621847040.0, + "grad_norm": 1.8882156105751804, + "language_loss": 0.77976549, + "learning_rate": 4.154692993144862e-07, + "loss": 0.80152893, + "num_input_tokens_seen": 142853610, + "step": 6631, + "time_per_iteration": 3.328517436981201 + }, + { + "auxiliary_loss_clip": 0.01167757, + "auxiliary_loss_mlp": 0.00762148, + "balance_loss_clip": 1.04766202, + "balance_loss_mlp": 1.00066459, + "epoch": 0.7974508507184512, + "flos": 21360600950400.0, + "grad_norm": 6.7170243747186, + "language_loss": 0.71272516, + "learning_rate": 4.1499410915736476e-07, + "loss": 0.73202431, + "num_input_tokens_seen": 142872540, + "step": 6632, + "time_per_iteration": 2.4151177406311035 + }, + { + "auxiliary_loss_clip": 0.01055182, + "auxiliary_loss_mlp": 0.01000811, + "balance_loss_clip": 1.00812423, + "balance_loss_mlp": 0.99976748, + "epoch": 0.7975710936090904, + "flos": 68253115317120.0, + "grad_norm": 0.7717015945629074, + "language_loss": 0.64266759, + "learning_rate": 4.145191594443762e-07, + "loss": 0.6632275, + "num_input_tokens_seen": 142936895, + "step": 6633, + "time_per_iteration": 3.19063663482666 + }, + { + "auxiliary_loss_clip": 0.0112077, + "auxiliary_loss_mlp": 0.01027962, + "balance_loss_clip": 1.0425148, + "balance_loss_mlp": 1.02057672, + "epoch": 0.7976913364997295, + "flos": 22492433479680.0, + "grad_norm": 2.280143691717563, + "language_loss": 0.70860028, + "learning_rate": 4.140444502475713e-07, + "loss": 0.73008752, + "num_input_tokens_seen": 142956445, + "step": 6634, + "time_per_iteration": 2.5465409755706787 + }, + { + "auxiliary_loss_clip": 0.011471, + "auxiliary_loss_mlp": 0.01030304, + "balance_loss_clip": 1.0413909, + "balance_loss_mlp": 1.02311873, + "epoch": 0.7978115793903685, + "flos": 15263220378240.0, + "grad_norm": 1.9644361993375536, + "language_loss": 0.70023966, + "learning_rate": 4.1356998163896216e-07, + "loss": 0.72201371, + "num_input_tokens_seen": 142973495, + "step": 6635, + "time_per_iteration": 2.4358675479888916 + }, + { + "auxiliary_loss_clip": 0.01129882, + "auxiliary_loss_mlp": 0.01023346, + "balance_loss_clip": 1.04422116, + "balance_loss_mlp": 1.01626515, + "epoch": 0.7979318222810077, + "flos": 19719232041600.0, + "grad_norm": 2.2325975137241376, + "language_loss": 0.74558848, + "learning_rate": 4.130957536905255e-07, + "loss": 0.76712078, + "num_input_tokens_seen": 142991510, + "step": 6636, + "time_per_iteration": 2.5037429332733154 + }, + { + "auxiliary_loss_clip": 0.01149169, + "auxiliary_loss_mlp": 0.01029188, + "balance_loss_clip": 1.04534149, + "balance_loss_mlp": 1.02124882, + "epoch": 0.7980520651716467, + "flos": 15560273854080.0, + "grad_norm": 3.05026161260029, + "language_loss": 0.71267706, + "learning_rate": 4.1262176647420134e-07, + "loss": 0.73446065, + "num_input_tokens_seen": 143009675, + "step": 6637, + "time_per_iteration": 2.5079615116119385 + }, + { + "auxiliary_loss_clip": 0.01147257, + "auxiliary_loss_mlp": 0.0102784, + "balance_loss_clip": 1.04655361, + "balance_loss_mlp": 1.02065158, + "epoch": 0.7981723080622858, + "flos": 22309432663680.0, + "grad_norm": 1.6307640552118028, + "language_loss": 0.79907572, + "learning_rate": 4.121480200618923e-07, + "loss": 0.82082665, + "num_input_tokens_seen": 143029330, + "step": 6638, + "time_per_iteration": 2.5058043003082275 + }, + { + "auxiliary_loss_clip": 0.01136193, + "auxiliary_loss_mlp": 0.01023567, + "balance_loss_clip": 1.04488075, + "balance_loss_mlp": 1.01648045, + "epoch": 0.798292550952925, + "flos": 22929573997440.0, + "grad_norm": 1.8331058690398634, + "language_loss": 0.80033797, + "learning_rate": 4.116745145254674e-07, + "loss": 0.82193553, + "num_input_tokens_seen": 143048865, + "step": 6639, + "time_per_iteration": 2.4933295249938965 + }, + { + "auxiliary_loss_clip": 0.0103798, + "auxiliary_loss_mlp": 0.01001284, + "balance_loss_clip": 1.00717998, + "balance_loss_mlp": 1.00028849, + "epoch": 0.798412793843564, + "flos": 64497936890880.0, + "grad_norm": 0.7812284448872275, + "language_loss": 0.58085757, + "learning_rate": 4.1120124993675476e-07, + "loss": 0.60125017, + "num_input_tokens_seen": 143113295, + "step": 6640, + "time_per_iteration": 3.0835366249084473 + }, + { + "auxiliary_loss_clip": 0.01148378, + "auxiliary_loss_mlp": 0.01023131, + "balance_loss_clip": 1.04490948, + "balance_loss_mlp": 1.01515269, + "epoch": 0.7985330367342031, + "flos": 13586910514560.0, + "grad_norm": 23.85411480440408, + "language_loss": 0.62015277, + "learning_rate": 4.107282263675498e-07, + "loss": 0.64186788, + "num_input_tokens_seen": 143130965, + "step": 6641, + "time_per_iteration": 2.4565463066101074 + }, + { + "auxiliary_loss_clip": 0.01041104, + "auxiliary_loss_mlp": 0.00752978, + "balance_loss_clip": 1.0100385, + "balance_loss_mlp": 1.00045156, + "epoch": 0.7986532796248422, + "flos": 67698797656320.0, + "grad_norm": 0.8116419362053018, + "language_loss": 0.52503091, + "learning_rate": 4.1025544388960907e-07, + "loss": 0.54297173, + "num_input_tokens_seen": 143192005, + "step": 6642, + "time_per_iteration": 3.0553178787231445 + }, + { + "auxiliary_loss_clip": 0.01154534, + "auxiliary_loss_mlp": 0.01027564, + "balance_loss_clip": 1.04816365, + "balance_loss_mlp": 1.02012527, + "epoch": 0.7987735225154813, + "flos": 22455373622400.0, + "grad_norm": 1.8809952175368885, + "language_loss": 0.71574152, + "learning_rate": 4.097829025746538e-07, + "loss": 0.73756248, + "num_input_tokens_seen": 143213550, + "step": 6643, + "time_per_iteration": 2.4904770851135254 + }, + { + "auxiliary_loss_clip": 0.010529, + "auxiliary_loss_mlp": 0.01001502, + "balance_loss_clip": 1.00730824, + "balance_loss_mlp": 1.00051248, + "epoch": 0.7988937654061203, + "flos": 68864098682880.0, + "grad_norm": 0.6587712559111585, + "language_loss": 0.61002946, + "learning_rate": 4.0931060249436757e-07, + "loss": 0.63057351, + "num_input_tokens_seen": 143277390, + "step": 6644, + "time_per_iteration": 3.058016538619995 + }, + { + "auxiliary_loss_clip": 0.01153786, + "auxiliary_loss_mlp": 0.01026757, + "balance_loss_clip": 1.04801297, + "balance_loss_mlp": 1.01879966, + "epoch": 0.7990140082967595, + "flos": 20806893820800.0, + "grad_norm": 2.216444655712168, + "language_loss": 0.69445062, + "learning_rate": 4.088385437203978e-07, + "loss": 0.71625602, + "num_input_tokens_seen": 143294400, + "step": 6645, + "time_per_iteration": 2.436831474304199 + }, + { + "auxiliary_loss_clip": 0.01168132, + "auxiliary_loss_mlp": 0.01024646, + "balance_loss_clip": 1.04617643, + "balance_loss_mlp": 1.01709127, + "epoch": 0.7991342511873986, + "flos": 18985289443200.0, + "grad_norm": 2.0617360057224294, + "language_loss": 0.77622211, + "learning_rate": 4.083667263243564e-07, + "loss": 0.79814982, + "num_input_tokens_seen": 143312745, + "step": 6646, + "time_per_iteration": 2.393120050430298 + }, + { + "auxiliary_loss_clip": 0.01150831, + "auxiliary_loss_mlp": 0.01027276, + "balance_loss_clip": 1.04671752, + "balance_loss_mlp": 1.02006412, + "epoch": 0.7992544940780376, + "flos": 20816805974400.0, + "grad_norm": 2.2242184179450857, + "language_loss": 0.71780843, + "learning_rate": 4.0789515037781653e-07, + "loss": 0.73958945, + "num_input_tokens_seen": 143333470, + "step": 6647, + "time_per_iteration": 2.4791648387908936 + }, + { + "auxiliary_loss_clip": 0.01156983, + "auxiliary_loss_mlp": 0.01022655, + "balance_loss_clip": 1.04633641, + "balance_loss_mlp": 1.01566935, + "epoch": 0.7993747369686768, + "flos": 12640772321280.0, + "grad_norm": 2.091635630092632, + "language_loss": 0.82689112, + "learning_rate": 4.0742381595231755e-07, + "loss": 0.84868747, + "num_input_tokens_seen": 143350195, + "step": 6648, + "time_per_iteration": 2.439025402069092 + }, + { + "auxiliary_loss_clip": 0.01131584, + "auxiliary_loss_mlp": 0.0102513, + "balance_loss_clip": 1.04504502, + "balance_loss_mlp": 1.01820397, + "epoch": 0.7994949798593158, + "flos": 20078769225600.0, + "grad_norm": 1.8628385096687412, + "language_loss": 0.78308558, + "learning_rate": 4.06952723119359e-07, + "loss": 0.80465281, + "num_input_tokens_seen": 143370070, + "step": 6649, + "time_per_iteration": 2.5194709300994873 + }, + { + "auxiliary_loss_clip": 0.01132432, + "auxiliary_loss_mlp": 0.0102664, + "balance_loss_clip": 1.04353547, + "balance_loss_mlp": 1.0190289, + "epoch": 0.7996152227499549, + "flos": 38654209509120.0, + "grad_norm": 1.8722781669765227, + "language_loss": 0.67289013, + "learning_rate": 4.0648187195040504e-07, + "loss": 0.69448084, + "num_input_tokens_seen": 143392275, + "step": 6650, + "time_per_iteration": 2.6679625511169434 + }, + { + "auxiliary_loss_clip": 0.01048284, + "auxiliary_loss_mlp": 0.01000466, + "balance_loss_clip": 1.0067625, + "balance_loss_mlp": 0.99938077, + "epoch": 0.799735465640594, + "flos": 70243821947520.0, + "grad_norm": 0.8172814382549617, + "language_loss": 0.67639184, + "learning_rate": 4.060112625168848e-07, + "loss": 0.69687939, + "num_input_tokens_seen": 143457385, + "step": 6651, + "time_per_iteration": 3.128058433532715 + }, + { + "auxiliary_loss_clip": 0.01170528, + "auxiliary_loss_mlp": 0.01028254, + "balance_loss_clip": 1.04954088, + "balance_loss_mlp": 1.02047586, + "epoch": 0.7998557085312331, + "flos": 24240995550720.0, + "grad_norm": 1.8012296417786406, + "language_loss": 0.73599124, + "learning_rate": 4.055408948901886e-07, + "loss": 0.75797904, + "num_input_tokens_seen": 143478785, + "step": 6652, + "time_per_iteration": 2.464498281478882 + }, + { + "auxiliary_loss_clip": 0.01158327, + "auxiliary_loss_mlp": 0.01023985, + "balance_loss_clip": 1.04693818, + "balance_loss_mlp": 1.01581955, + "epoch": 0.7999759514218722, + "flos": 27564025449600.0, + "grad_norm": 1.8419980176823063, + "language_loss": 0.71221387, + "learning_rate": 4.050707691416708e-07, + "loss": 0.73403698, + "num_input_tokens_seen": 143500095, + "step": 6653, + "time_per_iteration": 3.2885162830352783 + }, + { + "auxiliary_loss_clip": 0.01048197, + "auxiliary_loss_mlp": 0.01001123, + "balance_loss_clip": 1.00679731, + "balance_loss_mlp": 1.00009751, + "epoch": 0.8000961943125112, + "flos": 67337428878720.0, + "grad_norm": 0.668559334985628, + "language_loss": 0.59764713, + "learning_rate": 4.046008853426495e-07, + "loss": 0.61814034, + "num_input_tokens_seen": 143563410, + "step": 6654, + "time_per_iteration": 3.8061461448669434 + }, + { + "auxiliary_loss_clip": 0.01121809, + "auxiliary_loss_mlp": 0.01025337, + "balance_loss_clip": 1.04282045, + "balance_loss_mlp": 1.01768088, + "epoch": 0.8002164372031504, + "flos": 28733815676160.0, + "grad_norm": 2.1508080669000678, + "language_loss": 0.63020784, + "learning_rate": 4.0413124356440464e-07, + "loss": 0.65167928, + "num_input_tokens_seen": 143587455, + "step": 6655, + "time_per_iteration": 2.601128339767456 + }, + { + "auxiliary_loss_clip": 0.01114622, + "auxiliary_loss_mlp": 0.01025861, + "balance_loss_clip": 1.04082835, + "balance_loss_mlp": 1.01861894, + "epoch": 0.8003366800937894, + "flos": 17639429725440.0, + "grad_norm": 2.1898320509394833, + "language_loss": 0.82269204, + "learning_rate": 4.0366184387818223e-07, + "loss": 0.8440969, + "num_input_tokens_seen": 143605915, + "step": 6656, + "time_per_iteration": 3.2356395721435547 + }, + { + "auxiliary_loss_clip": 0.01174949, + "auxiliary_loss_mlp": 0.01022396, + "balance_loss_clip": 1.04924035, + "balance_loss_mlp": 1.01430809, + "epoch": 0.8004569229844285, + "flos": 25995303797760.0, + "grad_norm": 2.1399388530587578, + "language_loss": 0.85679638, + "learning_rate": 4.0319268635518797e-07, + "loss": 0.87876987, + "num_input_tokens_seen": 143626490, + "step": 6657, + "time_per_iteration": 2.496980905532837 + }, + { + "auxiliary_loss_clip": 0.01153747, + "auxiliary_loss_mlp": 0.01022188, + "balance_loss_clip": 1.04544044, + "balance_loss_mlp": 1.0151341, + "epoch": 0.8005771658750677, + "flos": 20812352688000.0, + "grad_norm": 1.5620790901618937, + "language_loss": 0.74756908, + "learning_rate": 4.027237710665943e-07, + "loss": 0.76932836, + "num_input_tokens_seen": 143644955, + "step": 6658, + "time_per_iteration": 3.2612149715423584 + }, + { + "auxiliary_loss_clip": 0.01128543, + "auxiliary_loss_mlp": 0.01026057, + "balance_loss_clip": 1.0407896, + "balance_loss_mlp": 1.01820683, + "epoch": 0.8006974087657067, + "flos": 25812626204160.0, + "grad_norm": 2.022194325085176, + "language_loss": 0.69640553, + "learning_rate": 4.022550980835344e-07, + "loss": 0.71795154, + "num_input_tokens_seen": 143667200, + "step": 6659, + "time_per_iteration": 2.5934898853302 + }, + { + "auxiliary_loss_clip": 0.01123677, + "auxiliary_loss_mlp": 0.01024868, + "balance_loss_clip": 1.03985238, + "balance_loss_mlp": 1.01749492, + "epoch": 0.8008176516563458, + "flos": 17164690646400.0, + "grad_norm": 2.2883595009065574, + "language_loss": 0.79406261, + "learning_rate": 4.017866674771051e-07, + "loss": 0.81554812, + "num_input_tokens_seen": 143684685, + "step": 6660, + "time_per_iteration": 2.4980857372283936 + }, + { + "auxiliary_loss_clip": 0.01103794, + "auxiliary_loss_mlp": 0.01028839, + "balance_loss_clip": 1.03860807, + "balance_loss_mlp": 1.02106082, + "epoch": 0.8009378945469849, + "flos": 24207311571840.0, + "grad_norm": 1.802330078237029, + "language_loss": 0.74475092, + "learning_rate": 4.013184793183688e-07, + "loss": 0.76607728, + "num_input_tokens_seen": 143706780, + "step": 6661, + "time_per_iteration": 2.604975700378418 + }, + { + "auxiliary_loss_clip": 0.01151999, + "auxiliary_loss_mlp": 0.01025867, + "balance_loss_clip": 1.04367065, + "balance_loss_mlp": 1.01872325, + "epoch": 0.801058137437624, + "flos": 19787318271360.0, + "grad_norm": 1.9539963463925631, + "language_loss": 0.72647339, + "learning_rate": 4.008505336783472e-07, + "loss": 0.74825203, + "num_input_tokens_seen": 143724505, + "step": 6662, + "time_per_iteration": 2.4590280055999756 + }, + { + "auxiliary_loss_clip": 0.01145346, + "auxiliary_loss_mlp": 0.0102761, + "balance_loss_clip": 1.04377234, + "balance_loss_mlp": 1.02047801, + "epoch": 0.801178380328263, + "flos": 18659400324480.0, + "grad_norm": 1.8851949032283188, + "language_loss": 0.80953509, + "learning_rate": 4.003828306280284e-07, + "loss": 0.83126462, + "num_input_tokens_seen": 143742180, + "step": 6663, + "time_per_iteration": 2.4309215545654297 + }, + { + "auxiliary_loss_clip": 0.01155964, + "auxiliary_loss_mlp": 0.01024081, + "balance_loss_clip": 1.04674721, + "balance_loss_mlp": 1.0172236, + "epoch": 0.8012986232189022, + "flos": 15706573948800.0, + "grad_norm": 1.8021328614269814, + "language_loss": 0.78189933, + "learning_rate": 3.999153702383626e-07, + "loss": 0.80369979, + "num_input_tokens_seen": 143760070, + "step": 6664, + "time_per_iteration": 2.4311859607696533 + }, + { + "auxiliary_loss_clip": 0.01158094, + "auxiliary_loss_mlp": 0.01022585, + "balance_loss_clip": 1.0465529, + "balance_loss_mlp": 1.01467538, + "epoch": 0.8014188661095413, + "flos": 28584139703040.0, + "grad_norm": 1.6756321803571, + "language_loss": 0.73874861, + "learning_rate": 3.9944815258026263e-07, + "loss": 0.76055545, + "num_input_tokens_seen": 143781890, + "step": 6665, + "time_per_iteration": 2.5545756816864014 + }, + { + "auxiliary_loss_clip": 0.01158221, + "auxiliary_loss_mlp": 0.01025842, + "balance_loss_clip": 1.04703879, + "balance_loss_mlp": 1.0181824, + "epoch": 0.8015391090001803, + "flos": 29310360877440.0, + "grad_norm": 2.326287930513354, + "language_loss": 0.83023059, + "learning_rate": 3.989811777246057e-07, + "loss": 0.85207117, + "num_input_tokens_seen": 143802060, + "step": 6666, + "time_per_iteration": 2.502647638320923 + }, + { + "auxiliary_loss_clip": 0.0106386, + "auxiliary_loss_mlp": 0.01000503, + "balance_loss_clip": 1.00743032, + "balance_loss_mlp": 0.99941778, + "epoch": 0.8016593518908195, + "flos": 70397340675840.0, + "grad_norm": 0.8473370994404873, + "language_loss": 0.66204387, + "learning_rate": 3.985144457422305e-07, + "loss": 0.68268746, + "num_input_tokens_seen": 143856345, + "step": 6667, + "time_per_iteration": 2.9483044147491455 + }, + { + "auxiliary_loss_clip": 0.01169154, + "auxiliary_loss_mlp": 0.01024239, + "balance_loss_clip": 1.04789948, + "balance_loss_mlp": 1.01672006, + "epoch": 0.8017795947814585, + "flos": 26026114688640.0, + "grad_norm": 2.232803841954557, + "language_loss": 0.7696799, + "learning_rate": 3.9804795670394096e-07, + "loss": 0.79161388, + "num_input_tokens_seen": 143876470, + "step": 6668, + "time_per_iteration": 2.442685127258301 + }, + { + "auxiliary_loss_clip": 0.01133242, + "auxiliary_loss_mlp": 0.01026291, + "balance_loss_clip": 1.04347253, + "balance_loss_mlp": 1.01898932, + "epoch": 0.8018998376720976, + "flos": 22087181260800.0, + "grad_norm": 1.7958519212828434, + "language_loss": 0.70536184, + "learning_rate": 3.975817106805022e-07, + "loss": 0.72695714, + "num_input_tokens_seen": 143895170, + "step": 6669, + "time_per_iteration": 2.493236541748047 + }, + { + "auxiliary_loss_clip": 0.01127122, + "auxiliary_loss_mlp": 0.01026017, + "balance_loss_clip": 1.04366493, + "balance_loss_mlp": 1.0179944, + "epoch": 0.8020200805627368, + "flos": 34568545023360.0, + "grad_norm": 1.7581517195840375, + "language_loss": 0.65063351, + "learning_rate": 3.97115707742645e-07, + "loss": 0.67216498, + "num_input_tokens_seen": 143915845, + "step": 6670, + "time_per_iteration": 2.65908145904541 + }, + { + "auxiliary_loss_clip": 0.01143835, + "auxiliary_loss_mlp": 0.01022439, + "balance_loss_clip": 1.04705083, + "balance_loss_mlp": 1.01506615, + "epoch": 0.8021403234533758, + "flos": 20120354196480.0, + "grad_norm": 2.038901633330455, + "language_loss": 0.6516825, + "learning_rate": 3.966499479610599e-07, + "loss": 0.67334521, + "num_input_tokens_seen": 143933940, + "step": 6671, + "time_per_iteration": 2.490997552871704 + }, + { + "auxiliary_loss_clip": 0.01125307, + "auxiliary_loss_mlp": 0.01025545, + "balance_loss_clip": 1.04564273, + "balance_loss_mlp": 1.01855016, + "epoch": 0.8022605663440149, + "flos": 27746200252800.0, + "grad_norm": 1.785382358419006, + "language_loss": 0.64952362, + "learning_rate": 3.9618443140640225e-07, + "loss": 0.67103213, + "num_input_tokens_seen": 143952850, + "step": 6672, + "time_per_iteration": 2.584472179412842 + }, + { + "auxiliary_loss_clip": 0.01019666, + "auxiliary_loss_mlp": 0.01000606, + "balance_loss_clip": 1.00687122, + "balance_loss_mlp": 0.99966413, + "epoch": 0.802380809234654, + "flos": 60244998768000.0, + "grad_norm": 0.6835795822480406, + "language_loss": 0.5134182, + "learning_rate": 3.957191581492918e-07, + "loss": 0.53362095, + "num_input_tokens_seen": 144013610, + "step": 6673, + "time_per_iteration": 3.1603152751922607 + }, + { + "auxiliary_loss_clip": 0.01136657, + "auxiliary_loss_mlp": 0.01022412, + "balance_loss_clip": 1.04371166, + "balance_loss_mlp": 1.01489925, + "epoch": 0.8025010521252931, + "flos": 15080722352640.0, + "grad_norm": 3.010372342995825, + "language_loss": 0.71394753, + "learning_rate": 3.952541282603097e-07, + "loss": 0.73553818, + "num_input_tokens_seen": 144028715, + "step": 6674, + "time_per_iteration": 2.4856362342834473 + }, + { + "auxiliary_loss_clip": 0.011511, + "auxiliary_loss_mlp": 0.01024078, + "balance_loss_clip": 1.04503536, + "balance_loss_mlp": 1.01675892, + "epoch": 0.8026212950159322, + "flos": 22163527618560.0, + "grad_norm": 5.653718714763743, + "language_loss": 0.83651781, + "learning_rate": 3.9478934181000013e-07, + "loss": 0.85826963, + "num_input_tokens_seen": 144048740, + "step": 6675, + "time_per_iteration": 2.4742438793182373 + }, + { + "auxiliary_loss_clip": 0.0117294, + "auxiliary_loss_mlp": 0.01023331, + "balance_loss_clip": 1.0485127, + "balance_loss_mlp": 1.01592875, + "epoch": 0.8027415379065713, + "flos": 17675986792320.0, + "grad_norm": 2.256306858818679, + "language_loss": 0.84467387, + "learning_rate": 3.943247988688714e-07, + "loss": 0.86663651, + "num_input_tokens_seen": 144067435, + "step": 6676, + "time_per_iteration": 2.424525499343872 + }, + { + "auxiliary_loss_clip": 0.01155044, + "auxiliary_loss_mlp": 0.01022465, + "balance_loss_clip": 1.04580498, + "balance_loss_mlp": 1.01576495, + "epoch": 0.8028617807972104, + "flos": 21979593048960.0, + "grad_norm": 1.815576255562185, + "language_loss": 0.72208732, + "learning_rate": 3.938604995073933e-07, + "loss": 0.74386239, + "num_input_tokens_seen": 144085905, + "step": 6677, + "time_per_iteration": 2.468031883239746 + }, + { + "auxiliary_loss_clip": 0.01143015, + "auxiliary_loss_mlp": 0.01025232, + "balance_loss_clip": 1.04384184, + "balance_loss_mlp": 1.01808596, + "epoch": 0.8029820236878494, + "flos": 26428457905920.0, + "grad_norm": 1.6930411685163151, + "language_loss": 0.65450251, + "learning_rate": 3.9339644379600157e-07, + "loss": 0.67618501, + "num_input_tokens_seen": 144105735, + "step": 6678, + "time_per_iteration": 2.6075901985168457 + }, + { + "auxiliary_loss_clip": 0.01160383, + "auxiliary_loss_mlp": 0.01023838, + "balance_loss_clip": 1.04832888, + "balance_loss_mlp": 1.01663446, + "epoch": 0.8031022665784886, + "flos": 17676489582720.0, + "grad_norm": 1.7088288964601337, + "language_loss": 0.71659869, + "learning_rate": 3.929326318050907e-07, + "loss": 0.73844093, + "num_input_tokens_seen": 144123405, + "step": 6679, + "time_per_iteration": 3.2083945274353027 + }, + { + "auxiliary_loss_clip": 0.01165066, + "auxiliary_loss_mlp": 0.01024874, + "balance_loss_clip": 1.04476166, + "balance_loss_mlp": 1.0175848, + "epoch": 0.8032225094691277, + "flos": 15450279431040.0, + "grad_norm": 1.8740151307103947, + "language_loss": 0.78882414, + "learning_rate": 3.924690636050225e-07, + "loss": 0.8107236, + "num_input_tokens_seen": 144140815, + "step": 6680, + "time_per_iteration": 3.17471981048584 + }, + { + "auxiliary_loss_clip": 0.01157191, + "auxiliary_loss_mlp": 0.01029029, + "balance_loss_clip": 1.04747438, + "balance_loss_mlp": 1.02022588, + "epoch": 0.8033427523597667, + "flos": 26179202453760.0, + "grad_norm": 1.9883504082792367, + "language_loss": 0.72521126, + "learning_rate": 3.9200573926611915e-07, + "loss": 0.74707341, + "num_input_tokens_seen": 144162230, + "step": 6681, + "time_per_iteration": 2.496060371398926 + }, + { + "auxiliary_loss_clip": 0.01152342, + "auxiliary_loss_mlp": 0.01024958, + "balance_loss_clip": 1.04716206, + "balance_loss_mlp": 1.01749587, + "epoch": 0.8034629952504058, + "flos": 21324905809920.0, + "grad_norm": 1.8973963420383149, + "language_loss": 0.72343791, + "learning_rate": 3.9154265885866613e-07, + "loss": 0.74521089, + "num_input_tokens_seen": 144181540, + "step": 6682, + "time_per_iteration": 3.2287490367889404 + }, + { + "auxiliary_loss_clip": 0.01153454, + "auxiliary_loss_mlp": 0.01025198, + "balance_loss_clip": 1.04685855, + "balance_loss_mlp": 1.0170207, + "epoch": 0.8035832381410449, + "flos": 21651585027840.0, + "grad_norm": 4.887404350711058, + "language_loss": 0.74616265, + "learning_rate": 3.9107982245291394e-07, + "loss": 0.76794916, + "num_input_tokens_seen": 144199665, + "step": 6683, + "time_per_iteration": 2.451277732849121 + }, + { + "auxiliary_loss_clip": 0.01127237, + "auxiliary_loss_mlp": 0.01028174, + "balance_loss_clip": 1.04492652, + "balance_loss_mlp": 1.020262, + "epoch": 0.803703481031684, + "flos": 20518818744960.0, + "grad_norm": 2.101836228917135, + "language_loss": 0.77116334, + "learning_rate": 3.9061723011907245e-07, + "loss": 0.79271746, + "num_input_tokens_seen": 144219020, + "step": 6684, + "time_per_iteration": 3.302645444869995 + }, + { + "auxiliary_loss_clip": 0.01140121, + "auxiliary_loss_mlp": 0.01024667, + "balance_loss_clip": 1.04443908, + "balance_loss_mlp": 1.01715088, + "epoch": 0.803823723922323, + "flos": 22854807838080.0, + "grad_norm": 1.711116567173154, + "language_loss": 0.79137206, + "learning_rate": 3.901548819273179e-07, + "loss": 0.81301999, + "num_input_tokens_seen": 144239035, + "step": 6685, + "time_per_iteration": 2.494050979614258 + }, + { + "auxiliary_loss_clip": 0.01158482, + "auxiliary_loss_mlp": 0.01024847, + "balance_loss_clip": 1.04903173, + "balance_loss_mlp": 1.01712811, + "epoch": 0.8039439668129622, + "flos": 21362145235200.0, + "grad_norm": 2.858562376981526, + "language_loss": 0.69157696, + "learning_rate": 3.896927779477881e-07, + "loss": 0.71341026, + "num_input_tokens_seen": 144258295, + "step": 6686, + "time_per_iteration": 2.4704811573028564 + }, + { + "auxiliary_loss_clip": 0.01128072, + "auxiliary_loss_mlp": 0.01022185, + "balance_loss_clip": 1.04389238, + "balance_loss_mlp": 1.01452303, + "epoch": 0.8040642097036013, + "flos": 23802382575360.0, + "grad_norm": 2.056247392857799, + "language_loss": 0.6701805, + "learning_rate": 3.892309182505833e-07, + "loss": 0.69168305, + "num_input_tokens_seen": 144276110, + "step": 6687, + "time_per_iteration": 2.527461290359497 + }, + { + "auxiliary_loss_clip": 0.01166562, + "auxiliary_loss_mlp": 0.01024399, + "balance_loss_clip": 1.04524505, + "balance_loss_mlp": 1.01726103, + "epoch": 0.8041844525942403, + "flos": 25922046009600.0, + "grad_norm": 2.269246828282638, + "language_loss": 0.85703719, + "learning_rate": 3.887693029057675e-07, + "loss": 0.87894678, + "num_input_tokens_seen": 144295620, + "step": 6688, + "time_per_iteration": 2.4754068851470947 + }, + { + "auxiliary_loss_clip": 0.0114076, + "auxiliary_loss_mlp": 0.01025563, + "balance_loss_clip": 1.04499662, + "balance_loss_mlp": 1.01829457, + "epoch": 0.8043046954848795, + "flos": 25191120153600.0, + "grad_norm": 1.6796384460641647, + "language_loss": 0.81188834, + "learning_rate": 3.8830793198336684e-07, + "loss": 0.83355153, + "num_input_tokens_seen": 144315210, + "step": 6689, + "time_per_iteration": 2.515183448791504 + }, + { + "auxiliary_loss_clip": 0.0116015, + "auxiliary_loss_mlp": 0.01028816, + "balance_loss_clip": 1.04646266, + "balance_loss_mlp": 1.02172303, + "epoch": 0.8044249383755185, + "flos": 41719185123840.0, + "grad_norm": 2.353541515118501, + "language_loss": 0.70352197, + "learning_rate": 3.878468055533721e-07, + "loss": 0.72541165, + "num_input_tokens_seen": 144337750, + "step": 6690, + "time_per_iteration": 2.6281516551971436 + }, + { + "auxiliary_loss_clip": 0.01134095, + "auxiliary_loss_mlp": 0.0102782, + "balance_loss_clip": 1.04666519, + "balance_loss_mlp": 1.01980865, + "epoch": 0.8045451812661576, + "flos": 20631434860800.0, + "grad_norm": 2.289739578753066, + "language_loss": 0.84343153, + "learning_rate": 3.8738592368573464e-07, + "loss": 0.86505067, + "num_input_tokens_seen": 144355305, + "step": 6691, + "time_per_iteration": 2.510437250137329 + }, + { + "auxiliary_loss_clip": 0.01117823, + "auxiliary_loss_mlp": 0.01026639, + "balance_loss_clip": 1.04394364, + "balance_loss_mlp": 1.01917052, + "epoch": 0.8046654241567968, + "flos": 29711806254720.0, + "grad_norm": 2.022748912720235, + "language_loss": 0.87927175, + "learning_rate": 3.8692528645037137e-07, + "loss": 0.90071636, + "num_input_tokens_seen": 144374485, + "step": 6692, + "time_per_iteration": 2.6092047691345215 + }, + { + "auxiliary_loss_clip": 0.01169459, + "auxiliary_loss_mlp": 0.01029381, + "balance_loss_clip": 1.04882598, + "balance_loss_mlp": 1.02221394, + "epoch": 0.8047856670474358, + "flos": 17671389851520.0, + "grad_norm": 2.4741427698461975, + "language_loss": 0.77762711, + "learning_rate": 3.8646489391715907e-07, + "loss": 0.7996155, + "num_input_tokens_seen": 144388780, + "step": 6693, + "time_per_iteration": 2.439185380935669 + }, + { + "auxiliary_loss_clip": 0.01140018, + "auxiliary_loss_mlp": 0.01028857, + "balance_loss_clip": 1.04463959, + "balance_loss_mlp": 1.02117121, + "epoch": 0.8049059099380749, + "flos": 17120699464320.0, + "grad_norm": 3.2642548555690665, + "language_loss": 0.88262832, + "learning_rate": 3.8600474615593903e-07, + "loss": 0.90431708, + "num_input_tokens_seen": 144403395, + "step": 6694, + "time_per_iteration": 2.474686622619629 + }, + { + "auxiliary_loss_clip": 0.0103362, + "auxiliary_loss_mlp": 0.01000823, + "balance_loss_clip": 1.00717545, + "balance_loss_mlp": 0.99973851, + "epoch": 0.805026152828714, + "flos": 62212903240320.0, + "grad_norm": 0.7820758806448971, + "language_loss": 0.59693885, + "learning_rate": 3.8554484323651605e-07, + "loss": 0.61728334, + "num_input_tokens_seen": 144465265, + "step": 6695, + "time_per_iteration": 3.153766393661499 + }, + { + "auxiliary_loss_clip": 0.01152805, + "auxiliary_loss_mlp": 0.00762048, + "balance_loss_clip": 1.04652965, + "balance_loss_mlp": 1.00062418, + "epoch": 0.8051463957193531, + "flos": 21688608971520.0, + "grad_norm": 1.5602879158557164, + "language_loss": 0.7942546, + "learning_rate": 3.85085185228657e-07, + "loss": 0.81340313, + "num_input_tokens_seen": 144484235, + "step": 6696, + "time_per_iteration": 2.4706826210021973 + }, + { + "auxiliary_loss_clip": 0.01136661, + "auxiliary_loss_mlp": 0.01027378, + "balance_loss_clip": 1.04438853, + "balance_loss_mlp": 1.01979613, + "epoch": 0.8052666386099921, + "flos": 32051458535040.0, + "grad_norm": 1.9213974732978862, + "language_loss": 0.73129296, + "learning_rate": 3.8462577220209114e-07, + "loss": 0.75293326, + "num_input_tokens_seen": 144504610, + "step": 6697, + "time_per_iteration": 2.5753207206726074 + }, + { + "auxiliary_loss_clip": 0.01063616, + "auxiliary_loss_mlp": 0.01000851, + "balance_loss_clip": 1.00736713, + "balance_loss_mlp": 0.99980778, + "epoch": 0.8053868815006313, + "flos": 67157875768320.0, + "grad_norm": 0.7187386942588324, + "language_loss": 0.58981639, + "learning_rate": 3.8416660422651127e-07, + "loss": 0.61046106, + "num_input_tokens_seen": 144574260, + "step": 6698, + "time_per_iteration": 3.0929293632507324 + }, + { + "auxiliary_loss_clip": 0.0113102, + "auxiliary_loss_mlp": 0.01027935, + "balance_loss_clip": 1.04319346, + "balance_loss_mlp": 1.01990056, + "epoch": 0.8055071243912704, + "flos": 23837000307840.0, + "grad_norm": 1.745468096706696, + "language_loss": 0.67867988, + "learning_rate": 3.837076813715723e-07, + "loss": 0.7002694, + "num_input_tokens_seen": 144594145, + "step": 6699, + "time_per_iteration": 2.545842170715332 + }, + { + "auxiliary_loss_clip": 0.01124329, + "auxiliary_loss_mlp": 0.01023252, + "balance_loss_clip": 1.04162419, + "balance_loss_mlp": 1.01526463, + "epoch": 0.8056273672819094, + "flos": 21324510760320.0, + "grad_norm": 2.0343220246589695, + "language_loss": 0.75051498, + "learning_rate": 3.832490037068941e-07, + "loss": 0.77199078, + "num_input_tokens_seen": 144612935, + "step": 6700, + "time_per_iteration": 2.533031940460205 + }, + { + "auxiliary_loss_clip": 0.01095814, + "auxiliary_loss_mlp": 0.01022431, + "balance_loss_clip": 1.0404923, + "balance_loss_mlp": 1.0150516, + "epoch": 0.8057476101725486, + "flos": 25768383626880.0, + "grad_norm": 2.187465695822132, + "language_loss": 0.76070487, + "learning_rate": 3.827905713020554e-07, + "loss": 0.78188729, + "num_input_tokens_seen": 144630580, + "step": 6701, + "time_per_iteration": 2.6461617946624756 + }, + { + "auxiliary_loss_clip": 0.01130186, + "auxiliary_loss_mlp": 0.01030104, + "balance_loss_clip": 1.04163289, + "balance_loss_mlp": 1.02181315, + "epoch": 0.8058678530631876, + "flos": 24535283679360.0, + "grad_norm": 1.9578889869417822, + "language_loss": 0.68616903, + "learning_rate": 3.823323842266017e-07, + "loss": 0.7077719, + "num_input_tokens_seen": 144649975, + "step": 6702, + "time_per_iteration": 2.6159229278564453 + }, + { + "auxiliary_loss_clip": 0.01155602, + "auxiliary_loss_mlp": 0.0102583, + "balance_loss_clip": 1.04354465, + "balance_loss_mlp": 1.01852512, + "epoch": 0.8059880959538267, + "flos": 24753728240640.0, + "grad_norm": 2.238400744861184, + "language_loss": 0.72931504, + "learning_rate": 3.818744425500393e-07, + "loss": 0.75112939, + "num_input_tokens_seen": 144667990, + "step": 6703, + "time_per_iteration": 2.4960548877716064 + }, + { + "auxiliary_loss_clip": 0.01120856, + "auxiliary_loss_mlp": 0.0102712, + "balance_loss_clip": 1.04119539, + "balance_loss_mlp": 1.01870084, + "epoch": 0.8061083388444659, + "flos": 22196349671040.0, + "grad_norm": 1.836934186117584, + "language_loss": 0.80629623, + "learning_rate": 3.8141674634183675e-07, + "loss": 0.82777596, + "num_input_tokens_seen": 144687020, + "step": 6704, + "time_per_iteration": 2.5382399559020996 + }, + { + "auxiliary_loss_clip": 0.01111402, + "auxiliary_loss_mlp": 0.0102598, + "balance_loss_clip": 1.04388905, + "balance_loss_mlp": 1.01914597, + "epoch": 0.8062285817351049, + "flos": 30044195735040.0, + "grad_norm": 2.1445973354152192, + "language_loss": 0.66810274, + "learning_rate": 3.809592956714278e-07, + "loss": 0.68947655, + "num_input_tokens_seen": 144710255, + "step": 6705, + "time_per_iteration": 2.6204850673675537 + }, + { + "auxiliary_loss_clip": 0.01159883, + "auxiliary_loss_mlp": 0.01028068, + "balance_loss_clip": 1.04810047, + "balance_loss_mlp": 1.02069497, + "epoch": 0.806348824625744, + "flos": 22782591544320.0, + "grad_norm": 2.0446352436521598, + "language_loss": 0.74694026, + "learning_rate": 3.805020906082057e-07, + "loss": 0.76881981, + "num_input_tokens_seen": 144728830, + "step": 6706, + "time_per_iteration": 3.258519172668457 + }, + { + "auxiliary_loss_clip": 0.01143996, + "auxiliary_loss_mlp": 0.0102458, + "balance_loss_clip": 1.04472363, + "balance_loss_mlp": 1.01673639, + "epoch": 0.8064690675163831, + "flos": 23404600385280.0, + "grad_norm": 2.220247850170993, + "language_loss": 0.80841911, + "learning_rate": 3.8004513122152917e-07, + "loss": 0.83010495, + "num_input_tokens_seen": 144747140, + "step": 6707, + "time_per_iteration": 3.2753114700317383 + }, + { + "auxiliary_loss_clip": 0.01130837, + "auxiliary_loss_mlp": 0.01029473, + "balance_loss_clip": 1.04570079, + "balance_loss_mlp": 1.02260685, + "epoch": 0.8065893104070222, + "flos": 24060903736320.0, + "grad_norm": 1.7561201781804616, + "language_loss": 0.67061383, + "learning_rate": 3.79588417580718e-07, + "loss": 0.69221693, + "num_input_tokens_seen": 144765250, + "step": 6708, + "time_per_iteration": 2.502037525177002 + }, + { + "auxiliary_loss_clip": 0.01158678, + "auxiliary_loss_mlp": 0.01023911, + "balance_loss_clip": 1.04895878, + "balance_loss_mlp": 1.01704454, + "epoch": 0.8067095532976613, + "flos": 22305410340480.0, + "grad_norm": 2.371556914020878, + "language_loss": 0.76730871, + "learning_rate": 3.791319497550558e-07, + "loss": 0.78913456, + "num_input_tokens_seen": 144783080, + "step": 6709, + "time_per_iteration": 3.2143876552581787 + }, + { + "auxiliary_loss_clip": 0.01132515, + "auxiliary_loss_mlp": 0.00761854, + "balance_loss_clip": 1.04496837, + "balance_loss_mlp": 1.00060594, + "epoch": 0.8068297961883004, + "flos": 17129498296320.0, + "grad_norm": 1.9000481312982145, + "language_loss": 0.70532072, + "learning_rate": 3.78675727813788e-07, + "loss": 0.72426438, + "num_input_tokens_seen": 144800645, + "step": 6710, + "time_per_iteration": 2.496975898742676 + }, + { + "auxiliary_loss_clip": 0.01140603, + "auxiliary_loss_mlp": 0.01020554, + "balance_loss_clip": 1.04554498, + "balance_loss_mlp": 1.01313353, + "epoch": 0.8069500390789395, + "flos": 22018843635840.0, + "grad_norm": 3.2352551107027665, + "language_loss": 0.73653567, + "learning_rate": 3.782197518261225e-07, + "loss": 0.75814724, + "num_input_tokens_seen": 144820085, + "step": 6711, + "time_per_iteration": 3.255855083465576 + }, + { + "auxiliary_loss_clip": 0.01147867, + "auxiliary_loss_mlp": 0.010307, + "balance_loss_clip": 1.04720449, + "balance_loss_mlp": 1.02327347, + "epoch": 0.8070702819695785, + "flos": 19244241567360.0, + "grad_norm": 2.2553705018886747, + "language_loss": 0.9564867, + "learning_rate": 3.777640218612319e-07, + "loss": 0.97827232, + "num_input_tokens_seen": 144838070, + "step": 6712, + "time_per_iteration": 2.476952314376831 + }, + { + "auxiliary_loss_clip": 0.01149477, + "auxiliary_loss_mlp": 0.01024939, + "balance_loss_clip": 1.04518914, + "balance_loss_mlp": 1.01778674, + "epoch": 0.8071905248602176, + "flos": 21544320038400.0, + "grad_norm": 2.7384917264114375, + "language_loss": 0.71654981, + "learning_rate": 3.773085379882488e-07, + "loss": 0.73829401, + "num_input_tokens_seen": 144857125, + "step": 6713, + "time_per_iteration": 2.4687089920043945 + }, + { + "auxiliary_loss_clip": 0.01154546, + "auxiliary_loss_mlp": 0.00762437, + "balance_loss_clip": 1.04418349, + "balance_loss_mlp": 1.00054276, + "epoch": 0.8073107677508568, + "flos": 37268309105280.0, + "grad_norm": 2.886632841116915, + "language_loss": 0.7588675, + "learning_rate": 3.768533002762715e-07, + "loss": 0.77803737, + "num_input_tokens_seen": 144880660, + "step": 6714, + "time_per_iteration": 2.7385222911834717 + }, + { + "auxiliary_loss_clip": 0.01139402, + "auxiliary_loss_mlp": 0.01022084, + "balance_loss_clip": 1.04163265, + "balance_loss_mlp": 1.01520896, + "epoch": 0.8074310106414958, + "flos": 28366269759360.0, + "grad_norm": 1.7559674972181891, + "language_loss": 0.76821417, + "learning_rate": 3.763983087943572e-07, + "loss": 0.78982902, + "num_input_tokens_seen": 144900050, + "step": 6715, + "time_per_iteration": 2.6670424938201904 + }, + { + "auxiliary_loss_clip": 0.01143917, + "auxiliary_loss_mlp": 0.00762101, + "balance_loss_clip": 1.04223287, + "balance_loss_mlp": 1.00061083, + "epoch": 0.8075512535321349, + "flos": 24281646768000.0, + "grad_norm": 1.6876161915981822, + "language_loss": 0.80977446, + "learning_rate": 3.759435636115282e-07, + "loss": 0.82883459, + "num_input_tokens_seen": 144920835, + "step": 6716, + "time_per_iteration": 2.52382755279541 + }, + { + "auxiliary_loss_clip": 0.01093988, + "auxiliary_loss_mlp": 0.00761569, + "balance_loss_clip": 1.0430845, + "balance_loss_mlp": 1.00058198, + "epoch": 0.807671496422774, + "flos": 26030855283840.0, + "grad_norm": 1.85756788631271, + "language_loss": 0.72942519, + "learning_rate": 3.7548906479676967e-07, + "loss": 0.74798071, + "num_input_tokens_seen": 144940430, + "step": 6717, + "time_per_iteration": 2.674114227294922 + }, + { + "auxiliary_loss_clip": 0.01157654, + "auxiliary_loss_mlp": 0.01023415, + "balance_loss_clip": 1.04459119, + "balance_loss_mlp": 1.01624465, + "epoch": 0.8077917393134131, + "flos": 23730740899200.0, + "grad_norm": 1.6681715722310586, + "language_loss": 0.71092522, + "learning_rate": 3.7503481241902855e-07, + "loss": 0.73273587, + "num_input_tokens_seen": 144960405, + "step": 6718, + "time_per_iteration": 2.5689239501953125 + }, + { + "auxiliary_loss_clip": 0.01141193, + "auxiliary_loss_mlp": 0.00761996, + "balance_loss_clip": 1.0439918, + "balance_loss_mlp": 1.00061727, + "epoch": 0.8079119822040521, + "flos": 18402028398720.0, + "grad_norm": 2.956697112084838, + "language_loss": 0.80074775, + "learning_rate": 3.745808065472145e-07, + "loss": 0.81977963, + "num_input_tokens_seen": 144977700, + "step": 6719, + "time_per_iteration": 2.543985605239868 + }, + { + "auxiliary_loss_clip": 0.01150772, + "auxiliary_loss_mlp": 0.01027286, + "balance_loss_clip": 1.049227, + "balance_loss_mlp": 1.02030611, + "epoch": 0.8080322250946913, + "flos": 23621787970560.0, + "grad_norm": 1.604128176776891, + "language_loss": 0.76310456, + "learning_rate": 3.741270472501994e-07, + "loss": 0.78488517, + "num_input_tokens_seen": 144998340, + "step": 6720, + "time_per_iteration": 2.550520658493042 + }, + { + "auxiliary_loss_clip": 0.01140215, + "auxiliary_loss_mlp": 0.01022918, + "balance_loss_clip": 1.04708672, + "balance_loss_mlp": 1.01618576, + "epoch": 0.8081524679853304, + "flos": 22820692896000.0, + "grad_norm": 1.6416952902133692, + "language_loss": 0.72789317, + "learning_rate": 3.736735345968183e-07, + "loss": 0.74952447, + "num_input_tokens_seen": 145017950, + "step": 6721, + "time_per_iteration": 2.531588554382324 + }, + { + "auxiliary_loss_clip": 0.0115596, + "auxiliary_loss_mlp": 0.01024383, + "balance_loss_clip": 1.04664791, + "balance_loss_mlp": 1.01740885, + "epoch": 0.8082727108759694, + "flos": 17640004343040.0, + "grad_norm": 1.5511733084340908, + "language_loss": 0.78653502, + "learning_rate": 3.7322026865586986e-07, + "loss": 0.80833846, + "num_input_tokens_seen": 145036985, + "step": 6722, + "time_per_iteration": 2.5977141857147217 + }, + { + "auxiliary_loss_clip": 0.01162092, + "auxiliary_loss_mlp": 0.01025182, + "balance_loss_clip": 1.0496099, + "balance_loss_mlp": 1.01754057, + "epoch": 0.8083929537666086, + "flos": 25958172113280.0, + "grad_norm": 1.891686630381942, + "language_loss": 0.73587835, + "learning_rate": 3.7276724949611206e-07, + "loss": 0.75775111, + "num_input_tokens_seen": 145057095, + "step": 6723, + "time_per_iteration": 2.5652053356170654 + }, + { + "auxiliary_loss_clip": 0.01146104, + "auxiliary_loss_mlp": 0.01025175, + "balance_loss_clip": 1.0467608, + "balance_loss_mlp": 1.01688957, + "epoch": 0.8085131966572476, + "flos": 27089178629760.0, + "grad_norm": 2.651796165077198, + "language_loss": 0.75097728, + "learning_rate": 3.723144771862694e-07, + "loss": 0.77269006, + "num_input_tokens_seen": 145077735, + "step": 6724, + "time_per_iteration": 2.543401002883911 + }, + { + "auxiliary_loss_clip": 0.0112817, + "auxiliary_loss_mlp": 0.01023797, + "balance_loss_clip": 1.04076409, + "balance_loss_mlp": 1.01604211, + "epoch": 0.8086334395478867, + "flos": 23988543788160.0, + "grad_norm": 1.5056736582599202, + "language_loss": 0.76853573, + "learning_rate": 3.718619517950263e-07, + "loss": 0.79005539, + "num_input_tokens_seen": 145098330, + "step": 6725, + "time_per_iteration": 2.6382532119750977 + }, + { + "auxiliary_loss_clip": 0.0116943, + "auxiliary_loss_mlp": 0.01029025, + "balance_loss_clip": 1.04905534, + "balance_loss_mlp": 1.02169657, + "epoch": 0.8087536824385259, + "flos": 20405879406720.0, + "grad_norm": 1.9060161596181828, + "language_loss": 0.76453853, + "learning_rate": 3.714096733910301e-07, + "loss": 0.7865231, + "num_input_tokens_seen": 145115855, + "step": 6726, + "time_per_iteration": 2.484654426574707 + }, + { + "auxiliary_loss_clip": 0.01163829, + "auxiliary_loss_mlp": 0.01023396, + "balance_loss_clip": 1.04794693, + "balance_loss_mlp": 1.01542103, + "epoch": 0.8088739253291649, + "flos": 25919639798400.0, + "grad_norm": 2.4395123221014203, + "language_loss": 0.70512164, + "learning_rate": 3.709576420428926e-07, + "loss": 0.72699392, + "num_input_tokens_seen": 145136655, + "step": 6727, + "time_per_iteration": 2.606008768081665 + }, + { + "auxiliary_loss_clip": 0.0114027, + "auxiliary_loss_mlp": 0.01023822, + "balance_loss_clip": 1.04190493, + "balance_loss_mlp": 1.01682782, + "epoch": 0.808994168219804, + "flos": 28402072640640.0, + "grad_norm": 2.3897637619515546, + "language_loss": 0.73118722, + "learning_rate": 3.7050585781918463e-07, + "loss": 0.75282812, + "num_input_tokens_seen": 145156955, + "step": 6728, + "time_per_iteration": 2.607781410217285 + }, + { + "auxiliary_loss_clip": 0.01160049, + "auxiliary_loss_mlp": 0.01026119, + "balance_loss_clip": 1.04670322, + "balance_loss_mlp": 1.01809645, + "epoch": 0.8091144111104431, + "flos": 17421056991360.0, + "grad_norm": 2.227869335024842, + "language_loss": 0.68775189, + "learning_rate": 3.700543207884428e-07, + "loss": 0.70961356, + "num_input_tokens_seen": 145173865, + "step": 6729, + "time_per_iteration": 2.5681650638580322 + }, + { + "auxiliary_loss_clip": 0.0115377, + "auxiliary_loss_mlp": 0.01025285, + "balance_loss_clip": 1.04690838, + "balance_loss_mlp": 1.0182631, + "epoch": 0.8092346540010822, + "flos": 32153803361280.0, + "grad_norm": 1.6420427270713238, + "language_loss": 0.71104777, + "learning_rate": 3.6960303101916466e-07, + "loss": 0.73283827, + "num_input_tokens_seen": 145193780, + "step": 6730, + "time_per_iteration": 2.71044659614563 + }, + { + "auxiliary_loss_clip": 0.01063265, + "auxiliary_loss_mlp": 0.00752877, + "balance_loss_clip": 1.00693405, + "balance_loss_mlp": 1.00051832, + "epoch": 0.8093548968917212, + "flos": 58035093390720.0, + "grad_norm": 0.7387032988732145, + "language_loss": 0.55598211, + "learning_rate": 3.6915198857981047e-07, + "loss": 0.57414353, + "num_input_tokens_seen": 145258980, + "step": 6731, + "time_per_iteration": 3.056812286376953 + }, + { + "auxiliary_loss_clip": 0.01122652, + "auxiliary_loss_mlp": 0.0102376, + "balance_loss_clip": 1.04200363, + "balance_loss_mlp": 1.01553404, + "epoch": 0.8094751397823604, + "flos": 27381599251200.0, + "grad_norm": 1.7632202899278302, + "language_loss": 0.67880136, + "learning_rate": 3.687011935388027e-07, + "loss": 0.70026547, + "num_input_tokens_seen": 145281875, + "step": 6732, + "time_per_iteration": 3.3002750873565674 + }, + { + "auxiliary_loss_clip": 0.01155866, + "auxiliary_loss_mlp": 0.01021908, + "balance_loss_clip": 1.04703426, + "balance_loss_mlp": 1.01476455, + "epoch": 0.8095953826729995, + "flos": 24061083304320.0, + "grad_norm": 2.8291596768193568, + "language_loss": 0.72880697, + "learning_rate": 3.6825064596452646e-07, + "loss": 0.75058472, + "num_input_tokens_seen": 145302220, + "step": 6733, + "time_per_iteration": 3.3165602684020996 + }, + { + "auxiliary_loss_clip": 0.01154357, + "auxiliary_loss_mlp": 0.01022905, + "balance_loss_clip": 1.04506409, + "balance_loss_mlp": 1.01567459, + "epoch": 0.8097156255636385, + "flos": 23951412103680.0, + "grad_norm": 1.6862066417402752, + "language_loss": 0.70663977, + "learning_rate": 3.678003459253305e-07, + "loss": 0.72841239, + "num_input_tokens_seen": 145323070, + "step": 6734, + "time_per_iteration": 2.497838258743286 + }, + { + "auxiliary_loss_clip": 0.01125293, + "auxiliary_loss_mlp": 0.01021877, + "balance_loss_clip": 1.04281056, + "balance_loss_mlp": 1.01437891, + "epoch": 0.8098358684542777, + "flos": 21799142098560.0, + "grad_norm": 2.1935039250781445, + "language_loss": 0.74146724, + "learning_rate": 3.673502934895236e-07, + "loss": 0.76293898, + "num_input_tokens_seen": 145342575, + "step": 6735, + "time_per_iteration": 3.283790111541748 + }, + { + "auxiliary_loss_clip": 0.01063769, + "auxiliary_loss_mlp": 0.01000478, + "balance_loss_clip": 1.00729775, + "balance_loss_mlp": 0.99939936, + "epoch": 0.8099561113449167, + "flos": 68809515966720.0, + "grad_norm": 0.6915622511621066, + "language_loss": 0.5799638, + "learning_rate": 3.669004887253802e-07, + "loss": 0.60060632, + "num_input_tokens_seen": 145408865, + "step": 6736, + "time_per_iteration": 3.17562198638916 + }, + { + "auxiliary_loss_clip": 0.01144595, + "auxiliary_loss_mlp": 0.01024363, + "balance_loss_clip": 1.04697871, + "balance_loss_mlp": 1.01724041, + "epoch": 0.8100763542355558, + "flos": 23586056916480.0, + "grad_norm": 1.7084152581656076, + "language_loss": 0.78809273, + "learning_rate": 3.664509317011335e-07, + "loss": 0.80978233, + "num_input_tokens_seen": 145429200, + "step": 6737, + "time_per_iteration": 3.3316738605499268 + }, + { + "auxiliary_loss_clip": 0.01157541, + "auxiliary_loss_mlp": 0.01028969, + "balance_loss_clip": 1.05067456, + "balance_loss_mlp": 1.02097631, + "epoch": 0.810196597126195, + "flos": 31650408207360.0, + "grad_norm": 1.8482518030308748, + "language_loss": 0.7360847, + "learning_rate": 3.6600162248498134e-07, + "loss": 0.75794983, + "num_input_tokens_seen": 145452830, + "step": 6738, + "time_per_iteration": 2.5985937118530273 + }, + { + "auxiliary_loss_clip": 0.01082115, + "auxiliary_loss_mlp": 0.01025061, + "balance_loss_clip": 1.03728628, + "balance_loss_mlp": 1.01814413, + "epoch": 0.810316840016834, + "flos": 24900459298560.0, + "grad_norm": 1.7094870604702959, + "language_loss": 0.75860369, + "learning_rate": 3.6555256114508426e-07, + "loss": 0.77967542, + "num_input_tokens_seen": 145472625, + "step": 6739, + "time_per_iteration": 2.6258528232574463 + }, + { + "auxiliary_loss_clip": 0.01140049, + "auxiliary_loss_mlp": 0.01025375, + "balance_loss_clip": 1.04112792, + "balance_loss_mlp": 1.0177995, + "epoch": 0.8104370829074731, + "flos": 27965003950080.0, + "grad_norm": 1.9980784273215768, + "language_loss": 0.72773612, + "learning_rate": 3.651037477495642e-07, + "loss": 0.74939036, + "num_input_tokens_seen": 145494075, + "step": 6740, + "time_per_iteration": 2.6228160858154297 + }, + { + "auxiliary_loss_clip": 0.01168912, + "auxiliary_loss_mlp": 0.01025004, + "balance_loss_clip": 1.0460906, + "balance_loss_mlp": 1.01766026, + "epoch": 0.8105573257981122, + "flos": 24640752988800.0, + "grad_norm": 2.0807153306792383, + "language_loss": 0.68031788, + "learning_rate": 3.6465518236650584e-07, + "loss": 0.70225704, + "num_input_tokens_seen": 145514220, + "step": 6741, + "time_per_iteration": 2.4924111366271973 + }, + { + "auxiliary_loss_clip": 0.01126881, + "auxiliary_loss_mlp": 0.01027189, + "balance_loss_clip": 1.04253972, + "balance_loss_mlp": 1.02034068, + "epoch": 0.8106775686887513, + "flos": 26358935132160.0, + "grad_norm": 1.7395052046206145, + "language_loss": 0.78336632, + "learning_rate": 3.642068650639558e-07, + "loss": 0.80490696, + "num_input_tokens_seen": 145533965, + "step": 6742, + "time_per_iteration": 2.6539158821105957 + }, + { + "auxiliary_loss_clip": 0.01133356, + "auxiliary_loss_mlp": 0.01025029, + "balance_loss_clip": 1.03948331, + "balance_loss_mlp": 1.01757848, + "epoch": 0.8107978115793903, + "flos": 27271892136960.0, + "grad_norm": 3.0415150578628998, + "language_loss": 0.64494997, + "learning_rate": 3.6375879590992334e-07, + "loss": 0.66653383, + "num_input_tokens_seen": 145554310, + "step": 6743, + "time_per_iteration": 2.5995163917541504 + }, + { + "auxiliary_loss_clip": 0.0113733, + "auxiliary_loss_mlp": 0.01026608, + "balance_loss_clip": 1.04430795, + "balance_loss_mlp": 1.01872253, + "epoch": 0.8109180544700295, + "flos": 24934322845440.0, + "grad_norm": 1.974524156789001, + "language_loss": 0.810651, + "learning_rate": 3.6331097497238173e-07, + "loss": 0.83229035, + "num_input_tokens_seen": 145573755, + "step": 6744, + "time_per_iteration": 2.5906591415405273 + }, + { + "auxiliary_loss_clip": 0.01125217, + "auxiliary_loss_mlp": 0.01020733, + "balance_loss_clip": 1.0426538, + "balance_loss_mlp": 1.01341403, + "epoch": 0.8110382973606686, + "flos": 21105383840640.0, + "grad_norm": 2.1946474778479805, + "language_loss": 0.80037469, + "learning_rate": 3.628634023192627e-07, + "loss": 0.82183421, + "num_input_tokens_seen": 145594000, + "step": 6745, + "time_per_iteration": 2.5450026988983154 + }, + { + "auxiliary_loss_clip": 0.01157434, + "auxiliary_loss_mlp": 0.01028386, + "balance_loss_clip": 1.04662704, + "balance_loss_mlp": 1.02007747, + "epoch": 0.8111585402513076, + "flos": 15414081500160.0, + "grad_norm": 2.2831013092408576, + "language_loss": 0.75581974, + "learning_rate": 3.624160780184644e-07, + "loss": 0.77767795, + "num_input_tokens_seen": 145611215, + "step": 6746, + "time_per_iteration": 2.4365127086639404 + }, + { + "auxiliary_loss_clip": 0.01133819, + "auxiliary_loss_mlp": 0.01024961, + "balance_loss_clip": 1.0431459, + "balance_loss_mlp": 1.01748037, + "epoch": 0.8112787831419467, + "flos": 24095736950400.0, + "grad_norm": 2.2284418131493737, + "language_loss": 0.74403274, + "learning_rate": 3.6196900213784496e-07, + "loss": 0.76562047, + "num_input_tokens_seen": 145630530, + "step": 6747, + "time_per_iteration": 2.5509727001190186 + }, + { + "auxiliary_loss_clip": 0.01155391, + "auxiliary_loss_mlp": 0.01025452, + "balance_loss_clip": 1.0458107, + "balance_loss_mlp": 1.01846027, + "epoch": 0.8113990260325858, + "flos": 20483374999680.0, + "grad_norm": 1.8879139478338236, + "language_loss": 0.86800647, + "learning_rate": 3.6152217474522527e-07, + "loss": 0.88981485, + "num_input_tokens_seen": 145647345, + "step": 6748, + "time_per_iteration": 2.451894998550415 + }, + { + "auxiliary_loss_clip": 0.01153996, + "auxiliary_loss_mlp": 0.01028259, + "balance_loss_clip": 1.04763961, + "balance_loss_mlp": 1.02129412, + "epoch": 0.8115192689232249, + "flos": 24901141656960.0, + "grad_norm": 2.3236961617384195, + "language_loss": 0.72689146, + "learning_rate": 3.6107559590838975e-07, + "loss": 0.74871409, + "num_input_tokens_seen": 145666330, + "step": 6749, + "time_per_iteration": 2.4987666606903076 + }, + { + "auxiliary_loss_clip": 0.01092537, + "auxiliary_loss_mlp": 0.01023813, + "balance_loss_clip": 1.0398221, + "balance_loss_mlp": 1.01644611, + "epoch": 0.811639511813864, + "flos": 24057204635520.0, + "grad_norm": 2.149066955365205, + "language_loss": 0.66337603, + "learning_rate": 3.606292656950822e-07, + "loss": 0.68453956, + "num_input_tokens_seen": 145684740, + "step": 6750, + "time_per_iteration": 2.607827663421631 + }, + { + "auxiliary_loss_clip": 0.01137199, + "auxiliary_loss_mlp": 0.01024275, + "balance_loss_clip": 1.04234648, + "balance_loss_mlp": 1.01662815, + "epoch": 0.8117597547045031, + "flos": 23185150243200.0, + "grad_norm": 2.029941019151068, + "language_loss": 0.86790639, + "learning_rate": 3.601831841730121e-07, + "loss": 0.88952112, + "num_input_tokens_seen": 145702660, + "step": 6751, + "time_per_iteration": 2.530283212661743 + }, + { + "auxiliary_loss_clip": 0.01155782, + "auxiliary_loss_mlp": 0.01023485, + "balance_loss_clip": 1.04738915, + "balance_loss_mlp": 1.0159514, + "epoch": 0.8118799975951422, + "flos": 23040250778880.0, + "grad_norm": 1.9051075202063916, + "language_loss": 0.73005635, + "learning_rate": 3.5973735140984916e-07, + "loss": 0.75184906, + "num_input_tokens_seen": 145722830, + "step": 6752, + "time_per_iteration": 2.4777603149414062 + }, + { + "auxiliary_loss_clip": 0.01107041, + "auxiliary_loss_mlp": 0.00761372, + "balance_loss_clip": 1.03872609, + "balance_loss_mlp": 1.00070572, + "epoch": 0.8120002404857812, + "flos": 24639962889600.0, + "grad_norm": 2.7006679378444605, + "language_loss": 0.79306793, + "learning_rate": 3.5929176747322607e-07, + "loss": 0.81175208, + "num_input_tokens_seen": 145741935, + "step": 6753, + "time_per_iteration": 2.579991102218628 + }, + { + "auxiliary_loss_clip": 0.01046938, + "auxiliary_loss_mlp": 0.0100023, + "balance_loss_clip": 1.0081811, + "balance_loss_mlp": 0.99920511, + "epoch": 0.8121204833764204, + "flos": 57415742156160.0, + "grad_norm": 0.8112112166000741, + "language_loss": 0.5624584, + "learning_rate": 3.588464324307372e-07, + "loss": 0.58293009, + "num_input_tokens_seen": 145805560, + "step": 6754, + "time_per_iteration": 3.1148576736450195 + }, + { + "auxiliary_loss_clip": 0.01155157, + "auxiliary_loss_mlp": 0.01025776, + "balance_loss_clip": 1.04403663, + "balance_loss_mlp": 1.01865315, + "epoch": 0.8122407262670595, + "flos": 19464589549440.0, + "grad_norm": 1.6174961753161716, + "language_loss": 0.74985099, + "learning_rate": 3.584013463499391e-07, + "loss": 0.77166027, + "num_input_tokens_seen": 145824180, + "step": 6755, + "time_per_iteration": 2.450680732727051 + }, + { + "auxiliary_loss_clip": 0.0104523, + "auxiliary_loss_mlp": 0.01000957, + "balance_loss_clip": 1.00897801, + "balance_loss_mlp": 0.99990767, + "epoch": 0.8123609691576985, + "flos": 56425325472000.0, + "grad_norm": 0.7324339707970698, + "language_loss": 0.64409465, + "learning_rate": 3.579565092983521e-07, + "loss": 0.66455656, + "num_input_tokens_seen": 145885300, + "step": 6756, + "time_per_iteration": 2.955617904663086 + }, + { + "auxiliary_loss_clip": 0.01171066, + "auxiliary_loss_mlp": 0.01032469, + "balance_loss_clip": 1.04959095, + "balance_loss_mlp": 1.02507257, + "epoch": 0.8124812120483377, + "flos": 20631973564800.0, + "grad_norm": 2.093065900505286, + "language_loss": 0.83791673, + "learning_rate": 3.575119213434565e-07, + "loss": 0.85995209, + "num_input_tokens_seen": 145903815, + "step": 6757, + "time_per_iteration": 2.4253885746002197 + }, + { + "auxiliary_loss_clip": 0.01151272, + "auxiliary_loss_mlp": 0.01021158, + "balance_loss_clip": 1.04585838, + "balance_loss_mlp": 1.01407683, + "epoch": 0.8126014549389767, + "flos": 22492397566080.0, + "grad_norm": 2.0382638371773316, + "language_loss": 0.81697047, + "learning_rate": 3.5706758255269765e-07, + "loss": 0.83869481, + "num_input_tokens_seen": 145922270, + "step": 6758, + "time_per_iteration": 2.4710750579833984 + }, + { + "auxiliary_loss_clip": 0.01144477, + "auxiliary_loss_mlp": 0.01027792, + "balance_loss_clip": 1.04531288, + "balance_loss_mlp": 1.02005577, + "epoch": 0.8127216978296158, + "flos": 23287961946240.0, + "grad_norm": 1.907948868577448, + "language_loss": 0.69973403, + "learning_rate": 3.566234929934795e-07, + "loss": 0.72145677, + "num_input_tokens_seen": 145941470, + "step": 6759, + "time_per_iteration": 3.218820333480835 + }, + { + "auxiliary_loss_clip": 0.01151628, + "auxiliary_loss_mlp": 0.01029328, + "balance_loss_clip": 1.0482111, + "balance_loss_mlp": 1.0220685, + "epoch": 0.812841940720255, + "flos": 25154994049920.0, + "grad_norm": 1.4638336645882146, + "language_loss": 0.71530664, + "learning_rate": 3.561796527331706e-07, + "loss": 0.73711622, + "num_input_tokens_seen": 145963145, + "step": 6760, + "time_per_iteration": 3.284371852874756 + }, + { + "auxiliary_loss_clip": 0.01130106, + "auxiliary_loss_mlp": 0.01023799, + "balance_loss_clip": 1.04319739, + "balance_loss_mlp": 1.01640511, + "epoch": 0.812962183610894, + "flos": 26648446752000.0, + "grad_norm": 1.9995443024369342, + "language_loss": 0.77359354, + "learning_rate": 3.5573606183910163e-07, + "loss": 0.79513264, + "num_input_tokens_seen": 145983150, + "step": 6761, + "time_per_iteration": 2.5695559978485107 + }, + { + "auxiliary_loss_clip": 0.01159882, + "auxiliary_loss_mlp": 0.01025407, + "balance_loss_clip": 1.04525661, + "balance_loss_mlp": 1.01787007, + "epoch": 0.8130824265015331, + "flos": 24966965329920.0, + "grad_norm": 1.8215214699299056, + "language_loss": 0.78594345, + "learning_rate": 3.5529272037856493e-07, + "loss": 0.80779636, + "num_input_tokens_seen": 146001365, + "step": 6762, + "time_per_iteration": 3.258387804031372 + }, + { + "auxiliary_loss_clip": 0.01018017, + "auxiliary_loss_mlp": 0.01000567, + "balance_loss_clip": 1.00747788, + "balance_loss_mlp": 0.9995181, + "epoch": 0.8132026693921722, + "flos": 67622918175360.0, + "grad_norm": 0.709474462189831, + "language_loss": 0.53851241, + "learning_rate": 3.548496284188149e-07, + "loss": 0.55869824, + "num_input_tokens_seen": 146061570, + "step": 6763, + "time_per_iteration": 3.184030532836914 + }, + { + "auxiliary_loss_clip": 0.01109573, + "auxiliary_loss_mlp": 0.01023294, + "balance_loss_clip": 1.04530454, + "balance_loss_mlp": 1.01595628, + "epoch": 0.8133229122828113, + "flos": 19495149045120.0, + "grad_norm": 1.7414267898669973, + "language_loss": 0.79197991, + "learning_rate": 3.544067860270681e-07, + "loss": 0.8133086, + "num_input_tokens_seen": 146079145, + "step": 6764, + "time_per_iteration": 3.30715012550354 + }, + { + "auxiliary_loss_clip": 0.01128916, + "auxiliary_loss_mlp": 0.01021948, + "balance_loss_clip": 1.04369545, + "balance_loss_mlp": 1.01435483, + "epoch": 0.8134431551734503, + "flos": 20668135582080.0, + "grad_norm": 2.0017686806533606, + "language_loss": 0.7092914, + "learning_rate": 3.539641932705029e-07, + "loss": 0.73080003, + "num_input_tokens_seen": 146097625, + "step": 6765, + "time_per_iteration": 2.5138261318206787 + }, + { + "auxiliary_loss_clip": 0.01172208, + "auxiliary_loss_mlp": 0.01025611, + "balance_loss_clip": 1.04733419, + "balance_loss_mlp": 1.01754689, + "epoch": 0.8135633980640895, + "flos": 21507332008320.0, + "grad_norm": 2.3930482043199945, + "language_loss": 0.77114552, + "learning_rate": 3.53521850216262e-07, + "loss": 0.79312366, + "num_input_tokens_seen": 146117195, + "step": 6766, + "time_per_iteration": 2.4257283210754395 + }, + { + "auxiliary_loss_clip": 0.01170553, + "auxiliary_loss_mlp": 0.01025513, + "balance_loss_clip": 1.04829025, + "balance_loss_mlp": 1.0177052, + "epoch": 0.8136836409547286, + "flos": 20554442058240.0, + "grad_norm": 1.9128463995174563, + "language_loss": 0.76917267, + "learning_rate": 3.530797569314461e-07, + "loss": 0.7911334, + "num_input_tokens_seen": 146136220, + "step": 6767, + "time_per_iteration": 2.409191131591797 + }, + { + "auxiliary_loss_clip": 0.01169661, + "auxiliary_loss_mlp": 0.01020015, + "balance_loss_clip": 1.04841316, + "balance_loss_mlp": 1.01262379, + "epoch": 0.8138038838453676, + "flos": 20299045380480.0, + "grad_norm": 1.9473831667284993, + "language_loss": 0.77919686, + "learning_rate": 3.5263791348312235e-07, + "loss": 0.80109358, + "num_input_tokens_seen": 146155415, + "step": 6768, + "time_per_iteration": 2.4111969470977783 + }, + { + "auxiliary_loss_clip": 0.01139054, + "auxiliary_loss_mlp": 0.01020749, + "balance_loss_clip": 1.04349697, + "balance_loss_mlp": 1.01316702, + "epoch": 0.8139241267360068, + "flos": 29789840551680.0, + "grad_norm": 1.839671084150611, + "language_loss": 0.70705831, + "learning_rate": 3.521963199383171e-07, + "loss": 0.72865635, + "num_input_tokens_seen": 146178370, + "step": 6769, + "time_per_iteration": 2.5388343334198 + }, + { + "auxiliary_loss_clip": 0.01113765, + "auxiliary_loss_mlp": 0.01025317, + "balance_loss_clip": 1.04255521, + "balance_loss_mlp": 1.0171864, + "epoch": 0.8140443696266458, + "flos": 19713270384000.0, + "grad_norm": 1.9750170953081139, + "language_loss": 0.76658255, + "learning_rate": 3.517549763640197e-07, + "loss": 0.78797334, + "num_input_tokens_seen": 146196010, + "step": 6770, + "time_per_iteration": 2.535484790802002 + }, + { + "auxiliary_loss_clip": 0.01153608, + "auxiliary_loss_mlp": 0.00762156, + "balance_loss_clip": 1.04951417, + "balance_loss_mlp": 1.0005995, + "epoch": 0.8141646125172849, + "flos": 27160568910720.0, + "grad_norm": 1.917828718113755, + "language_loss": 0.71453559, + "learning_rate": 3.513138828271829e-07, + "loss": 0.73369324, + "num_input_tokens_seen": 146215880, + "step": 6771, + "time_per_iteration": 2.5075595378875732 + }, + { + "auxiliary_loss_clip": 0.01121682, + "auxiliary_loss_mlp": 0.01027369, + "balance_loss_clip": 1.04288793, + "balance_loss_mlp": 1.01998055, + "epoch": 0.8142848554079241, + "flos": 39673102700160.0, + "grad_norm": 1.7103719056024478, + "language_loss": 0.69861275, + "learning_rate": 3.508730393947179e-07, + "loss": 0.72010326, + "num_input_tokens_seen": 146239135, + "step": 6772, + "time_per_iteration": 2.6636803150177 + }, + { + "auxiliary_loss_clip": 0.01126621, + "auxiliary_loss_mlp": 0.01025317, + "balance_loss_clip": 1.04381657, + "balance_loss_mlp": 1.01780128, + "epoch": 0.8144050982985631, + "flos": 22237288197120.0, + "grad_norm": 1.7752718261837368, + "language_loss": 0.71914899, + "learning_rate": 3.504324461335024e-07, + "loss": 0.74066836, + "num_input_tokens_seen": 146259245, + "step": 6773, + "time_per_iteration": 2.523486614227295 + }, + { + "auxiliary_loss_clip": 0.01102944, + "auxiliary_loss_mlp": 0.01028934, + "balance_loss_clip": 1.03952622, + "balance_loss_mlp": 1.02042294, + "epoch": 0.8145253411892022, + "flos": 23038239617280.0, + "grad_norm": 3.9196731824310667, + "language_loss": 0.88223982, + "learning_rate": 3.499921031103732e-07, + "loss": 0.90355861, + "num_input_tokens_seen": 146280015, + "step": 6774, + "time_per_iteration": 2.583867073059082 + }, + { + "auxiliary_loss_clip": 0.01136598, + "auxiliary_loss_mlp": 0.01023094, + "balance_loss_clip": 1.04307222, + "balance_loss_mlp": 1.01553035, + "epoch": 0.8146455840798413, + "flos": 24827668387200.0, + "grad_norm": 1.962102230524152, + "language_loss": 0.78617227, + "learning_rate": 3.4955201039212987e-07, + "loss": 0.80776912, + "num_input_tokens_seen": 146300935, + "step": 6775, + "time_per_iteration": 2.5713372230529785 + }, + { + "auxiliary_loss_clip": 0.01162984, + "auxiliary_loss_mlp": 0.01024522, + "balance_loss_clip": 1.04896069, + "balance_loss_mlp": 1.01735139, + "epoch": 0.8147658269704804, + "flos": 19974520978560.0, + "grad_norm": 2.2404831240763237, + "language_loss": 0.65640616, + "learning_rate": 3.4911216804553465e-07, + "loss": 0.67828125, + "num_input_tokens_seen": 146319835, + "step": 6776, + "time_per_iteration": 2.45393443107605 + }, + { + "auxiliary_loss_clip": 0.01142132, + "auxiliary_loss_mlp": 0.01026949, + "balance_loss_clip": 1.045959, + "balance_loss_mlp": 1.01858592, + "epoch": 0.8148860698611194, + "flos": 21178031097600.0, + "grad_norm": 2.3851985221533285, + "language_loss": 0.70282435, + "learning_rate": 3.4867257613731017e-07, + "loss": 0.72451514, + "num_input_tokens_seen": 146339030, + "step": 6777, + "time_per_iteration": 2.5334768295288086 + }, + { + "auxiliary_loss_clip": 0.0114289, + "auxiliary_loss_mlp": 0.01025794, + "balance_loss_clip": 1.04533815, + "balance_loss_mlp": 1.01843023, + "epoch": 0.8150063127517585, + "flos": 19606903234560.0, + "grad_norm": 1.787288789965131, + "language_loss": 0.85883218, + "learning_rate": 3.4823323473414343e-07, + "loss": 0.88051903, + "num_input_tokens_seen": 146358550, + "step": 6778, + "time_per_iteration": 2.4793472290039062 + }, + { + "auxiliary_loss_clip": 0.01134416, + "auxiliary_loss_mlp": 0.01028351, + "balance_loss_clip": 1.04428566, + "balance_loss_mlp": 1.01984525, + "epoch": 0.8151265556423977, + "flos": 22638374438400.0, + "grad_norm": 2.5781215168650244, + "language_loss": 0.75817251, + "learning_rate": 3.477941439026812e-07, + "loss": 0.77980018, + "num_input_tokens_seen": 146376770, + "step": 6779, + "time_per_iteration": 2.520029306411743 + }, + { + "auxiliary_loss_clip": 0.01143447, + "auxiliary_loss_mlp": 0.01023932, + "balance_loss_clip": 1.04696572, + "balance_loss_mlp": 1.01691914, + "epoch": 0.8152467985330367, + "flos": 17968048277760.0, + "grad_norm": 1.867361144462091, + "language_loss": 0.73099577, + "learning_rate": 3.473553037095349e-07, + "loss": 0.75266957, + "num_input_tokens_seen": 146395795, + "step": 6780, + "time_per_iteration": 2.4621803760528564 + }, + { + "auxiliary_loss_clip": 0.01133892, + "auxiliary_loss_mlp": 0.01026659, + "balance_loss_clip": 1.04380977, + "balance_loss_mlp": 1.01947713, + "epoch": 0.8153670414236758, + "flos": 24969012405120.0, + "grad_norm": 1.7247661622140396, + "language_loss": 0.82854724, + "learning_rate": 3.469167142212743e-07, + "loss": 0.85015273, + "num_input_tokens_seen": 146417640, + "step": 6781, + "time_per_iteration": 2.5313186645507812 + }, + { + "auxiliary_loss_clip": 0.01156211, + "auxiliary_loss_mlp": 0.01022262, + "balance_loss_clip": 1.04702187, + "balance_loss_mlp": 1.0143106, + "epoch": 0.8154872843143149, + "flos": 31066069754880.0, + "grad_norm": 2.185590603163934, + "language_loss": 0.63617867, + "learning_rate": 3.4647837550443337e-07, + "loss": 0.6579634, + "num_input_tokens_seen": 146436205, + "step": 6782, + "time_per_iteration": 2.527846336364746 + }, + { + "auxiliary_loss_clip": 0.01129626, + "auxiliary_loss_mlp": 0.01026284, + "balance_loss_clip": 1.04486966, + "balance_loss_mlp": 1.01916122, + "epoch": 0.815607527204954, + "flos": 19391654983680.0, + "grad_norm": 2.3884433413052735, + "language_loss": 0.74467981, + "learning_rate": 3.460402876255086e-07, + "loss": 0.76623887, + "num_input_tokens_seen": 146453595, + "step": 6783, + "time_per_iteration": 2.5120038986206055 + }, + { + "auxiliary_loss_clip": 0.01157665, + "auxiliary_loss_mlp": 0.01024157, + "balance_loss_clip": 1.04607224, + "balance_loss_mlp": 1.01662302, + "epoch": 0.815727770095593, + "flos": 26140418743680.0, + "grad_norm": 2.576811400357648, + "language_loss": 0.71536791, + "learning_rate": 3.456024506509574e-07, + "loss": 0.73718619, + "num_input_tokens_seen": 146474515, + "step": 6784, + "time_per_iteration": 2.4845926761627197 + }, + { + "auxiliary_loss_clip": 0.01156554, + "auxiliary_loss_mlp": 0.00762702, + "balance_loss_clip": 1.04934084, + "balance_loss_mlp": 1.00063562, + "epoch": 0.8158480129862322, + "flos": 25337527989120.0, + "grad_norm": 2.404047232539667, + "language_loss": 0.74173307, + "learning_rate": 3.4516486464719873e-07, + "loss": 0.76092565, + "num_input_tokens_seen": 146493905, + "step": 6785, + "time_per_iteration": 2.505258798599243 + }, + { + "auxiliary_loss_clip": 0.01107814, + "auxiliary_loss_mlp": 0.01027831, + "balance_loss_clip": 1.04105341, + "balance_loss_mlp": 1.02021337, + "epoch": 0.8159682558768713, + "flos": 34423645559040.0, + "grad_norm": 1.8035726101135916, + "language_loss": 0.61793959, + "learning_rate": 3.4472752968061445e-07, + "loss": 0.63929605, + "num_input_tokens_seen": 146518335, + "step": 6786, + "time_per_iteration": 3.4725584983825684 + }, + { + "auxiliary_loss_clip": 0.01153621, + "auxiliary_loss_mlp": 0.01025111, + "balance_loss_clip": 1.04488313, + "balance_loss_mlp": 1.0177114, + "epoch": 0.8160884987675103, + "flos": 18653223185280.0, + "grad_norm": 2.367722067050127, + "language_loss": 0.73845792, + "learning_rate": 3.442904458175475e-07, + "loss": 0.76024526, + "num_input_tokens_seen": 146535655, + "step": 6787, + "time_per_iteration": 3.250304698944092 + }, + { + "auxiliary_loss_clip": 0.01152483, + "auxiliary_loss_mlp": 0.01021976, + "balance_loss_clip": 1.04430032, + "balance_loss_mlp": 1.01454973, + "epoch": 0.8162087416581495, + "flos": 31430527102080.0, + "grad_norm": 1.480181623723431, + "language_loss": 0.75944126, + "learning_rate": 3.438536131243044e-07, + "loss": 0.78118587, + "num_input_tokens_seen": 146556815, + "step": 6788, + "time_per_iteration": 2.539072275161743 + }, + { + "auxiliary_loss_clip": 0.01145337, + "auxiliary_loss_mlp": 0.01023121, + "balance_loss_clip": 1.04566216, + "balance_loss_mlp": 1.01500845, + "epoch": 0.8163289845487885, + "flos": 37593910915200.0, + "grad_norm": 2.228258598650119, + "language_loss": 0.61797273, + "learning_rate": 3.434170316671503e-07, + "loss": 0.63965726, + "num_input_tokens_seen": 146581845, + "step": 6789, + "time_per_iteration": 3.3489108085632324 + }, + { + "auxiliary_loss_clip": 0.01121222, + "auxiliary_loss_mlp": 0.01023422, + "balance_loss_clip": 1.04569435, + "balance_loss_mlp": 1.01619768, + "epoch": 0.8164492274394276, + "flos": 13953989554560.0, + "grad_norm": 2.3713503852397797, + "language_loss": 0.89933366, + "learning_rate": 3.4298070151231583e-07, + "loss": 0.92078006, + "num_input_tokens_seen": 146597245, + "step": 6790, + "time_per_iteration": 2.4658877849578857 + }, + { + "auxiliary_loss_clip": 0.01145671, + "auxiliary_loss_mlp": 0.01024416, + "balance_loss_clip": 1.04510617, + "balance_loss_mlp": 1.01692939, + "epoch": 0.8165694703300668, + "flos": 28986554747520.0, + "grad_norm": 1.8937181847607278, + "language_loss": 0.60248351, + "learning_rate": 3.425446227259916e-07, + "loss": 0.62418431, + "num_input_tokens_seen": 146618210, + "step": 6791, + "time_per_iteration": 3.2829861640930176 + }, + { + "auxiliary_loss_clip": 0.01142231, + "auxiliary_loss_mlp": 0.01024042, + "balance_loss_clip": 1.04498112, + "balance_loss_mlp": 1.01750374, + "epoch": 0.8166897132207058, + "flos": 25118365155840.0, + "grad_norm": 1.9107805535787457, + "language_loss": 0.82264161, + "learning_rate": 3.421087953743296e-07, + "loss": 0.84430432, + "num_input_tokens_seen": 146637975, + "step": 6792, + "time_per_iteration": 2.518421173095703 + }, + { + "auxiliary_loss_clip": 0.01155187, + "auxiliary_loss_mlp": 0.01025354, + "balance_loss_clip": 1.04480755, + "balance_loss_mlp": 1.01748061, + "epoch": 0.8168099561113449, + "flos": 23148593176320.0, + "grad_norm": 4.8441107510244015, + "language_loss": 0.80046058, + "learning_rate": 3.416732195234464e-07, + "loss": 0.82226598, + "num_input_tokens_seen": 146658030, + "step": 6793, + "time_per_iteration": 2.4591002464294434 + }, + { + "auxiliary_loss_clip": 0.01157899, + "auxiliary_loss_mlp": 0.01021519, + "balance_loss_clip": 1.04605007, + "balance_loss_mlp": 1.01473951, + "epoch": 0.816930199001984, + "flos": 18407666833920.0, + "grad_norm": 1.4810000088002788, + "language_loss": 0.79474521, + "learning_rate": 3.4123789523941613e-07, + "loss": 0.81653941, + "num_input_tokens_seen": 146677855, + "step": 6794, + "time_per_iteration": 2.4487626552581787 + }, + { + "auxiliary_loss_clip": 0.01149053, + "auxiliary_loss_mlp": 0.01023118, + "balance_loss_clip": 1.04382503, + "balance_loss_mlp": 1.01533949, + "epoch": 0.8170504418926231, + "flos": 21251324799360.0, + "grad_norm": 2.2789964878894198, + "language_loss": 0.63439906, + "learning_rate": 3.4080282258827884e-07, + "loss": 0.65612078, + "num_input_tokens_seen": 146696230, + "step": 6795, + "time_per_iteration": 2.475374937057495 + }, + { + "auxiliary_loss_clip": 0.01155841, + "auxiliary_loss_mlp": 0.01023825, + "balance_loss_clip": 1.04531884, + "balance_loss_mlp": 1.01647568, + "epoch": 0.8171706847832622, + "flos": 19099234362240.0, + "grad_norm": 2.3043966574496415, + "language_loss": 0.72691917, + "learning_rate": 3.403680016360342e-07, + "loss": 0.74871582, + "num_input_tokens_seen": 146714835, + "step": 6796, + "time_per_iteration": 2.4403650760650635 + }, + { + "auxiliary_loss_clip": 0.01149274, + "auxiliary_loss_mlp": 0.01029611, + "balance_loss_clip": 1.0471096, + "balance_loss_mlp": 1.02134943, + "epoch": 0.8172909276739013, + "flos": 21470128496640.0, + "grad_norm": 1.5152312151479823, + "language_loss": 0.67604345, + "learning_rate": 3.3993343244864403e-07, + "loss": 0.69783223, + "num_input_tokens_seen": 146734425, + "step": 6797, + "time_per_iteration": 2.4528417587280273 + }, + { + "auxiliary_loss_clip": 0.01153267, + "auxiliary_loss_mlp": 0.01024526, + "balance_loss_clip": 1.04641342, + "balance_loss_mlp": 1.01719165, + "epoch": 0.8174111705645404, + "flos": 27599792417280.0, + "grad_norm": 2.0931783243763737, + "language_loss": 0.72701859, + "learning_rate": 3.394991150920323e-07, + "loss": 0.74879652, + "num_input_tokens_seen": 146757545, + "step": 6798, + "time_per_iteration": 2.5052883625030518 + }, + { + "auxiliary_loss_clip": 0.01116661, + "auxiliary_loss_mlp": 0.00763204, + "balance_loss_clip": 1.04334772, + "balance_loss_mlp": 1.00062692, + "epoch": 0.8175314134551794, + "flos": 14064594508800.0, + "grad_norm": 2.5642858656032637, + "language_loss": 0.74415404, + "learning_rate": 3.3906504963208396e-07, + "loss": 0.76295274, + "num_input_tokens_seen": 146774240, + "step": 6799, + "time_per_iteration": 2.524813413619995 + }, + { + "auxiliary_loss_clip": 0.01108143, + "auxiliary_loss_mlp": 0.01024665, + "balance_loss_clip": 1.04357147, + "balance_loss_mlp": 1.01710677, + "epoch": 0.8176516563458186, + "flos": 22708076780160.0, + "grad_norm": 1.9123074205618598, + "language_loss": 0.66544163, + "learning_rate": 3.3863123613464774e-07, + "loss": 0.68676966, + "num_input_tokens_seen": 146793140, + "step": 6800, + "time_per_iteration": 2.575735092163086 + }, + { + "auxiliary_loss_clip": 0.01140547, + "auxiliary_loss_mlp": 0.01025935, + "balance_loss_clip": 1.04015386, + "balance_loss_mlp": 1.01856804, + "epoch": 0.8177718992364577, + "flos": 21945406279680.0, + "grad_norm": 1.7139597049368656, + "language_loss": 0.75416553, + "learning_rate": 3.381976746655317e-07, + "loss": 0.77583039, + "num_input_tokens_seen": 146812895, + "step": 6801, + "time_per_iteration": 2.4958090782165527 + }, + { + "auxiliary_loss_clip": 0.01105084, + "auxiliary_loss_mlp": 0.01025204, + "balance_loss_clip": 1.04252148, + "balance_loss_mlp": 1.01803911, + "epoch": 0.8178921421270967, + "flos": 22017443005440.0, + "grad_norm": 2.0631466025693053, + "language_loss": 0.67109334, + "learning_rate": 3.3776436529050756e-07, + "loss": 0.69239616, + "num_input_tokens_seen": 146832445, + "step": 6802, + "time_per_iteration": 2.5386126041412354 + }, + { + "auxiliary_loss_clip": 0.01165014, + "auxiliary_loss_mlp": 0.0102653, + "balance_loss_clip": 1.04534614, + "balance_loss_mlp": 1.01892459, + "epoch": 0.8180123850177359, + "flos": 33183111496320.0, + "grad_norm": 1.6980134637098692, + "language_loss": 0.72378665, + "learning_rate": 3.373313080753073e-07, + "loss": 0.74570209, + "num_input_tokens_seen": 146856505, + "step": 6803, + "time_per_iteration": 2.5384466648101807 + }, + { + "auxiliary_loss_clip": 0.01148314, + "auxiliary_loss_mlp": 0.01025253, + "balance_loss_clip": 1.04335403, + "balance_loss_mlp": 1.01785028, + "epoch": 0.8181326279083749, + "flos": 22091167670400.0, + "grad_norm": 1.5706975041884594, + "language_loss": 0.77581704, + "learning_rate": 3.3689850308562527e-07, + "loss": 0.7975527, + "num_input_tokens_seen": 146876950, + "step": 6804, + "time_per_iteration": 2.4853057861328125 + }, + { + "auxiliary_loss_clip": 0.01102968, + "auxiliary_loss_mlp": 0.01026563, + "balance_loss_clip": 1.04255474, + "balance_loss_mlp": 1.01913893, + "epoch": 0.818252870799014, + "flos": 15705747936000.0, + "grad_norm": 1.8821338301379127, + "language_loss": 0.7753889, + "learning_rate": 3.364659503871183e-07, + "loss": 0.79668427, + "num_input_tokens_seen": 146894885, + "step": 6805, + "time_per_iteration": 2.5174450874328613 + }, + { + "auxiliary_loss_clip": 0.01124209, + "auxiliary_loss_mlp": 0.01023753, + "balance_loss_clip": 1.04195786, + "balance_loss_mlp": 1.01719618, + "epoch": 0.8183731136896532, + "flos": 18770687637120.0, + "grad_norm": 1.8869694364678842, + "language_loss": 0.8418715, + "learning_rate": 3.3603365004540417e-07, + "loss": 0.86335111, + "num_input_tokens_seen": 146913180, + "step": 6806, + "time_per_iteration": 2.5104849338531494 + }, + { + "auxiliary_loss_clip": 0.01168852, + "auxiliary_loss_mlp": 0.01025815, + "balance_loss_clip": 1.04931152, + "balance_loss_mlp": 1.01827788, + "epoch": 0.8184933565802922, + "flos": 26541792293760.0, + "grad_norm": 1.9520798221272015, + "language_loss": 0.77272904, + "learning_rate": 3.356016021260624e-07, + "loss": 0.79467571, + "num_input_tokens_seen": 146933510, + "step": 6807, + "time_per_iteration": 2.449103832244873 + }, + { + "auxiliary_loss_clip": 0.01156815, + "auxiliary_loss_mlp": 0.01025169, + "balance_loss_clip": 1.04741275, + "balance_loss_mlp": 1.01767397, + "epoch": 0.8186135994709313, + "flos": 17530117660800.0, + "grad_norm": 2.965480473362651, + "language_loss": 0.65525961, + "learning_rate": 3.35169806694634e-07, + "loss": 0.67707944, + "num_input_tokens_seen": 146951760, + "step": 6808, + "time_per_iteration": 2.4296209812164307 + }, + { + "auxiliary_loss_clip": 0.01031472, + "auxiliary_loss_mlp": 0.01005825, + "balance_loss_clip": 1.00924921, + "balance_loss_mlp": 1.00484717, + "epoch": 0.8187338423615703, + "flos": 63480300675840.0, + "grad_norm": 0.7251620708213754, + "language_loss": 0.60644639, + "learning_rate": 3.3473826381662186e-07, + "loss": 0.62681931, + "num_input_tokens_seen": 147022900, + "step": 6809, + "time_per_iteration": 3.206364631652832 + }, + { + "auxiliary_loss_clip": 0.01148648, + "auxiliary_loss_mlp": 0.01024419, + "balance_loss_clip": 1.04638529, + "balance_loss_mlp": 1.01701045, + "epoch": 0.8188540852522095, + "flos": 17529974006400.0, + "grad_norm": 1.863401785156458, + "language_loss": 0.81651384, + "learning_rate": 3.3430697355749216e-07, + "loss": 0.8382445, + "num_input_tokens_seen": 147040590, + "step": 6810, + "time_per_iteration": 2.434105634689331 + }, + { + "auxiliary_loss_clip": 0.01109695, + "auxiliary_loss_mlp": 0.01023827, + "balance_loss_clip": 1.04160464, + "balance_loss_mlp": 1.01576257, + "epoch": 0.8189743281428485, + "flos": 14392530702720.0, + "grad_norm": 1.7799072221914034, + "language_loss": 0.74778342, + "learning_rate": 3.3387593598266907e-07, + "loss": 0.76911867, + "num_input_tokens_seen": 147057200, + "step": 6811, + "time_per_iteration": 2.502495288848877 + }, + { + "auxiliary_loss_clip": 0.0111822, + "auxiliary_loss_mlp": 0.01021903, + "balance_loss_clip": 1.04121041, + "balance_loss_mlp": 1.01445842, + "epoch": 0.8190945710334876, + "flos": 25080479285760.0, + "grad_norm": 1.6499802279029865, + "language_loss": 0.78213227, + "learning_rate": 3.3344515115754225e-07, + "loss": 0.80353349, + "num_input_tokens_seen": 147076180, + "step": 6812, + "time_per_iteration": 2.588642120361328 + }, + { + "auxiliary_loss_clip": 0.01131681, + "auxiliary_loss_mlp": 0.01018214, + "balance_loss_clip": 1.04255736, + "balance_loss_mlp": 1.01102591, + "epoch": 0.8192148139241268, + "flos": 21507152440320.0, + "grad_norm": 2.7107718601820254, + "language_loss": 0.79787499, + "learning_rate": 3.33014619147461e-07, + "loss": 0.81937391, + "num_input_tokens_seen": 147094205, + "step": 6813, + "time_per_iteration": 4.076702117919922 + }, + { + "auxiliary_loss_clip": 0.01142499, + "auxiliary_loss_mlp": 0.01025944, + "balance_loss_clip": 1.04815531, + "balance_loss_mlp": 1.01855564, + "epoch": 0.8193350568147658, + "flos": 23952166289280.0, + "grad_norm": 1.7426282797233779, + "language_loss": 0.72061884, + "learning_rate": 3.325843400177362e-07, + "loss": 0.74230325, + "num_input_tokens_seen": 147115545, + "step": 6814, + "time_per_iteration": 2.5238120555877686 + }, + { + "auxiliary_loss_clip": 0.01158515, + "auxiliary_loss_mlp": 0.00762418, + "balance_loss_clip": 1.0470469, + "balance_loss_mlp": 1.00058889, + "epoch": 0.8194552997054049, + "flos": 20559469962240.0, + "grad_norm": 2.0988060463214437, + "language_loss": 0.73555452, + "learning_rate": 3.32154313833642e-07, + "loss": 0.75476384, + "num_input_tokens_seen": 147135700, + "step": 6815, + "time_per_iteration": 3.1592397689819336 + }, + { + "auxiliary_loss_clip": 0.01170665, + "auxiliary_loss_mlp": 0.01025872, + "balance_loss_clip": 1.04773593, + "balance_loss_mlp": 1.01817131, + "epoch": 0.819575542596044, + "flos": 26031753123840.0, + "grad_norm": 4.389714904567554, + "language_loss": 0.59670234, + "learning_rate": 3.3172454066041164e-07, + "loss": 0.61866772, + "num_input_tokens_seen": 147155205, + "step": 6816, + "time_per_iteration": 2.46238374710083 + }, + { + "auxiliary_loss_clip": 0.01096851, + "auxiliary_loss_mlp": 0.00761842, + "balance_loss_clip": 1.04169989, + "balance_loss_mlp": 1.00062633, + "epoch": 0.8196957854866831, + "flos": 29096944220160.0, + "grad_norm": 2.112178056484924, + "language_loss": 0.76649344, + "learning_rate": 3.3129502056324234e-07, + "loss": 0.78508037, + "num_input_tokens_seen": 147176570, + "step": 6817, + "time_per_iteration": 2.6418185234069824 + }, + { + "auxiliary_loss_clip": 0.01006304, + "auxiliary_loss_mlp": 0.0100209, + "balance_loss_clip": 1.01262045, + "balance_loss_mlp": 1.00109482, + "epoch": 0.8198160283773221, + "flos": 69033631898880.0, + "grad_norm": 0.8247715715682679, + "language_loss": 0.59812909, + "learning_rate": 3.3086575360729165e-07, + "loss": 0.61821306, + "num_input_tokens_seen": 147234105, + "step": 6818, + "time_per_iteration": 3.8540420532226562 + }, + { + "auxiliary_loss_clip": 0.01139962, + "auxiliary_loss_mlp": 0.01026694, + "balance_loss_clip": 1.04660523, + "balance_loss_mlp": 1.0190407, + "epoch": 0.8199362712679613, + "flos": 16618058496000.0, + "grad_norm": 1.8130345501899767, + "language_loss": 0.71633136, + "learning_rate": 3.3043673985767906e-07, + "loss": 0.73799789, + "num_input_tokens_seen": 147253170, + "step": 6819, + "time_per_iteration": 2.9980309009552 + }, + { + "auxiliary_loss_clip": 0.01117107, + "auxiliary_loss_mlp": 0.01027721, + "balance_loss_clip": 1.03971827, + "balance_loss_mlp": 1.01978731, + "epoch": 0.8200565141586004, + "flos": 21757664868480.0, + "grad_norm": 2.005922855705265, + "language_loss": 0.77637756, + "learning_rate": 3.3000797937948564e-07, + "loss": 0.79782581, + "num_input_tokens_seen": 147271465, + "step": 6820, + "time_per_iteration": 2.5927982330322266 + }, + { + "auxiliary_loss_clip": 0.01033085, + "auxiliary_loss_mlp": 0.01002409, + "balance_loss_clip": 1.00672579, + "balance_loss_mlp": 1.00131786, + "epoch": 0.8201767570492394, + "flos": 69807112392960.0, + "grad_norm": 0.9380687936044008, + "language_loss": 0.65021276, + "learning_rate": 3.295794722377534e-07, + "loss": 0.67056775, + "num_input_tokens_seen": 147335070, + "step": 6821, + "time_per_iteration": 3.1470398902893066 + }, + { + "auxiliary_loss_clip": 0.01165504, + "auxiliary_loss_mlp": 0.01024258, + "balance_loss_clip": 1.04482818, + "balance_loss_mlp": 1.01749897, + "epoch": 0.8202969999398786, + "flos": 23111892455040.0, + "grad_norm": 1.6089950729661964, + "language_loss": 0.80086339, + "learning_rate": 3.291512184974876e-07, + "loss": 0.822761, + "num_input_tokens_seen": 147355460, + "step": 6822, + "time_per_iteration": 2.4604554176330566 + }, + { + "auxiliary_loss_clip": 0.01138076, + "auxiliary_loss_mlp": 0.01025348, + "balance_loss_clip": 1.041471, + "balance_loss_mlp": 1.01764083, + "epoch": 0.8204172428305176, + "flos": 28220616109440.0, + "grad_norm": 1.5835368576466318, + "language_loss": 0.6655941, + "learning_rate": 3.2872321822365346e-07, + "loss": 0.68722832, + "num_input_tokens_seen": 147375675, + "step": 6823, + "time_per_iteration": 2.5748238563537598 + }, + { + "auxiliary_loss_clip": 0.01155104, + "auxiliary_loss_mlp": 0.01023716, + "balance_loss_clip": 1.0477277, + "balance_loss_mlp": 1.01628375, + "epoch": 0.8205374857211567, + "flos": 20887011106560.0, + "grad_norm": 1.7599456548854364, + "language_loss": 0.73676133, + "learning_rate": 3.282954714811783e-07, + "loss": 0.75854957, + "num_input_tokens_seen": 147394580, + "step": 6824, + "time_per_iteration": 2.523361921310425 + }, + { + "auxiliary_loss_clip": 0.0112745, + "auxiliary_loss_mlp": 0.01026946, + "balance_loss_clip": 1.04071724, + "balance_loss_mlp": 1.01907241, + "epoch": 0.8206577286117959, + "flos": 13152140294400.0, + "grad_norm": 2.5946802738446486, + "language_loss": 0.71192539, + "learning_rate": 3.2786797833495093e-07, + "loss": 0.73346937, + "num_input_tokens_seen": 147409935, + "step": 6825, + "time_per_iteration": 2.4706766605377197 + }, + { + "auxiliary_loss_clip": 0.01166576, + "auxiliary_loss_mlp": 0.01026611, + "balance_loss_clip": 1.04719973, + "balance_loss_mlp": 1.01976252, + "epoch": 0.8207779715024349, + "flos": 25265634917760.0, + "grad_norm": 1.7087275345334987, + "language_loss": 0.7241891, + "learning_rate": 3.274407388498213e-07, + "loss": 0.74612093, + "num_input_tokens_seen": 147428065, + "step": 6826, + "time_per_iteration": 2.5425801277160645 + }, + { + "auxiliary_loss_clip": 0.01120109, + "auxiliary_loss_mlp": 0.01031308, + "balance_loss_clip": 1.04121268, + "balance_loss_mlp": 1.02398849, + "epoch": 0.820898214393074, + "flos": 19610243199360.0, + "grad_norm": 1.9265746391501348, + "language_loss": 0.74368399, + "learning_rate": 3.270137530906021e-07, + "loss": 0.76519817, + "num_input_tokens_seen": 147447300, + "step": 6827, + "time_per_iteration": 2.5191755294799805 + }, + { + "auxiliary_loss_clip": 0.0110313, + "auxiliary_loss_mlp": 0.01024339, + "balance_loss_clip": 1.04314947, + "balance_loss_mlp": 1.01723385, + "epoch": 0.8210184572837131, + "flos": 15596615439360.0, + "grad_norm": 1.762737153232967, + "language_loss": 0.83272839, + "learning_rate": 3.265870211220665e-07, + "loss": 0.85400307, + "num_input_tokens_seen": 147465135, + "step": 6828, + "time_per_iteration": 2.525632858276367 + }, + { + "auxiliary_loss_clip": 0.01119291, + "auxiliary_loss_mlp": 0.01028562, + "balance_loss_clip": 1.04110861, + "balance_loss_mlp": 1.02077496, + "epoch": 0.8211387001743522, + "flos": 20813932886400.0, + "grad_norm": 2.288473511613395, + "language_loss": 0.81626177, + "learning_rate": 3.2616054300894934e-07, + "loss": 0.83774036, + "num_input_tokens_seen": 147484585, + "step": 6829, + "time_per_iteration": 2.5416476726531982 + }, + { + "auxiliary_loss_clip": 0.01128814, + "auxiliary_loss_mlp": 0.0102401, + "balance_loss_clip": 1.04361272, + "balance_loss_mlp": 1.01669645, + "epoch": 0.8212589430649913, + "flos": 27704579368320.0, + "grad_norm": 2.2415067218559694, + "language_loss": 0.84687936, + "learning_rate": 3.2573431881594693e-07, + "loss": 0.86840761, + "num_input_tokens_seen": 147504130, + "step": 6830, + "time_per_iteration": 2.533083915710449 + }, + { + "auxiliary_loss_clip": 0.01097968, + "auxiliary_loss_mlp": 0.01022356, + "balance_loss_clip": 1.03953671, + "balance_loss_mlp": 1.01515305, + "epoch": 0.8213791859556304, + "flos": 22455625017600.0, + "grad_norm": 2.129429054250629, + "language_loss": 0.66027832, + "learning_rate": 3.2530834860771663e-07, + "loss": 0.6814816, + "num_input_tokens_seen": 147523510, + "step": 6831, + "time_per_iteration": 2.6036078929901123 + }, + { + "auxiliary_loss_clip": 0.01155796, + "auxiliary_loss_mlp": 0.01026388, + "balance_loss_clip": 1.04563797, + "balance_loss_mlp": 1.01832366, + "epoch": 0.8214994288462695, + "flos": 16654471908480.0, + "grad_norm": 2.021419890159625, + "language_loss": 0.74061739, + "learning_rate": 3.248826324488794e-07, + "loss": 0.76243925, + "num_input_tokens_seen": 147540805, + "step": 6832, + "time_per_iteration": 2.483013391494751 + }, + { + "auxiliary_loss_clip": 0.0116941, + "auxiliary_loss_mlp": 0.01026351, + "balance_loss_clip": 1.05064321, + "balance_loss_mlp": 1.01894832, + "epoch": 0.8216196717369085, + "flos": 25221787390080.0, + "grad_norm": 1.9672069064505895, + "language_loss": 0.87848073, + "learning_rate": 3.244571704040138e-07, + "loss": 0.90043831, + "num_input_tokens_seen": 147560965, + "step": 6833, + "time_per_iteration": 2.4735333919525146 + }, + { + "auxiliary_loss_clip": 0.01150749, + "auxiliary_loss_mlp": 0.01028058, + "balance_loss_clip": 1.04297411, + "balance_loss_mlp": 1.01929045, + "epoch": 0.8217399146275477, + "flos": 25371930240000.0, + "grad_norm": 2.257340691349059, + "language_loss": 0.73625726, + "learning_rate": 3.2403196253766374e-07, + "loss": 0.75804532, + "num_input_tokens_seen": 147580045, + "step": 6834, + "time_per_iteration": 2.4854605197906494 + }, + { + "auxiliary_loss_clip": 0.01150486, + "auxiliary_loss_mlp": 0.01024088, + "balance_loss_clip": 1.04503655, + "balance_loss_mlp": 1.01595759, + "epoch": 0.8218601575181868, + "flos": 25629625388160.0, + "grad_norm": 2.5608110443854306, + "language_loss": 0.79193139, + "learning_rate": 3.2360700891433254e-07, + "loss": 0.81367713, + "num_input_tokens_seen": 147599070, + "step": 6835, + "time_per_iteration": 2.500368356704712 + }, + { + "auxiliary_loss_clip": 0.01024361, + "auxiliary_loss_mlp": 0.01000207, + "balance_loss_clip": 1.01074839, + "balance_loss_mlp": 0.99916428, + "epoch": 0.8219804004088258, + "flos": 67660229427840.0, + "grad_norm": 0.7852225873965438, + "language_loss": 0.57301223, + "learning_rate": 3.231823095984847e-07, + "loss": 0.5932579, + "num_input_tokens_seen": 147653710, + "step": 6836, + "time_per_iteration": 3.090465545654297 + }, + { + "auxiliary_loss_clip": 0.01140918, + "auxiliary_loss_mlp": 0.010248, + "balance_loss_clip": 1.04542863, + "balance_loss_mlp": 1.01778412, + "epoch": 0.822100643299465, + "flos": 19464266327040.0, + "grad_norm": 1.9991475294384693, + "language_loss": 0.75906956, + "learning_rate": 3.2275786465454814e-07, + "loss": 0.78072673, + "num_input_tokens_seen": 147670360, + "step": 6837, + "time_per_iteration": 2.4873552322387695 + }, + { + "auxiliary_loss_clip": 0.01125386, + "auxiliary_loss_mlp": 0.01022889, + "balance_loss_clip": 1.04380965, + "balance_loss_mlp": 1.01576042, + "epoch": 0.822220886190104, + "flos": 24681368292480.0, + "grad_norm": 2.0827734295370437, + "language_loss": 0.75735295, + "learning_rate": 3.2233367414690917e-07, + "loss": 0.77883571, + "num_input_tokens_seen": 147692550, + "step": 6838, + "time_per_iteration": 2.5702996253967285 + }, + { + "auxiliary_loss_clip": 0.01121801, + "auxiliary_loss_mlp": 0.01023393, + "balance_loss_clip": 1.03987265, + "balance_loss_mlp": 1.01652622, + "epoch": 0.8223411290807431, + "flos": 27819062991360.0, + "grad_norm": 2.135482816495203, + "language_loss": 0.85229301, + "learning_rate": 3.219097381399183e-07, + "loss": 0.87374496, + "num_input_tokens_seen": 147709725, + "step": 6839, + "time_per_iteration": 2.5780396461486816 + }, + { + "auxiliary_loss_clip": 0.01148363, + "auxiliary_loss_mlp": 0.01024914, + "balance_loss_clip": 1.046211, + "balance_loss_mlp": 1.01818466, + "epoch": 0.8224613719713821, + "flos": 23218546913280.0, + "grad_norm": 2.453410879656336, + "language_loss": 0.80962348, + "learning_rate": 3.2148605669788584e-07, + "loss": 0.83135629, + "num_input_tokens_seen": 147729615, + "step": 6840, + "time_per_iteration": 4.384998321533203 + }, + { + "auxiliary_loss_clip": 0.01141197, + "auxiliary_loss_mlp": 0.01026788, + "balance_loss_clip": 1.04602575, + "balance_loss_mlp": 1.01936102, + "epoch": 0.8225816148620213, + "flos": 15706250726400.0, + "grad_norm": 2.4839639839529433, + "language_loss": 0.76872766, + "learning_rate": 3.2106262988508405e-07, + "loss": 0.7904076, + "num_input_tokens_seen": 147747665, + "step": 6841, + "time_per_iteration": 2.496425151824951 + }, + { + "auxiliary_loss_clip": 0.01143418, + "auxiliary_loss_mlp": 0.01021917, + "balance_loss_clip": 1.04517972, + "balance_loss_mlp": 1.01474333, + "epoch": 0.8227018577526604, + "flos": 18515111391360.0, + "grad_norm": 2.3914459684555927, + "language_loss": 0.74155873, + "learning_rate": 3.206394577657465e-07, + "loss": 0.76321208, + "num_input_tokens_seen": 147765445, + "step": 6842, + "time_per_iteration": 3.2013769149780273 + }, + { + "auxiliary_loss_clip": 0.01158931, + "auxiliary_loss_mlp": 0.01029318, + "balance_loss_clip": 1.04759288, + "balance_loss_mlp": 1.02142048, + "epoch": 0.8228221006432994, + "flos": 22236785406720.0, + "grad_norm": 2.1226124059572555, + "language_loss": 0.72752184, + "learning_rate": 3.202165404040675e-07, + "loss": 0.74940431, + "num_input_tokens_seen": 147783365, + "step": 6843, + "time_per_iteration": 2.493269443511963 + }, + { + "auxiliary_loss_clip": 0.01097467, + "auxiliary_loss_mlp": 0.01027708, + "balance_loss_clip": 1.04045093, + "balance_loss_mlp": 1.01991153, + "epoch": 0.8229423435339386, + "flos": 24097532630400.0, + "grad_norm": 3.4405652900110066, + "language_loss": 0.74149132, + "learning_rate": 3.1979387786420396e-07, + "loss": 0.762743, + "num_input_tokens_seen": 147803605, + "step": 6844, + "time_per_iteration": 3.348378896713257 + }, + { + "auxiliary_loss_clip": 0.01140866, + "auxiliary_loss_mlp": 0.01021863, + "balance_loss_clip": 1.04184449, + "balance_loss_mlp": 1.01465046, + "epoch": 0.8230625864245776, + "flos": 23878549365120.0, + "grad_norm": 1.8524113011622831, + "language_loss": 0.82043439, + "learning_rate": 3.1937147021027346e-07, + "loss": 0.8420617, + "num_input_tokens_seen": 147822060, + "step": 6845, + "time_per_iteration": 2.5181055068969727 + }, + { + "auxiliary_loss_clip": 0.01153337, + "auxiliary_loss_mlp": 0.01021363, + "balance_loss_clip": 1.04567385, + "balance_loss_mlp": 1.0145148, + "epoch": 0.8231828293152167, + "flos": 16581106379520.0, + "grad_norm": 2.1572267212362193, + "language_loss": 0.76755524, + "learning_rate": 3.189493175063547e-07, + "loss": 0.78930223, + "num_input_tokens_seen": 147839295, + "step": 6846, + "time_per_iteration": 2.42720103263855 + }, + { + "auxiliary_loss_clip": 0.01140992, + "auxiliary_loss_mlp": 0.01024943, + "balance_loss_clip": 1.04507303, + "balance_loss_mlp": 1.01717615, + "epoch": 0.8233030722058559, + "flos": 18880071528960.0, + "grad_norm": 1.6305815924805847, + "language_loss": 0.67505383, + "learning_rate": 3.1852741981648776e-07, + "loss": 0.69671321, + "num_input_tokens_seen": 147857945, + "step": 6847, + "time_per_iteration": 2.5205559730529785 + }, + { + "auxiliary_loss_clip": 0.01113111, + "auxiliary_loss_mlp": 0.01031874, + "balance_loss_clip": 1.04241395, + "balance_loss_mlp": 1.02433419, + "epoch": 0.8234233150964949, + "flos": 28439024757120.0, + "grad_norm": 8.138355040879688, + "language_loss": 0.7027424, + "learning_rate": 3.1810577720467404e-07, + "loss": 0.72419226, + "num_input_tokens_seen": 147879675, + "step": 6848, + "time_per_iteration": 2.6182444095611572 + }, + { + "auxiliary_loss_clip": 0.01144329, + "auxiliary_loss_mlp": 0.01022197, + "balance_loss_clip": 1.04631329, + "balance_loss_mlp": 1.01443338, + "epoch": 0.823543557987134, + "flos": 33765941577600.0, + "grad_norm": 1.6328166069301608, + "language_loss": 0.56787437, + "learning_rate": 3.176843897348769e-07, + "loss": 0.58953965, + "num_input_tokens_seen": 147902870, + "step": 6849, + "time_per_iteration": 2.647188663482666 + }, + { + "auxiliary_loss_clip": 0.01136566, + "auxiliary_loss_mlp": 0.01023482, + "balance_loss_clip": 1.04395211, + "balance_loss_mlp": 1.0157156, + "epoch": 0.8236638008777731, + "flos": 17092366611840.0, + "grad_norm": 3.071920834552333, + "language_loss": 0.75756431, + "learning_rate": 3.1726325747102034e-07, + "loss": 0.77916479, + "num_input_tokens_seen": 147921245, + "step": 6850, + "time_per_iteration": 2.475806713104248 + }, + { + "auxiliary_loss_clip": 0.01102936, + "auxiliary_loss_mlp": 0.01024577, + "balance_loss_clip": 1.03640938, + "balance_loss_mlp": 1.0171082, + "epoch": 0.8237840437684122, + "flos": 61639982334720.0, + "grad_norm": 1.6778120390538036, + "language_loss": 0.64214969, + "learning_rate": 3.1684238047698974e-07, + "loss": 0.66342473, + "num_input_tokens_seen": 147949515, + "step": 6851, + "time_per_iteration": 2.936465263366699 + }, + { + "auxiliary_loss_clip": 0.01142462, + "auxiliary_loss_mlp": 0.01027308, + "balance_loss_clip": 1.04482818, + "balance_loss_mlp": 1.0194459, + "epoch": 0.8239042866590512, + "flos": 27309023821440.0, + "grad_norm": 2.305873132634242, + "language_loss": 0.5270943, + "learning_rate": 3.1642175881663155e-07, + "loss": 0.548792, + "num_input_tokens_seen": 147969245, + "step": 6852, + "time_per_iteration": 2.5487871170043945 + }, + { + "auxiliary_loss_clip": 0.01167728, + "auxiliary_loss_mlp": 0.01021678, + "balance_loss_clip": 1.04772758, + "balance_loss_mlp": 1.01487076, + "epoch": 0.8240245295496904, + "flos": 21726351187200.0, + "grad_norm": 2.391803169318608, + "language_loss": 0.84090292, + "learning_rate": 3.160013925537537e-07, + "loss": 0.8627969, + "num_input_tokens_seen": 147990080, + "step": 6853, + "time_per_iteration": 2.4877383708953857 + }, + { + "auxiliary_loss_clip": 0.01128863, + "auxiliary_loss_mlp": 0.01024618, + "balance_loss_clip": 1.04260886, + "balance_loss_mlp": 1.0170002, + "epoch": 0.8241447724403295, + "flos": 20009318279040.0, + "grad_norm": 2.1280756043072726, + "language_loss": 0.75588077, + "learning_rate": 3.155812817521266e-07, + "loss": 0.77741563, + "num_input_tokens_seen": 148010455, + "step": 6854, + "time_per_iteration": 2.5637617111206055 + }, + { + "auxiliary_loss_clip": 0.0114233, + "auxiliary_loss_mlp": 0.01024601, + "balance_loss_clip": 1.04612994, + "balance_loss_mlp": 1.01740348, + "epoch": 0.8242650153309685, + "flos": 22272983337600.0, + "grad_norm": 2.2123454948037193, + "language_loss": 0.77772772, + "learning_rate": 3.151614264754787e-07, + "loss": 0.79939705, + "num_input_tokens_seen": 148028400, + "step": 6855, + "time_per_iteration": 2.512528419494629 + }, + { + "auxiliary_loss_clip": 0.01167974, + "auxiliary_loss_mlp": 0.0102495, + "balance_loss_clip": 1.04587865, + "balance_loss_mlp": 1.01755321, + "epoch": 0.8243852582216077, + "flos": 22309971367680.0, + "grad_norm": 2.072142416039336, + "language_loss": 0.7950123, + "learning_rate": 3.147418267875035e-07, + "loss": 0.8169415, + "num_input_tokens_seen": 148046530, + "step": 6856, + "time_per_iteration": 2.5325679779052734 + }, + { + "auxiliary_loss_clip": 0.01096985, + "auxiliary_loss_mlp": 0.00762398, + "balance_loss_clip": 1.03964341, + "balance_loss_mlp": 1.00063157, + "epoch": 0.8245055011122467, + "flos": 24645421756800.0, + "grad_norm": 1.9849998614165507, + "language_loss": 0.65714216, + "learning_rate": 3.1432248275185315e-07, + "loss": 0.67573601, + "num_input_tokens_seen": 148067040, + "step": 6857, + "time_per_iteration": 2.6361546516418457 + }, + { + "auxiliary_loss_clip": 0.0115167, + "auxiliary_loss_mlp": 0.01022361, + "balance_loss_clip": 1.04646897, + "balance_loss_mlp": 1.01510084, + "epoch": 0.8246257440028858, + "flos": 17487275713920.0, + "grad_norm": 2.0453144518696096, + "language_loss": 0.76811182, + "learning_rate": 3.139033944321412e-07, + "loss": 0.78985214, + "num_input_tokens_seen": 148084400, + "step": 6858, + "time_per_iteration": 2.4555130004882812 + }, + { + "auxiliary_loss_clip": 0.01157442, + "auxiliary_loss_mlp": 0.01021258, + "balance_loss_clip": 1.04626656, + "balance_loss_mlp": 1.01420975, + "epoch": 0.824745986893525, + "flos": 25010130499200.0, + "grad_norm": 1.6774567321120026, + "language_loss": 0.78686225, + "learning_rate": 3.1348456189194507e-07, + "loss": 0.8086493, + "num_input_tokens_seen": 148104860, + "step": 6859, + "time_per_iteration": 2.49774432182312 + }, + { + "auxiliary_loss_clip": 0.01115107, + "auxiliary_loss_mlp": 0.01022693, + "balance_loss_clip": 1.03930998, + "balance_loss_mlp": 1.0150156, + "epoch": 0.824866229784164, + "flos": 18772698798720.0, + "grad_norm": 3.9064358805479977, + "language_loss": 0.82817447, + "learning_rate": 3.1306598519479876e-07, + "loss": 0.84955251, + "num_input_tokens_seen": 148124680, + "step": 6860, + "time_per_iteration": 2.5449752807617188 + }, + { + "auxiliary_loss_clip": 0.01136515, + "auxiliary_loss_mlp": 0.01022278, + "balance_loss_clip": 1.04394436, + "balance_loss_mlp": 1.01526833, + "epoch": 0.8249864726748031, + "flos": 23842171866240.0, + "grad_norm": 1.6316876723413205, + "language_loss": 0.78174353, + "learning_rate": 3.1264766440420177e-07, + "loss": 0.80333149, + "num_input_tokens_seen": 148147150, + "step": 6861, + "time_per_iteration": 2.571561336517334 + }, + { + "auxiliary_loss_clip": 0.01152174, + "auxiliary_loss_mlp": 0.01022883, + "balance_loss_clip": 1.04689085, + "balance_loss_mlp": 1.01549828, + "epoch": 0.8251067155654422, + "flos": 20303103617280.0, + "grad_norm": 2.068537068547157, + "language_loss": 0.69214523, + "learning_rate": 3.122295995836124e-07, + "loss": 0.7138958, + "num_input_tokens_seen": 148167020, + "step": 6862, + "time_per_iteration": 2.451986789703369 + }, + { + "auxiliary_loss_clip": 0.01158045, + "auxiliary_loss_mlp": 0.01024591, + "balance_loss_clip": 1.04413152, + "balance_loss_mlp": 1.01682401, + "epoch": 0.8252269584560813, + "flos": 25009699536000.0, + "grad_norm": 2.2684495807034346, + "language_loss": 0.77826118, + "learning_rate": 3.118117907964508e-07, + "loss": 0.80008751, + "num_input_tokens_seen": 148188965, + "step": 6863, + "time_per_iteration": 2.5081889629364014 + }, + { + "auxiliary_loss_clip": 0.01132656, + "auxiliary_loss_mlp": 0.01024038, + "balance_loss_clip": 1.04336286, + "balance_loss_mlp": 1.01685548, + "epoch": 0.8253472013467203, + "flos": 17128564542720.0, + "grad_norm": 1.9490921418309477, + "language_loss": 0.79989469, + "learning_rate": 3.1139423810609856e-07, + "loss": 0.82146168, + "num_input_tokens_seen": 148205660, + "step": 6864, + "time_per_iteration": 2.553708553314209 + }, + { + "auxiliary_loss_clip": 0.01165647, + "auxiliary_loss_mlp": 0.01026113, + "balance_loss_clip": 1.0437839, + "balance_loss_mlp": 1.0182091, + "epoch": 0.8254674442373595, + "flos": 22414794232320.0, + "grad_norm": 2.397503949219847, + "language_loss": 0.75359988, + "learning_rate": 3.1097694157589714e-07, + "loss": 0.77551746, + "num_input_tokens_seen": 148225545, + "step": 6865, + "time_per_iteration": 2.456087827682495 + }, + { + "auxiliary_loss_clip": 0.01152562, + "auxiliary_loss_mlp": 0.01031457, + "balance_loss_clip": 1.04761529, + "balance_loss_mlp": 1.02391148, + "epoch": 0.8255876871279986, + "flos": 24786765774720.0, + "grad_norm": 8.379831721596505, + "language_loss": 0.76336682, + "learning_rate": 3.105599012691511e-07, + "loss": 0.78520703, + "num_input_tokens_seen": 148243975, + "step": 6866, + "time_per_iteration": 3.2092344760894775 + }, + { + "auxiliary_loss_clip": 0.01151184, + "auxiliary_loss_mlp": 0.01024665, + "balance_loss_clip": 1.0453397, + "balance_loss_mlp": 1.01766694, + "epoch": 0.8257079300186376, + "flos": 27455431656960.0, + "grad_norm": 1.438327910592131, + "language_loss": 0.82103956, + "learning_rate": 3.101431172491249e-07, + "loss": 0.84279799, + "num_input_tokens_seen": 148265520, + "step": 6867, + "time_per_iteration": 3.2971341609954834 + }, + { + "auxiliary_loss_clip": 0.01129818, + "auxiliary_loss_mlp": 0.00762369, + "balance_loss_clip": 1.04167914, + "balance_loss_mlp": 1.00054967, + "epoch": 0.8258281729092768, + "flos": 16471866142080.0, + "grad_norm": 2.1813699827990956, + "language_loss": 0.72177386, + "learning_rate": 3.097265895790444e-07, + "loss": 0.74069571, + "num_input_tokens_seen": 148283730, + "step": 6868, + "time_per_iteration": 2.527533531188965 + }, + { + "auxiliary_loss_clip": 0.01128601, + "auxiliary_loss_mlp": 0.01020408, + "balance_loss_clip": 1.04369736, + "balance_loss_mlp": 1.01343989, + "epoch": 0.8259484157999158, + "flos": 21433822824960.0, + "grad_norm": 2.177891429193926, + "language_loss": 0.83132809, + "learning_rate": 3.093103183220962e-07, + "loss": 0.85281813, + "num_input_tokens_seen": 148303775, + "step": 6869, + "time_per_iteration": 3.286405324935913 + }, + { + "auxiliary_loss_clip": 0.01052652, + "auxiliary_loss_mlp": 0.01001618, + "balance_loss_clip": 1.00667822, + "balance_loss_mlp": 1.0006882, + "epoch": 0.8260686586905549, + "flos": 58322342453760.0, + "grad_norm": 0.8168887391360674, + "language_loss": 0.59413803, + "learning_rate": 3.0889430354142796e-07, + "loss": 0.61468071, + "num_input_tokens_seen": 148365285, + "step": 6870, + "time_per_iteration": 3.0292866230010986 + }, + { + "auxiliary_loss_clip": 0.0113088, + "auxiliary_loss_mlp": 0.01023881, + "balance_loss_clip": 1.0420866, + "balance_loss_mlp": 1.01626968, + "epoch": 0.826188901581194, + "flos": 27527288814720.0, + "grad_norm": 1.9186849607481415, + "language_loss": 0.70266563, + "learning_rate": 3.084785453001497e-07, + "loss": 0.72421324, + "num_input_tokens_seen": 148386200, + "step": 6871, + "time_per_iteration": 3.3318686485290527 + }, + { + "auxiliary_loss_clip": 0.01140431, + "auxiliary_loss_mlp": 0.00762493, + "balance_loss_clip": 1.04767227, + "balance_loss_mlp": 1.00064731, + "epoch": 0.8263091444718331, + "flos": 23696051339520.0, + "grad_norm": 2.1281817576543705, + "language_loss": 0.8181535, + "learning_rate": 3.080630436613314e-07, + "loss": 0.83718264, + "num_input_tokens_seen": 148403970, + "step": 6872, + "time_per_iteration": 2.559436559677124 + }, + { + "auxiliary_loss_clip": 0.01146234, + "auxiliary_loss_mlp": 0.01028331, + "balance_loss_clip": 1.04436541, + "balance_loss_mlp": 1.02092242, + "epoch": 0.8264293873624722, + "flos": 17165157523200.0, + "grad_norm": 2.21971303966148, + "language_loss": 0.85908443, + "learning_rate": 3.076477986880039e-07, + "loss": 0.88083011, + "num_input_tokens_seen": 148421765, + "step": 6873, + "time_per_iteration": 2.4544858932495117 + }, + { + "auxiliary_loss_clip": 0.01140478, + "auxiliary_loss_mlp": 0.01022863, + "balance_loss_clip": 1.04575562, + "balance_loss_mlp": 1.015553, + "epoch": 0.8265496302531112, + "flos": 24098645952000.0, + "grad_norm": 2.996151613228625, + "language_loss": 0.69011515, + "learning_rate": 3.0723281044315986e-07, + "loss": 0.71174854, + "num_input_tokens_seen": 148443720, + "step": 6874, + "time_per_iteration": 2.5110085010528564 + }, + { + "auxiliary_loss_clip": 0.01161958, + "auxiliary_loss_mlp": 0.0102514, + "balance_loss_clip": 1.04385209, + "balance_loss_mlp": 1.01825249, + "epoch": 0.8266698731437504, + "flos": 14099894599680.0, + "grad_norm": 2.1995631870941175, + "language_loss": 0.76498067, + "learning_rate": 3.068180789897521e-07, + "loss": 0.78685164, + "num_input_tokens_seen": 148462130, + "step": 6875, + "time_per_iteration": 2.4209389686584473 + }, + { + "auxiliary_loss_clip": 0.01157337, + "auxiliary_loss_mlp": 0.01023632, + "balance_loss_clip": 1.0448364, + "balance_loss_mlp": 1.01605666, + "epoch": 0.8267901160343895, + "flos": 30777563715840.0, + "grad_norm": 1.521036149425567, + "language_loss": 0.81793988, + "learning_rate": 3.064036043906966e-07, + "loss": 0.83974957, + "num_input_tokens_seen": 148485570, + "step": 6876, + "time_per_iteration": 2.555532455444336 + }, + { + "auxiliary_loss_clip": 0.01133398, + "auxiliary_loss_mlp": 0.01025885, + "balance_loss_clip": 1.04287839, + "balance_loss_mlp": 1.01780224, + "epoch": 0.8269103589250285, + "flos": 40624915242240.0, + "grad_norm": 2.1458081108298406, + "language_loss": 0.6778506, + "learning_rate": 3.059893867088668e-07, + "loss": 0.69944346, + "num_input_tokens_seen": 148509715, + "step": 6877, + "time_per_iteration": 2.7039690017700195 + }, + { + "auxiliary_loss_clip": 0.01149926, + "auxiliary_loss_mlp": 0.01025495, + "balance_loss_clip": 1.04479396, + "balance_loss_mlp": 1.01863992, + "epoch": 0.8270306018156677, + "flos": 30263645877120.0, + "grad_norm": 1.9649493342053095, + "language_loss": 0.6688984, + "learning_rate": 3.055754260071004e-07, + "loss": 0.69065261, + "num_input_tokens_seen": 148532010, + "step": 6878, + "time_per_iteration": 2.5357449054718018 + }, + { + "auxiliary_loss_clip": 0.01155159, + "auxiliary_loss_mlp": 0.01024709, + "balance_loss_clip": 1.04709983, + "balance_loss_mlp": 1.01799214, + "epoch": 0.8271508447063067, + "flos": 25226599812480.0, + "grad_norm": 2.0599631485446954, + "language_loss": 0.73714983, + "learning_rate": 3.051617223481948e-07, + "loss": 0.75894856, + "num_input_tokens_seen": 148553330, + "step": 6879, + "time_per_iteration": 2.5648770332336426 + }, + { + "auxiliary_loss_clip": 0.01138036, + "auxiliary_loss_mlp": 0.01029221, + "balance_loss_clip": 1.0442369, + "balance_loss_mlp": 1.02130556, + "epoch": 0.8272710875969458, + "flos": 17566602900480.0, + "grad_norm": 1.9428816915450509, + "language_loss": 0.75282902, + "learning_rate": 3.047482757949078e-07, + "loss": 0.77450168, + "num_input_tokens_seen": 148570960, + "step": 6880, + "time_per_iteration": 2.5075268745422363 + }, + { + "auxiliary_loss_clip": 0.01121484, + "auxiliary_loss_mlp": 0.00761583, + "balance_loss_clip": 1.0407784, + "balance_loss_mlp": 1.00064182, + "epoch": 0.827391330487585, + "flos": 19755465886080.0, + "grad_norm": 1.9848116227436554, + "language_loss": 0.85595477, + "learning_rate": 3.043350864099605e-07, + "loss": 0.87478536, + "num_input_tokens_seen": 148589520, + "step": 6881, + "time_per_iteration": 2.5157454013824463 + }, + { + "auxiliary_loss_clip": 0.01155351, + "auxiliary_loss_mlp": 0.01022992, + "balance_loss_clip": 1.04406857, + "balance_loss_mlp": 1.0156095, + "epoch": 0.827511573378224, + "flos": 16835174254080.0, + "grad_norm": 2.3986304205995133, + "language_loss": 0.80463576, + "learning_rate": 3.039221542560315e-07, + "loss": 0.82641923, + "num_input_tokens_seen": 148606085, + "step": 6882, + "time_per_iteration": 2.42742657661438 + }, + { + "auxiliary_loss_clip": 0.01154448, + "auxiliary_loss_mlp": 0.01022188, + "balance_loss_clip": 1.04782736, + "balance_loss_mlp": 1.01462746, + "epoch": 0.8276318162688631, + "flos": 18369242259840.0, + "grad_norm": 1.9370286095827705, + "language_loss": 0.7396698, + "learning_rate": 3.0350947939576356e-07, + "loss": 0.76143616, + "num_input_tokens_seen": 148625240, + "step": 6883, + "time_per_iteration": 2.434636116027832 + }, + { + "auxiliary_loss_clip": 0.01160869, + "auxiliary_loss_mlp": 0.01028981, + "balance_loss_clip": 1.04783869, + "balance_loss_mlp": 1.02076113, + "epoch": 0.8277520591595022, + "flos": 19352691705600.0, + "grad_norm": 1.7159291933641445, + "language_loss": 0.72183514, + "learning_rate": 3.0309706189175876e-07, + "loss": 0.74373364, + "num_input_tokens_seen": 148645075, + "step": 6884, + "time_per_iteration": 2.4515349864959717 + }, + { + "auxiliary_loss_clip": 0.01044661, + "auxiliary_loss_mlp": 0.01000487, + "balance_loss_clip": 1.00795984, + "balance_loss_mlp": 0.99940264, + "epoch": 0.8278723020501413, + "flos": 67918858329600.0, + "grad_norm": 0.761006475794876, + "language_loss": 0.57411993, + "learning_rate": 3.0268490180658045e-07, + "loss": 0.59457147, + "num_input_tokens_seen": 148707855, + "step": 6885, + "time_per_iteration": 3.0900514125823975 + }, + { + "auxiliary_loss_clip": 0.01172103, + "auxiliary_loss_mlp": 0.01022268, + "balance_loss_clip": 1.04976273, + "balance_loss_mlp": 1.01515079, + "epoch": 0.8279925449407803, + "flos": 18185738653440.0, + "grad_norm": 2.1531543269614013, + "language_loss": 0.79564524, + "learning_rate": 3.0227299920275305e-07, + "loss": 0.81758898, + "num_input_tokens_seen": 148724170, + "step": 6886, + "time_per_iteration": 2.394500494003296 + }, + { + "auxiliary_loss_clip": 0.01130727, + "auxiliary_loss_mlp": 0.01027184, + "balance_loss_clip": 1.04579854, + "balance_loss_mlp": 1.01897931, + "epoch": 0.8281127878314195, + "flos": 20631434860800.0, + "grad_norm": 1.9015192761423654, + "language_loss": 0.85827625, + "learning_rate": 3.018613541427613e-07, + "loss": 0.87985533, + "num_input_tokens_seen": 148743690, + "step": 6887, + "time_per_iteration": 2.603153944015503 + }, + { + "auxiliary_loss_clip": 0.01166074, + "auxiliary_loss_mlp": 0.01023254, + "balance_loss_clip": 1.04511118, + "balance_loss_mlp": 1.0160898, + "epoch": 0.8282330307220586, + "flos": 18004282122240.0, + "grad_norm": 1.7192766711166265, + "language_loss": 0.74003559, + "learning_rate": 3.0144996668905243e-07, + "loss": 0.7619288, + "num_input_tokens_seen": 148761070, + "step": 6888, + "time_per_iteration": 2.3966662883758545 + }, + { + "auxiliary_loss_clip": 0.01098933, + "auxiliary_loss_mlp": 0.00762263, + "balance_loss_clip": 1.03688002, + "balance_loss_mlp": 1.00059295, + "epoch": 0.8283532736126976, + "flos": 20084120352000.0, + "grad_norm": 1.9266126653626225, + "language_loss": 0.8172968, + "learning_rate": 3.010388369040331e-07, + "loss": 0.83590877, + "num_input_tokens_seen": 148779730, + "step": 6889, + "time_per_iteration": 2.6016435623168945 + }, + { + "auxiliary_loss_clip": 0.011563, + "auxiliary_loss_mlp": 0.01025377, + "balance_loss_clip": 1.0467627, + "balance_loss_mlp": 1.0180521, + "epoch": 0.8284735165033368, + "flos": 31868421805440.0, + "grad_norm": 1.4715205290974689, + "language_loss": 0.82840246, + "learning_rate": 3.0062796485007156e-07, + "loss": 0.85021919, + "num_input_tokens_seen": 148800670, + "step": 6890, + "time_per_iteration": 2.534233331680298 + }, + { + "auxiliary_loss_clip": 0.01168005, + "auxiliary_loss_mlp": 0.00762493, + "balance_loss_clip": 1.04658771, + "balance_loss_mlp": 1.00062299, + "epoch": 0.8285937593939758, + "flos": 26651319840000.0, + "grad_norm": 2.3814517086984477, + "language_loss": 0.65639448, + "learning_rate": 3.002173505894965e-07, + "loss": 0.67569947, + "num_input_tokens_seen": 148819820, + "step": 6891, + "time_per_iteration": 2.4526612758636475 + }, + { + "auxiliary_loss_clip": 0.01157969, + "auxiliary_loss_mlp": 0.01024494, + "balance_loss_clip": 1.04413867, + "balance_loss_mlp": 1.01624155, + "epoch": 0.8287140022846149, + "flos": 20193683811840.0, + "grad_norm": 2.838557899569873, + "language_loss": 0.62694651, + "learning_rate": 2.998069941845973e-07, + "loss": 0.64877117, + "num_input_tokens_seen": 148838890, + "step": 6892, + "time_per_iteration": 3.1946728229522705 + }, + { + "auxiliary_loss_clip": 0.01063125, + "auxiliary_loss_mlp": 0.01000924, + "balance_loss_clip": 1.00686574, + "balance_loss_mlp": 0.99986267, + "epoch": 0.8288342451752541, + "flos": 70755980019840.0, + "grad_norm": 0.7072879733066348, + "language_loss": 0.57427359, + "learning_rate": 2.993968956976258e-07, + "loss": 0.59491408, + "num_input_tokens_seen": 148906635, + "step": 6893, + "time_per_iteration": 3.12302565574646 + }, + { + "auxiliary_loss_clip": 0.01173561, + "auxiliary_loss_mlp": 0.01021659, + "balance_loss_clip": 1.04816425, + "balance_loss_mlp": 1.01325464, + "epoch": 0.8289544880658931, + "flos": 24572235795840.0, + "grad_norm": 2.4092361883180096, + "language_loss": 0.70158631, + "learning_rate": 2.9898705519079313e-07, + "loss": 0.72353852, + "num_input_tokens_seen": 148925740, + "step": 6894, + "time_per_iteration": 3.1992266178131104 + }, + { + "auxiliary_loss_clip": 0.01132856, + "auxiliary_loss_mlp": 0.01022863, + "balance_loss_clip": 1.04209208, + "balance_loss_mlp": 1.01543319, + "epoch": 0.8290747309565322, + "flos": 22273378387200.0, + "grad_norm": 2.1114566665487944, + "language_loss": 0.74828076, + "learning_rate": 2.985774727262715e-07, + "loss": 0.76983792, + "num_input_tokens_seen": 148944585, + "step": 6895, + "time_per_iteration": 2.484342575073242 + }, + { + "auxiliary_loss_clip": 0.01166494, + "auxiliary_loss_mlp": 0.01021391, + "balance_loss_clip": 1.04649603, + "balance_loss_mlp": 1.01450646, + "epoch": 0.8291949738471713, + "flos": 23255570856960.0, + "grad_norm": 2.2358774444661824, + "language_loss": 0.81641603, + "learning_rate": 2.981681483661949e-07, + "loss": 0.83829486, + "num_input_tokens_seen": 148964170, + "step": 6896, + "time_per_iteration": 3.212367534637451 + }, + { + "auxiliary_loss_clip": 0.01155904, + "auxiliary_loss_mlp": 0.01029452, + "balance_loss_clip": 1.04802918, + "balance_loss_mlp": 1.02244234, + "epoch": 0.8293152167378104, + "flos": 52555768185600.0, + "grad_norm": 1.5670942348970354, + "language_loss": 0.71138835, + "learning_rate": 2.9775908217265633e-07, + "loss": 0.73324192, + "num_input_tokens_seen": 148989405, + "step": 6897, + "time_per_iteration": 3.5577633380889893 + }, + { + "auxiliary_loss_clip": 0.01007748, + "auxiliary_loss_mlp": 0.01000325, + "balance_loss_clip": 1.00667632, + "balance_loss_mlp": 0.99916887, + "epoch": 0.8294354596284494, + "flos": 63356156294400.0, + "grad_norm": 0.8359829601904365, + "language_loss": 0.5036903, + "learning_rate": 2.9735027420771253e-07, + "loss": 0.52377105, + "num_input_tokens_seen": 149049740, + "step": 6898, + "time_per_iteration": 3.118032455444336 + }, + { + "auxiliary_loss_clip": 0.01137054, + "auxiliary_loss_mlp": 0.01030407, + "balance_loss_clip": 1.04912031, + "balance_loss_mlp": 1.02345705, + "epoch": 0.8295557025190886, + "flos": 24827021942400.0, + "grad_norm": 1.7521209436642817, + "language_loss": 0.71285594, + "learning_rate": 2.969417245333774e-07, + "loss": 0.73453057, + "num_input_tokens_seen": 149069120, + "step": 6899, + "time_per_iteration": 2.5524046421051025 + }, + { + "auxiliary_loss_clip": 0.01122949, + "auxiliary_loss_mlp": 0.01024067, + "balance_loss_clip": 1.04380655, + "balance_loss_mlp": 1.01721299, + "epoch": 0.8296759454097277, + "flos": 25118580637440.0, + "grad_norm": 2.281915054136508, + "language_loss": 0.78162783, + "learning_rate": 2.9653343321162915e-07, + "loss": 0.80309796, + "num_input_tokens_seen": 149088630, + "step": 6900, + "time_per_iteration": 2.5419974327087402 + }, + { + "auxiliary_loss_clip": 0.01126317, + "auxiliary_loss_mlp": 0.01021248, + "balance_loss_clip": 1.04508805, + "balance_loss_mlp": 1.01329684, + "epoch": 0.8297961883003667, + "flos": 24132581326080.0, + "grad_norm": 1.8827148271448404, + "language_loss": 0.6459105, + "learning_rate": 2.9612540030440446e-07, + "loss": 0.66738617, + "num_input_tokens_seen": 149109175, + "step": 6901, + "time_per_iteration": 2.5439395904541016 + }, + { + "auxiliary_loss_clip": 0.01042407, + "auxiliary_loss_mlp": 0.01001051, + "balance_loss_clip": 1.00667214, + "balance_loss_mlp": 0.99996608, + "epoch": 0.8299164311910058, + "flos": 67446561375360.0, + "grad_norm": 0.8637153666784296, + "language_loss": 0.64116156, + "learning_rate": 2.9571762587360206e-07, + "loss": 0.66159612, + "num_input_tokens_seen": 149165560, + "step": 6902, + "time_per_iteration": 3.0113656520843506 + }, + { + "auxiliary_loss_clip": 0.01106795, + "auxiliary_loss_mlp": 0.01024761, + "balance_loss_clip": 1.03546095, + "balance_loss_mlp": 1.01764393, + "epoch": 0.8300366740816449, + "flos": 25228682801280.0, + "grad_norm": 2.0580416532011516, + "language_loss": 0.73898017, + "learning_rate": 2.953101099810806e-07, + "loss": 0.76029575, + "num_input_tokens_seen": 149185165, + "step": 6903, + "time_per_iteration": 2.5931601524353027 + }, + { + "auxiliary_loss_clip": 0.01150379, + "auxiliary_loss_mlp": 0.01026929, + "balance_loss_clip": 1.04737234, + "balance_loss_mlp": 1.01964796, + "epoch": 0.830156916972284, + "flos": 18041018757120.0, + "grad_norm": 1.9778389025892593, + "language_loss": 0.82563925, + "learning_rate": 2.9490285268865965e-07, + "loss": 0.84741235, + "num_input_tokens_seen": 149202655, + "step": 6904, + "time_per_iteration": 2.436349630355835 + }, + { + "auxiliary_loss_clip": 0.01161341, + "auxiliary_loss_mlp": 0.01020974, + "balance_loss_clip": 1.04946506, + "balance_loss_mlp": 1.01375639, + "epoch": 0.830277159862923, + "flos": 26322485806080.0, + "grad_norm": 2.22299679757962, + "language_loss": 0.7957949, + "learning_rate": 2.9449585405812085e-07, + "loss": 0.81761807, + "num_input_tokens_seen": 149220035, + "step": 6905, + "time_per_iteration": 2.4924113750457764 + }, + { + "auxiliary_loss_clip": 0.01132141, + "auxiliary_loss_mlp": 0.01021746, + "balance_loss_clip": 1.04496288, + "balance_loss_mlp": 1.0143311, + "epoch": 0.8303974027535622, + "flos": 19938861751680.0, + "grad_norm": 1.864156272360866, + "language_loss": 0.74088466, + "learning_rate": 2.940891141512043e-07, + "loss": 0.76242352, + "num_input_tokens_seen": 149238055, + "step": 6906, + "time_per_iteration": 2.504330635070801 + }, + { + "auxiliary_loss_clip": 0.0114009, + "auxiliary_loss_mlp": 0.01028735, + "balance_loss_clip": 1.04447412, + "balance_loss_mlp": 1.02090943, + "epoch": 0.8305176456442013, + "flos": 17165552572800.0, + "grad_norm": 2.4441529476537114, + "language_loss": 0.7193982, + "learning_rate": 2.9368263302961385e-07, + "loss": 0.74108642, + "num_input_tokens_seen": 149256755, + "step": 6907, + "time_per_iteration": 2.461529493331909 + }, + { + "auxiliary_loss_clip": 0.01098312, + "auxiliary_loss_mlp": 0.01019485, + "balance_loss_clip": 1.03838885, + "balance_loss_mlp": 1.0121119, + "epoch": 0.8306378885348403, + "flos": 25627614226560.0, + "grad_norm": 1.8535706234130085, + "language_loss": 0.79873121, + "learning_rate": 2.9327641075501075e-07, + "loss": 0.81990916, + "num_input_tokens_seen": 149275745, + "step": 6908, + "time_per_iteration": 2.6077752113342285 + }, + { + "auxiliary_loss_clip": 0.01132825, + "auxiliary_loss_mlp": 0.01033308, + "balance_loss_clip": 1.040519, + "balance_loss_mlp": 1.02532148, + "epoch": 0.8307581314254795, + "flos": 33947864985600.0, + "grad_norm": 3.5794729345219394, + "language_loss": 0.66228974, + "learning_rate": 2.9287044738901866e-07, + "loss": 0.68395108, + "num_input_tokens_seen": 149293730, + "step": 6909, + "time_per_iteration": 2.6344223022460938 + }, + { + "auxiliary_loss_clip": 0.01155569, + "auxiliary_loss_mlp": 0.00761928, + "balance_loss_clip": 1.04620063, + "balance_loss_mlp": 1.00058508, + "epoch": 0.8308783743161186, + "flos": 17562724231680.0, + "grad_norm": 1.97838900658882, + "language_loss": 0.90729046, + "learning_rate": 2.9246474299322274e-07, + "loss": 0.92646539, + "num_input_tokens_seen": 149309290, + "step": 6910, + "time_per_iteration": 2.434816837310791 + }, + { + "auxiliary_loss_clip": 0.01027617, + "auxiliary_loss_mlp": 0.01003125, + "balance_loss_clip": 1.00587869, + "balance_loss_mlp": 1.00206399, + "epoch": 0.8309986172067576, + "flos": 69412885649280.0, + "grad_norm": 0.9001401891237406, + "language_loss": 0.63162136, + "learning_rate": 2.920592976291678e-07, + "loss": 0.65192878, + "num_input_tokens_seen": 149366620, + "step": 6911, + "time_per_iteration": 3.0356569290161133 + }, + { + "auxiliary_loss_clip": 0.01153781, + "auxiliary_loss_mlp": 0.01028538, + "balance_loss_clip": 1.04554367, + "balance_loss_mlp": 1.0210402, + "epoch": 0.8311188600973968, + "flos": 22309755886080.0, + "grad_norm": 2.0953941041219815, + "language_loss": 0.80394554, + "learning_rate": 2.916541113583595e-07, + "loss": 0.82576871, + "num_input_tokens_seen": 149385120, + "step": 6912, + "time_per_iteration": 2.480563163757324 + }, + { + "auxiliary_loss_clip": 0.01132062, + "auxiliary_loss_mlp": 0.01023966, + "balance_loss_clip": 1.04642487, + "balance_loss_mlp": 1.01611948, + "epoch": 0.8312391029880358, + "flos": 18770077105920.0, + "grad_norm": 2.192776849775292, + "language_loss": 0.66106826, + "learning_rate": 2.912491842422642e-07, + "loss": 0.68262851, + "num_input_tokens_seen": 149402825, + "step": 6913, + "time_per_iteration": 2.500399351119995 + }, + { + "auxiliary_loss_clip": 0.011556, + "auxiliary_loss_mlp": 0.01024358, + "balance_loss_clip": 1.04655766, + "balance_loss_mlp": 1.01686275, + "epoch": 0.8313593458786749, + "flos": 20376648714240.0, + "grad_norm": 2.2613292931900464, + "language_loss": 0.71195459, + "learning_rate": 2.9084451634230857e-07, + "loss": 0.73375422, + "num_input_tokens_seen": 149422125, + "step": 6914, + "time_per_iteration": 2.4619600772857666 + }, + { + "auxiliary_loss_clip": 0.01124609, + "auxiliary_loss_mlp": 0.01025608, + "balance_loss_clip": 1.04151106, + "balance_loss_mlp": 1.01790142, + "epoch": 0.831479588769314, + "flos": 32124069878400.0, + "grad_norm": 2.3758546383887835, + "language_loss": 0.71866322, + "learning_rate": 2.9044010771988125e-07, + "loss": 0.74016541, + "num_input_tokens_seen": 149441940, + "step": 6915, + "time_per_iteration": 2.630723237991333 + }, + { + "auxiliary_loss_clip": 0.01133734, + "auxiliary_loss_mlp": 0.01025002, + "balance_loss_clip": 1.04400349, + "balance_loss_mlp": 1.01792717, + "epoch": 0.8315998316599531, + "flos": 45185929338240.0, + "grad_norm": 1.7224193132342465, + "language_loss": 0.71894002, + "learning_rate": 2.900359584363303e-07, + "loss": 0.74052739, + "num_input_tokens_seen": 149465045, + "step": 6916, + "time_per_iteration": 2.757478713989258 + }, + { + "auxiliary_loss_clip": 0.01108159, + "auxiliary_loss_mlp": 0.01031864, + "balance_loss_clip": 1.04312372, + "balance_loss_mlp": 1.023561, + "epoch": 0.8317200745505922, + "flos": 18363747479040.0, + "grad_norm": 2.3459967243458224, + "language_loss": 0.8460502, + "learning_rate": 2.8963206855296494e-07, + "loss": 0.86745042, + "num_input_tokens_seen": 149481285, + "step": 6917, + "time_per_iteration": 2.560947895050049 + }, + { + "auxiliary_loss_clip": 0.01156246, + "auxiliary_loss_mlp": 0.01030202, + "balance_loss_clip": 1.04584169, + "balance_loss_mlp": 1.02241755, + "epoch": 0.8318403174412313, + "flos": 24206557386240.0, + "grad_norm": 4.63807049487764, + "language_loss": 0.76994276, + "learning_rate": 2.892284381310548e-07, + "loss": 0.79180717, + "num_input_tokens_seen": 149502700, + "step": 6918, + "time_per_iteration": 2.482891798019409 + }, + { + "auxiliary_loss_clip": 0.01136247, + "auxiliary_loss_mlp": 0.01025324, + "balance_loss_clip": 1.04281616, + "balance_loss_mlp": 1.01729214, + "epoch": 0.8319605603318704, + "flos": 22418780641920.0, + "grad_norm": 3.7248831661232256, + "language_loss": 0.7271651, + "learning_rate": 2.888250672318302e-07, + "loss": 0.74878073, + "num_input_tokens_seen": 149520100, + "step": 6919, + "time_per_iteration": 3.273258924484253 + }, + { + "auxiliary_loss_clip": 0.01173648, + "auxiliary_loss_mlp": 0.01029476, + "balance_loss_clip": 1.0512867, + "balance_loss_mlp": 1.02222824, + "epoch": 0.8320808032225094, + "flos": 37414501459200.0, + "grad_norm": 1.6660065905455879, + "language_loss": 0.68431592, + "learning_rate": 2.884219559164831e-07, + "loss": 0.70634711, + "num_input_tokens_seen": 149543245, + "step": 6920, + "time_per_iteration": 3.352215051651001 + }, + { + "auxiliary_loss_clip": 0.0115393, + "auxiliary_loss_mlp": 0.01024334, + "balance_loss_clip": 1.04633689, + "balance_loss_mlp": 1.01745272, + "epoch": 0.8322010461131486, + "flos": 12787395638400.0, + "grad_norm": 2.728712300509209, + "language_loss": 0.81462067, + "learning_rate": 2.880191042461635e-07, + "loss": 0.83640331, + "num_input_tokens_seen": 149559185, + "step": 6921, + "time_per_iteration": 2.457749366760254 + }, + { + "auxiliary_loss_clip": 0.01116369, + "auxiliary_loss_mlp": 0.01024343, + "balance_loss_clip": 1.04059315, + "balance_loss_mlp": 1.01739955, + "epoch": 0.8323212890037877, + "flos": 15815455050240.0, + "grad_norm": 1.7469412051186102, + "language_loss": 0.80032992, + "learning_rate": 2.876165122819849e-07, + "loss": 0.82173705, + "num_input_tokens_seen": 149577165, + "step": 6922, + "time_per_iteration": 3.30072283744812 + }, + { + "auxiliary_loss_clip": 0.01166739, + "auxiliary_loss_mlp": 0.01022577, + "balance_loss_clip": 1.04667318, + "balance_loss_mlp": 1.01572585, + "epoch": 0.8324415318944267, + "flos": 21719276208000.0, + "grad_norm": 1.744511430938835, + "language_loss": 0.79150164, + "learning_rate": 2.872141800850201e-07, + "loss": 0.81339484, + "num_input_tokens_seen": 149594340, + "step": 6923, + "time_per_iteration": 3.1985361576080322 + }, + { + "auxiliary_loss_clip": 0.01167216, + "auxiliary_loss_mlp": 0.01024614, + "balance_loss_clip": 1.04688358, + "balance_loss_mlp": 1.01782513, + "epoch": 0.8325617747850659, + "flos": 34198700636160.0, + "grad_norm": 1.8771538176340492, + "language_loss": 0.73552293, + "learning_rate": 2.868121077163024e-07, + "loss": 0.75744122, + "num_input_tokens_seen": 149613895, + "step": 6924, + "time_per_iteration": 2.519909620285034 + }, + { + "auxiliary_loss_clip": 0.01158233, + "auxiliary_loss_mlp": 0.01027065, + "balance_loss_clip": 1.04568696, + "balance_loss_mlp": 1.01981688, + "epoch": 0.8326820176757049, + "flos": 18369457741440.0, + "grad_norm": 1.8399019774407586, + "language_loss": 0.72211701, + "learning_rate": 2.864102952368257e-07, + "loss": 0.74396992, + "num_input_tokens_seen": 149631820, + "step": 6925, + "time_per_iteration": 2.433072328567505 + }, + { + "auxiliary_loss_clip": 0.01099527, + "auxiliary_loss_mlp": 0.01024036, + "balance_loss_clip": 1.03574336, + "balance_loss_mlp": 1.01620412, + "epoch": 0.832802260566344, + "flos": 35991325716480.0, + "grad_norm": 1.483870591704086, + "language_loss": 0.59236121, + "learning_rate": 2.860087427075444e-07, + "loss": 0.61359686, + "num_input_tokens_seen": 149656070, + "step": 6926, + "time_per_iteration": 2.6759090423583984 + }, + { + "auxiliary_loss_clip": 0.01134388, + "auxiliary_loss_mlp": 0.01027463, + "balance_loss_clip": 1.04379487, + "balance_loss_mlp": 1.02006614, + "epoch": 0.8329225034569832, + "flos": 14244434928000.0, + "grad_norm": 2.6527127967290576, + "language_loss": 0.85923243, + "learning_rate": 2.856074501893744e-07, + "loss": 0.88085097, + "num_input_tokens_seen": 149671270, + "step": 6927, + "time_per_iteration": 2.4399096965789795 + }, + { + "auxiliary_loss_clip": 0.01158933, + "auxiliary_loss_mlp": 0.01026026, + "balance_loss_clip": 1.04890871, + "balance_loss_mlp": 1.01907623, + "epoch": 0.8330427463476222, + "flos": 18077468083200.0, + "grad_norm": 1.7479415365188542, + "language_loss": 0.81797171, + "learning_rate": 2.8520641774319054e-07, + "loss": 0.83982122, + "num_input_tokens_seen": 149689360, + "step": 6928, + "time_per_iteration": 2.418778896331787 + }, + { + "auxiliary_loss_clip": 0.01140991, + "auxiliary_loss_mlp": 0.01025374, + "balance_loss_clip": 1.0406208, + "balance_loss_mlp": 1.01756549, + "epoch": 0.8331629892382613, + "flos": 18040839189120.0, + "grad_norm": 2.0174988833329675, + "language_loss": 0.75645924, + "learning_rate": 2.848056454298309e-07, + "loss": 0.7781229, + "num_input_tokens_seen": 149706685, + "step": 6929, + "time_per_iteration": 2.455765962600708 + }, + { + "auxiliary_loss_clip": 0.01139506, + "auxiliary_loss_mlp": 0.01022256, + "balance_loss_clip": 1.04602838, + "balance_loss_mlp": 1.01465583, + "epoch": 0.8332832321289004, + "flos": 17457398576640.0, + "grad_norm": 5.283792385257386, + "language_loss": 0.64988446, + "learning_rate": 2.844051333100905e-07, + "loss": 0.67150211, + "num_input_tokens_seen": 149724230, + "step": 6930, + "time_per_iteration": 2.443477153778076 + }, + { + "auxiliary_loss_clip": 0.01143176, + "auxiliary_loss_mlp": 0.01021969, + "balance_loss_clip": 1.04800892, + "balance_loss_mlp": 1.01551104, + "epoch": 0.8334034750195395, + "flos": 15084852416640.0, + "grad_norm": 1.9853100691780727, + "language_loss": 0.83911026, + "learning_rate": 2.840048814447269e-07, + "loss": 0.86076176, + "num_input_tokens_seen": 149742395, + "step": 6931, + "time_per_iteration": 2.4571101665496826 + }, + { + "auxiliary_loss_clip": 0.01134496, + "auxiliary_loss_mlp": 0.010234, + "balance_loss_clip": 1.04351461, + "balance_loss_mlp": 1.01643789, + "epoch": 0.8335237179101785, + "flos": 19427170556160.0, + "grad_norm": 2.496341361796886, + "language_loss": 0.74396229, + "learning_rate": 2.836048898944587e-07, + "loss": 0.76554126, + "num_input_tokens_seen": 149760820, + "step": 6932, + "time_per_iteration": 2.4659388065338135 + }, + { + "auxiliary_loss_clip": 0.01138297, + "auxiliary_loss_mlp": 0.0102484, + "balance_loss_clip": 1.04316044, + "balance_loss_mlp": 1.01797342, + "epoch": 0.8336439608008177, + "flos": 21762046327680.0, + "grad_norm": 2.8670041654403144, + "language_loss": 0.72388697, + "learning_rate": 2.832051587199642e-07, + "loss": 0.74551833, + "num_input_tokens_seen": 149778075, + "step": 6933, + "time_per_iteration": 2.477491855621338 + }, + { + "auxiliary_loss_clip": 0.01054282, + "auxiliary_loss_mlp": 0.01000844, + "balance_loss_clip": 1.00693035, + "balance_loss_mlp": 0.99981844, + "epoch": 0.8337642036914568, + "flos": 59702783990400.0, + "grad_norm": 0.8028503173708064, + "language_loss": 0.57702053, + "learning_rate": 2.828056879818821e-07, + "loss": 0.59757179, + "num_input_tokens_seen": 149837150, + "step": 6934, + "time_per_iteration": 2.9922473430633545 + }, + { + "auxiliary_loss_clip": 0.01124664, + "auxiliary_loss_mlp": 0.01022515, + "balance_loss_clip": 1.03994823, + "balance_loss_mlp": 1.01626587, + "epoch": 0.8338844465820958, + "flos": 27162185022720.0, + "grad_norm": 2.668016709240225, + "language_loss": 0.83548987, + "learning_rate": 2.824064777408117e-07, + "loss": 0.85696173, + "num_input_tokens_seen": 149856940, + "step": 6935, + "time_per_iteration": 2.578484058380127 + }, + { + "auxiliary_loss_clip": 0.0115292, + "auxiliary_loss_mlp": 0.01027482, + "balance_loss_clip": 1.04728484, + "balance_loss_mlp": 1.02020478, + "epoch": 0.8340046894727349, + "flos": 30481264425600.0, + "grad_norm": 2.097322635680896, + "language_loss": 0.76057589, + "learning_rate": 2.8200752805731263e-07, + "loss": 0.78237993, + "num_input_tokens_seen": 149879930, + "step": 6936, + "time_per_iteration": 2.511695384979248 + }, + { + "auxiliary_loss_clip": 0.01153743, + "auxiliary_loss_mlp": 0.01024091, + "balance_loss_clip": 1.04669118, + "balance_loss_mlp": 1.01674783, + "epoch": 0.834124932363374, + "flos": 27126166659840.0, + "grad_norm": 1.616758804866161, + "language_loss": 0.81169641, + "learning_rate": 2.8160883899190625e-07, + "loss": 0.83347476, + "num_input_tokens_seen": 149903200, + "step": 6937, + "time_per_iteration": 2.505402088165283 + }, + { + "auxiliary_loss_clip": 0.01117759, + "auxiliary_loss_mlp": 0.01026137, + "balance_loss_clip": 1.04483294, + "balance_loss_mlp": 1.01902878, + "epoch": 0.8342451752540131, + "flos": 24569865498240.0, + "grad_norm": 2.4976063719874997, + "language_loss": 0.72886574, + "learning_rate": 2.8121041060507234e-07, + "loss": 0.7503047, + "num_input_tokens_seen": 149922230, + "step": 6938, + "time_per_iteration": 2.536933422088623 + }, + { + "auxiliary_loss_clip": 0.01156881, + "auxiliary_loss_mlp": 0.01023652, + "balance_loss_clip": 1.04475105, + "balance_loss_mlp": 1.01622486, + "epoch": 0.8343654181446521, + "flos": 26615085995520.0, + "grad_norm": 2.362267238632959, + "language_loss": 0.7139625, + "learning_rate": 2.808122429572528e-07, + "loss": 0.73576784, + "num_input_tokens_seen": 149942435, + "step": 6939, + "time_per_iteration": 2.5024545192718506 + }, + { + "auxiliary_loss_clip": 0.01132663, + "auxiliary_loss_mlp": 0.01023961, + "balance_loss_clip": 1.04244578, + "balance_loss_mlp": 1.01681471, + "epoch": 0.8344856610352913, + "flos": 20777268078720.0, + "grad_norm": 2.929457629705969, + "language_loss": 0.76136941, + "learning_rate": 2.804143361088489e-07, + "loss": 0.78293568, + "num_input_tokens_seen": 149961615, + "step": 6940, + "time_per_iteration": 2.5023651123046875 + }, + { + "auxiliary_loss_clip": 0.01133397, + "auxiliary_loss_mlp": 0.01025045, + "balance_loss_clip": 1.04246056, + "balance_loss_mlp": 1.01707625, + "epoch": 0.8346059039259304, + "flos": 26095960684800.0, + "grad_norm": 2.213563728461599, + "language_loss": 0.78133631, + "learning_rate": 2.8001669012022277e-07, + "loss": 0.8029207, + "num_input_tokens_seen": 149979585, + "step": 6941, + "time_per_iteration": 2.511373281478882 + }, + { + "auxiliary_loss_clip": 0.01155625, + "auxiliary_loss_mlp": 0.0102679, + "balance_loss_clip": 1.04984033, + "balance_loss_mlp": 1.01884747, + "epoch": 0.8347261468165694, + "flos": 29027708755200.0, + "grad_norm": 1.9388275508455362, + "language_loss": 0.69087869, + "learning_rate": 2.7961930505169795e-07, + "loss": 0.71270287, + "num_input_tokens_seen": 150003830, + "step": 6942, + "time_per_iteration": 2.5546207427978516 + }, + { + "auxiliary_loss_clip": 0.01157705, + "auxiliary_loss_mlp": 0.00762293, + "balance_loss_clip": 1.04814863, + "balance_loss_mlp": 1.00057244, + "epoch": 0.8348463897072086, + "flos": 26396461866240.0, + "grad_norm": 2.300841701593095, + "language_loss": 0.76161587, + "learning_rate": 2.792221809635558e-07, + "loss": 0.78081584, + "num_input_tokens_seen": 150024460, + "step": 6943, + "time_per_iteration": 2.497102975845337 + }, + { + "auxiliary_loss_clip": 0.01086891, + "auxiliary_loss_mlp": 0.01023503, + "balance_loss_clip": 1.04095876, + "balance_loss_mlp": 1.015975, + "epoch": 0.8349666325978476, + "flos": 23367720096000.0, + "grad_norm": 1.7364165668150668, + "language_loss": 0.75295991, + "learning_rate": 2.788253179160411e-07, + "loss": 0.77406389, + "num_input_tokens_seen": 150045620, + "step": 6944, + "time_per_iteration": 2.6143996715545654 + }, + { + "auxiliary_loss_clip": 0.01140185, + "auxiliary_loss_mlp": 0.01026248, + "balance_loss_clip": 1.04499459, + "balance_loss_mlp": 1.01927149, + "epoch": 0.8350868754884867, + "flos": 12896528135040.0, + "grad_norm": 2.1841859745655614, + "language_loss": 0.6485846, + "learning_rate": 2.7842871596935725e-07, + "loss": 0.67024899, + "num_input_tokens_seen": 150064135, + "step": 6945, + "time_per_iteration": 2.4611427783966064 + }, + { + "auxiliary_loss_clip": 0.01157974, + "auxiliary_loss_mlp": 0.0101986, + "balance_loss_clip": 1.04569459, + "balance_loss_mlp": 1.01265943, + "epoch": 0.8352071183791259, + "flos": 26505522535680.0, + "grad_norm": 1.7154263909260363, + "language_loss": 0.69131207, + "learning_rate": 2.780323751836682e-07, + "loss": 0.71309042, + "num_input_tokens_seen": 150085350, + "step": 6946, + "time_per_iteration": 3.2908437252044678 + }, + { + "auxiliary_loss_clip": 0.01137884, + "auxiliary_loss_mlp": 0.0076172, + "balance_loss_clip": 1.04112983, + "balance_loss_mlp": 1.00065827, + "epoch": 0.8353273612697649, + "flos": 20668063754880.0, + "grad_norm": 1.7232838442371774, + "language_loss": 0.7849685, + "learning_rate": 2.7763629561909876e-07, + "loss": 0.80396456, + "num_input_tokens_seen": 150106180, + "step": 6947, + "time_per_iteration": 3.2757339477539062 + }, + { + "auxiliary_loss_clip": 0.01164961, + "auxiliary_loss_mlp": 0.01025537, + "balance_loss_clip": 1.04489732, + "balance_loss_mlp": 1.01792574, + "epoch": 0.835447604160404, + "flos": 19754137082880.0, + "grad_norm": 1.939473715695437, + "language_loss": 0.77235568, + "learning_rate": 2.772404773357335e-07, + "loss": 0.79426062, + "num_input_tokens_seen": 150125585, + "step": 6948, + "time_per_iteration": 2.404696226119995 + }, + { + "auxiliary_loss_clip": 0.01120983, + "auxiliary_loss_mlp": 0.01027114, + "balance_loss_clip": 1.04304743, + "balance_loss_mlp": 1.01958609, + "epoch": 0.8355678470510431, + "flos": 23435842239360.0, + "grad_norm": 2.623987656863197, + "language_loss": 0.78411567, + "learning_rate": 2.7684492039361853e-07, + "loss": 0.80559671, + "num_input_tokens_seen": 150144810, + "step": 6949, + "time_per_iteration": 3.279177188873291 + }, + { + "auxiliary_loss_clip": 0.0117013, + "auxiliary_loss_mlp": 0.01024716, + "balance_loss_clip": 1.04963517, + "balance_loss_mlp": 1.01762938, + "epoch": 0.8356880899416822, + "flos": 21214588164480.0, + "grad_norm": 2.301043794750192, + "language_loss": 0.83939511, + "learning_rate": 2.764496248527586e-07, + "loss": 0.8613435, + "num_input_tokens_seen": 150163785, + "step": 6950, + "time_per_iteration": 3.1740798950195312 + }, + { + "auxiliary_loss_clip": 0.0113568, + "auxiliary_loss_mlp": 0.01024004, + "balance_loss_clip": 1.04355454, + "balance_loss_mlp": 1.016711, + "epoch": 0.8358083328323213, + "flos": 28037543466240.0, + "grad_norm": 2.800579315448701, + "language_loss": 0.78251851, + "learning_rate": 2.760545907731211e-07, + "loss": 0.80411541, + "num_input_tokens_seen": 150184360, + "step": 6951, + "time_per_iteration": 2.55754017829895 + }, + { + "auxiliary_loss_clip": 0.01154319, + "auxiliary_loss_mlp": 0.01021213, + "balance_loss_clip": 1.04363585, + "balance_loss_mlp": 1.0136199, + "epoch": 0.8359285757229604, + "flos": 27783655159680.0, + "grad_norm": 3.5238737937213287, + "language_loss": 0.67733657, + "learning_rate": 2.75659818214631e-07, + "loss": 0.69909197, + "num_input_tokens_seen": 150205465, + "step": 6952, + "time_per_iteration": 2.509167432785034 + }, + { + "auxiliary_loss_clip": 0.01143203, + "auxiliary_loss_mlp": 0.01026086, + "balance_loss_clip": 1.04429901, + "balance_loss_mlp": 1.01870728, + "epoch": 0.8360488186135995, + "flos": 21435115714560.0, + "grad_norm": 2.0520642038761614, + "language_loss": 0.7823379, + "learning_rate": 2.752653072371749e-07, + "loss": 0.80403078, + "num_input_tokens_seen": 150224900, + "step": 6953, + "time_per_iteration": 2.5368189811706543 + }, + { + "auxiliary_loss_clip": 0.01121812, + "auxiliary_loss_mlp": 0.01026184, + "balance_loss_clip": 1.04394794, + "balance_loss_mlp": 1.01966298, + "epoch": 0.8361690615042385, + "flos": 27632327160960.0, + "grad_norm": 1.6902140066261, + "language_loss": 0.74587452, + "learning_rate": 2.7487105790060105e-07, + "loss": 0.76735443, + "num_input_tokens_seen": 150244310, + "step": 6954, + "time_per_iteration": 2.54534912109375 + }, + { + "auxiliary_loss_clip": 0.01156042, + "auxiliary_loss_mlp": 0.01024446, + "balance_loss_clip": 1.04578543, + "balance_loss_mlp": 1.01766574, + "epoch": 0.8362893043948777, + "flos": 39202529598720.0, + "grad_norm": 2.378674602512072, + "language_loss": 0.69462842, + "learning_rate": 2.7447707026471587e-07, + "loss": 0.71643329, + "num_input_tokens_seen": 150267285, + "step": 6955, + "time_per_iteration": 2.591282606124878 + }, + { + "auxiliary_loss_clip": 0.01126986, + "auxiliary_loss_mlp": 0.01023792, + "balance_loss_clip": 1.04245281, + "balance_loss_mlp": 1.01677048, + "epoch": 0.8364095472855168, + "flos": 24785329230720.0, + "grad_norm": 2.0710343475974375, + "language_loss": 0.79790086, + "learning_rate": 2.740833443892874e-07, + "loss": 0.81940866, + "num_input_tokens_seen": 150285455, + "step": 6956, + "time_per_iteration": 2.535041093826294 + }, + { + "auxiliary_loss_clip": 0.01139772, + "auxiliary_loss_mlp": 0.01021084, + "balance_loss_clip": 1.04449964, + "balance_loss_mlp": 1.01398468, + "epoch": 0.8365297901761558, + "flos": 22743412784640.0, + "grad_norm": 1.8311540433274505, + "language_loss": 0.79536384, + "learning_rate": 2.7368988033404327e-07, + "loss": 0.81697243, + "num_input_tokens_seen": 150302970, + "step": 6957, + "time_per_iteration": 2.480710506439209 + }, + { + "auxiliary_loss_clip": 0.01130098, + "auxiliary_loss_mlp": 0.01024802, + "balance_loss_clip": 1.04446387, + "balance_loss_mlp": 1.0180217, + "epoch": 0.836650033066795, + "flos": 28396003242240.0, + "grad_norm": 2.064486046132298, + "language_loss": 0.8465631, + "learning_rate": 2.732966781586712e-07, + "loss": 0.86811209, + "num_input_tokens_seen": 150322715, + "step": 6958, + "time_per_iteration": 2.568049430847168 + }, + { + "auxiliary_loss_clip": 0.01148074, + "auxiliary_loss_mlp": 0.01020449, + "balance_loss_clip": 1.0430491, + "balance_loss_mlp": 1.01353168, + "epoch": 0.836770275957434, + "flos": 22236857233920.0, + "grad_norm": 1.8236246792476394, + "language_loss": 0.66202778, + "learning_rate": 2.729037379228205e-07, + "loss": 0.68371302, + "num_input_tokens_seen": 150342900, + "step": 6959, + "time_per_iteration": 2.534950017929077 + }, + { + "auxiliary_loss_clip": 0.01139688, + "auxiliary_loss_mlp": 0.01026224, + "balance_loss_clip": 1.04723644, + "balance_loss_mlp": 1.01882756, + "epoch": 0.8368905188480731, + "flos": 22491930689280.0, + "grad_norm": 1.4420282700904177, + "language_loss": 0.80281317, + "learning_rate": 2.725110596860998e-07, + "loss": 0.82447231, + "num_input_tokens_seen": 150363580, + "step": 6960, + "time_per_iteration": 2.493654489517212 + }, + { + "auxiliary_loss_clip": 0.01107923, + "auxiliary_loss_mlp": 0.01023493, + "balance_loss_clip": 1.04213822, + "balance_loss_mlp": 1.01657891, + "epoch": 0.8370107617387123, + "flos": 13370405287680.0, + "grad_norm": 2.187056339235859, + "language_loss": 0.69697177, + "learning_rate": 2.7211864350807776e-07, + "loss": 0.71828592, + "num_input_tokens_seen": 150381780, + "step": 6961, + "time_per_iteration": 2.500687837600708 + }, + { + "auxiliary_loss_clip": 0.0116669, + "auxiliary_loss_mlp": 0.01028619, + "balance_loss_clip": 1.04607987, + "balance_loss_mlp": 1.02101982, + "epoch": 0.8371310046293513, + "flos": 25261289372160.0, + "grad_norm": 1.8209531152771317, + "language_loss": 0.73758471, + "learning_rate": 2.717264894482836e-07, + "loss": 0.75953782, + "num_input_tokens_seen": 150402120, + "step": 6962, + "time_per_iteration": 2.4511191844940186 + }, + { + "auxiliary_loss_clip": 0.01157166, + "auxiliary_loss_mlp": 0.01023397, + "balance_loss_clip": 1.04756784, + "balance_loss_mlp": 1.01545739, + "epoch": 0.8372512475199904, + "flos": 19792705311360.0, + "grad_norm": 1.8999421687406193, + "language_loss": 0.81147003, + "learning_rate": 2.7133459756620646e-07, + "loss": 0.83327568, + "num_input_tokens_seen": 150419315, + "step": 6963, + "time_per_iteration": 2.433993339538574 + }, + { + "auxiliary_loss_clip": 0.01149367, + "auxiliary_loss_mlp": 0.01030194, + "balance_loss_clip": 1.04529274, + "balance_loss_mlp": 1.02280021, + "epoch": 0.8373714904106295, + "flos": 19391224020480.0, + "grad_norm": 2.5520758534509445, + "language_loss": 0.73437512, + "learning_rate": 2.7094296792129733e-07, + "loss": 0.75617075, + "num_input_tokens_seen": 150438915, + "step": 6964, + "time_per_iteration": 2.4350101947784424 + }, + { + "auxiliary_loss_clip": 0.01152382, + "auxiliary_loss_mlp": 0.01021068, + "balance_loss_clip": 1.04470944, + "balance_loss_mlp": 1.01402545, + "epoch": 0.8374917333012686, + "flos": 14975935401600.0, + "grad_norm": 1.756873635631694, + "language_loss": 0.75473058, + "learning_rate": 2.7055160057296424e-07, + "loss": 0.77646506, + "num_input_tokens_seen": 150456155, + "step": 6965, + "time_per_iteration": 2.397631883621216 + }, + { + "auxiliary_loss_clip": 0.01124943, + "auxiliary_loss_mlp": 0.01023202, + "balance_loss_clip": 1.04264975, + "balance_loss_mlp": 1.01570678, + "epoch": 0.8376119761919076, + "flos": 30331839847680.0, + "grad_norm": 1.7500512627792528, + "language_loss": 0.72400534, + "learning_rate": 2.7016049558057896e-07, + "loss": 0.7454868, + "num_input_tokens_seen": 150478115, + "step": 6966, + "time_per_iteration": 2.5775721073150635 + }, + { + "auxiliary_loss_clip": 0.01154414, + "auxiliary_loss_mlp": 0.01029196, + "balance_loss_clip": 1.04757094, + "balance_loss_mlp": 1.02120292, + "epoch": 0.8377322190825467, + "flos": 29423336129280.0, + "grad_norm": 1.677970151939369, + "language_loss": 0.70685208, + "learning_rate": 2.6976965300347074e-07, + "loss": 0.72868824, + "num_input_tokens_seen": 150500725, + "step": 6967, + "time_per_iteration": 2.5005698204040527 + }, + { + "auxiliary_loss_clip": 0.01135112, + "auxiliary_loss_mlp": 0.01022126, + "balance_loss_clip": 1.04207242, + "balance_loss_mlp": 1.01437092, + "epoch": 0.8378524619731859, + "flos": 26687086807680.0, + "grad_norm": 2.592810548758192, + "language_loss": 0.6946764, + "learning_rate": 2.693790729009309e-07, + "loss": 0.71624875, + "num_input_tokens_seen": 150522335, + "step": 6968, + "time_per_iteration": 2.5111820697784424 + }, + { + "auxiliary_loss_clip": 0.01139822, + "auxiliary_loss_mlp": 0.01024055, + "balance_loss_clip": 1.04375196, + "balance_loss_mlp": 1.01677155, + "epoch": 0.8379727048638249, + "flos": 20703866636160.0, + "grad_norm": 1.8908984197862648, + "language_loss": 0.88369656, + "learning_rate": 2.6898875533220946e-07, + "loss": 0.90533531, + "num_input_tokens_seen": 150541640, + "step": 6969, + "time_per_iteration": 2.474102258682251 + }, + { + "auxiliary_loss_clip": 0.01165118, + "auxiliary_loss_mlp": 0.01024106, + "balance_loss_clip": 1.04832649, + "balance_loss_mlp": 1.01782358, + "epoch": 0.838092947754464, + "flos": 20084084438400.0, + "grad_norm": 1.7515720445794398, + "language_loss": 0.81794751, + "learning_rate": 2.685987003565171e-07, + "loss": 0.83983982, + "num_input_tokens_seen": 150559680, + "step": 6970, + "time_per_iteration": 2.4151904582977295 + }, + { + "auxiliary_loss_clip": 0.01120699, + "auxiliary_loss_mlp": 0.01027104, + "balance_loss_clip": 1.04654193, + "balance_loss_mlp": 1.01942062, + "epoch": 0.8382131906451031, + "flos": 18113270964480.0, + "grad_norm": 2.5973716541053524, + "language_loss": 0.75559193, + "learning_rate": 2.6820890803302566e-07, + "loss": 0.77706993, + "num_input_tokens_seen": 150575205, + "step": 6971, + "time_per_iteration": 2.523566246032715 + }, + { + "auxiliary_loss_clip": 0.01138126, + "auxiliary_loss_mlp": 0.01021967, + "balance_loss_clip": 1.04558229, + "balance_loss_mlp": 1.01474929, + "epoch": 0.8383334335357422, + "flos": 17092653920640.0, + "grad_norm": 2.7345302679577164, + "language_loss": 0.81869072, + "learning_rate": 2.6781937842086557e-07, + "loss": 0.84029162, + "num_input_tokens_seen": 150593995, + "step": 6972, + "time_per_iteration": 3.2770192623138428 + }, + { + "auxiliary_loss_clip": 0.01154017, + "auxiliary_loss_mlp": 0.01025381, + "balance_loss_clip": 1.04466569, + "balance_loss_mlp": 1.01808894, + "epoch": 0.8384536764263812, + "flos": 20704728562560.0, + "grad_norm": 1.9473354296799652, + "language_loss": 0.67495149, + "learning_rate": 2.6743011157912933e-07, + "loss": 0.69674551, + "num_input_tokens_seen": 150613715, + "step": 6973, + "time_per_iteration": 2.441216230392456 + }, + { + "auxiliary_loss_clip": 0.01109468, + "auxiliary_loss_mlp": 0.01023716, + "balance_loss_clip": 1.03765738, + "balance_loss_mlp": 1.01585972, + "epoch": 0.8385739193170204, + "flos": 28986842056320.0, + "grad_norm": 1.74987833611273, + "language_loss": 0.65178001, + "learning_rate": 2.6704110756686725e-07, + "loss": 0.67311186, + "num_input_tokens_seen": 150634540, + "step": 6974, + "time_per_iteration": 3.277212619781494 + }, + { + "auxiliary_loss_clip": 0.01135311, + "auxiliary_loss_mlp": 0.00761935, + "balance_loss_clip": 1.04176617, + "balance_loss_mlp": 1.00054693, + "epoch": 0.8386941622076595, + "flos": 23438068882560.0, + "grad_norm": 1.8893060703372482, + "language_loss": 0.83894932, + "learning_rate": 2.6665236644309085e-07, + "loss": 0.85792178, + "num_input_tokens_seen": 150654850, + "step": 6975, + "time_per_iteration": 2.5240368843078613 + }, + { + "auxiliary_loss_clip": 0.01153288, + "auxiliary_loss_mlp": 0.01022454, + "balance_loss_clip": 1.04400384, + "balance_loss_mlp": 1.01565337, + "epoch": 0.8388144050982985, + "flos": 23002724044800.0, + "grad_norm": 1.8113085749618913, + "language_loss": 0.79385817, + "learning_rate": 2.662638882667727e-07, + "loss": 0.81561559, + "num_input_tokens_seen": 150673790, + "step": 6976, + "time_per_iteration": 3.9138596057891846 + }, + { + "auxiliary_loss_clip": 0.01170159, + "auxiliary_loss_mlp": 0.0102585, + "balance_loss_clip": 1.0472331, + "balance_loss_mlp": 1.01787519, + "epoch": 0.8389346479889377, + "flos": 24280353878400.0, + "grad_norm": 2.1017026088764443, + "language_loss": 0.731709, + "learning_rate": 2.658756730968443e-07, + "loss": 0.75366914, + "num_input_tokens_seen": 150692255, + "step": 6977, + "time_per_iteration": 2.435337543487549 + }, + { + "auxiliary_loss_clip": 0.01145054, + "auxiliary_loss_mlp": 0.01023883, + "balance_loss_clip": 1.04844856, + "balance_loss_mlp": 1.01659918, + "epoch": 0.8390548908795767, + "flos": 21215019127680.0, + "grad_norm": 2.8654352771177547, + "language_loss": 0.88438147, + "learning_rate": 2.654877209921975e-07, + "loss": 0.90607083, + "num_input_tokens_seen": 150709790, + "step": 6978, + "time_per_iteration": 2.4777920246124268 + }, + { + "auxiliary_loss_clip": 0.01117073, + "auxiliary_loss_mlp": 0.01030509, + "balance_loss_clip": 1.03962183, + "balance_loss_mlp": 1.02165806, + "epoch": 0.8391751337702158, + "flos": 35627299332480.0, + "grad_norm": 2.055729732370947, + "language_loss": 0.62979609, + "learning_rate": 2.651000320116843e-07, + "loss": 0.65127194, + "num_input_tokens_seen": 150730675, + "step": 6979, + "time_per_iteration": 2.6838929653167725 + }, + { + "auxiliary_loss_clip": 0.01122312, + "auxiliary_loss_mlp": 0.00762416, + "balance_loss_clip": 1.04212499, + "balance_loss_mlp": 1.00058198, + "epoch": 0.839295376660855, + "flos": 21325229032320.0, + "grad_norm": 1.9364783456600232, + "language_loss": 0.76311469, + "learning_rate": 2.647126062141163e-07, + "loss": 0.78196192, + "num_input_tokens_seen": 150749750, + "step": 6980, + "time_per_iteration": 2.5514771938323975 + }, + { + "auxiliary_loss_clip": 0.01142546, + "auxiliary_loss_mlp": 0.01023258, + "balance_loss_clip": 1.04210079, + "balance_loss_mlp": 1.01616228, + "epoch": 0.839415619551494, + "flos": 18442535961600.0, + "grad_norm": 1.7844720766909343, + "language_loss": 0.83785629, + "learning_rate": 2.643254436582669e-07, + "loss": 0.85951436, + "num_input_tokens_seen": 150769240, + "step": 6981, + "time_per_iteration": 2.4532687664031982 + }, + { + "auxiliary_loss_clip": 0.01114272, + "auxiliary_loss_mlp": 0.01023543, + "balance_loss_clip": 1.04279661, + "balance_loss_mlp": 1.01602042, + "epoch": 0.8395358624421331, + "flos": 23221958705280.0, + "grad_norm": 1.8829405636996113, + "language_loss": 0.82619548, + "learning_rate": 2.6393854440286743e-07, + "loss": 0.84757364, + "num_input_tokens_seen": 150788410, + "step": 6982, + "time_per_iteration": 2.568110227584839 + }, + { + "auxiliary_loss_clip": 0.01169014, + "auxiliary_loss_mlp": 0.01026294, + "balance_loss_clip": 1.04916048, + "balance_loss_mlp": 1.01901078, + "epoch": 0.8396561053327722, + "flos": 24381657210240.0, + "grad_norm": 2.285459049253663, + "language_loss": 0.70128256, + "learning_rate": 2.6355190850661045e-07, + "loss": 0.72323567, + "num_input_tokens_seen": 150805245, + "step": 6983, + "time_per_iteration": 2.4140801429748535 + }, + { + "auxiliary_loss_clip": 0.01140142, + "auxiliary_loss_mlp": 0.01024099, + "balance_loss_clip": 1.04544485, + "balance_loss_mlp": 1.01665425, + "epoch": 0.8397763482234113, + "flos": 22237755073920.0, + "grad_norm": 1.5922375750849036, + "language_loss": 0.86206651, + "learning_rate": 2.631655360281486e-07, + "loss": 0.88370895, + "num_input_tokens_seen": 150824920, + "step": 6984, + "time_per_iteration": 2.5169336795806885 + }, + { + "auxiliary_loss_clip": 0.01158974, + "auxiliary_loss_mlp": 0.00762381, + "balance_loss_clip": 1.04551792, + "balance_loss_mlp": 1.00058699, + "epoch": 0.8398965911140504, + "flos": 22163743100160.0, + "grad_norm": 2.2858183868899067, + "language_loss": 0.65579128, + "learning_rate": 2.6277942702609323e-07, + "loss": 0.67500478, + "num_input_tokens_seen": 150844400, + "step": 6985, + "time_per_iteration": 2.4487104415893555 + }, + { + "auxiliary_loss_clip": 0.01129074, + "auxiliary_loss_mlp": 0.01027786, + "balance_loss_clip": 1.04500711, + "balance_loss_mlp": 1.02021003, + "epoch": 0.8400168340046895, + "flos": 21542775753600.0, + "grad_norm": 1.965232393781806, + "language_loss": 0.87465, + "learning_rate": 2.623935815590186e-07, + "loss": 0.8962186, + "num_input_tokens_seen": 150862780, + "step": 6986, + "time_per_iteration": 2.5192952156066895 + }, + { + "auxiliary_loss_clip": 0.01142468, + "auxiliary_loss_mlp": 0.01025209, + "balance_loss_clip": 1.04835629, + "balance_loss_mlp": 1.01781821, + "epoch": 0.8401370768953286, + "flos": 22491966602880.0, + "grad_norm": 1.7743200842895288, + "language_loss": 0.80739331, + "learning_rate": 2.6200799968545516e-07, + "loss": 0.82907009, + "num_input_tokens_seen": 150883075, + "step": 6987, + "time_per_iteration": 2.4926273822784424 + }, + { + "auxiliary_loss_clip": 0.01039819, + "auxiliary_loss_mlp": 0.01008002, + "balance_loss_clip": 1.00934827, + "balance_loss_mlp": 1.00698841, + "epoch": 0.8402573197859676, + "flos": 59238890818560.0, + "grad_norm": 0.7834279507766049, + "language_loss": 0.56393236, + "learning_rate": 2.616226814638969e-07, + "loss": 0.58441055, + "num_input_tokens_seen": 150948180, + "step": 6988, + "time_per_iteration": 3.101137638092041 + }, + { + "auxiliary_loss_clip": 0.01141419, + "auxiliary_loss_mlp": 0.01024036, + "balance_loss_clip": 1.04626536, + "balance_loss_mlp": 1.01708579, + "epoch": 0.8403775626766068, + "flos": 22674608282880.0, + "grad_norm": 1.9214541435153556, + "language_loss": 0.77950728, + "learning_rate": 2.612376269527954e-07, + "loss": 0.80116189, + "num_input_tokens_seen": 150967885, + "step": 6989, + "time_per_iteration": 2.4877262115478516 + }, + { + "auxiliary_loss_clip": 0.01137038, + "auxiliary_loss_mlp": 0.01026324, + "balance_loss_clip": 1.04496634, + "balance_loss_mlp": 1.01920104, + "epoch": 0.8404978055672458, + "flos": 19609704495360.0, + "grad_norm": 1.8616374081272926, + "language_loss": 0.67906171, + "learning_rate": 2.608528362105635e-07, + "loss": 0.70069534, + "num_input_tokens_seen": 150987255, + "step": 6990, + "time_per_iteration": 2.465660333633423 + }, + { + "auxiliary_loss_clip": 0.01127179, + "auxiliary_loss_mlp": 0.01025122, + "balance_loss_clip": 1.04010761, + "balance_loss_mlp": 1.01779389, + "epoch": 0.8406180484578849, + "flos": 27526929678720.0, + "grad_norm": 2.161600153395586, + "language_loss": 0.73234314, + "learning_rate": 2.6046830929557374e-07, + "loss": 0.7538662, + "num_input_tokens_seen": 151006905, + "step": 6991, + "time_per_iteration": 2.554567337036133 + }, + { + "auxiliary_loss_clip": 0.0112371, + "auxiliary_loss_mlp": 0.010248, + "balance_loss_clip": 1.04457867, + "balance_loss_mlp": 1.01747775, + "epoch": 0.8407382913485241, + "flos": 22127473342080.0, + "grad_norm": 2.14749332437934, + "language_loss": 0.84931719, + "learning_rate": 2.6008404626615776e-07, + "loss": 0.87080234, + "num_input_tokens_seen": 151025405, + "step": 6992, + "time_per_iteration": 2.554913282394409 + }, + { + "auxiliary_loss_clip": 0.01160833, + "auxiliary_loss_mlp": 0.01025449, + "balance_loss_clip": 1.04898643, + "balance_loss_mlp": 1.01828408, + "epoch": 0.8408585342391631, + "flos": 13918473982080.0, + "grad_norm": 2.6322063235692723, + "language_loss": 0.73812586, + "learning_rate": 2.597000471806092e-07, + "loss": 0.75998867, + "num_input_tokens_seen": 151041970, + "step": 6993, + "time_per_iteration": 2.4317147731781006 + }, + { + "auxiliary_loss_clip": 0.01137974, + "auxiliary_loss_mlp": 0.01025337, + "balance_loss_clip": 1.04868627, + "balance_loss_mlp": 1.01664615, + "epoch": 0.8409787771298022, + "flos": 20187865808640.0, + "grad_norm": 2.0143166247713906, + "language_loss": 0.72936302, + "learning_rate": 2.593163120971793e-07, + "loss": 0.75099611, + "num_input_tokens_seen": 151060835, + "step": 6994, + "time_per_iteration": 2.4886975288391113 + }, + { + "auxiliary_loss_clip": 0.01101086, + "auxiliary_loss_mlp": 0.01026116, + "balance_loss_clip": 1.03689039, + "balance_loss_mlp": 1.01912487, + "epoch": 0.8410990200204413, + "flos": 23142523777920.0, + "grad_norm": 3.476974325255336, + "language_loss": 0.69032013, + "learning_rate": 2.5893284107408165e-07, + "loss": 0.71159214, + "num_input_tokens_seen": 151078205, + "step": 6995, + "time_per_iteration": 2.5573997497558594 + }, + { + "auxiliary_loss_clip": 0.01114345, + "auxiliary_loss_mlp": 0.0102934, + "balance_loss_clip": 1.04464769, + "balance_loss_mlp": 1.02147841, + "epoch": 0.8412192629110804, + "flos": 24027219757440.0, + "grad_norm": 2.434637885682437, + "language_loss": 0.78072345, + "learning_rate": 2.5854963416948726e-07, + "loss": 0.80216032, + "num_input_tokens_seen": 151100470, + "step": 6996, + "time_per_iteration": 2.640551805496216 + }, + { + "auxiliary_loss_clip": 0.01109402, + "auxiliary_loss_mlp": 0.01026457, + "balance_loss_clip": 1.03572166, + "balance_loss_mlp": 1.0191431, + "epoch": 0.8413395058017195, + "flos": 25591703604480.0, + "grad_norm": 2.9363310207261524, + "language_loss": 0.6948092, + "learning_rate": 2.5816669144152816e-07, + "loss": 0.71616775, + "num_input_tokens_seen": 151121650, + "step": 6997, + "time_per_iteration": 2.6781668663024902 + }, + { + "auxiliary_loss_clip": 0.01063308, + "auxiliary_loss_mlp": 0.01001166, + "balance_loss_clip": 1.00707304, + "balance_loss_mlp": 1.00022435, + "epoch": 0.8414597486923585, + "flos": 63635396624640.0, + "grad_norm": 0.8445080382103932, + "language_loss": 0.66317332, + "learning_rate": 2.5778401294829777e-07, + "loss": 0.6838181, + "num_input_tokens_seen": 151180390, + "step": 6998, + "time_per_iteration": 3.0814054012298584 + }, + { + "auxiliary_loss_clip": 0.01151308, + "auxiliary_loss_mlp": 0.00762151, + "balance_loss_clip": 1.04589462, + "balance_loss_mlp": 1.00066042, + "epoch": 0.8415799915829977, + "flos": 19098731571840.0, + "grad_norm": 1.682351147916638, + "language_loss": 0.64972079, + "learning_rate": 2.574015987478473e-07, + "loss": 0.66885531, + "num_input_tokens_seen": 151198520, + "step": 6999, + "time_per_iteration": 3.1543405055999756 + }, + { + "auxiliary_loss_clip": 0.0114422, + "auxiliary_loss_mlp": 0.01026856, + "balance_loss_clip": 1.04449415, + "balance_loss_mlp": 1.01951241, + "epoch": 0.8417002344736367, + "flos": 19821612781440.0, + "grad_norm": 2.0027613804907167, + "language_loss": 0.86758637, + "learning_rate": 2.570194488981887e-07, + "loss": 0.88929713, + "num_input_tokens_seen": 151215065, + "step": 7000, + "time_per_iteration": 3.245760679244995 + }, + { + "auxiliary_loss_clip": 0.01063715, + "auxiliary_loss_mlp": 0.0100232, + "balance_loss_clip": 1.00740051, + "balance_loss_mlp": 1.00134206, + "epoch": 0.8418204773642758, + "flos": 62161516834560.0, + "grad_norm": 0.8365305207924869, + "language_loss": 0.60315102, + "learning_rate": 2.566375634572939e-07, + "loss": 0.62381136, + "num_input_tokens_seen": 151275705, + "step": 7001, + "time_per_iteration": 2.952176094055176 + }, + { + "auxiliary_loss_clip": 0.01130805, + "auxiliary_loss_mlp": 0.01029815, + "balance_loss_clip": 1.04187727, + "balance_loss_mlp": 1.02282047, + "epoch": 0.841940720254915, + "flos": 17092905315840.0, + "grad_norm": 1.8577997917567743, + "language_loss": 0.76433909, + "learning_rate": 2.562559424830943e-07, + "loss": 0.7859453, + "num_input_tokens_seen": 151293665, + "step": 7002, + "time_per_iteration": 2.4901254177093506 + }, + { + "auxiliary_loss_clip": 0.01138536, + "auxiliary_loss_mlp": 0.01026132, + "balance_loss_clip": 1.0438807, + "balance_loss_mlp": 1.01856506, + "epoch": 0.842060963145554, + "flos": 16283586026880.0, + "grad_norm": 2.3551481325788077, + "language_loss": 0.70430696, + "learning_rate": 2.5587458603348256e-07, + "loss": 0.7259537, + "num_input_tokens_seen": 151310955, + "step": 7003, + "time_per_iteration": 4.277583599090576 + }, + { + "auxiliary_loss_clip": 0.01120997, + "auxiliary_loss_mlp": 0.0102739, + "balance_loss_clip": 1.04289627, + "balance_loss_mlp": 1.01989484, + "epoch": 0.8421812060361931, + "flos": 21908238681600.0, + "grad_norm": 1.8509304429421265, + "language_loss": 0.84062493, + "learning_rate": 2.554934941663085e-07, + "loss": 0.86210877, + "num_input_tokens_seen": 151328490, + "step": 7004, + "time_per_iteration": 2.534914255142212 + }, + { + "auxiliary_loss_clip": 0.0112543, + "auxiliary_loss_mlp": 0.01029079, + "balance_loss_clip": 1.04262447, + "balance_loss_mlp": 1.02118754, + "epoch": 0.8423014489268322, + "flos": 27777693502080.0, + "grad_norm": 1.9292011512674314, + "language_loss": 0.73605597, + "learning_rate": 2.5511266693938484e-07, + "loss": 0.75760102, + "num_input_tokens_seen": 151346950, + "step": 7005, + "time_per_iteration": 2.5786213874816895 + }, + { + "auxiliary_loss_clip": 0.01139064, + "auxiliary_loss_mlp": 0.01023516, + "balance_loss_clip": 1.0468719, + "balance_loss_mlp": 1.01548374, + "epoch": 0.8424216918174713, + "flos": 25117610970240.0, + "grad_norm": 1.4919419574774313, + "language_loss": 0.78014266, + "learning_rate": 2.547321044104822e-07, + "loss": 0.80176848, + "num_input_tokens_seen": 151368445, + "step": 7006, + "time_per_iteration": 2.5109426975250244 + }, + { + "auxiliary_loss_clip": 0.01170494, + "auxiliary_loss_mlp": 0.01024766, + "balance_loss_clip": 1.04820013, + "balance_loss_mlp": 1.01695836, + "epoch": 0.8425419347081103, + "flos": 24748448941440.0, + "grad_norm": 2.5368698893271873, + "language_loss": 0.76663572, + "learning_rate": 2.5435180663733113e-07, + "loss": 0.78858835, + "num_input_tokens_seen": 151388745, + "step": 7007, + "time_per_iteration": 2.46748423576355 + }, + { + "auxiliary_loss_clip": 0.01120961, + "auxiliary_loss_mlp": 0.01025575, + "balance_loss_clip": 1.04156554, + "balance_loss_mlp": 1.01806784, + "epoch": 0.8426621775987495, + "flos": 24820916630400.0, + "grad_norm": 2.422339124126465, + "language_loss": 0.71449769, + "learning_rate": 2.539717736776241e-07, + "loss": 0.73596311, + "num_input_tokens_seen": 151404970, + "step": 7008, + "time_per_iteration": 2.5830142498016357 + }, + { + "auxiliary_loss_clip": 0.01151559, + "auxiliary_loss_mlp": 0.01019968, + "balance_loss_clip": 1.0464747, + "balance_loss_mlp": 1.01277113, + "epoch": 0.8427824204893886, + "flos": 23550074467200.0, + "grad_norm": 1.4132298835942054, + "language_loss": 0.76208246, + "learning_rate": 2.535920055890097e-07, + "loss": 0.78379768, + "num_input_tokens_seen": 151426265, + "step": 7009, + "time_per_iteration": 2.4861574172973633 + }, + { + "auxiliary_loss_clip": 0.01106593, + "auxiliary_loss_mlp": 0.01026002, + "balance_loss_clip": 1.04019928, + "balance_loss_mlp": 1.01781821, + "epoch": 0.8429026633800276, + "flos": 16143858120960.0, + "grad_norm": 2.614146118247875, + "language_loss": 0.64188731, + "learning_rate": 2.5321250242910006e-07, + "loss": 0.66321325, + "num_input_tokens_seen": 151444180, + "step": 7010, + "time_per_iteration": 2.5254085063934326 + }, + { + "auxiliary_loss_clip": 0.01169085, + "auxiliary_loss_mlp": 0.01019881, + "balance_loss_clip": 1.04915309, + "balance_loss_mlp": 1.01271391, + "epoch": 0.8430229062706668, + "flos": 22198540400640.0, + "grad_norm": 1.6860232913138185, + "language_loss": 0.86381155, + "learning_rate": 2.5283326425546493e-07, + "loss": 0.88570118, + "num_input_tokens_seen": 151463290, + "step": 7011, + "time_per_iteration": 2.416750907897949 + }, + { + "auxiliary_loss_clip": 0.01119921, + "auxiliary_loss_mlp": 0.01022729, + "balance_loss_clip": 1.04480588, + "balance_loss_mlp": 1.01564813, + "epoch": 0.8431431491613058, + "flos": 35330317683840.0, + "grad_norm": 2.0615458343470903, + "language_loss": 0.69406796, + "learning_rate": 2.5245429112563443e-07, + "loss": 0.71549445, + "num_input_tokens_seen": 151483965, + "step": 7012, + "time_per_iteration": 2.630713939666748 + }, + { + "auxiliary_loss_clip": 0.01155272, + "auxiliary_loss_mlp": 0.01023803, + "balance_loss_clip": 1.04795647, + "balance_loss_mlp": 1.01654875, + "epoch": 0.8432633920519449, + "flos": 25812374808960.0, + "grad_norm": 1.9423220964313352, + "language_loss": 0.82055902, + "learning_rate": 2.5207558309709865e-07, + "loss": 0.84234983, + "num_input_tokens_seen": 151503700, + "step": 7013, + "time_per_iteration": 2.4918699264526367 + }, + { + "auxiliary_loss_clip": 0.01036527, + "auxiliary_loss_mlp": 0.00752813, + "balance_loss_clip": 1.0067488, + "balance_loss_mlp": 1.00052392, + "epoch": 0.8433836349425841, + "flos": 64959531592320.0, + "grad_norm": 0.6569127234440182, + "language_loss": 0.56280971, + "learning_rate": 2.516971402273065e-07, + "loss": 0.58070302, + "num_input_tokens_seen": 151569765, + "step": 7014, + "time_per_iteration": 3.099980592727661 + }, + { + "auxiliary_loss_clip": 0.01141514, + "auxiliary_loss_mlp": 0.01021305, + "balance_loss_clip": 1.04334664, + "balance_loss_mlp": 1.0138123, + "epoch": 0.8435038778332231, + "flos": 20229989483520.0, + "grad_norm": 2.0553432782330736, + "language_loss": 0.67841852, + "learning_rate": 2.513189625736687e-07, + "loss": 0.70004672, + "num_input_tokens_seen": 151586660, + "step": 7015, + "time_per_iteration": 2.5066864490509033 + }, + { + "auxiliary_loss_clip": 0.01131151, + "auxiliary_loss_mlp": 0.01030717, + "balance_loss_clip": 1.04449034, + "balance_loss_mlp": 1.02273619, + "epoch": 0.8436241207238622, + "flos": 20992229020800.0, + "grad_norm": 2.755947194929615, + "language_loss": 0.7182399, + "learning_rate": 2.509410501935534e-07, + "loss": 0.73985851, + "num_input_tokens_seen": 151602295, + "step": 7016, + "time_per_iteration": 2.5365467071533203 + }, + { + "auxiliary_loss_clip": 0.01141414, + "auxiliary_loss_mlp": 0.01025592, + "balance_loss_clip": 1.04419291, + "balance_loss_mlp": 1.01770008, + "epoch": 0.8437443636145013, + "flos": 14682257804160.0, + "grad_norm": 2.6702225492839724, + "language_loss": 0.75444531, + "learning_rate": 2.5056340314429116e-07, + "loss": 0.77611542, + "num_input_tokens_seen": 151619760, + "step": 7017, + "time_per_iteration": 2.4589955806732178 + }, + { + "auxiliary_loss_clip": 0.01115885, + "auxiliary_loss_mlp": 0.01026861, + "balance_loss_clip": 1.04075265, + "balance_loss_mlp": 1.01874912, + "epoch": 0.8438646065051404, + "flos": 21608814908160.0, + "grad_norm": 2.089713804972125, + "language_loss": 0.80624306, + "learning_rate": 2.5018602148316904e-07, + "loss": 0.82767051, + "num_input_tokens_seen": 151635795, + "step": 7018, + "time_per_iteration": 2.5542335510253906 + }, + { + "auxiliary_loss_clip": 0.01167768, + "auxiliary_loss_mlp": 0.01024268, + "balance_loss_clip": 1.049474, + "balance_loss_mlp": 1.0172112, + "epoch": 0.8439848493957794, + "flos": 23289937194240.0, + "grad_norm": 1.83796998565791, + "language_loss": 0.80477774, + "learning_rate": 2.498089052674359e-07, + "loss": 0.82669806, + "num_input_tokens_seen": 151653770, + "step": 7019, + "time_per_iteration": 2.4259159564971924 + }, + { + "auxiliary_loss_clip": 0.01154581, + "auxiliary_loss_mlp": 0.01031547, + "balance_loss_clip": 1.0472424, + "balance_loss_mlp": 1.02390015, + "epoch": 0.8441050922864186, + "flos": 19719339782400.0, + "grad_norm": 1.9688488560038782, + "language_loss": 0.7545135, + "learning_rate": 2.494320545543007e-07, + "loss": 0.77637482, + "num_input_tokens_seen": 151673340, + "step": 7020, + "time_per_iteration": 2.4665603637695312 + }, + { + "auxiliary_loss_clip": 0.01171846, + "auxiliary_loss_mlp": 0.01028487, + "balance_loss_clip": 1.04804623, + "balance_loss_mlp": 1.02033305, + "epoch": 0.8442253351770577, + "flos": 21835268202240.0, + "grad_norm": 1.824427514333625, + "language_loss": 0.66347456, + "learning_rate": 2.490554694009308e-07, + "loss": 0.68547791, + "num_input_tokens_seen": 151694205, + "step": 7021, + "time_per_iteration": 2.438016414642334 + }, + { + "auxiliary_loss_clip": 0.01157772, + "auxiliary_loss_mlp": 0.01027486, + "balance_loss_clip": 1.0455215, + "balance_loss_mlp": 1.0204376, + "epoch": 0.8443455780676967, + "flos": 34346365447680.0, + "grad_norm": 1.670534450432678, + "language_loss": 0.78423417, + "learning_rate": 2.4867914986445426e-07, + "loss": 0.80608678, + "num_input_tokens_seen": 151716595, + "step": 7022, + "time_per_iteration": 2.561007499694824 + }, + { + "auxiliary_loss_clip": 0.01143064, + "auxiliary_loss_mlp": 0.01027723, + "balance_loss_clip": 1.04327321, + "balance_loss_mlp": 1.02060652, + "epoch": 0.8444658209583359, + "flos": 48214599281280.0, + "grad_norm": 1.996121878638722, + "language_loss": 0.71180314, + "learning_rate": 2.483030960019581e-07, + "loss": 0.73351103, + "num_input_tokens_seen": 151740525, + "step": 7023, + "time_per_iteration": 2.721406936645508 + }, + { + "auxiliary_loss_clip": 0.01020184, + "auxiliary_loss_mlp": 0.01001243, + "balance_loss_clip": 1.00777507, + "balance_loss_mlp": 1.00023603, + "epoch": 0.8445860638489749, + "flos": 68484773105280.0, + "grad_norm": 0.7302638063044736, + "language_loss": 0.55468637, + "learning_rate": 2.479273078704891e-07, + "loss": 0.57490063, + "num_input_tokens_seen": 151793890, + "step": 7024, + "time_per_iteration": 2.964556932449341 + }, + { + "auxiliary_loss_clip": 0.01011993, + "auxiliary_loss_mlp": 0.01001869, + "balance_loss_clip": 1.00723648, + "balance_loss_mlp": 1.0007906, + "epoch": 0.844706306739614, + "flos": 62833331882880.0, + "grad_norm": 0.7781757905091449, + "language_loss": 0.64715898, + "learning_rate": 2.475517855270552e-07, + "loss": 0.6672976, + "num_input_tokens_seen": 151853970, + "step": 7025, + "time_per_iteration": 3.0873799324035645 + }, + { + "auxiliary_loss_clip": 0.01168209, + "auxiliary_loss_mlp": 0.0102477, + "balance_loss_clip": 1.04887891, + "balance_loss_mlp": 1.01797545, + "epoch": 0.8448265496302532, + "flos": 14976114969600.0, + "grad_norm": 2.019787176381427, + "language_loss": 0.72488904, + "learning_rate": 2.4717652902862143e-07, + "loss": 0.74681878, + "num_input_tokens_seen": 151872945, + "step": 7026, + "time_per_iteration": 3.149320602416992 + }, + { + "auxiliary_loss_clip": 0.01145566, + "auxiliary_loss_mlp": 0.0102316, + "balance_loss_clip": 1.04615736, + "balance_loss_mlp": 1.01601303, + "epoch": 0.8449467925208922, + "flos": 23441265192960.0, + "grad_norm": 1.8510073217911334, + "language_loss": 0.81397879, + "learning_rate": 2.4680153843211495e-07, + "loss": 0.83566606, + "num_input_tokens_seen": 151892875, + "step": 7027, + "time_per_iteration": 3.2803590297698975 + }, + { + "auxiliary_loss_clip": 0.01141054, + "auxiliary_loss_mlp": 0.01026227, + "balance_loss_clip": 1.04597235, + "balance_loss_mlp": 1.01873434, + "epoch": 0.8450670354115313, + "flos": 22748045639040.0, + "grad_norm": 1.6798213034595841, + "language_loss": 0.72428513, + "learning_rate": 2.464268137944212e-07, + "loss": 0.74595791, + "num_input_tokens_seen": 151914170, + "step": 7028, + "time_per_iteration": 2.5283913612365723 + }, + { + "auxiliary_loss_clip": 0.01101336, + "auxiliary_loss_mlp": 0.0102838, + "balance_loss_clip": 1.03981757, + "balance_loss_mlp": 1.01993382, + "epoch": 0.8451872783021703, + "flos": 29825571605760.0, + "grad_norm": 1.8840780298095239, + "language_loss": 0.78209949, + "learning_rate": 2.46052355172385e-07, + "loss": 0.80339664, + "num_input_tokens_seen": 151932210, + "step": 7029, + "time_per_iteration": 4.125337362289429 + }, + { + "auxiliary_loss_clip": 0.01167174, + "auxiliary_loss_mlp": 0.01030624, + "balance_loss_clip": 1.04623771, + "balance_loss_mlp": 1.02263665, + "epoch": 0.8453075211928095, + "flos": 21870029589120.0, + "grad_norm": 1.8261703983494522, + "language_loss": 0.74739993, + "learning_rate": 2.456781626228128e-07, + "loss": 0.76937789, + "num_input_tokens_seen": 151951715, + "step": 7030, + "time_per_iteration": 2.433302402496338 + }, + { + "auxiliary_loss_clip": 0.01023151, + "auxiliary_loss_mlp": 0.00752672, + "balance_loss_clip": 1.00683618, + "balance_loss_mlp": 1.00042593, + "epoch": 0.8454277640834486, + "flos": 58751869288320.0, + "grad_norm": 0.9225930110683923, + "language_loss": 0.66355199, + "learning_rate": 2.453042362024675e-07, + "loss": 0.68131018, + "num_input_tokens_seen": 152004960, + "step": 7031, + "time_per_iteration": 3.1254518032073975 + }, + { + "auxiliary_loss_clip": 0.01166997, + "auxiliary_loss_mlp": 0.01027459, + "balance_loss_clip": 1.04693604, + "balance_loss_mlp": 1.02049732, + "epoch": 0.8455480069740876, + "flos": 27090076469760.0, + "grad_norm": 1.4703193999350195, + "language_loss": 0.73007464, + "learning_rate": 2.449305759680751e-07, + "loss": 0.75201917, + "num_input_tokens_seen": 152026285, + "step": 7032, + "time_per_iteration": 2.4868085384368896 + }, + { + "auxiliary_loss_clip": 0.01124909, + "auxiliary_loss_mlp": 0.01026098, + "balance_loss_clip": 1.04489732, + "balance_loss_mlp": 1.01871336, + "epoch": 0.8456682498647268, + "flos": 27198670262400.0, + "grad_norm": 1.5525274209474516, + "language_loss": 0.75192696, + "learning_rate": 2.445571819763188e-07, + "loss": 0.77343702, + "num_input_tokens_seen": 152048585, + "step": 7033, + "time_per_iteration": 2.5513522624969482 + }, + { + "auxiliary_loss_clip": 0.01169209, + "auxiliary_loss_mlp": 0.01025732, + "balance_loss_clip": 1.04956174, + "balance_loss_mlp": 1.01785851, + "epoch": 0.8457884927553658, + "flos": 20631901737600.0, + "grad_norm": 1.9483530044555588, + "language_loss": 0.58257961, + "learning_rate": 2.4418405428384227e-07, + "loss": 0.60452908, + "num_input_tokens_seen": 152068795, + "step": 7034, + "time_per_iteration": 2.438037633895874 + }, + { + "auxiliary_loss_clip": 0.01170882, + "auxiliary_loss_mlp": 0.00762415, + "balance_loss_clip": 1.04889023, + "balance_loss_mlp": 1.00063348, + "epoch": 0.8459087356460049, + "flos": 15299023259520.0, + "grad_norm": 1.768923895153959, + "language_loss": 0.71918058, + "learning_rate": 2.4381119294724864e-07, + "loss": 0.73851359, + "num_input_tokens_seen": 152086240, + "step": 7035, + "time_per_iteration": 2.404186964035034 + }, + { + "auxiliary_loss_clip": 0.01169396, + "auxiliary_loss_mlp": 0.01021497, + "balance_loss_clip": 1.04816151, + "balance_loss_mlp": 1.01446414, + "epoch": 0.846028978536644, + "flos": 18843155326080.0, + "grad_norm": 2.4472012980474815, + "language_loss": 0.54022896, + "learning_rate": 2.434385980231004e-07, + "loss": 0.56213784, + "num_input_tokens_seen": 152105080, + "step": 7036, + "time_per_iteration": 2.392099380493164 + }, + { + "auxiliary_loss_clip": 0.01153906, + "auxiliary_loss_mlp": 0.0102888, + "balance_loss_clip": 1.04589057, + "balance_loss_mlp": 1.02123308, + "epoch": 0.8461492214272831, + "flos": 52661740285440.0, + "grad_norm": 1.4894677751473533, + "language_loss": 0.65301996, + "learning_rate": 2.4306626956792043e-07, + "loss": 0.67484778, + "num_input_tokens_seen": 152130025, + "step": 7037, + "time_per_iteration": 2.7297046184539795 + }, + { + "auxiliary_loss_clip": 0.01151752, + "auxiliary_loss_mlp": 0.01028302, + "balance_loss_clip": 1.0434252, + "balance_loss_mlp": 1.02142417, + "epoch": 0.8462694643179222, + "flos": 18588405093120.0, + "grad_norm": 1.718735598088722, + "language_loss": 0.75448132, + "learning_rate": 2.4269420763819017e-07, + "loss": 0.77628183, + "num_input_tokens_seen": 152148070, + "step": 7038, + "time_per_iteration": 2.441145658493042 + }, + { + "auxiliary_loss_clip": 0.01149927, + "auxiliary_loss_mlp": 0.01026673, + "balance_loss_clip": 1.04467201, + "balance_loss_mlp": 1.01960361, + "epoch": 0.8463897072085613, + "flos": 24387080163840.0, + "grad_norm": 2.5370987170642523, + "language_loss": 0.83536661, + "learning_rate": 2.4232241229035223e-07, + "loss": 0.85713261, + "num_input_tokens_seen": 152165825, + "step": 7039, + "time_per_iteration": 2.4721784591674805 + }, + { + "auxiliary_loss_clip": 0.01055086, + "auxiliary_loss_mlp": 0.01000788, + "balance_loss_clip": 1.00796342, + "balance_loss_mlp": 0.99979848, + "epoch": 0.8465099500992004, + "flos": 68702140258560.0, + "grad_norm": 0.7865011284093479, + "language_loss": 0.5668124, + "learning_rate": 2.419508835808064e-07, + "loss": 0.58737111, + "num_input_tokens_seen": 152222380, + "step": 7040, + "time_per_iteration": 3.0024290084838867 + }, + { + "auxiliary_loss_clip": 0.01140572, + "auxiliary_loss_mlp": 0.01023251, + "balance_loss_clip": 1.04555964, + "balance_loss_mlp": 1.0157795, + "epoch": 0.8466301929898394, + "flos": 13735724561280.0, + "grad_norm": 2.1338059197753467, + "language_loss": 0.62852162, + "learning_rate": 2.415796215659134e-07, + "loss": 0.65015984, + "num_input_tokens_seen": 152239085, + "step": 7041, + "time_per_iteration": 2.4681270122528076 + }, + { + "auxiliary_loss_clip": 0.01128276, + "auxiliary_loss_mlp": 0.01030979, + "balance_loss_clip": 1.04073882, + "balance_loss_mlp": 1.02368927, + "epoch": 0.8467504358804786, + "flos": 19241260738560.0, + "grad_norm": 2.5118571740719284, + "language_loss": 0.77330363, + "learning_rate": 2.412086263019939e-07, + "loss": 0.79489619, + "num_input_tokens_seen": 152257110, + "step": 7042, + "time_per_iteration": 2.516371488571167 + }, + { + "auxiliary_loss_clip": 0.01163192, + "auxiliary_loss_mlp": 0.0102265, + "balance_loss_clip": 1.04641986, + "balance_loss_mlp": 1.01565838, + "epoch": 0.8468706787711177, + "flos": 21324115710720.0, + "grad_norm": 1.7273361161293772, + "language_loss": 0.79846692, + "learning_rate": 2.408378978453276e-07, + "loss": 0.82032537, + "num_input_tokens_seen": 152277230, + "step": 7043, + "time_per_iteration": 2.4192042350769043 + }, + { + "auxiliary_loss_clip": 0.01053577, + "auxiliary_loss_mlp": 0.01001148, + "balance_loss_clip": 1.00643599, + "balance_loss_mlp": 1.00009942, + "epoch": 0.8469909216617567, + "flos": 64877439058560.0, + "grad_norm": 0.8124792868356707, + "language_loss": 0.639507, + "learning_rate": 2.404674362521533e-07, + "loss": 0.66005427, + "num_input_tokens_seen": 152335725, + "step": 7044, + "time_per_iteration": 2.9512150287628174 + }, + { + "auxiliary_loss_clip": 0.01154797, + "auxiliary_loss_mlp": 0.01025106, + "balance_loss_clip": 1.04785991, + "balance_loss_mlp": 1.01857615, + "epoch": 0.8471111645523959, + "flos": 19280583152640.0, + "grad_norm": 2.5413047969226303, + "language_loss": 0.74715251, + "learning_rate": 2.4009724157866997e-07, + "loss": 0.76895154, + "num_input_tokens_seen": 152352785, + "step": 7045, + "time_per_iteration": 2.4702515602111816 + }, + { + "auxiliary_loss_clip": 0.01165953, + "auxiliary_loss_mlp": 0.01021482, + "balance_loss_clip": 1.04684305, + "balance_loss_mlp": 1.0146277, + "epoch": 0.8472314074430349, + "flos": 22015826893440.0, + "grad_norm": 1.9688647735032963, + "language_loss": 0.76579964, + "learning_rate": 2.3972731388103564e-07, + "loss": 0.78767407, + "num_input_tokens_seen": 152371265, + "step": 7046, + "time_per_iteration": 2.436272382736206 + }, + { + "auxiliary_loss_clip": 0.01006055, + "auxiliary_loss_mlp": 0.01000721, + "balance_loss_clip": 1.00675619, + "balance_loss_mlp": 0.9996959, + "epoch": 0.847351650333674, + "flos": 57882580243200.0, + "grad_norm": 0.804490146413904, + "language_loss": 0.6242851, + "learning_rate": 2.393576532153687e-07, + "loss": 0.64435279, + "num_input_tokens_seen": 152435050, + "step": 7047, + "time_per_iteration": 3.2677061557769775 + }, + { + "auxiliary_loss_clip": 0.01051203, + "auxiliary_loss_mlp": 0.01002228, + "balance_loss_clip": 1.00678086, + "balance_loss_mlp": 1.00119102, + "epoch": 0.8474718932243132, + "flos": 41284238313600.0, + "grad_norm": 0.9306802291549738, + "language_loss": 0.57809722, + "learning_rate": 2.389882596377453e-07, + "loss": 0.5986315, + "num_input_tokens_seen": 152489315, + "step": 7048, + "time_per_iteration": 3.278942823410034 + }, + { + "auxiliary_loss_clip": 0.01166243, + "auxiliary_loss_mlp": 0.01025077, + "balance_loss_clip": 1.04593039, + "balance_loss_mlp": 1.01798701, + "epoch": 0.8475921361149522, + "flos": 38180906974080.0, + "grad_norm": 1.7937068223294184, + "language_loss": 0.76369333, + "learning_rate": 2.386191332042031e-07, + "loss": 0.7856065, + "num_input_tokens_seen": 152511210, + "step": 7049, + "time_per_iteration": 2.555948495864868 + }, + { + "auxiliary_loss_clip": 0.01173019, + "auxiliary_loss_mlp": 0.01031295, + "balance_loss_clip": 1.04856491, + "balance_loss_mlp": 1.02415109, + "epoch": 0.8477123790055913, + "flos": 25375054723200.0, + "grad_norm": 2.1506751371245625, + "language_loss": 0.72863793, + "learning_rate": 2.3825027397073794e-07, + "loss": 0.75068104, + "num_input_tokens_seen": 152531685, + "step": 7050, + "time_per_iteration": 2.4534146785736084 + }, + { + "auxiliary_loss_clip": 0.01151617, + "auxiliary_loss_mlp": 0.01025528, + "balance_loss_clip": 1.04854774, + "balance_loss_mlp": 1.01836944, + "epoch": 0.8478326218962304, + "flos": 30225185389440.0, + "grad_norm": 2.3790620675044667, + "language_loss": 0.66906172, + "learning_rate": 2.3788168199330515e-07, + "loss": 0.69083315, + "num_input_tokens_seen": 152553245, + "step": 7051, + "time_per_iteration": 2.5378990173339844 + }, + { + "auxiliary_loss_clip": 0.01123192, + "auxiliary_loss_mlp": 0.01024948, + "balance_loss_clip": 1.03840494, + "balance_loss_mlp": 1.01720548, + "epoch": 0.8479528647868695, + "flos": 38213800853760.0, + "grad_norm": 1.5738754513542392, + "language_loss": 0.72332114, + "learning_rate": 2.3751335732782074e-07, + "loss": 0.74480259, + "num_input_tokens_seen": 152574505, + "step": 7052, + "time_per_iteration": 3.626823663711548 + }, + { + "auxiliary_loss_clip": 0.01153741, + "auxiliary_loss_mlp": 0.01023864, + "balance_loss_clip": 1.04867232, + "balance_loss_mlp": 1.01697969, + "epoch": 0.8480731076775085, + "flos": 20957790856320.0, + "grad_norm": 2.775159675656574, + "language_loss": 0.79301089, + "learning_rate": 2.371453000301582e-07, + "loss": 0.81478703, + "num_input_tokens_seen": 152593190, + "step": 7053, + "time_per_iteration": 3.305734872817993 + }, + { + "auxiliary_loss_clip": 0.01120673, + "auxiliary_loss_mlp": 0.0102259, + "balance_loss_clip": 1.04235172, + "balance_loss_mlp": 1.01534796, + "epoch": 0.8481933505681477, + "flos": 32596510487040.0, + "grad_norm": 1.8982056278663153, + "language_loss": 0.74491227, + "learning_rate": 2.3677751015615222e-07, + "loss": 0.7663449, + "num_input_tokens_seen": 152615265, + "step": 7054, + "time_per_iteration": 2.6534676551818848 + }, + { + "auxiliary_loss_clip": 0.01128495, + "auxiliary_loss_mlp": 0.01030103, + "balance_loss_clip": 1.04211235, + "balance_loss_mlp": 1.02250385, + "epoch": 0.8483135934587868, + "flos": 20741177888640.0, + "grad_norm": 6.4113317822500955, + "language_loss": 0.85446525, + "learning_rate": 2.3640998776159593e-07, + "loss": 0.87605125, + "num_input_tokens_seen": 152632770, + "step": 7055, + "time_per_iteration": 2.5368616580963135 + }, + { + "auxiliary_loss_clip": 0.01142583, + "auxiliary_loss_mlp": 0.01024078, + "balance_loss_clip": 1.04661322, + "balance_loss_mlp": 1.01757503, + "epoch": 0.8484338363494258, + "flos": 21653057485440.0, + "grad_norm": 1.696637914807235, + "language_loss": 0.80836135, + "learning_rate": 2.3604273290224253e-07, + "loss": 0.83002794, + "num_input_tokens_seen": 152653485, + "step": 7056, + "time_per_iteration": 4.111876010894775 + }, + { + "auxiliary_loss_clip": 0.01143, + "auxiliary_loss_mlp": 0.01027459, + "balance_loss_clip": 1.04697442, + "balance_loss_mlp": 1.01926374, + "epoch": 0.848554079240065, + "flos": 15013964926080.0, + "grad_norm": 5.354178601298593, + "language_loss": 0.74532712, + "learning_rate": 2.356757456338039e-07, + "loss": 0.76703167, + "num_input_tokens_seen": 152670970, + "step": 7057, + "time_per_iteration": 2.5151333808898926 + }, + { + "auxiliary_loss_clip": 0.01041311, + "auxiliary_loss_mlp": 0.01004485, + "balance_loss_clip": 1.01219511, + "balance_loss_mlp": 1.00358486, + "epoch": 0.848674322130704, + "flos": 68060453742720.0, + "grad_norm": 0.7587836478282766, + "language_loss": 0.59053481, + "learning_rate": 2.3530902601195147e-07, + "loss": 0.61099279, + "num_input_tokens_seen": 152739460, + "step": 7058, + "time_per_iteration": 3.232900619506836 + }, + { + "auxiliary_loss_clip": 0.01153256, + "auxiliary_loss_mlp": 0.01026659, + "balance_loss_clip": 1.04589605, + "balance_loss_mlp": 1.01919007, + "epoch": 0.8487945650213431, + "flos": 18475788977280.0, + "grad_norm": 2.288663788107731, + "language_loss": 0.7845397, + "learning_rate": 2.34942574092317e-07, + "loss": 0.80633879, + "num_input_tokens_seen": 152754710, + "step": 7059, + "time_per_iteration": 2.4912662506103516 + }, + { + "auxiliary_loss_clip": 0.01157789, + "auxiliary_loss_mlp": 0.01027294, + "balance_loss_clip": 1.0466367, + "balance_loss_mlp": 1.01956964, + "epoch": 0.8489148079119821, + "flos": 23473189405440.0, + "grad_norm": 2.046672342978728, + "language_loss": 0.76982927, + "learning_rate": 2.3457638993049045e-07, + "loss": 0.7916801, + "num_input_tokens_seen": 152772700, + "step": 7060, + "time_per_iteration": 2.485705614089966 + }, + { + "auxiliary_loss_clip": 0.01103012, + "auxiliary_loss_mlp": 0.01024996, + "balance_loss_clip": 1.04459226, + "balance_loss_mlp": 1.01712263, + "epoch": 0.8490350508026213, + "flos": 19937604775680.0, + "grad_norm": 1.9836885594549305, + "language_loss": 0.64139098, + "learning_rate": 2.3421047358202252e-07, + "loss": 0.66267109, + "num_input_tokens_seen": 152791550, + "step": 7061, + "time_per_iteration": 2.6032285690307617 + }, + { + "auxiliary_loss_clip": 0.01156212, + "auxiliary_loss_mlp": 0.0102549, + "balance_loss_clip": 1.04772854, + "balance_loss_mlp": 1.01815271, + "epoch": 0.8491552936932604, + "flos": 24279958828800.0, + "grad_norm": 2.378411864153359, + "language_loss": 0.83323956, + "learning_rate": 2.3384482510242144e-07, + "loss": 0.85505664, + "num_input_tokens_seen": 152809410, + "step": 7062, + "time_per_iteration": 2.5427939891815186 + }, + { + "auxiliary_loss_clip": 0.01169812, + "auxiliary_loss_mlp": 0.01029083, + "balance_loss_clip": 1.04686034, + "balance_loss_mlp": 1.02133393, + "epoch": 0.8492755365838994, + "flos": 22522526098560.0, + "grad_norm": 2.1011232943617295, + "language_loss": 0.77031207, + "learning_rate": 2.3347944454715575e-07, + "loss": 0.79230094, + "num_input_tokens_seen": 152825800, + "step": 7063, + "time_per_iteration": 2.4446041584014893 + }, + { + "auxiliary_loss_clip": 0.01172263, + "auxiliary_loss_mlp": 0.01026911, + "balance_loss_clip": 1.04819655, + "balance_loss_mlp": 1.01917481, + "epoch": 0.8493957794745386, + "flos": 26980441182720.0, + "grad_norm": 1.6680447041578148, + "language_loss": 0.67359096, + "learning_rate": 2.331143319716542e-07, + "loss": 0.69558269, + "num_input_tokens_seen": 152845330, + "step": 7064, + "time_per_iteration": 2.5089316368103027 + }, + { + "auxiliary_loss_clip": 0.01129792, + "auxiliary_loss_mlp": 0.01024405, + "balance_loss_clip": 1.04310048, + "balance_loss_mlp": 1.01690078, + "epoch": 0.8495160223651776, + "flos": 29861985018240.0, + "grad_norm": 2.1123509315111204, + "language_loss": 0.65628272, + "learning_rate": 2.3274948743130363e-07, + "loss": 0.67782468, + "num_input_tokens_seen": 152865165, + "step": 7065, + "time_per_iteration": 2.575930595397949 + }, + { + "auxiliary_loss_clip": 0.01167406, + "auxiliary_loss_mlp": 0.01022546, + "balance_loss_clip": 1.04544401, + "balance_loss_mlp": 1.01488066, + "epoch": 0.8496362652558167, + "flos": 23075443128960.0, + "grad_norm": 1.6074538234146951, + "language_loss": 0.79418659, + "learning_rate": 2.3238491098145085e-07, + "loss": 0.81608611, + "num_input_tokens_seen": 152884695, + "step": 7066, + "time_per_iteration": 2.438753366470337 + }, + { + "auxiliary_loss_clip": 0.01151934, + "auxiliary_loss_mlp": 0.01023731, + "balance_loss_clip": 1.04515088, + "balance_loss_mlp": 1.01657271, + "epoch": 0.8497565081464559, + "flos": 14609107756800.0, + "grad_norm": 2.362393907336275, + "language_loss": 0.73456085, + "learning_rate": 2.3202060267740141e-07, + "loss": 0.75631756, + "num_input_tokens_seen": 152902220, + "step": 7067, + "time_per_iteration": 2.422459602355957 + }, + { + "auxiliary_loss_clip": 0.01105649, + "auxiliary_loss_mlp": 0.0102206, + "balance_loss_clip": 1.03790438, + "balance_loss_mlp": 1.01512194, + "epoch": 0.8498767510370949, + "flos": 21136446126720.0, + "grad_norm": 4.330845913354848, + "language_loss": 0.77395004, + "learning_rate": 2.3165656257442044e-07, + "loss": 0.79522717, + "num_input_tokens_seen": 152920740, + "step": 7068, + "time_per_iteration": 2.60725998878479 + }, + { + "auxiliary_loss_clip": 0.01150387, + "auxiliary_loss_mlp": 0.01019828, + "balance_loss_clip": 1.04544473, + "balance_loss_mlp": 1.01318479, + "epoch": 0.849996993927734, + "flos": 23654538195840.0, + "grad_norm": 2.0285131909466214, + "language_loss": 0.90283847, + "learning_rate": 2.31292790727734e-07, + "loss": 0.92454058, + "num_input_tokens_seen": 152938305, + "step": 7069, + "time_per_iteration": 2.4926342964172363 + }, + { + "auxiliary_loss_clip": 0.01164987, + "auxiliary_loss_mlp": 0.0102392, + "balance_loss_clip": 1.04546714, + "balance_loss_mlp": 1.01706576, + "epoch": 0.8501172368183731, + "flos": 20558069331840.0, + "grad_norm": 2.5003186858193485, + "language_loss": 0.80253559, + "learning_rate": 2.3092928719252392e-07, + "loss": 0.82442468, + "num_input_tokens_seen": 152956705, + "step": 7070, + "time_per_iteration": 2.4435787200927734 + }, + { + "auxiliary_loss_clip": 0.01150121, + "auxiliary_loss_mlp": 0.01025582, + "balance_loss_clip": 1.0440917, + "balance_loss_mlp": 1.01815248, + "epoch": 0.8502374797090122, + "flos": 22272624201600.0, + "grad_norm": 1.9339965885010126, + "language_loss": 0.78398669, + "learning_rate": 2.3056605202393475e-07, + "loss": 0.80574375, + "num_input_tokens_seen": 152974265, + "step": 7071, + "time_per_iteration": 2.4622950553894043 + }, + { + "auxiliary_loss_clip": 0.01148632, + "auxiliary_loss_mlp": 0.00762745, + "balance_loss_clip": 1.04228556, + "balance_loss_mlp": 1.00059915, + "epoch": 0.8503577225996513, + "flos": 23659817495040.0, + "grad_norm": 1.910195539815755, + "language_loss": 0.66962284, + "learning_rate": 2.3020308527706888e-07, + "loss": 0.68873656, + "num_input_tokens_seen": 152993680, + "step": 7072, + "time_per_iteration": 2.509488821029663 + }, + { + "auxiliary_loss_clip": 0.01143644, + "auxiliary_loss_mlp": 0.01026619, + "balance_loss_clip": 1.04285133, + "balance_loss_mlp": 1.0192306, + "epoch": 0.8504779654902904, + "flos": 26758513002240.0, + "grad_norm": 1.69815993317552, + "language_loss": 0.88684362, + "learning_rate": 2.2984038700698715e-07, + "loss": 0.90854621, + "num_input_tokens_seen": 153012990, + "step": 7073, + "time_per_iteration": 2.5324513912200928 + }, + { + "auxiliary_loss_clip": 0.01151781, + "auxiliary_loss_mlp": 0.01026533, + "balance_loss_clip": 1.04787588, + "balance_loss_mlp": 1.01894546, + "epoch": 0.8505982083809295, + "flos": 26468247196800.0, + "grad_norm": 1.6192265688314937, + "language_loss": 0.79033172, + "learning_rate": 2.2947795726871222e-07, + "loss": 0.81211483, + "num_input_tokens_seen": 153034015, + "step": 7074, + "time_per_iteration": 2.4963085651397705 + }, + { + "auxiliary_loss_clip": 0.01152229, + "auxiliary_loss_mlp": 0.00762087, + "balance_loss_clip": 1.04907775, + "balance_loss_mlp": 1.00069904, + "epoch": 0.8507184512715685, + "flos": 20303390926080.0, + "grad_norm": 1.9456002213235553, + "language_loss": 0.85696316, + "learning_rate": 2.2911579611722253e-07, + "loss": 0.87610626, + "num_input_tokens_seen": 153053160, + "step": 7075, + "time_per_iteration": 2.4573702812194824 + }, + { + "auxiliary_loss_clip": 0.01138503, + "auxiliary_loss_mlp": 0.01025835, + "balance_loss_clip": 1.04435968, + "balance_loss_mlp": 1.01805639, + "epoch": 0.8508386941622077, + "flos": 19025186474880.0, + "grad_norm": 1.8603989557751914, + "language_loss": 0.87337416, + "learning_rate": 2.2875390360745905e-07, + "loss": 0.89501756, + "num_input_tokens_seen": 153072565, + "step": 7076, + "time_per_iteration": 2.485783576965332 + }, + { + "auxiliary_loss_clip": 0.01130465, + "auxiliary_loss_mlp": 0.01030528, + "balance_loss_clip": 1.04274929, + "balance_loss_mlp": 1.02310729, + "epoch": 0.8509589370528468, + "flos": 16433405654400.0, + "grad_norm": 1.704377323230328, + "language_loss": 0.77604008, + "learning_rate": 2.2839227979432008e-07, + "loss": 0.79765004, + "num_input_tokens_seen": 153090215, + "step": 7077, + "time_per_iteration": 2.5181801319122314 + }, + { + "auxiliary_loss_clip": 0.01140031, + "auxiliary_loss_mlp": 0.01028884, + "balance_loss_clip": 1.04362404, + "balance_loss_mlp": 1.02142453, + "epoch": 0.8510791799434858, + "flos": 18259714713600.0, + "grad_norm": 1.8159895597018392, + "language_loss": 0.84726083, + "learning_rate": 2.2803092473266373e-07, + "loss": 0.86894995, + "num_input_tokens_seen": 153107740, + "step": 7078, + "time_per_iteration": 2.4769530296325684 + }, + { + "auxiliary_loss_clip": 0.01171232, + "auxiliary_loss_mlp": 0.01025977, + "balance_loss_clip": 1.04963446, + "balance_loss_mlp": 1.01928949, + "epoch": 0.851199422834125, + "flos": 23441372933760.0, + "grad_norm": 2.2391023970680624, + "language_loss": 0.86929262, + "learning_rate": 2.2766983847730724e-07, + "loss": 0.89126468, + "num_input_tokens_seen": 153127410, + "step": 7079, + "time_per_iteration": 3.2882609367370605 + }, + { + "auxiliary_loss_clip": 0.01134264, + "auxiliary_loss_mlp": 0.01029493, + "balance_loss_clip": 1.0422889, + "balance_loss_mlp": 1.02163124, + "epoch": 0.851319665724764, + "flos": 16289404030080.0, + "grad_norm": 5.626774725558899, + "language_loss": 0.67018205, + "learning_rate": 2.2730902108302663e-07, + "loss": 0.69181955, + "num_input_tokens_seen": 153144325, + "step": 7080, + "time_per_iteration": 3.5582540035247803 + }, + { + "auxiliary_loss_clip": 0.0113234, + "auxiliary_loss_mlp": 0.01025557, + "balance_loss_clip": 1.04184926, + "balance_loss_mlp": 1.01784396, + "epoch": 0.8514399086154031, + "flos": 18989347680000.0, + "grad_norm": 1.6729068064777024, + "language_loss": 0.68959808, + "learning_rate": 2.269484726045583e-07, + "loss": 0.71117711, + "num_input_tokens_seen": 153163240, + "step": 7081, + "time_per_iteration": 2.5687451362609863 + }, + { + "auxiliary_loss_clip": 0.01130417, + "auxiliary_loss_mlp": 0.01028447, + "balance_loss_clip": 1.04517281, + "balance_loss_mlp": 1.02081704, + "epoch": 0.8515601515060423, + "flos": 24571194301440.0, + "grad_norm": 1.9369659198052203, + "language_loss": 0.79248059, + "learning_rate": 2.2658819309659672e-07, + "loss": 0.81406927, + "num_input_tokens_seen": 153183440, + "step": 7082, + "time_per_iteration": 2.5854835510253906 + }, + { + "auxiliary_loss_clip": 0.01137143, + "auxiliary_loss_mlp": 0.01021045, + "balance_loss_clip": 1.04640722, + "balance_loss_mlp": 1.01451588, + "epoch": 0.8516803943966813, + "flos": 19529443555200.0, + "grad_norm": 1.9894473980612943, + "language_loss": 0.84921724, + "learning_rate": 2.2622818261379706e-07, + "loss": 0.87079906, + "num_input_tokens_seen": 153200460, + "step": 7083, + "time_per_iteration": 3.2478187084198 + }, + { + "auxiliary_loss_clip": 0.01136886, + "auxiliary_loss_mlp": 0.010243, + "balance_loss_clip": 1.04319143, + "balance_loss_mlp": 1.01671803, + "epoch": 0.8518006372873204, + "flos": 20265792364800.0, + "grad_norm": 1.6995677278765775, + "language_loss": 0.74692696, + "learning_rate": 2.2586844121077142e-07, + "loss": 0.76853883, + "num_input_tokens_seen": 153218970, + "step": 7084, + "time_per_iteration": 2.48114013671875 + }, + { + "auxiliary_loss_clip": 0.01112882, + "auxiliary_loss_mlp": 0.0103068, + "balance_loss_clip": 1.03988266, + "balance_loss_mlp": 1.02249002, + "epoch": 0.8519208801779595, + "flos": 24133227770880.0, + "grad_norm": 1.7775019546384483, + "language_loss": 0.71884084, + "learning_rate": 2.2550896894209215e-07, + "loss": 0.74027646, + "num_input_tokens_seen": 153238485, + "step": 7085, + "time_per_iteration": 2.5791821479797363 + }, + { + "auxiliary_loss_clip": 0.01013622, + "auxiliary_loss_mlp": 0.010011, + "balance_loss_clip": 1.00920832, + "balance_loss_mlp": 1.00005651, + "epoch": 0.8520411230685986, + "flos": 63035223252480.0, + "grad_norm": 0.6878643599054319, + "language_loss": 0.56649196, + "learning_rate": 2.2514976586229184e-07, + "loss": 0.58663917, + "num_input_tokens_seen": 153306430, + "step": 7086, + "time_per_iteration": 3.449052095413208 + }, + { + "auxiliary_loss_clip": 0.01053604, + "auxiliary_loss_mlp": 0.01001274, + "balance_loss_clip": 1.00700831, + "balance_loss_mlp": 1.00029624, + "epoch": 0.8521613659592376, + "flos": 65836865283840.0, + "grad_norm": 0.7505314754217998, + "language_loss": 0.54632092, + "learning_rate": 2.247908320258609e-07, + "loss": 0.56686962, + "num_input_tokens_seen": 153366520, + "step": 7087, + "time_per_iteration": 3.138150215148926 + }, + { + "auxiliary_loss_clip": 0.01104989, + "auxiliary_loss_mlp": 0.01024692, + "balance_loss_clip": 1.04218483, + "balance_loss_mlp": 1.01697874, + "epoch": 0.8522816088498768, + "flos": 23112323418240.0, + "grad_norm": 1.9842334131561823, + "language_loss": 0.79552442, + "learning_rate": 2.2443216748724914e-07, + "loss": 0.81682122, + "num_input_tokens_seen": 153387230, + "step": 7088, + "time_per_iteration": 2.5801069736480713 + }, + { + "auxiliary_loss_clip": 0.01158763, + "auxiliary_loss_mlp": 0.0076235, + "balance_loss_clip": 1.04752946, + "balance_loss_mlp": 1.00054884, + "epoch": 0.8524018517405159, + "flos": 31758140073600.0, + "grad_norm": 1.9527292208581866, + "language_loss": 0.74267197, + "learning_rate": 2.2407377230086588e-07, + "loss": 0.76188314, + "num_input_tokens_seen": 153409585, + "step": 7089, + "time_per_iteration": 2.5487563610076904 + }, + { + "auxiliary_loss_clip": 0.01123283, + "auxiliary_loss_mlp": 0.01023226, + "balance_loss_clip": 1.04441154, + "balance_loss_mlp": 1.01612711, + "epoch": 0.8525220946311549, + "flos": 18690318956160.0, + "grad_norm": 2.9186596162693252, + "language_loss": 0.83430994, + "learning_rate": 2.23715646521079e-07, + "loss": 0.85577506, + "num_input_tokens_seen": 153427105, + "step": 7090, + "time_per_iteration": 2.5282247066497803 + }, + { + "auxiliary_loss_clip": 0.01156808, + "auxiliary_loss_mlp": 0.00762609, + "balance_loss_clip": 1.0450654, + "balance_loss_mlp": 1.00054455, + "epoch": 0.852642337521794, + "flos": 21793216354560.0, + "grad_norm": 1.9146302900501169, + "language_loss": 0.84378958, + "learning_rate": 2.2335779020221724e-07, + "loss": 0.8629837, + "num_input_tokens_seen": 153443725, + "step": 7091, + "time_per_iteration": 2.4751174449920654 + }, + { + "auxiliary_loss_clip": 0.0105479, + "auxiliary_loss_mlp": 0.0100098, + "balance_loss_clip": 1.01467896, + "balance_loss_mlp": 0.99965686, + "epoch": 0.8527625804124331, + "flos": 69040132260480.0, + "grad_norm": 0.7978339220704614, + "language_loss": 0.56463754, + "learning_rate": 2.2300020339856497e-07, + "loss": 0.58519518, + "num_input_tokens_seen": 153506410, + "step": 7092, + "time_per_iteration": 3.118492841720581 + }, + { + "auxiliary_loss_clip": 0.01134405, + "auxiliary_loss_mlp": 0.01023249, + "balance_loss_clip": 1.04295361, + "balance_loss_mlp": 1.01590252, + "epoch": 0.8528828233030722, + "flos": 26979399688320.0, + "grad_norm": 8.071407832106518, + "language_loss": 0.77997911, + "learning_rate": 2.2264288616436966e-07, + "loss": 0.80155563, + "num_input_tokens_seen": 153526665, + "step": 7093, + "time_per_iteration": 2.5772597789764404 + }, + { + "auxiliary_loss_clip": 0.01133116, + "auxiliary_loss_mlp": 0.01026227, + "balance_loss_clip": 1.04327726, + "balance_loss_mlp": 1.01896751, + "epoch": 0.8530030661937112, + "flos": 17487598936320.0, + "grad_norm": 2.2006988551464683, + "language_loss": 0.72743994, + "learning_rate": 2.222858385538351e-07, + "loss": 0.74903333, + "num_input_tokens_seen": 153543465, + "step": 7094, + "time_per_iteration": 2.463207244873047 + }, + { + "auxiliary_loss_clip": 0.0114844, + "auxiliary_loss_mlp": 0.01029016, + "balance_loss_clip": 1.04353642, + "balance_loss_mlp": 1.02168763, + "epoch": 0.8531233090843504, + "flos": 22160798184960.0, + "grad_norm": 1.9006423515019613, + "language_loss": 0.67845345, + "learning_rate": 2.2192906062112527e-07, + "loss": 0.7002281, + "num_input_tokens_seen": 153563340, + "step": 7095, + "time_per_iteration": 2.463075876235962 + }, + { + "auxiliary_loss_clip": 0.01166872, + "auxiliary_loss_mlp": 0.01024047, + "balance_loss_clip": 1.04537368, + "balance_loss_mlp": 1.01714456, + "epoch": 0.8532435519749895, + "flos": 37635388145280.0, + "grad_norm": 1.583750325132169, + "language_loss": 0.7077511, + "learning_rate": 2.2157255242036377e-07, + "loss": 0.72966033, + "num_input_tokens_seen": 153587005, + "step": 7096, + "time_per_iteration": 2.5845799446105957 + }, + { + "auxiliary_loss_clip": 0.01122089, + "auxiliary_loss_mlp": 0.01027632, + "balance_loss_clip": 1.04250836, + "balance_loss_mlp": 1.02037263, + "epoch": 0.8533637948656285, + "flos": 21398163598080.0, + "grad_norm": 1.6751974751757737, + "language_loss": 0.74195194, + "learning_rate": 2.2121631400563135e-07, + "loss": 0.76344919, + "num_input_tokens_seen": 153606835, + "step": 7097, + "time_per_iteration": 2.5481960773468018 + }, + { + "auxiliary_loss_clip": 0.01051504, + "auxiliary_loss_mlp": 0.01003085, + "balance_loss_clip": 1.01231909, + "balance_loss_mlp": 1.00217915, + "epoch": 0.8534840377562677, + "flos": 53345122490880.0, + "grad_norm": 0.7684350215163896, + "language_loss": 0.52953911, + "learning_rate": 2.208603454309701e-07, + "loss": 0.55008495, + "num_input_tokens_seen": 153664925, + "step": 7098, + "time_per_iteration": 3.0026113986968994 + }, + { + "auxiliary_loss_clip": 0.01111202, + "auxiliary_loss_mlp": 0.01025992, + "balance_loss_clip": 1.04138982, + "balance_loss_mlp": 1.01767659, + "epoch": 0.8536042806469067, + "flos": 20814148368000.0, + "grad_norm": 1.7440843856178512, + "language_loss": 0.70948231, + "learning_rate": 2.2050464675037994e-07, + "loss": 0.73085427, + "num_input_tokens_seen": 153683550, + "step": 7099, + "time_per_iteration": 2.5573785305023193 + }, + { + "auxiliary_loss_clip": 0.01139638, + "auxiliary_loss_mlp": 0.01025758, + "balance_loss_clip": 1.04532659, + "balance_loss_mlp": 1.01821768, + "epoch": 0.8537245235375458, + "flos": 24681368292480.0, + "grad_norm": 2.277390987620403, + "language_loss": 0.72903591, + "learning_rate": 2.2014921801782016e-07, + "loss": 0.75068986, + "num_input_tokens_seen": 153703040, + "step": 7100, + "time_per_iteration": 2.515376567840576 + }, + { + "auxiliary_loss_clip": 0.01139342, + "auxiliary_loss_mlp": 0.0102186, + "balance_loss_clip": 1.04016209, + "balance_loss_mlp": 1.01482666, + "epoch": 0.853844766428185, + "flos": 24384817607040.0, + "grad_norm": 2.0368959340787836, + "language_loss": 0.73891008, + "learning_rate": 2.1979405928720872e-07, + "loss": 0.76052207, + "num_input_tokens_seen": 153722695, + "step": 7101, + "time_per_iteration": 2.509791135787964 + }, + { + "auxiliary_loss_clip": 0.01142541, + "auxiliary_loss_mlp": 0.01022023, + "balance_loss_clip": 1.04470611, + "balance_loss_mlp": 1.01506758, + "epoch": 0.853965009318824, + "flos": 20955707867520.0, + "grad_norm": 1.5036657623999992, + "language_loss": 0.79190749, + "learning_rate": 2.1943917061242257e-07, + "loss": 0.81355315, + "num_input_tokens_seen": 153742550, + "step": 7102, + "time_per_iteration": 2.4884681701660156 + }, + { + "auxiliary_loss_clip": 0.01161231, + "auxiliary_loss_mlp": 0.00762364, + "balance_loss_clip": 1.04677629, + "balance_loss_mlp": 1.00051117, + "epoch": 0.8540852522094631, + "flos": 24201816791040.0, + "grad_norm": 1.6281872098458645, + "language_loss": 0.66315329, + "learning_rate": 2.1908455204729903e-07, + "loss": 0.68238926, + "num_input_tokens_seen": 153761700, + "step": 7103, + "time_per_iteration": 2.49631667137146 + }, + { + "auxiliary_loss_clip": 0.01138177, + "auxiliary_loss_mlp": 0.01023887, + "balance_loss_clip": 1.04211342, + "balance_loss_mlp": 1.01626408, + "epoch": 0.8542054951001022, + "flos": 25082921410560.0, + "grad_norm": 3.890889454559976, + "language_loss": 0.78422016, + "learning_rate": 2.1873020364563265e-07, + "loss": 0.80584079, + "num_input_tokens_seen": 153780765, + "step": 7104, + "time_per_iteration": 2.51365065574646 + }, + { + "auxiliary_loss_clip": 0.01150703, + "auxiliary_loss_mlp": 0.01026738, + "balance_loss_clip": 1.04642844, + "balance_loss_mlp": 1.01941586, + "epoch": 0.8543257379907413, + "flos": 24316551809280.0, + "grad_norm": 2.5079438505284126, + "language_loss": 0.76150352, + "learning_rate": 2.183761254611789e-07, + "loss": 0.78327799, + "num_input_tokens_seen": 153801090, + "step": 7105, + "time_per_iteration": 3.4796016216278076 + }, + { + "auxiliary_loss_clip": 0.01153445, + "auxiliary_loss_mlp": 0.0102571, + "balance_loss_clip": 1.04648066, + "balance_loss_mlp": 1.01839614, + "epoch": 0.8544459808813804, + "flos": 55286630467200.0, + "grad_norm": 2.2560242923423686, + "language_loss": 0.70271051, + "learning_rate": 2.1802231754764987e-07, + "loss": 0.72450209, + "num_input_tokens_seen": 153826530, + "step": 7106, + "time_per_iteration": 3.5585222244262695 + }, + { + "auxiliary_loss_clip": 0.01140328, + "auxiliary_loss_mlp": 0.01025052, + "balance_loss_clip": 1.04175377, + "balance_loss_mlp": 1.01706505, + "epoch": 0.8545662237720195, + "flos": 25776248705280.0, + "grad_norm": 1.967527053655893, + "language_loss": 0.76404595, + "learning_rate": 2.17668779958718e-07, + "loss": 0.78569973, + "num_input_tokens_seen": 153849110, + "step": 7107, + "time_per_iteration": 2.5463385581970215 + }, + { + "auxiliary_loss_clip": 0.01168321, + "auxiliary_loss_mlp": 0.01025931, + "balance_loss_clip": 1.04697847, + "balance_loss_mlp": 1.01834905, + "epoch": 0.8546864666626586, + "flos": 11108320427520.0, + "grad_norm": 2.257951033006694, + "language_loss": 0.80164886, + "learning_rate": 2.1731551274801553e-07, + "loss": 0.82359135, + "num_input_tokens_seen": 153865550, + "step": 7108, + "time_per_iteration": 2.3902180194854736 + }, + { + "auxiliary_loss_clip": 0.01140726, + "auxiliary_loss_mlp": 0.01021975, + "balance_loss_clip": 1.04568028, + "balance_loss_mlp": 1.01436949, + "epoch": 0.8548067095532976, + "flos": 25520169669120.0, + "grad_norm": 1.994072968271586, + "language_loss": 0.61626464, + "learning_rate": 2.169625159691324e-07, + "loss": 0.63789165, + "num_input_tokens_seen": 153885425, + "step": 7109, + "time_per_iteration": 3.2775559425354004 + }, + { + "auxiliary_loss_clip": 0.01121202, + "auxiliary_loss_mlp": 0.01022286, + "balance_loss_clip": 1.04166436, + "balance_loss_mlp": 1.01521134, + "epoch": 0.8549269524439368, + "flos": 24717853532160.0, + "grad_norm": 2.164589095132412, + "language_loss": 0.74129927, + "learning_rate": 2.1660978967561784e-07, + "loss": 0.76273412, + "num_input_tokens_seen": 153904760, + "step": 7110, + "time_per_iteration": 3.3629095554351807 + }, + { + "auxiliary_loss_clip": 0.01166027, + "auxiliary_loss_mlp": 0.01022707, + "balance_loss_clip": 1.04485536, + "balance_loss_mlp": 1.01575112, + "epoch": 0.8550471953345758, + "flos": 19825599191040.0, + "grad_norm": 4.5410121963819225, + "language_loss": 0.78477895, + "learning_rate": 2.1625733392098035e-07, + "loss": 0.80666637, + "num_input_tokens_seen": 153920370, + "step": 7111, + "time_per_iteration": 2.4143998622894287 + }, + { + "auxiliary_loss_clip": 0.0116629, + "auxiliary_loss_mlp": 0.01025474, + "balance_loss_clip": 1.04537714, + "balance_loss_mlp": 1.01822329, + "epoch": 0.8551674382252149, + "flos": 22820441500800.0, + "grad_norm": 1.7843276078189738, + "language_loss": 0.79452318, + "learning_rate": 2.159051487586867e-07, + "loss": 0.81644082, + "num_input_tokens_seen": 153940500, + "step": 7112, + "time_per_iteration": 2.42858624458313 + }, + { + "auxiliary_loss_clip": 0.01143775, + "auxiliary_loss_mlp": 0.01033742, + "balance_loss_clip": 1.04682612, + "balance_loss_mlp": 1.02556467, + "epoch": 0.8552876811158541, + "flos": 20631255292800.0, + "grad_norm": 2.6117120078732166, + "language_loss": 0.71841645, + "learning_rate": 2.155532342421642e-07, + "loss": 0.74019164, + "num_input_tokens_seen": 153958500, + "step": 7113, + "time_per_iteration": 2.4635138511657715 + }, + { + "auxiliary_loss_clip": 0.01157849, + "auxiliary_loss_mlp": 0.01031613, + "balance_loss_clip": 1.04653573, + "balance_loss_mlp": 1.02366161, + "epoch": 0.8554079240064931, + "flos": 23112359331840.0, + "grad_norm": 1.8475153809395193, + "language_loss": 0.78349978, + "learning_rate": 2.1520159042479636e-07, + "loss": 0.80539441, + "num_input_tokens_seen": 153976790, + "step": 7114, + "time_per_iteration": 2.4637362957000732 + }, + { + "auxiliary_loss_clip": 0.01154721, + "auxiliary_loss_mlp": 0.01026044, + "balance_loss_clip": 1.04764843, + "balance_loss_mlp": 1.01855767, + "epoch": 0.8555281668971322, + "flos": 22128047959680.0, + "grad_norm": 2.226674205460541, + "language_loss": 0.70851231, + "learning_rate": 2.148502173599287e-07, + "loss": 0.73031992, + "num_input_tokens_seen": 153994930, + "step": 7115, + "time_per_iteration": 2.4620587825775146 + }, + { + "auxiliary_loss_clip": 0.01134448, + "auxiliary_loss_mlp": 0.01022256, + "balance_loss_clip": 1.04321086, + "balance_loss_mlp": 1.0142746, + "epoch": 0.8556484097877713, + "flos": 31139040234240.0, + "grad_norm": 1.8785082683952645, + "language_loss": 0.65718937, + "learning_rate": 2.1449911510086372e-07, + "loss": 0.67875648, + "num_input_tokens_seen": 154014400, + "step": 7116, + "time_per_iteration": 2.5571329593658447 + }, + { + "auxiliary_loss_clip": 0.01150601, + "auxiliary_loss_mlp": 0.01026395, + "balance_loss_clip": 1.0445447, + "balance_loss_mlp": 1.01920378, + "epoch": 0.8557686526784104, + "flos": 24316551809280.0, + "grad_norm": 2.777417532366955, + "language_loss": 0.76800478, + "learning_rate": 2.141482837008628e-07, + "loss": 0.78977472, + "num_input_tokens_seen": 154034940, + "step": 7117, + "time_per_iteration": 2.511093854904175 + }, + { + "auxiliary_loss_clip": 0.01144649, + "auxiliary_loss_mlp": 0.01028779, + "balance_loss_clip": 1.04316139, + "balance_loss_mlp": 1.02093518, + "epoch": 0.8558888955690495, + "flos": 17712723427200.0, + "grad_norm": 2.0636987182164472, + "language_loss": 0.72205663, + "learning_rate": 2.1379772321314826e-07, + "loss": 0.74379086, + "num_input_tokens_seen": 154052985, + "step": 7118, + "time_per_iteration": 2.422373056411743 + }, + { + "auxiliary_loss_clip": 0.01088209, + "auxiliary_loss_mlp": 0.010272, + "balance_loss_clip": 1.03986716, + "balance_loss_mlp": 1.01939213, + "epoch": 0.8560091384596886, + "flos": 19171702051200.0, + "grad_norm": 1.924947415374973, + "language_loss": 0.81422567, + "learning_rate": 2.1344743369089802e-07, + "loss": 0.83537972, + "num_input_tokens_seen": 154068765, + "step": 7119, + "time_per_iteration": 2.5576541423797607 + }, + { + "auxiliary_loss_clip": 0.0113746, + "auxiliary_loss_mlp": 0.01021407, + "balance_loss_clip": 1.04440653, + "balance_loss_mlp": 1.01445973, + "epoch": 0.8561293813503277, + "flos": 23914855036800.0, + "grad_norm": 1.8820127243925793, + "language_loss": 0.81906247, + "learning_rate": 2.130974151872522e-07, + "loss": 0.84065109, + "num_input_tokens_seen": 154089100, + "step": 7120, + "time_per_iteration": 2.5269672870635986 + }, + { + "auxiliary_loss_clip": 0.0112776, + "auxiliary_loss_mlp": 0.01025255, + "balance_loss_clip": 1.04449749, + "balance_loss_mlp": 1.01812005, + "epoch": 0.8562496242409667, + "flos": 22529206028160.0, + "grad_norm": 1.7409732935050308, + "language_loss": 0.78602576, + "learning_rate": 2.1274766775530773e-07, + "loss": 0.80755591, + "num_input_tokens_seen": 154108965, + "step": 7121, + "time_per_iteration": 2.5429272651672363 + }, + { + "auxiliary_loss_clip": 0.0116865, + "auxiliary_loss_mlp": 0.01022471, + "balance_loss_clip": 1.0457201, + "balance_loss_mlp": 1.01506841, + "epoch": 0.8563698671316058, + "flos": 14712745472640.0, + "grad_norm": 2.015725870501803, + "language_loss": 0.7966398, + "learning_rate": 2.1239819144812077e-07, + "loss": 0.81855106, + "num_input_tokens_seen": 154123425, + "step": 7122, + "time_per_iteration": 2.3755223751068115 + }, + { + "auxiliary_loss_clip": 0.01118726, + "auxiliary_loss_mlp": 0.01025415, + "balance_loss_clip": 1.04018056, + "balance_loss_mlp": 1.01805425, + "epoch": 0.856490110022245, + "flos": 39167768211840.0, + "grad_norm": 1.7037229312192503, + "language_loss": 0.70037019, + "learning_rate": 2.1204898631870716e-07, + "loss": 0.72181159, + "num_input_tokens_seen": 154148315, + "step": 7123, + "time_per_iteration": 2.665858745574951 + }, + { + "auxiliary_loss_clip": 0.01140931, + "auxiliary_loss_mlp": 0.01021887, + "balance_loss_clip": 1.04651046, + "balance_loss_mlp": 1.01507401, + "epoch": 0.856610352912884, + "flos": 29059345658880.0, + "grad_norm": 1.9543245263948303, + "language_loss": 0.75883383, + "learning_rate": 2.1170005242004006e-07, + "loss": 0.78046203, + "num_input_tokens_seen": 154169665, + "step": 7124, + "time_per_iteration": 2.5466041564941406 + }, + { + "auxiliary_loss_clip": 0.01144023, + "auxiliary_loss_mlp": 0.01022491, + "balance_loss_clip": 1.04380989, + "balance_loss_mlp": 1.01540709, + "epoch": 0.8567305958035231, + "flos": 23878333883520.0, + "grad_norm": 1.894273827089325, + "language_loss": 0.77871102, + "learning_rate": 2.1135138980505384e-07, + "loss": 0.80037618, + "num_input_tokens_seen": 154190335, + "step": 7125, + "time_per_iteration": 2.4987032413482666 + }, + { + "auxiliary_loss_clip": 0.01136412, + "auxiliary_loss_mlp": 0.01021508, + "balance_loss_clip": 1.04560459, + "balance_loss_mlp": 1.01412916, + "epoch": 0.8568508386941622, + "flos": 22200120599040.0, + "grad_norm": 1.6845734419386529, + "language_loss": 0.72160375, + "learning_rate": 2.110029985266395e-07, + "loss": 0.74318302, + "num_input_tokens_seen": 154210040, + "step": 7126, + "time_per_iteration": 2.4778850078582764 + }, + { + "auxiliary_loss_clip": 0.01142901, + "auxiliary_loss_mlp": 0.01024918, + "balance_loss_clip": 1.04284513, + "balance_loss_mlp": 1.01768804, + "epoch": 0.8569710815848013, + "flos": 17307507121920.0, + "grad_norm": 1.7976607658842012, + "language_loss": 0.73911989, + "learning_rate": 2.1065487863764787e-07, + "loss": 0.76079804, + "num_input_tokens_seen": 154228385, + "step": 7127, + "time_per_iteration": 2.45661997795105 + }, + { + "auxiliary_loss_clip": 0.01101778, + "auxiliary_loss_mlp": 0.01023584, + "balance_loss_clip": 1.03612638, + "balance_loss_mlp": 1.01570415, + "epoch": 0.8570913244754403, + "flos": 23732285184000.0, + "grad_norm": 1.4586540121751612, + "language_loss": 0.85714626, + "learning_rate": 2.1030703019088846e-07, + "loss": 0.87839985, + "num_input_tokens_seen": 154249015, + "step": 7128, + "time_per_iteration": 2.5649757385253906 + }, + { + "auxiliary_loss_clip": 0.01147998, + "auxiliary_loss_mlp": 0.01023704, + "balance_loss_clip": 1.04453754, + "balance_loss_mlp": 1.01647699, + "epoch": 0.8572115673660795, + "flos": 20048748433920.0, + "grad_norm": 2.141865277818988, + "language_loss": 0.70563638, + "learning_rate": 2.099594532391291e-07, + "loss": 0.72735345, + "num_input_tokens_seen": 154267700, + "step": 7129, + "time_per_iteration": 2.454463243484497 + }, + { + "auxiliary_loss_clip": 0.01145018, + "auxiliary_loss_mlp": 0.01023888, + "balance_loss_clip": 1.04438853, + "balance_loss_mlp": 1.01667047, + "epoch": 0.8573318102567186, + "flos": 27160389342720.0, + "grad_norm": 1.4866498890170112, + "language_loss": 0.79037493, + "learning_rate": 2.0961214783509806e-07, + "loss": 0.81206399, + "num_input_tokens_seen": 154290580, + "step": 7130, + "time_per_iteration": 2.5432825088500977 + }, + { + "auxiliary_loss_clip": 0.01143418, + "auxiliary_loss_mlp": 0.0102263, + "balance_loss_clip": 1.04336429, + "balance_loss_mlp": 1.01531935, + "epoch": 0.8574520531473576, + "flos": 24936585402240.0, + "grad_norm": 1.807141131663687, + "language_loss": 0.74890745, + "learning_rate": 2.0926511403148051e-07, + "loss": 0.77056789, + "num_input_tokens_seen": 154309545, + "step": 7131, + "time_per_iteration": 3.252516269683838 + }, + { + "auxiliary_loss_clip": 0.01135282, + "auxiliary_loss_mlp": 0.01030483, + "balance_loss_clip": 1.04577148, + "balance_loss_mlp": 1.02346802, + "epoch": 0.8575722960379968, + "flos": 18771154513920.0, + "grad_norm": 1.8556911154116151, + "language_loss": 0.76217788, + "learning_rate": 2.0891835188092143e-07, + "loss": 0.78383553, + "num_input_tokens_seen": 154326545, + "step": 7132, + "time_per_iteration": 2.5149612426757812 + }, + { + "auxiliary_loss_clip": 0.01133434, + "auxiliary_loss_mlp": 0.01026874, + "balance_loss_clip": 1.04306674, + "balance_loss_mlp": 1.0190835, + "epoch": 0.8576925389286358, + "flos": 22200300167040.0, + "grad_norm": 2.1654746180115527, + "language_loss": 0.81112581, + "learning_rate": 2.0857186143602434e-07, + "loss": 0.83272892, + "num_input_tokens_seen": 154345190, + "step": 7133, + "time_per_iteration": 3.2992610931396484 + }, + { + "auxiliary_loss_clip": 0.01114878, + "auxiliary_loss_mlp": 0.01030034, + "balance_loss_clip": 1.03962362, + "balance_loss_mlp": 1.02193999, + "epoch": 0.8578127818192749, + "flos": 22894345733760.0, + "grad_norm": 1.893246493519847, + "language_loss": 0.67462105, + "learning_rate": 2.0822564274935094e-07, + "loss": 0.69607019, + "num_input_tokens_seen": 154364615, + "step": 7134, + "time_per_iteration": 2.5340800285339355 + }, + { + "auxiliary_loss_clip": 0.01140091, + "auxiliary_loss_mlp": 0.01024241, + "balance_loss_clip": 1.04686797, + "balance_loss_mlp": 1.01635265, + "epoch": 0.8579330247099141, + "flos": 34824839541120.0, + "grad_norm": 1.721951474833702, + "language_loss": 0.66901219, + "learning_rate": 2.078796958734239e-07, + "loss": 0.69065547, + "num_input_tokens_seen": 154387335, + "step": 7135, + "time_per_iteration": 2.6089277267456055 + }, + { + "auxiliary_loss_clip": 0.0115391, + "auxiliary_loss_mlp": 0.01023307, + "balance_loss_clip": 1.04603982, + "balance_loss_mlp": 1.01613939, + "epoch": 0.8580532676005531, + "flos": 19755681367680.0, + "grad_norm": 2.163940977471808, + "language_loss": 0.74882799, + "learning_rate": 2.0753402086072124e-07, + "loss": 0.77060014, + "num_input_tokens_seen": 154405965, + "step": 7136, + "time_per_iteration": 3.977055549621582 + }, + { + "auxiliary_loss_clip": 0.0109478, + "auxiliary_loss_mlp": 0.01029934, + "balance_loss_clip": 1.04114413, + "balance_loss_mlp": 1.02225637, + "epoch": 0.8581735104911922, + "flos": 22739318634240.0, + "grad_norm": 2.1742036435055208, + "language_loss": 0.75080645, + "learning_rate": 2.071886177636828e-07, + "loss": 0.7720536, + "num_input_tokens_seen": 154422750, + "step": 7137, + "time_per_iteration": 2.729172945022583 + }, + { + "auxiliary_loss_clip": 0.01151197, + "auxiliary_loss_mlp": 0.0102386, + "balance_loss_clip": 1.04633021, + "balance_loss_mlp": 1.0167191, + "epoch": 0.8582937533818313, + "flos": 23149131880320.0, + "grad_norm": 2.745246369045839, + "language_loss": 0.83021796, + "learning_rate": 2.0684348663470575e-07, + "loss": 0.85196853, + "num_input_tokens_seen": 154442930, + "step": 7138, + "time_per_iteration": 2.945892572402954 + }, + { + "auxiliary_loss_clip": 0.01137513, + "auxiliary_loss_mlp": 0.01026819, + "balance_loss_clip": 1.04019344, + "balance_loss_mlp": 1.01925492, + "epoch": 0.8584139962724704, + "flos": 19498668577920.0, + "grad_norm": 2.095917485038604, + "language_loss": 0.61638254, + "learning_rate": 2.0649862752614555e-07, + "loss": 0.63802582, + "num_input_tokens_seen": 154461640, + "step": 7139, + "time_per_iteration": 2.4858579635620117 + }, + { + "auxiliary_loss_clip": 0.01045125, + "auxiliary_loss_mlp": 0.01001626, + "balance_loss_clip": 1.00841045, + "balance_loss_mlp": 1.00058842, + "epoch": 0.8585342391631094, + "flos": 71276577788160.0, + "grad_norm": 0.7507253156288719, + "language_loss": 0.56998563, + "learning_rate": 2.0615404049031838e-07, + "loss": 0.59045315, + "num_input_tokens_seen": 154518610, + "step": 7140, + "time_per_iteration": 3.0815317630767822 + }, + { + "auxiliary_loss_clip": 0.01155621, + "auxiliary_loss_mlp": 0.01025106, + "balance_loss_clip": 1.04696381, + "balance_loss_mlp": 1.01717269, + "epoch": 0.8586544820537486, + "flos": 10815432929280.0, + "grad_norm": 3.0586041259113226, + "language_loss": 0.78288537, + "learning_rate": 2.0580972557949616e-07, + "loss": 0.80469263, + "num_input_tokens_seen": 154533700, + "step": 7141, + "time_per_iteration": 2.4626331329345703 + }, + { + "auxiliary_loss_clip": 0.01055149, + "auxiliary_loss_mlp": 0.01001404, + "balance_loss_clip": 1.00759959, + "balance_loss_mlp": 1.0004859, + "epoch": 0.8587747249443877, + "flos": 64811184422400.0, + "grad_norm": 0.7951211748235975, + "language_loss": 0.54283273, + "learning_rate": 2.054656828459125e-07, + "loss": 0.56339824, + "num_input_tokens_seen": 154597810, + "step": 7142, + "time_per_iteration": 3.0784835815429688 + }, + { + "auxiliary_loss_clip": 0.01107829, + "auxiliary_loss_mlp": 0.0102796, + "balance_loss_clip": 1.04119682, + "balance_loss_mlp": 1.02020884, + "epoch": 0.8588949678350267, + "flos": 26834607964800.0, + "grad_norm": 1.7536533451594936, + "language_loss": 0.77334994, + "learning_rate": 2.051219123417578e-07, + "loss": 0.79470789, + "num_input_tokens_seen": 154617870, + "step": 7143, + "time_per_iteration": 2.5888254642486572 + }, + { + "auxiliary_loss_clip": 0.01169855, + "auxiliary_loss_mlp": 0.01022853, + "balance_loss_clip": 1.0462079, + "balance_loss_mlp": 1.01457429, + "epoch": 0.8590152107256659, + "flos": 26104256726400.0, + "grad_norm": 2.1634749688059216, + "language_loss": 0.59777266, + "learning_rate": 2.0477841411918196e-07, + "loss": 0.61969972, + "num_input_tokens_seen": 154637395, + "step": 7144, + "time_per_iteration": 2.458035707473755 + }, + { + "auxiliary_loss_clip": 0.01147736, + "auxiliary_loss_mlp": 0.01024129, + "balance_loss_clip": 1.04380345, + "balance_loss_mlp": 1.01685095, + "epoch": 0.859135453616305, + "flos": 26140885620480.0, + "grad_norm": 2.5808894777225038, + "language_loss": 0.74600959, + "learning_rate": 2.0443518823029326e-07, + "loss": 0.76772827, + "num_input_tokens_seen": 154657935, + "step": 7145, + "time_per_iteration": 2.5063374042510986 + }, + { + "auxiliary_loss_clip": 0.01120374, + "auxiliary_loss_mlp": 0.01028237, + "balance_loss_clip": 1.04149008, + "balance_loss_mlp": 1.02045298, + "epoch": 0.859255696506944, + "flos": 12969319046400.0, + "grad_norm": 2.1913946169143226, + "language_loss": 0.76431233, + "learning_rate": 2.0409223472715854e-07, + "loss": 0.78579843, + "num_input_tokens_seen": 154675080, + "step": 7146, + "time_per_iteration": 2.481884717941284 + }, + { + "auxiliary_loss_clip": 0.01127606, + "auxiliary_loss_mlp": 0.00761938, + "balance_loss_clip": 1.04333711, + "balance_loss_mlp": 1.00071836, + "epoch": 0.8593759393975832, + "flos": 18475753063680.0, + "grad_norm": 2.05350130370955, + "language_loss": 0.74921083, + "learning_rate": 2.0374955366180434e-07, + "loss": 0.76810628, + "num_input_tokens_seen": 154692720, + "step": 7147, + "time_per_iteration": 2.5115485191345215 + }, + { + "auxiliary_loss_clip": 0.01129058, + "auxiliary_loss_mlp": 0.01022617, + "balance_loss_clip": 1.04105234, + "balance_loss_mlp": 1.01485038, + "epoch": 0.8594961822882222, + "flos": 22200156512640.0, + "grad_norm": 1.6928107248890771, + "language_loss": 0.72828341, + "learning_rate": 2.034071450862147e-07, + "loss": 0.74980021, + "num_input_tokens_seen": 154710190, + "step": 7148, + "time_per_iteration": 2.517585515975952 + }, + { + "auxiliary_loss_clip": 0.01141059, + "auxiliary_loss_mlp": 0.01024565, + "balance_loss_clip": 1.04253483, + "balance_loss_mlp": 1.0167985, + "epoch": 0.8596164251788613, + "flos": 23294749616640.0, + "grad_norm": 1.6593175331200605, + "language_loss": 0.76522529, + "learning_rate": 2.030650090523327e-07, + "loss": 0.78688157, + "num_input_tokens_seen": 154729380, + "step": 7149, + "time_per_iteration": 2.4822306632995605 + }, + { + "auxiliary_loss_clip": 0.01123436, + "auxiliary_loss_mlp": 0.01022648, + "balance_loss_clip": 1.04141986, + "balance_loss_mlp": 1.01502502, + "epoch": 0.8597366680695004, + "flos": 31649905416960.0, + "grad_norm": 1.6434773885091594, + "language_loss": 0.59370327, + "learning_rate": 2.0272314561205995e-07, + "loss": 0.6151641, + "num_input_tokens_seen": 154749775, + "step": 7150, + "time_per_iteration": 2.5925605297088623 + }, + { + "auxiliary_loss_clip": 0.01118811, + "auxiliary_loss_mlp": 0.01019619, + "balance_loss_clip": 1.04010677, + "balance_loss_mlp": 1.01265168, + "epoch": 0.8598569109601395, + "flos": 21287738211840.0, + "grad_norm": 1.921797688535412, + "language_loss": 0.72701579, + "learning_rate": 2.023815548172567e-07, + "loss": 0.74840009, + "num_input_tokens_seen": 154769845, + "step": 7151, + "time_per_iteration": 2.548100709915161 + }, + { + "auxiliary_loss_clip": 0.01153943, + "auxiliary_loss_mlp": 0.01024529, + "balance_loss_clip": 1.04516816, + "balance_loss_mlp": 1.01688147, + "epoch": 0.8599771538507786, + "flos": 25447809720960.0, + "grad_norm": 1.5685394225966678, + "language_loss": 0.66113973, + "learning_rate": 2.0204023671974267e-07, + "loss": 0.68292445, + "num_input_tokens_seen": 154789230, + "step": 7152, + "time_per_iteration": 2.4891810417175293 + }, + { + "auxiliary_loss_clip": 0.01147458, + "auxiliary_loss_mlp": 0.01027669, + "balance_loss_clip": 1.04351807, + "balance_loss_mlp": 1.0197531, + "epoch": 0.8600973967414177, + "flos": 16723958768640.0, + "grad_norm": 1.9581683129244998, + "language_loss": 0.80804336, + "learning_rate": 2.0169919137129532e-07, + "loss": 0.82979465, + "num_input_tokens_seen": 154807670, + "step": 7153, + "time_per_iteration": 2.4296035766601562 + }, + { + "auxiliary_loss_clip": 0.01156716, + "auxiliary_loss_mlp": 0.01023871, + "balance_loss_clip": 1.04835749, + "balance_loss_mlp": 1.01597977, + "epoch": 0.8602176396320568, + "flos": 25227928615680.0, + "grad_norm": 2.1642444647114987, + "language_loss": 0.70632815, + "learning_rate": 2.013584188236508e-07, + "loss": 0.72813404, + "num_input_tokens_seen": 154825575, + "step": 7154, + "time_per_iteration": 2.5422861576080322 + }, + { + "auxiliary_loss_clip": 0.01170623, + "auxiliary_loss_mlp": 0.01023324, + "balance_loss_clip": 1.04831791, + "balance_loss_mlp": 1.01537907, + "epoch": 0.8603378825226958, + "flos": 20412236113920.0, + "grad_norm": 1.6341846391824693, + "language_loss": 0.79261434, + "learning_rate": 2.0101791912850396e-07, + "loss": 0.81455386, + "num_input_tokens_seen": 154845115, + "step": 7155, + "time_per_iteration": 2.4166481494903564 + }, + { + "auxiliary_loss_clip": 0.01141149, + "auxiliary_loss_mlp": 0.01021843, + "balance_loss_clip": 1.04703295, + "balance_loss_mlp": 1.01450825, + "epoch": 0.8604581254133349, + "flos": 34930201109760.0, + "grad_norm": 7.838791042666217, + "language_loss": 0.63977087, + "learning_rate": 2.006776923375082e-07, + "loss": 0.66140079, + "num_input_tokens_seen": 154866770, + "step": 7156, + "time_per_iteration": 2.5938730239868164 + }, + { + "auxiliary_loss_clip": 0.01168088, + "auxiliary_loss_mlp": 0.010211, + "balance_loss_clip": 1.046803, + "balance_loss_mlp": 1.0136379, + "epoch": 0.860578368303974, + "flos": 22596538072320.0, + "grad_norm": 1.648113954539244, + "language_loss": 0.71120083, + "learning_rate": 2.003377385022764e-07, + "loss": 0.73309267, + "num_input_tokens_seen": 154885595, + "step": 7157, + "time_per_iteration": 2.420905590057373 + }, + { + "auxiliary_loss_clip": 0.01139844, + "auxiliary_loss_mlp": 0.01027149, + "balance_loss_clip": 1.04235578, + "balance_loss_mlp": 1.0201906, + "epoch": 0.8606986111946131, + "flos": 21324331192320.0, + "grad_norm": 1.8256404216734028, + "language_loss": 0.77266097, + "learning_rate": 1.9999805767437826e-07, + "loss": 0.79433084, + "num_input_tokens_seen": 154904485, + "step": 7158, + "time_per_iteration": 3.3149585723876953 + }, + { + "auxiliary_loss_clip": 0.0113381, + "auxiliary_loss_mlp": 0.01022171, + "balance_loss_clip": 1.04231477, + "balance_loss_mlp": 1.01503611, + "epoch": 0.8608188540852522, + "flos": 28877206769280.0, + "grad_norm": 1.647009716794953, + "language_loss": 0.71974409, + "learning_rate": 1.9965864990534386e-07, + "loss": 0.74130392, + "num_input_tokens_seen": 154925010, + "step": 7159, + "time_per_iteration": 2.552638530731201 + }, + { + "auxiliary_loss_clip": 0.01117366, + "auxiliary_loss_mlp": 0.01022571, + "balance_loss_clip": 1.03832269, + "balance_loss_mlp": 1.01577866, + "epoch": 0.8609390969758913, + "flos": 29716187713920.0, + "grad_norm": 1.9553027936076615, + "language_loss": 0.77523756, + "learning_rate": 1.9931951524666092e-07, + "loss": 0.79663694, + "num_input_tokens_seen": 154946100, + "step": 7160, + "time_per_iteration": 3.351903200149536 + }, + { + "auxiliary_loss_clip": 0.01156327, + "auxiliary_loss_mlp": 0.00762324, + "balance_loss_clip": 1.04613984, + "balance_loss_mlp": 1.00062418, + "epoch": 0.8610593398665304, + "flos": 21249349551360.0, + "grad_norm": 1.588489438852303, + "language_loss": 0.81307054, + "learning_rate": 1.9898065374977534e-07, + "loss": 0.83225703, + "num_input_tokens_seen": 154966305, + "step": 7161, + "time_per_iteration": 2.45837664604187 + }, + { + "auxiliary_loss_clip": 0.01121971, + "auxiliary_loss_mlp": 0.01020521, + "balance_loss_clip": 1.04085541, + "balance_loss_mlp": 1.01409316, + "epoch": 0.8611795827571694, + "flos": 14830102183680.0, + "grad_norm": 1.9567455753871597, + "language_loss": 0.72694671, + "learning_rate": 1.9864206546609342e-07, + "loss": 0.74837166, + "num_input_tokens_seen": 154985145, + "step": 7162, + "time_per_iteration": 2.474337577819824 + }, + { + "auxiliary_loss_clip": 0.01166618, + "auxiliary_loss_mlp": 0.01018918, + "balance_loss_clip": 1.04636192, + "balance_loss_mlp": 1.0118165, + "epoch": 0.8612998256478086, + "flos": 24243258107520.0, + "grad_norm": 2.409396812387493, + "language_loss": 0.84357387, + "learning_rate": 1.983037504469771e-07, + "loss": 0.86542928, + "num_input_tokens_seen": 155003855, + "step": 7163, + "time_per_iteration": 3.1708056926727295 + }, + { + "auxiliary_loss_clip": 0.0115803, + "auxiliary_loss_mlp": 0.01024594, + "balance_loss_clip": 1.04850125, + "balance_loss_mlp": 1.01705992, + "epoch": 0.8614200685384477, + "flos": 21252653602560.0, + "grad_norm": 1.8781622979002455, + "language_loss": 0.66666114, + "learning_rate": 1.9796570874374984e-07, + "loss": 0.68848741, + "num_input_tokens_seen": 155023960, + "step": 7164, + "time_per_iteration": 2.4534566402435303 + }, + { + "auxiliary_loss_clip": 0.01140227, + "auxiliary_loss_mlp": 0.01020419, + "balance_loss_clip": 1.0432303, + "balance_loss_mlp": 1.01290333, + "epoch": 0.8615403114290867, + "flos": 20007738080640.0, + "grad_norm": 1.7240894661316901, + "language_loss": 0.77568352, + "learning_rate": 1.976279404076917e-07, + "loss": 0.79728997, + "num_input_tokens_seen": 155043360, + "step": 7165, + "time_per_iteration": 2.493433952331543 + }, + { + "auxiliary_loss_clip": 0.01125642, + "auxiliary_loss_mlp": 0.01025578, + "balance_loss_clip": 1.04454398, + "balance_loss_mlp": 1.01827049, + "epoch": 0.8616605543197259, + "flos": 29789373674880.0, + "grad_norm": 1.9263810411023923, + "language_loss": 0.76047814, + "learning_rate": 1.9729044549004193e-07, + "loss": 0.78199029, + "num_input_tokens_seen": 155064745, + "step": 7166, + "time_per_iteration": 2.6244959831237793 + }, + { + "auxiliary_loss_clip": 0.01151823, + "auxiliary_loss_mlp": 0.0102373, + "balance_loss_clip": 1.04586828, + "balance_loss_mlp": 1.01632071, + "epoch": 0.8617807972103649, + "flos": 28911609020160.0, + "grad_norm": 1.5743700905377283, + "language_loss": 0.70229203, + "learning_rate": 1.9695322404199822e-07, + "loss": 0.72404754, + "num_input_tokens_seen": 155086790, + "step": 7167, + "time_per_iteration": 2.508148670196533 + }, + { + "auxiliary_loss_clip": 0.01140661, + "auxiliary_loss_mlp": 0.01027694, + "balance_loss_clip": 1.04638267, + "balance_loss_mlp": 1.01990926, + "epoch": 0.861901040101004, + "flos": 27673804391040.0, + "grad_norm": 1.9312615751617386, + "language_loss": 0.82280719, + "learning_rate": 1.9661627611471654e-07, + "loss": 0.84449077, + "num_input_tokens_seen": 155106585, + "step": 7168, + "time_per_iteration": 2.524948835372925 + }, + { + "auxiliary_loss_clip": 0.01145957, + "auxiliary_loss_mlp": 0.01021177, + "balance_loss_clip": 1.04464924, + "balance_loss_mlp": 1.01312482, + "epoch": 0.8620212829916432, + "flos": 49748056755840.0, + "grad_norm": 10.470037174644933, + "language_loss": 0.7011193, + "learning_rate": 1.9627960175931246e-07, + "loss": 0.72279072, + "num_input_tokens_seen": 155131285, + "step": 7169, + "time_per_iteration": 2.73283052444458 + }, + { + "auxiliary_loss_clip": 0.01155195, + "auxiliary_loss_mlp": 0.01028354, + "balance_loss_clip": 1.04855633, + "balance_loss_mlp": 1.02155948, + "epoch": 0.8621415258822822, + "flos": 21138672769920.0, + "grad_norm": 2.0695176549805505, + "language_loss": 0.74237061, + "learning_rate": 1.9594320102685847e-07, + "loss": 0.76420611, + "num_input_tokens_seen": 155150555, + "step": 7170, + "time_per_iteration": 2.4531359672546387 + }, + { + "auxiliary_loss_clip": 0.01129237, + "auxiliary_loss_mlp": 0.00761781, + "balance_loss_clip": 1.04195654, + "balance_loss_mlp": 1.0005939, + "epoch": 0.8622617687729213, + "flos": 21689039934720.0, + "grad_norm": 1.812907768089721, + "language_loss": 0.64231086, + "learning_rate": 1.956070739683864e-07, + "loss": 0.66122103, + "num_input_tokens_seen": 155169890, + "step": 7171, + "time_per_iteration": 2.494382858276367 + }, + { + "auxiliary_loss_clip": 0.01110619, + "auxiliary_loss_mlp": 0.01020327, + "balance_loss_clip": 1.0395565, + "balance_loss_mlp": 1.01289392, + "epoch": 0.8623820116635604, + "flos": 26250592734720.0, + "grad_norm": 1.870078796677333, + "language_loss": 0.74192846, + "learning_rate": 1.9527122063488678e-07, + "loss": 0.76323789, + "num_input_tokens_seen": 155191005, + "step": 7172, + "time_per_iteration": 2.5667150020599365 + }, + { + "auxiliary_loss_clip": 0.01134717, + "auxiliary_loss_mlp": 0.01020892, + "balance_loss_clip": 1.03921962, + "balance_loss_mlp": 1.01420164, + "epoch": 0.8625022545541995, + "flos": 19647554451840.0, + "grad_norm": 1.6654178172016643, + "language_loss": 0.80381, + "learning_rate": 1.9493564107730755e-07, + "loss": 0.82536608, + "num_input_tokens_seen": 155211005, + "step": 7173, + "time_per_iteration": 2.485018491744995 + }, + { + "auxiliary_loss_clip": 0.01133602, + "auxiliary_loss_mlp": 0.01024685, + "balance_loss_clip": 1.04060018, + "balance_loss_mlp": 1.01777112, + "epoch": 0.8626224974448385, + "flos": 21908382336000.0, + "grad_norm": 1.8758320886358502, + "language_loss": 0.60921741, + "learning_rate": 1.9460033534655684e-07, + "loss": 0.63080031, + "num_input_tokens_seen": 155230365, + "step": 7174, + "time_per_iteration": 2.4872629642486572 + }, + { + "auxiliary_loss_clip": 0.01130701, + "auxiliary_loss_mlp": 0.01024064, + "balance_loss_clip": 1.03768349, + "balance_loss_mlp": 1.01679838, + "epoch": 0.8627427403354777, + "flos": 23331198942720.0, + "grad_norm": 1.5609312884836346, + "language_loss": 0.8427788, + "learning_rate": 1.9426530349349978e-07, + "loss": 0.86432648, + "num_input_tokens_seen": 155250815, + "step": 7175, + "time_per_iteration": 2.5262598991394043 + }, + { + "auxiliary_loss_clip": 0.01151482, + "auxiliary_loss_mlp": 0.00762005, + "balance_loss_clip": 1.04381597, + "balance_loss_mlp": 1.00066924, + "epoch": 0.8628629832261168, + "flos": 16362877299840.0, + "grad_norm": 1.9785662645411293, + "language_loss": 0.64773816, + "learning_rate": 1.9393054556896038e-07, + "loss": 0.66687298, + "num_input_tokens_seen": 155268515, + "step": 7176, + "time_per_iteration": 2.4438424110412598 + }, + { + "auxiliary_loss_clip": 0.01122323, + "auxiliary_loss_mlp": 0.01024079, + "balance_loss_clip": 1.04029489, + "balance_loss_mlp": 1.01634789, + "epoch": 0.8629832261167558, + "flos": 28103941756800.0, + "grad_norm": 2.361277886437131, + "language_loss": 0.69271123, + "learning_rate": 1.9359606162372133e-07, + "loss": 0.71417528, + "num_input_tokens_seen": 155290120, + "step": 7177, + "time_per_iteration": 2.5799570083618164 + }, + { + "auxiliary_loss_clip": 0.01166596, + "auxiliary_loss_mlp": 0.01022204, + "balance_loss_clip": 1.04736686, + "balance_loss_mlp": 1.0150212, + "epoch": 0.863103469007395, + "flos": 20230061310720.0, + "grad_norm": 1.73311269519003, + "language_loss": 0.70419252, + "learning_rate": 1.9326185170852293e-07, + "loss": 0.72608054, + "num_input_tokens_seen": 155309085, + "step": 7178, + "time_per_iteration": 2.4422030448913574 + }, + { + "auxiliary_loss_clip": 0.01151916, + "auxiliary_loss_mlp": 0.0102346, + "balance_loss_clip": 1.04497802, + "balance_loss_mlp": 1.01626885, + "epoch": 0.863223711898034, + "flos": 24498547044480.0, + "grad_norm": 2.297700355589306, + "language_loss": 0.72574514, + "learning_rate": 1.9292791587406598e-07, + "loss": 0.74749887, + "num_input_tokens_seen": 155327945, + "step": 7179, + "time_per_iteration": 2.52087664604187 + }, + { + "auxiliary_loss_clip": 0.01150682, + "auxiliary_loss_mlp": 0.00762293, + "balance_loss_clip": 1.04343057, + "balance_loss_mlp": 1.00063062, + "epoch": 0.8633439547886731, + "flos": 17675376261120.0, + "grad_norm": 2.133877451213358, + "language_loss": 0.87113881, + "learning_rate": 1.9259425417100661e-07, + "loss": 0.89026856, + "num_input_tokens_seen": 155344060, + "step": 7180, + "time_per_iteration": 2.417158603668213 + }, + { + "auxiliary_loss_clip": 0.0109166, + "auxiliary_loss_mlp": 0.01028352, + "balance_loss_clip": 1.03385425, + "balance_loss_mlp": 1.02052569, + "epoch": 0.8634641976793123, + "flos": 12895055677440.0, + "grad_norm": 2.355286251179499, + "language_loss": 0.74674714, + "learning_rate": 1.9226086664996234e-07, + "loss": 0.76794732, + "num_input_tokens_seen": 155362305, + "step": 7181, + "time_per_iteration": 2.5655789375305176 + }, + { + "auxiliary_loss_clip": 0.01143767, + "auxiliary_loss_mlp": 0.01029782, + "balance_loss_clip": 1.04758, + "balance_loss_mlp": 1.02245998, + "epoch": 0.8635844405699513, + "flos": 23878980328320.0, + "grad_norm": 2.0470197807698582, + "language_loss": 0.74353993, + "learning_rate": 1.9192775336150712e-07, + "loss": 0.76527536, + "num_input_tokens_seen": 155382605, + "step": 7182, + "time_per_iteration": 2.50984787940979 + }, + { + "auxiliary_loss_clip": 0.0104983, + "auxiliary_loss_mlp": 0.01000342, + "balance_loss_clip": 1.00775397, + "balance_loss_mlp": 0.99931127, + "epoch": 0.8637046834605904, + "flos": 60453387521280.0, + "grad_norm": 0.8522077305548875, + "language_loss": 0.56279421, + "learning_rate": 1.915949143561739e-07, + "loss": 0.58329594, + "num_input_tokens_seen": 155437280, + "step": 7183, + "time_per_iteration": 3.01050066947937 + }, + { + "auxiliary_loss_clip": 0.01153606, + "auxiliary_loss_mlp": 0.01026514, + "balance_loss_clip": 1.04668498, + "balance_loss_mlp": 1.01927161, + "epoch": 0.8638249263512295, + "flos": 20558751690240.0, + "grad_norm": 1.8913393247432477, + "language_loss": 0.77720582, + "learning_rate": 1.9126234968445498e-07, + "loss": 0.799007, + "num_input_tokens_seen": 155456970, + "step": 7184, + "time_per_iteration": 2.4586644172668457 + }, + { + "auxiliary_loss_clip": 0.01169958, + "auxiliary_loss_mlp": 0.01028065, + "balance_loss_clip": 1.04898381, + "balance_loss_mlp": 1.02048373, + "epoch": 0.8639451692418686, + "flos": 26615768353920.0, + "grad_norm": 1.5481305423364744, + "language_loss": 0.67736274, + "learning_rate": 1.9093005939679884e-07, + "loss": 0.69934297, + "num_input_tokens_seen": 155478925, + "step": 7185, + "time_per_iteration": 3.266057014465332 + }, + { + "auxiliary_loss_clip": 0.01153644, + "auxiliary_loss_mlp": 0.0102856, + "balance_loss_clip": 1.04582262, + "balance_loss_mlp": 1.02098691, + "epoch": 0.8640654121325076, + "flos": 15122450977920.0, + "grad_norm": 1.9486732351725584, + "language_loss": 0.76749265, + "learning_rate": 1.9059804354361452e-07, + "loss": 0.78931475, + "num_input_tokens_seen": 155496700, + "step": 7186, + "time_per_iteration": 3.215562343597412 + }, + { + "auxiliary_loss_clip": 0.01130694, + "auxiliary_loss_mlp": 0.0102212, + "balance_loss_clip": 1.03891611, + "balance_loss_mlp": 1.0144105, + "epoch": 0.8641856550231467, + "flos": 31869068250240.0, + "grad_norm": 1.5721149573912647, + "language_loss": 0.70277578, + "learning_rate": 1.902663021752684e-07, + "loss": 0.72430396, + "num_input_tokens_seen": 155518130, + "step": 7187, + "time_per_iteration": 2.566303014755249 + }, + { + "auxiliary_loss_clip": 0.01170906, + "auxiliary_loss_mlp": 0.01020551, + "balance_loss_clip": 1.04875827, + "balance_loss_mlp": 1.01363134, + "epoch": 0.8643058979137859, + "flos": 14976545932800.0, + "grad_norm": 2.5598939513476067, + "language_loss": 0.82129109, + "learning_rate": 1.8993483534208556e-07, + "loss": 0.84320569, + "num_input_tokens_seen": 155537040, + "step": 7188, + "time_per_iteration": 2.4011571407318115 + }, + { + "auxiliary_loss_clip": 0.01133079, + "auxiliary_loss_mlp": 0.01025072, + "balance_loss_clip": 1.04406118, + "balance_loss_mlp": 1.01723695, + "epoch": 0.8644261408044249, + "flos": 13115726881920.0, + "grad_norm": 2.4148332189386252, + "language_loss": 0.74538851, + "learning_rate": 1.8960364309434884e-07, + "loss": 0.7669701, + "num_input_tokens_seen": 155554535, + "step": 7189, + "time_per_iteration": 2.447122097015381 + }, + { + "auxiliary_loss_clip": 0.01092361, + "auxiliary_loss_mlp": 0.00761818, + "balance_loss_clip": 1.03795099, + "balance_loss_mlp": 1.00062752, + "epoch": 0.864546383695064, + "flos": 20850920916480.0, + "grad_norm": 2.5408335006769125, + "language_loss": 0.78345829, + "learning_rate": 1.8927272548229967e-07, + "loss": 0.80200005, + "num_input_tokens_seen": 155574225, + "step": 7190, + "time_per_iteration": 3.391019344329834 + }, + { + "auxiliary_loss_clip": 0.01112355, + "auxiliary_loss_mlp": 0.01025389, + "balance_loss_clip": 1.04165459, + "balance_loss_mlp": 1.01826882, + "epoch": 0.8646666265857031, + "flos": 21324582587520.0, + "grad_norm": 1.9106372749266585, + "language_loss": 0.82986778, + "learning_rate": 1.8894208255613876e-07, + "loss": 0.85124516, + "num_input_tokens_seen": 155593540, + "step": 7191, + "time_per_iteration": 2.6008009910583496 + }, + { + "auxiliary_loss_clip": 0.01167006, + "auxiliary_loss_mlp": 0.01022945, + "balance_loss_clip": 1.04747689, + "balance_loss_mlp": 1.01570868, + "epoch": 0.8647868694763422, + "flos": 19750833031680.0, + "grad_norm": 1.882976010484817, + "language_loss": 0.77663457, + "learning_rate": 1.8861171436602397e-07, + "loss": 0.79853404, + "num_input_tokens_seen": 155610655, + "step": 7192, + "time_per_iteration": 2.4546051025390625 + }, + { + "auxiliary_loss_clip": 0.01155542, + "auxiliary_loss_mlp": 0.01027678, + "balance_loss_clip": 1.04686308, + "balance_loss_mlp": 1.01997685, + "epoch": 0.8649071123669813, + "flos": 26176760328960.0, + "grad_norm": 2.1985067649791827, + "language_loss": 0.80360889, + "learning_rate": 1.882816209620719e-07, + "loss": 0.82544106, + "num_input_tokens_seen": 155627365, + "step": 7193, + "time_per_iteration": 2.4991962909698486 + }, + { + "auxiliary_loss_clip": 0.01143697, + "auxiliary_loss_mlp": 0.01025201, + "balance_loss_clip": 1.04863596, + "balance_loss_mlp": 1.01709461, + "epoch": 0.8650273552576204, + "flos": 20302888135680.0, + "grad_norm": 1.926467400979457, + "language_loss": 0.76695168, + "learning_rate": 1.8795180239435738e-07, + "loss": 0.78864062, + "num_input_tokens_seen": 155646220, + "step": 7194, + "time_per_iteration": 2.471895217895508 + }, + { + "auxiliary_loss_clip": 0.01145304, + "auxiliary_loss_mlp": 0.01025849, + "balance_loss_clip": 1.04530478, + "balance_loss_mlp": 1.01838982, + "epoch": 0.8651475981482595, + "flos": 23951088881280.0, + "grad_norm": 3.3744842081656112, + "language_loss": 0.75735509, + "learning_rate": 1.8762225871291348e-07, + "loss": 0.77906668, + "num_input_tokens_seen": 155662095, + "step": 7195, + "time_per_iteration": 2.5095112323760986 + }, + { + "auxiliary_loss_clip": 0.01168771, + "auxiliary_loss_mlp": 0.00761964, + "balance_loss_clip": 1.04750669, + "balance_loss_mlp": 1.00061941, + "epoch": 0.8652678410388985, + "flos": 21684622561920.0, + "grad_norm": 2.560520196077845, + "language_loss": 0.81004345, + "learning_rate": 1.8729298996773201e-07, + "loss": 0.82935083, + "num_input_tokens_seen": 155680845, + "step": 7196, + "time_per_iteration": 2.424546003341675 + }, + { + "auxiliary_loss_clip": 0.01047322, + "auxiliary_loss_mlp": 0.0100063, + "balance_loss_clip": 1.00653028, + "balance_loss_mlp": 0.9995808, + "epoch": 0.8653880839295377, + "flos": 65224660855680.0, + "grad_norm": 0.8345049884520256, + "language_loss": 0.60988533, + "learning_rate": 1.8696399620876301e-07, + "loss": 0.63036478, + "num_input_tokens_seen": 155737875, + "step": 7197, + "time_per_iteration": 3.0031089782714844 + }, + { + "auxiliary_loss_clip": 0.01122318, + "auxiliary_loss_mlp": 0.01024397, + "balance_loss_clip": 1.03859413, + "balance_loss_mlp": 1.01635599, + "epoch": 0.8655083268201768, + "flos": 17749172753280.0, + "grad_norm": 1.9465771434007244, + "language_loss": 0.79184473, + "learning_rate": 1.866352774859141e-07, + "loss": 0.81331193, + "num_input_tokens_seen": 155753100, + "step": 7198, + "time_per_iteration": 2.488027334213257 + }, + { + "auxiliary_loss_clip": 0.01128283, + "auxiliary_loss_mlp": 0.01022043, + "balance_loss_clip": 1.04095352, + "balance_loss_mlp": 1.01531363, + "epoch": 0.8656285697108158, + "flos": 20703974376960.0, + "grad_norm": 2.243267134415929, + "language_loss": 0.69813752, + "learning_rate": 1.8630683384905188e-07, + "loss": 0.71964073, + "num_input_tokens_seen": 155772430, + "step": 7199, + "time_per_iteration": 2.529702663421631 + }, + { + "auxiliary_loss_clip": 0.01170133, + "auxiliary_loss_mlp": 0.00762174, + "balance_loss_clip": 1.04976535, + "balance_loss_mlp": 1.00065517, + "epoch": 0.865748812601455, + "flos": 18653833716480.0, + "grad_norm": 3.2762271028283263, + "language_loss": 0.88699514, + "learning_rate": 1.8597866534800045e-07, + "loss": 0.90631819, + "num_input_tokens_seen": 155787545, + "step": 7200, + "time_per_iteration": 2.3877909183502197 + }, + { + "auxiliary_loss_clip": 0.01156943, + "auxiliary_loss_mlp": 0.00762523, + "balance_loss_clip": 1.04668844, + "balance_loss_mlp": 1.00064898, + "epoch": 0.865869055492094, + "flos": 70652554807680.0, + "grad_norm": 1.9440705561106788, + "language_loss": 0.74353778, + "learning_rate": 1.8565077203254398e-07, + "loss": 0.76273239, + "num_input_tokens_seen": 155813005, + "step": 7201, + "time_per_iteration": 2.8600401878356934 + }, + { + "auxiliary_loss_clip": 0.01126452, + "auxiliary_loss_mlp": 0.01027358, + "balance_loss_clip": 1.0471102, + "balance_loss_mlp": 1.0197885, + "epoch": 0.8659892983827331, + "flos": 17383961220480.0, + "grad_norm": 2.6251826327402283, + "language_loss": 0.72491628, + "learning_rate": 1.8532315395242203e-07, + "loss": 0.74645436, + "num_input_tokens_seen": 155829455, + "step": 7202, + "time_per_iteration": 2.4796769618988037 + }, + { + "auxiliary_loss_clip": 0.01127262, + "auxiliary_loss_mlp": 0.0102506, + "balance_loss_clip": 1.04180336, + "balance_loss_mlp": 1.01788998, + "epoch": 0.8661095412733723, + "flos": 17895221452800.0, + "grad_norm": 2.053432134599784, + "language_loss": 0.72031504, + "learning_rate": 1.849958111573353e-07, + "loss": 0.74183828, + "num_input_tokens_seen": 155848060, + "step": 7203, + "time_per_iteration": 2.4987759590148926 + }, + { + "auxiliary_loss_clip": 0.01164553, + "auxiliary_loss_mlp": 0.01022771, + "balance_loss_clip": 1.04559159, + "balance_loss_mlp": 1.01581836, + "epoch": 0.8662297841640113, + "flos": 18224163227520.0, + "grad_norm": 1.6519049626105675, + "language_loss": 0.64023733, + "learning_rate": 1.8466874369694074e-07, + "loss": 0.66211057, + "num_input_tokens_seen": 155865755, + "step": 7204, + "time_per_iteration": 2.4254636764526367 + }, + { + "auxiliary_loss_clip": 0.01125294, + "auxiliary_loss_mlp": 0.01026209, + "balance_loss_clip": 1.03886914, + "balance_loss_mlp": 1.0191046, + "epoch": 0.8663500270546504, + "flos": 16362159027840.0, + "grad_norm": 3.048458687644922, + "language_loss": 0.70169717, + "learning_rate": 1.843419516208542e-07, + "loss": 0.72321218, + "num_input_tokens_seen": 155882680, + "step": 7205, + "time_per_iteration": 2.4784743785858154 + }, + { + "auxiliary_loss_clip": 0.0115607, + "auxiliary_loss_mlp": 0.01025022, + "balance_loss_clip": 1.04867697, + "balance_loss_mlp": 1.01714849, + "epoch": 0.8664702699452895, + "flos": 17894431353600.0, + "grad_norm": 2.258971985648335, + "language_loss": 0.79345548, + "learning_rate": 1.8401543497865047e-07, + "loss": 0.81526637, + "num_input_tokens_seen": 155900680, + "step": 7206, + "time_per_iteration": 2.4223670959472656 + }, + { + "auxiliary_loss_clip": 0.01156193, + "auxiliary_loss_mlp": 0.00762194, + "balance_loss_clip": 1.04632843, + "balance_loss_mlp": 1.00056767, + "epoch": 0.8665905128359286, + "flos": 30736373794560.0, + "grad_norm": 2.1947268630474563, + "language_loss": 0.64264572, + "learning_rate": 1.836891938198608e-07, + "loss": 0.66182965, + "num_input_tokens_seen": 155921105, + "step": 7207, + "time_per_iteration": 2.5265614986419678 + }, + { + "auxiliary_loss_clip": 0.01140221, + "auxiliary_loss_mlp": 0.01026069, + "balance_loss_clip": 1.04625595, + "balance_loss_mlp": 1.01849604, + "epoch": 0.8667107557265676, + "flos": 18656419495680.0, + "grad_norm": 2.2844126662028312, + "language_loss": 0.71098113, + "learning_rate": 1.8336322819397677e-07, + "loss": 0.73264408, + "num_input_tokens_seen": 155938640, + "step": 7208, + "time_per_iteration": 2.4496958255767822 + }, + { + "auxiliary_loss_clip": 0.01127663, + "auxiliary_loss_mlp": 0.01027565, + "balance_loss_clip": 1.0388937, + "balance_loss_mlp": 1.02026665, + "epoch": 0.8668309986172068, + "flos": 20083725302400.0, + "grad_norm": 2.0300094630321794, + "language_loss": 0.62650889, + "learning_rate": 1.8303753815044654e-07, + "loss": 0.64806116, + "num_input_tokens_seen": 155957945, + "step": 7209, + "time_per_iteration": 2.519683599472046 + }, + { + "auxiliary_loss_clip": 0.01148318, + "auxiliary_loss_mlp": 0.01023286, + "balance_loss_clip": 1.04395032, + "balance_loss_mlp": 1.01515639, + "epoch": 0.8669512415078459, + "flos": 21615099788160.0, + "grad_norm": 2.112979303122484, + "language_loss": 0.70235336, + "learning_rate": 1.827121237386773e-07, + "loss": 0.72406936, + "num_input_tokens_seen": 155975390, + "step": 7210, + "time_per_iteration": 2.484200954437256 + }, + { + "auxiliary_loss_clip": 0.01142802, + "auxiliary_loss_mlp": 0.01027039, + "balance_loss_clip": 1.04394484, + "balance_loss_mlp": 1.01957643, + "epoch": 0.8670714843984849, + "flos": 17703601372800.0, + "grad_norm": 2.238611612456096, + "language_loss": 0.75005591, + "learning_rate": 1.8238698500803374e-07, + "loss": 0.77175438, + "num_input_tokens_seen": 155988155, + "step": 7211, + "time_per_iteration": 3.2145442962646484 + }, + { + "auxiliary_loss_clip": 0.0105495, + "auxiliary_loss_mlp": 0.01000767, + "balance_loss_clip": 1.00769067, + "balance_loss_mlp": 0.99971181, + "epoch": 0.8671917272891241, + "flos": 60705483125760.0, + "grad_norm": 0.7552687383376634, + "language_loss": 0.56299829, + "learning_rate": 1.820621220078391e-07, + "loss": 0.58355546, + "num_input_tokens_seen": 156052065, + "step": 7212, + "time_per_iteration": 3.109221935272217 + }, + { + "auxiliary_loss_clip": 0.01167187, + "auxiliary_loss_mlp": 0.01020555, + "balance_loss_clip": 1.0464735, + "balance_loss_mlp": 1.01320553, + "epoch": 0.8673119701797631, + "flos": 20451881750400.0, + "grad_norm": 1.6419826277036824, + "language_loss": 0.67727172, + "learning_rate": 1.8173753478737553e-07, + "loss": 0.69914913, + "num_input_tokens_seen": 156072500, + "step": 7213, + "time_per_iteration": 3.237321376800537 + }, + { + "auxiliary_loss_clip": 0.01171234, + "auxiliary_loss_mlp": 0.01028033, + "balance_loss_clip": 1.04818594, + "balance_loss_mlp": 1.02044225, + "epoch": 0.8674322130704022, + "flos": 19647410797440.0, + "grad_norm": 2.059081566775878, + "language_loss": 0.79411089, + "learning_rate": 1.8141322339588205e-07, + "loss": 0.81610358, + "num_input_tokens_seen": 156089840, + "step": 7214, + "time_per_iteration": 2.416290044784546 + }, + { + "auxiliary_loss_clip": 0.01168233, + "auxiliary_loss_mlp": 0.01026484, + "balance_loss_clip": 1.04833937, + "balance_loss_mlp": 1.01922989, + "epoch": 0.8675524559610414, + "flos": 26025001367040.0, + "grad_norm": 2.0283827979583178, + "language_loss": 0.70222461, + "learning_rate": 1.810891878825569e-07, + "loss": 0.72417176, + "num_input_tokens_seen": 156109815, + "step": 7215, + "time_per_iteration": 2.4516968727111816 + }, + { + "auxiliary_loss_clip": 0.0113738, + "auxiliary_loss_mlp": 0.01021907, + "balance_loss_clip": 1.04092765, + "balance_loss_mlp": 1.01443899, + "epoch": 0.8676726988516804, + "flos": 15049444584960.0, + "grad_norm": 2.2652035408165365, + "language_loss": 0.71761483, + "learning_rate": 1.8076542829655561e-07, + "loss": 0.73920774, + "num_input_tokens_seen": 156128620, + "step": 7216, + "time_per_iteration": 2.47200083732605 + }, + { + "auxiliary_loss_clip": 0.01140282, + "auxiliary_loss_mlp": 0.01024472, + "balance_loss_clip": 1.04566145, + "balance_loss_mlp": 1.01640773, + "epoch": 0.8677929417423195, + "flos": 16288111140480.0, + "grad_norm": 1.9972244975416538, + "language_loss": 0.79336822, + "learning_rate": 1.8044194468699203e-07, + "loss": 0.81501579, + "num_input_tokens_seen": 156145930, + "step": 7217, + "time_per_iteration": 3.271275520324707 + }, + { + "auxiliary_loss_clip": 0.01136743, + "auxiliary_loss_mlp": 0.01024338, + "balance_loss_clip": 1.04609537, + "balance_loss_mlp": 1.01707482, + "epoch": 0.8679131846329585, + "flos": 18844160906880.0, + "grad_norm": 3.204844356544063, + "language_loss": 0.75669336, + "learning_rate": 1.8011873710293912e-07, + "loss": 0.77830416, + "num_input_tokens_seen": 156164435, + "step": 7218, + "time_per_iteration": 2.4763712882995605 + }, + { + "auxiliary_loss_clip": 0.0115031, + "auxiliary_loss_mlp": 0.01025992, + "balance_loss_clip": 1.04595399, + "balance_loss_mlp": 1.01845837, + "epoch": 0.8680334275235977, + "flos": 33620718890880.0, + "grad_norm": 2.3379526842627825, + "language_loss": 0.69524056, + "learning_rate": 1.7979580559342677e-07, + "loss": 0.71700358, + "num_input_tokens_seen": 156185165, + "step": 7219, + "time_per_iteration": 2.5571913719177246 + }, + { + "auxiliary_loss_clip": 0.01139161, + "auxiliary_loss_mlp": 0.01024123, + "balance_loss_clip": 1.04527986, + "balance_loss_mlp": 1.01679218, + "epoch": 0.8681536704142367, + "flos": 24681152810880.0, + "grad_norm": 1.6118718054683352, + "language_loss": 0.66382933, + "learning_rate": 1.7947315020744358e-07, + "loss": 0.68546212, + "num_input_tokens_seen": 156206260, + "step": 7220, + "time_per_iteration": 2.5165181159973145 + }, + { + "auxiliary_loss_clip": 0.01138238, + "auxiliary_loss_mlp": 0.01020971, + "balance_loss_clip": 1.04331064, + "balance_loss_mlp": 1.01397038, + "epoch": 0.8682739133048758, + "flos": 20011042131840.0, + "grad_norm": 1.8354539237941427, + "language_loss": 0.80140448, + "learning_rate": 1.7915077099393594e-07, + "loss": 0.8229965, + "num_input_tokens_seen": 156222860, + "step": 7221, + "time_per_iteration": 2.464486837387085 + }, + { + "auxiliary_loss_clip": 0.01154688, + "auxiliary_loss_mlp": 0.01027578, + "balance_loss_clip": 1.04333603, + "balance_loss_mlp": 1.01989198, + "epoch": 0.868394156195515, + "flos": 16654759217280.0, + "grad_norm": 1.796164580789187, + "language_loss": 0.73360479, + "learning_rate": 1.788286680018083e-07, + "loss": 0.75542742, + "num_input_tokens_seen": 156241570, + "step": 7222, + "time_per_iteration": 2.430434226989746 + }, + { + "auxiliary_loss_clip": 0.0114492, + "auxiliary_loss_mlp": 0.01025398, + "balance_loss_clip": 1.0449028, + "balance_loss_mlp": 1.01841843, + "epoch": 0.868514399086154, + "flos": 28001381448960.0, + "grad_norm": 1.5591288147340288, + "language_loss": 0.7263878, + "learning_rate": 1.7850684127992443e-07, + "loss": 0.74809098, + "num_input_tokens_seen": 156261315, + "step": 7223, + "time_per_iteration": 2.543583631515503 + }, + { + "auxiliary_loss_clip": 0.01125911, + "auxiliary_loss_mlp": 0.01024945, + "balance_loss_clip": 1.04527569, + "balance_loss_mlp": 1.01763165, + "epoch": 0.8686346419767931, + "flos": 20084587228800.0, + "grad_norm": 1.8495102429282817, + "language_loss": 0.70343494, + "learning_rate": 1.7818529087710378e-07, + "loss": 0.72494352, + "num_input_tokens_seen": 156281670, + "step": 7224, + "time_per_iteration": 2.5334997177124023 + }, + { + "auxiliary_loss_clip": 0.01150934, + "auxiliary_loss_mlp": 0.00762209, + "balance_loss_clip": 1.04359138, + "balance_loss_mlp": 1.00061905, + "epoch": 0.8687548848674322, + "flos": 18223516782720.0, + "grad_norm": 1.6765916963725427, + "language_loss": 0.84227264, + "learning_rate": 1.7786401684212637e-07, + "loss": 0.86140406, + "num_input_tokens_seen": 156300500, + "step": 7225, + "time_per_iteration": 2.4642088413238525 + }, + { + "auxiliary_loss_clip": 0.01031113, + "auxiliary_loss_mlp": 0.01005443, + "balance_loss_clip": 1.00916755, + "balance_loss_mlp": 1.00448918, + "epoch": 0.8688751277580713, + "flos": 70457885049600.0, + "grad_norm": 0.7293799676000451, + "language_loss": 0.5594396, + "learning_rate": 1.7754301922372883e-07, + "loss": 0.57980514, + "num_input_tokens_seen": 156350145, + "step": 7226, + "time_per_iteration": 2.9419915676116943 + }, + { + "auxiliary_loss_clip": 0.01102266, + "auxiliary_loss_mlp": 0.01026546, + "balance_loss_clip": 1.0398376, + "balance_loss_mlp": 1.01867795, + "epoch": 0.8689953706487104, + "flos": 26906788344960.0, + "grad_norm": 1.8639215274395349, + "language_loss": 0.80787373, + "learning_rate": 1.7722229807060617e-07, + "loss": 0.82916188, + "num_input_tokens_seen": 156368725, + "step": 7227, + "time_per_iteration": 2.612855911254883 + }, + { + "auxiliary_loss_clip": 0.01114729, + "auxiliary_loss_mlp": 0.01022909, + "balance_loss_clip": 1.03871822, + "balance_loss_mlp": 1.01573884, + "epoch": 0.8691156135393495, + "flos": 34637385438720.0, + "grad_norm": 2.0310136459579957, + "language_loss": 0.81725407, + "learning_rate": 1.7690185343141172e-07, + "loss": 0.83863044, + "num_input_tokens_seen": 156388640, + "step": 7228, + "time_per_iteration": 2.627864122390747 + }, + { + "auxiliary_loss_clip": 0.01138626, + "auxiliary_loss_mlp": 0.01021877, + "balance_loss_clip": 1.04212224, + "balance_loss_mlp": 1.01507926, + "epoch": 0.8692358564299886, + "flos": 18989814556800.0, + "grad_norm": 2.08386469961784, + "language_loss": 0.69734257, + "learning_rate": 1.7658168535475615e-07, + "loss": 0.71894759, + "num_input_tokens_seen": 156406425, + "step": 7229, + "time_per_iteration": 2.505333185195923 + }, + { + "auxiliary_loss_clip": 0.01145038, + "auxiliary_loss_mlp": 0.01028085, + "balance_loss_clip": 1.04608059, + "balance_loss_mlp": 1.02033615, + "epoch": 0.8693560993206276, + "flos": 30370839039360.0, + "grad_norm": 1.6038040133472549, + "language_loss": 0.64287859, + "learning_rate": 1.7626179388920948e-07, + "loss": 0.66460985, + "num_input_tokens_seen": 156427705, + "step": 7230, + "time_per_iteration": 2.550971031188965 + }, + { + "auxiliary_loss_clip": 0.01141073, + "auxiliary_loss_mlp": 0.00761999, + "balance_loss_clip": 1.04553413, + "balance_loss_mlp": 1.00060308, + "epoch": 0.8694763422112668, + "flos": 27200430028800.0, + "grad_norm": 1.6111708289997166, + "language_loss": 0.80495405, + "learning_rate": 1.7594217908329866e-07, + "loss": 0.82398474, + "num_input_tokens_seen": 156449890, + "step": 7231, + "time_per_iteration": 2.5555942058563232 + }, + { + "auxiliary_loss_clip": 0.01131775, + "auxiliary_loss_mlp": 0.01022429, + "balance_loss_clip": 1.04393005, + "balance_loss_mlp": 1.01540136, + "epoch": 0.8695965851019059, + "flos": 26139161767680.0, + "grad_norm": 4.389278943085361, + "language_loss": 0.74075276, + "learning_rate": 1.7562284098550895e-07, + "loss": 0.76229477, + "num_input_tokens_seen": 156469600, + "step": 7232, + "time_per_iteration": 2.5398471355438232 + }, + { + "auxiliary_loss_clip": 0.01040465, + "auxiliary_loss_mlp": 0.01003213, + "balance_loss_clip": 1.0127759, + "balance_loss_mlp": 1.0023483, + "epoch": 0.8697168279925449, + "flos": 67332616456320.0, + "grad_norm": 0.8402372236546429, + "language_loss": 0.62286258, + "learning_rate": 1.753037796442838e-07, + "loss": 0.64329934, + "num_input_tokens_seen": 156529040, + "step": 7233, + "time_per_iteration": 3.0574426651000977 + }, + { + "auxiliary_loss_clip": 0.01168088, + "auxiliary_loss_mlp": 0.01023182, + "balance_loss_clip": 1.04637957, + "balance_loss_mlp": 1.0152303, + "epoch": 0.8698370708831841, + "flos": 19718693337600.0, + "grad_norm": 2.1905172629875485, + "language_loss": 0.75139672, + "learning_rate": 1.74984995108024e-07, + "loss": 0.77330947, + "num_input_tokens_seen": 156546970, + "step": 7234, + "time_per_iteration": 2.4121339321136475 + }, + { + "auxiliary_loss_clip": 0.01155883, + "auxiliary_loss_mlp": 0.01023336, + "balance_loss_clip": 1.04569602, + "balance_loss_mlp": 1.0161984, + "epoch": 0.8699573137738231, + "flos": 12859971068160.0, + "grad_norm": 2.1105725048963304, + "language_loss": 0.83151925, + "learning_rate": 1.7466648742508981e-07, + "loss": 0.85331142, + "num_input_tokens_seen": 156563155, + "step": 7235, + "time_per_iteration": 2.42167329788208 + }, + { + "auxiliary_loss_clip": 0.01138886, + "auxiliary_loss_mlp": 0.01029059, + "balance_loss_clip": 1.0457921, + "balance_loss_mlp": 1.02127445, + "epoch": 0.8700775566644622, + "flos": 17420733768960.0, + "grad_norm": 1.8515066783234584, + "language_loss": 0.84482408, + "learning_rate": 1.7434825664379837e-07, + "loss": 0.86650354, + "num_input_tokens_seen": 156581660, + "step": 7236, + "time_per_iteration": 2.4634251594543457 + }, + { + "auxiliary_loss_clip": 0.01154889, + "auxiliary_loss_mlp": 0.01021953, + "balance_loss_clip": 1.04564297, + "balance_loss_mlp": 1.01427615, + "epoch": 0.8701977995551013, + "flos": 13735221770880.0, + "grad_norm": 2.8926282511387047, + "language_loss": 0.85692137, + "learning_rate": 1.740303028124246e-07, + "loss": 0.87868983, + "num_input_tokens_seen": 156597720, + "step": 7237, + "time_per_iteration": 2.4643502235412598 + }, + { + "auxiliary_loss_clip": 0.01087119, + "auxiliary_loss_mlp": 0.01024332, + "balance_loss_clip": 1.03642845, + "balance_loss_mlp": 1.01699734, + "epoch": 0.8703180424457404, + "flos": 30555707362560.0, + "grad_norm": 1.900785771780783, + "language_loss": 0.75429654, + "learning_rate": 1.7371262597920212e-07, + "loss": 0.77541113, + "num_input_tokens_seen": 156619780, + "step": 7238, + "time_per_iteration": 3.3717408180236816 + }, + { + "auxiliary_loss_clip": 0.01111113, + "auxiliary_loss_mlp": 0.01031573, + "balance_loss_clip": 1.04516482, + "balance_loss_mlp": 1.02400637, + "epoch": 0.8704382853363795, + "flos": 19608986223360.0, + "grad_norm": 1.488872093088757, + "language_loss": 0.76060796, + "learning_rate": 1.7339522619232195e-07, + "loss": 0.78203487, + "num_input_tokens_seen": 156638160, + "step": 7239, + "time_per_iteration": 2.5523416996002197 + }, + { + "auxiliary_loss_clip": 0.01145969, + "auxiliary_loss_mlp": 0.01027515, + "balance_loss_clip": 1.0425998, + "balance_loss_mlp": 1.01958752, + "epoch": 0.8705585282270186, + "flos": 26613900846720.0, + "grad_norm": 3.023764093603866, + "language_loss": 0.75570363, + "learning_rate": 1.730781034999338e-07, + "loss": 0.77743846, + "num_input_tokens_seen": 156659740, + "step": 7240, + "time_per_iteration": 3.3144447803497314 + }, + { + "auxiliary_loss_clip": 0.01166211, + "auxiliary_loss_mlp": 0.01024938, + "balance_loss_clip": 1.04952669, + "balance_loss_mlp": 1.01770198, + "epoch": 0.8706787711176577, + "flos": 34090465979520.0, + "grad_norm": 1.6704564431310251, + "language_loss": 0.73361278, + "learning_rate": 1.7276125795014497e-07, + "loss": 0.75552428, + "num_input_tokens_seen": 156678190, + "step": 7241, + "time_per_iteration": 2.593013048171997 + }, + { + "auxiliary_loss_clip": 0.0114192, + "auxiliary_loss_mlp": 0.01026386, + "balance_loss_clip": 1.04193306, + "balance_loss_mlp": 1.01882815, + "epoch": 0.8707990140082967, + "flos": 14611513968000.0, + "grad_norm": 2.0603969652418743, + "language_loss": 0.67072302, + "learning_rate": 1.7244468959102054e-07, + "loss": 0.69240606, + "num_input_tokens_seen": 156695245, + "step": 7242, + "time_per_iteration": 2.579695224761963 + }, + { + "auxiliary_loss_clip": 0.0115565, + "auxiliary_loss_mlp": 0.01021283, + "balance_loss_clip": 1.04834664, + "balance_loss_mlp": 1.01358807, + "epoch": 0.8709192568989359, + "flos": 20084156265600.0, + "grad_norm": 2.427425366523876, + "language_loss": 0.8495096, + "learning_rate": 1.7212839847058348e-07, + "loss": 0.87127888, + "num_input_tokens_seen": 156710375, + "step": 7243, + "time_per_iteration": 3.218540906906128 + }, + { + "auxiliary_loss_clip": 0.01104192, + "auxiliary_loss_mlp": 0.01021716, + "balance_loss_clip": 1.03831434, + "balance_loss_mlp": 1.01515675, + "epoch": 0.871039499789575, + "flos": 16727083251840.0, + "grad_norm": 3.4130785165111317, + "language_loss": 0.73784888, + "learning_rate": 1.718123846368147e-07, + "loss": 0.75910795, + "num_input_tokens_seen": 156729420, + "step": 7244, + "time_per_iteration": 2.589989423751831 + }, + { + "auxiliary_loss_clip": 0.01141071, + "auxiliary_loss_mlp": 0.00761774, + "balance_loss_clip": 1.04740131, + "balance_loss_mlp": 1.00062656, + "epoch": 0.871159742680214, + "flos": 21068790860160.0, + "grad_norm": 1.790644179809158, + "language_loss": 0.71783817, + "learning_rate": 1.714966481376543e-07, + "loss": 0.73686659, + "num_input_tokens_seen": 156746100, + "step": 7245, + "time_per_iteration": 2.498697280883789 + }, + { + "auxiliary_loss_clip": 0.01152972, + "auxiliary_loss_mlp": 0.0102606, + "balance_loss_clip": 1.04457426, + "balance_loss_mlp": 1.01878822, + "epoch": 0.8712799855708532, + "flos": 28256526731520.0, + "grad_norm": 3.1455435941059604, + "language_loss": 0.82881308, + "learning_rate": 1.7118118902099797e-07, + "loss": 0.85060334, + "num_input_tokens_seen": 156764185, + "step": 7246, + "time_per_iteration": 2.5038108825683594 + }, + { + "auxiliary_loss_clip": 0.01154269, + "auxiliary_loss_mlp": 0.01029479, + "balance_loss_clip": 1.04594791, + "balance_loss_mlp": 1.02204633, + "epoch": 0.8714002284614922, + "flos": 22236677665920.0, + "grad_norm": 1.8254458169885968, + "language_loss": 0.80311453, + "learning_rate": 1.7086600733470146e-07, + "loss": 0.82495207, + "num_input_tokens_seen": 156784855, + "step": 7247, + "time_per_iteration": 2.4574482440948486 + }, + { + "auxiliary_loss_clip": 0.01150793, + "auxiliary_loss_mlp": 0.01027616, + "balance_loss_clip": 1.04558468, + "balance_loss_mlp": 1.0206306, + "epoch": 0.8715204713521313, + "flos": 21431919404160.0, + "grad_norm": 1.783111827542661, + "language_loss": 0.76731271, + "learning_rate": 1.7055110312657738e-07, + "loss": 0.78909677, + "num_input_tokens_seen": 156804350, + "step": 7248, + "time_per_iteration": 2.4635331630706787 + }, + { + "auxiliary_loss_clip": 0.01134678, + "auxiliary_loss_mlp": 0.01025417, + "balance_loss_clip": 1.04478955, + "balance_loss_mlp": 1.01753736, + "epoch": 0.8716407142427703, + "flos": 23440439180160.0, + "grad_norm": 2.2883811392494535, + "language_loss": 0.74100697, + "learning_rate": 1.702364764443962e-07, + "loss": 0.76260793, + "num_input_tokens_seen": 156823425, + "step": 7249, + "time_per_iteration": 2.505491256713867 + }, + { + "auxiliary_loss_clip": 0.01091494, + "auxiliary_loss_mlp": 0.01022315, + "balance_loss_clip": 1.03674853, + "balance_loss_mlp": 1.01437879, + "epoch": 0.8717609571334095, + "flos": 27958683156480.0, + "grad_norm": 2.4482292660379925, + "language_loss": 0.72551775, + "learning_rate": 1.6992212733588685e-07, + "loss": 0.74665582, + "num_input_tokens_seen": 156843090, + "step": 7250, + "time_per_iteration": 2.647191047668457 + }, + { + "auxiliary_loss_clip": 0.0113688, + "auxiliary_loss_mlp": 0.01026501, + "balance_loss_clip": 1.04372287, + "balance_loss_mlp": 1.01924062, + "epoch": 0.8718812000240486, + "flos": 25479482538240.0, + "grad_norm": 1.8678970736169211, + "language_loss": 0.74597335, + "learning_rate": 1.6960805584873538e-07, + "loss": 0.76760715, + "num_input_tokens_seen": 156861090, + "step": 7251, + "time_per_iteration": 2.5255050659179688 + }, + { + "auxiliary_loss_clip": 0.01112584, + "auxiliary_loss_mlp": 0.01023414, + "balance_loss_clip": 1.03892255, + "balance_loss_mlp": 1.01629972, + "epoch": 0.8720014429146876, + "flos": 23403056100480.0, + "grad_norm": 1.5842692537467116, + "language_loss": 0.78299236, + "learning_rate": 1.6929426203058684e-07, + "loss": 0.80435228, + "num_input_tokens_seen": 156881515, + "step": 7252, + "time_per_iteration": 2.573591709136963 + }, + { + "auxiliary_loss_clip": 0.01170709, + "auxiliary_loss_mlp": 0.00762634, + "balance_loss_clip": 1.04586446, + "balance_loss_mlp": 1.00054526, + "epoch": 0.8721216858053268, + "flos": 24352821567360.0, + "grad_norm": 2.545401192652501, + "language_loss": 0.80146092, + "learning_rate": 1.689807459290431e-07, + "loss": 0.82079434, + "num_input_tokens_seen": 156900170, + "step": 7253, + "time_per_iteration": 2.4598841667175293 + }, + { + "auxiliary_loss_clip": 0.01139015, + "auxiliary_loss_mlp": 0.01025362, + "balance_loss_clip": 1.04374874, + "balance_loss_mlp": 1.01867449, + "epoch": 0.8722419286959658, + "flos": 33869687034240.0, + "grad_norm": 4.785571639840565, + "language_loss": 0.70792472, + "learning_rate": 1.6866750759166437e-07, + "loss": 0.72956854, + "num_input_tokens_seen": 156920150, + "step": 7254, + "time_per_iteration": 2.6331443786621094 + }, + { + "auxiliary_loss_clip": 0.01119434, + "auxiliary_loss_mlp": 0.01021898, + "balance_loss_clip": 1.0390625, + "balance_loss_mlp": 1.01425028, + "epoch": 0.8723621715866049, + "flos": 18369385914240.0, + "grad_norm": 2.1867455364461255, + "language_loss": 0.77552652, + "learning_rate": 1.6835454706596865e-07, + "loss": 0.79693985, + "num_input_tokens_seen": 156937980, + "step": 7255, + "time_per_iteration": 2.503242015838623 + }, + { + "auxiliary_loss_clip": 0.01169735, + "auxiliary_loss_mlp": 0.01026245, + "balance_loss_clip": 1.04869366, + "balance_loss_mlp": 1.01865721, + "epoch": 0.8724824144772441, + "flos": 22013348855040.0, + "grad_norm": 1.8002761308910535, + "language_loss": 0.73818541, + "learning_rate": 1.680418643994317e-07, + "loss": 0.76014519, + "num_input_tokens_seen": 156956550, + "step": 7256, + "time_per_iteration": 2.427302360534668 + }, + { + "auxiliary_loss_clip": 0.01063989, + "auxiliary_loss_mlp": 0.01000558, + "balance_loss_clip": 1.00806367, + "balance_loss_mlp": 0.99956876, + "epoch": 0.8726026573678831, + "flos": 66698720213760.0, + "grad_norm": 0.8885413832190315, + "language_loss": 0.64525819, + "learning_rate": 1.6772945963948738e-07, + "loss": 0.66590369, + "num_input_tokens_seen": 157014715, + "step": 7257, + "time_per_iteration": 3.0272269248962402 + }, + { + "auxiliary_loss_clip": 0.01135954, + "auxiliary_loss_mlp": 0.01025014, + "balance_loss_clip": 1.04455626, + "balance_loss_mlp": 1.01785803, + "epoch": 0.8727229002585222, + "flos": 13370908078080.0, + "grad_norm": 8.26715674683408, + "language_loss": 0.77039188, + "learning_rate": 1.6741733283352733e-07, + "loss": 0.79200161, + "num_input_tokens_seen": 157032320, + "step": 7258, + "time_per_iteration": 2.4531474113464355 + }, + { + "auxiliary_loss_clip": 0.01118904, + "auxiliary_loss_mlp": 0.0102646, + "balance_loss_clip": 1.04329729, + "balance_loss_mlp": 1.01923585, + "epoch": 0.8728431431491613, + "flos": 21796987282560.0, + "grad_norm": 1.567524982783893, + "language_loss": 0.83831143, + "learning_rate": 1.6710548402890102e-07, + "loss": 0.85976505, + "num_input_tokens_seen": 157052845, + "step": 7259, + "time_per_iteration": 2.542407989501953 + }, + { + "auxiliary_loss_clip": 0.01171615, + "auxiliary_loss_mlp": 0.01022685, + "balance_loss_clip": 1.04750514, + "balance_loss_mlp": 1.01505542, + "epoch": 0.8729633860398004, + "flos": 36173823742080.0, + "grad_norm": 1.9567450235194663, + "language_loss": 0.66541606, + "learning_rate": 1.6679391327291527e-07, + "loss": 0.68735909, + "num_input_tokens_seen": 157074050, + "step": 7260, + "time_per_iteration": 2.5676631927490234 + }, + { + "auxiliary_loss_clip": 0.01138491, + "auxiliary_loss_mlp": 0.01026156, + "balance_loss_clip": 1.0417304, + "balance_loss_mlp": 1.01903939, + "epoch": 0.8730836289304394, + "flos": 16359680989440.0, + "grad_norm": 2.7022682435056304, + "language_loss": 0.68303096, + "learning_rate": 1.6648262061283492e-07, + "loss": 0.70467746, + "num_input_tokens_seen": 157089350, + "step": 7261, + "time_per_iteration": 2.437488079071045 + }, + { + "auxiliary_loss_clip": 0.01125633, + "auxiliary_loss_mlp": 0.01024484, + "balance_loss_clip": 1.04018819, + "balance_loss_mlp": 1.01764774, + "epoch": 0.8732038718210786, + "flos": 21215126868480.0, + "grad_norm": 2.0388537219437586, + "language_loss": 0.73522091, + "learning_rate": 1.6617160609588353e-07, + "loss": 0.75672209, + "num_input_tokens_seen": 157108525, + "step": 7262, + "time_per_iteration": 2.5025341510772705 + }, + { + "auxiliary_loss_clip": 0.01145148, + "auxiliary_loss_mlp": 0.01024093, + "balance_loss_clip": 1.04560935, + "balance_loss_mlp": 1.01627934, + "epoch": 0.8733241147117177, + "flos": 16610696208000.0, + "grad_norm": 2.5270497430868564, + "language_loss": 0.71992147, + "learning_rate": 1.6586086976924163e-07, + "loss": 0.74161386, + "num_input_tokens_seen": 157124025, + "step": 7263, + "time_per_iteration": 2.435153007507324 + }, + { + "auxiliary_loss_clip": 0.0115355, + "auxiliary_loss_mlp": 0.01023723, + "balance_loss_clip": 1.04424787, + "balance_loss_mlp": 1.01665115, + "epoch": 0.8734443576023567, + "flos": 20193935207040.0, + "grad_norm": 1.8578342101097685, + "language_loss": 0.78419983, + "learning_rate": 1.6555041168004747e-07, + "loss": 0.80597258, + "num_input_tokens_seen": 157143345, + "step": 7264, + "time_per_iteration": 3.2527129650115967 + }, + { + "auxiliary_loss_clip": 0.01133772, + "auxiliary_loss_mlp": 0.01021843, + "balance_loss_clip": 1.04273403, + "balance_loss_mlp": 1.01497388, + "epoch": 0.8735646004929959, + "flos": 18041162411520.0, + "grad_norm": 1.6611499422899898, + "language_loss": 0.68632704, + "learning_rate": 1.6524023187539715e-07, + "loss": 0.70788318, + "num_input_tokens_seen": 157161630, + "step": 7265, + "time_per_iteration": 2.47082781791687 + }, + { + "auxiliary_loss_clip": 0.01142128, + "auxiliary_loss_mlp": 0.01022317, + "balance_loss_clip": 1.04496288, + "balance_loss_mlp": 1.01527214, + "epoch": 0.873684843383635, + "flos": 20262344659200.0, + "grad_norm": 2.2968641027434518, + "language_loss": 0.74436069, + "learning_rate": 1.649303304023446e-07, + "loss": 0.76600516, + "num_input_tokens_seen": 157181385, + "step": 7266, + "time_per_iteration": 3.305288314819336 + }, + { + "auxiliary_loss_clip": 0.01120678, + "auxiliary_loss_mlp": 0.01023647, + "balance_loss_clip": 1.04416847, + "balance_loss_mlp": 1.0163666, + "epoch": 0.873805086274274, + "flos": 16947287579520.0, + "grad_norm": 3.0036350371295293, + "language_loss": 0.78639388, + "learning_rate": 1.6462070730790246e-07, + "loss": 0.80783713, + "num_input_tokens_seen": 157200545, + "step": 7267, + "time_per_iteration": 2.501474618911743 + }, + { + "auxiliary_loss_clip": 0.01134699, + "auxiliary_loss_mlp": 0.01024196, + "balance_loss_clip": 1.04051161, + "balance_loss_mlp": 1.0166682, + "epoch": 0.8739253291649132, + "flos": 18041270152320.0, + "grad_norm": 2.514821978980379, + "language_loss": 0.78040469, + "learning_rate": 1.6431136263903912e-07, + "loss": 0.80199361, + "num_input_tokens_seen": 157219545, + "step": 7268, + "time_per_iteration": 2.464240074157715 + }, + { + "auxiliary_loss_clip": 0.01156211, + "auxiliary_loss_mlp": 0.00761818, + "balance_loss_clip": 1.04383969, + "balance_loss_mlp": 1.00059974, + "epoch": 0.8740455720555522, + "flos": 21325085377920.0, + "grad_norm": 1.825819095443998, + "language_loss": 0.73340523, + "learning_rate": 1.6400229644268282e-07, + "loss": 0.75258553, + "num_input_tokens_seen": 157237900, + "step": 7269, + "time_per_iteration": 3.2037551403045654 + }, + { + "auxiliary_loss_clip": 0.01117793, + "auxiliary_loss_mlp": 0.01025988, + "balance_loss_clip": 1.04431319, + "balance_loss_mlp": 1.01808405, + "epoch": 0.8741658149461913, + "flos": 15158684822400.0, + "grad_norm": 2.022641333291837, + "language_loss": 0.81003594, + "learning_rate": 1.6369350876571852e-07, + "loss": 0.83147377, + "num_input_tokens_seen": 157256055, + "step": 7270, + "time_per_iteration": 3.2486302852630615 + }, + { + "auxiliary_loss_clip": 0.01106386, + "auxiliary_loss_mlp": 0.0102204, + "balance_loss_clip": 1.03912699, + "balance_loss_mlp": 1.01474416, + "epoch": 0.8742860578368304, + "flos": 23039855729280.0, + "grad_norm": 2.0023778109048473, + "language_loss": 0.81363308, + "learning_rate": 1.6338499965498874e-07, + "loss": 0.83491743, + "num_input_tokens_seen": 157274785, + "step": 7271, + "time_per_iteration": 2.5870137214660645 + }, + { + "auxiliary_loss_clip": 0.0111934, + "auxiliary_loss_mlp": 0.01027457, + "balance_loss_clip": 1.04129171, + "balance_loss_mlp": 1.01979494, + "epoch": 0.8744063007274695, + "flos": 28145347159680.0, + "grad_norm": 1.6776112866578337, + "language_loss": 0.77332222, + "learning_rate": 1.630767691572943e-07, + "loss": 0.79479015, + "num_input_tokens_seen": 157294805, + "step": 7272, + "time_per_iteration": 2.5699243545532227 + }, + { + "auxiliary_loss_clip": 0.01044765, + "auxiliary_loss_mlp": 0.01000964, + "balance_loss_clip": 1.00736618, + "balance_loss_mlp": 0.99991477, + "epoch": 0.8745265436181086, + "flos": 64034076654720.0, + "grad_norm": 0.7406722596553733, + "language_loss": 0.53521466, + "learning_rate": 1.6276881731939306e-07, + "loss": 0.55567199, + "num_input_tokens_seen": 157356695, + "step": 7273, + "time_per_iteration": 3.1418018341064453 + }, + { + "auxiliary_loss_clip": 0.01148839, + "auxiliary_loss_mlp": 0.01023583, + "balance_loss_clip": 1.04551768, + "balance_loss_mlp": 1.01642752, + "epoch": 0.8746467865087477, + "flos": 28658618553600.0, + "grad_norm": 2.20721444834012, + "language_loss": 0.75432479, + "learning_rate": 1.6246114418800193e-07, + "loss": 0.77604902, + "num_input_tokens_seen": 157376975, + "step": 7274, + "time_per_iteration": 2.513587474822998 + }, + { + "auxiliary_loss_clip": 0.01147352, + "auxiliary_loss_mlp": 0.01025195, + "balance_loss_clip": 1.04417825, + "balance_loss_mlp": 1.0176549, + "epoch": 0.8747670293993868, + "flos": 23985850268160.0, + "grad_norm": 1.7504502522363012, + "language_loss": 0.76524734, + "learning_rate": 1.6215374980979423e-07, + "loss": 0.78697276, + "num_input_tokens_seen": 157397385, + "step": 7275, + "time_per_iteration": 2.4860260486602783 + }, + { + "auxiliary_loss_clip": 0.01148705, + "auxiliary_loss_mlp": 0.01024725, + "balance_loss_clip": 1.04638147, + "balance_loss_mlp": 1.0181272, + "epoch": 0.8748872722900258, + "flos": 45221624478720.0, + "grad_norm": 3.4076457160854132, + "language_loss": 0.68724912, + "learning_rate": 1.6184663423140133e-07, + "loss": 0.70898342, + "num_input_tokens_seen": 157417685, + "step": 7276, + "time_per_iteration": 2.650820732116699 + }, + { + "auxiliary_loss_clip": 0.01113533, + "auxiliary_loss_mlp": 0.01029904, + "balance_loss_clip": 1.04199839, + "balance_loss_mlp": 1.022367, + "epoch": 0.875007515180665, + "flos": 19754280737280.0, + "grad_norm": 3.728110841805341, + "language_loss": 0.63959265, + "learning_rate": 1.615397974994126e-07, + "loss": 0.66102707, + "num_input_tokens_seen": 157435490, + "step": 7277, + "time_per_iteration": 2.632469415664673 + }, + { + "auxiliary_loss_clip": 0.0116606, + "auxiliary_loss_mlp": 0.01024399, + "balance_loss_clip": 1.04769993, + "balance_loss_mlp": 1.01760411, + "epoch": 0.875127758071304, + "flos": 22710734386560.0, + "grad_norm": 1.5856042522365734, + "language_loss": 0.80717713, + "learning_rate": 1.6123323966037438e-07, + "loss": 0.82908171, + "num_input_tokens_seen": 157454010, + "step": 7278, + "time_per_iteration": 2.474921941757202 + }, + { + "auxiliary_loss_clip": 0.01167813, + "auxiliary_loss_mlp": 0.01029103, + "balance_loss_clip": 1.04769325, + "balance_loss_mlp": 1.02183104, + "epoch": 0.8752480009619431, + "flos": 23403846199680.0, + "grad_norm": 1.938785531875469, + "language_loss": 0.78661382, + "learning_rate": 1.6092696076079216e-07, + "loss": 0.8085829, + "num_input_tokens_seen": 157472385, + "step": 7279, + "time_per_iteration": 2.452253818511963 + }, + { + "auxiliary_loss_clip": 0.01114531, + "auxiliary_loss_mlp": 0.01021201, + "balance_loss_clip": 1.04175019, + "balance_loss_mlp": 1.01423907, + "epoch": 0.8753682438525822, + "flos": 26213101914240.0, + "grad_norm": 2.3627136766229078, + "language_loss": 0.74001855, + "learning_rate": 1.6062096084712785e-07, + "loss": 0.76137584, + "num_input_tokens_seen": 157493735, + "step": 7280, + "time_per_iteration": 2.5794754028320312 + }, + { + "auxiliary_loss_clip": 0.01129823, + "auxiliary_loss_mlp": 0.00761794, + "balance_loss_clip": 1.03973269, + "balance_loss_mlp": 1.00062704, + "epoch": 0.8754884867432213, + "flos": 23326745656320.0, + "grad_norm": 1.7198714662072903, + "language_loss": 0.70486349, + "learning_rate": 1.6031523996580098e-07, + "loss": 0.72377968, + "num_input_tokens_seen": 157511295, + "step": 7281, + "time_per_iteration": 2.487858772277832 + }, + { + "auxiliary_loss_clip": 0.01133879, + "auxiliary_loss_mlp": 0.01025724, + "balance_loss_clip": 1.04369032, + "balance_loss_mlp": 1.0180707, + "epoch": 0.8756087296338604, + "flos": 12495226412160.0, + "grad_norm": 2.329440606601495, + "language_loss": 0.66529953, + "learning_rate": 1.6000979816318981e-07, + "loss": 0.68689549, + "num_input_tokens_seen": 157529760, + "step": 7282, + "time_per_iteration": 2.513392448425293 + }, + { + "auxiliary_loss_clip": 0.01149262, + "auxiliary_loss_mlp": 0.0102303, + "balance_loss_clip": 1.04664683, + "balance_loss_mlp": 1.0157758, + "epoch": 0.8757289725244994, + "flos": 18952898353920.0, + "grad_norm": 2.707645109340736, + "language_loss": 0.7493974, + "learning_rate": 1.5970463548562886e-07, + "loss": 0.77112031, + "num_input_tokens_seen": 157548915, + "step": 7283, + "time_per_iteration": 2.442610263824463 + }, + { + "auxiliary_loss_clip": 0.01136394, + "auxiliary_loss_mlp": 0.0102493, + "balance_loss_clip": 1.0439539, + "balance_loss_mlp": 1.01765823, + "epoch": 0.8758492154151386, + "flos": 25265958140160.0, + "grad_norm": 1.7298406611906445, + "language_loss": 0.71595562, + "learning_rate": 1.5939975197941192e-07, + "loss": 0.73756891, + "num_input_tokens_seen": 157570570, + "step": 7284, + "time_per_iteration": 2.5291292667388916 + }, + { + "auxiliary_loss_clip": 0.01045285, + "auxiliary_loss_mlp": 0.01001042, + "balance_loss_clip": 1.00795972, + "balance_loss_mlp": 1.00010014, + "epoch": 0.8759694583057777, + "flos": 65571664193280.0, + "grad_norm": 0.8042764237743366, + "language_loss": 0.53367209, + "learning_rate": 1.5909514769078892e-07, + "loss": 0.55413538, + "num_input_tokens_seen": 157635675, + "step": 7285, + "time_per_iteration": 3.1377346515655518 + }, + { + "auxiliary_loss_clip": 0.01115831, + "auxiliary_loss_mlp": 0.01026095, + "balance_loss_clip": 1.04414952, + "balance_loss_mlp": 1.01924062, + "epoch": 0.8760897011964167, + "flos": 25446193608960.0, + "grad_norm": 1.5667903417237847, + "language_loss": 0.77479243, + "learning_rate": 1.5879082266596867e-07, + "loss": 0.79621166, + "num_input_tokens_seen": 157657015, + "step": 7286, + "time_per_iteration": 2.5858185291290283 + }, + { + "auxiliary_loss_clip": 0.01131431, + "auxiliary_loss_mlp": 0.01023333, + "balance_loss_clip": 1.03909338, + "balance_loss_mlp": 1.01573634, + "epoch": 0.8762099440870559, + "flos": 28984830894720.0, + "grad_norm": 3.5498229269029955, + "language_loss": 0.72040069, + "learning_rate": 1.5848677695111645e-07, + "loss": 0.74194831, + "num_input_tokens_seen": 157678615, + "step": 7287, + "time_per_iteration": 2.5438461303710938 + }, + { + "auxiliary_loss_clip": 0.01131395, + "auxiliary_loss_mlp": 0.01025287, + "balance_loss_clip": 1.04390407, + "balance_loss_mlp": 1.0173595, + "epoch": 0.8763301869776949, + "flos": 21609461352960.0, + "grad_norm": 2.9484200308056194, + "language_loss": 0.69631249, + "learning_rate": 1.5818301059235562e-07, + "loss": 0.7178793, + "num_input_tokens_seen": 157693790, + "step": 7288, + "time_per_iteration": 2.5057103633880615 + }, + { + "auxiliary_loss_clip": 0.01141937, + "auxiliary_loss_mlp": 0.01024273, + "balance_loss_clip": 1.04707742, + "balance_loss_mlp": 1.0165484, + "epoch": 0.876450429868334, + "flos": 24644416176000.0, + "grad_norm": 1.7172123877395842, + "language_loss": 0.81386268, + "learning_rate": 1.578795236357684e-07, + "loss": 0.8355248, + "num_input_tokens_seen": 157715255, + "step": 7289, + "time_per_iteration": 2.5689780712127686 + }, + { + "auxiliary_loss_clip": 0.01140888, + "auxiliary_loss_mlp": 0.01022212, + "balance_loss_clip": 1.04631114, + "balance_loss_mlp": 1.01497269, + "epoch": 0.8765706727589732, + "flos": 20260046188800.0, + "grad_norm": 2.1391647053707463, + "language_loss": 0.85584152, + "learning_rate": 1.5757631612739218e-07, + "loss": 0.87747252, + "num_input_tokens_seen": 157728800, + "step": 7290, + "time_per_iteration": 2.496398687362671 + }, + { + "auxiliary_loss_clip": 0.01064398, + "auxiliary_loss_mlp": 0.01000922, + "balance_loss_clip": 1.00827456, + "balance_loss_mlp": 0.99994469, + "epoch": 0.8766909156496122, + "flos": 71371165276800.0, + "grad_norm": 0.7776042901990383, + "language_loss": 0.6143437, + "learning_rate": 1.572733881132242e-07, + "loss": 0.63499689, + "num_input_tokens_seen": 157789445, + "step": 7291, + "time_per_iteration": 3.868009567260742 + }, + { + "auxiliary_loss_clip": 0.01032146, + "auxiliary_loss_mlp": 0.01001988, + "balance_loss_clip": 1.01359701, + "balance_loss_mlp": 1.00111735, + "epoch": 0.8768111585402513, + "flos": 69523490603520.0, + "grad_norm": 0.7989915254914749, + "language_loss": 0.58562267, + "learning_rate": 1.5697073963921814e-07, + "loss": 0.60596395, + "num_input_tokens_seen": 157848685, + "step": 7292, + "time_per_iteration": 3.7982516288757324 + }, + { + "auxiliary_loss_clip": 0.01155257, + "auxiliary_loss_mlp": 0.01021674, + "balance_loss_clip": 1.04732358, + "balance_loss_mlp": 1.01398253, + "epoch": 0.8769314014308904, + "flos": 18838558385280.0, + "grad_norm": 2.5945359256088216, + "language_loss": 0.84735918, + "learning_rate": 1.566683707512857e-07, + "loss": 0.86912853, + "num_input_tokens_seen": 157866360, + "step": 7293, + "time_per_iteration": 2.4518752098083496 + }, + { + "auxiliary_loss_clip": 0.01136494, + "auxiliary_loss_mlp": 0.01026707, + "balance_loss_clip": 1.04372025, + "balance_loss_mlp": 1.01877964, + "epoch": 0.8770516443215295, + "flos": 14976402278400.0, + "grad_norm": 2.0152112653590524, + "language_loss": 0.79059994, + "learning_rate": 1.5636628149529553e-07, + "loss": 0.81223202, + "num_input_tokens_seen": 157884150, + "step": 7294, + "time_per_iteration": 2.477919101715088 + }, + { + "auxiliary_loss_clip": 0.01135708, + "auxiliary_loss_mlp": 0.01021949, + "balance_loss_clip": 1.04235935, + "balance_loss_mlp": 1.01513076, + "epoch": 0.8771718872121685, + "flos": 31649654021760.0, + "grad_norm": 2.0299405691121613, + "language_loss": 0.79951781, + "learning_rate": 1.560644719170743e-07, + "loss": 0.82109439, + "num_input_tokens_seen": 157905020, + "step": 7295, + "time_per_iteration": 3.296365976333618 + }, + { + "auxiliary_loss_clip": 0.01123742, + "auxiliary_loss_mlp": 0.01026385, + "balance_loss_clip": 1.04167736, + "balance_loss_mlp": 1.01852882, + "epoch": 0.8772921301028077, + "flos": 36095466222720.0, + "grad_norm": 1.698525217704603, + "language_loss": 0.72046316, + "learning_rate": 1.5576294206240692e-07, + "loss": 0.74196446, + "num_input_tokens_seen": 157924545, + "step": 7296, + "time_per_iteration": 2.6460647583007812 + }, + { + "auxiliary_loss_clip": 0.01133609, + "auxiliary_loss_mlp": 0.0102601, + "balance_loss_clip": 1.04270422, + "balance_loss_mlp": 1.01901793, + "epoch": 0.8774123729934468, + "flos": 57116961849600.0, + "grad_norm": 2.646071818542406, + "language_loss": 0.67364681, + "learning_rate": 1.5546169197703507e-07, + "loss": 0.695243, + "num_input_tokens_seen": 157950820, + "step": 7297, + "time_per_iteration": 3.4730262756347656 + }, + { + "auxiliary_loss_clip": 0.01142868, + "auxiliary_loss_mlp": 0.01026422, + "balance_loss_clip": 1.0415442, + "balance_loss_mlp": 1.01912975, + "epoch": 0.8775326158840858, + "flos": 23914495900800.0, + "grad_norm": 2.884842021147055, + "language_loss": 0.77131116, + "learning_rate": 1.5516072170665774e-07, + "loss": 0.79300404, + "num_input_tokens_seen": 157968790, + "step": 7298, + "time_per_iteration": 2.5129735469818115 + }, + { + "auxiliary_loss_clip": 0.01154581, + "auxiliary_loss_mlp": 0.01021963, + "balance_loss_clip": 1.04545689, + "balance_loss_mlp": 1.01475048, + "epoch": 0.877652858774725, + "flos": 17123285243520.0, + "grad_norm": 1.8437566385658564, + "language_loss": 0.86955845, + "learning_rate": 1.5486003129693214e-07, + "loss": 0.89132386, + "num_input_tokens_seen": 157986155, + "step": 7299, + "time_per_iteration": 2.4747507572174072 + }, + { + "auxiliary_loss_clip": 0.0115526, + "auxiliary_loss_mlp": 0.01021004, + "balance_loss_clip": 1.04642916, + "balance_loss_mlp": 1.01345491, + "epoch": 0.877773101665364, + "flos": 16508961912960.0, + "grad_norm": 2.188468109788711, + "language_loss": 0.7814765, + "learning_rate": 1.545596207934725e-07, + "loss": 0.80323911, + "num_input_tokens_seen": 158004640, + "step": 7300, + "time_per_iteration": 2.444408416748047 + }, + { + "auxiliary_loss_clip": 0.01131101, + "auxiliary_loss_mlp": 0.01023221, + "balance_loss_clip": 1.04240203, + "balance_loss_mlp": 1.01600552, + "epoch": 0.8778933445560031, + "flos": 22053209973120.0, + "grad_norm": 1.7638521241166716, + "language_loss": 0.77620685, + "learning_rate": 1.5425949024185147e-07, + "loss": 0.79775012, + "num_input_tokens_seen": 158024665, + "step": 7301, + "time_per_iteration": 2.5064914226531982 + }, + { + "auxiliary_loss_clip": 0.0114167, + "auxiliary_loss_mlp": 0.0102388, + "balance_loss_clip": 1.04381275, + "balance_loss_mlp": 1.01669192, + "epoch": 0.8780135874466423, + "flos": 22564757514240.0, + "grad_norm": 2.034785988744606, + "language_loss": 0.67379487, + "learning_rate": 1.5395963968759818e-07, + "loss": 0.69545043, + "num_input_tokens_seen": 158044940, + "step": 7302, + "time_per_iteration": 2.5484111309051514 + }, + { + "auxiliary_loss_clip": 0.0113987, + "auxiliary_loss_mlp": 0.01021413, + "balance_loss_clip": 1.04203296, + "balance_loss_mlp": 1.01433516, + "epoch": 0.8781338303372813, + "flos": 61531999073280.0, + "grad_norm": 1.5845796405709744, + "language_loss": 0.64502025, + "learning_rate": 1.536600691761998e-07, + "loss": 0.66663307, + "num_input_tokens_seen": 158070770, + "step": 7303, + "time_per_iteration": 2.8626606464385986 + }, + { + "auxiliary_loss_clip": 0.01130074, + "auxiliary_loss_mlp": 0.01025433, + "balance_loss_clip": 1.04694891, + "balance_loss_mlp": 1.01890635, + "epoch": 0.8782540732279204, + "flos": 22674751937280.0, + "grad_norm": 2.5089555654476157, + "language_loss": 0.71642685, + "learning_rate": 1.5336077875310084e-07, + "loss": 0.73798192, + "num_input_tokens_seen": 158089995, + "step": 7304, + "time_per_iteration": 2.536034345626831 + }, + { + "auxiliary_loss_clip": 0.011157, + "auxiliary_loss_mlp": 0.01023128, + "balance_loss_clip": 1.04143071, + "balance_loss_mlp": 1.01598132, + "epoch": 0.8783743161185595, + "flos": 16070348937600.0, + "grad_norm": 2.0380892222344524, + "language_loss": 0.73882413, + "learning_rate": 1.5306176846370321e-07, + "loss": 0.76021242, + "num_input_tokens_seen": 158108140, + "step": 7305, + "time_per_iteration": 2.5283987522125244 + }, + { + "auxiliary_loss_clip": 0.01146685, + "auxiliary_loss_mlp": 0.01031904, + "balance_loss_clip": 1.04347491, + "balance_loss_mlp": 1.02398801, + "epoch": 0.8784945590091986, + "flos": 26067879227520.0, + "grad_norm": 1.8977398461408956, + "language_loss": 0.740376, + "learning_rate": 1.5276303835336712e-07, + "loss": 0.76216185, + "num_input_tokens_seen": 158128680, + "step": 7306, + "time_per_iteration": 2.5447189807891846 + }, + { + "auxiliary_loss_clip": 0.01056086, + "auxiliary_loss_mlp": 0.01001694, + "balance_loss_clip": 1.00856709, + "balance_loss_mlp": 1.00075781, + "epoch": 0.8786148018998376, + "flos": 62720643939840.0, + "grad_norm": 0.7608806971196155, + "language_loss": 0.53515017, + "learning_rate": 1.524645884674094e-07, + "loss": 0.55572802, + "num_input_tokens_seen": 158185610, + "step": 7307, + "time_per_iteration": 3.04211163520813 + }, + { + "auxiliary_loss_clip": 0.01167036, + "auxiliary_loss_mlp": 0.00763171, + "balance_loss_clip": 1.04495668, + "balance_loss_mlp": 1.00056601, + "epoch": 0.8787350447904768, + "flos": 21652734263040.0, + "grad_norm": 2.057423037670531, + "language_loss": 0.78669339, + "learning_rate": 1.521664188511047e-07, + "loss": 0.80599546, + "num_input_tokens_seen": 158205635, + "step": 7308, + "time_per_iteration": 2.4521446228027344 + }, + { + "auxiliary_loss_clip": 0.01139633, + "auxiliary_loss_mlp": 0.00762002, + "balance_loss_clip": 1.04676402, + "balance_loss_mlp": 1.00069892, + "epoch": 0.8788552876811159, + "flos": 25478476957440.0, + "grad_norm": 1.9302078226767405, + "language_loss": 0.80086792, + "learning_rate": 1.518685295496851e-07, + "loss": 0.8198843, + "num_input_tokens_seen": 158223495, + "step": 7309, + "time_per_iteration": 2.5139172077178955 + }, + { + "auxiliary_loss_clip": 0.01153048, + "auxiliary_loss_mlp": 0.01025642, + "balance_loss_clip": 1.04328704, + "balance_loss_mlp": 1.01863813, + "epoch": 0.8789755305717549, + "flos": 22310222762880.0, + "grad_norm": 1.6517337111766295, + "language_loss": 0.85409319, + "learning_rate": 1.5157092060833975e-07, + "loss": 0.87588018, + "num_input_tokens_seen": 158243145, + "step": 7310, + "time_per_iteration": 2.4703609943389893 + }, + { + "auxiliary_loss_clip": 0.01136544, + "auxiliary_loss_mlp": 0.01020021, + "balance_loss_clip": 1.04183614, + "balance_loss_mlp": 1.01292813, + "epoch": 0.879095773462394, + "flos": 29310971408640.0, + "grad_norm": 1.5158351213359036, + "language_loss": 0.65816075, + "learning_rate": 1.5127359207221658e-07, + "loss": 0.67972642, + "num_input_tokens_seen": 158262625, + "step": 7311, + "time_per_iteration": 2.541424512863159 + }, + { + "auxiliary_loss_clip": 0.01085868, + "auxiliary_loss_mlp": 0.01025557, + "balance_loss_clip": 1.03520536, + "balance_loss_mlp": 1.01751053, + "epoch": 0.8792160163530331, + "flos": 16690023394560.0, + "grad_norm": 1.9619524984498096, + "language_loss": 0.73149401, + "learning_rate": 1.5097654398641923e-07, + "loss": 0.7526083, + "num_input_tokens_seen": 158280530, + "step": 7312, + "time_per_iteration": 2.5611469745635986 + }, + { + "auxiliary_loss_clip": 0.0115788, + "auxiliary_loss_mlp": 0.01026833, + "balance_loss_clip": 1.04671478, + "balance_loss_mlp": 1.01925743, + "epoch": 0.8793362592436722, + "flos": 24499301230080.0, + "grad_norm": 1.4883054032024454, + "language_loss": 0.73286927, + "learning_rate": 1.5067977639601014e-07, + "loss": 0.7547164, + "num_input_tokens_seen": 158303290, + "step": 7313, + "time_per_iteration": 2.49361252784729 + }, + { + "auxiliary_loss_clip": 0.01136598, + "auxiliary_loss_mlp": 0.01026145, + "balance_loss_clip": 1.04418325, + "balance_loss_mlp": 1.01862884, + "epoch": 0.8794565021343113, + "flos": 14538399834240.0, + "grad_norm": 2.088028772231195, + "language_loss": 0.71237749, + "learning_rate": 1.5038328934600864e-07, + "loss": 0.73400497, + "num_input_tokens_seen": 158319925, + "step": 7314, + "time_per_iteration": 2.4740824699401855 + }, + { + "auxiliary_loss_clip": 0.01137789, + "auxiliary_loss_mlp": 0.01025805, + "balance_loss_clip": 1.04484391, + "balance_loss_mlp": 1.01880765, + "epoch": 0.8795767450249504, + "flos": 39530286224640.0, + "grad_norm": 2.2559842204846103, + "language_loss": 0.69873983, + "learning_rate": 1.5008708288139161e-07, + "loss": 0.72037578, + "num_input_tokens_seen": 158342285, + "step": 7315, + "time_per_iteration": 2.6884381771087646 + }, + { + "auxiliary_loss_clip": 0.01152069, + "auxiliary_loss_mlp": 0.01025245, + "balance_loss_clip": 1.04554045, + "balance_loss_mlp": 1.0178566, + "epoch": 0.8796969879155895, + "flos": 22960672197120.0, + "grad_norm": 2.3596761077206128, + "language_loss": 0.73308492, + "learning_rate": 1.497911570470931e-07, + "loss": 0.75485802, + "num_input_tokens_seen": 158362290, + "step": 7316, + "time_per_iteration": 2.47281813621521 + }, + { + "auxiliary_loss_clip": 0.01115572, + "auxiliary_loss_mlp": 0.01026513, + "balance_loss_clip": 1.0421207, + "balance_loss_mlp": 1.01918781, + "epoch": 0.8798172308062285, + "flos": 28362427004160.0, + "grad_norm": 1.809237533367953, + "language_loss": 0.85597265, + "learning_rate": 1.494955118880048e-07, + "loss": 0.87739354, + "num_input_tokens_seen": 158383275, + "step": 7317, + "time_per_iteration": 2.57771635055542 + }, + { + "auxiliary_loss_clip": 0.01152921, + "auxiliary_loss_mlp": 0.01023988, + "balance_loss_clip": 1.04423952, + "balance_loss_mlp": 1.01668096, + "epoch": 0.8799374736968677, + "flos": 23988974751360.0, + "grad_norm": 3.4607866999919885, + "language_loss": 0.72924304, + "learning_rate": 1.4920014744897634e-07, + "loss": 0.75101209, + "num_input_tokens_seen": 158402690, + "step": 7318, + "time_per_iteration": 3.2707629203796387 + }, + { + "auxiliary_loss_clip": 0.01131811, + "auxiliary_loss_mlp": 0.01021022, + "balance_loss_clip": 1.04352283, + "balance_loss_mlp": 1.01355362, + "epoch": 0.8800577165875068, + "flos": 25630271832960.0, + "grad_norm": 2.0105681597401968, + "language_loss": 0.8652485, + "learning_rate": 1.4890506377481392e-07, + "loss": 0.8867768, + "num_input_tokens_seen": 158421780, + "step": 7319, + "time_per_iteration": 3.2946431636810303 + }, + { + "auxiliary_loss_clip": 0.01094126, + "auxiliary_loss_mlp": 0.01030429, + "balance_loss_clip": 1.040452, + "balance_loss_mlp": 1.02350569, + "epoch": 0.8801779594781458, + "flos": 23440331439360.0, + "grad_norm": 1.4131944247577488, + "language_loss": 0.63810283, + "learning_rate": 1.486102609102815e-07, + "loss": 0.65934837, + "num_input_tokens_seen": 158442330, + "step": 7320, + "time_per_iteration": 2.5893232822418213 + }, + { + "auxiliary_loss_clip": 0.01132077, + "auxiliary_loss_mlp": 0.01023746, + "balance_loss_clip": 1.04276848, + "balance_loss_mlp": 1.01654601, + "epoch": 0.880298202368785, + "flos": 11508580656000.0, + "grad_norm": 2.670581403688963, + "language_loss": 0.86019558, + "learning_rate": 1.483157389001004e-07, + "loss": 0.8817538, + "num_input_tokens_seen": 158459890, + "step": 7321, + "time_per_iteration": 2.4754602909088135 + }, + { + "auxiliary_loss_clip": 0.01138614, + "auxiliary_loss_mlp": 0.01023776, + "balance_loss_clip": 1.04192436, + "balance_loss_mlp": 1.01562166, + "epoch": 0.880418445259424, + "flos": 22671447886080.0, + "grad_norm": 2.2608544184642563, + "language_loss": 0.78871012, + "learning_rate": 1.4802149778894933e-07, + "loss": 0.81033397, + "num_input_tokens_seen": 158478680, + "step": 7322, + "time_per_iteration": 3.219449043273926 + }, + { + "auxiliary_loss_clip": 0.01142286, + "auxiliary_loss_mlp": 0.01021652, + "balance_loss_clip": 1.04071879, + "balance_loss_mlp": 1.01489234, + "epoch": 0.8805386881500631, + "flos": 20522158709760.0, + "grad_norm": 1.7353574923629225, + "language_loss": 0.87582809, + "learning_rate": 1.4772753762146484e-07, + "loss": 0.89746749, + "num_input_tokens_seen": 158497935, + "step": 7323, + "time_per_iteration": 2.4749724864959717 + }, + { + "auxiliary_loss_clip": 0.01146516, + "auxiliary_loss_mlp": 0.01021404, + "balance_loss_clip": 1.04343581, + "balance_loss_mlp": 1.01367903, + "epoch": 0.8806589310407023, + "flos": 36538891620480.0, + "grad_norm": 2.3147309725415863, + "language_loss": 0.70532537, + "learning_rate": 1.474338584422401e-07, + "loss": 0.72700453, + "num_input_tokens_seen": 158523145, + "step": 7324, + "time_per_iteration": 3.2977213859558105 + }, + { + "auxiliary_loss_clip": 0.0114854, + "auxiliary_loss_mlp": 0.01022464, + "balance_loss_clip": 1.04540873, + "balance_loss_mlp": 1.01523972, + "epoch": 0.8807791739313413, + "flos": 23440187784960.0, + "grad_norm": 1.6755798169333675, + "language_loss": 0.75741774, + "learning_rate": 1.4714046029582595e-07, + "loss": 0.77912778, + "num_input_tokens_seen": 158542210, + "step": 7325, + "time_per_iteration": 2.4863028526306152 + }, + { + "auxiliary_loss_clip": 0.01127679, + "auxiliary_loss_mlp": 0.01021278, + "balance_loss_clip": 1.04242551, + "balance_loss_mlp": 1.01383293, + "epoch": 0.8808994168219804, + "flos": 25956843310080.0, + "grad_norm": 2.0244605864238, + "language_loss": 0.7575978, + "learning_rate": 1.46847343226731e-07, + "loss": 0.77908731, + "num_input_tokens_seen": 158563250, + "step": 7326, + "time_per_iteration": 2.5566837787628174 + }, + { + "auxiliary_loss_clip": 0.01152888, + "auxiliary_loss_mlp": 0.0102281, + "balance_loss_clip": 1.04303956, + "balance_loss_mlp": 1.01543355, + "epoch": 0.8810196597126195, + "flos": 17092079303040.0, + "grad_norm": 1.9393187719833394, + "language_loss": 0.69505167, + "learning_rate": 1.465545072794203e-07, + "loss": 0.71680862, + "num_input_tokens_seen": 158581125, + "step": 7327, + "time_per_iteration": 2.4781532287597656 + }, + { + "auxiliary_loss_clip": 0.01106624, + "auxiliary_loss_mlp": 0.01024224, + "balance_loss_clip": 1.0427196, + "balance_loss_mlp": 1.01670766, + "epoch": 0.8811399026032586, + "flos": 23002831785600.0, + "grad_norm": 2.0245195028005734, + "language_loss": 0.75535828, + "learning_rate": 1.4626195249831774e-07, + "loss": 0.77666676, + "num_input_tokens_seen": 158602025, + "step": 7328, + "time_per_iteration": 2.5797619819641113 + }, + { + "auxiliary_loss_clip": 0.01150782, + "auxiliary_loss_mlp": 0.01022793, + "balance_loss_clip": 1.0439043, + "balance_loss_mlp": 1.01567662, + "epoch": 0.8812601454938976, + "flos": 14463813242880.0, + "grad_norm": 2.042457489107333, + "language_loss": 0.71560121, + "learning_rate": 1.4596967892780244e-07, + "loss": 0.73733687, + "num_input_tokens_seen": 158618355, + "step": 7329, + "time_per_iteration": 2.4455814361572266 + }, + { + "auxiliary_loss_clip": 0.01165347, + "auxiliary_loss_mlp": 0.01024113, + "balance_loss_clip": 1.04685736, + "balance_loss_mlp": 1.01677608, + "epoch": 0.8813803883845368, + "flos": 22493223578880.0, + "grad_norm": 1.7786348411590196, + "language_loss": 0.74815834, + "learning_rate": 1.4567768661221314e-07, + "loss": 0.77005297, + "num_input_tokens_seen": 158638925, + "step": 7330, + "time_per_iteration": 2.4341068267822266 + }, + { + "auxiliary_loss_clip": 0.01157334, + "auxiliary_loss_mlp": 0.00762315, + "balance_loss_clip": 1.04700339, + "balance_loss_mlp": 1.00058472, + "epoch": 0.8815006312751759, + "flos": 21506901045120.0, + "grad_norm": 1.9107441604304392, + "language_loss": 0.74257326, + "learning_rate": 1.4538597559584442e-07, + "loss": 0.76176977, + "num_input_tokens_seen": 158656715, + "step": 7331, + "time_per_iteration": 2.4794609546661377 + }, + { + "auxiliary_loss_clip": 0.01134459, + "auxiliary_loss_mlp": 0.01029157, + "balance_loss_clip": 1.04269552, + "balance_loss_mlp": 1.02139938, + "epoch": 0.8816208741658149, + "flos": 22784566792320.0, + "grad_norm": 1.810702719045528, + "language_loss": 0.78879464, + "learning_rate": 1.4509454592294823e-07, + "loss": 0.81043077, + "num_input_tokens_seen": 158677200, + "step": 7332, + "time_per_iteration": 2.4955148696899414 + }, + { + "auxiliary_loss_clip": 0.01125522, + "auxiliary_loss_mlp": 0.00762218, + "balance_loss_clip": 1.04421377, + "balance_loss_mlp": 1.00057244, + "epoch": 0.8817411170564541, + "flos": 17779409026560.0, + "grad_norm": 2.3029144282152854, + "language_loss": 0.79111397, + "learning_rate": 1.448033976377354e-07, + "loss": 0.80999136, + "num_input_tokens_seen": 158692185, + "step": 7333, + "time_per_iteration": 2.509106159210205 + }, + { + "auxiliary_loss_clip": 0.01153728, + "auxiliary_loss_mlp": 0.01021087, + "balance_loss_clip": 1.04408193, + "balance_loss_mlp": 1.01419353, + "epoch": 0.8818613599470931, + "flos": 18551812112640.0, + "grad_norm": 1.9075361032414335, + "language_loss": 0.74136347, + "learning_rate": 1.445125307843713e-07, + "loss": 0.76311159, + "num_input_tokens_seen": 158710410, + "step": 7334, + "time_per_iteration": 2.441840171813965 + }, + { + "auxiliary_loss_clip": 0.01151814, + "auxiliary_loss_mlp": 0.01022601, + "balance_loss_clip": 1.04698992, + "balance_loss_mlp": 1.01596403, + "epoch": 0.8819816028377322, + "flos": 27599792417280.0, + "grad_norm": 1.8695668684393756, + "language_loss": 0.75793469, + "learning_rate": 1.442219454069813e-07, + "loss": 0.77967882, + "num_input_tokens_seen": 158731435, + "step": 7335, + "time_per_iteration": 2.566807746887207 + }, + { + "auxiliary_loss_clip": 0.01112418, + "auxiliary_loss_mlp": 0.0102897, + "balance_loss_clip": 1.04104745, + "balance_loss_mlp": 1.02183509, + "epoch": 0.8821018457283714, + "flos": 23404600385280.0, + "grad_norm": 2.244394298274669, + "language_loss": 0.66638738, + "learning_rate": 1.4393164154964676e-07, + "loss": 0.68780124, + "num_input_tokens_seen": 158750965, + "step": 7336, + "time_per_iteration": 2.5563507080078125 + }, + { + "auxiliary_loss_clip": 0.0114976, + "auxiliary_loss_mlp": 0.01025807, + "balance_loss_clip": 1.04631782, + "balance_loss_mlp": 1.01882696, + "epoch": 0.8822220886190104, + "flos": 29132459792640.0, + "grad_norm": 1.7256219375992343, + "language_loss": 0.9397099, + "learning_rate": 1.4364161925640649e-07, + "loss": 0.96146554, + "num_input_tokens_seen": 158772365, + "step": 7337, + "time_per_iteration": 2.5269577503204346 + }, + { + "auxiliary_loss_clip": 0.01165232, + "auxiliary_loss_mlp": 0.01022503, + "balance_loss_clip": 1.04600835, + "balance_loss_mlp": 1.01554692, + "epoch": 0.8823423315096495, + "flos": 20485422074880.0, + "grad_norm": 1.814866774818243, + "language_loss": 0.84876752, + "learning_rate": 1.4335187857125663e-07, + "loss": 0.87064481, + "num_input_tokens_seen": 158791065, + "step": 7338, + "time_per_iteration": 2.42106294631958 + }, + { + "auxiliary_loss_clip": 0.01154406, + "auxiliary_loss_mlp": 0.01021333, + "balance_loss_clip": 1.04516971, + "balance_loss_mlp": 1.01434159, + "epoch": 0.8824625744002886, + "flos": 24206377818240.0, + "grad_norm": 1.6384798652361117, + "language_loss": 0.7516036, + "learning_rate": 1.4306241953815023e-07, + "loss": 0.77336097, + "num_input_tokens_seen": 158812125, + "step": 7339, + "time_per_iteration": 2.513108730316162 + }, + { + "auxiliary_loss_clip": 0.01155099, + "auxiliary_loss_mlp": 0.01022718, + "balance_loss_clip": 1.04592705, + "balance_loss_mlp": 1.01561928, + "epoch": 0.8825828172909277, + "flos": 24679500785280.0, + "grad_norm": 2.027334800283928, + "language_loss": 0.70878744, + "learning_rate": 1.4277324220099862e-07, + "loss": 0.73056567, + "num_input_tokens_seen": 158834035, + "step": 7340, + "time_per_iteration": 2.5377185344696045 + }, + { + "auxiliary_loss_clip": 0.0111932, + "auxiliary_loss_mlp": 0.01025037, + "balance_loss_clip": 1.04041696, + "balance_loss_mlp": 1.01820004, + "epoch": 0.8827030601815667, + "flos": 22456163721600.0, + "grad_norm": 1.9380869607267517, + "language_loss": 0.74217021, + "learning_rate": 1.4248434660366938e-07, + "loss": 0.76361382, + "num_input_tokens_seen": 158853510, + "step": 7341, + "time_per_iteration": 2.611093282699585 + }, + { + "auxiliary_loss_clip": 0.01138042, + "auxiliary_loss_mlp": 0.01025013, + "balance_loss_clip": 1.04537439, + "balance_loss_mlp": 1.01790178, + "epoch": 0.8828233030722058, + "flos": 19865639877120.0, + "grad_norm": 2.11051079170759, + "language_loss": 0.70511472, + "learning_rate": 1.4219573278998808e-07, + "loss": 0.72674531, + "num_input_tokens_seen": 158871970, + "step": 7342, + "time_per_iteration": 2.4752860069274902 + }, + { + "auxiliary_loss_clip": 0.01135848, + "auxiliary_loss_mlp": 0.01022764, + "balance_loss_clip": 1.04059076, + "balance_loss_mlp": 1.01502109, + "epoch": 0.882943545962845, + "flos": 39347213581440.0, + "grad_norm": 2.340981071837258, + "language_loss": 0.64702392, + "learning_rate": 1.4190740080373685e-07, + "loss": 0.6686101, + "num_input_tokens_seen": 158892250, + "step": 7343, + "time_per_iteration": 2.643040180206299 + }, + { + "auxiliary_loss_clip": 0.01111745, + "auxiliary_loss_mlp": 0.01024384, + "balance_loss_clip": 1.04280281, + "balance_loss_mlp": 1.01638496, + "epoch": 0.883063788853484, + "flos": 19054524908160.0, + "grad_norm": 1.9716460686576502, + "language_loss": 0.84043813, + "learning_rate": 1.4161935068865538e-07, + "loss": 0.86179936, + "num_input_tokens_seen": 158907395, + "step": 7344, + "time_per_iteration": 3.236603021621704 + }, + { + "auxiliary_loss_clip": 0.01167213, + "auxiliary_loss_mlp": 0.01021796, + "balance_loss_clip": 1.04664123, + "balance_loss_mlp": 1.01401711, + "epoch": 0.8831840317441231, + "flos": 18733196816640.0, + "grad_norm": 1.98761535632202, + "language_loss": 0.75538301, + "learning_rate": 1.4133158248844113e-07, + "loss": 0.77727306, + "num_input_tokens_seen": 158926300, + "step": 7345, + "time_per_iteration": 3.1870760917663574 + }, + { + "auxiliary_loss_clip": 0.01128052, + "auxiliary_loss_mlp": 0.01026316, + "balance_loss_clip": 1.04328251, + "balance_loss_mlp": 1.0182755, + "epoch": 0.8833042746347622, + "flos": 26827712553600.0, + "grad_norm": 1.7306976450826885, + "language_loss": 0.73131925, + "learning_rate": 1.4104409624674785e-07, + "loss": 0.75286293, + "num_input_tokens_seen": 158946085, + "step": 7346, + "time_per_iteration": 2.553011655807495 + }, + { + "auxiliary_loss_clip": 0.0115724, + "auxiliary_loss_mlp": 0.0101844, + "balance_loss_clip": 1.05001521, + "balance_loss_mlp": 1.01149881, + "epoch": 0.8834245175254013, + "flos": 26104077158400.0, + "grad_norm": 2.019574905059198, + "language_loss": 0.78594458, + "learning_rate": 1.407568920071873e-07, + "loss": 0.80770141, + "num_input_tokens_seen": 158964950, + "step": 7347, + "time_per_iteration": 2.5098865032196045 + }, + { + "auxiliary_loss_clip": 0.01172893, + "auxiliary_loss_mlp": 0.01025424, + "balance_loss_clip": 1.04847717, + "balance_loss_mlp": 1.01746094, + "epoch": 0.8835447604160404, + "flos": 30629036977920.0, + "grad_norm": 1.7061077068117374, + "language_loss": 0.68003678, + "learning_rate": 1.4046996981332782e-07, + "loss": 0.70201993, + "num_input_tokens_seen": 158984835, + "step": 7348, + "time_per_iteration": 3.246121883392334 + }, + { + "auxiliary_loss_clip": 0.0112657, + "auxiliary_loss_mlp": 0.01024191, + "balance_loss_clip": 1.04315209, + "balance_loss_mlp": 1.01618564, + "epoch": 0.8836650033066795, + "flos": 24718356322560.0, + "grad_norm": 2.1422938670578984, + "language_loss": 0.78458548, + "learning_rate": 1.4018332970869516e-07, + "loss": 0.8060931, + "num_input_tokens_seen": 159002775, + "step": 7349, + "time_per_iteration": 2.5553195476531982 + }, + { + "auxiliary_loss_clip": 0.01131365, + "auxiliary_loss_mlp": 0.01024736, + "balance_loss_clip": 1.04336262, + "balance_loss_mlp": 1.0171423, + "epoch": 0.8837852461973186, + "flos": 25413371556480.0, + "grad_norm": 1.8104461911341434, + "language_loss": 0.85321897, + "learning_rate": 1.398969717367733e-07, + "loss": 0.87478, + "num_input_tokens_seen": 159024100, + "step": 7350, + "time_per_iteration": 2.5315303802490234 + }, + { + "auxiliary_loss_clip": 0.01110808, + "auxiliary_loss_mlp": 0.01026997, + "balance_loss_clip": 1.04450536, + "balance_loss_mlp": 1.01999021, + "epoch": 0.8839054890879576, + "flos": 17822574195840.0, + "grad_norm": 1.7283589273753592, + "language_loss": 0.76029164, + "learning_rate": 1.396108959410014e-07, + "loss": 0.78166974, + "num_input_tokens_seen": 159043315, + "step": 7351, + "time_per_iteration": 3.2885615825653076 + }, + { + "auxiliary_loss_clip": 0.01152836, + "auxiliary_loss_mlp": 0.00762304, + "balance_loss_clip": 1.04699409, + "balance_loss_mlp": 1.00052035, + "epoch": 0.8840257319785968, + "flos": 23769021818880.0, + "grad_norm": 1.6407344735643339, + "language_loss": 0.81518865, + "learning_rate": 1.3932510236477745e-07, + "loss": 0.83433998, + "num_input_tokens_seen": 159063985, + "step": 7352, + "time_per_iteration": 2.5149285793304443 + }, + { + "auxiliary_loss_clip": 0.01151682, + "auxiliary_loss_mlp": 0.01023575, + "balance_loss_clip": 1.04325294, + "balance_loss_mlp": 1.0157882, + "epoch": 0.8841459748692359, + "flos": 29059776622080.0, + "grad_norm": 1.904816464421521, + "language_loss": 0.55741119, + "learning_rate": 1.3903959105145636e-07, + "loss": 0.57916373, + "num_input_tokens_seen": 159084475, + "step": 7353, + "time_per_iteration": 2.5325565338134766 + }, + { + "auxiliary_loss_clip": 0.01166967, + "auxiliary_loss_mlp": 0.01023229, + "balance_loss_clip": 1.04655766, + "balance_loss_mlp": 1.01599288, + "epoch": 0.8842662177598749, + "flos": 24311523905280.0, + "grad_norm": 2.070580523763091, + "language_loss": 0.83306193, + "learning_rate": 1.387543620443492e-07, + "loss": 0.8549639, + "num_input_tokens_seen": 159101320, + "step": 7354, + "time_per_iteration": 2.4423272609710693 + }, + { + "auxiliary_loss_clip": 0.01167103, + "auxiliary_loss_mlp": 0.01023416, + "balance_loss_clip": 1.04754901, + "balance_loss_mlp": 1.01629305, + "epoch": 0.8843864606505141, + "flos": 25007867942400.0, + "grad_norm": 1.5652600756934711, + "language_loss": 0.84138691, + "learning_rate": 1.3846941538672606e-07, + "loss": 0.8632921, + "num_input_tokens_seen": 159120025, + "step": 7355, + "time_per_iteration": 2.466055154800415 + }, + { + "auxiliary_loss_clip": 0.01115138, + "auxiliary_loss_mlp": 0.01023156, + "balance_loss_clip": 1.04310131, + "balance_loss_mlp": 1.01589012, + "epoch": 0.8845067035411531, + "flos": 28183915388160.0, + "grad_norm": 2.196133189201758, + "language_loss": 0.80793935, + "learning_rate": 1.3818475112181193e-07, + "loss": 0.82932228, + "num_input_tokens_seen": 159138820, + "step": 7356, + "time_per_iteration": 2.598966360092163 + }, + { + "auxiliary_loss_clip": 0.01138979, + "auxiliary_loss_mlp": 0.01025239, + "balance_loss_clip": 1.04481471, + "balance_loss_mlp": 1.01836658, + "epoch": 0.8846269464317922, + "flos": 12853219311360.0, + "grad_norm": 2.1114227658124562, + "language_loss": 0.79691994, + "learning_rate": 1.3790036929279091e-07, + "loss": 0.81856215, + "num_input_tokens_seen": 159155975, + "step": 7357, + "time_per_iteration": 2.485149621963501 + }, + { + "auxiliary_loss_clip": 0.01157336, + "auxiliary_loss_mlp": 0.00762527, + "balance_loss_clip": 1.04770255, + "balance_loss_mlp": 1.00068569, + "epoch": 0.8847471893224313, + "flos": 18624351628800.0, + "grad_norm": 2.11189474360998, + "language_loss": 0.58997214, + "learning_rate": 1.3761626994280363e-07, + "loss": 0.60917073, + "num_input_tokens_seen": 159173445, + "step": 7358, + "time_per_iteration": 2.422142744064331 + }, + { + "auxiliary_loss_clip": 0.01129157, + "auxiliary_loss_mlp": 0.0102487, + "balance_loss_clip": 1.04331303, + "balance_loss_mlp": 1.01757467, + "epoch": 0.8848674322130704, + "flos": 35769433449600.0, + "grad_norm": 1.9285989481168178, + "language_loss": 0.73470283, + "learning_rate": 1.3733245311494735e-07, + "loss": 0.75624305, + "num_input_tokens_seen": 159196100, + "step": 7359, + "time_per_iteration": 2.6651382446289062 + }, + { + "auxiliary_loss_clip": 0.01157756, + "auxiliary_loss_mlp": 0.01027346, + "balance_loss_clip": 1.04912949, + "balance_loss_mlp": 1.0194608, + "epoch": 0.8849876751037095, + "flos": 24243760897920.0, + "grad_norm": 2.078464397841499, + "language_loss": 0.70802486, + "learning_rate": 1.3704891885227676e-07, + "loss": 0.72987586, + "num_input_tokens_seen": 159216145, + "step": 7360, + "time_per_iteration": 2.4758851528167725 + }, + { + "auxiliary_loss_clip": 0.01125039, + "auxiliary_loss_mlp": 0.01029388, + "balance_loss_clip": 1.04075384, + "balance_loss_mlp": 1.02124619, + "epoch": 0.8851079179943486, + "flos": 21500580251520.0, + "grad_norm": 2.1295427857907887, + "language_loss": 0.78014934, + "learning_rate": 1.367656671978037e-07, + "loss": 0.80169368, + "num_input_tokens_seen": 159233610, + "step": 7361, + "time_per_iteration": 2.5424818992614746 + }, + { + "auxiliary_loss_clip": 0.01145553, + "auxiliary_loss_mlp": 0.01025124, + "balance_loss_clip": 1.04362297, + "balance_loss_mlp": 1.018103, + "epoch": 0.8852281608849877, + "flos": 15300711198720.0, + "grad_norm": 1.892417768609137, + "language_loss": 0.73706836, + "learning_rate": 1.36482698194498e-07, + "loss": 0.75877512, + "num_input_tokens_seen": 159250155, + "step": 7362, + "time_per_iteration": 2.459303379058838 + }, + { + "auxiliary_loss_clip": 0.01139617, + "auxiliary_loss_mlp": 0.010273, + "balance_loss_clip": 1.0433749, + "balance_loss_mlp": 1.01956332, + "epoch": 0.8853484037756267, + "flos": 23295719283840.0, + "grad_norm": 1.8754422495688374, + "language_loss": 0.72124159, + "learning_rate": 1.3620001188528506e-07, + "loss": 0.7429108, + "num_input_tokens_seen": 159270875, + "step": 7363, + "time_per_iteration": 2.5157954692840576 + }, + { + "auxiliary_loss_clip": 0.01155496, + "auxiliary_loss_mlp": 0.01026418, + "balance_loss_clip": 1.0440675, + "balance_loss_mlp": 1.01821685, + "epoch": 0.8854686466662659, + "flos": 25114773795840.0, + "grad_norm": 2.570810991991841, + "language_loss": 0.7341736, + "learning_rate": 1.3591760831304865e-07, + "loss": 0.75599277, + "num_input_tokens_seen": 159288565, + "step": 7364, + "time_per_iteration": 2.481921434402466 + }, + { + "auxiliary_loss_clip": 0.01167579, + "auxiliary_loss_mlp": 0.01025564, + "balance_loss_clip": 1.04737484, + "balance_loss_mlp": 1.01799464, + "epoch": 0.885588889556905, + "flos": 21390873137280.0, + "grad_norm": 2.067432259836448, + "language_loss": 0.79192352, + "learning_rate": 1.356354875206287e-07, + "loss": 0.81385493, + "num_input_tokens_seen": 159306400, + "step": 7365, + "time_per_iteration": 2.460566759109497 + }, + { + "auxiliary_loss_clip": 0.01123809, + "auxiliary_loss_mlp": 0.01021645, + "balance_loss_clip": 1.04502869, + "balance_loss_mlp": 1.01450443, + "epoch": 0.885709132447544, + "flos": 26906752431360.0, + "grad_norm": 1.9745981203304996, + "language_loss": 0.69599515, + "learning_rate": 1.3535364955082296e-07, + "loss": 0.71744967, + "num_input_tokens_seen": 159326250, + "step": 7366, + "time_per_iteration": 2.554753065109253 + }, + { + "auxiliary_loss_clip": 0.01164481, + "auxiliary_loss_mlp": 0.01024508, + "balance_loss_clip": 1.04639399, + "balance_loss_mlp": 1.01729941, + "epoch": 0.8858293753381832, + "flos": 26103394800000.0, + "grad_norm": 1.7667350862966584, + "language_loss": 0.64695472, + "learning_rate": 1.3507209444638613e-07, + "loss": 0.66884458, + "num_input_tokens_seen": 159348250, + "step": 7367, + "time_per_iteration": 2.4745445251464844 + }, + { + "auxiliary_loss_clip": 0.01154361, + "auxiliary_loss_mlp": 0.01026553, + "balance_loss_clip": 1.04668784, + "balance_loss_mlp": 1.01914418, + "epoch": 0.8859496182288222, + "flos": 23292810282240.0, + "grad_norm": 1.896840387866395, + "language_loss": 0.74248421, + "learning_rate": 1.347908222500298e-07, + "loss": 0.76429337, + "num_input_tokens_seen": 159368325, + "step": 7368, + "time_per_iteration": 2.4611334800720215 + }, + { + "auxiliary_loss_clip": 0.01112028, + "auxiliary_loss_mlp": 0.01024439, + "balance_loss_clip": 1.04262304, + "balance_loss_mlp": 1.01741195, + "epoch": 0.8860698611194613, + "flos": 16872916469760.0, + "grad_norm": 1.8525085343569176, + "language_loss": 0.69531977, + "learning_rate": 1.3450983300442276e-07, + "loss": 0.7166844, + "num_input_tokens_seen": 159387555, + "step": 7369, + "time_per_iteration": 2.519171714782715 + }, + { + "auxiliary_loss_clip": 0.01154736, + "auxiliary_loss_mlp": 0.01025927, + "balance_loss_clip": 1.04597759, + "balance_loss_mlp": 1.01876795, + "epoch": 0.8861901040101005, + "flos": 24681404206080.0, + "grad_norm": 1.8992220001809965, + "language_loss": 0.7354821, + "learning_rate": 1.3422912675219068e-07, + "loss": 0.75728869, + "num_input_tokens_seen": 159407310, + "step": 7370, + "time_per_iteration": 2.482877016067505 + }, + { + "auxiliary_loss_clip": 0.01165782, + "auxiliary_loss_mlp": 0.01021926, + "balance_loss_clip": 1.04902065, + "balance_loss_mlp": 1.01531005, + "epoch": 0.8863103469007395, + "flos": 24423026699520.0, + "grad_norm": 1.8495088599531955, + "language_loss": 0.79438639, + "learning_rate": 1.339487035359166e-07, + "loss": 0.81626344, + "num_input_tokens_seen": 159427680, + "step": 7371, + "time_per_iteration": 3.2174251079559326 + }, + { + "auxiliary_loss_clip": 0.01141571, + "auxiliary_loss_mlp": 0.00761661, + "balance_loss_clip": 1.04682112, + "balance_loss_mlp": 1.0006907, + "epoch": 0.8864305897913786, + "flos": 22053964158720.0, + "grad_norm": 1.597455437237237, + "language_loss": 0.84979659, + "learning_rate": 1.336685633981409e-07, + "loss": 0.86882889, + "num_input_tokens_seen": 159448765, + "step": 7372, + "time_per_iteration": 3.2861554622650146 + }, + { + "auxiliary_loss_clip": 0.01156029, + "auxiliary_loss_mlp": 0.0102379, + "balance_loss_clip": 1.04496908, + "balance_loss_mlp": 1.01616359, + "epoch": 0.8865508326820177, + "flos": 19099449843840.0, + "grad_norm": 2.393551071163638, + "language_loss": 0.75068486, + "learning_rate": 1.333887063813597e-07, + "loss": 0.77248311, + "num_input_tokens_seen": 159466870, + "step": 7373, + "time_per_iteration": 2.4485280513763428 + }, + { + "auxiliary_loss_clip": 0.01139322, + "auxiliary_loss_mlp": 0.01021816, + "balance_loss_clip": 1.04176819, + "balance_loss_mlp": 1.01479506, + "epoch": 0.8866710755726568, + "flos": 15414189240960.0, + "grad_norm": 1.7334510513623052, + "language_loss": 0.65906167, + "learning_rate": 1.331091325280278e-07, + "loss": 0.680673, + "num_input_tokens_seen": 159485840, + "step": 7374, + "time_per_iteration": 2.4637110233306885 + }, + { + "auxiliary_loss_clip": 0.01103397, + "auxiliary_loss_mlp": 0.01025358, + "balance_loss_clip": 1.04041338, + "balance_loss_mlp": 1.01790786, + "epoch": 0.8867913184632958, + "flos": 20083689388800.0, + "grad_norm": 1.5869549028558485, + "language_loss": 0.78619242, + "learning_rate": 1.3282984188055625e-07, + "loss": 0.80747998, + "num_input_tokens_seen": 159505630, + "step": 7375, + "time_per_iteration": 3.3116140365600586 + }, + { + "auxiliary_loss_clip": 0.01165356, + "auxiliary_loss_mlp": 0.01030979, + "balance_loss_clip": 1.04520607, + "balance_loss_mlp": 1.02407992, + "epoch": 0.8869115613539349, + "flos": 23365852588800.0, + "grad_norm": 1.8681797849803032, + "language_loss": 0.79563189, + "learning_rate": 1.3255083448131288e-07, + "loss": 0.81759524, + "num_input_tokens_seen": 159524675, + "step": 7376, + "time_per_iteration": 2.4327404499053955 + }, + { + "auxiliary_loss_clip": 0.01156094, + "auxiliary_loss_mlp": 0.01025941, + "balance_loss_clip": 1.04431033, + "balance_loss_mlp": 1.01851392, + "epoch": 0.8870318042445741, + "flos": 21286840371840.0, + "grad_norm": 2.2320715663946693, + "language_loss": 0.78522199, + "learning_rate": 1.3227211037262365e-07, + "loss": 0.80704236, + "num_input_tokens_seen": 159541915, + "step": 7377, + "time_per_iteration": 3.2543017864227295 + }, + { + "auxiliary_loss_clip": 0.01112433, + "auxiliary_loss_mlp": 0.01030738, + "balance_loss_clip": 1.03900743, + "balance_loss_mlp": 1.02284086, + "epoch": 0.8871520471352131, + "flos": 20010862563840.0, + "grad_norm": 2.5299780303748576, + "language_loss": 0.85412812, + "learning_rate": 1.319936695967696e-07, + "loss": 0.87555981, + "num_input_tokens_seen": 159559740, + "step": 7378, + "time_per_iteration": 2.5456631183624268 + }, + { + "auxiliary_loss_clip": 0.01174974, + "auxiliary_loss_mlp": 0.01024219, + "balance_loss_clip": 1.04869854, + "balance_loss_mlp": 1.01604748, + "epoch": 0.8872722900258522, + "flos": 22601422321920.0, + "grad_norm": 2.2894699093811908, + "language_loss": 0.82098043, + "learning_rate": 1.3171551219599097e-07, + "loss": 0.84297234, + "num_input_tokens_seen": 159578265, + "step": 7379, + "time_per_iteration": 2.4450671672821045 + }, + { + "auxiliary_loss_clip": 0.01168673, + "auxiliary_loss_mlp": 0.0102406, + "balance_loss_clip": 1.04939282, + "balance_loss_mlp": 1.01650834, + "epoch": 0.8873925329164913, + "flos": 22163276223360.0, + "grad_norm": 2.2641514125220734, + "language_loss": 0.77878988, + "learning_rate": 1.3143763821248377e-07, + "loss": 0.80071723, + "num_input_tokens_seen": 159595350, + "step": 7380, + "time_per_iteration": 2.429564952850342 + }, + { + "auxiliary_loss_clip": 0.01164302, + "auxiliary_loss_mlp": 0.01027756, + "balance_loss_clip": 1.04546475, + "balance_loss_mlp": 1.0208087, + "epoch": 0.8875127758071304, + "flos": 19208223204480.0, + "grad_norm": 2.0939792022138963, + "language_loss": 0.72096193, + "learning_rate": 1.3116004768840118e-07, + "loss": 0.74288249, + "num_input_tokens_seen": 159613725, + "step": 7381, + "time_per_iteration": 2.409867286682129 + }, + { + "auxiliary_loss_clip": 0.01168622, + "auxiliary_loss_mlp": 0.01027053, + "balance_loss_clip": 1.04647708, + "balance_loss_mlp": 1.01956105, + "epoch": 0.8876330186977694, + "flos": 18110900666880.0, + "grad_norm": 1.7113047893776026, + "language_loss": 0.74156475, + "learning_rate": 1.3088274066585348e-07, + "loss": 0.76352143, + "num_input_tokens_seen": 159631335, + "step": 7382, + "time_per_iteration": 2.41460919380188 + }, + { + "auxiliary_loss_clip": 0.01132479, + "auxiliary_loss_mlp": 0.01023351, + "balance_loss_clip": 1.04241705, + "balance_loss_mlp": 1.01647615, + "epoch": 0.8877532615884086, + "flos": 22009434272640.0, + "grad_norm": 2.0893222713479536, + "language_loss": 0.90294135, + "learning_rate": 1.3060571718690749e-07, + "loss": 0.92449963, + "num_input_tokens_seen": 159648830, + "step": 7383, + "time_per_iteration": 2.5223071575164795 + }, + { + "auxiliary_loss_clip": 0.0103392, + "auxiliary_loss_mlp": 0.00752691, + "balance_loss_clip": 1.00722361, + "balance_loss_mlp": 1.00041008, + "epoch": 0.8878735044790477, + "flos": 72136924346880.0, + "grad_norm": 0.7450133385266835, + "language_loss": 0.56879723, + "learning_rate": 1.3032897729358805e-07, + "loss": 0.58666337, + "num_input_tokens_seen": 159709785, + "step": 7384, + "time_per_iteration": 3.11431884765625 + }, + { + "auxiliary_loss_clip": 0.0108548, + "auxiliary_loss_mlp": 0.00762425, + "balance_loss_clip": 1.03577411, + "balance_loss_mlp": 1.00063252, + "epoch": 0.8879937473696867, + "flos": 27526355061120.0, + "grad_norm": 2.6002315832579406, + "language_loss": 0.79840124, + "learning_rate": 1.3005252102787645e-07, + "loss": 0.81688035, + "num_input_tokens_seen": 159728725, + "step": 7385, + "time_per_iteration": 2.6529507637023926 + }, + { + "auxiliary_loss_clip": 0.01156034, + "auxiliary_loss_mlp": 0.01025648, + "balance_loss_clip": 1.0455699, + "balance_loss_mlp": 1.01821494, + "epoch": 0.8881139902603259, + "flos": 22234091886720.0, + "grad_norm": 1.6086419503139429, + "language_loss": 0.73571682, + "learning_rate": 1.297763484317105e-07, + "loss": 0.75753361, + "num_input_tokens_seen": 159747020, + "step": 7386, + "time_per_iteration": 2.469247817993164 + }, + { + "auxiliary_loss_clip": 0.01109256, + "auxiliary_loss_mlp": 0.00762831, + "balance_loss_clip": 1.03923583, + "balance_loss_mlp": 1.00059783, + "epoch": 0.888234233150965, + "flos": 20299548170880.0, + "grad_norm": 21.57819370066847, + "language_loss": 0.7041471, + "learning_rate": 1.2950045954698551e-07, + "loss": 0.72286803, + "num_input_tokens_seen": 159764855, + "step": 7387, + "time_per_iteration": 2.5509190559387207 + }, + { + "auxiliary_loss_clip": 0.01117191, + "auxiliary_loss_mlp": 0.0102361, + "balance_loss_clip": 1.04297781, + "balance_loss_mlp": 1.01665711, + "epoch": 0.888354476041604, + "flos": 18147996437760.0, + "grad_norm": 1.7903512803658719, + "language_loss": 0.75432396, + "learning_rate": 1.2922485441555343e-07, + "loss": 0.77573192, + "num_input_tokens_seen": 159783935, + "step": 7388, + "time_per_iteration": 2.507554769515991 + }, + { + "auxiliary_loss_clip": 0.01164333, + "auxiliary_loss_mlp": 0.01021899, + "balance_loss_clip": 1.04506981, + "balance_loss_mlp": 1.01448989, + "epoch": 0.8884747189322432, + "flos": 22014282608640.0, + "grad_norm": 1.801259134611573, + "language_loss": 0.81610358, + "learning_rate": 1.2894953307922363e-07, + "loss": 0.83796585, + "num_input_tokens_seen": 159802895, + "step": 7389, + "time_per_iteration": 2.4406957626342773 + }, + { + "auxiliary_loss_clip": 0.01119544, + "auxiliary_loss_mlp": 0.01026156, + "balance_loss_clip": 1.04197013, + "balance_loss_mlp": 1.01908112, + "epoch": 0.8885949618228822, + "flos": 19786779567360.0, + "grad_norm": 2.492466305475135, + "language_loss": 0.83958477, + "learning_rate": 1.2867449557976208e-07, + "loss": 0.86104178, + "num_input_tokens_seen": 159820995, + "step": 7390, + "time_per_iteration": 2.563081979751587 + }, + { + "auxiliary_loss_clip": 0.01151658, + "auxiliary_loss_mlp": 0.01026246, + "balance_loss_clip": 1.04697251, + "balance_loss_mlp": 1.01906681, + "epoch": 0.8887152047135213, + "flos": 20047599198720.0, + "grad_norm": 1.9957992359130128, + "language_loss": 0.75766367, + "learning_rate": 1.283997419588916e-07, + "loss": 0.77944267, + "num_input_tokens_seen": 159840465, + "step": 7391, + "time_per_iteration": 2.486354351043701 + }, + { + "auxiliary_loss_clip": 0.01157183, + "auxiliary_loss_mlp": 0.01024872, + "balance_loss_clip": 1.04556096, + "balance_loss_mlp": 1.01765037, + "epoch": 0.8888354476041604, + "flos": 18588117784320.0, + "grad_norm": 1.8765563675864634, + "language_loss": 0.61793852, + "learning_rate": 1.2812527225829216e-07, + "loss": 0.63975906, + "num_input_tokens_seen": 159858690, + "step": 7392, + "time_per_iteration": 2.43646502494812 + }, + { + "auxiliary_loss_clip": 0.01160625, + "auxiliary_loss_mlp": 0.01023203, + "balance_loss_clip": 1.04917264, + "balance_loss_mlp": 1.01479828, + "epoch": 0.8889556904947995, + "flos": 21689794120320.0, + "grad_norm": 2.1427836091179326, + "language_loss": 0.76476014, + "learning_rate": 1.2785108651960052e-07, + "loss": 0.78659844, + "num_input_tokens_seen": 159880325, + "step": 7393, + "time_per_iteration": 2.4937798976898193 + }, + { + "auxiliary_loss_clip": 0.01156679, + "auxiliary_loss_mlp": 0.01022877, + "balance_loss_clip": 1.04586458, + "balance_loss_mlp": 1.0154984, + "epoch": 0.8890759333854386, + "flos": 27381204201600.0, + "grad_norm": 2.2025011248332174, + "language_loss": 0.80393076, + "learning_rate": 1.2757718478441094e-07, + "loss": 0.82572633, + "num_input_tokens_seen": 159901070, + "step": 7394, + "time_per_iteration": 2.507291078567505 + }, + { + "auxiliary_loss_clip": 0.01138375, + "auxiliary_loss_mlp": 0.01020471, + "balance_loss_clip": 1.04283285, + "balance_loss_mlp": 1.01356912, + "epoch": 0.8891961762760777, + "flos": 24498834353280.0, + "grad_norm": 2.4954016256573817, + "language_loss": 0.77470911, + "learning_rate": 1.2730356709427302e-07, + "loss": 0.79629761, + "num_input_tokens_seen": 159919750, + "step": 7395, + "time_per_iteration": 2.5235209465026855 + }, + { + "auxiliary_loss_clip": 0.01151699, + "auxiliary_loss_mlp": 0.01028963, + "balance_loss_clip": 1.046942, + "balance_loss_mlp": 1.02141058, + "epoch": 0.8893164191667168, + "flos": 41499770895360.0, + "grad_norm": 3.140251589928309, + "language_loss": 0.59922242, + "learning_rate": 1.2703023349069542e-07, + "loss": 0.62102902, + "num_input_tokens_seen": 159944600, + "step": 7396, + "time_per_iteration": 2.6222829818725586 + }, + { + "auxiliary_loss_clip": 0.01150175, + "auxiliary_loss_mlp": 0.0102297, + "balance_loss_clip": 1.04586279, + "balance_loss_mlp": 1.01578736, + "epoch": 0.8894366620573558, + "flos": 33583623120000.0, + "grad_norm": 2.04605227293675, + "language_loss": 0.61551476, + "learning_rate": 1.2675718401514223e-07, + "loss": 0.63724619, + "num_input_tokens_seen": 159968780, + "step": 7397, + "time_per_iteration": 2.5623044967651367 + }, + { + "auxiliary_loss_clip": 0.01138675, + "auxiliary_loss_mlp": 0.01022825, + "balance_loss_clip": 1.04346871, + "balance_loss_mlp": 1.01547885, + "epoch": 0.889556904947995, + "flos": 16909832672640.0, + "grad_norm": 2.2087019239566663, + "language_loss": 0.74697673, + "learning_rate": 1.264844187090346e-07, + "loss": 0.76859176, + "num_input_tokens_seen": 159985905, + "step": 7398, + "time_per_iteration": 2.4616124629974365 + }, + { + "auxiliary_loss_clip": 0.01134186, + "auxiliary_loss_mlp": 0.0102461, + "balance_loss_clip": 1.04252565, + "balance_loss_mlp": 1.01730204, + "epoch": 0.889677147838634, + "flos": 26030855283840.0, + "grad_norm": 1.6254442675665486, + "language_loss": 0.7494024, + "learning_rate": 1.262119376137516e-07, + "loss": 0.77099037, + "num_input_tokens_seen": 160006965, + "step": 7399, + "time_per_iteration": 4.0892791748046875 + }, + { + "auxiliary_loss_clip": 0.0114357, + "auxiliary_loss_mlp": 0.01021613, + "balance_loss_clip": 1.0426991, + "balance_loss_mlp": 1.01459777, + "epoch": 0.8897973907292731, + "flos": 26468283110400.0, + "grad_norm": 1.8798290790044325, + "language_loss": 0.85209107, + "learning_rate": 1.2593974077062707e-07, + "loss": 0.87374288, + "num_input_tokens_seen": 160028585, + "step": 7400, + "time_per_iteration": 2.489576816558838 + }, + { + "auxiliary_loss_clip": 0.01116713, + "auxiliary_loss_mlp": 0.0102594, + "balance_loss_clip": 1.04117346, + "balance_loss_mlp": 1.01857901, + "epoch": 0.8899176336199123, + "flos": 26249694894720.0, + "grad_norm": 1.524444900540369, + "language_loss": 0.63654023, + "learning_rate": 1.2566782822095423e-07, + "loss": 0.65796673, + "num_input_tokens_seen": 160048840, + "step": 7401, + "time_per_iteration": 3.3170604705810547 + }, + { + "auxiliary_loss_clip": 0.01133269, + "auxiliary_loss_mlp": 0.01026345, + "balance_loss_clip": 1.04625535, + "balance_loss_mlp": 1.01901972, + "epoch": 0.8900378765105513, + "flos": 20811742156800.0, + "grad_norm": 1.794475972552434, + "language_loss": 0.71347725, + "learning_rate": 1.2539620000598162e-07, + "loss": 0.73507339, + "num_input_tokens_seen": 160068175, + "step": 7402, + "time_per_iteration": 2.544241189956665 + }, + { + "auxiliary_loss_clip": 0.01167667, + "auxiliary_loss_mlp": 0.01027031, + "balance_loss_clip": 1.04798758, + "balance_loss_mlp": 1.01973891, + "epoch": 0.8901581194011904, + "flos": 16472333018880.0, + "grad_norm": 2.2891329264318805, + "language_loss": 0.79443431, + "learning_rate": 1.2512485616691492e-07, + "loss": 0.81638128, + "num_input_tokens_seen": 160085230, + "step": 7403, + "time_per_iteration": 2.3922884464263916 + }, + { + "auxiliary_loss_clip": 0.01125136, + "auxiliary_loss_mlp": 0.01030924, + "balance_loss_clip": 1.04137945, + "balance_loss_mlp": 1.02304411, + "epoch": 0.8902783622918296, + "flos": 35155253773440.0, + "grad_norm": 1.4628026390654727, + "language_loss": 0.81032574, + "learning_rate": 1.2485379674491681e-07, + "loss": 0.83188629, + "num_input_tokens_seen": 160111425, + "step": 7404, + "time_per_iteration": 3.4156012535095215 + }, + { + "auxiliary_loss_clip": 0.01139825, + "auxiliary_loss_mlp": 0.01026778, + "balance_loss_clip": 1.04644895, + "balance_loss_mlp": 1.01944995, + "epoch": 0.8903986051824686, + "flos": 17201068145280.0, + "grad_norm": 2.4248463856397118, + "language_loss": 0.78963703, + "learning_rate": 1.2458302178110657e-07, + "loss": 0.81130308, + "num_input_tokens_seen": 160129790, + "step": 7405, + "time_per_iteration": 2.455798864364624 + }, + { + "auxiliary_loss_clip": 0.0111363, + "auxiliary_loss_mlp": 0.01020272, + "balance_loss_clip": 1.04010963, + "balance_loss_mlp": 1.0134052, + "epoch": 0.8905188480731077, + "flos": 25483863997440.0, + "grad_norm": 1.9194225776729414, + "language_loss": 0.82208145, + "learning_rate": 1.2431253131656118e-07, + "loss": 0.84342045, + "num_input_tokens_seen": 160149265, + "step": 7406, + "time_per_iteration": 2.5640387535095215 + }, + { + "auxiliary_loss_clip": 0.01130587, + "auxiliary_loss_mlp": 0.01025724, + "balance_loss_clip": 1.04266119, + "balance_loss_mlp": 1.01802278, + "epoch": 0.8906390909637467, + "flos": 23365888502400.0, + "grad_norm": 4.964329447262334, + "language_loss": 0.76756471, + "learning_rate": 1.240423253923133e-07, + "loss": 0.78912777, + "num_input_tokens_seen": 160168870, + "step": 7407, + "time_per_iteration": 2.4877445697784424 + }, + { + "auxiliary_loss_clip": 0.01154264, + "auxiliary_loss_mlp": 0.01027108, + "balance_loss_clip": 1.04540658, + "balance_loss_mlp": 1.01954412, + "epoch": 0.8907593338543859, + "flos": 21068790860160.0, + "grad_norm": 2.0157047379972073, + "language_loss": 0.69415069, + "learning_rate": 1.237724040493533e-07, + "loss": 0.71596438, + "num_input_tokens_seen": 160187495, + "step": 7408, + "time_per_iteration": 2.4683520793914795 + }, + { + "auxiliary_loss_clip": 0.01174665, + "auxiliary_loss_mlp": 0.0103234, + "balance_loss_clip": 1.05062377, + "balance_loss_mlp": 1.02437091, + "epoch": 0.8908795767450249, + "flos": 21869562712320.0, + "grad_norm": 2.5350501051224077, + "language_loss": 0.72957933, + "learning_rate": 1.2350276732862773e-07, + "loss": 0.75164938, + "num_input_tokens_seen": 160208520, + "step": 7409, + "time_per_iteration": 2.434563636779785 + }, + { + "auxiliary_loss_clip": 0.01053747, + "auxiliary_loss_mlp": 0.01001852, + "balance_loss_clip": 1.00711465, + "balance_loss_mlp": 1.00077903, + "epoch": 0.890999819635664, + "flos": 66307869348480.0, + "grad_norm": 0.8297630076655466, + "language_loss": 0.56715524, + "learning_rate": 1.2323341527103993e-07, + "loss": 0.58771122, + "num_input_tokens_seen": 160263720, + "step": 7410, + "time_per_iteration": 2.965568780899048 + }, + { + "auxiliary_loss_clip": 0.01166266, + "auxiliary_loss_mlp": 0.0102673, + "balance_loss_clip": 1.04692447, + "balance_loss_mlp": 1.01969647, + "epoch": 0.8911200625263032, + "flos": 26869908055680.0, + "grad_norm": 2.284601855552799, + "language_loss": 0.84845793, + "learning_rate": 1.2296434791745135e-07, + "loss": 0.87038785, + "num_input_tokens_seen": 160282170, + "step": 7411, + "time_per_iteration": 2.4676764011383057 + }, + { + "auxiliary_loss_clip": 0.01157376, + "auxiliary_loss_mlp": 0.01022986, + "balance_loss_clip": 1.0462712, + "balance_loss_mlp": 1.01547885, + "epoch": 0.8912403054169422, + "flos": 20885825957760.0, + "grad_norm": 1.6823399266161354, + "language_loss": 0.76654994, + "learning_rate": 1.2269556530867875e-07, + "loss": 0.78835356, + "num_input_tokens_seen": 160300725, + "step": 7412, + "time_per_iteration": 2.451011896133423 + }, + { + "auxiliary_loss_clip": 0.01173488, + "auxiliary_loss_mlp": 0.01027478, + "balance_loss_clip": 1.04921556, + "balance_loss_mlp": 1.01906776, + "epoch": 0.8913605483075813, + "flos": 27016567286400.0, + "grad_norm": 2.618810490799991, + "language_loss": 0.82079768, + "learning_rate": 1.2242706748549614e-07, + "loss": 0.84280741, + "num_input_tokens_seen": 160318720, + "step": 7413, + "time_per_iteration": 2.4778783321380615 + }, + { + "auxiliary_loss_clip": 0.01137995, + "auxiliary_loss_mlp": 0.0102041, + "balance_loss_clip": 1.03998899, + "balance_loss_mlp": 1.01327229, + "epoch": 0.8914807911982204, + "flos": 23621500661760.0, + "grad_norm": 1.8563474331367313, + "language_loss": 0.81897271, + "learning_rate": 1.2215885448863473e-07, + "loss": 0.84055674, + "num_input_tokens_seen": 160339595, + "step": 7414, + "time_per_iteration": 2.567824363708496 + }, + { + "auxiliary_loss_clip": 0.01139721, + "auxiliary_loss_mlp": 0.01026474, + "balance_loss_clip": 1.04557467, + "balance_loss_mlp": 1.01966393, + "epoch": 0.8916010340888595, + "flos": 24462277286400.0, + "grad_norm": 1.8552645195085022, + "language_loss": 0.80380356, + "learning_rate": 1.2189092635878152e-07, + "loss": 0.82546556, + "num_input_tokens_seen": 160361045, + "step": 7415, + "time_per_iteration": 2.5069491863250732 + }, + { + "auxiliary_loss_clip": 0.01115373, + "auxiliary_loss_mlp": 0.01024201, + "balance_loss_clip": 1.04124439, + "balance_loss_mlp": 1.01660752, + "epoch": 0.8917212769794985, + "flos": 21215773313280.0, + "grad_norm": 1.9879952093251831, + "language_loss": 0.77427626, + "learning_rate": 1.216232831365822e-07, + "loss": 0.79567206, + "num_input_tokens_seen": 160379990, + "step": 7416, + "time_per_iteration": 2.5228335857391357 + }, + { + "auxiliary_loss_clip": 0.01145359, + "auxiliary_loss_mlp": 0.01026449, + "balance_loss_clip": 1.04558849, + "balance_loss_mlp": 1.01876271, + "epoch": 0.8918415198701377, + "flos": 25513992529920.0, + "grad_norm": 1.973386362880644, + "language_loss": 0.80712914, + "learning_rate": 1.2135592486263678e-07, + "loss": 0.82884729, + "num_input_tokens_seen": 160399240, + "step": 7417, + "time_per_iteration": 2.5268094539642334 + }, + { + "auxiliary_loss_clip": 0.0113822, + "auxiliary_loss_mlp": 0.01026596, + "balance_loss_clip": 1.04279196, + "balance_loss_mlp": 1.01943731, + "epoch": 0.8919617627607768, + "flos": 37853006693760.0, + "grad_norm": 1.5584384674977865, + "language_loss": 0.61063468, + "learning_rate": 1.2108885157750415e-07, + "loss": 0.63228285, + "num_input_tokens_seen": 160421600, + "step": 7418, + "time_per_iteration": 2.6339428424835205 + }, + { + "auxiliary_loss_clip": 0.01123349, + "auxiliary_loss_mlp": 0.00761766, + "balance_loss_clip": 1.04506135, + "balance_loss_mlp": 1.00064206, + "epoch": 0.8920820056514158, + "flos": 26213676531840.0, + "grad_norm": 2.4142118824753447, + "language_loss": 0.80449665, + "learning_rate": 1.2082206332169897e-07, + "loss": 0.82334775, + "num_input_tokens_seen": 160441695, + "step": 7419, + "time_per_iteration": 2.5832650661468506 + }, + { + "auxiliary_loss_clip": 0.01134273, + "auxiliary_loss_mlp": 0.01025589, + "balance_loss_clip": 1.04563379, + "balance_loss_mlp": 1.01765513, + "epoch": 0.892202248542055, + "flos": 17383135207680.0, + "grad_norm": 3.1783574462972637, + "language_loss": 0.73165262, + "learning_rate": 1.2055556013569225e-07, + "loss": 0.75325131, + "num_input_tokens_seen": 160457205, + "step": 7420, + "time_per_iteration": 2.456822633743286 + }, + { + "auxiliary_loss_clip": 0.01142332, + "auxiliary_loss_mlp": 0.0102525, + "balance_loss_clip": 1.04448247, + "balance_loss_mlp": 1.0180614, + "epoch": 0.892322491432694, + "flos": 21324223451520.0, + "grad_norm": 1.6371491983947797, + "language_loss": 0.82104754, + "learning_rate": 1.2028934205991315e-07, + "loss": 0.84272337, + "num_input_tokens_seen": 160476525, + "step": 7421, + "time_per_iteration": 2.5037317276000977 + }, + { + "auxiliary_loss_clip": 0.01152937, + "auxiliary_loss_mlp": 0.01024361, + "balance_loss_clip": 1.04413295, + "balance_loss_mlp": 1.01668119, + "epoch": 0.8924427343233331, + "flos": 24029374573440.0, + "grad_norm": 2.844990711544825, + "language_loss": 0.76800466, + "learning_rate": 1.2002340913474607e-07, + "loss": 0.78977764, + "num_input_tokens_seen": 160500160, + "step": 7422, + "time_per_iteration": 2.545806407928467 + }, + { + "auxiliary_loss_clip": 0.01168758, + "auxiliary_loss_mlp": 0.01027181, + "balance_loss_clip": 1.04716313, + "balance_loss_mlp": 1.01911688, + "epoch": 0.8925629772139723, + "flos": 30008069631360.0, + "grad_norm": 2.885559915371445, + "language_loss": 0.74062645, + "learning_rate": 1.1975776140053317e-07, + "loss": 0.76258576, + "num_input_tokens_seen": 160520130, + "step": 7423, + "time_per_iteration": 2.514486074447632 + }, + { + "auxiliary_loss_clip": 0.01112743, + "auxiliary_loss_mlp": 0.01025473, + "balance_loss_clip": 1.04154682, + "balance_loss_mlp": 1.0177213, + "epoch": 0.8926832201046113, + "flos": 22601709630720.0, + "grad_norm": 1.9698788583358022, + "language_loss": 0.73148167, + "learning_rate": 1.194923988975729e-07, + "loss": 0.75286388, + "num_input_tokens_seen": 160539730, + "step": 7424, + "time_per_iteration": 2.5673859119415283 + }, + { + "auxiliary_loss_clip": 0.01120937, + "auxiliary_loss_mlp": 0.01020713, + "balance_loss_clip": 1.04237342, + "balance_loss_mlp": 1.01230919, + "epoch": 0.8928034629952504, + "flos": 13297722117120.0, + "grad_norm": 2.2229386798960737, + "language_loss": 0.73536915, + "learning_rate": 1.192273216661206e-07, + "loss": 0.75678563, + "num_input_tokens_seen": 160557820, + "step": 7425, + "time_per_iteration": 3.2604598999023438 + }, + { + "auxiliary_loss_clip": 0.01011652, + "auxiliary_loss_mlp": 0.01001746, + "balance_loss_clip": 1.00703037, + "balance_loss_mlp": 1.0007689, + "epoch": 0.8929237058858895, + "flos": 54854556744960.0, + "grad_norm": 0.7635403439457834, + "language_loss": 0.57450205, + "learning_rate": 1.189625297463881e-07, + "loss": 0.59463602, + "num_input_tokens_seen": 160619510, + "step": 7426, + "time_per_iteration": 3.888928174972534 + }, + { + "auxiliary_loss_clip": 0.01092773, + "auxiliary_loss_mlp": 0.01025441, + "balance_loss_clip": 1.03833973, + "balance_loss_mlp": 1.01818967, + "epoch": 0.8930439487765286, + "flos": 28883850785280.0, + "grad_norm": 1.772171773923728, + "language_loss": 0.79602784, + "learning_rate": 1.1869802317854394e-07, + "loss": 0.81720996, + "num_input_tokens_seen": 160643295, + "step": 7427, + "time_per_iteration": 2.701374053955078 + }, + { + "auxiliary_loss_clip": 0.01115786, + "auxiliary_loss_mlp": 0.01023383, + "balance_loss_clip": 1.04155231, + "balance_loss_mlp": 1.01582778, + "epoch": 0.8931641916671677, + "flos": 22419283432320.0, + "grad_norm": 2.505065085727083, + "language_loss": 0.72039276, + "learning_rate": 1.1843380200271425e-07, + "loss": 0.74178445, + "num_input_tokens_seen": 160662495, + "step": 7428, + "time_per_iteration": 3.259114980697632 + }, + { + "auxiliary_loss_clip": 0.01119673, + "auxiliary_loss_mlp": 0.01018484, + "balance_loss_clip": 1.04289842, + "balance_loss_mlp": 1.01081276, + "epoch": 0.8932844345578068, + "flos": 25843149786240.0, + "grad_norm": 1.7383370737544463, + "language_loss": 0.80495358, + "learning_rate": 1.181698662589805e-07, + "loss": 0.82633513, + "num_input_tokens_seen": 160682080, + "step": 7429, + "time_per_iteration": 2.5609099864959717 + }, + { + "auxiliary_loss_clip": 0.01153241, + "auxiliary_loss_mlp": 0.01026582, + "balance_loss_clip": 1.04466999, + "balance_loss_mlp": 1.01892865, + "epoch": 0.8934046774484459, + "flos": 22925803069440.0, + "grad_norm": 1.8788434368957736, + "language_loss": 0.76101273, + "learning_rate": 1.1790621598738249e-07, + "loss": 0.78281093, + "num_input_tokens_seen": 160700395, + "step": 7430, + "time_per_iteration": 2.4717624187469482 + }, + { + "auxiliary_loss_clip": 0.01164917, + "auxiliary_loss_mlp": 0.01024487, + "balance_loss_clip": 1.04737186, + "balance_loss_mlp": 1.01785302, + "epoch": 0.8935249203390849, + "flos": 24462097718400.0, + "grad_norm": 3.2713201570923207, + "language_loss": 0.74919331, + "learning_rate": 1.1764285122791461e-07, + "loss": 0.77108741, + "num_input_tokens_seen": 160721115, + "step": 7431, + "time_per_iteration": 3.2282261848449707 + }, + { + "auxiliary_loss_clip": 0.01151606, + "auxiliary_loss_mlp": 0.01021601, + "balance_loss_clip": 1.04213524, + "balance_loss_mlp": 1.01439786, + "epoch": 0.8936451632297241, + "flos": 15742735966080.0, + "grad_norm": 1.8201986144801403, + "language_loss": 0.77070105, + "learning_rate": 1.173797720205294e-07, + "loss": 0.79243308, + "num_input_tokens_seen": 160739150, + "step": 7432, + "time_per_iteration": 2.431166410446167 + }, + { + "auxiliary_loss_clip": 0.01156021, + "auxiliary_loss_mlp": 0.01028485, + "balance_loss_clip": 1.04715037, + "balance_loss_mlp": 1.02043509, + "epoch": 0.8937654061203631, + "flos": 35115500396160.0, + "grad_norm": 2.3029185382578508, + "language_loss": 0.71623325, + "learning_rate": 1.1711697840513602e-07, + "loss": 0.73807836, + "num_input_tokens_seen": 160758585, + "step": 7433, + "time_per_iteration": 2.5717546939849854 + }, + { + "auxiliary_loss_clip": 0.01144853, + "auxiliary_loss_mlp": 0.0102391, + "balance_loss_clip": 1.04251432, + "balance_loss_mlp": 1.01653719, + "epoch": 0.8938856490110022, + "flos": 16107444708480.0, + "grad_norm": 2.7526445132056288, + "language_loss": 0.70577073, + "learning_rate": 1.1685447042160012e-07, + "loss": 0.72745836, + "num_input_tokens_seen": 160776620, + "step": 7434, + "time_per_iteration": 2.4610936641693115 + }, + { + "auxiliary_loss_clip": 0.01170157, + "auxiliary_loss_mlp": 0.01027444, + "balance_loss_clip": 1.04778767, + "balance_loss_mlp": 1.01989186, + "epoch": 0.8940058919016414, + "flos": 20704189858560.0, + "grad_norm": 2.0940095344187406, + "language_loss": 0.71497583, + "learning_rate": 1.1659224810974367e-07, + "loss": 0.73695183, + "num_input_tokens_seen": 160796580, + "step": 7435, + "time_per_iteration": 2.4341869354248047 + }, + { + "auxiliary_loss_clip": 0.01138555, + "auxiliary_loss_mlp": 0.01024919, + "balance_loss_clip": 1.04583943, + "balance_loss_mlp": 1.01742387, + "epoch": 0.8941261347922804, + "flos": 25229041937280.0, + "grad_norm": 2.6759962518509774, + "language_loss": 0.68334919, + "learning_rate": 1.1633031150934591e-07, + "loss": 0.70498395, + "num_input_tokens_seen": 160819610, + "step": 7436, + "time_per_iteration": 2.6037685871124268 + }, + { + "auxiliary_loss_clip": 0.01156168, + "auxiliary_loss_mlp": 0.01031183, + "balance_loss_clip": 1.04790461, + "balance_loss_mlp": 1.02357757, + "epoch": 0.8942463776829195, + "flos": 19537236806400.0, + "grad_norm": 1.872565109186289, + "language_loss": 0.79467511, + "learning_rate": 1.1606866066014176e-07, + "loss": 0.81654859, + "num_input_tokens_seen": 160838660, + "step": 7437, + "time_per_iteration": 2.4481379985809326 + }, + { + "auxiliary_loss_clip": 0.01122826, + "auxiliary_loss_mlp": 0.01023907, + "balance_loss_clip": 1.04346251, + "balance_loss_mlp": 1.01628971, + "epoch": 0.8943666205735585, + "flos": 22301567585280.0, + "grad_norm": 2.437993321651224, + "language_loss": 0.75193417, + "learning_rate": 1.1580729560182434e-07, + "loss": 0.7734015, + "num_input_tokens_seen": 160854515, + "step": 7438, + "time_per_iteration": 2.5338168144226074 + }, + { + "auxiliary_loss_clip": 0.01168931, + "auxiliary_loss_mlp": 0.00762272, + "balance_loss_clip": 1.04820538, + "balance_loss_mlp": 1.00061321, + "epoch": 0.8944868634641977, + "flos": 18912893581440.0, + "grad_norm": 2.095694392194354, + "language_loss": 0.70972657, + "learning_rate": 1.1554621637404171e-07, + "loss": 0.7290386, + "num_input_tokens_seen": 160872605, + "step": 7439, + "time_per_iteration": 2.4663138389587402 + }, + { + "auxiliary_loss_clip": 0.01156454, + "auxiliary_loss_mlp": 0.01020717, + "balance_loss_clip": 1.04665613, + "balance_loss_mlp": 1.01337934, + "epoch": 0.8946071063548368, + "flos": 14460904241280.0, + "grad_norm": 2.371260816809581, + "language_loss": 0.61038452, + "learning_rate": 1.1528542301639999e-07, + "loss": 0.63215619, + "num_input_tokens_seen": 160889395, + "step": 7440, + "time_per_iteration": 2.4941580295562744 + }, + { + "auxiliary_loss_clip": 0.01127112, + "auxiliary_loss_mlp": 0.01019759, + "balance_loss_clip": 1.04050052, + "balance_loss_mlp": 1.01284134, + "epoch": 0.8947273492454758, + "flos": 20084084438400.0, + "grad_norm": 2.5513670028655047, + "language_loss": 0.82723725, + "learning_rate": 1.1502491556846105e-07, + "loss": 0.84870589, + "num_input_tokens_seen": 160907890, + "step": 7441, + "time_per_iteration": 2.503692150115967 + }, + { + "auxiliary_loss_clip": 0.01140347, + "auxiliary_loss_mlp": 0.01025978, + "balance_loss_clip": 1.04480207, + "balance_loss_mlp": 1.0186826, + "epoch": 0.894847592136115, + "flos": 18550555136640.0, + "grad_norm": 2.3773858035298403, + "language_loss": 0.81362367, + "learning_rate": 1.1476469406974331e-07, + "loss": 0.83528686, + "num_input_tokens_seen": 160923490, + "step": 7442, + "time_per_iteration": 2.4775238037109375 + }, + { + "auxiliary_loss_clip": 0.01166509, + "auxiliary_loss_mlp": 0.01027007, + "balance_loss_clip": 1.04793596, + "balance_loss_mlp": 1.02002418, + "epoch": 0.894967835026754, + "flos": 23478468704640.0, + "grad_norm": 1.659363070308815, + "language_loss": 0.77199137, + "learning_rate": 1.1450475855972341e-07, + "loss": 0.7939266, + "num_input_tokens_seen": 160944280, + "step": 7443, + "time_per_iteration": 2.4394636154174805 + }, + { + "auxiliary_loss_clip": 0.01138212, + "auxiliary_loss_mlp": 0.00762387, + "balance_loss_clip": 1.04223466, + "balance_loss_mlp": 1.00060666, + "epoch": 0.8950880779173931, + "flos": 15188310564480.0, + "grad_norm": 1.934174244433168, + "language_loss": 0.70731318, + "learning_rate": 1.1424510907783158e-07, + "loss": 0.72631919, + "num_input_tokens_seen": 160961560, + "step": 7444, + "time_per_iteration": 2.5068490505218506 + }, + { + "auxiliary_loss_clip": 0.01142474, + "auxiliary_loss_mlp": 0.01026969, + "balance_loss_clip": 1.0425601, + "balance_loss_mlp": 1.02014995, + "epoch": 0.8952083208080323, + "flos": 22091957769600.0, + "grad_norm": 1.6867283865722358, + "language_loss": 0.82539356, + "learning_rate": 1.1398574566345787e-07, + "loss": 0.84708798, + "num_input_tokens_seen": 160982195, + "step": 7445, + "time_per_iteration": 2.5007164478302 + }, + { + "auxiliary_loss_clip": 0.01142385, + "auxiliary_loss_mlp": 0.01023423, + "balance_loss_clip": 1.04128921, + "balance_loss_mlp": 1.0157634, + "epoch": 0.8953285636986713, + "flos": 23254026572160.0, + "grad_norm": 2.3385399583908373, + "language_loss": 0.82609391, + "learning_rate": 1.1372666835594702e-07, + "loss": 0.84775198, + "num_input_tokens_seen": 161000520, + "step": 7446, + "time_per_iteration": 2.5431675910949707 + }, + { + "auxiliary_loss_clip": 0.01138998, + "auxiliary_loss_mlp": 0.01021808, + "balance_loss_clip": 1.04486752, + "balance_loss_mlp": 1.01509094, + "epoch": 0.8954488065893104, + "flos": 16362661818240.0, + "grad_norm": 1.9635575208924345, + "language_loss": 0.71729922, + "learning_rate": 1.1346787719460071e-07, + "loss": 0.73890728, + "num_input_tokens_seen": 161019405, + "step": 7447, + "time_per_iteration": 2.466113805770874 + }, + { + "auxiliary_loss_clip": 0.0113946, + "auxiliary_loss_mlp": 0.01027996, + "balance_loss_clip": 1.04558456, + "balance_loss_mlp": 1.02061653, + "epoch": 0.8955690494799495, + "flos": 18257883120000.0, + "grad_norm": 1.796947602327518, + "language_loss": 0.72480601, + "learning_rate": 1.1320937221867732e-07, + "loss": 0.74648058, + "num_input_tokens_seen": 161036985, + "step": 7448, + "time_per_iteration": 2.502088785171509 + }, + { + "auxiliary_loss_clip": 0.01137536, + "auxiliary_loss_mlp": 0.01022155, + "balance_loss_clip": 1.04276967, + "balance_loss_mlp": 1.01570606, + "epoch": 0.8956892923705886, + "flos": 25447486498560.0, + "grad_norm": 2.0201014529300734, + "language_loss": 0.79535413, + "learning_rate": 1.1295115346739192e-07, + "loss": 0.81695104, + "num_input_tokens_seen": 161056985, + "step": 7449, + "time_per_iteration": 2.534937620162964 + }, + { + "auxiliary_loss_clip": 0.01142937, + "auxiliary_loss_mlp": 0.01026114, + "balance_loss_clip": 1.04555821, + "balance_loss_mlp": 1.01881278, + "epoch": 0.8958095352612276, + "flos": 52661883939840.0, + "grad_norm": 2.312103204824123, + "language_loss": 0.72793537, + "learning_rate": 1.1269322097991629e-07, + "loss": 0.74962592, + "num_input_tokens_seen": 161080270, + "step": 7450, + "time_per_iteration": 2.7983410358428955 + }, + { + "auxiliary_loss_clip": 0.01159081, + "auxiliary_loss_mlp": 0.01026303, + "balance_loss_clip": 1.04896879, + "balance_loss_mlp": 1.01834846, + "epoch": 0.8959297781518668, + "flos": 23186335392000.0, + "grad_norm": 2.13071466255611, + "language_loss": 0.68177712, + "learning_rate": 1.1243557479537846e-07, + "loss": 0.70363092, + "num_input_tokens_seen": 161100160, + "step": 7451, + "time_per_iteration": 3.2611303329467773 + }, + { + "auxiliary_loss_clip": 0.01165798, + "auxiliary_loss_mlp": 0.01021124, + "balance_loss_clip": 1.04498887, + "balance_loss_mlp": 1.01374233, + "epoch": 0.8960500210425059, + "flos": 20334309557760.0, + "grad_norm": 3.607117019472578, + "language_loss": 0.68461329, + "learning_rate": 1.121782149528634e-07, + "loss": 0.70648247, + "num_input_tokens_seen": 161117260, + "step": 7452, + "time_per_iteration": 3.2269232273101807 + }, + { + "auxiliary_loss_clip": 0.01142081, + "auxiliary_loss_mlp": 0.01018675, + "balance_loss_clip": 1.04522514, + "balance_loss_mlp": 1.01159692, + "epoch": 0.8961702639331449, + "flos": 19901694153600.0, + "grad_norm": 2.0473039198619705, + "language_loss": 0.78457558, + "learning_rate": 1.1192114149141208e-07, + "loss": 0.80618316, + "num_input_tokens_seen": 161136895, + "step": 7453, + "time_per_iteration": 2.4855098724365234 + }, + { + "auxiliary_loss_clip": 0.01143275, + "auxiliary_loss_mlp": 0.0102843, + "balance_loss_clip": 1.04316044, + "balance_loss_mlp": 1.02052629, + "epoch": 0.8962905068237841, + "flos": 12896348567040.0, + "grad_norm": 2.2545343198548857, + "language_loss": 0.65043402, + "learning_rate": 1.1166435445002197e-07, + "loss": 0.67215109, + "num_input_tokens_seen": 161154565, + "step": 7454, + "time_per_iteration": 3.204529047012329 + }, + { + "auxiliary_loss_clip": 0.01156966, + "auxiliary_loss_mlp": 0.01025674, + "balance_loss_clip": 1.04670167, + "balance_loss_mlp": 1.017851, + "epoch": 0.8964107497144231, + "flos": 23440331439360.0, + "grad_norm": 2.619624727781681, + "language_loss": 0.67936754, + "learning_rate": 1.1140785386764818e-07, + "loss": 0.70119387, + "num_input_tokens_seen": 161173265, + "step": 7455, + "time_per_iteration": 2.464535713195801 + }, + { + "auxiliary_loss_clip": 0.01148771, + "auxiliary_loss_mlp": 0.01026939, + "balance_loss_clip": 1.04476953, + "balance_loss_mlp": 1.0191431, + "epoch": 0.8965309926050622, + "flos": 19500176949120.0, + "grad_norm": 2.0732874092651965, + "language_loss": 0.69341463, + "learning_rate": 1.1115163978320153e-07, + "loss": 0.71517169, + "num_input_tokens_seen": 161191995, + "step": 7456, + "time_per_iteration": 2.451580286026001 + }, + { + "auxiliary_loss_clip": 0.01158998, + "auxiliary_loss_mlp": 0.00762459, + "balance_loss_clip": 1.04640353, + "balance_loss_mlp": 1.0005933, + "epoch": 0.8966512354957014, + "flos": 28658008022400.0, + "grad_norm": 1.9682957341615581, + "language_loss": 0.82402259, + "learning_rate": 1.1089571223554917e-07, + "loss": 0.84323716, + "num_input_tokens_seen": 161212880, + "step": 7457, + "time_per_iteration": 3.291071653366089 + }, + { + "auxiliary_loss_clip": 0.01154702, + "auxiliary_loss_mlp": 0.01024583, + "balance_loss_clip": 1.04462433, + "balance_loss_mlp": 1.0170188, + "epoch": 0.8967714783863404, + "flos": 23370916406400.0, + "grad_norm": 3.8595202914122853, + "language_loss": 0.85375875, + "learning_rate": 1.1064007126351537e-07, + "loss": 0.87555158, + "num_input_tokens_seen": 161233595, + "step": 7458, + "time_per_iteration": 2.489938259124756 + }, + { + "auxiliary_loss_clip": 0.01135406, + "auxiliary_loss_mlp": 0.01022928, + "balance_loss_clip": 1.04495382, + "balance_loss_mlp": 1.01554346, + "epoch": 0.8968917212769795, + "flos": 24535175938560.0, + "grad_norm": 2.18535424437314, + "language_loss": 0.76240367, + "learning_rate": 1.1038471690588003e-07, + "loss": 0.78398705, + "num_input_tokens_seen": 161252740, + "step": 7459, + "time_per_iteration": 2.513246774673462 + }, + { + "auxiliary_loss_clip": 0.01111476, + "auxiliary_loss_mlp": 0.01024963, + "balance_loss_clip": 1.04465365, + "balance_loss_mlp": 1.01772988, + "epoch": 0.8970119641676186, + "flos": 23475416048640.0, + "grad_norm": 2.1891274414166797, + "language_loss": 0.7995826, + "learning_rate": 1.1012964920138145e-07, + "loss": 0.82094705, + "num_input_tokens_seen": 161272325, + "step": 7460, + "time_per_iteration": 2.57185697555542 + }, + { + "auxiliary_loss_clip": 0.01132577, + "auxiliary_loss_mlp": 0.01025955, + "balance_loss_clip": 1.04013658, + "balance_loss_mlp": 1.01890361, + "epoch": 0.8971322070582577, + "flos": 24538192680960.0, + "grad_norm": 1.589976756073563, + "language_loss": 0.75847507, + "learning_rate": 1.0987486818871205e-07, + "loss": 0.78006035, + "num_input_tokens_seen": 161295915, + "step": 7461, + "time_per_iteration": 2.619816780090332 + }, + { + "auxiliary_loss_clip": 0.01153783, + "auxiliary_loss_mlp": 0.00762421, + "balance_loss_clip": 1.04540229, + "balance_loss_mlp": 1.00066304, + "epoch": 0.8972524499488967, + "flos": 21797454159360.0, + "grad_norm": 2.180413159575725, + "language_loss": 0.7331717, + "learning_rate": 1.0962037390652245e-07, + "loss": 0.75233376, + "num_input_tokens_seen": 161314935, + "step": 7462, + "time_per_iteration": 2.45479154586792 + }, + { + "auxiliary_loss_clip": 0.01138494, + "auxiliary_loss_mlp": 0.01024621, + "balance_loss_clip": 1.04387653, + "balance_loss_mlp": 1.01720357, + "epoch": 0.8973726928395359, + "flos": 21726243446400.0, + "grad_norm": 1.946855835748748, + "language_loss": 0.72067374, + "learning_rate": 1.0936616639341911e-07, + "loss": 0.74230492, + "num_input_tokens_seen": 161335225, + "step": 7463, + "time_per_iteration": 2.5017812252044678 + }, + { + "auxiliary_loss_clip": 0.01048472, + "auxiliary_loss_mlp": 0.010068, + "balance_loss_clip": 1.0091356, + "balance_loss_mlp": 1.00582886, + "epoch": 0.897492935730175, + "flos": 53837100097920.0, + "grad_norm": 0.7355110833793101, + "language_loss": 0.54717159, + "learning_rate": 1.0911224568796473e-07, + "loss": 0.56772429, + "num_input_tokens_seen": 161393420, + "step": 7464, + "time_per_iteration": 3.0704543590545654 + }, + { + "auxiliary_loss_clip": 0.01153933, + "auxiliary_loss_mlp": 0.01031348, + "balance_loss_clip": 1.04756308, + "balance_loss_mlp": 1.02427626, + "epoch": 0.897613178620814, + "flos": 18290346036480.0, + "grad_norm": 1.8314608794628733, + "language_loss": 0.70915514, + "learning_rate": 1.0885861182867984e-07, + "loss": 0.73100793, + "num_input_tokens_seen": 161411525, + "step": 7465, + "time_per_iteration": 2.4488158226013184 + }, + { + "auxiliary_loss_clip": 0.01142826, + "auxiliary_loss_mlp": 0.01027637, + "balance_loss_clip": 1.0436461, + "balance_loss_mlp": 1.02004886, + "epoch": 0.8977334215114532, + "flos": 32993718059520.0, + "grad_norm": 1.980000549972533, + "language_loss": 0.70677495, + "learning_rate": 1.0860526485403942e-07, + "loss": 0.72847962, + "num_input_tokens_seen": 161432800, + "step": 7466, + "time_per_iteration": 2.5926225185394287 + }, + { + "auxiliary_loss_clip": 0.01167381, + "auxiliary_loss_mlp": 0.01024187, + "balance_loss_clip": 1.04714906, + "balance_loss_mlp": 1.01718903, + "epoch": 0.8978536644020922, + "flos": 15195636938880.0, + "grad_norm": 1.652185279341757, + "language_loss": 0.77265203, + "learning_rate": 1.0835220480247675e-07, + "loss": 0.7945677, + "num_input_tokens_seen": 161451295, + "step": 7467, + "time_per_iteration": 2.414560317993164 + }, + { + "auxiliary_loss_clip": 0.01138291, + "auxiliary_loss_mlp": 0.01026346, + "balance_loss_clip": 1.04477894, + "balance_loss_mlp": 1.01891875, + "epoch": 0.8979739072927313, + "flos": 18004389863040.0, + "grad_norm": 2.0370523683675055, + "language_loss": 0.84224844, + "learning_rate": 1.0809943171238067e-07, + "loss": 0.8638947, + "num_input_tokens_seen": 161469220, + "step": 7468, + "time_per_iteration": 2.4664456844329834 + }, + { + "auxiliary_loss_clip": 0.01147336, + "auxiliary_loss_mlp": 0.01030096, + "balance_loss_clip": 1.04567075, + "balance_loss_mlp": 1.02150071, + "epoch": 0.8980941501833704, + "flos": 22271546793600.0, + "grad_norm": 2.2140306402831085, + "language_loss": 0.6300866, + "learning_rate": 1.078469456220965e-07, + "loss": 0.65186095, + "num_input_tokens_seen": 161489375, + "step": 7469, + "time_per_iteration": 2.528177499771118 + }, + { + "auxiliary_loss_clip": 0.01153385, + "auxiliary_loss_mlp": 0.01024397, + "balance_loss_clip": 1.04361963, + "balance_loss_mlp": 1.0169251, + "epoch": 0.8982143930740095, + "flos": 37560729726720.0, + "grad_norm": 2.4771915521427528, + "language_loss": 0.69786018, + "learning_rate": 1.0759474656992606e-07, + "loss": 0.71963799, + "num_input_tokens_seen": 161512145, + "step": 7470, + "time_per_iteration": 2.5917084217071533 + }, + { + "auxiliary_loss_clip": 0.01143984, + "auxiliary_loss_mlp": 0.01025833, + "balance_loss_clip": 1.04292488, + "balance_loss_mlp": 1.01811445, + "epoch": 0.8983346359646486, + "flos": 18076893465600.0, + "grad_norm": 2.4653886131279843, + "language_loss": 0.78153151, + "learning_rate": 1.0734283459412785e-07, + "loss": 0.80322969, + "num_input_tokens_seen": 161528995, + "step": 7471, + "time_per_iteration": 2.482351064682007 + }, + { + "auxiliary_loss_clip": 0.01114729, + "auxiliary_loss_mlp": 0.0102761, + "balance_loss_clip": 1.04110157, + "balance_loss_mlp": 1.01928318, + "epoch": 0.8984548788552876, + "flos": 20558895344640.0, + "grad_norm": 1.9121021083336944, + "language_loss": 0.80575848, + "learning_rate": 1.0709120973291707e-07, + "loss": 0.82718182, + "num_input_tokens_seen": 161548775, + "step": 7472, + "time_per_iteration": 2.5761332511901855 + }, + { + "auxiliary_loss_clip": 0.01170281, + "auxiliary_loss_mlp": 0.01027906, + "balance_loss_clip": 1.0479368, + "balance_loss_mlp": 1.02002668, + "epoch": 0.8985751217459268, + "flos": 17785442511360.0, + "grad_norm": 2.013073053229545, + "language_loss": 0.77481848, + "learning_rate": 1.0683987202446475e-07, + "loss": 0.79680037, + "num_input_tokens_seen": 161566960, + "step": 7473, + "time_per_iteration": 2.4046192169189453 + }, + { + "auxiliary_loss_clip": 0.01156555, + "auxiliary_loss_mlp": 0.01021525, + "balance_loss_clip": 1.04534125, + "balance_loss_mlp": 1.01406837, + "epoch": 0.8986953646365659, + "flos": 21617003208960.0, + "grad_norm": 1.8378907430604072, + "language_loss": 0.69983363, + "learning_rate": 1.0658882150689862e-07, + "loss": 0.72161448, + "num_input_tokens_seen": 161585820, + "step": 7474, + "time_per_iteration": 2.450406789779663 + }, + { + "auxiliary_loss_clip": 0.01130092, + "auxiliary_loss_mlp": 0.01023943, + "balance_loss_clip": 1.04363775, + "balance_loss_mlp": 1.01627791, + "epoch": 0.8988156075272049, + "flos": 14027355083520.0, + "grad_norm": 2.334734537841935, + "language_loss": 0.77516782, + "learning_rate": 1.0633805821830288e-07, + "loss": 0.79670823, + "num_input_tokens_seen": 161602505, + "step": 7475, + "time_per_iteration": 2.4986226558685303 + }, + { + "auxiliary_loss_clip": 0.0114116, + "auxiliary_loss_mlp": 0.01025346, + "balance_loss_clip": 1.04514563, + "balance_loss_mlp": 1.01742458, + "epoch": 0.8989358504178441, + "flos": 29059202004480.0, + "grad_norm": 3.87003735027866, + "language_loss": 0.83018863, + "learning_rate": 1.0608758219671753e-07, + "loss": 0.85185373, + "num_input_tokens_seen": 161621545, + "step": 7476, + "time_per_iteration": 2.5478601455688477 + }, + { + "auxiliary_loss_clip": 0.01144102, + "auxiliary_loss_mlp": 0.0102125, + "balance_loss_clip": 1.04403448, + "balance_loss_mlp": 1.01436901, + "epoch": 0.8990560933084831, + "flos": 20230420446720.0, + "grad_norm": 1.6307525054418994, + "language_loss": 0.70706922, + "learning_rate": 1.0583739348014065e-07, + "loss": 0.72872269, + "num_input_tokens_seen": 161642630, + "step": 7477, + "time_per_iteration": 2.5312917232513428 + }, + { + "auxiliary_loss_clip": 0.01169245, + "auxiliary_loss_mlp": 0.0102445, + "balance_loss_clip": 1.04958773, + "balance_loss_mlp": 1.01718986, + "epoch": 0.8991763361991222, + "flos": 25520672459520.0, + "grad_norm": 1.8631860272225649, + "language_loss": 0.84614086, + "learning_rate": 1.0558749210652518e-07, + "loss": 0.86807787, + "num_input_tokens_seen": 161662560, + "step": 7478, + "time_per_iteration": 3.9073450565338135 + }, + { + "auxiliary_loss_clip": 0.011302, + "auxiliary_loss_mlp": 0.01021932, + "balance_loss_clip": 1.04384899, + "balance_loss_mlp": 1.01485109, + "epoch": 0.8992965790897613, + "flos": 25119191168640.0, + "grad_norm": 1.7394046311183315, + "language_loss": 0.85209298, + "learning_rate": 1.053378781137808e-07, + "loss": 0.87361425, + "num_input_tokens_seen": 161683480, + "step": 7479, + "time_per_iteration": 2.552726984024048 + }, + { + "auxiliary_loss_clip": 0.01142679, + "auxiliary_loss_mlp": 0.0102657, + "balance_loss_clip": 1.04427111, + "balance_loss_mlp": 1.01902413, + "epoch": 0.8994168219804004, + "flos": 16070815814400.0, + "grad_norm": 2.29169726738745, + "language_loss": 0.77782017, + "learning_rate": 1.0508855153977392e-07, + "loss": 0.79951262, + "num_input_tokens_seen": 161699945, + "step": 7480, + "time_per_iteration": 2.4476003646850586 + }, + { + "auxiliary_loss_clip": 0.01156203, + "auxiliary_loss_mlp": 0.01029072, + "balance_loss_clip": 1.04484165, + "balance_loss_mlp": 1.0213654, + "epoch": 0.8995370648710395, + "flos": 24825764966400.0, + "grad_norm": 2.6152140375878874, + "language_loss": 0.67078817, + "learning_rate": 1.0483951242232669e-07, + "loss": 0.6926409, + "num_input_tokens_seen": 161720420, + "step": 7481, + "time_per_iteration": 3.252464532852173 + }, + { + "auxiliary_loss_clip": 0.01063075, + "auxiliary_loss_mlp": 0.01001835, + "balance_loss_clip": 1.00722206, + "balance_loss_mlp": 1.00086904, + "epoch": 0.8996573077616786, + "flos": 63116238378240.0, + "grad_norm": 0.9751521316970158, + "language_loss": 0.57756341, + "learning_rate": 1.0459076079921936e-07, + "loss": 0.59821248, + "num_input_tokens_seen": 161773080, + "step": 7482, + "time_per_iteration": 3.0508599281311035 + }, + { + "auxiliary_loss_clip": 0.01134286, + "auxiliary_loss_mlp": 0.01030335, + "balance_loss_clip": 1.0439477, + "balance_loss_mlp": 1.02219892, + "epoch": 0.8997775506523177, + "flos": 18219674027520.0, + "grad_norm": 2.2451708641428705, + "language_loss": 0.85041869, + "learning_rate": 1.0434229670818618e-07, + "loss": 0.87206489, + "num_input_tokens_seen": 161789755, + "step": 7483, + "time_per_iteration": 2.4608266353607178 + }, + { + "auxiliary_loss_clip": 0.01131571, + "auxiliary_loss_mlp": 0.01023433, + "balance_loss_clip": 1.0422169, + "balance_loss_mlp": 1.01586902, + "epoch": 0.8998977935429567, + "flos": 24166768095360.0, + "grad_norm": 1.446359350685097, + "language_loss": 0.79885775, + "learning_rate": 1.0409412018691944e-07, + "loss": 0.82040775, + "num_input_tokens_seen": 161810220, + "step": 7484, + "time_per_iteration": 3.1845691204071045 + }, + { + "auxiliary_loss_clip": 0.0113638, + "auxiliary_loss_mlp": 0.01024605, + "balance_loss_clip": 1.04458559, + "balance_loss_mlp": 1.01723766, + "epoch": 0.9000180364335959, + "flos": 20773030273920.0, + "grad_norm": 1.7785806992330773, + "language_loss": 0.74833721, + "learning_rate": 1.0384623127306724e-07, + "loss": 0.76994711, + "num_input_tokens_seen": 161827565, + "step": 7485, + "time_per_iteration": 2.476339340209961 + }, + { + "auxiliary_loss_clip": 0.01122816, + "auxiliary_loss_mlp": 0.01025903, + "balance_loss_clip": 1.04109347, + "balance_loss_mlp": 1.01894152, + "epoch": 0.900138279324235, + "flos": 19205745166080.0, + "grad_norm": 1.7876560674517339, + "language_loss": 0.7933346, + "learning_rate": 1.0359863000423397e-07, + "loss": 0.81482184, + "num_input_tokens_seen": 161845700, + "step": 7486, + "time_per_iteration": 2.4994113445281982 + }, + { + "auxiliary_loss_clip": 0.01169239, + "auxiliary_loss_mlp": 0.01024285, + "balance_loss_clip": 1.04822397, + "balance_loss_mlp": 1.01716852, + "epoch": 0.900258522214874, + "flos": 28731158069760.0, + "grad_norm": 1.6718493120816262, + "language_loss": 0.71290994, + "learning_rate": 1.0335131641798112e-07, + "loss": 0.73484522, + "num_input_tokens_seen": 161867660, + "step": 7487, + "time_per_iteration": 2.4739151000976562 + }, + { + "auxiliary_loss_clip": 0.01041923, + "auxiliary_loss_mlp": 0.01000328, + "balance_loss_clip": 1.00681996, + "balance_loss_mlp": 0.99930924, + "epoch": 0.9003787651055132, + "flos": 58280685655680.0, + "grad_norm": 0.9982033625698457, + "language_loss": 0.55665976, + "learning_rate": 1.0310429055182512e-07, + "loss": 0.57708228, + "num_input_tokens_seen": 161921980, + "step": 7488, + "time_per_iteration": 2.9019582271575928 + }, + { + "auxiliary_loss_clip": 0.01127439, + "auxiliary_loss_mlp": 0.01027618, + "balance_loss_clip": 1.04257977, + "balance_loss_mlp": 1.02031624, + "epoch": 0.9004990079961522, + "flos": 25556475340800.0, + "grad_norm": 1.9739696636953579, + "language_loss": 0.73972243, + "learning_rate": 1.0285755244324024e-07, + "loss": 0.76127303, + "num_input_tokens_seen": 161942725, + "step": 7489, + "time_per_iteration": 2.5952131748199463 + }, + { + "auxiliary_loss_clip": 0.01141065, + "auxiliary_loss_mlp": 0.00761707, + "balance_loss_clip": 1.04165006, + "balance_loss_mlp": 1.00055146, + "epoch": 0.9006192508867913, + "flos": 23335185352320.0, + "grad_norm": 2.068751875384326, + "language_loss": 0.68481946, + "learning_rate": 1.0261110212965629e-07, + "loss": 0.70384717, + "num_input_tokens_seen": 161964520, + "step": 7490, + "time_per_iteration": 2.536444902420044 + }, + { + "auxiliary_loss_clip": 0.01142035, + "auxiliary_loss_mlp": 0.01024432, + "balance_loss_clip": 1.04539311, + "balance_loss_mlp": 1.01755667, + "epoch": 0.9007394937774305, + "flos": 18040300485120.0, + "grad_norm": 2.5867850548672457, + "language_loss": 0.79151618, + "learning_rate": 1.023649396484596e-07, + "loss": 0.8131808, + "num_input_tokens_seen": 161983575, + "step": 7491, + "time_per_iteration": 2.4618587493896484 + }, + { + "auxiliary_loss_clip": 0.01167792, + "auxiliary_loss_mlp": 0.0102677, + "balance_loss_clip": 1.04661179, + "balance_loss_mlp": 1.01980472, + "epoch": 0.9008597366680695, + "flos": 43068456633600.0, + "grad_norm": 1.9433675385652829, + "language_loss": 0.67541468, + "learning_rate": 1.0211906503699275e-07, + "loss": 0.69736028, + "num_input_tokens_seen": 162006550, + "step": 7492, + "time_per_iteration": 2.619664430618286 + }, + { + "auxiliary_loss_clip": 0.01158486, + "auxiliary_loss_mlp": 0.01025693, + "balance_loss_clip": 1.04890311, + "balance_loss_mlp": 1.01739037, + "epoch": 0.9009799795587086, + "flos": 14939055112320.0, + "grad_norm": 6.303371702064974, + "language_loss": 0.82660317, + "learning_rate": 1.0187347833255455e-07, + "loss": 0.84844494, + "num_input_tokens_seen": 162022455, + "step": 7493, + "time_per_iteration": 2.419987916946411 + }, + { + "auxiliary_loss_clip": 0.01166535, + "auxiliary_loss_mlp": 0.01026902, + "balance_loss_clip": 1.04820323, + "balance_loss_mlp": 1.01973128, + "epoch": 0.9011002224493477, + "flos": 21579584215680.0, + "grad_norm": 1.809566074713343, + "language_loss": 0.79078817, + "learning_rate": 1.0162817957240056e-07, + "loss": 0.81272256, + "num_input_tokens_seen": 162042350, + "step": 7494, + "time_per_iteration": 2.4185807704925537 + }, + { + "auxiliary_loss_clip": 0.01053352, + "auxiliary_loss_mlp": 0.01000706, + "balance_loss_clip": 1.00701439, + "balance_loss_mlp": 0.9996565, + "epoch": 0.9012204653399868, + "flos": 71166367883520.0, + "grad_norm": 0.9703938280569944, + "language_loss": 0.62991118, + "learning_rate": 1.0138316879374253e-07, + "loss": 0.65045178, + "num_input_tokens_seen": 162111640, + "step": 7495, + "time_per_iteration": 3.179142475128174 + }, + { + "auxiliary_loss_clip": 0.01144441, + "auxiliary_loss_mlp": 0.01021966, + "balance_loss_clip": 1.04779077, + "balance_loss_mlp": 1.01459563, + "epoch": 0.9013407082306258, + "flos": 15594963413760.0, + "grad_norm": 2.397982989267322, + "language_loss": 0.73877203, + "learning_rate": 1.0113844603374833e-07, + "loss": 0.76043606, + "num_input_tokens_seen": 162128165, + "step": 7496, + "time_per_iteration": 2.4597349166870117 + }, + { + "auxiliary_loss_clip": 0.01140467, + "auxiliary_loss_mlp": 0.01024165, + "balance_loss_clip": 1.04267406, + "balance_loss_mlp": 1.01589751, + "epoch": 0.901460951121265, + "flos": 15049157276160.0, + "grad_norm": 2.2659863894246826, + "language_loss": 0.71825391, + "learning_rate": 1.0089401132954178e-07, + "loss": 0.73990023, + "num_input_tokens_seen": 162146145, + "step": 7497, + "time_per_iteration": 2.4608070850372314 + }, + { + "auxiliary_loss_clip": 0.01140762, + "auxiliary_loss_mlp": 0.01023734, + "balance_loss_clip": 1.04659343, + "balance_loss_mlp": 1.01684034, + "epoch": 0.9015811940119041, + "flos": 22236857233920.0, + "grad_norm": 1.8360068319718819, + "language_loss": 0.72280157, + "learning_rate": 1.006498647182037e-07, + "loss": 0.74444652, + "num_input_tokens_seen": 162164800, + "step": 7498, + "time_per_iteration": 2.5012319087982178 + }, + { + "auxiliary_loss_clip": 0.01094455, + "auxiliary_loss_mlp": 0.01031092, + "balance_loss_clip": 1.03852916, + "balance_loss_mlp": 1.02342057, + "epoch": 0.9017014369025431, + "flos": 24973824827520.0, + "grad_norm": 2.076669650742831, + "language_loss": 0.71439272, + "learning_rate": 1.004060062367713e-07, + "loss": 0.73564821, + "num_input_tokens_seen": 162185895, + "step": 7499, + "time_per_iteration": 2.6170780658721924 + }, + { + "auxiliary_loss_clip": 0.01155833, + "auxiliary_loss_mlp": 0.01023652, + "balance_loss_clip": 1.04558146, + "balance_loss_mlp": 1.01578987, + "epoch": 0.9018216797931822, + "flos": 18114168804480.0, + "grad_norm": 1.9703947209891877, + "language_loss": 0.69498992, + "learning_rate": 1.0016243592223728e-07, + "loss": 0.71678472, + "num_input_tokens_seen": 162206295, + "step": 7500, + "time_per_iteration": 2.4527032375335693 + }, + { + "auxiliary_loss_clip": 0.01095113, + "auxiliary_loss_mlp": 0.01022888, + "balance_loss_clip": 1.03965271, + "balance_loss_mlp": 1.01544011, + "epoch": 0.9019419226838213, + "flos": 37268452759680.0, + "grad_norm": 1.8800412685652304, + "language_loss": 0.65795141, + "learning_rate": 9.991915381155114e-08, + "loss": 0.67913139, + "num_input_tokens_seen": 162229275, + "step": 7501, + "time_per_iteration": 2.765042781829834 + }, + { + "auxiliary_loss_clip": 0.01157405, + "auxiliary_loss_mlp": 0.01023055, + "balance_loss_clip": 1.0460639, + "balance_loss_mlp": 1.01558077, + "epoch": 0.9020621655744604, + "flos": 23441121538560.0, + "grad_norm": 2.4624322291358434, + "language_loss": 0.74894655, + "learning_rate": 9.967615994161871e-08, + "loss": 0.77075124, + "num_input_tokens_seen": 162248935, + "step": 7502, + "time_per_iteration": 2.48335599899292 + }, + { + "auxiliary_loss_clip": 0.01167237, + "auxiliary_loss_mlp": 0.01019558, + "balance_loss_clip": 1.04651451, + "balance_loss_mlp": 1.01236999, + "epoch": 0.9021824084650995, + "flos": 22857465444480.0, + "grad_norm": 1.8840937688781694, + "language_loss": 0.77936471, + "learning_rate": 9.943345434930161e-08, + "loss": 0.8012327, + "num_input_tokens_seen": 162269185, + "step": 7503, + "time_per_iteration": 2.4477124214172363 + }, + { + "auxiliary_loss_clip": 0.01127433, + "auxiliary_loss_mlp": 0.01025971, + "balance_loss_clip": 1.04572034, + "balance_loss_mlp": 1.01858568, + "epoch": 0.9023026513557386, + "flos": 22127581082880.0, + "grad_norm": 1.9527315048575342, + "language_loss": 0.68894815, + "learning_rate": 9.919103707141885e-08, + "loss": 0.71048212, + "num_input_tokens_seen": 162288065, + "step": 7504, + "time_per_iteration": 3.29168701171875 + }, + { + "auxiliary_loss_clip": 0.01152848, + "auxiliary_loss_mlp": 0.01024966, + "balance_loss_clip": 1.04547787, + "balance_loss_mlp": 1.01719952, + "epoch": 0.9024228942463777, + "flos": 24199087357440.0, + "grad_norm": 1.969227268959718, + "language_loss": 0.76490772, + "learning_rate": 9.89489081447441e-08, + "loss": 0.78668582, + "num_input_tokens_seen": 162305265, + "step": 7505, + "time_per_iteration": 3.2232251167297363 + }, + { + "auxiliary_loss_clip": 0.01139618, + "auxiliary_loss_mlp": 0.01023153, + "balance_loss_clip": 1.04286695, + "balance_loss_mlp": 1.01544595, + "epoch": 0.9025431371370167, + "flos": 25008262992000.0, + "grad_norm": 2.579532175774338, + "language_loss": 0.82806182, + "learning_rate": 9.870706760600844e-08, + "loss": 0.84968948, + "num_input_tokens_seen": 162325215, + "step": 7506, + "time_per_iteration": 2.515490770339966 + }, + { + "auxiliary_loss_clip": 0.01120799, + "auxiliary_loss_mlp": 0.01026148, + "balance_loss_clip": 1.04731715, + "balance_loss_mlp": 1.01842928, + "epoch": 0.9026633800276559, + "flos": 18952862440320.0, + "grad_norm": 1.9531577401095015, + "language_loss": 0.72483552, + "learning_rate": 9.846551549189918e-08, + "loss": 0.74630493, + "num_input_tokens_seen": 162344820, + "step": 7507, + "time_per_iteration": 2.5364110469818115 + }, + { + "auxiliary_loss_clip": 0.01137588, + "auxiliary_loss_mlp": 0.01025487, + "balance_loss_clip": 1.04437542, + "balance_loss_mlp": 1.01784277, + "epoch": 0.902783622918295, + "flos": 32416059536640.0, + "grad_norm": 2.4322601560763206, + "language_loss": 0.68295652, + "learning_rate": 9.822425183905902e-08, + "loss": 0.70458722, + "num_input_tokens_seen": 162365345, + "step": 7508, + "time_per_iteration": 3.338870048522949 + }, + { + "auxiliary_loss_clip": 0.01032459, + "auxiliary_loss_mlp": 0.01000979, + "balance_loss_clip": 1.00695491, + "balance_loss_mlp": 0.99994773, + "epoch": 0.902903865808934, + "flos": 63717453244800.0, + "grad_norm": 0.9075134129601353, + "language_loss": 0.75151765, + "learning_rate": 9.798327668408823e-08, + "loss": 0.77185202, + "num_input_tokens_seen": 162426980, + "step": 7509, + "time_per_iteration": 3.200796365737915 + }, + { + "auxiliary_loss_clip": 0.01170786, + "auxiliary_loss_mlp": 0.01027977, + "balance_loss_clip": 1.04685295, + "balance_loss_mlp": 1.02007365, + "epoch": 0.9030241086995732, + "flos": 23804034600960.0, + "grad_norm": 2.068858058066389, + "language_loss": 0.6894874, + "learning_rate": 9.774259006354158e-08, + "loss": 0.71147507, + "num_input_tokens_seen": 162447050, + "step": 7510, + "time_per_iteration": 2.4622485637664795 + }, + { + "auxiliary_loss_clip": 0.01145113, + "auxiliary_loss_mlp": 0.0102576, + "balance_loss_clip": 1.04387283, + "balance_loss_mlp": 1.0183692, + "epoch": 0.9031443515902122, + "flos": 26395887248640.0, + "grad_norm": 1.9666830637044788, + "language_loss": 0.76129806, + "learning_rate": 9.750219201393184e-08, + "loss": 0.78300679, + "num_input_tokens_seen": 162467015, + "step": 7511, + "time_per_iteration": 3.28875994682312 + }, + { + "auxiliary_loss_clip": 0.01152229, + "auxiliary_loss_mlp": 0.01020042, + "balance_loss_clip": 1.04488504, + "balance_loss_mlp": 1.01264834, + "epoch": 0.9032645944808513, + "flos": 24939350749440.0, + "grad_norm": 5.840958640146394, + "language_loss": 0.77775908, + "learning_rate": 9.726208257172697e-08, + "loss": 0.79948175, + "num_input_tokens_seen": 162488710, + "step": 7512, + "time_per_iteration": 2.5514774322509766 + }, + { + "auxiliary_loss_clip": 0.01167212, + "auxiliary_loss_mlp": 0.01022325, + "balance_loss_clip": 1.04689503, + "balance_loss_mlp": 1.01476717, + "epoch": 0.9033848373714904, + "flos": 21178821196800.0, + "grad_norm": 1.9246519085461173, + "language_loss": 0.74749744, + "learning_rate": 9.702226177335115e-08, + "loss": 0.76939279, + "num_input_tokens_seen": 162507205, + "step": 7513, + "time_per_iteration": 2.410644054412842 + }, + { + "auxiliary_loss_clip": 0.01141997, + "auxiliary_loss_mlp": 0.01030648, + "balance_loss_clip": 1.04708505, + "balance_loss_mlp": 1.02236915, + "epoch": 0.9035050802621295, + "flos": 26286359702400.0, + "grad_norm": 1.666635071776909, + "language_loss": 0.72351444, + "learning_rate": 9.67827296551853e-08, + "loss": 0.74524081, + "num_input_tokens_seen": 162528490, + "step": 7514, + "time_per_iteration": 2.5546743869781494 + }, + { + "auxiliary_loss_clip": 0.01131468, + "auxiliary_loss_mlp": 0.00762272, + "balance_loss_clip": 1.0410639, + "balance_loss_mlp": 1.00066876, + "epoch": 0.9036253231527686, + "flos": 24204546224640.0, + "grad_norm": 2.311566923091549, + "language_loss": 0.68624938, + "learning_rate": 9.65434862535659e-08, + "loss": 0.70518672, + "num_input_tokens_seen": 162547860, + "step": 7515, + "time_per_iteration": 2.5298755168914795 + }, + { + "auxiliary_loss_clip": 0.01140536, + "auxiliary_loss_mlp": 0.01029855, + "balance_loss_clip": 1.04273295, + "balance_loss_mlp": 1.02221394, + "epoch": 0.9037455660434077, + "flos": 18072655660800.0, + "grad_norm": 6.486773873803285, + "language_loss": 0.64949232, + "learning_rate": 9.630453160478635e-08, + "loss": 0.67119622, + "num_input_tokens_seen": 162563215, + "step": 7516, + "time_per_iteration": 2.462792158126831 + }, + { + "auxiliary_loss_clip": 0.01114105, + "auxiliary_loss_mlp": 0.01024457, + "balance_loss_clip": 1.04223311, + "balance_loss_mlp": 1.01701784, + "epoch": 0.9038658089340468, + "flos": 24060795995520.0, + "grad_norm": 1.83137114393229, + "language_loss": 0.82553911, + "learning_rate": 9.60658657450959e-08, + "loss": 0.84692466, + "num_input_tokens_seen": 162583515, + "step": 7517, + "time_per_iteration": 2.58362078666687 + }, + { + "auxiliary_loss_clip": 0.01125106, + "auxiliary_loss_mlp": 0.01024572, + "balance_loss_clip": 1.03963351, + "balance_loss_mlp": 1.01731503, + "epoch": 0.9039860518246858, + "flos": 21834298535040.0, + "grad_norm": 1.5371463033916737, + "language_loss": 0.79198819, + "learning_rate": 9.582748871069979e-08, + "loss": 0.81348503, + "num_input_tokens_seen": 162602955, + "step": 7518, + "time_per_iteration": 2.51606822013855 + }, + { + "auxiliary_loss_clip": 0.0114199, + "auxiliary_loss_mlp": 0.0076154, + "balance_loss_clip": 1.04355717, + "balance_loss_mlp": 1.00061321, + "epoch": 0.904106294715325, + "flos": 26614870513920.0, + "grad_norm": 2.1215857694285143, + "language_loss": 0.8370502, + "learning_rate": 9.558940053775954e-08, + "loss": 0.85608554, + "num_input_tokens_seen": 162621595, + "step": 7519, + "time_per_iteration": 2.5291664600372314 + }, + { + "auxiliary_loss_clip": 0.01153749, + "auxiliary_loss_mlp": 0.01026132, + "balance_loss_clip": 1.04653382, + "balance_loss_mlp": 1.01874721, + "epoch": 0.904226537605964, + "flos": 17785693906560.0, + "grad_norm": 1.9785362444003276, + "language_loss": 0.6828714, + "learning_rate": 9.535160126239294e-08, + "loss": 0.70467025, + "num_input_tokens_seen": 162638220, + "step": 7520, + "time_per_iteration": 2.44897723197937 + }, + { + "auxiliary_loss_clip": 0.01152121, + "auxiliary_loss_mlp": 0.01025899, + "balance_loss_clip": 1.04603446, + "balance_loss_mlp": 1.01861501, + "epoch": 0.9043467804966031, + "flos": 24790428961920.0, + "grad_norm": 1.7141149980643737, + "language_loss": 0.70384598, + "learning_rate": 9.511409092067424e-08, + "loss": 0.72562623, + "num_input_tokens_seen": 162658575, + "step": 7521, + "time_per_iteration": 2.4861838817596436 + }, + { + "auxiliary_loss_clip": 0.01142208, + "auxiliary_loss_mlp": 0.01022294, + "balance_loss_clip": 1.04656863, + "balance_loss_mlp": 1.0149684, + "epoch": 0.9044670233872423, + "flos": 22632125472000.0, + "grad_norm": 1.858793125397998, + "language_loss": 0.67469102, + "learning_rate": 9.487686954863327e-08, + "loss": 0.69633603, + "num_input_tokens_seen": 162678295, + "step": 7522, + "time_per_iteration": 2.509335517883301 + }, + { + "auxiliary_loss_clip": 0.0115355, + "auxiliary_loss_mlp": 0.01023479, + "balance_loss_clip": 1.04669893, + "balance_loss_mlp": 1.01630878, + "epoch": 0.9045872662778813, + "flos": 23771320289280.0, + "grad_norm": 2.5151446862364653, + "language_loss": 0.77184308, + "learning_rate": 9.46399371822566e-08, + "loss": 0.79361331, + "num_input_tokens_seen": 162698070, + "step": 7523, + "time_per_iteration": 2.4762980937957764 + }, + { + "auxiliary_loss_clip": 0.01169265, + "auxiliary_loss_mlp": 0.01024107, + "balance_loss_clip": 1.04823887, + "balance_loss_mlp": 1.01643634, + "epoch": 0.9047075091685204, + "flos": 15191039998080.0, + "grad_norm": 3.2833600092622346, + "language_loss": 0.72346842, + "learning_rate": 9.440329385748657e-08, + "loss": 0.74540222, + "num_input_tokens_seen": 162715140, + "step": 7524, + "time_per_iteration": 2.39872145652771 + }, + { + "auxiliary_loss_clip": 0.01125514, + "auxiliary_loss_mlp": 0.01017282, + "balance_loss_clip": 1.04399002, + "balance_loss_mlp": 1.01076102, + "epoch": 0.9048277520591596, + "flos": 18003707504640.0, + "grad_norm": 2.162751211048667, + "language_loss": 0.70811832, + "learning_rate": 9.416693961022137e-08, + "loss": 0.72954631, + "num_input_tokens_seen": 162733390, + "step": 7525, + "time_per_iteration": 2.4975833892822266 + }, + { + "auxiliary_loss_clip": 0.01084363, + "auxiliary_loss_mlp": 0.01024077, + "balance_loss_clip": 1.03814173, + "balance_loss_mlp": 1.01667738, + "epoch": 0.9049479949497986, + "flos": 21872471713920.0, + "grad_norm": 1.987055477679091, + "language_loss": 0.76997256, + "learning_rate": 9.393087447631654e-08, + "loss": 0.79105699, + "num_input_tokens_seen": 162751670, + "step": 7526, + "time_per_iteration": 2.6181704998016357 + }, + { + "auxiliary_loss_clip": 0.01141099, + "auxiliary_loss_mlp": 0.01021475, + "balance_loss_clip": 1.04328668, + "balance_loss_mlp": 1.01462603, + "epoch": 0.9050682378404377, + "flos": 20773928113920.0, + "grad_norm": 1.698716490987202, + "language_loss": 0.72837192, + "learning_rate": 9.36950984915823e-08, + "loss": 0.74999768, + "num_input_tokens_seen": 162770025, + "step": 7527, + "time_per_iteration": 2.506044387817383 + }, + { + "auxiliary_loss_clip": 0.01170802, + "auxiliary_loss_mlp": 0.01025427, + "balance_loss_clip": 1.04984283, + "balance_loss_mlp": 1.01783288, + "epoch": 0.9051884807310768, + "flos": 21580015178880.0, + "grad_norm": 1.7653158997329197, + "language_loss": 0.69208878, + "learning_rate": 9.345961169178607e-08, + "loss": 0.71405113, + "num_input_tokens_seen": 162789710, + "step": 7528, + "time_per_iteration": 2.444981336593628 + }, + { + "auxiliary_loss_clip": 0.01113175, + "auxiliary_loss_mlp": 0.01025175, + "balance_loss_clip": 1.04525554, + "balance_loss_mlp": 1.01821899, + "epoch": 0.9053087236217159, + "flos": 21908059113600.0, + "grad_norm": 1.486251637750028, + "language_loss": 0.72977948, + "learning_rate": 9.322441411265081e-08, + "loss": 0.75116301, + "num_input_tokens_seen": 162810695, + "step": 7529, + "time_per_iteration": 2.519649028778076 + }, + { + "auxiliary_loss_clip": 0.01137103, + "auxiliary_loss_mlp": 0.01026525, + "balance_loss_clip": 1.04527771, + "balance_loss_mlp": 1.01918459, + "epoch": 0.9054289665123549, + "flos": 17055809544960.0, + "grad_norm": 1.7504273332872147, + "language_loss": 0.72875792, + "learning_rate": 9.298950578985554e-08, + "loss": 0.75039423, + "num_input_tokens_seen": 162827770, + "step": 7530, + "time_per_iteration": 2.466503381729126 + }, + { + "auxiliary_loss_clip": 0.01150439, + "auxiliary_loss_mlp": 0.00762582, + "balance_loss_clip": 1.04608798, + "balance_loss_mlp": 1.00059867, + "epoch": 0.905549209402994, + "flos": 20777268078720.0, + "grad_norm": 1.65653010678439, + "language_loss": 0.70831716, + "learning_rate": 9.275488675903665e-08, + "loss": 0.72744739, + "num_input_tokens_seen": 162846715, + "step": 7531, + "time_per_iteration": 3.253729820251465 + }, + { + "auxiliary_loss_clip": 0.01110052, + "auxiliary_loss_mlp": 0.0102387, + "balance_loss_clip": 1.0423944, + "balance_loss_mlp": 1.01652694, + "epoch": 0.9056694522936332, + "flos": 21686813291520.0, + "grad_norm": 2.061532485275794, + "language_loss": 0.73962098, + "learning_rate": 9.252055705578454e-08, + "loss": 0.76096016, + "num_input_tokens_seen": 162866215, + "step": 7532, + "time_per_iteration": 3.354048013687134 + }, + { + "auxiliary_loss_clip": 0.01152087, + "auxiliary_loss_mlp": 0.01024669, + "balance_loss_clip": 1.04370475, + "balance_loss_mlp": 1.01767111, + "epoch": 0.9057896951842722, + "flos": 29569133433600.0, + "grad_norm": 1.634598435220296, + "language_loss": 0.72020185, + "learning_rate": 9.228651671564747e-08, + "loss": 0.74196947, + "num_input_tokens_seen": 162888245, + "step": 7533, + "time_per_iteration": 2.5383460521698 + }, + { + "auxiliary_loss_clip": 0.01108129, + "auxiliary_loss_mlp": 0.01024142, + "balance_loss_clip": 1.04404569, + "balance_loss_mlp": 1.01711714, + "epoch": 0.9059099380749113, + "flos": 27892248952320.0, + "grad_norm": 1.5794755931091622, + "language_loss": 0.77857113, + "learning_rate": 9.205276577412901e-08, + "loss": 0.7998938, + "num_input_tokens_seen": 162911025, + "step": 7534, + "time_per_iteration": 2.6048240661621094 + }, + { + "auxiliary_loss_clip": 0.01146382, + "auxiliary_loss_mlp": 0.00762087, + "balance_loss_clip": 1.04320788, + "balance_loss_mlp": 1.00058055, + "epoch": 0.9060301809655504, + "flos": 17748993185280.0, + "grad_norm": 2.459759920993589, + "language_loss": 0.77106678, + "learning_rate": 9.181930426668905e-08, + "loss": 0.79015148, + "num_input_tokens_seen": 162927820, + "step": 7535, + "time_per_iteration": 3.2118759155273438 + }, + { + "auxiliary_loss_clip": 0.01107894, + "auxiliary_loss_mlp": 0.01025929, + "balance_loss_clip": 1.04205596, + "balance_loss_mlp": 1.0191853, + "epoch": 0.9061504238561895, + "flos": 31759432963200.0, + "grad_norm": 1.5825074263306385, + "language_loss": 0.6745854, + "learning_rate": 9.158613222874346e-08, + "loss": 0.69592357, + "num_input_tokens_seen": 162949445, + "step": 7536, + "time_per_iteration": 2.629523992538452 + }, + { + "auxiliary_loss_clip": 0.01138026, + "auxiliary_loss_mlp": 0.0102148, + "balance_loss_clip": 1.04339516, + "balance_loss_mlp": 1.01429737, + "epoch": 0.9062706667468285, + "flos": 20048066075520.0, + "grad_norm": 1.5659477420899628, + "language_loss": 0.81848061, + "learning_rate": 9.135324969566394e-08, + "loss": 0.84007561, + "num_input_tokens_seen": 162968945, + "step": 7537, + "time_per_iteration": 3.3342010974884033 + }, + { + "auxiliary_loss_clip": 0.01158757, + "auxiliary_loss_mlp": 0.01022614, + "balance_loss_clip": 1.04733634, + "balance_loss_mlp": 1.01545238, + "epoch": 0.9063909096374677, + "flos": 18437292576000.0, + "grad_norm": 2.2345298803333224, + "language_loss": 0.7588892, + "learning_rate": 9.112065670277913e-08, + "loss": 0.78070283, + "num_input_tokens_seen": 162985310, + "step": 7538, + "time_per_iteration": 2.522016763687134 + }, + { + "auxiliary_loss_clip": 0.0113644, + "auxiliary_loss_mlp": 0.01021006, + "balance_loss_clip": 1.04238939, + "balance_loss_mlp": 1.01403832, + "epoch": 0.9065111525281068, + "flos": 33547353361920.0, + "grad_norm": 1.778730652820774, + "language_loss": 0.72769725, + "learning_rate": 9.088835328537303e-08, + "loss": 0.74927163, + "num_input_tokens_seen": 163006900, + "step": 7539, + "time_per_iteration": 2.5802993774414062 + }, + { + "auxiliary_loss_clip": 0.01144526, + "auxiliary_loss_mlp": 0.01022116, + "balance_loss_clip": 1.0458858, + "balance_loss_mlp": 1.01471305, + "epoch": 0.9066313954187458, + "flos": 23367863750400.0, + "grad_norm": 2.289992721547135, + "language_loss": 0.71374834, + "learning_rate": 9.065633947868568e-08, + "loss": 0.73541468, + "num_input_tokens_seen": 163026505, + "step": 7540, + "time_per_iteration": 2.50004506111145 + }, + { + "auxiliary_loss_clip": 0.01126419, + "auxiliary_loss_mlp": 0.00761998, + "balance_loss_clip": 1.04614735, + "balance_loss_mlp": 1.00056839, + "epoch": 0.906751638309385, + "flos": 26249623067520.0, + "grad_norm": 3.082398168815239, + "language_loss": 0.80325931, + "learning_rate": 9.042461531791379e-08, + "loss": 0.8221435, + "num_input_tokens_seen": 163044925, + "step": 7541, + "time_per_iteration": 2.547110080718994 + }, + { + "auxiliary_loss_clip": 0.01164061, + "auxiliary_loss_mlp": 0.01023916, + "balance_loss_clip": 1.04562438, + "balance_loss_mlp": 1.01703429, + "epoch": 0.906871881200024, + "flos": 16544477485440.0, + "grad_norm": 1.7329479903540137, + "language_loss": 0.78199518, + "learning_rate": 9.019318083820903e-08, + "loss": 0.80387497, + "num_input_tokens_seen": 163063505, + "step": 7542, + "time_per_iteration": 2.390437602996826 + }, + { + "auxiliary_loss_clip": 0.01152981, + "auxiliary_loss_mlp": 0.01028429, + "balance_loss_clip": 1.04750896, + "balance_loss_mlp": 1.02103829, + "epoch": 0.9069921240906631, + "flos": 24605129675520.0, + "grad_norm": 2.0023785075331126, + "language_loss": 0.85184461, + "learning_rate": 8.996203607468045e-08, + "loss": 0.87365878, + "num_input_tokens_seen": 163082505, + "step": 7543, + "time_per_iteration": 2.477757692337036 + }, + { + "auxiliary_loss_clip": 0.0114886, + "auxiliary_loss_mlp": 0.01026777, + "balance_loss_clip": 1.0428853, + "balance_loss_mlp": 1.01927567, + "epoch": 0.9071123669813023, + "flos": 25374731500800.0, + "grad_norm": 1.4645532119734124, + "language_loss": 0.75440693, + "learning_rate": 8.973118106239241e-08, + "loss": 0.77616334, + "num_input_tokens_seen": 163105110, + "step": 7544, + "time_per_iteration": 2.505614995956421 + }, + { + "auxiliary_loss_clip": 0.01095216, + "auxiliary_loss_mlp": 0.01027988, + "balance_loss_clip": 1.03688478, + "balance_loss_mlp": 1.02041233, + "epoch": 0.9072326098719413, + "flos": 26725798690560.0, + "grad_norm": 2.0905302540153463, + "language_loss": 0.94804525, + "learning_rate": 8.95006158363656e-08, + "loss": 0.96927732, + "num_input_tokens_seen": 163125295, + "step": 7545, + "time_per_iteration": 2.6325249671936035 + }, + { + "auxiliary_loss_clip": 0.01153383, + "auxiliary_loss_mlp": 0.01027474, + "balance_loss_clip": 1.04777241, + "balance_loss_mlp": 1.01956487, + "epoch": 0.9073528527625804, + "flos": 23878800760320.0, + "grad_norm": 2.2245065383330087, + "language_loss": 0.76932657, + "learning_rate": 8.9270340431576e-08, + "loss": 0.79113507, + "num_input_tokens_seen": 163144385, + "step": 7546, + "time_per_iteration": 2.473992109298706 + }, + { + "auxiliary_loss_clip": 0.01154594, + "auxiliary_loss_mlp": 0.01024576, + "balance_loss_clip": 1.04430819, + "balance_loss_mlp": 1.01757884, + "epoch": 0.9074730956532195, + "flos": 37852144767360.0, + "grad_norm": 2.059092161266508, + "language_loss": 0.72981203, + "learning_rate": 8.904035488295658e-08, + "loss": 0.75160372, + "num_input_tokens_seen": 163163885, + "step": 7547, + "time_per_iteration": 2.586333751678467 + }, + { + "auxiliary_loss_clip": 0.01053067, + "auxiliary_loss_mlp": 0.00752849, + "balance_loss_clip": 1.00724471, + "balance_loss_mlp": 1.00057411, + "epoch": 0.9075933385438586, + "flos": 65173307385600.0, + "grad_norm": 0.6607876875412431, + "language_loss": 0.53266406, + "learning_rate": 8.881065922539632e-08, + "loss": 0.5507232, + "num_input_tokens_seen": 163224325, + "step": 7548, + "time_per_iteration": 2.9836766719818115 + }, + { + "auxiliary_loss_clip": 0.01119647, + "auxiliary_loss_mlp": 0.01019765, + "balance_loss_clip": 1.04273593, + "balance_loss_mlp": 1.01315498, + "epoch": 0.9077135814344977, + "flos": 19931571290880.0, + "grad_norm": 1.961537818154808, + "language_loss": 0.73301464, + "learning_rate": 8.85812534937389e-08, + "loss": 0.75440872, + "num_input_tokens_seen": 163242425, + "step": 7549, + "time_per_iteration": 2.5072293281555176 + }, + { + "auxiliary_loss_clip": 0.01160627, + "auxiliary_loss_mlp": 0.01025513, + "balance_loss_clip": 1.04718399, + "balance_loss_mlp": 1.01790738, + "epoch": 0.9078338243251368, + "flos": 17529650784000.0, + "grad_norm": 4.985164339625351, + "language_loss": 0.67327619, + "learning_rate": 8.835213772278583e-08, + "loss": 0.69513756, + "num_input_tokens_seen": 163259280, + "step": 7550, + "time_per_iteration": 2.4775190353393555 + }, + { + "auxiliary_loss_clip": 0.0111421, + "auxiliary_loss_mlp": 0.010219, + "balance_loss_clip": 1.04270339, + "balance_loss_mlp": 1.01484835, + "epoch": 0.9079540672157759, + "flos": 28803410277120.0, + "grad_norm": 1.863287117329654, + "language_loss": 0.79371458, + "learning_rate": 8.812331194729373e-08, + "loss": 0.81507564, + "num_input_tokens_seen": 163278925, + "step": 7551, + "time_per_iteration": 2.5729520320892334 + }, + { + "auxiliary_loss_clip": 0.01176493, + "auxiliary_loss_mlp": 0.01026329, + "balance_loss_clip": 1.05369735, + "balance_loss_mlp": 1.018718, + "epoch": 0.9080743101064149, + "flos": 23513840622720.0, + "grad_norm": 2.2199359459030896, + "language_loss": 0.72335291, + "learning_rate": 8.789477620197461e-08, + "loss": 0.74538112, + "num_input_tokens_seen": 163298450, + "step": 7552, + "time_per_iteration": 2.463181495666504 + }, + { + "auxiliary_loss_clip": 0.01138948, + "auxiliary_loss_mlp": 0.01024396, + "balance_loss_clip": 1.04393649, + "balance_loss_mlp": 1.01699591, + "epoch": 0.9081945529970541, + "flos": 22778102344320.0, + "grad_norm": 3.1582162234323934, + "language_loss": 0.78779966, + "learning_rate": 8.766653052149831e-08, + "loss": 0.80943316, + "num_input_tokens_seen": 163313635, + "step": 7553, + "time_per_iteration": 2.479616403579712 + }, + { + "auxiliary_loss_clip": 0.01137744, + "auxiliary_loss_mlp": 0.01026231, + "balance_loss_clip": 1.04336774, + "balance_loss_mlp": 1.01833355, + "epoch": 0.9083147958876931, + "flos": 18873714821760.0, + "grad_norm": 2.2264079878012377, + "language_loss": 0.74323857, + "learning_rate": 8.743857494048823e-08, + "loss": 0.76487839, + "num_input_tokens_seen": 163330450, + "step": 7554, + "time_per_iteration": 2.468980550765991 + }, + { + "auxiliary_loss_clip": 0.01123895, + "auxiliary_loss_mlp": 0.01026838, + "balance_loss_clip": 1.04284251, + "balance_loss_mlp": 1.0191431, + "epoch": 0.9084350387783322, + "flos": 18909374048640.0, + "grad_norm": 2.5556114341414258, + "language_loss": 0.62721753, + "learning_rate": 8.721090949352605e-08, + "loss": 0.64872485, + "num_input_tokens_seen": 163346690, + "step": 7555, + "time_per_iteration": 2.506692886352539 + }, + { + "auxiliary_loss_clip": 0.01163485, + "auxiliary_loss_mlp": 0.01026539, + "balance_loss_clip": 1.04874635, + "balance_loss_mlp": 1.01841497, + "epoch": 0.9085552816689714, + "flos": 20595488325120.0, + "grad_norm": 1.8287228030855922, + "language_loss": 0.72739619, + "learning_rate": 8.698353421514793e-08, + "loss": 0.74929643, + "num_input_tokens_seen": 163365065, + "step": 7556, + "time_per_iteration": 2.4697444438934326 + }, + { + "auxiliary_loss_clip": 0.0115437, + "auxiliary_loss_mlp": 0.01024914, + "balance_loss_clip": 1.04712594, + "balance_loss_mlp": 1.0181433, + "epoch": 0.9086755245596104, + "flos": 18113163223680.0, + "grad_norm": 3.06881531089631, + "language_loss": 0.80415642, + "learning_rate": 8.67564491398467e-08, + "loss": 0.82594931, + "num_input_tokens_seen": 163382070, + "step": 7557, + "time_per_iteration": 2.495985269546509 + }, + { + "auxiliary_loss_clip": 0.01154875, + "auxiliary_loss_mlp": 0.01025913, + "balance_loss_clip": 1.044361, + "balance_loss_mlp": 1.01772976, + "epoch": 0.9087957674502495, + "flos": 19129793857920.0, + "grad_norm": 2.300809917047527, + "language_loss": 0.73428833, + "learning_rate": 8.652965430207104e-08, + "loss": 0.75609612, + "num_input_tokens_seen": 163399975, + "step": 7558, + "time_per_iteration": 3.2663745880126953 + }, + { + "auxiliary_loss_clip": 0.01157855, + "auxiliary_loss_mlp": 0.01029759, + "balance_loss_clip": 1.04597902, + "balance_loss_mlp": 1.02188492, + "epoch": 0.9089160103408886, + "flos": 18109930999680.0, + "grad_norm": 2.2361342094669086, + "language_loss": 0.65614027, + "learning_rate": 8.630314973622521e-08, + "loss": 0.67801642, + "num_input_tokens_seen": 163417520, + "step": 7559, + "time_per_iteration": 3.217677354812622 + }, + { + "auxiliary_loss_clip": 0.01151243, + "auxiliary_loss_mlp": 0.01025444, + "balance_loss_clip": 1.04780793, + "balance_loss_mlp": 1.01842296, + "epoch": 0.9090362532315277, + "flos": 33364855336320.0, + "grad_norm": 1.9017871485582785, + "language_loss": 0.70535803, + "learning_rate": 8.607693547666995e-08, + "loss": 0.72712493, + "num_input_tokens_seen": 163440060, + "step": 7560, + "time_per_iteration": 2.5627450942993164 + }, + { + "auxiliary_loss_clip": 0.01034547, + "auxiliary_loss_mlp": 0.01000568, + "balance_loss_clip": 1.00758457, + "balance_loss_mlp": 0.99951857, + "epoch": 0.9091564961221668, + "flos": 71480585082240.0, + "grad_norm": 0.9002536647570955, + "language_loss": 0.57930243, + "learning_rate": 8.585101155772201e-08, + "loss": 0.59965354, + "num_input_tokens_seen": 163502180, + "step": 7561, + "time_per_iteration": 3.833277940750122 + }, + { + "auxiliary_loss_clip": 0.0113205, + "auxiliary_loss_mlp": 0.01026556, + "balance_loss_clip": 1.04020631, + "balance_loss_mlp": 1.01919436, + "epoch": 0.9092767390128058, + "flos": 24712574232960.0, + "grad_norm": 1.8630762312675433, + "language_loss": 0.68773091, + "learning_rate": 8.562537801365377e-08, + "loss": 0.70931697, + "num_input_tokens_seen": 163521915, + "step": 7562, + "time_per_iteration": 2.555105447769165 + }, + { + "auxiliary_loss_clip": 0.0116974, + "auxiliary_loss_mlp": 0.01027955, + "balance_loss_clip": 1.04808116, + "balance_loss_mlp": 1.02045715, + "epoch": 0.909396981903445, + "flos": 23586487879680.0, + "grad_norm": 4.827409962437152, + "language_loss": 0.6979624, + "learning_rate": 8.540003487869362e-08, + "loss": 0.71993935, + "num_input_tokens_seen": 163543585, + "step": 7563, + "time_per_iteration": 2.499483585357666 + }, + { + "auxiliary_loss_clip": 0.01114026, + "auxiliary_loss_mlp": 0.01024538, + "balance_loss_clip": 1.04041624, + "balance_loss_mlp": 1.01704574, + "epoch": 0.909517224794084, + "flos": 23404169422080.0, + "grad_norm": 2.358086896197855, + "language_loss": 0.79645872, + "learning_rate": 8.517498218702557e-08, + "loss": 0.81784439, + "num_input_tokens_seen": 163561515, + "step": 7564, + "time_per_iteration": 3.302440881729126 + }, + { + "auxiliary_loss_clip": 0.01121234, + "auxiliary_loss_mlp": 0.01019602, + "balance_loss_clip": 1.04238927, + "balance_loss_mlp": 1.0124228, + "epoch": 0.9096374676847231, + "flos": 19208618254080.0, + "grad_norm": 2.618285000129647, + "language_loss": 0.6942271, + "learning_rate": 8.49502199727905e-08, + "loss": 0.71563542, + "num_input_tokens_seen": 163579540, + "step": 7565, + "time_per_iteration": 2.514634370803833 + }, + { + "auxiliary_loss_clip": 0.01149598, + "auxiliary_loss_mlp": 0.01025239, + "balance_loss_clip": 1.04260457, + "balance_loss_mlp": 1.0175916, + "epoch": 0.9097577105753623, + "flos": 33292495388160.0, + "grad_norm": 2.4836085706986, + "language_loss": 0.66127467, + "learning_rate": 8.472574827008428e-08, + "loss": 0.6830231, + "num_input_tokens_seen": 163600425, + "step": 7566, + "time_per_iteration": 2.5547547340393066 + }, + { + "auxiliary_loss_clip": 0.01151815, + "auxiliary_loss_mlp": 0.01026705, + "balance_loss_clip": 1.04304481, + "balance_loss_mlp": 1.01939738, + "epoch": 0.9098779534660013, + "flos": 21906443001600.0, + "grad_norm": 1.7324059850052218, + "language_loss": 0.83921808, + "learning_rate": 8.450156711295942e-08, + "loss": 0.86100328, + "num_input_tokens_seen": 163620595, + "step": 7567, + "time_per_iteration": 2.465272903442383 + }, + { + "auxiliary_loss_clip": 0.01140892, + "auxiliary_loss_mlp": 0.01026601, + "balance_loss_clip": 1.04828143, + "balance_loss_mlp": 1.01939154, + "epoch": 0.9099981963566404, + "flos": 25730354102400.0, + "grad_norm": 1.9274042419024267, + "language_loss": 0.8670702, + "learning_rate": 8.427767653542383e-08, + "loss": 0.88874519, + "num_input_tokens_seen": 163635765, + "step": 7568, + "time_per_iteration": 2.516515016555786 + }, + { + "auxiliary_loss_clip": 0.0110552, + "auxiliary_loss_mlp": 0.01025317, + "balance_loss_clip": 1.03896046, + "balance_loss_mlp": 1.01845336, + "epoch": 0.9101184392472795, + "flos": 21069437304960.0, + "grad_norm": 1.8539315334396054, + "language_loss": 0.698852, + "learning_rate": 8.405407657144125e-08, + "loss": 0.72016042, + "num_input_tokens_seen": 163654925, + "step": 7569, + "time_per_iteration": 2.5505597591400146 + }, + { + "auxiliary_loss_clip": 0.01134209, + "auxiliary_loss_mlp": 0.01024141, + "balance_loss_clip": 1.04303598, + "balance_loss_mlp": 1.01679826, + "epoch": 0.9102386821379186, + "flos": 24752614919040.0, + "grad_norm": 1.7247758900466597, + "language_loss": 0.72149831, + "learning_rate": 8.383076725493232e-08, + "loss": 0.74308181, + "num_input_tokens_seen": 163672245, + "step": 7570, + "time_per_iteration": 2.5145318508148193 + }, + { + "auxiliary_loss_clip": 0.011554, + "auxiliary_loss_mlp": 0.01021254, + "balance_loss_clip": 1.04620111, + "balance_loss_mlp": 1.01435184, + "epoch": 0.9103589250285576, + "flos": 22562818179840.0, + "grad_norm": 2.2029409913935214, + "language_loss": 0.67712522, + "learning_rate": 8.360774861977216e-08, + "loss": 0.69889176, + "num_input_tokens_seen": 163691365, + "step": 7571, + "time_per_iteration": 2.459878921508789 + }, + { + "auxiliary_loss_clip": 0.01138014, + "auxiliary_loss_mlp": 0.01021957, + "balance_loss_clip": 1.04037762, + "balance_loss_mlp": 1.01480412, + "epoch": 0.9104791679191968, + "flos": 25373474524800.0, + "grad_norm": 1.8909629897914773, + "language_loss": 0.74721098, + "learning_rate": 8.338502069979281e-08, + "loss": 0.76881063, + "num_input_tokens_seen": 163711675, + "step": 7572, + "time_per_iteration": 2.5305027961730957 + }, + { + "auxiliary_loss_clip": 0.01155029, + "auxiliary_loss_mlp": 0.01025337, + "balance_loss_clip": 1.04426026, + "balance_loss_mlp": 1.01767802, + "epoch": 0.9105994108098359, + "flos": 14426681558400.0, + "grad_norm": 3.0938423577157916, + "language_loss": 0.8003304, + "learning_rate": 8.316258352878214e-08, + "loss": 0.82213408, + "num_input_tokens_seen": 163728095, + "step": 7573, + "time_per_iteration": 2.431338310241699 + }, + { + "auxiliary_loss_clip": 0.01158831, + "auxiliary_loss_mlp": 0.01025935, + "balance_loss_clip": 1.04529881, + "balance_loss_mlp": 1.01838279, + "epoch": 0.9107196537004749, + "flos": 26718292748160.0, + "grad_norm": 1.9119145923716458, + "language_loss": 0.71373779, + "learning_rate": 8.294043714048338e-08, + "loss": 0.73558545, + "num_input_tokens_seen": 163747175, + "step": 7574, + "time_per_iteration": 2.5157461166381836 + }, + { + "auxiliary_loss_clip": 0.01044298, + "auxiliary_loss_mlp": 0.01002015, + "balance_loss_clip": 1.00767434, + "balance_loss_mlp": 1.00100219, + "epoch": 0.9108398965911141, + "flos": 66532634703360.0, + "grad_norm": 0.7551381941104454, + "language_loss": 0.6046322, + "learning_rate": 8.271858156859624e-08, + "loss": 0.62509531, + "num_input_tokens_seen": 163812545, + "step": 7575, + "time_per_iteration": 3.143444776535034 + }, + { + "auxiliary_loss_clip": 0.01165207, + "auxiliary_loss_mlp": 0.01021354, + "balance_loss_clip": 1.04682755, + "balance_loss_mlp": 1.01380169, + "epoch": 0.9109601394817531, + "flos": 25411073086080.0, + "grad_norm": 1.7419752261321302, + "language_loss": 0.73619723, + "learning_rate": 8.249701684677557e-08, + "loss": 0.75806284, + "num_input_tokens_seen": 163833870, + "step": 7576, + "time_per_iteration": 2.463787078857422 + }, + { + "auxiliary_loss_clip": 0.01156084, + "auxiliary_loss_mlp": 0.01022588, + "balance_loss_clip": 1.04913282, + "balance_loss_mlp": 1.01526606, + "epoch": 0.9110803823723922, + "flos": 22747794243840.0, + "grad_norm": 1.9703148005968019, + "language_loss": 0.80803311, + "learning_rate": 8.227574300863294e-08, + "loss": 0.82981992, + "num_input_tokens_seen": 163854040, + "step": 7577, + "time_per_iteration": 2.4812889099121094 + }, + { + "auxiliary_loss_clip": 0.01143806, + "auxiliary_loss_mlp": 0.01023849, + "balance_loss_clip": 1.04683089, + "balance_loss_mlp": 1.01601088, + "epoch": 0.9112006252630314, + "flos": 48469924131840.0, + "grad_norm": 1.617871875021163, + "language_loss": 0.69462633, + "learning_rate": 8.205476008773548e-08, + "loss": 0.71630287, + "num_input_tokens_seen": 163878040, + "step": 7578, + "time_per_iteration": 2.7361176013946533 + }, + { + "auxiliary_loss_clip": 0.01116711, + "auxiliary_loss_mlp": 0.01025094, + "balance_loss_clip": 1.0422703, + "balance_loss_mlp": 1.01764631, + "epoch": 0.9113208681536704, + "flos": 30009649829760.0, + "grad_norm": 2.545324968656029, + "language_loss": 0.82227695, + "learning_rate": 8.183406811760596e-08, + "loss": 0.84369504, + "num_input_tokens_seen": 163897770, + "step": 7579, + "time_per_iteration": 2.588406801223755 + }, + { + "auxiliary_loss_clip": 0.01111443, + "auxiliary_loss_mlp": 0.01022101, + "balance_loss_clip": 1.04007339, + "balance_loss_mlp": 1.01496005, + "epoch": 0.9114411110443095, + "flos": 25594971742080.0, + "grad_norm": 1.7099112658447218, + "language_loss": 0.74198896, + "learning_rate": 8.161366713172313e-08, + "loss": 0.76332438, + "num_input_tokens_seen": 163920160, + "step": 7580, + "time_per_iteration": 2.5842747688293457 + }, + { + "auxiliary_loss_clip": 0.01131655, + "auxiliary_loss_mlp": 0.01027424, + "balance_loss_clip": 1.04321718, + "balance_loss_mlp": 1.0197053, + "epoch": 0.9115613539349486, + "flos": 18399729928320.0, + "grad_norm": 3.1921627133436807, + "language_loss": 0.84123647, + "learning_rate": 8.139355716352137e-08, + "loss": 0.86282724, + "num_input_tokens_seen": 163935000, + "step": 7581, + "time_per_iteration": 2.483145236968994 + }, + { + "auxiliary_loss_clip": 0.01142473, + "auxiliary_loss_mlp": 0.01025954, + "balance_loss_clip": 1.0432446, + "balance_loss_mlp": 1.01824164, + "epoch": 0.9116815968255877, + "flos": 21726171619200.0, + "grad_norm": 1.6174939935877248, + "language_loss": 0.69978321, + "learning_rate": 8.117373824639196e-08, + "loss": 0.72146749, + "num_input_tokens_seen": 163955265, + "step": 7582, + "time_per_iteration": 2.4885759353637695 + }, + { + "auxiliary_loss_clip": 0.01063331, + "auxiliary_loss_mlp": 0.01001496, + "balance_loss_clip": 1.00744033, + "balance_loss_mlp": 1.0005064, + "epoch": 0.9118018397162267, + "flos": 65363526835200.0, + "grad_norm": 0.7206252529193478, + "language_loss": 0.59245944, + "learning_rate": 8.095421041368067e-08, + "loss": 0.61310768, + "num_input_tokens_seen": 164014680, + "step": 7583, + "time_per_iteration": 2.9515910148620605 + }, + { + "auxiliary_loss_clip": 0.01138002, + "auxiliary_loss_mlp": 0.00762384, + "balance_loss_clip": 1.04478502, + "balance_loss_mlp": 1.00071633, + "epoch": 0.9119220826068659, + "flos": 20922885815040.0, + "grad_norm": 2.2161953280522457, + "language_loss": 0.70773506, + "learning_rate": 8.073497369868999e-08, + "loss": 0.72673893, + "num_input_tokens_seen": 164033140, + "step": 7584, + "time_per_iteration": 3.2537271976470947 + }, + { + "auxiliary_loss_clip": 0.01149078, + "auxiliary_loss_mlp": 0.01025455, + "balance_loss_clip": 1.04595184, + "balance_loss_mlp": 1.0177685, + "epoch": 0.912042325497505, + "flos": 28366449327360.0, + "grad_norm": 2.0142722585221824, + "language_loss": 0.75469691, + "learning_rate": 8.051602813467772e-08, + "loss": 0.77644223, + "num_input_tokens_seen": 164054995, + "step": 7585, + "time_per_iteration": 3.3457400798797607 + }, + { + "auxiliary_loss_clip": 0.01158157, + "auxiliary_loss_mlp": 0.01023933, + "balance_loss_clip": 1.04724169, + "balance_loss_mlp": 1.01692355, + "epoch": 0.912162568388144, + "flos": 17566782468480.0, + "grad_norm": 1.7847070998808714, + "language_loss": 0.70973802, + "learning_rate": 8.029737375485756e-08, + "loss": 0.73155892, + "num_input_tokens_seen": 164074225, + "step": 7586, + "time_per_iteration": 2.4412057399749756 + }, + { + "auxiliary_loss_clip": 0.01167194, + "auxiliary_loss_mlp": 0.01021733, + "balance_loss_clip": 1.04678166, + "balance_loss_mlp": 1.01458681, + "epoch": 0.9122828112787832, + "flos": 19827897661440.0, + "grad_norm": 1.889662669197226, + "language_loss": 0.72412205, + "learning_rate": 8.007901059239986e-08, + "loss": 0.74601126, + "num_input_tokens_seen": 164093505, + "step": 7587, + "time_per_iteration": 3.19319486618042 + }, + { + "auxiliary_loss_clip": 0.01138808, + "auxiliary_loss_mlp": 0.01022402, + "balance_loss_clip": 1.04094493, + "balance_loss_mlp": 1.01523149, + "epoch": 0.9124030541694222, + "flos": 20813789232000.0, + "grad_norm": 1.5843078522756022, + "language_loss": 0.80037647, + "learning_rate": 7.986093868042964e-08, + "loss": 0.82198858, + "num_input_tokens_seen": 164113750, + "step": 7588, + "time_per_iteration": 2.5071604251861572 + }, + { + "auxiliary_loss_clip": 0.01152524, + "auxiliary_loss_mlp": 0.01029734, + "balance_loss_clip": 1.04553294, + "balance_loss_mlp": 1.02268922, + "epoch": 0.9125232970600613, + "flos": 25192305302400.0, + "grad_norm": 1.7644920222354536, + "language_loss": 0.67760116, + "learning_rate": 7.964315805202826e-08, + "loss": 0.69942367, + "num_input_tokens_seen": 164134330, + "step": 7589, + "time_per_iteration": 2.4938485622406006 + }, + { + "auxiliary_loss_clip": 0.01137006, + "auxiliary_loss_mlp": 0.01023195, + "balance_loss_clip": 1.04345107, + "balance_loss_mlp": 1.01508915, + "epoch": 0.9126435399507005, + "flos": 19719591177600.0, + "grad_norm": 1.725406982103428, + "language_loss": 0.7303623, + "learning_rate": 7.942566874023304e-08, + "loss": 0.75196433, + "num_input_tokens_seen": 164153515, + "step": 7590, + "time_per_iteration": 2.567249059677124 + }, + { + "auxiliary_loss_clip": 0.01136103, + "auxiliary_loss_mlp": 0.01024514, + "balance_loss_clip": 1.04197621, + "balance_loss_mlp": 1.01694989, + "epoch": 0.9127637828413395, + "flos": 19573614305280.0, + "grad_norm": 2.0964218498266534, + "language_loss": 0.69705927, + "learning_rate": 7.920847077803649e-08, + "loss": 0.71866548, + "num_input_tokens_seen": 164171305, + "step": 7591, + "time_per_iteration": 3.2427401542663574 + }, + { + "auxiliary_loss_clip": 0.01099572, + "auxiliary_loss_mlp": 0.0102652, + "balance_loss_clip": 1.03607821, + "balance_loss_mlp": 1.01933157, + "epoch": 0.9128840257319786, + "flos": 20230635928320.0, + "grad_norm": 1.957113516152195, + "language_loss": 0.82797909, + "learning_rate": 7.899156419838826e-08, + "loss": 0.84923995, + "num_input_tokens_seen": 164190275, + "step": 7592, + "time_per_iteration": 2.533723831176758 + }, + { + "auxiliary_loss_clip": 0.01122023, + "auxiliary_loss_mlp": 0.01021318, + "balance_loss_clip": 1.04182243, + "balance_loss_mlp": 1.01416254, + "epoch": 0.9130042686226177, + "flos": 24858658846080.0, + "grad_norm": 1.856686746127236, + "language_loss": 0.65660691, + "learning_rate": 7.87749490341918e-08, + "loss": 0.67804033, + "num_input_tokens_seen": 164210550, + "step": 7593, + "time_per_iteration": 2.566314458847046 + }, + { + "auxiliary_loss_clip": 0.01171443, + "auxiliary_loss_mlp": 0.01022488, + "balance_loss_clip": 1.04889953, + "balance_loss_mlp": 1.01504338, + "epoch": 0.9131245115132568, + "flos": 23581747284480.0, + "grad_norm": 2.012438173210636, + "language_loss": 0.83595526, + "learning_rate": 7.855862531830836e-08, + "loss": 0.8578946, + "num_input_tokens_seen": 164226660, + "step": 7594, + "time_per_iteration": 2.4269392490386963 + }, + { + "auxiliary_loss_clip": 0.01151492, + "auxiliary_loss_mlp": 0.01024326, + "balance_loss_clip": 1.04406226, + "balance_loss_mlp": 1.01710784, + "epoch": 0.9132447544038959, + "flos": 19931607204480.0, + "grad_norm": 1.700456050237155, + "language_loss": 0.72826207, + "learning_rate": 7.834259308355373e-08, + "loss": 0.75002027, + "num_input_tokens_seen": 164245425, + "step": 7595, + "time_per_iteration": 2.4644222259521484 + }, + { + "auxiliary_loss_clip": 0.01080654, + "auxiliary_loss_mlp": 0.01024931, + "balance_loss_clip": 1.03759778, + "balance_loss_mlp": 1.01759708, + "epoch": 0.9133649972945349, + "flos": 21981747864960.0, + "grad_norm": 2.275367801823131, + "language_loss": 0.7489031, + "learning_rate": 7.812685236269989e-08, + "loss": 0.76995897, + "num_input_tokens_seen": 164264085, + "step": 7596, + "time_per_iteration": 2.58955979347229 + }, + { + "auxiliary_loss_clip": 0.01029447, + "auxiliary_loss_mlp": 0.01001986, + "balance_loss_clip": 1.00943291, + "balance_loss_mlp": 1.00101459, + "epoch": 0.9134852401851741, + "flos": 71240523511680.0, + "grad_norm": 0.7893316413292809, + "language_loss": 0.58652204, + "learning_rate": 7.791140318847445e-08, + "loss": 0.60683638, + "num_input_tokens_seen": 164322220, + "step": 7597, + "time_per_iteration": 3.119609832763672 + }, + { + "auxiliary_loss_clip": 0.01136148, + "auxiliary_loss_mlp": 0.01019813, + "balance_loss_clip": 1.04694915, + "balance_loss_mlp": 1.01312554, + "epoch": 0.9136054830758131, + "flos": 23626923615360.0, + "grad_norm": 1.4063111316042651, + "language_loss": 0.8025018, + "learning_rate": 7.769624559356081e-08, + "loss": 0.82406139, + "num_input_tokens_seen": 164345615, + "step": 7598, + "time_per_iteration": 2.567007303237915 + }, + { + "auxiliary_loss_clip": 0.01151662, + "auxiliary_loss_mlp": 0.01025752, + "balance_loss_clip": 1.04316771, + "balance_loss_mlp": 1.01715684, + "epoch": 0.9137257259664522, + "flos": 23438858981760.0, + "grad_norm": 2.4245123931130794, + "language_loss": 0.75162363, + "learning_rate": 7.748137961059842e-08, + "loss": 0.7733978, + "num_input_tokens_seen": 164359595, + "step": 7599, + "time_per_iteration": 2.5099644660949707 + }, + { + "auxiliary_loss_clip": 0.01165764, + "auxiliary_loss_mlp": 0.01023986, + "balance_loss_clip": 1.04780245, + "balance_loss_mlp": 1.01640129, + "epoch": 0.9138459688570914, + "flos": 19127854523520.0, + "grad_norm": 2.323245860134046, + "language_loss": 0.64930695, + "learning_rate": 7.726680527218211e-08, + "loss": 0.67120445, + "num_input_tokens_seen": 164376635, + "step": 7600, + "time_per_iteration": 2.395509719848633 + }, + { + "auxiliary_loss_clip": 0.01166636, + "auxiliary_loss_mlp": 0.01023138, + "balance_loss_clip": 1.04475999, + "balance_loss_mlp": 1.01607442, + "epoch": 0.9139662117477304, + "flos": 46281240714240.0, + "grad_norm": 1.6431855438185088, + "language_loss": 0.75513351, + "learning_rate": 7.70525226108627e-08, + "loss": 0.77703124, + "num_input_tokens_seen": 164400305, + "step": 7601, + "time_per_iteration": 2.633046865463257 + }, + { + "auxiliary_loss_clip": 0.01155728, + "auxiliary_loss_mlp": 0.01029709, + "balance_loss_clip": 1.04853559, + "balance_loss_mlp": 1.02225232, + "epoch": 0.9140864546383695, + "flos": 22273198819200.0, + "grad_norm": 2.373922196977757, + "language_loss": 0.79541147, + "learning_rate": 7.683853165914666e-08, + "loss": 0.81726581, + "num_input_tokens_seen": 164418075, + "step": 7602, + "time_per_iteration": 2.465198516845703 + }, + { + "auxiliary_loss_clip": 0.01111855, + "auxiliary_loss_mlp": 0.01028023, + "balance_loss_clip": 1.04286325, + "balance_loss_mlp": 1.02103782, + "epoch": 0.9142066975290086, + "flos": 17530009920000.0, + "grad_norm": 1.7433339553451817, + "language_loss": 0.77085471, + "learning_rate": 7.662483244949602e-08, + "loss": 0.79225349, + "num_input_tokens_seen": 164435335, + "step": 7603, + "time_per_iteration": 2.528021812438965 + }, + { + "auxiliary_loss_clip": 0.01119161, + "auxiliary_loss_mlp": 0.01021522, + "balance_loss_clip": 1.04242384, + "balance_loss_mlp": 1.01410103, + "epoch": 0.9143269404196477, + "flos": 17712148809600.0, + "grad_norm": 2.3973854210555725, + "language_loss": 0.81116593, + "learning_rate": 7.641142501432951e-08, + "loss": 0.8325727, + "num_input_tokens_seen": 164451530, + "step": 7604, + "time_per_iteration": 2.4836556911468506 + }, + { + "auxiliary_loss_clip": 0.01133655, + "auxiliary_loss_mlp": 0.0102455, + "balance_loss_clip": 1.04243684, + "balance_loss_mlp": 1.01745749, + "epoch": 0.9144471833102867, + "flos": 33323414019840.0, + "grad_norm": 1.7768517789577625, + "language_loss": 0.73837185, + "learning_rate": 7.619830938602013e-08, + "loss": 0.75995386, + "num_input_tokens_seen": 164472755, + "step": 7605, + "time_per_iteration": 2.584273338317871 + }, + { + "auxiliary_loss_clip": 0.01149253, + "auxiliary_loss_mlp": 0.01023474, + "balance_loss_clip": 1.04549098, + "balance_loss_mlp": 1.01595831, + "epoch": 0.9145674262009259, + "flos": 21068970428160.0, + "grad_norm": 2.0743400099323943, + "language_loss": 0.82555723, + "learning_rate": 7.598548559689777e-08, + "loss": 0.8472845, + "num_input_tokens_seen": 164491155, + "step": 7606, + "time_per_iteration": 2.4580600261688232 + }, + { + "auxiliary_loss_clip": 0.0111996, + "auxiliary_loss_mlp": 0.01020666, + "balance_loss_clip": 1.04133022, + "balance_loss_mlp": 1.01344836, + "epoch": 0.914687669091565, + "flos": 16800269212800.0, + "grad_norm": 2.4927302830753555, + "language_loss": 0.8134371, + "learning_rate": 7.577295367924751e-08, + "loss": 0.83484334, + "num_input_tokens_seen": 164507555, + "step": 7607, + "time_per_iteration": 2.4817614555358887 + }, + { + "auxiliary_loss_clip": 0.01143987, + "auxiliary_loss_mlp": 0.01021044, + "balance_loss_clip": 1.04562354, + "balance_loss_mlp": 1.01333141, + "epoch": 0.914807911982204, + "flos": 25773627012480.0, + "grad_norm": 1.8422414323896854, + "language_loss": 0.8217833, + "learning_rate": 7.556071366531002e-08, + "loss": 0.84343362, + "num_input_tokens_seen": 164528525, + "step": 7608, + "time_per_iteration": 2.540740489959717 + }, + { + "auxiliary_loss_clip": 0.01154671, + "auxiliary_loss_mlp": 0.01026339, + "balance_loss_clip": 1.04788399, + "balance_loss_mlp": 1.01874518, + "epoch": 0.9149281548728432, + "flos": 19208043636480.0, + "grad_norm": 2.6538509479992562, + "language_loss": 0.78970075, + "learning_rate": 7.53487655872822e-08, + "loss": 0.8115108, + "num_input_tokens_seen": 164547695, + "step": 7609, + "time_per_iteration": 2.469477891921997 + }, + { + "auxiliary_loss_clip": 0.01113819, + "auxiliary_loss_mlp": 0.01022414, + "balance_loss_clip": 1.03895295, + "balance_loss_mlp": 1.01442742, + "epoch": 0.9150483977634822, + "flos": 26870554500480.0, + "grad_norm": 3.164975619453336, + "language_loss": 0.74143666, + "learning_rate": 7.513710947731656e-08, + "loss": 0.76279902, + "num_input_tokens_seen": 164568905, + "step": 7610, + "time_per_iteration": 2.625819206237793 + }, + { + "auxiliary_loss_clip": 0.01131724, + "auxiliary_loss_mlp": 0.01024756, + "balance_loss_clip": 1.04302859, + "balance_loss_mlp": 1.01737666, + "epoch": 0.9151686406541213, + "flos": 21908956953600.0, + "grad_norm": 1.7751651254015304, + "language_loss": 0.85304993, + "learning_rate": 7.492574536752095e-08, + "loss": 0.87461472, + "num_input_tokens_seen": 164588895, + "step": 7611, + "time_per_iteration": 3.211615562438965 + }, + { + "auxiliary_loss_clip": 0.01149621, + "auxiliary_loss_mlp": 0.01029292, + "balance_loss_clip": 1.04586899, + "balance_loss_mlp": 1.02231181, + "epoch": 0.9152888835447605, + "flos": 27308556944640.0, + "grad_norm": 1.7974786614467873, + "language_loss": 0.78131914, + "learning_rate": 7.471467328995907e-08, + "loss": 0.80310827, + "num_input_tokens_seen": 164607705, + "step": 7612, + "time_per_iteration": 3.297025680541992 + }, + { + "auxiliary_loss_clip": 0.01076288, + "auxiliary_loss_mlp": 0.01021981, + "balance_loss_clip": 1.03697014, + "balance_loss_mlp": 1.01403308, + "epoch": 0.9154091264353995, + "flos": 13370728510080.0, + "grad_norm": 3.3019009176221785, + "language_loss": 0.60441947, + "learning_rate": 7.450389327665018e-08, + "loss": 0.62540215, + "num_input_tokens_seen": 164625540, + "step": 7613, + "time_per_iteration": 2.702507495880127 + }, + { + "auxiliary_loss_clip": 0.0112704, + "auxiliary_loss_mlp": 0.01026445, + "balance_loss_clip": 1.04687226, + "balance_loss_mlp": 1.01886964, + "epoch": 0.9155293693260386, + "flos": 20193037367040.0, + "grad_norm": 3.9911826065536036, + "language_loss": 0.67663938, + "learning_rate": 7.429340535957029e-08, + "loss": 0.69817424, + "num_input_tokens_seen": 164640735, + "step": 7614, + "time_per_iteration": 4.049654483795166 + }, + { + "auxiliary_loss_clip": 0.01140323, + "auxiliary_loss_mlp": 0.01024743, + "balance_loss_clip": 1.04374647, + "balance_loss_mlp": 1.01777244, + "epoch": 0.9156496122166777, + "flos": 19354990176000.0, + "grad_norm": 3.168264045363415, + "language_loss": 0.70422965, + "learning_rate": 7.40832095706494e-08, + "loss": 0.72588032, + "num_input_tokens_seen": 164657430, + "step": 7615, + "time_per_iteration": 2.5118844509124756 + }, + { + "auxiliary_loss_clip": 0.01131112, + "auxiliary_loss_mlp": 0.01030038, + "balance_loss_clip": 1.04512203, + "balance_loss_mlp": 1.02305818, + "epoch": 0.9157698551073168, + "flos": 21107287261440.0, + "grad_norm": 1.8331047130299258, + "language_loss": 0.80384666, + "learning_rate": 7.387330594177443e-08, + "loss": 0.82545817, + "num_input_tokens_seen": 164679505, + "step": 7616, + "time_per_iteration": 2.595047950744629 + }, + { + "auxiliary_loss_clip": 0.01119603, + "auxiliary_loss_mlp": 0.01029629, + "balance_loss_clip": 1.04258823, + "balance_loss_mlp": 1.02262831, + "epoch": 0.9158900979979558, + "flos": 25193167228800.0, + "grad_norm": 1.5707934877216148, + "language_loss": 0.79326099, + "learning_rate": 7.366369450478749e-08, + "loss": 0.81475329, + "num_input_tokens_seen": 164700615, + "step": 7617, + "time_per_iteration": 2.598253011703491 + }, + { + "auxiliary_loss_clip": 0.01120977, + "auxiliary_loss_mlp": 0.01026684, + "balance_loss_clip": 1.04185867, + "balance_loss_mlp": 1.01943922, + "epoch": 0.916010340888595, + "flos": 30146648302080.0, + "grad_norm": 2.0330016640653428, + "language_loss": 0.6633811, + "learning_rate": 7.345437529148646e-08, + "loss": 0.68485773, + "num_input_tokens_seen": 164719625, + "step": 7618, + "time_per_iteration": 3.2803022861480713 + }, + { + "auxiliary_loss_clip": 0.01125103, + "auxiliary_loss_mlp": 0.01025886, + "balance_loss_clip": 1.04278326, + "balance_loss_mlp": 1.01879632, + "epoch": 0.9161305837792341, + "flos": 17091827907840.0, + "grad_norm": 2.0811223929823437, + "language_loss": 0.7276392, + "learning_rate": 7.324534833362483e-08, + "loss": 0.74914908, + "num_input_tokens_seen": 164737200, + "step": 7619, + "time_per_iteration": 2.5398528575897217 + }, + { + "auxiliary_loss_clip": 0.01140454, + "auxiliary_loss_mlp": 0.01023228, + "balance_loss_clip": 1.04558301, + "balance_loss_mlp": 1.01602769, + "epoch": 0.9162508266698731, + "flos": 22893699288960.0, + "grad_norm": 1.8847259921663775, + "language_loss": 0.68643057, + "learning_rate": 7.303661366291192e-08, + "loss": 0.70806742, + "num_input_tokens_seen": 164757870, + "step": 7620, + "time_per_iteration": 2.5309293270111084 + }, + { + "auxiliary_loss_clip": 0.011097, + "auxiliary_loss_mlp": 0.01023316, + "balance_loss_clip": 1.04165268, + "balance_loss_mlp": 1.01632118, + "epoch": 0.9163710695605123, + "flos": 19974808287360.0, + "grad_norm": 2.2117682578568147, + "language_loss": 0.81390023, + "learning_rate": 7.28281713110126e-08, + "loss": 0.83523035, + "num_input_tokens_seen": 164775945, + "step": 7621, + "time_per_iteration": 2.544813871383667 + }, + { + "auxiliary_loss_clip": 0.01134072, + "auxiliary_loss_mlp": 0.01024838, + "balance_loss_clip": 1.04380405, + "balance_loss_mlp": 1.01760793, + "epoch": 0.9164913124511513, + "flos": 22783812606720.0, + "grad_norm": 2.0369453171079, + "language_loss": 0.77240491, + "learning_rate": 7.262002130954759e-08, + "loss": 0.79399395, + "num_input_tokens_seen": 164794400, + "step": 7622, + "time_per_iteration": 2.5005905628204346 + }, + { + "auxiliary_loss_clip": 0.01115331, + "auxiliary_loss_mlp": 0.01027558, + "balance_loss_clip": 1.04211473, + "balance_loss_mlp": 1.01975, + "epoch": 0.9166115553417904, + "flos": 24900854348160.0, + "grad_norm": 1.7158397649099861, + "language_loss": 0.78895134, + "learning_rate": 7.241216369009296e-08, + "loss": 0.81038022, + "num_input_tokens_seen": 164814585, + "step": 7623, + "time_per_iteration": 2.6300055980682373 + }, + { + "auxiliary_loss_clip": 0.01165572, + "auxiliary_loss_mlp": 0.01019465, + "balance_loss_clip": 1.04509056, + "balance_loss_mlp": 1.01238441, + "epoch": 0.9167317982324296, + "flos": 25702919089920.0, + "grad_norm": 2.1489607620833997, + "language_loss": 0.66076863, + "learning_rate": 7.220459848418037e-08, + "loss": 0.68261898, + "num_input_tokens_seen": 164834660, + "step": 7624, + "time_per_iteration": 2.4667165279388428 + }, + { + "auxiliary_loss_clip": 0.01165494, + "auxiliary_loss_mlp": 0.01022116, + "balance_loss_clip": 1.04793823, + "balance_loss_mlp": 1.0152173, + "epoch": 0.9168520411230686, + "flos": 15632813370240.0, + "grad_norm": 1.9222961481438838, + "language_loss": 0.79765308, + "learning_rate": 7.199732572329708e-08, + "loss": 0.81952924, + "num_input_tokens_seen": 164852560, + "step": 7625, + "time_per_iteration": 2.402921676635742 + }, + { + "auxiliary_loss_clip": 0.01128175, + "auxiliary_loss_mlp": 0.01028282, + "balance_loss_clip": 1.04274273, + "balance_loss_mlp": 1.02077532, + "epoch": 0.9169722840137077, + "flos": 30258151096320.0, + "grad_norm": 2.7762311682138914, + "language_loss": 0.75864762, + "learning_rate": 7.179034543888684e-08, + "loss": 0.78021222, + "num_input_tokens_seen": 164872065, + "step": 7626, + "time_per_iteration": 2.5769577026367188 + }, + { + "auxiliary_loss_clip": 0.01154783, + "auxiliary_loss_mlp": 0.01025675, + "balance_loss_clip": 1.04426694, + "balance_loss_mlp": 1.01874948, + "epoch": 0.9170925269043467, + "flos": 22491643380480.0, + "grad_norm": 2.9622356049962164, + "language_loss": 0.77313322, + "learning_rate": 7.158365766234808e-08, + "loss": 0.79493779, + "num_input_tokens_seen": 164890915, + "step": 7627, + "time_per_iteration": 2.4899754524230957 + }, + { + "auxiliary_loss_clip": 0.01116174, + "auxiliary_loss_mlp": 0.01024764, + "balance_loss_clip": 1.03923678, + "balance_loss_mlp": 1.01684284, + "epoch": 0.9172127697949859, + "flos": 22893914770560.0, + "grad_norm": 2.097620117537633, + "language_loss": 0.72508037, + "learning_rate": 7.137726242503527e-08, + "loss": 0.74648976, + "num_input_tokens_seen": 164909835, + "step": 7628, + "time_per_iteration": 2.536228656768799 + }, + { + "auxiliary_loss_clip": 0.01152625, + "auxiliary_loss_mlp": 0.00762729, + "balance_loss_clip": 1.04623961, + "balance_loss_mlp": 1.00070119, + "epoch": 0.917333012685625, + "flos": 17451867882240.0, + "grad_norm": 3.7585544006438187, + "language_loss": 0.77978384, + "learning_rate": 7.11711597582585e-08, + "loss": 0.79893744, + "num_input_tokens_seen": 164927195, + "step": 7629, + "time_per_iteration": 2.423119306564331 + }, + { + "auxiliary_loss_clip": 0.01123479, + "auxiliary_loss_mlp": 0.01022499, + "balance_loss_clip": 1.03914762, + "balance_loss_mlp": 1.0157752, + "epoch": 0.917453255576264, + "flos": 14318949692160.0, + "grad_norm": 1.6879541719493323, + "language_loss": 0.79933763, + "learning_rate": 7.096534969328271e-08, + "loss": 0.82079738, + "num_input_tokens_seen": 164944640, + "step": 7630, + "time_per_iteration": 2.495929718017578 + }, + { + "auxiliary_loss_clip": 0.01142267, + "auxiliary_loss_mlp": 0.0102299, + "balance_loss_clip": 1.0421108, + "balance_loss_mlp": 1.01595354, + "epoch": 0.9175734984669032, + "flos": 20741177888640.0, + "grad_norm": 1.9412917174838438, + "language_loss": 0.84095049, + "learning_rate": 7.075983226132987e-08, + "loss": 0.86260307, + "num_input_tokens_seen": 164963570, + "step": 7631, + "time_per_iteration": 2.5489253997802734 + }, + { + "auxiliary_loss_clip": 0.01142744, + "auxiliary_loss_mlp": 0.00762668, + "balance_loss_clip": 1.04210997, + "balance_loss_mlp": 1.00060618, + "epoch": 0.9176937413575422, + "flos": 14830497233280.0, + "grad_norm": 3.2119892660368032, + "language_loss": 0.79500616, + "learning_rate": 7.055460749357656e-08, + "loss": 0.81406027, + "num_input_tokens_seen": 164979850, + "step": 7632, + "time_per_iteration": 2.468475341796875 + }, + { + "auxiliary_loss_clip": 0.01139534, + "auxiliary_loss_mlp": 0.01027535, + "balance_loss_clip": 1.04558897, + "balance_loss_mlp": 1.01983094, + "epoch": 0.9178139842481813, + "flos": 18474603828480.0, + "grad_norm": 2.016153253485065, + "language_loss": 0.70155376, + "learning_rate": 7.034967542115521e-08, + "loss": 0.7232244, + "num_input_tokens_seen": 164998115, + "step": 7633, + "time_per_iteration": 2.4581222534179688 + }, + { + "auxiliary_loss_clip": 0.01143684, + "auxiliary_loss_mlp": 0.00762105, + "balance_loss_clip": 1.04370272, + "balance_loss_mlp": 1.00069952, + "epoch": 0.9179342271388204, + "flos": 20047455544320.0, + "grad_norm": 4.893710727332219, + "language_loss": 0.75591648, + "learning_rate": 7.014503607515388e-08, + "loss": 0.77497441, + "num_input_tokens_seen": 165017420, + "step": 7634, + "time_per_iteration": 2.4529385566711426 + }, + { + "auxiliary_loss_clip": 0.01141198, + "auxiliary_loss_mlp": 0.01028384, + "balance_loss_clip": 1.04827392, + "balance_loss_mlp": 1.0208683, + "epoch": 0.9180544700294595, + "flos": 24676232647680.0, + "grad_norm": 1.9430213833667742, + "language_loss": 0.68277222, + "learning_rate": 6.994068948661592e-08, + "loss": 0.70446801, + "num_input_tokens_seen": 165035575, + "step": 7635, + "time_per_iteration": 2.5071194171905518 + }, + { + "auxiliary_loss_clip": 0.01153568, + "auxiliary_loss_mlp": 0.01027022, + "balance_loss_clip": 1.04625487, + "balance_loss_mlp": 1.01860046, + "epoch": 0.9181747129200986, + "flos": 16727478301440.0, + "grad_norm": 2.1431843965507578, + "language_loss": 0.76752269, + "learning_rate": 6.973663568654142e-08, + "loss": 0.78932863, + "num_input_tokens_seen": 165053280, + "step": 7636, + "time_per_iteration": 2.4725358486175537 + }, + { + "auxiliary_loss_clip": 0.01166707, + "auxiliary_loss_mlp": 0.01030375, + "balance_loss_clip": 1.04771996, + "balance_loss_mlp": 1.02276993, + "epoch": 0.9182949558107377, + "flos": 24271626873600.0, + "grad_norm": 3.2761092188591223, + "language_loss": 0.64867306, + "learning_rate": 6.953287470588386e-08, + "loss": 0.67064381, + "num_input_tokens_seen": 165071235, + "step": 7637, + "time_per_iteration": 2.4866883754730225 + }, + { + "auxiliary_loss_clip": 0.01157132, + "auxiliary_loss_mlp": 0.01023756, + "balance_loss_clip": 1.04475665, + "balance_loss_mlp": 1.01628137, + "epoch": 0.9184151987013768, + "flos": 22082117443200.0, + "grad_norm": 2.3067173349941887, + "language_loss": 0.85919297, + "learning_rate": 6.932940657555452e-08, + "loss": 0.88100183, + "num_input_tokens_seen": 165087365, + "step": 7638, + "time_per_iteration": 3.2562851905822754 + }, + { + "auxiliary_loss_clip": 0.01162321, + "auxiliary_loss_mlp": 0.01020854, + "balance_loss_clip": 1.04583395, + "balance_loss_mlp": 1.01400542, + "epoch": 0.9185354415920158, + "flos": 32166732257280.0, + "grad_norm": 1.57740963718009, + "language_loss": 0.76343656, + "learning_rate": 6.912623132641938e-08, + "loss": 0.78526831, + "num_input_tokens_seen": 165112455, + "step": 7639, + "time_per_iteration": 3.2675745487213135 + }, + { + "auxiliary_loss_clip": 0.01141562, + "auxiliary_loss_mlp": 0.01027476, + "balance_loss_clip": 1.04492676, + "balance_loss_mlp": 1.01984072, + "epoch": 0.918655684482655, + "flos": 20997831542400.0, + "grad_norm": 2.112823964863413, + "language_loss": 0.76451522, + "learning_rate": 6.892334898929952e-08, + "loss": 0.78620565, + "num_input_tokens_seen": 165132700, + "step": 7640, + "time_per_iteration": 2.4812746047973633 + }, + { + "auxiliary_loss_clip": 0.01148183, + "auxiliary_loss_mlp": 0.0102517, + "balance_loss_clip": 1.04445386, + "balance_loss_mlp": 1.01789236, + "epoch": 0.918775927373294, + "flos": 15560704817280.0, + "grad_norm": 1.9921242154300096, + "language_loss": 0.84760797, + "learning_rate": 6.872075959497236e-08, + "loss": 0.86934149, + "num_input_tokens_seen": 165151475, + "step": 7641, + "time_per_iteration": 3.176488161087036 + }, + { + "auxiliary_loss_clip": 0.01155121, + "auxiliary_loss_mlp": 0.01023927, + "balance_loss_clip": 1.04414678, + "balance_loss_mlp": 1.01696467, + "epoch": 0.9188961702639331, + "flos": 29934057657600.0, + "grad_norm": 1.8211438140388514, + "language_loss": 0.82433367, + "learning_rate": 6.85184631741702e-08, + "loss": 0.84612417, + "num_input_tokens_seen": 165172040, + "step": 7642, + "time_per_iteration": 2.514589786529541 + }, + { + "auxiliary_loss_clip": 0.01152075, + "auxiliary_loss_mlp": 0.01023347, + "balance_loss_clip": 1.04449272, + "balance_loss_mlp": 1.01572931, + "epoch": 0.9190164131545723, + "flos": 20701244943360.0, + "grad_norm": 1.8272526314194117, + "language_loss": 0.76960087, + "learning_rate": 6.831645975758161e-08, + "loss": 0.79135501, + "num_input_tokens_seen": 165189980, + "step": 7643, + "time_per_iteration": 2.439962148666382 + }, + { + "auxiliary_loss_clip": 0.01131371, + "auxiliary_loss_mlp": 0.01024979, + "balance_loss_clip": 1.04402733, + "balance_loss_mlp": 1.01799941, + "epoch": 0.9191366560452113, + "flos": 25629912696960.0, + "grad_norm": 2.3436004769947747, + "language_loss": 0.67119181, + "learning_rate": 6.811474937585026e-08, + "loss": 0.69275534, + "num_input_tokens_seen": 165209770, + "step": 7644, + "time_per_iteration": 3.218221664428711 + }, + { + "auxiliary_loss_clip": 0.01120499, + "auxiliary_loss_mlp": 0.01024612, + "balance_loss_clip": 1.041682, + "balance_loss_mlp": 1.01759028, + "epoch": 0.9192568989358504, + "flos": 21434325615360.0, + "grad_norm": 1.6279581507090297, + "language_loss": 0.79364294, + "learning_rate": 6.79133320595755e-08, + "loss": 0.81509405, + "num_input_tokens_seen": 165229690, + "step": 7645, + "time_per_iteration": 2.5398435592651367 + }, + { + "auxiliary_loss_clip": 0.01142564, + "auxiliary_loss_mlp": 0.01024291, + "balance_loss_clip": 1.04635549, + "balance_loss_mlp": 1.01720726, + "epoch": 0.9193771418264896, + "flos": 23185078416000.0, + "grad_norm": 1.838475806989485, + "language_loss": 0.75577044, + "learning_rate": 6.771220783931198e-08, + "loss": 0.777439, + "num_input_tokens_seen": 165249850, + "step": 7646, + "time_per_iteration": 2.497814178466797 + }, + { + "auxiliary_loss_clip": 0.0099241, + "auxiliary_loss_mlp": 0.00753706, + "balance_loss_clip": 1.01532578, + "balance_loss_mlp": 1.0005089, + "epoch": 0.9194973847171286, + "flos": 70582963184640.0, + "grad_norm": 0.859370688223324, + "language_loss": 0.64596868, + "learning_rate": 6.751137674556994e-08, + "loss": 0.66342986, + "num_input_tokens_seen": 165310235, + "step": 7647, + "time_per_iteration": 3.628962516784668 + }, + { + "auxiliary_loss_clip": 0.01154253, + "auxiliary_loss_mlp": 0.01020328, + "balance_loss_clip": 1.04324329, + "balance_loss_mlp": 1.0131427, + "epoch": 0.9196176276077677, + "flos": 14720682378240.0, + "grad_norm": 2.013674929147741, + "language_loss": 0.77455539, + "learning_rate": 6.731083880881572e-08, + "loss": 0.79630113, + "num_input_tokens_seen": 165326455, + "step": 7648, + "time_per_iteration": 2.730668067932129 + }, + { + "auxiliary_loss_clip": 0.01139399, + "auxiliary_loss_mlp": 0.0102152, + "balance_loss_clip": 1.04368377, + "balance_loss_mlp": 1.014341, + "epoch": 0.9197378704984068, + "flos": 23294893271040.0, + "grad_norm": 1.9803007723072652, + "language_loss": 0.80884218, + "learning_rate": 6.711059405947072e-08, + "loss": 0.83045131, + "num_input_tokens_seen": 165344645, + "step": 7649, + "time_per_iteration": 2.5019328594207764 + }, + { + "auxiliary_loss_clip": 0.01123479, + "auxiliary_loss_mlp": 0.01022017, + "balance_loss_clip": 1.0437305, + "balance_loss_mlp": 1.01436996, + "epoch": 0.9198581133890459, + "flos": 20302564913280.0, + "grad_norm": 1.6809332858085493, + "language_loss": 0.77105451, + "learning_rate": 6.691064252791156e-08, + "loss": 0.79250944, + "num_input_tokens_seen": 165364120, + "step": 7650, + "time_per_iteration": 2.505009412765503 + }, + { + "auxiliary_loss_clip": 0.01103707, + "auxiliary_loss_mlp": 0.01023353, + "balance_loss_clip": 1.0414319, + "balance_loss_mlp": 1.01595032, + "epoch": 0.9199783562796849, + "flos": 17675663569920.0, + "grad_norm": 1.5790818303335383, + "language_loss": 0.77716386, + "learning_rate": 6.67109842444713e-08, + "loss": 0.7984345, + "num_input_tokens_seen": 165383050, + "step": 7651, + "time_per_iteration": 2.5292787551879883 + }, + { + "auxiliary_loss_clip": 0.01152421, + "auxiliary_loss_mlp": 0.00762643, + "balance_loss_clip": 1.047997, + "balance_loss_mlp": 1.00067997, + "epoch": 0.9200985991703241, + "flos": 17676022705920.0, + "grad_norm": 1.8943220942717613, + "language_loss": 0.76755357, + "learning_rate": 6.651161923943704e-08, + "loss": 0.78670424, + "num_input_tokens_seen": 165400955, + "step": 7652, + "time_per_iteration": 2.4240994453430176 + }, + { + "auxiliary_loss_clip": 0.01147778, + "auxiliary_loss_mlp": 0.01026043, + "balance_loss_clip": 1.0435096, + "balance_loss_mlp": 1.01843429, + "epoch": 0.9202188420609632, + "flos": 20996574566400.0, + "grad_norm": 1.8444407949045518, + "language_loss": 0.76825958, + "learning_rate": 6.631254754305326e-08, + "loss": 0.78999782, + "num_input_tokens_seen": 165420415, + "step": 7653, + "time_per_iteration": 2.455369710922241 + }, + { + "auxiliary_loss_clip": 0.01169533, + "auxiliary_loss_mlp": 0.01025649, + "balance_loss_clip": 1.04723263, + "balance_loss_mlp": 1.01815367, + "epoch": 0.9203390849516022, + "flos": 13918222586880.0, + "grad_norm": 1.9388279938809312, + "language_loss": 0.78289115, + "learning_rate": 6.611376918551848e-08, + "loss": 0.80484301, + "num_input_tokens_seen": 165439200, + "step": 7654, + "time_per_iteration": 2.441417932510376 + }, + { + "auxiliary_loss_clip": 0.01120562, + "auxiliary_loss_mlp": 0.0076224, + "balance_loss_clip": 1.04055476, + "balance_loss_mlp": 1.00054574, + "epoch": 0.9204593278422414, + "flos": 21175912195200.0, + "grad_norm": 2.3395538613343394, + "language_loss": 0.79334784, + "learning_rate": 6.591528419698744e-08, + "loss": 0.81217587, + "num_input_tokens_seen": 165458985, + "step": 7655, + "time_per_iteration": 2.526383876800537 + }, + { + "auxiliary_loss_clip": 0.01142633, + "auxiliary_loss_mlp": 0.01022472, + "balance_loss_clip": 1.04362571, + "balance_loss_mlp": 1.0156889, + "epoch": 0.9205795707328804, + "flos": 14501375890560.0, + "grad_norm": 2.268667782475982, + "language_loss": 0.83252263, + "learning_rate": 6.571709260756986e-08, + "loss": 0.85417378, + "num_input_tokens_seen": 165475630, + "step": 7656, + "time_per_iteration": 2.4644079208374023 + }, + { + "auxiliary_loss_clip": 0.01159384, + "auxiliary_loss_mlp": 0.01028614, + "balance_loss_clip": 1.05075705, + "balance_loss_mlp": 1.0205673, + "epoch": 0.9206998136235195, + "flos": 22417559579520.0, + "grad_norm": 2.195891064060851, + "language_loss": 0.76677293, + "learning_rate": 6.551919444733122e-08, + "loss": 0.7886529, + "num_input_tokens_seen": 165493445, + "step": 7657, + "time_per_iteration": 2.452890396118164 + }, + { + "auxiliary_loss_clip": 0.01136519, + "auxiliary_loss_mlp": 0.01026776, + "balance_loss_clip": 1.04473805, + "balance_loss_mlp": 1.01891983, + "epoch": 0.9208200565141585, + "flos": 53358407544960.0, + "grad_norm": 2.0114105901983277, + "language_loss": 0.66119134, + "learning_rate": 6.53215897462931e-08, + "loss": 0.68282425, + "num_input_tokens_seen": 165517200, + "step": 7658, + "time_per_iteration": 2.7636711597442627 + }, + { + "auxiliary_loss_clip": 0.01151, + "auxiliary_loss_mlp": 0.01027872, + "balance_loss_clip": 1.04482055, + "balance_loss_mlp": 1.0201118, + "epoch": 0.9209402994047977, + "flos": 30589139946240.0, + "grad_norm": 2.10503029650995, + "language_loss": 0.7463243, + "learning_rate": 6.512427853443103e-08, + "loss": 0.76811296, + "num_input_tokens_seen": 165539280, + "step": 7659, + "time_per_iteration": 2.5083746910095215 + }, + { + "auxiliary_loss_clip": 0.01156585, + "auxiliary_loss_mlp": 0.0101983, + "balance_loss_clip": 1.04565823, + "balance_loss_mlp": 1.01239729, + "epoch": 0.9210605422954368, + "flos": 29132711187840.0, + "grad_norm": 1.580797989825861, + "language_loss": 0.75852782, + "learning_rate": 6.492726084167799e-08, + "loss": 0.78029197, + "num_input_tokens_seen": 165561395, + "step": 7660, + "time_per_iteration": 2.5086421966552734 + }, + { + "auxiliary_loss_clip": 0.01063305, + "auxiliary_loss_mlp": 0.01001362, + "balance_loss_clip": 1.0075357, + "balance_loss_mlp": 1.00033665, + "epoch": 0.9211807851860758, + "flos": 54853838472960.0, + "grad_norm": 0.7779149509831746, + "language_loss": 0.57527936, + "learning_rate": 6.473053669792072e-08, + "loss": 0.59592605, + "num_input_tokens_seen": 165616085, + "step": 7661, + "time_per_iteration": 2.8847568035125732 + }, + { + "auxiliary_loss_clip": 0.0115213, + "auxiliary_loss_mlp": 0.01026055, + "balance_loss_clip": 1.04408586, + "balance_loss_mlp": 1.018098, + "epoch": 0.921301028076715, + "flos": 19201974238080.0, + "grad_norm": 1.9417370153143974, + "language_loss": 0.72880614, + "learning_rate": 6.453410613300248e-08, + "loss": 0.75058806, + "num_input_tokens_seen": 165634015, + "step": 7662, + "time_per_iteration": 2.4312002658843994 + }, + { + "auxiliary_loss_clip": 0.01096904, + "auxiliary_loss_mlp": 0.01030653, + "balance_loss_clip": 1.04088449, + "balance_loss_mlp": 1.02308941, + "epoch": 0.921421270967354, + "flos": 27526893765120.0, + "grad_norm": 1.717006370072091, + "language_loss": 0.58264768, + "learning_rate": 6.43379691767214e-08, + "loss": 0.60392332, + "num_input_tokens_seen": 165653220, + "step": 7663, + "time_per_iteration": 2.6437273025512695 + }, + { + "auxiliary_loss_clip": 0.01022765, + "auxiliary_loss_mlp": 0.01001084, + "balance_loss_clip": 1.0073967, + "balance_loss_mlp": 1.00008285, + "epoch": 0.9215415138579931, + "flos": 70209311955840.0, + "grad_norm": 0.733988581536461, + "language_loss": 0.55191684, + "learning_rate": 6.414212585883105e-08, + "loss": 0.57215536, + "num_input_tokens_seen": 165715850, + "step": 7664, + "time_per_iteration": 3.197953462600708 + }, + { + "auxiliary_loss_clip": 0.01142482, + "auxiliary_loss_mlp": 0.01022631, + "balance_loss_clip": 1.04489899, + "balance_loss_mlp": 1.0145967, + "epoch": 0.9216617567486323, + "flos": 35553107790720.0, + "grad_norm": 1.723047246695315, + "language_loss": 0.70046151, + "learning_rate": 6.394657620904143e-08, + "loss": 0.7221126, + "num_input_tokens_seen": 165738960, + "step": 7665, + "time_per_iteration": 3.394010066986084 + }, + { + "auxiliary_loss_clip": 0.01171881, + "auxiliary_loss_mlp": 0.01026126, + "balance_loss_clip": 1.04865396, + "balance_loss_mlp": 1.01866937, + "epoch": 0.9217819996392713, + "flos": 29533330552320.0, + "grad_norm": 1.6875176407332761, + "language_loss": 0.71556067, + "learning_rate": 6.375132025701657e-08, + "loss": 0.73754078, + "num_input_tokens_seen": 165761260, + "step": 7666, + "time_per_iteration": 3.270878553390503 + }, + { + "auxiliary_loss_clip": 0.01173198, + "auxiliary_loss_mlp": 0.01023907, + "balance_loss_clip": 1.04999137, + "balance_loss_mlp": 1.016433, + "epoch": 0.9219022425299104, + "flos": 14574669592320.0, + "grad_norm": 2.2490841967143136, + "language_loss": 0.6907115, + "learning_rate": 6.355635803237724e-08, + "loss": 0.7126826, + "num_input_tokens_seen": 165776960, + "step": 7667, + "time_per_iteration": 3.1182291507720947 + }, + { + "auxiliary_loss_clip": 0.01152783, + "auxiliary_loss_mlp": 0.01025632, + "balance_loss_clip": 1.04462433, + "balance_loss_mlp": 1.01809835, + "epoch": 0.9220224854205495, + "flos": 18077503996800.0, + "grad_norm": 2.2003281656663596, + "language_loss": 0.7994926, + "learning_rate": 6.336168956469867e-08, + "loss": 0.82127666, + "num_input_tokens_seen": 165795435, + "step": 7668, + "time_per_iteration": 2.4267101287841797 + }, + { + "auxiliary_loss_clip": 0.01131505, + "auxiliary_loss_mlp": 0.0102476, + "balance_loss_clip": 1.04322243, + "balance_loss_mlp": 1.01823902, + "epoch": 0.9221427283111886, + "flos": 24790464875520.0, + "grad_norm": 2.0238693578290365, + "language_loss": 0.72029638, + "learning_rate": 6.316731488351168e-08, + "loss": 0.74185908, + "num_input_tokens_seen": 165816625, + "step": 7669, + "time_per_iteration": 2.536612033843994 + }, + { + "auxiliary_loss_clip": 0.01153214, + "auxiliary_loss_mlp": 0.01025079, + "balance_loss_clip": 1.04613686, + "balance_loss_mlp": 1.01780117, + "epoch": 0.9222629712018277, + "flos": 13845036625920.0, + "grad_norm": 3.9963045292852444, + "language_loss": 0.63264608, + "learning_rate": 6.297323401830334e-08, + "loss": 0.65442908, + "num_input_tokens_seen": 165835410, + "step": 7670, + "time_per_iteration": 2.4229650497436523 + }, + { + "auxiliary_loss_clip": 0.01155828, + "auxiliary_loss_mlp": 0.01023514, + "balance_loss_clip": 1.04532671, + "balance_loss_mlp": 1.01619411, + "epoch": 0.9223832140924668, + "flos": 21616177196160.0, + "grad_norm": 1.9540217701215514, + "language_loss": 0.6909582, + "learning_rate": 6.277944699851523e-08, + "loss": 0.71275163, + "num_input_tokens_seen": 165854930, + "step": 7671, + "time_per_iteration": 3.218696355819702 + }, + { + "auxiliary_loss_clip": 0.01165053, + "auxiliary_loss_mlp": 0.01026643, + "balance_loss_clip": 1.04512215, + "balance_loss_mlp": 1.01913834, + "epoch": 0.9225034569831059, + "flos": 21142084561920.0, + "grad_norm": 3.318015928870639, + "language_loss": 0.73315656, + "learning_rate": 6.25859538535447e-08, + "loss": 0.75507349, + "num_input_tokens_seen": 165875725, + "step": 7672, + "time_per_iteration": 2.452723979949951 + }, + { + "auxiliary_loss_clip": 0.01140405, + "auxiliary_loss_mlp": 0.01020662, + "balance_loss_clip": 1.04553103, + "balance_loss_mlp": 1.01319313, + "epoch": 0.9226236998737449, + "flos": 12495046844160.0, + "grad_norm": 2.5247960818183386, + "language_loss": 0.78119886, + "learning_rate": 6.239275461274474e-08, + "loss": 0.80280948, + "num_input_tokens_seen": 165892100, + "step": 7673, + "time_per_iteration": 2.4519901275634766 + }, + { + "auxiliary_loss_clip": 0.01154529, + "auxiliary_loss_mlp": 0.01025056, + "balance_loss_clip": 1.0458796, + "balance_loss_mlp": 1.01793623, + "epoch": 0.9227439427643841, + "flos": 26214071581440.0, + "grad_norm": 2.3454104255523203, + "language_loss": 0.8557725, + "learning_rate": 6.219984930542299e-08, + "loss": 0.87756836, + "num_input_tokens_seen": 165912840, + "step": 7674, + "time_per_iteration": 2.4833035469055176 + }, + { + "auxiliary_loss_clip": 0.01154959, + "auxiliary_loss_mlp": 0.01028647, + "balance_loss_clip": 1.04528403, + "balance_loss_mlp": 1.02156925, + "epoch": 0.9228641856550232, + "flos": 17967581400960.0, + "grad_norm": 5.510109498640113, + "language_loss": 0.76464188, + "learning_rate": 6.200723796084383e-08, + "loss": 0.78647792, + "num_input_tokens_seen": 165930935, + "step": 7675, + "time_per_iteration": 2.449655771255493 + }, + { + "auxiliary_loss_clip": 0.01036816, + "auxiliary_loss_mlp": 0.01000888, + "balance_loss_clip": 1.00779688, + "balance_loss_mlp": 0.99994624, + "epoch": 0.9229844285456622, + "flos": 70420609710720.0, + "grad_norm": 0.759164166297114, + "language_loss": 0.63012421, + "learning_rate": 6.181492060822546e-08, + "loss": 0.65050125, + "num_input_tokens_seen": 165991110, + "step": 7676, + "time_per_iteration": 3.0109939575195312 + }, + { + "auxiliary_loss_clip": 0.01109581, + "auxiliary_loss_mlp": 0.010235, + "balance_loss_clip": 1.04080379, + "balance_loss_mlp": 1.01608253, + "epoch": 0.9231046714363014, + "flos": 17967832796160.0, + "grad_norm": 2.995387055612862, + "language_loss": 0.81363493, + "learning_rate": 6.162289727674274e-08, + "loss": 0.83496571, + "num_input_tokens_seen": 166008790, + "step": 7677, + "time_per_iteration": 2.56949520111084 + }, + { + "auxiliary_loss_clip": 0.0112474, + "auxiliary_loss_mlp": 0.01025018, + "balance_loss_clip": 1.04145122, + "balance_loss_mlp": 1.01805949, + "epoch": 0.9232249143269404, + "flos": 17858233422720.0, + "grad_norm": 2.1975303998185574, + "language_loss": 0.88074636, + "learning_rate": 6.143116799552527e-08, + "loss": 0.90224385, + "num_input_tokens_seen": 166025035, + "step": 7678, + "time_per_iteration": 2.5324158668518066 + }, + { + "auxiliary_loss_clip": 0.01157814, + "auxiliary_loss_mlp": 0.01020368, + "balance_loss_clip": 1.0474, + "balance_loss_mlp": 1.01306677, + "epoch": 0.9233451572175795, + "flos": 23404384903680.0, + "grad_norm": 2.141532404053057, + "language_loss": 0.55926836, + "learning_rate": 6.123973279365802e-08, + "loss": 0.58105022, + "num_input_tokens_seen": 166044010, + "step": 7679, + "time_per_iteration": 2.4716298580169678 + }, + { + "auxiliary_loss_clip": 0.01159434, + "auxiliary_loss_mlp": 0.01023726, + "balance_loss_clip": 1.04785776, + "balance_loss_mlp": 1.016675, + "epoch": 0.9234654001082186, + "flos": 17999326045440.0, + "grad_norm": 1.8838751861464764, + "language_loss": 0.77930182, + "learning_rate": 6.10485917001824e-08, + "loss": 0.80113339, + "num_input_tokens_seen": 166061865, + "step": 7680, + "time_per_iteration": 2.449502468109131 + }, + { + "auxiliary_loss_clip": 0.01142857, + "auxiliary_loss_mlp": 0.01021789, + "balance_loss_clip": 1.04411149, + "balance_loss_mlp": 1.01490724, + "epoch": 0.9235856429988577, + "flos": 24750747411840.0, + "grad_norm": 1.6735428336951235, + "language_loss": 0.80875278, + "learning_rate": 6.085774474409322e-08, + "loss": 0.83039927, + "num_input_tokens_seen": 166082425, + "step": 7681, + "time_per_iteration": 2.5327255725860596 + }, + { + "auxiliary_loss_clip": 0.011422, + "auxiliary_loss_mlp": 0.01026999, + "balance_loss_clip": 1.04760444, + "balance_loss_mlp": 1.01985288, + "epoch": 0.9237058858894968, + "flos": 14099894599680.0, + "grad_norm": 1.961362977456114, + "language_loss": 0.69879234, + "learning_rate": 6.066719195434267e-08, + "loss": 0.72048438, + "num_input_tokens_seen": 166100225, + "step": 7682, + "time_per_iteration": 2.451974868774414 + }, + { + "auxiliary_loss_clip": 0.01156503, + "auxiliary_loss_mlp": 0.01028907, + "balance_loss_clip": 1.04698825, + "balance_loss_mlp": 1.02122355, + "epoch": 0.9238261287801359, + "flos": 28694529175680.0, + "grad_norm": 3.4387801918883096, + "language_loss": 0.66416365, + "learning_rate": 6.047693335983717e-08, + "loss": 0.68601781, + "num_input_tokens_seen": 166122570, + "step": 7683, + "time_per_iteration": 2.5161654949188232 + }, + { + "auxiliary_loss_clip": 0.01155956, + "auxiliary_loss_mlp": 0.0102431, + "balance_loss_clip": 1.04498529, + "balance_loss_mlp": 1.0168829, + "epoch": 0.923946371670775, + "flos": 23111856541440.0, + "grad_norm": 2.5773597587972286, + "language_loss": 0.83013809, + "learning_rate": 6.028696898943853e-08, + "loss": 0.85194069, + "num_input_tokens_seen": 166141630, + "step": 7684, + "time_per_iteration": 2.4455151557922363 + }, + { + "auxiliary_loss_clip": 0.0113928, + "auxiliary_loss_mlp": 0.00762844, + "balance_loss_clip": 1.04141331, + "balance_loss_mlp": 1.00061846, + "epoch": 0.924066614561414, + "flos": 21867120587520.0, + "grad_norm": 2.0128977487784425, + "language_loss": 0.70598978, + "learning_rate": 6.00972988719648e-08, + "loss": 0.72501105, + "num_input_tokens_seen": 166159865, + "step": 7685, + "time_per_iteration": 2.5226376056671143 + }, + { + "auxiliary_loss_clip": 0.0112797, + "auxiliary_loss_mlp": 0.00762639, + "balance_loss_clip": 1.04360771, + "balance_loss_mlp": 1.00054479, + "epoch": 0.9241868574520532, + "flos": 28511887495680.0, + "grad_norm": 2.5237570055937697, + "language_loss": 0.70597476, + "learning_rate": 5.990792303618807e-08, + "loss": 0.72488081, + "num_input_tokens_seen": 166179445, + "step": 7686, + "time_per_iteration": 2.600367307662964 + }, + { + "auxiliary_loss_clip": 0.01123554, + "auxiliary_loss_mlp": 0.01021267, + "balance_loss_clip": 1.04291296, + "balance_loss_mlp": 1.01374459, + "epoch": 0.9243071003426923, + "flos": 30518324282880.0, + "grad_norm": 1.9849346669385324, + "language_loss": 0.69514239, + "learning_rate": 5.971884151083695e-08, + "loss": 0.71659058, + "num_input_tokens_seen": 166201855, + "step": 7687, + "time_per_iteration": 2.5934078693389893 + }, + { + "auxiliary_loss_clip": 0.01141899, + "auxiliary_loss_mlp": 0.01024635, + "balance_loss_clip": 1.04368114, + "balance_loss_mlp": 1.01760793, + "epoch": 0.9244273432333313, + "flos": 28658331244800.0, + "grad_norm": 1.7522689612155604, + "language_loss": 0.74460596, + "learning_rate": 5.9530054324595124e-08, + "loss": 0.76627129, + "num_input_tokens_seen": 166221970, + "step": 7688, + "time_per_iteration": 2.5509495735168457 + }, + { + "auxiliary_loss_clip": 0.0104737, + "auxiliary_loss_mlp": 0.00752626, + "balance_loss_clip": 1.00661397, + "balance_loss_mlp": 1.00047922, + "epoch": 0.9245475861239704, + "flos": 66230589237120.0, + "grad_norm": 0.7140939454913553, + "language_loss": 0.57551765, + "learning_rate": 5.934156150610103e-08, + "loss": 0.59351766, + "num_input_tokens_seen": 166279335, + "step": 7689, + "time_per_iteration": 3.0808489322662354 + }, + { + "auxiliary_loss_clip": 0.0113693, + "auxiliary_loss_mlp": 0.01028506, + "balance_loss_clip": 1.04285383, + "balance_loss_mlp": 1.02062607, + "epoch": 0.9246678290146095, + "flos": 24239918142720.0, + "grad_norm": 10.598578025434877, + "language_loss": 0.79633486, + "learning_rate": 5.915336308394914e-08, + "loss": 0.81798923, + "num_input_tokens_seen": 166298170, + "step": 7690, + "time_per_iteration": 2.5216658115386963 + }, + { + "auxiliary_loss_clip": 0.0114752, + "auxiliary_loss_mlp": 0.01021265, + "balance_loss_clip": 1.0440495, + "balance_loss_mlp": 1.01475966, + "epoch": 0.9247880719052486, + "flos": 18988808976000.0, + "grad_norm": 1.6754619249397797, + "language_loss": 0.7714628, + "learning_rate": 5.89654590866886e-08, + "loss": 0.79315066, + "num_input_tokens_seen": 166317670, + "step": 7691, + "time_per_iteration": 3.248448371887207 + }, + { + "auxiliary_loss_clip": 0.01102023, + "auxiliary_loss_mlp": 0.01023733, + "balance_loss_clip": 1.04396057, + "balance_loss_mlp": 1.01596081, + "epoch": 0.9249083147958876, + "flos": 24024095274240.0, + "grad_norm": 2.096013892199512, + "language_loss": 0.88399017, + "learning_rate": 5.877784954282483e-08, + "loss": 0.90524769, + "num_input_tokens_seen": 166337010, + "step": 7692, + "time_per_iteration": 3.3954405784606934 + }, + { + "auxiliary_loss_clip": 0.01156722, + "auxiliary_loss_mlp": 0.01021287, + "balance_loss_clip": 1.04538274, + "balance_loss_mlp": 1.01342475, + "epoch": 0.9250285576865268, + "flos": 30773972355840.0, + "grad_norm": 2.6038069734888265, + "language_loss": 0.72158515, + "learning_rate": 5.8590534480817963e-08, + "loss": 0.74336529, + "num_input_tokens_seen": 166358735, + "step": 7693, + "time_per_iteration": 2.53037428855896 + }, + { + "auxiliary_loss_clip": 0.01170377, + "auxiliary_loss_mlp": 0.01024398, + "balance_loss_clip": 1.04947066, + "balance_loss_mlp": 1.01681685, + "epoch": 0.9251488005771659, + "flos": 10633581348480.0, + "grad_norm": 2.7039539438773788, + "language_loss": 0.72441328, + "learning_rate": 5.840351392908349e-08, + "loss": 0.74636102, + "num_input_tokens_seen": 166374455, + "step": 7694, + "time_per_iteration": 3.0788426399230957 + }, + { + "auxiliary_loss_clip": 0.01146233, + "auxiliary_loss_mlp": 0.0076262, + "balance_loss_clip": 1.0440352, + "balance_loss_mlp": 1.00063682, + "epoch": 0.9252690434678049, + "flos": 23586416052480.0, + "grad_norm": 2.1315015500959866, + "language_loss": 0.70505023, + "learning_rate": 5.821678791599205e-08, + "loss": 0.72413868, + "num_input_tokens_seen": 166393900, + "step": 7695, + "time_per_iteration": 2.5130515098571777 + }, + { + "auxiliary_loss_clip": 0.0113812, + "auxiliary_loss_mlp": 0.01023926, + "balance_loss_clip": 1.04551351, + "balance_loss_mlp": 1.0170331, + "epoch": 0.9253892863584441, + "flos": 21469158829440.0, + "grad_norm": 1.9100906957075183, + "language_loss": 0.80669183, + "learning_rate": 5.803035646986965e-08, + "loss": 0.82831228, + "num_input_tokens_seen": 166413235, + "step": 7696, + "time_per_iteration": 2.4835433959960938 + }, + { + "auxiliary_loss_clip": 0.01170047, + "auxiliary_loss_mlp": 0.0102414, + "balance_loss_clip": 1.04784274, + "balance_loss_mlp": 1.01608729, + "epoch": 0.9255095292490831, + "flos": 17456680304640.0, + "grad_norm": 3.6611904069633487, + "language_loss": 0.67324424, + "learning_rate": 5.7844219618998766e-08, + "loss": 0.69518614, + "num_input_tokens_seen": 166427560, + "step": 7697, + "time_per_iteration": 3.22959303855896 + }, + { + "auxiliary_loss_clip": 0.01109715, + "auxiliary_loss_mlp": 0.01026232, + "balance_loss_clip": 1.0375042, + "balance_loss_mlp": 1.01863575, + "epoch": 0.9256297721397222, + "flos": 24750675584640.0, + "grad_norm": 2.1305756471739814, + "language_loss": 0.71369725, + "learning_rate": 5.765837739161505e-08, + "loss": 0.73505676, + "num_input_tokens_seen": 166446680, + "step": 7698, + "time_per_iteration": 2.534395694732666 + }, + { + "auxiliary_loss_clip": 0.01125462, + "auxiliary_loss_mlp": 0.01021536, + "balance_loss_clip": 1.04289269, + "balance_loss_mlp": 1.01424634, + "epoch": 0.9257500150303614, + "flos": 23112215677440.0, + "grad_norm": 1.675786311763946, + "language_loss": 0.74224788, + "learning_rate": 5.7472829815911504e-08, + "loss": 0.76371783, + "num_input_tokens_seen": 166465505, + "step": 7699, + "time_per_iteration": 2.5213816165924072 + }, + { + "auxiliary_loss_clip": 0.01136942, + "auxiliary_loss_mlp": 0.01029685, + "balance_loss_clip": 1.04469419, + "balance_loss_mlp": 1.02185273, + "epoch": 0.9258702579210004, + "flos": 22564685687040.0, + "grad_norm": 1.9849638293053042, + "language_loss": 0.81492102, + "learning_rate": 5.7287576920035164e-08, + "loss": 0.83658731, + "num_input_tokens_seen": 166484520, + "step": 7700, + "time_per_iteration": 2.500467300415039 + }, + { + "auxiliary_loss_clip": 0.01122377, + "auxiliary_loss_mlp": 0.01024131, + "balance_loss_clip": 1.04347765, + "balance_loss_mlp": 1.01738119, + "epoch": 0.9259905008116395, + "flos": 30004298703360.0, + "grad_norm": 1.8160262865525085, + "language_loss": 0.76538223, + "learning_rate": 5.7102618732088435e-08, + "loss": 0.78684735, + "num_input_tokens_seen": 166503850, + "step": 7701, + "time_per_iteration": 2.5791163444519043 + }, + { + "auxiliary_loss_clip": 0.01145489, + "auxiliary_loss_mlp": 0.01025739, + "balance_loss_clip": 1.04561174, + "balance_loss_mlp": 1.01916432, + "epoch": 0.9261107437022786, + "flos": 24572128055040.0, + "grad_norm": 1.8921008394901089, + "language_loss": 0.74530882, + "learning_rate": 5.6917955280130216e-08, + "loss": 0.76702106, + "num_input_tokens_seen": 166525330, + "step": 7702, + "time_per_iteration": 2.5264928340911865 + }, + { + "auxiliary_loss_clip": 0.01150707, + "auxiliary_loss_mlp": 0.01028483, + "balance_loss_clip": 1.04535246, + "balance_loss_mlp": 1.02115715, + "epoch": 0.9262309865929177, + "flos": 22018448586240.0, + "grad_norm": 2.2259415487760656, + "language_loss": 0.71993113, + "learning_rate": 5.6733586592172755e-08, + "loss": 0.74172306, + "num_input_tokens_seen": 166544825, + "step": 7703, + "time_per_iteration": 2.455397367477417 + }, + { + "auxiliary_loss_clip": 0.0113273, + "auxiliary_loss_mlp": 0.00761776, + "balance_loss_clip": 1.0414269, + "balance_loss_mlp": 1.00060701, + "epoch": 0.9263512294835567, + "flos": 20339481116160.0, + "grad_norm": 1.881586619652047, + "language_loss": 0.80031556, + "learning_rate": 5.6549512696185244e-08, + "loss": 0.8192606, + "num_input_tokens_seen": 166563325, + "step": 7704, + "time_per_iteration": 2.4780008792877197 + }, + { + "auxiliary_loss_clip": 0.01165836, + "auxiliary_loss_mlp": 0.01020694, + "balance_loss_clip": 1.04684091, + "balance_loss_mlp": 1.0131923, + "epoch": 0.9264714723741959, + "flos": 21215378263680.0, + "grad_norm": 1.7783046904483053, + "language_loss": 0.68215978, + "learning_rate": 5.636573362009156e-08, + "loss": 0.70402515, + "num_input_tokens_seen": 166583385, + "step": 7705, + "time_per_iteration": 2.4337048530578613 + }, + { + "auxiliary_loss_clip": 0.01170122, + "auxiliary_loss_mlp": 0.01025485, + "balance_loss_clip": 1.04783285, + "balance_loss_mlp": 1.01798677, + "epoch": 0.926591715264835, + "flos": 18004964480640.0, + "grad_norm": 1.9418285682406315, + "language_loss": 0.77093118, + "learning_rate": 5.618224939177074e-08, + "loss": 0.79288727, + "num_input_tokens_seen": 166601290, + "step": 7706, + "time_per_iteration": 2.3997740745544434 + }, + { + "auxiliary_loss_clip": 0.01128178, + "auxiliary_loss_mlp": 0.01025428, + "balance_loss_clip": 1.04198945, + "balance_loss_mlp": 1.01767361, + "epoch": 0.926711958155474, + "flos": 36167969825280.0, + "grad_norm": 1.7421050364753696, + "language_loss": 0.7018503, + "learning_rate": 5.599906003905719e-08, + "loss": 0.72338641, + "num_input_tokens_seen": 166623835, + "step": 7707, + "time_per_iteration": 2.6004676818847656 + }, + { + "auxiliary_loss_clip": 0.01148941, + "auxiliary_loss_mlp": 0.0102451, + "balance_loss_clip": 1.04678369, + "balance_loss_mlp": 1.01676738, + "epoch": 0.9268322010461132, + "flos": 21032736583680.0, + "grad_norm": 2.9806327560525188, + "language_loss": 0.81735021, + "learning_rate": 5.581616558974023e-08, + "loss": 0.83908474, + "num_input_tokens_seen": 166642400, + "step": 7708, + "time_per_iteration": 2.436887264251709 + }, + { + "auxiliary_loss_clip": 0.01160279, + "auxiliary_loss_mlp": 0.00762644, + "balance_loss_clip": 1.0467155, + "balance_loss_mlp": 1.00059175, + "epoch": 0.9269524439367522, + "flos": 22964838174720.0, + "grad_norm": 1.7367121796857168, + "language_loss": 0.79120314, + "learning_rate": 5.5633566071565444e-08, + "loss": 0.81043231, + "num_input_tokens_seen": 166661640, + "step": 7709, + "time_per_iteration": 2.460806369781494 + }, + { + "auxiliary_loss_clip": 0.01100382, + "auxiliary_loss_mlp": 0.01021398, + "balance_loss_clip": 1.03973699, + "balance_loss_mlp": 1.01437664, + "epoch": 0.9270726868273913, + "flos": 41975551468800.0, + "grad_norm": 2.0227381978061882, + "language_loss": 0.70734727, + "learning_rate": 5.5451261512232896e-08, + "loss": 0.7285651, + "num_input_tokens_seen": 166684320, + "step": 7710, + "time_per_iteration": 2.8055975437164307 + }, + { + "auxiliary_loss_clip": 0.01157239, + "auxiliary_loss_mlp": 0.01023553, + "balance_loss_clip": 1.04365754, + "balance_loss_mlp": 1.01582277, + "epoch": 0.9271929297180305, + "flos": 19791771557760.0, + "grad_norm": 1.9330613288743874, + "language_loss": 0.6246407, + "learning_rate": 5.5269251939397576e-08, + "loss": 0.64644861, + "num_input_tokens_seen": 166703835, + "step": 7711, + "time_per_iteration": 2.4434733390808105 + }, + { + "auxiliary_loss_clip": 0.01124602, + "auxiliary_loss_mlp": 0.01022477, + "balance_loss_clip": 1.03957546, + "balance_loss_mlp": 1.01512814, + "epoch": 0.9273131726086695, + "flos": 19968343839360.0, + "grad_norm": 2.94802167749872, + "language_loss": 0.7642951, + "learning_rate": 5.508753738067073e-08, + "loss": 0.78576589, + "num_input_tokens_seen": 166723375, + "step": 7712, + "time_per_iteration": 2.5319905281066895 + }, + { + "auxiliary_loss_clip": 0.01155307, + "auxiliary_loss_mlp": 0.01024028, + "balance_loss_clip": 1.04340255, + "balance_loss_mlp": 1.0166316, + "epoch": 0.9274334154993086, + "flos": 23258587599360.0, + "grad_norm": 1.8688148592148275, + "language_loss": 0.78870416, + "learning_rate": 5.4906117863617875e-08, + "loss": 0.81049752, + "num_input_tokens_seen": 166742760, + "step": 7713, + "time_per_iteration": 2.477344512939453 + }, + { + "auxiliary_loss_clip": 0.0112131, + "auxiliary_loss_mlp": 0.01023171, + "balance_loss_clip": 1.04012203, + "balance_loss_mlp": 1.01586902, + "epoch": 0.9275536583899477, + "flos": 31795343585280.0, + "grad_norm": 1.8782603810555907, + "language_loss": 0.77882683, + "learning_rate": 5.4724993415760533e-08, + "loss": 0.80027157, + "num_input_tokens_seen": 166761115, + "step": 7714, + "time_per_iteration": 2.590710401535034 + }, + { + "auxiliary_loss_clip": 0.01134085, + "auxiliary_loss_mlp": 0.00762811, + "balance_loss_clip": 1.04230046, + "balance_loss_mlp": 1.00071216, + "epoch": 0.9276739012805868, + "flos": 18696998885760.0, + "grad_norm": 4.574307584379913, + "language_loss": 0.74485362, + "learning_rate": 5.454416406457496e-08, + "loss": 0.76382256, + "num_input_tokens_seen": 166780210, + "step": 7715, + "time_per_iteration": 2.5015735626220703 + }, + { + "auxiliary_loss_clip": 0.01154099, + "auxiliary_loss_mlp": 0.010281, + "balance_loss_clip": 1.04601264, + "balance_loss_mlp": 1.02139115, + "epoch": 0.9277941441712259, + "flos": 13879079740800.0, + "grad_norm": 2.718192655969446, + "language_loss": 0.73612267, + "learning_rate": 5.436362983749299e-08, + "loss": 0.75794464, + "num_input_tokens_seen": 166795380, + "step": 7716, + "time_per_iteration": 2.426255941390991 + }, + { + "auxiliary_loss_clip": 0.0111826, + "auxiliary_loss_mlp": 0.01025336, + "balance_loss_clip": 1.04353654, + "balance_loss_mlp": 1.01863611, + "epoch": 0.927914387061865, + "flos": 23258659426560.0, + "grad_norm": 2.0357339326797863, + "language_loss": 0.64435148, + "learning_rate": 5.418339076190137e-08, + "loss": 0.6657874, + "num_input_tokens_seen": 166814890, + "step": 7717, + "time_per_iteration": 2.534334897994995 + }, + { + "auxiliary_loss_clip": 0.01133741, + "auxiliary_loss_mlp": 0.010192, + "balance_loss_clip": 1.04435909, + "balance_loss_mlp": 1.0119524, + "epoch": 0.9280346299525041, + "flos": 18073733068800.0, + "grad_norm": 1.9402309055850935, + "language_loss": 0.8857125, + "learning_rate": 5.400344686514202e-08, + "loss": 0.90724194, + "num_input_tokens_seen": 166832475, + "step": 7718, + "time_per_iteration": 3.1925737857818604 + }, + { + "auxiliary_loss_clip": 0.01153726, + "auxiliary_loss_mlp": 0.01020876, + "balance_loss_clip": 1.04772925, + "balance_loss_mlp": 1.01347923, + "epoch": 0.9281548728431431, + "flos": 22342901160960.0, + "grad_norm": 1.8450719505933295, + "language_loss": 0.6683799, + "learning_rate": 5.38237981745131e-08, + "loss": 0.69012588, + "num_input_tokens_seen": 166850590, + "step": 7719, + "time_per_iteration": 2.482804775238037 + }, + { + "auxiliary_loss_clip": 0.01156647, + "auxiliary_loss_mlp": 0.00762314, + "balance_loss_clip": 1.04633236, + "balance_loss_mlp": 1.00064731, + "epoch": 0.9282751157337822, + "flos": 18843765857280.0, + "grad_norm": 1.8024746532824132, + "language_loss": 0.81573415, + "learning_rate": 5.364444471726592e-08, + "loss": 0.83492374, + "num_input_tokens_seen": 166869795, + "step": 7720, + "time_per_iteration": 3.934846878051758 + }, + { + "auxiliary_loss_clip": 0.01150947, + "auxiliary_loss_mlp": 0.01021978, + "balance_loss_clip": 1.04343629, + "balance_loss_mlp": 1.01481652, + "epoch": 0.9283953586244214, + "flos": 25556834476800.0, + "grad_norm": 2.069737650774822, + "language_loss": 0.8012054, + "learning_rate": 5.346538652060939e-08, + "loss": 0.82293463, + "num_input_tokens_seen": 166891150, + "step": 7721, + "time_per_iteration": 2.4889321327209473 + }, + { + "auxiliary_loss_clip": 0.01135794, + "auxiliary_loss_mlp": 0.01021946, + "balance_loss_clip": 1.04500055, + "balance_loss_mlp": 1.01486182, + "epoch": 0.9285156015150604, + "flos": 18223480869120.0, + "grad_norm": 1.9233301808911212, + "language_loss": 0.70256656, + "learning_rate": 5.3286623611705994e-08, + "loss": 0.72414392, + "num_input_tokens_seen": 166909195, + "step": 7722, + "time_per_iteration": 2.486859083175659 + }, + { + "auxiliary_loss_clip": 0.01063195, + "auxiliary_loss_mlp": 0.01000572, + "balance_loss_clip": 1.00737143, + "balance_loss_mlp": 0.99960041, + "epoch": 0.9286358444056995, + "flos": 66400017690240.0, + "grad_norm": 0.84346739268616, + "language_loss": 0.60502636, + "learning_rate": 5.3108156017673824e-08, + "loss": 0.625664, + "num_input_tokens_seen": 166970955, + "step": 7723, + "time_per_iteration": 3.0794718265533447 + }, + { + "auxiliary_loss_clip": 0.01146673, + "auxiliary_loss_mlp": 0.01024121, + "balance_loss_clip": 1.04592216, + "balance_loss_mlp": 1.01606274, + "epoch": 0.9287560872963386, + "flos": 22345630594560.0, + "grad_norm": 1.9181033835974528, + "language_loss": 0.7154842, + "learning_rate": 5.2929983765586775e-08, + "loss": 0.73719215, + "num_input_tokens_seen": 166989735, + "step": 7724, + "time_per_iteration": 3.2754366397857666 + }, + { + "auxiliary_loss_clip": 0.01169926, + "auxiliary_loss_mlp": 0.01024223, + "balance_loss_clip": 1.04993391, + "balance_loss_mlp": 1.01762211, + "epoch": 0.9288763301869777, + "flos": 25700225569920.0, + "grad_norm": 1.7232063330869178, + "language_loss": 0.62475479, + "learning_rate": 5.275210688247278e-08, + "loss": 0.64669627, + "num_input_tokens_seen": 167010060, + "step": 7725, + "time_per_iteration": 2.462496757507324 + }, + { + "auxiliary_loss_clip": 0.01109508, + "auxiliary_loss_mlp": 0.01025624, + "balance_loss_clip": 1.04158449, + "balance_loss_mlp": 1.01848888, + "epoch": 0.9289965730776167, + "flos": 12312046028160.0, + "grad_norm": 1.9825540298767756, + "language_loss": 0.85249341, + "learning_rate": 5.257452539531604e-08, + "loss": 0.87384474, + "num_input_tokens_seen": 167027130, + "step": 7726, + "time_per_iteration": 2.5266404151916504 + }, + { + "auxiliary_loss_clip": 0.01154048, + "auxiliary_loss_mlp": 0.01027241, + "balance_loss_clip": 1.04501355, + "balance_loss_mlp": 1.01976705, + "epoch": 0.9291168159682559, + "flos": 26685973486080.0, + "grad_norm": 1.9172699149588017, + "language_loss": 0.68334115, + "learning_rate": 5.2397239331055445e-08, + "loss": 0.705154, + "num_input_tokens_seen": 167049130, + "step": 7727, + "time_per_iteration": 2.4812376499176025 + }, + { + "auxiliary_loss_clip": 0.01137788, + "auxiliary_loss_mlp": 0.01021964, + "balance_loss_clip": 1.04688728, + "balance_loss_mlp": 1.01458836, + "epoch": 0.929237058858895, + "flos": 14538256179840.0, + "grad_norm": 2.5130556492944214, + "language_loss": 0.81001782, + "learning_rate": 5.2220248716585036e-08, + "loss": 0.83161533, + "num_input_tokens_seen": 167066810, + "step": 7728, + "time_per_iteration": 2.4652581214904785 + }, + { + "auxiliary_loss_clip": 0.01145429, + "auxiliary_loss_mlp": 0.01027759, + "balance_loss_clip": 1.04333436, + "balance_loss_mlp": 1.02027225, + "epoch": 0.929357301749534, + "flos": 23835456023040.0, + "grad_norm": 2.307459436688935, + "language_loss": 0.75542855, + "learning_rate": 5.204355357875445e-08, + "loss": 0.77716047, + "num_input_tokens_seen": 167085155, + "step": 7729, + "time_per_iteration": 2.4746837615966797 + }, + { + "auxiliary_loss_clip": 0.0113665, + "auxiliary_loss_mlp": 0.01021773, + "balance_loss_clip": 1.04202151, + "balance_loss_mlp": 1.01436973, + "epoch": 0.9294775446401732, + "flos": 12969319046400.0, + "grad_norm": 3.096310964332137, + "language_loss": 0.70309055, + "learning_rate": 5.1867153944367584e-08, + "loss": 0.72467482, + "num_input_tokens_seen": 167101545, + "step": 7730, + "time_per_iteration": 2.490938663482666 + }, + { + "auxiliary_loss_clip": 0.01131855, + "auxiliary_loss_mlp": 0.01028022, + "balance_loss_clip": 1.04497457, + "balance_loss_mlp": 1.02067316, + "epoch": 0.9295977875308122, + "flos": 26211809024640.0, + "grad_norm": 1.619803555091549, + "language_loss": 0.73254812, + "learning_rate": 5.16910498401848e-08, + "loss": 0.75414693, + "num_input_tokens_seen": 167120995, + "step": 7731, + "time_per_iteration": 2.5727860927581787 + }, + { + "auxiliary_loss_clip": 0.01167178, + "auxiliary_loss_mlp": 0.0102381, + "balance_loss_clip": 1.04856026, + "balance_loss_mlp": 1.01664519, + "epoch": 0.9297180304214513, + "flos": 16472297105280.0, + "grad_norm": 5.101071757606467, + "language_loss": 0.83559084, + "learning_rate": 5.151524129292073e-08, + "loss": 0.85750079, + "num_input_tokens_seen": 167138890, + "step": 7732, + "time_per_iteration": 2.406557321548462 + }, + { + "auxiliary_loss_clip": 0.01150592, + "auxiliary_loss_mlp": 0.01028159, + "balance_loss_clip": 1.04440498, + "balance_loss_mlp": 1.02095032, + "epoch": 0.9298382733120905, + "flos": 24060436859520.0, + "grad_norm": 1.7952635958347178, + "language_loss": 0.66486007, + "learning_rate": 5.1339728329245155e-08, + "loss": 0.68664759, + "num_input_tokens_seen": 167159455, + "step": 7733, + "time_per_iteration": 2.4784913063049316 + }, + { + "auxiliary_loss_clip": 0.01172751, + "auxiliary_loss_mlp": 0.01030212, + "balance_loss_clip": 1.04818416, + "balance_loss_mlp": 1.02224588, + "epoch": 0.9299585162027295, + "flos": 22127652910080.0, + "grad_norm": 2.332469815470943, + "language_loss": 0.79360735, + "learning_rate": 5.116451097578367e-08, + "loss": 0.81563699, + "num_input_tokens_seen": 167178495, + "step": 7734, + "time_per_iteration": 2.435184955596924 + }, + { + "auxiliary_loss_clip": 0.0112545, + "auxiliary_loss_mlp": 0.01026045, + "balance_loss_clip": 1.04343855, + "balance_loss_mlp": 1.01883841, + "epoch": 0.9300787590933686, + "flos": 21471780522240.0, + "grad_norm": 1.7193743013446632, + "language_loss": 0.74058425, + "learning_rate": 5.0989589259115895e-08, + "loss": 0.76209927, + "num_input_tokens_seen": 167199380, + "step": 7735, + "time_per_iteration": 2.580061197280884 + }, + { + "auxiliary_loss_clip": 0.01150157, + "auxiliary_loss_mlp": 0.01024653, + "balance_loss_clip": 1.04237485, + "balance_loss_mlp": 1.01654124, + "epoch": 0.9301990019840077, + "flos": 17779588594560.0, + "grad_norm": 1.8537483659497571, + "language_loss": 0.71717215, + "learning_rate": 5.081496320577816e-08, + "loss": 0.73892021, + "num_input_tokens_seen": 167216500, + "step": 7736, + "time_per_iteration": 2.522118330001831 + }, + { + "auxiliary_loss_clip": 0.0104507, + "auxiliary_loss_mlp": 0.01001246, + "balance_loss_clip": 1.01448488, + "balance_loss_mlp": 1.00001228, + "epoch": 0.9303192448746468, + "flos": 58896122307840.0, + "grad_norm": 0.9123924974367614, + "language_loss": 0.61160827, + "learning_rate": 5.0640632842260835e-08, + "loss": 0.6320715, + "num_input_tokens_seen": 167276760, + "step": 7737, + "time_per_iteration": 3.14267635345459 + }, + { + "auxiliary_loss_clip": 0.01124782, + "auxiliary_loss_mlp": 0.00762396, + "balance_loss_clip": 1.04562378, + "balance_loss_mlp": 1.00060868, + "epoch": 0.9304394877652858, + "flos": 57663522172800.0, + "grad_norm": 1.421512627758082, + "language_loss": 0.72752571, + "learning_rate": 5.0466598195009426e-08, + "loss": 0.7463975, + "num_input_tokens_seen": 167303630, + "step": 7738, + "time_per_iteration": 2.8754804134368896 + }, + { + "auxiliary_loss_clip": 0.01126879, + "auxiliary_loss_mlp": 0.01025306, + "balance_loss_clip": 1.04372931, + "balance_loss_mlp": 1.01789129, + "epoch": 0.930559730655925, + "flos": 20996143603200.0, + "grad_norm": 2.2811174529906393, + "language_loss": 0.70205283, + "learning_rate": 5.0292859290425036e-08, + "loss": 0.7235747, + "num_input_tokens_seen": 167321500, + "step": 7739, + "time_per_iteration": 2.503455877304077 + }, + { + "auxiliary_loss_clip": 0.01166099, + "auxiliary_loss_mlp": 0.01022184, + "balance_loss_clip": 1.04725266, + "balance_loss_mlp": 1.01541305, + "epoch": 0.9306799735465641, + "flos": 23258264376960.0, + "grad_norm": 2.2634299704563743, + "language_loss": 0.77631128, + "learning_rate": 5.011941615486348e-08, + "loss": 0.79819405, + "num_input_tokens_seen": 167340615, + "step": 7740, + "time_per_iteration": 2.46232271194458 + }, + { + "auxiliary_loss_clip": 0.01165991, + "auxiliary_loss_mlp": 0.01023352, + "balance_loss_clip": 1.04549277, + "balance_loss_mlp": 1.01592529, + "epoch": 0.9308002164372031, + "flos": 15231547560960.0, + "grad_norm": 2.104684344233991, + "language_loss": 0.84534574, + "learning_rate": 4.994626881463659e-08, + "loss": 0.86723918, + "num_input_tokens_seen": 167356870, + "step": 7741, + "time_per_iteration": 2.390636444091797 + }, + { + "auxiliary_loss_clip": 0.01096382, + "auxiliary_loss_mlp": 0.01022698, + "balance_loss_clip": 1.0391854, + "balance_loss_mlp": 1.01535463, + "epoch": 0.9309204593278423, + "flos": 30847481539200.0, + "grad_norm": 1.8858911154362206, + "language_loss": 0.70847976, + "learning_rate": 4.9773417296009814e-08, + "loss": 0.72967058, + "num_input_tokens_seen": 167378390, + "step": 7742, + "time_per_iteration": 2.6384434700012207 + }, + { + "auxiliary_loss_clip": 0.01159313, + "auxiliary_loss_mlp": 0.01030612, + "balance_loss_clip": 1.0468266, + "balance_loss_mlp": 1.02342963, + "epoch": 0.9310407022184813, + "flos": 23037269950080.0, + "grad_norm": 4.20702843634947, + "language_loss": 0.65535766, + "learning_rate": 4.960086162520527e-08, + "loss": 0.67725694, + "num_input_tokens_seen": 167398480, + "step": 7743, + "time_per_iteration": 2.492748975753784 + }, + { + "auxiliary_loss_clip": 0.01116856, + "auxiliary_loss_mlp": 0.01026046, + "balance_loss_clip": 1.04183447, + "balance_loss_mlp": 1.01896477, + "epoch": 0.9311609451091204, + "flos": 22127976132480.0, + "grad_norm": 2.017693094553869, + "language_loss": 0.82419872, + "learning_rate": 4.942860182839936e-08, + "loss": 0.84562778, + "num_input_tokens_seen": 167416825, + "step": 7744, + "time_per_iteration": 3.3346023559570312 + }, + { + "auxiliary_loss_clip": 0.01138882, + "auxiliary_loss_mlp": 0.01024904, + "balance_loss_clip": 1.04526079, + "balance_loss_mlp": 1.01756978, + "epoch": 0.9312811879997596, + "flos": 21099206701440.0, + "grad_norm": 1.8609541799765275, + "language_loss": 0.79984605, + "learning_rate": 4.925663793172341e-08, + "loss": 0.82148397, + "num_input_tokens_seen": 167434785, + "step": 7745, + "time_per_iteration": 2.486422300338745 + }, + { + "auxiliary_loss_clip": 0.01038291, + "auxiliary_loss_mlp": 0.00752663, + "balance_loss_clip": 1.00747752, + "balance_loss_mlp": 1.00051165, + "epoch": 0.9314014308903986, + "flos": 67148179096320.0, + "grad_norm": 0.780922869403287, + "language_loss": 0.56509423, + "learning_rate": 4.908496996126477e-08, + "loss": 0.58300376, + "num_input_tokens_seen": 167498245, + "step": 7746, + "time_per_iteration": 3.8717048168182373 + }, + { + "auxiliary_loss_clip": 0.01154121, + "auxiliary_loss_mlp": 0.01026407, + "balance_loss_clip": 1.04906392, + "balance_loss_mlp": 1.01879835, + "epoch": 0.9315216737810377, + "flos": 22565583527040.0, + "grad_norm": 1.5563820934123935, + "language_loss": 0.76501381, + "learning_rate": 4.89135979430646e-08, + "loss": 0.7868191, + "num_input_tokens_seen": 167518290, + "step": 7747, + "time_per_iteration": 3.2636313438415527 + }, + { + "auxiliary_loss_clip": 0.01168909, + "auxiliary_loss_mlp": 0.01023342, + "balance_loss_clip": 1.04907119, + "balance_loss_mlp": 1.01553941, + "epoch": 0.9316419166716768, + "flos": 23984054588160.0, + "grad_norm": 1.6860887224985297, + "language_loss": 0.85476017, + "learning_rate": 4.874252190312078e-08, + "loss": 0.87668264, + "num_input_tokens_seen": 167538675, + "step": 7748, + "time_per_iteration": 2.4832141399383545 + }, + { + "auxiliary_loss_clip": 0.01157733, + "auxiliary_loss_mlp": 0.01024071, + "balance_loss_clip": 1.04631138, + "balance_loss_mlp": 1.01672733, + "epoch": 0.9317621595623159, + "flos": 30230464688640.0, + "grad_norm": 1.809055340452101, + "language_loss": 0.64729279, + "learning_rate": 4.857174186738477e-08, + "loss": 0.66911083, + "num_input_tokens_seen": 167562025, + "step": 7749, + "time_per_iteration": 2.5448849201202393 + }, + { + "auxiliary_loss_clip": 0.01169065, + "auxiliary_loss_mlp": 0.01024235, + "balance_loss_clip": 1.04857707, + "balance_loss_mlp": 1.0166353, + "epoch": 0.931882402452955, + "flos": 15742735966080.0, + "grad_norm": 2.451920694043136, + "language_loss": 0.73077047, + "learning_rate": 4.840125786176408e-08, + "loss": 0.75270343, + "num_input_tokens_seen": 167578230, + "step": 7750, + "time_per_iteration": 2.3938069343566895 + }, + { + "auxiliary_loss_clip": 0.01135647, + "auxiliary_loss_mlp": 0.01023041, + "balance_loss_clip": 1.04316247, + "balance_loss_mlp": 1.01602006, + "epoch": 0.932002645343594, + "flos": 28366521154560.0, + "grad_norm": 1.7685535834537507, + "language_loss": 0.77175421, + "learning_rate": 4.823106991212067e-08, + "loss": 0.79334104, + "num_input_tokens_seen": 167597470, + "step": 7751, + "time_per_iteration": 3.3278825283050537 + }, + { + "auxiliary_loss_clip": 0.01155494, + "auxiliary_loss_mlp": 0.01023049, + "balance_loss_clip": 1.04626536, + "balance_loss_mlp": 1.01586103, + "epoch": 0.9321228882342332, + "flos": 15341146934400.0, + "grad_norm": 1.961208590766289, + "language_loss": 0.83382338, + "learning_rate": 4.806117804427212e-08, + "loss": 0.85560876, + "num_input_tokens_seen": 167615405, + "step": 7752, + "time_per_iteration": 2.4304287433624268 + }, + { + "auxiliary_loss_clip": 0.01149135, + "auxiliary_loss_mlp": 0.01027919, + "balance_loss_clip": 1.04383516, + "balance_loss_mlp": 1.02029526, + "epoch": 0.9322431311248722, + "flos": 17895365107200.0, + "grad_norm": 2.3252554181628655, + "language_loss": 0.64426142, + "learning_rate": 4.7891582283990926e-08, + "loss": 0.66603196, + "num_input_tokens_seen": 167634130, + "step": 7753, + "time_per_iteration": 2.4121787548065186 + }, + { + "auxiliary_loss_clip": 0.01126588, + "auxiliary_loss_mlp": 0.01020101, + "balance_loss_clip": 1.04240823, + "balance_loss_mlp": 1.01288033, + "epoch": 0.9323633740155113, + "flos": 24169713010560.0, + "grad_norm": 1.6388245176584502, + "language_loss": 0.73062283, + "learning_rate": 4.772228265700473e-08, + "loss": 0.75208974, + "num_input_tokens_seen": 167654990, + "step": 7754, + "time_per_iteration": 2.543905735015869 + }, + { + "auxiliary_loss_clip": 0.01157471, + "auxiliary_loss_mlp": 0.01026552, + "balance_loss_clip": 1.04689109, + "balance_loss_mlp": 1.01907468, + "epoch": 0.9324836169061504, + "flos": 15043482927360.0, + "grad_norm": 2.271434902830792, + "language_loss": 0.75875652, + "learning_rate": 4.75532791889961e-08, + "loss": 0.78059673, + "num_input_tokens_seen": 167671690, + "step": 7755, + "time_per_iteration": 2.4054393768310547 + }, + { + "auxiliary_loss_clip": 0.01151185, + "auxiliary_loss_mlp": 0.01024918, + "balance_loss_clip": 1.04366624, + "balance_loss_mlp": 1.0174768, + "epoch": 0.9326038597967895, + "flos": 18624890332800.0, + "grad_norm": 2.122412330964856, + "language_loss": 0.65473139, + "learning_rate": 4.738457190560252e-08, + "loss": 0.67649245, + "num_input_tokens_seen": 167690800, + "step": 7756, + "time_per_iteration": 2.4357893466949463 + }, + { + "auxiliary_loss_clip": 0.01112935, + "auxiliary_loss_mlp": 0.0102348, + "balance_loss_clip": 1.04439425, + "balance_loss_mlp": 1.01615167, + "epoch": 0.9327241026874286, + "flos": 18952646958720.0, + "grad_norm": 2.0218145390074582, + "language_loss": 0.78779572, + "learning_rate": 4.721616083241664e-08, + "loss": 0.80915987, + "num_input_tokens_seen": 167709055, + "step": 7757, + "time_per_iteration": 2.540067672729492 + }, + { + "auxiliary_loss_clip": 0.01149778, + "auxiliary_loss_mlp": 0.01024712, + "balance_loss_clip": 1.04528439, + "balance_loss_mlp": 1.01752925, + "epoch": 0.9328443455780677, + "flos": 29570282668800.0, + "grad_norm": 1.74613509472, + "language_loss": 0.77616692, + "learning_rate": 4.7048045994986684e-08, + "loss": 0.79791182, + "num_input_tokens_seen": 167729915, + "step": 7758, + "time_per_iteration": 2.5292460918426514 + }, + { + "auxiliary_loss_clip": 0.01160153, + "auxiliary_loss_mlp": 0.01023023, + "balance_loss_clip": 1.04783273, + "balance_loss_mlp": 1.01562655, + "epoch": 0.9329645884687068, + "flos": 30081722469120.0, + "grad_norm": 3.1257223032779367, + "language_loss": 0.90721643, + "learning_rate": 4.688022741881559e-08, + "loss": 0.92904824, + "num_input_tokens_seen": 167750440, + "step": 7759, + "time_per_iteration": 2.5612568855285645 + }, + { + "auxiliary_loss_clip": 0.0114984, + "auxiliary_loss_mlp": 0.01025817, + "balance_loss_clip": 1.04373288, + "balance_loss_mlp": 1.0192101, + "epoch": 0.9330848313593458, + "flos": 21867982513920.0, + "grad_norm": 1.5206656103908, + "language_loss": 0.7524339, + "learning_rate": 4.671270512936076e-08, + "loss": 0.77419049, + "num_input_tokens_seen": 167769600, + "step": 7760, + "time_per_iteration": 2.4687018394470215 + }, + { + "auxiliary_loss_clip": 0.0111905, + "auxiliary_loss_mlp": 0.0102227, + "balance_loss_clip": 1.04169965, + "balance_loss_mlp": 1.01503968, + "epoch": 0.933205074249985, + "flos": 22127221946880.0, + "grad_norm": 1.949754964393329, + "language_loss": 0.82719713, + "learning_rate": 4.6545479152035884e-08, + "loss": 0.8486104, + "num_input_tokens_seen": 167788770, + "step": 7761, + "time_per_iteration": 2.5331623554229736 + }, + { + "auxiliary_loss_clip": 0.01152181, + "auxiliary_loss_mlp": 0.01023284, + "balance_loss_clip": 1.04455757, + "balance_loss_mlp": 1.01615858, + "epoch": 0.9333253171406241, + "flos": 15341254675200.0, + "grad_norm": 2.0702568639357053, + "language_loss": 0.76255763, + "learning_rate": 4.637854951220821e-08, + "loss": 0.78431225, + "num_input_tokens_seen": 167805555, + "step": 7762, + "time_per_iteration": 2.431830644607544 + }, + { + "auxiliary_loss_clip": 0.0111884, + "auxiliary_loss_mlp": 0.01021534, + "balance_loss_clip": 1.04058409, + "balance_loss_mlp": 1.01445019, + "epoch": 0.9334455600312631, + "flos": 15706142985600.0, + "grad_norm": 1.9355184934939547, + "language_loss": 0.74704212, + "learning_rate": 4.621191623520171e-08, + "loss": 0.76844585, + "num_input_tokens_seen": 167823985, + "step": 7763, + "time_per_iteration": 2.5038564205169678 + }, + { + "auxiliary_loss_clip": 0.0111182, + "auxiliary_loss_mlp": 0.01025303, + "balance_loss_clip": 1.04270625, + "balance_loss_mlp": 1.01773965, + "epoch": 0.9335658029219023, + "flos": 22163563532160.0, + "grad_norm": 5.6004385786172595, + "language_loss": 0.84348029, + "learning_rate": 4.604557934629372e-08, + "loss": 0.86485153, + "num_input_tokens_seen": 167843060, + "step": 7764, + "time_per_iteration": 2.5822134017944336 + }, + { + "auxiliary_loss_clip": 0.01135083, + "auxiliary_loss_mlp": 0.01021735, + "balance_loss_clip": 1.04522264, + "balance_loss_mlp": 1.01516652, + "epoch": 0.9336860458125413, + "flos": 20266833859200.0, + "grad_norm": 1.8423442587934618, + "language_loss": 0.80346429, + "learning_rate": 4.587953887071805e-08, + "loss": 0.82503247, + "num_input_tokens_seen": 167862880, + "step": 7765, + "time_per_iteration": 2.4883809089660645 + }, + { + "auxiliary_loss_clip": 0.01135615, + "auxiliary_loss_mlp": 0.01028548, + "balance_loss_clip": 1.04189467, + "balance_loss_mlp": 1.02093935, + "epoch": 0.9338062887031804, + "flos": 20919689504640.0, + "grad_norm": 1.8998275813197074, + "language_loss": 0.85778427, + "learning_rate": 4.5713794833662554e-08, + "loss": 0.87942588, + "num_input_tokens_seen": 167882095, + "step": 7766, + "time_per_iteration": 2.4818527698516846 + }, + { + "auxiliary_loss_clip": 0.01168434, + "auxiliary_loss_mlp": 0.01025821, + "balance_loss_clip": 1.04765522, + "balance_loss_mlp": 1.01814437, + "epoch": 0.9339265315938196, + "flos": 23221635482880.0, + "grad_norm": 1.7276202630387312, + "language_loss": 0.63347232, + "learning_rate": 4.5548347260270236e-08, + "loss": 0.65541488, + "num_input_tokens_seen": 167901385, + "step": 7767, + "time_per_iteration": 2.4282264709472656 + }, + { + "auxiliary_loss_clip": 0.0112022, + "auxiliary_loss_mlp": 0.01022665, + "balance_loss_clip": 1.04228354, + "balance_loss_mlp": 1.01574826, + "epoch": 0.9340467744844586, + "flos": 22820261932800.0, + "grad_norm": 1.6516552051205475, + "language_loss": 0.69199467, + "learning_rate": 4.538319617564012e-08, + "loss": 0.71342349, + "num_input_tokens_seen": 167920405, + "step": 7768, + "time_per_iteration": 2.539844036102295 + }, + { + "auxiliary_loss_clip": 0.01137868, + "auxiliary_loss_mlp": 0.01021834, + "balance_loss_clip": 1.0421834, + "balance_loss_mlp": 1.01453841, + "epoch": 0.9341670173750977, + "flos": 23660428026240.0, + "grad_norm": 2.051451457619955, + "language_loss": 0.74606818, + "learning_rate": 4.521834160482485e-08, + "loss": 0.76766515, + "num_input_tokens_seen": 167939145, + "step": 7769, + "time_per_iteration": 2.5057058334350586 + }, + { + "auxiliary_loss_clip": 0.01155813, + "auxiliary_loss_mlp": 0.01028966, + "balance_loss_clip": 1.04549897, + "balance_loss_mlp": 1.02129793, + "epoch": 0.9342872602657368, + "flos": 24824256595200.0, + "grad_norm": 3.1090720985959037, + "language_loss": 0.81990516, + "learning_rate": 4.5053783572832846e-08, + "loss": 0.84175301, + "num_input_tokens_seen": 167959325, + "step": 7770, + "time_per_iteration": 2.4914896488189697 + }, + { + "auxiliary_loss_clip": 0.01154957, + "auxiliary_loss_mlp": 0.01026234, + "balance_loss_clip": 1.04713011, + "balance_loss_mlp": 1.01923907, + "epoch": 0.9344075031563759, + "flos": 25771831332480.0, + "grad_norm": 2.3062484525244726, + "language_loss": 0.7629475, + "learning_rate": 4.488952210462771e-08, + "loss": 0.7847594, + "num_input_tokens_seen": 167979530, + "step": 7771, + "time_per_iteration": 3.274014949798584 + }, + { + "auxiliary_loss_clip": 0.01165438, + "auxiliary_loss_mlp": 0.01022501, + "balance_loss_clip": 1.04652774, + "balance_loss_mlp": 1.01555109, + "epoch": 0.9345277460470149, + "flos": 25551303782400.0, + "grad_norm": 1.8724899762396883, + "language_loss": 0.8566125, + "learning_rate": 4.4725557225127495e-08, + "loss": 0.87849188, + "num_input_tokens_seen": 167997870, + "step": 7772, + "time_per_iteration": 2.4937944412231445 + }, + { + "auxiliary_loss_clip": 0.01154339, + "auxiliary_loss_mlp": 0.01026478, + "balance_loss_clip": 1.04761422, + "balance_loss_mlp": 1.01982355, + "epoch": 0.9346479889376541, + "flos": 34313112432000.0, + "grad_norm": 2.784939976730529, + "language_loss": 0.79494673, + "learning_rate": 4.456188895920565e-08, + "loss": 0.81675494, + "num_input_tokens_seen": 168019625, + "step": 7773, + "time_per_iteration": 3.3695473670959473 + }, + { + "auxiliary_loss_clip": 0.01169503, + "auxiliary_loss_mlp": 0.01023793, + "balance_loss_clip": 1.04843163, + "balance_loss_mlp": 1.01616621, + "epoch": 0.9347682318282932, + "flos": 19093739581440.0, + "grad_norm": 1.9031234829656736, + "language_loss": 0.85162151, + "learning_rate": 4.439851733169031e-08, + "loss": 0.87355447, + "num_input_tokens_seen": 168037415, + "step": 7774, + "time_per_iteration": 2.3998022079467773 + }, + { + "auxiliary_loss_clip": 0.0112668, + "auxiliary_loss_mlp": 0.01027453, + "balance_loss_clip": 1.04224074, + "balance_loss_mlp": 1.02061319, + "epoch": 0.9348884747189322, + "flos": 26249587153920.0, + "grad_norm": 2.757126907656063, + "language_loss": 0.69364548, + "learning_rate": 4.4235442367365204e-08, + "loss": 0.71518677, + "num_input_tokens_seen": 168057725, + "step": 7775, + "time_per_iteration": 2.561352014541626 + }, + { + "auxiliary_loss_clip": 0.01133949, + "auxiliary_loss_mlp": 0.01025587, + "balance_loss_clip": 1.04107821, + "balance_loss_mlp": 1.01802897, + "epoch": 0.9350087176095714, + "flos": 18333080242560.0, + "grad_norm": 2.5142500296259205, + "language_loss": 0.79407549, + "learning_rate": 4.4072664090968545e-08, + "loss": 0.81567085, + "num_input_tokens_seen": 168076110, + "step": 7776, + "time_per_iteration": 2.4600725173950195 + }, + { + "auxiliary_loss_clip": 0.01140618, + "auxiliary_loss_mlp": 0.01026301, + "balance_loss_clip": 1.04298997, + "balance_loss_mlp": 1.01914907, + "epoch": 0.9351289605002104, + "flos": 19318253541120.0, + "grad_norm": 1.804543980890633, + "language_loss": 0.84372717, + "learning_rate": 4.391018252719347e-08, + "loss": 0.86539644, + "num_input_tokens_seen": 168095905, + "step": 7777, + "time_per_iteration": 2.5006606578826904 + }, + { + "auxiliary_loss_clip": 0.01140321, + "auxiliary_loss_mlp": 0.0102592, + "balance_loss_clip": 1.04291701, + "balance_loss_mlp": 1.0185349, + "epoch": 0.9352492033908495, + "flos": 18799990156800.0, + "grad_norm": 1.6395451934347522, + "language_loss": 0.68730843, + "learning_rate": 4.374799770068849e-08, + "loss": 0.70897079, + "num_input_tokens_seen": 168112580, + "step": 7778, + "time_per_iteration": 3.2621312141418457 + }, + { + "auxiliary_loss_clip": 0.01150947, + "auxiliary_loss_mlp": 0.01023369, + "balance_loss_clip": 1.04638124, + "balance_loss_mlp": 1.01590633, + "epoch": 0.9353694462814887, + "flos": 29530134241920.0, + "grad_norm": 2.2088570093569113, + "language_loss": 0.74629378, + "learning_rate": 4.358610963605658e-08, + "loss": 0.76803696, + "num_input_tokens_seen": 168133030, + "step": 7779, + "time_per_iteration": 2.5178260803222656 + }, + { + "auxiliary_loss_clip": 0.01171655, + "auxiliary_loss_mlp": 0.01036805, + "balance_loss_clip": 1.04923868, + "balance_loss_mlp": 1.02942312, + "epoch": 0.9354896891721277, + "flos": 30665450390400.0, + "grad_norm": 1.9958206410828383, + "language_loss": 0.68209743, + "learning_rate": 4.342451835785677e-08, + "loss": 0.70418203, + "num_input_tokens_seen": 168153940, + "step": 7780, + "time_per_iteration": 2.5115773677825928 + }, + { + "auxiliary_loss_clip": 0.01136066, + "auxiliary_loss_mlp": 0.010234, + "balance_loss_clip": 1.04310894, + "balance_loss_mlp": 1.01648855, + "epoch": 0.9356099320627668, + "flos": 19463907191040.0, + "grad_norm": 1.5244695501284478, + "language_loss": 0.74869299, + "learning_rate": 4.3263223890601665e-08, + "loss": 0.77028763, + "num_input_tokens_seen": 168172650, + "step": 7781, + "time_per_iteration": 2.4714760780334473 + }, + { + "auxiliary_loss_clip": 0.01149469, + "auxiliary_loss_mlp": 0.00761937, + "balance_loss_clip": 1.04722393, + "balance_loss_mlp": 1.00066006, + "epoch": 0.9357301749534058, + "flos": 19098156954240.0, + "grad_norm": 1.8008277084266564, + "language_loss": 0.7942009, + "learning_rate": 4.31022262587597e-08, + "loss": 0.81331491, + "num_input_tokens_seen": 168191325, + "step": 7782, + "time_per_iteration": 2.4477033615112305 + }, + { + "auxiliary_loss_clip": 0.01153868, + "auxiliary_loss_mlp": 0.01028599, + "balance_loss_clip": 1.04721665, + "balance_loss_mlp": 1.02023101, + "epoch": 0.935850417844045, + "flos": 23550361776000.0, + "grad_norm": 1.9247084645853048, + "language_loss": 0.65898091, + "learning_rate": 4.2941525486754225e-08, + "loss": 0.68080556, + "num_input_tokens_seen": 168211645, + "step": 7783, + "time_per_iteration": 2.4782402515411377 + }, + { + "auxiliary_loss_clip": 0.01120385, + "auxiliary_loss_mlp": 0.01024399, + "balance_loss_clip": 1.04267859, + "balance_loss_mlp": 1.01783943, + "epoch": 0.935970660734684, + "flos": 18588333265920.0, + "grad_norm": 1.8585185891306255, + "language_loss": 0.79671198, + "learning_rate": 4.278112159896286e-08, + "loss": 0.81815976, + "num_input_tokens_seen": 168229485, + "step": 7784, + "time_per_iteration": 2.5393271446228027 + }, + { + "auxiliary_loss_clip": 0.01131392, + "auxiliary_loss_mlp": 0.01023026, + "balance_loss_clip": 1.03984475, + "balance_loss_mlp": 1.01644254, + "epoch": 0.9360909036253231, + "flos": 20631255292800.0, + "grad_norm": 2.0598980794418886, + "language_loss": 0.67468482, + "learning_rate": 4.2621014619719896e-08, + "loss": 0.69622898, + "num_input_tokens_seen": 168247250, + "step": 7785, + "time_per_iteration": 2.5019028186798096 + }, + { + "auxiliary_loss_clip": 0.0104205, + "auxiliary_loss_mlp": 0.01002144, + "balance_loss_clip": 1.00835729, + "balance_loss_mlp": 1.00106514, + "epoch": 0.9362111465159623, + "flos": 61791421052160.0, + "grad_norm": 0.7204162990322758, + "language_loss": 0.58674622, + "learning_rate": 4.246120457331215e-08, + "loss": 0.60718811, + "num_input_tokens_seen": 168309425, + "step": 7786, + "time_per_iteration": 3.113050937652588 + }, + { + "auxiliary_loss_clip": 0.01132681, + "auxiliary_loss_mlp": 0.01027183, + "balance_loss_clip": 1.04405093, + "balance_loss_mlp": 1.0193336, + "epoch": 0.9363313894066013, + "flos": 24170395368960.0, + "grad_norm": 2.3149594950405374, + "language_loss": 0.7216152, + "learning_rate": 4.2301691483983325e-08, + "loss": 0.74321389, + "num_input_tokens_seen": 168329545, + "step": 7787, + "time_per_iteration": 2.50471830368042 + }, + { + "auxiliary_loss_clip": 0.01155783, + "auxiliary_loss_mlp": 0.01021175, + "balance_loss_clip": 1.04489696, + "balance_loss_mlp": 1.01361132, + "epoch": 0.9364516322972404, + "flos": 20120354196480.0, + "grad_norm": 1.7627217359258793, + "language_loss": 0.75730914, + "learning_rate": 4.214247537593163e-08, + "loss": 0.77907872, + "num_input_tokens_seen": 168348795, + "step": 7788, + "time_per_iteration": 2.4466326236724854 + }, + { + "auxiliary_loss_clip": 0.01138548, + "auxiliary_loss_mlp": 0.01031024, + "balance_loss_clip": 1.04285026, + "balance_loss_mlp": 1.02399945, + "epoch": 0.9365718751878795, + "flos": 20703758895360.0, + "grad_norm": 2.1273945890790467, + "language_loss": 0.80595803, + "learning_rate": 4.1983556273309293e-08, + "loss": 0.82765377, + "num_input_tokens_seen": 168367545, + "step": 7789, + "time_per_iteration": 2.4870927333831787 + }, + { + "auxiliary_loss_clip": 0.01171061, + "auxiliary_loss_mlp": 0.01029844, + "balance_loss_clip": 1.04820704, + "balance_loss_mlp": 1.02202344, + "epoch": 0.9366921180785186, + "flos": 18655270260480.0, + "grad_norm": 2.607109776881646, + "language_loss": 0.69133621, + "learning_rate": 4.182493420022526e-08, + "loss": 0.71334517, + "num_input_tokens_seen": 168383215, + "step": 7790, + "time_per_iteration": 2.3710358142852783 + }, + { + "auxiliary_loss_clip": 0.01125809, + "auxiliary_loss_mlp": 0.01023802, + "balance_loss_clip": 1.04210448, + "balance_loss_mlp": 1.01702499, + "epoch": 0.9368123609691577, + "flos": 25774955815680.0, + "grad_norm": 2.010507199889832, + "language_loss": 0.78649557, + "learning_rate": 4.166660918074139e-08, + "loss": 0.80799168, + "num_input_tokens_seen": 168403120, + "step": 7791, + "time_per_iteration": 2.558145761489868 + }, + { + "auxiliary_loss_clip": 0.01120244, + "auxiliary_loss_mlp": 0.01024027, + "balance_loss_clip": 1.04063344, + "balance_loss_mlp": 1.01662099, + "epoch": 0.9369326038597968, + "flos": 25553386771200.0, + "grad_norm": 1.5597357198480821, + "language_loss": 0.73549384, + "learning_rate": 4.15085812388758e-08, + "loss": 0.75693655, + "num_input_tokens_seen": 168425340, + "step": 7792, + "time_per_iteration": 2.5560269355773926 + }, + { + "auxiliary_loss_clip": 0.01138286, + "auxiliary_loss_mlp": 0.01026076, + "balance_loss_clip": 1.0442555, + "balance_loss_mlp": 1.01866066, + "epoch": 0.9370528467504359, + "flos": 23220019370880.0, + "grad_norm": 2.9071545076157053, + "language_loss": 0.78572118, + "learning_rate": 4.135085039860153e-08, + "loss": 0.80736482, + "num_input_tokens_seen": 168444740, + "step": 7793, + "time_per_iteration": 2.517411947250366 + }, + { + "auxiliary_loss_clip": 0.01140417, + "auxiliary_loss_mlp": 0.01022623, + "balance_loss_clip": 1.047966, + "balance_loss_mlp": 1.01509476, + "epoch": 0.9371730896410749, + "flos": 24967468120320.0, + "grad_norm": 2.150998935095886, + "language_loss": 0.78364915, + "learning_rate": 4.1193416683845906e-08, + "loss": 0.80527955, + "num_input_tokens_seen": 168463670, + "step": 7794, + "time_per_iteration": 2.523966073989868 + }, + { + "auxiliary_loss_clip": 0.01129543, + "auxiliary_loss_mlp": 0.01025529, + "balance_loss_clip": 1.0455842, + "balance_loss_mlp": 1.01894867, + "epoch": 0.9372933325317141, + "flos": 15553091134080.0, + "grad_norm": 2.5779923543156236, + "language_loss": 0.833197, + "learning_rate": 4.103628011849136e-08, + "loss": 0.85474771, + "num_input_tokens_seen": 168479030, + "step": 7795, + "time_per_iteration": 2.5277864933013916 + }, + { + "auxiliary_loss_clip": 0.01140128, + "auxiliary_loss_mlp": 0.01025555, + "balance_loss_clip": 1.04504073, + "balance_loss_mlp": 1.01812541, + "epoch": 0.9374135754223532, + "flos": 21871861182720.0, + "grad_norm": 2.2506362288028883, + "language_loss": 0.75852734, + "learning_rate": 4.0879440726375506e-08, + "loss": 0.78018415, + "num_input_tokens_seen": 168496815, + "step": 7796, + "time_per_iteration": 2.5206353664398193 + }, + { + "auxiliary_loss_clip": 0.01136424, + "auxiliary_loss_mlp": 0.01022139, + "balance_loss_clip": 1.04107738, + "balance_loss_mlp": 1.01470637, + "epoch": 0.9375338183129922, + "flos": 22631048064000.0, + "grad_norm": 2.5629553838901304, + "language_loss": 0.56267172, + "learning_rate": 4.0722898531291074e-08, + "loss": 0.58425736, + "num_input_tokens_seen": 168514055, + "step": 7797, + "time_per_iteration": 2.5532777309417725 + }, + { + "auxiliary_loss_clip": 0.01145226, + "auxiliary_loss_mlp": 0.01023181, + "balance_loss_clip": 1.04407644, + "balance_loss_mlp": 1.01554, + "epoch": 0.9376540612036314, + "flos": 26104292640000.0, + "grad_norm": 1.7361415351381928, + "language_loss": 0.76755524, + "learning_rate": 4.0566653556985295e-08, + "loss": 0.78923929, + "num_input_tokens_seen": 168534600, + "step": 7798, + "time_per_iteration": 3.3162167072296143 + }, + { + "auxiliary_loss_clip": 0.01085676, + "auxiliary_loss_mlp": 0.01028844, + "balance_loss_clip": 1.03872085, + "balance_loss_mlp": 1.02051997, + "epoch": 0.9377743040942704, + "flos": 19717580016000.0, + "grad_norm": 2.5496667825055392, + "language_loss": 0.81701159, + "learning_rate": 4.0410705827159886e-08, + "loss": 0.8381567, + "num_input_tokens_seen": 168551895, + "step": 7799, + "time_per_iteration": 2.6989023685455322 + }, + { + "auxiliary_loss_clip": 0.01134122, + "auxiliary_loss_mlp": 0.01024715, + "balance_loss_clip": 1.04024577, + "balance_loss_mlp": 1.01705003, + "epoch": 0.9378945469849095, + "flos": 15267530010240.0, + "grad_norm": 1.8673395105223762, + "language_loss": 0.71160042, + "learning_rate": 4.0255055365472356e-08, + "loss": 0.73318875, + "num_input_tokens_seen": 168569990, + "step": 7800, + "time_per_iteration": 4.253475189208984 + }, + { + "auxiliary_loss_clip": 0.01097467, + "auxiliary_loss_mlp": 0.01026116, + "balance_loss_clip": 1.03711736, + "balance_loss_mlp": 1.01907921, + "epoch": 0.9380147898755486, + "flos": 20591394174720.0, + "grad_norm": 3.0014982637280774, + "language_loss": 0.74573958, + "learning_rate": 4.009970219553471e-08, + "loss": 0.7669754, + "num_input_tokens_seen": 168586940, + "step": 7801, + "time_per_iteration": 2.6053988933563232 + }, + { + "auxiliary_loss_clip": 0.01156961, + "auxiliary_loss_mlp": 0.01025098, + "balance_loss_clip": 1.04499888, + "balance_loss_mlp": 1.01696789, + "epoch": 0.9381350327661877, + "flos": 26281116316800.0, + "grad_norm": 2.872935354994669, + "language_loss": 0.76661658, + "learning_rate": 3.99446463409141e-08, + "loss": 0.78843719, + "num_input_tokens_seen": 168604795, + "step": 7802, + "time_per_iteration": 2.4914040565490723 + }, + { + "auxiliary_loss_clip": 0.01157228, + "auxiliary_loss_mlp": 0.01027268, + "balance_loss_clip": 1.0436244, + "balance_loss_mlp": 1.01954365, + "epoch": 0.9382552756568268, + "flos": 23586344225280.0, + "grad_norm": 2.2606709662971167, + "language_loss": 0.68607688, + "learning_rate": 3.978988782513215e-08, + "loss": 0.7079218, + "num_input_tokens_seen": 168622290, + "step": 7803, + "time_per_iteration": 2.473031997680664 + }, + { + "auxiliary_loss_clip": 0.01158101, + "auxiliary_loss_mlp": 0.01021117, + "balance_loss_clip": 1.0460062, + "balance_loss_mlp": 1.01394331, + "epoch": 0.9383755185474659, + "flos": 28438809275520.0, + "grad_norm": 1.6221331804895633, + "language_loss": 0.76198906, + "learning_rate": 3.963542667166586e-08, + "loss": 0.78378123, + "num_input_tokens_seen": 168642395, + "step": 7804, + "time_per_iteration": 3.2487833499908447 + }, + { + "auxiliary_loss_clip": 0.01130296, + "auxiliary_loss_mlp": 0.01025506, + "balance_loss_clip": 1.04848313, + "balance_loss_mlp": 1.01847243, + "epoch": 0.938495761438105, + "flos": 20449583280000.0, + "grad_norm": 2.3162879065217385, + "language_loss": 0.6806643, + "learning_rate": 3.9481262903946486e-08, + "loss": 0.70222235, + "num_input_tokens_seen": 168661840, + "step": 7805, + "time_per_iteration": 2.529344081878662 + }, + { + "auxiliary_loss_clip": 0.01025699, + "auxiliary_loss_mlp": 0.01001028, + "balance_loss_clip": 1.00768113, + "balance_loss_mlp": 1.0000807, + "epoch": 0.938616004328744, + "flos": 69302711658240.0, + "grad_norm": 0.7674971263429821, + "language_loss": 0.54476541, + "learning_rate": 3.932739654536066e-08, + "loss": 0.56503272, + "num_input_tokens_seen": 168724540, + "step": 7806, + "time_per_iteration": 3.1295969486236572 + }, + { + "auxiliary_loss_clip": 0.0115403, + "auxiliary_loss_mlp": 0.01023876, + "balance_loss_clip": 1.04732513, + "balance_loss_mlp": 1.0172565, + "epoch": 0.9387362472193832, + "flos": 18911636605440.0, + "grad_norm": 2.1996502046611988, + "language_loss": 0.74094957, + "learning_rate": 3.917382761925014e-08, + "loss": 0.76272863, + "num_input_tokens_seen": 168740375, + "step": 7807, + "time_per_iteration": 2.445462226867676 + }, + { + "auxiliary_loss_clip": 0.01150919, + "auxiliary_loss_mlp": 0.01028124, + "balance_loss_clip": 1.04594564, + "balance_loss_mlp": 1.02128696, + "epoch": 0.9388564901100223, + "flos": 26501967089280.0, + "grad_norm": 1.640341859522848, + "language_loss": 0.79248202, + "learning_rate": 3.9020556148910754e-08, + "loss": 0.81427246, + "num_input_tokens_seen": 168759730, + "step": 7808, + "time_per_iteration": 2.4942984580993652 + }, + { + "auxiliary_loss_clip": 0.01046109, + "auxiliary_loss_mlp": 0.01000887, + "balance_loss_clip": 1.00842142, + "balance_loss_mlp": 0.9999153, + "epoch": 0.9389767330006613, + "flos": 58941083157120.0, + "grad_norm": 0.7090820242197226, + "language_loss": 0.56650305, + "learning_rate": 3.8867582157593895e-08, + "loss": 0.58697307, + "num_input_tokens_seen": 168813935, + "step": 7809, + "time_per_iteration": 2.905857801437378 + }, + { + "auxiliary_loss_clip": 0.01154728, + "auxiliary_loss_mlp": 0.0102265, + "balance_loss_clip": 1.04848194, + "balance_loss_mlp": 1.01524091, + "epoch": 0.9390969758913005, + "flos": 31102554994560.0, + "grad_norm": 1.6903599231415016, + "language_loss": 0.76339483, + "learning_rate": 3.871490566850544e-08, + "loss": 0.78516853, + "num_input_tokens_seen": 168838145, + "step": 7810, + "time_per_iteration": 2.5311310291290283 + }, + { + "auxiliary_loss_clip": 0.01136987, + "auxiliary_loss_mlp": 0.01023756, + "balance_loss_clip": 1.04518938, + "balance_loss_mlp": 1.01650178, + "epoch": 0.9392172187819395, + "flos": 22419391173120.0, + "grad_norm": 1.540085546648502, + "language_loss": 0.70627373, + "learning_rate": 3.856252670480642e-08, + "loss": 0.72788119, + "num_input_tokens_seen": 168856805, + "step": 7811, + "time_per_iteration": 2.4954991340637207 + }, + { + "auxiliary_loss_clip": 0.01136239, + "auxiliary_loss_mlp": 0.01026149, + "balance_loss_clip": 1.04129934, + "balance_loss_mlp": 1.01841235, + "epoch": 0.9393374616725786, + "flos": 19719483436800.0, + "grad_norm": 2.97660100874608, + "language_loss": 0.81217742, + "learning_rate": 3.841044528961279e-08, + "loss": 0.83380133, + "num_input_tokens_seen": 168874600, + "step": 7812, + "time_per_iteration": 2.481539726257324 + }, + { + "auxiliary_loss_clip": 0.01167038, + "auxiliary_loss_mlp": 0.0102229, + "balance_loss_clip": 1.04492974, + "balance_loss_mlp": 1.01483965, + "epoch": 0.9394577045632178, + "flos": 24170215800960.0, + "grad_norm": 2.041387815896717, + "language_loss": 0.79025894, + "learning_rate": 3.825866144599477e-08, + "loss": 0.81215221, + "num_input_tokens_seen": 168893655, + "step": 7813, + "time_per_iteration": 2.4443559646606445 + }, + { + "auxiliary_loss_clip": 0.01139737, + "auxiliary_loss_mlp": 0.01022771, + "balance_loss_clip": 1.04323089, + "balance_loss_mlp": 1.01542819, + "epoch": 0.9395779474538568, + "flos": 19023929498880.0, + "grad_norm": 2.251033563369909, + "language_loss": 0.75454736, + "learning_rate": 3.8107175196978145e-08, + "loss": 0.7761724, + "num_input_tokens_seen": 168909960, + "step": 7814, + "time_per_iteration": 2.4697349071502686 + }, + { + "auxiliary_loss_clip": 0.01119759, + "auxiliary_loss_mlp": 0.01026472, + "balance_loss_clip": 1.0414784, + "balance_loss_mlp": 1.01971304, + "epoch": 0.9396981903444959, + "flos": 14319129260160.0, + "grad_norm": 2.1397716355828247, + "language_loss": 0.77109623, + "learning_rate": 3.7955986565542996e-08, + "loss": 0.79255855, + "num_input_tokens_seen": 168928040, + "step": 7815, + "time_per_iteration": 2.4963245391845703 + }, + { + "auxiliary_loss_clip": 0.0112461, + "auxiliary_loss_mlp": 0.01028709, + "balance_loss_clip": 1.04197645, + "balance_loss_mlp": 1.02155375, + "epoch": 0.9398184332351349, + "flos": 34787564202240.0, + "grad_norm": 1.8515574704007218, + "language_loss": 0.68223011, + "learning_rate": 3.780509557462497e-08, + "loss": 0.70376337, + "num_input_tokens_seen": 168948240, + "step": 7816, + "time_per_iteration": 2.63737154006958 + }, + { + "auxiliary_loss_clip": 0.0113693, + "auxiliary_loss_mlp": 0.01021352, + "balance_loss_clip": 1.04211688, + "balance_loss_mlp": 1.01360345, + "epoch": 0.9399386761257741, + "flos": 25372253462400.0, + "grad_norm": 1.7212503171289255, + "language_loss": 0.75551355, + "learning_rate": 3.765450224711375e-08, + "loss": 0.77709633, + "num_input_tokens_seen": 168968745, + "step": 7817, + "time_per_iteration": 2.5372376441955566 + }, + { + "auxiliary_loss_clip": 0.01133746, + "auxiliary_loss_mlp": 0.01024477, + "balance_loss_clip": 1.04473543, + "balance_loss_mlp": 1.01722324, + "epoch": 0.9400589190164131, + "flos": 27304965584640.0, + "grad_norm": 1.6721132179280866, + "language_loss": 0.79929125, + "learning_rate": 3.750420660585396e-08, + "loss": 0.82087344, + "num_input_tokens_seen": 168990685, + "step": 7818, + "time_per_iteration": 2.5367321968078613 + }, + { + "auxiliary_loss_clip": 0.01166804, + "auxiliary_loss_mlp": 0.01024361, + "balance_loss_clip": 1.04795051, + "balance_loss_mlp": 1.01731527, + "epoch": 0.9401791619070522, + "flos": 23399859790080.0, + "grad_norm": 1.5583949699781308, + "language_loss": 0.79763186, + "learning_rate": 3.735420867364603e-08, + "loss": 0.81954354, + "num_input_tokens_seen": 169011665, + "step": 7819, + "time_per_iteration": 2.483991861343384 + }, + { + "auxiliary_loss_clip": 0.01087201, + "auxiliary_loss_mlp": 0.01021338, + "balance_loss_clip": 1.03545308, + "balance_loss_mlp": 1.01428723, + "epoch": 0.9402994047976914, + "flos": 35881403120640.0, + "grad_norm": 1.7075724329385014, + "language_loss": 0.61450464, + "learning_rate": 3.7204508473244186e-08, + "loss": 0.63559002, + "num_input_tokens_seen": 169035290, + "step": 7820, + "time_per_iteration": 2.7397570610046387 + }, + { + "auxiliary_loss_clip": 0.01079353, + "auxiliary_loss_mlp": 0.0102102, + "balance_loss_clip": 1.03947186, + "balance_loss_mlp": 1.01451111, + "epoch": 0.9404196476883304, + "flos": 22236821320320.0, + "grad_norm": 1.6439200948044488, + "language_loss": 0.69096196, + "learning_rate": 3.7055106027357395e-08, + "loss": 0.71196568, + "num_input_tokens_seen": 169055155, + "step": 7821, + "time_per_iteration": 2.653526544570923 + }, + { + "auxiliary_loss_clip": 0.01149279, + "auxiliary_loss_mlp": 0.01024628, + "balance_loss_clip": 1.04553986, + "balance_loss_mlp": 1.01681948, + "epoch": 0.9405398905789695, + "flos": 18915802583040.0, + "grad_norm": 2.0712108255145516, + "language_loss": 0.71650368, + "learning_rate": 3.690600135865063e-08, + "loss": 0.73824275, + "num_input_tokens_seen": 169072080, + "step": 7822, + "time_per_iteration": 2.4421417713165283 + }, + { + "auxiliary_loss_clip": 0.01024716, + "auxiliary_loss_mlp": 0.01001015, + "balance_loss_clip": 1.00932145, + "balance_loss_mlp": 1.00001967, + "epoch": 0.9406601334696086, + "flos": 70274130048000.0, + "grad_norm": 0.8797976314830713, + "language_loss": 0.58090436, + "learning_rate": 3.675719448974246e-08, + "loss": 0.6011616, + "num_input_tokens_seen": 169137170, + "step": 7823, + "time_per_iteration": 3.1906909942626953 + }, + { + "auxiliary_loss_clip": 0.01106942, + "auxiliary_loss_mlp": 0.00761804, + "balance_loss_clip": 1.04033065, + "balance_loss_mlp": 1.00062764, + "epoch": 0.9407803763602477, + "flos": 22165071903360.0, + "grad_norm": 2.2259009334911837, + "language_loss": 0.60026896, + "learning_rate": 3.6608685443207054e-08, + "loss": 0.61895645, + "num_input_tokens_seen": 169156320, + "step": 7824, + "time_per_iteration": 3.326925039291382 + }, + { + "auxiliary_loss_clip": 0.01125949, + "auxiliary_loss_mlp": 0.01024081, + "balance_loss_clip": 1.04246306, + "balance_loss_mlp": 1.01728892, + "epoch": 0.9409006192508867, + "flos": 18879496911360.0, + "grad_norm": 2.441699868820006, + "language_loss": 0.66821343, + "learning_rate": 3.646047424157306e-08, + "loss": 0.68971372, + "num_input_tokens_seen": 169173295, + "step": 7825, + "time_per_iteration": 2.506852388381958 + }, + { + "auxiliary_loss_clip": 0.01141382, + "auxiliary_loss_mlp": 0.01028831, + "balance_loss_clip": 1.04585218, + "balance_loss_mlp": 1.02108288, + "epoch": 0.9410208621415259, + "flos": 23368258800000.0, + "grad_norm": 4.2974355682344685, + "language_loss": 0.68853915, + "learning_rate": 3.631256090732382e-08, + "loss": 0.7102412, + "num_input_tokens_seen": 169193755, + "step": 7826, + "time_per_iteration": 3.2572150230407715 + }, + { + "auxiliary_loss_clip": 0.01125927, + "auxiliary_loss_mlp": 0.01025824, + "balance_loss_clip": 1.04471397, + "balance_loss_mlp": 1.01907063, + "epoch": 0.941141105032165, + "flos": 22742227635840.0, + "grad_norm": 1.773014252468627, + "language_loss": 0.82478762, + "learning_rate": 3.6164945462897833e-08, + "loss": 0.84630513, + "num_input_tokens_seen": 169213045, + "step": 7827, + "time_per_iteration": 3.4466664791107178 + }, + { + "auxiliary_loss_clip": 0.01153084, + "auxiliary_loss_mlp": 0.00761593, + "balance_loss_clip": 1.04708493, + "balance_loss_mlp": 1.0006454, + "epoch": 0.941261347922804, + "flos": 20704908130560.0, + "grad_norm": 1.8660756258078526, + "language_loss": 0.75540292, + "learning_rate": 3.6017627930687856e-08, + "loss": 0.77454972, + "num_input_tokens_seen": 169232870, + "step": 7828, + "time_per_iteration": 2.5382049083709717 + }, + { + "auxiliary_loss_clip": 0.0110683, + "auxiliary_loss_mlp": 0.0102153, + "balance_loss_clip": 1.03818345, + "balance_loss_mlp": 1.01450908, + "epoch": 0.9413815908134432, + "flos": 19421998997760.0, + "grad_norm": 2.2492095902532205, + "language_loss": 0.77066344, + "learning_rate": 3.587060833304267e-08, + "loss": 0.79194707, + "num_input_tokens_seen": 169251060, + "step": 7829, + "time_per_iteration": 2.5624101161956787 + }, + { + "auxiliary_loss_clip": 0.01158093, + "auxiliary_loss_mlp": 0.01028354, + "balance_loss_clip": 1.04792202, + "balance_loss_mlp": 1.02062285, + "epoch": 0.9415018337040822, + "flos": 17493452853120.0, + "grad_norm": 2.2681525350946634, + "language_loss": 0.63911527, + "learning_rate": 3.5723886692264225e-08, + "loss": 0.66097975, + "num_input_tokens_seen": 169268600, + "step": 7830, + "time_per_iteration": 2.5145158767700195 + }, + { + "auxiliary_loss_clip": 0.01135097, + "auxiliary_loss_mlp": 0.01028346, + "balance_loss_clip": 1.04165375, + "balance_loss_mlp": 1.0214107, + "epoch": 0.9416220765947213, + "flos": 31831613343360.0, + "grad_norm": 2.939982367543481, + "language_loss": 0.61811924, + "learning_rate": 3.557746303061071e-08, + "loss": 0.6397537, + "num_input_tokens_seen": 169290355, + "step": 7831, + "time_per_iteration": 3.469024896621704 + }, + { + "auxiliary_loss_clip": 0.01137689, + "auxiliary_loss_mlp": 0.01024339, + "balance_loss_clip": 1.0434351, + "balance_loss_mlp": 1.01732945, + "epoch": 0.9417423194853605, + "flos": 23511973115520.0, + "grad_norm": 1.9232738425016322, + "language_loss": 0.72216856, + "learning_rate": 3.543133737029391e-08, + "loss": 0.74378884, + "num_input_tokens_seen": 169310865, + "step": 7832, + "time_per_iteration": 2.5206756591796875 + }, + { + "auxiliary_loss_clip": 0.01157117, + "auxiliary_loss_mlp": 0.01027729, + "balance_loss_clip": 1.04558527, + "balance_loss_mlp": 1.02032042, + "epoch": 0.9418625623759995, + "flos": 23915106432000.0, + "grad_norm": 4.473242461906273, + "language_loss": 0.6955083, + "learning_rate": 3.5285509733481214e-08, + "loss": 0.7173568, + "num_input_tokens_seen": 169330590, + "step": 7833, + "time_per_iteration": 2.5150740146636963 + }, + { + "auxiliary_loss_clip": 0.0114967, + "auxiliary_loss_mlp": 0.01027859, + "balance_loss_clip": 1.0442667, + "balance_loss_mlp": 1.01988435, + "epoch": 0.9419828052666386, + "flos": 18076965292800.0, + "grad_norm": 2.10342240689359, + "language_loss": 0.76452875, + "learning_rate": 3.513998014229469e-08, + "loss": 0.78630406, + "num_input_tokens_seen": 169349540, + "step": 7834, + "time_per_iteration": 2.5162785053253174 + }, + { + "auxiliary_loss_clip": 0.01139669, + "auxiliary_loss_mlp": 0.01026414, + "balance_loss_clip": 1.0455395, + "balance_loss_mlp": 1.01954448, + "epoch": 0.9421030481572777, + "flos": 17712328377600.0, + "grad_norm": 2.0011462802998787, + "language_loss": 0.86550587, + "learning_rate": 3.499474861881069e-08, + "loss": 0.88716674, + "num_input_tokens_seen": 169366765, + "step": 7835, + "time_per_iteration": 2.4638938903808594 + }, + { + "auxiliary_loss_clip": 0.01098489, + "auxiliary_loss_mlp": 0.01021188, + "balance_loss_clip": 1.0413965, + "balance_loss_mlp": 1.01417851, + "epoch": 0.9422232910479168, + "flos": 20194114775040.0, + "grad_norm": 1.9745656304812236, + "language_loss": 0.68131816, + "learning_rate": 3.4849815185061136e-08, + "loss": 0.70251495, + "num_input_tokens_seen": 169386655, + "step": 7836, + "time_per_iteration": 2.5990102291107178 + }, + { + "auxiliary_loss_clip": 0.01149804, + "auxiliary_loss_mlp": 0.01022227, + "balance_loss_clip": 1.04169643, + "balance_loss_mlp": 1.0157454, + "epoch": 0.9423435339385559, + "flos": 18442571875200.0, + "grad_norm": 1.902095159688308, + "language_loss": 0.7594974, + "learning_rate": 3.470517986303223e-08, + "loss": 0.78121769, + "num_input_tokens_seen": 169405640, + "step": 7837, + "time_per_iteration": 2.4487578868865967 + }, + { + "auxiliary_loss_clip": 0.01124142, + "auxiliary_loss_mlp": 0.01032716, + "balance_loss_clip": 1.04601932, + "balance_loss_mlp": 1.02543235, + "epoch": 0.942463776829195, + "flos": 20080636732800.0, + "grad_norm": 4.192024716974497, + "language_loss": 0.79550481, + "learning_rate": 3.4560842674664856e-08, + "loss": 0.81707335, + "num_input_tokens_seen": 169424155, + "step": 7838, + "time_per_iteration": 2.5332679748535156 + }, + { + "auxiliary_loss_clip": 0.01154818, + "auxiliary_loss_mlp": 0.01020507, + "balance_loss_clip": 1.04362142, + "balance_loss_mlp": 1.01296687, + "epoch": 0.9425840197198341, + "flos": 22636255536000.0, + "grad_norm": 1.8157585670748864, + "language_loss": 0.75101733, + "learning_rate": 3.441680364185506e-08, + "loss": 0.77277058, + "num_input_tokens_seen": 169444025, + "step": 7839, + "time_per_iteration": 2.477332830429077 + }, + { + "auxiliary_loss_clip": 0.01143163, + "auxiliary_loss_mlp": 0.01026657, + "balance_loss_clip": 1.04820418, + "balance_loss_mlp": 1.0194571, + "epoch": 0.9427042626104731, + "flos": 19937892084480.0, + "grad_norm": 6.968448372239925, + "language_loss": 0.7458061, + "learning_rate": 3.427306278645314e-08, + "loss": 0.76750433, + "num_input_tokens_seen": 169462480, + "step": 7840, + "time_per_iteration": 2.4831836223602295 + }, + { + "auxiliary_loss_clip": 0.01111539, + "auxiliary_loss_mlp": 0.01025223, + "balance_loss_clip": 1.04172218, + "balance_loss_mlp": 1.01802862, + "epoch": 0.9428245055011123, + "flos": 22856998567680.0, + "grad_norm": 1.9341249584881812, + "language_loss": 0.73245782, + "learning_rate": 3.4129620130264767e-08, + "loss": 0.75382543, + "num_input_tokens_seen": 169480840, + "step": 7841, + "time_per_iteration": 2.5484869480133057 + }, + { + "auxiliary_loss_clip": 0.01143923, + "auxiliary_loss_mlp": 0.00761766, + "balance_loss_clip": 1.04631972, + "balance_loss_mlp": 1.00065351, + "epoch": 0.9429447483917514, + "flos": 20951757371520.0, + "grad_norm": 2.688575199557219, + "language_loss": 0.77577627, + "learning_rate": 3.398647569505009e-08, + "loss": 0.79483312, + "num_input_tokens_seen": 169498265, + "step": 7842, + "time_per_iteration": 2.4871723651885986 + }, + { + "auxiliary_loss_clip": 0.01132646, + "auxiliary_loss_mlp": 0.01024541, + "balance_loss_clip": 1.04372263, + "balance_loss_mlp": 1.01688731, + "epoch": 0.9430649912823904, + "flos": 18843658116480.0, + "grad_norm": 2.5204046347188616, + "language_loss": 0.74876559, + "learning_rate": 3.384362950252373e-08, + "loss": 0.77033746, + "num_input_tokens_seen": 169515235, + "step": 7843, + "time_per_iteration": 2.5132296085357666 + }, + { + "auxiliary_loss_clip": 0.01135776, + "auxiliary_loss_mlp": 0.01022059, + "balance_loss_clip": 1.04152405, + "balance_loss_mlp": 1.01470375, + "epoch": 0.9431852341730296, + "flos": 32556038837760.0, + "grad_norm": 3.0255581221675922, + "language_loss": 0.5698024, + "learning_rate": 3.3701081574355473e-08, + "loss": 0.59138083, + "num_input_tokens_seen": 169537195, + "step": 7844, + "time_per_iteration": 2.571640729904175 + }, + { + "auxiliary_loss_clip": 0.01045585, + "auxiliary_loss_mlp": 0.010007, + "balance_loss_clip": 1.00848818, + "balance_loss_mlp": 0.99971068, + "epoch": 0.9433054770636686, + "flos": 66904490252160.0, + "grad_norm": 0.6372563802566831, + "language_loss": 0.51662171, + "learning_rate": 3.3558831932169796e-08, + "loss": 0.53708458, + "num_input_tokens_seen": 169605865, + "step": 7845, + "time_per_iteration": 3.1349146366119385 + }, + { + "auxiliary_loss_clip": 0.01150393, + "auxiliary_loss_mlp": 0.01023624, + "balance_loss_clip": 1.04430556, + "balance_loss_mlp": 1.01653421, + "epoch": 0.9434257199543077, + "flos": 26140346916480.0, + "grad_norm": 1.8849879800354046, + "language_loss": 0.88468242, + "learning_rate": 3.341688059754588e-08, + "loss": 0.90642256, + "num_input_tokens_seen": 169621520, + "step": 7846, + "time_per_iteration": 2.5236575603485107 + }, + { + "auxiliary_loss_clip": 0.0113105, + "auxiliary_loss_mlp": 0.007616, + "balance_loss_clip": 1.04099905, + "balance_loss_mlp": 1.00064921, + "epoch": 0.9435459628449467, + "flos": 25003486483200.0, + "grad_norm": 2.1937980703278375, + "language_loss": 0.7784574, + "learning_rate": 3.327522759201762e-08, + "loss": 0.7973839, + "num_input_tokens_seen": 169641390, + "step": 7847, + "time_per_iteration": 2.559006452560425 + }, + { + "auxiliary_loss_clip": 0.01124041, + "auxiliary_loss_mlp": 0.01026059, + "balance_loss_clip": 1.04313672, + "balance_loss_mlp": 1.01832795, + "epoch": 0.9436662057355859, + "flos": 22163240309760.0, + "grad_norm": 2.430544572255115, + "language_loss": 0.67025274, + "learning_rate": 3.313387293707359e-08, + "loss": 0.69175369, + "num_input_tokens_seen": 169660095, + "step": 7848, + "time_per_iteration": 2.5209567546844482 + }, + { + "auxiliary_loss_clip": 0.0112114, + "auxiliary_loss_mlp": 0.01027972, + "balance_loss_clip": 1.04308033, + "balance_loss_mlp": 1.01998496, + "epoch": 0.943786448626225, + "flos": 20118522602880.0, + "grad_norm": 1.8986736043403019, + "language_loss": 0.68618894, + "learning_rate": 3.29928166541571e-08, + "loss": 0.70767999, + "num_input_tokens_seen": 169679050, + "step": 7849, + "time_per_iteration": 2.505182981491089 + }, + { + "auxiliary_loss_clip": 0.01129301, + "auxiliary_loss_mlp": 0.01024118, + "balance_loss_clip": 1.04253173, + "balance_loss_mlp": 1.0165484, + "epoch": 0.943906691516864, + "flos": 22090808534400.0, + "grad_norm": 2.0696658593637114, + "language_loss": 0.80739301, + "learning_rate": 3.2852058764666346e-08, + "loss": 0.82892722, + "num_input_tokens_seen": 169698150, + "step": 7850, + "time_per_iteration": 2.491576671600342 + }, + { + "auxiliary_loss_clip": 0.01113331, + "auxiliary_loss_mlp": 0.01026531, + "balance_loss_clip": 1.04445875, + "balance_loss_mlp": 1.01961684, + "epoch": 0.9440269344075032, + "flos": 35298501212160.0, + "grad_norm": 1.7731869424513413, + "language_loss": 0.68446577, + "learning_rate": 3.2711599289954264e-08, + "loss": 0.70586443, + "num_input_tokens_seen": 169722185, + "step": 7851, + "time_per_iteration": 3.3777778148651123 + }, + { + "auxiliary_loss_clip": 0.01097772, + "auxiliary_loss_mlp": 0.01027172, + "balance_loss_clip": 1.0398562, + "balance_loss_mlp": 1.02033877, + "epoch": 0.9441471772981422, + "flos": 19238136255360.0, + "grad_norm": 1.7662591229983857, + "language_loss": 0.77909303, + "learning_rate": 3.257143825132847e-08, + "loss": 0.80034244, + "num_input_tokens_seen": 169740355, + "step": 7852, + "time_per_iteration": 2.5904383659362793 + }, + { + "auxiliary_loss_clip": 0.01139631, + "auxiliary_loss_mlp": 0.01020983, + "balance_loss_clip": 1.04407406, + "balance_loss_mlp": 1.01386905, + "epoch": 0.9442674201887813, + "flos": 25739799379200.0, + "grad_norm": 1.790530148938152, + "language_loss": 0.76067042, + "learning_rate": 3.243157567005106e-08, + "loss": 0.78227651, + "num_input_tokens_seen": 169758535, + "step": 7853, + "time_per_iteration": 4.052370548248291 + }, + { + "auxiliary_loss_clip": 0.01172602, + "auxiliary_loss_mlp": 0.01024419, + "balance_loss_clip": 1.05027413, + "balance_loss_mlp": 1.01699829, + "epoch": 0.9443876630794205, + "flos": 15523321737600.0, + "grad_norm": 1.9120879341117991, + "language_loss": 0.63550484, + "learning_rate": 3.2292011567339296e-08, + "loss": 0.65747511, + "num_input_tokens_seen": 169776340, + "step": 7854, + "time_per_iteration": 2.4021694660186768 + }, + { + "auxiliary_loss_clip": 0.01153783, + "auxiliary_loss_mlp": 0.00761768, + "balance_loss_clip": 1.0443902, + "balance_loss_mlp": 1.00068355, + "epoch": 0.9445079059700595, + "flos": 13400821128960.0, + "grad_norm": 6.015107062322036, + "language_loss": 0.55420125, + "learning_rate": 3.21527459643649e-08, + "loss": 0.57335681, + "num_input_tokens_seen": 169793225, + "step": 7855, + "time_per_iteration": 2.4423179626464844 + }, + { + "auxiliary_loss_clip": 0.011562, + "auxiliary_loss_mlp": 0.0102652, + "balance_loss_clip": 1.04611528, + "balance_loss_mlp": 1.01895654, + "epoch": 0.9446281488606986, + "flos": 23659242877440.0, + "grad_norm": 2.033297673336326, + "language_loss": 0.74278504, + "learning_rate": 3.2013778882254536e-08, + "loss": 0.7646122, + "num_input_tokens_seen": 169812020, + "step": 7856, + "time_per_iteration": 2.4818532466888428 + }, + { + "auxiliary_loss_clip": 0.01145049, + "auxiliary_loss_mlp": 0.0102738, + "balance_loss_clip": 1.04407668, + "balance_loss_mlp": 1.02015901, + "epoch": 0.9447483917513377, + "flos": 25557337267200.0, + "grad_norm": 1.679289547103525, + "language_loss": 0.75965554, + "learning_rate": 3.1875110342088676e-08, + "loss": 0.78137976, + "num_input_tokens_seen": 169833470, + "step": 7857, + "time_per_iteration": 3.349987030029297 + }, + { + "auxiliary_loss_clip": 0.011334, + "auxiliary_loss_mlp": 0.01024248, + "balance_loss_clip": 1.04449069, + "balance_loss_mlp": 1.01719356, + "epoch": 0.9448686346419768, + "flos": 24535463247360.0, + "grad_norm": 1.5991947234523798, + "language_loss": 0.65257645, + "learning_rate": 3.1736740364904035e-08, + "loss": 0.67415291, + "num_input_tokens_seen": 169854000, + "step": 7858, + "time_per_iteration": 2.5295093059539795 + }, + { + "auxiliary_loss_clip": 0.01107514, + "auxiliary_loss_mlp": 0.00762358, + "balance_loss_clip": 1.04053831, + "balance_loss_mlp": 1.0005728, + "epoch": 0.9449888775326158, + "flos": 14721256995840.0, + "grad_norm": 2.254821413162969, + "language_loss": 0.77037656, + "learning_rate": 3.159866897169094e-08, + "loss": 0.78907526, + "num_input_tokens_seen": 169872200, + "step": 7859, + "time_per_iteration": 2.5321850776672363 + }, + { + "auxiliary_loss_clip": 0.01134218, + "auxiliary_loss_mlp": 0.01026001, + "balance_loss_clip": 1.04497373, + "balance_loss_mlp": 1.01861548, + "epoch": 0.945109120423255, + "flos": 15447873219840.0, + "grad_norm": 2.2156589353723666, + "language_loss": 0.7563237, + "learning_rate": 3.146089618339487e-08, + "loss": 0.77792597, + "num_input_tokens_seen": 169889055, + "step": 7860, + "time_per_iteration": 2.4813249111175537 + }, + { + "auxiliary_loss_clip": 0.01124923, + "auxiliary_loss_mlp": 0.0101954, + "balance_loss_clip": 1.04159427, + "balance_loss_mlp": 1.0122807, + "epoch": 0.9452293633138941, + "flos": 25448097029760.0, + "grad_norm": 1.8197782411903283, + "language_loss": 0.67940646, + "learning_rate": 3.132342202091554e-08, + "loss": 0.70085108, + "num_input_tokens_seen": 169909280, + "step": 7861, + "time_per_iteration": 2.5569498538970947 + }, + { + "auxiliary_loss_clip": 0.01167336, + "auxiliary_loss_mlp": 0.0102476, + "balance_loss_clip": 1.04606557, + "balance_loss_mlp": 1.01735163, + "epoch": 0.9453496062045331, + "flos": 21215342350080.0, + "grad_norm": 2.1022986801635217, + "language_loss": 0.68117166, + "learning_rate": 3.1186246505107595e-08, + "loss": 0.70309258, + "num_input_tokens_seen": 169928420, + "step": 7862, + "time_per_iteration": 2.4130027294158936 + }, + { + "auxiliary_loss_clip": 0.01155123, + "auxiliary_loss_mlp": 0.01024869, + "balance_loss_clip": 1.04822934, + "balance_loss_mlp": 1.01705766, + "epoch": 0.9454698490951723, + "flos": 20010898477440.0, + "grad_norm": 1.6927474119253154, + "language_loss": 0.8404386, + "learning_rate": 3.104936965678084e-08, + "loss": 0.86223853, + "num_input_tokens_seen": 169946750, + "step": 7863, + "time_per_iteration": 2.4573543071746826 + }, + { + "auxiliary_loss_clip": 0.01152727, + "auxiliary_loss_mlp": 0.01021869, + "balance_loss_clip": 1.04446459, + "balance_loss_mlp": 1.01410866, + "epoch": 0.9455900919858113, + "flos": 21069652786560.0, + "grad_norm": 2.2165953610526667, + "language_loss": 0.81716323, + "learning_rate": 3.091279149669956e-08, + "loss": 0.83890915, + "num_input_tokens_seen": 169965540, + "step": 7864, + "time_per_iteration": 2.459774971008301 + }, + { + "auxiliary_loss_clip": 0.01152763, + "auxiliary_loss_mlp": 0.00761818, + "balance_loss_clip": 1.04499292, + "balance_loss_mlp": 1.0006144, + "epoch": 0.9457103348764504, + "flos": 20740854666240.0, + "grad_norm": 2.9521290602523957, + "language_loss": 0.73923409, + "learning_rate": 3.0776512045581624e-08, + "loss": 0.75837994, + "num_input_tokens_seen": 169984330, + "step": 7865, + "time_per_iteration": 2.463352918624878 + }, + { + "auxiliary_loss_clip": 0.01132686, + "auxiliary_loss_mlp": 0.01026448, + "balance_loss_clip": 1.04401779, + "balance_loss_mlp": 1.01881254, + "epoch": 0.9458305777670896, + "flos": 21428363957760.0, + "grad_norm": 2.63681941587417, + "language_loss": 0.77686292, + "learning_rate": 3.0640531324101384e-08, + "loss": 0.79845428, + "num_input_tokens_seen": 170002095, + "step": 7866, + "time_per_iteration": 2.487238883972168 + }, + { + "auxiliary_loss_clip": 0.01157603, + "auxiliary_loss_mlp": 0.01026072, + "balance_loss_clip": 1.04967022, + "balance_loss_mlp": 1.01797175, + "epoch": 0.9459508206577286, + "flos": 20011185786240.0, + "grad_norm": 2.1544380003260803, + "language_loss": 0.76178002, + "learning_rate": 3.0504849352886554e-08, + "loss": 0.78361678, + "num_input_tokens_seen": 170020240, + "step": 7867, + "time_per_iteration": 2.4411051273345947 + }, + { + "auxiliary_loss_clip": 0.01151739, + "auxiliary_loss_mlp": 0.01023161, + "balance_loss_clip": 1.04558301, + "balance_loss_mlp": 1.01615715, + "epoch": 0.9460710635483677, + "flos": 12166428291840.0, + "grad_norm": 11.937432109775708, + "language_loss": 0.71737665, + "learning_rate": 3.036946615252023e-08, + "loss": 0.73912561, + "num_input_tokens_seen": 170035770, + "step": 7868, + "time_per_iteration": 2.4135582447052 + }, + { + "auxiliary_loss_clip": 0.01144296, + "auxiliary_loss_mlp": 0.01024442, + "balance_loss_clip": 1.04449797, + "balance_loss_mlp": 1.01703942, + "epoch": 0.9461913064390068, + "flos": 34276196229120.0, + "grad_norm": 2.7069440621728527, + "language_loss": 0.66710621, + "learning_rate": 3.0234381743539984e-08, + "loss": 0.6887936, + "num_input_tokens_seen": 170053385, + "step": 7869, + "time_per_iteration": 2.557217836380005 + }, + { + "auxiliary_loss_clip": 0.01144094, + "auxiliary_loss_mlp": 0.01023101, + "balance_loss_clip": 1.04334354, + "balance_loss_mlp": 1.01582599, + "epoch": 0.9463115493296459, + "flos": 19463763536640.0, + "grad_norm": 2.2500773255696274, + "language_loss": 0.800699, + "learning_rate": 3.0099596146437863e-08, + "loss": 0.82237095, + "num_input_tokens_seen": 170070490, + "step": 7870, + "time_per_iteration": 2.4789507389068604 + }, + { + "auxiliary_loss_clip": 0.01063303, + "auxiliary_loss_mlp": 0.01000452, + "balance_loss_clip": 1.00740623, + "balance_loss_mlp": 0.9994325, + "epoch": 0.946431792220285, + "flos": 70570824387840.0, + "grad_norm": 0.7769486032261099, + "language_loss": 0.60044223, + "learning_rate": 2.996510938166086e-08, + "loss": 0.6210798, + "num_input_tokens_seen": 170133465, + "step": 7871, + "time_per_iteration": 3.1053287982940674 + }, + { + "auxiliary_loss_clip": 0.0115186, + "auxiliary_loss_mlp": 0.01025384, + "balance_loss_clip": 1.04744053, + "balance_loss_mlp": 1.01836824, + "epoch": 0.9465520351109241, + "flos": 18947906363520.0, + "grad_norm": 1.9006696660010554, + "language_loss": 0.73456025, + "learning_rate": 2.983092146960997e-08, + "loss": 0.75633264, + "num_input_tokens_seen": 170150810, + "step": 7872, + "time_per_iteration": 2.433053493499756 + }, + { + "auxiliary_loss_clip": 0.0113857, + "auxiliary_loss_mlp": 0.01028016, + "balance_loss_clip": 1.0417546, + "balance_loss_mlp": 1.01983857, + "epoch": 0.9466722780015632, + "flos": 19135647774720.0, + "grad_norm": 2.478180244307679, + "language_loss": 0.79765606, + "learning_rate": 2.9697032430642256e-08, + "loss": 0.81932187, + "num_input_tokens_seen": 170169025, + "step": 7873, + "time_per_iteration": 2.4914791584014893 + }, + { + "auxiliary_loss_clip": 0.01164634, + "auxiliary_loss_mlp": 0.01020057, + "balance_loss_clip": 1.04641092, + "balance_loss_mlp": 1.01330709, + "epoch": 0.9467925208922022, + "flos": 17237912520960.0, + "grad_norm": 2.2798603933509805, + "language_loss": 0.73267829, + "learning_rate": 2.9563442285067906e-08, + "loss": 0.75452518, + "num_input_tokens_seen": 170186070, + "step": 7874, + "time_per_iteration": 2.389665126800537 + }, + { + "auxiliary_loss_clip": 0.01155915, + "auxiliary_loss_mlp": 0.01025368, + "balance_loss_clip": 1.04622304, + "balance_loss_mlp": 1.01777458, + "epoch": 0.9469127637828414, + "flos": 29169016859520.0, + "grad_norm": 1.9375735900126045, + "language_loss": 0.79453397, + "learning_rate": 2.943015105315294e-08, + "loss": 0.81634676, + "num_input_tokens_seen": 170206265, + "step": 7875, + "time_per_iteration": 2.5478789806365967 + }, + { + "auxiliary_loss_clip": 0.01111893, + "auxiliary_loss_mlp": 0.01024008, + "balance_loss_clip": 1.03972125, + "balance_loss_mlp": 1.01572347, + "epoch": 0.9470330066734804, + "flos": 26030460234240.0, + "grad_norm": 2.193657260572584, + "language_loss": 0.66511893, + "learning_rate": 2.929715875511718e-08, + "loss": 0.68647802, + "num_input_tokens_seen": 170225300, + "step": 7876, + "time_per_iteration": 2.577680826187134 + }, + { + "auxiliary_loss_clip": 0.01153193, + "auxiliary_loss_mlp": 0.01024992, + "balance_loss_clip": 1.0421381, + "balance_loss_mlp": 1.01750588, + "epoch": 0.9471532495641195, + "flos": 23440906056960.0, + "grad_norm": 1.7401615073579482, + "language_loss": 0.70155597, + "learning_rate": 2.9164465411135375e-08, + "loss": 0.72333777, + "num_input_tokens_seen": 170245070, + "step": 7877, + "time_per_iteration": 2.4757533073425293 + }, + { + "auxiliary_loss_clip": 0.01155783, + "auxiliary_loss_mlp": 0.01021885, + "balance_loss_clip": 1.04843187, + "balance_loss_mlp": 1.01460409, + "epoch": 0.9472734924547586, + "flos": 15815850099840.0, + "grad_norm": 1.8213127652002588, + "language_loss": 0.80818367, + "learning_rate": 2.9032071041337426e-08, + "loss": 0.82996029, + "num_input_tokens_seen": 170263305, + "step": 7878, + "time_per_iteration": 3.3132503032684326 + }, + { + "auxiliary_loss_clip": 0.01130491, + "auxiliary_loss_mlp": 0.01027338, + "balance_loss_clip": 1.04240155, + "balance_loss_mlp": 1.02048385, + "epoch": 0.9473937353453977, + "flos": 11181793697280.0, + "grad_norm": 1.9328647952301456, + "language_loss": 0.72793168, + "learning_rate": 2.889997566580704e-08, + "loss": 0.74950993, + "num_input_tokens_seen": 170281460, + "step": 7879, + "time_per_iteration": 3.250657796859741 + }, + { + "auxiliary_loss_clip": 0.01168128, + "auxiliary_loss_mlp": 0.01027791, + "balance_loss_clip": 1.0461477, + "balance_loss_mlp": 1.01992047, + "epoch": 0.9475139782360368, + "flos": 25775530433280.0, + "grad_norm": 1.8241016635782676, + "language_loss": 0.70424378, + "learning_rate": 2.8768179304583086e-08, + "loss": 0.72620302, + "num_input_tokens_seen": 170303515, + "step": 7880, + "time_per_iteration": 3.226572275161743 + }, + { + "auxiliary_loss_clip": 0.01126744, + "auxiliary_loss_mlp": 0.01027162, + "balance_loss_clip": 1.04599857, + "balance_loss_mlp": 1.01994407, + "epoch": 0.9476342211266758, + "flos": 22820046451200.0, + "grad_norm": 1.648772691557507, + "language_loss": 0.73451066, + "learning_rate": 2.8636681977659117e-08, + "loss": 0.75604975, + "num_input_tokens_seen": 170323165, + "step": 7881, + "time_per_iteration": 2.5197861194610596 + }, + { + "auxiliary_loss_clip": 0.01110347, + "auxiliary_loss_mlp": 0.01025151, + "balance_loss_clip": 1.04442477, + "balance_loss_mlp": 1.01774526, + "epoch": 0.947754464017315, + "flos": 20193611984640.0, + "grad_norm": 2.023591184025256, + "language_loss": 0.78151703, + "learning_rate": 2.850548370498318e-08, + "loss": 0.80287194, + "num_input_tokens_seen": 170341005, + "step": 7882, + "time_per_iteration": 2.5740411281585693 + }, + { + "auxiliary_loss_clip": 0.01150683, + "auxiliary_loss_mlp": 0.01022735, + "balance_loss_clip": 1.04276133, + "balance_loss_mlp": 1.01602697, + "epoch": 0.9478747069079541, + "flos": 24717925359360.0, + "grad_norm": 1.5293481482905742, + "language_loss": 0.71448457, + "learning_rate": 2.8374584506457798e-08, + "loss": 0.73621875, + "num_input_tokens_seen": 170362280, + "step": 7883, + "time_per_iteration": 2.5171284675598145 + }, + { + "auxiliary_loss_clip": 0.01137961, + "auxiliary_loss_mlp": 0.01020829, + "balance_loss_clip": 1.04538429, + "balance_loss_mlp": 1.01298797, + "epoch": 0.9479949497985931, + "flos": 21361355136000.0, + "grad_norm": 2.3184299247509457, + "language_loss": 0.67247462, + "learning_rate": 2.824398440193998e-08, + "loss": 0.69406247, + "num_input_tokens_seen": 170381080, + "step": 7884, + "time_per_iteration": 3.295213222503662 + }, + { + "auxiliary_loss_clip": 0.01105731, + "auxiliary_loss_mlp": 0.01022935, + "balance_loss_clip": 1.04068637, + "balance_loss_mlp": 1.01532376, + "epoch": 0.9481151926892323, + "flos": 18148606968960.0, + "grad_norm": 2.744098498309799, + "language_loss": 0.715464, + "learning_rate": 2.811368341124232e-08, + "loss": 0.7367506, + "num_input_tokens_seen": 170400150, + "step": 7885, + "time_per_iteration": 2.5287275314331055 + }, + { + "auxiliary_loss_clip": 0.01137066, + "auxiliary_loss_mlp": 0.01028344, + "balance_loss_clip": 1.04406357, + "balance_loss_mlp": 1.02121544, + "epoch": 0.9482354355798713, + "flos": 22128012046080.0, + "grad_norm": 2.413623957523302, + "language_loss": 0.68119484, + "learning_rate": 2.7983681554131222e-08, + "loss": 0.70284891, + "num_input_tokens_seen": 170420410, + "step": 7886, + "time_per_iteration": 2.5147194862365723 + }, + { + "auxiliary_loss_clip": 0.01137222, + "auxiliary_loss_mlp": 0.01022656, + "balance_loss_clip": 1.04301214, + "balance_loss_mlp": 1.01498437, + "epoch": 0.9483556784705104, + "flos": 19063072344960.0, + "grad_norm": 2.4235120722437804, + "language_loss": 0.70433962, + "learning_rate": 2.7853978850327365e-08, + "loss": 0.72593838, + "num_input_tokens_seen": 170439580, + "step": 7887, + "time_per_iteration": 2.4699199199676514 + }, + { + "auxiliary_loss_clip": 0.01126491, + "auxiliary_loss_mlp": 0.01026534, + "balance_loss_clip": 1.04761386, + "balance_loss_mlp": 1.01913142, + "epoch": 0.9484759213611496, + "flos": 25777110631680.0, + "grad_norm": 1.7700296345179978, + "language_loss": 0.87076008, + "learning_rate": 2.7724575319507225e-08, + "loss": 0.89229035, + "num_input_tokens_seen": 170459290, + "step": 7888, + "time_per_iteration": 2.557633399963379 + }, + { + "auxiliary_loss_clip": 0.01151108, + "auxiliary_loss_mlp": 0.01023015, + "balance_loss_clip": 1.04287362, + "balance_loss_mlp": 1.01605356, + "epoch": 0.9485961642517886, + "flos": 20667740532480.0, + "grad_norm": 1.8023812195808455, + "language_loss": 0.77391684, + "learning_rate": 2.759547098130044e-08, + "loss": 0.79565799, + "num_input_tokens_seen": 170478020, + "step": 7889, + "time_per_iteration": 2.4671363830566406 + }, + { + "auxiliary_loss_clip": 0.01164103, + "auxiliary_loss_mlp": 0.0102407, + "balance_loss_clip": 1.04621804, + "balance_loss_mlp": 1.01695919, + "epoch": 0.9487164071424277, + "flos": 22674069578880.0, + "grad_norm": 2.2283592794782834, + "language_loss": 0.76850259, + "learning_rate": 2.746666585529267e-08, + "loss": 0.79038429, + "num_input_tokens_seen": 170498295, + "step": 7890, + "time_per_iteration": 2.458873987197876 + }, + { + "auxiliary_loss_clip": 0.01144409, + "auxiliary_loss_mlp": 0.01024927, + "balance_loss_clip": 1.04308474, + "balance_loss_mlp": 1.01764965, + "epoch": 0.9488366500330668, + "flos": 38726461716480.0, + "grad_norm": 2.1363871330631308, + "language_loss": 0.74660015, + "learning_rate": 2.73381599610234e-08, + "loss": 0.76829356, + "num_input_tokens_seen": 170518695, + "step": 7891, + "time_per_iteration": 2.589759588241577 + }, + { + "auxiliary_loss_clip": 0.01145727, + "auxiliary_loss_mlp": 0.01025368, + "balance_loss_clip": 1.04074073, + "balance_loss_mlp": 1.01773572, + "epoch": 0.9489568929237059, + "flos": 27890920149120.0, + "grad_norm": 1.8167232839210818, + "language_loss": 0.71172696, + "learning_rate": 2.7209953317987033e-08, + "loss": 0.73343796, + "num_input_tokens_seen": 170539735, + "step": 7892, + "time_per_iteration": 2.523150682449341 + }, + { + "auxiliary_loss_clip": 0.01154732, + "auxiliary_loss_mlp": 0.01022035, + "balance_loss_clip": 1.0465939, + "balance_loss_mlp": 1.01472795, + "epoch": 0.9490771358143449, + "flos": 33580642291200.0, + "grad_norm": 2.2591897656683293, + "language_loss": 0.77909696, + "learning_rate": 2.7082045945631793e-08, + "loss": 0.80086458, + "num_input_tokens_seen": 170561950, + "step": 7893, + "time_per_iteration": 2.5628318786621094 + }, + { + "auxiliary_loss_clip": 0.01116972, + "auxiliary_loss_mlp": 0.01024435, + "balance_loss_clip": 1.04174805, + "balance_loss_mlp": 1.01691306, + "epoch": 0.9491973787049841, + "flos": 14793796512000.0, + "grad_norm": 2.2608846411385644, + "language_loss": 0.69370341, + "learning_rate": 2.6954437863361712e-08, + "loss": 0.71511745, + "num_input_tokens_seen": 170579865, + "step": 7894, + "time_per_iteration": 2.5249414443969727 + }, + { + "auxiliary_loss_clip": 0.01098126, + "auxiliary_loss_mlp": 0.01022487, + "balance_loss_clip": 1.03948796, + "balance_loss_mlp": 1.0157342, + "epoch": 0.9493176215956232, + "flos": 25332535998720.0, + "grad_norm": 1.9617637761217566, + "language_loss": 0.70911467, + "learning_rate": 2.6827129090534862e-08, + "loss": 0.73032081, + "num_input_tokens_seen": 170600165, + "step": 7895, + "time_per_iteration": 2.6433522701263428 + }, + { + "auxiliary_loss_clip": 0.01135469, + "auxiliary_loss_mlp": 0.01023564, + "balance_loss_clip": 1.04367185, + "balance_loss_mlp": 1.01598883, + "epoch": 0.9494378644862622, + "flos": 21029971236480.0, + "grad_norm": 1.7722487151224426, + "language_loss": 0.77622533, + "learning_rate": 2.670011964646335e-08, + "loss": 0.79781568, + "num_input_tokens_seen": 170618845, + "step": 7896, + "time_per_iteration": 2.483421802520752 + }, + { + "auxiliary_loss_clip": 0.01086451, + "auxiliary_loss_mlp": 0.01024628, + "balance_loss_clip": 1.03379261, + "balance_loss_mlp": 1.01652741, + "epoch": 0.9495581073769014, + "flos": 15195134148480.0, + "grad_norm": 1.8608789085778754, + "language_loss": 0.6833809, + "learning_rate": 2.657340955041487e-08, + "loss": 0.70449167, + "num_input_tokens_seen": 170637620, + "step": 7897, + "time_per_iteration": 2.7571418285369873 + }, + { + "auxiliary_loss_clip": 0.01137975, + "auxiliary_loss_mlp": 0.01026445, + "balance_loss_clip": 1.04581702, + "balance_loss_mlp": 1.01876199, + "epoch": 0.9496783502675404, + "flos": 28616566705920.0, + "grad_norm": 2.026387094236624, + "language_loss": 0.71866488, + "learning_rate": 2.6446998821611167e-08, + "loss": 0.74030912, + "num_input_tokens_seen": 170657815, + "step": 7898, + "time_per_iteration": 2.7262370586395264 + }, + { + "auxiliary_loss_clip": 0.01112136, + "auxiliary_loss_mlp": 0.0102491, + "balance_loss_clip": 1.04098344, + "balance_loss_mlp": 1.01781428, + "epoch": 0.9497985931581795, + "flos": 14866874732160.0, + "grad_norm": 2.329107224732416, + "language_loss": 0.71905053, + "learning_rate": 2.6320887479228228e-08, + "loss": 0.74042094, + "num_input_tokens_seen": 170674415, + "step": 7899, + "time_per_iteration": 2.5336270332336426 + }, + { + "auxiliary_loss_clip": 0.01141211, + "auxiliary_loss_mlp": 0.01027941, + "balance_loss_clip": 1.04339337, + "balance_loss_mlp": 1.02073503, + "epoch": 0.9499188360488187, + "flos": 27193319136000.0, + "grad_norm": 2.254025569010067, + "language_loss": 0.72189438, + "learning_rate": 2.619507554239786e-08, + "loss": 0.74358594, + "num_input_tokens_seen": 170692975, + "step": 7900, + "time_per_iteration": 2.524916887283325 + }, + { + "auxiliary_loss_clip": 0.01137504, + "auxiliary_loss_mlp": 0.0102723, + "balance_loss_clip": 1.04312384, + "balance_loss_mlp": 1.0195291, + "epoch": 0.9500390789394577, + "flos": 24316479982080.0, + "grad_norm": 1.6941089576007635, + "language_loss": 0.69680512, + "learning_rate": 2.606956303020502e-08, + "loss": 0.71845245, + "num_input_tokens_seen": 170713780, + "step": 7901, + "time_per_iteration": 2.513561248779297 + }, + { + "auxiliary_loss_clip": 0.01155408, + "auxiliary_loss_mlp": 0.01023811, + "balance_loss_clip": 1.04794049, + "balance_loss_mlp": 1.0160861, + "epoch": 0.9501593218300968, + "flos": 14354752573440.0, + "grad_norm": 1.7272547049784672, + "language_loss": 0.8411088, + "learning_rate": 2.5944349961690036e-08, + "loss": 0.86290097, + "num_input_tokens_seen": 170730800, + "step": 7902, + "time_per_iteration": 2.510406494140625 + }, + { + "auxiliary_loss_clip": 0.01124241, + "auxiliary_loss_mlp": 0.0102325, + "balance_loss_clip": 1.04203343, + "balance_loss_mlp": 1.01576376, + "epoch": 0.9502795647207359, + "flos": 38728113742080.0, + "grad_norm": 1.7773480167761502, + "language_loss": 0.72993547, + "learning_rate": 2.581943635584749e-08, + "loss": 0.75141037, + "num_input_tokens_seen": 170753630, + "step": 7903, + "time_per_iteration": 2.687701463699341 + }, + { + "auxiliary_loss_clip": 0.01132038, + "auxiliary_loss_mlp": 0.01018864, + "balance_loss_clip": 1.04404688, + "balance_loss_mlp": 1.0122211, + "epoch": 0.950399807611375, + "flos": 40808023799040.0, + "grad_norm": 1.500091782503605, + "language_loss": 0.6530503, + "learning_rate": 2.569482223162689e-08, + "loss": 0.6745593, + "num_input_tokens_seen": 170777605, + "step": 7904, + "time_per_iteration": 2.662578821182251 + }, + { + "auxiliary_loss_clip": 0.01152437, + "auxiliary_loss_mlp": 0.01022262, + "balance_loss_clip": 1.04358518, + "balance_loss_mlp": 1.01462674, + "epoch": 0.950520050502014, + "flos": 23440403266560.0, + "grad_norm": 1.67459294843724, + "language_loss": 0.72344089, + "learning_rate": 2.5570507607932e-08, + "loss": 0.74518788, + "num_input_tokens_seen": 170797520, + "step": 7905, + "time_per_iteration": 3.2642040252685547 + }, + { + "auxiliary_loss_clip": 0.011572, + "auxiliary_loss_mlp": 0.01024636, + "balance_loss_clip": 1.04517388, + "balance_loss_mlp": 1.01687002, + "epoch": 0.9506402933926532, + "flos": 17783718658560.0, + "grad_norm": 3.1381619024372305, + "language_loss": 0.63879329, + "learning_rate": 2.54464925036213e-08, + "loss": 0.66061169, + "num_input_tokens_seen": 170814810, + "step": 7906, + "time_per_iteration": 4.010417699813843 + }, + { + "auxiliary_loss_clip": 0.01152466, + "auxiliary_loss_mlp": 0.0102301, + "balance_loss_clip": 1.04591393, + "balance_loss_mlp": 1.01515412, + "epoch": 0.9507605362832923, + "flos": 32561928668160.0, + "grad_norm": 1.9107488534011299, + "language_loss": 0.6091153, + "learning_rate": 2.532277693750773e-08, + "loss": 0.6308701, + "num_input_tokens_seen": 170835735, + "step": 7907, + "time_per_iteration": 2.5971455574035645 + }, + { + "auxiliary_loss_clip": 0.01107363, + "auxiliary_loss_mlp": 0.01026159, + "balance_loss_clip": 1.04339802, + "balance_loss_mlp": 1.01867819, + "epoch": 0.9508807791739313, + "flos": 19602054898560.0, + "grad_norm": 1.8910455603787553, + "language_loss": 0.75597787, + "learning_rate": 2.5199360928358948e-08, + "loss": 0.77731311, + "num_input_tokens_seen": 170852970, + "step": 7908, + "time_per_iteration": 2.540691375732422 + }, + { + "auxiliary_loss_clip": 0.01142494, + "auxiliary_loss_mlp": 0.00761807, + "balance_loss_clip": 1.04207861, + "balance_loss_mlp": 1.00068486, + "epoch": 0.9510010220645704, + "flos": 21471852349440.0, + "grad_norm": 1.7561116639946484, + "language_loss": 0.87050295, + "learning_rate": 2.507624449489665e-08, + "loss": 0.88954604, + "num_input_tokens_seen": 170871600, + "step": 7909, + "time_per_iteration": 2.469482898712158 + }, + { + "auxiliary_loss_clip": 0.01141611, + "auxiliary_loss_mlp": 0.01029525, + "balance_loss_clip": 1.04701686, + "balance_loss_mlp": 1.0215975, + "epoch": 0.9511212649552095, + "flos": 18879999701760.0, + "grad_norm": 1.794718907990066, + "language_loss": 0.64999592, + "learning_rate": 2.495342765579811e-08, + "loss": 0.67170727, + "num_input_tokens_seen": 170890260, + "step": 7910, + "time_per_iteration": 2.4899497032165527 + }, + { + "auxiliary_loss_clip": 0.01107601, + "auxiliary_loss_mlp": 0.010222, + "balance_loss_clip": 1.04264808, + "balance_loss_mlp": 1.01522875, + "epoch": 0.9512415078458486, + "flos": 20810521094400.0, + "grad_norm": 1.998990655672778, + "language_loss": 0.70954859, + "learning_rate": 2.4830910429693984e-08, + "loss": 0.73084664, + "num_input_tokens_seen": 170910220, + "step": 7911, + "time_per_iteration": 3.637373208999634 + }, + { + "auxiliary_loss_clip": 0.0116619, + "auxiliary_loss_mlp": 0.01028007, + "balance_loss_clip": 1.04571605, + "balance_loss_mlp": 1.02065194, + "epoch": 0.9513617507364877, + "flos": 18369565482240.0, + "grad_norm": 2.2050948563353336, + "language_loss": 0.79820001, + "learning_rate": 2.470869283517052e-08, + "loss": 0.82014197, + "num_input_tokens_seen": 170928255, + "step": 7912, + "time_per_iteration": 2.4419476985931396 + }, + { + "auxiliary_loss_clip": 0.01145946, + "auxiliary_loss_mlp": 0.0102661, + "balance_loss_clip": 1.04326212, + "balance_loss_mlp": 1.01947284, + "epoch": 0.9514819936271268, + "flos": 25010166412800.0, + "grad_norm": 1.670974635178606, + "language_loss": 0.77095056, + "learning_rate": 2.458677489076777e-08, + "loss": 0.79267609, + "num_input_tokens_seen": 170949265, + "step": 7913, + "time_per_iteration": 2.563901424407959 + }, + { + "auxiliary_loss_clip": 0.01143736, + "auxiliary_loss_mlp": 0.01025715, + "balance_loss_clip": 1.04365897, + "balance_loss_mlp": 1.01870847, + "epoch": 0.9516022365177659, + "flos": 18662129758080.0, + "grad_norm": 1.7013719527210418, + "language_loss": 0.8288995, + "learning_rate": 2.446515661498072e-08, + "loss": 0.85059404, + "num_input_tokens_seen": 170968595, + "step": 7914, + "time_per_iteration": 2.470118522644043 + }, + { + "auxiliary_loss_clip": 0.01093274, + "auxiliary_loss_mlp": 0.01027222, + "balance_loss_clip": 1.03930616, + "balance_loss_mlp": 1.02002203, + "epoch": 0.9517224794084049, + "flos": 25372109808000.0, + "grad_norm": 6.464058437136102, + "language_loss": 0.74365914, + "learning_rate": 2.434383802625861e-08, + "loss": 0.76486409, + "num_input_tokens_seen": 170987550, + "step": 7915, + "time_per_iteration": 2.672823667526245 + }, + { + "auxiliary_loss_clip": 0.01123246, + "auxiliary_loss_mlp": 0.01020643, + "balance_loss_clip": 1.03958988, + "balance_loss_mlp": 1.01360726, + "epoch": 0.9518427222990441, + "flos": 21470918595840.0, + "grad_norm": 1.9078463586291645, + "language_loss": 0.73910666, + "learning_rate": 2.4222819143005168e-08, + "loss": 0.76054561, + "num_input_tokens_seen": 171007145, + "step": 7916, + "time_per_iteration": 2.600278854370117 + }, + { + "auxiliary_loss_clip": 0.01165833, + "auxiliary_loss_mlp": 0.01023841, + "balance_loss_clip": 1.04798961, + "balance_loss_mlp": 1.01650333, + "epoch": 0.9519629651896832, + "flos": 21033634423680.0, + "grad_norm": 1.7767235580662304, + "language_loss": 0.80788648, + "learning_rate": 2.4102099983579706e-08, + "loss": 0.82978326, + "num_input_tokens_seen": 171026295, + "step": 7917, + "time_per_iteration": 2.462700366973877 + }, + { + "auxiliary_loss_clip": 0.01151938, + "auxiliary_loss_mlp": 0.01025731, + "balance_loss_clip": 1.0441134, + "balance_loss_mlp": 1.01772881, + "epoch": 0.9520832080803222, + "flos": 21689219502720.0, + "grad_norm": 1.7915659785026907, + "language_loss": 0.77021074, + "learning_rate": 2.3981680566294236e-08, + "loss": 0.79198742, + "num_input_tokens_seen": 171045895, + "step": 7918, + "time_per_iteration": 2.4611759185791016 + }, + { + "auxiliary_loss_clip": 0.01165163, + "auxiliary_loss_mlp": 0.01026845, + "balance_loss_clip": 1.04769027, + "balance_loss_mlp": 1.02004719, + "epoch": 0.9522034509709614, + "flos": 23145289125120.0, + "grad_norm": 1.9226171614126006, + "language_loss": 0.73020673, + "learning_rate": 2.3861560909416822e-08, + "loss": 0.75212681, + "num_input_tokens_seen": 171065445, + "step": 7919, + "time_per_iteration": 2.4732003211975098 + }, + { + "auxiliary_loss_clip": 0.01113452, + "auxiliary_loss_mlp": 0.01027069, + "balance_loss_clip": 1.04408836, + "balance_loss_mlp": 1.01989532, + "epoch": 0.9523236938616004, + "flos": 24679428958080.0, + "grad_norm": 1.7378303580034993, + "language_loss": 0.82392061, + "learning_rate": 2.3741741031169325e-08, + "loss": 0.84532583, + "num_input_tokens_seen": 171085015, + "step": 7920, + "time_per_iteration": 2.6072213649749756 + }, + { + "auxiliary_loss_clip": 0.01105359, + "auxiliary_loss_mlp": 0.01026282, + "balance_loss_clip": 1.03853416, + "balance_loss_mlp": 1.01954985, + "epoch": 0.9524439367522395, + "flos": 22672309812480.0, + "grad_norm": 2.1132135571031765, + "language_loss": 0.71090758, + "learning_rate": 2.3622220949728544e-08, + "loss": 0.73222399, + "num_input_tokens_seen": 171103900, + "step": 7921, + "time_per_iteration": 2.591789484024048 + }, + { + "auxiliary_loss_clip": 0.01146809, + "auxiliary_loss_mlp": 0.01023358, + "balance_loss_clip": 1.04488635, + "balance_loss_mlp": 1.01518679, + "epoch": 0.9525641796428787, + "flos": 34055525024640.0, + "grad_norm": 4.304813193760286, + "language_loss": 0.61537629, + "learning_rate": 2.3503000683225526e-08, + "loss": 0.63707805, + "num_input_tokens_seen": 171121615, + "step": 7922, + "time_per_iteration": 2.5769894123077393 + }, + { + "auxiliary_loss_clip": 0.01169168, + "auxiliary_loss_mlp": 0.01024156, + "balance_loss_clip": 1.04725671, + "balance_loss_mlp": 1.01639009, + "epoch": 0.9526844225335177, + "flos": 16727083251840.0, + "grad_norm": 2.0017987589898056, + "language_loss": 0.8430469, + "learning_rate": 2.3384080249745585e-08, + "loss": 0.8649801, + "num_input_tokens_seen": 171139505, + "step": 7923, + "time_per_iteration": 2.4349024295806885 + }, + { + "auxiliary_loss_clip": 0.0111445, + "auxiliary_loss_mlp": 0.01021325, + "balance_loss_clip": 1.04116631, + "balance_loss_mlp": 1.0145421, + "epoch": 0.9528046654241568, + "flos": 36939367330560.0, + "grad_norm": 2.197803934667128, + "language_loss": 0.83116734, + "learning_rate": 2.3265459667329178e-08, + "loss": 0.85252506, + "num_input_tokens_seen": 171158995, + "step": 7924, + "time_per_iteration": 2.6892409324645996 + }, + { + "auxiliary_loss_clip": 0.0114176, + "auxiliary_loss_mlp": 0.01022015, + "balance_loss_clip": 1.04482675, + "balance_loss_mlp": 1.01436722, + "epoch": 0.9529249083147959, + "flos": 18255010032000.0, + "grad_norm": 2.3355818932170656, + "language_loss": 0.86501676, + "learning_rate": 2.31471389539708e-08, + "loss": 0.8866545, + "num_input_tokens_seen": 171176120, + "step": 7925, + "time_per_iteration": 2.503582715988159 + }, + { + "auxiliary_loss_clip": 0.01155348, + "auxiliary_loss_mlp": 0.00761757, + "balance_loss_clip": 1.04712057, + "balance_loss_mlp": 1.000633, + "epoch": 0.953045151205435, + "flos": 28658438985600.0, + "grad_norm": 2.080176832669102, + "language_loss": 0.7281484, + "learning_rate": 2.3029118127619872e-08, + "loss": 0.74731946, + "num_input_tokens_seen": 171195835, + "step": 7926, + "time_per_iteration": 2.5262534618377686 + }, + { + "auxiliary_loss_clip": 0.01130786, + "auxiliary_loss_mlp": 0.01022108, + "balance_loss_clip": 1.04256713, + "balance_loss_mlp": 1.01462793, + "epoch": 0.953165394096074, + "flos": 21835232288640.0, + "grad_norm": 21.48595469645213, + "language_loss": 0.87281501, + "learning_rate": 2.2911397206179628e-08, + "loss": 0.89434391, + "num_input_tokens_seen": 171212585, + "step": 7927, + "time_per_iteration": 2.5355429649353027 + }, + { + "auxiliary_loss_clip": 0.01164969, + "auxiliary_loss_mlp": 0.01024882, + "balance_loss_clip": 1.04675174, + "balance_loss_mlp": 1.0179621, + "epoch": 0.9532856369867132, + "flos": 19975059682560.0, + "grad_norm": 2.415493121562455, + "language_loss": 0.6274755, + "learning_rate": 2.279397620750845e-08, + "loss": 0.64937401, + "num_input_tokens_seen": 171231630, + "step": 7928, + "time_per_iteration": 2.460563898086548 + }, + { + "auxiliary_loss_clip": 0.01135466, + "auxiliary_loss_mlp": 0.0102275, + "balance_loss_clip": 1.04232585, + "balance_loss_mlp": 1.01618767, + "epoch": 0.9534058798773523, + "flos": 15049588239360.0, + "grad_norm": 48.66252220076234, + "language_loss": 0.78408706, + "learning_rate": 2.2676855149419195e-08, + "loss": 0.80566925, + "num_input_tokens_seen": 171248800, + "step": 7929, + "time_per_iteration": 2.4837958812713623 + }, + { + "auxiliary_loss_clip": 0.01137015, + "auxiliary_loss_mlp": 0.01025241, + "balance_loss_clip": 1.04768658, + "balance_loss_mlp": 1.0177846, + "epoch": 0.9535261227679913, + "flos": 17602800831360.0, + "grad_norm": 2.30271493573535, + "language_loss": 0.75170535, + "learning_rate": 2.2560034049678988e-08, + "loss": 0.77332795, + "num_input_tokens_seen": 171263150, + "step": 7930, + "time_per_iteration": 2.529278039932251 + }, + { + "auxiliary_loss_clip": 0.01171574, + "auxiliary_loss_mlp": 0.01025701, + "balance_loss_clip": 1.0491128, + "balance_loss_mlp": 1.01834285, + "epoch": 0.9536463656586305, + "flos": 23142954741120.0, + "grad_norm": 1.8909857648677215, + "language_loss": 0.7541455, + "learning_rate": 2.2443512926008988e-08, + "loss": 0.77611828, + "num_input_tokens_seen": 171282480, + "step": 7931, + "time_per_iteration": 2.483874559402466 + }, + { + "auxiliary_loss_clip": 0.0112505, + "auxiliary_loss_mlp": 0.01024318, + "balance_loss_clip": 1.04144573, + "balance_loss_mlp": 1.01717734, + "epoch": 0.9537666085492695, + "flos": 18625033987200.0, + "grad_norm": 2.1538562170872004, + "language_loss": 0.69707793, + "learning_rate": 2.2327291796085946e-08, + "loss": 0.7185716, + "num_input_tokens_seen": 171300840, + "step": 7932, + "time_per_iteration": 3.369271993637085 + }, + { + "auxiliary_loss_clip": 0.01167552, + "auxiliary_loss_mlp": 0.01026749, + "balance_loss_clip": 1.04605281, + "balance_loss_mlp": 1.01920354, + "epoch": 0.9538868514399086, + "flos": 18989347680000.0, + "grad_norm": 2.563967281951092, + "language_loss": 0.77286577, + "learning_rate": 2.2211370677540197e-08, + "loss": 0.79480875, + "num_input_tokens_seen": 171317365, + "step": 7933, + "time_per_iteration": 3.30440092086792 + }, + { + "auxiliary_loss_clip": 0.01168394, + "auxiliary_loss_mlp": 0.01025312, + "balance_loss_clip": 1.04768491, + "balance_loss_mlp": 1.01806402, + "epoch": 0.9540070943305478, + "flos": 16800556521600.0, + "grad_norm": 2.8616267230236025, + "language_loss": 0.78240693, + "learning_rate": 2.2095749587957012e-08, + "loss": 0.80434394, + "num_input_tokens_seen": 171335270, + "step": 7934, + "time_per_iteration": 2.4244015216827393 + }, + { + "auxiliary_loss_clip": 0.01133817, + "auxiliary_loss_mlp": 0.01024206, + "balance_loss_clip": 1.04022515, + "balance_loss_mlp": 1.01663601, + "epoch": 0.9541273372211868, + "flos": 20156911263360.0, + "grad_norm": 1.86165842536052, + "language_loss": 0.69329822, + "learning_rate": 2.1980428544876138e-08, + "loss": 0.71487844, + "num_input_tokens_seen": 171353910, + "step": 7935, + "time_per_iteration": 2.5143494606018066 + }, + { + "auxiliary_loss_clip": 0.01103039, + "auxiliary_loss_mlp": 0.01022793, + "balance_loss_clip": 1.03647888, + "balance_loss_mlp": 1.01522911, + "epoch": 0.9542475801118259, + "flos": 26725511381760.0, + "grad_norm": 1.8651741051571638, + "language_loss": 0.74112409, + "learning_rate": 2.1865407565791584e-08, + "loss": 0.76238239, + "num_input_tokens_seen": 171375480, + "step": 7936, + "time_per_iteration": 2.6297202110290527 + }, + { + "auxiliary_loss_clip": 0.01139458, + "auxiliary_loss_mlp": 0.01022896, + "balance_loss_clip": 1.04287863, + "balance_loss_mlp": 1.01515365, + "epoch": 0.954367823002465, + "flos": 23330911633920.0, + "grad_norm": 2.1274034993248847, + "language_loss": 0.77304661, + "learning_rate": 2.175068666815183e-08, + "loss": 0.7946701, + "num_input_tokens_seen": 171396320, + "step": 7937, + "time_per_iteration": 2.5490288734436035 + }, + { + "auxiliary_loss_clip": 0.01126779, + "auxiliary_loss_mlp": 0.0103079, + "balance_loss_clip": 1.04229927, + "balance_loss_mlp": 1.02327347, + "epoch": 0.9544880658931041, + "flos": 14902713527040.0, + "grad_norm": 2.0383459201940104, + "language_loss": 0.79152167, + "learning_rate": 2.163626586935985e-08, + "loss": 0.81309736, + "num_input_tokens_seen": 171412860, + "step": 7938, + "time_per_iteration": 3.2546017169952393 + }, + { + "auxiliary_loss_clip": 0.01150423, + "auxiliary_loss_mlp": 0.01030428, + "balance_loss_clip": 1.04459572, + "balance_loss_mlp": 1.02266741, + "epoch": 0.9546083087837431, + "flos": 29095902725760.0, + "grad_norm": 2.4003110423664586, + "language_loss": 0.63033986, + "learning_rate": 2.1522145186773755e-08, + "loss": 0.65214837, + "num_input_tokens_seen": 171431780, + "step": 7939, + "time_per_iteration": 2.5469436645507812 + }, + { + "auxiliary_loss_clip": 0.01137957, + "auxiliary_loss_mlp": 0.01026788, + "balance_loss_clip": 1.044438, + "balance_loss_mlp": 1.01954627, + "epoch": 0.9547285516743822, + "flos": 21142335957120.0, + "grad_norm": 2.8209626501586156, + "language_loss": 0.85655785, + "learning_rate": 2.140832463770481e-08, + "loss": 0.8782053, + "num_input_tokens_seen": 171450975, + "step": 7940, + "time_per_iteration": 2.4924919605255127 + }, + { + "auxiliary_loss_clip": 0.0114324, + "auxiliary_loss_mlp": 0.01023354, + "balance_loss_clip": 1.04404736, + "balance_loss_mlp": 1.01622796, + "epoch": 0.9548487945650214, + "flos": 27490157130240.0, + "grad_norm": 4.195662600667238, + "language_loss": 0.76066333, + "learning_rate": 2.129480423941987e-08, + "loss": 0.78232932, + "num_input_tokens_seen": 171467645, + "step": 7941, + "time_per_iteration": 2.5655570030212402 + }, + { + "auxiliary_loss_clip": 0.01141406, + "auxiliary_loss_mlp": 0.01022543, + "balance_loss_clip": 1.04398799, + "balance_loss_mlp": 1.01565552, + "epoch": 0.9549690374556604, + "flos": 22273198819200.0, + "grad_norm": 2.3721284608731033, + "language_loss": 0.80274725, + "learning_rate": 2.1181584009140052e-08, + "loss": 0.82438672, + "num_input_tokens_seen": 171487185, + "step": 7942, + "time_per_iteration": 2.483475685119629 + }, + { + "auxiliary_loss_clip": 0.01133791, + "auxiliary_loss_mlp": 0.01024978, + "balance_loss_clip": 1.04435742, + "balance_loss_mlp": 1.01803398, + "epoch": 0.9550892803462995, + "flos": 17595294888960.0, + "grad_norm": 2.077565588647142, + "language_loss": 0.83976197, + "learning_rate": 2.10686639640405e-08, + "loss": 0.8613497, + "num_input_tokens_seen": 171501275, + "step": 7943, + "time_per_iteration": 2.4974820613861084 + }, + { + "auxiliary_loss_clip": 0.01156542, + "auxiliary_loss_mlp": 0.0102408, + "balance_loss_clip": 1.04590535, + "balance_loss_mlp": 1.01665604, + "epoch": 0.9552095232369386, + "flos": 24353144789760.0, + "grad_norm": 1.922431043148255, + "language_loss": 0.8114962, + "learning_rate": 2.0956044121251294e-08, + "loss": 0.83330238, + "num_input_tokens_seen": 171520060, + "step": 7944, + "time_per_iteration": 2.514356851577759 + }, + { + "auxiliary_loss_clip": 0.01124621, + "auxiliary_loss_mlp": 0.01025425, + "balance_loss_clip": 1.04468203, + "balance_loss_mlp": 1.01784909, + "epoch": 0.9553297661275777, + "flos": 22746860490240.0, + "grad_norm": 1.7580174189150535, + "language_loss": 0.80754423, + "learning_rate": 2.084372449785654e-08, + "loss": 0.8290447, + "num_input_tokens_seen": 171539895, + "step": 7945, + "time_per_iteration": 2.57291316986084 + }, + { + "auxiliary_loss_clip": 0.01134779, + "auxiliary_loss_mlp": 0.0102847, + "balance_loss_clip": 1.04179549, + "balance_loss_mlp": 1.02088809, + "epoch": 0.9554500090182168, + "flos": 15413866018560.0, + "grad_norm": 2.035685630151634, + "language_loss": 0.68805921, + "learning_rate": 2.0731705110895282e-08, + "loss": 0.70969176, + "num_input_tokens_seen": 171557385, + "step": 7946, + "time_per_iteration": 2.4673078060150146 + }, + { + "auxiliary_loss_clip": 0.01157096, + "auxiliary_loss_mlp": 0.01027933, + "balance_loss_clip": 1.04843593, + "balance_loss_mlp": 1.02007699, + "epoch": 0.9555702519088559, + "flos": 23513517400320.0, + "grad_norm": 2.1875575900766027, + "language_loss": 0.86712408, + "learning_rate": 2.0619985977360587e-08, + "loss": 0.88897443, + "num_input_tokens_seen": 171575705, + "step": 7947, + "time_per_iteration": 2.5045478343963623 + }, + { + "auxiliary_loss_clip": 0.01110769, + "auxiliary_loss_mlp": 0.01027215, + "balance_loss_clip": 1.0380429, + "balance_loss_mlp": 1.01998174, + "epoch": 0.955690494799495, + "flos": 22962072827520.0, + "grad_norm": 1.7482551263965806, + "language_loss": 0.76780587, + "learning_rate": 2.0508567114200237e-08, + "loss": 0.7891857, + "num_input_tokens_seen": 171595620, + "step": 7948, + "time_per_iteration": 2.5872154235839844 + }, + { + "auxiliary_loss_clip": 0.01141861, + "auxiliary_loss_mlp": 0.01023544, + "balance_loss_clip": 1.04430294, + "balance_loss_mlp": 1.01656687, + "epoch": 0.955810737690134, + "flos": 26031250333440.0, + "grad_norm": 3.584120957994768, + "language_loss": 0.79070807, + "learning_rate": 2.0397448538316485e-08, + "loss": 0.81236207, + "num_input_tokens_seen": 171616660, + "step": 7949, + "time_per_iteration": 2.5795981884002686 + }, + { + "auxiliary_loss_clip": 0.01119887, + "auxiliary_loss_mlp": 0.01022427, + "balance_loss_clip": 1.04210448, + "balance_loss_mlp": 1.01539958, + "epoch": 0.9559309805807732, + "flos": 20849951249280.0, + "grad_norm": 2.369258650859101, + "language_loss": 0.67247331, + "learning_rate": 2.028663026656563e-08, + "loss": 0.69389653, + "num_input_tokens_seen": 171635515, + "step": 7950, + "time_per_iteration": 2.5277466773986816 + }, + { + "auxiliary_loss_clip": 0.01166227, + "auxiliary_loss_mlp": 0.00762575, + "balance_loss_clip": 1.04668283, + "balance_loss_mlp": 1.00067866, + "epoch": 0.9560512234714122, + "flos": 21578219498880.0, + "grad_norm": 2.519315070829844, + "language_loss": 0.71910357, + "learning_rate": 2.0176112315758885e-08, + "loss": 0.73839164, + "num_input_tokens_seen": 171653305, + "step": 7951, + "time_per_iteration": 2.4397518634796143 + }, + { + "auxiliary_loss_clip": 0.01117823, + "auxiliary_loss_mlp": 0.0102918, + "balance_loss_clip": 1.04224515, + "balance_loss_mlp": 1.02144289, + "epoch": 0.9561714663620513, + "flos": 17450144029440.0, + "grad_norm": 2.7177359945969464, + "language_loss": 0.69533056, + "learning_rate": 2.0065894702661957e-08, + "loss": 0.71680057, + "num_input_tokens_seen": 171669980, + "step": 7952, + "time_per_iteration": 2.5279130935668945 + }, + { + "auxiliary_loss_clip": 0.01116079, + "auxiliary_loss_mlp": 0.00761957, + "balance_loss_clip": 1.03969622, + "balance_loss_mlp": 1.00060487, + "epoch": 0.9562917092526905, + "flos": 26098510550400.0, + "grad_norm": 2.8231738390124326, + "language_loss": 0.77933931, + "learning_rate": 1.9955977443994577e-08, + "loss": 0.79811966, + "num_input_tokens_seen": 171689970, + "step": 7953, + "time_per_iteration": 2.6149849891662598 + }, + { + "auxiliary_loss_clip": 0.01139726, + "auxiliary_loss_mlp": 0.01030932, + "balance_loss_clip": 1.04488254, + "balance_loss_mlp": 1.02277255, + "epoch": 0.9564119521433295, + "flos": 24096742531200.0, + "grad_norm": 2.1687485209992845, + "language_loss": 0.62245053, + "learning_rate": 1.9846360556430965e-08, + "loss": 0.64415711, + "num_input_tokens_seen": 171708270, + "step": 7954, + "time_per_iteration": 2.5506606101989746 + }, + { + "auxiliary_loss_clip": 0.01164531, + "auxiliary_loss_mlp": 0.01024556, + "balance_loss_clip": 1.045295, + "balance_loss_mlp": 1.01752877, + "epoch": 0.9565321950339686, + "flos": 32008903896960.0, + "grad_norm": 2.816951182778877, + "language_loss": 0.61347544, + "learning_rate": 1.973704405660004e-08, + "loss": 0.63536626, + "num_input_tokens_seen": 171729385, + "step": 7955, + "time_per_iteration": 2.5553534030914307 + }, + { + "auxiliary_loss_clip": 0.01094824, + "auxiliary_loss_mlp": 0.01023828, + "balance_loss_clip": 1.04002631, + "balance_loss_mlp": 1.0170536, + "epoch": 0.9566524379246077, + "flos": 23588642695680.0, + "grad_norm": 1.757269158442338, + "language_loss": 0.78140914, + "learning_rate": 1.9628027961085203e-08, + "loss": 0.80259567, + "num_input_tokens_seen": 171752615, + "step": 7956, + "time_per_iteration": 2.6562161445617676 + }, + { + "auxiliary_loss_clip": 0.01111479, + "auxiliary_loss_mlp": 0.01022688, + "balance_loss_clip": 1.03752184, + "balance_loss_mlp": 1.01565719, + "epoch": 0.9567726808152468, + "flos": 38067716240640.0, + "grad_norm": 1.736405700314914, + "language_loss": 0.83813471, + "learning_rate": 1.9519312286423894e-08, + "loss": 0.85947633, + "num_input_tokens_seen": 171775810, + "step": 7957, + "time_per_iteration": 2.703594446182251 + }, + { + "auxiliary_loss_clip": 0.01151309, + "auxiliary_loss_mlp": 0.01022311, + "balance_loss_clip": 1.04699039, + "balance_loss_mlp": 1.01473475, + "epoch": 0.9568929237058859, + "flos": 22744059229440.0, + "grad_norm": 1.648190833565484, + "language_loss": 0.77792501, + "learning_rate": 1.9410897049108255e-08, + "loss": 0.79966116, + "num_input_tokens_seen": 171795090, + "step": 7958, + "time_per_iteration": 2.480990409851074 + }, + { + "auxiliary_loss_clip": 0.01175379, + "auxiliary_loss_mlp": 0.01024649, + "balance_loss_clip": 1.05173278, + "balance_loss_mlp": 1.01662588, + "epoch": 0.957013166596525, + "flos": 23841633162240.0, + "grad_norm": 2.1444042880767333, + "language_loss": 0.91281188, + "learning_rate": 1.9302782265584905e-08, + "loss": 0.93481219, + "num_input_tokens_seen": 171815755, + "step": 7959, + "time_per_iteration": 3.2261593341827393 + }, + { + "auxiliary_loss_clip": 0.01098355, + "auxiliary_loss_mlp": 0.01023523, + "balance_loss_clip": 1.04127049, + "balance_loss_mlp": 1.01615322, + "epoch": 0.9571334094871641, + "flos": 17639286071040.0, + "grad_norm": 2.0445912641052946, + "language_loss": 0.87240481, + "learning_rate": 1.9194967952254282e-08, + "loss": 0.89362359, + "num_input_tokens_seen": 171834330, + "step": 7960, + "time_per_iteration": 3.3562941551208496 + }, + { + "auxiliary_loss_clip": 0.0115325, + "auxiliary_loss_mlp": 0.01023375, + "balance_loss_clip": 1.04647398, + "balance_loss_mlp": 1.01555753, + "epoch": 0.9572536523778031, + "flos": 15369623441280.0, + "grad_norm": 2.279523889503837, + "language_loss": 0.80548728, + "learning_rate": 1.9087454125472635e-08, + "loss": 0.82725346, + "num_input_tokens_seen": 171848805, + "step": 7961, + "time_per_iteration": 2.412428379058838 + }, + { + "auxiliary_loss_clip": 0.01168584, + "auxiliary_loss_mlp": 0.01023756, + "balance_loss_clip": 1.04768932, + "balance_loss_mlp": 1.01622832, + "epoch": 0.9573738952684423, + "flos": 24969838417920.0, + "grad_norm": 2.3363445290031852, + "language_loss": 0.78659159, + "learning_rate": 1.8980240801548696e-08, + "loss": 0.80851495, + "num_input_tokens_seen": 171867995, + "step": 7962, + "time_per_iteration": 2.4563794136047363 + }, + { + "auxiliary_loss_clip": 0.01138193, + "auxiliary_loss_mlp": 0.01020179, + "balance_loss_clip": 1.04716754, + "balance_loss_mlp": 1.01312459, + "epoch": 0.9574941381590814, + "flos": 25769461034880.0, + "grad_norm": 1.6612742737852795, + "language_loss": 0.74075723, + "learning_rate": 1.8873327996747458e-08, + "loss": 0.76234102, + "num_input_tokens_seen": 171886495, + "step": 7963, + "time_per_iteration": 2.5229685306549072 + }, + { + "auxiliary_loss_clip": 0.01154723, + "auxiliary_loss_mlp": 0.01022947, + "balance_loss_clip": 1.04398835, + "balance_loss_mlp": 1.01608944, + "epoch": 0.9576143810497204, + "flos": 32307178435200.0, + "grad_norm": 2.080848366968484, + "language_loss": 0.66011906, + "learning_rate": 1.8766715727287053e-08, + "loss": 0.68189573, + "num_input_tokens_seen": 171908200, + "step": 7964, + "time_per_iteration": 3.2997586727142334 + }, + { + "auxiliary_loss_clip": 0.01156026, + "auxiliary_loss_mlp": 0.00762323, + "balance_loss_clip": 1.04509306, + "balance_loss_mlp": 1.00059032, + "epoch": 0.9577346239403596, + "flos": 27745733376000.0, + "grad_norm": 1.7335937878750487, + "language_loss": 0.79408598, + "learning_rate": 1.8660404009340546e-08, + "loss": 0.8132695, + "num_input_tokens_seen": 171928650, + "step": 7965, + "time_per_iteration": 2.4980146884918213 + }, + { + "auxiliary_loss_clip": 0.01055087, + "auxiliary_loss_mlp": 0.01001256, + "balance_loss_clip": 1.0079236, + "balance_loss_mlp": 1.00025451, + "epoch": 0.9578548668309986, + "flos": 57468313710720.0, + "grad_norm": 0.9077311413332132, + "language_loss": 0.59507883, + "learning_rate": 1.8554392859035485e-08, + "loss": 0.61564231, + "num_input_tokens_seen": 171986400, + "step": 7966, + "time_per_iteration": 3.038370132446289 + }, + { + "auxiliary_loss_clip": 0.0108629, + "auxiliary_loss_mlp": 0.0102285, + "balance_loss_clip": 1.03778553, + "balance_loss_mlp": 1.01562595, + "epoch": 0.9579751097216377, + "flos": 19756040503680.0, + "grad_norm": 1.6929392399228895, + "language_loss": 0.78715944, + "learning_rate": 1.8448682292453444e-08, + "loss": 0.80825078, + "num_input_tokens_seen": 172005475, + "step": 7967, + "time_per_iteration": 2.621426582336426 + }, + { + "auxiliary_loss_clip": 0.01166883, + "auxiliary_loss_mlp": 0.01028541, + "balance_loss_clip": 1.04763973, + "balance_loss_mlp": 1.02115297, + "epoch": 0.9580953526122769, + "flos": 18041270152320.0, + "grad_norm": 1.9059823978717527, + "language_loss": 0.66590071, + "learning_rate": 1.8343272325631154e-08, + "loss": 0.68785495, + "num_input_tokens_seen": 172024420, + "step": 7968, + "time_per_iteration": 2.412973165512085 + }, + { + "auxiliary_loss_clip": 0.01085699, + "auxiliary_loss_mlp": 0.00762432, + "balance_loss_clip": 1.03815877, + "balance_loss_mlp": 1.00061238, + "epoch": 0.9582155955029159, + "flos": 24270154416000.0, + "grad_norm": 2.170539441113069, + "language_loss": 0.77966064, + "learning_rate": 1.8238162974558492e-08, + "loss": 0.79814196, + "num_input_tokens_seen": 172038350, + "step": 7969, + "time_per_iteration": 2.6503403186798096 + }, + { + "auxiliary_loss_clip": 0.01135907, + "auxiliary_loss_mlp": 0.01026735, + "balance_loss_clip": 1.04443729, + "balance_loss_mlp": 1.01932597, + "epoch": 0.958335838393555, + "flos": 22783309816320.0, + "grad_norm": 1.834595370654246, + "language_loss": 0.74750292, + "learning_rate": 1.8133354255181144e-08, + "loss": 0.76912928, + "num_input_tokens_seen": 172058665, + "step": 7970, + "time_per_iteration": 2.5052437782287598 + }, + { + "auxiliary_loss_clip": 0.01146193, + "auxiliary_loss_mlp": 0.01026273, + "balance_loss_clip": 1.04276943, + "balance_loss_mlp": 1.01886988, + "epoch": 0.958456081284194, + "flos": 16911484698240.0, + "grad_norm": 2.838318254242547, + "language_loss": 0.74378955, + "learning_rate": 1.802884618339795e-08, + "loss": 0.76551414, + "num_input_tokens_seen": 172077470, + "step": 7971, + "time_per_iteration": 2.464690923690796 + }, + { + "auxiliary_loss_clip": 0.01155732, + "auxiliary_loss_mlp": 0.01020436, + "balance_loss_clip": 1.0488987, + "balance_loss_mlp": 1.01292562, + "epoch": 0.9585763241748332, + "flos": 19974951941760.0, + "grad_norm": 2.0377945951117127, + "language_loss": 0.80838656, + "learning_rate": 1.7924638775062894e-08, + "loss": 0.83014822, + "num_input_tokens_seen": 172096590, + "step": 7972, + "time_per_iteration": 2.453888177871704 + }, + { + "auxiliary_loss_clip": 0.01119734, + "auxiliary_loss_mlp": 0.0102551, + "balance_loss_clip": 1.04432225, + "balance_loss_mlp": 1.0182445, + "epoch": 0.9586965670654722, + "flos": 21395649646080.0, + "grad_norm": 2.0921427408481397, + "language_loss": 0.81547821, + "learning_rate": 1.7820732045984444e-08, + "loss": 0.83693063, + "num_input_tokens_seen": 172116735, + "step": 7973, + "time_per_iteration": 2.534499168395996 + }, + { + "auxiliary_loss_clip": 0.01151281, + "auxiliary_loss_mlp": 0.01025798, + "balance_loss_clip": 1.04487014, + "balance_loss_mlp": 1.01812077, + "epoch": 0.9588168099561113, + "flos": 21435115714560.0, + "grad_norm": 1.855269011364932, + "language_loss": 0.74120224, + "learning_rate": 1.7717126011924655e-08, + "loss": 0.76297301, + "num_input_tokens_seen": 172138320, + "step": 7974, + "time_per_iteration": 2.493234157562256 + }, + { + "auxiliary_loss_clip": 0.01102314, + "auxiliary_loss_mlp": 0.01025166, + "balance_loss_clip": 1.03576231, + "balance_loss_mlp": 1.01792669, + "epoch": 0.9589370528467505, + "flos": 11763761852160.0, + "grad_norm": 2.622941434220424, + "language_loss": 0.76398134, + "learning_rate": 1.7613820688600957e-08, + "loss": 0.78525615, + "num_input_tokens_seen": 172154225, + "step": 7975, + "time_per_iteration": 2.530590772628784 + }, + { + "auxiliary_loss_clip": 0.01145316, + "auxiliary_loss_mlp": 0.01024158, + "balance_loss_clip": 1.04275393, + "balance_loss_mlp": 1.01705921, + "epoch": 0.9590572957373895, + "flos": 23441516588160.0, + "grad_norm": 1.7723082350184474, + "language_loss": 0.78292656, + "learning_rate": 1.7510816091684588e-08, + "loss": 0.80462134, + "num_input_tokens_seen": 172174150, + "step": 7976, + "time_per_iteration": 2.5033633708953857 + }, + { + "auxiliary_loss_clip": 0.01141308, + "auxiliary_loss_mlp": 0.01027722, + "balance_loss_clip": 1.04516625, + "balance_loss_mlp": 1.02014577, + "epoch": 0.9591775386280286, + "flos": 22528272274560.0, + "grad_norm": 3.0412419835931153, + "language_loss": 0.78726292, + "learning_rate": 1.740811223680083e-08, + "loss": 0.80895317, + "num_input_tokens_seen": 172191005, + "step": 7977, + "time_per_iteration": 2.5281789302825928 + }, + { + "auxiliary_loss_clip": 0.01167573, + "auxiliary_loss_mlp": 0.01025114, + "balance_loss_clip": 1.04705143, + "balance_loss_mlp": 1.01731181, + "epoch": 0.9592977815186677, + "flos": 18186959715840.0, + "grad_norm": 3.6156212976462534, + "language_loss": 0.74440444, + "learning_rate": 1.7305709139530334e-08, + "loss": 0.76633132, + "num_input_tokens_seen": 172209785, + "step": 7978, + "time_per_iteration": 2.4147422313690186 + }, + { + "auxiliary_loss_clip": 0.01145769, + "auxiliary_loss_mlp": 0.01023315, + "balance_loss_clip": 1.04346013, + "balance_loss_mlp": 1.01588249, + "epoch": 0.9594180244093068, + "flos": 16537797555840.0, + "grad_norm": 2.369707388391738, + "language_loss": 0.74481088, + "learning_rate": 1.7203606815407334e-08, + "loss": 0.76650167, + "num_input_tokens_seen": 172224380, + "step": 7979, + "time_per_iteration": 2.4234097003936768 + }, + { + "auxiliary_loss_clip": 0.0114626, + "auxiliary_loss_mlp": 0.01024425, + "balance_loss_clip": 1.04847085, + "balance_loss_mlp": 1.01698041, + "epoch": 0.9595382672999458, + "flos": 20554334317440.0, + "grad_norm": 1.711202937973293, + "language_loss": 0.7929368, + "learning_rate": 1.7101805279920557e-08, + "loss": 0.81464368, + "num_input_tokens_seen": 172242540, + "step": 7980, + "time_per_iteration": 2.4875075817108154 + }, + { + "auxiliary_loss_clip": 0.01168192, + "auxiliary_loss_mlp": 0.01024164, + "balance_loss_clip": 1.0477308, + "balance_loss_mlp": 1.01628435, + "epoch": 0.959658510190585, + "flos": 22638266697600.0, + "grad_norm": 1.9632819105356938, + "language_loss": 0.8136816, + "learning_rate": 1.7000304548513643e-08, + "loss": 0.83560514, + "num_input_tokens_seen": 172262645, + "step": 7981, + "time_per_iteration": 2.433427572250366 + }, + { + "auxiliary_loss_clip": 0.01121474, + "auxiliary_loss_mlp": 0.01027912, + "balance_loss_clip": 1.04025424, + "balance_loss_mlp": 1.02056861, + "epoch": 0.9597787530812241, + "flos": 19135252725120.0, + "grad_norm": 2.476873379431039, + "language_loss": 0.8256653, + "learning_rate": 1.6899104636583394e-08, + "loss": 0.84715915, + "num_input_tokens_seen": 172280695, + "step": 7982, + "time_per_iteration": 2.552259922027588 + }, + { + "auxiliary_loss_clip": 0.01055098, + "auxiliary_loss_mlp": 0.01001248, + "balance_loss_clip": 1.00755823, + "balance_loss_mlp": 1.00022304, + "epoch": 0.9598989959718631, + "flos": 60098124055680.0, + "grad_norm": 0.7236116621717783, + "language_loss": 0.61968577, + "learning_rate": 1.6798205559482638e-08, + "loss": 0.64024925, + "num_input_tokens_seen": 172343075, + "step": 7983, + "time_per_iteration": 3.2202296257019043 + }, + { + "auxiliary_loss_clip": 0.01128072, + "auxiliary_loss_mlp": 0.0102752, + "balance_loss_clip": 1.04496121, + "balance_loss_mlp": 1.01963687, + "epoch": 0.9600192388625023, + "flos": 20886795624960.0, + "grad_norm": 1.8716225638036523, + "language_loss": 0.76529104, + "learning_rate": 1.669760733251713e-08, + "loss": 0.78684694, + "num_input_tokens_seen": 172361950, + "step": 7984, + "time_per_iteration": 2.5230154991149902 + }, + { + "auxiliary_loss_clip": 0.01106367, + "auxiliary_loss_mlp": 0.0102309, + "balance_loss_clip": 1.04071701, + "balance_loss_mlp": 1.01627135, + "epoch": 0.9601394817531413, + "flos": 20445740524800.0, + "grad_norm": 1.6331319679826761, + "language_loss": 0.82302696, + "learning_rate": 1.659730997094755e-08, + "loss": 0.84432149, + "num_input_tokens_seen": 172380440, + "step": 7985, + "time_per_iteration": 2.581662654876709 + }, + { + "auxiliary_loss_clip": 0.01146623, + "auxiliary_loss_mlp": 0.01025279, + "balance_loss_clip": 1.04348469, + "balance_loss_mlp": 1.0182575, + "epoch": 0.9602597246437804, + "flos": 21507152440320.0, + "grad_norm": 1.8042856578891642, + "language_loss": 0.62016195, + "learning_rate": 1.6497313489989283e-08, + "loss": 0.64188099, + "num_input_tokens_seen": 172400265, + "step": 7986, + "time_per_iteration": 3.2817022800445557 + }, + { + "auxiliary_loss_clip": 0.01109267, + "auxiliary_loss_mlp": 0.01027055, + "balance_loss_clip": 1.03494859, + "balance_loss_mlp": 1.01963437, + "epoch": 0.9603799675344196, + "flos": 29935099152000.0, + "grad_norm": 2.7714527742918453, + "language_loss": 0.6996268, + "learning_rate": 1.639761790481131e-08, + "loss": 0.72099006, + "num_input_tokens_seen": 172421145, + "step": 7987, + "time_per_iteration": 3.3748395442962646 + }, + { + "auxiliary_loss_clip": 0.01157061, + "auxiliary_loss_mlp": 0.01023753, + "balance_loss_clip": 1.04680431, + "balance_loss_mlp": 1.01668096, + "epoch": 0.9605002104250586, + "flos": 28001525103360.0, + "grad_norm": 1.890946926036063, + "language_loss": 0.79012394, + "learning_rate": 1.6298223230537754e-08, + "loss": 0.81193209, + "num_input_tokens_seen": 172438945, + "step": 7988, + "time_per_iteration": 2.501830816268921 + }, + { + "auxiliary_loss_clip": 0.01137086, + "auxiliary_loss_mlp": 0.00762449, + "balance_loss_clip": 1.04305685, + "balance_loss_mlp": 1.00055957, + "epoch": 0.9606204533156977, + "flos": 35590490870400.0, + "grad_norm": 2.1495108846194113, + "language_loss": 0.69519734, + "learning_rate": 1.619912948224611e-08, + "loss": 0.71419275, + "num_input_tokens_seen": 172460150, + "step": 7989, + "time_per_iteration": 2.5941388607025146 + }, + { + "auxiliary_loss_clip": 0.01119287, + "auxiliary_loss_mlp": 0.01026585, + "balance_loss_clip": 1.04361618, + "balance_loss_mlp": 1.01869917, + "epoch": 0.9607406962063368, + "flos": 26574614346240.0, + "grad_norm": 2.5086573585542005, + "language_loss": 0.61087775, + "learning_rate": 1.6100336674969682e-08, + "loss": 0.6323365, + "num_input_tokens_seen": 172478990, + "step": 7990, + "time_per_iteration": 2.565671682357788 + }, + { + "auxiliary_loss_clip": 0.01114038, + "auxiliary_loss_mlp": 0.01031687, + "balance_loss_clip": 1.04016662, + "balance_loss_mlp": 1.02398002, + "epoch": 0.9608609390969759, + "flos": 25331781813120.0, + "grad_norm": 2.079289169864331, + "language_loss": 0.76338995, + "learning_rate": 1.600184482369449e-08, + "loss": 0.78484714, + "num_input_tokens_seen": 172498905, + "step": 7991, + "time_per_iteration": 3.292734146118164 + }, + { + "auxiliary_loss_clip": 0.01130085, + "auxiliary_loss_mlp": 0.01023374, + "balance_loss_clip": 1.04278541, + "balance_loss_mlp": 1.01493382, + "epoch": 0.960981181987615, + "flos": 21069114082560.0, + "grad_norm": 3.0958854291666467, + "language_loss": 0.88798308, + "learning_rate": 1.5903653943362126e-08, + "loss": 0.90951765, + "num_input_tokens_seen": 172517900, + "step": 7992, + "time_per_iteration": 2.5271735191345215 + }, + { + "auxiliary_loss_clip": 0.01140586, + "auxiliary_loss_mlp": 0.01022441, + "balance_loss_clip": 1.04503298, + "balance_loss_mlp": 1.01550341, + "epoch": 0.9611014248782541, + "flos": 17823256554240.0, + "grad_norm": 1.9674536363203192, + "language_loss": 0.77112669, + "learning_rate": 1.580576404886802e-08, + "loss": 0.79275692, + "num_input_tokens_seen": 172536430, + "step": 7993, + "time_per_iteration": 2.476681709289551 + }, + { + "auxiliary_loss_clip": 0.0115349, + "auxiliary_loss_mlp": 0.01024774, + "balance_loss_clip": 1.04513049, + "balance_loss_mlp": 1.0178901, + "epoch": 0.9612216677688932, + "flos": 19354631040000.0, + "grad_norm": 2.036270857847344, + "language_loss": 0.79949272, + "learning_rate": 1.570817515506162e-08, + "loss": 0.82127535, + "num_input_tokens_seen": 172555120, + "step": 7994, + "time_per_iteration": 2.493501901626587 + }, + { + "auxiliary_loss_clip": 0.01165593, + "auxiliary_loss_mlp": 0.01028099, + "balance_loss_clip": 1.04730844, + "balance_loss_mlp": 1.02113748, + "epoch": 0.9613419106595322, + "flos": 15808739207040.0, + "grad_norm": 1.962117824876856, + "language_loss": 0.81333292, + "learning_rate": 1.561088727674753e-08, + "loss": 0.83526981, + "num_input_tokens_seen": 172569330, + "step": 7995, + "time_per_iteration": 2.3851006031036377 + }, + { + "auxiliary_loss_clip": 0.01126723, + "auxiliary_loss_mlp": 0.01030252, + "balance_loss_clip": 1.04318905, + "balance_loss_mlp": 1.02193439, + "epoch": 0.9614621535501714, + "flos": 25702488126720.0, + "grad_norm": 8.807361951222864, + "language_loss": 0.71130747, + "learning_rate": 1.551390042868417e-08, + "loss": 0.73287719, + "num_input_tokens_seen": 172591100, + "step": 7996, + "time_per_iteration": 2.6281070709228516 + }, + { + "auxiliary_loss_clip": 0.01154817, + "auxiliary_loss_mlp": 0.0102361, + "balance_loss_clip": 1.04669523, + "balance_loss_mlp": 1.01598024, + "epoch": 0.9615823964408104, + "flos": 17819054663040.0, + "grad_norm": 1.7569469832161309, + "language_loss": 0.70748198, + "learning_rate": 1.5417214625584207e-08, + "loss": 0.72926629, + "num_input_tokens_seen": 172608755, + "step": 7997, + "time_per_iteration": 2.4399664402008057 + }, + { + "auxiliary_loss_clip": 0.01146601, + "auxiliary_loss_mlp": 0.01024368, + "balance_loss_clip": 1.04201865, + "balance_loss_mlp": 1.01698291, + "epoch": 0.9617026393314495, + "flos": 20190020624640.0, + "grad_norm": 1.8515155832899564, + "language_loss": 0.84984696, + "learning_rate": 1.5320829882114806e-08, + "loss": 0.87155664, + "num_input_tokens_seen": 172626830, + "step": 7998, + "time_per_iteration": 2.4596874713897705 + }, + { + "auxiliary_loss_clip": 0.01165707, + "auxiliary_loss_mlp": 0.01026143, + "balance_loss_clip": 1.04485643, + "balance_loss_mlp": 1.0189395, + "epoch": 0.9618228822220887, + "flos": 20267013427200.0, + "grad_norm": 2.0225400117295913, + "language_loss": 0.79134965, + "learning_rate": 1.5224746212897378e-08, + "loss": 0.81326807, + "num_input_tokens_seen": 172646125, + "step": 7999, + "time_per_iteration": 2.4311530590057373 + }, + { + "auxiliary_loss_clip": 0.01164726, + "auxiliary_loss_mlp": 0.01022366, + "balance_loss_clip": 1.04580212, + "balance_loss_mlp": 1.01529074, + "epoch": 0.9619431251127277, + "flos": 21031300039680.0, + "grad_norm": 1.6592142581285243, + "language_loss": 0.77348912, + "learning_rate": 1.512896363250804e-08, + "loss": 0.79536009, + "num_input_tokens_seen": 172666235, + "step": 8000, + "time_per_iteration": 2.431659460067749 + }, + { + "auxiliary_loss_clip": 0.01152381, + "auxiliary_loss_mlp": 0.01025703, + "balance_loss_clip": 1.04304826, + "balance_loss_mlp": 1.01866364, + "epoch": 0.9620633680033668, + "flos": 22382654538240.0, + "grad_norm": 2.9105417453396547, + "language_loss": 0.7560873, + "learning_rate": 1.503348215547673e-08, + "loss": 0.77786815, + "num_input_tokens_seen": 172687325, + "step": 8001, + "time_per_iteration": 2.473311185836792 + }, + { + "auxiliary_loss_clip": 0.01136546, + "auxiliary_loss_mlp": 0.01024196, + "balance_loss_clip": 1.04361761, + "balance_loss_mlp": 1.01711249, + "epoch": 0.962183610894006, + "flos": 18471730740480.0, + "grad_norm": 1.8314212260682077, + "language_loss": 0.80740333, + "learning_rate": 1.4938301796288078e-08, + "loss": 0.82901073, + "num_input_tokens_seen": 172703895, + "step": 8002, + "time_per_iteration": 2.455267906188965 + }, + { + "auxiliary_loss_clip": 0.01166648, + "auxiliary_loss_mlp": 0.01024362, + "balance_loss_clip": 1.0468204, + "balance_loss_mlp": 1.01664901, + "epoch": 0.962303853784645, + "flos": 18435245500800.0, + "grad_norm": 6.833046417937243, + "language_loss": 0.82007343, + "learning_rate": 1.4843422569380537e-08, + "loss": 0.84198356, + "num_input_tokens_seen": 172720650, + "step": 8003, + "time_per_iteration": 2.374662160873413 + }, + { + "auxiliary_loss_clip": 0.01108538, + "auxiliary_loss_mlp": 0.01020436, + "balance_loss_clip": 1.03900182, + "balance_loss_mlp": 1.01343489, + "epoch": 0.9624240966752841, + "flos": 26391074826240.0, + "grad_norm": 2.169748428777136, + "language_loss": 0.83160806, + "learning_rate": 1.4748844489147483e-08, + "loss": 0.85289776, + "num_input_tokens_seen": 172737640, + "step": 8004, + "time_per_iteration": 2.589351177215576 + }, + { + "auxiliary_loss_clip": 0.0113847, + "auxiliary_loss_mlp": 0.0102153, + "balance_loss_clip": 1.04233217, + "balance_loss_mlp": 1.0149374, + "epoch": 0.9625443395659231, + "flos": 14647675985280.0, + "grad_norm": 2.7552114587699172, + "language_loss": 0.71063918, + "learning_rate": 1.4654567569936326e-08, + "loss": 0.73223925, + "num_input_tokens_seen": 172755215, + "step": 8005, + "time_per_iteration": 2.4369215965270996 + }, + { + "auxiliary_loss_clip": 0.01106605, + "auxiliary_loss_mlp": 0.01030953, + "balance_loss_clip": 1.0407207, + "balance_loss_mlp": 1.02358603, + "epoch": 0.9626645824565623, + "flos": 18367626147840.0, + "grad_norm": 2.2366282759723903, + "language_loss": 0.83063912, + "learning_rate": 1.456059182604874e-08, + "loss": 0.85201466, + "num_input_tokens_seen": 172774020, + "step": 8006, + "time_per_iteration": 2.548853874206543 + }, + { + "auxiliary_loss_clip": 0.01169318, + "auxiliary_loss_mlp": 0.01028238, + "balance_loss_clip": 1.0489949, + "balance_loss_mlp": 1.02038217, + "epoch": 0.9627848253472013, + "flos": 16580424021120.0, + "grad_norm": 1.9967247713779508, + "language_loss": 0.76727498, + "learning_rate": 1.4466917271740653e-08, + "loss": 0.78925055, + "num_input_tokens_seen": 172792220, + "step": 8007, + "time_per_iteration": 2.4310503005981445 + }, + { + "auxiliary_loss_clip": 0.01134866, + "auxiliary_loss_mlp": 0.01027045, + "balance_loss_clip": 1.04246259, + "balance_loss_mlp": 1.01939213, + "epoch": 0.9629050682378404, + "flos": 20886867452160.0, + "grad_norm": 2.2421300834134428, + "language_loss": 0.67874563, + "learning_rate": 1.4373543921222697e-08, + "loss": 0.70036471, + "num_input_tokens_seen": 172811805, + "step": 8008, + "time_per_iteration": 2.4909770488739014 + }, + { + "auxiliary_loss_clip": 0.01136754, + "auxiliary_loss_mlp": 0.0102525, + "balance_loss_clip": 1.04463696, + "balance_loss_mlp": 1.01774883, + "epoch": 0.9630253111284796, + "flos": 17019252478080.0, + "grad_norm": 1.752448587405767, + "language_loss": 0.77753413, + "learning_rate": 1.428047178865932e-08, + "loss": 0.79915416, + "num_input_tokens_seen": 172828595, + "step": 8009, + "time_per_iteration": 2.4377002716064453 + }, + { + "auxiliary_loss_clip": 0.01137982, + "auxiliary_loss_mlp": 0.01026546, + "balance_loss_clip": 1.04160213, + "balance_loss_mlp": 1.01873231, + "epoch": 0.9631455540191186, + "flos": 20338942412160.0, + "grad_norm": 1.6250636386267765, + "language_loss": 0.74172699, + "learning_rate": 1.4187700888169451e-08, + "loss": 0.7633723, + "num_input_tokens_seen": 172847770, + "step": 8010, + "time_per_iteration": 2.4824717044830322 + }, + { + "auxiliary_loss_clip": 0.01052775, + "auxiliary_loss_mlp": 0.01000821, + "balance_loss_clip": 1.00780964, + "balance_loss_mlp": 0.99978942, + "epoch": 0.9632657969097577, + "flos": 65956700033280.0, + "grad_norm": 0.7497974868088093, + "language_loss": 0.57001805, + "learning_rate": 1.40952312338265e-08, + "loss": 0.590554, + "num_input_tokens_seen": 172912415, + "step": 8011, + "time_per_iteration": 3.086881637573242 + }, + { + "auxiliary_loss_clip": 0.01126289, + "auxiliary_loss_mlp": 0.0102523, + "balance_loss_clip": 1.04117346, + "balance_loss_mlp": 1.01764202, + "epoch": 0.9633860398003968, + "flos": 44419523823360.0, + "grad_norm": 1.7653380501763924, + "language_loss": 0.685013, + "learning_rate": 1.4003062839657909e-08, + "loss": 0.70652819, + "num_input_tokens_seen": 172934895, + "step": 8012, + "time_per_iteration": 2.7042393684387207 + }, + { + "auxiliary_loss_clip": 0.01127922, + "auxiliary_loss_mlp": 0.01020076, + "balance_loss_clip": 1.04337013, + "balance_loss_mlp": 1.01314449, + "epoch": 0.9635062826910359, + "flos": 24827704300800.0, + "grad_norm": 1.6015461557352422, + "language_loss": 0.80307853, + "learning_rate": 1.391119571964583e-08, + "loss": 0.82455856, + "num_input_tokens_seen": 172955835, + "step": 8013, + "time_per_iteration": 4.928661346435547 + }, + { + "auxiliary_loss_clip": 0.01153099, + "auxiliary_loss_mlp": 0.01027053, + "balance_loss_clip": 1.0469048, + "balance_loss_mlp": 1.01929808, + "epoch": 0.9636265255816749, + "flos": 15961360095360.0, + "grad_norm": 2.0715256702365568, + "language_loss": 0.72576565, + "learning_rate": 1.3819629887726225e-08, + "loss": 0.74756718, + "num_input_tokens_seen": 172973925, + "step": 8014, + "time_per_iteration": 2.4374892711639404 + }, + { + "auxiliary_loss_clip": 0.01144324, + "auxiliary_loss_mlp": 0.0102507, + "balance_loss_clip": 1.04750764, + "balance_loss_mlp": 1.01762581, + "epoch": 0.9637467684723141, + "flos": 22601781457920.0, + "grad_norm": 2.3860304989121377, + "language_loss": 0.76105368, + "learning_rate": 1.3728365357789317e-08, + "loss": 0.78274763, + "num_input_tokens_seen": 172993290, + "step": 8015, + "time_per_iteration": 2.5137031078338623 + }, + { + "auxiliary_loss_clip": 0.01089317, + "auxiliary_loss_mlp": 0.01023227, + "balance_loss_clip": 1.03890347, + "balance_loss_mlp": 1.01493645, + "epoch": 0.9638670113629532, + "flos": 17565812801280.0, + "grad_norm": 2.804951891784436, + "language_loss": 0.76126933, + "learning_rate": 1.3637402143680254e-08, + "loss": 0.78239483, + "num_input_tokens_seen": 173008190, + "step": 8016, + "time_per_iteration": 2.5531187057495117 + }, + { + "auxiliary_loss_clip": 0.01028686, + "auxiliary_loss_mlp": 0.01005508, + "balance_loss_clip": 1.00855577, + "balance_loss_mlp": 1.00457227, + "epoch": 0.9639872542535922, + "flos": 55072139379840.0, + "grad_norm": 0.7243435339688528, + "language_loss": 0.55060482, + "learning_rate": 1.3546740259197998e-08, + "loss": 0.57094675, + "num_input_tokens_seen": 173061000, + "step": 8017, + "time_per_iteration": 3.803385019302368 + }, + { + "auxiliary_loss_clip": 0.01139818, + "auxiliary_loss_mlp": 0.01028205, + "balance_loss_clip": 1.04451418, + "balance_loss_mlp": 1.02064717, + "epoch": 0.9641074971442314, + "flos": 24134484746880.0, + "grad_norm": 3.191638201036174, + "language_loss": 0.70163953, + "learning_rate": 1.3456379718095989e-08, + "loss": 0.72331977, + "num_input_tokens_seen": 173081415, + "step": 8018, + "time_per_iteration": 2.5065958499908447 + }, + { + "auxiliary_loss_clip": 0.0103933, + "auxiliary_loss_mlp": 0.01001429, + "balance_loss_clip": 1.00667906, + "balance_loss_mlp": 1.00048769, + "epoch": 0.9642277400348704, + "flos": 66747416077440.0, + "grad_norm": 2.027099662614555, + "language_loss": 0.6200465, + "learning_rate": 1.3366320534081487e-08, + "loss": 0.64045405, + "num_input_tokens_seen": 173144095, + "step": 8019, + "time_per_iteration": 3.1063430309295654 + }, + { + "auxiliary_loss_clip": 0.01153792, + "auxiliary_loss_mlp": 0.01021106, + "balance_loss_clip": 1.04555678, + "balance_loss_mlp": 1.01378667, + "epoch": 0.9643479829255095, + "flos": 30920272450560.0, + "grad_norm": 4.514097332109828, + "language_loss": 0.7591114, + "learning_rate": 1.3276562720816675e-08, + "loss": 0.78086042, + "num_input_tokens_seen": 173165605, + "step": 8020, + "time_per_iteration": 2.5232558250427246 + }, + { + "auxiliary_loss_clip": 0.01167782, + "auxiliary_loss_mlp": 0.01025305, + "balance_loss_clip": 1.04653406, + "balance_loss_mlp": 1.01797056, + "epoch": 0.9644682258161487, + "flos": 20048245643520.0, + "grad_norm": 2.2367460036338476, + "language_loss": 0.82540226, + "learning_rate": 1.3187106291917549e-08, + "loss": 0.84733319, + "num_input_tokens_seen": 173182595, + "step": 8021, + "time_per_iteration": 2.4007842540740967 + }, + { + "auxiliary_loss_clip": 0.01148655, + "auxiliary_loss_mlp": 0.01022381, + "balance_loss_clip": 1.0454638, + "balance_loss_mlp": 1.0158627, + "epoch": 0.9645884687067877, + "flos": 21178713456000.0, + "grad_norm": 2.332089017488306, + "language_loss": 0.70515442, + "learning_rate": 1.309795126095503e-08, + "loss": 0.72686481, + "num_input_tokens_seen": 173200895, + "step": 8022, + "time_per_iteration": 2.4498631954193115 + }, + { + "auxiliary_loss_clip": 0.01077637, + "auxiliary_loss_mlp": 0.01027533, + "balance_loss_clip": 1.0365566, + "balance_loss_mlp": 1.01990914, + "epoch": 0.9647087115974268, + "flos": 18945967029120.0, + "grad_norm": 2.330807481950482, + "language_loss": 0.81060308, + "learning_rate": 1.3009097641453192e-08, + "loss": 0.83165479, + "num_input_tokens_seen": 173218745, + "step": 8023, + "time_per_iteration": 2.5872962474823 + }, + { + "auxiliary_loss_clip": 0.01136741, + "auxiliary_loss_mlp": 0.01022253, + "balance_loss_clip": 1.04389703, + "balance_loss_mlp": 1.01501405, + "epoch": 0.9648289544880659, + "flos": 16545088016640.0, + "grad_norm": 1.6598694306329322, + "language_loss": 0.76005781, + "learning_rate": 1.2920545446891474e-08, + "loss": 0.78164774, + "num_input_tokens_seen": 173235465, + "step": 8024, + "time_per_iteration": 2.4415509700775146 + }, + { + "auxiliary_loss_clip": 0.01142013, + "auxiliary_loss_mlp": 0.0103122, + "balance_loss_clip": 1.0470171, + "balance_loss_mlp": 1.02332854, + "epoch": 0.964949197378705, + "flos": 24057527857920.0, + "grad_norm": 1.6016061196065985, + "language_loss": 0.70756853, + "learning_rate": 1.2832294690703127e-08, + "loss": 0.7293008, + "num_input_tokens_seen": 173254440, + "step": 8025, + "time_per_iteration": 2.4899356365203857 + }, + { + "auxiliary_loss_clip": 0.01152712, + "auxiliary_loss_mlp": 0.01026485, + "balance_loss_clip": 1.04694998, + "balance_loss_mlp": 1.0187602, + "epoch": 0.965069440269344, + "flos": 23365565280000.0, + "grad_norm": 2.6295805140959208, + "language_loss": 0.77575946, + "learning_rate": 1.2744345386275668e-08, + "loss": 0.79755139, + "num_input_tokens_seen": 173273980, + "step": 8026, + "time_per_iteration": 2.4643962383270264 + }, + { + "auxiliary_loss_clip": 0.01148954, + "auxiliary_loss_mlp": 0.0102485, + "balance_loss_clip": 1.04966092, + "balance_loss_mlp": 1.01733065, + "epoch": 0.9651896831599832, + "flos": 25374875155200.0, + "grad_norm": 3.1624524728647927, + "language_loss": 0.78811193, + "learning_rate": 1.265669754695109e-08, + "loss": 0.80984998, + "num_input_tokens_seen": 173293550, + "step": 8027, + "time_per_iteration": 2.5575897693634033 + }, + { + "auxiliary_loss_clip": 0.01096791, + "auxiliary_loss_mlp": 0.01029019, + "balance_loss_clip": 1.03697562, + "balance_loss_mlp": 1.0213778, + "epoch": 0.9653099260506223, + "flos": 22272875596800.0, + "grad_norm": 1.9539853467348203, + "language_loss": 0.82040668, + "learning_rate": 1.2569351186025201e-08, + "loss": 0.84166473, + "num_input_tokens_seen": 173312005, + "step": 8028, + "time_per_iteration": 2.5741868019104004 + }, + { + "auxiliary_loss_clip": 0.01111965, + "auxiliary_loss_mlp": 0.01023748, + "balance_loss_clip": 1.03924918, + "balance_loss_mlp": 1.01649404, + "epoch": 0.9654301689412613, + "flos": 26760847386240.0, + "grad_norm": 1.8511248610831106, + "language_loss": 0.75488448, + "learning_rate": 1.2482306316748737e-08, + "loss": 0.7762416, + "num_input_tokens_seen": 173332450, + "step": 8029, + "time_per_iteration": 2.54937481880188 + }, + { + "auxiliary_loss_clip": 0.01156794, + "auxiliary_loss_mlp": 0.01024313, + "balance_loss_clip": 1.04489267, + "balance_loss_mlp": 1.01746774, + "epoch": 0.9655504118319005, + "flos": 17412689122560.0, + "grad_norm": 2.1419745850580183, + "language_loss": 0.78395593, + "learning_rate": 1.2395562952326021e-08, + "loss": 0.805767, + "num_input_tokens_seen": 173349610, + "step": 8030, + "time_per_iteration": 2.421171188354492 + }, + { + "auxiliary_loss_clip": 0.01149691, + "auxiliary_loss_mlp": 0.01030899, + "balance_loss_clip": 1.04707778, + "balance_loss_mlp": 1.02248883, + "epoch": 0.9656706547225395, + "flos": 22126970551680.0, + "grad_norm": 2.4326111017236367, + "language_loss": 0.80971301, + "learning_rate": 1.2309121105916309e-08, + "loss": 0.83151895, + "num_input_tokens_seen": 173367900, + "step": 8031, + "time_per_iteration": 2.505610466003418 + }, + { + "auxiliary_loss_clip": 0.01156806, + "auxiliary_loss_mlp": 0.01024424, + "balance_loss_clip": 1.04622829, + "balance_loss_mlp": 1.0173068, + "epoch": 0.9657908976131786, + "flos": 37049289926400.0, + "grad_norm": 1.8606587905861858, + "language_loss": 0.69356799, + "learning_rate": 1.222298079063222e-08, + "loss": 0.71538031, + "num_input_tokens_seen": 173389040, + "step": 8032, + "time_per_iteration": 2.644381523132324 + }, + { + "auxiliary_loss_clip": 0.01152277, + "auxiliary_loss_mlp": 0.01024186, + "balance_loss_clip": 1.04541278, + "balance_loss_mlp": 1.01720893, + "epoch": 0.9659111405038178, + "flos": 24389809597440.0, + "grad_norm": 2.1156326676099857, + "language_loss": 0.72487485, + "learning_rate": 1.2137142019541524e-08, + "loss": 0.74663949, + "num_input_tokens_seen": 173407595, + "step": 8033, + "time_per_iteration": 2.5079541206359863 + }, + { + "auxiliary_loss_clip": 0.01145958, + "auxiliary_loss_mlp": 0.01025711, + "balance_loss_clip": 1.04442894, + "balance_loss_mlp": 1.01864827, + "epoch": 0.9660313833944568, + "flos": 25009412227200.0, + "grad_norm": 1.9314309635836888, + "language_loss": 0.73693746, + "learning_rate": 1.2051604805666027e-08, + "loss": 0.75865418, + "num_input_tokens_seen": 173424720, + "step": 8034, + "time_per_iteration": 2.5104217529296875 + }, + { + "auxiliary_loss_clip": 0.01167649, + "auxiliary_loss_mlp": 0.00762115, + "balance_loss_clip": 1.04757667, + "balance_loss_mlp": 1.00070572, + "epoch": 0.9661516262850959, + "flos": 11801575895040.0, + "grad_norm": 2.092332334457056, + "language_loss": 0.78375548, + "learning_rate": 1.196636916198135e-08, + "loss": 0.80305314, + "num_input_tokens_seen": 173442260, + "step": 8035, + "time_per_iteration": 2.4015579223632812 + }, + { + "auxiliary_loss_clip": 0.01168767, + "auxiliary_loss_mlp": 0.01020901, + "balance_loss_clip": 1.0474515, + "balance_loss_mlp": 1.01370716, + "epoch": 0.9662718691757349, + "flos": 20047778766720.0, + "grad_norm": 2.0127602803346667, + "language_loss": 0.76855898, + "learning_rate": 1.1881435101418036e-08, + "loss": 0.7904557, + "num_input_tokens_seen": 173461675, + "step": 8036, + "time_per_iteration": 2.435908555984497 + }, + { + "auxiliary_loss_clip": 0.01043085, + "auxiliary_loss_mlp": 0.01000733, + "balance_loss_clip": 1.00818658, + "balance_loss_mlp": 0.99974912, + "epoch": 0.9663921120663741, + "flos": 68027703517440.0, + "grad_norm": 0.7302208708272737, + "language_loss": 0.65556467, + "learning_rate": 1.1796802636860003e-08, + "loss": 0.67600286, + "num_input_tokens_seen": 173530205, + "step": 8037, + "time_per_iteration": 3.1178343296051025 + }, + { + "auxiliary_loss_clip": 0.01166572, + "auxiliary_loss_mlp": 0.0102647, + "balance_loss_clip": 1.04561818, + "balance_loss_mlp": 1.01889467, + "epoch": 0.9665123549570132, + "flos": 26322916769280.0, + "grad_norm": 2.8557517640742214, + "language_loss": 0.73892391, + "learning_rate": 1.1712471781146316e-08, + "loss": 0.76085436, + "num_input_tokens_seen": 173549540, + "step": 8038, + "time_per_iteration": 2.447465181350708 + }, + { + "auxiliary_loss_clip": 0.01164629, + "auxiliary_loss_mlp": 0.010265, + "balance_loss_clip": 1.04474783, + "balance_loss_mlp": 1.01917434, + "epoch": 0.9666325978476522, + "flos": 43941121557120.0, + "grad_norm": 2.0152379688183903, + "language_loss": 0.67098987, + "learning_rate": 1.1628442547069628e-08, + "loss": 0.69290113, + "num_input_tokens_seen": 173571740, + "step": 8039, + "time_per_iteration": 3.3800137042999268 + }, + { + "auxiliary_loss_clip": 0.01156093, + "auxiliary_loss_mlp": 0.00762406, + "balance_loss_clip": 1.04473567, + "balance_loss_mlp": 1.00067043, + "epoch": 0.9667528407382914, + "flos": 21543422198400.0, + "grad_norm": 2.520186446037593, + "language_loss": 0.77241576, + "learning_rate": 1.1544714947377521e-08, + "loss": 0.7916007, + "num_input_tokens_seen": 173589425, + "step": 8040, + "time_per_iteration": 4.075325012207031 + }, + { + "auxiliary_loss_clip": 0.01168878, + "auxiliary_loss_mlp": 0.01029313, + "balance_loss_clip": 1.04742837, + "balance_loss_mlp": 1.02092671, + "epoch": 0.9668730836289304, + "flos": 23878585278720.0, + "grad_norm": 56.01021971545861, + "language_loss": 0.70019454, + "learning_rate": 1.1461288994770945e-08, + "loss": 0.72217643, + "num_input_tokens_seen": 173608500, + "step": 8041, + "time_per_iteration": 2.4201924800872803 + }, + { + "auxiliary_loss_clip": 0.0116876, + "auxiliary_loss_mlp": 0.01029452, + "balance_loss_clip": 1.04575694, + "balance_loss_mlp": 1.02188826, + "epoch": 0.9669933265195695, + "flos": 28293011971200.0, + "grad_norm": 1.9983955295097686, + "language_loss": 0.77087688, + "learning_rate": 1.1378164701906002e-08, + "loss": 0.79285896, + "num_input_tokens_seen": 173630265, + "step": 8042, + "time_per_iteration": 2.463726282119751 + }, + { + "auxiliary_loss_clip": 0.01170056, + "auxiliary_loss_mlp": 0.01026431, + "balance_loss_clip": 1.04778385, + "balance_loss_mlp": 1.01913822, + "epoch": 0.9671135694102087, + "flos": 22454763091200.0, + "grad_norm": 3.899061178011049, + "language_loss": 0.66443056, + "learning_rate": 1.1295342081392156e-08, + "loss": 0.68639541, + "num_input_tokens_seen": 173649625, + "step": 8043, + "time_per_iteration": 2.4613916873931885 + }, + { + "auxiliary_loss_clip": 0.01141045, + "auxiliary_loss_mlp": 0.01023408, + "balance_loss_clip": 1.04302907, + "balance_loss_mlp": 1.01588547, + "epoch": 0.9672338123008477, + "flos": 20155941596160.0, + "grad_norm": 1.6486887675436754, + "language_loss": 0.69102621, + "learning_rate": 1.1212821145793804e-08, + "loss": 0.71267068, + "num_input_tokens_seen": 173669240, + "step": 8044, + "time_per_iteration": 3.2383525371551514 + }, + { + "auxiliary_loss_clip": 0.01141, + "auxiliary_loss_mlp": 0.01024771, + "balance_loss_clip": 1.04370904, + "balance_loss_mlp": 1.01734471, + "epoch": 0.9673540551914868, + "flos": 16977487939200.0, + "grad_norm": 1.9601130012199868, + "language_loss": 0.78689933, + "learning_rate": 1.1130601907629156e-08, + "loss": 0.80855703, + "num_input_tokens_seen": 173686970, + "step": 8045, + "time_per_iteration": 2.4504709243774414 + }, + { + "auxiliary_loss_clip": 0.01054706, + "auxiliary_loss_mlp": 0.01001627, + "balance_loss_clip": 1.00718403, + "balance_loss_mlp": 1.0007453, + "epoch": 0.9674742980821259, + "flos": 61892903952000.0, + "grad_norm": 0.7974315181202021, + "language_loss": 0.64795947, + "learning_rate": 1.1048684379370899e-08, + "loss": 0.66852278, + "num_input_tokens_seen": 173747655, + "step": 8046, + "time_per_iteration": 3.0120019912719727 + }, + { + "auxiliary_loss_clip": 0.01130178, + "auxiliary_loss_mlp": 0.01022594, + "balance_loss_clip": 1.04397488, + "balance_loss_mlp": 1.01592386, + "epoch": 0.967594540972765, + "flos": 18697824898560.0, + "grad_norm": 2.1608267924452016, + "language_loss": 0.74414253, + "learning_rate": 1.0967068573445759e-08, + "loss": 0.7656703, + "num_input_tokens_seen": 173765140, + "step": 8047, + "time_per_iteration": 2.4586079120635986 + }, + { + "auxiliary_loss_clip": 0.01135372, + "auxiliary_loss_mlp": 0.01024441, + "balance_loss_clip": 1.04261839, + "balance_loss_mlp": 1.01707411, + "epoch": 0.967714783863404, + "flos": 20777411733120.0, + "grad_norm": 2.233891256598366, + "language_loss": 0.65217477, + "learning_rate": 1.0885754502234945e-08, + "loss": 0.67377293, + "num_input_tokens_seen": 173784800, + "step": 8048, + "time_per_iteration": 2.4674291610717773 + }, + { + "auxiliary_loss_clip": 0.01124635, + "auxiliary_loss_mlp": 0.01024976, + "balance_loss_clip": 1.04402065, + "balance_loss_mlp": 1.0178231, + "epoch": 0.9678350267540432, + "flos": 23185473465600.0, + "grad_norm": 2.1322186559471086, + "language_loss": 0.77977502, + "learning_rate": 1.08047421780737e-08, + "loss": 0.80127108, + "num_input_tokens_seen": 173803990, + "step": 8049, + "time_per_iteration": 2.5113677978515625 + }, + { + "auxiliary_loss_clip": 0.01145498, + "auxiliary_loss_mlp": 0.00761895, + "balance_loss_clip": 1.04440379, + "balance_loss_mlp": 1.00058949, + "epoch": 0.9679552696446823, + "flos": 21726063878400.0, + "grad_norm": 2.1380685239376906, + "language_loss": 0.74191016, + "learning_rate": 1.0724031613251305e-08, + "loss": 0.76098406, + "num_input_tokens_seen": 173821890, + "step": 8050, + "time_per_iteration": 2.4896881580352783 + }, + { + "auxiliary_loss_clip": 0.01159815, + "auxiliary_loss_mlp": 0.01025484, + "balance_loss_clip": 1.04652882, + "balance_loss_mlp": 1.01776528, + "epoch": 0.9680755125353213, + "flos": 26869046129280.0, + "grad_norm": 2.2124597904357124, + "language_loss": 0.66237211, + "learning_rate": 1.0643622820011744e-08, + "loss": 0.68422508, + "num_input_tokens_seen": 173842945, + "step": 8051, + "time_per_iteration": 2.494203805923462 + }, + { + "auxiliary_loss_clip": 0.01170877, + "auxiliary_loss_mlp": 0.01027904, + "balance_loss_clip": 1.04654181, + "balance_loss_mlp": 1.01994681, + "epoch": 0.9681957554259605, + "flos": 28325008010880.0, + "grad_norm": 2.4308596584574715, + "language_loss": 0.67879975, + "learning_rate": 1.0563515810552814e-08, + "loss": 0.70078754, + "num_input_tokens_seen": 173859915, + "step": 8052, + "time_per_iteration": 2.4649057388305664 + }, + { + "auxiliary_loss_clip": 0.01170554, + "auxiliary_loss_mlp": 0.01026736, + "balance_loss_clip": 1.0506916, + "balance_loss_mlp": 1.01980662, + "epoch": 0.9683159983165995, + "flos": 20557674282240.0, + "grad_norm": 1.6046980368928612, + "language_loss": 0.73392045, + "learning_rate": 1.0483710597026795e-08, + "loss": 0.75589335, + "num_input_tokens_seen": 173879775, + "step": 8053, + "time_per_iteration": 2.4438889026641846 + }, + { + "auxiliary_loss_clip": 0.01124838, + "auxiliary_loss_mlp": 0.01029355, + "balance_loss_clip": 1.04242301, + "balance_loss_mlp": 1.02169621, + "epoch": 0.9684362412072386, + "flos": 24207958016640.0, + "grad_norm": 2.330222451167835, + "language_loss": 0.73990333, + "learning_rate": 1.0404207191540227e-08, + "loss": 0.76144528, + "num_input_tokens_seen": 173900230, + "step": 8054, + "time_per_iteration": 2.5550057888031006 + }, + { + "auxiliary_loss_clip": 0.01166392, + "auxiliary_loss_mlp": 0.01024492, + "balance_loss_clip": 1.04644549, + "balance_loss_mlp": 1.0172267, + "epoch": 0.9685564840978778, + "flos": 22346241125760.0, + "grad_norm": 2.7782934519455216, + "language_loss": 0.74531853, + "learning_rate": 1.0325005606153236e-08, + "loss": 0.76722741, + "num_input_tokens_seen": 173919690, + "step": 8055, + "time_per_iteration": 2.4238998889923096 + }, + { + "auxiliary_loss_clip": 0.01114529, + "auxiliary_loss_mlp": 0.01029242, + "balance_loss_clip": 1.04040241, + "balance_loss_mlp": 1.0221101, + "epoch": 0.9686767269885168, + "flos": 14386389477120.0, + "grad_norm": 2.7346036921529144, + "language_loss": 0.78961277, + "learning_rate": 1.0246105852881104e-08, + "loss": 0.81105047, + "num_input_tokens_seen": 173934790, + "step": 8056, + "time_per_iteration": 2.533811569213867 + }, + { + "auxiliary_loss_clip": 0.01168683, + "auxiliary_loss_mlp": 0.01022538, + "balance_loss_clip": 1.04739261, + "balance_loss_mlp": 1.01499224, + "epoch": 0.9687969698791559, + "flos": 21287630471040.0, + "grad_norm": 2.1526739717901155, + "language_loss": 0.7877506, + "learning_rate": 1.0167507943692476e-08, + "loss": 0.80966282, + "num_input_tokens_seen": 173953875, + "step": 8057, + "time_per_iteration": 2.4269909858703613 + }, + { + "auxiliary_loss_clip": 0.01152111, + "auxiliary_loss_mlp": 0.01033726, + "balance_loss_clip": 1.04731989, + "balance_loss_mlp": 1.02549791, + "epoch": 0.968917212769795, + "flos": 19828328624640.0, + "grad_norm": 2.195353251535549, + "language_loss": 0.71419787, + "learning_rate": 1.008921189051093e-08, + "loss": 0.73605633, + "num_input_tokens_seen": 173971220, + "step": 8058, + "time_per_iteration": 2.4310498237609863 + }, + { + "auxiliary_loss_clip": 0.011693, + "auxiliary_loss_mlp": 0.01026515, + "balance_loss_clip": 1.04826474, + "balance_loss_mlp": 1.01898742, + "epoch": 0.9690374556604341, + "flos": 21681749473920.0, + "grad_norm": 2.6997788462025425, + "language_loss": 0.77506638, + "learning_rate": 1.0011217705213848e-08, + "loss": 0.79702455, + "num_input_tokens_seen": 173989095, + "step": 8059, + "time_per_iteration": 2.4117512702941895 + }, + { + "auxiliary_loss_clip": 0.01149628, + "auxiliary_loss_mlp": 0.01024776, + "balance_loss_clip": 1.04570496, + "balance_loss_mlp": 1.01821685, + "epoch": 0.9691576985510731, + "flos": 32635437851520.0, + "grad_norm": 1.8801408469758973, + "language_loss": 0.74755383, + "learning_rate": 9.933525399632658e-09, + "loss": 0.76929784, + "num_input_tokens_seen": 174007330, + "step": 8060, + "time_per_iteration": 2.5451512336730957 + }, + { + "auxiliary_loss_clip": 0.01134506, + "auxiliary_loss_mlp": 0.01025277, + "balance_loss_clip": 1.04201806, + "balance_loss_mlp": 1.01743317, + "epoch": 0.9692779414417123, + "flos": 35663174040960.0, + "grad_norm": 3.9465462186448614, + "language_loss": 0.64930058, + "learning_rate": 9.856134985553488e-09, + "loss": 0.67089844, + "num_input_tokens_seen": 174027055, + "step": 8061, + "time_per_iteration": 2.5831146240234375 + }, + { + "auxiliary_loss_clip": 0.01167647, + "auxiliary_loss_mlp": 0.01024286, + "balance_loss_clip": 1.04707146, + "balance_loss_mlp": 1.01651955, + "epoch": 0.9693981843323514, + "flos": 28366952117760.0, + "grad_norm": 1.7129474708624965, + "language_loss": 0.73590678, + "learning_rate": 9.77904647471628e-09, + "loss": 0.75782609, + "num_input_tokens_seen": 174050235, + "step": 8062, + "time_per_iteration": 2.493574380874634 + }, + { + "auxiliary_loss_clip": 0.0110466, + "auxiliary_loss_mlp": 0.01024643, + "balance_loss_clip": 1.03977299, + "balance_loss_mlp": 1.01741934, + "epoch": 0.9695184272229904, + "flos": 23622865378560.0, + "grad_norm": 1.4574676847996328, + "language_loss": 0.74111295, + "learning_rate": 9.702259878815454e-09, + "loss": 0.76240599, + "num_input_tokens_seen": 174070560, + "step": 8063, + "time_per_iteration": 2.561349630355835 + }, + { + "auxiliary_loss_clip": 0.01158001, + "auxiliary_loss_mlp": 0.01025171, + "balance_loss_clip": 1.04848719, + "balance_loss_mlp": 1.01751804, + "epoch": 0.9696386701136296, + "flos": 23294677789440.0, + "grad_norm": 2.143551999823273, + "language_loss": 0.74310893, + "learning_rate": 9.625775209499254e-09, + "loss": 0.76494062, + "num_input_tokens_seen": 174090565, + "step": 8064, + "time_per_iteration": 2.4522297382354736 + }, + { + "auxiliary_loss_clip": 0.0111822, + "auxiliary_loss_mlp": 0.01024226, + "balance_loss_clip": 1.03908849, + "balance_loss_mlp": 1.01718092, + "epoch": 0.9697589130042686, + "flos": 15121876360320.0, + "grad_norm": 1.9881645283673244, + "language_loss": 0.74365723, + "learning_rate": 9.549592478370172e-09, + "loss": 0.7650817, + "num_input_tokens_seen": 174108745, + "step": 8065, + "time_per_iteration": 2.4932780265808105 + }, + { + "auxiliary_loss_clip": 0.01154387, + "auxiliary_loss_mlp": 0.01022613, + "balance_loss_clip": 1.04450893, + "balance_loss_mlp": 1.01549613, + "epoch": 0.9698791558949077, + "flos": 18879532824960.0, + "grad_norm": 1.8347470715199556, + "language_loss": 0.79113173, + "learning_rate": 9.473711696985632e-09, + "loss": 0.81290174, + "num_input_tokens_seen": 174128075, + "step": 8066, + "time_per_iteration": 3.1871111392974854 + }, + { + "auxiliary_loss_clip": 0.01138906, + "auxiliary_loss_mlp": 0.01026487, + "balance_loss_clip": 1.04364491, + "balance_loss_mlp": 1.01910758, + "epoch": 0.9699993987855468, + "flos": 17931455297280.0, + "grad_norm": 2.9307455274276677, + "language_loss": 0.75884473, + "learning_rate": 9.398132876856201e-09, + "loss": 0.78049868, + "num_input_tokens_seen": 174147040, + "step": 8067, + "time_per_iteration": 4.022068500518799 + }, + { + "auxiliary_loss_clip": 0.01023072, + "auxiliary_loss_mlp": 0.01002192, + "balance_loss_clip": 1.00768209, + "balance_loss_mlp": 1.00100029, + "epoch": 0.9701196416761859, + "flos": 67182186297600.0, + "grad_norm": 0.7795336309331165, + "language_loss": 0.60844868, + "learning_rate": 9.322856029447379e-09, + "loss": 0.62870133, + "num_input_tokens_seen": 174208225, + "step": 8068, + "time_per_iteration": 3.0260491371154785 + }, + { + "auxiliary_loss_clip": 0.01165327, + "auxiliary_loss_mlp": 0.01027389, + "balance_loss_clip": 1.0470885, + "balance_loss_mlp": 1.02043629, + "epoch": 0.970239884566825, + "flos": 24277804012800.0, + "grad_norm": 2.3654589492184743, + "language_loss": 0.79985833, + "learning_rate": 9.247881166178695e-09, + "loss": 0.82178545, + "num_input_tokens_seen": 174226935, + "step": 8069, + "time_per_iteration": 2.4422974586486816 + }, + { + "auxiliary_loss_clip": 0.01134742, + "auxiliary_loss_mlp": 0.01026263, + "balance_loss_clip": 1.04466891, + "balance_loss_mlp": 1.01919103, + "epoch": 0.970360127457464, + "flos": 25301689194240.0, + "grad_norm": 2.248478266231203, + "language_loss": 0.7653088, + "learning_rate": 9.173208298423274e-09, + "loss": 0.78691888, + "num_input_tokens_seen": 174248140, + "step": 8070, + "time_per_iteration": 2.5679781436920166 + }, + { + "auxiliary_loss_clip": 0.0110756, + "auxiliary_loss_mlp": 0.0076243, + "balance_loss_clip": 1.0414803, + "balance_loss_mlp": 1.00059974, + "epoch": 0.9704803703481032, + "flos": 29572473398400.0, + "grad_norm": 1.5675571739206446, + "language_loss": 0.76265132, + "learning_rate": 9.09883743750961e-09, + "loss": 0.78135121, + "num_input_tokens_seen": 174271030, + "step": 8071, + "time_per_iteration": 3.4086062908172607 + }, + { + "auxiliary_loss_clip": 0.01137698, + "auxiliary_loss_mlp": 0.01023099, + "balance_loss_clip": 1.0436784, + "balance_loss_mlp": 1.01599765, + "epoch": 0.9706006132387422, + "flos": 17380046638080.0, + "grad_norm": 2.1504605092004816, + "language_loss": 0.83707047, + "learning_rate": 9.024768594719124e-09, + "loss": 0.85867834, + "num_input_tokens_seen": 174289410, + "step": 8072, + "time_per_iteration": 2.472027540206909 + }, + { + "auxiliary_loss_clip": 0.01125632, + "auxiliary_loss_mlp": 0.01022445, + "balance_loss_clip": 1.04430175, + "balance_loss_mlp": 1.01540565, + "epoch": 0.9707208561293813, + "flos": 18186421011840.0, + "grad_norm": 2.1905240454937247, + "language_loss": 0.72456491, + "learning_rate": 8.95100178128816e-09, + "loss": 0.74604565, + "num_input_tokens_seen": 174308550, + "step": 8073, + "time_per_iteration": 2.5601024627685547 + }, + { + "auxiliary_loss_clip": 0.01140289, + "auxiliary_loss_mlp": 0.01025738, + "balance_loss_clip": 1.04389286, + "balance_loss_mlp": 1.01767397, + "epoch": 0.9708410990200205, + "flos": 31248388212480.0, + "grad_norm": 2.1802214869354053, + "language_loss": 0.70234752, + "learning_rate": 8.877537008407321e-09, + "loss": 0.72400773, + "num_input_tokens_seen": 174328600, + "step": 8074, + "time_per_iteration": 2.5519375801086426 + }, + { + "auxiliary_loss_clip": 0.01144563, + "auxiliary_loss_mlp": 0.01024396, + "balance_loss_clip": 1.04533827, + "balance_loss_mlp": 1.0172646, + "epoch": 0.9709613419106595, + "flos": 30554450386560.0, + "grad_norm": 1.5534238203054866, + "language_loss": 0.68660533, + "learning_rate": 8.804374287221028e-09, + "loss": 0.70829487, + "num_input_tokens_seen": 174349835, + "step": 8075, + "time_per_iteration": 2.582050323486328 + }, + { + "auxiliary_loss_clip": 0.01116665, + "auxiliary_loss_mlp": 0.01025016, + "balance_loss_clip": 1.03727722, + "balance_loss_mlp": 1.01743722, + "epoch": 0.9710815848012986, + "flos": 23730166281600.0, + "grad_norm": 1.6482358708308027, + "language_loss": 0.84381938, + "learning_rate": 8.731513628827958e-09, + "loss": 0.86523616, + "num_input_tokens_seen": 174369200, + "step": 8076, + "time_per_iteration": 2.5404446125030518 + }, + { + "auxiliary_loss_clip": 0.01155827, + "auxiliary_loss_mlp": 0.0102469, + "balance_loss_clip": 1.04663086, + "balance_loss_mlp": 1.01738501, + "epoch": 0.9712018276919377, + "flos": 23761875012480.0, + "grad_norm": 2.206246956857742, + "language_loss": 0.82513309, + "learning_rate": 8.658955044280825e-09, + "loss": 0.84693825, + "num_input_tokens_seen": 174388125, + "step": 8077, + "time_per_iteration": 2.469510316848755 + }, + { + "auxiliary_loss_clip": 0.01149696, + "auxiliary_loss_mlp": 0.01022496, + "balance_loss_clip": 1.04493761, + "balance_loss_mlp": 1.0149678, + "epoch": 0.9713220705825768, + "flos": 23330983461120.0, + "grad_norm": 1.53870487119585, + "language_loss": 0.77385879, + "learning_rate": 8.586698544587268e-09, + "loss": 0.79558074, + "num_input_tokens_seen": 174409735, + "step": 8078, + "time_per_iteration": 2.4790873527526855 + }, + { + "auxiliary_loss_clip": 0.01133109, + "auxiliary_loss_mlp": 0.01028611, + "balance_loss_clip": 1.04283881, + "balance_loss_mlp": 1.02069843, + "epoch": 0.9714423134732159, + "flos": 22200946611840.0, + "grad_norm": 2.1185867418630897, + "language_loss": 0.73885357, + "learning_rate": 8.514744140707853e-09, + "loss": 0.76047081, + "num_input_tokens_seen": 174428875, + "step": 8079, + "time_per_iteration": 2.5228824615478516 + }, + { + "auxiliary_loss_clip": 0.01165521, + "auxiliary_loss_mlp": 0.01021161, + "balance_loss_clip": 1.04619658, + "balance_loss_mlp": 1.01410353, + "epoch": 0.971562556363855, + "flos": 20229917656320.0, + "grad_norm": 1.6404285498649962, + "language_loss": 0.76206893, + "learning_rate": 8.443091843558515e-09, + "loss": 0.78393579, + "num_input_tokens_seen": 174447960, + "step": 8080, + "time_per_iteration": 2.4389865398406982 + }, + { + "auxiliary_loss_clip": 0.01134637, + "auxiliary_loss_mlp": 0.01022978, + "balance_loss_clip": 1.04312658, + "balance_loss_mlp": 1.01528323, + "epoch": 0.9716827992544941, + "flos": 24970197553920.0, + "grad_norm": 2.2036708210937435, + "language_loss": 0.64384067, + "learning_rate": 8.37174166400878e-09, + "loss": 0.66541684, + "num_input_tokens_seen": 174463535, + "step": 8081, + "time_per_iteration": 2.54681658744812 + }, + { + "auxiliary_loss_clip": 0.01166794, + "auxiliary_loss_mlp": 0.01021454, + "balance_loss_clip": 1.04770255, + "balance_loss_mlp": 1.01444781, + "epoch": 0.9718030421451331, + "flos": 24681476033280.0, + "grad_norm": 3.912147257488598, + "language_loss": 0.84746623, + "learning_rate": 8.300693612881992e-09, + "loss": 0.8693487, + "num_input_tokens_seen": 174483600, + "step": 8082, + "time_per_iteration": 2.479210138320923 + }, + { + "auxiliary_loss_clip": 0.01151791, + "auxiliary_loss_mlp": 0.00762004, + "balance_loss_clip": 1.04648042, + "balance_loss_mlp": 1.00058484, + "epoch": 0.9719232850357723, + "flos": 22090700793600.0, + "grad_norm": 2.0587012962422873, + "language_loss": 0.81515497, + "learning_rate": 8.22994770095664e-09, + "loss": 0.83429295, + "num_input_tokens_seen": 174502175, + "step": 8083, + "time_per_iteration": 2.449392557144165 + }, + { + "auxiliary_loss_clip": 0.0114036, + "auxiliary_loss_mlp": 0.01028268, + "balance_loss_clip": 1.04821503, + "balance_loss_mlp": 1.0203706, + "epoch": 0.9720435279264114, + "flos": 23656908493440.0, + "grad_norm": 2.737931254648101, + "language_loss": 0.75273955, + "learning_rate": 8.159503938964585e-09, + "loss": 0.77442586, + "num_input_tokens_seen": 174519495, + "step": 8084, + "time_per_iteration": 2.4800314903259277 + }, + { + "auxiliary_loss_clip": 0.01119429, + "auxiliary_loss_mlp": 0.01017743, + "balance_loss_clip": 1.04277217, + "balance_loss_mlp": 1.01083457, + "epoch": 0.9721637708170504, + "flos": 28365910623360.0, + "grad_norm": 1.8130265712589695, + "language_loss": 0.70518458, + "learning_rate": 8.089362337592164e-09, + "loss": 0.7265563, + "num_input_tokens_seen": 174543120, + "step": 8085, + "time_per_iteration": 2.6152031421661377 + }, + { + "auxiliary_loss_clip": 0.01136066, + "auxiliary_loss_mlp": 0.0102936, + "balance_loss_clip": 1.04361677, + "balance_loss_mlp": 1.02240467, + "epoch": 0.9722840137076896, + "flos": 29130807767040.0, + "grad_norm": 1.9128017865226312, + "language_loss": 0.72251463, + "learning_rate": 8.019522907479536e-09, + "loss": 0.74416888, + "num_input_tokens_seen": 174563480, + "step": 8086, + "time_per_iteration": 2.53702712059021 + }, + { + "auxiliary_loss_clip": 0.01155545, + "auxiliary_loss_mlp": 0.01026531, + "balance_loss_clip": 1.0463419, + "balance_loss_mlp": 1.01887774, + "epoch": 0.9724042565983286, + "flos": 19243954258560.0, + "grad_norm": 2.06473077390423, + "language_loss": 0.77585709, + "learning_rate": 7.949985659221558e-09, + "loss": 0.79767787, + "num_input_tokens_seen": 174580745, + "step": 8087, + "time_per_iteration": 2.4546830654144287 + }, + { + "auxiliary_loss_clip": 0.01142043, + "auxiliary_loss_mlp": 0.01028648, + "balance_loss_clip": 1.04473186, + "balance_loss_mlp": 1.0219245, + "epoch": 0.9725244994889677, + "flos": 23039676161280.0, + "grad_norm": 2.160216206767064, + "language_loss": 0.78984773, + "learning_rate": 7.880750603366904e-09, + "loss": 0.81155467, + "num_input_tokens_seen": 174599615, + "step": 8088, + "time_per_iteration": 2.5837514400482178 + }, + { + "auxiliary_loss_clip": 0.01132393, + "auxiliary_loss_mlp": 0.01025783, + "balance_loss_clip": 1.04125321, + "balance_loss_mlp": 1.01756036, + "epoch": 0.9726447423796069, + "flos": 23367468700800.0, + "grad_norm": 1.905888991342626, + "language_loss": 0.80054367, + "learning_rate": 7.811817750418282e-09, + "loss": 0.82212543, + "num_input_tokens_seen": 174618375, + "step": 8089, + "time_per_iteration": 2.525035858154297 + }, + { + "auxiliary_loss_clip": 0.01122458, + "auxiliary_loss_mlp": 0.01022034, + "balance_loss_clip": 1.04322004, + "balance_loss_mlp": 1.01420748, + "epoch": 0.9727649852702459, + "flos": 26541648639360.0, + "grad_norm": 2.024073996670743, + "language_loss": 0.80231923, + "learning_rate": 7.743187110833105e-09, + "loss": 0.8237642, + "num_input_tokens_seen": 174641135, + "step": 8090, + "time_per_iteration": 2.548693895339966 + }, + { + "auxiliary_loss_clip": 0.01141563, + "auxiliary_loss_mlp": 0.01021694, + "balance_loss_clip": 1.04344177, + "balance_loss_mlp": 1.01478863, + "epoch": 0.972885228160885, + "flos": 20522338277760.0, + "grad_norm": 1.668833669223421, + "language_loss": 0.80734229, + "learning_rate": 7.674858695022602e-09, + "loss": 0.82897484, + "num_input_tokens_seen": 174659490, + "step": 8091, + "time_per_iteration": 2.4952847957611084 + }, + { + "auxiliary_loss_clip": 0.01170235, + "auxiliary_loss_mlp": 0.01027995, + "balance_loss_clip": 1.04893386, + "balance_loss_mlp": 1.02025795, + "epoch": 0.9730054710515241, + "flos": 17566064196480.0, + "grad_norm": 2.370332409508284, + "language_loss": 0.76071453, + "learning_rate": 7.606832513351591e-09, + "loss": 0.78269684, + "num_input_tokens_seen": 174677440, + "step": 8092, + "time_per_iteration": 2.3930892944335938 + }, + { + "auxiliary_loss_clip": 0.0106309, + "auxiliary_loss_mlp": 0.00752653, + "balance_loss_clip": 1.00736785, + "balance_loss_mlp": 1.0005312, + "epoch": 0.9731257139421632, + "flos": 68972010117120.0, + "grad_norm": 0.8267476659459099, + "language_loss": 0.63968027, + "learning_rate": 7.539108576140264e-09, + "loss": 0.65783775, + "num_input_tokens_seen": 174741550, + "step": 8093, + "time_per_iteration": 3.956737756729126 + }, + { + "auxiliary_loss_clip": 0.01111911, + "auxiliary_loss_mlp": 0.01020979, + "balance_loss_clip": 1.04236782, + "balance_loss_mlp": 1.01459503, + "epoch": 0.9732459568328022, + "flos": 18478841633280.0, + "grad_norm": 1.9146116402534725, + "language_loss": 0.7018137, + "learning_rate": 7.471686893661732e-09, + "loss": 0.72314262, + "num_input_tokens_seen": 174759845, + "step": 8094, + "time_per_iteration": 3.3082833290100098 + }, + { + "auxiliary_loss_clip": 0.01137181, + "auxiliary_loss_mlp": 0.01024665, + "balance_loss_clip": 1.04397249, + "balance_loss_mlp": 1.01752448, + "epoch": 0.9733661997234414, + "flos": 20883886623360.0, + "grad_norm": 1.6650796069642522, + "language_loss": 0.64285171, + "learning_rate": 7.4045674761442636e-09, + "loss": 0.66447014, + "num_input_tokens_seen": 174777175, + "step": 8095, + "time_per_iteration": 2.4891738891601562 + }, + { + "auxiliary_loss_clip": 0.01167358, + "auxiliary_loss_mlp": 0.00761626, + "balance_loss_clip": 1.04776323, + "balance_loss_mlp": 1.00056553, + "epoch": 0.9734864426140805, + "flos": 23766795175680.0, + "grad_norm": 1.6540538607161386, + "language_loss": 0.74497122, + "learning_rate": 7.337750333769488e-09, + "loss": 0.76426101, + "num_input_tokens_seen": 174796980, + "step": 8096, + "time_per_iteration": 2.4517574310302734 + }, + { + "auxiliary_loss_clip": 0.01143679, + "auxiliary_loss_mlp": 0.01024845, + "balance_loss_clip": 1.04053843, + "balance_loss_mlp": 1.01650596, + "epoch": 0.9736066855047195, + "flos": 35042422176000.0, + "grad_norm": 1.7705204721756345, + "language_loss": 0.7284683, + "learning_rate": 7.2712354766737425e-09, + "loss": 0.75015354, + "num_input_tokens_seen": 174817310, + "step": 8097, + "time_per_iteration": 3.3587119579315186 + }, + { + "auxiliary_loss_clip": 0.01119125, + "auxiliary_loss_mlp": 0.01024259, + "balance_loss_clip": 1.04494619, + "balance_loss_mlp": 1.01632011, + "epoch": 0.9737269283953586, + "flos": 20410620001920.0, + "grad_norm": 1.606153319520327, + "language_loss": 0.8105824, + "learning_rate": 7.2050229149469565e-09, + "loss": 0.83201623, + "num_input_tokens_seen": 174837320, + "step": 8098, + "time_per_iteration": 2.5299766063690186 + }, + { + "auxiliary_loss_clip": 0.01128944, + "auxiliary_loss_mlp": 0.01025621, + "balance_loss_clip": 1.04120612, + "balance_loss_mlp": 1.01811051, + "epoch": 0.9738471712859977, + "flos": 28911680847360.0, + "grad_norm": 1.8896872748670663, + "language_loss": 0.63154966, + "learning_rate": 7.139112658633984e-09, + "loss": 0.6530953, + "num_input_tokens_seen": 174857470, + "step": 8099, + "time_per_iteration": 2.569882869720459 + }, + { + "auxiliary_loss_clip": 0.0112431, + "auxiliary_loss_mlp": 0.01023085, + "balance_loss_clip": 1.04452825, + "balance_loss_mlp": 1.0161438, + "epoch": 0.9739674141766368, + "flos": 27782326356480.0, + "grad_norm": 3.9781695762352074, + "language_loss": 0.70273763, + "learning_rate": 7.073504717733048e-09, + "loss": 0.72421157, + "num_input_tokens_seen": 174877035, + "step": 8100, + "time_per_iteration": 2.567214012145996 + }, + { + "auxiliary_loss_clip": 0.01021905, + "auxiliary_loss_mlp": 0.01003186, + "balance_loss_clip": 1.00923407, + "balance_loss_mlp": 1.00219631, + "epoch": 0.9740876570672758, + "flos": 68863057188480.0, + "grad_norm": 0.7337492411141538, + "language_loss": 0.57239008, + "learning_rate": 7.008199102196855e-09, + "loss": 0.592641, + "num_input_tokens_seen": 174938460, + "step": 8101, + "time_per_iteration": 3.1409497261047363 + }, + { + "auxiliary_loss_clip": 0.01037729, + "auxiliary_loss_mlp": 0.01002822, + "balance_loss_clip": 1.01139307, + "balance_loss_mlp": 1.00191617, + "epoch": 0.974207899957915, + "flos": 58236622646400.0, + "grad_norm": 0.7957368218686858, + "language_loss": 0.59026486, + "learning_rate": 6.9431958219321464e-09, + "loss": 0.61067039, + "num_input_tokens_seen": 174994625, + "step": 8102, + "time_per_iteration": 3.019454002380371 + }, + { + "auxiliary_loss_clip": 0.01140391, + "auxiliary_loss_mlp": 0.01025815, + "balance_loss_clip": 1.04436147, + "balance_loss_mlp": 1.01843047, + "epoch": 0.9743281428485541, + "flos": 22600057605120.0, + "grad_norm": 2.2935246787173806, + "language_loss": 0.77777207, + "learning_rate": 6.878494886800146e-09, + "loss": 0.79943413, + "num_input_tokens_seen": 175015400, + "step": 8103, + "time_per_iteration": 2.536686420440674 + }, + { + "auxiliary_loss_clip": 0.01141355, + "auxiliary_loss_mlp": 0.01023473, + "balance_loss_clip": 1.04605877, + "balance_loss_mlp": 1.01597476, + "epoch": 0.9744483857391931, + "flos": 20008815488640.0, + "grad_norm": 2.1226952932567866, + "language_loss": 0.76061267, + "learning_rate": 6.814096306615669e-09, + "loss": 0.78226095, + "num_input_tokens_seen": 175033540, + "step": 8104, + "time_per_iteration": 2.475198745727539 + }, + { + "auxiliary_loss_clip": 0.01143732, + "auxiliary_loss_mlp": 0.01024874, + "balance_loss_clip": 1.04224217, + "balance_loss_mlp": 1.01734352, + "epoch": 0.9745686286298323, + "flos": 17675268520320.0, + "grad_norm": 2.3785371736946117, + "language_loss": 0.65099907, + "learning_rate": 6.750000091148011e-09, + "loss": 0.67268515, + "num_input_tokens_seen": 175050835, + "step": 8105, + "time_per_iteration": 2.513545036315918 + }, + { + "auxiliary_loss_clip": 0.01169323, + "auxiliary_loss_mlp": 0.01026935, + "balance_loss_clip": 1.04859066, + "balance_loss_mlp": 1.01901305, + "epoch": 0.9746888715204713, + "flos": 29460252332160.0, + "grad_norm": 1.9527767727641934, + "language_loss": 0.72748309, + "learning_rate": 6.686206250120729e-09, + "loss": 0.74944568, + "num_input_tokens_seen": 175072330, + "step": 8106, + "time_per_iteration": 2.5092272758483887 + }, + { + "auxiliary_loss_clip": 0.01129867, + "auxiliary_loss_mlp": 0.01021563, + "balance_loss_clip": 1.040308, + "balance_loss_mlp": 1.01465809, + "epoch": 0.9748091144111104, + "flos": 18479308510080.0, + "grad_norm": 2.632665755280398, + "language_loss": 0.74778044, + "learning_rate": 6.622714793210749e-09, + "loss": 0.76929474, + "num_input_tokens_seen": 175091250, + "step": 8107, + "time_per_iteration": 2.5091066360473633 + }, + { + "auxiliary_loss_clip": 0.01167121, + "auxiliary_loss_mlp": 0.01021351, + "balance_loss_clip": 1.04591608, + "balance_loss_mlp": 1.01436806, + "epoch": 0.9749293573017496, + "flos": 20665154753280.0, + "grad_norm": 2.421771145122187, + "language_loss": 0.78829789, + "learning_rate": 6.559525730050364e-09, + "loss": 0.81018257, + "num_input_tokens_seen": 175111350, + "step": 8108, + "time_per_iteration": 2.429919481277466 + }, + { + "auxiliary_loss_clip": 0.01130044, + "auxiliary_loss_mlp": 0.01024418, + "balance_loss_clip": 1.04461622, + "balance_loss_mlp": 1.01728916, + "epoch": 0.9750496001923886, + "flos": 18478590238080.0, + "grad_norm": 1.84051876271746, + "language_loss": 0.76025534, + "learning_rate": 6.496639070224574e-09, + "loss": 0.78179991, + "num_input_tokens_seen": 175129835, + "step": 8109, + "time_per_iteration": 2.506392240524292 + }, + { + "auxiliary_loss_clip": 0.01156124, + "auxiliary_loss_mlp": 0.0102335, + "balance_loss_clip": 1.04685724, + "balance_loss_mlp": 1.01671886, + "epoch": 0.9751698430830277, + "flos": 19572967860480.0, + "grad_norm": 2.10549227813261, + "language_loss": 0.83716238, + "learning_rate": 6.4340548232739714e-09, + "loss": 0.85895717, + "num_input_tokens_seen": 175146035, + "step": 8110, + "time_per_iteration": 2.4322288036346436 + }, + { + "auxiliary_loss_clip": 0.01128596, + "auxiliary_loss_mlp": 0.01024749, + "balance_loss_clip": 1.04193413, + "balance_loss_mlp": 1.01736379, + "epoch": 0.9752900859736668, + "flos": 23550325862400.0, + "grad_norm": 1.7803606884084135, + "language_loss": 0.79042554, + "learning_rate": 6.371772998692071e-09, + "loss": 0.81195897, + "num_input_tokens_seen": 175165290, + "step": 8111, + "time_per_iteration": 2.571608543395996 + }, + { + "auxiliary_loss_clip": 0.01130747, + "auxiliary_loss_mlp": 0.01020542, + "balance_loss_clip": 1.04178834, + "balance_loss_mlp": 1.01319826, + "epoch": 0.9754103288643059, + "flos": 20303211358080.0, + "grad_norm": 2.9701056340462153, + "language_loss": 0.65282202, + "learning_rate": 6.309793605927094e-09, + "loss": 0.67433488, + "num_input_tokens_seen": 175183610, + "step": 8112, + "time_per_iteration": 2.504584312438965 + }, + { + "auxiliary_loss_clip": 0.01144505, + "auxiliary_loss_mlp": 0.01021403, + "balance_loss_clip": 1.04374743, + "balance_loss_mlp": 1.01380968, + "epoch": 0.975530571754945, + "flos": 19350680544000.0, + "grad_norm": 1.896340130736083, + "language_loss": 0.80093956, + "learning_rate": 6.248116654381297e-09, + "loss": 0.82259864, + "num_input_tokens_seen": 175202080, + "step": 8113, + "time_per_iteration": 2.4790987968444824 + }, + { + "auxiliary_loss_clip": 0.01139432, + "auxiliary_loss_mlp": 0.01025072, + "balance_loss_clip": 1.04094625, + "balance_loss_mlp": 1.01821113, + "epoch": 0.9756508146455841, + "flos": 23583399310080.0, + "grad_norm": 1.7298351923113606, + "language_loss": 0.72716528, + "learning_rate": 6.186742153410751e-09, + "loss": 0.74881029, + "num_input_tokens_seen": 175221575, + "step": 8114, + "time_per_iteration": 2.5182433128356934 + }, + { + "auxiliary_loss_clip": 0.01138895, + "auxiliary_loss_mlp": 0.01027114, + "balance_loss_clip": 1.04563236, + "balance_loss_mlp": 1.01884127, + "epoch": 0.9757710575362232, + "flos": 22966921163520.0, + "grad_norm": 1.8974025636350687, + "language_loss": 0.87463748, + "learning_rate": 6.125670112326453e-09, + "loss": 0.89629757, + "num_input_tokens_seen": 175240835, + "step": 8115, + "time_per_iteration": 2.5218911170959473 + }, + { + "auxiliary_loss_clip": 0.01152095, + "auxiliary_loss_mlp": 0.01023948, + "balance_loss_clip": 1.04239368, + "balance_loss_mlp": 1.01654506, + "epoch": 0.9758913004268622, + "flos": 27966009530880.0, + "grad_norm": 1.5954507031822243, + "language_loss": 0.69844133, + "learning_rate": 6.064900540392548e-09, + "loss": 0.72020173, + "num_input_tokens_seen": 175262930, + "step": 8116, + "time_per_iteration": 2.525923490524292 + }, + { + "auxiliary_loss_clip": 0.01135801, + "auxiliary_loss_mlp": 0.01021786, + "balance_loss_clip": 1.04532313, + "balance_loss_mlp": 1.01454401, + "epoch": 0.9760115433175014, + "flos": 22200156512640.0, + "grad_norm": 2.0354376643377354, + "language_loss": 0.78445756, + "learning_rate": 6.0044334468278835e-09, + "loss": 0.80603343, + "num_input_tokens_seen": 175282275, + "step": 8117, + "time_per_iteration": 2.504361152648926 + }, + { + "auxiliary_loss_clip": 0.01115964, + "auxiliary_loss_mlp": 0.01027615, + "balance_loss_clip": 1.04332542, + "balance_loss_mlp": 1.01990795, + "epoch": 0.9761317862081405, + "flos": 26250736389120.0, + "grad_norm": 1.818610823924184, + "language_loss": 0.71681732, + "learning_rate": 5.944268840805345e-09, + "loss": 0.73825312, + "num_input_tokens_seen": 175303020, + "step": 8118, + "time_per_iteration": 2.644371271133423 + }, + { + "auxiliary_loss_clip": 0.01118756, + "auxiliary_loss_mlp": 0.0102191, + "balance_loss_clip": 1.04076397, + "balance_loss_mlp": 1.01510358, + "epoch": 0.9762520290987795, + "flos": 26575440359040.0, + "grad_norm": 2.210366859485191, + "language_loss": 0.64018446, + "learning_rate": 5.88440673145163e-09, + "loss": 0.66159111, + "num_input_tokens_seen": 175324070, + "step": 8119, + "time_per_iteration": 2.584740400314331 + }, + { + "auxiliary_loss_clip": 0.01154252, + "auxiliary_loss_mlp": 0.01025578, + "balance_loss_clip": 1.04855394, + "balance_loss_mlp": 1.01808834, + "epoch": 0.9763722719894187, + "flos": 18005036307840.0, + "grad_norm": 1.9823411306313141, + "language_loss": 0.82634741, + "learning_rate": 5.824847127848142e-09, + "loss": 0.84814572, + "num_input_tokens_seen": 175342595, + "step": 8120, + "time_per_iteration": 4.040231227874756 + }, + { + "auxiliary_loss_clip": 0.01115344, + "auxiliary_loss_mlp": 0.01023988, + "balance_loss_clip": 1.04372513, + "balance_loss_mlp": 1.01665044, + "epoch": 0.9764925148800577, + "flos": 22455660931200.0, + "grad_norm": 2.011444024231969, + "language_loss": 0.79002368, + "learning_rate": 5.765590039029433e-09, + "loss": 0.81141698, + "num_input_tokens_seen": 175361915, + "step": 8121, + "time_per_iteration": 2.5652103424072266 + }, + { + "auxiliary_loss_clip": 0.01165976, + "auxiliary_loss_mlp": 0.01025548, + "balance_loss_clip": 1.04733837, + "balance_loss_mlp": 1.01813889, + "epoch": 0.9766127577706968, + "flos": 36757084786560.0, + "grad_norm": 1.6944226832000466, + "language_loss": 0.71032131, + "learning_rate": 5.706635473985422e-09, + "loss": 0.7322365, + "num_input_tokens_seen": 175385785, + "step": 8122, + "time_per_iteration": 2.6030287742614746 + }, + { + "auxiliary_loss_clip": 0.01149469, + "auxiliary_loss_mlp": 0.01024092, + "balance_loss_clip": 1.04401159, + "balance_loss_mlp": 1.01681721, + "epoch": 0.976733000661336, + "flos": 22309971367680.0, + "grad_norm": 3.581921947020156, + "language_loss": 0.85255027, + "learning_rate": 5.6479834416591764e-09, + "loss": 0.87428594, + "num_input_tokens_seen": 175405145, + "step": 8123, + "time_per_iteration": 2.460294723510742 + }, + { + "auxiliary_loss_clip": 0.01151584, + "auxiliary_loss_mlp": 0.00762475, + "balance_loss_clip": 1.04623485, + "balance_loss_mlp": 1.00056076, + "epoch": 0.976853243551975, + "flos": 25810938264960.0, + "grad_norm": 2.0545381799601707, + "language_loss": 0.68479317, + "learning_rate": 5.589633950947803e-09, + "loss": 0.70393378, + "num_input_tokens_seen": 175422645, + "step": 8124, + "time_per_iteration": 3.178044080734253 + }, + { + "auxiliary_loss_clip": 0.01140537, + "auxiliary_loss_mlp": 0.01031577, + "balance_loss_clip": 1.0459305, + "balance_loss_mlp": 1.02381623, + "epoch": 0.9769734864426141, + "flos": 21397445326080.0, + "grad_norm": 2.140900854067923, + "language_loss": 0.69798207, + "learning_rate": 5.5315870107035535e-09, + "loss": 0.71970314, + "num_input_tokens_seen": 175440695, + "step": 8125, + "time_per_iteration": 2.492844820022583 + }, + { + "auxiliary_loss_clip": 0.01135768, + "auxiliary_loss_mlp": 0.0102361, + "balance_loss_clip": 1.04626989, + "balance_loss_mlp": 1.01611745, + "epoch": 0.9770937293332532, + "flos": 13990977584640.0, + "grad_norm": 1.761394122809354, + "language_loss": 0.78494847, + "learning_rate": 5.473842629731607e-09, + "loss": 0.80654222, + "num_input_tokens_seen": 175459195, + "step": 8126, + "time_per_iteration": 2.4780080318450928 + }, + { + "auxiliary_loss_clip": 0.01146857, + "auxiliary_loss_mlp": 0.00762271, + "balance_loss_clip": 1.04397666, + "balance_loss_mlp": 1.00067592, + "epoch": 0.9772139722238923, + "flos": 17931994001280.0, + "grad_norm": 2.4700068491656406, + "language_loss": 0.77808577, + "learning_rate": 5.416400816792066e-09, + "loss": 0.79717702, + "num_input_tokens_seen": 175476710, + "step": 8127, + "time_per_iteration": 2.4801204204559326 + }, + { + "auxiliary_loss_clip": 0.01165311, + "auxiliary_loss_mlp": 0.010204, + "balance_loss_clip": 1.04602718, + "balance_loss_mlp": 1.0131526, + "epoch": 0.9773342151145313, + "flos": 20446171488000.0, + "grad_norm": 4.996993962023569, + "language_loss": 0.7827704, + "learning_rate": 5.359261580598407e-09, + "loss": 0.8046276, + "num_input_tokens_seen": 175492550, + "step": 8128, + "time_per_iteration": 2.400819778442383 + }, + { + "auxiliary_loss_clip": 0.01156945, + "auxiliary_loss_mlp": 0.01024705, + "balance_loss_clip": 1.04741549, + "balance_loss_mlp": 1.01690531, + "epoch": 0.9774544580051704, + "flos": 11837306949120.0, + "grad_norm": 2.350311685541444, + "language_loss": 0.78130692, + "learning_rate": 5.302424929819027e-09, + "loss": 0.80312341, + "num_input_tokens_seen": 175506560, + "step": 8129, + "time_per_iteration": 2.405646562576294 + }, + { + "auxiliary_loss_clip": 0.01156844, + "auxiliary_loss_mlp": 0.01028131, + "balance_loss_clip": 1.04361379, + "balance_loss_mlp": 1.02066207, + "epoch": 0.9775747008958096, + "flos": 13479932833920.0, + "grad_norm": 3.262343607544472, + "language_loss": 0.73202038, + "learning_rate": 5.24589087307592e-09, + "loss": 0.75387007, + "num_input_tokens_seen": 175524180, + "step": 8130, + "time_per_iteration": 2.4372546672821045 + }, + { + "auxiliary_loss_clip": 0.01167771, + "auxiliary_loss_mlp": 0.01027498, + "balance_loss_clip": 1.04654706, + "balance_loss_mlp": 1.02038097, + "epoch": 0.9776949437864486, + "flos": 59532314042880.0, + "grad_norm": 1.4825424064491346, + "language_loss": 0.65003979, + "learning_rate": 5.189659418944891e-09, + "loss": 0.67199242, + "num_input_tokens_seen": 175554355, + "step": 8131, + "time_per_iteration": 2.8669605255126953 + }, + { + "auxiliary_loss_clip": 0.01167883, + "auxiliary_loss_mlp": 0.0102691, + "balance_loss_clip": 1.04888356, + "balance_loss_mlp": 1.01981723, + "epoch": 0.9778151866770877, + "flos": 21178605715200.0, + "grad_norm": 2.558773225234404, + "language_loss": 0.78121066, + "learning_rate": 5.133730575956674e-09, + "loss": 0.80315858, + "num_input_tokens_seen": 175574025, + "step": 8132, + "time_per_iteration": 2.4702508449554443 + }, + { + "auxiliary_loss_clip": 0.01142103, + "auxiliary_loss_mlp": 0.01020732, + "balance_loss_clip": 1.04423881, + "balance_loss_mlp": 1.0134958, + "epoch": 0.9779354295677268, + "flos": 20886795624960.0, + "grad_norm": 1.9241185850621758, + "language_loss": 0.71898246, + "learning_rate": 5.0781043525953696e-09, + "loss": 0.74061078, + "num_input_tokens_seen": 175592090, + "step": 8133, + "time_per_iteration": 2.5327818393707275 + }, + { + "auxiliary_loss_clip": 0.01132665, + "auxiliary_loss_mlp": 0.01026472, + "balance_loss_clip": 1.04542172, + "balance_loss_mlp": 1.01923943, + "epoch": 0.9780556724583659, + "flos": 23440618748160.0, + "grad_norm": 1.7355659335757294, + "language_loss": 0.7351014, + "learning_rate": 5.0227807572995605e-09, + "loss": 0.75669277, + "num_input_tokens_seen": 175614065, + "step": 8134, + "time_per_iteration": 2.530061960220337 + }, + { + "auxiliary_loss_clip": 0.01140566, + "auxiliary_loss_mlp": 0.01023181, + "balance_loss_clip": 1.04209459, + "balance_loss_mlp": 1.01604342, + "epoch": 0.9781759153490049, + "flos": 20923244951040.0, + "grad_norm": 10.96042819041808, + "language_loss": 0.67302477, + "learning_rate": 4.967759798461646e-09, + "loss": 0.69466227, + "num_input_tokens_seen": 175632410, + "step": 8135, + "time_per_iteration": 2.5110161304473877 + }, + { + "auxiliary_loss_clip": 0.01165715, + "auxiliary_loss_mlp": 0.01022481, + "balance_loss_clip": 1.04790306, + "balance_loss_mlp": 1.01569247, + "epoch": 0.9782961582396441, + "flos": 28293191539200.0, + "grad_norm": 2.1210895368520544, + "language_loss": 0.75139022, + "learning_rate": 4.913041484428282e-09, + "loss": 0.77327216, + "num_input_tokens_seen": 175652885, + "step": 8136, + "time_per_iteration": 2.4865927696228027 + }, + { + "auxiliary_loss_clip": 0.01156293, + "auxiliary_loss_mlp": 0.0102083, + "balance_loss_clip": 1.04737353, + "balance_loss_mlp": 1.01377285, + "epoch": 0.9784164011302832, + "flos": 25552955808000.0, + "grad_norm": 1.774181867697472, + "language_loss": 0.74313807, + "learning_rate": 4.858625823500384e-09, + "loss": 0.76490927, + "num_input_tokens_seen": 175670585, + "step": 8137, + "time_per_iteration": 2.491525411605835 + }, + { + "auxiliary_loss_clip": 0.01156691, + "auxiliary_loss_mlp": 0.01025785, + "balance_loss_clip": 1.04603362, + "balance_loss_mlp": 1.01766109, + "epoch": 0.9785366440209222, + "flos": 29965945956480.0, + "grad_norm": 2.2870794552940064, + "language_loss": 0.73532969, + "learning_rate": 4.80451282393246e-09, + "loss": 0.75715441, + "num_input_tokens_seen": 175690570, + "step": 8138, + "time_per_iteration": 2.524644613265991 + }, + { + "auxiliary_loss_clip": 0.01139622, + "auxiliary_loss_mlp": 0.01021113, + "balance_loss_clip": 1.04471803, + "balance_loss_mlp": 1.0141418, + "epoch": 0.9786568869115614, + "flos": 32343591847680.0, + "grad_norm": 2.7954632431571147, + "language_loss": 0.67347223, + "learning_rate": 4.750702493933722e-09, + "loss": 0.69507957, + "num_input_tokens_seen": 175710455, + "step": 8139, + "time_per_iteration": 2.6153886318206787 + }, + { + "auxiliary_loss_clip": 0.01138703, + "auxiliary_loss_mlp": 0.00762026, + "balance_loss_clip": 1.04581487, + "balance_loss_mlp": 1.00059485, + "epoch": 0.9787771298022004, + "flos": 23331414424320.0, + "grad_norm": 2.38544023484483, + "language_loss": 0.85339922, + "learning_rate": 4.697194841666974e-09, + "loss": 0.87240654, + "num_input_tokens_seen": 175729380, + "step": 8140, + "time_per_iteration": 2.515671730041504 + }, + { + "auxiliary_loss_clip": 0.01156368, + "auxiliary_loss_mlp": 0.01029475, + "balance_loss_clip": 1.04673672, + "balance_loss_mlp": 1.0210886, + "epoch": 0.9788973726928395, + "flos": 21468548298240.0, + "grad_norm": 1.931549393757214, + "language_loss": 0.81482172, + "learning_rate": 4.6439898752492764e-09, + "loss": 0.83668017, + "num_input_tokens_seen": 175749520, + "step": 8141, + "time_per_iteration": 2.49373722076416 + }, + { + "auxiliary_loss_clip": 0.01054867, + "auxiliary_loss_mlp": 0.00752674, + "balance_loss_clip": 1.00845122, + "balance_loss_mlp": 1.00053906, + "epoch": 0.9790176155834787, + "flos": 68897459439360.0, + "grad_norm": 0.746676899777296, + "language_loss": 0.63639653, + "learning_rate": 4.591087602751731e-09, + "loss": 0.65447199, + "num_input_tokens_seen": 175811380, + "step": 8142, + "time_per_iteration": 3.161210298538208 + }, + { + "auxiliary_loss_clip": 0.01153331, + "auxiliary_loss_mlp": 0.010244, + "balance_loss_clip": 1.0450449, + "balance_loss_mlp": 1.01727772, + "epoch": 0.9791378584741177, + "flos": 21430877909760.0, + "grad_norm": 1.8141108880232333, + "language_loss": 0.71878719, + "learning_rate": 4.538488032199916e-09, + "loss": 0.74056447, + "num_input_tokens_seen": 175829480, + "step": 8143, + "time_per_iteration": 2.5064706802368164 + }, + { + "auxiliary_loss_clip": 0.01155692, + "auxiliary_loss_mlp": 0.01028427, + "balance_loss_clip": 1.04386306, + "balance_loss_mlp": 1.02099752, + "epoch": 0.9792581013647568, + "flos": 20153032594560.0, + "grad_norm": 2.229272488347639, + "language_loss": 0.6856761, + "learning_rate": 4.486191171572784e-09, + "loss": 0.70751727, + "num_input_tokens_seen": 175846750, + "step": 8144, + "time_per_iteration": 2.4654998779296875 + }, + { + "auxiliary_loss_clip": 0.01157435, + "auxiliary_loss_mlp": 0.01021924, + "balance_loss_clip": 1.04826212, + "balance_loss_mlp": 1.01469731, + "epoch": 0.9793783442553959, + "flos": 23728191033600.0, + "grad_norm": 1.477345287245209, + "language_loss": 0.77590144, + "learning_rate": 4.434197028803766e-09, + "loss": 0.79769498, + "num_input_tokens_seen": 175865975, + "step": 8145, + "time_per_iteration": 2.4916505813598633 + }, + { + "auxiliary_loss_clip": 0.01133092, + "auxiliary_loss_mlp": 0.01029805, + "balance_loss_clip": 1.04306018, + "balance_loss_mlp": 1.0213232, + "epoch": 0.979498587146035, + "flos": 23038742407680.0, + "grad_norm": 2.4131724481450503, + "language_loss": 0.82033288, + "learning_rate": 4.3825056117805514e-09, + "loss": 0.84196186, + "num_input_tokens_seen": 175881860, + "step": 8146, + "time_per_iteration": 3.2998340129852295 + }, + { + "auxiliary_loss_clip": 0.01166629, + "auxiliary_loss_mlp": 0.01018579, + "balance_loss_clip": 1.04557955, + "balance_loss_mlp": 1.0110625, + "epoch": 0.979618830036674, + "flos": 14318841951360.0, + "grad_norm": 3.904702823643069, + "language_loss": 0.79110515, + "learning_rate": 4.331116928344425e-09, + "loss": 0.81295717, + "num_input_tokens_seen": 175898175, + "step": 8147, + "time_per_iteration": 3.9176769256591797 + }, + { + "auxiliary_loss_clip": 0.01144441, + "auxiliary_loss_mlp": 0.00762254, + "balance_loss_clip": 1.04321861, + "balance_loss_mlp": 1.00054455, + "epoch": 0.9797390729273132, + "flos": 16727514215040.0, + "grad_norm": 2.1915243297233107, + "language_loss": 0.62798721, + "learning_rate": 4.28003098629115e-09, + "loss": 0.64705408, + "num_input_tokens_seen": 175914310, + "step": 8148, + "time_per_iteration": 2.4618661403656006 + }, + { + "auxiliary_loss_clip": 0.01122865, + "auxiliary_loss_mlp": 0.01022243, + "balance_loss_clip": 1.03797722, + "balance_loss_mlp": 1.01438141, + "epoch": 0.9798593158179523, + "flos": 24532661986560.0, + "grad_norm": 2.6786271150092404, + "language_loss": 0.78758937, + "learning_rate": 4.229247793370305e-09, + "loss": 0.80904043, + "num_input_tokens_seen": 175933435, + "step": 8149, + "time_per_iteration": 2.593855142593384 + }, + { + "auxiliary_loss_clip": 0.01170401, + "auxiliary_loss_mlp": 0.01023324, + "balance_loss_clip": 1.04857278, + "balance_loss_mlp": 1.01568246, + "epoch": 0.9799795587085913, + "flos": 27308808339840.0, + "grad_norm": 1.6530204411439908, + "language_loss": 0.70390767, + "learning_rate": 4.178767357285951e-09, + "loss": 0.72584492, + "num_input_tokens_seen": 175955065, + "step": 8150, + "time_per_iteration": 3.2292869091033936 + }, + { + "auxiliary_loss_clip": 0.0115505, + "auxiliary_loss_mlp": 0.00762401, + "balance_loss_clip": 1.04734349, + "balance_loss_mlp": 1.00070965, + "epoch": 0.9800998015992305, + "flos": 26286575184000.0, + "grad_norm": 1.8292701066506256, + "language_loss": 0.7129932, + "learning_rate": 4.128589685695516e-09, + "loss": 0.73216772, + "num_input_tokens_seen": 175975490, + "step": 8151, + "time_per_iteration": 2.5342957973480225 + }, + { + "auxiliary_loss_clip": 0.01167758, + "auxiliary_loss_mlp": 0.01023428, + "balance_loss_clip": 1.04790831, + "balance_loss_mlp": 1.0166924, + "epoch": 0.9802200444898695, + "flos": 16723635546240.0, + "grad_norm": 3.23009129092809, + "language_loss": 0.84422755, + "learning_rate": 4.078714786211135e-09, + "loss": 0.86613941, + "num_input_tokens_seen": 175991340, + "step": 8152, + "time_per_iteration": 2.374114751815796 + }, + { + "auxiliary_loss_clip": 0.01151087, + "auxiliary_loss_mlp": 0.01021437, + "balance_loss_clip": 1.04567981, + "balance_loss_mlp": 1.01431704, + "epoch": 0.9803402873805086, + "flos": 24900459298560.0, + "grad_norm": 1.7315241309703828, + "language_loss": 0.76367354, + "learning_rate": 4.029142666398977e-09, + "loss": 0.78539872, + "num_input_tokens_seen": 176011505, + "step": 8153, + "time_per_iteration": 2.516803026199341 + }, + { + "auxiliary_loss_clip": 0.0116577, + "auxiliary_loss_mlp": 0.0102543, + "balance_loss_clip": 1.04754615, + "balance_loss_mlp": 1.0185039, + "epoch": 0.9804605302711478, + "flos": 22564937082240.0, + "grad_norm": 10.400411814515202, + "language_loss": 0.80280101, + "learning_rate": 3.979873333778805e-09, + "loss": 0.82471299, + "num_input_tokens_seen": 176029680, + "step": 8154, + "time_per_iteration": 2.4341259002685547 + }, + { + "auxiliary_loss_clip": 0.011437, + "auxiliary_loss_mlp": 0.01026011, + "balance_loss_clip": 1.04588389, + "balance_loss_mlp": 1.0186255, + "epoch": 0.9805807731617868, + "flos": 38905368382080.0, + "grad_norm": 2.2216318539268083, + "language_loss": 0.73939943, + "learning_rate": 3.930906795824862e-09, + "loss": 0.76109654, + "num_input_tokens_seen": 176050355, + "step": 8155, + "time_per_iteration": 2.7003955841064453 + }, + { + "auxiliary_loss_clip": 0.01151086, + "auxiliary_loss_mlp": 0.01020659, + "balance_loss_clip": 1.04438233, + "balance_loss_mlp": 1.01326227, + "epoch": 0.9807010160524259, + "flos": 17821999578240.0, + "grad_norm": 3.554778848417184, + "language_loss": 0.76924264, + "learning_rate": 3.882243059965207e-09, + "loss": 0.79096013, + "num_input_tokens_seen": 176068070, + "step": 8156, + "time_per_iteration": 2.4283907413482666 + }, + { + "auxiliary_loss_clip": 0.0114539, + "auxiliary_loss_mlp": 0.01023577, + "balance_loss_clip": 1.04257143, + "balance_loss_mlp": 1.01637936, + "epoch": 0.980821258943065, + "flos": 13552975140480.0, + "grad_norm": 2.6162925784780215, + "language_loss": 0.65320623, + "learning_rate": 3.833882133582156e-09, + "loss": 0.67489588, + "num_input_tokens_seen": 176083730, + "step": 8157, + "time_per_iteration": 2.4526216983795166 + }, + { + "auxiliary_loss_clip": 0.01155703, + "auxiliary_loss_mlp": 0.01023027, + "balance_loss_clip": 1.04579186, + "balance_loss_mlp": 1.01592815, + "epoch": 0.9809415018337041, + "flos": 21689794120320.0, + "grad_norm": 1.7192273235574265, + "language_loss": 0.7814306, + "learning_rate": 3.785824024012285e-09, + "loss": 0.80321795, + "num_input_tokens_seen": 176102730, + "step": 8158, + "time_per_iteration": 2.4534175395965576 + }, + { + "auxiliary_loss_clip": 0.01133883, + "auxiliary_loss_mlp": 0.01026713, + "balance_loss_clip": 1.04735959, + "balance_loss_mlp": 1.0195694, + "epoch": 0.9810617447243432, + "flos": 23294857357440.0, + "grad_norm": 1.5285062253889283, + "language_loss": 0.78164703, + "learning_rate": 3.738068738545541e-09, + "loss": 0.803253, + "num_input_tokens_seen": 176121815, + "step": 8159, + "time_per_iteration": 2.5020835399627686 + }, + { + "auxiliary_loss_clip": 0.01158533, + "auxiliary_loss_mlp": 0.01026714, + "balance_loss_clip": 1.04739499, + "balance_loss_mlp": 1.01940024, + "epoch": 0.9811819876149822, + "flos": 18332038748160.0, + "grad_norm": 2.7033585601600296, + "language_loss": 0.78628474, + "learning_rate": 3.6906162844265733e-09, + "loss": 0.80813718, + "num_input_tokens_seen": 176138900, + "step": 8160, + "time_per_iteration": 2.428943157196045 + }, + { + "auxiliary_loss_clip": 0.01133939, + "auxiliary_loss_mlp": 0.01025866, + "balance_loss_clip": 1.04269075, + "balance_loss_mlp": 1.01809406, + "epoch": 0.9813022305056214, + "flos": 22601961025920.0, + "grad_norm": 1.7941510429778997, + "language_loss": 0.70771092, + "learning_rate": 3.643466668853845e-09, + "loss": 0.72930896, + "num_input_tokens_seen": 176156925, + "step": 8161, + "time_per_iteration": 2.488337516784668 + }, + { + "auxiliary_loss_clip": 0.01140816, + "auxiliary_loss_mlp": 0.01021853, + "balance_loss_clip": 1.0438149, + "balance_loss_mlp": 1.01416373, + "epoch": 0.9814224733962604, + "flos": 25413335642880.0, + "grad_norm": 1.8991436356210438, + "language_loss": 0.75543654, + "learning_rate": 3.59661989898008e-09, + "loss": 0.77706325, + "num_input_tokens_seen": 176177980, + "step": 8162, + "time_per_iteration": 2.5407230854034424 + }, + { + "auxiliary_loss_clip": 0.01117218, + "auxiliary_loss_mlp": 0.0102498, + "balance_loss_clip": 1.04272771, + "balance_loss_mlp": 1.01776814, + "epoch": 0.9815427162868995, + "flos": 25007185584000.0, + "grad_norm": 1.6944441783424107, + "language_loss": 0.76776028, + "learning_rate": 3.5500759819115934e-09, + "loss": 0.78918231, + "num_input_tokens_seen": 176198345, + "step": 8163, + "time_per_iteration": 2.5474624633789062 + }, + { + "auxiliary_loss_clip": 0.01170396, + "auxiliary_loss_mlp": 0.01022016, + "balance_loss_clip": 1.0495863, + "balance_loss_mlp": 1.01456189, + "epoch": 0.9816629591775387, + "flos": 20662604887680.0, + "grad_norm": 1.981944227481615, + "language_loss": 0.8076272, + "learning_rate": 3.5038349247094034e-09, + "loss": 0.82955128, + "num_input_tokens_seen": 176215605, + "step": 8164, + "time_per_iteration": 2.4154601097106934 + }, + { + "auxiliary_loss_clip": 0.0113811, + "auxiliary_loss_mlp": 0.01023825, + "balance_loss_clip": 1.04167295, + "balance_loss_mlp": 1.01649332, + "epoch": 0.9817832020681777, + "flos": 17712220636800.0, + "grad_norm": 2.269312831297346, + "language_loss": 0.77536494, + "learning_rate": 3.4578967343878994e-09, + "loss": 0.79698431, + "num_input_tokens_seen": 176231810, + "step": 8165, + "time_per_iteration": 2.453925132751465 + }, + { + "auxiliary_loss_clip": 0.01139299, + "auxiliary_loss_mlp": 0.01025645, + "balance_loss_clip": 1.04472971, + "balance_loss_mlp": 1.01857853, + "epoch": 0.9819034449588168, + "flos": 22530032040960.0, + "grad_norm": 3.2797374116411673, + "language_loss": 0.80718303, + "learning_rate": 3.4122614179161733e-09, + "loss": 0.82883251, + "num_input_tokens_seen": 176251770, + "step": 8166, + "time_per_iteration": 2.508246898651123 + }, + { + "auxiliary_loss_clip": 0.01112353, + "auxiliary_loss_mlp": 0.01022173, + "balance_loss_clip": 1.03907728, + "balance_loss_mlp": 1.0149852, + "epoch": 0.9820236878494559, + "flos": 20011221699840.0, + "grad_norm": 1.999599735952239, + "language_loss": 0.78169918, + "learning_rate": 3.36692898221691e-09, + "loss": 0.80304444, + "num_input_tokens_seen": 176270135, + "step": 8167, + "time_per_iteration": 2.5226492881774902 + }, + { + "auxiliary_loss_clip": 0.01153789, + "auxiliary_loss_mlp": 0.01023253, + "balance_loss_clip": 1.04483557, + "balance_loss_mlp": 1.01647925, + "epoch": 0.982143930740095, + "flos": 18807316531200.0, + "grad_norm": 1.8474937447287842, + "language_loss": 0.73353428, + "learning_rate": 3.3218994341668305e-09, + "loss": 0.75530469, + "num_input_tokens_seen": 176289065, + "step": 8168, + "time_per_iteration": 2.4744274616241455 + }, + { + "auxiliary_loss_clip": 0.01167521, + "auxiliary_loss_mlp": 0.01025085, + "balance_loss_clip": 1.04940581, + "balance_loss_mlp": 1.01814699, + "epoch": 0.982264173630734, + "flos": 26578026138240.0, + "grad_norm": 1.5962770190368758, + "language_loss": 0.75347692, + "learning_rate": 3.2771727805971373e-09, + "loss": 0.77540302, + "num_input_tokens_seen": 176310450, + "step": 8169, + "time_per_iteration": 2.4732041358947754 + }, + { + "auxiliary_loss_clip": 0.01105122, + "auxiliary_loss_mlp": 0.01021798, + "balance_loss_clip": 1.03749192, + "balance_loss_mlp": 1.01421595, + "epoch": 0.9823844165213732, + "flos": 22014462176640.0, + "grad_norm": 1.9787952770170127, + "language_loss": 0.77071601, + "learning_rate": 3.232749028292847e-09, + "loss": 0.79198527, + "num_input_tokens_seen": 176327415, + "step": 8170, + "time_per_iteration": 2.5437276363372803 + }, + { + "auxiliary_loss_clip": 0.01169119, + "auxiliary_loss_mlp": 0.01027995, + "balance_loss_clip": 1.04708171, + "balance_loss_mlp": 1.02042198, + "epoch": 0.9825046594120123, + "flos": 21908166854400.0, + "grad_norm": 1.6525139824819888, + "language_loss": 0.88370413, + "learning_rate": 3.188628183992792e-09, + "loss": 0.90567529, + "num_input_tokens_seen": 176347680, + "step": 8171, + "time_per_iteration": 2.435896635055542 + }, + { + "auxiliary_loss_clip": 0.01054984, + "auxiliary_loss_mlp": 0.01000885, + "balance_loss_clip": 1.00766802, + "balance_loss_mlp": 0.99987775, + "epoch": 0.9826249023026513, + "flos": 59494610718720.0, + "grad_norm": 0.7405730586770035, + "language_loss": 0.62525129, + "learning_rate": 3.1448102543902844e-09, + "loss": 0.64581001, + "num_input_tokens_seen": 176411595, + "step": 8172, + "time_per_iteration": 3.793273687362671 + }, + { + "auxiliary_loss_clip": 0.01134086, + "auxiliary_loss_mlp": 0.01030226, + "balance_loss_clip": 1.04519892, + "balance_loss_mlp": 1.0224359, + "epoch": 0.9827451451932905, + "flos": 16071031296000.0, + "grad_norm": 2.103149103405032, + "language_loss": 0.67701638, + "learning_rate": 3.1012952461324515e-09, + "loss": 0.69865954, + "num_input_tokens_seen": 176430570, + "step": 8173, + "time_per_iteration": 3.2540249824523926 + }, + { + "auxiliary_loss_clip": 0.0115194, + "auxiliary_loss_mlp": 0.01028591, + "balance_loss_clip": 1.04881907, + "balance_loss_mlp": 1.0209856, + "epoch": 0.9828653880839295, + "flos": 20262775622400.0, + "grad_norm": 2.5425599933409457, + "language_loss": 0.7420193, + "learning_rate": 3.0580831658204575e-09, + "loss": 0.76382458, + "num_input_tokens_seen": 176448150, + "step": 8174, + "time_per_iteration": 3.1652028560638428 + }, + { + "auxiliary_loss_clip": 0.01152708, + "auxiliary_loss_mlp": 0.01022821, + "balance_loss_clip": 1.04737592, + "balance_loss_mlp": 1.0160408, + "epoch": 0.9829856309745686, + "flos": 21616141282560.0, + "grad_norm": 1.6446993045416693, + "language_loss": 0.7803095, + "learning_rate": 3.015174020009281e-09, + "loss": 0.80206478, + "num_input_tokens_seen": 176467475, + "step": 8175, + "time_per_iteration": 2.4740188121795654 + }, + { + "auxiliary_loss_clip": 0.01129176, + "auxiliary_loss_mlp": 0.01022693, + "balance_loss_clip": 1.04302573, + "balance_loss_mlp": 1.0160563, + "epoch": 0.9831058738652078, + "flos": 23764209396480.0, + "grad_norm": 2.6432060662119836, + "language_loss": 0.75161266, + "learning_rate": 2.9725678152086043e-09, + "loss": 0.77313131, + "num_input_tokens_seen": 176486045, + "step": 8176, + "time_per_iteration": 2.5520987510681152 + }, + { + "auxiliary_loss_clip": 0.01127071, + "auxiliary_loss_mlp": 0.010241, + "balance_loss_clip": 1.04210556, + "balance_loss_mlp": 1.01679873, + "epoch": 0.9832261167558468, + "flos": 11320911072000.0, + "grad_norm": 2.6539414917279105, + "language_loss": 0.82363844, + "learning_rate": 2.930264557881257e-09, + "loss": 0.84515017, + "num_input_tokens_seen": 176501230, + "step": 8177, + "time_per_iteration": 3.250739812850952 + }, + { + "auxiliary_loss_clip": 0.01063142, + "auxiliary_loss_mlp": 0.01000865, + "balance_loss_clip": 1.00751042, + "balance_loss_mlp": 0.99987555, + "epoch": 0.9833463596464859, + "flos": 60000304343040.0, + "grad_norm": 0.9424134243566, + "language_loss": 0.58218217, + "learning_rate": 2.8882642544452163e-09, + "loss": 0.6028223, + "num_input_tokens_seen": 176565955, + "step": 8178, + "time_per_iteration": 3.053947687149048 + }, + { + "auxiliary_loss_clip": 0.01129279, + "auxiliary_loss_mlp": 0.01025992, + "balance_loss_clip": 1.04337764, + "balance_loss_mlp": 1.01846933, + "epoch": 0.983466602537125, + "flos": 13626699805440.0, + "grad_norm": 2.181613500029796, + "language_loss": 0.74239612, + "learning_rate": 2.8465669112716083e-09, + "loss": 0.7639488, + "num_input_tokens_seen": 176583480, + "step": 8179, + "time_per_iteration": 2.489365816116333 + }, + { + "auxiliary_loss_clip": 0.01154545, + "auxiliary_loss_mlp": 0.00762173, + "balance_loss_clip": 1.0444057, + "balance_loss_mlp": 1.00056422, + "epoch": 0.9835868454277641, + "flos": 22926844563840.0, + "grad_norm": 1.8118605699347958, + "language_loss": 0.76164156, + "learning_rate": 2.8051725346858177e-09, + "loss": 0.78080875, + "num_input_tokens_seen": 176603740, + "step": 8180, + "time_per_iteration": 2.510066509246826 + }, + { + "auxiliary_loss_clip": 0.01168369, + "auxiliary_loss_mlp": 0.01025906, + "balance_loss_clip": 1.04571974, + "balance_loss_mlp": 1.01843452, + "epoch": 0.9837070883184031, + "flos": 27673409341440.0, + "grad_norm": 2.1186569623516887, + "language_loss": 0.70937723, + "learning_rate": 2.7640811309674883e-09, + "loss": 0.73132002, + "num_input_tokens_seen": 176623240, + "step": 8181, + "time_per_iteration": 2.5203425884246826 + }, + { + "auxiliary_loss_clip": 0.01115827, + "auxiliary_loss_mlp": 0.01021551, + "balance_loss_clip": 1.04139018, + "balance_loss_mlp": 1.01436841, + "epoch": 0.9838273312090423, + "flos": 29241951425280.0, + "grad_norm": 1.5691790401033294, + "language_loss": 0.8081367, + "learning_rate": 2.7232927063498557e-09, + "loss": 0.82951051, + "num_input_tokens_seen": 176643615, + "step": 8182, + "time_per_iteration": 2.5776140689849854 + }, + { + "auxiliary_loss_clip": 0.01158963, + "auxiliary_loss_mlp": 0.01022436, + "balance_loss_clip": 1.0478673, + "balance_loss_mlp": 1.01527739, + "epoch": 0.9839475740996814, + "flos": 40110207304320.0, + "grad_norm": 2.12714511432501, + "language_loss": 0.68979353, + "learning_rate": 2.682807267020859e-09, + "loss": 0.71160746, + "num_input_tokens_seen": 176666375, + "step": 8183, + "time_per_iteration": 2.6215717792510986 + }, + { + "auxiliary_loss_clip": 0.01153944, + "auxiliary_loss_mlp": 0.01022759, + "balance_loss_clip": 1.04614604, + "balance_loss_mlp": 1.01503372, + "epoch": 0.9840678169903204, + "flos": 24169389788160.0, + "grad_norm": 1.8870439494420332, + "language_loss": 0.62480426, + "learning_rate": 2.642624819121808e-09, + "loss": 0.64657128, + "num_input_tokens_seen": 176686525, + "step": 8184, + "time_per_iteration": 2.4833874702453613 + }, + { + "auxiliary_loss_clip": 0.01137339, + "auxiliary_loss_mlp": 0.0102681, + "balance_loss_clip": 1.04557943, + "balance_loss_mlp": 1.01966918, + "epoch": 0.9841880598809596, + "flos": 14684484447360.0, + "grad_norm": 2.083227674020249, + "language_loss": 0.61510849, + "learning_rate": 2.6027453687487154e-09, + "loss": 0.63674998, + "num_input_tokens_seen": 176703615, + "step": 8185, + "time_per_iteration": 2.466007947921753 + }, + { + "auxiliary_loss_clip": 0.01139663, + "auxiliary_loss_mlp": 0.01023353, + "balance_loss_clip": 1.04467857, + "balance_loss_mlp": 1.01589036, + "epoch": 0.9843083027715986, + "flos": 22344768668160.0, + "grad_norm": 2.316192554184271, + "language_loss": 0.54062122, + "learning_rate": 2.5631689219509643e-09, + "loss": 0.56225133, + "num_input_tokens_seen": 176722295, + "step": 8186, + "time_per_iteration": 2.4768857955932617 + }, + { + "auxiliary_loss_clip": 0.01142113, + "auxiliary_loss_mlp": 0.0102294, + "balance_loss_clip": 1.04661727, + "balance_loss_mlp": 1.01603174, + "epoch": 0.9844285456622377, + "flos": 21800111765760.0, + "grad_norm": 1.7250016961083863, + "language_loss": 0.83351791, + "learning_rate": 2.523895484732197e-09, + "loss": 0.85516846, + "num_input_tokens_seen": 176741750, + "step": 8187, + "time_per_iteration": 2.492861270904541 + }, + { + "auxiliary_loss_clip": 0.01159712, + "auxiliary_loss_mlp": 0.0102265, + "balance_loss_clip": 1.0462358, + "balance_loss_mlp": 1.01466584, + "epoch": 0.9845487885528769, + "flos": 18035380321920.0, + "grad_norm": 2.110367721231472, + "language_loss": 0.74481094, + "learning_rate": 2.4849250630505357e-09, + "loss": 0.76663458, + "num_input_tokens_seen": 176759995, + "step": 8188, + "time_per_iteration": 2.4516735076904297 + }, + { + "auxiliary_loss_clip": 0.01070586, + "auxiliary_loss_mlp": 0.0102496, + "balance_loss_clip": 1.03611231, + "balance_loss_mlp": 1.01754487, + "epoch": 0.9846690314435159, + "flos": 25228610974080.0, + "grad_norm": 2.4029296156069693, + "language_loss": 0.73222101, + "learning_rate": 2.4462576628172528e-09, + "loss": 0.75317651, + "num_input_tokens_seen": 176778625, + "step": 8189, + "time_per_iteration": 2.6452271938323975 + }, + { + "auxiliary_loss_clip": 0.01149484, + "auxiliary_loss_mlp": 0.01027628, + "balance_loss_clip": 1.04573083, + "balance_loss_mlp": 1.01982868, + "epoch": 0.984789274334155, + "flos": 18552171248640.0, + "grad_norm": 5.94647970010407, + "language_loss": 0.74075502, + "learning_rate": 2.407893289898766e-09, + "loss": 0.76252615, + "num_input_tokens_seen": 176797655, + "step": 8190, + "time_per_iteration": 2.4488613605499268 + }, + { + "auxiliary_loss_clip": 0.01116072, + "auxiliary_loss_mlp": 0.0102173, + "balance_loss_clip": 1.03946376, + "balance_loss_mlp": 1.01419032, + "epoch": 0.984909517224794, + "flos": 27345437233920.0, + "grad_norm": 1.9523306553040032, + "language_loss": 0.83909845, + "learning_rate": 2.3698319501144202e-09, + "loss": 0.86047649, + "num_input_tokens_seen": 176818640, + "step": 8191, + "time_per_iteration": 2.5712978839874268 + }, + { + "auxiliary_loss_clip": 0.011596, + "auxiliary_loss_mlp": 0.0102697, + "balance_loss_clip": 1.04564404, + "balance_loss_mlp": 1.01894093, + "epoch": 0.9850297601154332, + "flos": 18734058743040.0, + "grad_norm": 1.7623163354538098, + "language_loss": 0.73508132, + "learning_rate": 2.3320736492382644e-09, + "loss": 0.75694698, + "num_input_tokens_seen": 176837475, + "step": 8192, + "time_per_iteration": 2.433499574661255 + }, + { + "auxiliary_loss_clip": 0.01165369, + "auxiliary_loss_mlp": 0.0102672, + "balance_loss_clip": 1.04699659, + "balance_loss_mlp": 1.01972842, + "epoch": 0.9851500030060723, + "flos": 22308247514880.0, + "grad_norm": 1.5889813499416583, + "language_loss": 0.68245631, + "learning_rate": 2.29461839299816e-09, + "loss": 0.70437729, + "num_input_tokens_seen": 176857190, + "step": 8193, + "time_per_iteration": 2.4764187335968018 + }, + { + "auxiliary_loss_clip": 0.01127014, + "auxiliary_loss_mlp": 0.0101929, + "balance_loss_clip": 1.04250312, + "balance_loss_mlp": 1.01216102, + "epoch": 0.9852702458967113, + "flos": 26353691746560.0, + "grad_norm": 1.640274470743587, + "language_loss": 0.79916048, + "learning_rate": 2.257466187076229e-09, + "loss": 0.82062352, + "num_input_tokens_seen": 176876395, + "step": 8194, + "time_per_iteration": 2.5692710876464844 + }, + { + "auxiliary_loss_clip": 0.01157269, + "auxiliary_loss_mlp": 0.00761782, + "balance_loss_clip": 1.04461396, + "balance_loss_mlp": 1.00057006, + "epoch": 0.9853904887873505, + "flos": 20883599314560.0, + "grad_norm": 1.8695159273073916, + "language_loss": 0.71045637, + "learning_rate": 2.2206170371081854e-09, + "loss": 0.72964692, + "num_input_tokens_seen": 176894980, + "step": 8195, + "time_per_iteration": 2.4878017902374268 + }, + { + "auxiliary_loss_clip": 0.01139681, + "auxiliary_loss_mlp": 0.01029045, + "balance_loss_clip": 1.04299355, + "balance_loss_mlp": 1.02083778, + "epoch": 0.9855107316779895, + "flos": 25263444188160.0, + "grad_norm": 1.683838961162754, + "language_loss": 0.8471719, + "learning_rate": 2.1840709486842247e-09, + "loss": 0.86885917, + "num_input_tokens_seen": 176914600, + "step": 8196, + "time_per_iteration": 2.51877498626709 + }, + { + "auxiliary_loss_clip": 0.01131839, + "auxiliary_loss_mlp": 0.01026357, + "balance_loss_clip": 1.04316044, + "balance_loss_mlp": 1.01894259, + "epoch": 0.9856309745686286, + "flos": 19062102677760.0, + "grad_norm": 2.1508400627968323, + "language_loss": 0.79172707, + "learning_rate": 2.1478279273481335e-09, + "loss": 0.81330907, + "num_input_tokens_seen": 176933085, + "step": 8197, + "time_per_iteration": 2.507251262664795 + }, + { + "auxiliary_loss_clip": 0.01154341, + "auxiliary_loss_mlp": 0.01025833, + "balance_loss_clip": 1.04827464, + "balance_loss_mlp": 1.01867461, + "epoch": 0.9857512174592677, + "flos": 34130758060800.0, + "grad_norm": 2.242227059336327, + "language_loss": 0.79769909, + "learning_rate": 2.1118879785981815e-09, + "loss": 0.8195008, + "num_input_tokens_seen": 176953225, + "step": 8198, + "time_per_iteration": 2.5707337856292725 + }, + { + "auxiliary_loss_clip": 0.011371, + "auxiliary_loss_mlp": 0.01025325, + "balance_loss_clip": 1.04412341, + "balance_loss_mlp": 1.01851821, + "epoch": 0.9858714603499068, + "flos": 25994693266560.0, + "grad_norm": 1.6365242058092058, + "language_loss": 0.79326862, + "learning_rate": 2.0762511078862288e-09, + "loss": 0.81489289, + "num_input_tokens_seen": 176973570, + "step": 8199, + "time_per_iteration": 3.2358992099761963 + }, + { + "auxiliary_loss_clip": 0.01144437, + "auxiliary_loss_mlp": 0.01022499, + "balance_loss_clip": 1.04264975, + "balance_loss_mlp": 1.01543021, + "epoch": 0.9859917032405459, + "flos": 23696230907520.0, + "grad_norm": 1.902136182958391, + "language_loss": 0.64696813, + "learning_rate": 2.0409173206186183e-09, + "loss": 0.66863751, + "num_input_tokens_seen": 176992810, + "step": 8200, + "time_per_iteration": 3.278679370880127 + }, + { + "auxiliary_loss_clip": 0.01125019, + "auxiliary_loss_mlp": 0.01023472, + "balance_loss_clip": 1.04659665, + "balance_loss_mlp": 1.01633763, + "epoch": 0.986111946131185, + "flos": 19938287134080.0, + "grad_norm": 2.7169942135067204, + "language_loss": 0.87131417, + "learning_rate": 2.0058866221550617e-09, + "loss": 0.89279902, + "num_input_tokens_seen": 177011050, + "step": 8201, + "time_per_iteration": 2.5061678886413574 + }, + { + "auxiliary_loss_clip": 0.01165332, + "auxiliary_loss_mlp": 0.01022776, + "balance_loss_clip": 1.0441165, + "balance_loss_mlp": 1.01553988, + "epoch": 0.9862321890218241, + "flos": 19828831415040.0, + "grad_norm": 2.3074101586144926, + "language_loss": 0.74943882, + "learning_rate": 1.971159017809976e-09, + "loss": 0.77131987, + "num_input_tokens_seen": 177029340, + "step": 8202, + "time_per_iteration": 2.4736013412475586 + }, + { + "auxiliary_loss_clip": 0.01152386, + "auxiliary_loss_mlp": 0.01024899, + "balance_loss_clip": 1.04575586, + "balance_loss_mlp": 1.01700783, + "epoch": 0.9863524319124631, + "flos": 21652051904640.0, + "grad_norm": 2.0748808664415073, + "language_loss": 0.78066123, + "learning_rate": 1.93673451285159e-09, + "loss": 0.80243409, + "num_input_tokens_seen": 177048390, + "step": 8203, + "time_per_iteration": 2.4382591247558594 + }, + { + "auxiliary_loss_clip": 0.0104657, + "auxiliary_loss_mlp": 0.01001876, + "balance_loss_clip": 1.00820494, + "balance_loss_mlp": 1.0008744, + "epoch": 0.9864726748031023, + "flos": 52769977920000.0, + "grad_norm": 0.7337858627096866, + "language_loss": 0.56543422, + "learning_rate": 1.9026131125019495e-09, + "loss": 0.58591866, + "num_input_tokens_seen": 177105760, + "step": 8204, + "time_per_iteration": 3.7532331943511963 + }, + { + "auxiliary_loss_clip": 0.01148622, + "auxiliary_loss_mlp": 0.01021127, + "balance_loss_clip": 1.04632998, + "balance_loss_mlp": 1.01402521, + "epoch": 0.9865929176937414, + "flos": 23364631526400.0, + "grad_norm": 1.840331795077079, + "language_loss": 0.86984956, + "learning_rate": 1.8687948219371363e-09, + "loss": 0.89154702, + "num_input_tokens_seen": 177124985, + "step": 8205, + "time_per_iteration": 2.4769561290740967 + }, + { + "auxiliary_loss_clip": 0.01171582, + "auxiliary_loss_mlp": 0.01027192, + "balance_loss_clip": 1.04658926, + "balance_loss_mlp": 1.01956248, + "epoch": 0.9867131605843804, + "flos": 21616679986560.0, + "grad_norm": 1.9297934310528602, + "language_loss": 0.88755941, + "learning_rate": 1.835279646287491e-09, + "loss": 0.90954715, + "num_input_tokens_seen": 177142995, + "step": 8206, + "time_per_iteration": 2.405608892440796 + }, + { + "auxiliary_loss_clip": 0.01159581, + "auxiliary_loss_mlp": 0.01030511, + "balance_loss_clip": 1.04690027, + "balance_loss_mlp": 1.022995, + "epoch": 0.9868334034750196, + "flos": 22271403139200.0, + "grad_norm": 1.922046770400577, + "language_loss": 0.76199174, + "learning_rate": 1.8020675906371685e-09, + "loss": 0.78389263, + "num_input_tokens_seen": 177162390, + "step": 8207, + "time_per_iteration": 2.46624493598938 + }, + { + "auxiliary_loss_clip": 0.01107297, + "auxiliary_loss_mlp": 0.0102555, + "balance_loss_clip": 1.03986263, + "balance_loss_mlp": 1.01867151, + "epoch": 0.9869536463656586, + "flos": 25809573548160.0, + "grad_norm": 1.8591490625332225, + "language_loss": 0.74900973, + "learning_rate": 1.7691586600243612e-09, + "loss": 0.77033818, + "num_input_tokens_seen": 177181290, + "step": 8208, + "time_per_iteration": 2.5776174068450928 + }, + { + "auxiliary_loss_clip": 0.01137909, + "auxiliary_loss_mlp": 0.01022097, + "balance_loss_clip": 1.04716575, + "balance_loss_mlp": 1.01433074, + "epoch": 0.9870738892562977, + "flos": 16398500613120.0, + "grad_norm": 2.857375820502851, + "language_loss": 0.86851382, + "learning_rate": 1.7365528594415202e-09, + "loss": 0.89011389, + "num_input_tokens_seen": 177195360, + "step": 8209, + "time_per_iteration": 2.4374730587005615 + }, + { + "auxiliary_loss_clip": 0.01157532, + "auxiliary_loss_mlp": 0.00762375, + "balance_loss_clip": 1.0451448, + "balance_loss_mlp": 1.00056338, + "epoch": 0.9871941321469369, + "flos": 35481358373760.0, + "grad_norm": 2.6194458169615724, + "language_loss": 0.67598844, + "learning_rate": 1.7042501938346888e-09, + "loss": 0.69518751, + "num_input_tokens_seen": 177218090, + "step": 8210, + "time_per_iteration": 2.583091974258423 + }, + { + "auxiliary_loss_clip": 0.0112689, + "auxiliary_loss_mlp": 0.01023043, + "balance_loss_clip": 1.03915417, + "balance_loss_mlp": 1.01556849, + "epoch": 0.9873143750375759, + "flos": 21434217874560.0, + "grad_norm": 2.0494536979948927, + "language_loss": 0.7625469, + "learning_rate": 1.6722506681043913e-09, + "loss": 0.78404623, + "num_input_tokens_seen": 177237050, + "step": 8211, + "time_per_iteration": 2.486260175704956 + }, + { + "auxiliary_loss_clip": 0.01142552, + "auxiliary_loss_mlp": 0.01026756, + "balance_loss_clip": 1.04444838, + "balance_loss_mlp": 1.01977015, + "epoch": 0.987434617928215, + "flos": 16326499800960.0, + "grad_norm": 2.1870468861861596, + "language_loss": 0.69389594, + "learning_rate": 1.640554287104745e-09, + "loss": 0.71558899, + "num_input_tokens_seen": 177255325, + "step": 8212, + "time_per_iteration": 2.464951276779175 + }, + { + "auxiliary_loss_clip": 0.01126627, + "auxiliary_loss_mlp": 0.01023244, + "balance_loss_clip": 1.03894484, + "balance_loss_mlp": 1.01571047, + "epoch": 0.9875548608188541, + "flos": 17851984456320.0, + "grad_norm": 4.121327741680285, + "language_loss": 0.80409837, + "learning_rate": 1.609161055644348e-09, + "loss": 0.82559705, + "num_input_tokens_seen": 177271250, + "step": 8213, + "time_per_iteration": 2.4860217571258545 + }, + { + "auxiliary_loss_clip": 0.01160877, + "auxiliary_loss_mlp": 0.01023398, + "balance_loss_clip": 1.04615378, + "balance_loss_mlp": 1.01605511, + "epoch": 0.9876751037094932, + "flos": 26132876887680.0, + "grad_norm": 1.964416167054375, + "language_loss": 0.68160248, + "learning_rate": 1.5780709784849467e-09, + "loss": 0.7034452, + "num_input_tokens_seen": 177288270, + "step": 8214, + "time_per_iteration": 2.5204732418060303 + }, + { + "auxiliary_loss_clip": 0.01102972, + "auxiliary_loss_mlp": 0.01027213, + "balance_loss_clip": 1.04415894, + "balance_loss_mlp": 1.01965499, + "epoch": 0.9877953466001322, + "flos": 15991344973440.0, + "grad_norm": 1.8969398109921847, + "language_loss": 0.82171023, + "learning_rate": 1.5472840603436565e-09, + "loss": 0.84301209, + "num_input_tokens_seen": 177305500, + "step": 8215, + "time_per_iteration": 2.5644190311431885 + }, + { + "auxiliary_loss_clip": 0.01145389, + "auxiliary_loss_mlp": 0.01025093, + "balance_loss_clip": 1.04663384, + "balance_loss_mlp": 1.01776135, + "epoch": 0.9879155894907714, + "flos": 18806777827200.0, + "grad_norm": 2.759621478740113, + "language_loss": 0.78138304, + "learning_rate": 1.5168003058900757e-09, + "loss": 0.80308783, + "num_input_tokens_seen": 177323500, + "step": 8216, + "time_per_iteration": 2.4678776264190674 + }, + { + "auxiliary_loss_clip": 0.01124784, + "auxiliary_loss_mlp": 0.01029395, + "balance_loss_clip": 1.04123402, + "balance_loss_mlp": 1.02238512, + "epoch": 0.9880358323814105, + "flos": 22382044007040.0, + "grad_norm": 2.002049369338471, + "language_loss": 0.92064029, + "learning_rate": 1.4866197197491715e-09, + "loss": 0.94218206, + "num_input_tokens_seen": 177342860, + "step": 8217, + "time_per_iteration": 2.5309698581695557 + }, + { + "auxiliary_loss_clip": 0.01158869, + "auxiliary_loss_mlp": 0.00763021, + "balance_loss_clip": 1.04690528, + "balance_loss_mlp": 1.00059593, + "epoch": 0.9881560752720495, + "flos": 15668831733120.0, + "grad_norm": 3.2521073249387955, + "language_loss": 0.78376389, + "learning_rate": 1.4567423064988371e-09, + "loss": 0.80298281, + "num_input_tokens_seen": 177360210, + "step": 8218, + "time_per_iteration": 2.489686965942383 + }, + { + "auxiliary_loss_clip": 0.01167731, + "auxiliary_loss_mlp": 0.0102488, + "balance_loss_clip": 1.04597735, + "balance_loss_mlp": 1.01719642, + "epoch": 0.9882763181626887, + "flos": 21500113374720.0, + "grad_norm": 1.9479579485016307, + "language_loss": 0.77967119, + "learning_rate": 1.4271680706718913e-09, + "loss": 0.80159736, + "num_input_tokens_seen": 177377885, + "step": 8219, + "time_per_iteration": 2.410869598388672 + }, + { + "auxiliary_loss_clip": 0.01159815, + "auxiliary_loss_mlp": 0.0102855, + "balance_loss_clip": 1.04918253, + "balance_loss_mlp": 1.0207057, + "epoch": 0.9883965610533277, + "flos": 28034598551040.0, + "grad_norm": 2.435637914108635, + "language_loss": 0.82488263, + "learning_rate": 1.3978970167543013e-09, + "loss": 0.84676623, + "num_input_tokens_seen": 177398065, + "step": 8220, + "time_per_iteration": 2.515953302383423 + }, + { + "auxiliary_loss_clip": 0.01131905, + "auxiliary_loss_mlp": 0.01026815, + "balance_loss_clip": 1.04370368, + "balance_loss_mlp": 1.01919198, + "epoch": 0.9885168039439668, + "flos": 14098601710080.0, + "grad_norm": 3.3638123299716054, + "language_loss": 0.77643692, + "learning_rate": 1.3689291491867372e-09, + "loss": 0.79802418, + "num_input_tokens_seen": 177416380, + "step": 8221, + "time_per_iteration": 2.464137554168701 + }, + { + "auxiliary_loss_clip": 0.01169206, + "auxiliary_loss_mlp": 0.01031811, + "balance_loss_clip": 1.04685903, + "balance_loss_mlp": 1.02396142, + "epoch": 0.988637046834606, + "flos": 26432013352320.0, + "grad_norm": 2.4788698007333143, + "language_loss": 0.73647016, + "learning_rate": 1.3402644723636836e-09, + "loss": 0.75848031, + "num_input_tokens_seen": 177438410, + "step": 8222, + "time_per_iteration": 2.4789505004882812 + }, + { + "auxiliary_loss_clip": 0.01135627, + "auxiliary_loss_mlp": 0.01024933, + "balance_loss_clip": 1.04691935, + "balance_loss_mlp": 1.01778054, + "epoch": 0.988757289725245, + "flos": 25229113764480.0, + "grad_norm": 2.76177580663477, + "language_loss": 0.83658719, + "learning_rate": 1.311902990633218e-09, + "loss": 0.8581928, + "num_input_tokens_seen": 177457375, + "step": 8223, + "time_per_iteration": 2.5508809089660645 + }, + { + "auxiliary_loss_clip": 0.01132524, + "auxiliary_loss_mlp": 0.01020269, + "balance_loss_clip": 1.04063988, + "balance_loss_mlp": 1.0136205, + "epoch": 0.9888775326158841, + "flos": 26359042872960.0, + "grad_norm": 1.637776053344963, + "language_loss": 0.71448207, + "learning_rate": 1.2838447082978987e-09, + "loss": 0.73600996, + "num_input_tokens_seen": 177478530, + "step": 8224, + "time_per_iteration": 2.5532851219177246 + }, + { + "auxiliary_loss_clip": 0.0115102, + "auxiliary_loss_mlp": 0.01024329, + "balance_loss_clip": 1.04477394, + "balance_loss_mlp": 1.01656294, + "epoch": 0.9889977755065231, + "flos": 24316120846080.0, + "grad_norm": 3.356412700405251, + "language_loss": 0.8296752, + "learning_rate": 1.2560896296143208e-09, + "loss": 0.85142869, + "num_input_tokens_seen": 177496995, + "step": 8225, + "time_per_iteration": 2.4576194286346436 + }, + { + "auxiliary_loss_clip": 0.01168093, + "auxiliary_loss_mlp": 0.01023241, + "balance_loss_clip": 1.04659653, + "balance_loss_mlp": 1.01608872, + "epoch": 0.9891180183971623, + "flos": 18951066760320.0, + "grad_norm": 2.5723688176599513, + "language_loss": 0.82244617, + "learning_rate": 1.2286377587926722e-09, + "loss": 0.84435952, + "num_input_tokens_seen": 177513785, + "step": 8226, + "time_per_iteration": 3.2042288780212402 + }, + { + "auxiliary_loss_clip": 0.01167024, + "auxiliary_loss_mlp": 0.01022997, + "balance_loss_clip": 1.04633772, + "balance_loss_mlp": 1.01540303, + "epoch": 0.9892382612878013, + "flos": 26176580760960.0, + "grad_norm": 2.3339034959642833, + "language_loss": 0.74881291, + "learning_rate": 1.2014890999973992e-09, + "loss": 0.77071309, + "num_input_tokens_seen": 177530705, + "step": 8227, + "time_per_iteration": 3.9138870239257812 + }, + { + "auxiliary_loss_clip": 0.01165275, + "auxiliary_loss_mlp": 0.01025571, + "balance_loss_clip": 1.04529214, + "balance_loss_mlp": 1.01867449, + "epoch": 0.9893585041784404, + "flos": 25449605400960.0, + "grad_norm": 2.1256205645051933, + "language_loss": 0.78385609, + "learning_rate": 1.1746436573472073e-09, + "loss": 0.80576456, + "num_input_tokens_seen": 177552440, + "step": 8228, + "time_per_iteration": 2.473114490509033 + }, + { + "auxiliary_loss_clip": 0.01147716, + "auxiliary_loss_mlp": 0.01027163, + "balance_loss_clip": 1.04469502, + "balance_loss_mlp": 1.01979291, + "epoch": 0.9894787470690796, + "flos": 20189302352640.0, + "grad_norm": 1.923150308749261, + "language_loss": 0.69419312, + "learning_rate": 1.1481014349141726e-09, + "loss": 0.71594191, + "num_input_tokens_seen": 177569660, + "step": 8229, + "time_per_iteration": 2.4942822456359863 + }, + { + "auxiliary_loss_clip": 0.01140172, + "auxiliary_loss_mlp": 0.01023164, + "balance_loss_clip": 1.04441524, + "balance_loss_mlp": 1.01477814, + "epoch": 0.9895989899597186, + "flos": 24644308435200.0, + "grad_norm": 1.8066284364469956, + "language_loss": 0.84529001, + "learning_rate": 1.121862436724852e-09, + "loss": 0.86692333, + "num_input_tokens_seen": 177588500, + "step": 8230, + "time_per_iteration": 2.522102117538452 + }, + { + "auxiliary_loss_clip": 0.01155987, + "auxiliary_loss_mlp": 0.01027387, + "balance_loss_clip": 1.0490582, + "balance_loss_mlp": 1.01996934, + "epoch": 0.9897192328503577, + "flos": 21799034357760.0, + "grad_norm": 1.5973946056242037, + "language_loss": 0.70528162, + "learning_rate": 1.0959266667598388e-09, + "loss": 0.72711533, + "num_input_tokens_seen": 177607315, + "step": 8231, + "time_per_iteration": 3.226381778717041 + }, + { + "auxiliary_loss_clip": 0.0113101, + "auxiliary_loss_mlp": 0.01029332, + "balance_loss_clip": 1.04618359, + "balance_loss_mlp": 1.02125514, + "epoch": 0.9898394757409968, + "flos": 21325229032320.0, + "grad_norm": 2.201739118746864, + "language_loss": 0.74335742, + "learning_rate": 1.0702941289533196e-09, + "loss": 0.76496077, + "num_input_tokens_seen": 177625990, + "step": 8232, + "time_per_iteration": 2.5317962169647217 + }, + { + "auxiliary_loss_clip": 0.01125063, + "auxiliary_loss_mlp": 0.01024166, + "balance_loss_clip": 1.04379499, + "balance_loss_mlp": 1.01756191, + "epoch": 0.9899597186316359, + "flos": 18545024442240.0, + "grad_norm": 2.8896096533815006, + "language_loss": 0.88576066, + "learning_rate": 1.0449648271939615e-09, + "loss": 0.90725291, + "num_input_tokens_seen": 177642335, + "step": 8233, + "time_per_iteration": 2.495853900909424 + }, + { + "auxiliary_loss_clip": 0.01115876, + "auxiliary_loss_mlp": 0.00762177, + "balance_loss_clip": 1.04407287, + "balance_loss_mlp": 1.00062859, + "epoch": 0.990079961522275, + "flos": 23766723348480.0, + "grad_norm": 2.76041777860802, + "language_loss": 0.72725701, + "learning_rate": 1.0199387653240243e-09, + "loss": 0.7460376, + "num_input_tokens_seen": 177662025, + "step": 8234, + "time_per_iteration": 2.577036142349243 + }, + { + "auxiliary_loss_clip": 0.01133392, + "auxiliary_loss_mlp": 0.01019879, + "balance_loss_clip": 1.04435647, + "balance_loss_mlp": 1.01281011, + "epoch": 0.9902002044129141, + "flos": 16399182971520.0, + "grad_norm": 2.303659877070589, + "language_loss": 0.70714641, + "learning_rate": 9.952159471400267e-10, + "loss": 0.72867918, + "num_input_tokens_seen": 177679065, + "step": 8235, + "time_per_iteration": 2.477562427520752 + }, + { + "auxiliary_loss_clip": 0.01154572, + "auxiliary_loss_mlp": 0.00762029, + "balance_loss_clip": 1.04453111, + "balance_loss_mlp": 1.00074172, + "epoch": 0.9903204473035532, + "flos": 22559657783040.0, + "grad_norm": 1.7769158677946908, + "language_loss": 0.8412655, + "learning_rate": 9.707963763923022e-10, + "loss": 0.86043155, + "num_input_tokens_seen": 177698115, + "step": 8236, + "time_per_iteration": 2.474961519241333 + }, + { + "auxiliary_loss_clip": 0.01136222, + "auxiliary_loss_mlp": 0.01022745, + "balance_loss_clip": 1.04121029, + "balance_loss_mlp": 1.01517224, + "epoch": 0.9904406901941922, + "flos": 16144001775360.0, + "grad_norm": 1.7839247276405528, + "language_loss": 0.79130411, + "learning_rate": 9.466800567854427e-10, + "loss": 0.81289375, + "num_input_tokens_seen": 177716715, + "step": 8237, + "time_per_iteration": 2.4853551387786865 + }, + { + "auxiliary_loss_clip": 0.01127051, + "auxiliary_loss_mlp": 0.01023145, + "balance_loss_clip": 1.04012048, + "balance_loss_mlp": 1.01534319, + "epoch": 0.9905609330848314, + "flos": 26651499408000.0, + "grad_norm": 2.1268477840871833, + "language_loss": 0.68074143, + "learning_rate": 9.228669919778553e-10, + "loss": 0.70224345, + "num_input_tokens_seen": 177735640, + "step": 8238, + "time_per_iteration": 2.56010103225708 + }, + { + "auxiliary_loss_clip": 0.01135135, + "auxiliary_loss_mlp": 0.01023417, + "balance_loss_clip": 1.0426302, + "balance_loss_mlp": 1.01520371, + "epoch": 0.9906811759754705, + "flos": 23111820627840.0, + "grad_norm": 2.1326355431635693, + "language_loss": 0.79652917, + "learning_rate": 8.993571855817617e-10, + "loss": 0.81811476, + "num_input_tokens_seen": 177754470, + "step": 8239, + "time_per_iteration": 2.4847805500030518 + }, + { + "auxiliary_loss_clip": 0.01153524, + "auxiliary_loss_mlp": 0.01022487, + "balance_loss_clip": 1.04494739, + "balance_loss_mlp": 1.01518583, + "epoch": 0.9908014188661095, + "flos": 22090593052800.0, + "grad_norm": 2.0402561788084532, + "language_loss": 0.74917817, + "learning_rate": 8.761506411638642e-10, + "loss": 0.77093828, + "num_input_tokens_seen": 177773935, + "step": 8240, + "time_per_iteration": 2.4563562870025635 + }, + { + "auxiliary_loss_clip": 0.01136626, + "auxiliary_loss_mlp": 0.01026933, + "balance_loss_clip": 1.04414487, + "balance_loss_mlp": 1.01967907, + "epoch": 0.9909216617567487, + "flos": 19242948677760.0, + "grad_norm": 1.8223575580024955, + "language_loss": 0.73792607, + "learning_rate": 8.53247362244236e-10, + "loss": 0.75956166, + "num_input_tokens_seen": 177792745, + "step": 8241, + "time_per_iteration": 2.482393980026245 + }, + { + "auxiliary_loss_clip": 0.01141039, + "auxiliary_loss_mlp": 0.01023161, + "balance_loss_clip": 1.04505301, + "balance_loss_mlp": 1.01585948, + "epoch": 0.9910419046473877, + "flos": 23621213352960.0, + "grad_norm": 2.440833877725719, + "language_loss": 0.68479657, + "learning_rate": 8.306473522976532e-10, + "loss": 0.70643854, + "num_input_tokens_seen": 177812150, + "step": 8242, + "time_per_iteration": 2.509937047958374 + }, + { + "auxiliary_loss_clip": 0.01168773, + "auxiliary_loss_mlp": 0.01021663, + "balance_loss_clip": 1.04873443, + "balance_loss_mlp": 1.01418018, + "epoch": 0.9911621475380268, + "flos": 22711380831360.0, + "grad_norm": 2.1090548906803446, + "language_loss": 0.71604657, + "learning_rate": 8.083506147522623e-10, + "loss": 0.73795092, + "num_input_tokens_seen": 177831545, + "step": 8243, + "time_per_iteration": 2.477202892303467 + }, + { + "auxiliary_loss_clip": 0.01146445, + "auxiliary_loss_mlp": 0.01029745, + "balance_loss_clip": 1.04278612, + "balance_loss_mlp": 1.02246392, + "epoch": 0.991282390428666, + "flos": 13516956777600.0, + "grad_norm": 2.0346413910674284, + "language_loss": 0.85337728, + "learning_rate": 7.863571529906909e-10, + "loss": 0.87513918, + "num_input_tokens_seen": 177847130, + "step": 8244, + "time_per_iteration": 2.4446234703063965 + }, + { + "auxiliary_loss_clip": 0.01054858, + "auxiliary_loss_mlp": 0.01000367, + "balance_loss_clip": 1.00787866, + "balance_loss_mlp": 0.99942535, + "epoch": 0.991402633319305, + "flos": 61830492071040.0, + "grad_norm": 0.7269899223576223, + "language_loss": 0.59655035, + "learning_rate": 7.646669703489372e-10, + "loss": 0.61710262, + "num_input_tokens_seen": 177911440, + "step": 8245, + "time_per_iteration": 3.13742733001709 + }, + { + "auxiliary_loss_clip": 0.01059261, + "auxiliary_loss_mlp": 0.01023264, + "balance_loss_clip": 1.03386903, + "balance_loss_mlp": 1.01592708, + "epoch": 0.9915228762099441, + "flos": 18770148933120.0, + "grad_norm": 1.8167541609638762, + "language_loss": 0.57455891, + "learning_rate": 7.432800701177023e-10, + "loss": 0.59538418, + "num_input_tokens_seen": 177929440, + "step": 8246, + "time_per_iteration": 2.8410513401031494 + }, + { + "auxiliary_loss_clip": 0.01045088, + "auxiliary_loss_mlp": 0.01001357, + "balance_loss_clip": 1.00847054, + "balance_loss_mlp": 1.00036204, + "epoch": 0.9916431191005832, + "flos": 65936660244480.0, + "grad_norm": 0.7841382846919347, + "language_loss": 0.57774627, + "learning_rate": 7.221964555415017e-10, + "loss": 0.59821069, + "num_input_tokens_seen": 177989100, + "step": 8247, + "time_per_iteration": 3.209969997406006 + }, + { + "auxiliary_loss_clip": 0.01136156, + "auxiliary_loss_mlp": 0.010199, + "balance_loss_clip": 1.04417753, + "balance_loss_mlp": 1.01331377, + "epoch": 0.9917633619912223, + "flos": 16581573256320.0, + "grad_norm": 1.9772248721115984, + "language_loss": 0.75016767, + "learning_rate": 7.01416129818222e-10, + "loss": 0.77172828, + "num_input_tokens_seen": 178006720, + "step": 8248, + "time_per_iteration": 2.476958990097046 + }, + { + "auxiliary_loss_clip": 0.01131996, + "auxiliary_loss_mlp": 0.01023931, + "balance_loss_clip": 1.04314506, + "balance_loss_mlp": 1.01665378, + "epoch": 0.9918836048818613, + "flos": 25411108999680.0, + "grad_norm": 2.751623819765739, + "language_loss": 0.58576548, + "learning_rate": 6.809390961006745e-10, + "loss": 0.60732478, + "num_input_tokens_seen": 178026850, + "step": 8249, + "time_per_iteration": 2.556936264038086 + }, + { + "auxiliary_loss_clip": 0.01139352, + "auxiliary_loss_mlp": 0.01026675, + "balance_loss_clip": 1.04494286, + "balance_loss_mlp": 1.01937902, + "epoch": 0.9920038477725005, + "flos": 25046867134080.0, + "grad_norm": 2.1872700459048615, + "language_loss": 0.68763542, + "learning_rate": 6.607653574948191e-10, + "loss": 0.70929575, + "num_input_tokens_seen": 178047630, + "step": 8250, + "time_per_iteration": 2.5221664905548096 + }, + { + "auxiliary_loss_clip": 0.01143625, + "auxiliary_loss_mlp": 0.01025707, + "balance_loss_clip": 1.04131937, + "balance_loss_mlp": 1.01878667, + "epoch": 0.9921240906631396, + "flos": 21829773421440.0, + "grad_norm": 1.7110142157515211, + "language_loss": 0.81643224, + "learning_rate": 6.408949170613187e-10, + "loss": 0.83812559, + "num_input_tokens_seen": 178066895, + "step": 8251, + "time_per_iteration": 2.460099697113037 + }, + { + "auxiliary_loss_clip": 0.01137904, + "auxiliary_loss_mlp": 0.01026716, + "balance_loss_clip": 1.04368043, + "balance_loss_mlp": 1.01846099, + "epoch": 0.9922443335537786, + "flos": 24864225454080.0, + "grad_norm": 2.02866047688021, + "language_loss": 0.81602097, + "learning_rate": 6.213277778144288e-10, + "loss": 0.83766711, + "num_input_tokens_seen": 178088540, + "step": 8252, + "time_per_iteration": 3.7620036602020264 + }, + { + "auxiliary_loss_clip": 0.01100475, + "auxiliary_loss_mlp": 0.01028689, + "balance_loss_clip": 1.0391047, + "balance_loss_mlp": 1.02083349, + "epoch": 0.9923645764444178, + "flos": 21613088626560.0, + "grad_norm": 2.302721636881168, + "language_loss": 0.66715372, + "learning_rate": 6.020639427224416e-10, + "loss": 0.68844539, + "num_input_tokens_seen": 178106185, + "step": 8253, + "time_per_iteration": 3.3820929527282715 + }, + { + "auxiliary_loss_clip": 0.01138971, + "auxiliary_loss_mlp": 0.01021947, + "balance_loss_clip": 1.0444932, + "balance_loss_mlp": 1.01475835, + "epoch": 0.9924848193350568, + "flos": 25001798544000.0, + "grad_norm": 1.9695820544885474, + "language_loss": 0.72688174, + "learning_rate": 5.831034147076864e-10, + "loss": 0.74849093, + "num_input_tokens_seen": 178123435, + "step": 8254, + "time_per_iteration": 3.3004164695739746 + }, + { + "auxiliary_loss_clip": 0.01049946, + "auxiliary_loss_mlp": 0.01000579, + "balance_loss_clip": 1.00719261, + "balance_loss_mlp": 0.99965554, + "epoch": 0.9926050622256959, + "flos": 68912543151360.0, + "grad_norm": 0.6861013532374817, + "language_loss": 0.55743039, + "learning_rate": 5.644461966463065e-10, + "loss": 0.57793564, + "num_input_tokens_seen": 178191045, + "step": 8255, + "time_per_iteration": 3.122847557067871 + }, + { + "auxiliary_loss_clip": 0.01139287, + "auxiliary_loss_mlp": 0.01020593, + "balance_loss_clip": 1.04458475, + "balance_loss_mlp": 1.0138104, + "epoch": 0.9927253051163349, + "flos": 20923675914240.0, + "grad_norm": 1.8145098134372848, + "language_loss": 0.75564885, + "learning_rate": 5.460922913687049e-10, + "loss": 0.77724767, + "num_input_tokens_seen": 178210135, + "step": 8256, + "time_per_iteration": 2.5001537799835205 + }, + { + "auxiliary_loss_clip": 0.01109775, + "auxiliary_loss_mlp": 0.00762872, + "balance_loss_clip": 1.03879619, + "balance_loss_mlp": 1.00064909, + "epoch": 0.9928455480069741, + "flos": 22308211601280.0, + "grad_norm": 2.090954555961724, + "language_loss": 0.75222409, + "learning_rate": 5.280417016593208e-10, + "loss": 0.77095056, + "num_input_tokens_seen": 178229925, + "step": 8257, + "time_per_iteration": 3.298884391784668 + }, + { + "auxiliary_loss_clip": 0.01151201, + "auxiliary_loss_mlp": 0.00761416, + "balance_loss_clip": 1.04748654, + "balance_loss_mlp": 1.00067365, + "epoch": 0.9929657908976132, + "flos": 17383889393280.0, + "grad_norm": 1.6033359434179786, + "language_loss": 0.74702382, + "learning_rate": 5.102944302559642e-10, + "loss": 0.76615, + "num_input_tokens_seen": 178247420, + "step": 8258, + "time_per_iteration": 2.443937301635742 + }, + { + "auxiliary_loss_clip": 0.01105076, + "auxiliary_loss_mlp": 0.01026748, + "balance_loss_clip": 1.04143119, + "balance_loss_mlp": 1.01902974, + "epoch": 0.9930860337882522, + "flos": 22674680110080.0, + "grad_norm": 3.081076223361422, + "language_loss": 0.7984637, + "learning_rate": 4.9285047985137e-10, + "loss": 0.81978196, + "num_input_tokens_seen": 178266840, + "step": 8259, + "time_per_iteration": 2.5939669609069824 + }, + { + "auxiliary_loss_clip": 0.01156016, + "auxiliary_loss_mlp": 0.01029908, + "balance_loss_clip": 1.04519081, + "balance_loss_mlp": 1.02238607, + "epoch": 0.9932062766788914, + "flos": 28147789284480.0, + "grad_norm": 1.6882326614350056, + "language_loss": 0.74299824, + "learning_rate": 4.757098530916436e-10, + "loss": 0.76485753, + "num_input_tokens_seen": 178287285, + "step": 8260, + "time_per_iteration": 2.531508445739746 + }, + { + "auxiliary_loss_clip": 0.01158553, + "auxiliary_loss_mlp": 0.01028525, + "balance_loss_clip": 1.04798007, + "balance_loss_mlp": 1.02098489, + "epoch": 0.9933265195695304, + "flos": 20156659868160.0, + "grad_norm": 8.497460667728078, + "language_loss": 0.77172053, + "learning_rate": 4.5887255257670563e-10, + "loss": 0.79359126, + "num_input_tokens_seen": 178304325, + "step": 8261, + "time_per_iteration": 2.4440345764160156 + }, + { + "auxiliary_loss_clip": 0.01168735, + "auxiliary_loss_mlp": 0.01026401, + "balance_loss_clip": 1.0467248, + "balance_loss_mlp": 1.01900411, + "epoch": 0.9934467624601695, + "flos": 21362037494400.0, + "grad_norm": 2.1932889956473844, + "language_loss": 0.7679503, + "learning_rate": 4.4233858086117906e-10, + "loss": 0.78990161, + "num_input_tokens_seen": 178322850, + "step": 8262, + "time_per_iteration": 2.443607807159424 + }, + { + "auxiliary_loss_clip": 0.01110683, + "auxiliary_loss_mlp": 0.01028309, + "balance_loss_clip": 1.04494786, + "balance_loss_mlp": 1.02061439, + "epoch": 0.9935670053508087, + "flos": 19756040503680.0, + "grad_norm": 2.1473812948528335, + "language_loss": 0.67858607, + "learning_rate": 4.261079404528356e-10, + "loss": 0.69997597, + "num_input_tokens_seen": 178342330, + "step": 8263, + "time_per_iteration": 2.564164876937866 + }, + { + "auxiliary_loss_clip": 0.01151404, + "auxiliary_loss_mlp": 0.01023516, + "balance_loss_clip": 1.04519451, + "balance_loss_mlp": 1.01579165, + "epoch": 0.9936872482414477, + "flos": 21978838863360.0, + "grad_norm": 2.008624633184713, + "language_loss": 0.68761051, + "learning_rate": 4.1018063381437205e-10, + "loss": 0.70935965, + "num_input_tokens_seen": 178362715, + "step": 8264, + "time_per_iteration": 2.4713144302368164 + }, + { + "auxiliary_loss_clip": 0.01047583, + "auxiliary_loss_mlp": 0.01003178, + "balance_loss_clip": 1.00839055, + "balance_loss_mlp": 1.00226021, + "epoch": 0.9938074911320868, + "flos": 69810667839360.0, + "grad_norm": 0.8685938692636785, + "language_loss": 0.61116558, + "learning_rate": 3.9455666336141167e-10, + "loss": 0.63167316, + "num_input_tokens_seen": 178426495, + "step": 8265, + "time_per_iteration": 3.091525077819824 + }, + { + "auxiliary_loss_clip": 0.01167284, + "auxiliary_loss_mlp": 0.01024645, + "balance_loss_clip": 1.04753327, + "balance_loss_mlp": 1.0176239, + "epoch": 0.9939277340227259, + "flos": 15084170058240.0, + "grad_norm": 2.8981061351531094, + "language_loss": 0.83039385, + "learning_rate": 3.7923603146450267e-10, + "loss": 0.85231316, + "num_input_tokens_seen": 178442555, + "step": 8266, + "time_per_iteration": 2.3886373043060303 + }, + { + "auxiliary_loss_clip": 0.01126853, + "auxiliary_loss_mlp": 0.01027132, + "balance_loss_clip": 1.0409863, + "balance_loss_mlp": 1.0195148, + "epoch": 0.994047976913365, + "flos": 17712364291200.0, + "grad_norm": 1.8693809162376847, + "language_loss": 0.80595827, + "learning_rate": 3.642187404473418e-10, + "loss": 0.82749808, + "num_input_tokens_seen": 178460715, + "step": 8267, + "time_per_iteration": 2.512758731842041 + }, + { + "auxiliary_loss_clip": 0.01152536, + "auxiliary_loss_mlp": 0.01019192, + "balance_loss_clip": 1.04352975, + "balance_loss_mlp": 1.01196146, + "epoch": 0.994168219804004, + "flos": 19171558396800.0, + "grad_norm": 2.1092569176172185, + "language_loss": 0.85964561, + "learning_rate": 3.495047925885508e-10, + "loss": 0.88136286, + "num_input_tokens_seen": 178479050, + "step": 8268, + "time_per_iteration": 2.4375224113464355 + }, + { + "auxiliary_loss_clip": 0.01137197, + "auxiliary_loss_mlp": 0.01026073, + "balance_loss_clip": 1.04234719, + "balance_loss_mlp": 1.01846099, + "epoch": 0.9942884626946432, + "flos": 17851589406720.0, + "grad_norm": 2.140988269110184, + "language_loss": 0.82836187, + "learning_rate": 3.350941901199e-10, + "loss": 0.84999454, + "num_input_tokens_seen": 178495970, + "step": 8269, + "time_per_iteration": 2.451788902282715 + }, + { + "auxiliary_loss_clip": 0.01143571, + "auxiliary_loss_mlp": 0.01025051, + "balance_loss_clip": 1.04467583, + "balance_loss_mlp": 1.01773787, + "epoch": 0.9944087055852823, + "flos": 18796578364800.0, + "grad_norm": 3.1622171892068143, + "language_loss": 0.83346772, + "learning_rate": 3.2098693522764066e-10, + "loss": 0.85515392, + "num_input_tokens_seen": 178509170, + "step": 8270, + "time_per_iteration": 2.451991319656372 + }, + { + "auxiliary_loss_clip": 0.01145614, + "auxiliary_loss_mlp": 0.00762139, + "balance_loss_clip": 1.04514551, + "balance_loss_mlp": 1.00060701, + "epoch": 0.9945289484759213, + "flos": 20996969616000.0, + "grad_norm": 2.1230273283877716, + "language_loss": 0.81242931, + "learning_rate": 3.071830300516165e-10, + "loss": 0.83150685, + "num_input_tokens_seen": 178527000, + "step": 8271, + "time_per_iteration": 2.488334894180298 + }, + { + "auxiliary_loss_clip": 0.01159943, + "auxiliary_loss_mlp": 0.01026865, + "balance_loss_clip": 1.04639077, + "balance_loss_mlp": 1.01909804, + "epoch": 0.9946491913665605, + "flos": 14756952136320.0, + "grad_norm": 2.5919184646275957, + "language_loss": 0.70764506, + "learning_rate": 2.9368247668615234e-10, + "loss": 0.72951311, + "num_input_tokens_seen": 178545590, + "step": 8272, + "time_per_iteration": 2.463449239730835 + }, + { + "auxiliary_loss_clip": 0.01172017, + "auxiliary_loss_mlp": 0.01026925, + "balance_loss_clip": 1.04856467, + "balance_loss_mlp": 1.0193193, + "epoch": 0.9947694342571995, + "flos": 12669931186560.0, + "grad_norm": 2.779523124531342, + "language_loss": 0.61325175, + "learning_rate": 2.804852771789434e-10, + "loss": 0.63524115, + "num_input_tokens_seen": 178558890, + "step": 8273, + "time_per_iteration": 2.3740410804748535 + }, + { + "auxiliary_loss_clip": 0.01164315, + "auxiliary_loss_mlp": 0.01025442, + "balance_loss_clip": 1.04563022, + "balance_loss_mlp": 1.01824784, + "epoch": 0.9948896771478386, + "flos": 18843442634880.0, + "grad_norm": 1.9596610401899996, + "language_loss": 0.55768418, + "learning_rate": 2.675914335321661e-10, + "loss": 0.57958174, + "num_input_tokens_seen": 178577645, + "step": 8274, + "time_per_iteration": 2.397969961166382 + }, + { + "auxiliary_loss_clip": 0.01159349, + "auxiliary_loss_mlp": 0.01024501, + "balance_loss_clip": 1.04631507, + "balance_loss_mlp": 1.01620376, + "epoch": 0.9950099200384778, + "flos": 24900207903360.0, + "grad_norm": 2.1227658225396455, + "language_loss": 0.78518879, + "learning_rate": 2.550009477018111e-10, + "loss": 0.80702728, + "num_input_tokens_seen": 178596415, + "step": 8275, + "time_per_iteration": 2.49664306640625 + }, + { + "auxiliary_loss_clip": 0.01139205, + "auxiliary_loss_mlp": 0.00762298, + "balance_loss_clip": 1.04511118, + "balance_loss_mlp": 1.00062513, + "epoch": 0.9951301629291168, + "flos": 23733613987200.0, + "grad_norm": 2.385834997626654, + "language_loss": 0.63191015, + "learning_rate": 2.4271382159790634e-10, + "loss": 0.65092516, + "num_input_tokens_seen": 178613845, + "step": 8276, + "time_per_iteration": 2.5168216228485107 + }, + { + "auxiliary_loss_clip": 0.01110905, + "auxiliary_loss_mlp": 0.01028412, + "balance_loss_clip": 1.04250872, + "balance_loss_mlp": 1.02106595, + "epoch": 0.9952504058197559, + "flos": 22236893147520.0, + "grad_norm": 1.7382875597930345, + "language_loss": 0.86035007, + "learning_rate": 2.3073005708429406e-10, + "loss": 0.88174325, + "num_input_tokens_seen": 178633490, + "step": 8277, + "time_per_iteration": 2.593047618865967 + }, + { + "auxiliary_loss_clip": 0.01125167, + "auxiliary_loss_mlp": 0.01021549, + "balance_loss_clip": 1.046332, + "balance_loss_mlp": 1.01458132, + "epoch": 0.995370648710395, + "flos": 21211032718080.0, + "grad_norm": 2.1491285969986142, + "language_loss": 0.72149444, + "learning_rate": 2.190496559788535e-10, + "loss": 0.74296165, + "num_input_tokens_seen": 178651775, + "step": 8278, + "time_per_iteration": 2.522277593612671 + }, + { + "auxiliary_loss_clip": 0.01138744, + "auxiliary_loss_mlp": 0.01028913, + "balance_loss_clip": 1.04508591, + "balance_loss_mlp": 1.02176321, + "epoch": 0.9954908916010341, + "flos": 14866731077760.0, + "grad_norm": 3.588224508547678, + "language_loss": 0.76549459, + "learning_rate": 2.0767262005372265e-10, + "loss": 0.78717113, + "num_input_tokens_seen": 178669290, + "step": 8279, + "time_per_iteration": 3.2960801124572754 + }, + { + "auxiliary_loss_clip": 0.0113214, + "auxiliary_loss_mlp": 0.01023668, + "balance_loss_clip": 1.04286158, + "balance_loss_mlp": 1.01710856, + "epoch": 0.9956111344916732, + "flos": 19208259118080.0, + "grad_norm": 2.2338680566778093, + "language_loss": 0.75648141, + "learning_rate": 1.965989510346322e-10, + "loss": 0.77803946, + "num_input_tokens_seen": 178688410, + "step": 8280, + "time_per_iteration": 3.3436014652252197 + }, + { + "auxiliary_loss_clip": 0.01105971, + "auxiliary_loss_mlp": 0.01026634, + "balance_loss_clip": 1.04099989, + "balance_loss_mlp": 1.01846266, + "epoch": 0.9957313773823123, + "flos": 20047060494720.0, + "grad_norm": 2.0046026175872353, + "language_loss": 0.70604873, + "learning_rate": 1.8582865060134955e-10, + "loss": 0.72737479, + "num_input_tokens_seen": 178706600, + "step": 8281, + "time_per_iteration": 3.2476279735565186 + }, + { + "auxiliary_loss_clip": 0.01063296, + "auxiliary_loss_mlp": 0.01001106, + "balance_loss_clip": 1.00747359, + "balance_loss_mlp": 1.00010455, + "epoch": 0.9958516202729514, + "flos": 57483253768320.0, + "grad_norm": 0.8454102904722606, + "language_loss": 0.55700755, + "learning_rate": 1.7536172038790098e-10, + "loss": 0.57765156, + "num_input_tokens_seen": 178766910, + "step": 8282, + "time_per_iteration": 3.0720016956329346 + }, + { + "auxiliary_loss_clip": 0.01141016, + "auxiliary_loss_mlp": 0.01023468, + "balance_loss_clip": 1.04640603, + "balance_loss_mlp": 1.01619065, + "epoch": 0.9959718631635904, + "flos": 27782900974080.0, + "grad_norm": 2.026608207870529, + "language_loss": 0.69481134, + "learning_rate": 1.651981619819054e-10, + "loss": 0.71645623, + "num_input_tokens_seen": 178784060, + "step": 8283, + "time_per_iteration": 2.5934648513793945 + }, + { + "auxiliary_loss_clip": 0.01117447, + "auxiliary_loss_mlp": 0.01023288, + "balance_loss_clip": 1.04254365, + "balance_loss_mlp": 1.01576328, + "epoch": 0.9960921060542296, + "flos": 24024095274240.0, + "grad_norm": 2.5940161612722927, + "language_loss": 0.70290983, + "learning_rate": 1.5533797692546257e-10, + "loss": 0.72431719, + "num_input_tokens_seen": 178802795, + "step": 8284, + "time_per_iteration": 3.311424493789673 + }, + { + "auxiliary_loss_clip": 0.01148962, + "auxiliary_loss_mlp": 0.01021424, + "balance_loss_clip": 1.04305542, + "balance_loss_mlp": 1.01388144, + "epoch": 0.9962123489448687, + "flos": 18697393935360.0, + "grad_norm": 2.1954076979305617, + "language_loss": 0.83746666, + "learning_rate": 1.4578116671404296e-10, + "loss": 0.8591705, + "num_input_tokens_seen": 178821075, + "step": 8285, + "time_per_iteration": 2.429844617843628 + }, + { + "auxiliary_loss_clip": 0.01149614, + "auxiliary_loss_mlp": 0.01028716, + "balance_loss_clip": 1.04686165, + "balance_loss_mlp": 1.02129519, + "epoch": 0.9963325918355077, + "flos": 20010754823040.0, + "grad_norm": 2.5913083275624307, + "language_loss": 0.71280408, + "learning_rate": 1.3652773279759777e-10, + "loss": 0.73458731, + "num_input_tokens_seen": 178837725, + "step": 8286, + "time_per_iteration": 2.4406073093414307 + }, + { + "auxiliary_loss_clip": 0.01152671, + "auxiliary_loss_mlp": 0.01028858, + "balance_loss_clip": 1.04445612, + "balance_loss_mlp": 1.02154148, + "epoch": 0.9964528347261468, + "flos": 33108488991360.0, + "grad_norm": 1.6307409904728571, + "language_loss": 0.63058066, + "learning_rate": 1.2757767657989305e-10, + "loss": 0.65239596, + "num_input_tokens_seen": 178861515, + "step": 8287, + "time_per_iteration": 2.5597989559173584 + }, + { + "auxiliary_loss_clip": 0.01152427, + "auxiliary_loss_mlp": 0.01022734, + "balance_loss_clip": 1.0458076, + "balance_loss_mlp": 1.01574242, + "epoch": 0.9965730776167859, + "flos": 23109342589440.0, + "grad_norm": 1.9753063735360898, + "language_loss": 0.86788201, + "learning_rate": 1.1893099941850948e-10, + "loss": 0.88963366, + "num_input_tokens_seen": 178880410, + "step": 8288, + "time_per_iteration": 2.469120979309082 + }, + { + "auxiliary_loss_clip": 0.01142016, + "auxiliary_loss_mlp": 0.01026841, + "balance_loss_clip": 1.0412364, + "balance_loss_mlp": 1.01922977, + "epoch": 0.996693320507425, + "flos": 22965843755520.0, + "grad_norm": 3.4868697694599273, + "language_loss": 0.77714133, + "learning_rate": 1.105877026252866e-10, + "loss": 0.79882991, + "num_input_tokens_seen": 178898740, + "step": 8289, + "time_per_iteration": 2.486619472503662 + }, + { + "auxiliary_loss_clip": 0.01169205, + "auxiliary_loss_mlp": 0.01027017, + "balance_loss_clip": 1.04634356, + "balance_loss_mlp": 1.01892304, + "epoch": 0.996813563398064, + "flos": 13222740476160.0, + "grad_norm": 2.605570291837353, + "language_loss": 0.7202673, + "learning_rate": 1.0254778746565663e-10, + "loss": 0.74222952, + "num_input_tokens_seen": 178914015, + "step": 8290, + "time_per_iteration": 2.367690086364746 + }, + { + "auxiliary_loss_clip": 0.01124073, + "auxiliary_loss_mlp": 0.01029945, + "balance_loss_clip": 1.04394031, + "balance_loss_mlp": 1.02251554, + "epoch": 0.9969338062887032, + "flos": 14647855553280.0, + "grad_norm": 2.2685300841543663, + "language_loss": 0.73424971, + "learning_rate": 9.481125515953259e-11, + "loss": 0.75578988, + "num_input_tokens_seen": 178932075, + "step": 8291, + "time_per_iteration": 2.577449083328247 + }, + { + "auxiliary_loss_clip": 0.01110914, + "auxiliary_loss_mlp": 0.01026456, + "balance_loss_clip": 1.03790665, + "balance_loss_mlp": 1.01883876, + "epoch": 0.9970540491793423, + "flos": 25735741142400.0, + "grad_norm": 1.8821016106515884, + "language_loss": 0.79751408, + "learning_rate": 8.737810688064228e-11, + "loss": 0.81888783, + "num_input_tokens_seen": 178951910, + "step": 8292, + "time_per_iteration": 2.585144519805908 + }, + { + "auxiliary_loss_clip": 0.01120336, + "auxiliary_loss_mlp": 0.01030097, + "balance_loss_clip": 1.04196119, + "balance_loss_mlp": 1.02082264, + "epoch": 0.9971742920699813, + "flos": 21470236237440.0, + "grad_norm": 8.211445806854316, + "language_loss": 0.7872864, + "learning_rate": 8.024834375608414e-11, + "loss": 0.80879074, + "num_input_tokens_seen": 178970500, + "step": 8293, + "time_per_iteration": 2.504213571548462 + }, + { + "auxiliary_loss_clip": 0.01063244, + "auxiliary_loss_mlp": 0.01001021, + "balance_loss_clip": 1.00749946, + "balance_loss_mlp": 1.00003719, + "epoch": 0.9972945349606205, + "flos": 72211223629440.0, + "grad_norm": 0.8154591824814386, + "language_loss": 0.6272887, + "learning_rate": 7.342196686788149e-11, + "loss": 0.64793134, + "num_input_tokens_seen": 179023665, + "step": 8294, + "time_per_iteration": 2.8936257362365723 + }, + { + "auxiliary_loss_clip": 0.01136671, + "auxiliary_loss_mlp": 0.01026361, + "balance_loss_clip": 1.04637837, + "balance_loss_mlp": 1.01925027, + "epoch": 0.9974147778512595, + "flos": 19678293515520.0, + "grad_norm": 1.9956350637904627, + "language_loss": 0.68806058, + "learning_rate": 6.689897725142834e-11, + "loss": 0.70969087, + "num_input_tokens_seen": 179043140, + "step": 8295, + "time_per_iteration": 2.4997565746307373 + }, + { + "auxiliary_loss_clip": 0.0114093, + "auxiliary_loss_mlp": 0.01026345, + "balance_loss_clip": 1.04359174, + "balance_loss_mlp": 1.01894224, + "epoch": 0.9975350207418986, + "flos": 15960821391360.0, + "grad_norm": 2.383925902900279, + "language_loss": 0.88485873, + "learning_rate": 6.067937589615545e-11, + "loss": 0.90653145, + "num_input_tokens_seen": 179061215, + "step": 8296, + "time_per_iteration": 2.531463861465454 + }, + { + "auxiliary_loss_clip": 0.0104655, + "auxiliary_loss_mlp": 0.01001485, + "balance_loss_clip": 1.00854683, + "balance_loss_mlp": 1.00061476, + "epoch": 0.9976552636325378, + "flos": 59961879768960.0, + "grad_norm": 0.7558585747896637, + "language_loss": 0.57671577, + "learning_rate": 5.476316374575241e-11, + "loss": 0.59719616, + "num_input_tokens_seen": 179124700, + "step": 8297, + "time_per_iteration": 3.083962917327881 + }, + { + "auxiliary_loss_clip": 0.01169955, + "auxiliary_loss_mlp": 0.01023741, + "balance_loss_clip": 1.047786, + "balance_loss_mlp": 1.0155158, + "epoch": 0.9977755065231768, + "flos": 22487872452480.0, + "grad_norm": 1.9303269260897653, + "language_loss": 0.72446823, + "learning_rate": 4.9150341697723476e-11, + "loss": 0.74640524, + "num_input_tokens_seen": 179144590, + "step": 8298, + "time_per_iteration": 2.4544506072998047 + }, + { + "auxiliary_loss_clip": 0.01136778, + "auxiliary_loss_mlp": 0.01030434, + "balance_loss_clip": 1.04561353, + "balance_loss_mlp": 1.02261424, + "epoch": 0.9978957494138159, + "flos": 26030280666240.0, + "grad_norm": 1.4955247522580175, + "language_loss": 0.66283566, + "learning_rate": 4.384091060338768e-11, + "loss": 0.68450773, + "num_input_tokens_seen": 179165060, + "step": 8299, + "time_per_iteration": 2.5299713611602783 + }, + { + "auxiliary_loss_clip": 0.01150731, + "auxiliary_loss_mlp": 0.01026967, + "balance_loss_clip": 1.04537475, + "balance_loss_mlp": 1.01938534, + "epoch": 0.998015992304455, + "flos": 22637835734400.0, + "grad_norm": 2.2246990808447635, + "language_loss": 0.73276329, + "learning_rate": 3.883487126810081e-11, + "loss": 0.75454026, + "num_input_tokens_seen": 179184320, + "step": 8300, + "time_per_iteration": 2.4876785278320312 + }, + { + "auxiliary_loss_clip": 0.011437, + "auxiliary_loss_mlp": 0.01025224, + "balance_loss_clip": 1.04303217, + "balance_loss_mlp": 1.01774371, + "epoch": 0.9981362351950941, + "flos": 18223444955520.0, + "grad_norm": 1.7985351304142945, + "language_loss": 0.79391563, + "learning_rate": 3.41322244516995e-11, + "loss": 0.81560487, + "num_input_tokens_seen": 179202265, + "step": 8301, + "time_per_iteration": 2.4474427700042725 + }, + { + "auxiliary_loss_clip": 0.01095714, + "auxiliary_loss_mlp": 0.01024486, + "balance_loss_clip": 1.03924632, + "balance_loss_mlp": 1.01702642, + "epoch": 0.9982564780857331, + "flos": 33474095573760.0, + "grad_norm": 1.7375110068190003, + "language_loss": 0.62998229, + "learning_rate": 2.9732970866946925e-11, + "loss": 0.65118432, + "num_input_tokens_seen": 179222145, + "step": 8302, + "time_per_iteration": 2.6560375690460205 + }, + { + "auxiliary_loss_clip": 0.01111225, + "auxiliary_loss_mlp": 0.01024487, + "balance_loss_clip": 1.03839111, + "balance_loss_mlp": 1.01647568, + "epoch": 0.9983767209763723, + "flos": 15523465392000.0, + "grad_norm": 12.982498054181093, + "language_loss": 0.77706778, + "learning_rate": 2.563711118175327e-11, + "loss": 0.7984249, + "num_input_tokens_seen": 179239030, + "step": 8303, + "time_per_iteration": 2.5360257625579834 + }, + { + "auxiliary_loss_clip": 0.01123494, + "auxiliary_loss_mlp": 0.01026342, + "balance_loss_clip": 1.04504108, + "balance_loss_mlp": 1.01884079, + "epoch": 0.9984969638670114, + "flos": 19974377324160.0, + "grad_norm": 1.8569844469931258, + "language_loss": 0.83655494, + "learning_rate": 2.184464601717728e-11, + "loss": 0.85805333, + "num_input_tokens_seen": 179257345, + "step": 8304, + "time_per_iteration": 2.5932915210723877 + }, + { + "auxiliary_loss_clip": 0.01158963, + "auxiliary_loss_mlp": 0.01025912, + "balance_loss_clip": 1.04854941, + "balance_loss_mlp": 1.01821685, + "epoch": 0.9986172067576504, + "flos": 20375750874240.0, + "grad_norm": 2.4128369071775895, + "language_loss": 0.77390945, + "learning_rate": 1.8355575948758585e-11, + "loss": 0.79575825, + "num_input_tokens_seen": 179275330, + "step": 8305, + "time_per_iteration": 2.439441680908203 + }, + { + "auxiliary_loss_clip": 0.0113773, + "auxiliary_loss_mlp": 0.01025571, + "balance_loss_clip": 1.04166389, + "balance_loss_mlp": 1.01780415, + "epoch": 0.9987374496482896, + "flos": 23727903724800.0, + "grad_norm": 1.9344777736895724, + "language_loss": 0.72889698, + "learning_rate": 1.5169901505407424e-11, + "loss": 0.75052989, + "num_input_tokens_seen": 179292395, + "step": 8306, + "time_per_iteration": 3.233381509780884 + }, + { + "auxiliary_loss_clip": 0.01138721, + "auxiliary_loss_mlp": 0.01021239, + "balance_loss_clip": 1.04519188, + "balance_loss_mlp": 1.01405644, + "epoch": 0.9988576925389286, + "flos": 25044029959680.0, + "grad_norm": 1.8641656673416098, + "language_loss": 0.73693192, + "learning_rate": 1.228762317073695e-11, + "loss": 0.75853145, + "num_input_tokens_seen": 179311225, + "step": 8307, + "time_per_iteration": 4.0200300216674805 + }, + { + "auxiliary_loss_clip": 0.01137334, + "auxiliary_loss_mlp": 0.01024149, + "balance_loss_clip": 1.04335272, + "balance_loss_mlp": 1.0168829, + "epoch": 0.9989779354295677, + "flos": 31285627637760.0, + "grad_norm": 1.8264754622270722, + "language_loss": 0.78881001, + "learning_rate": 9.70874138195299e-12, + "loss": 0.8104248, + "num_input_tokens_seen": 179333135, + "step": 8308, + "time_per_iteration": 2.558802366256714 + }, + { + "auxiliary_loss_clip": 0.01168325, + "auxiliary_loss_mlp": 0.01023342, + "balance_loss_clip": 1.04597247, + "balance_loss_mlp": 1.01616597, + "epoch": 0.9990981783202069, + "flos": 19573398823680.0, + "grad_norm": 3.9703300332941294, + "language_loss": 0.74527323, + "learning_rate": 7.433256530076093e-12, + "loss": 0.76718998, + "num_input_tokens_seen": 179353090, + "step": 8309, + "time_per_iteration": 2.425372838973999 + }, + { + "auxiliary_loss_clip": 0.01114758, + "auxiliary_loss_mlp": 0.01022589, + "balance_loss_clip": 1.04025328, + "balance_loss_mlp": 1.01589561, + "epoch": 0.9992184212108459, + "flos": 17199667514880.0, + "grad_norm": 2.2408095689159055, + "language_loss": 0.75648904, + "learning_rate": 5.46116896038562e-12, + "loss": 0.77786249, + "num_input_tokens_seen": 179367500, + "step": 8310, + "time_per_iteration": 2.5084097385406494 + }, + { + "auxiliary_loss_clip": 0.01134156, + "auxiliary_loss_mlp": 0.01026309, + "balance_loss_clip": 1.04283786, + "balance_loss_mlp": 1.01898623, + "epoch": 0.999338664101485, + "flos": 46497853681920.0, + "grad_norm": 1.986742554785348, + "language_loss": 0.61802042, + "learning_rate": 3.792478972197699e-12, + "loss": 0.63962507, + "num_input_tokens_seen": 179388085, + "step": 8311, + "time_per_iteration": 3.397172212600708 + }, + { + "auxiliary_loss_clip": 0.01165116, + "auxiliary_loss_mlp": 0.01019533, + "balance_loss_clip": 1.04483366, + "balance_loss_mlp": 1.01220798, + "epoch": 0.9994589069921241, + "flos": 15158253859200.0, + "grad_norm": 2.4531028822860863, + "language_loss": 0.70041096, + "learning_rate": 2.4271868181990895e-12, + "loss": 0.72225738, + "num_input_tokens_seen": 179405250, + "step": 8312, + "time_per_iteration": 2.409008026123047 + }, + { + "auxiliary_loss_clip": 0.01154574, + "auxiliary_loss_mlp": 0.01025821, + "balance_loss_clip": 1.0449214, + "balance_loss_mlp": 1.01814449, + "epoch": 0.9995791498827632, + "flos": 12531460256640.0, + "grad_norm": 2.2859681715320592, + "language_loss": 0.80857158, + "learning_rate": 1.3652927060014973e-12, + "loss": 0.83037543, + "num_input_tokens_seen": 179420845, + "step": 8313, + "time_per_iteration": 2.4202191829681396 + }, + { + "auxiliary_loss_clip": 0.01128153, + "auxiliary_loss_mlp": 0.01027172, + "balance_loss_clip": 1.04265857, + "balance_loss_mlp": 1.01920319, + "epoch": 0.9996993927734023, + "flos": 19245175320960.0, + "grad_norm": 4.967788661839031, + "language_loss": 0.63895774, + "learning_rate": 6.067967965872612e-13, + "loss": 0.66051102, + "num_input_tokens_seen": 179440455, + "step": 8314, + "time_per_iteration": 2.492452621459961 + }, + { + "auxiliary_loss_clip": 0.01125633, + "auxiliary_loss_mlp": 0.01026475, + "balance_loss_clip": 1.04511559, + "balance_loss_mlp": 1.01926327, + "epoch": 0.9998196356640414, + "flos": 62952804518400.0, + "grad_norm": 1.7134160026445546, + "language_loss": 0.77040625, + "learning_rate": 1.5169920497548615e-13, + "loss": 0.79192734, + "num_input_tokens_seen": 179465075, + "step": 8315, + "time_per_iteration": 2.8896384239196777 + }, + { + "auxiliary_loss_clip": 0.01107275, + "auxiliary_loss_mlp": 0.010158, + "balance_loss_clip": 1.02629113, + "balance_loss_mlp": 1.01144695, + "epoch": 0.9999398785546805, + "flos": 50922375073920.0, + "grad_norm": 1.1256922983541644, + "language_loss": 0.54950082, + "learning_rate": 0.0, + "loss": 0.57073164, + "num_input_tokens_seen": 179513955, + "step": 8316, + "time_per_iteration": 3.046828508377075 + }, + { + "epoch": 0.9999398785546805, + "num_input_tokens_seen": 179513955, + "step": 8316, + "total_flos": 6.996749092776837e+17, + "train_loss": 0.7888866581198343, + "train_runtime": 23328.2862, + "train_samples_per_second": 14.259, + "train_steps_per_second": 0.356 + } + ], + "logging_steps": 1.0, + "max_steps": 8316, + "num_input_tokens_seen": 179513955, + "num_train_epochs": 1, + "save_steps": 1664, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.996749092776837e+17, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +}